Skip to content

Commit c2433e6

Browse files
authoredAug 13, 2024
fix: Make fewer copies when building a string array (#5503)
* fix: Make fewer copies when building a string array "In the last few months a performance degradation has crept in when processing flux. One possible cause was the changes that were made to the way string arrays were built, which always built a full array even if it would ultimately not be needed. This change ensures that a full arrow array is only created when necessary. * chore: update comment for review
1 parent 6ffcb71 commit c2433e6

File tree

2 files changed

+145
-33
lines changed

2 files changed

+145
-33
lines changed
 

‎array/array_test.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ func TestString(t *testing.T) {
2626
b.Append("abcdefghij")
2727
}
2828
},
29-
bsz: 256, // 64 bytes nulls + 192 bytes data.
30-
sz: 64, // The minimum size of a buffer is 64 bytes
29+
bsz: 64, // 64 bytes data.
30+
sz: 64, // The minimum size of a buffer is 64 bytes
3131
want: []interface{}{
3232
"abcdefghij",
3333
"abcdefghij",

‎array/builder.go

+143-31
Original file line numberDiff line numberDiff line change
@@ -2,53 +2,100 @@ package array
22

33
import (
44
"bytes"
5+
"sync/atomic"
6+
"unsafe"
57

8+
"github.com/apache/arrow/go/v7/arrow"
69
"github.com/apache/arrow/go/v7/arrow/array"
710
"github.com/apache/arrow/go/v7/arrow/bitutil"
811
"github.com/apache/arrow/go/v7/arrow/memory"
912
)
1013

1114
type StringBuilder struct {
12-
mem memory.Allocator
13-
builder *array.BinaryBuilder
14-
constant bool
15+
mem memory.Allocator
16+
len int
17+
cap int
18+
reserveData int
19+
buffer *memory.Buffer
20+
builder *array.BinaryBuilder
21+
refCount int64
1522
}
1623

1724
func NewStringBuilder(mem memory.Allocator) *StringBuilder {
1825
return &StringBuilder{
19-
mem: mem,
20-
builder: array.NewBinaryBuilder(mem, StringType),
21-
constant: true,
26+
mem: mem,
27+
len: 0,
28+
cap: 0,
29+
reserveData: 0,
30+
buffer: nil,
31+
builder: nil,
32+
refCount: 1,
2233
}
2334
}
2435

2536
func (b *StringBuilder) Retain() {
26-
b.builder.Retain()
37+
atomic.AddInt64(&b.refCount, 1)
2738
}
2839
func (b *StringBuilder) Release() {
29-
b.builder.Release()
40+
if atomic.AddInt64(&b.refCount, -1) == 0 {
41+
if b.buffer != nil {
42+
b.buffer.Release()
43+
}
44+
if b.builder != nil {
45+
b.builder.Release()
46+
}
47+
}
3048
}
3149
func (b *StringBuilder) Len() int {
32-
return b.builder.Len()
50+
if b.builder != nil {
51+
return b.builder.Len()
52+
}
53+
return b.len
3354
}
3455
func (b *StringBuilder) Cap() int {
35-
return b.builder.Cap()
56+
if b.builder != nil {
57+
return b.builder.Cap()
58+
}
59+
if b.cap > b.len {
60+
return b.cap
61+
}
62+
return b.len
3663
}
3764
func (b *StringBuilder) NullN() int {
38-
return b.builder.NullN()
65+
if b.builder != nil {
66+
return b.builder.NullN()
67+
}
68+
return 0
3969
}
4070

4171
func (b *StringBuilder) AppendBytes(buf []byte) {
42-
if b.builder.Len() > 0 {
43-
b.constant = b.constant && bytes.Equal(buf, b.builder.Value(0))
72+
if b.builder != nil {
73+
b.builder.Append(buf)
74+
return
75+
}
76+
if b.len == 0 {
77+
b.buffer = memory.NewResizableBuffer(b.mem)
78+
b.buffer.Resize(len(buf))
79+
copy(b.buffer.Bytes(), buf)
80+
b.len = 1
81+
return
4482
}
45-
b.builder.Append(buf)
83+
if bytes.Equal(b.buffer.Bytes(), buf) {
84+
b.len++
85+
return
86+
}
87+
b.makeBuilder(buf)
88+
4689
}
4790

48-
// Append appends a string to the array being built. The input string
49-
// will always be copied.
91+
// Append appends a string to the array being built. A reference
92+
// to the input string will not be retained by the builder. The
93+
// string will be copied, if necessary.
5094
func (b *StringBuilder) Append(v string) {
51-
b.AppendBytes([]byte(v))
95+
// Avoid copying the input string as AppendBytes
96+
// will never keep a reference or modify the input.
97+
bytes := unsafe.Slice(unsafe.StringData(v), len(v))
98+
b.AppendBytes(bytes)
5299
}
53100

54101
func (b *StringBuilder) AppendValues(v []string, valid []bool) {
@@ -61,38 +108,66 @@ func (b *StringBuilder) AppendValues(v []string, valid []bool) {
61108
}
62109
}
63110
func (b *StringBuilder) AppendNull() {
64-
b.constant = false
111+
if b.builder == nil {
112+
b.makeBuilder(nil)
113+
}
65114
b.builder.AppendNull()
66115
}
67116

68-
func (b *StringBuilder) UnsafeAppendBoolToBitmap(isValid bool) {
69-
b.builder.UnsafeAppendBoolToBitmap(isValid)
70-
}
71-
72117
func (b *StringBuilder) Reserve(n int) {
73-
b.builder.Reserve(n)
118+
if b.builder != nil {
119+
b.builder.Reserve(n)
120+
return
121+
}
122+
if b.len+n > b.cap {
123+
b.cap = b.len + n
124+
}
74125
}
75126

76127
func (b *StringBuilder) ReserveData(n int) {
77-
b.builder.ReserveData(n)
128+
if b.builder != nil {
129+
b.builder.ReserveData(n)
130+
return
131+
}
132+
b.reserveData = n
78133
}
79134

80135
func (b *StringBuilder) Resize(n int) {
81-
b.builder.Resize(n)
136+
if b.builder != nil {
137+
b.builder.Resize(n)
138+
}
139+
b.cap = n
140+
if b.len > n {
141+
b.len = n
142+
}
82143
}
83144

84145
func (b *StringBuilder) NewArray() Array {
85146
return b.NewStringArray()
86147
}
87148

88149
func (b *StringBuilder) NewStringArray() *String {
89-
arr := b.builder.NewBinaryArray()
90-
if !b.constant || arr.Len() < 1 {
91-
b.constant = true
92-
return &String{arr}
150+
if b.builder != nil {
151+
arr := &String{b.builder.NewBinaryArray()}
152+
b.builder.Release()
153+
b.builder = nil
154+
return arr
155+
}
156+
if b.buffer != nil {
157+
arr := &String{&repeatedBinary{
158+
len: b.len,
159+
buf: b.buffer,
160+
}}
161+
b.buffer = nil
162+
b.len = 0
163+
b.cap = 0
164+
return arr
93165
}
94-
defer arr.Release()
95-
return StringRepeat(arr.ValueString(0), arr.Len(), b.mem)
166+
// getting this far means we have an empty array.
167+
arr := StringRepeat("", b.len, b.mem)
168+
b.len = 0
169+
b.cap = 0
170+
return arr
96171
}
97172

98173
func (b *StringBuilder) CopyValidValues(values *String, nullCheckArray Array) {
@@ -110,6 +185,43 @@ func (b *StringBuilder) CopyValidValues(values *String, nullCheckArray Array) {
110185
}
111186
}
112187

188+
func (b *StringBuilder) makeBuilder(value []byte) {
189+
bufferLen := 0
190+
if b.buffer != nil {
191+
bufferLen = b.buffer.Len()
192+
}
193+
size := b.len
194+
if b.cap > b.len {
195+
size = b.cap
196+
}
197+
dataSize := b.len * bufferLen
198+
if value != nil {
199+
if b.cap <= b.len {
200+
size++
201+
}
202+
dataSize += len(value)
203+
}
204+
if b.reserveData > dataSize {
205+
dataSize = b.reserveData
206+
}
207+
b.builder = array.NewBinaryBuilder(b.mem, arrow.BinaryTypes.String)
208+
b.builder.Resize(size)
209+
b.builder.ReserveData(dataSize)
210+
for i := 0; i < b.len; i++ {
211+
b.builder.Append(b.buffer.Bytes())
212+
}
213+
if value != nil {
214+
b.builder.Append(value)
215+
}
216+
if b.buffer != nil {
217+
b.buffer.Release()
218+
b.buffer = nil
219+
}
220+
b.len = 0
221+
b.cap = 0
222+
b.reserveData = 0
223+
}
224+
113225
// Copy of Array.IsValid from arrow, allowing the IsValid check to be done without going through an interface
114226
func isValid(nullBitmapBytes []byte, offset int, i int) bool {
115227
return len(nullBitmapBytes) == 0 || bitutil.BitIsSet(nullBitmapBytes, offset+i)

0 commit comments

Comments
 (0)