diff --git a/src/Arrow.dom.ts b/src/Arrow.dom.ts index b6e3fbce..997a4ee5 100644 --- a/src/Arrow.dom.ts +++ b/src/Arrow.dom.ts @@ -48,7 +48,7 @@ export { Bool, Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, - Utf8, LargeUtf8, + Utf8, Utf8View, LargeUtf8, Binary, LargeBinary, FixedSizeBinary, Date_, DateDay, DateMillisecond, @@ -97,5 +97,5 @@ export { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, TimestampMicrosecondBuilder, TimestampNanosecondBuilder, TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecondBuilder, TimeNanosecondBuilder, UnionBuilder, DenseUnionBuilder, SparseUnionBuilder, - Utf8Builder, LargeUtf8Builder + Utf8Builder, Utf8ViewBuilder, LargeUtf8Builder } from './Arrow.js'; diff --git a/src/Arrow.ts b/src/Arrow.ts index f31f91a7..1de9f1d9 100644 --- a/src/Arrow.ts +++ b/src/Arrow.ts @@ -36,7 +36,7 @@ export { Bool, Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, - Utf8, LargeUtf8, + Utf8, Utf8View, LargeUtf8, Binary, LargeBinary, FixedSizeBinary, Date_, DateDay, DateMillisecond, @@ -78,6 +78,7 @@ export { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, export { IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder, IntervalMonthDayNanoBuilder } from './builder/interval.js'; export { DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, DurationMicrosecondBuilder, DurationNanosecondBuilder } from './builder/duration.js'; export { Utf8Builder } from './builder/utf8.js'; +export { Utf8ViewBuilder } from './builder/utf8view.js'; export { LargeUtf8Builder } from './builder/largeutf8.js'; export { BinaryBuilder } from './builder/binary.js'; export { LargeBinaryBuilder } from './builder/largebinary.js'; diff --git a/src/builder.ts b/src/builder.ts index 5ae43a88..c0ad184e 100644 --- a/src/builder.ts +++ b/src/builder.ts @@ -22,7 +22,7 @@ import { DataType, strideForType, Float, Int, Decimal, FixedSizeBinary, Date_, Time, Timestamp, Interval, Duration, - Utf8, LargeUtf8, Binary, LargeBinary, List, Map_, + Utf8, Utf8View, LargeUtf8, Binary, LargeBinary, List, Map_, } from './type.js'; import { createIsValidFunction } from './builder/valid.js'; import { BufferBuilder, BitmapBufferBuilder, DataBufferBuilder, OffsetsBufferBuilder } from './builder/buffer.js'; @@ -357,7 +357,7 @@ export abstract class FixedWidthBuilder extends Builder { +export abstract class VariableWidthBuilder extends Builder { protected _pendingLength = 0; protected _offsets: OffsetsBufferBuilder; protected _pending: Map | undefined; diff --git a/src/builder/utf8view.ts b/src/builder/utf8view.ts new file mode 100644 index 00000000..87cd7518 --- /dev/null +++ b/src/builder/utf8view.ts @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Utf8View } from '../type.js'; +import { encodeUtf8 } from '../util/utf8.js'; +import { BinaryBuilder } from './binary.js'; +import { BufferBuilder } from './buffer.js'; +import { VariableWidthBuilder, BuilderOptions } from '../builder.js'; + +/** @ignore */ +export class Utf8ViewBuilder extends VariableWidthBuilder { + constructor(opts: BuilderOptions) { + super(opts); + this._values = new BufferBuilder(Uint8Array); + } + public get byteLength(): number { + let size = this._pendingLength + (this.length * 4); + this._offsets && (size += this._offsets.byteLength); + this._values && (size += this._values.byteLength); + this._nulls && (size += this._nulls.byteLength); + return size; + } + public setValue(index: number, value: string) { + return super.setValue(index, encodeUtf8(value) as any); + } + // @ts-ignore + protected _flushPending(pending: Map, pendingLength: number): void { } +} + +(Utf8ViewBuilder.prototype as any)._flushPending = (BinaryBuilder.prototype as any)._flushPending; diff --git a/src/data.ts b/src/data.ts index 45fcc35d..96e1ba1b 100644 --- a/src/data.ts +++ b/src/data.ts @@ -256,7 +256,7 @@ export class Data { import { Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Int, Date_, @@ -311,6 +311,14 @@ class MakeDataVisitor extends Visitor { const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]); } + public visitUtf8View(props: Utf8ViewDataProps) { + const { ['type']: type, ['offset']: offset = 0 } = props; + const data = toUint8Array(props['data']); + const nullBitmap = toUint8Array(props['nullBitmap']); + const valueOffsets = toInt32Array(props['valueOffsets']); + const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; + return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]); + } public visitLargeUtf8(props: LargeUtf8DataProps) { const { ['type']: type, ['offset']: offset = 0 } = props; const data = toUint8Array(props['data']); @@ -457,6 +465,7 @@ interface FixedSizeBinaryDataProps extends DataProps_ interface BinaryDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } interface LargeBinaryDataProps extends DataProps_ { valueOffsets: LargeValueOffsetsBuffer | ValueOffsetsBuffer; data?: DataBuffer } interface Utf8DataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } +interface Utf8ViewDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } interface LargeUtf8DataProps extends DataProps_ { valueOffsets: LargeValueOffsetsBuffer | ValueOffsetsBuffer; data?: DataBuffer } interface ListDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; child: Data } interface FixedSizeListDataProps extends DataProps_ { child: Data } @@ -482,6 +491,7 @@ export type DataProps = ( T extends Binary /* */ ? BinaryDataProps : T extends LargeBinary /* */ ? LargeBinaryDataProps : T extends Utf8 /* */ ? Utf8DataProps : + T extends Utf8View /* */ ? Utf8ViewDataProps : T extends LargeUtf8 /* */ ? LargeUtf8DataProps : T extends List /* */ ? ListDataProps : T extends FixedSizeList /* */ ? FixedSizeListDataProps : @@ -510,6 +520,7 @@ export function makeData(props: FixedSizeBinaryDataPr export function makeData(props: BinaryDataProps): Data; export function makeData(props: LargeBinaryDataProps): Data; export function makeData(props: Utf8DataProps): Data; +export function makeData(props: Utf8ViewDataProps): Data; export function makeData(props: LargeUtf8DataProps): Data; export function makeData(props: ListDataProps): Data; export function makeData(props: FixedSizeListDataProps): Data; diff --git a/src/enum.ts b/src/enum.ts index 73d95538..3154faba 100644 --- a/src/enum.ts +++ b/src/enum.ts @@ -70,6 +70,7 @@ export enum Type { Duration = 18, /** Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds */ LargeBinary = 19, /** Large variable-length bytes (no guarantee of UTF8-ness) */ LargeUtf8 = 20, /** Large variable-length string as List */ + Utf8View = 24, /** Like Utf8 but may point to one of potentially several data buffers or may contain the characters inline **/ Dictionary = -1, /** Dictionary aka Category type */ Int8 = -2, diff --git a/src/factories.ts b/src/factories.ts index 657ae1b9..d7991ae0 100644 --- a/src/factories.ts +++ b/src/factories.ts @@ -65,6 +65,7 @@ export function makeBuilder(option export function vectorFromArray(values: readonly (null | undefined)[], type?: dtypes.Null): Vector; export function vectorFromArray(values: readonly (null | undefined | boolean)[], type?: dtypes.Bool): Vector; export function vectorFromArray = dtypes.Dictionary>(values: readonly (null | undefined | string)[], type?: T): Vector; +export function vectorFromArray = dtypes.Dictionary>(values: readonly (null | undefined | string)[], type?: T): Vector; export function vectorFromArray(values: readonly (null | undefined | Date)[], type?: T): Vector; export function vectorFromArray(values: readonly (null | undefined | number)[], type: T): Vector; export function vectorFromArray(values: readonly (null | undefined | bigint)[], type?: T): Vector; diff --git a/src/fb/Schema_generated.ts b/src/fb/Schema_generated.ts index 405b33c7..01415934 100644 --- a/src/fb/Schema_generated.ts +++ b/src/fb/Schema_generated.ts @@ -34,3 +34,4 @@ export { Type, unionToType, unionListToType } from './type.js'; export { Union } from './union.js'; export { UnionMode } from './union-mode.js'; export { Utf8 } from './utf8.js'; +export { Utf8View } from './utf8-view.js'; diff --git a/src/fb/type.ts b/src/fb/type.ts index 8eb87042..337e0f6c 100644 --- a/src/fb/type.ts +++ b/src/fb/type.ts @@ -22,6 +22,7 @@ import { Time } from './time.js'; import { Timestamp } from './timestamp.js'; import { Union } from './union.js'; import { Utf8 } from './utf8.js'; +import { Utf8View } from './utf8-view.js'; /** @@ -52,13 +53,14 @@ export enum Type { LargeBinary = 19, LargeUtf8 = 20, LargeList = 21, - RunEndEncoded = 22 + RunEndEncoded = 22, + Utf8View = 24 } export function unionToType( type: Type, - accessor: (obj:Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8) => Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|null -): Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|null { + accessor: (obj:Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View) => Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View|null +): Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View|null { switch(Type[type]) { case 'NONE': return null; case 'Null': return accessor(new Null())! as Null; @@ -83,15 +85,16 @@ export function unionToType( case 'LargeUtf8': return accessor(new LargeUtf8())! as LargeUtf8; case 'LargeList': return accessor(new LargeList())! as LargeList; case 'RunEndEncoded': return accessor(new RunEndEncoded())! as RunEndEncoded; + case 'Utf8View': return accessor(new Utf8View())! as Utf8View; default: return null; } } export function unionListToType( type: Type, - accessor: (index: number, obj:Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8) => Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|null, + accessor: (index: number, obj:Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View) => Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View|null, index: number -): Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|null { +): Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View|null { switch(Type[type]) { case 'NONE': return null; case 'Null': return accessor(index, new Null())! as Null; @@ -116,6 +119,7 @@ export function unionListToType( case 'LargeUtf8': return accessor(index, new LargeUtf8())! as LargeUtf8; case 'LargeList': return accessor(index, new LargeList())! as LargeList; case 'RunEndEncoded': return accessor(index, new RunEndEncoded())! as RunEndEncoded; + case 'Utf8View': return accessor(index, new Utf8View())! as Utf8View; default: return null; } } diff --git a/src/fb/utf8-view.ts b/src/fb/utf8-view.ts new file mode 100644 index 00000000..886a9df7 --- /dev/null +++ b/src/fb/utf8-view.ts @@ -0,0 +1,47 @@ +// automatically generated by the FlatBuffers compiler, do not modify + +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + +import * as flatbuffers from 'flatbuffers'; + +/** + * Logically the same as Utf8, but the internal representation uses a view + * struct that contains the string length and either the string's entire data + * inline (for small strings) or an inlined prefix, an index of another buffer, + * and an offset pointing to a slice in that buffer (for non-small strings). + * + * Since it uses a variable number of data buffers, each Field with this type + * must have a corresponding entry in `variadicBufferCounts`. + */ +export class Utf8View { + bb: flatbuffers.ByteBuffer|null = null; + bb_pos = 0; + __init(i:number, bb:flatbuffers.ByteBuffer):Utf8View { + this.bb_pos = i; + this.bb = bb; + return this; +} + +static getRootAsUtf8View(bb:flatbuffers.ByteBuffer, obj?:Utf8View):Utf8View { + return (obj || new Utf8View()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static getSizePrefixedRootAsUtf8View(bb:flatbuffers.ByteBuffer, obj?:Utf8View):Utf8View { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new Utf8View()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static startUtf8View(builder:flatbuffers.Builder) { + builder.startObject(0); +} + +static endUtf8View(builder:flatbuffers.Builder):flatbuffers.Offset { + const offset = builder.endObject(); + return offset; +} + +static createUtf8View(builder:flatbuffers.Builder):flatbuffers.Offset { + Utf8View.startUtf8View(builder); + return Utf8View.endUtf8View(builder); +} +} diff --git a/src/interfaces.ts b/src/interfaces.ts index 0645753b..150d2422 100644 --- a/src/interfaces.ts +++ b/src/interfaces.ts @@ -41,6 +41,7 @@ import type { FixedSizeListBuilder } from './builder/fixedsizelist.js'; import type { MapBuilder } from './builder/map.js'; import type { StructBuilder } from './builder/struct.js'; import type { UnionBuilder, SparseUnionBuilder, DenseUnionBuilder } from './builder/union.js'; +import type { Utf8ViewBuilder } from "./builder/utf8view.js"; /** @ignore */ type FloatArray = Float32Array | Float64Array; /** @ignore */ type IntArray = Int8Array | Int16Array | Int32Array; @@ -209,6 +210,7 @@ export type TypeToDataType = { [Type.Float64]: type.Float64; [Type.Float]: type.Float; [Type.Utf8]: type.Utf8; + [Type.Utf8View]: type.Utf8View; [Type.LargeUtf8]: type.LargeUtf8; [Type.Binary]: type.Binary; [Type.LargeBinary]: type.LargeBinary; @@ -265,6 +267,7 @@ type TypeToBuilder = { [Type.Float64]: Float64Builder; [Type.Float]: FloatBuilder; [Type.Utf8]: Utf8Builder; + [Type.Utf8View]: Utf8ViewBuilder; [Type.LargeUtf8]: LargeUtf8Builder; [Type.Binary]: BinaryBuilder; [Type.LargeBinary]: LargeBinaryBuilder; @@ -321,6 +324,7 @@ type DataTypeToBuilder = { [Type.Float64]: T extends type.Float64 ? Float64Builder : never; [Type.Float]: T extends type.Float ? FloatBuilder : never; [Type.Utf8]: T extends type.Utf8 ? Utf8Builder : never; + [Type.Utf8View]: T extends type.Utf8View ? Utf8ViewBuilder : never; [Type.LargeUtf8]: T extends type.LargeUtf8 ? LargeUtf8Builder : never; [Type.Binary]: T extends type.Binary ? BinaryBuilder : never; [Type.LargeBinary]: T extends type.LargeBinary ? LargeBinaryBuilder : never; diff --git a/src/ipc/metadata/json.ts b/src/ipc/metadata/json.ts index bb88f0da..c32a6169 100644 --- a/src/ipc/metadata/json.ts +++ b/src/ipc/metadata/json.ts @@ -18,7 +18,7 @@ import { Schema, Field } from '../../schema.js'; import { DataType, Dictionary, TimeBitWidth, - Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, + Utf8, Utf8View, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Union, Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration, } from '../../type.js'; @@ -149,6 +149,7 @@ function typeFromJSON(f: any, children?: Field[]): DataType { case 'binary': return new Binary(); case 'largebinary': return new LargeBinary(); case 'utf8': return new Utf8(); + case 'utf8view': return new Utf8View(); case 'largeutf8': return new LargeUtf8(); case 'bool': return new Bool(); case 'list': return new List((children || [])[0]); diff --git a/src/ipc/metadata/message.ts b/src/ipc/metadata/message.ts index d3428972..935cd8cd 100644 --- a/src/ipc/metadata/message.ts +++ b/src/ipc/metadata/message.ts @@ -56,7 +56,7 @@ import { DataType, Dictionary, TimeBitWidth, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Union, - Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration, + Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration, Utf8View, } from '../../type.js'; /** @@ -432,6 +432,7 @@ function decodeFieldType(f: _Field, children?: Field[]): DataType { case Type['Binary']: return new Binary(); case Type['LargeBinary']: return new LargeBinary(); case Type['Utf8']: return new Utf8(); + case Type['Utf8View']: return new Utf8View(); case Type['LargeUtf8']: return new LargeUtf8(); case Type['Bool']: return new Bool(); case Type['List']: return new List((children || [])[0]); diff --git a/src/type.ts b/src/type.ts index ea5e24fa..b944d815 100644 --- a/src/type.ts +++ b/src/type.ts @@ -60,6 +60,7 @@ export abstract class DataType { })(Utf8.prototype); } +/** @ignore */ +export interface Utf8View extends DataType { TArray: Uint8Array; TOffsetArray: Int32Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetArrayType: TypedArrayConstructor } +/** @ignore */ +export class Utf8View extends DataType { + constructor() { + super(Type.Utf8View); + } + public toString() { return `Utf8View`; } + protected static [Symbol.toStringTag] = ((proto: Utf8View) => { + (proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'Utf8View'; + })(Utf8View.prototype); +} + /** @ignore */ export interface LargeUtf8 extends DataType { TArray: Uint8Array; TOffsetArray: BigInt64Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetArrayType: BigIntArrayConstructor } /** @ignore */ diff --git a/src/visitor.ts b/src/visitor.ts index 977e0a4e..ab380883 100644 --- a/src/visitor.ts +++ b/src/visitor.ts @@ -36,6 +36,7 @@ export abstract class Visitor { public visitInt(_node: any, ..._args: any[]): any { return null; } public visitFloat(_node: any, ..._args: any[]): any { return null; } public visitUtf8(_node: any, ..._args: any[]): any { return null; } + public visitUtf8View(_node: any, ..._args: any[]): any { return null; } public visitLargeUtf8(_node: any, ..._args: any[]): any { return null; } public visitBinary(_node: any, ..._args: any[]): any { return null; } public visitLargeBinary(_node: any, ..._args: any[]): any { return null; } @@ -91,6 +92,7 @@ function getVisitFnByTypeId(visitor: Visitor, dtype: Type, throwIfNotFound = tru case Type.Float32: fn = visitor.visitFloat32 || visitor.visitFloat; break; case Type.Float64: fn = visitor.visitFloat64 || visitor.visitFloat; break; case Type.Utf8: fn = visitor.visitUtf8; break; + case Type.Utf8View: fn = visitor.visitUtf8View; break; case Type.LargeUtf8: fn = visitor.visitLargeUtf8; break; case Type.Binary: fn = visitor.visitBinary; break; case Type.LargeBinary: fn = visitor.visitLargeBinary; break; @@ -158,6 +160,7 @@ function inferDType(type: T): Type { case Type.Binary: return Type.Binary; case Type.LargeBinary: return Type.LargeBinary; case Type.Utf8: return Type.Utf8; + case Type.Utf8View: return Type.Utf8View; case Type.LargeUtf8: return Type.LargeUtf8; case Type.Bool: return Type.Bool; case Type.Decimal: return Type.Decimal; @@ -237,6 +240,7 @@ export interface Visitor { visitFloat32?(node: any, ...args: any[]): any; visitFloat64?(node: any, ...args: any[]): any; visitUtf8(node: any, ...args: any[]): any; + visitUtf8View(node: any, ...args: any[]): any; visitLargeUtf8(node: any, ...args: any[]): any; visitBinary(node: any, ...args: any[]): any; visitLargeBinary(node: any, ...args: any[]): any; diff --git a/src/visitor/builderctor.ts b/src/visitor/builderctor.ts index 791576b0..f7884f1c 100644 --- a/src/visitor/builderctor.ts +++ b/src/visitor/builderctor.ts @@ -42,6 +42,7 @@ import { TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecond import { UnionBuilder, DenseUnionBuilder, SparseUnionBuilder } from '../builder/union.js'; import { Utf8Builder } from '../builder/utf8.js'; import { LargeUtf8Builder } from '../builder/largeutf8.js'; +import { Utf8ViewBuilder } from '../builder/utf8view.js'; /** @ignore */ export interface GetBuilderCtor extends Visitor { @@ -69,6 +70,7 @@ export class GetBuilderCtor extends Visitor { public visitFloat32() { return Float32Builder; } public visitFloat64() { return Float64Builder; } public visitUtf8() { return Utf8Builder; } + public visitUtf8View() { return Utf8ViewBuilder; } public visitLargeUtf8() { return LargeUtf8Builder; } public visitBinary() { return BinaryBuilder; } public visitLargeBinary() { return LargeBinaryBuilder; } diff --git a/src/visitor/get.ts b/src/visitor/get.ts index a5502dd3..bc303c3f 100644 --- a/src/visitor/get.ts +++ b/src/visitor/get.ts @@ -37,7 +37,7 @@ import { Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond, Union, DenseUnion, SparseUnion, - IntervalMonthDayNano, + IntervalMonthDayNano, Utf8View, } from '../type.js'; /** @ignore */ @@ -62,6 +62,7 @@ export interface GetVisitor extends Visitor { visitFloat32(data: Data, index: number): T['TValue'] | null; visitFloat64(data: Data, index: number): T['TValue'] | null; visitUtf8(data: Data, index: number): T['TValue'] | null; + visitUtf8View(data: Data, index: number): T['TValue'] | null; visitLargeUtf8(data: Data, index: number): T['TValue'] | null; visitBinary(data: Data, index: number): T['TValue'] | null; visitLargeBinary(data: Data, index: number): T['TValue'] | null; @@ -149,7 +150,7 @@ const getFixedSizeBinary = ({ stride, values }: Data< /** @ignore */ const getBinary = ({ values, valueOffsets }: Data, index: number): T['TValue'] => getVariableWidthBytes(values, valueOffsets, index); /** @ignore */ -const getUtf8 = ({ values, valueOffsets }: Data, index: number): T['TValue'] => { +const getUtf8 = ({ values, valueOffsets }: Data, index: number): T['TValue'] => { const bytes = getVariableWidthBytes(values, valueOffsets, index); return bytes !== null ? decodeUtf8(bytes) : null as any; }; @@ -331,6 +332,7 @@ GetVisitor.prototype.visitFloat16 = wrapGet(getFloat16); GetVisitor.prototype.visitFloat32 = wrapGet(getNumeric); GetVisitor.prototype.visitFloat64 = wrapGet(getNumeric); GetVisitor.prototype.visitUtf8 = wrapGet(getUtf8); +GetVisitor.prototype.visitUtf8View = wrapGet(getUtf8); GetVisitor.prototype.visitLargeUtf8 = wrapGet(getUtf8); GetVisitor.prototype.visitBinary = wrapGet(getBinary); GetVisitor.prototype.visitLargeBinary = wrapGet(getBinary); diff --git a/src/visitor/indexof.ts b/src/visitor/indexof.ts index 3a4d1171..a88d537a 100644 --- a/src/visitor/indexof.ts +++ b/src/visitor/indexof.ts @@ -33,7 +33,7 @@ import { Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond, Union, DenseUnion, SparseUnion, - IntervalMonthDayNano, + IntervalMonthDayNano, Utf8View, } from '../type.js'; /** @ignore */ @@ -58,6 +58,7 @@ export interface IndexOfVisitor extends Visitor { visitFloat32(data: Data, value: T['TValue'] | null, index?: number): number; visitFloat64(data: Data, value: T['TValue'] | null, index?: number): number; visitUtf8(data: Data, value: T['TValue'] | null, index?: number): number; + visitUtf8View(data: Data, value: T['TValue'] | null, index?: number): number; visitLargeUtf8(data: Data, value: T['TValue'] | null, index?: number): number; visitBinary(data: Data, value: T['TValue'] | null, index?: number): number; visitLargeBinary(data: Data, value: T['TValue'] | null, index?: number): number; @@ -176,6 +177,7 @@ IndexOfVisitor.prototype.visitFloat16 = indexOfValue; IndexOfVisitor.prototype.visitFloat32 = indexOfValue; IndexOfVisitor.prototype.visitFloat64 = indexOfValue; IndexOfVisitor.prototype.visitUtf8 = indexOfValue; +IndexOfVisitor.prototype.visitUtf8View = indexOfValue; IndexOfVisitor.prototype.visitLargeUtf8 = indexOfValue; IndexOfVisitor.prototype.visitBinary = indexOfValue; IndexOfVisitor.prototype.visitLargeBinary = indexOfValue; diff --git a/src/visitor/iterator.ts b/src/visitor/iterator.ts index 9f2844b3..190474d8 100644 --- a/src/visitor/iterator.ts +++ b/src/visitor/iterator.ts @@ -30,7 +30,7 @@ import { Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond, Union, DenseUnion, SparseUnion, - IntervalMonthDayNano, + IntervalMonthDayNano, Utf8View, } from '../type.js'; import { ChunkedIterator } from '../util/chunk.js'; @@ -56,6 +56,7 @@ export interface IteratorVisitor extends Visitor { visitFloat32(vector: Vector): IterableIterator; visitFloat64(vector: Vector): IterableIterator; visitUtf8(vector: Vector): IterableIterator; + visitUtf8View(vector: Vector): IterableIterator; visitLargeUtf8(vector: Vector): IterableIterator; visitBinary(vector: Vector): IterableIterator; visitLargeBinary(vector: Vector): IterableIterator; @@ -163,6 +164,7 @@ IteratorVisitor.prototype.visitFloat16 = vectorIterator; IteratorVisitor.prototype.visitFloat32 = vectorIterator; IteratorVisitor.prototype.visitFloat64 = vectorIterator; IteratorVisitor.prototype.visitUtf8 = vectorIterator; +IteratorVisitor.prototype.visitUtf8View = vectorIterator; IteratorVisitor.prototype.visitLargeUtf8 = vectorIterator; IteratorVisitor.prototype.visitBinary = vectorIterator; IteratorVisitor.prototype.visitLargeBinary = vectorIterator; diff --git a/src/visitor/jsontypeassembler.ts b/src/visitor/jsontypeassembler.ts index 823b1dea..a1e08a81 100644 --- a/src/visitor/jsontypeassembler.ts +++ b/src/visitor/jsontypeassembler.ts @@ -51,6 +51,9 @@ export class JSONTypeAssembler extends Visitor { public visitUtf8({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; } + public visitUtf8View({ typeId }: T) { + return { 'name': ArrowType[typeId].toLowerCase() }; + } public visitLargeUtf8({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; } diff --git a/src/visitor/jsonvectorassembler.ts b/src/visitor/jsonvectorassembler.ts index 6841b39d..b70831f8 100644 --- a/src/visitor/jsonvectorassembler.ts +++ b/src/visitor/jsonvectorassembler.ts @@ -28,7 +28,7 @@ import { toIntervalDayTimeObjects, toIntervalMonthDayNanoObjects } from '../util import { DataType, Float, Int, Date_, Interval, Time, Timestamp, Union, Duration, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, IntArray, + Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, IntArray, Utf8View } from '../type.js'; /** @ignore */ @@ -43,6 +43,7 @@ export interface JSONVectorAssembler extends Visitor { visitInt(data: Data): { DATA: number[] | string[] }; visitFloat(data: Data): { DATA: number[] }; visitUtf8(data: Data): { DATA: string[]; OFFSET: number[] }; + visitUtf8View(data: Data): { DATA: string[]; OFFSET: number[] }; visitLargeUtf8(data: Data): { DATA: string[]; OFFSET: string[] }; visitBinary(data: Data): { DATA: string[]; OFFSET: number[] }; visitLargeBinary(data: Data): { DATA: string[]; OFFSET: string[] }; @@ -103,6 +104,9 @@ export class JSONVectorAssembler extends Visitor { public visitUtf8(data: Data) { return { 'DATA': [...new Vector([data])], 'OFFSET': [...data.valueOffsets] }; } + public visitUtf8View(data: Data) { + return { 'DATA': [...new Vector([data])], 'OFFSET': [...data.valueOffsets] }; + } public visitLargeUtf8(data: Data) { return { 'DATA': [...new Vector([data])], 'OFFSET': [...bigNumsToStrings(data.valueOffsets, 2)] }; } diff --git a/src/visitor/set.ts b/src/visitor/set.ts index 4bf632ba..4641fcae 100644 --- a/src/visitor/set.ts +++ b/src/visitor/set.ts @@ -35,7 +35,7 @@ import { Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond, Union, DenseUnion, SparseUnion, - IntervalMonthDayNano, + IntervalMonthDayNano, Utf8View, } from '../type.js'; /** @ignore */ @@ -60,6 +60,7 @@ export interface SetVisitor extends Visitor { visitFloat32(data: Data, index: number, value: T['TValue']): void; visitFloat64(data: Data, index: number, value: T['TValue']): void; visitUtf8(data: Data, index: number, value: T['TValue']): void; + visitUtf8View(data: Data, index: number, value: T['TValue']): void; visitLargeUtf8(data: Data, index: number, value: T['TValue']): void; visitBinary(data: Data, index: number, value: T['TValue']): void; visitLargeBinary(data: Data, index: number, value: T['TValue']): void; @@ -155,7 +156,7 @@ export const setFixedSizeBinary = ({ stride, values } /** @ignore */ const setBinary = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => setVariableWidthBytes(values, valueOffsets, index, value); /** @ignore */ -const setUtf8 = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => setVariableWidthBytes(values, valueOffsets, index, encodeUtf8(value)); +const setUtf8 = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => setVariableWidthBytes(values, valueOffsets, index, encodeUtf8(value)); /* istanbul ignore next */ export const setDate = (data: Data, index: number, value: T['TValue']): void => { @@ -358,6 +359,7 @@ SetVisitor.prototype.visitFloat16 = wrapSet(setFloat16); SetVisitor.prototype.visitFloat32 = wrapSet(setFloat); SetVisitor.prototype.visitFloat64 = wrapSet(setFloat); SetVisitor.prototype.visitUtf8 = wrapSet(setUtf8); +SetVisitor.prototype.visitUtf8View = wrapSet(setUtf8); SetVisitor.prototype.visitLargeUtf8 = wrapSet(setUtf8); SetVisitor.prototype.visitBinary = wrapSet(setBinary); SetVisitor.prototype.visitLargeBinary = wrapSet(setBinary); diff --git a/src/visitor/typeassembler.ts b/src/visitor/typeassembler.ts index 169f3627..9afb66ed 100644 --- a/src/visitor/typeassembler.ts +++ b/src/visitor/typeassembler.ts @@ -42,6 +42,7 @@ import { DictionaryEncoding } from '../fb/dictionary-encoding.js'; import { FixedSizeBinary } from '../fb/fixed-size-binary.js'; import { FixedSizeList } from '../fb/fixed-size-list.js'; import { Map as Map_ } from '../fb/map.js'; +import { Utf8View } from '../fb/utf8-view.js'; /** @ignore */ export interface TypeAssembler extends Visitor { @@ -84,6 +85,10 @@ export class TypeAssembler extends Visitor { Utf8.startUtf8(b); return Utf8.endUtf8(b); } + public visitUtf8View(_node: T, b: Builder) { + Utf8View.startUtf8View(b); + return Utf8View.endUtf8View(b); + } public visitLargeUtf8(_node: T, b: Builder) { LargeUtf8.startLargeUtf8(b); return LargeUtf8.endLargeUtf8(b); diff --git a/src/visitor/typecomparator.ts b/src/visitor/typecomparator.ts index 65413ccd..fe224607 100644 --- a/src/visitor/typecomparator.ts +++ b/src/visitor/typecomparator.ts @@ -30,7 +30,7 @@ import { Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond, Union, DenseUnion, SparseUnion, - IntervalMonthDayNano, + IntervalMonthDayNano, Utf8View, } from '../type.js'; /** @ignore */ @@ -54,6 +54,7 @@ export interface TypeComparator extends Visitor { visitFloat32(type: T, other?: DataType | null): other is T; visitFloat64(type: T, other?: DataType | null): other is T; visitUtf8(type: T, other?: DataType | null): other is T; + visitUtf8View(type: T, other?: DataType | null): other is T; visitLargeUtf8(type: T, other?: DataType | null): other is T; visitBinary(type: T, other?: DataType | null): other is T; visitLargeBinary(type: T, other?: DataType | null): other is T; @@ -253,6 +254,7 @@ TypeComparator.prototype.visitFloat16 = compareFloat; TypeComparator.prototype.visitFloat32 = compareFloat; TypeComparator.prototype.visitFloat64 = compareFloat; TypeComparator.prototype.visitUtf8 = compareAny; +TypeComparator.prototype.visitUtf8View = compareAny; TypeComparator.prototype.visitLargeUtf8 = compareAny; TypeComparator.prototype.visitBinary = compareAny; TypeComparator.prototype.visitLargeBinary = compareAny; diff --git a/src/visitor/typector.ts b/src/visitor/typector.ts index 2aab6d3d..875bca1f 100644 --- a/src/visitor/typector.ts +++ b/src/visitor/typector.ts @@ -49,6 +49,7 @@ export class GetDataTypeConstructor extends Visitor { public visitFloat32() { return type.Float32; } public visitFloat64() { return type.Float64; } public visitUtf8() { return type.Utf8; } + public visitUtf8View() { return type.Utf8View; } public visitLargeUtf8() { return type.LargeUtf8; } public visitBinary() { return type.Binary; } public visitLargeBinary() { return type.LargeBinary; } diff --git a/src/visitor/vectorassembler.ts b/src/visitor/vectorassembler.ts index 7dc36955..97379e20 100644 --- a/src/visitor/vectorassembler.ts +++ b/src/visitor/vectorassembler.ts @@ -27,7 +27,7 @@ import { BufferRegion, FieldNode } from '../ipc/metadata/message.js'; import { DataType, Dictionary, Float, Int, Date_, Interval, Time, Timestamp, Union, Duration, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Utf8View } from '../type.js'; import { bigIntToNumber } from '../util/bigint.js'; @@ -42,6 +42,7 @@ export interface VectorAssembler extends Visitor { visitInt(data: Data): this; visitFloat(data: Data): this; visitUtf8(data: Data): this; + visitUtf8View(data: Data): this; visitLargeUtf8(data: Data): this; visitBinary(data: Data): this; visitLargeBinary(data: Data): this; @@ -204,7 +205,7 @@ function assembleFlatVector(this: VectorAssembler, data: Data) { +function assembleFlatListVector(this: VectorAssembler, data: Data) { const { length, values, valueOffsets } = data; const begin = bigIntToNumber(valueOffsets[0]); const end = bigIntToNumber(valueOffsets[length]); @@ -238,6 +239,7 @@ VectorAssembler.prototype.visitBool = assembleBoolVector; VectorAssembler.prototype.visitInt = assembleFlatVector; VectorAssembler.prototype.visitFloat = assembleFlatVector; VectorAssembler.prototype.visitUtf8 = assembleFlatListVector; +VectorAssembler.prototype.visitUtf8View = assembleFlatListVector; VectorAssembler.prototype.visitLargeUtf8 = assembleFlatListVector; VectorAssembler.prototype.visitBinary = assembleFlatListVector; VectorAssembler.prototype.visitLargeBinary = assembleFlatListVector; diff --git a/src/visitor/vectorloader.ts b/src/visitor/vectorloader.ts index 198c32ff..cb31ade4 100644 --- a/src/visitor/vectorloader.ts +++ b/src/visitor/vectorloader.ts @@ -72,6 +72,9 @@ export class VectorLoader extends Visitor { public visitUtf8(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); } + public visitUtf8View(type: T, { length, nullCount } = this.nextFieldNode()) { + return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); + } public visitLargeUtf8(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); } @@ -177,7 +180,7 @@ export class JSONVectorLoader extends VectorLoader { return binaryDataFromJSON(sources[offset] as string[]); } else if (DataType.isBool(type)) { return packBools(sources[offset] as number[]); - } else if (DataType.isUtf8(type) || DataType.isLargeUtf8(type)) { + } else if (DataType.isUtf8(type) || DataType.isUtf8View(type) || DataType.isLargeUtf8(type)) { return encodeUtf8((sources[offset] as string[]).join('')); } else if (DataType.isInterval(type)) { switch (type.unit) { diff --git a/test/data/tables.ts b/test/data/tables.ts index e9674d9b..d445b2e5 100644 --- a/test/data/tables.ts +++ b/test/data/tables.ts @@ -27,7 +27,7 @@ const nestedVectorGeneratorNames = ['struct', 'denseUnion', 'sparseUnion', 'map' const dictionaryKeyGeneratorNames = ['int8', 'int16', 'int32', 'uint8', 'uint16', 'uint32']; const valueVectorGeneratorNames = [ 'null_', 'bool', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64', - 'float16', 'float32', 'float64', 'utf8', 'largeUtf8', 'binary', 'largeBinary', 'fixedSizeBinary', 'dateDay', 'dateMillisecond', + 'float16', 'float32', 'float64', 'utf8', 'largeUtf8', 'utf8View', 'binary', 'largeBinary', 'fixedSizeBinary', 'dateDay', 'dateMillisecond', 'timestampSecond', 'timestampMillisecond', 'timestampMicrosecond', 'timestampNanosecond', 'timeSecond', 'timeMillisecond', 'timeMicrosecond', 'timeNanosecond', 'decimal', 'dictionary', 'intervalDayTime', 'intervalYearMonth', 'intervalMonthDayNano', diff --git a/test/generate-test-data.ts b/test/generate-test-data.ts index de4a8269..034ff22c 100644 --- a/test/generate-test-data.ts +++ b/test/generate-test-data.ts @@ -39,7 +39,7 @@ import { Map_, DateUnit, TimeUnit, UnionMode, util, - IntervalUnit + IntervalUnit, Utf8View } from 'apache-arrow'; import { randomString } from './random-string.js'; @@ -53,6 +53,7 @@ interface TestDataVectorGenerator extends Visitor { visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; + visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; @@ -78,6 +79,7 @@ interface TestDataVectorGenerator extends Visitor { visitUint64: typeof generateBigInt; visitFloat: typeof generateFloat; visitUtf8: typeof generateUtf8; + visitUtf8View: typeof generateUtf8View; visitLargeUtf8: typeof generateLargeUtf8; visitBinary: typeof generateBinary; visitLargeBinary: typeof generateLargeBinary; @@ -105,6 +107,7 @@ TestDataVectorGenerator.prototype.visitInt64 = generateBigInt; TestDataVectorGenerator.prototype.visitUint64 = generateBigInt; TestDataVectorGenerator.prototype.visitFloat = generateFloat; TestDataVectorGenerator.prototype.visitUtf8 = generateUtf8; +TestDataVectorGenerator.prototype.visitUtf8View = generateUtf8View; TestDataVectorGenerator.prototype.visitLargeUtf8 = generateLargeUtf8; TestDataVectorGenerator.prototype.visitBinary = generateBinary; TestDataVectorGenerator.prototype.visitLargeBinary = generateLargeBinary; @@ -221,6 +224,7 @@ export const float16 = (length = 100, nullCount = Math.trunc(length * 0.2)) => v export const float32 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Float32(), length, nullCount); export const float64 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Float64(), length, nullCount); export const utf8 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Utf8(), length, nullCount); +export const utf8View = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Utf8View(), length, nullCount); export const largeUtf8 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new LargeUtf8(), length, nullCount); export const binary = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Binary(), length, nullCount); export const largeBinary = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new LargeBinary(), length, nullCount); @@ -252,7 +256,7 @@ export const fixedSizeList = (length = 100, nullCount = Math.trunc(length * 0.2) export const map = (length = 100, nullCount = Math.trunc(length * 0.2), child: Field> = defaultMapChild()) => vectorGenerator.visit(new Map_(child), length, nullCount); export const vecs = { - null_, bool, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float16, float32, float64, utf8, largeUtf8, binary, largeBinary, fixedSizeBinary, dateDay, dateMillisecond, timestampSecond, timestampMillisecond, timestampMicrosecond, timestampNanosecond, timeSecond, timeMillisecond, timeMicrosecond, timeNanosecond, decimal, list, struct, denseUnion, sparseUnion, dictionary, intervalDayTime, intervalYearMonth, intervalMonthDayNano, fixedSizeList, map, durationSecond, durationMillisecond, durationMicrosecond, durationNanosecond + null_, bool, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float16, float32, float64, utf8, utf8View, largeUtf8, binary, largeBinary, fixedSizeBinary, dateDay, dateMillisecond, timestampSecond, timestampMillisecond, timestampMicrosecond, timestampNanosecond, timeSecond, timeMillisecond, timeMicrosecond, timeNanosecond, decimal, list, struct, denseUnion, sparseUnion, dictionary, intervalDayTime, intervalYearMonth, intervalMonthDayNano, fixedSizeList, map, durationSecond, durationMillisecond, durationMicrosecond, durationNanosecond } as { [k: string]: (...args: any[]) => any }; function generateNull(this: TestDataVectorGenerator, type: T, length = 100): GeneratedVector { @@ -342,6 +346,28 @@ function generateUtf8(this: TestDataVectorGenerator, type: T, le return { values: () => values, vector: new Vector([makeData({ type, length, nullCount, nullBitmap, valueOffsets, data })]) }; } +function generateUtf8View(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { + const nullBitmap = createBitmap(length, nullCount); + const valueOffsets = createVariableWidthOffsets32(length, nullBitmap, 10, 20, nullCount != 0); + const values: string[] = new Array(valueOffsets.length - 1).fill(null); + [...valueOffsets.slice(1)] + .map((o, i) => isValid(nullBitmap, i) ? o - valueOffsets[i] : null) + .reduce((map, length, i) => { + if (length !== null) { + if (length > 0) { + do { + values[i] = randomString(length); + } while (map.has(values[i])); + return map.set(values[i], i); + } + values[i] = ''; + } + return map; + }, new Map()); + const data = createVariableWidthBytes(length, nullBitmap, valueOffsets, (i) => encodeUtf8(values[i])); + return { values: () => values, vector: new Vector([makeData({ type, length, nullCount, nullBitmap, valueOffsets, data })]) }; +} + function generateLargeUtf8(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { const nullBitmap = createBitmap(length, nullCount); const valueOffsets = createVariableWidthOffsets64(length, nullBitmap, 10, 20, nullCount != 0); diff --git a/test/unit/builders/builder-tests.ts b/test/unit/builders/builder-tests.ts index e8684010..31c4f801 100644 --- a/test/unit/builders/builder-tests.ts +++ b/test/unit/builders/builder-tests.ts @@ -44,6 +44,7 @@ describe('Generated Test Data', () => { describe('Float32Builder', () => { validateBuilder(generate.float32); }); describe('Float64Builder', () => { validateBuilder(generate.float64); }); describe('Utf8Builder', () => { validateBuilder(generate.utf8); }); + describe('Utf8ViewBuilder', () => { validateBuilder(generate.utf8View); }); describe('LargeUtf8Builder', () => { validateBuilder(generate.largeUtf8); }); describe('BinaryBuilder', () => { validateBuilder(generate.binary); }); describe('LargeBinaryBuilder', () => { validateBuilder(generate.largeBinary); }); diff --git a/test/unit/builders/utf8view-tests.ts b/test/unit/builders/utf8view-tests.ts new file mode 100644 index 00000000..43f405dd --- /dev/null +++ b/test/unit/builders/utf8view-tests.ts @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import 'web-streams-polyfill'; + +import { validateVector } from './utils.js'; +import { + encodeAll, + encodeEach, + encodeEachDOM, + encodeEachNode, + stringsNoNulls, + stringsWithNAs, + stringsWithNulls, + stringsWithEmpties +} from './utils.js'; + +import { Vector, Utf8View } from 'apache-arrow'; + +const testDOMStreams = process.env.TEST_DOM_STREAMS === 'true'; +const testNodeStreams = process.env.TEST_NODE_STREAMS === 'true'; + +describe('Utf8ViewBuilder', () => { + runTestsWithEncoder('encodeAll', encodeAll(() => new Utf8View())); + runTestsWithEncoder('encodeEach: 5', encodeEach(() => new Utf8View(), 5)); + runTestsWithEncoder('encodeEach: 25', encodeEach(() => new Utf8View(), 25)); + runTestsWithEncoder('encodeEach: undefined', encodeEach(() => new Utf8View(), void 0)); + testDOMStreams && runTestsWithEncoder('encodeEachDOM: 25', encodeEachDOM(() => new Utf8View(), 25)); + testNodeStreams && runTestsWithEncoder('encodeEachNode: 25', encodeEachNode(() => new Utf8View(), 25)); +}); + +function runTestsWithEncoder(name: string, encode: (vals: (string | null)[], nullVals?: any[]) => Promise>) { + describe(`${encode.name} ${name}`, () => { + it(`encodes strings no nulls`, async () => { + const vals = stringsNoNulls(20); + validateVector(vals, await encode(vals, []), []); + }); + it(`encodes strings with nulls`, async () => { + const vals = stringsWithNulls(20); + validateVector(vals, await encode(vals, [null]), [null]); + }); + it(`encodes strings using n/a as the null value rep`, async () => { + const vals = stringsWithNAs(20); + validateVector(vals, await encode(vals, ['n/a']), ['n/a']); + }); + it(`encodes strings using \\0 as the null value rep`, async () => { + const vals = stringsWithEmpties(20); + validateVector(vals, await encode(vals, ['\0']), ['\0']); + }); + }); +} diff --git a/test/unit/generated-data-tests.ts b/test/unit/generated-data-tests.ts index 9affe5f6..6e14ae97 100644 --- a/test/unit/generated-data-tests.ts +++ b/test/unit/generated-data-tests.ts @@ -38,6 +38,7 @@ describe('Generated Test Data', () => { describe('Float32', () => { validateVector(generate.float32()); }); describe('Float64', () => { validateVector(generate.float64()); }); describe('Utf8', () => { validateVector(generate.utf8()); }); + describe('Utf8View', () => { validateVector(generate.utf8View()); }); describe('LargeUtf8', () => { validateVector(generate.largeUtf8()); }); describe('Binary', () => { validateVector(generate.binary()); }); describe('LargeBinary', () => { validateVector(generate.largeBinary()); }); diff --git a/test/unit/vector/vector-tests.ts b/test/unit/vector/vector-tests.ts index 73c9cdbb..f373e6a4 100644 --- a/test/unit/vector/vector-tests.ts +++ b/test/unit/vector/vector-tests.ts @@ -16,7 +16,7 @@ // under the License. import { - Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, makeVector, Struct, Utf8, LargeUtf8, util, Vector, vectorFromArray, makeData, FixedSizeList, Field, + Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, makeVector, Struct, Utf8, LargeUtf8, util, Vector, vectorFromArray, makeData, FixedSizeList, Field, Utf8View, } from 'apache-arrow'; describe(`makeVectorFromArray`, () => { @@ -234,6 +234,28 @@ describe(`Utf8Vector`, () => { }); }); +describe(`Utf8ViewVector`, () => { + const values = ['foo', 'bar', 'baz', 'foo bar', 'bar']; + const vector = vectorFromArray(values, new Utf8View); + + test(`has utf8view type`, () => { + expect(vector.type).toBeInstanceOf(Utf8View); + }); + + test(`is not memoized`, () => { + expect(vector.isMemoized).toBe(false); + const memoizedVector = vector.memoize(); + expect(memoizedVector.isMemoized).toBe(true); + const unMemoizedVector = vector.unmemoize(); + expect(unMemoizedVector.isMemoized).toBe(false); + }); + + basicVectorTests(vector, values, ['abc', '123']); + describe(`sliced`, () => { + basicVectorTests(vector.slice(1, 3), values.slice(1, 3), ['foo', 'abc']); + }); +}); + describe(`LargeUtf8Vector`, () => { const values = ['foo', 'bar', 'baz', 'foo bar', 'bar']; const vector = vectorFromArray(values, new LargeUtf8); diff --git a/test/unit/visitor-tests.ts b/test/unit/visitor-tests.ts index 6ecb6cca..62eabe30 100644 --- a/test/unit/visitor-tests.ts +++ b/test/unit/visitor-tests.ts @@ -26,7 +26,7 @@ import { Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond, - Union, DenseUnion, SparseUnion, + Union, DenseUnion, SparseUnion, Utf8View, } from 'apache-arrow'; class BasicVisitor extends Visitor { @@ -36,6 +36,7 @@ class BasicVisitor extends Visitor { public visitInt(type: T) { return (this.type = type); } public visitFloat(type: T) { return (this.type = type); } public visitUtf8(type: T) { return (this.type = type); } + public visitUtf8View(type: T) { return (this.type = type); } public visitLargeUtf8(type: T) { return (this.type = type); } public visitBinary(type: T) { return (this.type = type); } public visitLargeBinary(type: T) { return (this.type = type); } @@ -70,6 +71,7 @@ class FeatureVisitor extends Visitor { public visitFloat32(type: T) { return (this.type = type); } public visitFloat64(type: T) { return (this.type = type); } public visitUtf8(type: T) { return (this.type = type); } + public visitUtf8View(type: T) { return (this.type = type); } public visitLargeUtf8(type: T) { return (this.type = type); } public visitBinary(type: T) { return (this.type = type); } public visitLargeBinary(type: T) { return (this.type = type); } @@ -108,6 +110,7 @@ describe('Visitor', () => { test(`visits Int types`, () => validateBasicVisitor(new Int(true, 32))); test(`visits Float types`, () => validateBasicVisitor(new Float(0))); test(`visits Utf8 types`, () => validateBasicVisitor(new Utf8())); + test(`visits Utf8View types`, () => validateBasicVisitor(new Utf8View())); test(`visits LargeUtf8 types`, () => validateBasicVisitor(new LargeUtf8())); test(`visits Binary types`, () => validateBasicVisitor(new Binary())); test(`visits LargeBinary types`, () => validateBasicVisitor(new LargeBinary())); @@ -150,6 +153,7 @@ describe('Visitor', () => { test(`visits Float32 types`, () => validateFeatureVisitor(new Float32())); test(`visits Float64 types`, () => validateFeatureVisitor(new Float64())); test(`visits Utf8 types`, () => validateFeatureVisitor(new Utf8())); + test(`visits Utf8View types`, () => validateFeatureVisitor(new Utf8View())); test(`visits LargeUtf8 types`, () => validateFeatureVisitor(new LargeUtf8())); test(`visits Binary types`, () => validateFeatureVisitor(new Binary())); test(`visits LargeBinary types`, () => validateFeatureVisitor(new LargeBinary()));