// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package arrow import ( "errors" "fmt" "strconv" "strings" "github.com/apache/arrow/go/v10/arrow/internal/debug" ) type NestedType interface { DataType Fields() []Field } // ListType describes a nested type in which each array slot contains // a variable-size sequence of values, all having the same relative type. type ListType struct { elem Field } func ListOfField(f Field) *ListType { if f.Type == nil { panic("arrow: nil type for list field") } return &ListType{elem: f} } // ListOf returns the list type with element type t. // For example, if t represents int32, ListOf(t) represents []int32. // // ListOf panics if t is nil or invalid. NullableElem defaults to true func ListOf(t DataType) *ListType { if t == nil { panic("arrow: nil DataType") } return &ListType{elem: Field{Name: "item", Type: t, Nullable: true}} } // ListOfNonNullable is like ListOf but NullableElem defaults to false, indicating // that the child type should be marked as non-nullable. func ListOfNonNullable(t DataType) *ListType { if t == nil { panic("arrow: nil DataType") } return &ListType{elem: Field{Name: "item", Type: t, Nullable: false}} } func (*ListType) ID() Type { return LIST } func (*ListType) Name() string { return "list" } func (t *ListType) String() string { if t.elem.Nullable { return fmt.Sprintf("list<%s: %s, nullable>", t.elem.Name, t.elem.Type) } return fmt.Sprintf("list<%s: %s>", t.elem.Name, t.elem.Type) } func (t *ListType) Fingerprint() string { child := t.elem.Type.Fingerprint() if len(child) > 0 { return typeFingerprint(t) + "{" + child + "}" } return "" } func (t *ListType) SetElemMetadata(md Metadata) { t.elem.Metadata = md } func (t *ListType) SetElemNullable(n bool) { t.elem.Nullable = n } // Elem returns the ListType's element type. func (t *ListType) Elem() DataType { return t.elem.Type } func (t *ListType) ElemField() Field { return t.elem } func (t *ListType) Fields() []Field { return []Field{t.ElemField()} } func (ListType) Layout() DataTypeLayout { return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap(), SpecFixedWidth(Int32SizeBytes)}} } func (ListType) OffsetTypeTraits() OffsetTraits { return Int32Traits } type LargeListType struct { ListType } func (LargeListType) ID() Type { return LARGE_LIST } func (LargeListType) Name() string { return "large_list" } func (t *LargeListType) String() string { return "large_" + t.ListType.String() } func (t *LargeListType) Fingerprint() string { child := t.elem.Type.Fingerprint() if len(child) > 0 { return typeFingerprint(t) + "{" + child + "}" } return "" } func (LargeListType) Layout() DataTypeLayout { return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap(), SpecFixedWidth(Int64SizeBytes)}} } func (LargeListType) OffsetTypeTraits() OffsetTraits { return Int64Traits } func LargeListOfField(f Field) *LargeListType { if f.Type == nil { panic("arrow: nil type for list field") } return &LargeListType{ListType{elem: f}} } // ListOf returns the list type with element type t. // For example, if t represents int32, ListOf(t) represents []int32. // // ListOf panics if t is nil or invalid. NullableElem defaults to true func LargeListOf(t DataType) *LargeListType { if t == nil { panic("arrow: nil DataType") } return &LargeListType{ListType{elem: Field{Name: "item", Type: t, Nullable: true}}} } // ListOfNonNullable is like ListOf but NullableElem defaults to false, indicating // that the child type should be marked as non-nullable. func LargeListOfNonNullable(t DataType) *LargeListType { if t == nil { panic("arrow: nil DataType") } return &LargeListType{ListType{elem: Field{Name: "item", Type: t, Nullable: false}}} } // FixedSizeListType describes a nested type in which each array slot contains // a fixed-size sequence of values, all having the same relative type. type FixedSizeListType struct { n int32 // number of elements in the list elem Field } func FixedSizeListOfField(n int32, f Field) *FixedSizeListType { if f.Type == nil { panic("arrow: nil DataType") } if n <= 0 { panic("arrow: invalid size") } return &FixedSizeListType{n: n, elem: f} } // FixedSizeListOf returns the list type with element type t. // For example, if t represents int32, FixedSizeListOf(10, t) represents [10]int32. // // FixedSizeListOf panics if t is nil or invalid. // FixedSizeListOf panics if n is <= 0. // NullableElem defaults to true func FixedSizeListOf(n int32, t DataType) *FixedSizeListType { if t == nil { panic("arrow: nil DataType") } if n <= 0 { panic("arrow: invalid size") } return &FixedSizeListType{n: n, elem: Field{Name: "item", Type: t, Nullable: true}} } // FixedSizeListOfNonNullable is like FixedSizeListOf but NullableElem defaults to false // indicating that the child type should be marked as non-nullable. func FixedSizeListOfNonNullable(n int32, t DataType) *FixedSizeListType { if t == nil { panic("arrow: nil DataType") } if n <= 0 { panic("arrow: invalid size") } return &FixedSizeListType{n: n, elem: Field{Name: "item", Type: t, Nullable: false}} } func (*FixedSizeListType) ID() Type { return FIXED_SIZE_LIST } func (*FixedSizeListType) Name() string { return "fixed_size_list" } func (t *FixedSizeListType) String() string { if t.elem.Nullable { return fmt.Sprintf("fixed_size_list<%s: %s, nullable>[%d]", t.elem.Name, t.elem.Type, t.n) } return fmt.Sprintf("fixed_size_list<%s: %s>[%d]", t.elem.Name, t.elem.Type, t.n) } // Elem returns the FixedSizeListType's element type. func (t *FixedSizeListType) Elem() DataType { return t.elem.Type } // Len returns the FixedSizeListType's size. func (t *FixedSizeListType) Len() int32 { return t.n } func (t *FixedSizeListType) ElemField() Field { return t.elem } func (t *FixedSizeListType) Fingerprint() string { child := t.elem.Type.Fingerprint() if len(child) > 0 { return fmt.Sprintf("%s[%d]{%s}", typeFingerprint(t), t.n, child) } return "" } func (t *FixedSizeListType) Fields() []Field { return []Field{t.ElemField()} } func (FixedSizeListType) Layout() DataTypeLayout { return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap()}} } // StructType describes a nested type parameterized by an ordered sequence // of relative types, called its fields. type StructType struct { fields []Field index map[string]int meta Metadata } // StructOf returns the struct type with fields fs. // // StructOf panics if there are duplicated fields. // StructOf panics if there is a field with an invalid DataType. func StructOf(fs ...Field) *StructType { n := len(fs) if n == 0 { return &StructType{} } t := &StructType{ fields: make([]Field, n), index: make(map[string]int, n), } for i, f := range fs { if f.Type == nil { panic("arrow: field with nil DataType") } t.fields[i] = Field{ Name: f.Name, Type: f.Type, Nullable: f.Nullable, Metadata: f.Metadata.clone(), } if _, dup := t.index[f.Name]; dup { panic(fmt.Errorf("arrow: duplicate field with name %q", f.Name)) } t.index[f.Name] = i } return t } func (*StructType) ID() Type { return STRUCT } func (*StructType) Name() string { return "struct" } func (t *StructType) String() string { var o strings.Builder o.WriteString("struct<") for i, f := range t.fields { if i > 0 { o.WriteString(", ") } o.WriteString(fmt.Sprintf("%s: %v", f.Name, f.Type)) } o.WriteString(">") return o.String() } func (t *StructType) Fields() []Field { return t.fields } func (t *StructType) Field(i int) Field { return t.fields[i] } func (t *StructType) FieldByName(name string) (Field, bool) { i, ok := t.index[name] if !ok { return Field{}, false } return t.fields[i], true } func (t *StructType) FieldIdx(name string) (int, bool) { i, ok := t.index[name] return i, ok } func (t *StructType) Fingerprint() string { var b strings.Builder b.WriteString(typeFingerprint(t)) b.WriteByte('{') for _, c := range t.fields { child := c.Fingerprint() if len(child) == 0 { return "" } b.WriteString(child) b.WriteByte(';') } b.WriteByte('}') return b.String() } func (StructType) Layout() DataTypeLayout { return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap()}} } type MapType struct { value *ListType KeysSorted bool } func MapOf(key, item DataType) *MapType { if key == nil || item == nil { panic("arrow: nil key or item type for MapType") } return &MapType{value: ListOf(StructOf(Field{Name: "key", Type: key}, Field{Name: "value", Type: item, Nullable: true}))} } func (*MapType) ID() Type { return MAP } func (*MapType) Name() string { return "map" } func (t *MapType) String() string { var o strings.Builder o.WriteString(fmt.Sprintf("map<%s, %s", t.value.Elem().(*StructType).Field(0).Type, t.value.Elem().(*StructType).Field(1).Type)) if t.KeysSorted { o.WriteString(", keys_sorted") } o.WriteString(">") return o.String() } func (t *MapType) KeyField() Field { return t.value.Elem().(*StructType).Field(0) } func (t *MapType) KeyType() DataType { return t.KeyField().Type } func (t *MapType) ItemField() Field { return t.value.Elem().(*StructType).Field(1) } func (t *MapType) ItemType() DataType { return t.ItemField().Type } func (t *MapType) ValueType() *StructType { return t.value.Elem().(*StructType) } func (t *MapType) ValueField() Field { return Field{ Name: "entries", Type: t.ValueType(), } } func (t *MapType) SetItemNullable(nullable bool) { t.value.Elem().(*StructType).fields[1].Nullable = nullable } func (t *MapType) Fingerprint() string { keyFingerprint := t.KeyType().Fingerprint() itemFingerprint := t.ItemType().Fingerprint() if keyFingerprint == "" || itemFingerprint == "" { return "" } fingerprint := typeFingerprint(t) if t.KeysSorted { fingerprint += "s" } return fingerprint + "{" + keyFingerprint + itemFingerprint + "}" } func (t *MapType) Fields() []Field { return []Field{t.ValueField()} } func (t *MapType) Layout() DataTypeLayout { return t.value.Layout() } func (MapType) OffsetTypeTraits() OffsetTraits { return Int32Traits } type ( // UnionTypeCode is an alias to int8 which is the type of the ids // used for union arrays. UnionTypeCode = int8 UnionMode int8 ) const ( MaxUnionTypeCode UnionTypeCode = 127 InvalidUnionChildID int = -1 SparseMode UnionMode = iota // SPARSE DenseMode // DENSE ) // UnionType is an interface to encompass both Dense and Sparse Union types. // // A UnionType is a nested type where each logical value is taken // from a single child. A buffer of 8-bit type ids (typed as UnionTypeCode) // indicates which child a given logical value is to be taken from. This is // represented as the "child id" or "child index", which is the index into the // list of child fields for a given child. type UnionType interface { NestedType // Mode returns either SparseMode or DenseMode depending on the current // concrete data type. Mode() UnionMode // ChildIDs returns a slice of ints to map UnionTypeCode values to // the index in the Fields that represents the given Type. It is // initialized with all values being InvalidUnionChildID (-1) // before being populated based on the TypeCodes and fields of the type. // The field for a given type can be retrieved by Fields()[ChildIDs()[typeCode]] ChildIDs() []int // TypeCodes returns the list of available type codes for this union type // which will correspond to indexes into the ChildIDs slice to locate the // appropriate child. A union Array contains a buffer of these type codes // which indicate for a given index, which child has the value for that index. TypeCodes() []UnionTypeCode // MaxTypeCode returns the value of the largest TypeCode in the list of typecodes // that are defined by this Union type MaxTypeCode() UnionTypeCode } // UnionOf returns an appropriate union type for the given Mode (Sparse or Dense), // child fields, and type codes. len(fields) == len(typeCodes) must be true, or else // this will panic. len(fields) can be 0. func UnionOf(mode UnionMode, fields []Field, typeCodes []UnionTypeCode) UnionType { switch mode { case SparseMode: return SparseUnionOf(fields, typeCodes) case DenseMode: return DenseUnionOf(fields, typeCodes) default: panic("arrow: invalid union mode") } } type unionType struct { children []Field typeCodes []UnionTypeCode childIDs [int(MaxUnionTypeCode) + 1]int } func (t *unionType) init(fields []Field, typeCodes []UnionTypeCode) { // initialize all child IDs to -1 t.childIDs[0] = InvalidUnionChildID for i := 1; i < len(t.childIDs); i *= 2 { copy(t.childIDs[i:], t.childIDs[:i]) } t.children = fields t.typeCodes = typeCodes for i, tc := range t.typeCodes { t.childIDs[tc] = i } } func (t unionType) Fields() []Field { return t.children } func (t unionType) TypeCodes() []UnionTypeCode { return t.typeCodes } func (t unionType) ChildIDs() []int { return t.childIDs[:] } func (t *unionType) validate(fields []Field, typeCodes []UnionTypeCode, _ UnionMode) error { if len(fields) != len(typeCodes) { return errors.New("arrow: union types should have the same number of fields as type codes") } for _, c := range typeCodes { if c < 0 || c > MaxUnionTypeCode { return errors.New("arrow: union type code out of bounds") } } return nil } func (t *unionType) MaxTypeCode() (max UnionTypeCode) { if len(t.typeCodes) == 0 { return } max = t.typeCodes[0] for _, c := range t.typeCodes[1:] { if c > max { max = c } } return } func (t *unionType) String() string { var b strings.Builder b.WriteByte('<') for i := range t.typeCodes { if i != 0 { b.WriteString(", ") } fmt.Fprintf(&b, "%s=%d", t.children[i], t.typeCodes[i]) } b.WriteByte('>') return b.String() } func (t *unionType) fingerprint() string { var b strings.Builder for _, c := range t.typeCodes { fmt.Fprintf(&b, ":%d", c) } b.WriteString("]{") for _, c := range t.children { fingerprint := c.Fingerprint() if len(fingerprint) == 0 { return "" } b.WriteString(fingerprint) b.WriteByte(';') } b.WriteByte('}') return b.String() } func fieldsFromArrays(arrays []Array, names ...string) (ret []Field) { ret = make([]Field, len(arrays)) if len(names) == 0 { for i, c := range arrays { ret[i] = Field{Name: strconv.Itoa(i), Type: c.DataType(), Nullable: true} } } else { debug.Assert(len(names) == len(arrays), "mismatch of arrays and names") for i, c := range arrays { ret[i] = Field{Name: names[i], Type: c.DataType(), Nullable: true} } } return } // SparseUnionType is the concrete type for Sparse union data. // // A sparse union is a nested type where each logical value is taken // from a single child. A buffer of 8-bit type ids indicates which child // a given logical value is to be taken from. // // In a sparse union, each child array will have the same length as the // union array itself, regardless of the actual number of union values which // refer to it. // // Unlike most other types, unions do not have a top-level validity bitmap. type SparseUnionType struct { unionType } // SparseUnionFromArrays enables creating a union type from a list of Arrays, // field names, and type codes. len(fields) should be either 0 or equal to len(children). // len(codes) should also be either 0, or equal to len(children). // // If len(fields) == 0, then the fields will be named numerically as "0", "1", "2"... // and so on. If len(codes) == 0, then the type codes will be constructed as // [0, 1, 2, ..., n]. func SparseUnionFromArrays(children []Array, fields []string, codes []UnionTypeCode) *SparseUnionType { if len(codes) == 0 { codes = make([]UnionTypeCode, len(children)) for i := range children { codes[i] = UnionTypeCode(i) } } return SparseUnionOf(fieldsFromArrays(children, fields...), codes) } // SparseUnionOf is equivalent to UnionOf(arrow.SparseMode, fields, typeCodes), // constructing a SparseUnionType from a list of fields and type codes. // // If len(fields) != len(typeCodes) this will panic. They are allowed to be // of length 0. func SparseUnionOf(fields []Field, typeCodes []UnionTypeCode) *SparseUnionType { ret := &SparseUnionType{} if err := ret.validate(fields, typeCodes, ret.Mode()); err != nil { panic(err) } ret.init(fields, typeCodes) return ret } func (SparseUnionType) ID() Type { return SPARSE_UNION } func (SparseUnionType) Name() string { return "sparse_union" } func (SparseUnionType) Mode() UnionMode { return SparseMode } func (t *SparseUnionType) Fingerprint() string { return typeFingerprint(t) + "[s" + t.fingerprint() } func (SparseUnionType) Layout() DataTypeLayout { return DataTypeLayout{Buffers: []BufferSpec{SpecAlwaysNull(), SpecFixedWidth(Uint8SizeBytes)}} } func (t *SparseUnionType) String() string { return t.Name() + t.unionType.String() } // DenseUnionType is the concrete type for dense union data. // // A dense union is a nested type where each logical value is taken from a // single child, at a specific offset. A buffer of 8-bit type ids (typed // as UnionTypeCode) indicates which child a given logical value is to be // taken from and a buffer of 32-bit offsets indicating which physical position // in the given child array has the logical value for that index. // // Unlike a sparse union, a dense union allows encoding only the child values // which are actually referred to by the union array. This is counterbalanced // by the additional footprint of the offsets buffer, and the additional // indirection cost when looking up values. // // Unlike most other types, unions don't have a top-level validity bitmap type DenseUnionType struct { unionType } // DenseUnionFromArrays enables creating a union type from a list of Arrays, // field names, and type codes. len(fields) should be either 0 or equal to len(children). // len(codes) should also be either 0, or equal to len(children). // // If len(fields) == 0, then the fields will be named numerically as "0", "1", "2"... // and so on. If len(codes) == 0, then the type codes will be constructed as // [0, 1, 2, ..., n]. func DenseUnionFromArrays(children []Array, fields []string, codes []UnionTypeCode) *DenseUnionType { if len(codes) == 0 { codes = make([]UnionTypeCode, len(children)) for i := range children { codes[i] = UnionTypeCode(i) } } return DenseUnionOf(fieldsFromArrays(children, fields...), codes) } // DenseUnionOf is equivalent to UnionOf(arrow.DenseMode, fields, typeCodes), // constructing a SparseUnionType from a list of fields and type codes. // // If len(fields) != len(typeCodes) this will panic. They are allowed to be // of length 0. func DenseUnionOf(fields []Field, typeCodes []UnionTypeCode) *DenseUnionType { ret := &DenseUnionType{} if err := ret.validate(fields, typeCodes, ret.Mode()); err != nil { panic(err) } ret.init(fields, typeCodes) return ret } func (DenseUnionType) ID() Type { return DENSE_UNION } func (DenseUnionType) Name() string { return "dense_union" } func (DenseUnionType) Mode() UnionMode { return DenseMode } func (t *DenseUnionType) Fingerprint() string { return typeFingerprint(t) + "[s" + t.fingerprint() } func (DenseUnionType) Layout() DataTypeLayout { return DataTypeLayout{Buffers: []BufferSpec{SpecAlwaysNull(), SpecFixedWidth(Uint8SizeBytes), SpecFixedWidth(Int32SizeBytes)}} } func (DenseUnionType) OffsetTypeTraits() OffsetTraits { return Int32Traits } func (t *DenseUnionType) String() string { return t.Name() + t.unionType.String() } type Field struct { Name string // Field name Type DataType // The field's data type Nullable bool // Fields can be nullable Metadata Metadata // The field's metadata, if any } func (f Field) Fingerprint() string { typeFingerprint := f.Type.Fingerprint() if typeFingerprint == "" { return "" } var b strings.Builder b.WriteByte('F') if f.Nullable { b.WriteByte('n') } else { b.WriteByte('N') } b.WriteString(f.Name) b.WriteByte('{') b.WriteString(typeFingerprint) b.WriteByte('}') return b.String() } func (f Field) HasMetadata() bool { return f.Metadata.Len() != 0 } func (f Field) Equal(o Field) bool { switch { case f.Name != o.Name: return false case f.Nullable != o.Nullable: return false case !TypeEqual(f.Type, o.Type, CheckMetadata()): return false case !f.Metadata.Equal(o.Metadata): return false } return true } func (f Field) String() string { var o strings.Builder nullable := "" if f.Nullable { nullable = ", nullable" } fmt.Fprintf(&o, "%s: type=%v%v", f.Name, f.Type, nullable) if f.HasMetadata() { fmt.Fprintf(&o, "\n%*.smetadata: %v", len(f.Name)+2, "", f.Metadata) } return o.String() } var ( _ DataType = (*ListType)(nil) _ DataType = (*FixedSizeListType)(nil) _ DataType = (*StructType)(nil) _ DataType = (*MapType)(nil) )