You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

666 lines
16 KiB

package schema
import (
"math/big"
"testing"
"time"
"github.com/parquet-go/parquet-go"
"github.com/seaweedfs/seaweedfs/weed/pb/schema_pb"
)
func TestToParquetValue_BasicTypes(t *testing.T) {
tests := []struct {
name string
value *schema_pb.Value
expected parquet.Value
wantErr bool
}{
{
name: "BoolValue true",
value: &schema_pb.Value{
Kind: &schema_pb.Value_BoolValue{BoolValue: true},
},
expected: parquet.BooleanValue(true),
},
{
name: "Int32Value",
value: &schema_pb.Value{
Kind: &schema_pb.Value_Int32Value{Int32Value: 42},
},
expected: parquet.Int32Value(42),
},
{
name: "Int64Value",
value: &schema_pb.Value{
Kind: &schema_pb.Value_Int64Value{Int64Value: 12345678901234},
},
expected: parquet.Int64Value(12345678901234),
},
{
name: "FloatValue",
value: &schema_pb.Value{
Kind: &schema_pb.Value_FloatValue{FloatValue: 3.14159},
},
expected: parquet.FloatValue(3.14159),
},
{
name: "DoubleValue",
value: &schema_pb.Value{
Kind: &schema_pb.Value_DoubleValue{DoubleValue: 2.718281828},
},
expected: parquet.DoubleValue(2.718281828),
},
{
name: "BytesValue",
value: &schema_pb.Value{
Kind: &schema_pb.Value_BytesValue{BytesValue: []byte("hello world")},
},
expected: parquet.ByteArrayValue([]byte("hello world")),
},
{
name: "BytesValue empty",
value: &schema_pb.Value{
Kind: &schema_pb.Value_BytesValue{BytesValue: []byte{}},
},
expected: parquet.ByteArrayValue([]byte{}),
},
{
name: "StringValue",
value: &schema_pb.Value{
Kind: &schema_pb.Value_StringValue{StringValue: "test string"},
},
expected: parquet.ByteArrayValue([]byte("test string")),
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result, err := toParquetValue(tt.value)
if (err != nil) != tt.wantErr {
t.Errorf("toParquetValue() error = %v, wantErr %v", err, tt.wantErr)
return
}
if !parquetValuesEqual(result, tt.expected) {
t.Errorf("toParquetValue() = %v, want %v", result, tt.expected)
}
})
}
}
func TestToParquetValue_TimestampValue(t *testing.T) {
tests := []struct {
name string
value *schema_pb.Value
expected parquet.Value
wantErr bool
}{
{
name: "Valid TimestampValue UTC",
value: &schema_pb.Value{
Kind: &schema_pb.Value_TimestampValue{
TimestampValue: &schema_pb.TimestampValue{
TimestampMicros: 1704067200000000, // 2024-01-01 00:00:00 UTC in microseconds
IsUtc: true,
},
},
},
expected: parquet.Int64Value(1704067200000000),
},
{
name: "Valid TimestampValue local",
value: &schema_pb.Value{
Kind: &schema_pb.Value_TimestampValue{
TimestampValue: &schema_pb.TimestampValue{
TimestampMicros: 1704067200000000,
IsUtc: false,
},
},
},
expected: parquet.Int64Value(1704067200000000),
},
{
name: "TimestampValue zero",
value: &schema_pb.Value{
Kind: &schema_pb.Value_TimestampValue{
TimestampValue: &schema_pb.TimestampValue{
TimestampMicros: 0,
IsUtc: true,
},
},
},
expected: parquet.Int64Value(0),
},
{
name: "TimestampValue negative (before epoch)",
value: &schema_pb.Value{
Kind: &schema_pb.Value_TimestampValue{
TimestampValue: &schema_pb.TimestampValue{
TimestampMicros: -1000000, // 1 second before epoch
IsUtc: true,
},
},
},
expected: parquet.Int64Value(-1000000),
},
{
name: "TimestampValue nil pointer",
value: &schema_pb.Value{
Kind: &schema_pb.Value_TimestampValue{
TimestampValue: nil,
},
},
expected: parquet.NullValue(),
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result, err := toParquetValue(tt.value)
if (err != nil) != tt.wantErr {
t.Errorf("toParquetValue() error = %v, wantErr %v", err, tt.wantErr)
return
}
if !parquetValuesEqual(result, tt.expected) {
t.Errorf("toParquetValue() = %v, want %v", result, tt.expected)
}
})
}
}
func TestToParquetValue_DateValue(t *testing.T) {
tests := []struct {
name string
value *schema_pb.Value
expected parquet.Value
wantErr bool
}{
{
name: "Valid DateValue (2024-01-01)",
value: &schema_pb.Value{
Kind: &schema_pb.Value_DateValue{
DateValue: &schema_pb.DateValue{
DaysSinceEpoch: 19723, // 2024-01-01 = 19723 days since epoch
},
},
},
expected: parquet.Int32Value(19723),
},
{
name: "DateValue epoch (1970-01-01)",
value: &schema_pb.Value{
Kind: &schema_pb.Value_DateValue{
DateValue: &schema_pb.DateValue{
DaysSinceEpoch: 0,
},
},
},
expected: parquet.Int32Value(0),
},
{
name: "DateValue before epoch",
value: &schema_pb.Value{
Kind: &schema_pb.Value_DateValue{
DateValue: &schema_pb.DateValue{
DaysSinceEpoch: -365, // 1969-01-01
},
},
},
expected: parquet.Int32Value(-365),
},
{
name: "DateValue nil pointer",
value: &schema_pb.Value{
Kind: &schema_pb.Value_DateValue{
DateValue: nil,
},
},
expected: parquet.NullValue(),
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result, err := toParquetValue(tt.value)
if (err != nil) != tt.wantErr {
t.Errorf("toParquetValue() error = %v, wantErr %v", err, tt.wantErr)
return
}
if !parquetValuesEqual(result, tt.expected) {
t.Errorf("toParquetValue() = %v, want %v", result, tt.expected)
}
})
}
}
func TestToParquetValue_DecimalValue(t *testing.T) {
tests := []struct {
name string
value *schema_pb.Value
expected parquet.Value
wantErr bool
}{
{
name: "Small Decimal (precision <= 9) - positive",
value: &schema_pb.Value{
Kind: &schema_pb.Value_DecimalValue{
DecimalValue: &schema_pb.DecimalValue{
Value: encodeBigIntToBytes(big.NewInt(12345)), // 123.45 with scale 2
Precision: 5,
Scale: 2,
},
},
},
expected: createFixedLenByteArray(encodeBigIntToBytes(big.NewInt(12345))), // FixedLenByteArray conversion
},
{
name: "Small Decimal (precision <= 9) - negative",
value: &schema_pb.Value{
Kind: &schema_pb.Value_DecimalValue{
DecimalValue: &schema_pb.DecimalValue{
Value: encodeBigIntToBytes(big.NewInt(-12345)),
Precision: 5,
Scale: 2,
},
},
},
expected: createFixedLenByteArray(encodeBigIntToBytes(big.NewInt(-12345))), // FixedLenByteArray conversion
},
{
name: "Medium Decimal (9 < precision <= 18)",
value: &schema_pb.Value{
Kind: &schema_pb.Value_DecimalValue{
DecimalValue: &schema_pb.DecimalValue{
Value: encodeBigIntToBytes(big.NewInt(123456789012345)),
Precision: 15,
Scale: 2,
},
},
},
expected: createFixedLenByteArray(encodeBigIntToBytes(big.NewInt(123456789012345))), // FixedLenByteArray conversion
},
{
name: "Large Decimal (precision > 18)",
value: &schema_pb.Value{
Kind: &schema_pb.Value_DecimalValue{
DecimalValue: &schema_pb.DecimalValue{
Value: []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF}, // Large number as bytes
Precision: 25,
Scale: 5,
},
},
},
expected: createFixedLenByteArray([]byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF}), // FixedLenByteArray conversion
},
{
name: "Decimal with zero precision",
value: &schema_pb.Value{
Kind: &schema_pb.Value_DecimalValue{
DecimalValue: &schema_pb.DecimalValue{
Value: encodeBigIntToBytes(big.NewInt(0)),
Precision: 0,
Scale: 0,
},
},
},
expected: createFixedLenByteArray(encodeBigIntToBytes(big.NewInt(0))), // Zero as FixedLenByteArray
},
{
name: "Decimal nil pointer",
value: &schema_pb.Value{
Kind: &schema_pb.Value_DecimalValue{
DecimalValue: nil,
},
},
expected: parquet.NullValue(),
},
{
name: "Decimal with nil Value bytes",
value: &schema_pb.Value{
Kind: &schema_pb.Value_DecimalValue{
DecimalValue: &schema_pb.DecimalValue{
Value: nil, // This was the original panic cause
Precision: 5,
Scale: 2,
},
},
},
expected: parquet.NullValue(),
},
{
name: "Decimal with empty Value bytes",
value: &schema_pb.Value{
Kind: &schema_pb.Value_DecimalValue{
DecimalValue: &schema_pb.DecimalValue{
Value: []byte{}, // Empty slice
Precision: 5,
Scale: 2,
},
},
},
expected: parquet.NullValue(), // Returns null for empty bytes
},
{
name: "Decimal out of int32 range (stored as binary)",
value: &schema_pb.Value{
Kind: &schema_pb.Value_DecimalValue{
DecimalValue: &schema_pb.DecimalValue{
Value: encodeBigIntToBytes(big.NewInt(999999999999)), // Too large for int32
Precision: 5, // But precision says int32
Scale: 0,
},
},
},
expected: createFixedLenByteArray(encodeBigIntToBytes(big.NewInt(999999999999))), // FixedLenByteArray
},
{
name: "Decimal out of int64 range (stored as binary)",
value: &schema_pb.Value{
Kind: &schema_pb.Value_DecimalValue{
DecimalValue: &schema_pb.DecimalValue{
Value: func() []byte {
// Create a number larger than int64 max
bigNum := new(big.Int)
bigNum.SetString("99999999999999999999999999999", 10)
return encodeBigIntToBytes(bigNum)
}(),
Precision: 15, // Says int64 but value is too large
Scale: 0,
},
},
},
expected: createFixedLenByteArray(func() []byte {
bigNum := new(big.Int)
bigNum.SetString("99999999999999999999999999999", 10)
return encodeBigIntToBytes(bigNum)
}()), // Large number as FixedLenByteArray (truncated to 16 bytes)
},
{
name: "Decimal extremely large value (should be rejected)",
value: &schema_pb.Value{
Kind: &schema_pb.Value_DecimalValue{
DecimalValue: &schema_pb.DecimalValue{
Value: make([]byte, 100), // 100 bytes > 64 byte limit
Precision: 100,
Scale: 0,
},
},
},
expected: parquet.NullValue(),
wantErr: true, // Should return error instead of corrupting data
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result, err := toParquetValue(tt.value)
if (err != nil) != tt.wantErr {
t.Errorf("toParquetValue() error = %v, wantErr %v", err, tt.wantErr)
return
}
if !parquetValuesEqual(result, tt.expected) {
t.Errorf("toParquetValue() = %v, want %v", result, tt.expected)
}
})
}
}
func TestToParquetValue_TimeValue(t *testing.T) {
tests := []struct {
name string
value *schema_pb.Value
expected parquet.Value
wantErr bool
}{
{
name: "Valid TimeValue (12:34:56.789)",
value: &schema_pb.Value{
Kind: &schema_pb.Value_TimeValue{
TimeValue: &schema_pb.TimeValue{
TimeMicros: 45296789000, // 12:34:56.789 in microseconds since midnight
},
},
},
expected: parquet.Int64Value(45296789000),
},
{
name: "TimeValue midnight",
value: &schema_pb.Value{
Kind: &schema_pb.Value_TimeValue{
TimeValue: &schema_pb.TimeValue{
TimeMicros: 0,
},
},
},
expected: parquet.Int64Value(0),
},
{
name: "TimeValue end of day (23:59:59.999999)",
value: &schema_pb.Value{
Kind: &schema_pb.Value_TimeValue{
TimeValue: &schema_pb.TimeValue{
TimeMicros: 86399999999, // 23:59:59.999999
},
},
},
expected: parquet.Int64Value(86399999999),
},
{
name: "TimeValue nil pointer",
value: &schema_pb.Value{
Kind: &schema_pb.Value_TimeValue{
TimeValue: nil,
},
},
expected: parquet.NullValue(),
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result, err := toParquetValue(tt.value)
if (err != nil) != tt.wantErr {
t.Errorf("toParquetValue() error = %v, wantErr %v", err, tt.wantErr)
return
}
if !parquetValuesEqual(result, tt.expected) {
t.Errorf("toParquetValue() = %v, want %v", result, tt.expected)
}
})
}
}
func TestToParquetValue_EdgeCases(t *testing.T) {
tests := []struct {
name string
value *schema_pb.Value
expected parquet.Value
wantErr bool
}{
{
name: "Nil value",
value: &schema_pb.Value{
Kind: nil,
},
wantErr: true,
},
{
name: "Completely nil value",
value: nil,
wantErr: true,
},
{
name: "BytesValue with nil slice",
value: &schema_pb.Value{
Kind: &schema_pb.Value_BytesValue{BytesValue: nil},
},
expected: parquet.ByteArrayValue([]byte{}), // Should convert nil to empty slice
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result, err := toParquetValue(tt.value)
if (err != nil) != tt.wantErr {
t.Errorf("toParquetValue() error = %v, wantErr %v", err, tt.wantErr)
return
}
if !tt.wantErr && !parquetValuesEqual(result, tt.expected) {
t.Errorf("toParquetValue() = %v, want %v", result, tt.expected)
}
})
}
}
// Helper function to encode a big.Int to bytes using two's complement representation
func encodeBigIntToBytes(n *big.Int) []byte {
if n.Sign() == 0 {
return []byte{0}
}
// For positive numbers, just use Bytes()
if n.Sign() > 0 {
return n.Bytes()
}
// For negative numbers, we need two's complement representation
bitLen := n.BitLen()
if bitLen%8 != 0 {
bitLen += 8 - (bitLen % 8) // Round up to byte boundary
}
byteLen := bitLen / 8
if byteLen == 0 {
byteLen = 1
}
// Calculate 2^(byteLen*8)
modulus := new(big.Int).Lsh(big.NewInt(1), uint(byteLen*8))
// Convert negative to positive representation: n + 2^(byteLen*8)
positive := new(big.Int).Add(n, modulus)
bytes := positive.Bytes()
// Pad with leading zeros if needed
if len(bytes) < byteLen {
padded := make([]byte, byteLen)
copy(padded[byteLen-len(bytes):], bytes)
return padded
}
return bytes
}
// Helper function to create a FixedLenByteArray(16) matching our conversion logic
func createFixedLenByteArray(inputBytes []byte) parquet.Value {
fixedBytes := make([]byte, 16)
if len(inputBytes) <= 16 {
// Right-align the value (big-endian) - same as our conversion logic
copy(fixedBytes[16-len(inputBytes):], inputBytes)
} else {
// Truncate if too large, taking the least significant bytes
copy(fixedBytes, inputBytes[len(inputBytes)-16:])
}
return parquet.FixedLenByteArrayValue(fixedBytes)
}
// Helper function to compare parquet values
func parquetValuesEqual(a, b parquet.Value) bool {
// Handle both being null
if a.IsNull() && b.IsNull() {
return true
}
if a.IsNull() != b.IsNull() {
return false
}
// Compare kind first
if a.Kind() != b.Kind() {
return false
}
// Compare based on type
switch a.Kind() {
case parquet.Boolean:
return a.Boolean() == b.Boolean()
case parquet.Int32:
return a.Int32() == b.Int32()
case parquet.Int64:
return a.Int64() == b.Int64()
case parquet.Float:
return a.Float() == b.Float()
case parquet.Double:
return a.Double() == b.Double()
case parquet.ByteArray:
aBytes := a.ByteArray()
bBytes := b.ByteArray()
if len(aBytes) != len(bBytes) {
return false
}
for i, v := range aBytes {
if v != bBytes[i] {
return false
}
}
return true
case parquet.FixedLenByteArray:
aBytes := a.ByteArray() // FixedLenByteArray also uses ByteArray() method
bBytes := b.ByteArray()
if len(aBytes) != len(bBytes) {
return false
}
for i, v := range aBytes {
if v != bBytes[i] {
return false
}
}
return true
default:
return false
}
}
// Benchmark tests
func BenchmarkToParquetValue_BasicTypes(b *testing.B) {
value := &schema_pb.Value{
Kind: &schema_pb.Value_Int64Value{Int64Value: 12345678901234},
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
_, _ = toParquetValue(value)
}
}
func BenchmarkToParquetValue_TimestampValue(b *testing.B) {
value := &schema_pb.Value{
Kind: &schema_pb.Value_TimestampValue{
TimestampValue: &schema_pb.TimestampValue{
TimestampMicros: time.Now().UnixMicro(),
IsUtc: true,
},
},
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
_, _ = toParquetValue(value)
}
}
func BenchmarkToParquetValue_DecimalValue(b *testing.B) {
value := &schema_pb.Value{
Kind: &schema_pb.Value_DecimalValue{
DecimalValue: &schema_pb.DecimalValue{
Value: encodeBigIntToBytes(big.NewInt(123456789012345)),
Precision: 15,
Scale: 2,
},
},
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
_, _ = toParquetValue(value)
}
}