You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							346 lines
						
					
					
						
							12 KiB
						
					
					
				
			
		
		
		
			
			
			
		
		
	
	
							346 lines
						
					
					
						
							12 KiB
						
					
					
				| package schema | |
| 
 | |
| import ( | |
| 	"fmt" | |
| 	"strconv" | |
| 
 | |
| 	parquet "github.com/parquet-go/parquet-go" | |
| 	"github.com/seaweedfs/seaweedfs/weed/pb/schema_pb" | |
| ) | |
| 
 | |
| func rowBuilderVisit(rowBuilder *parquet.RowBuilder, fieldType *schema_pb.Type, levels *ParquetLevels, fieldValue *schema_pb.Value) (err error) { | |
| 	switch fieldType.Kind.(type) { | |
| 	case *schema_pb.Type_ScalarType: | |
| 		// If value is missing, write NULL at the correct column to keep rows aligned | |
| 		if fieldValue == nil || fieldValue.Kind == nil { | |
| 			rowBuilder.Add(levels.startColumnIndex, parquet.NullValue()) | |
| 			return nil | |
| 		} | |
| 		var parquetValue parquet.Value | |
| 		parquetValue, err = toParquetValueForType(fieldType, fieldValue) | |
| 		if err != nil { | |
| 			return | |
| 		} | |
| 
 | |
| 		// Safety check: prevent nil byte arrays from reaching parquet library | |
| 		if parquetValue.Kind() == parquet.ByteArray { | |
| 			byteData := parquetValue.ByteArray() | |
| 			if byteData == nil { | |
| 				parquetValue = parquet.ByteArrayValue([]byte{}) | |
| 			} | |
| 		} | |
| 
 | |
| 		rowBuilder.Add(levels.startColumnIndex, parquetValue) | |
| 	case *schema_pb.Type_ListType: | |
| 		// Advance to list position even if value is missing | |
| 		rowBuilder.Next(levels.startColumnIndex) | |
| 		if fieldValue == nil || fieldValue.GetListValue() == nil { | |
| 			return nil | |
| 		} | |
| 
 | |
| 		elementType := fieldType.GetListType().ElementType | |
| 		for _, value := range fieldValue.GetListValue().Values { | |
| 			if err = rowBuilderVisit(rowBuilder, elementType, levels, value); err != nil { | |
| 				return | |
| 			} | |
| 		} | |
| 	} | |
| 	return | |
| } | |
| 
 | |
| func AddRecordValue(rowBuilder *parquet.RowBuilder, recordType *schema_pb.RecordType, parquetLevels *ParquetLevels, recordValue *schema_pb.RecordValue) error { | |
| 	visitor := func(fieldType *schema_pb.Type, levels *ParquetLevels, fieldValue *schema_pb.Value) (err error) { | |
| 		return rowBuilderVisit(rowBuilder, fieldType, levels, fieldValue) | |
| 	} | |
| 	fieldType := &schema_pb.Type{Kind: &schema_pb.Type_RecordType{RecordType: recordType}} | |
| 	fieldValue := &schema_pb.Value{Kind: &schema_pb.Value_RecordValue{RecordValue: recordValue}} | |
| 	return doVisitValue(fieldType, parquetLevels, fieldValue, visitor) | |
| } | |
| 
 | |
| // typeValueVisitor is a function that is called for each value in a schema_pb.Value | |
| // Find the column index. | |
| // intended to be used in RowBuilder.Add(columnIndex, value) | |
| type typeValueVisitor func(fieldType *schema_pb.Type, levels *ParquetLevels, fieldValue *schema_pb.Value) (err error) | |
| 
 | |
| // endIndex is exclusive | |
| // same logic as RowBuilder.configure in row_builder.go | |
| func doVisitValue(fieldType *schema_pb.Type, levels *ParquetLevels, fieldValue *schema_pb.Value, visitor typeValueVisitor) (err error) { | |
| 	switch fieldType.Kind.(type) { | |
| 	case *schema_pb.Type_ScalarType: | |
| 		return visitor(fieldType, levels, fieldValue) | |
| 	case *schema_pb.Type_ListType: | |
| 		return visitor(fieldType, levels, fieldValue) | |
| 	case *schema_pb.Type_RecordType: | |
| 		for _, field := range fieldType.GetRecordType().Fields { | |
| 			var fv *schema_pb.Value | |
| 			if fieldValue != nil && fieldValue.GetRecordValue() != nil { | |
| 				var found bool | |
| 				fv, found = fieldValue.GetRecordValue().Fields[field.Name] | |
| 				if !found { | |
| 					// pass nil so visitor can emit NULL for alignment | |
| 					fv = nil | |
| 				} | |
| 			} | |
| 			fieldLevels := levels.levels[field.Name] | |
| 			err = doVisitValue(field.Type, fieldLevels, fv, visitor) | |
| 			if err != nil { | |
| 				return | |
| 			} | |
| 		} | |
| 		return | |
| 	} | |
| 	return | |
| } | |
| 
 | |
| func toParquetValue(value *schema_pb.Value) (parquet.Value, error) { | |
| 	// Safety check for nil value | |
| 	if value == nil || value.Kind == nil { | |
| 		return parquet.NullValue(), fmt.Errorf("nil value or nil value kind") | |
| 	} | |
| 
 | |
| 	switch value.Kind.(type) { | |
| 	case *schema_pb.Value_BoolValue: | |
| 		return parquet.BooleanValue(value.GetBoolValue()), nil | |
| 	case *schema_pb.Value_Int32Value: | |
| 		return parquet.Int32Value(value.GetInt32Value()), nil | |
| 	case *schema_pb.Value_Int64Value: | |
| 		return parquet.Int64Value(value.GetInt64Value()), nil | |
| 	case *schema_pb.Value_FloatValue: | |
| 		return parquet.FloatValue(value.GetFloatValue()), nil | |
| 	case *schema_pb.Value_DoubleValue: | |
| 		return parquet.DoubleValue(value.GetDoubleValue()), nil | |
| 	case *schema_pb.Value_BytesValue: | |
| 		// Handle nil byte slices to prevent growslice panic in parquet-go | |
| 		byteData := value.GetBytesValue() | |
| 		if byteData == nil { | |
| 			byteData = []byte{} // Use empty slice instead of nil | |
| 		} | |
| 		return parquet.ByteArrayValue(byteData), nil | |
| 	case *schema_pb.Value_StringValue: | |
| 		// Convert string to bytes, ensuring we never pass nil | |
| 		stringData := value.GetStringValue() | |
| 		return parquet.ByteArrayValue([]byte(stringData)), nil | |
| 	// Parquet logical types with safe conversion (preventing commit 7a4aeec60 panic) | |
| 	case *schema_pb.Value_TimestampValue: | |
| 		timestampValue := value.GetTimestampValue() | |
| 		if timestampValue == nil { | |
| 			return parquet.NullValue(), nil | |
| 		} | |
| 		return parquet.Int64Value(timestampValue.TimestampMicros), nil | |
| 	case *schema_pb.Value_DateValue: | |
| 		dateValue := value.GetDateValue() | |
| 		if dateValue == nil { | |
| 			return parquet.NullValue(), nil | |
| 		} | |
| 		return parquet.Int32Value(dateValue.DaysSinceEpoch), nil | |
| 	case *schema_pb.Value_DecimalValue: | |
| 		decimalValue := value.GetDecimalValue() | |
| 		if decimalValue == nil || decimalValue.Value == nil || len(decimalValue.Value) == 0 { | |
| 			return parquet.NullValue(), nil | |
| 		} | |
| 
 | |
| 		// Validate input data - reject unreasonably large values instead of corrupting data | |
| 		if len(decimalValue.Value) > 64 { | |
| 			// Reject extremely large decimal values (>512 bits) as likely corrupted data | |
| 			// Better to fail fast than silently corrupt financial/scientific data | |
| 			return parquet.NullValue(), fmt.Errorf("decimal value too large: %d bytes (max 64)", len(decimalValue.Value)) | |
| 		} | |
| 
 | |
| 		// Convert to FixedLenByteArray to match schema (DECIMAL with FixedLenByteArray physical type) | |
| 		// This accommodates any precision up to 38 digits (16 bytes = 128 bits) | |
|  | |
| 		// Pad or truncate to exactly 16 bytes for FixedLenByteArray | |
| 		fixedBytes := make([]byte, 16) | |
| 		if len(decimalValue.Value) <= 16 { | |
| 			// Right-align the value (big-endian) | |
| 			copy(fixedBytes[16-len(decimalValue.Value):], decimalValue.Value) | |
| 		} else { | |
| 			// Truncate if too large, taking the least significant bytes | |
| 			copy(fixedBytes, decimalValue.Value[len(decimalValue.Value)-16:]) | |
| 		} | |
| 
 | |
| 		return parquet.FixedLenByteArrayValue(fixedBytes), nil | |
| 	case *schema_pb.Value_TimeValue: | |
| 		timeValue := value.GetTimeValue() | |
| 		if timeValue == nil { | |
| 			return parquet.NullValue(), nil | |
| 		} | |
| 		return parquet.Int64Value(timeValue.TimeMicros), nil | |
| 	default: | |
| 		return parquet.NullValue(), fmt.Errorf("unknown value type: %T", value.Kind) | |
| 	} | |
| } | |
| 
 | |
| // toParquetValueForType coerces a schema_pb.Value into a parquet.Value that matches the declared field type. | |
| func toParquetValueForType(fieldType *schema_pb.Type, value *schema_pb.Value) (parquet.Value, error) { | |
| 	switch t := fieldType.Kind.(type) { | |
| 	case *schema_pb.Type_ScalarType: | |
| 		switch t.ScalarType { | |
| 		case schema_pb.ScalarType_BOOL: | |
| 			switch v := value.Kind.(type) { | |
| 			case *schema_pb.Value_BoolValue: | |
| 				return parquet.BooleanValue(v.BoolValue), nil | |
| 			case *schema_pb.Value_StringValue: | |
| 				if b, err := strconv.ParseBool(v.StringValue); err == nil { | |
| 					return parquet.BooleanValue(b), nil | |
| 				} | |
| 				return parquet.BooleanValue(false), nil | |
| 			default: | |
| 				return parquet.BooleanValue(false), nil | |
| 			} | |
| 
 | |
| 		case schema_pb.ScalarType_INT32: | |
| 			switch v := value.Kind.(type) { | |
| 			case *schema_pb.Value_Int32Value: | |
| 				return parquet.Int32Value(v.Int32Value), nil | |
| 			case *schema_pb.Value_Int64Value: | |
| 				return parquet.Int32Value(int32(v.Int64Value)), nil | |
| 			case *schema_pb.Value_DoubleValue: | |
| 				return parquet.Int32Value(int32(v.DoubleValue)), nil | |
| 			case *schema_pb.Value_StringValue: | |
| 				if i, err := strconv.ParseInt(v.StringValue, 10, 32); err == nil { | |
| 					return parquet.Int32Value(int32(i)), nil | |
| 				} | |
| 				return parquet.Int32Value(0), nil | |
| 			default: | |
| 				return parquet.Int32Value(0), nil | |
| 			} | |
| 
 | |
| 		case schema_pb.ScalarType_INT64: | |
| 			switch v := value.Kind.(type) { | |
| 			case *schema_pb.Value_Int64Value: | |
| 				return parquet.Int64Value(v.Int64Value), nil | |
| 			case *schema_pb.Value_Int32Value: | |
| 				return parquet.Int64Value(int64(v.Int32Value)), nil | |
| 			case *schema_pb.Value_DoubleValue: | |
| 				return parquet.Int64Value(int64(v.DoubleValue)), nil | |
| 			case *schema_pb.Value_StringValue: | |
| 				if i, err := strconv.ParseInt(v.StringValue, 10, 64); err == nil { | |
| 					return parquet.Int64Value(i), nil | |
| 				} | |
| 				return parquet.Int64Value(0), nil | |
| 			default: | |
| 				return parquet.Int64Value(0), nil | |
| 			} | |
| 
 | |
| 		case schema_pb.ScalarType_FLOAT: | |
| 			switch v := value.Kind.(type) { | |
| 			case *schema_pb.Value_FloatValue: | |
| 				return parquet.FloatValue(v.FloatValue), nil | |
| 			case *schema_pb.Value_DoubleValue: | |
| 				return parquet.FloatValue(float32(v.DoubleValue)), nil | |
| 			case *schema_pb.Value_Int64Value: | |
| 				return parquet.FloatValue(float32(v.Int64Value)), nil | |
| 			case *schema_pb.Value_StringValue: | |
| 				if f, err := strconv.ParseFloat(v.StringValue, 32); err == nil { | |
| 					return parquet.FloatValue(float32(f)), nil | |
| 				} | |
| 				return parquet.FloatValue(0), nil | |
| 			default: | |
| 				return parquet.FloatValue(0), nil | |
| 			} | |
| 
 | |
| 		case schema_pb.ScalarType_DOUBLE: | |
| 			switch v := value.Kind.(type) { | |
| 			case *schema_pb.Value_DoubleValue: | |
| 				return parquet.DoubleValue(v.DoubleValue), nil | |
| 			case *schema_pb.Value_Int64Value: | |
| 				return parquet.DoubleValue(float64(v.Int64Value)), nil | |
| 			case *schema_pb.Value_Int32Value: | |
| 				return parquet.DoubleValue(float64(v.Int32Value)), nil | |
| 			case *schema_pb.Value_StringValue: | |
| 				if f, err := strconv.ParseFloat(v.StringValue, 64); err == nil { | |
| 					return parquet.DoubleValue(f), nil | |
| 				} | |
| 				return parquet.DoubleValue(0), nil | |
| 			default: | |
| 				return parquet.DoubleValue(0), nil | |
| 			} | |
| 
 | |
| 		case schema_pb.ScalarType_BYTES: | |
| 			switch v := value.Kind.(type) { | |
| 			case *schema_pb.Value_BytesValue: | |
| 				b := v.BytesValue | |
| 				if b == nil { | |
| 					b = []byte{} | |
| 				} | |
| 				return parquet.ByteArrayValue(b), nil | |
| 			case *schema_pb.Value_StringValue: | |
| 				return parquet.ByteArrayValue([]byte(v.StringValue)), nil | |
| 			case *schema_pb.Value_Int64Value: | |
| 				return parquet.ByteArrayValue([]byte(strconv.FormatInt(v.Int64Value, 10))), nil | |
| 			case *schema_pb.Value_Int32Value: | |
| 				return parquet.ByteArrayValue([]byte(strconv.FormatInt(int64(v.Int32Value), 10))), nil | |
| 			case *schema_pb.Value_DoubleValue: | |
| 				return parquet.ByteArrayValue([]byte(strconv.FormatFloat(v.DoubleValue, 'f', -1, 64))), nil | |
| 			case *schema_pb.Value_FloatValue: | |
| 				return parquet.ByteArrayValue([]byte(strconv.FormatFloat(float64(v.FloatValue), 'f', -1, 32))), nil | |
| 			case *schema_pb.Value_BoolValue: | |
| 				if v.BoolValue { | |
| 					return parquet.ByteArrayValue([]byte("true")), nil | |
| 				} | |
| 				return parquet.ByteArrayValue([]byte("false")), nil | |
| 			default: | |
| 				return parquet.ByteArrayValue([]byte{}), nil | |
| 			} | |
| 
 | |
| 		case schema_pb.ScalarType_STRING: | |
| 			// Same as bytes but semantically string | |
| 			switch v := value.Kind.(type) { | |
| 			case *schema_pb.Value_StringValue: | |
| 				return parquet.ByteArrayValue([]byte(v.StringValue)), nil | |
| 			default: | |
| 				// Fallback through bytes coercion | |
| 				b, _ := toParquetValueForType(&schema_pb.Type{Kind: &schema_pb.Type_ScalarType{ScalarType: schema_pb.ScalarType_BYTES}}, value) | |
| 				return b, nil | |
| 			} | |
| 
 | |
| 		case schema_pb.ScalarType_TIMESTAMP: | |
| 			switch v := value.Kind.(type) { | |
| 			case *schema_pb.Value_Int64Value: | |
| 				return parquet.Int64Value(v.Int64Value), nil | |
| 			case *schema_pb.Value_StringValue: | |
| 				if i, err := strconv.ParseInt(v.StringValue, 10, 64); err == nil { | |
| 					return parquet.Int64Value(i), nil | |
| 				} | |
| 				return parquet.Int64Value(0), nil | |
| 			default: | |
| 				return parquet.Int64Value(0), nil | |
| 			} | |
| 
 | |
| 		case schema_pb.ScalarType_DATE: | |
| 			switch v := value.Kind.(type) { | |
| 			case *schema_pb.Value_Int32Value: | |
| 				return parquet.Int32Value(v.Int32Value), nil | |
| 			case *schema_pb.Value_Int64Value: | |
| 				return parquet.Int32Value(int32(v.Int64Value)), nil | |
| 			case *schema_pb.Value_StringValue: | |
| 				if i, err := strconv.ParseInt(v.StringValue, 10, 32); err == nil { | |
| 					return parquet.Int32Value(int32(i)), nil | |
| 				} | |
| 				return parquet.Int32Value(0), nil | |
| 			default: | |
| 				return parquet.Int32Value(0), nil | |
| 			} | |
| 
 | |
| 		case schema_pb.ScalarType_DECIMAL: | |
| 			// Reuse existing conversion path (FixedLenByteArray 16) | |
| 			return toParquetValue(value) | |
| 
 | |
| 		case schema_pb.ScalarType_TIME: | |
| 			switch v := value.Kind.(type) { | |
| 			case *schema_pb.Value_Int64Value: | |
| 				return parquet.Int64Value(v.Int64Value), nil | |
| 			case *schema_pb.Value_StringValue: | |
| 				if i, err := strconv.ParseInt(v.StringValue, 10, 64); err == nil { | |
| 					return parquet.Int64Value(i), nil | |
| 				} | |
| 				return parquet.Int64Value(0), nil | |
| 			default: | |
| 				return parquet.Int64Value(0), nil | |
| 			} | |
| 		} | |
| 	} | |
| 	// Fallback to generic conversion | |
| 	return toParquetValue(value) | |
| }
 |