You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							259 lines
						
					
					
						
							6.9 KiB
						
					
					
				
			
		
		
		
			
			
			
		
		
	
	
							259 lines
						
					
					
						
							6.9 KiB
						
					
					
				| package schema | |
| 
 | |
| import ( | |
| 	"encoding/binary" | |
| 	"fmt" | |
| 
 | |
| 	"github.com/seaweedfs/seaweedfs/weed/glog" | |
| ) | |
| 
 | |
| // Format represents the schema format type | |
| type Format int | |
| 
 | |
| const ( | |
| 	FormatUnknown Format = iota | |
| 	FormatAvro | |
| 	FormatProtobuf | |
| 	FormatJSONSchema | |
| ) | |
| 
 | |
| func (f Format) String() string { | |
| 	switch f { | |
| 	case FormatAvro: | |
| 		return "AVRO" | |
| 	case FormatProtobuf: | |
| 		return "PROTOBUF" | |
| 	case FormatJSONSchema: | |
| 		return "JSON_SCHEMA" | |
| 	default: | |
| 		return "UNKNOWN" | |
| 	} | |
| } | |
| 
 | |
| // ConfluentEnvelope represents the parsed Confluent Schema Registry envelope | |
| type ConfluentEnvelope struct { | |
| 	Format        Format | |
| 	SchemaID      uint32 | |
| 	Indexes       []int  // For Protobuf nested message resolution | |
| 	Payload       []byte // The actual encoded data | |
| 	OriginalBytes []byte // The complete original envelope bytes | |
| } | |
| 
 | |
| // ParseConfluentEnvelope parses a Confluent Schema Registry framed message | |
| // Returns the envelope details and whether the message was successfully parsed | |
| func ParseConfluentEnvelope(data []byte) (*ConfluentEnvelope, bool) { | |
| 	if len(data) < 5 { | |
| 		return nil, false // Too short to contain magic byte + schema ID | |
| 	} | |
| 
 | |
| 	// Check for Confluent magic byte (0x00) | |
| 	if data[0] != 0x00 { | |
| 		return nil, false // Not a Confluent-framed message | |
| 	} | |
| 
 | |
| 	// Extract schema ID (big-endian uint32) | |
| 	schemaID := binary.BigEndian.Uint32(data[1:5]) | |
| 
 | |
| 	envelope := &ConfluentEnvelope{ | |
| 		Format:        FormatAvro, // Default assumption; will be refined by schema registry lookup | |
| 		SchemaID:      schemaID, | |
| 		Indexes:       nil, | |
| 		Payload:       data[5:], // Default: payload starts after schema ID | |
| 		OriginalBytes: data,     // Store the complete original envelope | |
| 	} | |
| 
 | |
| 	// Note: Format detection should be done by the schema registry lookup | |
| 	// For now, we'll default to Avro and let the manager determine the actual format | |
| 	// based on the schema registry information | |
|  | |
| 	return envelope, true | |
| } | |
| 
 | |
| // ParseConfluentProtobufEnvelope parses a Confluent Protobuf envelope with indexes | |
| // This is a specialized version for Protobuf that handles message indexes | |
| // | |
| // Note: This function uses heuristics to distinguish between index varints and | |
| // payload data, which may not be 100% reliable in all cases. For production use, | |
| // consider using ParseConfluentProtobufEnvelopeWithIndexCount if you know the | |
| // expected number of indexes. | |
| func ParseConfluentProtobufEnvelope(data []byte) (*ConfluentEnvelope, bool) { | |
| 	// For now, assume no indexes to avoid parsing issues | |
| 	// This can be enhanced later when we have better schema information | |
| 	return ParseConfluentProtobufEnvelopeWithIndexCount(data, 0) | |
| } | |
| 
 | |
| // ParseConfluentProtobufEnvelopeWithIndexCount parses a Confluent Protobuf envelope | |
| // when you know the expected number of indexes | |
| func ParseConfluentProtobufEnvelopeWithIndexCount(data []byte, expectedIndexCount int) (*ConfluentEnvelope, bool) { | |
| 	if len(data) < 5 { | |
| 		return nil, false | |
| 	} | |
| 
 | |
| 	// Check for Confluent magic byte | |
| 	if data[0] != 0x00 { | |
| 		return nil, false | |
| 	} | |
| 
 | |
| 	// Extract schema ID (big-endian uint32) | |
| 	schemaID := binary.BigEndian.Uint32(data[1:5]) | |
| 
 | |
| 	envelope := &ConfluentEnvelope{ | |
| 		Format:        FormatProtobuf, | |
| 		SchemaID:      schemaID, | |
| 		Indexes:       nil, | |
| 		Payload:       data[5:], // Default: payload starts after schema ID | |
| 		OriginalBytes: data, | |
| 	} | |
| 
 | |
| 	// Parse the expected number of indexes | |
| 	offset := 5 | |
| 	for i := 0; i < expectedIndexCount && offset < len(data); i++ { | |
| 		index, bytesRead := readVarint(data[offset:]) | |
| 		if bytesRead == 0 { | |
| 			// Invalid varint, stop parsing | |
| 			break | |
| 		} | |
| 		envelope.Indexes = append(envelope.Indexes, int(index)) | |
| 		offset += bytesRead | |
| 	} | |
| 
 | |
| 	envelope.Payload = data[offset:] | |
| 	return envelope, true | |
| } | |
| 
 | |
| // IsSchematized checks if the given bytes represent a Confluent-framed message | |
| func IsSchematized(data []byte) bool { | |
| 	_, ok := ParseConfluentEnvelope(data) | |
| 	return ok | |
| } | |
| 
 | |
| // ExtractSchemaID extracts just the schema ID without full parsing (for quick checks) | |
| func ExtractSchemaID(data []byte) (uint32, bool) { | |
| 	if len(data) < 5 || data[0] != 0x00 { | |
| 		return 0, false | |
| 	} | |
| 	return binary.BigEndian.Uint32(data[1:5]), true | |
| } | |
| 
 | |
| // CreateConfluentEnvelope creates a Confluent-framed message from components | |
| // This will be useful for reconstructing messages on the Fetch path | |
| func CreateConfluentEnvelope(format Format, schemaID uint32, indexes []int, payload []byte) []byte { | |
| 	// Start with magic byte + schema ID (5 bytes minimum) | |
| 	// Validate sizes to prevent overflow | |
| 	const maxSize = 1 << 30 // 1 GB limit | |
| 	indexSize := len(indexes) * 4 | |
| 	totalCapacity := 5 + len(payload) + indexSize | |
| 	if len(payload) > maxSize || indexSize > maxSize || totalCapacity < 0 || totalCapacity > maxSize { | |
| 		glog.Errorf("Envelope size too large: payload=%d, indexes=%d", len(payload), len(indexes)) | |
| 		return nil | |
| 	} | |
| 	result := make([]byte, 5, totalCapacity) | |
| 	result[0] = 0x00 // Magic byte | |
| 	binary.BigEndian.PutUint32(result[1:5], schemaID) | |
| 
 | |
| 	// For Protobuf, add indexes as varints | |
| 	if format == FormatProtobuf && len(indexes) > 0 { | |
| 		for _, index := range indexes { | |
| 			varintBytes := encodeVarint(uint64(index)) | |
| 			result = append(result, varintBytes...) | |
| 		} | |
| 	} | |
| 
 | |
| 	// Append the actual payload | |
| 	result = append(result, payload...) | |
| 
 | |
| 	return result | |
| } | |
| 
 | |
| // ValidateEnvelope performs basic validation on a parsed envelope | |
| func (e *ConfluentEnvelope) Validate() error { | |
| 	if e.SchemaID == 0 { | |
| 		return fmt.Errorf("invalid schema ID: 0") | |
| 	} | |
| 
 | |
| 	if len(e.Payload) == 0 { | |
| 		return fmt.Errorf("empty payload") | |
| 	} | |
| 
 | |
| 	// Format-specific validation | |
| 	switch e.Format { | |
| 	case FormatAvro: | |
| 		// Avro payloads should be valid binary data | |
| 		// More specific validation will be done by the Avro decoder | |
| 	case FormatProtobuf: | |
| 		// Protobuf validation will be implemented in Phase 5 | |
| 	case FormatJSONSchema: | |
| 		// JSON Schema validation will be implemented in Phase 6 | |
| 	default: | |
| 		return fmt.Errorf("unsupported format: %v", e.Format) | |
| 	} | |
| 
 | |
| 	return nil | |
| } | |
| 
 | |
| // Metadata returns a map of envelope metadata for storage | |
| func (e *ConfluentEnvelope) Metadata() map[string]string { | |
| 	metadata := map[string]string{ | |
| 		"schema_format": e.Format.String(), | |
| 		"schema_id":     fmt.Sprintf("%d", e.SchemaID), | |
| 	} | |
| 
 | |
| 	if len(e.Indexes) > 0 { | |
| 		// Store indexes for Protobuf reconstruction | |
| 		indexStr := "" | |
| 		for i, idx := range e.Indexes { | |
| 			if i > 0 { | |
| 				indexStr += "," | |
| 			} | |
| 			indexStr += fmt.Sprintf("%d", idx) | |
| 		} | |
| 		metadata["protobuf_indexes"] = indexStr | |
| 	} | |
| 
 | |
| 	return metadata | |
| } | |
| 
 | |
| // encodeVarint encodes a uint64 as a varint | |
| func encodeVarint(value uint64) []byte { | |
| 	if value == 0 { | |
| 		return []byte{0} | |
| 	} | |
| 
 | |
| 	var result []byte | |
| 	for value > 0 { | |
| 		b := byte(value & 0x7F) | |
| 		value >>= 7 | |
| 
 | |
| 		if value > 0 { | |
| 			b |= 0x80 // Set continuation bit | |
| 		} | |
| 
 | |
| 		result = append(result, b) | |
| 	} | |
| 
 | |
| 	return result | |
| } | |
| 
 | |
| // readVarint reads a varint from the byte slice and returns the value and bytes consumed | |
| func readVarint(data []byte) (uint64, int) { | |
| 	var result uint64 | |
| 	var shift uint | |
| 
 | |
| 	for i, b := range data { | |
| 		if i >= 10 { // Prevent overflow (max varint is 10 bytes) | |
| 			return 0, 0 | |
| 		} | |
| 
 | |
| 		result |= uint64(b&0x7F) << shift | |
| 
 | |
| 		if b&0x80 == 0 { | |
| 			// Last byte (MSB is 0) | |
| 			return result, i + 1 | |
| 		} | |
| 
 | |
| 		shift += 7 | |
| 	} | |
| 
 | |
| 	// Incomplete varint | |
| 	return 0, 0 | |
| }
 |