You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

414 lines
13 KiB

package s3tables
import (
"context"
"encoding/json"
"errors"
pathpkg "path"
"regexp"
"strings"
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
)
// Iceberg file layout validation
// Apache Iceberg tables follow a specific file layout structure:
// - metadata/ directory containing metadata files (*.json, *.avro)
// - data/ directory containing data files (*.parquet, *.orc, *.avro)
//
// Valid file patterns include:
// - metadata/v*.metadata.json (table metadata)
// - metadata/snap-*.avro (snapshot manifest lists)
// - metadata/*.avro (manifest files)
// - data/*.parquet, data/*.orc, data/*.avro (data files)
const uuidPattern = `[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}`
var (
// Allowed directories in an Iceberg table
icebergAllowedDirs = map[string]bool{
"metadata": true,
"data": true,
}
// Patterns for valid metadata files
metadataFilePatterns = []*regexp.Regexp{
regexp.MustCompile(`^v\d+\.metadata\.json$`), // Table metadata: v1.metadata.json, v2.metadata.json
regexp.MustCompile(`^snap-\d+-\d+-` + uuidPattern + `\.avro$`), // Snapshot manifests: snap-123-1-uuid.avro
regexp.MustCompile(`^` + uuidPattern + `-m\d+\.avro$`), // Manifest files: uuid-m0.avro
regexp.MustCompile(`^` + uuidPattern + `\.avro$`), // General manifest files
regexp.MustCompile(`^version-hint\.text$`), // Version hint file
regexp.MustCompile(`^` + uuidPattern + `\.metadata\.json$`), // UUID-named metadata
}
// Patterns for valid data files
dataFilePatterns = []*regexp.Regexp{
regexp.MustCompile(`^[^/]+\.parquet$`), // Parquet files
regexp.MustCompile(`^[^/]+\.orc$`), // ORC files
regexp.MustCompile(`^[^/]+\.avro$`), // Avro files
}
// Data file partition path pattern (e.g., year=2024/month=01/)
partitionPathPattern = regexp.MustCompile(`^[a-zA-Z_][a-zA-Z0-9_]*=[^/]+$`)
// Pattern for valid subdirectory names (alphanumeric, underscore, hyphen, and UUID-style directories)
validSubdirectoryPattern = regexp.MustCompile(`^[a-zA-Z0-9_-]+$`)
)
// IcebergLayoutValidator validates that files conform to Iceberg table layout
type IcebergLayoutValidator struct{}
// NewIcebergLayoutValidator creates a new Iceberg layout validator
func NewIcebergLayoutValidator() *IcebergLayoutValidator {
return &IcebergLayoutValidator{}
}
// ValidateFilePath validates that a file path conforms to Iceberg layout
// The path should be relative to the table root (e.g., "metadata/v1.metadata.json" or "data/file.parquet")
func (v *IcebergLayoutValidator) ValidateFilePath(relativePath string) error {
// Normalize path separators
relativePath = strings.TrimPrefix(relativePath, "/")
if relativePath == "" {
return &IcebergLayoutError{
Code: ErrCodeInvalidIcebergLayout,
Message: "empty file path",
}
}
parts := strings.SplitN(relativePath, "/", 2)
topDir := parts[0]
// Check if top-level directory is allowed
if !icebergAllowedDirs[topDir] {
return &IcebergLayoutError{
Code: ErrCodeInvalidIcebergLayout,
Message: "files must be placed in 'metadata/' or 'data/' directories",
}
}
// If it's just a bare top-level key (no trailing slash and no subpath), reject it
if len(parts) == 1 {
return &IcebergLayoutError{
Code: ErrCodeInvalidIcebergLayout,
Message: "must be a directory (use trailing slash) or contain a subpath",
}
}
remainingPath := parts[1]
if remainingPath == "" {
return nil // allow paths like "data/" or "metadata/"
}
switch topDir {
case "metadata":
return v.validateMetadataFile(remainingPath)
case "data":
return v.validateDataFile(remainingPath)
}
return nil
}
// validateDirectoryPath validates intermediate subdirectories in a path
// isMetadata indicates if we're in the metadata directory (true) or data directory (false)
func validateDirectoryPath(normalizedPath string, isMetadata bool) error {
if isMetadata {
// For metadata, reject any subdirectories (enforce flat structure under metadata/)
return &IcebergLayoutError{
Code: ErrCodeInvalidIcebergLayout,
Message: "metadata directory does not support subdirectories",
}
}
// For data, validate each partition or subdirectory segment
subdirs := strings.Split(normalizedPath, "/")
for _, subdir := range subdirs {
if subdir == "" {
return &IcebergLayoutError{
Code: ErrCodeInvalidIcebergLayout,
Message: "invalid partition or subdirectory format in data path: empty segment",
}
}
// For data, allow both partitions and valid subdirectories
if !partitionPathPattern.MatchString(subdir) && !isValidSubdirectory(subdir) {
return &IcebergLayoutError{
Code: ErrCodeInvalidIcebergLayout,
Message: "invalid partition or subdirectory format in data path",
}
}
}
return nil
}
// validateFilePatterns validates a filename against allowed patterns
// isMetadata indicates if we're validating metadata files (true) or data files (false)
func validateFilePatterns(filename string, isMetadata bool) error {
var patterns []*regexp.Regexp
var errorMsg string
if isMetadata {
patterns = metadataFilePatterns
errorMsg = "invalid metadata file format: must be a valid Iceberg metadata, manifest, or snapshot file"
} else {
patterns = dataFilePatterns
errorMsg = "invalid data file format: must be .parquet, .orc, or .avro"
}
for _, pattern := range patterns {
if pattern.MatchString(filename) {
return nil
}
}
return &IcebergLayoutError{
Code: ErrCodeInvalidIcebergLayout,
Message: errorMsg,
}
}
// validateFile validates files with a unified logic for metadata and data directories
// isMetadata indicates whether we're validating metadata files (true) or data files (false)
// The logic is:
// 1. If path ends with "/", it's a directory - validate all parts and return nil
// 2. Otherwise, validate intermediate parts, then check the filename against patterns
func (v *IcebergLayoutValidator) validateFile(path string, isMetadata bool) error {
// Detect if it's a directory (path ends with "/")
if strings.HasSuffix(path, "/") {
// Normalize by removing trailing slash
normalizedPath := strings.TrimSuffix(path, "/")
return validateDirectoryPath(normalizedPath, isMetadata)
}
filename := pathpkg.Base(path)
// Validate intermediate subdirectories if present
// Find if there are intermediate directories by looking for the last slash
lastSlash := strings.LastIndex(path, "/")
if lastSlash != -1 {
dir := path[:lastSlash]
if err := validateDirectoryPath(dir, isMetadata); err != nil {
return err
}
}
// Check against allowed file patterns
err := validateFilePatterns(filename, isMetadata)
if err == nil {
return nil
}
// Path could be for a directory without a trailing slash, e.g., "data/year=2024"
if !isMetadata {
if partitionPathPattern.MatchString(filename) || isValidSubdirectory(filename) {
return nil
}
}
return err
}
// validateMetadataFile validates files in the metadata/ directory
// This is a thin wrapper that calls validateFile with isMetadata=true
func (v *IcebergLayoutValidator) validateMetadataFile(path string) error {
return v.validateFile(path, true)
}
// validateDataFile validates files in the data/ directory
// This is a thin wrapper that calls validateFile with isMetadata=false
func (v *IcebergLayoutValidator) validateDataFile(path string) error {
return v.validateFile(path, false)
}
// isValidSubdirectory checks if a path component is a valid subdirectory name
func isValidSubdirectory(name string) bool {
// Allow alphanumeric, underscore, hyphen, and UUID-style directories
return validSubdirectoryPattern.MatchString(name)
}
// IcebergLayoutError represents an Iceberg layout validation error
type IcebergLayoutError struct {
Code string
Message string
}
func (e *IcebergLayoutError) Error() string {
return e.Message
}
// Error code for Iceberg layout violations
const (
ErrCodeInvalidIcebergLayout = "InvalidIcebergLayout"
)
// TableBucketFileValidator validates file uploads to table buckets
type TableBucketFileValidator struct {
layoutValidator *IcebergLayoutValidator
}
// NewTableBucketFileValidator creates a new table bucket file validator
func NewTableBucketFileValidator() *TableBucketFileValidator {
return &TableBucketFileValidator{
layoutValidator: NewIcebergLayoutValidator(),
}
}
// ValidateTableBucketUpload checks if a file upload to a table bucket conforms to Iceberg layout
// fullPath is the complete filer path (e.g., /table-buckets/mybucket/mynamespace/mytable/data/file.parquet)
// Returns nil if the path is not a table bucket path or if validation passes
// Returns an error if the file doesn't conform to Iceberg layout
func (v *TableBucketFileValidator) ValidateTableBucketUpload(fullPath string) error {
// Check if this is a table bucket path
if !strings.HasPrefix(fullPath, TablesPath+"/") {
return nil // Not a table bucket, no validation needed
}
// Extract the path relative to table bucket root
// Format: /table-buckets/{bucket}/{namespace}/{table}/{relative-path}
relativePath := strings.TrimPrefix(fullPath, TablesPath+"/")
parts := strings.SplitN(relativePath, "/", 4)
// Need at least bucket/namespace/table/file
if len(parts) < 4 {
// Creating bucket, namespace, or table directories - allow only if preceding parts are non-empty
for i := 0; i < len(parts); i++ {
if parts[i] == "" {
return &IcebergLayoutError{
Code: ErrCodeInvalidIcebergLayout,
Message: "bucket, namespace, and table segments cannot be empty",
}
}
}
return nil
}
// For full paths, also verify bucket, namespace, and table segments are non-empty
if parts[0] == "" || parts[1] == "" || parts[2] == "" {
return &IcebergLayoutError{
Code: ErrCodeInvalidIcebergLayout,
Message: "bucket, namespace, and table segments cannot be empty",
}
}
// The last part is the path within the table (data/file.parquet or metadata/v1.json)
tableRelativePath := parts[3]
if tableRelativePath == "" {
return nil
}
// Reject paths with empty segments (double slashes) within the table path
if strings.HasPrefix(tableRelativePath, "/") || strings.Contains(tableRelativePath, "//") {
return &IcebergLayoutError{
Code: ErrCodeInvalidIcebergLayout,
Message: "bucket, namespace, and table segments cannot be empty",
}
}
return v.layoutValidator.ValidateFilePath(tableRelativePath)
}
// IsTableBucketPath checks if a path is under the table-buckets directory
func IsTableBucketPath(fullPath string) bool {
return strings.HasPrefix(fullPath, TablesPath+"/")
}
// GetTableInfoFromPath extracts bucket, namespace, and table names from a table bucket path
// Returns empty strings if the path doesn't contain enough components
func GetTableInfoFromPath(fullPath string) (bucket, namespace, table string) {
if !strings.HasPrefix(fullPath, TablesPath+"/") {
return "", "", ""
}
relativePath := strings.TrimPrefix(fullPath, TablesPath+"/")
parts := strings.SplitN(relativePath, "/", 4)
if len(parts) >= 1 {
bucket = parts[0]
}
if len(parts) >= 2 {
namespace = parts[1]
}
if len(parts) >= 3 {
table = parts[2]
}
return
}
// ValidateTableBucketUploadWithClient validates upload and checks that the table exists and is ICEBERG format
func (v *TableBucketFileValidator) ValidateTableBucketUploadWithClient(
ctx context.Context,
client filer_pb.SeaweedFilerClient,
fullPath string,
) error {
// First check basic layout
if err := v.ValidateTableBucketUpload(fullPath); err != nil {
return err
}
// If not a table bucket path, nothing more to check
if !IsTableBucketPath(fullPath) {
return nil
}
// Get table info and verify it exists
bucket, namespace, table := GetTableInfoFromPath(fullPath)
if bucket == "" || namespace == "" || table == "" {
return nil // Not deep enough to need validation
}
// Verify the table exists and has ICEBERG format by checking its metadata
tablePath := GetTablePath(bucket, namespace, table)
dir, name := splitPath(tablePath)
resp, err := filer_pb.LookupEntry(ctx, client, &filer_pb.LookupDirectoryEntryRequest{
Directory: dir,
Name: name,
})
if err != nil {
// Distinguish between "not found" and other errors
if errors.Is(err, filer_pb.ErrNotFound) {
return &IcebergLayoutError{
Code: ErrCodeInvalidIcebergLayout,
Message: "table does not exist",
}
}
return &IcebergLayoutError{
Code: ErrCodeInvalidIcebergLayout,
Message: "failed to verify table existence: " + err.Error(),
}
}
// Check if table has metadata indicating ICEBERG format
if resp.Entry == nil || resp.Entry.Extended == nil {
return &IcebergLayoutError{
Code: ErrCodeInvalidIcebergLayout,
Message: "table is not a valid ICEBERG table (missing metadata)",
}
}
metadataBytes, ok := resp.Entry.Extended[ExtendedKeyMetadata]
if !ok {
return &IcebergLayoutError{
Code: ErrCodeInvalidIcebergLayout,
Message: "table is not in ICEBERG format (missing format metadata)",
}
}
var metadata tableMetadataInternal
if err := json.Unmarshal(metadataBytes, &metadata); err != nil {
return &IcebergLayoutError{
Code: ErrCodeInvalidIcebergLayout,
Message: "failed to parse table metadata: " + err.Error(),
}
}
const TableFormatIceberg = "ICEBERG"
if metadata.Format != TableFormatIceberg {
return &IcebergLayoutError{
Code: ErrCodeInvalidIcebergLayout,
Message: "table is not in " + TableFormatIceberg + " format",
}
}
return nil
}