Browse Source
iceberg: add delete file rewrite maintenance (#8664)
iceberg: add delete file rewrite maintenance (#8664)
* iceberg: add delete file rewrite maintenance * iceberg: preserve untouched delete files during rewrites * iceberg: share detection threshold defaults * iceberg: add partition-scoped maintenance filters (#8665) * iceberg: add partition-scoped maintenance filters * iceberg: tighten where-filter partition matchingmaster
committed by
GitHub
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 2140 additions and 117 deletions
-
59weed/plugin/worker/iceberg/compact.go
-
139weed/plugin/worker/iceberg/config.go
-
595weed/plugin/worker/iceberg/delete_rewrite.go
-
126weed/plugin/worker/iceberg/detection.go
-
512weed/plugin/worker/iceberg/exec_test.go
-
144weed/plugin/worker/iceberg/handler.go
-
35weed/plugin/worker/iceberg/handler_test.go
-
47weed/plugin/worker/iceberg/operations.go
-
2weed/plugin/worker/iceberg/planning_index.go
-
311weed/plugin/worker/iceberg/where_filter.go
-
287weed/plugin/worker/iceberg/where_filter_test.go
@ -0,0 +1,595 @@ |
|||
package iceberg |
|||
|
|||
import ( |
|||
"bytes" |
|||
"context" |
|||
"fmt" |
|||
"math" |
|||
"path" |
|||
"sort" |
|||
"time" |
|||
|
|||
"github.com/apache/iceberg-go" |
|||
"github.com/apache/iceberg-go/table" |
|||
"github.com/parquet-go/parquet-go" |
|||
"github.com/seaweedfs/seaweedfs/weed/glog" |
|||
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb" |
|||
"github.com/seaweedfs/seaweedfs/weed/s3api/s3tables" |
|||
) |
|||
|
|||
type deleteRewriteInput struct { |
|||
Entry iceberg.ManifestEntry |
|||
ReferencedPath string |
|||
Positions []int64 |
|||
} |
|||
|
|||
type deleteRewriteGroup struct { |
|||
SpecID int32 |
|||
Partition map[int]any |
|||
PartitionKey string |
|||
ReferencedPath string |
|||
Inputs []deleteRewriteInput |
|||
TotalSize int64 |
|||
} |
|||
|
|||
type positionDeleteRow struct { |
|||
FilePath string `parquet:"file_path"` |
|||
Pos int64 `parquet:"pos"` |
|||
} |
|||
|
|||
func hasEligibleDeleteRewrite( |
|||
ctx context.Context, |
|||
filerClient filer_pb.SeaweedFilerClient, |
|||
bucketName, tablePath string, |
|||
manifests []iceberg.ManifestFile, |
|||
config Config, |
|||
meta table.Metadata, |
|||
predicate *partitionPredicate, |
|||
) (bool, error) { |
|||
groups, _, err := collectDeleteRewriteGroups(ctx, filerClient, bucketName, tablePath, manifests) |
|||
if err != nil { |
|||
return false, err |
|||
} |
|||
for _, group := range groups { |
|||
if predicate != nil { |
|||
spec, ok := specByID(meta)[int(group.SpecID)] |
|||
if !ok { |
|||
continue |
|||
} |
|||
match, err := predicate.Matches(spec, group.Partition) |
|||
if err != nil { |
|||
return false, err |
|||
} |
|||
if !match { |
|||
continue |
|||
} |
|||
} |
|||
if groupEligibleForRewrite(group, config) { |
|||
return true, nil |
|||
} |
|||
} |
|||
return false, nil |
|||
} |
|||
|
|||
func collectDeleteRewriteGroups( |
|||
ctx context.Context, |
|||
filerClient filer_pb.SeaweedFilerClient, |
|||
bucketName, tablePath string, |
|||
manifests []iceberg.ManifestFile, |
|||
) (map[string]*deleteRewriteGroup, []iceberg.ManifestEntry, error) { |
|||
groups := make(map[string]*deleteRewriteGroup) |
|||
var allPositionEntries []iceberg.ManifestEntry |
|||
|
|||
for _, mf := range manifests { |
|||
if mf.ManifestContent() != iceberg.ManifestContentDeletes { |
|||
continue |
|||
} |
|||
|
|||
manifestData, err := loadFileByIcebergPath(ctx, filerClient, bucketName, tablePath, mf.FilePath()) |
|||
if err != nil { |
|||
return nil, nil, fmt.Errorf("read delete manifest %s: %w", mf.FilePath(), err) |
|||
} |
|||
entries, err := iceberg.ReadManifest(mf, bytes.NewReader(manifestData), true) |
|||
if err != nil { |
|||
return nil, nil, fmt.Errorf("parse delete manifest %s: %w", mf.FilePath(), err) |
|||
} |
|||
|
|||
for _, entry := range entries { |
|||
if entry.DataFile().ContentType() != iceberg.EntryContentPosDeletes { |
|||
continue |
|||
} |
|||
|
|||
allPositionEntries = append(allPositionEntries, entry) |
|||
|
|||
fileDeletes, err := readPositionDeleteFile(ctx, filerClient, bucketName, tablePath, entry.DataFile().FilePath()) |
|||
if err != nil { |
|||
return nil, nil, fmt.Errorf("read position delete file %s: %w", entry.DataFile().FilePath(), err) |
|||
} |
|||
if len(fileDeletes) != 1 { |
|||
// Phase 1 only rewrites files that target a single data file.
|
|||
continue |
|||
} |
|||
|
|||
var referencedPath string |
|||
var positions []int64 |
|||
for fp, pos := range fileDeletes { |
|||
referencedPath = normalizeIcebergPath(fp, bucketName, tablePath) |
|||
positions = append(positions, pos...) |
|||
} |
|||
sort.Slice(positions, func(i, j int) bool { return positions[i] < positions[j] }) |
|||
|
|||
partKey := partitionKey(entry.DataFile().Partition()) |
|||
groupKey := fmt.Sprintf("spec%d\x00%s\x00%s", entry.DataFile().SpecID(), partKey, referencedPath) |
|||
group, ok := groups[groupKey] |
|||
if !ok { |
|||
group = &deleteRewriteGroup{ |
|||
SpecID: entry.DataFile().SpecID(), |
|||
Partition: entry.DataFile().Partition(), |
|||
PartitionKey: partKey, |
|||
ReferencedPath: referencedPath, |
|||
} |
|||
groups[groupKey] = group |
|||
} |
|||
group.Inputs = append(group.Inputs, deleteRewriteInput{ |
|||
Entry: entry, |
|||
ReferencedPath: referencedPath, |
|||
Positions: positions, |
|||
}) |
|||
group.TotalSize += entry.DataFile().FileSizeBytes() |
|||
} |
|||
} |
|||
|
|||
return groups, allPositionEntries, nil |
|||
} |
|||
|
|||
func groupEligibleForRewrite(group *deleteRewriteGroup, config Config) bool { |
|||
if group == nil { |
|||
return false |
|||
} |
|||
if len(group.Inputs) < 2 { |
|||
return false |
|||
} |
|||
if group.TotalSize > config.DeleteMaxFileGroupSizeBytes { |
|||
return false |
|||
} |
|||
target := config.DeleteTargetFileSizeBytes |
|||
if target <= 0 { |
|||
target = defaultDeleteTargetFileSizeMB * 1024 * 1024 |
|||
} |
|||
outputFiles := int64(estimatedDeleteOutputFiles(group.TotalSize, target)) |
|||
if config.DeleteMaxOutputFiles > 0 && outputFiles > config.DeleteMaxOutputFiles { |
|||
return false |
|||
} |
|||
return int64(len(group.Inputs)) >= config.DeleteMinInputFiles |
|||
} |
|||
|
|||
func estimatedDeleteOutputFiles(totalSize, targetSize int64) int { |
|||
if totalSize <= 0 || targetSize <= 0 { |
|||
return 1 |
|||
} |
|||
count := int(math.Ceil(float64(totalSize) / float64(targetSize))) |
|||
if count < 1 { |
|||
return 1 |
|||
} |
|||
return count |
|||
} |
|||
|
|||
func manifestEntrySeqNum(entry iceberg.ManifestEntry) *int64 { |
|||
seqNum := entry.SequenceNum() |
|||
if seqNum < 0 { |
|||
return nil |
|||
} |
|||
return &seqNum |
|||
} |
|||
|
|||
func manifestEntryFileSeqNum(entry iceberg.ManifestEntry) *int64 { |
|||
if fileSeqNum := entry.FileSequenceNum(); fileSeqNum != nil { |
|||
value := *fileSeqNum |
|||
return &value |
|||
} |
|||
return manifestEntrySeqNum(entry) |
|||
} |
|||
|
|||
func writeManifestWithContent( |
|||
filename string, |
|||
version int, |
|||
spec iceberg.PartitionSpec, |
|||
schema *iceberg.Schema, |
|||
snapshotID int64, |
|||
entries []iceberg.ManifestEntry, |
|||
content iceberg.ManifestContent, |
|||
) (iceberg.ManifestFile, []byte, error) { |
|||
var manifestBuf bytes.Buffer |
|||
mf, err := iceberg.WriteManifest(filename, &manifestBuf, version, spec, schema, snapshotID, entries) |
|||
if err != nil { |
|||
return nil, nil, err |
|||
} |
|||
|
|||
manifestBytes := manifestBuf.Bytes() |
|||
if content == iceberg.ManifestContentDeletes { |
|||
manifestBytes, err = patchManifestContentBytesToDeletes(manifestBytes) |
|||
if err != nil { |
|||
return nil, nil, err |
|||
} |
|||
} |
|||
|
|||
rebuilt := iceberg.NewManifestFile(version, filename, int64(len(manifestBytes)), int32(spec.ID()), snapshotID). |
|||
Content(content). |
|||
AddedFiles(mf.AddedDataFiles()). |
|||
ExistingFiles(mf.ExistingDataFiles()). |
|||
DeletedFiles(mf.DeletedDataFiles()). |
|||
AddedRows(mf.AddedRows()). |
|||
ExistingRows(mf.ExistingRows()). |
|||
DeletedRows(mf.DeletedRows()). |
|||
Partitions(mf.Partitions()). |
|||
Build() |
|||
return rebuilt, manifestBytes, nil |
|||
} |
|||
|
|||
func patchManifestContentBytesToDeletes(manifestBytes []byte) ([]byte, error) { |
|||
old := append([]byte{0x0e}, []byte("content")...) |
|||
old = append(old, 0x08) |
|||
old = append(old, []byte("data")...) |
|||
|
|||
new := append([]byte{0x0e}, []byte("content")...) |
|||
new = append(new, 0x0e) |
|||
new = append(new, []byte("deletes")...) |
|||
|
|||
result := bytes.Replace(manifestBytes, old, new, 1) |
|||
if bytes.Equal(result, manifestBytes) { |
|||
return nil, fmt.Errorf("delete manifest content patch failed") |
|||
} |
|||
return result, nil |
|||
} |
|||
|
|||
func writePositionDeleteFile(rows []positionDeleteRow) ([]byte, error) { |
|||
var buf bytes.Buffer |
|||
writer := parquet.NewWriter(&buf, parquet.SchemaOf(new(positionDeleteRow))) |
|||
for _, row := range rows { |
|||
if err := writer.Write(&row); err != nil { |
|||
return nil, fmt.Errorf("write position delete row: %w", err) |
|||
} |
|||
} |
|||
if err := writer.Close(); err != nil { |
|||
return nil, fmt.Errorf("close position delete file: %w", err) |
|||
} |
|||
return buf.Bytes(), nil |
|||
} |
|||
|
|||
func (h *Handler) rewritePositionDeleteFiles( |
|||
ctx context.Context, |
|||
filerClient filer_pb.SeaweedFilerClient, |
|||
bucketName, tablePath string, |
|||
config Config, |
|||
) (string, map[string]int64, error) { |
|||
start := time.Now() |
|||
meta, metadataFileName, err := loadCurrentMetadata(ctx, filerClient, bucketName, tablePath) |
|||
if err != nil { |
|||
return "", nil, fmt.Errorf("load metadata: %w", err) |
|||
} |
|||
|
|||
currentSnap := meta.CurrentSnapshot() |
|||
if currentSnap == nil || currentSnap.ManifestList == "" { |
|||
return "no current snapshot", nil, nil |
|||
} |
|||
|
|||
manifestListData, err := loadFileByIcebergPath(ctx, filerClient, bucketName, tablePath, currentSnap.ManifestList) |
|||
if err != nil { |
|||
return "", nil, fmt.Errorf("read manifest list: %w", err) |
|||
} |
|||
manifests, err := iceberg.ReadManifestList(bytes.NewReader(manifestListData)) |
|||
if err != nil { |
|||
return "", nil, fmt.Errorf("parse manifest list: %w", err) |
|||
} |
|||
|
|||
var dataManifests []iceberg.ManifestFile |
|||
var allEqualityEntries []iceberg.ManifestEntry |
|||
for _, mf := range manifests { |
|||
switch mf.ManifestContent() { |
|||
case iceberg.ManifestContentData: |
|||
dataManifests = append(dataManifests, mf) |
|||
case iceberg.ManifestContentDeletes: |
|||
manifestData, readErr := loadFileByIcebergPath(ctx, filerClient, bucketName, tablePath, mf.FilePath()) |
|||
if readErr != nil { |
|||
return "", nil, fmt.Errorf("read delete manifest %s: %w", mf.FilePath(), readErr) |
|||
} |
|||
entries, parseErr := iceberg.ReadManifest(mf, bytes.NewReader(manifestData), true) |
|||
if parseErr != nil { |
|||
return "", nil, fmt.Errorf("parse delete manifest %s: %w", mf.FilePath(), parseErr) |
|||
} |
|||
for _, entry := range entries { |
|||
if entry.DataFile().ContentType() == iceberg.EntryContentEqDeletes { |
|||
allEqualityEntries = append(allEqualityEntries, entry) |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
groupMap, allPositionEntries, err := collectDeleteRewriteGroups(ctx, filerClient, bucketName, tablePath, manifests) |
|||
if err != nil { |
|||
return "", nil, err |
|||
} |
|||
if len(groupMap) == 0 { |
|||
return "no position delete files eligible for rewrite", nil, nil |
|||
} |
|||
|
|||
type artifact struct { |
|||
dir, fileName string |
|||
} |
|||
var writtenArtifacts []artifact |
|||
committed := false |
|||
defer func() { |
|||
if committed || len(writtenArtifacts) == 0 { |
|||
return |
|||
} |
|||
cleanupCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) |
|||
defer cancel() |
|||
for _, a := range writtenArtifacts { |
|||
if err := deleteFilerFile(cleanupCtx, filerClient, a.dir, a.fileName); err != nil { |
|||
glog.Warningf("iceberg delete rewrite: failed to clean up artifact %s/%s: %v", a.dir, a.fileName, err) |
|||
} |
|||
} |
|||
}() |
|||
|
|||
specByID := specByID(meta) |
|||
predicate, err := parsePartitionPredicate(config.Where, meta) |
|||
if err != nil { |
|||
return "", nil, err |
|||
} |
|||
|
|||
type specEntries struct { |
|||
specID int32 |
|||
entries []iceberg.ManifestEntry |
|||
} |
|||
specEntriesMap := make(map[int32]*specEntries) |
|||
addToSpec := func(specID int32, entry iceberg.ManifestEntry) { |
|||
se, ok := specEntriesMap[specID] |
|||
if !ok { |
|||
se = &specEntries{specID: specID} |
|||
specEntriesMap[specID] = se |
|||
} |
|||
se.entries = append(se.entries, entry) |
|||
} |
|||
|
|||
newSnapID := time.Now().UnixMilli() |
|||
version := meta.Version() |
|||
snapshotID := currentSnap.SnapshotID |
|||
seqNum := currentSnap.SequenceNumber + 1 |
|||
metaDir := path.Join(s3tables.TablesPath, bucketName, tablePath, "metadata") |
|||
dataDir := path.Join(s3tables.TablesPath, bucketName, tablePath, "data") |
|||
artifactSuffix := compactRandomSuffix() |
|||
|
|||
replacedPaths := make(map[string]struct{}) |
|||
var rewrittenGroups int64 |
|||
var skippedGroups int64 |
|||
var deleteFilesRewritten int64 |
|||
var deleteFilesWritten int64 |
|||
var deleteBytesRewritten int64 |
|||
|
|||
sortedKeys := make([]string, 0, len(groupMap)) |
|||
for key := range groupMap { |
|||
sortedKeys = append(sortedKeys, key) |
|||
} |
|||
sort.Strings(sortedKeys) |
|||
|
|||
for _, key := range sortedKeys { |
|||
group := groupMap[key] |
|||
if predicate != nil { |
|||
spec, ok := specByID[int(group.SpecID)] |
|||
if !ok { |
|||
continue |
|||
} |
|||
match, err := predicate.Matches(spec, group.Partition) |
|||
if err != nil { |
|||
return "", nil, err |
|||
} |
|||
if !match { |
|||
skippedGroups++ |
|||
continue |
|||
} |
|||
} |
|||
if !groupEligibleForRewrite(group, config) { |
|||
skippedGroups++ |
|||
continue |
|||
} |
|||
rows := make([]positionDeleteRow, 0) |
|||
for _, input := range group.Inputs { |
|||
for _, pos := range input.Positions { |
|||
rows = append(rows, positionDeleteRow{FilePath: input.ReferencedPath, Pos: pos}) |
|||
} |
|||
replacedPaths[input.Entry.DataFile().FilePath()] = struct{}{} |
|||
deleteFilesRewritten++ |
|||
deleteBytesRewritten += input.Entry.DataFile().FileSizeBytes() |
|||
} |
|||
sort.Slice(rows, func(i, j int) bool { |
|||
if rows[i].FilePath != rows[j].FilePath { |
|||
return rows[i].FilePath < rows[j].FilePath |
|||
} |
|||
return rows[i].Pos < rows[j].Pos |
|||
}) |
|||
|
|||
outputFiles := estimatedDeleteOutputFiles(group.TotalSize, config.DeleteTargetFileSizeBytes) |
|||
rowsPerFile := (len(rows) + outputFiles - 1) / outputFiles |
|||
if rowsPerFile < 1 { |
|||
rowsPerFile = len(rows) |
|||
} |
|||
|
|||
for startIdx, fileIdx := 0, 0; startIdx < len(rows); startIdx, fileIdx = startIdx+rowsPerFile, fileIdx+1 { |
|||
endIdx := startIdx + rowsPerFile |
|||
if endIdx > len(rows) { |
|||
endIdx = len(rows) |
|||
} |
|||
outputRows := rows[startIdx:endIdx] |
|||
deleteBytes, err := writePositionDeleteFile(outputRows) |
|||
if err != nil { |
|||
return "", nil, err |
|||
} |
|||
fileName := fmt.Sprintf("rewrite-delete-%d-%s-%d.parquet", newSnapID, artifactSuffix, deleteFilesWritten) |
|||
if err := ensureFilerDir(ctx, filerClient, dataDir); err != nil { |
|||
return "", nil, fmt.Errorf("ensure data dir: %w", err) |
|||
} |
|||
if err := saveFilerFile(ctx, filerClient, dataDir, fileName, deleteBytes); err != nil { |
|||
return "", nil, fmt.Errorf("save rewritten delete file: %w", err) |
|||
} |
|||
writtenArtifacts = append(writtenArtifacts, artifact{dir: dataDir, fileName: fileName}) |
|||
|
|||
spec, ok := specByID[int(group.SpecID)] |
|||
if !ok { |
|||
return "", nil, fmt.Errorf("partition spec %d not found", group.SpecID) |
|||
} |
|||
dfBuilder, err := iceberg.NewDataFileBuilder( |
|||
spec, |
|||
iceberg.EntryContentPosDeletes, |
|||
path.Join("data", fileName), |
|||
iceberg.ParquetFile, |
|||
group.Partition, |
|||
nil, nil, |
|||
int64(len(outputRows)), |
|||
int64(len(deleteBytes)), |
|||
) |
|||
if err != nil { |
|||
return "", nil, fmt.Errorf("build rewritten delete file: %w", err) |
|||
} |
|||
entry := iceberg.NewManifestEntry(iceberg.EntryStatusADDED, &newSnapID, nil, nil, dfBuilder.Build()) |
|||
addToSpec(group.SpecID, entry) |
|||
deleteFilesWritten++ |
|||
} |
|||
|
|||
for _, input := range group.Inputs { |
|||
delEntry := iceberg.NewManifestEntry( |
|||
iceberg.EntryStatusDELETED, |
|||
&newSnapID, |
|||
manifestEntrySeqNum(input.Entry), |
|||
manifestEntryFileSeqNum(input.Entry), |
|||
input.Entry.DataFile(), |
|||
) |
|||
addToSpec(group.SpecID, delEntry) |
|||
} |
|||
rewrittenGroups++ |
|||
} |
|||
|
|||
if rewrittenGroups == 0 { |
|||
return "no position delete files eligible for rewrite", nil, nil |
|||
} |
|||
|
|||
for _, entry := range allEqualityEntries { |
|||
existingEntry := iceberg.NewManifestEntry( |
|||
iceberg.EntryStatusEXISTING, |
|||
func() *int64 { id := entry.SnapshotID(); return &id }(), |
|||
manifestEntrySeqNum(entry), |
|||
manifestEntryFileSeqNum(entry), |
|||
entry.DataFile(), |
|||
) |
|||
addToSpec(entry.DataFile().SpecID(), existingEntry) |
|||
} |
|||
|
|||
for _, entry := range allPositionEntries { |
|||
if _, replaced := replacedPaths[entry.DataFile().FilePath()]; replaced { |
|||
continue |
|||
} |
|||
existingEntry := iceberg.NewManifestEntry( |
|||
iceberg.EntryStatusEXISTING, |
|||
func() *int64 { id := entry.SnapshotID(); return &id }(), |
|||
manifestEntrySeqNum(entry), |
|||
manifestEntryFileSeqNum(entry), |
|||
entry.DataFile(), |
|||
) |
|||
addToSpec(entry.DataFile().SpecID(), existingEntry) |
|||
} |
|||
|
|||
sortedSpecIDs := make([]int32, 0, len(specEntriesMap)) |
|||
for specID := range specEntriesMap { |
|||
sortedSpecIDs = append(sortedSpecIDs, specID) |
|||
} |
|||
sort.Slice(sortedSpecIDs, func(i, j int) bool { return sortedSpecIDs[i] < sortedSpecIDs[j] }) |
|||
|
|||
allManifests := make([]iceberg.ManifestFile, 0, len(dataManifests)+len(sortedSpecIDs)) |
|||
allManifests = append(allManifests, dataManifests...) |
|||
|
|||
for _, specID := range sortedSpecIDs { |
|||
spec, ok := specByID[int(specID)] |
|||
if !ok { |
|||
return "", nil, fmt.Errorf("partition spec %d not found", specID) |
|||
} |
|||
manifestName := fmt.Sprintf("rewrite-delete-%d-%s-spec%d.avro", newSnapID, artifactSuffix, specID) |
|||
manifestPath := path.Join("metadata", manifestName) |
|||
mf, manifestBytes, err := writeManifestWithContent( |
|||
manifestPath, |
|||
version, |
|||
spec, |
|||
meta.CurrentSchema(), |
|||
newSnapID, |
|||
specEntriesMap[specID].entries, |
|||
iceberg.ManifestContentDeletes, |
|||
) |
|||
if err != nil { |
|||
return "", nil, fmt.Errorf("write delete manifest for spec %d: %w", specID, err) |
|||
} |
|||
if err := saveFilerFile(ctx, filerClient, metaDir, manifestName, manifestBytes); err != nil { |
|||
return "", nil, fmt.Errorf("save delete manifest for spec %d: %w", specID, err) |
|||
} |
|||
writtenArtifacts = append(writtenArtifacts, artifact{dir: metaDir, fileName: manifestName}) |
|||
allManifests = append(allManifests, mf) |
|||
} |
|||
|
|||
var manifestListBuf bytes.Buffer |
|||
if err := iceberg.WriteManifestList(version, &manifestListBuf, newSnapID, &snapshotID, &seqNum, 0, allManifests); err != nil { |
|||
return "", nil, fmt.Errorf("write delete manifest list: %w", err) |
|||
} |
|||
manifestListName := fmt.Sprintf("snap-%d-%s.avro", newSnapID, artifactSuffix) |
|||
if err := saveFilerFile(ctx, filerClient, metaDir, manifestListName, manifestListBuf.Bytes()); err != nil { |
|||
return "", nil, fmt.Errorf("save delete manifest list: %w", err) |
|||
} |
|||
writtenArtifacts = append(writtenArtifacts, artifact{dir: metaDir, fileName: manifestListName}) |
|||
|
|||
manifestListLocation := path.Join("metadata", manifestListName) |
|||
err = h.commitWithRetry(ctx, filerClient, bucketName, tablePath, metadataFileName, config, func(currentMeta table.Metadata, builder *table.MetadataBuilder) error { |
|||
cs := currentMeta.CurrentSnapshot() |
|||
if cs == nil || cs.SnapshotID != snapshotID { |
|||
return errStalePlan |
|||
} |
|||
newSnapshot := &table.Snapshot{ |
|||
SnapshotID: newSnapID, |
|||
ParentSnapshotID: &snapshotID, |
|||
SequenceNumber: seqNum, |
|||
TimestampMs: newSnapID, |
|||
ManifestList: manifestListLocation, |
|||
Summary: &table.Summary{ |
|||
Operation: table.OpReplace, |
|||
Properties: map[string]string{ |
|||
"maintenance": "rewrite_position_delete_files", |
|||
"delete-files-rewritten": fmt.Sprintf("%d", deleteFilesRewritten), |
|||
"delete-files-written": fmt.Sprintf("%d", deleteFilesWritten), |
|||
"delete-groups": fmt.Sprintf("%d", rewrittenGroups), |
|||
}, |
|||
}, |
|||
SchemaID: func() *int { |
|||
id := meta.CurrentSchema().ID |
|||
return &id |
|||
}(), |
|||
} |
|||
if err := builder.AddSnapshot(newSnapshot); err != nil { |
|||
return err |
|||
} |
|||
return builder.SetSnapshotRef(table.MainBranch, newSnapID, table.BranchRef) |
|||
}) |
|||
if err != nil { |
|||
return "", nil, fmt.Errorf("commit delete rewrite: %w", err) |
|||
} |
|||
|
|||
committed = true |
|||
metrics := map[string]int64{ |
|||
MetricDeleteFilesRewritten: deleteFilesRewritten, |
|||
MetricDeleteFilesWritten: deleteFilesWritten, |
|||
MetricDeleteBytesRewritten: deleteBytesRewritten, |
|||
MetricDeleteGroupsPlanned: rewrittenGroups, |
|||
MetricDeleteGroupsSkipped: skippedGroups, |
|||
MetricDurationMs: time.Since(start).Milliseconds(), |
|||
} |
|||
return fmt.Sprintf( |
|||
"rewrote %d position delete files into %d across %d group(s)", |
|||
deleteFilesRewritten, |
|||
deleteFilesWritten, |
|||
rewrittenGroups, |
|||
), metrics, nil |
|||
} |
|||
@ -0,0 +1,311 @@ |
|||
package iceberg |
|||
|
|||
import ( |
|||
"fmt" |
|||
"regexp" |
|||
"strconv" |
|||
"strings" |
|||
|
|||
"github.com/apache/iceberg-go" |
|||
"github.com/apache/iceberg-go/table" |
|||
) |
|||
|
|||
var ( |
|||
whereEqualsPattern = regexp.MustCompile(`^([A-Za-z_][A-Za-z0-9_]*)\s*=\s*(.+)$`) |
|||
whereInPattern = regexp.MustCompile(`^(?i)([A-Za-z_][A-Za-z0-9_]*)\s+IN\s*\((.*)\)$`) |
|||
) |
|||
|
|||
type whereClause struct { |
|||
Field string |
|||
Literals []string |
|||
} |
|||
|
|||
type partitionPredicate struct { |
|||
Clauses []whereClause |
|||
} |
|||
|
|||
func validateWhereOperations(where string, ops []string) error { |
|||
if strings.TrimSpace(where) == "" { |
|||
return nil |
|||
} |
|||
for _, op := range ops { |
|||
switch op { |
|||
case "compact", "rewrite_manifests", "rewrite_position_delete_files": |
|||
continue |
|||
default: |
|||
return fmt.Errorf("where filter is only supported for compact, rewrite_position_delete_files, and rewrite_manifests") |
|||
} |
|||
} |
|||
return nil |
|||
} |
|||
|
|||
func parsePartitionPredicate(where string, meta table.Metadata) (*partitionPredicate, error) { |
|||
where = strings.TrimSpace(where) |
|||
if where == "" { |
|||
return nil, nil |
|||
} |
|||
if meta == nil { |
|||
return nil, fmt.Errorf("where filter requires table metadata") |
|||
} |
|||
|
|||
specs := meta.PartitionSpecs() |
|||
if len(specs) == 0 || meta.PartitionSpec().IsUnpartitioned() { |
|||
return nil, fmt.Errorf("where filter is not supported for unpartitioned tables") |
|||
} |
|||
|
|||
rawClauses := splitWhereConjunction(where) |
|||
clauses := make([]whereClause, 0, len(rawClauses)) |
|||
for _, raw := range rawClauses { |
|||
clause, err := parseWhereClause(raw) |
|||
if err != nil { |
|||
return nil, err |
|||
} |
|||
clauses = append(clauses, clause) |
|||
} |
|||
|
|||
// Validate against the current partition spec only. Historical specs may
|
|||
// lack fields added during schema evolution; per-entry matching in Matches()
|
|||
// handles those gracefully.
|
|||
currentSpec := meta.PartitionSpec() |
|||
for _, clause := range clauses { |
|||
if !specHasFieldByName(currentSpec, clause.Field) { |
|||
return nil, fmt.Errorf("where field %q is not present in current partition spec %d", clause.Field, currentSpec.ID()) |
|||
} |
|||
} |
|||
|
|||
return &partitionPredicate{Clauses: clauses}, nil |
|||
} |
|||
|
|||
func splitWhereConjunction(where string) []string { |
|||
// Quote-aware split: only split on AND that appears outside quotes.
|
|||
var parts []string |
|||
var current strings.Builder |
|||
var quote rune |
|||
runes := []rune(where) |
|||
for i := 0; i < len(runes); i++ { |
|||
r := runes[i] |
|||
if quote != 0 { |
|||
current.WriteRune(r) |
|||
if r == quote { |
|||
quote = 0 |
|||
} |
|||
continue |
|||
} |
|||
if r == '\'' || r == '"' { |
|||
quote = r |
|||
current.WriteRune(r) |
|||
continue |
|||
} |
|||
// Check for case-insensitive AND surrounded by whitespace.
|
|||
if (r == 'A' || r == 'a') && i+3 < len(runes) { |
|||
candidate := string(runes[i : i+3]) |
|||
if strings.EqualFold(candidate, "AND") { |
|||
before := i > 0 && isWhitespace(runes[i-1]) |
|||
after := i+3 < len(runes) && isWhitespace(runes[i+3]) |
|||
if before && after { |
|||
part := strings.TrimSpace(current.String()) |
|||
if part != "" { |
|||
parts = append(parts, part) |
|||
} |
|||
current.Reset() |
|||
i += 3 // skip "AND" + the after-space will be consumed next iteration
|
|||
continue |
|||
} |
|||
} |
|||
} |
|||
current.WriteRune(r) |
|||
} |
|||
if part := strings.TrimSpace(current.String()); part != "" { |
|||
parts = append(parts, part) |
|||
} |
|||
return parts |
|||
} |
|||
|
|||
func isWhitespace(r rune) bool { |
|||
return r == ' ' || r == '\t' || r == '\n' || r == '\r' |
|||
} |
|||
|
|||
func parseWhereClause(raw string) (whereClause, error) { |
|||
raw = strings.TrimSpace(raw) |
|||
if raw == "" { |
|||
return whereClause{}, fmt.Errorf("empty where clause") |
|||
} |
|||
if matches := whereInPattern.FindStringSubmatch(raw); matches != nil { |
|||
literals, err := splitLiteralList(matches[2]) |
|||
if err != nil { |
|||
return whereClause{}, err |
|||
} |
|||
if len(literals) == 0 { |
|||
return whereClause{}, fmt.Errorf("empty IN list in where clause %q", raw) |
|||
} |
|||
return whereClause{Field: matches[1], Literals: literals}, nil |
|||
} |
|||
if matches := whereEqualsPattern.FindStringSubmatch(raw); matches != nil { |
|||
return whereClause{Field: matches[1], Literals: []string{strings.TrimSpace(matches[2])}}, nil |
|||
} |
|||
return whereClause{}, fmt.Errorf("unsupported where clause %q", raw) |
|||
} |
|||
|
|||
func splitLiteralList(raw string) ([]string, error) { |
|||
raw = strings.TrimSpace(raw) |
|||
if raw == "" { |
|||
return nil, nil |
|||
} |
|||
var ( |
|||
literals []string |
|||
current strings.Builder |
|||
quote rune |
|||
) |
|||
for _, r := range raw { |
|||
switch { |
|||
case quote != 0: |
|||
current.WriteRune(r) |
|||
if r == quote { |
|||
quote = 0 |
|||
} |
|||
case r == '\'' || r == '"': |
|||
quote = r |
|||
current.WriteRune(r) |
|||
case r == ',': |
|||
literal := strings.TrimSpace(current.String()) |
|||
if literal != "" { |
|||
literals = append(literals, literal) |
|||
} |
|||
current.Reset() |
|||
default: |
|||
current.WriteRune(r) |
|||
} |
|||
} |
|||
if quote != 0 { |
|||
return nil, fmt.Errorf("unterminated quoted literal in IN list") |
|||
} |
|||
if literal := strings.TrimSpace(current.String()); literal != "" { |
|||
literals = append(literals, literal) |
|||
} |
|||
return literals, nil |
|||
} |
|||
|
|||
func specHasFieldByName(spec iceberg.PartitionSpec, fieldName string) bool { |
|||
for field := range spec.Fields() { |
|||
if field.Name == fieldName { |
|||
return true |
|||
} |
|||
} |
|||
return false |
|||
} |
|||
|
|||
func specByID(meta table.Metadata) map[int]iceberg.PartitionSpec { |
|||
result := make(map[int]iceberg.PartitionSpec) |
|||
if meta == nil { |
|||
return result |
|||
} |
|||
for _, spec := range meta.PartitionSpecs() { |
|||
result[spec.ID()] = spec |
|||
} |
|||
return result |
|||
} |
|||
|
|||
func (p *partitionPredicate) Matches(spec iceberg.PartitionSpec, partition map[int]any) (bool, error) { |
|||
if p == nil { |
|||
return true, nil |
|||
} |
|||
|
|||
valuesByName := make(map[string]any) |
|||
for field := range spec.Fields() { |
|||
if value, ok := partition[field.FieldID]; ok { |
|||
valuesByName[field.Name] = value |
|||
} |
|||
} |
|||
|
|||
for _, clause := range p.Clauses { |
|||
actual, ok := valuesByName[clause.Field] |
|||
if !ok { |
|||
// Field not present in this spec (e.g. older spec before schema
|
|||
// evolution). Skip this entry rather than erroring.
|
|||
return false, nil |
|||
} |
|||
matched := false |
|||
for _, literal := range clause.Literals { |
|||
ok, err := literalMatchesActual(literal, actual) |
|||
if err != nil { |
|||
return false, fmt.Errorf("where field %q: %w", clause.Field, err) |
|||
} |
|||
if ok { |
|||
matched = true |
|||
break |
|||
} |
|||
} |
|||
if !matched { |
|||
return false, nil |
|||
} |
|||
} |
|||
return true, nil |
|||
} |
|||
|
|||
func literalMatchesActual(raw string, actual any) (bool, error) { |
|||
raw = strings.TrimSpace(raw) |
|||
if raw == "" { |
|||
return false, fmt.Errorf("empty literal") |
|||
} |
|||
|
|||
switch v := actual.(type) { |
|||
case string: |
|||
value, err := unquoteLiteral(raw) |
|||
if err != nil { |
|||
return false, err |
|||
} |
|||
return v == value, nil |
|||
case bool: |
|||
value, err := strconv.ParseBool(strings.ToLower(strings.TrimSpace(raw))) |
|||
if err != nil { |
|||
return false, fmt.Errorf("parse bool literal %q: %w", raw, err) |
|||
} |
|||
return v == value, nil |
|||
case int: |
|||
value, err := strconv.ParseInt(raw, 10, 64) |
|||
if err != nil { |
|||
return false, fmt.Errorf("parse int literal %q: %w", raw, err) |
|||
} |
|||
return int64(v) == value, nil |
|||
case int32: |
|||
value, err := strconv.ParseInt(raw, 10, 32) |
|||
if err != nil { |
|||
return false, fmt.Errorf("parse int32 literal %q: %w", raw, err) |
|||
} |
|||
return v == int32(value), nil |
|||
case int64: |
|||
value, err := strconv.ParseInt(raw, 10, 64) |
|||
if err != nil { |
|||
return false, fmt.Errorf("parse int64 literal %q: %w", raw, err) |
|||
} |
|||
return v == value, nil |
|||
case float32: |
|||
value, err := strconv.ParseFloat(raw, 32) |
|||
if err != nil { |
|||
return false, fmt.Errorf("parse float32 literal %q: %w", raw, err) |
|||
} |
|||
return v == float32(value), nil |
|||
case float64: |
|||
value, err := strconv.ParseFloat(raw, 64) |
|||
if err != nil { |
|||
return false, fmt.Errorf("parse float64 literal %q: %w", raw, err) |
|||
} |
|||
return v == value, nil |
|||
default: |
|||
value, err := unquoteLiteral(raw) |
|||
if err != nil { |
|||
return false, err |
|||
} |
|||
return fmt.Sprint(actual) == value, nil |
|||
} |
|||
} |
|||
|
|||
func unquoteLiteral(raw string) (string, error) { |
|||
raw = strings.TrimSpace(raw) |
|||
if len(raw) >= 2 { |
|||
if (raw[0] == '\'' && raw[len(raw)-1] == '\'') || (raw[0] == '"' && raw[len(raw)-1] == '"') { |
|||
return raw[1 : len(raw)-1], nil |
|||
} |
|||
} |
|||
return raw, nil |
|||
} |
|||
@ -0,0 +1,287 @@ |
|||
package iceberg |
|||
|
|||
import ( |
|||
"bytes" |
|||
"context" |
|||
"encoding/json" |
|||
"fmt" |
|||
"path" |
|||
"strings" |
|||
"testing" |
|||
"time" |
|||
|
|||
"github.com/apache/iceberg-go" |
|||
"github.com/apache/iceberg-go/table" |
|||
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb" |
|||
"github.com/seaweedfs/seaweedfs/weed/s3api/s3tables" |
|||
) |
|||
|
|||
type partitionedTestFile struct { |
|||
Name string |
|||
Partition map[int]any |
|||
Rows []struct { |
|||
ID int64 |
|||
Name string |
|||
} |
|||
} |
|||
|
|||
func populatePartitionedDataTable( |
|||
t *testing.T, |
|||
fs *fakeFilerServer, |
|||
setup tableSetup, |
|||
partitionSpec iceberg.PartitionSpec, |
|||
manifestGroups [][]partitionedTestFile, |
|||
) table.Metadata { |
|||
t.Helper() |
|||
|
|||
schema := newTestSchema() |
|||
meta, err := table.NewMetadata(schema, &partitionSpec, table.UnsortedSortOrder, "s3://"+setup.BucketName+"/"+setup.tablePath(), nil) |
|||
if err != nil { |
|||
t.Fatalf("create metadata: %v", err) |
|||
} |
|||
|
|||
bucketsPath := s3tables.TablesPath |
|||
bucketPath := path.Join(bucketsPath, setup.BucketName) |
|||
nsPath := path.Join(bucketPath, setup.Namespace) |
|||
tablePath := path.Join(nsPath, setup.TableName) |
|||
metaDir := path.Join(tablePath, "metadata") |
|||
dataDir := path.Join(tablePath, "data") |
|||
|
|||
version := meta.Version() |
|||
var manifestFiles []iceberg.ManifestFile |
|||
for idx, group := range manifestGroups { |
|||
entries := make([]iceberg.ManifestEntry, 0, len(group)) |
|||
for _, file := range group { |
|||
data := writeTestParquetFile(t, fs, dataDir, file.Name, file.Rows) |
|||
dfBuilder, err := iceberg.NewDataFileBuilder( |
|||
partitionSpec, |
|||
iceberg.EntryContentData, |
|||
path.Join("data", file.Name), |
|||
iceberg.ParquetFile, |
|||
file.Partition, |
|||
nil, nil, |
|||
int64(len(file.Rows)), |
|||
int64(len(data)), |
|||
) |
|||
if err != nil { |
|||
t.Fatalf("build data file %s: %v", file.Name, err) |
|||
} |
|||
snapID := int64(1) |
|||
entries = append(entries, iceberg.NewManifestEntry(iceberg.EntryStatusADDED, &snapID, nil, nil, dfBuilder.Build())) |
|||
} |
|||
|
|||
manifestName := fmt.Sprintf("where-manifest-%d.avro", idx+1) |
|||
var manifestBuf bytes.Buffer |
|||
mf, err := iceberg.WriteManifest(path.Join("metadata", manifestName), &manifestBuf, version, partitionSpec, schema, 1, entries) |
|||
if err != nil { |
|||
t.Fatalf("write manifest %d: %v", idx+1, err) |
|||
} |
|||
fs.putEntry(metaDir, manifestName, &filer_pb.Entry{ |
|||
Name: manifestName, |
|||
Content: manifestBuf.Bytes(), |
|||
Attributes: &filer_pb.FuseAttributes{Mtime: time.Now().Unix(), FileSize: uint64(manifestBuf.Len())}, |
|||
}) |
|||
manifestFiles = append(manifestFiles, mf) |
|||
} |
|||
|
|||
var manifestListBuf bytes.Buffer |
|||
seqNum := int64(1) |
|||
if err := iceberg.WriteManifestList(version, &manifestListBuf, 1, nil, &seqNum, 0, manifestFiles); err != nil { |
|||
t.Fatalf("write manifest list: %v", err) |
|||
} |
|||
fs.putEntry(metaDir, "snap-1.avro", &filer_pb.Entry{ |
|||
Name: "snap-1.avro", |
|||
Content: manifestListBuf.Bytes(), |
|||
Attributes: &filer_pb.FuseAttributes{Mtime: time.Now().Unix(), FileSize: uint64(manifestListBuf.Len())}, |
|||
}) |
|||
|
|||
builder, err := table.MetadataBuilderFromBase(meta, "s3://"+setup.BucketName+"/"+setup.tablePath()) |
|||
if err != nil { |
|||
t.Fatalf("metadata builder: %v", err) |
|||
} |
|||
snapshot := table.Snapshot{SnapshotID: 1, TimestampMs: time.Now().UnixMilli(), ManifestList: "metadata/snap-1.avro", SequenceNumber: 1} |
|||
if err := builder.AddSnapshot(&snapshot); err != nil { |
|||
t.Fatalf("add snapshot: %v", err) |
|||
} |
|||
if err := builder.SetSnapshotRef(table.MainBranch, 1, table.BranchRef); err != nil { |
|||
t.Fatalf("set snapshot ref: %v", err) |
|||
} |
|||
meta, err = builder.Build() |
|||
if err != nil { |
|||
t.Fatalf("build metadata: %v", err) |
|||
} |
|||
|
|||
fullMetadataJSON, _ := json.Marshal(meta) |
|||
internalMeta := map[string]interface{}{ |
|||
"metadataVersion": 1, |
|||
"metadataLocation": "metadata/v1.metadata.json", |
|||
"metadata": map[string]interface{}{"fullMetadata": json.RawMessage(fullMetadataJSON)}, |
|||
} |
|||
xattr, _ := json.Marshal(internalMeta) |
|||
|
|||
fs.putEntry(bucketsPath, setup.BucketName, &filer_pb.Entry{ |
|||
Name: setup.BucketName, |
|||
IsDirectory: true, |
|||
Extended: map[string][]byte{s3tables.ExtendedKeyTableBucket: []byte("true")}, |
|||
}) |
|||
fs.putEntry(bucketPath, setup.Namespace, &filer_pb.Entry{Name: setup.Namespace, IsDirectory: true}) |
|||
fs.putEntry(nsPath, setup.TableName, &filer_pb.Entry{ |
|||
Name: setup.TableName, |
|||
IsDirectory: true, |
|||
Extended: map[string][]byte{ |
|||
s3tables.ExtendedKeyMetadata: xattr, |
|||
s3tables.ExtendedKeyMetadataVersion: metadataVersionXattr(1), |
|||
}, |
|||
}) |
|||
|
|||
return meta |
|||
} |
|||
|
|||
func TestValidateWhereOperations(t *testing.T) { |
|||
if err := validateWhereOperations("name = 'us'", []string{"compact", "rewrite_manifests"}); err != nil { |
|||
t.Fatalf("unexpected validation error: %v", err) |
|||
} |
|||
if err := validateWhereOperations("name = 'us'", []string{"expire_snapshots"}); err == nil { |
|||
t.Fatal("expected where validation to reject expire_snapshots") |
|||
} |
|||
} |
|||
|
|||
func TestSplitWhereConjunctionQuoteAware(t *testing.T) { |
|||
cases := []struct { |
|||
input string |
|||
expected []string |
|||
}{ |
|||
{"a = 1 AND b = 2", []string{"a = 1", "b = 2"}}, |
|||
{"a = 'research AND dev'", []string{"a = 'research AND dev'"}}, |
|||
{"a IN ('sales AND marketing', 'eng') AND b = 2", []string{"a IN ('sales AND marketing', 'eng')", "b = 2"}}, |
|||
{"a = 1 and b = 2", []string{"a = 1", "b = 2"}}, |
|||
{"a = 'x' AND b = \"y AND z\"", []string{"a = 'x'", "b = \"y AND z\""}}, |
|||
} |
|||
for _, tc := range cases { |
|||
got := splitWhereConjunction(tc.input) |
|||
if len(got) != len(tc.expected) { |
|||
t.Errorf("splitWhereConjunction(%q) = %v, want %v", tc.input, got, tc.expected) |
|||
continue |
|||
} |
|||
for i := range got { |
|||
if got[i] != tc.expected[i] { |
|||
t.Errorf("splitWhereConjunction(%q)[%d] = %q, want %q", tc.input, i, got[i], tc.expected[i]) |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
func TestPartitionPredicateMatchesUsesPartitionFieldIDs(t *testing.T) { |
|||
spec := iceberg.NewPartitionSpec(iceberg.PartitionField{ |
|||
SourceID: 2, |
|||
FieldID: 1000, |
|||
Name: "name", |
|||
Transform: iceberg.IdentityTransform{}, |
|||
}) |
|||
predicate := &partitionPredicate{Clauses: []whereClause{{Field: "name", Literals: []string{"'us'"}}}} |
|||
|
|||
match, err := predicate.Matches(spec, map[int]any{2: "us"}) |
|||
if err != nil { |
|||
t.Fatalf("unexpected error: %v", err) |
|||
} |
|||
if match { |
|||
t.Fatal("expected source-column key to not match partition predicate") |
|||
} |
|||
} |
|||
|
|||
func TestCompactDataFilesWhereFilter(t *testing.T) { |
|||
fs, client := startFakeFiler(t) |
|||
|
|||
partitionSpec := iceberg.NewPartitionSpec(iceberg.PartitionField{ |
|||
SourceID: 2, |
|||
FieldID: 1000, |
|||
Name: "name", |
|||
Transform: iceberg.IdentityTransform{}, |
|||
}) |
|||
|
|||
setup := tableSetup{BucketName: "tb", Namespace: "ns", TableName: "tbl"} |
|||
populatePartitionedDataTable(t, fs, setup, partitionSpec, [][]partitionedTestFile{ |
|||
{ |
|||
{Name: "us-1.parquet", Partition: map[int]any{1000: "us"}, Rows: []struct { |
|||
ID int64 |
|||
Name string |
|||
}{{1, "us"}}}, |
|||
}, |
|||
{ |
|||
{Name: "us-2.parquet", Partition: map[int]any{1000: "us"}, Rows: []struct { |
|||
ID int64 |
|||
Name string |
|||
}{{2, "us"}}}, |
|||
}, |
|||
{ |
|||
{Name: "eu-1.parquet", Partition: map[int]any{1000: "eu"}, Rows: []struct { |
|||
ID int64 |
|||
Name string |
|||
}{{3, "eu"}}}, |
|||
{Name: "eu-2.parquet", Partition: map[int]any{1000: "eu"}, Rows: []struct { |
|||
ID int64 |
|||
Name string |
|||
}{{4, "eu"}}}, |
|||
}, |
|||
}) |
|||
|
|||
handler := NewHandler(nil) |
|||
config := Config{ |
|||
TargetFileSizeBytes: 256 * 1024 * 1024, |
|||
MinInputFiles: 2, |
|||
MaxCommitRetries: 3, |
|||
Where: "name = 'us'", |
|||
} |
|||
|
|||
result, _, err := handler.compactDataFiles(context.Background(), client, setup.BucketName, setup.tablePath(), config, nil) |
|||
if err != nil { |
|||
t.Fatalf("compactDataFiles: %v", err) |
|||
} |
|||
if !strings.Contains(result, "compacted 2 files into 1") { |
|||
t.Fatalf("unexpected result: %q", result) |
|||
} |
|||
|
|||
meta, _, err := loadCurrentMetadata(context.Background(), client, setup.BucketName, setup.tablePath()) |
|||
if err != nil { |
|||
t.Fatalf("loadCurrentMetadata: %v", err) |
|||
} |
|||
manifests, err := loadCurrentManifests(context.Background(), client, setup.BucketName, setup.tablePath(), meta) |
|||
if err != nil { |
|||
t.Fatalf("loadCurrentManifests: %v", err) |
|||
} |
|||
|
|||
var liveDataPaths []string |
|||
for _, mf := range manifests { |
|||
if mf.ManifestContent() != iceberg.ManifestContentData { |
|||
continue |
|||
} |
|||
manifestData, err := loadFileByIcebergPath(context.Background(), client, setup.BucketName, setup.tablePath(), mf.FilePath()) |
|||
if err != nil { |
|||
t.Fatalf("load data manifest: %v", err) |
|||
} |
|||
entries, err := iceberg.ReadManifest(mf, bytes.NewReader(manifestData), true) |
|||
if err != nil { |
|||
t.Fatalf("read data manifest: %v", err) |
|||
} |
|||
for _, entry := range entries { |
|||
liveDataPaths = append(liveDataPaths, entry.DataFile().FilePath()) |
|||
} |
|||
} |
|||
|
|||
if len(liveDataPaths) != 3 { |
|||
t.Fatalf("expected 3 live data files after filtered compaction, got %v", liveDataPaths) |
|||
} |
|||
var compactedCount int |
|||
for _, p := range liveDataPaths { |
|||
switch { |
|||
case strings.HasPrefix(p, "data/compact-"): |
|||
compactedCount++ |
|||
case p == "data/eu-1.parquet", p == "data/eu-2.parquet": |
|||
default: |
|||
t.Fatalf("unexpected live data file %q", p) |
|||
} |
|||
} |
|||
if compactedCount != 1 { |
|||
t.Fatalf("expected exactly one compacted file, got %d in %v", compactedCount, liveDataPaths) |
|||
} |
|||
} |
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue