seaweedfs/weed/plugin/worker/iceberg/operations.go


								package iceberg


								import (

									"bytes"

									"context"

									"encoding/json"

									"errors"

									"fmt"

									"math/rand/v2"

									"path"

									"sort"

									"strings"

									"time"


									"github.com/apache/iceberg-go"

									"github.com/apache/iceberg-go/table"

									"github.com/seaweedfs/seaweedfs/weed/glog"

									"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"

									"github.com/seaweedfs/seaweedfs/weed/s3api/s3tables"

								)


								// errStalePlan is returned by a commit mutation when the table head has

								// advanced since planning. The caller should not retry the same plan.

								var errStalePlan = errors.New("stale plan: table head changed since planning")


								// errMetadataVersionConflict is returned when the xattr update detects a

								// concurrent metadata version change (compare-and-swap failure).

								var errMetadataVersionConflict = errors.New("metadata version conflict")


								// ---------------------------------------------------------------------------

								// Operation: Expire Snapshots

								// ---------------------------------------------------------------------------


								// expireSnapshots removes old snapshots from the table metadata and cleans up

								// their manifest list files.

								func (h *Handler) expireSnapshots(

									ctx context.Context,

									filerClient filer_pb.SeaweedFilerClient,

									bucketName, tablePath string,

									config Config,

								) (string, error) {

									// Load current metadata

									meta, metadataFileName, err := loadCurrentMetadata(ctx, filerClient, bucketName, tablePath)

									if err != nil {

										return "", fmt.Errorf("load metadata: %w", err)

									}


									snapshots := meta.Snapshots()

									if len(snapshots) == 0 {

										return "no snapshots", nil

									}


									// Determine which snapshots to expire

									currentSnap := meta.CurrentSnapshot()

									var currentSnapID int64

									if currentSnap != nil {

										currentSnapID = currentSnap.SnapshotID

									}


									retentionMs := config.SnapshotRetentionHours * 3600 * 1000

									nowMs := time.Now().UnixMilli()


									// Sort snapshots by timestamp descending (most recent first) so that

									// the keep-count logic always preserves the newest snapshots.

									sorted := make([]table.Snapshot, len(snapshots))

									copy(sorted, snapshots)

									sort.Slice(sorted, func(i, j int) bool {

										return sorted[i].TimestampMs > sorted[j].TimestampMs

									})


									// Walk from newest to oldest. The current snapshot is always kept.

									// Among the remaining, keep up to MaxSnapshotsToKeep-1 (since current

									// counts toward the quota). Expire the rest only if they exceed the

									// retention window; snapshots within the window are kept regardless.

									var toExpire []int64

									var kept int64

									for _, snap := range sorted {

										if snap.SnapshotID == currentSnapID {

											kept++

											continue

										}

										age := nowMs - snap.TimestampMs

										if kept < config.MaxSnapshotsToKeep {

											kept++

											continue

										}

										if age > retentionMs {

											toExpire = append(toExpire, snap.SnapshotID)

										} else {

											kept++

										}

									}


									if len(toExpire) == 0 {

										return "no snapshots expired", nil

									}


									// Split snapshots into expired and kept sets

									expireSet := make(map[int64]struct{}, len(toExpire))

									for _, id := range toExpire {

										expireSet[id] = struct{}{}

									}

									var expiredSnaps, keptSnaps []table.Snapshot

									for _, snap := range sorted {

										if _, ok := expireSet[snap.SnapshotID]; ok {

											expiredSnaps = append(expiredSnaps, snap)

										} else {

											keptSnaps = append(keptSnaps, snap)

										}

									}


									// Collect all files referenced by each set before modifying metadata.

									// This lets us determine which files become unreferenced.

									expiredFiles, err := collectSnapshotFiles(ctx, filerClient, bucketName, tablePath, expiredSnaps)

									if err != nil {

										return "", fmt.Errorf("collect expired snapshot files: %w", err)

									}

									keptFiles, err := collectSnapshotFiles(ctx, filerClient, bucketName, tablePath, keptSnaps)

									if err != nil {

										return "", fmt.Errorf("collect kept snapshot files: %w", err)

									}


									// Normalize kept file paths for consistent comparison

									normalizedKept := make(map[string]struct{}, len(keptFiles))

									for f := range keptFiles {

										normalizedKept[normalizeIcebergPath(f, bucketName, tablePath)] = struct{}{}

									}


									// Use MetadataBuilder to remove snapshots and create new metadata

									err = h.commitWithRetry(ctx, filerClient, bucketName, tablePath, metadataFileName, config, func(currentMeta table.Metadata, builder *table.MetadataBuilder) error {

										// Guard: verify table head hasn't changed since we planned

										cs := currentMeta.CurrentSnapshot()

										if (cs == nil) != (currentSnapID == 0) || (cs != nil && cs.SnapshotID != currentSnapID) {

											return errStalePlan

										}

										return builder.RemoveSnapshots(toExpire)

									})

									if err != nil {

										return "", fmt.Errorf("commit snapshot expiration: %w", err)

									}


									// Delete files exclusively referenced by expired snapshots (best-effort)

									tableBasePath := path.Join(s3tables.TablesPath, bucketName, tablePath)

									deletedCount := 0

									for filePath := range expiredFiles {

										normalized := normalizeIcebergPath(filePath, bucketName, tablePath)

										if _, stillReferenced := normalizedKept[normalized]; stillReferenced {

											continue

										}

										dir := path.Join(tableBasePath, path.Dir(normalized))

										fileName := path.Base(normalized)

										if delErr := deleteFilerFile(ctx, filerClient, dir, fileName); delErr != nil {

											glog.Warningf("iceberg maintenance: failed to delete unreferenced file %s: %v", filePath, delErr)

										} else {

											deletedCount++

										}

									}


									return fmt.Sprintf("expired %d snapshot(s), deleted %d unreferenced file(s)", len(toExpire), deletedCount), nil

								}


								// collectSnapshotFiles returns all file paths (manifest lists, manifest files,

								// data files) referenced by the given snapshots. It returns an error if any

								// manifest list or manifest cannot be read/parsed, to prevent delete decisions

								// based on incomplete reference data.

								func collectSnapshotFiles(

									ctx context.Context,

									filerClient filer_pb.SeaweedFilerClient,

									bucketName, tablePath string,

									snapshots []table.Snapshot,

								) (map[string]struct{}, error) {

									files := make(map[string]struct{})

									for _, snap := range snapshots {

										if snap.ManifestList == "" {

											continue

										}

										files[snap.ManifestList] = struct{}{}


										manifestListData, err := loadFileByIcebergPath(ctx, filerClient, bucketName, tablePath, snap.ManifestList)

										if err != nil {

											return nil, fmt.Errorf("read manifest list %s: %w", snap.ManifestList, err)

										}

										manifests, err := iceberg.ReadManifestList(bytes.NewReader(manifestListData))

										if err != nil {

											return nil, fmt.Errorf("parse manifest list %s: %w", snap.ManifestList, err)

										}


										for _, mf := range manifests {

											files[mf.FilePath()] = struct{}{}


											manifestData, err := loadFileByIcebergPath(ctx, filerClient, bucketName, tablePath, mf.FilePath())

											if err != nil {

												return nil, fmt.Errorf("read manifest %s: %w", mf.FilePath(), err)

											}

											entries, err := iceberg.ReadManifest(mf, bytes.NewReader(manifestData), false)

											if err != nil {

												return nil, fmt.Errorf("parse manifest %s: %w", mf.FilePath(), err)

											}

											for _, entry := range entries {

												files[entry.DataFile().FilePath()] = struct{}{}

											}

										}

									}

									return files, nil

								}


								// ---------------------------------------------------------------------------

								// Operation: Remove Orphans

								// ---------------------------------------------------------------------------


								// removeOrphans finds and deletes unreferenced files from the table's

								// metadata/ and data/ directories.

								func (h *Handler) removeOrphans(

									ctx context.Context,

									filerClient filer_pb.SeaweedFilerClient,

									bucketName, tablePath string,

									config Config,

								) (string, error) {

									// Load current metadata

									meta, metadataFileName, err := loadCurrentMetadata(ctx, filerClient, bucketName, tablePath)

									if err != nil {

										return "", fmt.Errorf("load metadata: %w", err)

									}


									// Collect all referenced files from all snapshots

									referencedFiles, err := collectSnapshotFiles(ctx, filerClient, bucketName, tablePath, meta.Snapshots())

									if err != nil {

										return "", fmt.Errorf("collect referenced files: %w", err)

									}


									// Reference the active metadata file so it is not treated as orphan

									referencedFiles[path.Join("metadata", metadataFileName)] = struct{}{}


									// Also reference the current metadata files

									for mle := range meta.PreviousFiles() {

										referencedFiles[mle.MetadataFile] = struct{}{}

									}


									// Precompute a normalized lookup set so orphan checks are O(1) per file.

									normalizedRefs := make(map[string]struct{}, len(referencedFiles))

									for ref := range referencedFiles {

										normalizedRefs[ref] = struct{}{}

										normalizedRefs[normalizeIcebergPath(ref, bucketName, tablePath)] = struct{}{}

									}


									// List actual files on filer in metadata/ and data/ directories

									tableBasePath := path.Join(s3tables.TablesPath, bucketName, tablePath)

									safetyThreshold := time.Now().Add(-time.Duration(config.OrphanOlderThanHours) * time.Hour)

									orphanCount := 0


									for _, subdir := range []string{"metadata", "data"} {

										dirPath := path.Join(tableBasePath, subdir)

										fileEntries, err := walkFilerEntries(ctx, filerClient, dirPath)

										if err != nil {

											glog.V(2).Infof("iceberg maintenance: cannot walk %s: %v", dirPath, err)

											continue

										}


										for _, fe := range fileEntries {

											entry := fe.Entry

											// Build relative path from the table base (e.g. "data/region=us/file.parquet")

											fullPath := path.Join(fe.Dir, entry.Name)

											relPath := strings.TrimPrefix(fullPath, tableBasePath+"/")


											_, isReferenced := normalizedRefs[relPath]


											if isReferenced {

												continue

											}


											// Check safety window — skip entries with unknown age

											if entry.Attributes == nil {

												continue

											}

											mtime := time.Unix(entry.Attributes.Mtime, 0)

											if mtime.After(safetyThreshold) {

												continue

											}


											// Delete orphan

											if delErr := deleteFilerFile(ctx, filerClient, fe.Dir, entry.Name); delErr != nil {

												glog.Warningf("iceberg maintenance: failed to delete orphan %s/%s: %v", fe.Dir, entry.Name, delErr)

											} else {

												orphanCount++

											}

										}

									}


									return fmt.Sprintf("removed %d orphan file(s)", orphanCount), nil

								}


								// ---------------------------------------------------------------------------

								// Operation: Rewrite Manifests

								// ---------------------------------------------------------------------------


								// rewriteManifests merges small manifests into fewer, larger ones.

								func (h *Handler) rewriteManifests(

									ctx context.Context,

									filerClient filer_pb.SeaweedFilerClient,

									bucketName, tablePath string,

									config Config,

								) (string, error) {

									// Load current metadata

									meta, metadataFileName, err := loadCurrentMetadata(ctx, filerClient, bucketName, tablePath)

									if err != nil {

										return "", fmt.Errorf("load metadata: %w", err)

									}


									currentSnap := meta.CurrentSnapshot()

									if currentSnap == nil || currentSnap.ManifestList == "" {

										return "no current snapshot", nil

									}


									// Read manifest list

									manifestListData, err := loadFileByIcebergPath(ctx, filerClient, bucketName, tablePath, currentSnap.ManifestList)

									if err != nil {

										return "", fmt.Errorf("read manifest list: %w", err)

									}


									manifests, err := iceberg.ReadManifestList(bytes.NewReader(manifestListData))

									if err != nil {

										return "", fmt.Errorf("parse manifest list: %w", err)

									}


									if int64(len(manifests)) < config.MinInputFiles {

										return fmt.Sprintf("only %d manifests, below threshold of %d", len(manifests), config.MinInputFiles), nil

									}


									// Collect all entries from data manifests, grouped by partition spec ID

									// so we write one merged manifest per spec (required for spec-evolved tables).

									type specEntries struct {

										specID  int32

										spec    iceberg.PartitionSpec

										entries []iceberg.ManifestEntry

									}

									specMap := make(map[int32]*specEntries)


									// Build a lookup from spec ID to PartitionSpec

									specByID := make(map[int]iceberg.PartitionSpec)

									for _, ps := range meta.PartitionSpecs() {

										specByID[ps.ID()] = ps

									}


									for _, mf := range manifests {

										if mf.ManifestContent() != iceberg.ManifestContentData {

											continue

										}

										manifestData, err := loadFileByIcebergPath(ctx, filerClient, bucketName, tablePath, mf.FilePath())

										if err != nil {

											return "", fmt.Errorf("read manifest %s: %w", mf.FilePath(), err)

										}

										entries, err := iceberg.ReadManifest(mf, bytes.NewReader(manifestData), true)

										if err != nil {

											return "", fmt.Errorf("parse manifest %s: %w", mf.FilePath(), err)

										}


										sid := mf.PartitionSpecID()

										se, ok := specMap[sid]

										if !ok {

											ps, found := specByID[int(sid)]

											if !found {

												return "", fmt.Errorf("partition spec %d not found in table metadata", sid)

											}

											se = &specEntries{specID: sid, spec: ps}

											specMap[sid] = se

										}

										se.entries = append(se.entries, entries...)

									}


									if len(specMap) == 0 {

										return "no data entries to rewrite", nil

									}


									schema := meta.CurrentSchema()

									version := meta.Version()

									snapshotID := currentSnap.SnapshotID

									newSnapshotID := time.Now().UnixMilli()

									newSeqNum := currentSnap.SequenceNumber + 1

									metaDir := path.Join(s3tables.TablesPath, bucketName, tablePath, "metadata")


									// Track written artifacts so we can clean them up if the commit fails.

									type artifact struct {

										dir, fileName string

									}

									var writtenArtifacts []artifact

									committed := false


									defer func() {

										if committed || len(writtenArtifacts) == 0 {

											return

										}

										cleanupCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)

										defer cancel()

										for _, a := range writtenArtifacts {

											if err := deleteFilerFile(cleanupCtx, filerClient, a.dir, a.fileName); err != nil {

												glog.Warningf("iceberg rewrite-manifests: failed to clean up artifact %s/%s: %v", a.dir, a.fileName, err)

											}

										}

									}()


									// Write one merged manifest per partition spec

									var newManifests []iceberg.ManifestFile

									totalEntries := 0

									for _, se := range specMap {

										totalEntries += len(se.entries)

										manifestFileName := fmt.Sprintf("merged-%d-spec%d-%d.avro", newSnapshotID, se.specID, time.Now().UnixMilli())

										manifestPath := path.Join("metadata", manifestFileName)


										var manifestBuf bytes.Buffer

										mergedManifest, err := iceberg.WriteManifest(

											manifestPath,

											&manifestBuf,

											version,

											se.spec,

											schema,

											newSnapshotID,

											se.entries,

										)

										if err != nil {

											return "", fmt.Errorf("write merged manifest for spec %d: %w", se.specID, err)

										}


										if err := saveFilerFile(ctx, filerClient, metaDir, manifestFileName, manifestBuf.Bytes()); err != nil {

											return "", fmt.Errorf("save merged manifest for spec %d: %w", se.specID, err)

										}

										writtenArtifacts = append(writtenArtifacts, artifact{dir: metaDir, fileName: manifestFileName})

										newManifests = append(newManifests, mergedManifest)

									}


									// Include any delete manifests that were not rewritten

									for _, mf := range manifests {

										if mf.ManifestContent() != iceberg.ManifestContentData {

											newManifests = append(newManifests, mf)

										}

									}


									var manifestListBuf bytes.Buffer

									err = iceberg.WriteManifestList(version, &manifestListBuf, newSnapshotID, &snapshotID, &newSeqNum, 0, newManifests)

									if err != nil {

										return "", fmt.Errorf("write manifest list: %w", err)

									}


									// Save new manifest list

									manifestListFileName := fmt.Sprintf("snap-%d-%d.avro", newSnapshotID, time.Now().UnixMilli())

									if err := saveFilerFile(ctx, filerClient, metaDir, manifestListFileName, manifestListBuf.Bytes()); err != nil {

										return "", fmt.Errorf("save manifest list: %w", err)

									}

									writtenArtifacts = append(writtenArtifacts, artifact{dir: metaDir, fileName: manifestListFileName})


									// Create new snapshot with the rewritten manifest list

									manifestListLocation := path.Join("metadata", manifestListFileName)


									err = h.commitWithRetry(ctx, filerClient, bucketName, tablePath, metadataFileName, config, func(currentMeta table.Metadata, builder *table.MetadataBuilder) error {

										// Guard: verify table head hasn't advanced since we planned.

										// The merged manifest and manifest list were built against snapshotID;

										// if the head moved, they reference stale state.

										cs := currentMeta.CurrentSnapshot()

										if cs == nil || cs.SnapshotID != snapshotID {

											return errStalePlan

										}


										newSnapshot := &table.Snapshot{

											SnapshotID:       newSnapshotID,

											ParentSnapshotID: &snapshotID,

											SequenceNumber:   cs.SequenceNumber + 1,

											TimestampMs:      time.Now().UnixMilli(),

											ManifestList:     manifestListLocation,

											Summary: &table.Summary{

												Operation:  table.OpReplace,

												Properties: map[string]string{"maintenance": "rewrite_manifests"},

											},

											SchemaID: func() *int {

												id := schema.ID

												return &id

											}(),

										}

										if err := builder.AddSnapshot(newSnapshot); err != nil {

											return err

										}

										return builder.SetSnapshotRef(

											table.MainBranch,

											newSnapshotID,

											table.BranchRef,

										)

									})

									if err != nil {

										return "", fmt.Errorf("commit manifest rewrite: %w", err)

									}


									committed = true

									return fmt.Sprintf("rewrote %d manifests into %d (%d entries)", len(manifests), len(specMap), totalEntries), nil

								}


								// ---------------------------------------------------------------------------

								// Commit Protocol with Retry

								// ---------------------------------------------------------------------------


								// commitWithRetry implements optimistic concurrency for metadata updates.

								// It reads the current metadata, applies the mutation, writes a new metadata

								// file, and updates the table entry. On version conflict, it retries.

								func (h *Handler) commitWithRetry(

									ctx context.Context,

									filerClient filer_pb.SeaweedFilerClient,

									bucketName, tablePath, currentMetadataFileName string,

									config Config,

									mutate func(currentMeta table.Metadata, builder *table.MetadataBuilder) error,

								) error {

									maxRetries := config.MaxCommitRetries

									if maxRetries <= 0 || maxRetries > 20 {

										maxRetries = defaultMaxCommitRetries

									}


									for attempt := int64(0); attempt < maxRetries; attempt++ {

										if attempt > 0 {

											backoff := time.Duration(50*(1<<(attempt-1))) * time.Millisecond // exponential: 50ms, 100ms, 200ms, ...

											const maxBackoff = 5 * time.Second

											if backoff > maxBackoff {

												backoff = maxBackoff

											}

											jitter := time.Duration(rand.Int64N(int64(backoff) / 5)) // 0–20% of backoff

											timer := time.NewTimer(backoff + jitter)

											select {

											case <-timer.C:

											case <-ctx.Done():

												timer.Stop()

												return ctx.Err()

											}

										}


										// Load current metadata

										meta, metaFileName, err := loadCurrentMetadata(ctx, filerClient, bucketName, tablePath)

										if err != nil {

											return fmt.Errorf("load metadata (attempt %d): %w", attempt, err)

										}


										// Build new metadata — pass the current metadata file path so the

										// metadata log correctly records where the previous version lives.

										currentMetaFilePath := path.Join("metadata", metaFileName)

										builder, err := table.MetadataBuilderFromBase(meta, currentMetaFilePath)

										if err != nil {

											return fmt.Errorf("create metadata builder (attempt %d): %w", attempt, err)

										}


										// Apply the mutation

										if err := mutate(meta, builder); err != nil {

											return fmt.Errorf("apply mutation (attempt %d): %w", attempt, err)

										}


										if !builder.HasChanges() {

											return nil // nothing to commit

										}


										newMeta, err := builder.Build()

										if err != nil {

											return fmt.Errorf("build metadata (attempt %d): %w", attempt, err)

										}


										// Serialize

										metadataBytes, err := json.Marshal(newMeta)

										if err != nil {

											return fmt.Errorf("marshal metadata (attempt %d): %w", attempt, err)

										}


										// Determine new metadata file name. Include a timestamp suffix so

										// concurrent writers stage to distinct files instead of clobbering.

										currentVersion := extractMetadataVersion(metaFileName)

										newVersion := currentVersion + 1

										newMetadataFileName := fmt.Sprintf("v%d-%d.metadata.json", newVersion, time.Now().UnixNano())


										// Save new metadata file

										metaDir := path.Join(s3tables.TablesPath, bucketName, tablePath, "metadata")

										if err := saveFilerFile(ctx, filerClient, metaDir, newMetadataFileName, metadataBytes); err != nil {

											return fmt.Errorf("save metadata file (attempt %d): %w", attempt, err)

										}


										// Update the table entry's xattr with new metadata (CAS on version)

										tableDir := path.Join(s3tables.TablesPath, bucketName, tablePath)

										newMetadataLocation := path.Join("metadata", newMetadataFileName)

										err = updateTableMetadataXattr(ctx, filerClient, tableDir, currentVersion, metadataBytes, newMetadataLocation)

										if err != nil {

											// Use a detached context for cleanup so staged files are removed

											// even if the original context was canceled.

											cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), 10*time.Second)

											if !errors.Is(err, errMetadataVersionConflict) {

												// Non-conflict error (permissions, transport, etc.): fail immediately.

												_ = deleteFilerFile(cleanupCtx, filerClient, metaDir, newMetadataFileName)

												cleanupCancel()

												return fmt.Errorf("update table xattr (attempt %d): %w", attempt, err)

											}

											// Version conflict: clean up the new metadata file and retry

											_ = deleteFilerFile(cleanupCtx, filerClient, metaDir, newMetadataFileName)

											cleanupCancel()

											if attempt < maxRetries-1 {

												glog.V(1).Infof("iceberg maintenance: version conflict on %s/%s, retrying (attempt %d)", bucketName, tablePath, attempt)

												continue

											}

											return fmt.Errorf("update table xattr (attempt %d): %w", attempt, err)

										}


										return nil

									}


									return fmt.Errorf("exceeded max commit retries (%d)", maxRetries)

								}