From f3c5ba3cd6c8adb037a8001b92bccb021c3d9e30 Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Fri, 13 Mar 2026 09:36:54 -0700 Subject: [PATCH] feat(filer): add lazy directory listing for remote mounts (#8615) * feat(filer): add lazy directory listing for remote mounts Directory listings on remote mounts previously only queried the local filer store. With lazy mounts the listing was empty; with eager mounts it went stale over time. Add on-demand directory listing that fetches from remote and caches results with a 5-minute TTL: - Add `ListDirectory` to `RemoteStorageClient` interface (delimiter-based, single-level listing, separate from recursive `Traverse`) - Implement in S3, GCS, and Azure backends using each platform's hierarchical listing API - Add `maybeLazyListFromRemote` to filer: before each directory listing, check if the directory is under a remote mount with an expired cache, fetch from remote, persist entries to the local store, then let existing listing logic run on the populated store - Use singleflight to deduplicate concurrent requests for the same directory - Skip local-only entries (no RemoteEntry) to avoid overwriting unsynced uploads - Errors are logged and swallowed (availability over consistency) * refactor: extract xattr key to constant xattrRemoteListingSyncedAt * feat: make listing cache TTL configurable per mount via listing_cache_ttl_seconds Add listing_cache_ttl_seconds field to RemoteStorageLocation protobuf. When 0 (default), lazy directory listing is disabled for that mount. When >0, enables on-demand directory listing with the specified TTL. Expose as -listingCacheTTL flag on remote.mount command. * refactor: address review feedback for lazy directory listing - Add context.Context to ListDirectory interface and all implementations - Capture startTime before remote call for accurate TTL tracking - Simplify S3 ListDirectory using ListObjectsV2PagesWithContext - Make maybeLazyListFromRemote return void (errors always swallowed) - Remove redundant trailing-slash path manipulation in caller - Update tests to match new signatures * When an existing entry has Remote != nil, we should merge remote metadata into it rather than replacing it. * fix(gcs): wrap ListDirectory iterator error with context The raw iterator error was returned without bucket/path context, making it harder to debug. Wrap it consistently with the S3 pattern. * fix(s3): guard against nil pointer dereference in Traverse and ListDirectory Some S3-compatible backends may return nil for LastModified, Size, or ETag fields. Check for nil before dereferencing to prevent panics. * fix(filer): remove blanket 2-minute timeout from lazy listing context Individual SDK operations (S3, GCS, Azure) already have per-request timeouts and retry policies. The blanket timeout could cut off large directory listings mid-operation even though individual pages were succeeding. * fix(filer): preserve trace context in lazy listing with WithoutCancel Use context.WithoutCancel(ctx) instead of context.Background() so trace/span values from the incoming request are retained for distributed tracing, while still decoupling cancellation. * fix(filer): use Store.FindEntry for internal lookups, add Uid/Gid to files, fix updateDirectoryListingSyncedAt - Use f.Store.FindEntry instead of f.FindEntry for staleness check and child lookups to avoid unnecessary lazy-fetch overhead - Set OS_UID/OS_GID on new file entries for consistency with directories - In updateDirectoryListingSyncedAt, use Store.UpdateEntry for existing directories instead of CreateEntry to avoid deleteChunksIfNotNew and NotifyUpdateEvent side effects * fix(filer): distinguish not-found from store errors in lazy listing Previously, any error from Store.FindEntry was treated as "not found," which could cause entry recreation/overwrite on transient DB failures. Now check for filer_pb.ErrNotFound explicitly and skip entries or bail out on real store errors. * refactor(filer): use errors.Is for ErrNotFound comparisons --- weed/filer/filer.go | 3 + weed/filer/filer_lazy_remote_listing.go | 208 ++++++++++++++ weed/filer/filer_lazy_remote_test.go | 271 ++++++++++++++++++ weed/pb/remote.proto | 1 + weed/pb/remote_pb/remote.pb.go | 25 +- .../azure/azure_storage_client.go | 62 ++++ weed/remote_storage/gcs/gcs_storage_client.go | 46 +++ weed/remote_storage/remote_storage.go | 2 + weed/remote_storage/s3/s3_storage_client.go | 78 ++++- weed/shell/command_remote_mount.go | 4 + 10 files changed, 687 insertions(+), 13 deletions(-) create mode 100644 weed/filer/filer_lazy_remote_listing.go diff --git a/weed/filer/filer.go b/weed/filer/filer.go index ecca13f7a..63aae9612 100644 --- a/weed/filer/filer.go +++ b/weed/filer/filer.go @@ -56,6 +56,7 @@ type Filer struct { FilerConf *FilerConf RemoteStorage *FilerRemoteStorage lazyFetchGroup singleflight.Group + lazyListGroup singleflight.Group Dlm *lock_manager.DistributedLockManager MaxFilenameLength uint32 deletionQuit chan struct{} @@ -389,6 +390,8 @@ func (f *Filer) FindEntry(ctx context.Context, p util.FullPath) (entry *Entry, e } func (f *Filer) doListDirectoryEntries(ctx context.Context, p util.FullPath, startFileName string, inclusive bool, limit int64, prefix string, eachEntryFunc ListEachEntryFunc) (expiredCount int64, lastFileName string, err error) { + f.maybeLazyListFromRemote(ctx, p) + // Collect expired entries during iteration to avoid deadlock with DB connection pool var expiredEntries []*Entry var s3ExpiredEntries []*Entry diff --git a/weed/filer/filer_lazy_remote_listing.go b/weed/filer/filer_lazy_remote_listing.go new file mode 100644 index 000000000..c6d827157 --- /dev/null +++ b/weed/filer/filer_lazy_remote_listing.go @@ -0,0 +1,208 @@ +package filer + +import ( + "context" + "errors" + "fmt" + "os" + "strconv" + "time" + + "github.com/seaweedfs/seaweedfs/weed/glog" + "github.com/seaweedfs/seaweedfs/weed/pb/filer_pb" + "github.com/seaweedfs/seaweedfs/weed/util" +) + +const xattrRemoteListingSyncedAt = "remote.listing.synced_at" + +type lazyListContextKey struct{} + +// maybeLazyListFromRemote populates the local filer store with entries from the +// remote storage backend for directory p if the following conditions hold: +// - p is under a remote mount with listing_cache_ttl_seconds > 0 +// - the cached listing has expired (based on the per-mount TTL) +// +// When listing_cache_ttl_seconds is 0 (the default), lazy listing is disabled +// for that mount. +// +// On success it updates the directory's xattrRemoteListingSyncedAt extended +// attribute so subsequent calls within the TTL window are no-ops. +// +// Errors are logged and swallowed (availability over consistency). +func (f *Filer) maybeLazyListFromRemote(ctx context.Context, p util.FullPath) { + // Prevent recursion: CreateEntry → FindEntry → doListDirectoryEntries → here + if ctx.Value(lazyListContextKey{}) != nil { + return + } + // Also respect the lazy-fetch guard to prevent mutual recursion + if ctx.Value(lazyFetchContextKey{}) != nil { + return + } + + if f.RemoteStorage == nil { + return + } + + // The ptrie stores mount rules with trailing "/". When p is exactly the + // mount directory (e.g. "/buckets/mybucket"), we must also try matching + // with a trailing "/" so the trie recognizes the mount root. + lookupPath := p + mountDir, remoteLoc := f.RemoteStorage.FindMountDirectory(lookupPath) + if remoteLoc == nil { + lookupPath = util.FullPath(string(p) + "/") + mountDir, remoteLoc = f.RemoteStorage.FindMountDirectory(lookupPath) + if remoteLoc == nil { + return + } + } + + // Lazy listing is opt-in: disabled when TTL is 0 + if remoteLoc.ListingCacheTtlSeconds <= 0 { + return + } + cacheTTL := time.Duration(remoteLoc.ListingCacheTtlSeconds) * time.Second + + // Check staleness: read the directory entry's extended attributes. + // Use Store.FindEntry directly — we only need the local xattr, not lazy-fetch. + dirEntry, _ := f.Store.FindEntry(ctx, p) + if dirEntry != nil { + if syncedAtStr, ok := dirEntry.Extended[xattrRemoteListingSyncedAt]; ok { + if syncedAt, err := strconv.ParseInt(string(syncedAtStr), 10, 64); err == nil { + if time.Since(time.Unix(syncedAt, 0)) < cacheTTL { + return + } + } + } + } + + client, _, found := f.RemoteStorage.FindRemoteStorageClient(lookupPath) + if !found { + return + } + + key := "list:" + string(p) + f.lazyListGroup.Do(key, func() (interface{}, error) { + startTime := time.Now() + objectLoc := MapFullPathToRemoteStorageLocation(mountDir, remoteLoc, p) + + // Decouple from the caller's cancellation/deadline while preserving + // trace/span values for distributed tracing. + persistCtx := context.WithValue(context.WithoutCancel(ctx), lazyListContextKey{}, true) + persistCtx = context.WithValue(persistCtx, lazyFetchContextKey{}, true) + + listErr := client.ListDirectory(persistCtx, objectLoc, func(dir string, name string, isDirectory bool, remoteEntry *filer_pb.RemoteEntry) error { + childPath := p.Child(name) + + existingEntry, findErr := f.Store.FindEntry(persistCtx, childPath) + if findErr != nil && !errors.Is(findErr, filer_pb.ErrNotFound) { + glog.Warningf("maybeLazyListFromRemote: find %s: %v", childPath, findErr) + return nil // skip this entry on transient store error + } + + // Skip entries that exist locally without a RemoteEntry (local-only uploads) + if existingEntry != nil && existingEntry.Remote == nil { + return nil + } + + if existingEntry != nil { + // Merge: update remote metadata while preserving local state + // (Chunks, Extended, Uid/Gid/Mode, etc.) + existingEntry.Remote = remoteEntry + if !isDirectory && remoteEntry != nil { + if remoteEntry.RemoteMtime > 0 { + existingEntry.Attr.Mtime = time.Unix(remoteEntry.RemoteMtime, 0) + } + existingEntry.Attr.FileSize = uint64(remoteEntry.RemoteSize) + } + if saveErr := f.Store.UpdateEntry(persistCtx, existingEntry); saveErr != nil { + glog.Warningf("maybeLazyListFromRemote: update %s: %v", childPath, saveErr) + } + } else { + // New entry not yet in local store + var entry *Entry + if isDirectory { + now := time.Now() + entry = &Entry{ + FullPath: childPath, + Attr: Attr{ + Mtime: now, + Crtime: now, + Mode: os.ModeDir | 0755, + Uid: OS_UID, + Gid: OS_GID, + }, + } + } else { + mtime := time.Now() + if remoteEntry != nil && remoteEntry.RemoteMtime > 0 { + mtime = time.Unix(remoteEntry.RemoteMtime, 0) + } + entry = &Entry{ + FullPath: childPath, + Attr: Attr{ + Mtime: mtime, + Crtime: mtime, + Mode: 0644, + Uid: OS_UID, + Gid: OS_GID, + }, + Remote: remoteEntry, + } + if remoteEntry != nil { + entry.Attr.FileSize = uint64(remoteEntry.RemoteSize) + } + } + if saveErr := f.CreateEntry(persistCtx, entry, false, false, nil, true, f.MaxFilenameLength); saveErr != nil { + glog.Warningf("maybeLazyListFromRemote: persist %s: %v", childPath, saveErr) + } + } + return nil + }) + if listErr != nil { + glog.Warningf("maybeLazyListFromRemote: list %s: %v", p, listErr) + return nil, nil // swallow error + } + + // Update the synced_at timestamp on the directory entry + f.updateDirectoryListingSyncedAt(persistCtx, p, startTime) + + return nil, nil + }) +} + +func (f *Filer) updateDirectoryListingSyncedAt(ctx context.Context, p util.FullPath, syncTime time.Time) { + dirEntry, findErr := f.Store.FindEntry(ctx, p) + if findErr != nil && !errors.Is(findErr, filer_pb.ErrNotFound) { + glog.Warningf("maybeLazyListFromRemote: find dir %s: %v", p, findErr) + return + } + if errors.Is(findErr, filer_pb.ErrNotFound) { + // Directory doesn't exist yet, create it + now := time.Now() + dirEntry = &Entry{ + FullPath: p, + Attr: Attr{ + Mtime: now, + Crtime: now, + Mode: os.ModeDir | 0755, + Uid: OS_UID, + Gid: OS_GID, + }, + } + if dirEntry.Extended == nil { + dirEntry.Extended = make(map[string][]byte) + } + dirEntry.Extended[xattrRemoteListingSyncedAt] = []byte(fmt.Sprintf("%d", syncTime.Unix())) + if saveErr := f.CreateEntry(ctx, dirEntry, false, false, nil, true, f.MaxFilenameLength); saveErr != nil { + glog.Warningf("maybeLazyListFromRemote: create dir synced_at for %s: %v", p, saveErr) + } + return + } + if dirEntry.Extended == nil { + dirEntry.Extended = make(map[string][]byte) + } + dirEntry.Extended[xattrRemoteListingSyncedAt] = []byte(fmt.Sprintf("%d", syncTime.Unix())) + if saveErr := f.Store.UpdateEntry(ctx, dirEntry); saveErr != nil { + glog.Warningf("maybeLazyListFromRemote: update synced_at for %s: %v", p, saveErr) + } +} diff --git a/weed/filer/filer_lazy_remote_test.go b/weed/filer/filer_lazy_remote_test.go index c911ccfbb..42dae97e6 100644 --- a/weed/filer/filer_lazy_remote_test.go +++ b/weed/filer/filer_lazy_remote_test.go @@ -199,6 +199,9 @@ type stubRemoteClient struct { deleteCalls []*remote_pb.RemoteStorageLocation removeCalls []*remote_pb.RemoteStorageLocation + + listDirFn func(loc *remote_pb.RemoteStorageLocation, visitFn remote_storage.VisitFunc) error + listDirCalls int } func (c *stubRemoteClient) StatFile(*remote_pb.RemoteStorageLocation) (*filer_pb.RemoteEntry, error) { @@ -235,6 +238,13 @@ func (c *stubRemoteClient) DeleteFile(loc *remote_pb.RemoteStorageLocation) erro }) return c.deleteErr } +func (c *stubRemoteClient) ListDirectory(_ context.Context, loc *remote_pb.RemoteStorageLocation, visitFn remote_storage.VisitFunc) error { + c.listDirCalls++ + if c.listDirFn != nil { + return c.listDirFn(loc, visitFn) + } + return nil +} func (c *stubRemoteClient) ListBuckets() ([]*remote_storage.Bucket, error) { return nil, nil } func (c *stubRemoteClient) CreateBucket(string) error { return nil } func (c *stubRemoteClient) DeleteBucket(string) error { return nil } @@ -828,3 +838,264 @@ func TestDeleteEntryMetaAndData_RecursiveFolderDeleteRemotesChildren(t *testing. require.Len(t, stub.removeCalls, 1) assert.Equal(t, "/subdir", stub.removeCalls[0].Path) } + +// --- lazy listing tests --- + +func TestMaybeLazyListFromRemote_PopulatesStoreFromRemote(t *testing.T) { + const storageType = "stub_lazy_list_populate" + stub := &stubRemoteClient{ + listDirFn: func(loc *remote_pb.RemoteStorageLocation, visitFn remote_storage.VisitFunc) error { + if err := visitFn("/", "subdir", true, nil); err != nil { + return err + } + if err := visitFn("/", "file.txt", false, &filer_pb.RemoteEntry{ + RemoteMtime: 1700000000, + RemoteSize: 42, + RemoteETag: "abc", + StorageName: "myliststore", + }); err != nil { + return err + } + return nil + }, + } + defer registerStubMaker(t, storageType, stub)() + + conf := &remote_pb.RemoteConf{Name: "myliststore", Type: storageType} + rs := NewFilerRemoteStorage() + rs.storageNameToConf[conf.Name] = conf + rs.mapDirectoryToRemoteStorage("/buckets/mybucket", &remote_pb.RemoteStorageLocation{ + Name: "myliststore", + Bucket: "mybucket", + Path: "/", + ListingCacheTtlSeconds: 300, + }) + + store := newStubFilerStore() + f := newTestFiler(t, store, rs) + + f.maybeLazyListFromRemote(context.Background(), util.FullPath("/buckets/mybucket")) + assert.Equal(t, 1, stub.listDirCalls) + + // Check that the file was persisted + fileEntry := store.getEntry("/buckets/mybucket/file.txt") + require.NotNil(t, fileEntry, "file.txt should be persisted") + assert.Equal(t, uint64(42), fileEntry.FileSize) + assert.NotNil(t, fileEntry.Remote) + + // Check that the subdirectory was persisted + dirEntry := store.getEntry("/buckets/mybucket/subdir") + require.NotNil(t, dirEntry, "subdir should be persisted") + assert.True(t, dirEntry.IsDirectory()) +} + +func TestMaybeLazyListFromRemote_DisabledWhenTTLZero(t *testing.T) { + const storageType = "stub_lazy_list_disabled" + stub := &stubRemoteClient{ + listDirFn: func(loc *remote_pb.RemoteStorageLocation, visitFn remote_storage.VisitFunc) error { + return visitFn("/", "file.txt", false, &filer_pb.RemoteEntry{ + RemoteMtime: 1700000000, RemoteSize: 10, + }) + }, + } + defer registerStubMaker(t, storageType, stub)() + + conf := &remote_pb.RemoteConf{Name: "disabledstore", Type: storageType} + rs := NewFilerRemoteStorage() + rs.storageNameToConf[conf.Name] = conf + rs.mapDirectoryToRemoteStorage("/buckets/mybucket", &remote_pb.RemoteStorageLocation{ + Name: "disabledstore", + Bucket: "mybucket", + Path: "/", + // ListingCacheTtlSeconds defaults to 0 → disabled + }) + + store := newStubFilerStore() + f := newTestFiler(t, store, rs) + + f.maybeLazyListFromRemote(context.Background(), util.FullPath("/buckets/mybucket")) + assert.Equal(t, 0, stub.listDirCalls, "should not call remote when TTL is 0") +} + +func TestMaybeLazyListFromRemote_TTLCachePreventsSecondCall(t *testing.T) { + const storageType = "stub_lazy_list_ttl" + stub := &stubRemoteClient{ + listDirFn: func(loc *remote_pb.RemoteStorageLocation, visitFn remote_storage.VisitFunc) error { + return visitFn("/", "file.txt", false, &filer_pb.RemoteEntry{ + RemoteMtime: 1700000000, RemoteSize: 10, + }) + }, + } + defer registerStubMaker(t, storageType, stub)() + + conf := &remote_pb.RemoteConf{Name: "ttlstore", Type: storageType} + rs := NewFilerRemoteStorage() + rs.storageNameToConf[conf.Name] = conf + rs.mapDirectoryToRemoteStorage("/buckets/mybucket", &remote_pb.RemoteStorageLocation{ + Name: "ttlstore", + Bucket: "mybucket", + Path: "/", + ListingCacheTtlSeconds: 300, + }) + + store := newStubFilerStore() + f := newTestFiler(t, store, rs) + + // First call should hit remote + f.maybeLazyListFromRemote(context.Background(), util.FullPath("/buckets/mybucket")) + assert.Equal(t, 1, stub.listDirCalls) + + // Second call within TTL should be a no-op + f.maybeLazyListFromRemote(context.Background(), util.FullPath("/buckets/mybucket")) + assert.Equal(t, 1, stub.listDirCalls, "should not call remote again within TTL") +} + +func TestMaybeLazyListFromRemote_NotUnderMount(t *testing.T) { + rs := NewFilerRemoteStorage() + store := newStubFilerStore() + f := newTestFiler(t, store, rs) + + f.maybeLazyListFromRemote(context.Background(), util.FullPath("/not/a/mount")) +} + +func TestMaybeLazyListFromRemote_SkipsLocalOnlyEntries(t *testing.T) { + const storageType = "stub_lazy_list_skiplocal" + stub := &stubRemoteClient{ + listDirFn: func(loc *remote_pb.RemoteStorageLocation, visitFn remote_storage.VisitFunc) error { + // Remote has a file called "local.txt" too + return visitFn("/", "local.txt", false, &filer_pb.RemoteEntry{ + RemoteMtime: 1700000000, RemoteSize: 99, + }) + }, + } + defer registerStubMaker(t, storageType, stub)() + + conf := &remote_pb.RemoteConf{Name: "skipstore", Type: storageType} + rs := NewFilerRemoteStorage() + rs.storageNameToConf[conf.Name] = conf + rs.mapDirectoryToRemoteStorage("/buckets/mybucket", &remote_pb.RemoteStorageLocation{ + Name: "skipstore", + Bucket: "mybucket", + Path: "/", + ListingCacheTtlSeconds: 300, + }) + + store := newStubFilerStore() + // Pre-populate a local-only entry (no Remote field) + store.entries["/buckets/mybucket/local.txt"] = &Entry{ + FullPath: "/buckets/mybucket/local.txt", + Attr: Attr{Mode: 0644, FileSize: 50}, + } + f := newTestFiler(t, store, rs) + + f.maybeLazyListFromRemote(context.Background(), util.FullPath("/buckets/mybucket")) + + // Local entry should NOT have been overwritten + localEntry := store.getEntry("/buckets/mybucket/local.txt") + require.NotNil(t, localEntry) + assert.Equal(t, uint64(50), localEntry.FileSize, "local-only entry should not be overwritten") + assert.Nil(t, localEntry.Remote, "local-only entry should keep nil Remote") +} + +func TestMaybeLazyListFromRemote_MergesExistingRemoteEntry(t *testing.T) { + const storageType = "stub_lazy_list_merge" + stub := &stubRemoteClient{ + listDirFn: func(loc *remote_pb.RemoteStorageLocation, visitFn remote_storage.VisitFunc) error { + return visitFn("/", "cached.txt", false, &filer_pb.RemoteEntry{ + RemoteMtime: 1700000099, // updated mtime + RemoteSize: 200, // updated size + RemoteETag: "new-etag", + StorageName: "mergestore", + }) + }, + } + defer registerStubMaker(t, storageType, stub)() + + conf := &remote_pb.RemoteConf{Name: "mergestore", Type: storageType} + rs := NewFilerRemoteStorage() + rs.storageNameToConf[conf.Name] = conf + rs.mapDirectoryToRemoteStorage("/buckets/mybucket", &remote_pb.RemoteStorageLocation{ + Name: "mergestore", + Bucket: "mybucket", + Path: "/", + ListingCacheTtlSeconds: 300, + }) + + store := newStubFilerStore() + // Pre-populate an existing remote-backed entry with chunks and extended attrs + existingChunks := []*filer_pb.FileChunk{ + {FileId: "1,abc123", Size: 100, Offset: 0}, + } + store.entries["/buckets/mybucket/cached.txt"] = &Entry{ + FullPath: "/buckets/mybucket/cached.txt", + Attr: Attr{ + Mode: 0644, + FileSize: 100, + Uid: 1000, + Gid: 1000, + Mtime: time.Unix(1700000000, 0), + Crtime: time.Unix(1699000000, 0), + }, + Chunks: existingChunks, + Extended: map[string][]byte{ + "user.custom": []byte("myvalue"), + }, + Remote: &filer_pb.RemoteEntry{ + RemoteMtime: 1700000000, + RemoteSize: 100, + RemoteETag: "old-etag", + StorageName: "mergestore", + }, + } + f := newTestFiler(t, store, rs) + + f.maybeLazyListFromRemote(context.Background(), util.FullPath("/buckets/mybucket")) + assert.Equal(t, 1, stub.listDirCalls) + + merged := store.getEntry("/buckets/mybucket/cached.txt") + require.NotNil(t, merged) + + // Remote metadata should be updated + assert.Equal(t, int64(1700000099), merged.Remote.RemoteMtime) + assert.Equal(t, int64(200), merged.Remote.RemoteSize) + assert.Equal(t, "new-etag", merged.Remote.RemoteETag) + assert.Equal(t, uint64(200), merged.FileSize) + assert.Equal(t, time.Unix(1700000099, 0), merged.Mtime) + + // Local state should be preserved + assert.Equal(t, existingChunks, merged.Chunks, "chunks must be preserved") + assert.Equal(t, []byte("myvalue"), merged.Extended["user.custom"], "extended attrs must be preserved") + assert.Equal(t, uint32(1000), merged.Uid, "uid must be preserved") + assert.Equal(t, uint32(1000), merged.Gid, "gid must be preserved") + assert.Equal(t, os.FileMode(0644), merged.Mode, "mode must be preserved") + assert.Equal(t, time.Unix(1699000000, 0), merged.Crtime, "crtime must be preserved") +} + +func TestMaybeLazyListFromRemote_ContextGuardPreventsRecursion(t *testing.T) { + const storageType = "stub_lazy_list_guard" + stub := &stubRemoteClient{} + defer registerStubMaker(t, storageType, stub)() + + conf := &remote_pb.RemoteConf{Name: "guardliststore", Type: storageType} + rs := NewFilerRemoteStorage() + rs.storageNameToConf[conf.Name] = conf + rs.mapDirectoryToRemoteStorage("/buckets/mybucket", &remote_pb.RemoteStorageLocation{ + Name: "guardliststore", + Bucket: "mybucket", + Path: "/", + ListingCacheTtlSeconds: 300, + }) + + store := newStubFilerStore() + f := newTestFiler(t, store, rs) + + // With lazyListContextKey set, should be a no-op + guardCtx := context.WithValue(context.Background(), lazyListContextKey{}, true) + f.maybeLazyListFromRemote(guardCtx, util.FullPath("/buckets/mybucket")) + assert.Equal(t, 0, stub.listDirCalls) + + // With lazyFetchContextKey set, should also be a no-op + fetchCtx := context.WithValue(context.Background(), lazyFetchContextKey{}, true) + f.maybeLazyListFromRemote(fetchCtx, util.FullPath("/buckets/mybucket")) + assert.Equal(t, 0, stub.listDirCalls) +} diff --git a/weed/pb/remote.proto b/weed/pb/remote.proto index 9d6d81ff5..f84cd5f8e 100644 --- a/weed/pb/remote.proto +++ b/weed/pb/remote.proto @@ -73,4 +73,5 @@ message RemoteStorageLocation { string name = 1; string bucket = 2; string path = 3; + int32 listing_cache_ttl_seconds = 4; // 0 = disabled; >0 enables on-demand directory listing with this TTL in seconds } diff --git a/weed/pb/remote_pb/remote.pb.go b/weed/pb/remote_pb/remote.pb.go index 877188ffa..f5e18060d 100644 --- a/weed/pb/remote_pb/remote.pb.go +++ b/weed/pb/remote_pb/remote.pb.go @@ -457,12 +457,13 @@ func (x *RemoteStorageMapping) GetPrimaryBucketStorageName() string { } type RemoteStorageLocation struct { - state protoimpl.MessageState `protogen:"open.v1"` - Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` - Bucket string `protobuf:"bytes,2,opt,name=bucket,proto3" json:"bucket,omitempty"` - Path string `protobuf:"bytes,3,opt,name=path,proto3" json:"path,omitempty"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache + state protoimpl.MessageState `protogen:"open.v1"` + Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` + Bucket string `protobuf:"bytes,2,opt,name=bucket,proto3" json:"bucket,omitempty"` + Path string `protobuf:"bytes,3,opt,name=path,proto3" json:"path,omitempty"` + ListingCacheTtlSeconds int32 `protobuf:"varint,4,opt,name=listing_cache_ttl_seconds,json=listingCacheTtlSeconds,proto3" json:"listing_cache_ttl_seconds,omitempty"` // 0 = disabled; >0 enables on-demand directory listing with this TTL in seconds + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache } func (x *RemoteStorageLocation) Reset() { @@ -516,6 +517,13 @@ func (x *RemoteStorageLocation) GetPath() string { return "" } +func (x *RemoteStorageLocation) GetListingCacheTtlSeconds() int32 { + if x != nil { + return x.ListingCacheTtlSeconds + } + return 0 +} + var File_remote_proto protoreflect.FileDescriptor const file_remote_proto_rawDesc = "" + @@ -573,11 +581,12 @@ const file_remote_proto_rawDesc = "" + "\x1bprimary_bucket_storage_name\x18\x02 \x01(\tR\x18primaryBucketStorageName\x1a]\n" + "\rMappingsEntry\x12\x10\n" + "\x03key\x18\x01 \x01(\tR\x03key\x126\n" + - "\x05value\x18\x02 \x01(\v2 .remote_pb.RemoteStorageLocationR\x05value:\x028\x01\"W\n" + + "\x05value\x18\x02 \x01(\v2 .remote_pb.RemoteStorageLocationR\x05value:\x028\x01\"\x92\x01\n" + "\x15RemoteStorageLocation\x12\x12\n" + "\x04name\x18\x01 \x01(\tR\x04name\x12\x16\n" + "\x06bucket\x18\x02 \x01(\tR\x06bucket\x12\x12\n" + - "\x04path\x18\x03 \x01(\tR\x04pathBP\n" + + "\x04path\x18\x03 \x01(\tR\x04path\x129\n" + + "\x19listing_cache_ttl_seconds\x18\x04 \x01(\x05R\x16listingCacheTtlSecondsBP\n" + "\x10seaweedfs.clientB\n" + "FilerProtoZ0github.com/seaweedfs/seaweedfs/weed/pb/remote_pbb\x06proto3" diff --git a/weed/remote_storage/azure/azure_storage_client.go b/weed/remote_storage/azure/azure_storage_client.go index 5785a4a0f..b56fff7a8 100644 --- a/weed/remote_storage/azure/azure_storage_client.go +++ b/weed/remote_storage/azure/azure_storage_client.go @@ -127,6 +127,68 @@ type azureRemoteStorageClient struct { var _ = remote_storage.RemoteStorageClient(&azureRemoteStorageClient{}) +func (az *azureRemoteStorageClient) ListDirectory(ctx context.Context, loc *remote_pb.RemoteStorageLocation, visitFn remote_storage.VisitFunc) (err error) { + pathKey := loc.Path[1:] + if pathKey != "" && !strings.HasSuffix(pathKey, "/") { + pathKey += "/" + } + + containerClient := az.client.ServiceClient().NewContainerClient(loc.Bucket) + pager := containerClient.NewListBlobsHierarchyPager("/", &container.ListBlobsHierarchyOptions{ + Prefix: &pathKey, + }) + + for pager.More() { + resp, pageErr := pager.NextPage(ctx) + if pageErr != nil { + return fmt.Errorf("azure list directory %s%s: %w", loc.Bucket, loc.Path, pageErr) + } + + for _, prefix := range resp.Segment.BlobPrefixes { + if prefix.Name == nil { + continue + } + dirKey := "/" + strings.TrimSuffix(*prefix.Name, "/") + dir, name := util.FullPath(dirKey).DirAndName() + if err = visitFn(dir, name, true, nil); err != nil { + return fmt.Errorf("azure processing directory prefix %s: %w", *prefix.Name, err) + } + } + + for _, blobItem := range resp.Segment.BlobItems { + if blobItem.Name == nil { + continue + } + key := "/" + *blobItem.Name + if strings.HasSuffix(key, "/") { + continue // skip directory markers + } + dir, name := util.FullPath(key).DirAndName() + + remoteEntry := &filer_pb.RemoteEntry{ + StorageName: az.conf.Name, + } + if blobItem.Properties != nil { + if blobItem.Properties.LastModified != nil { + remoteEntry.RemoteMtime = blobItem.Properties.LastModified.Unix() + } + if blobItem.Properties.ContentLength != nil { + remoteEntry.RemoteSize = *blobItem.Properties.ContentLength + } + if blobItem.Properties.ETag != nil { + remoteEntry.RemoteETag = string(*blobItem.Properties.ETag) + } + } + + if err = visitFn(dir, name, false, remoteEntry); err != nil { + return fmt.Errorf("azure processing blob %s: %w", *blobItem.Name, err) + } + } + } + + return nil +} + func (az *azureRemoteStorageClient) StatFile(loc *remote_pb.RemoteStorageLocation) (remoteEntry *filer_pb.RemoteEntry, err error) { key := loc.Path[1:] ctx, cancel := context.WithTimeout(context.Background(), DefaultAzureOpTimeout) diff --git a/weed/remote_storage/gcs/gcs_storage_client.go b/weed/remote_storage/gcs/gcs_storage_client.go index 7a9bc1e31..01053033e 100644 --- a/weed/remote_storage/gcs/gcs_storage_client.go +++ b/weed/remote_storage/gcs/gcs_storage_client.go @@ -131,6 +131,52 @@ func (gcs *gcsRemoteStorageClient) Traverse(loc *remote_pb.RemoteStorageLocation const defaultGCSOpTimeout = 30 * time.Second +func (gcs *gcsRemoteStorageClient) ListDirectory(ctx context.Context, loc *remote_pb.RemoteStorageLocation, visitFn remote_storage.VisitFunc) (err error) { + pathKey := loc.Path[1:] + if pathKey != "" && !strings.HasSuffix(pathKey, "/") { + pathKey += "/" + } + + objectIterator := gcs.client.Bucket(loc.Bucket).Objects(ctx, &storage.Query{ + Delimiter: "/", + Prefix: pathKey, + Versions: false, + }) + + for { + objectAttr, iterErr := objectIterator.Next() + if iterErr != nil { + if iterErr == iterator.Done { + return nil + } + return fmt.Errorf("list directory %s%s: %w", loc.Bucket, loc.Path, iterErr) + } + + if objectAttr.Prefix != "" { + // Common prefix → subdirectory + dirKey := "/" + strings.TrimSuffix(objectAttr.Prefix, "/") + dir, name := util.FullPath(dirKey).DirAndName() + if err = visitFn(dir, name, true, nil); err != nil { + return err + } + } else { + key := "/" + objectAttr.Name + if strings.HasSuffix(key, "/") { + continue // skip directory markers + } + dir, name := util.FullPath(key).DirAndName() + if err = visitFn(dir, name, false, &filer_pb.RemoteEntry{ + RemoteMtime: objectAttr.Updated.Unix(), + RemoteSize: objectAttr.Size, + RemoteETag: objectAttr.Etag, + StorageName: gcs.conf.Name, + }); err != nil { + return err + } + } + } +} + func (gcs *gcsRemoteStorageClient) StatFile(loc *remote_pb.RemoteStorageLocation) (remoteEntry *filer_pb.RemoteEntry, err error) { key := loc.Path[1:] ctx, cancel := context.WithTimeout(context.Background(), defaultGCSOpTimeout) diff --git a/weed/remote_storage/remote_storage.go b/weed/remote_storage/remote_storage.go index e8f54d944..3c6bc2e6f 100644 --- a/weed/remote_storage/remote_storage.go +++ b/weed/remote_storage/remote_storage.go @@ -1,6 +1,7 @@ package remote_storage import ( + "context" "errors" "fmt" "io" @@ -75,6 +76,7 @@ var ErrRemoteObjectNotFound = errors.New("remote object not found") type RemoteStorageClient interface { Traverse(loc *remote_pb.RemoteStorageLocation, visitFn VisitFunc) error + ListDirectory(ctx context.Context, loc *remote_pb.RemoteStorageLocation, visitFn VisitFunc) error StatFile(loc *remote_pb.RemoteStorageLocation) (remoteEntry *filer_pb.RemoteEntry, err error) ReadFile(loc *remote_pb.RemoteStorageLocation, offset int64, size int64) (data []byte, err error) WriteDirectory(loc *remote_pb.RemoteStorageLocation, entry *filer_pb.Entry) (err error) diff --git a/weed/remote_storage/s3/s3_storage_client.go b/weed/remote_storage/s3/s3_storage_client.go index 6700fbdd3..7f82808f8 100644 --- a/weed/remote_storage/s3/s3_storage_client.go +++ b/weed/remote_storage/s3/s3_storage_client.go @@ -1,10 +1,12 @@ package s3 import ( + "context" "fmt" "io" "net/http" "reflect" + "strings" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/awserr" @@ -98,12 +100,19 @@ func (s *s3RemoteStorageClient) Traverse(remote *remote_pb.RemoteStorageLocation key := *content.Key key = "/" + key dir, name := util.FullPath(key).DirAndName() - if err := visitFn(dir, name, false, &filer_pb.RemoteEntry{ - RemoteMtime: (*content.LastModified).Unix(), - RemoteSize: *content.Size, - RemoteETag: *content.ETag, + remoteEntry := &filer_pb.RemoteEntry{ StorageName: s.conf.Name, - }); err != nil { + } + if content.LastModified != nil { + remoteEntry.RemoteMtime = content.LastModified.Unix() + } + if content.Size != nil { + remoteEntry.RemoteSize = *content.Size + } + if content.ETag != nil { + remoteEntry.RemoteETag = *content.ETag + } + if err := visitFn(dir, name, false, remoteEntry); err != nil { localErr = err return false } @@ -122,6 +131,65 @@ func (s *s3RemoteStorageClient) Traverse(remote *remote_pb.RemoteStorageLocation return } +func (s *s3RemoteStorageClient) ListDirectory(ctx context.Context, loc *remote_pb.RemoteStorageLocation, visitFn remote_storage.VisitFunc) error { + pathKey := loc.Path[1:] + if pathKey != "" && !strings.HasSuffix(pathKey, "/") { + pathKey += "/" + } + + listInput := &s3.ListObjectsV2Input{ + Bucket: aws.String(loc.Bucket), + Prefix: aws.String(pathKey), + Delimiter: aws.String("/"), + } + + var localErr error + listErr := s.conn.ListObjectsV2PagesWithContext(ctx, listInput, func(page *s3.ListObjectsV2Output, lastPage bool) bool { + for _, prefix := range page.CommonPrefixes { + if prefix.Prefix == nil { + continue + } + dirKey := "/" + strings.TrimSuffix(*prefix.Prefix, "/") + dir, name := util.FullPath(dirKey).DirAndName() + if err := visitFn(dir, name, true, nil); err != nil { + localErr = err + return false + } + } + for _, content := range page.Contents { + key := "/" + *content.Key + if strings.HasSuffix(key, "/") { + continue // skip directory markers + } + dir, name := util.FullPath(key).DirAndName() + remoteEntry := &filer_pb.RemoteEntry{ + StorageName: s.conf.Name, + } + if content.LastModified != nil { + remoteEntry.RemoteMtime = content.LastModified.Unix() + } + if content.Size != nil { + remoteEntry.RemoteSize = *content.Size + } + if content.ETag != nil { + remoteEntry.RemoteETag = *content.ETag + } + if err := visitFn(dir, name, false, remoteEntry); err != nil { + localErr = err + return false + } + } + return true + }) + if listErr != nil { + return fmt.Errorf("list directory %v: %w", loc, listErr) + } + if localErr != nil { + return fmt.Errorf("process directory %v: %w", loc, localErr) + } + return nil +} + func (s *s3RemoteStorageClient) StatFile(loc *remote_pb.RemoteStorageLocation) (remoteEntry *filer_pb.RemoteEntry, err error) { resp, err := s.conn.HeadObject(&s3.HeadObjectInput{ Bucket: aws.String(loc.Bucket), diff --git a/weed/shell/command_remote_mount.go b/weed/shell/command_remote_mount.go index 5f995e9f3..8728ca4c0 100644 --- a/weed/shell/command_remote_mount.go +++ b/weed/shell/command_remote_mount.go @@ -48,6 +48,8 @@ func (c *commandRemoteMount) Help() string { remote.mount -dir=/xxx -remote=cloud1/bucket -metadataStrategy=lazy # mount and pull one directory in the bucket remote.mount -dir=/xxx -remote=cloud1/bucket/dir1 + # mount with on-demand directory listing cached for 5 minutes + remote.mount -dir=/xxx -remote=cloud1/bucket -listingCacheTTL=300 # after mount, start a separate process to write updates to remote storage weed filer.remote.sync -filer=: -dir=/xxx @@ -67,6 +69,7 @@ func (c *commandRemoteMount) Do(args []string, commandEnv *CommandEnv, writer io nonEmpty := remoteMountCommand.Bool("nonempty", false, "allows the mounting over a non-empty directory") metadataStrategy := remoteMountCommand.String("metadataStrategy", string(MetadataCacheEager), "lazy: skip upfront metadata pull; eager: full metadata pull (default)") remote := remoteMountCommand.String("remote", "", "a directory in remote storage, ex. //path/to/dir") + listingCacheTTL := remoteMountCommand.Int("listingCacheTTL", 0, "seconds to cache remote directory listings (0 = disabled)") if err = remoteMountCommand.Parse(args); err != nil { return nil @@ -87,6 +90,7 @@ func (c *commandRemoteMount) Do(args []string, commandEnv *CommandEnv, writer io if err != nil { return err } + remoteStorageLocation.ListingCacheTtlSeconds = int32(*listingCacheTTL) strategy := MetadataCacheStrategy(strings.ToLower(*metadataStrategy)) if strategy != MetadataCacheLazy && strategy != MetadataCacheEager {