From aad8cc9bb735c901a0aa0e818042ecc84ec1959b Mon Sep 17 00:00:00 2001 From: Sheya Bernstein Date: Sun, 11 Jan 2026 00:15:16 +0000 Subject: [PATCH] fix: add filer fallback after consecutive connection failures --- weed/cluster/lock_client.go | 39 +++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/weed/cluster/lock_client.go b/weed/cluster/lock_client.go index b08e8f458..241c583ea 100644 --- a/weed/cluster/lock_client.go +++ b/weed/cluster/lock_client.go @@ -32,17 +32,18 @@ func NewLockClient(grpcDialOption grpc.DialOption, seedFiler pb.ServerAddress) * } type LiveLock struct { - key string - renewToken string - expireAtNs int64 - hostFiler pb.ServerAddress - cancelCh chan struct{} - grpcDialOption grpc.DialOption - isLocked int32 // 0 = unlocked, 1 = locked; use atomic operations - self string - lc *LockClient - owner string - lockTTL time.Duration + key string + renewToken string + expireAtNs int64 + hostFiler pb.ServerAddress + cancelCh chan struct{} + grpcDialOption grpc.DialOption + isLocked int32 // 0 = unlocked, 1 = locked; use atomic operations + self string + lc *LockClient + owner string + lockTTL time.Duration + consecutiveFailures int // Track connection failures to trigger fallback } // NewShortLivedLock creates a lock with a 5-second duration @@ -213,6 +214,7 @@ func (lock *LiveLock) doLock(lockDuration time.Duration) (errorMessage string, e glog.V(4).Infof("LOCK: DistributedLock response - key=%s err=%v", lock.key, err) if err == nil && resp != nil { lock.renewToken = resp.RenewToken + lock.consecutiveFailures = 0 // Reset failure counter on success glog.V(4).Infof("LOCK: Got renewToken for key=%s", lock.key) } else { //this can be retried. Need to remember the last valid renewToken @@ -225,7 +227,7 @@ func (lock *LiveLock) doLock(lockDuration time.Duration) (errorMessage string, e // Only log if the host actually changed glog.V(2).Infof("LOCK: Host changed from %s to %s for key=%s", previousHostFiler, resp.LockHostMovedTo, lock.key) lock.hostFiler = pb.ServerAddress(resp.LockHostMovedTo) - lock.lc.seedFiler = lock.hostFiler + // Don't update seedFiler - keep original for fallback } else if resp.LockHostMovedTo != "" { lock.hostFiler = pb.ServerAddress(resp.LockHostMovedTo) } @@ -242,6 +244,19 @@ func (lock *LiveLock) doLock(lockDuration time.Duration) (errorMessage string, e } return err }) + + if err != nil && lock.hostFiler != lock.lc.seedFiler { + lock.consecutiveFailures++ + // Fall back to seed filer after 3 consecutive connection failures + if lock.consecutiveFailures >= 3 { + glog.V(0).Infof("LOCK: Connection failed %d times for key=%s filer=%s, falling back to seed filer=%s", + lock.consecutiveFailures, lock.key, lock.hostFiler, lock.lc.seedFiler) + lock.hostFiler = lock.lc.seedFiler + lock.consecutiveFailures = 0 + lock.renewToken = "" + } + } + return }