From 99ae38339d456f90facb2862f4ba394b139f9e90 Mon Sep 17 00:00:00 2001 From: chrislu Date: Thu, 20 Nov 2025 16:21:50 -0800 Subject: [PATCH] fix: OnPeerUpdate should only process updates for matching FilerGroup Critical bug: The OnPeerUpdate callback was incorrectly moved outside the FilerGroup check when restoring observability instrumentation. This caused clients to process peer updates for ALL filer groups, not just their own. Problem: Before: mc.OnPeerUpdate only called for update.FilerGroup == mc.FilerGroup Bug: mc.OnPeerUpdate called for ALL updates regardless of FilerGroup Impact: - Multi-tenant deployments with separate filer groups would see cross-group updates (e.g., group A clients processing group B updates) - Could cause incorrect cluster membership tracking - OnPeerUpdate handlers (like Filer's DLM ring updates) would receive irrelevant updates from other groups Example scenario: Cluster has two filer groups: "production" and "staging" Production filer connects with FilerGroup="production" Incorrect behavior (bug): - Receives "staging" group updates - Incorrectly adds staging filers to production DLM ring - Cross-tenant data access issues Correct behavior (fixed): - Only receives "production" group updates - Only adds production filers to production DLM ring - Proper isolation between groups Fix: Moved mc.OnPeerUpdate(update, time.Now()) back INSIDE the FilerGroup check where it belongs, matching the original implementation. The logging and stats counter were already correctly scoped to matching FilerGroup, so they remain inside the if block as intended. --- weed/wdclient/filer_client.go | 10 +++++----- weed/wdclient/masterclient.go | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/weed/wdclient/filer_client.go b/weed/wdclient/filer_client.go index 8065c4dfd..bca28eb2b 100644 --- a/weed/wdclient/filer_client.go +++ b/weed/wdclient/filer_client.go @@ -37,7 +37,7 @@ type filerHealth struct { type FilerClient struct { *vidMapClient filerAddresses []pb.ServerAddress - filerIndex int32 // atomic: current filer index for round-robin + filerIndex int32 // atomic: current filer index for round-robin filerHealth []*filerHealth // health status per filer (same order as filerAddresses) grpcDialOption grpc.DialOption urlPreference UrlPreference @@ -186,17 +186,17 @@ func isRetryableGrpcError(err error) bool { func (fc *FilerClient) shouldSkipUnhealthyFiler(index int32) bool { health := fc.filerHealth[index] failureCount := atomic.LoadInt32(&health.failureCount) - + // Allow up to 2 failures before skipping if failureCount < 3 { return false } - + // Re-check unhealthy filers every 30 seconds if time.Since(health.lastFailureTime) > 30*time.Second { return false // Time to re-check } - + return true // Skip this unhealthy filer } @@ -244,7 +244,7 @@ func (p *filerVolumeProvider) LookupVolumeIds(ctx context.Context, volumeIds []s for x := int32(0); x < n; x++ { // Circuit breaker: skip unhealthy filers if fc.shouldSkipUnhealthyFiler(i) { - glog.V(2).Infof("FilerClient: skipping unhealthy filer %s (consecutive failures: %d)", + glog.V(2).Infof("FilerClient: skipping unhealthy filer %s (consecutive failures: %d)", fc.filerAddresses[i], atomic.LoadInt32(&fc.filerHealth[i].failureCount)) i++ if i >= n { diff --git a/weed/wdclient/masterclient.go b/weed/wdclient/masterclient.go index 5a1b4f57c..5f47a4a03 100644 --- a/weed/wdclient/masterclient.go +++ b/weed/wdclient/masterclient.go @@ -238,8 +238,8 @@ func (mc *MasterClient) tryConnectToMaster(ctx context.Context, master pb.Server glog.V(0).Infof("- %s@%s noticed %s.%s %s\n", mc.clientType, mc.clientHost, update.FilerGroup, update.NodeType, update.Address) } stats.MasterClientConnectCounter.WithLabelValues(stats.OnPeerUpdate).Inc() + mc.OnPeerUpdate(update, time.Now()) } - mc.OnPeerUpdate(update, time.Now()) } mc.OnPeerUpdateLock.RUnlock() }