fix: OnPeerUpdate should only process updates for matching FilerGroup

Critical bug: The OnPeerUpdate callback was incorrectly moved outside the FilerGroup check when restoring observability instrumentation. This caused clients to process peer updates for ALL filer groups, not just their own. Problem: Before: mc.OnPeerUpdate only called for update.FilerGroup == mc.FilerGroup Bug: mc.OnPeerUpdate called for ALL updates regardless of FilerGroup Impact: - Multi-tenant deployments with separate filer groups would see cross-group updates (e.g., group A clients processing group B updates) - Could cause incorrect cluster membership tracking - OnPeerUpdate handlers (like Filer's DLM ring updates) would receive irrelevant updates from other groups Example scenario: Cluster has two filer groups: "production" and "staging" Production filer connects with FilerGroup="production" Incorrect behavior (bug): - Receives "staging" group updates - Incorrectly adds staging filers to production DLM ring - Cross-tenant data access issues Correct behavior (fixed): - Only receives "production" group updates - Only adds production filers to production DLM ring - Proper isolation between groups Fix: Moved mc.OnPeerUpdate(update, time.Now()) back INSIDE the FilerGroup check where it belongs, matching the original implementation. The logging and stats counter were already correctly scoped to matching FilerGroup, so they remain inside the if block as intended.
3 months ago · 99ae38339d
2 changed files with 6 additions and 6 deletions
--- a/weed/wdclient/filer_client.go
+++ b/weed/wdclient/filer_client.go
@ -37,7 +37,7 @@ type filerHealth struct {
 type FilerClient struct {
 	*vidMapClient
 	filerAddresses []pb.ServerAddress
 	filerIndex     int32 // atomic: current filer index for round-robin
 	filerIndex     int32          // atomic: current filer index for round-robin
 	filerHealth    []*filerHealth // health status per filer (same order as filerAddresses)
 	grpcDialOption grpc.DialOption
 	urlPreference  UrlPreference
@ -186,17 +186,17 @@ func isRetryableGrpcError(err error) bool {
 func (fc *FilerClient) shouldSkipUnhealthyFiler(index int32) bool {
 	health := fc.filerHealth[index]
 	failureCount := atomic.LoadInt32(&health.failureCount)
 	// Allow up to 2 failures before skipping
 	if failureCount < 3 {
 		return false
 	}
 	// Re-check unhealthy filers every 30 seconds
 	if time.Since(health.lastFailureTime) > 30*time.Second {
 		return false // Time to re-check
 	}
 	return true // Skip this unhealthy filer
 }
@ -244,7 +244,7 @@ func (p *filerVolumeProvider) LookupVolumeIds(ctx context.Context, volumeIds []s
 		for x := int32(0); x < n; x++ {
 			// Circuit breaker: skip unhealthy filers
 			if fc.shouldSkipUnhealthyFiler(i) {
 				glog.V(2).Infof("FilerClient: skipping unhealthy filer %s (consecutive failures: %d)", 
 				glog.V(2).Infof("FilerClient: skipping unhealthy filer %s (consecutive failures: %d)",
 					fc.filerAddresses[i], atomic.LoadInt32(&fc.filerHealth[i].failureCount))
 				i++
 				if i >= n {
--- a/weed/wdclient/masterclient.go
+++ b/weed/wdclient/masterclient.go
@ -238,8 +238,8 @@ func (mc *MasterClient) tryConnectToMaster(ctx context.Context, master pb.Server
 							glog.V(0).Infof("- %s@%s noticed %s.%s %s\n", mc.clientType, mc.clientHost, update.FilerGroup, update.NodeType, update.Address)
 						}
 						stats.MasterClientConnectCounter.WithLabelValues(stats.OnPeerUpdate).Inc()
 						mc.OnPeerUpdate(update, time.Now())
 					}
 					mc.OnPeerUpdate(update, time.Now())
 				}
 				mc.OnPeerUpdateLock.RUnlock()
 			}