From 99ae38339d456f90facb2862f4ba394b139f9e90 Mon Sep 17 00:00:00 2001
From: chrislu <chris.lu@gmail.com>
Date: Thu, 20 Nov 2025 16:21:50 -0800
Subject: [PATCH] fix: OnPeerUpdate should only process updates for matching
 FilerGroup

Critical bug: The OnPeerUpdate callback was incorrectly moved outside the
FilerGroup check when restoring observability instrumentation. This caused
clients to process peer updates for ALL filer groups, not just their own.

Problem:
  Before: mc.OnPeerUpdate only called for update.FilerGroup == mc.FilerGroup
  Bug:    mc.OnPeerUpdate called for ALL updates regardless of FilerGroup

Impact:
- Multi-tenant deployments with separate filer groups would see cross-group
  updates (e.g., group A clients processing group B updates)
- Could cause incorrect cluster membership tracking
- OnPeerUpdate handlers (like Filer's DLM ring updates) would receive
  irrelevant updates from other groups

Example scenario:
  Cluster has two filer groups: "production" and "staging"
  Production filer connects with FilerGroup="production"

  Incorrect behavior (bug):
    - Receives "staging" group updates
    - Incorrectly adds staging filers to production DLM ring
    - Cross-tenant data access issues

  Correct behavior (fixed):
    - Only receives "production" group updates
    - Only adds production filers to production DLM ring
    - Proper isolation between groups

Fix:
  Moved mc.OnPeerUpdate(update, time.Now()) back INSIDE the FilerGroup check
  where it belongs, matching the original implementation.

The logging and stats counter were already correctly scoped to matching
FilerGroup, so they remain inside the if block as intended.
---
 weed/wdclient/filer_client.go | 10 +++++-----
 weed/wdclient/masterclient.go |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/weed/wdclient/filer_client.go b/weed/wdclient/filer_client.go
index 8065c4dfd..bca28eb2b 100644
--- a/weed/wdclient/filer_client.go
+++ b/weed/wdclient/filer_client.go
@@ -37,7 +37,7 @@ type filerHealth struct {
 type FilerClient struct {
 	*vidMapClient
 	filerAddresses []pb.ServerAddress
-	filerIndex     int32 // atomic: current filer index for round-robin
+	filerIndex     int32          // atomic: current filer index for round-robin
 	filerHealth    []*filerHealth // health status per filer (same order as filerAddresses)
 	grpcDialOption grpc.DialOption
 	urlPreference  UrlPreference
@@ -186,17 +186,17 @@ func isRetryableGrpcError(err error) bool {
 func (fc *FilerClient) shouldSkipUnhealthyFiler(index int32) bool {
 	health := fc.filerHealth[index]
 	failureCount := atomic.LoadInt32(&health.failureCount)
-	
+
 	// Allow up to 2 failures before skipping
 	if failureCount < 3 {
 		return false
 	}
-	
+
 	// Re-check unhealthy filers every 30 seconds
 	if time.Since(health.lastFailureTime) > 30*time.Second {
 		return false // Time to re-check
 	}
-	
+
 	return true // Skip this unhealthy filer
 }
 
@@ -244,7 +244,7 @@ func (p *filerVolumeProvider) LookupVolumeIds(ctx context.Context, volumeIds []s
 		for x := int32(0); x < n; x++ {
 			// Circuit breaker: skip unhealthy filers
 			if fc.shouldSkipUnhealthyFiler(i) {
-				glog.V(2).Infof("FilerClient: skipping unhealthy filer %s (consecutive failures: %d)", 
+				glog.V(2).Infof("FilerClient: skipping unhealthy filer %s (consecutive failures: %d)",
 					fc.filerAddresses[i], atomic.LoadInt32(&fc.filerHealth[i].failureCount))
 				i++
 				if i >= n {
diff --git a/weed/wdclient/masterclient.go b/weed/wdclient/masterclient.go
index 5a1b4f57c..5f47a4a03 100644
--- a/weed/wdclient/masterclient.go
+++ b/weed/wdclient/masterclient.go
@@ -238,8 +238,8 @@ func (mc *MasterClient) tryConnectToMaster(ctx context.Context, master pb.Server
 							glog.V(0).Infof("- %s@%s noticed %s.%s %s\n", mc.clientType, mc.clientHost, update.FilerGroup, update.NodeType, update.Address)
 						}
 						stats.MasterClientConnectCounter.WithLabelValues(stats.OnPeerUpdate).Inc()
+						mc.OnPeerUpdate(update, time.Now())
 					}
-					mc.OnPeerUpdate(update, time.Now())
 				}
 				mc.OnPeerUpdateLock.RUnlock()
 			}