Browse Source

fix: OnPeerUpdate should only process updates for matching FilerGroup

Critical bug: The OnPeerUpdate callback was incorrectly moved outside the
FilerGroup check when restoring observability instrumentation. This caused
clients to process peer updates for ALL filer groups, not just their own.

Problem:
  Before: mc.OnPeerUpdate only called for update.FilerGroup == mc.FilerGroup
  Bug:    mc.OnPeerUpdate called for ALL updates regardless of FilerGroup

Impact:
- Multi-tenant deployments with separate filer groups would see cross-group
  updates (e.g., group A clients processing group B updates)
- Could cause incorrect cluster membership tracking
- OnPeerUpdate handlers (like Filer's DLM ring updates) would receive
  irrelevant updates from other groups

Example scenario:
  Cluster has two filer groups: "production" and "staging"
  Production filer connects with FilerGroup="production"

  Incorrect behavior (bug):
    - Receives "staging" group updates
    - Incorrectly adds staging filers to production DLM ring
    - Cross-tenant data access issues

  Correct behavior (fixed):
    - Only receives "production" group updates
    - Only adds production filers to production DLM ring
    - Proper isolation between groups

Fix:
  Moved mc.OnPeerUpdate(update, time.Now()) back INSIDE the FilerGroup check
  where it belongs, matching the original implementation.

The logging and stats counter were already correctly scoped to matching
FilerGroup, so they remain inside the if block as intended.
pull/7518/head
chrislu 2 weeks ago
parent
commit
99ae38339d
  1. 10
      weed/wdclient/filer_client.go
  2. 2
      weed/wdclient/masterclient.go

10
weed/wdclient/filer_client.go

@ -37,7 +37,7 @@ type filerHealth struct {
type FilerClient struct { type FilerClient struct {
*vidMapClient *vidMapClient
filerAddresses []pb.ServerAddress filerAddresses []pb.ServerAddress
filerIndex int32 // atomic: current filer index for round-robin
filerIndex int32 // atomic: current filer index for round-robin
filerHealth []*filerHealth // health status per filer (same order as filerAddresses) filerHealth []*filerHealth // health status per filer (same order as filerAddresses)
grpcDialOption grpc.DialOption grpcDialOption grpc.DialOption
urlPreference UrlPreference urlPreference UrlPreference
@ -186,17 +186,17 @@ func isRetryableGrpcError(err error) bool {
func (fc *FilerClient) shouldSkipUnhealthyFiler(index int32) bool { func (fc *FilerClient) shouldSkipUnhealthyFiler(index int32) bool {
health := fc.filerHealth[index] health := fc.filerHealth[index]
failureCount := atomic.LoadInt32(&health.failureCount) failureCount := atomic.LoadInt32(&health.failureCount)
// Allow up to 2 failures before skipping // Allow up to 2 failures before skipping
if failureCount < 3 { if failureCount < 3 {
return false return false
} }
// Re-check unhealthy filers every 30 seconds // Re-check unhealthy filers every 30 seconds
if time.Since(health.lastFailureTime) > 30*time.Second { if time.Since(health.lastFailureTime) > 30*time.Second {
return false // Time to re-check return false // Time to re-check
} }
return true // Skip this unhealthy filer return true // Skip this unhealthy filer
} }
@ -244,7 +244,7 @@ func (p *filerVolumeProvider) LookupVolumeIds(ctx context.Context, volumeIds []s
for x := int32(0); x < n; x++ { for x := int32(0); x < n; x++ {
// Circuit breaker: skip unhealthy filers // Circuit breaker: skip unhealthy filers
if fc.shouldSkipUnhealthyFiler(i) { if fc.shouldSkipUnhealthyFiler(i) {
glog.V(2).Infof("FilerClient: skipping unhealthy filer %s (consecutive failures: %d)",
glog.V(2).Infof("FilerClient: skipping unhealthy filer %s (consecutive failures: %d)",
fc.filerAddresses[i], atomic.LoadInt32(&fc.filerHealth[i].failureCount)) fc.filerAddresses[i], atomic.LoadInt32(&fc.filerHealth[i].failureCount))
i++ i++
if i >= n { if i >= n {

2
weed/wdclient/masterclient.go

@ -238,8 +238,8 @@ func (mc *MasterClient) tryConnectToMaster(ctx context.Context, master pb.Server
glog.V(0).Infof("- %s@%s noticed %s.%s %s\n", mc.clientType, mc.clientHost, update.FilerGroup, update.NodeType, update.Address) glog.V(0).Infof("- %s@%s noticed %s.%s %s\n", mc.clientType, mc.clientHost, update.FilerGroup, update.NodeType, update.Address)
} }
stats.MasterClientConnectCounter.WithLabelValues(stats.OnPeerUpdate).Inc() stats.MasterClientConnectCounter.WithLabelValues(stats.OnPeerUpdate).Inc()
mc.OnPeerUpdate(update, time.Now())
} }
mc.OnPeerUpdate(update, time.Now())
} }
mc.OnPeerUpdateLock.RUnlock() mc.OnPeerUpdateLock.RUnlock()
} }

Loading…
Cancel
Save