From 7b264afdb4f5d20d7e12782e70c6f212cd628598 Mon Sep 17 00:00:00 2001
From: chrislu <chris.lu@gmail.com>
Date: Thu, 20 Nov 2025 16:45:23 -0800
Subject: [PATCH] fix: always reset vidMap cache on master reconnection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous refactoring removed the else block that resets vidMap when
the first message from a newly connected master is not a VolumeLocation.

Problem scenario:
  1. Client connects to master-1 and builds vidMap cache
  2. Master-1 fails, client connects to master-2
  3. First message from master-2 is a ClusterNodeUpdate (not VolumeLocation)
  4. Old code: vidMap is reset and updated ✅
  5. New code: vidMap is NOT reset ❌
  6. Result: Client uses stale cache from master-1 → data access errors

Example flow with bug:
  Connect to master-2
  First message: ClusterNodeUpdate {filer.x added}
  → No resetVidMap() call
  → vidMap still has master-1's stale volume locations
  → Client reads from wrong volume servers → 404 errors

Fix:
  Restored the else block that resets vidMap when first message is not
  a VolumeLocation:

    if resp.VolumeLocation != nil {
      // ... check leader, reset, and update ...
    } else {
      // First message is ClusterNodeUpdate or other type
      // Must still reset to avoid stale data
      mc.resetVidMap()
    }

This ensures the cache is always cleared when establishing a new master
connection, regardless of what the first message type is.

Root cause:
  During the vidMapClient refactoring, this else block was accidentally
  dropped, making failover behavior fragile and non-deterministic (depends
  on which message type arrives first from the new master).

Impact:
  - High severity for master failover scenarios
  - Could cause read failures, 404s, or wrong data access
  - Only manifests when first message is not VolumeLocation
---
 weed/wdclient/masterclient.go | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/weed/wdclient/masterclient.go b/weed/wdclient/masterclient.go
index 5f47a4a03..496daad3b 100644
--- a/weed/wdclient/masterclient.go
+++ b/weed/wdclient/masterclient.go
@@ -201,6 +201,10 @@ func (mc *MasterClient) tryConnectToMaster(ctx context.Context, master pb.Server
 			}
 			mc.resetVidMap()
 			mc.updateVidMap(resp)
+		} else {
+			// First message from master is not VolumeLocation (e.g., ClusterNodeUpdate)
+			// Still need to reset cache to ensure we don't use stale data from previous master
+			mc.resetVidMap()
 		}
 		mc.setCurrentMaster(master)
 
@@ -318,16 +322,17 @@ func (mc *MasterClient) setCurrentMaster(master pb.ServerAddress) {
 }
 
 // GetMaster returns the current master address, blocking until connected.
-// 
+//
 // IMPORTANT: This method blocks until KeepConnectedToMaster successfully establishes
 // a connection to a master server. If KeepConnectedToMaster hasn't been started in a
 // background goroutine, this will block indefinitely (or until ctx is canceled).
 //
 // Typical initialization pattern:
-//   mc := wdclient.NewMasterClient(...)
-//   go mc.KeepConnectedToMaster(ctx)  // Start connection management
-//   // ... later ...
-//   master := mc.GetMaster(ctx)       // Will block until connected
+//
+//	mc := wdclient.NewMasterClient(...)
+//	go mc.KeepConnectedToMaster(ctx)  // Start connection management
+//	// ... later ...
+//	master := mc.GetMaster(ctx)       // Will block until connected
 //
 // If called before KeepConnectedToMaster establishes a connection, this may cause
 // unexpected timeouts in LookupVolumeIds and other operations that depend on it.
@@ -404,4 +409,3 @@ func (mc *MasterClient) FindLeaderFromOtherPeers(myMasterAddress pb.ServerAddres
 	glog.V(0).Infof("No existing leader found!")
 	return
 }
-