From 7b264afdb4f5d20d7e12782e70c6f212cd628598 Mon Sep 17 00:00:00 2001 From: chrislu Date: Thu, 20 Nov 2025 16:45:23 -0800 Subject: [PATCH] fix: always reset vidMap cache on master reconnection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous refactoring removed the else block that resets vidMap when the first message from a newly connected master is not a VolumeLocation. Problem scenario: 1. Client connects to master-1 and builds vidMap cache 2. Master-1 fails, client connects to master-2 3. First message from master-2 is a ClusterNodeUpdate (not VolumeLocation) 4. Old code: vidMap is reset and updated ✅ 5. New code: vidMap is NOT reset ❌ 6. Result: Client uses stale cache from master-1 → data access errors Example flow with bug: Connect to master-2 First message: ClusterNodeUpdate {filer.x added} → No resetVidMap() call → vidMap still has master-1's stale volume locations → Client reads from wrong volume servers → 404 errors Fix: Restored the else block that resets vidMap when first message is not a VolumeLocation: if resp.VolumeLocation != nil { // ... check leader, reset, and update ... } else { // First message is ClusterNodeUpdate or other type // Must still reset to avoid stale data mc.resetVidMap() } This ensures the cache is always cleared when establishing a new master connection, regardless of what the first message type is. Root cause: During the vidMapClient refactoring, this else block was accidentally dropped, making failover behavior fragile and non-deterministic (depends on which message type arrives first from the new master). Impact: - High severity for master failover scenarios - Could cause read failures, 404s, or wrong data access - Only manifests when first message is not VolumeLocation --- weed/wdclient/masterclient.go | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/weed/wdclient/masterclient.go b/weed/wdclient/masterclient.go index 5f47a4a03..496daad3b 100644 --- a/weed/wdclient/masterclient.go +++ b/weed/wdclient/masterclient.go @@ -201,6 +201,10 @@ func (mc *MasterClient) tryConnectToMaster(ctx context.Context, master pb.Server } mc.resetVidMap() mc.updateVidMap(resp) + } else { + // First message from master is not VolumeLocation (e.g., ClusterNodeUpdate) + // Still need to reset cache to ensure we don't use stale data from previous master + mc.resetVidMap() } mc.setCurrentMaster(master) @@ -318,16 +322,17 @@ func (mc *MasterClient) setCurrentMaster(master pb.ServerAddress) { } // GetMaster returns the current master address, blocking until connected. -// +// // IMPORTANT: This method blocks until KeepConnectedToMaster successfully establishes // a connection to a master server. If KeepConnectedToMaster hasn't been started in a // background goroutine, this will block indefinitely (or until ctx is canceled). // // Typical initialization pattern: -// mc := wdclient.NewMasterClient(...) -// go mc.KeepConnectedToMaster(ctx) // Start connection management -// // ... later ... -// master := mc.GetMaster(ctx) // Will block until connected +// +// mc := wdclient.NewMasterClient(...) +// go mc.KeepConnectedToMaster(ctx) // Start connection management +// // ... later ... +// master := mc.GetMaster(ctx) // Will block until connected // // If called before KeepConnectedToMaster establishes a connection, this may cause // unexpected timeouts in LookupVolumeIds and other operations that depend on it. @@ -404,4 +409,3 @@ func (mc *MasterClient) FindLeaderFromOtherPeers(myMasterAddress pb.ServerAddres glog.V(0).Infof("No existing leader found!") return } -