From 19962b8c561a908a32797971a519ed8ef6fb2e9a Mon Sep 17 00:00:00 2001
From: chrislu <chris.lu@gmail.com>
Date: Thu, 20 Nov 2025 16:35:35 -0800
Subject: [PATCH] fix: data race on filerHealth.lastFailureTime in circuit
 breaker
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The circuit breaker tracked lastFailureTime as time.Time, which was
written in recordFilerFailure and read in shouldSkipUnhealthyFiler
without synchronization, causing a data race.

Data race scenario:
  Goroutine 1: recordFilerFailure(0)
    health.lastFailureTime = time.Now()  // ❌ unsynchronized write

  Goroutine 2: shouldSkipUnhealthyFiler(0)
    time.Since(health.lastFailureTime)   // ❌ unsynchronized read

  → RACE DETECTED by -race detector

Fix:
  Changed lastFailureTime from time.Time to int64 (lastFailureTimeNs)
  storing Unix nanoseconds for atomic access:

  Write side (recordFilerFailure):
    atomic.StoreInt64(&health.lastFailureTimeNs, time.Now().UnixNano())

  Read side (shouldSkipUnhealthyFiler):
    lastFailureNs := atomic.LoadInt64(&health.lastFailureTimeNs)
    if lastFailureNs == 0 { return false }  // Never failed
    lastFailureTime := time.Unix(0, lastFailureNs)
    time.Since(lastFailureTime) > 30*time.Second

Benefits:
  - Atomic reads/writes (no data race)
  - Efficient (int64 is 8 bytes, always atomic on 64-bit systems)
  - Zero value (0) naturally means "never failed"
  - No mutex needed (lock-free circuit breaker)

Note: sync/atomic was already imported for failureCount, so no new
import needed.
---
 weed/wdclient/filer_client.go | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/weed/wdclient/filer_client.go b/weed/wdclient/filer_client.go
index bb2e9e5a4..7821bf068 100644
--- a/weed/wdclient/filer_client.go
+++ b/weed/wdclient/filer_client.go
@@ -26,8 +26,8 @@ const (
 
 // filerHealth tracks the health status of a filer
 type filerHealth struct {
-	failureCount    int32     // atomic: consecutive failures
-	lastFailureTime time.Time // last time this filer failed
+	failureCount       int32 // atomic: consecutive failures
+	lastFailureTimeNs  int64 // atomic: last failure time in Unix nanoseconds
 }
 
 // FilerClient provides volume location services by querying a filer
@@ -198,17 +198,22 @@ func isRetryableGrpcError(err error) bool {
 func (fc *FilerClient) shouldSkipUnhealthyFiler(index int32) bool {
 	health := fc.filerHealth[index]
 	failureCount := atomic.LoadInt32(&health.failureCount)
-
+	
 	// Allow up to 2 failures before skipping
 	if failureCount < 3 {
 		return false
 	}
-
+	
 	// Re-check unhealthy filers every 30 seconds
-	if time.Since(health.lastFailureTime) > 30*time.Second {
+	lastFailureNs := atomic.LoadInt64(&health.lastFailureTimeNs)
+	if lastFailureNs == 0 {
+		return false // Never failed, shouldn't skip
+	}
+	lastFailureTime := time.Unix(0, lastFailureNs)
+	if time.Since(lastFailureTime) > 30*time.Second {
 		return false // Time to re-check
 	}
-
+	
 	return true // Skip this unhealthy filer
 }
 
@@ -222,7 +227,7 @@ func (fc *FilerClient) recordFilerSuccess(index int32) {
 func (fc *FilerClient) recordFilerFailure(index int32) {
 	health := fc.filerHealth[index]
 	atomic.AddInt32(&health.failureCount, 1)
-	health.lastFailureTime = time.Now()
+	atomic.StoreInt64(&health.lastFailureTimeNs, time.Now().UnixNano())
 }
 
 // LookupVolumeIds queries the filer for volume locations with automatic failover