From 19962b8c561a908a32797971a519ed8ef6fb2e9a Mon Sep 17 00:00:00 2001 From: chrislu Date: Thu, 20 Nov 2025 16:35:35 -0800 Subject: [PATCH] fix: data race on filerHealth.lastFailureTime in circuit breaker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The circuit breaker tracked lastFailureTime as time.Time, which was written in recordFilerFailure and read in shouldSkipUnhealthyFiler without synchronization, causing a data race. Data race scenario: Goroutine 1: recordFilerFailure(0) health.lastFailureTime = time.Now() // ❌ unsynchronized write Goroutine 2: shouldSkipUnhealthyFiler(0) time.Since(health.lastFailureTime) // ❌ unsynchronized read → RACE DETECTED by -race detector Fix: Changed lastFailureTime from time.Time to int64 (lastFailureTimeNs) storing Unix nanoseconds for atomic access: Write side (recordFilerFailure): atomic.StoreInt64(&health.lastFailureTimeNs, time.Now().UnixNano()) Read side (shouldSkipUnhealthyFiler): lastFailureNs := atomic.LoadInt64(&health.lastFailureTimeNs) if lastFailureNs == 0 { return false } // Never failed lastFailureTime := time.Unix(0, lastFailureNs) time.Since(lastFailureTime) > 30*time.Second Benefits: - Atomic reads/writes (no data race) - Efficient (int64 is 8 bytes, always atomic on 64-bit systems) - Zero value (0) naturally means "never failed" - No mutex needed (lock-free circuit breaker) Note: sync/atomic was already imported for failureCount, so no new import needed. --- weed/wdclient/filer_client.go | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/weed/wdclient/filer_client.go b/weed/wdclient/filer_client.go index bb2e9e5a4..7821bf068 100644 --- a/weed/wdclient/filer_client.go +++ b/weed/wdclient/filer_client.go @@ -26,8 +26,8 @@ const ( // filerHealth tracks the health status of a filer type filerHealth struct { - failureCount int32 // atomic: consecutive failures - lastFailureTime time.Time // last time this filer failed + failureCount int32 // atomic: consecutive failures + lastFailureTimeNs int64 // atomic: last failure time in Unix nanoseconds } // FilerClient provides volume location services by querying a filer @@ -198,17 +198,22 @@ func isRetryableGrpcError(err error) bool { func (fc *FilerClient) shouldSkipUnhealthyFiler(index int32) bool { health := fc.filerHealth[index] failureCount := atomic.LoadInt32(&health.failureCount) - + // Allow up to 2 failures before skipping if failureCount < 3 { return false } - + // Re-check unhealthy filers every 30 seconds - if time.Since(health.lastFailureTime) > 30*time.Second { + lastFailureNs := atomic.LoadInt64(&health.lastFailureTimeNs) + if lastFailureNs == 0 { + return false // Never failed, shouldn't skip + } + lastFailureTime := time.Unix(0, lastFailureNs) + if time.Since(lastFailureTime) > 30*time.Second { return false // Time to re-check } - + return true // Skip this unhealthy filer } @@ -222,7 +227,7 @@ func (fc *FilerClient) recordFilerSuccess(index int32) { func (fc *FilerClient) recordFilerFailure(index int32) { health := fc.filerHealth[index] atomic.AddInt32(&health.failureCount, 1) - health.lastFailureTime = time.Now() + atomic.StoreInt64(&health.lastFailureTimeNs, time.Now().UnixNano()) } // LookupVolumeIds queries the filer for volume locations with automatic failover