Browse Source

fix: data race on filerHealth.lastFailureTime in circuit breaker

The circuit breaker tracked lastFailureTime as time.Time, which was
written in recordFilerFailure and read in shouldSkipUnhealthyFiler
without synchronization, causing a data race.

Data race scenario:
  Goroutine 1: recordFilerFailure(0)
    health.lastFailureTime = time.Now()  //  unsynchronized write

  Goroutine 2: shouldSkipUnhealthyFiler(0)
    time.Since(health.lastFailureTime)   //  unsynchronized read

  → RACE DETECTED by -race detector

Fix:
  Changed lastFailureTime from time.Time to int64 (lastFailureTimeNs)
  storing Unix nanoseconds for atomic access:

  Write side (recordFilerFailure):
    atomic.StoreInt64(&health.lastFailureTimeNs, time.Now().UnixNano())

  Read side (shouldSkipUnhealthyFiler):
    lastFailureNs := atomic.LoadInt64(&health.lastFailureTimeNs)
    if lastFailureNs == 0 { return false }  // Never failed
    lastFailureTime := time.Unix(0, lastFailureNs)
    time.Since(lastFailureTime) > 30*time.Second

Benefits:
  - Atomic reads/writes (no data race)
  - Efficient (int64 is 8 bytes, always atomic on 64-bit systems)
  - Zero value (0) naturally means "never failed"
  - No mutex needed (lock-free circuit breaker)

Note: sync/atomic was already imported for failureCount, so no new
import needed.
pull/7518/head
chrislu 2 weeks ago
parent
commit
19962b8c56
  1. 19
      weed/wdclient/filer_client.go

19
weed/wdclient/filer_client.go

@ -26,8 +26,8 @@ const (
// filerHealth tracks the health status of a filer
type filerHealth struct {
failureCount int32 // atomic: consecutive failures
lastFailureTime time.Time // last time this filer failed
failureCount int32 // atomic: consecutive failures
lastFailureTimeNs int64 // atomic: last failure time in Unix nanoseconds
}
// FilerClient provides volume location services by querying a filer
@ -198,17 +198,22 @@ func isRetryableGrpcError(err error) bool {
func (fc *FilerClient) shouldSkipUnhealthyFiler(index int32) bool {
health := fc.filerHealth[index]
failureCount := atomic.LoadInt32(&health.failureCount)
// Allow up to 2 failures before skipping
if failureCount < 3 {
return false
}
// Re-check unhealthy filers every 30 seconds
if time.Since(health.lastFailureTime) > 30*time.Second {
lastFailureNs := atomic.LoadInt64(&health.lastFailureTimeNs)
if lastFailureNs == 0 {
return false // Never failed, shouldn't skip
}
lastFailureTime := time.Unix(0, lastFailureNs)
if time.Since(lastFailureTime) > 30*time.Second {
return false // Time to re-check
}
return true // Skip this unhealthy filer
}
@ -222,7 +227,7 @@ func (fc *FilerClient) recordFilerSuccess(index int32) {
func (fc *FilerClient) recordFilerFailure(index int32) {
health := fc.filerHealth[index]
atomic.AddInt32(&health.failureCount, 1)
health.lastFailureTime = time.Now()
atomic.StoreInt64(&health.lastFailureTimeNs, time.Now().UnixNano())
}
// LookupVolumeIds queries the filer for volume locations with automatic failover

Loading…
Cancel
Save