Browse Source

fix: data race on filerHealth.lastFailureTime in circuit breaker

The circuit breaker tracked lastFailureTime as time.Time, which was
written in recordFilerFailure and read in shouldSkipUnhealthyFiler
without synchronization, causing a data race.

Data race scenario:
  Goroutine 1: recordFilerFailure(0)
    health.lastFailureTime = time.Now()  //  unsynchronized write

  Goroutine 2: shouldSkipUnhealthyFiler(0)
    time.Since(health.lastFailureTime)   //  unsynchronized read

  → RACE DETECTED by -race detector

Fix:
  Changed lastFailureTime from time.Time to int64 (lastFailureTimeNs)
  storing Unix nanoseconds for atomic access:

  Write side (recordFilerFailure):
    atomic.StoreInt64(&health.lastFailureTimeNs, time.Now().UnixNano())

  Read side (shouldSkipUnhealthyFiler):
    lastFailureNs := atomic.LoadInt64(&health.lastFailureTimeNs)
    if lastFailureNs == 0 { return false }  // Never failed
    lastFailureTime := time.Unix(0, lastFailureNs)
    time.Since(lastFailureTime) > 30*time.Second

Benefits:
  - Atomic reads/writes (no data race)
  - Efficient (int64 is 8 bytes, always atomic on 64-bit systems)
  - Zero value (0) naturally means "never failed"
  - No mutex needed (lock-free circuit breaker)

Note: sync/atomic was already imported for failureCount, so no new
import needed.
pull/7518/head
chrislu 3 weeks ago
parent
commit
19962b8c56
  1. 11
      weed/wdclient/filer_client.go

11
weed/wdclient/filer_client.go

@ -27,7 +27,7 @@ const (
// filerHealth tracks the health status of a filer // filerHealth tracks the health status of a filer
type filerHealth struct { type filerHealth struct {
failureCount int32 // atomic: consecutive failures failureCount int32 // atomic: consecutive failures
lastFailureTime time.Time // last time this filer failed
lastFailureTimeNs int64 // atomic: last failure time in Unix nanoseconds
} }
// FilerClient provides volume location services by querying a filer // FilerClient provides volume location services by querying a filer
@ -205,7 +205,12 @@ func (fc *FilerClient) shouldSkipUnhealthyFiler(index int32) bool {
} }
// Re-check unhealthy filers every 30 seconds // Re-check unhealthy filers every 30 seconds
if time.Since(health.lastFailureTime) > 30*time.Second {
lastFailureNs := atomic.LoadInt64(&health.lastFailureTimeNs)
if lastFailureNs == 0 {
return false // Never failed, shouldn't skip
}
lastFailureTime := time.Unix(0, lastFailureNs)
if time.Since(lastFailureTime) > 30*time.Second {
return false // Time to re-check return false // Time to re-check
} }
@ -222,7 +227,7 @@ func (fc *FilerClient) recordFilerSuccess(index int32) {
func (fc *FilerClient) recordFilerFailure(index int32) { func (fc *FilerClient) recordFilerFailure(index int32) {
health := fc.filerHealth[index] health := fc.filerHealth[index]
atomic.AddInt32(&health.failureCount, 1) atomic.AddInt32(&health.failureCount, 1)
health.lastFailureTime = time.Now()
atomic.StoreInt64(&health.lastFailureTimeNs, time.Now().UnixNano())
} }
// LookupVolumeIds queries the filer for volume locations with automatic failover // LookupVolumeIds queries the filer for volume locations with automatic failover

Loading…
Cancel
Save