You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

307 lines
8.5 KiB

package weed_server
import (
"sync/atomic"
"testing"
"time"
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol"
)
func TestQABlockHeartbeatCollector(t *testing.T) {
tests := []struct {
name string
run func(t *testing.T)
}{
// --- Stop lifecycle ---
{name: "stop_before_run_deadlocks", run: testStopBeforeRunDeadlocks},
{name: "double_stop_no_panic", run: testDoubleStopNoPanic},
{name: "stop_during_callback", run: testStopDuringCallback},
// --- Interval edge cases ---
{name: "zero_interval_panics", run: testZeroIntervalPanics},
{name: "very_short_interval", run: testVeryShortInterval},
// --- Callback edge cases ---
{name: "callback_panic_crashes_goroutine", run: testCallbackPanicCrashesGoroutine},
{name: "callback_slow_blocks_next_tick", run: testCallbackSlowBlocksNextTick},
{name: "callback_set_after_run", run: testCallbackSetAfterRun},
// --- Concurrency ---
{name: "concurrent_stop_calls", run: testConcurrentStopCalls},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
tt.run(t)
})
}
}
// ---------------------------------------------------------------------------
// Stop lifecycle
// ---------------------------------------------------------------------------
func testStopBeforeRunDeadlocks(t *testing.T) {
// BUG-CP4B2-1: Stop() before Run() blocks forever on <-c.done.
// done channel is unbuffered and only closed by Run()'s defer.
// If Run() never starts, Stop() hangs.
bs := newTestBlockService(t)
collector := NewBlockVolumeHeartbeatCollector(bs, 100*time.Millisecond)
stopped := make(chan struct{})
go func() {
collector.Stop()
close(stopped)
}()
select {
case <-stopped:
// Good: Stop() returned. Bug is fixed.
case <-time.After(2 * time.Second):
t.Error("BUG-CP4B2-1: Stop() before Run() deadlocked (blocked >2s on <-c.done)")
// We can't recover from this -- the goroutine is leaked.
// Start Run() to unblock.
go collector.Run()
<-stopped
}
}
func testDoubleStopNoPanic(t *testing.T) {
bs := newTestBlockService(t)
collector := NewBlockVolumeHeartbeatCollector(bs, 10*time.Millisecond)
collector.SetStatusCallback(func(msgs []blockvol.BlockVolumeInfoMessage) {})
go collector.Run()
time.Sleep(30 * time.Millisecond)
// First stop.
collector.Stop()
// Second stop should not panic (sync.Once + closed done channel).
done := make(chan struct{})
go func() {
defer func() {
if r := recover(); r != nil {
t.Errorf("double Stop() panicked: %v", r)
}
close(done)
}()
collector.Stop()
}()
select {
case <-done:
case <-time.After(2 * time.Second):
t.Fatal("double Stop() blocked >2s")
}
}
func testStopDuringCallback(t *testing.T) {
bs := newTestBlockService(t)
collector := NewBlockVolumeHeartbeatCollector(bs, 10*time.Millisecond)
callbackStarted := make(chan struct{})
callbackRelease := make(chan struct{})
collector.SetStatusCallback(func(msgs []blockvol.BlockVolumeInfoMessage) {
select {
case callbackStarted <- struct{}{}:
default:
}
<-callbackRelease
})
go collector.Run()
// Wait for callback to start.
select {
case <-callbackStarted:
case <-time.After(2 * time.Second):
close(callbackRelease)
t.Fatal("callback never started")
return
}
// Stop while callback is blocked. Stop should block until Run() exits.
stopDone := make(chan struct{})
go func() {
collector.Stop()
close(stopDone)
}()
// Stop should be blocked because callback is still running.
select {
case <-stopDone:
close(callbackRelease)
// Stop returned while callback was blocked -- this means Run()
// exited mid-callback? Let's see...
t.Log("Stop() returned while callback was blocked (Run exited between ticks)")
case <-time.After(100 * time.Millisecond):
// Expected: Stop is blocked because Run() is in the callback.
// Actually, Run()'s select will pick stopCh on the NEXT iteration,
// not mid-callback. So callback must complete first.
close(callbackRelease)
<-stopDone
t.Log("Stop() waited for callback to finish before returning (correct)")
}
}
// ---------------------------------------------------------------------------
// Interval edge cases
// ---------------------------------------------------------------------------
func testZeroIntervalPanics(t *testing.T) {
// BUG-CP4B2-2 (fixed): zero interval is clamped to minHeartbeatInterval.
// Verify: no panic, collector runs normally, callbacks fire.
bs := newTestBlockService(t)
collector := NewBlockVolumeHeartbeatCollector(bs, 0)
var count atomic.Int64
collector.SetStatusCallback(func(msgs []blockvol.BlockVolumeInfoMessage) {
count.Add(1)
})
go collector.Run()
time.Sleep(30 * time.Millisecond)
collector.Stop()
n := count.Load()
if n < 1 {
t.Errorf("expected at least 1 callback with clamped interval, got %d", n)
}
t.Logf("zero interval (clamped): %d callbacks in 30ms", n)
}
func testVeryShortInterval(t *testing.T) {
bs := newTestBlockService(t)
var count atomic.Int64
collector := NewBlockVolumeHeartbeatCollector(bs, 1*time.Millisecond)
collector.SetStatusCallback(func(msgs []blockvol.BlockVolumeInfoMessage) {
count.Add(1)
})
go collector.Run()
time.Sleep(50 * time.Millisecond)
collector.Stop()
n := count.Load()
if n < 5 {
t.Errorf("expected >= 5 callbacks at 1ms interval over 50ms, got %d", n)
}
t.Logf("1ms interval: %d callbacks in 50ms", n)
}
// ---------------------------------------------------------------------------
// Callback edge cases
// ---------------------------------------------------------------------------
func testCallbackPanicCrashesGoroutine(t *testing.T) {
// BUG-CP4B2-3 (fixed): safeCallback recovers panics. Run() continues.
// Verify: panic is logged, collector keeps running, subsequent callbacks fire.
bs := newTestBlockService(t)
collector := NewBlockVolumeHeartbeatCollector(bs, 10*time.Millisecond)
var callCount atomic.Int64
collector.SetStatusCallback(func(msgs []blockvol.BlockVolumeInfoMessage) {
n := callCount.Add(1)
if n == 1 {
panic("deliberate test panic in callback")
}
})
go collector.Run()
// Wait for multiple callbacks (first panics, subsequent should still fire).
time.Sleep(100 * time.Millisecond)
collector.Stop()
n := callCount.Load()
if n < 2 {
t.Errorf("expected >= 2 callbacks (first panics, rest recover), got %d", n)
}
t.Logf("callback panic recovery: %d callbacks total (first panicked, rest recovered)", n)
}
func testCallbackSlowBlocksNextTick(t *testing.T) {
bs := newTestBlockService(t)
var count atomic.Int64
collector := NewBlockVolumeHeartbeatCollector(bs, 10*time.Millisecond)
collector.SetStatusCallback(func(msgs []blockvol.BlockVolumeInfoMessage) {
count.Add(1)
time.Sleep(50 * time.Millisecond) // 5x the interval
})
go collector.Run()
time.Sleep(200 * time.Millisecond)
collector.Stop()
n := count.Load()
// With 50ms callback sleep and 10ms interval, we should get ~4 callbacks
// (200ms / 50ms), not 20 (200ms / 10ms). Slow callback blocks the loop.
if n > 8 {
t.Errorf("expected slow callback to throttle ticks, got %d callbacks", n)
}
t.Logf("slow callback: %d callbacks in 200ms (10ms interval, 50ms callback)", n)
}
func testCallbackSetAfterRun(t *testing.T) {
// Setting SetStatusCallback after Run() starts -- now safe with cbMu
// (BUG-CP4B3-2 fix).
bs := newTestBlockService(t)
collector := NewBlockVolumeHeartbeatCollector(bs, 10*time.Millisecond)
// Start with nil callback.
go collector.Run()
// Set callback after Run started. With cbMu, this is race-free.
time.Sleep(5 * time.Millisecond)
var called atomic.Bool
collector.SetStatusCallback(func(msgs []blockvol.BlockVolumeInfoMessage) {
called.Store(true)
})
time.Sleep(50 * time.Millisecond)
collector.Stop()
t.Logf("callback set after Run: called=%v", called.Load())
}
// ---------------------------------------------------------------------------
// Concurrency
// ---------------------------------------------------------------------------
func testConcurrentStopCalls(t *testing.T) {
bs := newTestBlockService(t)
collector := NewBlockVolumeHeartbeatCollector(bs, 10*time.Millisecond)
collector.SetStatusCallback(func(msgs []blockvol.BlockVolumeInfoMessage) {})
go collector.Run()
time.Sleep(30 * time.Millisecond)
// 10 goroutines all calling Stop concurrently.
var wg atomic.Int64
done := make(chan struct{})
for i := 0; i < 10; i++ {
wg.Add(1)
go func() {
defer wg.Add(-1)
collector.Stop()
}()
}
go func() {
for wg.Load() > 0 {
time.Sleep(1 * time.Millisecond)
}
close(done)
}()
select {
case <-done:
// All returned.
case <-time.After(5 * time.Second):
t.Fatal("concurrent Stop() calls blocked >5s")
}
}