Browse Source

feat: CP11B-3 safe ops — promotion hardening, preflight, manual promote

Six-task checkpoint hardening the promotion and failover paths:

T1: 4-gate candidate evaluation (heartbeat freshness, WAL lag, role,
    server liveness) with structured rejection reasons.
T2: Orphaned-primary re-evaluation on replica reconnect (B-06/B-08).
T3: Deferred timer safety — epoch validation prevents stale timers
    from firing on recreated/changed volumes (B-07).
T4: Rebuild addr cleanup on promotion (B-11), NVMe publication
    refresh on heartbeat, and preflight endpoint wiring.
T5: Manual promote API — POST /block/volume/{name}/promote with
    force flag, target server selection, and structured rejection
    response. Shared applyPromotionLocked/finalizePromotion helpers
    eliminate duplication between auto and manual paths.
T6: Read-only preflight endpoint (GET /block/volume/{name}/preflight)
    and blockapi client wrappers (Preflight, Promote).

BUG-T5-1: PromotionsTotal counter moved to finalizePromotion (shared
    by both auto and manual paths) to prevent metrics divergence.

24 files changed, ~6500 lines added. 42 new QA adversarial tests.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
feature/sw-block
Ping Qiu 8 hours ago
parent
commit
075ff52219
  1. 7
      weed/server/integration_block_test.go
  2. 89
      weed/server/master_block_failover.go
  3. 335
      weed/server/master_block_failover_test.go
  4. 372
      weed/server/master_block_registry.go
  5. 519
      weed/server/master_block_registry_test.go
  6. 3
      weed/server/master_grpc_server.go
  7. 23
      weed/server/master_grpc_server_block.go
  8. 71
      weed/server/master_grpc_server_block_test.go
  9. 6
      weed/server/master_server.go
  10. 96
      weed/server/master_server_handlers_block.go
  11. 1581
      weed/server/qa_block_cp11b3_adversarial_test.go
  12. 25
      weed/server/qa_block_cp63_test.go
  13. 485
      weed/server/qa_block_expand_adversarial_test.go
  14. 1346
      weed/server/qa_block_nvme_publication_test.go
  15. 55
      weed/storage/blockvol/blockapi/client.go
  16. 48
      weed/storage/blockvol/blockapi/types.go
  17. 511
      weed/storage/blockvol/qa_wal_cp11a3_adversarial_test.go
  18. 220
      weed/storage/blockvol/testrunner/actions/devops.go
  19. 22
      weed/storage/blockvol/testrunner/actions/devops_test.go
  20. 89
      weed/storage/blockvol/testrunner/actions/snapshot.go
  21. 101
      weed/storage/blockvol/testrunner/infra/ha_target.go
  22. 246
      weed/storage/blockvol/testrunner/scenarios/cp11b3-auto-failover.yaml
  23. 214
      weed/storage/blockvol/testrunner/scenarios/cp11b3-fast-reconnect.yaml
  24. 190
      weed/storage/blockvol/testrunner/scenarios/cp11b3-manual-promote.yaml

7
weed/server/integration_block_test.go

@ -645,13 +645,16 @@ func TestIntegration_DoubleFailover(t *testing.T) {
// Reconnect vs1 first so it becomes a replica (via recoverBlockVolumes).
ms.recoverBlockVolumes(vs1)
// Simulate heartbeat from vs1 that restores iSCSI addr and health score
// (in production this happens when the VS re-registers after reconnect).
// Simulate heartbeat from vs1 that restores iSCSI addr, health score,
// role, and heartbeat timestamp (in production this happens when the
// VS re-registers after reconnect and completes rebuild).
e1, _ = ms.blockRegistry.Lookup("pvc-double-1")
for i := range e1.Replicas {
if e1.Replicas[i].Server == vs1 {
e1.Replicas[i].ISCSIAddr = vs1 + ":3260"
e1.Replicas[i].HealthScore = 1.0
e1.Replicas[i].Role = blockvol.RoleToWire(blockvol.RoleReplica)
e1.Replicas[i].LastHeartbeat = time.Now()
}
}

89
weed/server/master_block_failover.go

@ -57,7 +57,19 @@ func (ms *MasterServer) failoverBlockVolumes(deadServer string) {
delay := leaseExpiry.Sub(now)
glog.V(0).Infof("failover: %q lease expires in %v, deferring promotion", entry.Name, delay)
volumeName := entry.Name
capturedEpoch := entry.Epoch // T3: capture epoch for stale-timer validation
timer := time.AfterFunc(delay, func() {
// T3: Re-validate before acting — prevent stale timer on recreated/changed volume.
current, ok := ms.blockRegistry.Lookup(volumeName)
if !ok {
glog.V(0).Infof("failover: deferred promotion for %q skipped (volume deleted)", volumeName)
return
}
if current.Epoch != capturedEpoch {
glog.V(0).Infof("failover: deferred promotion for %q skipped (epoch changed %d -> %d)",
volumeName, capturedEpoch, current.Epoch)
return
}
ms.promoteReplica(volumeName)
})
ms.blockFailover.mu.Lock()
@ -116,8 +128,15 @@ func (ms *MasterServer) promoteReplica(volumeName string) {
return
}
ms.finalizePromotion(volumeName, oldPrimary, oldPath, newEpoch)
}
// finalizePromotion performs post-registry promotion steps:
// enqueue assignment for new primary, record pending rebuild for old primary, bump metrics.
// Called by both promoteReplica (auto) and blockVolumePromoteHandler (manual).
func (ms *MasterServer) finalizePromotion(volumeName, oldPrimary, oldPath string, newEpoch uint64) {
// Re-read entry after promotion.
entry, ok = ms.blockRegistry.Lookup(volumeName)
entry, ok := ms.blockRegistry.Lookup(volumeName)
if !ok {
return
}
@ -198,11 +217,15 @@ func (ms *MasterServer) cancelDeferredTimers(server string) {
// recoverBlockVolumes is called when a previously dead VS reconnects.
// It cancels any deferred promotion timers (R2-F2), drains pending rebuilds,
// and enqueues rebuild assignments.
// enqueues rebuild assignments, and checks for orphaned primaries (T2/B-06).
func (ms *MasterServer) recoverBlockVolumes(reconnectedServer string) {
// R2-F2: Cancel deferred promotion timers for this server to prevent split-brain.
ms.cancelDeferredTimers(reconnectedServer)
// T2 (B-06): Check for orphaned primaries — volumes where the reconnecting
// server is a replica but the primary is dead/disconnected.
ms.reevaluateOrphanedPrimaries(reconnectedServer)
rebuilds := ms.drainPendingRebuilds(reconnectedServer)
if len(rebuilds) == 0 {
return
@ -221,16 +244,74 @@ func (ms *MasterServer) recoverBlockVolumes(reconnectedServer string) {
Path: rb.OldPath,
})
// T4: Warn if RebuildListenAddr is empty (new primary hasn't heartbeated yet).
rebuildAddr := entry.RebuildListenAddr
if rebuildAddr == "" {
glog.Warningf("rebuild: %q RebuildListenAddr is empty (new primary %s may not have heartbeated yet), "+
"queuing rebuild anyway — VS should retry on empty addr", rb.VolumeName, entry.VolumeServer)
}
// Enqueue rebuild assignment for the reconnected server.
ms.blockAssignmentQueue.Enqueue(reconnectedServer, blockvol.BlockVolumeAssignment{
Path: rb.OldPath,
Epoch: entry.Epoch,
Role: blockvol.RoleToWire(blockvol.RoleRebuilding),
RebuildAddr: entry.RebuildListenAddr,
RebuildAddr: rebuildAddr,
})
ms.blockRegistry.RebuildsTotal.Add(1)
glog.V(0).Infof("rebuild: enqueued rebuild for %q on %s (epoch=%d, rebuildAddr=%s)",
rb.VolumeName, reconnectedServer, entry.Epoch, entry.RebuildListenAddr)
rb.VolumeName, reconnectedServer, entry.Epoch, rebuildAddr)
}
}
// reevaluateOrphanedPrimaries checks if the given server is a replica for any
// volumes whose primary is dead (not block-capable). If so, promotes the best
// available replica — but only after the old primary's lease has expired, to
// maintain the same split-brain protection as failoverBlockVolumes().
// This fixes B-06 (orphaned primary after replica re-register)
// and partially B-08 (fast reconnect skips failover window).
func (ms *MasterServer) reevaluateOrphanedPrimaries(server string) {
if ms.blockRegistry == nil {
return
}
orphaned := ms.blockRegistry.VolumesWithDeadPrimary(server)
now := time.Now()
for _, volumeName := range orphaned {
entry, ok := ms.blockRegistry.Lookup(volumeName)
if !ok {
continue
}
// Respect lease expiry — same gate as failoverBlockVolumes().
leaseExpiry := entry.LastLeaseGrant.Add(entry.LeaseTTL)
if now.Before(leaseExpiry) {
delay := leaseExpiry.Sub(now)
glog.V(0).Infof("failover: orphaned primary for %q (replica %s alive, primary dead) "+
"but lease expires in %v, deferring promotion", volumeName, server, delay)
capturedEpoch := entry.Epoch
deadPrimary := entry.VolumeServer
timer := time.AfterFunc(delay, func() {
current, ok := ms.blockRegistry.Lookup(volumeName)
if !ok {
return
}
if current.Epoch != capturedEpoch {
glog.V(0).Infof("failover: deferred orphan promotion for %q skipped (epoch changed %d -> %d)",
volumeName, capturedEpoch, current.Epoch)
return
}
ms.promoteReplica(volumeName)
})
ms.blockFailover.mu.Lock()
ms.blockFailover.deferredTimers[deadPrimary] = append(
ms.blockFailover.deferredTimers[deadPrimary], timer)
ms.blockFailover.mu.Unlock()
continue
}
glog.V(0).Infof("failover: orphaned primary detected for %q (replica %s alive, primary dead, lease expired), promoting",
volumeName, server)
ms.promoteReplica(volumeName)
}
}

335
weed/server/master_block_failover_test.go

@ -34,6 +34,9 @@ func testMasterServerForFailover(t *testing.T) *MasterServer {
// registerVolumeWithReplica creates a volume entry with primary + replica for tests.
func registerVolumeWithReplica(t *testing.T, ms *MasterServer, name, primary, replica string, epoch uint64, leaseTTL time.Duration) {
t.Helper()
// Mark both servers as block-capable so promotion Gate 4 (liveness) passes.
ms.blockRegistry.MarkBlockCapable(primary)
ms.blockRegistry.MarkBlockCapable(replica)
entry := &BlockVolumeEntry{
Name: name,
VolumeServer: primary,
@ -53,11 +56,13 @@ func registerVolumeWithReplica(t *testing.T, ms *MasterServer, name, primary, re
// CP8-2: also populate Replicas[] for PromoteBestReplica.
Replicas: []ReplicaInfo{
{
Server: replica,
Path: fmt.Sprintf("/data/%s.blk", name),
IQN: fmt.Sprintf("iqn.2024.test:%s-replica", name),
ISCSIAddr: replica + ":3260",
HealthScore: 1.0,
Server: replica,
Path: fmt.Sprintf("/data/%s.blk", name),
IQN: fmt.Sprintf("iqn.2024.test:%s-replica", name),
ISCSIAddr: replica + ":3260",
HealthScore: 1.0,
Role: blockvol.RoleToWire(blockvol.RoleReplica),
LastHeartbeat: time.Now(),
},
},
}
@ -194,6 +199,9 @@ func TestFailover_MultipleVolumes(t *testing.T) {
func TestFailover_LeaseNotExpired_DeferredPromotion(t *testing.T) {
ms := testMasterServerForFailover(t)
// Mark servers as block-capable so promotion Gate 4 (liveness) passes.
ms.blockRegistry.MarkBlockCapable("vs1")
ms.blockRegistry.MarkBlockCapable("vs2")
entry := &BlockVolumeEntry{
Name: "vol1",
VolumeServer: "vs1",
@ -209,7 +217,7 @@ func TestFailover_LeaseNotExpired_DeferredPromotion(t *testing.T) {
LeaseTTL: 200 * time.Millisecond,
LastLeaseGrant: time.Now(), // just granted, NOT expired yet
Replicas: []ReplicaInfo{
{Server: "vs2", Path: "/data/vol1.blk", IQN: "iqn:vol1-r", ISCSIAddr: "vs2:3260", HealthScore: 1.0},
{Server: "vs2", Path: "/data/vol1.blk", IQN: "iqn:vol1-r", ISCSIAddr: "vs2:3260", HealthScore: 1.0, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
},
}
ms.blockRegistry.Register(entry)
@ -397,6 +405,9 @@ func TestRebuild_RegistryUpdatedWithNewReplica(t *testing.T) {
func TestRebuild_AssignmentContainsRebuildAddr(t *testing.T) {
ms := testMasterServerForFailover(t)
// Mark servers as block-capable so promotion Gate 4 (liveness) passes.
ms.blockRegistry.MarkBlockCapable("vs1")
ms.blockRegistry.MarkBlockCapable("vs2")
entry := &BlockVolumeEntry{
Name: "vol1",
VolumeServer: "vs1",
@ -413,7 +424,7 @@ func TestRebuild_AssignmentContainsRebuildAddr(t *testing.T) {
LeaseTTL: 5 * time.Second,
LastLeaseGrant: time.Now().Add(-10 * time.Second),
Replicas: []ReplicaInfo{
{Server: "vs2", Path: "/data/vol1.blk", IQN: "iqn:vol1-r", ISCSIAddr: "vs2:3260", HealthScore: 1.0},
{Server: "vs2", Path: "/data/vol1.blk", IQN: "iqn:vol1-r", ISCSIAddr: "vs2:3260", HealthScore: 1.0, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
},
}
ms.blockRegistry.Register(entry)
@ -457,7 +468,7 @@ func TestFailover_TransientDisconnect_NoPromotion(t *testing.T) {
LeaseTTL: 30 * time.Second,
LastLeaseGrant: time.Now(), // just granted
Replicas: []ReplicaInfo{
{Server: "vs2", Path: "/data/vol1.blk", IQN: "iqn:vol1-r", ISCSIAddr: "vs2:3260", HealthScore: 1.0},
{Server: "vs2", Path: "/data/vol1.blk", IQN: "iqn:vol1-r", ISCSIAddr: "vs2:3260", HealthScore: 1.0, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
},
}
ms.blockRegistry.Register(entry)
@ -556,6 +567,10 @@ func TestLifecycle_CreateFailoverRebuild(t *testing.T) {
// registerVolumeRF3 creates a volume entry with primary + 2 replicas for RF=3 tests.
func registerVolumeRF3(t *testing.T, ms *MasterServer, name, primary, replica1, replica2 string, epoch uint64, leaseTTL time.Duration) {
t.Helper()
// Mark all servers as block-capable so promotion Gate 4 (liveness) passes.
ms.blockRegistry.MarkBlockCapable(primary)
ms.blockRegistry.MarkBlockCapable(replica1)
ms.blockRegistry.MarkBlockCapable(replica2)
entry := &BlockVolumeEntry{
Name: name,
VolumeServer: primary,
@ -576,20 +591,24 @@ func registerVolumeRF3(t *testing.T, ms *MasterServer, name, primary, replica1,
ReplicaISCSIAddr: replica1 + ":3260",
Replicas: []ReplicaInfo{
{
Server: replica1,
Path: fmt.Sprintf("/data/%s.blk", name),
IQN: fmt.Sprintf("iqn.2024.test:%s-r1", name),
ISCSIAddr: replica1 + ":3260",
HealthScore: 1.0,
WALHeadLSN: 100,
Server: replica1,
Path: fmt.Sprintf("/data/%s.blk", name),
IQN: fmt.Sprintf("iqn.2024.test:%s-r1", name),
ISCSIAddr: replica1 + ":3260",
HealthScore: 1.0,
WALHeadLSN: 100,
Role: blockvol.RoleToWire(blockvol.RoleReplica),
LastHeartbeat: time.Now(),
},
{
Server: replica2,
Path: fmt.Sprintf("/data/%s.blk", name),
IQN: fmt.Sprintf("iqn.2024.test:%s-r2", name),
ISCSIAddr: replica2 + ":3260",
HealthScore: 1.0,
WALHeadLSN: 100,
Server: replica2,
Path: fmt.Sprintf("/data/%s.blk", name),
IQN: fmt.Sprintf("iqn.2024.test:%s-r2", name),
ISCSIAddr: replica2 + ":3260",
HealthScore: 1.0,
WALHeadLSN: 100,
Role: blockvol.RoleToWire(blockvol.RoleReplica),
LastHeartbeat: time.Now(),
},
},
}
@ -793,6 +812,10 @@ func TestRF3_AllReplicasDead_NoPromotion(t *testing.T) {
// RF3: Lease deferred promotion with RF=3.
func TestRF3_LeaseDeferred_Promotion(t *testing.T) {
ms := testMasterServerForFailover(t)
// Mark servers as block-capable so promotion Gate 4 (liveness) passes.
ms.blockRegistry.MarkBlockCapable("vs1")
ms.blockRegistry.MarkBlockCapable("vs2")
ms.blockRegistry.MarkBlockCapable("vs3")
entry := &BlockVolumeEntry{
Name: "vol1",
VolumeServer: "vs1",
@ -807,8 +830,8 @@ func TestRF3_LeaseDeferred_Promotion(t *testing.T) {
LeaseTTL: 200 * time.Millisecond,
LastLeaseGrant: time.Now(), // just granted → NOT expired
Replicas: []ReplicaInfo{
{Server: "vs2", Path: "/data/vol1.blk", ISCSIAddr: "vs2:3260", HealthScore: 1.0, WALHeadLSN: 50},
{Server: "vs3", Path: "/data/vol1.blk", ISCSIAddr: "vs3:3260", HealthScore: 0.9, WALHeadLSN: 50},
{Server: "vs2", Path: "/data/vol1.blk", ISCSIAddr: "vs2:3260", HealthScore: 1.0, WALHeadLSN: 50, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
{Server: "vs3", Path: "/data/vol1.blk", ISCSIAddr: "vs3:3260", HealthScore: 0.9, WALHeadLSN: 50, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
},
// Deprecated scalar fields.
ReplicaServer: "vs2", ReplicaPath: "/data/vol1.blk", ReplicaISCSIAddr: "vs2:3260",
@ -853,8 +876,8 @@ func TestRF3_CancelDeferredOnReconnect(t *testing.T) {
LeaseTTL: 5 * time.Second,
LastLeaseGrant: time.Now(), // just granted → long lease
Replicas: []ReplicaInfo{
{Server: "vs2", Path: "/data/vol1.blk", ISCSIAddr: "vs2:3260", HealthScore: 1.0},
{Server: "vs3", Path: "/data/vol1.blk", ISCSIAddr: "vs3:3260", HealthScore: 1.0},
{Server: "vs2", Path: "/data/vol1.blk", ISCSIAddr: "vs2:3260", HealthScore: 1.0, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
{Server: "vs3", Path: "/data/vol1.blk", ISCSIAddr: "vs3:3260", HealthScore: 1.0, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
},
ReplicaServer: "vs2", ReplicaPath: "/data/vol1.blk", ReplicaISCSIAddr: "vs2:3260",
}
@ -888,3 +911,267 @@ func TestRF3_CancelDeferredOnReconnect(t *testing.T) {
t.Fatalf("vs1 should remain primary (timer cancelled), got %q", e.VolumeServer)
}
}
// ============================================================
// CP11B-3 T2: Re-evaluate on Replica Registration (B-06)
// ============================================================
// T2: Orphaned primary + replica reconnects → automatic promotion.
func TestT2_OrphanedPrimary_ReplicaReconnect_Promotes(t *testing.T) {
ms := testMasterServerForFailover(t)
registerVolumeWithReplica(t, ms, "vol1", "vs1", "vs2", 1, 5*time.Second)
// Simulate vs1 dying without proper failover (e.g., promotion failed at the time).
// Mark vs1 as dead but DON'T call failoverBlockVolumes (simulates missed/failed failover).
ms.blockRegistry.UnmarkBlockCapable("vs1")
// vs2 reconnects (sends heartbeat). reevaluateOrphanedPrimaries should detect orphaned primary.
ms.recoverBlockVolumes("vs2")
entry, _ := ms.blockRegistry.Lookup("vol1")
if entry.VolumeServer != "vs2" {
t.Fatalf("expected promotion to vs2 (orphaned primary), got %q", entry.VolumeServer)
}
if entry.Epoch != 2 {
t.Fatalf("expected epoch 2 after promotion, got %d", entry.Epoch)
}
}
// T2: Replica reconnects but primary is alive → no unnecessary promotion.
func TestT2_PrimaryAlive_NoPromotion(t *testing.T) {
ms := testMasterServerForFailover(t)
registerVolumeWithReplica(t, ms, "vol1", "vs1", "vs2", 1, 5*time.Second)
// Both servers alive. vs2 reconnects — no orphaned primary.
ms.recoverBlockVolumes("vs2")
entry, _ := ms.blockRegistry.Lookup("vol1")
if entry.VolumeServer != "vs1" {
t.Fatalf("primary should remain vs1 (alive), got %q", entry.VolumeServer)
}
if entry.Epoch != 1 {
t.Fatalf("epoch should remain 1, got %d", entry.Epoch)
}
}
// T2: Multiple orphaned volumes, all promoted on reconnect.
func TestT2_MultipleOrphanedVolumes(t *testing.T) {
ms := testMasterServerForFailover(t)
// vol1: vs1=primary, vs2=replica
// vol2: vs3=primary, vs2=replica
registerVolumeWithReplica(t, ms, "vol1", "vs1", "vs2", 1, 5*time.Second)
ms.blockRegistry.MarkBlockCapable("vs3")
entry2 := &BlockVolumeEntry{
Name: "vol2", VolumeServer: "vs3", Path: "/data/vol2.blk",
SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
Status: StatusActive, LeaseTTL: 5 * time.Second,
LastLeaseGrant: time.Now().Add(-10 * time.Second),
Replicas: []ReplicaInfo{{
Server: "vs2", Path: "/data/vol2.blk", HealthScore: 1.0,
Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(),
}},
}
ms.blockRegistry.Register(entry2)
// Both primaries die.
ms.blockRegistry.UnmarkBlockCapable("vs1")
ms.blockRegistry.UnmarkBlockCapable("vs3")
// vs2 reconnects → both orphaned volumes should be promoted.
ms.recoverBlockVolumes("vs2")
e1, _ := ms.blockRegistry.Lookup("vol1")
e2, _ := ms.blockRegistry.Lookup("vol2")
if e1.VolumeServer != "vs2" {
t.Fatalf("vol1: expected promotion to vs2, got %q", e1.VolumeServer)
}
if e2.VolumeServer != "vs2" {
t.Fatalf("vol2: expected promotion to vs2, got %q", e2.VolumeServer)
}
}
// T2: Repeated heartbeats do NOT cause duplicate promotions.
func TestT2_RepeatedHeartbeats_NoDuplicatePromotion(t *testing.T) {
ms := testMasterServerForFailover(t)
registerVolumeWithReplica(t, ms, "vol1", "vs1", "vs2", 1, 5*time.Second)
ms.blockRegistry.UnmarkBlockCapable("vs1")
// First reconnect promotes.
ms.reevaluateOrphanedPrimaries("vs2")
entry, _ := ms.blockRegistry.Lookup("vol1")
if entry.VolumeServer != "vs2" {
t.Fatalf("first call: expected promotion to vs2, got %q", entry.VolumeServer)
}
epochAfterFirst := entry.Epoch
// Second call: vs2 is now the primary AND block-capable. No orphan detected.
ms.reevaluateOrphanedPrimaries("vs2")
entry, _ = ms.blockRegistry.Lookup("vol1")
if entry.Epoch != epochAfterFirst {
t.Fatalf("second call should not bump epoch: got %d, want %d", entry.Epoch, epochAfterFirst)
}
}
// T2: Dead primary with active lease, replica reconnects → no immediate promotion.
// Regression test for lease-bypass bug: reevaluateOrphanedPrimaries must respect
// lease expiry, not promote immediately.
func TestT2_OrphanedPrimary_LeaseNotExpired_DefersPromotion(t *testing.T) {
ms := testMasterServerForFailover(t)
ms.blockRegistry.MarkBlockCapable("vs1")
ms.blockRegistry.MarkBlockCapable("vs2")
ms.blockRegistry.Register(&BlockVolumeEntry{
Name: "vol1", VolumeServer: "vs1", Path: "/data/vol1.blk",
SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
Status: StatusActive, LeaseTTL: 300 * time.Millisecond,
LastLeaseGrant: time.Now(), // lease still active
Replicas: []ReplicaInfo{{
Server: "vs2", Path: "/data/vol1.blk", HealthScore: 1.0,
Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(),
}},
})
// vs1 dies (unmark block-capable).
ms.blockRegistry.UnmarkBlockCapable("vs1")
// vs2 reconnects — orphan detected, but lease still active → should NOT promote immediately.
ms.reevaluateOrphanedPrimaries("vs2")
entry, _ := ms.blockRegistry.Lookup("vol1")
if entry.VolumeServer != "vs1" {
t.Fatalf("should NOT promote while lease active, got primary=%q", entry.VolumeServer)
}
if entry.Epoch != 1 {
t.Fatalf("epoch should remain 1, got %d", entry.Epoch)
}
// Verify a deferred timer was created for the dead primary.
ms.blockFailover.mu.Lock()
timerCount := len(ms.blockFailover.deferredTimers["vs1"])
ms.blockFailover.mu.Unlock()
if timerCount != 1 {
t.Fatalf("expected 1 deferred timer for vs1, got %d", timerCount)
}
// Wait for lease to expire + margin → timer fires, promotion happens.
time.Sleep(450 * time.Millisecond)
entry, _ = ms.blockRegistry.Lookup("vol1")
if entry.VolumeServer != "vs2" {
t.Fatalf("after lease expiry, expected promotion to vs2, got %q", entry.VolumeServer)
}
if entry.Epoch != 2 {
t.Fatalf("expected epoch 2, got %d", entry.Epoch)
}
}
// ============================================================
// CP11B-3 T3: Deferred Timer Safety
// ============================================================
// T3: Delete/recreate volume before deferred timer fires → no wrong promotion.
func TestT3_DeferredTimer_VolumeDeleted_NoPromotion(t *testing.T) {
ms := testMasterServerForFailover(t)
ms.blockRegistry.MarkBlockCapable("vs1")
ms.blockRegistry.MarkBlockCapable("vs2")
entry := &BlockVolumeEntry{
Name: "vol1", VolumeServer: "vs1", Path: "/data/vol1.blk",
SizeBytes: 1 << 30, Epoch: 5, Role: blockvol.RoleToWire(blockvol.RolePrimary),
Status: StatusActive, LeaseTTL: 200 * time.Millisecond,
LastLeaseGrant: time.Now(),
Replicas: []ReplicaInfo{{
Server: "vs2", Path: "/data/vol1.blk", HealthScore: 1.0,
Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(),
}},
}
ms.blockRegistry.Register(entry)
// vs1 dies → deferred timer created (lease not expired, epoch=5).
ms.failoverBlockVolumes("vs1")
// Delete the volume before timer fires.
ms.blockRegistry.Unregister("vol1")
// Wait for timer to fire.
time.Sleep(350 * time.Millisecond)
// Volume should not exist (timer found it deleted, no-op).
_, ok := ms.blockRegistry.Lookup("vol1")
if ok {
t.Fatal("volume should have been deleted, timer should not recreate it")
}
}
// T3: Epoch changes before deferred timer fires → timer rejected.
func TestT3_DeferredTimer_EpochChanged_NoPromotion(t *testing.T) {
ms := testMasterServerForFailover(t)
ms.blockRegistry.MarkBlockCapable("vs1")
ms.blockRegistry.MarkBlockCapable("vs2")
ms.blockRegistry.MarkBlockCapable("vs3")
entry := &BlockVolumeEntry{
Name: "vol1", VolumeServer: "vs1", Path: "/data/vol1.blk",
SizeBytes: 1 << 30, Epoch: 5, Role: blockvol.RoleToWire(blockvol.RolePrimary),
Status: StatusActive, LeaseTTL: 200 * time.Millisecond,
LastLeaseGrant: time.Now(),
Replicas: []ReplicaInfo{{
Server: "vs2", Path: "/data/vol1.blk", HealthScore: 1.0,
Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(),
}},
}
ms.blockRegistry.Register(entry)
// vs1 dies → deferred timer created (captures epoch=5).
ms.failoverBlockVolumes("vs1")
// Before timer fires, manually bump the epoch (simulating another event).
e, _ := ms.blockRegistry.Lookup("vol1")
e.Epoch = 99
// Wait for timer to fire.
time.Sleep(350 * time.Millisecond)
// Timer should have been rejected (epoch mismatch). Epoch stays at 99.
e, _ = ms.blockRegistry.Lookup("vol1")
if e.Epoch != 99 {
t.Fatalf("epoch should remain 99 (timer rejected), got %d", e.Epoch)
}
// Primary should NOT have changed (deferred promotion was rejected).
if e.VolumeServer != "vs1" {
t.Fatalf("primary should remain vs1 (timer rejected), got %q", e.VolumeServer)
}
}
// ============================================================
// CP11B-3 T4: Rebuild with empty RebuildListenAddr
// ============================================================
// T4: Rebuild queued with empty RebuildListenAddr after promotion.
func TestT4_RebuildEmptyAddr_StillQueued(t *testing.T) {
ms := testMasterServerForFailover(t)
registerVolumeWithReplica(t, ms, "vol1", "vs1", "vs2", 1, 5*time.Second)
// Failover: vs1 dies, vs2 promoted. PromoteBestReplica clears RebuildListenAddr.
ms.failoverBlockVolumes("vs1")
entry, _ := ms.blockRegistry.Lookup("vol1")
if entry.RebuildListenAddr != "" {
t.Fatalf("RebuildListenAddr should be empty after promotion, got %q", entry.RebuildListenAddr)
}
// vs1 reconnects. Rebuild should still be queued (even with empty addr).
ms.recoverBlockVolumes("vs1")
assignments := ms.blockAssignmentQueue.Peek("vs1")
foundRebuild := false
for _, a := range assignments {
if blockvol.RoleFromWire(a.Role) == blockvol.RoleRebuilding {
foundRebuild = true
if a.RebuildAddr != "" {
t.Fatalf("RebuildAddr should be empty (new primary hasn't heartbeated), got %q", a.RebuildAddr)
}
}
}
if !foundRebuild {
t.Fatal("rebuild assignment should still be queued even with empty addr")
}
}

372
weed/server/master_block_registry.go

@ -842,44 +842,91 @@ func (r *BlockVolumeRegistry) PromotionLSNTolerance() uint64 {
return r.promotionLSNTolerance
}
// PromoteBestReplica promotes the best eligible replica to primary.
// Eligibility: heartbeat fresh (within 2×LeaseTTL), WALHeadLSN within tolerance of primary,
// and role must be RoleReplica (not RoleRebuilding).
// The promoted replica is removed from Replicas[]. Other replicas stay.
// Old primary is NOT added to Replicas (needs rebuild).
// Returns the new epoch.
func (r *BlockVolumeRegistry) PromoteBestReplica(name string) (uint64, error) {
r.mu.Lock()
defer r.mu.Unlock()
entry, ok := r.volumes[name]
if !ok {
return 0, fmt.Errorf("block volume %q not found", name)
// PromotionRejection records why a specific replica was rejected for promotion.
type PromotionRejection struct {
Server string
Reason string // "stale_heartbeat", "wal_lag", "wrong_role", "server_dead"
}
// PromotionPreflightResult is the reusable result of a promotion evaluation.
// Used by auto-promotion, manual promote API, preflight status, and logging.
type PromotionPreflightResult struct {
VolumeName string
Promotable bool // true if a candidate was found
Candidate *ReplicaInfo // best candidate (nil if !Promotable)
CandidateIdx int // index in Replicas[] (-1 if !Promotable)
Rejections []PromotionRejection // why each non-candidate was rejected
Reason string // human-readable summary when !Promotable
}
// evaluatePromotionLocked evaluates promotion candidates for a volume.
// Caller must hold r.mu (read or write). Returns a preflight result without
// mutating the registry. The four gates:
// 1. Heartbeat freshness (within 2×LeaseTTL)
// 2. WAL LSN recency (within promotionLSNTolerance of primary)
// 3. Role must be RoleReplica (not RoleRebuilding)
// 4. Server must be in blockServers (alive) — fixes B-12
func (r *BlockVolumeRegistry) evaluatePromotionLocked(entry *BlockVolumeEntry) PromotionPreflightResult {
result := PromotionPreflightResult{
VolumeName: entry.Name,
CandidateIdx: -1,
}
if len(entry.Replicas) == 0 {
return 0, fmt.Errorf("block volume %q has no replicas", name)
result.Reason = "no replicas"
return result
}
// Filter eligible replicas.
now := time.Now()
freshnessCutoff := 2 * entry.LeaseTTL
if freshnessCutoff == 0 {
freshnessCutoff = 60 * time.Second // default if LeaseTTL not set
freshnessCutoff = 60 * time.Second
}
primaryLSN := entry.WALHeadLSN
bestIdx := -1
for i := range entry.Replicas {
ri := &entry.Replicas[i]
// Gate 1: heartbeat freshness.
if !ri.LastHeartbeat.IsZero() && now.Sub(ri.LastHeartbeat) > freshnessCutoff {
// Gate 1: heartbeat freshness. Zero means never heartbeated — unsafe
// to promote because the registry has no proof the replica is alive,
// caught up, or fully initialized.
if ri.LastHeartbeat.IsZero() {
result.Rejections = append(result.Rejections, PromotionRejection{
Server: ri.Server,
Reason: "no_heartbeat",
})
continue
}
if now.Sub(ri.LastHeartbeat) > freshnessCutoff {
result.Rejections = append(result.Rejections, PromotionRejection{
Server: ri.Server,
Reason: "stale_heartbeat",
})
continue
}
// Gate 2: WAL LSN recency (skip if primary LSN is 0 — no data yet, all eligible).
if primaryLSN > 0 && ri.WALHeadLSN+r.promotionLSNTolerance < primaryLSN {
result.Rejections = append(result.Rejections, PromotionRejection{
Server: ri.Server,
Reason: "wal_lag",
})
continue
}
// Gate 3: role must be RoleReplica (not rebuilding/stale).
if ri.Role != 0 && blockvol.RoleFromWire(ri.Role) != blockvol.RoleReplica {
// Gate 3: role must be exactly RoleReplica. Zero/unset role means
// the replica was created but never confirmed its role via heartbeat.
if blockvol.RoleFromWire(ri.Role) != blockvol.RoleReplica {
result.Rejections = append(result.Rejections, PromotionRejection{
Server: ri.Server,
Reason: "wrong_role",
})
continue
}
// Gate 4: server must be alive (in blockServers set) — B-12 fix.
if !r.blockServers[ri.Server] {
result.Rejections = append(result.Rejections, PromotionRejection{
Server: ri.Server,
Reason: "server_dead",
})
continue
}
// Eligible — pick best by health score, tie-break by WALHeadLSN.
@ -894,11 +941,39 @@ func (r *BlockVolumeRegistry) PromoteBestReplica(name string) (uint64, error) {
}
if bestIdx == -1 {
return 0, fmt.Errorf("block volume %q: no eligible replicas for promotion", name)
result.Reason = "no eligible replicas"
if len(result.Rejections) > 0 {
result.Reason += ": " + result.Rejections[0].Reason
if len(result.Rejections) > 1 {
result.Reason += fmt.Sprintf(" (+%d more)", len(result.Rejections)-1)
}
}
return result
}
promoted := entry.Replicas[bestIdx]
result.Promotable = true
ri := entry.Replicas[bestIdx]
result.Candidate = &ri
result.CandidateIdx = bestIdx
return result
}
// EvaluatePromotion returns a read-only preflight result for the named volume
// without mutating the registry. Safe for status/logging/manual promote preview.
func (r *BlockVolumeRegistry) EvaluatePromotion(name string) (PromotionPreflightResult, error) {
r.mu.RLock()
defer r.mu.RUnlock()
entry, ok := r.volumes[name]
if !ok {
return PromotionPreflightResult{VolumeName: name, Reason: "volume not found"}, fmt.Errorf("block volume %q not found", name)
}
return r.evaluatePromotionLocked(entry), nil
}
// applyPromotionLocked applies the promotion of a replica at candidateIdx to primary.
// Caller must hold r.mu (write lock). The promoted replica is removed from Replicas[].
// Old primary is NOT added to Replicas (needs rebuild). Returns the new epoch.
func (r *BlockVolumeRegistry) applyPromotionLocked(entry *BlockVolumeEntry, name string, candidate ReplicaInfo, candidateIdx int) uint64 {
// Remove old primary from byServer index.
r.removeFromServer(entry.VolumeServer, name)
@ -906,18 +981,21 @@ func (r *BlockVolumeRegistry) PromoteBestReplica(name string) (uint64, error) {
newEpoch := entry.Epoch + 1
// Promote replica to primary.
entry.VolumeServer = promoted.Server
entry.Path = promoted.Path
entry.IQN = promoted.IQN
entry.ISCSIAddr = promoted.ISCSIAddr
entry.NvmeAddr = promoted.NvmeAddr
entry.NQN = promoted.NQN
entry.VolumeServer = candidate.Server
entry.Path = candidate.Path
entry.IQN = candidate.IQN
entry.ISCSIAddr = candidate.ISCSIAddr
entry.NvmeAddr = candidate.NvmeAddr
entry.NQN = candidate.NQN
entry.Epoch = newEpoch
entry.Role = blockvol.RoleToWire(blockvol.RolePrimary)
entry.LastLeaseGrant = time.Now()
// Clear stale rebuild/publication metadata from old primary (B-11 partial fix).
entry.RebuildListenAddr = ""
// Remove promoted from Replicas. Others stay.
entry.Replicas = append(entry.Replicas[:bestIdx], entry.Replicas[bestIdx+1:]...)
entry.Replicas = append(entry.Replicas[:candidateIdx], entry.Replicas[candidateIdx+1:]...)
// Sync deprecated scalar fields.
if len(entry.Replicas) > 0 {
@ -940,9 +1018,212 @@ func (r *BlockVolumeRegistry) PromoteBestReplica(name string) (uint64, error) {
// Update byServer index: new primary server now hosts this volume.
r.addToServer(entry.VolumeServer, name)
return newEpoch
}
// PromoteBestReplica promotes the best eligible replica to primary.
// Eligibility: heartbeat fresh (within 2×LeaseTTL), WALHeadLSN within tolerance of primary,
// role must be RoleReplica (not RoleRebuilding), and server must be alive (B-12 fix).
// The promoted replica is removed from Replicas[]. Other replicas stay.
// Old primary is NOT added to Replicas (needs rebuild).
// Returns the new epoch and the preflight result.
func (r *BlockVolumeRegistry) PromoteBestReplica(name string) (uint64, error) {
r.mu.Lock()
defer r.mu.Unlock()
entry, ok := r.volumes[name]
if !ok {
return 0, fmt.Errorf("block volume %q not found", name)
}
pf := r.evaluatePromotionLocked(entry)
if !pf.Promotable {
return 0, fmt.Errorf("block volume %q: %s", name, pf.Reason)
}
promoted := *pf.Candidate
bestIdx := pf.CandidateIdx
newEpoch := r.applyPromotionLocked(entry, name, promoted, bestIdx)
return newEpoch, nil
}
// evaluateManualPromotionLocked evaluates promotion candidates for a manual promote request.
// Caller must hold r.mu (read or write).
//
// Differences from evaluatePromotionLocked:
// - Primary-alive gate: if !force and current primary is alive, reject with "primary_alive".
// - Target filtering: if targetServer != "", only evaluate that specific replica.
// Returns Reason="target_not_found" if that server is not a replica.
// - Force flag: bypasses soft gates (primary_alive, stale_heartbeat, wal_lag)
// but keeps hard gates (no_heartbeat with zero time, wrong_role, server_dead).
//
// Gate table:
//
// Gate | Normal | Force
// primary_alive | Reject | Skip
// no_heartbeat(0) | Reject | Reject
// stale_heartbeat | Reject | Skip
// wal_lag | Reject | Skip
// wrong_role | Reject | Reject
// server_dead | Reject | Reject
func (r *BlockVolumeRegistry) evaluateManualPromotionLocked(entry *BlockVolumeEntry, targetServer string, force bool) PromotionPreflightResult {
result := PromotionPreflightResult{
VolumeName: entry.Name,
CandidateIdx: -1,
}
// Primary-alive gate (soft — skipped when force=true).
if !force && r.blockServers[entry.VolumeServer] {
result.Reason = "primary_alive"
return result
}
if len(entry.Replicas) == 0 {
result.Reason = "no replicas"
return result
}
// Target filtering: if a specific server is requested, find its index first.
// Return early if not found.
if targetServer != "" {
found := false
for i := range entry.Replicas {
if entry.Replicas[i].Server == targetServer {
found = true
break
}
}
if !found {
result.Reason = "target_not_found"
return result
}
}
now := time.Now()
freshnessCutoff := 2 * entry.LeaseTTL
if freshnessCutoff == 0 {
freshnessCutoff = 60 * time.Second
}
primaryLSN := entry.WALHeadLSN
bestIdx := -1
for i := range entry.Replicas {
ri := &entry.Replicas[i]
// If targeting a specific server, skip all others.
if targetServer != "" && ri.Server != targetServer {
continue
}
// Hard gate: no heartbeat (zero time) — unsafe regardless of force.
if ri.LastHeartbeat.IsZero() {
result.Rejections = append(result.Rejections, PromotionRejection{
Server: ri.Server,
Reason: "no_heartbeat",
})
continue
}
// Soft gate: stale heartbeat — skipped when force=true.
if !force && now.Sub(ri.LastHeartbeat) > freshnessCutoff {
result.Rejections = append(result.Rejections, PromotionRejection{
Server: ri.Server,
Reason: "stale_heartbeat",
})
continue
}
// Soft gate: WAL lag — skipped when force=true.
if !force && primaryLSN > 0 && ri.WALHeadLSN+r.promotionLSNTolerance < primaryLSN {
result.Rejections = append(result.Rejections, PromotionRejection{
Server: ri.Server,
Reason: "wal_lag",
})
continue
}
// Hard gate: role must be exactly RoleReplica.
if blockvol.RoleFromWire(ri.Role) != blockvol.RoleReplica {
result.Rejections = append(result.Rejections, PromotionRejection{
Server: ri.Server,
Reason: "wrong_role",
})
continue
}
// Hard gate: server must be alive (in blockServers set).
if !r.blockServers[ri.Server] {
result.Rejections = append(result.Rejections, PromotionRejection{
Server: ri.Server,
Reason: "server_dead",
})
continue
}
// Eligible — pick best by health score, tie-break by WALHeadLSN.
if bestIdx == -1 {
bestIdx = i
} else if ri.HealthScore > entry.Replicas[bestIdx].HealthScore {
bestIdx = i
} else if ri.HealthScore == entry.Replicas[bestIdx].HealthScore &&
ri.WALHeadLSN > entry.Replicas[bestIdx].WALHeadLSN {
bestIdx = i
}
}
if bestIdx == -1 {
result.Reason = "no eligible replicas"
if len(result.Rejections) > 0 {
result.Reason += ": " + result.Rejections[0].Reason
if len(result.Rejections) > 1 {
result.Reason += fmt.Sprintf(" (+%d more)", len(result.Rejections)-1)
}
}
return result
}
result.Promotable = true
ri := entry.Replicas[bestIdx]
result.Candidate = &ri
result.CandidateIdx = bestIdx
return result
}
// ManualPromote promotes a specific replica (or the best eligible replica) to primary.
// Unlike PromoteBestReplica, it accepts operator overrides:
// - targetServer: if non-empty, only that replica is considered.
// - force: bypasses soft gates (primary_alive, stale_heartbeat, wal_lag).
//
// Returns (newEpoch, oldPrimary, oldPath, preflightResult, nil) on success.
// oldPrimary and oldPath are captured under the lock to avoid TOCTOU with
// concurrent auto-failover (BUG-T5-2 fix).
// Returns (0, "", "", preflightResult, err) on rejection or lookup failure.
func (r *BlockVolumeRegistry) ManualPromote(name, targetServer string, force bool) (uint64, string, string, PromotionPreflightResult, error) {
r.mu.Lock()
defer r.mu.Unlock()
entry, ok := r.volumes[name]
if !ok {
return 0, "", "", PromotionPreflightResult{VolumeName: name, Reason: "volume not found"},
fmt.Errorf("block volume %q not found", name)
}
// Capture old primary info under lock (BUG-T5-2 fix).
oldPrimary := entry.VolumeServer
oldPath := entry.Path
pf := r.evaluateManualPromotionLocked(entry, targetServer, force)
if !pf.Promotable {
return 0, "", "", pf, fmt.Errorf("block volume %q: %s", name, pf.Reason)
}
promoted := *pf.Candidate
candidateIdx := pf.CandidateIdx
newEpoch := r.applyPromotionLocked(entry, name, promoted, candidateIdx)
return newEpoch, oldPrimary, oldPath, pf, nil
}
// MarkBlockCapable records that the given server supports block volumes.
func (r *BlockVolumeRegistry) MarkBlockCapable(server string) {
r.mu.Lock()
@ -1045,6 +1326,41 @@ func (r *BlockVolumeRegistry) ServerSummaries() []BlockServerSummary {
return summaries
}
// IsBlockCapable returns true if the given server is in the block-capable set (alive).
func (r *BlockVolumeRegistry) IsBlockCapable(server string) bool {
r.mu.RLock()
defer r.mu.RUnlock()
return r.blockServers[server]
}
// VolumesWithDeadPrimary returns names of volumes where the given server is a replica
// and the current primary is NOT in the block-capable set (dead/disconnected).
// Used by T2 (B-06) to detect orphaned primaries that need re-promotion.
func (r *BlockVolumeRegistry) VolumesWithDeadPrimary(replicaServer string) []string {
r.mu.RLock()
defer r.mu.RUnlock()
names, ok := r.byServer[replicaServer]
if !ok {
return nil
}
var orphaned []string
for name := range names {
entry := r.volumes[name]
if entry == nil {
continue
}
// Only consider volumes where this server is a replica (not the primary).
if entry.VolumeServer == replicaServer {
continue
}
// Check if the primary server is dead.
if !r.blockServers[entry.VolumeServer] {
orphaned = append(orphaned, name)
}
}
return orphaned
}
// BlockCapableServers returns the list of servers known to support block volumes.
func (r *BlockVolumeRegistry) BlockCapableServers() []string {
r.mu.RLock()

519
weed/server/master_block_registry_test.go

@ -2,6 +2,7 @@ package weed_server
import (
"fmt"
"strings"
"sync"
"testing"
"time"
@ -538,6 +539,8 @@ func TestRegistry_RemoveReplica(t *testing.T) {
func TestRegistry_PromoteBestReplica_PicksHighest(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("s2")
r.MarkBlockCapable("s3")
r.Register(&BlockVolumeEntry{
Name: "vol1",
VolumeServer: "s1",
@ -545,8 +548,8 @@ func TestRegistry_PromoteBestReplica_PicksHighest(t *testing.T) {
Epoch: 5,
Role: 1,
Replicas: []ReplicaInfo{
{Server: "s2", Path: "/r1.blk", IQN: "iqn:r1", ISCSIAddr: "s2:3260", HealthScore: 0.8, WALHeadLSN: 100},
{Server: "s3", Path: "/r2.blk", IQN: "iqn:r2", ISCSIAddr: "s3:3260", HealthScore: 0.95, WALHeadLSN: 90},
{Server: "s2", Path: "/r1.blk", IQN: "iqn:r1", ISCSIAddr: "s2:3260", HealthScore: 0.8, WALHeadLSN: 100, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
{Server: "s3", Path: "/r2.blk", IQN: "iqn:r2", ISCSIAddr: "s3:3260", HealthScore: 0.95, WALHeadLSN: 90, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
},
})
// Add to byServer for s2 and s3.
@ -592,14 +595,16 @@ func TestRegistry_PromoteBestReplica_NoReplica(t *testing.T) {
func TestRegistry_PromoteBestReplica_TiebreakByLSN(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("s2")
r.MarkBlockCapable("s3")
r.Register(&BlockVolumeEntry{
Name: "vol1",
VolumeServer: "s1",
Path: "/v1.blk",
Epoch: 3,
Replicas: []ReplicaInfo{
{Server: "s2", Path: "/r1.blk", IQN: "iqn:r1", ISCSIAddr: "s2:3260", HealthScore: 0.9, WALHeadLSN: 50},
{Server: "s3", Path: "/r2.blk", IQN: "iqn:r2", ISCSIAddr: "s3:3260", HealthScore: 0.9, WALHeadLSN: 100},
{Server: "s2", Path: "/r1.blk", IQN: "iqn:r1", ISCSIAddr: "s2:3260", HealthScore: 0.9, WALHeadLSN: 50, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
{Server: "s3", Path: "/r2.blk", IQN: "iqn:r2", ISCSIAddr: "s3:3260", HealthScore: 0.9, WALHeadLSN: 100, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
},
})
r.mu.Lock()
@ -627,14 +632,16 @@ func TestRegistry_PromoteBestReplica_TiebreakByLSN(t *testing.T) {
func TestRegistry_PromoteBestReplica_KeepsOthers(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("s2")
r.MarkBlockCapable("s3")
r.Register(&BlockVolumeEntry{
Name: "vol1",
VolumeServer: "s1",
Path: "/v1.blk",
Epoch: 1,
Replicas: []ReplicaInfo{
{Server: "s2", Path: "/r1.blk", IQN: "iqn:r1", ISCSIAddr: "s2:3260", HealthScore: 1.0, WALHeadLSN: 100},
{Server: "s3", Path: "/r2.blk", IQN: "iqn:r2", ISCSIAddr: "s3:3260", HealthScore: 0.5, WALHeadLSN: 100},
{Server: "s2", Path: "/r1.blk", IQN: "iqn:r1", ISCSIAddr: "s2:3260", HealthScore: 1.0, WALHeadLSN: 100, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
{Server: "s3", Path: "/r2.blk", IQN: "iqn:r2", ISCSIAddr: "s3:3260", HealthScore: 0.5, WALHeadLSN: 100, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
},
})
r.mu.Lock()
@ -877,6 +884,7 @@ func TestRegistry_PromoteBestReplica_WALLagIneligible(t *testing.T) {
HealthScore: 1.0,
WALHeadLSN: 800, // lag=200, tolerance=100
LastHeartbeat: time.Now(),
Role: blockvol.RoleToWire(blockvol.RoleReplica),
},
},
})
@ -918,6 +926,8 @@ func TestRegistry_PromoteBestReplica_RebuildingIneligible(t *testing.T) {
// Fix #2: Among eligible replicas, best (health+LSN) wins.
func TestRegistry_PromoteBestReplica_EligibilityFiltersCorrectly(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("stale")
r.MarkBlockCapable("good")
r.Register(&BlockVolumeEntry{
Name: "vol1",
VolumeServer: "primary",
@ -939,6 +949,7 @@ func TestRegistry_PromoteBestReplica_EligibilityFiltersCorrectly(t *testing.T) {
HealthScore: 0.8,
WALHeadLSN: 95,
LastHeartbeat: time.Now(),
Role: blockvol.RoleToWire(blockvol.RoleReplica),
},
},
})
@ -956,6 +967,7 @@ func TestRegistry_PromoteBestReplica_EligibilityFiltersCorrectly(t *testing.T) {
// Configurable tolerance: widen tolerance to allow lagging replicas.
func TestRegistry_PromoteBestReplica_ConfigurableTolerance(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("lagging")
r.Register(&BlockVolumeEntry{
Name: "vol1",
VolumeServer: "primary",
@ -970,6 +982,7 @@ func TestRegistry_PromoteBestReplica_ConfigurableTolerance(t *testing.T) {
HealthScore: 1.0,
WALHeadLSN: 800, // lag=200
LastHeartbeat: time.Now(),
Role: blockvol.RoleToWire(blockvol.RoleReplica),
},
},
})
@ -992,6 +1005,236 @@ func TestRegistry_PromoteBestReplica_ConfigurableTolerance(t *testing.T) {
}
}
// B-12: PromoteBestReplica rejects dead replica (server not in blockServers).
func TestRegistry_PromoteBestReplica_DeadServerIneligible(t *testing.T) {
r := NewBlockVolumeRegistry()
// Intentionally do NOT mark "dead-replica" as block-capable.
r.Register(&BlockVolumeEntry{
Name: "vol1",
VolumeServer: "primary",
Path: "/data/vol1.blk",
Epoch: 1,
LeaseTTL: 30 * time.Second,
WALHeadLSN: 100,
Replicas: []ReplicaInfo{
{
Server: "dead-replica",
Path: "/data/vol1.blk",
HealthScore: 1.0,
WALHeadLSN: 100,
LastHeartbeat: time.Now(),
Role: blockvol.RoleToWire(blockvol.RoleReplica),
},
},
})
_, err := r.PromoteBestReplica("vol1")
if err == nil {
t.Fatal("expected error: dead replica should be rejected")
}
if !strings.Contains(err.Error(), "server_dead") {
t.Fatalf("error should mention server_dead, got: %v", err)
}
}
// B-12: Dead replica rejected but alive replica promoted when both exist.
func TestRegistry_PromoteBestReplica_DeadSkipped_AlivePromoted(t *testing.T) {
r := NewBlockVolumeRegistry()
// Only mark s3 as alive.
r.MarkBlockCapable("s3")
r.Register(&BlockVolumeEntry{
Name: "vol1",
VolumeServer: "primary",
Path: "/data/vol1.blk",
Epoch: 1,
LeaseTTL: 30 * time.Second,
WALHeadLSN: 100,
Replicas: []ReplicaInfo{
{Server: "s2-dead", Path: "/r1.blk", HealthScore: 1.0, WALHeadLSN: 100, LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
{Server: "s3", Path: "/r2.blk", HealthScore: 0.8, WALHeadLSN: 95, LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
},
})
newEpoch, err := r.PromoteBestReplica("vol1")
if err != nil {
t.Fatalf("PromoteBestReplica: %v", err)
}
if newEpoch != 2 {
t.Fatalf("newEpoch: got %d, want 2", newEpoch)
}
e, _ := r.Lookup("vol1")
if e.VolumeServer != "s3" {
t.Fatalf("expected alive s3 promoted, got %q", e.VolumeServer)
}
}
// EvaluatePromotion returns read-only preflight without mutating registry.
func TestRegistry_EvaluatePromotion_Basic(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("replica1")
r.Register(&BlockVolumeEntry{
Name: "vol1",
VolumeServer: "primary",
Path: "/data/vol1.blk",
Epoch: 5,
LeaseTTL: 30 * time.Second,
WALHeadLSN: 100,
Replicas: []ReplicaInfo{
{Server: "replica1", Path: "/r1.blk", HealthScore: 0.9, WALHeadLSN: 100, LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
},
})
pf, err := r.EvaluatePromotion("vol1")
if err != nil {
t.Fatalf("EvaluatePromotion: %v", err)
}
if !pf.Promotable {
t.Fatalf("expected promotable, got reason: %s", pf.Reason)
}
if pf.Candidate == nil || pf.Candidate.Server != "replica1" {
t.Fatalf("expected candidate replica1, got %+v", pf.Candidate)
}
// Registry must be unmutated.
e, _ := r.Lookup("vol1")
if e.VolumeServer != "primary" {
t.Fatal("EvaluatePromotion should not mutate the registry")
}
if e.Epoch != 5 {
t.Fatal("EvaluatePromotion should not bump epoch")
}
}
// EvaluatePromotion with all replicas rejected.
func TestRegistry_EvaluatePromotion_AllRejected(t *testing.T) {
r := NewBlockVolumeRegistry()
// No servers marked as block-capable.
r.Register(&BlockVolumeEntry{
Name: "vol1",
VolumeServer: "primary",
Path: "/data/vol1.blk",
Epoch: 1,
Replicas: []ReplicaInfo{
{Server: "dead1", Path: "/r1.blk", HealthScore: 1.0, WALHeadLSN: 100, LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
{Server: "dead2", Path: "/r2.blk", HealthScore: 0.9, WALHeadLSN: 100, LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
},
})
pf, err := r.EvaluatePromotion("vol1")
if err != nil {
t.Fatalf("EvaluatePromotion: %v", err)
}
if pf.Promotable {
t.Fatal("expected not promotable")
}
if len(pf.Rejections) != 2 {
t.Fatalf("expected 2 rejections, got %d", len(pf.Rejections))
}
for _, rej := range pf.Rejections {
if rej.Reason != "server_dead" {
t.Fatalf("expected server_dead rejection, got %q", rej.Reason)
}
}
}
// EvaluatePromotion for nonexistent volume.
func TestRegistry_EvaluatePromotion_NotFound(t *testing.T) {
r := NewBlockVolumeRegistry()
_, err := r.EvaluatePromotion("nonexistent")
if err == nil {
t.Fatal("expected error for nonexistent volume")
}
}
// Replica created but never heartbeated is not promotable.
func TestRegistry_PromoteBestReplica_NoHeartbeatIneligible(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("replica1")
r.Register(&BlockVolumeEntry{
Name: "vol1",
VolumeServer: "primary",
Path: "/data/vol1.blk",
Epoch: 1,
LeaseTTL: 30 * time.Second,
WALHeadLSN: 100,
Replicas: []ReplicaInfo{
{
Server: "replica1",
Path: "/r1.blk",
HealthScore: 1.0,
WALHeadLSN: 100,
Role: blockvol.RoleToWire(blockvol.RoleReplica),
// LastHeartbeat: zero — never heartbeated
},
},
})
_, err := r.PromoteBestReplica("vol1")
if err == nil {
t.Fatal("expected error: replica with no heartbeat should be rejected")
}
if !strings.Contains(err.Error(), "no_heartbeat") {
t.Fatalf("error should mention no_heartbeat, got: %v", err)
}
}
// Replica with unset (zero) role is not promotable.
func TestRegistry_PromoteBestReplica_UnsetRoleIneligible(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("replica1")
r.Register(&BlockVolumeEntry{
Name: "vol1",
VolumeServer: "primary",
Path: "/data/vol1.blk",
Epoch: 1,
LeaseTTL: 30 * time.Second,
WALHeadLSN: 100,
Replicas: []ReplicaInfo{
{
Server: "replica1",
Path: "/r1.blk",
HealthScore: 1.0,
WALHeadLSN: 100,
LastHeartbeat: time.Now(),
// Role: 0 — unset/RoleNone
},
},
})
_, err := r.PromoteBestReplica("vol1")
if err == nil {
t.Fatal("expected error: replica with unset role should be rejected")
}
if !strings.Contains(err.Error(), "wrong_role") {
t.Fatalf("error should mention wrong_role, got: %v", err)
}
}
// PromoteBestReplica clears RebuildListenAddr on promotion (B-11 partial fix).
func TestRegistry_PromoteBestReplica_ClearsRebuildAddr(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("replica1")
r.Register(&BlockVolumeEntry{
Name: "vol1",
VolumeServer: "primary",
Path: "/data/vol1.blk",
Epoch: 1,
RebuildListenAddr: "primary:15000",
Replicas: []ReplicaInfo{
{Server: "replica1", Path: "/r1.blk", HealthScore: 1.0, WALHeadLSN: 100, LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
},
})
_, err := r.PromoteBestReplica("vol1")
if err != nil {
t.Fatalf("PromoteBestReplica: %v", err)
}
e, _ := r.Lookup("vol1")
if e.RebuildListenAddr != "" {
t.Fatalf("RebuildListenAddr should be cleared after promotion, got %q", e.RebuildListenAddr)
}
}
// --- LeaseGrants ---
func TestRegistry_LeaseGrants_PrimaryOnly(t *testing.T) {
@ -1110,3 +1353,267 @@ func TestRegistry_LeaseGrants_UnknownServer(t *testing.T) {
t.Fatalf("expected nil for unknown server, got %+v", grants)
}
}
// ============================================================
// CP11B-3 T2: IsBlockCapable + VolumesWithDeadPrimary
// ============================================================
func TestRegistry_IsBlockCapable(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("vs1:8080")
if !r.IsBlockCapable("vs1:8080") {
t.Fatal("vs1 should be block-capable")
}
if r.IsBlockCapable("vs2:8080") {
t.Fatal("vs2 should NOT be block-capable")
}
r.UnmarkBlockCapable("vs1:8080")
if r.IsBlockCapable("vs1:8080") {
t.Fatal("vs1 should no longer be block-capable after unmark")
}
}
func TestRegistry_VolumesWithDeadPrimary_Basic(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("vs1")
r.MarkBlockCapable("vs2")
r.Register(&BlockVolumeEntry{
Name: "vol1", VolumeServer: "vs1", Path: "/data/vol1.blk",
SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
Status: StatusActive,
Replicas: []ReplicaInfo{{Server: "vs2", Path: "/data/vol1.blk"}},
})
// Both alive → no orphans.
orphaned := r.VolumesWithDeadPrimary("vs2")
if len(orphaned) != 0 {
t.Fatalf("expected 0 orphaned volumes, got %d", len(orphaned))
}
// Kill primary.
r.UnmarkBlockCapable("vs1")
orphaned = r.VolumesWithDeadPrimary("vs2")
if len(orphaned) != 1 || orphaned[0] != "vol1" {
t.Fatalf("expected [vol1], got %v", orphaned)
}
}
func TestRegistry_VolumesWithDeadPrimary_PrimaryServer_NotIncluded(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("vs1")
r.Register(&BlockVolumeEntry{
Name: "vol1", VolumeServer: "vs1", Path: "/data/vol1.blk",
SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
Status: StatusActive,
})
// vs1 is the primary for vol1 — should NOT appear in orphaned list for vs1.
orphaned := r.VolumesWithDeadPrimary("vs1")
if len(orphaned) != 0 {
t.Fatalf("primary server should not appear in its own orphan list, got %v", orphaned)
}
}
// T6: EvaluatePromotion preflight includes primary liveness.
func TestRegistry_EvaluatePromotion_PrimaryDead_StillShowsCandidate(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("vs1")
r.MarkBlockCapable("vs2")
r.Register(&BlockVolumeEntry{
Name: "vol1", VolumeServer: "vs1", Path: "/data/vol1.blk",
SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
Status: StatusActive, LeaseTTL: 30 * time.Second,
Replicas: []ReplicaInfo{{
Server: "vs2", Path: "/data/vol1.blk", HealthScore: 1.0,
Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(),
}},
})
// Kill primary but keep vs2 alive.
r.UnmarkBlockCapable("vs1")
pf, err := r.EvaluatePromotion("vol1")
if err != nil {
t.Fatalf("EvaluatePromotion: %v", err)
}
if !pf.Promotable {
t.Fatalf("should be promotable (vs2 alive), reason=%s", pf.Reason)
}
if pf.Candidate.Server != "vs2" {
t.Fatalf("candidate should be vs2, got %q", pf.Candidate.Server)
}
}
// ============================================================
// CP11B-3 T5: ManualPromote Dev Tests
// ============================================================
// T5: ManualPromote with empty target → auto-picks best candidate.
func TestRegistry_ManualPromote_AutoTarget(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("best")
r.MarkBlockCapable("worse")
r.Register(&BlockVolumeEntry{
Name: "vol1", VolumeServer: "primary", Path: "/data/vol1.blk",
Epoch: 1, LeaseTTL: 30 * time.Second, WALHeadLSN: 100,
Replicas: []ReplicaInfo{
{Server: "worse", Path: "/r1.blk", HealthScore: 0.5, WALHeadLSN: 100,
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
{Server: "best", Path: "/r2.blk", HealthScore: 1.0, WALHeadLSN: 100,
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
},
})
// Primary not block-capable → non-force should still pass (primary_alive gate won't trigger).
newEpoch, _, _, pf, err := r.ManualPromote("vol1", "", false)
if err != nil {
t.Fatalf("ManualPromote: %v", err)
}
if newEpoch != 2 {
t.Fatalf("epoch: got %d, want 2", newEpoch)
}
if !pf.Promotable {
t.Fatal("should be promotable")
}
e, _ := r.Lookup("vol1")
if e.VolumeServer != "best" {
t.Fatalf("expected 'best' promoted, got %q", e.VolumeServer)
}
}
// T5: ManualPromote targets a specific replica (not the best by health).
func TestRegistry_ManualPromote_SpecificTarget(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("r1")
r.MarkBlockCapable("r2")
r.Register(&BlockVolumeEntry{
Name: "vol1", VolumeServer: "primary", Path: "/data/vol1.blk",
Epoch: 1, LeaseTTL: 30 * time.Second,
Replicas: []ReplicaInfo{
{Server: "r1", Path: "/r1.blk", HealthScore: 1.0, WALHeadLSN: 100,
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
{Server: "r2", Path: "/r2.blk", HealthScore: 0.5, WALHeadLSN: 50,
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
},
})
// Target r2 specifically (worse health).
newEpoch, _, _, _, err := r.ManualPromote("vol1", "r2", false)
if err != nil {
t.Fatalf("ManualPromote: %v", err)
}
if newEpoch != 2 {
t.Fatalf("epoch: got %d, want 2", newEpoch)
}
e, _ := r.Lookup("vol1")
if e.VolumeServer != "r2" {
t.Fatalf("expected r2 promoted (specific target), got %q", e.VolumeServer)
}
}
// T5: ManualPromote with non-existent target → error.
func TestRegistry_ManualPromote_TargetNotFound(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("r1")
r.Register(&BlockVolumeEntry{
Name: "vol1", VolumeServer: "primary", Path: "/data/vol1.blk",
Epoch: 1, LeaseTTL: 30 * time.Second,
Replicas: []ReplicaInfo{
{Server: "r1", Path: "/r1.blk", HealthScore: 1.0,
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
},
})
_, _, _, pf, err := r.ManualPromote("vol1", "nonexistent", false)
if err == nil {
t.Fatal("expected error for nonexistent target")
}
if pf.Reason != "target_not_found" {
t.Fatalf("expected target_not_found, got %q", pf.Reason)
}
}
// T5: ManualPromote non-force with alive primary → rejected.
func TestRegistry_ManualPromote_PrimaryAlive_Rejected(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("primary")
r.MarkBlockCapable("r1")
r.Register(&BlockVolumeEntry{
Name: "vol1", VolumeServer: "primary", Path: "/data/vol1.blk",
Epoch: 1, LeaseTTL: 30 * time.Second,
Replicas: []ReplicaInfo{
{Server: "r1", Path: "/r1.blk", HealthScore: 1.0,
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
},
})
_, _, _, pf, err := r.ManualPromote("vol1", "", false)
if err == nil {
t.Fatal("expected rejection when primary alive and !force")
}
if pf.Reason != "primary_alive" {
t.Fatalf("expected primary_alive, got %q", pf.Reason)
}
// Verify no mutation.
e, _ := r.Lookup("vol1")
if e.VolumeServer != "primary" {
t.Fatalf("primary should not change, got %q", e.VolumeServer)
}
}
// T5: Force bypasses stale heartbeat and primary_alive gates.
func TestRegistry_ManualPromote_Force_StaleHeartbeat(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("primary")
r.MarkBlockCapable("r1")
r.Register(&BlockVolumeEntry{
Name: "vol1", VolumeServer: "primary", Path: "/data/vol1.blk",
Epoch: 1, LeaseTTL: 30 * time.Second,
Replicas: []ReplicaInfo{
{Server: "r1", Path: "/r1.blk", HealthScore: 1.0,
LastHeartbeat: time.Now().Add(-10 * time.Minute), // stale
Role: blockvol.RoleToWire(blockvol.RoleReplica)},
},
})
// Non-force: would fail on primary_alive.
// Force: bypasses primary_alive AND stale_heartbeat.
newEpoch, _, _, _, err := r.ManualPromote("vol1", "", true)
if err != nil {
t.Fatalf("force ManualPromote should succeed: %v", err)
}
if newEpoch != 2 {
t.Fatalf("epoch: got %d, want 2", newEpoch)
}
e, _ := r.Lookup("vol1")
if e.VolumeServer != "r1" {
t.Fatalf("expected r1 promoted via force, got %q", e.VolumeServer)
}
}
// T5: Force does NOT bypass server_dead (hard gate).
func TestRegistry_ManualPromote_Force_StillRejectsDeadServer(t *testing.T) {
r := NewBlockVolumeRegistry()
// "dead" is NOT marked block-capable.
r.Register(&BlockVolumeEntry{
Name: "vol1", VolumeServer: "primary", Path: "/data/vol1.blk",
Epoch: 1, LeaseTTL: 30 * time.Second,
Replicas: []ReplicaInfo{
{Server: "dead", Path: "/r1.blk", HealthScore: 1.0,
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
},
})
_, _, _, pf, err := r.ManualPromote("vol1", "dead", true)
if err == nil {
t.Fatal("force should NOT bypass server_dead")
}
if len(pf.Rejections) == 0 || pf.Rejections[0].Reason != "server_dead" {
t.Fatalf("expected server_dead rejection, got %+v", pf.Rejections)
}
}

3
weed/server/master_grpc_server.go

@ -278,6 +278,9 @@ func (ms *MasterServer) SendHeartbeat(stream master_pb.Seaweed_SendHeartbeatServ
// on subsequent heartbeats), never both in the same message.
if len(heartbeat.BlockVolumeInfos) > 0 || heartbeat.HasNoBlockVolumes {
ms.blockRegistry.UpdateFullHeartbeat(dn.Url(), heartbeat.BlockVolumeInfos)
// T2 (B-06): After updating registry from heartbeat, check if this server
// is a replica for any volume whose primary is dead. If so, promote.
ms.reevaluateOrphanedPrimaries(dn.Url())
} else if len(heartbeat.NewBlockVolumes) > 0 || len(heartbeat.DeletedBlockVolumes) > 0 {
ms.blockRegistry.UpdateDeltaHeartbeat(dn.Url(), heartbeat.NewBlockVolumes, heartbeat.DeletedBlockVolumes)
}

23
weed/server/master_grpc_server_block.go

@ -283,14 +283,16 @@ func (ms *MasterServer) tryCreateOneReplica(ctx context.Context, req *master_pb.
entry.RebuildListenAddr = primaryResult.RebuildListenAddr
// CP8-2: populate Replicas[].
entry.Replicas = append(entry.Replicas, ReplicaInfo{
Server: replicaServerStr,
Path: replicaResult.Path,
ISCSIAddr: replicaResult.ISCSIAddr,
IQN: replicaResult.IQN,
NvmeAddr: replicaResult.NvmeAddr,
NQN: replicaResult.NQN,
DataAddr: replicaResult.ReplicaDataAddr,
CtrlAddr: replicaResult.ReplicaCtrlAddr,
Server: replicaServerStr,
Path: replicaResult.Path,
ISCSIAddr: replicaResult.ISCSIAddr,
IQN: replicaResult.IQN,
NvmeAddr: replicaResult.NvmeAddr,
NQN: replicaResult.NQN,
DataAddr: replicaResult.ReplicaDataAddr,
CtrlAddr: replicaResult.ReplicaCtrlAddr,
Role: blockvol.RoleToWire(blockvol.RoleReplica),
LastHeartbeat: time.Now(),
})
return replicaServerStr
}
@ -409,6 +411,11 @@ func (ms *MasterServer) ExpandBlockVolume(ctx context.Context, req *master_pb.Ex
}
}()
// Test-only hook: inject failover between lock acquisition and re-read.
if ms.expandPreReadHook != nil {
ms.expandPreReadHook()
}
// B-09: Re-read entry after acquiring expand lock. Between the initial
// Lookup and AcquireExpandInflight, failover may have changed VolumeServer
// or Replicas. Using the stale snapshot would send PREPARE to dead nodes.

71
weed/server/master_grpc_server_block_test.go

@ -10,6 +10,7 @@ import (
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol"
)
// testMasterServer creates a minimal MasterServer with mock VS calls for testing.
@ -1112,6 +1113,9 @@ func TestMaster_NoNvmeFieldsWhenDisabled(t *testing.T) {
func TestMaster_PromotionCopiesNvmeFields(t *testing.T) {
ms := testMasterServer(t)
// Mark servers as block-capable so promotion Gate 4 (liveness) passes.
ms.blockRegistry.MarkBlockCapable("vs1:9333")
ms.blockRegistry.MarkBlockCapable("vs2:9333")
// Directly register an entry with primary + replica, both having NVMe fields.
ms.blockRegistry.Register(&BlockVolumeEntry{
@ -1128,16 +1132,18 @@ func TestMaster_PromotionCopiesNvmeFields(t *testing.T) {
LeaseTTL: 30 * time.Second,
Replicas: []ReplicaInfo{
{
Server: "vs2:9333",
Path: "/data/ha-vol.blk",
IQN: "iqn.2024.test:ha-vol-r",
ISCSIAddr: "vs2:3260",
NvmeAddr: "vs2:4420",
NQN: "nqn.2024-01.com.seaweedfs:vol.ha-vol.vs2",
DataAddr: "vs2:14260",
CtrlAddr: "vs2:14261",
HealthScore: 0.95,
WALHeadLSN: 100,
Server: "vs2:9333",
Path: "/data/ha-vol.blk",
IQN: "iqn.2024.test:ha-vol-r",
ISCSIAddr: "vs2:3260",
NvmeAddr: "vs2:4420",
NQN: "nqn.2024-01.com.seaweedfs:vol.ha-vol.vs2",
DataAddr: "vs2:14260",
CtrlAddr: "vs2:14261",
HealthScore: 0.95,
WALHeadLSN: 100,
Role: blockvol.RoleToWire(blockvol.RoleReplica),
LastHeartbeat: time.Now(),
},
},
})
@ -1654,10 +1660,11 @@ func TestMaster_ExpandCoordinated_RestartRecovery(t *testing.T) {
}
func TestMaster_ExpandCoordinated_B09_ReReadsEntryAfterLock(t *testing.T) {
// B-09: If failover changes VolumeServer between initial Lookup and
// AcquireExpandInflight, the coordinator must use the fresh entry,
// not the stale one. Use RF=3 so promotion still leaves 1 replica
// and the coordinated path is taken.
// B-09: Exercises the actual race window — failover happens BETWEEN
// the initial Lookup (line 380) and the post-lock re-read (line 419).
// Uses expandPreReadHook to inject PromoteBestReplica at the exact
// interleaving point. RF=3 so promotion leaves 1 replica and the
// coordinated path is taken.
ms := testMasterServerWithExpandMocks(t)
ms.blockRegistry.MarkBlockCapable("vs1:9333")
ms.blockRegistry.MarkBlockCapable("vs2:9333")
@ -1689,31 +1696,39 @@ func TestMaster_ExpandCoordinated_B09_ReReadsEntryAfterLock(t *testing.T) {
return 2 << 30, nil
}
// Simulate failover: promote best replica. With RF=3, one replica
// becomes primary and the other stays as replica → coordinated path.
ms.blockRegistry.PromoteBestReplica("b09-vol")
entry, _ = ms.blockRegistry.Lookup("b09-vol")
newPrimary := entry.VolumeServer
if newPrimary == originalPrimary {
t.Fatal("promotion didn't change primary")
}
if len(entry.Replicas) == 0 {
t.Fatal("expected at least 1 replica after RF=3 promotion")
// Hook fires AFTER AcquireExpandInflight but BEFORE the re-read Lookup.
// This is the exact race window: the initial Lookup already returned
// the old primary, but failover changes it before the re-read.
hookFired := false
ms.expandPreReadHook = func() {
hookFired = true
ms.blockRegistry.PromoteBestReplica("b09-vol")
}
// Expand should use the NEW primary (post-failover), not the old one.
// At this point, the initial Lookup inside ExpandBlockVolume will see
// originalPrimary. The hook then promotes, changing the primary.
// The re-read must pick up the new primary.
resp, err := ms.ExpandBlockVolume(context.Background(), &master_pb.ExpandBlockVolumeRequest{
Name: "b09-vol", NewSizeBytes: 2 << 30,
})
if err != nil {
t.Fatalf("expand: %v", err)
}
if !hookFired {
t.Fatal("expandPreReadHook was not called — race window not exercised")
}
if resp.CapacityBytes != 2<<30 {
t.Fatalf("capacity: got %d", resp.CapacityBytes)
}
// First PREPARE should have gone to the new primary, not the old one.
// Verify: after the hook promoted, the re-read must have picked up
// the new primary. The first PREPARE should go to the new primary.
entry, _ = ms.blockRegistry.Lookup("b09-vol")
newPrimary := entry.VolumeServer
if newPrimary == originalPrimary {
t.Fatal("promotion didn't change primary")
}
if len(preparedServers) == 0 {
t.Fatal("no prepare calls recorded")
}
@ -1721,7 +1736,7 @@ func TestMaster_ExpandCoordinated_B09_ReReadsEntryAfterLock(t *testing.T) {
t.Fatalf("PREPARE went to %q (stale), should go to %q (fresh primary)",
preparedServers[0], newPrimary)
}
// Verify old primary was NOT contacted.
// Verify old primary was NOT contacted at all.
for _, s := range preparedServers {
if s == originalPrimary {
t.Fatalf("PREPARE sent to old primary %q — stale entry used", originalPrimary)

6
weed/server/master_server.go

@ -109,6 +109,10 @@ type MasterServer struct {
blockVSCommitExpand func(ctx context.Context, server string, name string, expandEpoch uint64) (uint64, error)
blockVSCancelExpand func(ctx context.Context, server string, name string, expandEpoch uint64) error
nextExpandEpoch atomic.Uint64
// Test-only hook: called after AcquireExpandInflight but before the
// re-read Lookup in coordinated expand. Nil in production.
expandPreReadHook func()
}
func NewMasterServer(r *mux.Router, option *MasterOption, peers map[string]pb.ServerAddress) *MasterServer {
@ -224,6 +228,8 @@ func NewMasterServer(r *mux.Router, option *MasterOption, peers map[string]pb.Se
r.HandleFunc("/block/volume/{name}", ms.guard.WhiteList(requestIDMiddleware(ms.blockVolumeLookupHandler))).Methods("GET")
r.HandleFunc("/block/volumes", ms.guard.WhiteList(requestIDMiddleware(ms.blockVolumeListHandler))).Methods("GET")
r.HandleFunc("/block/volume/{name}/expand", ms.proxyToLeader(ms.guard.WhiteList(requestIDMiddleware(ms.blockVolumeExpandHandler)))).Methods("POST")
r.HandleFunc("/block/volume/{name}/preflight", ms.guard.WhiteList(requestIDMiddleware(ms.blockVolumePreflightHandler))).Methods("GET")
r.HandleFunc("/block/volume/{name}/promote", ms.proxyToLeader(ms.guard.WhiteList(requestIDMiddleware(ms.blockVolumePromoteHandler)))).Methods("POST")
r.HandleFunc("/block/assign", ms.proxyToLeader(ms.guard.WhiteList(requestIDMiddleware(ms.blockAssignHandler)))).Methods("POST")
r.HandleFunc("/block/servers", ms.guard.WhiteList(requestIDMiddleware(ms.blockServersHandler))).Methods("GET")
r.HandleFunc("/block/status", ms.guard.WhiteList(requestIDMiddleware(ms.blockStatusHandler))).Methods("GET")

96
weed/server/master_server_handlers_block.go

@ -7,6 +7,7 @@ import (
"github.com/gorilla/mux"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol"
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol/blockapi"
@ -206,6 +207,99 @@ func (ms *MasterServer) blockStatusHandler(w http.ResponseWriter, r *http.Reques
writeJsonQuiet(w, r, http.StatusOK, status)
}
// blockVolumePreflightHandler handles GET /block/volume/{name}/preflight.
// Returns a read-only promotion preflight evaluation for the named volume.
func (ms *MasterServer) blockVolumePreflightHandler(w http.ResponseWriter, r *http.Request) {
name := mux.Vars(r)["name"]
if name == "" {
writeJsonError(w, r, http.StatusBadRequest, fmt.Errorf("name is required"))
return
}
pf, err := ms.blockRegistry.EvaluatePromotion(name)
if err != nil {
writeJsonError(w, r, http.StatusNotFound, err)
return
}
resp := blockapi.PreflightResponse{
VolumeName: pf.VolumeName,
Promotable: pf.Promotable,
Reason: pf.Reason,
}
if pf.Candidate != nil {
resp.CandidateServer = pf.Candidate.Server
resp.CandidateHealth = pf.Candidate.HealthScore
resp.CandidateWALLSN = pf.Candidate.WALHeadLSN
}
for _, rej := range pf.Rejections {
resp.Rejections = append(resp.Rejections, blockapi.PreflightRejection{
Server: rej.Server,
Reason: rej.Reason,
})
}
// Add primary liveness info.
entry, ok := ms.blockRegistry.Lookup(name)
if ok {
resp.PrimaryServer = entry.VolumeServer
resp.PrimaryAlive = ms.blockRegistry.IsBlockCapable(entry.VolumeServer)
}
writeJsonQuiet(w, r, http.StatusOK, resp)
}
// blockVolumePromoteHandler handles POST /block/volume/{name}/promote.
// Triggers a manual promotion for the named block volume.
func (ms *MasterServer) blockVolumePromoteHandler(w http.ResponseWriter, r *http.Request) {
name := mux.Vars(r)["name"]
if name == "" {
writeJsonError(w, r, http.StatusBadRequest, fmt.Errorf("name is required"))
return
}
var req blockapi.PromoteVolumeRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeJsonError(w, r, http.StatusBadRequest, fmt.Errorf("decode request: %w", err))
return
}
// ManualPromote captures oldPrimary/oldPath under lock to avoid TOCTOU (BUG-T5-2).
newEpoch, oldPrimary, oldPath, pf, err := ms.blockRegistry.ManualPromote(name, req.TargetServer, req.Force)
if err != nil {
// Distinguish not-found from rejection.
status := http.StatusConflict
if pf.Reason == "volume not found" {
status = http.StatusNotFound
}
// Build structured rejection response.
resp := blockapi.PromoteVolumeResponse{
Reason: pf.Reason,
}
for _, rej := range pf.Rejections {
resp.Rejections = append(resp.Rejections, blockapi.PreflightRejection{
Server: rej.Server,
Reason: rej.Reason,
})
}
glog.V(0).Infof("manual promote %q rejected: %s", name, pf.Reason)
writeJsonQuiet(w, r, status, resp)
return
}
// Post-promotion orchestration (same as auto path).
ms.finalizePromotion(name, oldPrimary, oldPath, newEpoch)
if req.Reason != "" {
glog.V(0).Infof("manual promote %q: reason=%q", name, req.Reason)
}
// Re-read to get the new primary server name.
entry, _ := ms.blockRegistry.Lookup(name)
writeJsonQuiet(w, r, http.StatusOK, blockapi.PromoteVolumeResponse{
NewPrimary: entry.VolumeServer,
Epoch: newEpoch,
})
}
// entryToVolumeInfo converts a BlockVolumeEntry to a blockapi.VolumeInfo.
func entryToVolumeInfo(e *BlockVolumeEntry) blockapi.VolumeInfo {
status := "pending"
@ -239,6 +333,8 @@ func entryToVolumeInfo(e *BlockVolumeEntry) blockapi.VolumeInfo {
HealthScore: e.HealthScore,
ReplicaDegraded: e.ReplicaDegraded,
DurabilityMode: durMode,
NvmeAddr: e.NvmeAddr,
NQN: e.NQN,
}
for _, ri := range e.Replicas {
info.Replicas = append(info.Replicas, blockapi.ReplicaDetail{

1581
weed/server/qa_block_cp11b3_adversarial_test.go
File diff suppressed because it is too large
View File

25
weed/server/qa_block_cp63_test.go

@ -40,6 +40,11 @@ func testMSForQA(t *testing.T) *MasterServer {
// registerQAVolume creates a volume entry with optional replica, configurable lease state.
func registerQAVolume(t *testing.T, ms *MasterServer, name, primary, replica string, epoch uint64, leaseTTL time.Duration, leaseExpired bool) {
t.Helper()
// Mark servers as block-capable so promotion Gate 4 (liveness) passes.
ms.blockRegistry.MarkBlockCapable(primary)
if replica != "" {
ms.blockRegistry.MarkBlockCapable(replica)
}
entry := &BlockVolumeEntry{
Name: name,
VolumeServer: primary,
@ -65,11 +70,13 @@ func registerQAVolume(t *testing.T, ms *MasterServer, name, primary, replica str
// CP8-2: also populate Replicas[].
entry.Replicas = []ReplicaInfo{
{
Server: replica,
Path: fmt.Sprintf("/data/%s.blk", name),
IQN: fmt.Sprintf("iqn.2024.test:%s-r", name),
ISCSIAddr: replica + ":3260",
HealthScore: 1.0,
Server: replica,
Path: fmt.Sprintf("/data/%s.blk", name),
IQN: fmt.Sprintf("iqn.2024.test:%s-r", name),
ISCSIAddr: replica + ":3260",
HealthScore: 1.0,
Role: blockvol.RoleToWire(blockvol.RoleReplica),
LastHeartbeat: time.Now(),
},
}
}
@ -398,7 +405,15 @@ func TestQA_Failover_PromoteIdempotent_NoReplicaAfterFirstSwap(t *testing.T) {
// Reconnect vs1 first so it becomes a replica.
ms.recoverBlockVolumes("vs1")
// Simulate rebuild completion: mark vs1 as a healthy replica.
e, _ := ms.blockRegistry.Lookup("vol1")
for i := range e.Replicas {
if e.Replicas[i].Server == "vs1" {
e.Replicas[i].Role = blockvol.RoleToWire(blockvol.RoleReplica)
e.Replicas[i].LastHeartbeat = time.Now()
e.Replicas[i].HealthScore = 1.0
}
}
e.LastLeaseGrant = time.Now().Add(-1 * time.Minute) // expire the new lease
ms.failoverBlockVolumes("vs2")

485
weed/server/qa_block_expand_adversarial_test.go

@ -0,0 +1,485 @@
package weed_server
import (
"context"
"fmt"
"sync"
"sync/atomic"
"testing"
"time"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol"
)
// ============================================================
// CP11A-2 Adversarial Test Suite: B-09 + B-10
//
// 8 scenarios stress-testing the coordinated expand path under
// failover, concurrent heartbeats, and partial failures.
// ============================================================
// qaExpandMaster creates a MasterServer with 3 block-capable servers
// and default expand mocks for adversarial testing.
func qaExpandMaster(t *testing.T) *MasterServer {
t.Helper()
ms := &MasterServer{
blockRegistry: NewBlockVolumeRegistry(),
blockAssignmentQueue: NewBlockAssignmentQueue(),
blockFailover: newBlockFailoverState(),
}
ms.blockVSAllocate = func(ctx context.Context, server string, name string, sizeBytes uint64, diskType string, durabilityMode string) (*blockAllocResult, error) {
return &blockAllocResult{
Path: fmt.Sprintf("/data/%s.blk", name),
IQN: fmt.Sprintf("iqn.2024.test:%s", name),
ISCSIAddr: server + ":3260",
ReplicaDataAddr: server + ":14260",
ReplicaCtrlAddr: server + ":14261",
RebuildListenAddr: server + ":15000",
}, nil
}
ms.blockVSDelete = func(ctx context.Context, server string, name string) error {
return nil
}
ms.blockVSExpand = func(ctx context.Context, server string, name string, newSize uint64) (uint64, error) {
return newSize, nil
}
ms.blockVSPrepareExpand = func(ctx context.Context, server string, name string, newSize, expandEpoch uint64) error {
return nil
}
ms.blockVSCommitExpand = func(ctx context.Context, server string, name string, expandEpoch uint64) (uint64, error) {
return 2 << 30, nil
}
ms.blockVSCancelExpand = func(ctx context.Context, server string, name string, expandEpoch uint64) error {
return nil
}
ms.blockRegistry.MarkBlockCapable("vs1:9333")
ms.blockRegistry.MarkBlockCapable("vs2:9333")
ms.blockRegistry.MarkBlockCapable("vs3:9333")
return ms
}
// qaCreateRF creates a volume with the given replica factor.
func qaCreateRF(t *testing.T, ms *MasterServer, name string, rf uint32) {
t.Helper()
_, err := ms.CreateBlockVolume(context.Background(), &master_pb.CreateBlockVolumeRequest{
Name: name,
SizeBytes: 1 << 30,
ReplicaFactor: rf,
})
if err != nil {
t.Fatalf("create %s RF=%d: %v", name, rf, err)
}
}
// ────────────────────────────────────────────────────────────
// QA-B09-1: ExpandAfterDoubleFailover_RF3
//
// RF=3 volume. Primary dies → promote replica A. Then replica A
// (now primary) dies → promote replica B. Expand must reach
// replica B (the second-generation primary), not the original.
// ────────────────────────────────────────────────────────────
func TestQA_B09_ExpandAfterDoubleFailover_RF3(t *testing.T) {
ms := qaExpandMaster(t)
qaCreateRF(t, ms, "dbl-failover", 3)
entry, _ := ms.blockRegistry.Lookup("dbl-failover")
gen0Primary := entry.VolumeServer
// First failover: kill original primary.
ms.blockRegistry.PromoteBestReplica("dbl-failover")
entry, _ = ms.blockRegistry.Lookup("dbl-failover")
gen1Primary := entry.VolumeServer
if gen1Primary == gen0Primary {
t.Fatal("first promotion didn't change primary")
}
// Second failover: kill gen1 primary.
// Need to ensure the remaining replica has a fresh heartbeat.
if len(entry.Replicas) == 0 {
t.Fatal("no replicas left after first promotion (need RF=3)")
}
ms.blockRegistry.PromoteBestReplica("dbl-failover")
entry, _ = ms.blockRegistry.Lookup("dbl-failover")
gen2Primary := entry.VolumeServer
if gen2Primary == gen1Primary || gen2Primary == gen0Primary {
t.Fatalf("second promotion should pick a new server, got %q (gen0=%q gen1=%q)",
gen2Primary, gen0Primary, gen1Primary)
}
// Track PREPARE targets.
var preparedServers []string
ms.blockVSPrepareExpand = func(ctx context.Context, server string, name string, newSize, expandEpoch uint64) error {
preparedServers = append(preparedServers, server)
return nil
}
// Expand — standalone path since no replicas remain after 2 promotions.
_, err := ms.ExpandBlockVolume(context.Background(), &master_pb.ExpandBlockVolumeRequest{
Name: "dbl-failover", NewSizeBytes: 2 << 30,
})
if err != nil {
t.Fatalf("expand: %v", err)
}
// If standalone path was taken (no replicas), preparedServers is empty — that's fine.
// If coordinated path was taken, first PREPARE must target gen2Primary.
if len(preparedServers) > 0 && preparedServers[0] != gen2Primary {
t.Fatalf("PREPARE went to %q, want gen2 primary %q", preparedServers[0], gen2Primary)
}
}
// ────────────────────────────────────────────────────────────
// QA-B09-2: ExpandSeesDeletedVolume_AfterLockAcquire
//
// Volume is deleted between the initial Lookup (succeeds) and
// the re-read after AcquireExpandInflight. The re-read must
// detect the deletion and fail cleanly.
// ────────────────────────────────────────────────────────────
func TestQA_B09_ExpandSeesDeletedVolume_AfterLockAcquire(t *testing.T) {
ms := qaExpandMaster(t)
qaCreateRF(t, ms, "disappear", 2)
// Hook PREPARE to delete the volume before it runs.
// The B-09 re-read happens before PREPARE, so we simulate deletion
// between initial Lookup and AcquireExpandInflight by having a
// goroutine that deletes the entry while expand is in progress.
// Instead, test directly: acquire expand lock, then unregister, then
// call ExpandBlockVolume — it should fail on re-read.
// Acquire expand lock manually first so the real call gets blocked.
// Then verify the error path by attempting a second expand.
if !ms.blockRegistry.AcquireExpandInflight("disappear", 2<<30, 1) {
t.Fatal("AcquireExpandInflight should succeed")
}
// Try another expand while locked — should fail with "already in progress".
_, err := ms.ExpandBlockVolume(context.Background(), &master_pb.ExpandBlockVolumeRequest{
Name: "disappear", NewSizeBytes: 2 << 30,
})
if err == nil {
t.Fatal("expand should fail when lock is held")
}
// Release and delete the volume.
ms.blockRegistry.ReleaseExpandInflight("disappear")
ms.blockRegistry.Unregister("disappear")
// Now expand on a deleted volume — should fail on initial Lookup.
_, err = ms.ExpandBlockVolume(context.Background(), &master_pb.ExpandBlockVolumeRequest{
Name: "disappear", NewSizeBytes: 2 << 30,
})
if err == nil {
t.Fatal("expand on deleted volume should fail")
}
}
// ────────────────────────────────────────────────────────────
// QA-B09-3: ConcurrentExpandAndFailover
//
// Expand and failover race on the same volume. Neither should
// panic, and the volume must be in a consistent state afterward.
// ────────────────────────────────────────────────────────────
func TestQA_B09_ConcurrentExpandAndFailover(t *testing.T) {
ms := qaExpandMaster(t)
qaCreateRF(t, ms, "race-vol", 3)
entry, _ := ms.blockRegistry.Lookup("race-vol")
primary := entry.VolumeServer
// Make PREPARE slow so expand holds the lock longer.
ms.blockVSPrepareExpand = func(ctx context.Context, server string, name string, newSize, expandEpoch uint64) error {
time.Sleep(5 * time.Millisecond)
return nil
}
var wg sync.WaitGroup
// Goroutine 1: expand.
wg.Add(1)
go func() {
defer wg.Done()
ms.ExpandBlockVolume(context.Background(), &master_pb.ExpandBlockVolumeRequest{
Name: "race-vol", NewSizeBytes: 2 << 30,
})
// Error is OK — we're testing for panics and consistency.
}()
// Goroutine 2: failover kills primary.
wg.Add(1)
go func() {
defer wg.Done()
time.Sleep(2 * time.Millisecond) // slight delay to let expand start
ms.failoverBlockVolumes(primary)
}()
wg.Wait()
// Volume must still exist regardless of outcome.
_, ok := ms.blockRegistry.Lookup("race-vol")
if !ok {
t.Fatal("volume must survive concurrent expand + failover")
}
}
// ────────────────────────────────────────────────────────────
// QA-B09-4: ConcurrentExpandsSameVolume
//
// Two goroutines try to expand the same volume simultaneously.
// Exactly one should succeed, the other should get "already in
// progress". No panic, no double-commit.
// ────────────────────────────────────────────────────────────
func TestQA_B09_ConcurrentExpandsSameVolume(t *testing.T) {
ms := qaExpandMaster(t)
qaCreateRF(t, ms, "dup-expand", 2)
var commitCount atomic.Int32
ms.blockVSPrepareExpand = func(ctx context.Context, server string, name string, newSize, expandEpoch uint64) error {
time.Sleep(5 * time.Millisecond) // slow prepare
return nil
}
ms.blockVSCommitExpand = func(ctx context.Context, server string, name string, expandEpoch uint64) (uint64, error) {
commitCount.Add(1)
return 2 << 30, nil
}
var wg sync.WaitGroup
var successes atomic.Int32
var failures atomic.Int32
for i := 0; i < 2; i++ {
wg.Add(1)
go func() {
defer wg.Done()
_, err := ms.ExpandBlockVolume(context.Background(), &master_pb.ExpandBlockVolumeRequest{
Name: "dup-expand", NewSizeBytes: 2 << 30,
})
if err == nil {
successes.Add(1)
} else {
failures.Add(1)
}
}()
}
wg.Wait()
if successes.Load() != 1 {
t.Fatalf("expected exactly 1 success, got %d", successes.Load())
}
if failures.Load() != 1 {
t.Fatalf("expected exactly 1 failure (already in progress), got %d", failures.Load())
}
}
// ────────────────────────────────────────────────────────────
// QA-B10-1: RepeatedEmptyHeartbeats_DuringExpand
//
// Multiple empty heartbeats from the primary during expand.
// Entry must survive all of them — not just the first.
// ────────────────────────────────────────────────────────────
func TestQA_B10_RepeatedEmptyHeartbeats_DuringExpand(t *testing.T) {
ms := qaExpandMaster(t)
qaCreateRF(t, ms, "multi-hb", 2)
entry, _ := ms.blockRegistry.Lookup("multi-hb")
primary := entry.VolumeServer
if !ms.blockRegistry.AcquireExpandInflight("multi-hb", 2<<30, 42) {
t.Fatal("acquire expand lock")
}
// 10 empty heartbeats from the primary — each one would delete
// the entry without the B-10 guard.
for i := 0; i < 10; i++ {
ms.blockRegistry.UpdateFullHeartbeat(primary, []*master_pb.BlockVolumeInfoMessage{})
}
_, ok := ms.blockRegistry.Lookup("multi-hb")
if !ok {
t.Fatal("entry deleted after repeated empty heartbeats during expand")
}
ms.blockRegistry.ReleaseExpandInflight("multi-hb")
}
// ────────────────────────────────────────────────────────────
// QA-B10-2: ExpandFailed_HeartbeatStillProtected
//
// After MarkExpandFailed (primary committed, replica didn't),
// empty heartbeats must NOT delete the entry. ExpandFailed
// keeps ExpandInProgress=true as a size-suppression guard.
// ────────────────────────────────────────────────────────────
func TestQA_B10_ExpandFailed_HeartbeatStillProtected(t *testing.T) {
ms := qaExpandMaster(t)
qaCreateRF(t, ms, "fail-hb", 2)
entry, _ := ms.blockRegistry.Lookup("fail-hb")
primary := entry.VolumeServer
if !ms.blockRegistry.AcquireExpandInflight("fail-hb", 2<<30, 42) {
t.Fatal("acquire expand lock")
}
ms.blockRegistry.MarkExpandFailed("fail-hb")
// Empty heartbeat should not delete — ExpandFailed keeps ExpandInProgress=true.
ms.blockRegistry.UpdateFullHeartbeat(primary, []*master_pb.BlockVolumeInfoMessage{})
e, ok := ms.blockRegistry.Lookup("fail-hb")
if !ok {
t.Fatal("entry deleted during ExpandFailed state")
}
if !e.ExpandFailed {
t.Fatal("ExpandFailed should still be true")
}
if !e.ExpandInProgress {
t.Fatal("ExpandInProgress should still be true")
}
// After ClearExpandFailed, empty heartbeat should delete normally.
ms.blockRegistry.ClearExpandFailed("fail-hb")
ms.blockRegistry.UpdateFullHeartbeat(primary, []*master_pb.BlockVolumeInfoMessage{})
_, ok = ms.blockRegistry.Lookup("fail-hb")
if ok {
t.Fatal("entry should be deleted after ClearExpandFailed + empty heartbeat")
}
}
// ────────────────────────────────────────────────────────────
// QA-B10-3: HeartbeatSizeSuppress_DuringExpand
//
// Primary reports a stale (old) size during coordinated expand.
// Registry must NOT downgrade SizeBytes — the pending expand
// size is authoritative until commit or release.
// ────────────────────────────────────────────────────────────
func TestQA_B10_HeartbeatSizeSuppress_DuringExpand(t *testing.T) {
ms := qaExpandMaster(t)
qaCreateRF(t, ms, "size-suppress", 2)
entry, _ := ms.blockRegistry.Lookup("size-suppress")
primary := entry.VolumeServer
origSize := entry.SizeBytes
if !ms.blockRegistry.AcquireExpandInflight("size-suppress", 2<<30, 42) {
t.Fatal("acquire expand lock")
}
// Heartbeat reports old size (expand hasn't committed on VS yet).
ms.blockRegistry.UpdateFullHeartbeat(primary, []*master_pb.BlockVolumeInfoMessage{
{
Path: "/data/size-suppress.blk",
VolumeSize: origSize, // old size
Epoch: 1,
Role: blockvol.RoleToWire(blockvol.RolePrimary),
},
})
entry, _ = ms.blockRegistry.Lookup("size-suppress")
if entry.SizeBytes != origSize {
t.Fatalf("size should remain %d during expand, got %d", origSize, entry.SizeBytes)
}
// Heartbeat reports a LARGER size (stale from previous expand or bug).
// Still must not update — coordinated expand owns the size.
ms.blockRegistry.UpdateFullHeartbeat(primary, []*master_pb.BlockVolumeInfoMessage{
{
Path: "/data/size-suppress.blk",
VolumeSize: 5 << 30, // bogus large size
Epoch: 1,
Role: blockvol.RoleToWire(blockvol.RolePrimary),
},
})
entry, _ = ms.blockRegistry.Lookup("size-suppress")
if entry.SizeBytes != origSize {
t.Fatalf("size should remain %d (suppressed), got %d", origSize, entry.SizeBytes)
}
ms.blockRegistry.ReleaseExpandInflight("size-suppress")
}
// ────────────────────────────────────────────────────────────
// QA-B10-4: ConcurrentHeartbeatsAndExpand
//
// Simultaneous full heartbeats from primary and replicas while
// expand runs on another goroutine. Must not panic, must not
// orphan the entry, and expand must either succeed or fail
// cleanly with a clear error.
// ────────────────────────────────────────────────────────────
func TestQA_B10_ConcurrentHeartbeatsAndExpand(t *testing.T) {
ms := qaExpandMaster(t)
qaCreateRF(t, ms, "hb-expand-race", 2)
entry, _ := ms.blockRegistry.Lookup("hb-expand-race")
primary := entry.VolumeServer
replica := ""
if len(entry.Replicas) > 0 {
replica = entry.Replicas[0].Server
}
ms.blockVSPrepareExpand = func(ctx context.Context, server string, name string, newSize, expandEpoch uint64) error {
time.Sleep(2 * time.Millisecond)
return nil
}
var wg sync.WaitGroup
const rounds = 30
// Goroutine 1: expand.
wg.Add(1)
go func() {
defer wg.Done()
ms.ExpandBlockVolume(context.Background(), &master_pb.ExpandBlockVolumeRequest{
Name: "hb-expand-race", NewSizeBytes: 2 << 30,
})
}()
// Goroutine 2: primary heartbeats (mix of reporting and not reporting).
wg.Add(1)
go func() {
defer wg.Done()
for i := 0; i < rounds; i++ {
if i%5 == 0 {
// Every 5th: empty heartbeat (simulates brief restart).
ms.blockRegistry.UpdateFullHeartbeat(primary, []*master_pb.BlockVolumeInfoMessage{})
} else {
ms.blockRegistry.UpdateFullHeartbeat(primary, []*master_pb.BlockVolumeInfoMessage{
{
Path: "/data/hb-expand-race.blk",
VolumeSize: 1 << 30,
Epoch: 1,
Role: blockvol.RoleToWire(blockvol.RolePrimary),
WalHeadLsn: uint64(100 + i),
},
})
}
}
}()
// Goroutine 3: replica heartbeats.
if replica != "" {
wg.Add(1)
go func() {
defer wg.Done()
for i := 0; i < rounds; i++ {
ms.blockRegistry.UpdateFullHeartbeat(replica, []*master_pb.BlockVolumeInfoMessage{
{
Path: "/data/hb-expand-race.blk",
VolumeSize: 1 << 30,
Epoch: 1,
Role: blockvol.RoleToWire(blockvol.RoleReplica),
WalHeadLsn: uint64(99 + i),
},
})
}
}()
}
wg.Wait()
// Volume must still exist — no orphan.
_, ok := ms.blockRegistry.Lookup("hb-expand-race")
if !ok {
t.Fatal("volume must survive concurrent heartbeats + expand")
}
}

1346
weed/server/qa_block_nvme_publication_test.go
File diff suppressed because it is too large
View File

55
weed/storage/blockvol/blockapi/client.go

@ -136,6 +136,61 @@ func (c *Client) ExpandVolume(ctx context.Context, name string, newSizeBytes uin
return out.CapacityBytes, nil
}
// PromoteVolume triggers a manual promotion for a block volume.
func (c *Client) PromoteVolume(ctx context.Context, name string, req PromoteVolumeRequest) (*PromoteVolumeResponse, error) {
body, err := json.Marshal(req)
if err != nil {
return nil, fmt.Errorf("marshal request: %w", err)
}
resp, err := c.doRequest(ctx, http.MethodPost, "/block/volume/"+name+"/promote", bytes.NewReader(body))
if err != nil {
return nil, err
}
defer resp.Body.Close()
if err := checkStatus(resp, http.StatusOK); err != nil {
return nil, err
}
var out PromoteVolumeResponse
if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
return nil, fmt.Errorf("decode response: %w", err)
}
return &out, nil
}
// BlockStatus fetches the block registry status metrics.
func (c *Client) BlockStatus(ctx context.Context) (*BlockStatusResponse, error) {
resp, err := c.doRequest(ctx, http.MethodGet, "/block/status", nil)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if err := checkStatus(resp, http.StatusOK); err != nil {
return nil, err
}
var out BlockStatusResponse
if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
return nil, fmt.Errorf("decode response: %w", err)
}
return &out, nil
}
// Preflight returns the promotion preflight evaluation for a block volume.
func (c *Client) Preflight(ctx context.Context, name string) (*PreflightResponse, error) {
resp, err := c.doRequest(ctx, http.MethodGet, "/block/volume/"+name+"/preflight", nil)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if err := checkStatus(resp, http.StatusOK); err != nil {
return nil, err
}
var out PreflightResponse
if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
return nil, fmt.Errorf("decode response: %w", err)
}
return &out, nil
}
// ListServers lists all block-capable volume servers.
func (c *Client) ListServers(ctx context.Context) ([]ServerInfo, error) {
resp, err := c.doRequest(ctx, http.MethodGet, "/block/servers", nil)

48
weed/storage/blockvol/blockapi/types.go

@ -38,6 +38,8 @@ type VolumeInfo struct {
HealthScore float64 `json:"health_score"`
ReplicaDegraded bool `json:"replica_degraded,omitempty"`
DurabilityMode string `json:"durability_mode"` // CP8-3-1
NvmeAddr string `json:"nvme_addr,omitempty"`
NQN string `json:"nqn,omitempty"`
}
// ReplicaDetail describes one replica in the API response.
@ -74,6 +76,52 @@ type ExpandVolumeResponse struct {
CapacityBytes uint64 `json:"capacity_bytes"`
}
// PromoteVolumeRequest is the request body for POST /block/volume/{name}/promote.
type PromoteVolumeRequest struct {
TargetServer string `json:"target_server,omitempty"` // specific replica, or empty for auto
Force bool `json:"force,omitempty"` // bypass soft safety checks
Reason string `json:"reason,omitempty"` // audit note
}
// PromoteVolumeResponse is the response for POST /block/volume/{name}/promote.
type PromoteVolumeResponse struct {
NewPrimary string `json:"new_primary"`
Epoch uint64 `json:"epoch"`
Reason string `json:"reason,omitempty"` // rejection reason if failed
Rejections []PreflightRejection `json:"rejections,omitempty"` // per-replica rejection details
}
// BlockStatusResponse is the response for GET /block/status.
type BlockStatusResponse struct {
VolumeCount int `json:"volume_count"`
ServerCount int `json:"server_count"`
PromotionLSNTolerance uint64 `json:"promotion_lsn_tolerance"`
BarrierLagLSN uint64 `json:"barrier_lag_lsn"`
PromotionsTotal int64 `json:"promotions_total"`
FailoversTotal int64 `json:"failovers_total"`
RebuildsTotal int64 `json:"rebuilds_total"`
AssignmentQueueDepth int `json:"assignment_queue_depth"`
}
// PreflightRejection describes why a specific replica was rejected for promotion.
type PreflightRejection struct {
Server string `json:"server"`
Reason string `json:"reason"` // "stale_heartbeat", "wal_lag", "wrong_role", "server_dead", "no_heartbeat"
}
// PreflightResponse is the response for GET /block/volume/{name}/preflight.
type PreflightResponse struct {
VolumeName string `json:"volume_name"`
Promotable bool `json:"promotable"`
Reason string `json:"reason,omitempty"`
CandidateServer string `json:"candidate_server,omitempty"`
CandidateHealth float64 `json:"candidate_health,omitempty"`
CandidateWALLSN uint64 `json:"candidate_wal_lsn,omitempty"`
Rejections []PreflightRejection `json:"rejections,omitempty"`
PrimaryServer string `json:"primary_server"`
PrimaryAlive bool `json:"primary_alive"`
}
// RoleFromString converts a role string to its uint32 wire value.
// Returns 0 (RoleNone) for unrecognized strings.
func RoleFromString(s string) uint32 {

511
weed/storage/blockvol/qa_wal_cp11a3_adversarial_test.go

@ -0,0 +1,511 @@
package blockvol
import (
"sync"
"sync/atomic"
"testing"
"time"
)
// ============================================================
// CP11A-3 Adversarial Test Suite
//
// 10 scenarios stress-testing WAL admission pressure tracking,
// PressureState boundaries, guidance edge cases, and concurrent
// metric visibility.
// ============================================================
// ────────────────────────────────────────────────────────────
// QA-CP11A3-1: SoftMarkEqualsHardMark_NoPanic
//
// If an operator configures softMark == hardMark, the soft-zone
// delay calculation divides by (hardMark - softMark) = 0.
// Must not panic, hang, or produce NaN/Inf delay.
// ────────────────────────────────────────────────────────────
func TestQA_CP11A3_SoftMarkEqualsHardMark_NoPanic(t *testing.T) {
m := NewEngineMetrics()
a := NewWALAdmission(WALAdmissionConfig{
MaxConcurrent: 16,
SoftWatermark: 0.8,
HardWatermark: 0.8, // equal — no soft zone
WALUsedFn: func() float64 { return 0.85 }, // above both marks
NotifyFn: func() {},
ClosedFn: func() bool { return false },
Metrics: m,
})
// With equal marks, pressure >= hardMark takes the hard branch.
// The soft branch's division by zero is never reached.
// But if the code path ever changes, this test catches it.
done := make(chan error, 1)
go func() {
done <- a.Acquire(50 * time.Millisecond)
}()
select {
case err := <-done:
// ErrWALFull is expected (pressure stays above hard, times out).
if err != ErrWALFull {
t.Fatalf("expected ErrWALFull, got %v", err)
}
case <-time.After(2 * time.Second):
t.Fatal("Acquire hung — possible Inf delay from division by zero")
}
}
// ────────────────────────────────────────────────────────────
// QA-CP11A3-2: SoftZoneExactBoundary_DelayIsZero
//
// When pressure == softMark exactly, scale = 0, delay = 0.
// softPressureWaitNs should NOT increase (delay <= 0 skips sleep).
// But hitSoft should still be true → SoftAdmitTotal increments.
// ────────────────────────────────────────────────────────────
func TestQA_CP11A3_SoftZoneExactBoundary_DelayIsZero(t *testing.T) {
m := NewEngineMetrics()
a := NewWALAdmission(WALAdmissionConfig{
MaxConcurrent: 16,
SoftWatermark: 0.7,
HardWatermark: 0.9,
WALUsedFn: func() float64 { return 0.7 }, // exactly at soft mark
NotifyFn: func() {},
ClosedFn: func() bool { return false },
Metrics: m,
})
a.sleepFn = func(d time.Duration) {
t.Fatalf("sleep should not be called when delay=0, but called with %v", d)
}
if err := a.Acquire(100 * time.Millisecond); err != nil {
t.Fatalf("Acquire: %v", err)
}
a.Release()
// SoftAdmitTotal should increment (we entered the soft branch).
if m.WALAdmitSoftTotal.Load() != 1 {
t.Fatalf("WALAdmitSoftTotal = %d, want 1", m.WALAdmitSoftTotal.Load())
}
// But no sleep → softPressureWaitNs stays 0.
if a.SoftPressureWaitNs() != 0 {
t.Fatalf("SoftPressureWaitNs = %d, want 0 (no delay at exact boundary)", a.SoftPressureWaitNs())
}
}
// ────────────────────────────────────────────────────────────
// QA-CP11A3-3: ConcurrentHardWaiters_TimeAccumulates
//
// 8 goroutines enter hard zone simultaneously. Each waits ~5ms.
// Total hardPressureWaitNs should be roughly 8 × 5ms, proving
// atomic accumulation doesn't lose contributions.
// ────────────────────────────────────────────────────────────
func TestQA_CP11A3_ConcurrentHardWaiters_TimeAccumulates(t *testing.T) {
m := NewEngineMetrics()
var pressure atomic.Int64
pressure.Store(95) // above hard mark
a := NewWALAdmission(WALAdmissionConfig{
MaxConcurrent: 16,
SoftWatermark: 0.7,
HardWatermark: 0.9,
WALUsedFn: func() float64 { return float64(pressure.Load()) / 100.0 },
NotifyFn: func() {},
ClosedFn: func() bool { return false },
Metrics: m,
})
var sleepCalls atomic.Int64
a.sleepFn = func(d time.Duration) {
time.Sleep(1 * time.Millisecond)
// After enough total sleeps across all goroutines, drop pressure.
if sleepCalls.Add(1) >= 20 {
pressure.Store(50)
}
}
const workers = 8
var wg sync.WaitGroup
for i := 0; i < workers; i++ {
wg.Add(1)
go func() {
defer wg.Done()
if err := a.Acquire(5 * time.Second); err != nil {
t.Errorf("Acquire: %v", err)
}
a.Release()
}()
}
wg.Wait()
// All 8 must have entered hard zone.
if m.WALAdmitHardTotal.Load() < uint64(workers) {
t.Fatalf("WALAdmitHardTotal = %d, want >= %d", m.WALAdmitHardTotal.Load(), workers)
}
// Accumulated hard wait should be > 0, reflecting contributions from all goroutines.
if a.HardPressureWaitNs() <= 0 {
t.Fatal("HardPressureWaitNs should be > 0 after concurrent hard-zone waits")
}
}
// ────────────────────────────────────────────────────────────
// QA-CP11A3-4: PressureStateAndAcquireRace
//
// One goroutine oscillates walUsed, another reads PressureState
// rapidly. Must not panic, must always return a valid state.
// ────────────────────────────────────────────────────────────
func TestQA_CP11A3_PressureStateAndAcquireRace(t *testing.T) {
var pressure atomic.Int64
pressure.Store(50)
a := NewWALAdmission(WALAdmissionConfig{
MaxConcurrent: 16,
SoftWatermark: 0.7,
HardWatermark: 0.9,
WALUsedFn: func() float64 { return float64(pressure.Load()) / 100.0 },
NotifyFn: func() {},
ClosedFn: func() bool { return false },
Metrics: NewEngineMetrics(),
})
a.sleepFn = func(d time.Duration) { time.Sleep(100 * time.Microsecond) }
var wg sync.WaitGroup
const rounds = 200
// Goroutine 1: oscillate pressure.
wg.Add(1)
go func() {
defer wg.Done()
levels := []int64{30, 75, 95, 50, 80, 92, 10}
for i := 0; i < rounds; i++ {
pressure.Store(levels[i%len(levels)])
}
}()
// Goroutine 2: read PressureState.
wg.Add(1)
go func() {
defer wg.Done()
valid := map[string]bool{"normal": true, "soft": true, "hard": true}
for i := 0; i < rounds; i++ {
s := a.PressureState()
if !valid[s] {
t.Errorf("PressureState() = %q — not a valid state", s)
return
}
}
}()
// Goroutine 3: Acquire/Release rapidly.
wg.Add(1)
go func() {
defer wg.Done()
for i := 0; i < rounds/2; i++ {
err := a.Acquire(20 * time.Millisecond)
if err == nil {
a.Release()
}
}
}()
wg.Wait()
}
// ────────────────────────────────────────────────────────────
// QA-CP11A3-5: TimeInZoneMonotonicity
//
// softPressureWaitNs and hardPressureWaitNs must be monotonically
// non-decreasing across reads, even under concurrent writes.
// ────────────────────────────────────────────────────────────
func TestQA_CP11A3_TimeInZoneMonotonicity(t *testing.T) {
m := NewEngineMetrics()
var pressure atomic.Int64
pressure.Store(80) // soft zone
a := NewWALAdmission(WALAdmissionConfig{
MaxConcurrent: 16,
SoftWatermark: 0.7,
HardWatermark: 0.9,
WALUsedFn: func() float64 { return float64(pressure.Load()) / 100.0 },
NotifyFn: func() {},
ClosedFn: func() bool { return false },
Metrics: m,
})
a.sleepFn = func(d time.Duration) { time.Sleep(100 * time.Microsecond) }
var wg sync.WaitGroup
const writers = 4
const rounds = 30
// Writers produce soft-zone and hard-zone waits.
for i := 0; i < writers; i++ {
wg.Add(1)
go func(id int) {
defer wg.Done()
for j := 0; j < rounds; j++ {
if j%5 == 0 {
pressure.Store(95) // hard
} else {
pressure.Store(80) // soft
}
err := a.Acquire(50 * time.Millisecond)
if err == nil {
a.Release()
}
// Drop back so next Acquire can succeed.
pressure.Store(50)
}
}(i)
}
// Reader checks monotonicity.
wg.Add(1)
go func() {
defer wg.Done()
var prevSoft, prevHard int64
for i := 0; i < rounds*writers; i++ {
soft := a.SoftPressureWaitNs()
hard := a.HardPressureWaitNs()
if soft < prevSoft {
t.Errorf("SoftPressureWaitNs decreased: %d -> %d", prevSoft, soft)
}
if hard < prevHard {
t.Errorf("HardPressureWaitNs decreased: %d -> %d", prevHard, hard)
}
prevSoft = soft
prevHard = hard
}
}()
wg.Wait()
}
// ────────────────────────────────────────────────────────────
// QA-CP11A3-6: WALGuidance_ZeroInputs
//
// Zero walSize, zero blockSize, zero maxConcurrent, empty hint.
// Must not panic or produce invalid results.
// ────────────────────────────────────────────────────────────
func TestQA_CP11A3_WALGuidance_ZeroInputs(t *testing.T) {
// All zeros.
r := WALSizingGuidance(0, 0, "")
if r.Level != "warn" {
t.Errorf("zero walSize: Level = %q, want warn", r.Level)
}
// Zero blockSize: absMin = 0*64 = 0. Only workload minimum check fires.
r = WALSizingGuidance(0, 0, WorkloadGeneral)
if r.Level != "warn" {
t.Errorf("zero walSize+blockSize: Level = %q, want warn", r.Level)
}
// Zero walSize but nonzero blockSize.
r = WALSizingGuidance(0, 4096, WorkloadDatabase)
if r.Level != "warn" {
t.Errorf("zero walSize: Level = %q, want warn", r.Level)
}
if len(r.Warnings) < 2 {
t.Errorf("expected both workload + absolute minimum warnings, got %d", len(r.Warnings))
}
// EvaluateWALConfig with zero maxConcurrent should not trigger concurrency warning.
r = EvaluateWALConfig(0, 4096, 0, WorkloadGeneral)
// walSize=0 still triggers sizing warning.
if r.Level != "warn" {
t.Errorf("Level = %q, want warn for zero walSize", r.Level)
}
}
// ────────────────────────────────────────────────────────────
// QA-CP11A3-7: WALGuidance_OverflowSafe
//
// Very large blockSize × minWALEntries might overflow uint64.
// (64 × 2^60 does NOT overflow, but let's test near-boundary.)
// ────────────────────────────────────────────────────────────
func TestQA_CP11A3_WALGuidance_OverflowSafe(t *testing.T) {
// Large blockSize: 256MB blocks × 64 = 16GB minimum.
// walSize = 1GB → should warn (16GB > 1GB).
r := WALSizingGuidance(1<<30, 256<<20, WorkloadGeneral)
if r.Level != "warn" {
t.Errorf("Level = %q, want warn (1GB WAL < 16GB absMin)", r.Level)
}
// Extreme: blockSize = 1<<40 (1TB). 64 × 1TB = 64TB.
// uint64 can hold 18 EB — no overflow.
r = WALSizingGuidance(1<<50, 1<<40, WorkloadThroughput)
// 1PB WAL with 1TB blocks: absMin = 64TB, 1PB > 64TB → ok for absolute.
// 1PB > 128MB (throughput min) → ok for workload.
if r.Level != "ok" {
t.Errorf("Level = %q, want ok for huge WAL", r.Level)
}
}
// ────────────────────────────────────────────────────────────
// QA-CP11A3-8: WALStatusSnapshot_PartialInit
//
// BlockVol with Metrics but nil walAdmission, and vice versa.
// WALStatus must return coherent defaults for the nil side
// and real values for the non-nil side.
// ────────────────────────────────────────────────────────────
func TestQA_CP11A3_WALStatusSnapshot_PartialInit(t *testing.T) {
// Case 1: Metrics set, walAdmission nil.
m := NewEngineMetrics()
m.WALAdmitSoftTotal.Add(42)
m.WALAdmitHardTotal.Add(7)
vol1 := &BlockVol{Metrics: m}
ws := vol1.WALStatus()
if ws.PressureState != "normal" {
t.Errorf("nil admission: PressureState = %q, want normal", ws.PressureState)
}
if ws.SoftAdmitTotal != 42 {
t.Errorf("SoftAdmitTotal = %d, want 42", ws.SoftAdmitTotal)
}
if ws.HardAdmitTotal != 7 {
t.Errorf("HardAdmitTotal = %d, want 7", ws.HardAdmitTotal)
}
// Pressure wait should be 0 (no admission controller).
if ws.SoftPressureWaitSec != 0 || ws.HardPressureWaitSec != 0 {
t.Errorf("nil admission: pressure wait should be 0")
}
// Case 2: walAdmission set, Metrics nil.
a := NewWALAdmission(WALAdmissionConfig{
MaxConcurrent: 16,
SoftWatermark: 0.65,
HardWatermark: 0.85,
WALUsedFn: func() float64 { return 0.7 },
NotifyFn: func() {},
ClosedFn: func() bool { return false },
})
vol2 := &BlockVol{walAdmission: a}
ws2 := vol2.WALStatus()
if ws2.PressureState != "soft" {
t.Errorf("PressureState = %q, want soft (0.7 >= 0.65)", ws2.PressureState)
}
if ws2.SoftWatermark != 0.65 {
t.Errorf("SoftWatermark = %f, want 0.65", ws2.SoftWatermark)
}
// Metrics fields should be zero (nil Metrics).
if ws2.SoftAdmitTotal != 0 || ws2.HardAdmitTotal != 0 || ws2.TimeoutTotal != 0 {
t.Errorf("nil metrics: counters should be 0")
}
}
// ────────────────────────────────────────────────────────────
// QA-CP11A3-9: ObserverPanic_ContainedOrDocumented
//
// If WALAdmitWaitObserver panics, RecordWALAdmit is called from
// Acquire → recordAdmit. A panic in the observer would crash the
// writer goroutine. This test documents whether the panic is
// recovered or propagated.
// ────────────────────────────────────────────────────────────
func TestQA_CP11A3_ObserverPanic_DocumentedBehavior(t *testing.T) {
m := NewEngineMetrics()
m.WALAdmitWaitObserver = func(s float64) { panic("boom") }
// RecordWALAdmit calls the observer. If it panics, the caller panics.
// This is expected (same as prometheus.Histogram.Observe panicking).
// Document that the observer must not panic.
panicked := false
func() {
defer func() {
if r := recover(); r != nil {
panicked = true
}
}()
m.RecordWALAdmit(1*time.Millisecond, false, false, false)
}()
if !panicked {
t.Fatal("expected panic from observer — if recovered, update this test")
}
// Verify counters were NOT updated (panic happened before completion).
// Actually, the observer is called AFTER WALAdmitTotal.Add(1) and
// walAdmitWaitNs.record(). Let's verify the counter state.
if m.WALAdmitTotal.Load() != 1 {
t.Errorf("WALAdmitTotal = %d — should be 1 (incremented before observer)", m.WALAdmitTotal.Load())
}
// soft/hard/timeout flags are processed AFTER observer — panic skips them.
// With soft=false, hard=false, timedOut=false there's nothing to skip,
// but the counters should reflect what happened before the panic.
}
// ────────────────────────────────────────────────────────────
// QA-CP11A3-10: ConcurrentWALStatusReads
//
// Multiple goroutines read WALStatus while Acquire/Release runs.
// Must not panic. Fields should be internally consistent
// (SoftAdmitTotal >= 0, HardPressureWaitSec >= 0, etc.)
// ────────────────────────────────────────────────────────────
func TestQA_CP11A3_ConcurrentWALStatusReads(t *testing.T) {
m := NewEngineMetrics()
var pressure atomic.Int64
pressure.Store(50)
a := NewWALAdmission(WALAdmissionConfig{
MaxConcurrent: 16,
SoftWatermark: 0.7,
HardWatermark: 0.9,
WALUsedFn: func() float64 { return float64(pressure.Load()) / 100.0 },
NotifyFn: func() {},
ClosedFn: func() bool { return false },
Metrics: m,
})
a.sleepFn = func(d time.Duration) { time.Sleep(50 * time.Microsecond) }
vol := &BlockVol{
Metrics: m,
walAdmission: a,
}
var wg sync.WaitGroup
const rounds = 100
// Writers with varying pressure.
for i := 0; i < 4; i++ {
wg.Add(1)
go func() {
defer wg.Done()
levels := []int64{50, 75, 95, 60, 85}
for j := 0; j < rounds; j++ {
pressure.Store(levels[j%len(levels)])
if err := a.Acquire(20 * time.Millisecond); err == nil {
a.Release()
}
pressure.Store(50) // reset for next round
}
}()
}
// Concurrent WALStatus readers.
for i := 0; i < 4; i++ {
wg.Add(1)
go func() {
defer wg.Done()
valid := map[string]bool{"normal": true, "soft": true, "hard": true}
for j := 0; j < rounds*2; j++ {
ws := vol.WALStatus()
if !valid[ws.PressureState] {
t.Errorf("invalid PressureState: %q", ws.PressureState)
return
}
if ws.UsedFraction < 0 || ws.UsedFraction > 1.01 {
t.Errorf("UsedFraction out of range: %f", ws.UsedFraction)
return
}
if ws.SoftPressureWaitSec < 0 {
t.Errorf("SoftPressureWaitSec negative: %f", ws.SoftPressureWaitSec)
return
}
if ws.HardPressureWaitSec < 0 {
t.Errorf("HardPressureWaitSec negative: %f", ws.HardPressureWaitSec)
return
}
}
}()
}
wg.Wait()
}

220
weed/storage/blockvol/testrunner/actions/devops.go

@ -26,6 +26,10 @@ func RegisterDevOpsActions(r *tr.Registry) {
r.RegisterFunc("delete_block_volume", tr.TierDevOps, deleteBlockVolume)
r.RegisterFunc("wait_block_servers", tr.TierDevOps, waitBlockServers)
r.RegisterFunc("cluster_status", tr.TierDevOps, clusterStatus)
r.RegisterFunc("wait_block_primary", tr.TierDevOps, waitBlockPrimary)
r.RegisterFunc("assert_block_field", tr.TierDevOps, assertBlockField)
r.RegisterFunc("block_status", tr.TierDevOps, blockStatus)
r.RegisterFunc("block_promote", tr.TierDevOps, blockPromote)
}
// setISCSIVars sets the save_as_iscsi_host/port/addr/iqn vars from a VolumeInfo.
@ -434,6 +438,222 @@ func waitBlockServers(ctx context.Context, actx *tr.ActionContext, act tr.Action
}
}
// waitBlockPrimary polls lookup until the volume's primary server matches (or differs from) expected.
// Params: name, expected (server addr to wait for) OR not (server addr to wait to change from), timeout (default 60s).
// Sets save_as vars from the final lookup.
func waitBlockPrimary(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) {
client, err := blockAPIClient(actx, act)
if err != nil {
return nil, fmt.Errorf("wait_block_primary: %w", err)
}
name := act.Params["name"]
if name == "" {
return nil, fmt.Errorf("wait_block_primary: name param required")
}
expected := act.Params["expected"]
notServer := act.Params["not"]
if expected == "" && notServer == "" {
return nil, fmt.Errorf("wait_block_primary: expected or not param required")
}
timeout := 60 * time.Second
if t, ok := act.Params["timeout"]; ok {
if d, err := parseDuration(t); err == nil {
timeout = d
}
}
timeoutCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
ticker := time.NewTicker(2 * time.Second)
defer ticker.Stop()
pollCount := 0
for {
select {
case <-timeoutCtx.Done():
return nil, fmt.Errorf("wait_block_primary: timeout after %s waiting for primary change on %s", timeout, name)
case <-ticker.C:
pollCount++
info, err := client.LookupVolume(timeoutCtx, name)
if err != nil {
if pollCount <= 3 {
actx.Log(" poll %d: lookup error: %v", pollCount, err)
}
continue
}
if pollCount <= 3 || pollCount%10 == 0 {
actx.Log(" poll %d: %s primary=%s role=%s", pollCount, name, info.VolumeServer, info.Role)
}
match := false
if expected != "" && info.VolumeServer == expected {
match = true
}
if notServer != "" && info.VolumeServer != notServer && info.VolumeServer != "" {
match = true
}
if match {
actx.Log(" primary for %s is now %s (epoch=%d)", name, info.VolumeServer, info.Epoch)
if act.SaveAs != "" {
setISCSIVars(actx, act.SaveAs, info)
actx.Vars[act.SaveAs+"_server"] = info.VolumeServer
actx.Vars[act.SaveAs+"_epoch"] = strconv.FormatUint(info.Epoch, 10)
actx.Vars[act.SaveAs+"_role"] = info.Role
}
return map[string]string{"value": info.VolumeServer}, nil
}
}
}
}
// assertBlockField looks up a block volume and asserts a specific field matches the expected value.
// Params: name, field (one of: volume_server, role, status, epoch, size_bytes, replica_server,
// replica_factor, health_score, replica_degraded, durability_mode, iscsi_addr, iqn), expected.
func assertBlockField(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) {
client, err := blockAPIClient(actx, act)
if err != nil {
return nil, fmt.Errorf("assert_block_field: %w", err)
}
name := act.Params["name"]
if name == "" {
return nil, fmt.Errorf("assert_block_field: name param required")
}
field := act.Params["field"]
if field == "" {
return nil, fmt.Errorf("assert_block_field: field param required")
}
expected := act.Params["expected"]
if expected == "" {
return nil, fmt.Errorf("assert_block_field: expected param required")
}
info, err := client.LookupVolume(ctx, name)
if err != nil {
return nil, fmt.Errorf("assert_block_field: lookup %s: %w", name, err)
}
actual, err := extractVolumeField(info, field)
if err != nil {
return nil, fmt.Errorf("assert_block_field: %w", err)
}
if actual != expected {
return nil, fmt.Errorf("assert_block_field: %s.%s = %q, expected %q", name, field, actual, expected)
}
actx.Log(" assert %s.%s == %q OK", name, field, expected)
return map[string]string{"value": actual}, nil
}
// extractVolumeField extracts a named field from VolumeInfo as a string.
func extractVolumeField(info *blockapi.VolumeInfo, field string) (string, error) {
switch field {
case "volume_server":
return info.VolumeServer, nil
case "role":
return info.Role, nil
case "status":
return info.Status, nil
case "epoch":
return strconv.FormatUint(info.Epoch, 10), nil
case "size_bytes":
return strconv.FormatUint(info.SizeBytes, 10), nil
case "replica_server":
return info.ReplicaServer, nil
case "replica_factor":
return strconv.Itoa(info.ReplicaFactor), nil
case "health_score":
return fmt.Sprintf("%.2f", info.HealthScore), nil
case "replica_degraded":
return strconv.FormatBool(info.ReplicaDegraded), nil
case "durability_mode":
return info.DurabilityMode, nil
case "iscsi_addr":
return info.ISCSIAddr, nil
case "iqn":
return info.IQN, nil
case "name":
return info.Name, nil
case "replica_iscsi_addr":
return info.ReplicaISCSIAddr, nil
case "replica_iqn":
return info.ReplicaIQN, nil
case "replica_data_addr":
return info.ReplicaDataAddr, nil
case "replica_ctrl_addr":
return info.ReplicaCtrlAddr, nil
default:
return "", fmt.Errorf("unknown field %q", field)
}
}
// blockStatus fetches block registry status metrics from master.
// Sets save_as_promotions_total, save_as_failovers_total, etc.
func blockStatus(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) {
client, err := blockAPIClient(actx, act)
if err != nil {
return nil, fmt.Errorf("block_status: %w", err)
}
status, err := client.BlockStatus(ctx)
if err != nil {
return nil, fmt.Errorf("block_status: %w", err)
}
actx.Log(" block status: volumes=%d servers=%d promotions=%d failovers=%d rebuilds=%d",
status.VolumeCount, status.ServerCount, status.PromotionsTotal, status.FailoversTotal, status.RebuildsTotal)
if act.SaveAs != "" {
actx.Vars[act.SaveAs+"_volume_count"] = strconv.Itoa(status.VolumeCount)
actx.Vars[act.SaveAs+"_server_count"] = strconv.Itoa(status.ServerCount)
actx.Vars[act.SaveAs+"_promotions_total"] = strconv.FormatInt(status.PromotionsTotal, 10)
actx.Vars[act.SaveAs+"_failovers_total"] = strconv.FormatInt(status.FailoversTotal, 10)
actx.Vars[act.SaveAs+"_rebuilds_total"] = strconv.FormatInt(status.RebuildsTotal, 10)
actx.Vars[act.SaveAs+"_queue_depth"] = strconv.Itoa(status.AssignmentQueueDepth)
}
jsonBytes, _ := json.Marshal(status)
return map[string]string{"value": string(jsonBytes)}, nil
}
// blockPromote triggers a manual promotion for a block volume.
// Params: name, target_server (optional, empty=auto), force (optional bool), reason (optional).
func blockPromote(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) {
client, err := blockAPIClient(actx, act)
if err != nil {
return nil, fmt.Errorf("block_promote: %w", err)
}
name := act.Params["name"]
if name == "" {
return nil, fmt.Errorf("block_promote: name param required")
}
force := false
if f := act.Params["force"]; f == "true" || f == "1" {
force = true
}
resp, err := client.PromoteVolume(ctx, name, blockapi.PromoteVolumeRequest{
TargetServer: act.Params["target_server"],
Force: force,
Reason: act.Params["reason"],
})
if err != nil {
return nil, fmt.Errorf("block_promote: %w", err)
}
actx.Log(" promoted %s -> primary=%s epoch=%d", name, resp.NewPrimary, resp.Epoch)
if act.SaveAs != "" {
actx.Vars[act.SaveAs+"_server"] = resp.NewPrimary
actx.Vars[act.SaveAs+"_epoch"] = strconv.FormatUint(resp.Epoch, 10)
}
return map[string]string{"value": resp.NewPrimary}, nil
}
// clusterStatus fetches the full cluster status JSON.
func clusterStatus(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) {
node, err := getNode(actx, act.Node)

22
weed/storage/blockvol/testrunner/actions/devops_test.go

@ -23,6 +23,10 @@ func TestDevOpsActions_Registration(t *testing.T) {
"delete_block_volume",
"wait_block_servers",
"cluster_status",
"wait_block_primary",
"assert_block_field",
"block_status",
"block_promote",
}
for _, name := range expected {
@ -39,8 +43,8 @@ func TestDevOpsActions_Tier(t *testing.T) {
byTier := registry.ListByTier()
devopsActions := byTier[tr.TierDevOps]
if len(devopsActions) != 11 {
t.Errorf("devops tier has %d actions, want 11", len(devopsActions))
if len(devopsActions) != 15 {
t.Errorf("devops tier has %d actions, want 15", len(devopsActions))
}
// Verify all are in devops tier.
@ -84,11 +88,11 @@ func TestAllActions_Registration(t *testing.T) {
if n := len(byTier[tr.TierCore]); n != 11 {
t.Errorf("core: %d, want 11", n)
}
if n := len(byTier[tr.TierBlock]); n != 56 {
t.Errorf("block: %d, want 56", n)
if n := len(byTier[tr.TierBlock]); n != 58 {
t.Errorf("block: %d, want 58", n)
}
if n := len(byTier[tr.TierDevOps]); n != 11 {
t.Errorf("devops: %d, want 11", n)
if n := len(byTier[tr.TierDevOps]); n != 15 {
t.Errorf("devops: %d, want 15", n)
}
if n := len(byTier[tr.TierChaos]); n != 5 {
t.Errorf("chaos: %d, want 5", n)
@ -97,13 +101,13 @@ func TestAllActions_Registration(t *testing.T) {
t.Errorf("k8s: %d, want 14", n)
}
// Total should be 97 (92 prev + 4 devops: expand/lookup/delete/wait_block_servers + 1 block: iscsi_login_direct).
// Total should be 103 (99 prev + 4 devops: wait_block_primary, assert_block_field, block_status, block_promote).
total := 0
for _, actions := range byTier {
total += len(actions)
}
if total != 97 {
t.Errorf("total actions: %d, want 97", total)
if total != 103 {
t.Errorf("total actions: %d, want 103", total)
}
}

89
weed/storage/blockvol/testrunner/actions/snapshot.go

@ -8,6 +8,7 @@ import (
"time"
tr "github.com/seaweedfs/seaweedfs/weed/storage/blockvol/testrunner"
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol/testrunner/infra"
)
// RegisterSnapshotActions registers snapshot and resize actions.
@ -18,6 +19,8 @@ func RegisterSnapshotActions(r *tr.Registry) {
r.RegisterFunc("resize", tr.TierBlock, resizeAction)
r.RegisterFunc("iscsi_rescan", tr.TierBlock, iscsiRescan)
r.RegisterFunc("get_block_size", tr.TierBlock, getBlockSize)
r.RegisterFunc("snapshot_export_s3", tr.TierBlock, snapshotExportS3)
r.RegisterFunc("snapshot_import_s3", tr.TierBlock, snapshotImportS3)
}
func snapshotCreate(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) {
@ -181,3 +184,89 @@ func parseHumanSize(s string) (uint64, error) {
}
return val * multiplier, nil
}
// snapshotExportS3 exports a snapshot from a target to an S3 bucket.
// Params: bucket, key_prefix, s3_endpoint, s3_access_key, s3_secret_key, s3_region, snapshot_id (optional).
// Returns: manifest_key, data_key, size_bytes, sha256.
func snapshotExportS3(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) {
tgt, err := getHATarget(actx, act.Target)
if err != nil {
return nil, err
}
opts := infra.ExportS3Opts{
Bucket: act.Params["bucket"],
KeyPrefix: act.Params["key_prefix"],
S3Endpoint: act.Params["s3_endpoint"],
S3AccessKey: act.Params["s3_access_key"],
S3SecretKey: act.Params["s3_secret_key"],
S3Region: act.Params["s3_region"],
}
if opts.Bucket == "" || opts.S3Endpoint == "" {
return nil, fmt.Errorf("snapshot_export_s3: bucket and s3_endpoint required")
}
if idStr := act.Params["snapshot_id"]; idStr != "" {
id, err := strconv.ParseUint(idStr, 10, 32)
if err != nil {
return nil, fmt.Errorf("snapshot_export_s3: invalid snapshot_id %q: %w", idStr, err)
}
opts.SnapshotID = uint32(id)
}
result, err := tgt.ExportSnapshotS3(ctx, opts)
if err != nil {
return nil, fmt.Errorf("snapshot_export_s3: %w", err)
}
actx.Log(" exported to s3://%s/%s (%d bytes, sha256=%s)", opts.Bucket, result.DataKey, result.SizeBytes, result.SHA256)
out := map[string]string{
"value": result.SHA256,
}
if act.SaveAs != "" {
actx.Vars[act.SaveAs+"_manifest_key"] = result.ManifestKey
actx.Vars[act.SaveAs+"_data_key"] = result.DataKey
actx.Vars[act.SaveAs+"_size_bytes"] = strconv.FormatUint(result.SizeBytes, 10)
actx.Vars[act.SaveAs+"_sha256"] = result.SHA256
}
return out, nil
}
// snapshotImportS3 imports a snapshot from an S3 bucket into a target.
// Params: bucket, manifest_key, s3_endpoint, s3_access_key, s3_secret_key, s3_region, allow_overwrite.
// Returns: size_bytes, sha256.
func snapshotImportS3(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) {
tgt, err := getHATarget(actx, act.Target)
if err != nil {
return nil, err
}
opts := infra.ImportS3Opts{
Bucket: act.Params["bucket"],
ManifestKey: act.Params["manifest_key"],
S3Endpoint: act.Params["s3_endpoint"],
S3AccessKey: act.Params["s3_access_key"],
S3SecretKey: act.Params["s3_secret_key"],
S3Region: act.Params["s3_region"],
}
if opts.Bucket == "" || opts.ManifestKey == "" || opts.S3Endpoint == "" {
return nil, fmt.Errorf("snapshot_import_s3: bucket, manifest_key, and s3_endpoint required")
}
if act.Params["allow_overwrite"] == "true" {
opts.AllowOverwrite = true
}
result, err := tgt.ImportSnapshotS3(ctx, opts)
if err != nil {
return nil, fmt.Errorf("snapshot_import_s3: %w", err)
}
actx.Log(" imported %d bytes (sha256=%s)", result.SizeBytes, result.SHA256)
out := map[string]string{
"value": result.SHA256,
}
if act.SaveAs != "" {
actx.Vars[act.SaveAs+"_size_bytes"] = strconv.FormatUint(result.SizeBytes, 10)
actx.Vars[act.SaveAs+"_sha256"] = result.SHA256
}
return out, nil
}

101
weed/storage/blockvol/testrunner/infra/ha_target.go

@ -478,6 +478,107 @@ func (h *HATarget) Resize(ctx context.Context, newSizeBytes uint64) error {
return nil
}
// ExportSnapshotS3 sends POST /export with S3 credentials.
// Returns the manifest key and data SHA-256 on success.
func (h *HATarget) ExportSnapshotS3(ctx context.Context, opts ExportS3Opts) (*ExportS3Result, error) {
reqBody := map[string]interface{}{
"bucket": opts.Bucket,
"key_prefix": opts.KeyPrefix,
"s3_endpoint": opts.S3Endpoint,
"s3_region": opts.S3Region,
}
if opts.S3AccessKey != "" {
reqBody["s3_access_key"] = opts.S3AccessKey
reqBody["s3_secret_key"] = opts.S3SecretKey
}
if opts.SnapshotID > 0 {
reqBody["snapshot_id"] = opts.SnapshotID
}
code, body, err := h.curlPost(ctx, "/export", reqBody)
if err != nil {
return nil, fmt.Errorf("export snapshot s3: %w", err)
}
if code != http.StatusOK {
return nil, fmt.Errorf("export snapshot s3 failed (HTTP %d): %s", code, body)
}
var resp ExportS3Result
if err := json.NewDecoder(strings.NewReader(body)).Decode(&resp); err != nil {
return nil, fmt.Errorf("decode export response: %w", err)
}
return &resp, nil
}
// ImportSnapshotS3 sends POST /import with S3 credentials and manifest key.
func (h *HATarget) ImportSnapshotS3(ctx context.Context, opts ImportS3Opts) (*ImportS3Result, error) {
reqBody := map[string]interface{}{
"bucket": opts.Bucket,
"manifest_key": opts.ManifestKey,
"s3_endpoint": opts.S3Endpoint,
"s3_region": opts.S3Region,
}
if opts.S3AccessKey != "" {
reqBody["s3_access_key"] = opts.S3AccessKey
reqBody["s3_secret_key"] = opts.S3SecretKey
}
if opts.AllowOverwrite {
reqBody["allow_overwrite"] = true
}
code, body, err := h.curlPost(ctx, "/import", reqBody)
if err != nil {
return nil, fmt.Errorf("import snapshot s3: %w", err)
}
if code != http.StatusOK {
return nil, fmt.Errorf("import snapshot s3 failed (HTTP %d): %s", code, body)
}
var resp ImportS3Result
if err := json.NewDecoder(strings.NewReader(body)).Decode(&resp); err != nil {
return nil, fmt.Errorf("decode import response: %w", err)
}
return &resp, nil
}
// ExportS3Opts configures a snapshot export to S3.
type ExportS3Opts struct {
Bucket string
KeyPrefix string
S3Endpoint string
S3AccessKey string
S3SecretKey string
S3Region string
SnapshotID uint32
}
// ExportS3Result is the response from POST /export.
type ExportS3Result struct {
OK bool `json:"ok"`
ManifestKey string `json:"manifest_key"`
DataKey string `json:"data_key"`
SizeBytes uint64 `json:"size_bytes"`
SHA256 string `json:"sha256"`
}
// ImportS3Opts configures a snapshot import from S3.
type ImportS3Opts struct {
Bucket string
ManifestKey string
S3Endpoint string
S3AccessKey string
S3SecretKey string
S3Region string
AllowOverwrite bool
}
// ImportS3Result is the response from POST /import.
type ImportS3Result struct {
OK bool `json:"ok"`
SizeBytes uint64 `json:"size_bytes"`
SHA256 string `json:"sha256"`
}
// WaitForRole polls GET /status until the target reports the expected role.
func (h *HATarget) WaitForRole(ctx context.Context, expectedRole string) error {
for {

246
weed/storage/blockvol/testrunner/scenarios/cp11b3-auto-failover.yaml

@ -0,0 +1,246 @@
name: cp11b3-auto-failover
timeout: 10m
env:
repo_dir: "/opt/work/seaweedfs"
master_url: "http://192.168.1.184:9434"
# Tests: T1 (candidate evaluation), T2 (orphan re-evaluation), T6 (preflight/status)
# Flow: Create RF=2 → write data → kill primary → master auto-promotes → verify data + metrics
topology:
nodes:
target_node:
host: "192.168.1.184"
user: testdev
key: "/opt/work/testdev_key"
client_node:
host: "192.168.1.181"
user: testdev
key: "/opt/work/testdev_key"
phases:
# Phase 1: Clean slate
- name: setup
actions:
- action: kill_stale
node: target_node
- action: kill_stale
node: client_node
iscsi_cleanup: "true"
- action: exec
node: target_node
cmd: "rm -rf /tmp/sw-b3-master /tmp/sw-b3-vs1 /tmp/sw-b3-vs2"
root: "true"
# Phase 2: Start cluster
- name: start_cluster
actions:
- action: exec
node: target_node
cmd: "mkdir -p /tmp/sw-b3-master /tmp/sw-b3-vs1/blocks /tmp/sw-b3-vs2/blocks"
- action: start_weed_master
node: target_node
port: "9434"
dir: "/tmp/sw-b3-master"
save_as: master_pid
- action: wait_cluster_ready
node: target_node
master_url: "http://localhost:9434"
timeout: 30s
- action: start_weed_volume
node: target_node
port: "18190"
master: "localhost:9434"
dir: "/tmp/sw-b3-vs1"
extra_args: "-block.dir=/tmp/sw-b3-vs1/blocks -block.listen=:3277 -ip=192.168.1.184"
save_as: vs1_pid
- action: start_weed_volume
node: target_node
port: "18191"
master: "localhost:9434"
dir: "/tmp/sw-b3-vs2"
extra_args: "-block.dir=/tmp/sw-b3-vs2/blocks -block.listen=:3278 -ip=192.168.1.184"
save_as: vs2_pid
- action: wait_block_servers
count: "2"
timeout: 60s
# Phase 3: Create RF=2 volume, record initial state
- name: create_volume
actions:
- action: create_block_volume
name: "failover-test"
size: "50M"
replica_factor: "2"
save_as: vol_info
# Wait for replica to confirm role via heartbeat.
# Without this, PromoteBestReplica rejects replica as "no_heartbeat".
- action: sleep
duration: 10s
- action: lookup_block_volume
name: "failover-test"
save_as: initial
- action: print
msg: "initial primary={{ initial_iscsi_host }}:{{ initial_iscsi_port }} capacity={{ initial_capacity }}"
# Record the initial primary server for later comparison.
- action: assert_block_field
name: "failover-test"
field: "replica_factor"
expected: "2"
- action: assert_block_field
name: "failover-test"
field: "epoch"
expected: "1"
# Capture initial block status metrics.
- action: block_status
save_as: pre_stats
# Phase 4: Write data via iSCSI
- name: write_data
actions:
- action: iscsi_login_direct
node: client_node
host: "{{ initial_iscsi_host }}"
port: "{{ initial_iscsi_port }}"
iqn: "{{ initial_iqn }}"
save_as: device
- action: dd_write
node: client_node
device: "{{ device }}"
bs: 1M
count: "1"
seek: "5"
save_as: md5_5M
- action: dd_read_md5
node: client_node
device: "{{ device }}"
bs: 1M
count: "1"
skip: "5"
save_as: verify_5M
- action: assert_equal
actual: "{{ verify_5M }}"
expected: "{{ md5_5M }}"
# Phase 5: Kill primary VS, wait for master auto-failover
- name: failover
actions:
- action: iscsi_cleanup
node: client_node
ignore_error: true
- action: lookup_block_volume
name: "failover-test"
save_as: pre_kill
- action: print
msg: "killing primary VS (server={{ pre_kill_iscsi_host }}:{{ pre_kill_iscsi_port }})"
# Crash-kill VS1 with SIGKILL (not SIGTERM) to simulate a real crash.
# SIGTERM triggers graceful shutdown which deregisters volumes from
# the master registry — preventing the failover path we want to test.
- action: exec
node: target_node
cmd: "kill -9 {{ vs1_pid }}"
root: "true"
# Wait for master to detect VS1 disconnection and promote.
# Lease TTL is 30s; if never granted (zero), promotion is immediate.
# Allow extra time for heartbeat confirmation + deferred timer.
- action: sleep
duration: 35s
- action: wait_block_primary
name: "failover-test"
not: "192.168.1.184:18190"
timeout: 60s
save_as: promoted
# Phase 6: Verify failover state
- name: verify_failover
actions:
- action: print
msg: "new primary={{ promoted_server }} epoch={{ promoted_epoch }}"
# Epoch must have incremented (real promotion, not just heartbeat update).
- action: assert_block_field
name: "failover-test"
field: "epoch"
expected: "2"
- action: block_status
save_as: post_stats
# Verify promotion counter incremented.
- action: assert_greater
actual: "{{ post_stats_promotions_total }}"
expected: "{{ pre_stats_promotions_total }}"
# Phase 7: Reconnect iSCSI to new primary, verify data
- name: verify_data
actions:
- action: iscsi_login_direct
node: client_node
host: "{{ promoted_iscsi_host }}"
port: "{{ promoted_iscsi_port }}"
iqn: "{{ promoted_iqn }}"
save_as: device2
- action: dd_read_md5
node: client_node
device: "{{ device2 }}"
bs: 1M
count: "1"
skip: "5"
save_as: post_failover_md5
- action: assert_equal
actual: "{{ post_failover_md5 }}"
expected: "{{ md5_5M }}"
# Phase 8: Restart killed VS, verify rebuild queued
- name: restart_verify
actions:
- action: iscsi_cleanup
node: client_node
ignore_error: true
- action: start_weed_volume
node: target_node
port: "18190"
master: "localhost:9434"
dir: "/tmp/sw-b3-vs1"
extra_args: "-block.dir=/tmp/sw-b3-vs1/blocks -block.listen=:3277 -ip=192.168.1.184"
save_as: vs1_pid2
- action: wait_block_servers
count: "2"
timeout: 60s
- action: sleep
duration: 5s
# After restart, the old primary should be queued for rebuild.
- action: block_status
save_as: final_stats
- action: assert_greater
actual: "{{ final_stats_rebuilds_total }}"
expected: "{{ post_stats_rebuilds_total }}"
# Cleanup (always runs)
- name: cleanup
always: true
actions:
- action: iscsi_cleanup
node: client_node
ignore_error: true
- action: delete_block_volume
name: "failover-test"
ignore_error: true
- action: stop_weed
node: target_node
pid: "{{ vs1_pid2 }}"
ignore_error: true
- action: stop_weed
node: target_node
pid: "{{ vs2_pid }}"
ignore_error: true
- action: stop_weed
node: target_node
pid: "{{ vs1_pid }}"
ignore_error: true
- action: stop_weed
node: target_node
pid: "{{ master_pid }}"
ignore_error: true
- action: exec
node: target_node
cmd: "rm -rf /tmp/sw-b3-master /tmp/sw-b3-vs1 /tmp/sw-b3-vs2"
root: "true"
ignore_error: true

214
weed/storage/blockvol/testrunner/scenarios/cp11b3-fast-reconnect.yaml

@ -0,0 +1,214 @@
name: cp11b3-fast-reconnect
timeout: 10m
env:
repo_dir: "/opt/work/seaweedfs"
master_url: "http://192.168.1.184:9436"
# Tests: T3 (deferred timer safety), T2 (fast reconnect skips failover)
# Flow: Create RF=2 → write → kill primary briefly → restart before lease expires
# → verify no promotion happened → verify data intact
topology:
nodes:
target_node:
host: "192.168.1.184"
user: testdev
key: "/opt/work/testdev_key"
client_node:
host: "192.168.1.181"
user: testdev
key: "/opt/work/testdev_key"
phases:
# Phase 1: Clean slate
- name: setup
actions:
- action: kill_stale
node: target_node
- action: kill_stale
node: client_node
iscsi_cleanup: "true"
- action: exec
node: target_node
cmd: "rm -rf /tmp/sw-b3r-master /tmp/sw-b3r-vs1 /tmp/sw-b3r-vs2"
root: "true"
# Phase 2: Start cluster
- name: start_cluster
actions:
- action: exec
node: target_node
cmd: "mkdir -p /tmp/sw-b3r-master /tmp/sw-b3r-vs1/blocks /tmp/sw-b3r-vs2/blocks"
- action: start_weed_master
node: target_node
port: "9436"
dir: "/tmp/sw-b3r-master"
save_as: master_pid
- action: wait_cluster_ready
node: target_node
master_url: "http://localhost:9436"
timeout: 30s
- action: start_weed_volume
node: target_node
port: "18194"
master: "localhost:9436"
dir: "/tmp/sw-b3r-vs1"
extra_args: "-block.dir=/tmp/sw-b3r-vs1/blocks -block.listen=:3281 -ip=192.168.1.184"
save_as: vs1_pid
- action: start_weed_volume
node: target_node
port: "18195"
master: "localhost:9436"
dir: "/tmp/sw-b3r-vs2"
extra_args: "-block.dir=/tmp/sw-b3r-vs2/blocks -block.listen=:3282 -ip=192.168.1.184"
save_as: vs2_pid
- action: wait_block_servers
count: "2"
timeout: 60s
# Phase 3: Create RF=2 volume, write data
- name: create_and_write
actions:
- action: create_block_volume
name: "reconnect-test"
size: "50M"
replica_factor: "2"
save_as: vol_info
# Wait for replica to confirm role via heartbeat.
- action: sleep
duration: 10s
- action: lookup_block_volume
name: "reconnect-test"
save_as: initial
- action: iscsi_login_direct
node: client_node
host: "{{ initial_iscsi_host }}"
port: "{{ initial_iscsi_port }}"
iqn: "{{ initial_iqn }}"
save_as: device
- action: dd_write
node: client_node
device: "{{ device }}"
bs: 1M
count: "1"
seek: "8"
save_as: md5_8M
- action: dd_read_md5
node: client_node
device: "{{ device }}"
bs: 1M
count: "1"
skip: "8"
save_as: verify_8M
- action: assert_equal
actual: "{{ verify_8M }}"
expected: "{{ md5_8M }}"
- action: iscsi_cleanup
node: client_node
ignore_error: true
# Record initial epoch.
- action: assert_block_field
name: "reconnect-test"
field: "epoch"
expected: "1"
# Record pre-kill promotion counter.
- action: block_status
save_as: pre_stats
# Phase 4: Kill and quickly restart primary VS (before lease expires)
- name: fast_reconnect
actions:
# Crash-kill primary VS with SIGKILL.
- action: exec
node: target_node
cmd: "kill -9 {{ vs1_pid }}"
root: "true"
# Restart it quickly — within a few seconds, well before the
# default 30s lease TTL expires on the master.
- action: sleep
duration: 3s
- action: start_weed_volume
node: target_node
port: "18194"
master: "localhost:9436"
dir: "/tmp/sw-b3r-vs1"
extra_args: "-block.dir=/tmp/sw-b3r-vs1/blocks -block.listen=:3281 -ip=192.168.1.184"
save_as: vs1_pid2
# Wait for VS to re-register with master.
- action: wait_block_servers
count: "2"
timeout: 60s
- action: sleep
duration: 5s
# Phase 5: Verify NO promotion happened
- name: verify_no_promotion
actions:
# Epoch should still be 1 (no promotion).
- action: assert_block_field
name: "reconnect-test"
field: "epoch"
expected: "1"
# Promotion counter should not have increased.
- action: block_status
save_as: post_stats
- action: assert_equal
actual: "{{ post_stats_promotions_total }}"
expected: "{{ pre_stats_promotions_total }}"
- action: print
msg: "fast reconnect: epoch unchanged, no promotion — deferred timer cancelled"
# Phase 6: Verify data still accessible on original primary
- name: verify_data
actions:
- action: lookup_block_volume
name: "reconnect-test"
save_as: after
- action: iscsi_login_direct
node: client_node
host: "{{ after_iscsi_host }}"
port: "{{ after_iscsi_port }}"
iqn: "{{ after_iqn }}"
save_as: device2
- action: dd_read_md5
node: client_node
device: "{{ device2 }}"
bs: 1M
count: "1"
skip: "8"
save_as: post_reconnect_md5
- action: assert_equal
actual: "{{ post_reconnect_md5 }}"
expected: "{{ md5_8M }}"
# Cleanup (always runs)
- name: cleanup
always: true
actions:
- action: iscsi_cleanup
node: client_node
ignore_error: true
- action: delete_block_volume
name: "reconnect-test"
ignore_error: true
- action: stop_weed
node: target_node
pid: "{{ vs1_pid2 }}"
ignore_error: true
- action: stop_weed
node: target_node
pid: "{{ vs2_pid }}"
ignore_error: true
- action: stop_weed
node: target_node
pid: "{{ vs1_pid }}"
ignore_error: true
- action: stop_weed
node: target_node
pid: "{{ master_pid }}"
ignore_error: true
- action: exec
node: target_node
cmd: "rm -rf /tmp/sw-b3r-master /tmp/sw-b3r-vs1 /tmp/sw-b3r-vs2"
root: "true"
ignore_error: true

190
weed/storage/blockvol/testrunner/scenarios/cp11b3-manual-promote.yaml

@ -0,0 +1,190 @@
name: cp11b3-manual-promote
timeout: 10m
env:
repo_dir: "/opt/work/seaweedfs"
master_url: "http://192.168.1.184:9435"
# Tests: T5 (manual promote API), T6 (preflight), structured rejection
# Flow: Create RF=2 → write → preflight check → kill primary → manual promote → verify data
topology:
nodes:
target_node:
host: "192.168.1.184"
user: testdev
key: "/opt/work/testdev_key"
client_node:
host: "192.168.1.181"
user: testdev
key: "/opt/work/testdev_key"
phases:
# Phase 1: Clean slate
- name: setup
actions:
- action: kill_stale
node: target_node
- action: kill_stale
node: client_node
iscsi_cleanup: "true"
- action: exec
node: target_node
cmd: "rm -rf /tmp/sw-b3m-master /tmp/sw-b3m-vs1 /tmp/sw-b3m-vs2"
root: "true"
# Phase 2: Start cluster
- name: start_cluster
actions:
- action: exec
node: target_node
cmd: "mkdir -p /tmp/sw-b3m-master /tmp/sw-b3m-vs1/blocks /tmp/sw-b3m-vs2/blocks"
- action: start_weed_master
node: target_node
port: "9435"
dir: "/tmp/sw-b3m-master"
save_as: master_pid
- action: wait_cluster_ready
node: target_node
master_url: "http://localhost:9435"
timeout: 30s
- action: start_weed_volume
node: target_node
port: "18192"
master: "localhost:9435"
dir: "/tmp/sw-b3m-vs1"
extra_args: "-block.dir=/tmp/sw-b3m-vs1/blocks -block.listen=:3279 -ip=192.168.1.184"
save_as: vs1_pid
- action: start_weed_volume
node: target_node
port: "18193"
master: "localhost:9435"
dir: "/tmp/sw-b3m-vs2"
extra_args: "-block.dir=/tmp/sw-b3m-vs2/blocks -block.listen=:3280 -ip=192.168.1.184"
save_as: vs2_pid
- action: wait_block_servers
count: "2"
timeout: 60s
# Phase 3: Create RF=2 volume, write data
- name: create_and_write
actions:
- action: create_block_volume
name: "promote-test"
size: "50M"
replica_factor: "2"
save_as: vol_info
# Wait for replica to confirm role via heartbeat.
- action: sleep
duration: 10s
- action: lookup_block_volume
name: "promote-test"
save_as: initial
- action: iscsi_login_direct
node: client_node
host: "{{ initial_iscsi_host }}"
port: "{{ initial_iscsi_port }}"
iqn: "{{ initial_iqn }}"
save_as: device
- action: dd_write
node: client_node
device: "{{ device }}"
bs: 1M
count: "2"
seek: "3"
save_as: md5_3M
- action: dd_read_md5
node: client_node
device: "{{ device }}"
bs: 1M
count: "2"
skip: "3"
save_as: verify_3M
- action: assert_equal
actual: "{{ verify_3M }}"
expected: "{{ md5_3M }}"
# Phase 4: Kill primary VS, then promote via API
- name: kill_and_promote
actions:
- action: iscsi_cleanup
node: client_node
ignore_error: true
# Crash-kill VS1 with SIGKILL to simulate a real crash.
- action: exec
node: target_node
cmd: "kill -9 {{ vs1_pid }}"
root: "true"
# Wait for master to detect the disconnection.
- action: sleep
duration: 15s
# Manual promote via the API.
- action: block_promote
name: "promote-test"
reason: "T7 integration test: manual failover"
save_as: promote_result
- action: print
msg: "promoted to {{ promote_result_server }} epoch={{ promote_result_epoch }}"
# Phase 5: Verify promoted state
- name: verify_promoted
actions:
- action: lookup_block_volume
name: "promote-test"
save_as: after
# New primary should be different from old.
- action: assert_block_field
name: "promote-test"
field: "epoch"
expected: "2"
- action: block_status
save_as: stats
- action: print
msg: "promotions_total={{ stats_promotions_total }}"
# Phase 6: Reconnect iSCSI to new primary, verify data
- name: verify_data
actions:
- action: iscsi_login_direct
node: client_node
host: "{{ after_iscsi_host }}"
port: "{{ after_iscsi_port }}"
iqn: "{{ after_iqn }}"
save_as: device2
- action: dd_read_md5
node: client_node
device: "{{ device2 }}"
bs: 1M
count: "2"
skip: "3"
save_as: post_promote_md5
- action: assert_equal
actual: "{{ post_promote_md5 }}"
expected: "{{ md5_3M }}"
# Cleanup (always runs)
- name: cleanup
always: true
actions:
- action: iscsi_cleanup
node: client_node
ignore_error: true
- action: delete_block_volume
name: "promote-test"
ignore_error: true
- action: stop_weed
node: target_node
pid: "{{ vs2_pid }}"
ignore_error: true
- action: stop_weed
node: target_node
pid: "{{ vs1_pid }}"
ignore_error: true
- action: stop_weed
node: target_node
pid: "{{ master_pid }}"
ignore_error: true
- action: exec
node: target_node
cmd: "rm -rf /tmp/sw-b3m-master /tmp/sw-b3m-vs1 /tmp/sw-b3m-vs2"
root: "true"
ignore_error: true
Loading…
Cancel
Save