Browse Source

feat: CP11B-3 safe ops — promotion hardening, preflight, manual promote

Six-task checkpoint hardening the promotion and failover paths:

T1: 4-gate candidate evaluation (heartbeat freshness, WAL lag, role,
    server liveness) with structured rejection reasons.
T2: Orphaned-primary re-evaluation on replica reconnect (B-06/B-08).
T3: Deferred timer safety — epoch validation prevents stale timers
    from firing on recreated/changed volumes (B-07).
T4: Rebuild addr cleanup on promotion (B-11), NVMe publication
    refresh on heartbeat, and preflight endpoint wiring.
T5: Manual promote API — POST /block/volume/{name}/promote with
    force flag, target server selection, and structured rejection
    response. Shared applyPromotionLocked/finalizePromotion helpers
    eliminate duplication between auto and manual paths.
T6: Read-only preflight endpoint (GET /block/volume/{name}/preflight)
    and blockapi client wrappers (Preflight, Promote).

BUG-T5-1: PromotionsTotal counter moved to finalizePromotion (shared
    by both auto and manual paths) to prevent metrics divergence.

24 files changed, ~6500 lines added. 42 new QA adversarial tests.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
feature/sw-block
Ping Qiu 2 days ago
parent
commit
075ff52219
  1. 7
      weed/server/integration_block_test.go
  2. 89
      weed/server/master_block_failover.go
  3. 335
      weed/server/master_block_failover_test.go
  4. 372
      weed/server/master_block_registry.go
  5. 519
      weed/server/master_block_registry_test.go
  6. 3
      weed/server/master_grpc_server.go
  7. 23
      weed/server/master_grpc_server_block.go
  8. 71
      weed/server/master_grpc_server_block_test.go
  9. 6
      weed/server/master_server.go
  10. 96
      weed/server/master_server_handlers_block.go
  11. 1581
      weed/server/qa_block_cp11b3_adversarial_test.go
  12. 25
      weed/server/qa_block_cp63_test.go
  13. 485
      weed/server/qa_block_expand_adversarial_test.go
  14. 1346
      weed/server/qa_block_nvme_publication_test.go
  15. 55
      weed/storage/blockvol/blockapi/client.go
  16. 48
      weed/storage/blockvol/blockapi/types.go
  17. 511
      weed/storage/blockvol/qa_wal_cp11a3_adversarial_test.go
  18. 220
      weed/storage/blockvol/testrunner/actions/devops.go
  19. 22
      weed/storage/blockvol/testrunner/actions/devops_test.go
  20. 89
      weed/storage/blockvol/testrunner/actions/snapshot.go
  21. 101
      weed/storage/blockvol/testrunner/infra/ha_target.go
  22. 246
      weed/storage/blockvol/testrunner/scenarios/cp11b3-auto-failover.yaml
  23. 214
      weed/storage/blockvol/testrunner/scenarios/cp11b3-fast-reconnect.yaml
  24. 190
      weed/storage/blockvol/testrunner/scenarios/cp11b3-manual-promote.yaml

7
weed/server/integration_block_test.go

@ -645,13 +645,16 @@ func TestIntegration_DoubleFailover(t *testing.T) {
// Reconnect vs1 first so it becomes a replica (via recoverBlockVolumes). // Reconnect vs1 first so it becomes a replica (via recoverBlockVolumes).
ms.recoverBlockVolumes(vs1) ms.recoverBlockVolumes(vs1)
// Simulate heartbeat from vs1 that restores iSCSI addr and health score
// (in production this happens when the VS re-registers after reconnect).
// Simulate heartbeat from vs1 that restores iSCSI addr, health score,
// role, and heartbeat timestamp (in production this happens when the
// VS re-registers after reconnect and completes rebuild).
e1, _ = ms.blockRegistry.Lookup("pvc-double-1") e1, _ = ms.blockRegistry.Lookup("pvc-double-1")
for i := range e1.Replicas { for i := range e1.Replicas {
if e1.Replicas[i].Server == vs1 { if e1.Replicas[i].Server == vs1 {
e1.Replicas[i].ISCSIAddr = vs1 + ":3260" e1.Replicas[i].ISCSIAddr = vs1 + ":3260"
e1.Replicas[i].HealthScore = 1.0 e1.Replicas[i].HealthScore = 1.0
e1.Replicas[i].Role = blockvol.RoleToWire(blockvol.RoleReplica)
e1.Replicas[i].LastHeartbeat = time.Now()
} }
} }

89
weed/server/master_block_failover.go

@ -57,7 +57,19 @@ func (ms *MasterServer) failoverBlockVolumes(deadServer string) {
delay := leaseExpiry.Sub(now) delay := leaseExpiry.Sub(now)
glog.V(0).Infof("failover: %q lease expires in %v, deferring promotion", entry.Name, delay) glog.V(0).Infof("failover: %q lease expires in %v, deferring promotion", entry.Name, delay)
volumeName := entry.Name volumeName := entry.Name
capturedEpoch := entry.Epoch // T3: capture epoch for stale-timer validation
timer := time.AfterFunc(delay, func() { timer := time.AfterFunc(delay, func() {
// T3: Re-validate before acting — prevent stale timer on recreated/changed volume.
current, ok := ms.blockRegistry.Lookup(volumeName)
if !ok {
glog.V(0).Infof("failover: deferred promotion for %q skipped (volume deleted)", volumeName)
return
}
if current.Epoch != capturedEpoch {
glog.V(0).Infof("failover: deferred promotion for %q skipped (epoch changed %d -> %d)",
volumeName, capturedEpoch, current.Epoch)
return
}
ms.promoteReplica(volumeName) ms.promoteReplica(volumeName)
}) })
ms.blockFailover.mu.Lock() ms.blockFailover.mu.Lock()
@ -116,8 +128,15 @@ func (ms *MasterServer) promoteReplica(volumeName string) {
return return
} }
ms.finalizePromotion(volumeName, oldPrimary, oldPath, newEpoch)
}
// finalizePromotion performs post-registry promotion steps:
// enqueue assignment for new primary, record pending rebuild for old primary, bump metrics.
// Called by both promoteReplica (auto) and blockVolumePromoteHandler (manual).
func (ms *MasterServer) finalizePromotion(volumeName, oldPrimary, oldPath string, newEpoch uint64) {
// Re-read entry after promotion. // Re-read entry after promotion.
entry, ok = ms.blockRegistry.Lookup(volumeName)
entry, ok := ms.blockRegistry.Lookup(volumeName)
if !ok { if !ok {
return return
} }
@ -198,11 +217,15 @@ func (ms *MasterServer) cancelDeferredTimers(server string) {
// recoverBlockVolumes is called when a previously dead VS reconnects. // recoverBlockVolumes is called when a previously dead VS reconnects.
// It cancels any deferred promotion timers (R2-F2), drains pending rebuilds, // It cancels any deferred promotion timers (R2-F2), drains pending rebuilds,
// and enqueues rebuild assignments.
// enqueues rebuild assignments, and checks for orphaned primaries (T2/B-06).
func (ms *MasterServer) recoverBlockVolumes(reconnectedServer string) { func (ms *MasterServer) recoverBlockVolumes(reconnectedServer string) {
// R2-F2: Cancel deferred promotion timers for this server to prevent split-brain. // R2-F2: Cancel deferred promotion timers for this server to prevent split-brain.
ms.cancelDeferredTimers(reconnectedServer) ms.cancelDeferredTimers(reconnectedServer)
// T2 (B-06): Check for orphaned primaries — volumes where the reconnecting
// server is a replica but the primary is dead/disconnected.
ms.reevaluateOrphanedPrimaries(reconnectedServer)
rebuilds := ms.drainPendingRebuilds(reconnectedServer) rebuilds := ms.drainPendingRebuilds(reconnectedServer)
if len(rebuilds) == 0 { if len(rebuilds) == 0 {
return return
@ -221,16 +244,74 @@ func (ms *MasterServer) recoverBlockVolumes(reconnectedServer string) {
Path: rb.OldPath, Path: rb.OldPath,
}) })
// T4: Warn if RebuildListenAddr is empty (new primary hasn't heartbeated yet).
rebuildAddr := entry.RebuildListenAddr
if rebuildAddr == "" {
glog.Warningf("rebuild: %q RebuildListenAddr is empty (new primary %s may not have heartbeated yet), "+
"queuing rebuild anyway — VS should retry on empty addr", rb.VolumeName, entry.VolumeServer)
}
// Enqueue rebuild assignment for the reconnected server. // Enqueue rebuild assignment for the reconnected server.
ms.blockAssignmentQueue.Enqueue(reconnectedServer, blockvol.BlockVolumeAssignment{ ms.blockAssignmentQueue.Enqueue(reconnectedServer, blockvol.BlockVolumeAssignment{
Path: rb.OldPath, Path: rb.OldPath,
Epoch: entry.Epoch, Epoch: entry.Epoch,
Role: blockvol.RoleToWire(blockvol.RoleRebuilding), Role: blockvol.RoleToWire(blockvol.RoleRebuilding),
RebuildAddr: entry.RebuildListenAddr,
RebuildAddr: rebuildAddr,
}) })
ms.blockRegistry.RebuildsTotal.Add(1) ms.blockRegistry.RebuildsTotal.Add(1)
glog.V(0).Infof("rebuild: enqueued rebuild for %q on %s (epoch=%d, rebuildAddr=%s)", glog.V(0).Infof("rebuild: enqueued rebuild for %q on %s (epoch=%d, rebuildAddr=%s)",
rb.VolumeName, reconnectedServer, entry.Epoch, entry.RebuildListenAddr)
rb.VolumeName, reconnectedServer, entry.Epoch, rebuildAddr)
}
}
// reevaluateOrphanedPrimaries checks if the given server is a replica for any
// volumes whose primary is dead (not block-capable). If so, promotes the best
// available replica — but only after the old primary's lease has expired, to
// maintain the same split-brain protection as failoverBlockVolumes().
// This fixes B-06 (orphaned primary after replica re-register)
// and partially B-08 (fast reconnect skips failover window).
func (ms *MasterServer) reevaluateOrphanedPrimaries(server string) {
if ms.blockRegistry == nil {
return
}
orphaned := ms.blockRegistry.VolumesWithDeadPrimary(server)
now := time.Now()
for _, volumeName := range orphaned {
entry, ok := ms.blockRegistry.Lookup(volumeName)
if !ok {
continue
}
// Respect lease expiry — same gate as failoverBlockVolumes().
leaseExpiry := entry.LastLeaseGrant.Add(entry.LeaseTTL)
if now.Before(leaseExpiry) {
delay := leaseExpiry.Sub(now)
glog.V(0).Infof("failover: orphaned primary for %q (replica %s alive, primary dead) "+
"but lease expires in %v, deferring promotion", volumeName, server, delay)
capturedEpoch := entry.Epoch
deadPrimary := entry.VolumeServer
timer := time.AfterFunc(delay, func() {
current, ok := ms.blockRegistry.Lookup(volumeName)
if !ok {
return
}
if current.Epoch != capturedEpoch {
glog.V(0).Infof("failover: deferred orphan promotion for %q skipped (epoch changed %d -> %d)",
volumeName, capturedEpoch, current.Epoch)
return
}
ms.promoteReplica(volumeName)
})
ms.blockFailover.mu.Lock()
ms.blockFailover.deferredTimers[deadPrimary] = append(
ms.blockFailover.deferredTimers[deadPrimary], timer)
ms.blockFailover.mu.Unlock()
continue
}
glog.V(0).Infof("failover: orphaned primary detected for %q (replica %s alive, primary dead, lease expired), promoting",
volumeName, server)
ms.promoteReplica(volumeName)
} }
} }

335
weed/server/master_block_failover_test.go

@ -34,6 +34,9 @@ func testMasterServerForFailover(t *testing.T) *MasterServer {
// registerVolumeWithReplica creates a volume entry with primary + replica for tests. // registerVolumeWithReplica creates a volume entry with primary + replica for tests.
func registerVolumeWithReplica(t *testing.T, ms *MasterServer, name, primary, replica string, epoch uint64, leaseTTL time.Duration) { func registerVolumeWithReplica(t *testing.T, ms *MasterServer, name, primary, replica string, epoch uint64, leaseTTL time.Duration) {
t.Helper() t.Helper()
// Mark both servers as block-capable so promotion Gate 4 (liveness) passes.
ms.blockRegistry.MarkBlockCapable(primary)
ms.blockRegistry.MarkBlockCapable(replica)
entry := &BlockVolumeEntry{ entry := &BlockVolumeEntry{
Name: name, Name: name,
VolumeServer: primary, VolumeServer: primary,
@ -53,11 +56,13 @@ func registerVolumeWithReplica(t *testing.T, ms *MasterServer, name, primary, re
// CP8-2: also populate Replicas[] for PromoteBestReplica. // CP8-2: also populate Replicas[] for PromoteBestReplica.
Replicas: []ReplicaInfo{ Replicas: []ReplicaInfo{
{ {
Server: replica,
Path: fmt.Sprintf("/data/%s.blk", name),
IQN: fmt.Sprintf("iqn.2024.test:%s-replica", name),
ISCSIAddr: replica + ":3260",
HealthScore: 1.0,
Server: replica,
Path: fmt.Sprintf("/data/%s.blk", name),
IQN: fmt.Sprintf("iqn.2024.test:%s-replica", name),
ISCSIAddr: replica + ":3260",
HealthScore: 1.0,
Role: blockvol.RoleToWire(blockvol.RoleReplica),
LastHeartbeat: time.Now(),
}, },
}, },
} }
@ -194,6 +199,9 @@ func TestFailover_MultipleVolumes(t *testing.T) {
func TestFailover_LeaseNotExpired_DeferredPromotion(t *testing.T) { func TestFailover_LeaseNotExpired_DeferredPromotion(t *testing.T) {
ms := testMasterServerForFailover(t) ms := testMasterServerForFailover(t)
// Mark servers as block-capable so promotion Gate 4 (liveness) passes.
ms.blockRegistry.MarkBlockCapable("vs1")
ms.blockRegistry.MarkBlockCapable("vs2")
entry := &BlockVolumeEntry{ entry := &BlockVolumeEntry{
Name: "vol1", Name: "vol1",
VolumeServer: "vs1", VolumeServer: "vs1",
@ -209,7 +217,7 @@ func TestFailover_LeaseNotExpired_DeferredPromotion(t *testing.T) {
LeaseTTL: 200 * time.Millisecond, LeaseTTL: 200 * time.Millisecond,
LastLeaseGrant: time.Now(), // just granted, NOT expired yet LastLeaseGrant: time.Now(), // just granted, NOT expired yet
Replicas: []ReplicaInfo{ Replicas: []ReplicaInfo{
{Server: "vs2", Path: "/data/vol1.blk", IQN: "iqn:vol1-r", ISCSIAddr: "vs2:3260", HealthScore: 1.0},
{Server: "vs2", Path: "/data/vol1.blk", IQN: "iqn:vol1-r", ISCSIAddr: "vs2:3260", HealthScore: 1.0, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
}, },
} }
ms.blockRegistry.Register(entry) ms.blockRegistry.Register(entry)
@ -397,6 +405,9 @@ func TestRebuild_RegistryUpdatedWithNewReplica(t *testing.T) {
func TestRebuild_AssignmentContainsRebuildAddr(t *testing.T) { func TestRebuild_AssignmentContainsRebuildAddr(t *testing.T) {
ms := testMasterServerForFailover(t) ms := testMasterServerForFailover(t)
// Mark servers as block-capable so promotion Gate 4 (liveness) passes.
ms.blockRegistry.MarkBlockCapable("vs1")
ms.blockRegistry.MarkBlockCapable("vs2")
entry := &BlockVolumeEntry{ entry := &BlockVolumeEntry{
Name: "vol1", Name: "vol1",
VolumeServer: "vs1", VolumeServer: "vs1",
@ -413,7 +424,7 @@ func TestRebuild_AssignmentContainsRebuildAddr(t *testing.T) {
LeaseTTL: 5 * time.Second, LeaseTTL: 5 * time.Second,
LastLeaseGrant: time.Now().Add(-10 * time.Second), LastLeaseGrant: time.Now().Add(-10 * time.Second),
Replicas: []ReplicaInfo{ Replicas: []ReplicaInfo{
{Server: "vs2", Path: "/data/vol1.blk", IQN: "iqn:vol1-r", ISCSIAddr: "vs2:3260", HealthScore: 1.0},
{Server: "vs2", Path: "/data/vol1.blk", IQN: "iqn:vol1-r", ISCSIAddr: "vs2:3260", HealthScore: 1.0, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
}, },
} }
ms.blockRegistry.Register(entry) ms.blockRegistry.Register(entry)
@ -457,7 +468,7 @@ func TestFailover_TransientDisconnect_NoPromotion(t *testing.T) {
LeaseTTL: 30 * time.Second, LeaseTTL: 30 * time.Second,
LastLeaseGrant: time.Now(), // just granted LastLeaseGrant: time.Now(), // just granted
Replicas: []ReplicaInfo{ Replicas: []ReplicaInfo{
{Server: "vs2", Path: "/data/vol1.blk", IQN: "iqn:vol1-r", ISCSIAddr: "vs2:3260", HealthScore: 1.0},
{Server: "vs2", Path: "/data/vol1.blk", IQN: "iqn:vol1-r", ISCSIAddr: "vs2:3260", HealthScore: 1.0, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
}, },
} }
ms.blockRegistry.Register(entry) ms.blockRegistry.Register(entry)
@ -556,6 +567,10 @@ func TestLifecycle_CreateFailoverRebuild(t *testing.T) {
// registerVolumeRF3 creates a volume entry with primary + 2 replicas for RF=3 tests. // registerVolumeRF3 creates a volume entry with primary + 2 replicas for RF=3 tests.
func registerVolumeRF3(t *testing.T, ms *MasterServer, name, primary, replica1, replica2 string, epoch uint64, leaseTTL time.Duration) { func registerVolumeRF3(t *testing.T, ms *MasterServer, name, primary, replica1, replica2 string, epoch uint64, leaseTTL time.Duration) {
t.Helper() t.Helper()
// Mark all servers as block-capable so promotion Gate 4 (liveness) passes.
ms.blockRegistry.MarkBlockCapable(primary)
ms.blockRegistry.MarkBlockCapable(replica1)
ms.blockRegistry.MarkBlockCapable(replica2)
entry := &BlockVolumeEntry{ entry := &BlockVolumeEntry{
Name: name, Name: name,
VolumeServer: primary, VolumeServer: primary,
@ -576,20 +591,24 @@ func registerVolumeRF3(t *testing.T, ms *MasterServer, name, primary, replica1,
ReplicaISCSIAddr: replica1 + ":3260", ReplicaISCSIAddr: replica1 + ":3260",
Replicas: []ReplicaInfo{ Replicas: []ReplicaInfo{
{ {
Server: replica1,
Path: fmt.Sprintf("/data/%s.blk", name),
IQN: fmt.Sprintf("iqn.2024.test:%s-r1", name),
ISCSIAddr: replica1 + ":3260",
HealthScore: 1.0,
WALHeadLSN: 100,
Server: replica1,
Path: fmt.Sprintf("/data/%s.blk", name),
IQN: fmt.Sprintf("iqn.2024.test:%s-r1", name),
ISCSIAddr: replica1 + ":3260",
HealthScore: 1.0,
WALHeadLSN: 100,
Role: blockvol.RoleToWire(blockvol.RoleReplica),
LastHeartbeat: time.Now(),
}, },
{ {
Server: replica2,
Path: fmt.Sprintf("/data/%s.blk", name),
IQN: fmt.Sprintf("iqn.2024.test:%s-r2", name),
ISCSIAddr: replica2 + ":3260",
HealthScore: 1.0,
WALHeadLSN: 100,
Server: replica2,
Path: fmt.Sprintf("/data/%s.blk", name),
IQN: fmt.Sprintf("iqn.2024.test:%s-r2", name),
ISCSIAddr: replica2 + ":3260",
HealthScore: 1.0,
WALHeadLSN: 100,
Role: blockvol.RoleToWire(blockvol.RoleReplica),
LastHeartbeat: time.Now(),
}, },
}, },
} }
@ -793,6 +812,10 @@ func TestRF3_AllReplicasDead_NoPromotion(t *testing.T) {
// RF3: Lease deferred promotion with RF=3. // RF3: Lease deferred promotion with RF=3.
func TestRF3_LeaseDeferred_Promotion(t *testing.T) { func TestRF3_LeaseDeferred_Promotion(t *testing.T) {
ms := testMasterServerForFailover(t) ms := testMasterServerForFailover(t)
// Mark servers as block-capable so promotion Gate 4 (liveness) passes.
ms.blockRegistry.MarkBlockCapable("vs1")
ms.blockRegistry.MarkBlockCapable("vs2")
ms.blockRegistry.MarkBlockCapable("vs3")
entry := &BlockVolumeEntry{ entry := &BlockVolumeEntry{
Name: "vol1", Name: "vol1",
VolumeServer: "vs1", VolumeServer: "vs1",
@ -807,8 +830,8 @@ func TestRF3_LeaseDeferred_Promotion(t *testing.T) {
LeaseTTL: 200 * time.Millisecond, LeaseTTL: 200 * time.Millisecond,
LastLeaseGrant: time.Now(), // just granted → NOT expired LastLeaseGrant: time.Now(), // just granted → NOT expired
Replicas: []ReplicaInfo{ Replicas: []ReplicaInfo{
{Server: "vs2", Path: "/data/vol1.blk", ISCSIAddr: "vs2:3260", HealthScore: 1.0, WALHeadLSN: 50},
{Server: "vs3", Path: "/data/vol1.blk", ISCSIAddr: "vs3:3260", HealthScore: 0.9, WALHeadLSN: 50},
{Server: "vs2", Path: "/data/vol1.blk", ISCSIAddr: "vs2:3260", HealthScore: 1.0, WALHeadLSN: 50, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
{Server: "vs3", Path: "/data/vol1.blk", ISCSIAddr: "vs3:3260", HealthScore: 0.9, WALHeadLSN: 50, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
}, },
// Deprecated scalar fields. // Deprecated scalar fields.
ReplicaServer: "vs2", ReplicaPath: "/data/vol1.blk", ReplicaISCSIAddr: "vs2:3260", ReplicaServer: "vs2", ReplicaPath: "/data/vol1.blk", ReplicaISCSIAddr: "vs2:3260",
@ -853,8 +876,8 @@ func TestRF3_CancelDeferredOnReconnect(t *testing.T) {
LeaseTTL: 5 * time.Second, LeaseTTL: 5 * time.Second,
LastLeaseGrant: time.Now(), // just granted → long lease LastLeaseGrant: time.Now(), // just granted → long lease
Replicas: []ReplicaInfo{ Replicas: []ReplicaInfo{
{Server: "vs2", Path: "/data/vol1.blk", ISCSIAddr: "vs2:3260", HealthScore: 1.0},
{Server: "vs3", Path: "/data/vol1.blk", ISCSIAddr: "vs3:3260", HealthScore: 1.0},
{Server: "vs2", Path: "/data/vol1.blk", ISCSIAddr: "vs2:3260", HealthScore: 1.0, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
{Server: "vs3", Path: "/data/vol1.blk", ISCSIAddr: "vs3:3260", HealthScore: 1.0, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
}, },
ReplicaServer: "vs2", ReplicaPath: "/data/vol1.blk", ReplicaISCSIAddr: "vs2:3260", ReplicaServer: "vs2", ReplicaPath: "/data/vol1.blk", ReplicaISCSIAddr: "vs2:3260",
} }
@ -888,3 +911,267 @@ func TestRF3_CancelDeferredOnReconnect(t *testing.T) {
t.Fatalf("vs1 should remain primary (timer cancelled), got %q", e.VolumeServer) t.Fatalf("vs1 should remain primary (timer cancelled), got %q", e.VolumeServer)
} }
} }
// ============================================================
// CP11B-3 T2: Re-evaluate on Replica Registration (B-06)
// ============================================================
// T2: Orphaned primary + replica reconnects → automatic promotion.
func TestT2_OrphanedPrimary_ReplicaReconnect_Promotes(t *testing.T) {
ms := testMasterServerForFailover(t)
registerVolumeWithReplica(t, ms, "vol1", "vs1", "vs2", 1, 5*time.Second)
// Simulate vs1 dying without proper failover (e.g., promotion failed at the time).
// Mark vs1 as dead but DON'T call failoverBlockVolumes (simulates missed/failed failover).
ms.blockRegistry.UnmarkBlockCapable("vs1")
// vs2 reconnects (sends heartbeat). reevaluateOrphanedPrimaries should detect orphaned primary.
ms.recoverBlockVolumes("vs2")
entry, _ := ms.blockRegistry.Lookup("vol1")
if entry.VolumeServer != "vs2" {
t.Fatalf("expected promotion to vs2 (orphaned primary), got %q", entry.VolumeServer)
}
if entry.Epoch != 2 {
t.Fatalf("expected epoch 2 after promotion, got %d", entry.Epoch)
}
}
// T2: Replica reconnects but primary is alive → no unnecessary promotion.
func TestT2_PrimaryAlive_NoPromotion(t *testing.T) {
ms := testMasterServerForFailover(t)
registerVolumeWithReplica(t, ms, "vol1", "vs1", "vs2", 1, 5*time.Second)
// Both servers alive. vs2 reconnects — no orphaned primary.
ms.recoverBlockVolumes("vs2")
entry, _ := ms.blockRegistry.Lookup("vol1")
if entry.VolumeServer != "vs1" {
t.Fatalf("primary should remain vs1 (alive), got %q", entry.VolumeServer)
}
if entry.Epoch != 1 {
t.Fatalf("epoch should remain 1, got %d", entry.Epoch)
}
}
// T2: Multiple orphaned volumes, all promoted on reconnect.
func TestT2_MultipleOrphanedVolumes(t *testing.T) {
ms := testMasterServerForFailover(t)
// vol1: vs1=primary, vs2=replica
// vol2: vs3=primary, vs2=replica
registerVolumeWithReplica(t, ms, "vol1", "vs1", "vs2", 1, 5*time.Second)
ms.blockRegistry.MarkBlockCapable("vs3")
entry2 := &BlockVolumeEntry{
Name: "vol2", VolumeServer: "vs3", Path: "/data/vol2.blk",
SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
Status: StatusActive, LeaseTTL: 5 * time.Second,
LastLeaseGrant: time.Now().Add(-10 * time.Second),
Replicas: []ReplicaInfo{{
Server: "vs2", Path: "/data/vol2.blk", HealthScore: 1.0,
Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(),
}},
}
ms.blockRegistry.Register(entry2)
// Both primaries die.
ms.blockRegistry.UnmarkBlockCapable("vs1")
ms.blockRegistry.UnmarkBlockCapable("vs3")
// vs2 reconnects → both orphaned volumes should be promoted.
ms.recoverBlockVolumes("vs2")
e1, _ := ms.blockRegistry.Lookup("vol1")
e2, _ := ms.blockRegistry.Lookup("vol2")
if e1.VolumeServer != "vs2" {
t.Fatalf("vol1: expected promotion to vs2, got %q", e1.VolumeServer)
}
if e2.VolumeServer != "vs2" {
t.Fatalf("vol2: expected promotion to vs2, got %q", e2.VolumeServer)
}
}
// T2: Repeated heartbeats do NOT cause duplicate promotions.
func TestT2_RepeatedHeartbeats_NoDuplicatePromotion(t *testing.T) {
ms := testMasterServerForFailover(t)
registerVolumeWithReplica(t, ms, "vol1", "vs1", "vs2", 1, 5*time.Second)
ms.blockRegistry.UnmarkBlockCapable("vs1")
// First reconnect promotes.
ms.reevaluateOrphanedPrimaries("vs2")
entry, _ := ms.blockRegistry.Lookup("vol1")
if entry.VolumeServer != "vs2" {
t.Fatalf("first call: expected promotion to vs2, got %q", entry.VolumeServer)
}
epochAfterFirst := entry.Epoch
// Second call: vs2 is now the primary AND block-capable. No orphan detected.
ms.reevaluateOrphanedPrimaries("vs2")
entry, _ = ms.blockRegistry.Lookup("vol1")
if entry.Epoch != epochAfterFirst {
t.Fatalf("second call should not bump epoch: got %d, want %d", entry.Epoch, epochAfterFirst)
}
}
// T2: Dead primary with active lease, replica reconnects → no immediate promotion.
// Regression test for lease-bypass bug: reevaluateOrphanedPrimaries must respect
// lease expiry, not promote immediately.
func TestT2_OrphanedPrimary_LeaseNotExpired_DefersPromotion(t *testing.T) {
ms := testMasterServerForFailover(t)
ms.blockRegistry.MarkBlockCapable("vs1")
ms.blockRegistry.MarkBlockCapable("vs2")
ms.blockRegistry.Register(&BlockVolumeEntry{
Name: "vol1", VolumeServer: "vs1", Path: "/data/vol1.blk",
SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
Status: StatusActive, LeaseTTL: 300 * time.Millisecond,
LastLeaseGrant: time.Now(), // lease still active
Replicas: []ReplicaInfo{{
Server: "vs2", Path: "/data/vol1.blk", HealthScore: 1.0,
Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(),
}},
})
// vs1 dies (unmark block-capable).
ms.blockRegistry.UnmarkBlockCapable("vs1")
// vs2 reconnects — orphan detected, but lease still active → should NOT promote immediately.
ms.reevaluateOrphanedPrimaries("vs2")
entry, _ := ms.blockRegistry.Lookup("vol1")
if entry.VolumeServer != "vs1" {
t.Fatalf("should NOT promote while lease active, got primary=%q", entry.VolumeServer)
}
if entry.Epoch != 1 {
t.Fatalf("epoch should remain 1, got %d", entry.Epoch)
}
// Verify a deferred timer was created for the dead primary.
ms.blockFailover.mu.Lock()
timerCount := len(ms.blockFailover.deferredTimers["vs1"])
ms.blockFailover.mu.Unlock()
if timerCount != 1 {
t.Fatalf("expected 1 deferred timer for vs1, got %d", timerCount)
}
// Wait for lease to expire + margin → timer fires, promotion happens.
time.Sleep(450 * time.Millisecond)
entry, _ = ms.blockRegistry.Lookup("vol1")
if entry.VolumeServer != "vs2" {
t.Fatalf("after lease expiry, expected promotion to vs2, got %q", entry.VolumeServer)
}
if entry.Epoch != 2 {
t.Fatalf("expected epoch 2, got %d", entry.Epoch)
}
}
// ============================================================
// CP11B-3 T3: Deferred Timer Safety
// ============================================================
// T3: Delete/recreate volume before deferred timer fires → no wrong promotion.
func TestT3_DeferredTimer_VolumeDeleted_NoPromotion(t *testing.T) {
ms := testMasterServerForFailover(t)
ms.blockRegistry.MarkBlockCapable("vs1")
ms.blockRegistry.MarkBlockCapable("vs2")
entry := &BlockVolumeEntry{
Name: "vol1", VolumeServer: "vs1", Path: "/data/vol1.blk",
SizeBytes: 1 << 30, Epoch: 5, Role: blockvol.RoleToWire(blockvol.RolePrimary),
Status: StatusActive, LeaseTTL: 200 * time.Millisecond,
LastLeaseGrant: time.Now(),
Replicas: []ReplicaInfo{{
Server: "vs2", Path: "/data/vol1.blk", HealthScore: 1.0,
Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(),
}},
}
ms.blockRegistry.Register(entry)
// vs1 dies → deferred timer created (lease not expired, epoch=5).
ms.failoverBlockVolumes("vs1")
// Delete the volume before timer fires.
ms.blockRegistry.Unregister("vol1")
// Wait for timer to fire.
time.Sleep(350 * time.Millisecond)
// Volume should not exist (timer found it deleted, no-op).
_, ok := ms.blockRegistry.Lookup("vol1")
if ok {
t.Fatal("volume should have been deleted, timer should not recreate it")
}
}
// T3: Epoch changes before deferred timer fires → timer rejected.
func TestT3_DeferredTimer_EpochChanged_NoPromotion(t *testing.T) {
ms := testMasterServerForFailover(t)
ms.blockRegistry.MarkBlockCapable("vs1")
ms.blockRegistry.MarkBlockCapable("vs2")
ms.blockRegistry.MarkBlockCapable("vs3")
entry := &BlockVolumeEntry{
Name: "vol1", VolumeServer: "vs1", Path: "/data/vol1.blk",
SizeBytes: 1 << 30, Epoch: 5, Role: blockvol.RoleToWire(blockvol.RolePrimary),
Status: StatusActive, LeaseTTL: 200 * time.Millisecond,
LastLeaseGrant: time.Now(),
Replicas: []ReplicaInfo{{
Server: "vs2", Path: "/data/vol1.blk", HealthScore: 1.0,
Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(),
}},
}
ms.blockRegistry.Register(entry)
// vs1 dies → deferred timer created (captures epoch=5).
ms.failoverBlockVolumes("vs1")
// Before timer fires, manually bump the epoch (simulating another event).
e, _ := ms.blockRegistry.Lookup("vol1")
e.Epoch = 99
// Wait for timer to fire.
time.Sleep(350 * time.Millisecond)
// Timer should have been rejected (epoch mismatch). Epoch stays at 99.
e, _ = ms.blockRegistry.Lookup("vol1")
if e.Epoch != 99 {
t.Fatalf("epoch should remain 99 (timer rejected), got %d", e.Epoch)
}
// Primary should NOT have changed (deferred promotion was rejected).
if e.VolumeServer != "vs1" {
t.Fatalf("primary should remain vs1 (timer rejected), got %q", e.VolumeServer)
}
}
// ============================================================
// CP11B-3 T4: Rebuild with empty RebuildListenAddr
// ============================================================
// T4: Rebuild queued with empty RebuildListenAddr after promotion.
func TestT4_RebuildEmptyAddr_StillQueued(t *testing.T) {
ms := testMasterServerForFailover(t)
registerVolumeWithReplica(t, ms, "vol1", "vs1", "vs2", 1, 5*time.Second)
// Failover: vs1 dies, vs2 promoted. PromoteBestReplica clears RebuildListenAddr.
ms.failoverBlockVolumes("vs1")
entry, _ := ms.blockRegistry.Lookup("vol1")
if entry.RebuildListenAddr != "" {
t.Fatalf("RebuildListenAddr should be empty after promotion, got %q", entry.RebuildListenAddr)
}
// vs1 reconnects. Rebuild should still be queued (even with empty addr).
ms.recoverBlockVolumes("vs1")
assignments := ms.blockAssignmentQueue.Peek("vs1")
foundRebuild := false
for _, a := range assignments {
if blockvol.RoleFromWire(a.Role) == blockvol.RoleRebuilding {
foundRebuild = true
if a.RebuildAddr != "" {
t.Fatalf("RebuildAddr should be empty (new primary hasn't heartbeated), got %q", a.RebuildAddr)
}
}
}
if !foundRebuild {
t.Fatal("rebuild assignment should still be queued even with empty addr")
}
}

372
weed/server/master_block_registry.go

@ -842,44 +842,91 @@ func (r *BlockVolumeRegistry) PromotionLSNTolerance() uint64 {
return r.promotionLSNTolerance return r.promotionLSNTolerance
} }
// PromoteBestReplica promotes the best eligible replica to primary.
// Eligibility: heartbeat fresh (within 2×LeaseTTL), WALHeadLSN within tolerance of primary,
// and role must be RoleReplica (not RoleRebuilding).
// The promoted replica is removed from Replicas[]. Other replicas stay.
// Old primary is NOT added to Replicas (needs rebuild).
// Returns the new epoch.
func (r *BlockVolumeRegistry) PromoteBestReplica(name string) (uint64, error) {
r.mu.Lock()
defer r.mu.Unlock()
entry, ok := r.volumes[name]
if !ok {
return 0, fmt.Errorf("block volume %q not found", name)
// PromotionRejection records why a specific replica was rejected for promotion.
type PromotionRejection struct {
Server string
Reason string // "stale_heartbeat", "wal_lag", "wrong_role", "server_dead"
}
// PromotionPreflightResult is the reusable result of a promotion evaluation.
// Used by auto-promotion, manual promote API, preflight status, and logging.
type PromotionPreflightResult struct {
VolumeName string
Promotable bool // true if a candidate was found
Candidate *ReplicaInfo // best candidate (nil if !Promotable)
CandidateIdx int // index in Replicas[] (-1 if !Promotable)
Rejections []PromotionRejection // why each non-candidate was rejected
Reason string // human-readable summary when !Promotable
}
// evaluatePromotionLocked evaluates promotion candidates for a volume.
// Caller must hold r.mu (read or write). Returns a preflight result without
// mutating the registry. The four gates:
// 1. Heartbeat freshness (within 2×LeaseTTL)
// 2. WAL LSN recency (within promotionLSNTolerance of primary)
// 3. Role must be RoleReplica (not RoleRebuilding)
// 4. Server must be in blockServers (alive) — fixes B-12
func (r *BlockVolumeRegistry) evaluatePromotionLocked(entry *BlockVolumeEntry) PromotionPreflightResult {
result := PromotionPreflightResult{
VolumeName: entry.Name,
CandidateIdx: -1,
} }
if len(entry.Replicas) == 0 { if len(entry.Replicas) == 0 {
return 0, fmt.Errorf("block volume %q has no replicas", name)
result.Reason = "no replicas"
return result
} }
// Filter eligible replicas.
now := time.Now() now := time.Now()
freshnessCutoff := 2 * entry.LeaseTTL freshnessCutoff := 2 * entry.LeaseTTL
if freshnessCutoff == 0 { if freshnessCutoff == 0 {
freshnessCutoff = 60 * time.Second // default if LeaseTTL not set
freshnessCutoff = 60 * time.Second
} }
primaryLSN := entry.WALHeadLSN primaryLSN := entry.WALHeadLSN
bestIdx := -1 bestIdx := -1
for i := range entry.Replicas { for i := range entry.Replicas {
ri := &entry.Replicas[i] ri := &entry.Replicas[i]
// Gate 1: heartbeat freshness.
if !ri.LastHeartbeat.IsZero() && now.Sub(ri.LastHeartbeat) > freshnessCutoff {
// Gate 1: heartbeat freshness. Zero means never heartbeated — unsafe
// to promote because the registry has no proof the replica is alive,
// caught up, or fully initialized.
if ri.LastHeartbeat.IsZero() {
result.Rejections = append(result.Rejections, PromotionRejection{
Server: ri.Server,
Reason: "no_heartbeat",
})
continue
}
if now.Sub(ri.LastHeartbeat) > freshnessCutoff {
result.Rejections = append(result.Rejections, PromotionRejection{
Server: ri.Server,
Reason: "stale_heartbeat",
})
continue continue
} }
// Gate 2: WAL LSN recency (skip if primary LSN is 0 — no data yet, all eligible). // Gate 2: WAL LSN recency (skip if primary LSN is 0 — no data yet, all eligible).
if primaryLSN > 0 && ri.WALHeadLSN+r.promotionLSNTolerance < primaryLSN { if primaryLSN > 0 && ri.WALHeadLSN+r.promotionLSNTolerance < primaryLSN {
result.Rejections = append(result.Rejections, PromotionRejection{
Server: ri.Server,
Reason: "wal_lag",
})
continue continue
} }
// Gate 3: role must be RoleReplica (not rebuilding/stale).
if ri.Role != 0 && blockvol.RoleFromWire(ri.Role) != blockvol.RoleReplica {
// Gate 3: role must be exactly RoleReplica. Zero/unset role means
// the replica was created but never confirmed its role via heartbeat.
if blockvol.RoleFromWire(ri.Role) != blockvol.RoleReplica {
result.Rejections = append(result.Rejections, PromotionRejection{
Server: ri.Server,
Reason: "wrong_role",
})
continue
}
// Gate 4: server must be alive (in blockServers set) — B-12 fix.
if !r.blockServers[ri.Server] {
result.Rejections = append(result.Rejections, PromotionRejection{
Server: ri.Server,
Reason: "server_dead",
})
continue continue
} }
// Eligible — pick best by health score, tie-break by WALHeadLSN. // Eligible — pick best by health score, tie-break by WALHeadLSN.
@ -894,11 +941,39 @@ func (r *BlockVolumeRegistry) PromoteBestReplica(name string) (uint64, error) {
} }
if bestIdx == -1 { if bestIdx == -1 {
return 0, fmt.Errorf("block volume %q: no eligible replicas for promotion", name)
result.Reason = "no eligible replicas"
if len(result.Rejections) > 0 {
result.Reason += ": " + result.Rejections[0].Reason
if len(result.Rejections) > 1 {
result.Reason += fmt.Sprintf(" (+%d more)", len(result.Rejections)-1)
}
}
return result
} }
promoted := entry.Replicas[bestIdx]
result.Promotable = true
ri := entry.Replicas[bestIdx]
result.Candidate = &ri
result.CandidateIdx = bestIdx
return result
}
// EvaluatePromotion returns a read-only preflight result for the named volume
// without mutating the registry. Safe for status/logging/manual promote preview.
func (r *BlockVolumeRegistry) EvaluatePromotion(name string) (PromotionPreflightResult, error) {
r.mu.RLock()
defer r.mu.RUnlock()
entry, ok := r.volumes[name]
if !ok {
return PromotionPreflightResult{VolumeName: name, Reason: "volume not found"}, fmt.Errorf("block volume %q not found", name)
}
return r.evaluatePromotionLocked(entry), nil
}
// applyPromotionLocked applies the promotion of a replica at candidateIdx to primary.
// Caller must hold r.mu (write lock). The promoted replica is removed from Replicas[].
// Old primary is NOT added to Replicas (needs rebuild). Returns the new epoch.
func (r *BlockVolumeRegistry) applyPromotionLocked(entry *BlockVolumeEntry, name string, candidate ReplicaInfo, candidateIdx int) uint64 {
// Remove old primary from byServer index. // Remove old primary from byServer index.
r.removeFromServer(entry.VolumeServer, name) r.removeFromServer(entry.VolumeServer, name)
@ -906,18 +981,21 @@ func (r *BlockVolumeRegistry) PromoteBestReplica(name string) (uint64, error) {
newEpoch := entry.Epoch + 1 newEpoch := entry.Epoch + 1
// Promote replica to primary. // Promote replica to primary.
entry.VolumeServer = promoted.Server
entry.Path = promoted.Path
entry.IQN = promoted.IQN
entry.ISCSIAddr = promoted.ISCSIAddr
entry.NvmeAddr = promoted.NvmeAddr
entry.NQN = promoted.NQN
entry.VolumeServer = candidate.Server
entry.Path = candidate.Path
entry.IQN = candidate.IQN
entry.ISCSIAddr = candidate.ISCSIAddr
entry.NvmeAddr = candidate.NvmeAddr
entry.NQN = candidate.NQN
entry.Epoch = newEpoch entry.Epoch = newEpoch
entry.Role = blockvol.RoleToWire(blockvol.RolePrimary) entry.Role = blockvol.RoleToWire(blockvol.RolePrimary)
entry.LastLeaseGrant = time.Now() entry.LastLeaseGrant = time.Now()
// Clear stale rebuild/publication metadata from old primary (B-11 partial fix).
entry.RebuildListenAddr = ""
// Remove promoted from Replicas. Others stay. // Remove promoted from Replicas. Others stay.
entry.Replicas = append(entry.Replicas[:bestIdx], entry.Replicas[bestIdx+1:]...)
entry.Replicas = append(entry.Replicas[:candidateIdx], entry.Replicas[candidateIdx+1:]...)
// Sync deprecated scalar fields. // Sync deprecated scalar fields.
if len(entry.Replicas) > 0 { if len(entry.Replicas) > 0 {
@ -940,9 +1018,212 @@ func (r *BlockVolumeRegistry) PromoteBestReplica(name string) (uint64, error) {
// Update byServer index: new primary server now hosts this volume. // Update byServer index: new primary server now hosts this volume.
r.addToServer(entry.VolumeServer, name) r.addToServer(entry.VolumeServer, name)
return newEpoch
}
// PromoteBestReplica promotes the best eligible replica to primary.
// Eligibility: heartbeat fresh (within 2×LeaseTTL), WALHeadLSN within tolerance of primary,
// role must be RoleReplica (not RoleRebuilding), and server must be alive (B-12 fix).
// The promoted replica is removed from Replicas[]. Other replicas stay.
// Old primary is NOT added to Replicas (needs rebuild).
// Returns the new epoch and the preflight result.
func (r *BlockVolumeRegistry) PromoteBestReplica(name string) (uint64, error) {
r.mu.Lock()
defer r.mu.Unlock()
entry, ok := r.volumes[name]
if !ok {
return 0, fmt.Errorf("block volume %q not found", name)
}
pf := r.evaluatePromotionLocked(entry)
if !pf.Promotable {
return 0, fmt.Errorf("block volume %q: %s", name, pf.Reason)
}
promoted := *pf.Candidate
bestIdx := pf.CandidateIdx
newEpoch := r.applyPromotionLocked(entry, name, promoted, bestIdx)
return newEpoch, nil return newEpoch, nil
} }
// evaluateManualPromotionLocked evaluates promotion candidates for a manual promote request.
// Caller must hold r.mu (read or write).
//
// Differences from evaluatePromotionLocked:
// - Primary-alive gate: if !force and current primary is alive, reject with "primary_alive".
// - Target filtering: if targetServer != "", only evaluate that specific replica.
// Returns Reason="target_not_found" if that server is not a replica.
// - Force flag: bypasses soft gates (primary_alive, stale_heartbeat, wal_lag)
// but keeps hard gates (no_heartbeat with zero time, wrong_role, server_dead).
//
// Gate table:
//
// Gate | Normal | Force
// primary_alive | Reject | Skip
// no_heartbeat(0) | Reject | Reject
// stale_heartbeat | Reject | Skip
// wal_lag | Reject | Skip
// wrong_role | Reject | Reject
// server_dead | Reject | Reject
func (r *BlockVolumeRegistry) evaluateManualPromotionLocked(entry *BlockVolumeEntry, targetServer string, force bool) PromotionPreflightResult {
result := PromotionPreflightResult{
VolumeName: entry.Name,
CandidateIdx: -1,
}
// Primary-alive gate (soft — skipped when force=true).
if !force && r.blockServers[entry.VolumeServer] {
result.Reason = "primary_alive"
return result
}
if len(entry.Replicas) == 0 {
result.Reason = "no replicas"
return result
}
// Target filtering: if a specific server is requested, find its index first.
// Return early if not found.
if targetServer != "" {
found := false
for i := range entry.Replicas {
if entry.Replicas[i].Server == targetServer {
found = true
break
}
}
if !found {
result.Reason = "target_not_found"
return result
}
}
now := time.Now()
freshnessCutoff := 2 * entry.LeaseTTL
if freshnessCutoff == 0 {
freshnessCutoff = 60 * time.Second
}
primaryLSN := entry.WALHeadLSN
bestIdx := -1
for i := range entry.Replicas {
ri := &entry.Replicas[i]
// If targeting a specific server, skip all others.
if targetServer != "" && ri.Server != targetServer {
continue
}
// Hard gate: no heartbeat (zero time) — unsafe regardless of force.
if ri.LastHeartbeat.IsZero() {
result.Rejections = append(result.Rejections, PromotionRejection{
Server: ri.Server,
Reason: "no_heartbeat",
})
continue
}
// Soft gate: stale heartbeat — skipped when force=true.
if !force && now.Sub(ri.LastHeartbeat) > freshnessCutoff {
result.Rejections = append(result.Rejections, PromotionRejection{
Server: ri.Server,
Reason: "stale_heartbeat",
})
continue
}
// Soft gate: WAL lag — skipped when force=true.
if !force && primaryLSN > 0 && ri.WALHeadLSN+r.promotionLSNTolerance < primaryLSN {
result.Rejections = append(result.Rejections, PromotionRejection{
Server: ri.Server,
Reason: "wal_lag",
})
continue
}
// Hard gate: role must be exactly RoleReplica.
if blockvol.RoleFromWire(ri.Role) != blockvol.RoleReplica {
result.Rejections = append(result.Rejections, PromotionRejection{
Server: ri.Server,
Reason: "wrong_role",
})
continue
}
// Hard gate: server must be alive (in blockServers set).
if !r.blockServers[ri.Server] {
result.Rejections = append(result.Rejections, PromotionRejection{
Server: ri.Server,
Reason: "server_dead",
})
continue
}
// Eligible — pick best by health score, tie-break by WALHeadLSN.
if bestIdx == -1 {
bestIdx = i
} else if ri.HealthScore > entry.Replicas[bestIdx].HealthScore {
bestIdx = i
} else if ri.HealthScore == entry.Replicas[bestIdx].HealthScore &&
ri.WALHeadLSN > entry.Replicas[bestIdx].WALHeadLSN {
bestIdx = i
}
}
if bestIdx == -1 {
result.Reason = "no eligible replicas"
if len(result.Rejections) > 0 {
result.Reason += ": " + result.Rejections[0].Reason
if len(result.Rejections) > 1 {
result.Reason += fmt.Sprintf(" (+%d more)", len(result.Rejections)-1)
}
}
return result
}
result.Promotable = true
ri := entry.Replicas[bestIdx]
result.Candidate = &ri
result.CandidateIdx = bestIdx
return result
}
// ManualPromote promotes a specific replica (or the best eligible replica) to primary.
// Unlike PromoteBestReplica, it accepts operator overrides:
// - targetServer: if non-empty, only that replica is considered.
// - force: bypasses soft gates (primary_alive, stale_heartbeat, wal_lag).
//
// Returns (newEpoch, oldPrimary, oldPath, preflightResult, nil) on success.
// oldPrimary and oldPath are captured under the lock to avoid TOCTOU with
// concurrent auto-failover (BUG-T5-2 fix).
// Returns (0, "", "", preflightResult, err) on rejection or lookup failure.
func (r *BlockVolumeRegistry) ManualPromote(name, targetServer string, force bool) (uint64, string, string, PromotionPreflightResult, error) {
r.mu.Lock()
defer r.mu.Unlock()
entry, ok := r.volumes[name]
if !ok {
return 0, "", "", PromotionPreflightResult{VolumeName: name, Reason: "volume not found"},
fmt.Errorf("block volume %q not found", name)
}
// Capture old primary info under lock (BUG-T5-2 fix).
oldPrimary := entry.VolumeServer
oldPath := entry.Path
pf := r.evaluateManualPromotionLocked(entry, targetServer, force)
if !pf.Promotable {
return 0, "", "", pf, fmt.Errorf("block volume %q: %s", name, pf.Reason)
}
promoted := *pf.Candidate
candidateIdx := pf.CandidateIdx
newEpoch := r.applyPromotionLocked(entry, name, promoted, candidateIdx)
return newEpoch, oldPrimary, oldPath, pf, nil
}
// MarkBlockCapable records that the given server supports block volumes. // MarkBlockCapable records that the given server supports block volumes.
func (r *BlockVolumeRegistry) MarkBlockCapable(server string) { func (r *BlockVolumeRegistry) MarkBlockCapable(server string) {
r.mu.Lock() r.mu.Lock()
@ -1045,6 +1326,41 @@ func (r *BlockVolumeRegistry) ServerSummaries() []BlockServerSummary {
return summaries return summaries
} }
// IsBlockCapable returns true if the given server is in the block-capable set (alive).
func (r *BlockVolumeRegistry) IsBlockCapable(server string) bool {
r.mu.RLock()
defer r.mu.RUnlock()
return r.blockServers[server]
}
// VolumesWithDeadPrimary returns names of volumes where the given server is a replica
// and the current primary is NOT in the block-capable set (dead/disconnected).
// Used by T2 (B-06) to detect orphaned primaries that need re-promotion.
func (r *BlockVolumeRegistry) VolumesWithDeadPrimary(replicaServer string) []string {
r.mu.RLock()
defer r.mu.RUnlock()
names, ok := r.byServer[replicaServer]
if !ok {
return nil
}
var orphaned []string
for name := range names {
entry := r.volumes[name]
if entry == nil {
continue
}
// Only consider volumes where this server is a replica (not the primary).
if entry.VolumeServer == replicaServer {
continue
}
// Check if the primary server is dead.
if !r.blockServers[entry.VolumeServer] {
orphaned = append(orphaned, name)
}
}
return orphaned
}
// BlockCapableServers returns the list of servers known to support block volumes. // BlockCapableServers returns the list of servers known to support block volumes.
func (r *BlockVolumeRegistry) BlockCapableServers() []string { func (r *BlockVolumeRegistry) BlockCapableServers() []string {
r.mu.RLock() r.mu.RLock()

519
weed/server/master_block_registry_test.go

@ -2,6 +2,7 @@ package weed_server
import ( import (
"fmt" "fmt"
"strings"
"sync" "sync"
"testing" "testing"
"time" "time"
@ -538,6 +539,8 @@ func TestRegistry_RemoveReplica(t *testing.T) {
func TestRegistry_PromoteBestReplica_PicksHighest(t *testing.T) { func TestRegistry_PromoteBestReplica_PicksHighest(t *testing.T) {
r := NewBlockVolumeRegistry() r := NewBlockVolumeRegistry()
r.MarkBlockCapable("s2")
r.MarkBlockCapable("s3")
r.Register(&BlockVolumeEntry{ r.Register(&BlockVolumeEntry{
Name: "vol1", Name: "vol1",
VolumeServer: "s1", VolumeServer: "s1",
@ -545,8 +548,8 @@ func TestRegistry_PromoteBestReplica_PicksHighest(t *testing.T) {
Epoch: 5, Epoch: 5,
Role: 1, Role: 1,
Replicas: []ReplicaInfo{ Replicas: []ReplicaInfo{
{Server: "s2", Path: "/r1.blk", IQN: "iqn:r1", ISCSIAddr: "s2:3260", HealthScore: 0.8, WALHeadLSN: 100},
{Server: "s3", Path: "/r2.blk", IQN: "iqn:r2", ISCSIAddr: "s3:3260", HealthScore: 0.95, WALHeadLSN: 90},
{Server: "s2", Path: "/r1.blk", IQN: "iqn:r1", ISCSIAddr: "s2:3260", HealthScore: 0.8, WALHeadLSN: 100, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
{Server: "s3", Path: "/r2.blk", IQN: "iqn:r2", ISCSIAddr: "s3:3260", HealthScore: 0.95, WALHeadLSN: 90, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
}, },
}) })
// Add to byServer for s2 and s3. // Add to byServer for s2 and s3.
@ -592,14 +595,16 @@ func TestRegistry_PromoteBestReplica_NoReplica(t *testing.T) {
func TestRegistry_PromoteBestReplica_TiebreakByLSN(t *testing.T) { func TestRegistry_PromoteBestReplica_TiebreakByLSN(t *testing.T) {
r := NewBlockVolumeRegistry() r := NewBlockVolumeRegistry()
r.MarkBlockCapable("s2")
r.MarkBlockCapable("s3")
r.Register(&BlockVolumeEntry{ r.Register(&BlockVolumeEntry{
Name: "vol1", Name: "vol1",
VolumeServer: "s1", VolumeServer: "s1",
Path: "/v1.blk", Path: "/v1.blk",
Epoch: 3, Epoch: 3,
Replicas: []ReplicaInfo{ Replicas: []ReplicaInfo{
{Server: "s2", Path: "/r1.blk", IQN: "iqn:r1", ISCSIAddr: "s2:3260", HealthScore: 0.9, WALHeadLSN: 50},
{Server: "s3", Path: "/r2.blk", IQN: "iqn:r2", ISCSIAddr: "s3:3260", HealthScore: 0.9, WALHeadLSN: 100},
{Server: "s2", Path: "/r1.blk", IQN: "iqn:r1", ISCSIAddr: "s2:3260", HealthScore: 0.9, WALHeadLSN: 50, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
{Server: "s3", Path: "/r2.blk", IQN: "iqn:r2", ISCSIAddr: "s3:3260", HealthScore: 0.9, WALHeadLSN: 100, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
}, },
}) })
r.mu.Lock() r.mu.Lock()
@ -627,14 +632,16 @@ func TestRegistry_PromoteBestReplica_TiebreakByLSN(t *testing.T) {
func TestRegistry_PromoteBestReplica_KeepsOthers(t *testing.T) { func TestRegistry_PromoteBestReplica_KeepsOthers(t *testing.T) {
r := NewBlockVolumeRegistry() r := NewBlockVolumeRegistry()
r.MarkBlockCapable("s2")
r.MarkBlockCapable("s3")
r.Register(&BlockVolumeEntry{ r.Register(&BlockVolumeEntry{
Name: "vol1", Name: "vol1",
VolumeServer: "s1", VolumeServer: "s1",
Path: "/v1.blk", Path: "/v1.blk",
Epoch: 1, Epoch: 1,
Replicas: []ReplicaInfo{ Replicas: []ReplicaInfo{
{Server: "s2", Path: "/r1.blk", IQN: "iqn:r1", ISCSIAddr: "s2:3260", HealthScore: 1.0, WALHeadLSN: 100},
{Server: "s3", Path: "/r2.blk", IQN: "iqn:r2", ISCSIAddr: "s3:3260", HealthScore: 0.5, WALHeadLSN: 100},
{Server: "s2", Path: "/r1.blk", IQN: "iqn:r1", ISCSIAddr: "s2:3260", HealthScore: 1.0, WALHeadLSN: 100, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
{Server: "s3", Path: "/r2.blk", IQN: "iqn:r2", ISCSIAddr: "s3:3260", HealthScore: 0.5, WALHeadLSN: 100, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
}, },
}) })
r.mu.Lock() r.mu.Lock()
@ -877,6 +884,7 @@ func TestRegistry_PromoteBestReplica_WALLagIneligible(t *testing.T) {
HealthScore: 1.0, HealthScore: 1.0,
WALHeadLSN: 800, // lag=200, tolerance=100 WALHeadLSN: 800, // lag=200, tolerance=100
LastHeartbeat: time.Now(), LastHeartbeat: time.Now(),
Role: blockvol.RoleToWire(blockvol.RoleReplica),
}, },
}, },
}) })
@ -918,6 +926,8 @@ func TestRegistry_PromoteBestReplica_RebuildingIneligible(t *testing.T) {
// Fix #2: Among eligible replicas, best (health+LSN) wins. // Fix #2: Among eligible replicas, best (health+LSN) wins.
func TestRegistry_PromoteBestReplica_EligibilityFiltersCorrectly(t *testing.T) { func TestRegistry_PromoteBestReplica_EligibilityFiltersCorrectly(t *testing.T) {
r := NewBlockVolumeRegistry() r := NewBlockVolumeRegistry()
r.MarkBlockCapable("stale")
r.MarkBlockCapable("good")
r.Register(&BlockVolumeEntry{ r.Register(&BlockVolumeEntry{
Name: "vol1", Name: "vol1",
VolumeServer: "primary", VolumeServer: "primary",
@ -939,6 +949,7 @@ func TestRegistry_PromoteBestReplica_EligibilityFiltersCorrectly(t *testing.T) {
HealthScore: 0.8, HealthScore: 0.8,
WALHeadLSN: 95, WALHeadLSN: 95,
LastHeartbeat: time.Now(), LastHeartbeat: time.Now(),
Role: blockvol.RoleToWire(blockvol.RoleReplica),
}, },
}, },
}) })
@ -956,6 +967,7 @@ func TestRegistry_PromoteBestReplica_EligibilityFiltersCorrectly(t *testing.T) {
// Configurable tolerance: widen tolerance to allow lagging replicas. // Configurable tolerance: widen tolerance to allow lagging replicas.
func TestRegistry_PromoteBestReplica_ConfigurableTolerance(t *testing.T) { func TestRegistry_PromoteBestReplica_ConfigurableTolerance(t *testing.T) {
r := NewBlockVolumeRegistry() r := NewBlockVolumeRegistry()
r.MarkBlockCapable("lagging")
r.Register(&BlockVolumeEntry{ r.Register(&BlockVolumeEntry{
Name: "vol1", Name: "vol1",
VolumeServer: "primary", VolumeServer: "primary",
@ -970,6 +982,7 @@ func TestRegistry_PromoteBestReplica_ConfigurableTolerance(t *testing.T) {
HealthScore: 1.0, HealthScore: 1.0,
WALHeadLSN: 800, // lag=200 WALHeadLSN: 800, // lag=200
LastHeartbeat: time.Now(), LastHeartbeat: time.Now(),
Role: blockvol.RoleToWire(blockvol.RoleReplica),
}, },
}, },
}) })
@ -992,6 +1005,236 @@ func TestRegistry_PromoteBestReplica_ConfigurableTolerance(t *testing.T) {
} }
} }
// B-12: PromoteBestReplica rejects dead replica (server not in blockServers).
func TestRegistry_PromoteBestReplica_DeadServerIneligible(t *testing.T) {
r := NewBlockVolumeRegistry()
// Intentionally do NOT mark "dead-replica" as block-capable.
r.Register(&BlockVolumeEntry{
Name: "vol1",
VolumeServer: "primary",
Path: "/data/vol1.blk",
Epoch: 1,
LeaseTTL: 30 * time.Second,
WALHeadLSN: 100,
Replicas: []ReplicaInfo{
{
Server: "dead-replica",
Path: "/data/vol1.blk",
HealthScore: 1.0,
WALHeadLSN: 100,
LastHeartbeat: time.Now(),
Role: blockvol.RoleToWire(blockvol.RoleReplica),
},
},
})
_, err := r.PromoteBestReplica("vol1")
if err == nil {
t.Fatal("expected error: dead replica should be rejected")
}
if !strings.Contains(err.Error(), "server_dead") {
t.Fatalf("error should mention server_dead, got: %v", err)
}
}
// B-12: Dead replica rejected but alive replica promoted when both exist.
func TestRegistry_PromoteBestReplica_DeadSkipped_AlivePromoted(t *testing.T) {
r := NewBlockVolumeRegistry()
// Only mark s3 as alive.
r.MarkBlockCapable("s3")
r.Register(&BlockVolumeEntry{
Name: "vol1",
VolumeServer: "primary",
Path: "/data/vol1.blk",
Epoch: 1,
LeaseTTL: 30 * time.Second,
WALHeadLSN: 100,
Replicas: []ReplicaInfo{
{Server: "s2-dead", Path: "/r1.blk", HealthScore: 1.0, WALHeadLSN: 100, LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
{Server: "s3", Path: "/r2.blk", HealthScore: 0.8, WALHeadLSN: 95, LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
},
})
newEpoch, err := r.PromoteBestReplica("vol1")
if err != nil {
t.Fatalf("PromoteBestReplica: %v", err)
}
if newEpoch != 2 {
t.Fatalf("newEpoch: got %d, want 2", newEpoch)
}
e, _ := r.Lookup("vol1")
if e.VolumeServer != "s3" {
t.Fatalf("expected alive s3 promoted, got %q", e.VolumeServer)
}
}
// EvaluatePromotion returns read-only preflight without mutating registry.
func TestRegistry_EvaluatePromotion_Basic(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("replica1")
r.Register(&BlockVolumeEntry{
Name: "vol1",
VolumeServer: "primary",
Path: "/data/vol1.blk",
Epoch: 5,
LeaseTTL: 30 * time.Second,
WALHeadLSN: 100,
Replicas: []ReplicaInfo{
{Server: "replica1", Path: "/r1.blk", HealthScore: 0.9, WALHeadLSN: 100, LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
},
})
pf, err := r.EvaluatePromotion("vol1")
if err != nil {
t.Fatalf("EvaluatePromotion: %v", err)
}
if !pf.Promotable {
t.Fatalf("expected promotable, got reason: %s", pf.Reason)
}
if pf.Candidate == nil || pf.Candidate.Server != "replica1" {
t.Fatalf("expected candidate replica1, got %+v", pf.Candidate)
}
// Registry must be unmutated.
e, _ := r.Lookup("vol1")
if e.VolumeServer != "primary" {
t.Fatal("EvaluatePromotion should not mutate the registry")
}
if e.Epoch != 5 {
t.Fatal("EvaluatePromotion should not bump epoch")
}
}
// EvaluatePromotion with all replicas rejected.
func TestRegistry_EvaluatePromotion_AllRejected(t *testing.T) {
r := NewBlockVolumeRegistry()
// No servers marked as block-capable.
r.Register(&BlockVolumeEntry{
Name: "vol1",
VolumeServer: "primary",
Path: "/data/vol1.blk",
Epoch: 1,
Replicas: []ReplicaInfo{
{Server: "dead1", Path: "/r1.blk", HealthScore: 1.0, WALHeadLSN: 100, LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
{Server: "dead2", Path: "/r2.blk", HealthScore: 0.9, WALHeadLSN: 100, LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
},
})
pf, err := r.EvaluatePromotion("vol1")
if err != nil {
t.Fatalf("EvaluatePromotion: %v", err)
}
if pf.Promotable {
t.Fatal("expected not promotable")
}
if len(pf.Rejections) != 2 {
t.Fatalf("expected 2 rejections, got %d", len(pf.Rejections))
}
for _, rej := range pf.Rejections {
if rej.Reason != "server_dead" {
t.Fatalf("expected server_dead rejection, got %q", rej.Reason)
}
}
}
// EvaluatePromotion for nonexistent volume.
func TestRegistry_EvaluatePromotion_NotFound(t *testing.T) {
r := NewBlockVolumeRegistry()
_, err := r.EvaluatePromotion("nonexistent")
if err == nil {
t.Fatal("expected error for nonexistent volume")
}
}
// Replica created but never heartbeated is not promotable.
func TestRegistry_PromoteBestReplica_NoHeartbeatIneligible(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("replica1")
r.Register(&BlockVolumeEntry{
Name: "vol1",
VolumeServer: "primary",
Path: "/data/vol1.blk",
Epoch: 1,
LeaseTTL: 30 * time.Second,
WALHeadLSN: 100,
Replicas: []ReplicaInfo{
{
Server: "replica1",
Path: "/r1.blk",
HealthScore: 1.0,
WALHeadLSN: 100,
Role: blockvol.RoleToWire(blockvol.RoleReplica),
// LastHeartbeat: zero — never heartbeated
},
},
})
_, err := r.PromoteBestReplica("vol1")
if err == nil {
t.Fatal("expected error: replica with no heartbeat should be rejected")
}
if !strings.Contains(err.Error(), "no_heartbeat") {
t.Fatalf("error should mention no_heartbeat, got: %v", err)
}
}
// Replica with unset (zero) role is not promotable.
func TestRegistry_PromoteBestReplica_UnsetRoleIneligible(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("replica1")
r.Register(&BlockVolumeEntry{
Name: "vol1",
VolumeServer: "primary",
Path: "/data/vol1.blk",
Epoch: 1,
LeaseTTL: 30 * time.Second,
WALHeadLSN: 100,
Replicas: []ReplicaInfo{
{
Server: "replica1",
Path: "/r1.blk",
HealthScore: 1.0,
WALHeadLSN: 100,
LastHeartbeat: time.Now(),
// Role: 0 — unset/RoleNone
},
},
})
_, err := r.PromoteBestReplica("vol1")
if err == nil {
t.Fatal("expected error: replica with unset role should be rejected")
}
if !strings.Contains(err.Error(), "wrong_role") {
t.Fatalf("error should mention wrong_role, got: %v", err)
}
}
// PromoteBestReplica clears RebuildListenAddr on promotion (B-11 partial fix).
func TestRegistry_PromoteBestReplica_ClearsRebuildAddr(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("replica1")
r.Register(&BlockVolumeEntry{
Name: "vol1",
VolumeServer: "primary",
Path: "/data/vol1.blk",
Epoch: 1,
RebuildListenAddr: "primary:15000",
Replicas: []ReplicaInfo{
{Server: "replica1", Path: "/r1.blk", HealthScore: 1.0, WALHeadLSN: 100, LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
},
})
_, err := r.PromoteBestReplica("vol1")
if err != nil {
t.Fatalf("PromoteBestReplica: %v", err)
}
e, _ := r.Lookup("vol1")
if e.RebuildListenAddr != "" {
t.Fatalf("RebuildListenAddr should be cleared after promotion, got %q", e.RebuildListenAddr)
}
}
// --- LeaseGrants --- // --- LeaseGrants ---
func TestRegistry_LeaseGrants_PrimaryOnly(t *testing.T) { func TestRegistry_LeaseGrants_PrimaryOnly(t *testing.T) {
@ -1110,3 +1353,267 @@ func TestRegistry_LeaseGrants_UnknownServer(t *testing.T) {
t.Fatalf("expected nil for unknown server, got %+v", grants) t.Fatalf("expected nil for unknown server, got %+v", grants)
} }
} }
// ============================================================
// CP11B-3 T2: IsBlockCapable + VolumesWithDeadPrimary
// ============================================================
func TestRegistry_IsBlockCapable(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("vs1:8080")
if !r.IsBlockCapable("vs1:8080") {
t.Fatal("vs1 should be block-capable")
}
if r.IsBlockCapable("vs2:8080") {
t.Fatal("vs2 should NOT be block-capable")
}
r.UnmarkBlockCapable("vs1:8080")
if r.IsBlockCapable("vs1:8080") {
t.Fatal("vs1 should no longer be block-capable after unmark")
}
}
func TestRegistry_VolumesWithDeadPrimary_Basic(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("vs1")
r.MarkBlockCapable("vs2")
r.Register(&BlockVolumeEntry{
Name: "vol1", VolumeServer: "vs1", Path: "/data/vol1.blk",
SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
Status: StatusActive,
Replicas: []ReplicaInfo{{Server: "vs2", Path: "/data/vol1.blk"}},
})
// Both alive → no orphans.
orphaned := r.VolumesWithDeadPrimary("vs2")
if len(orphaned) != 0 {
t.Fatalf("expected 0 orphaned volumes, got %d", len(orphaned))
}
// Kill primary.
r.UnmarkBlockCapable("vs1")
orphaned = r.VolumesWithDeadPrimary("vs2")
if len(orphaned) != 1 || orphaned[0] != "vol1" {
t.Fatalf("expected [vol1], got %v", orphaned)
}
}
func TestRegistry_VolumesWithDeadPrimary_PrimaryServer_NotIncluded(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("vs1")
r.Register(&BlockVolumeEntry{
Name: "vol1", VolumeServer: "vs1", Path: "/data/vol1.blk",
SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
Status: StatusActive,
})
// vs1 is the primary for vol1 — should NOT appear in orphaned list for vs1.
orphaned := r.VolumesWithDeadPrimary("vs1")
if len(orphaned) != 0 {
t.Fatalf("primary server should not appear in its own orphan list, got %v", orphaned)
}
}
// T6: EvaluatePromotion preflight includes primary liveness.
func TestRegistry_EvaluatePromotion_PrimaryDead_StillShowsCandidate(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("vs1")
r.MarkBlockCapable("vs2")
r.Register(&BlockVolumeEntry{
Name: "vol1", VolumeServer: "vs1", Path: "/data/vol1.blk",
SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
Status: StatusActive, LeaseTTL: 30 * time.Second,
Replicas: []ReplicaInfo{{
Server: "vs2", Path: "/data/vol1.blk", HealthScore: 1.0,
Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(),
}},
})
// Kill primary but keep vs2 alive.
r.UnmarkBlockCapable("vs1")
pf, err := r.EvaluatePromotion("vol1")
if err != nil {
t.Fatalf("EvaluatePromotion: %v", err)
}
if !pf.Promotable {
t.Fatalf("should be promotable (vs2 alive), reason=%s", pf.Reason)
}
if pf.Candidate.Server != "vs2" {
t.Fatalf("candidate should be vs2, got %q", pf.Candidate.Server)
}
}
// ============================================================
// CP11B-3 T5: ManualPromote Dev Tests
// ============================================================
// T5: ManualPromote with empty target → auto-picks best candidate.
func TestRegistry_ManualPromote_AutoTarget(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("best")
r.MarkBlockCapable("worse")
r.Register(&BlockVolumeEntry{
Name: "vol1", VolumeServer: "primary", Path: "/data/vol1.blk",
Epoch: 1, LeaseTTL: 30 * time.Second, WALHeadLSN: 100,
Replicas: []ReplicaInfo{
{Server: "worse", Path: "/r1.blk", HealthScore: 0.5, WALHeadLSN: 100,
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
{Server: "best", Path: "/r2.blk", HealthScore: 1.0, WALHeadLSN: 100,
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
},
})
// Primary not block-capable → non-force should still pass (primary_alive gate won't trigger).
newEpoch, _, _, pf, err := r.ManualPromote("vol1", "", false)
if err != nil {
t.Fatalf("ManualPromote: %v", err)
}
if newEpoch != 2 {
t.Fatalf("epoch: got %d, want 2", newEpoch)
}
if !pf.Promotable {
t.Fatal("should be promotable")
}
e, _ := r.Lookup("vol1")
if e.VolumeServer != "best" {
t.Fatalf("expected 'best' promoted, got %q", e.VolumeServer)
}
}
// T5: ManualPromote targets a specific replica (not the best by health).
func TestRegistry_ManualPromote_SpecificTarget(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("r1")
r.MarkBlockCapable("r2")
r.Register(&BlockVolumeEntry{
Name: "vol1", VolumeServer: "primary", Path: "/data/vol1.blk",
Epoch: 1, LeaseTTL: 30 * time.Second,
Replicas: []ReplicaInfo{
{Server: "r1", Path: "/r1.blk", HealthScore: 1.0, WALHeadLSN: 100,
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
{Server: "r2", Path: "/r2.blk", HealthScore: 0.5, WALHeadLSN: 50,
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
},
})
// Target r2 specifically (worse health).
newEpoch, _, _, _, err := r.ManualPromote("vol1", "r2", false)
if err != nil {
t.Fatalf("ManualPromote: %v", err)
}
if newEpoch != 2 {
t.Fatalf("epoch: got %d, want 2", newEpoch)
}
e, _ := r.Lookup("vol1")
if e.VolumeServer != "r2" {
t.Fatalf("expected r2 promoted (specific target), got %q", e.VolumeServer)
}
}
// T5: ManualPromote with non-existent target → error.
func TestRegistry_ManualPromote_TargetNotFound(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("r1")
r.Register(&BlockVolumeEntry{
Name: "vol1", VolumeServer: "primary", Path: "/data/vol1.blk",
Epoch: 1, LeaseTTL: 30 * time.Second,
Replicas: []ReplicaInfo{
{Server: "r1", Path: "/r1.blk", HealthScore: 1.0,
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
},
})
_, _, _, pf, err := r.ManualPromote("vol1", "nonexistent", false)
if err == nil {
t.Fatal("expected error for nonexistent target")
}
if pf.Reason != "target_not_found" {
t.Fatalf("expected target_not_found, got %q", pf.Reason)
}
}
// T5: ManualPromote non-force with alive primary → rejected.
func TestRegistry_ManualPromote_PrimaryAlive_Rejected(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("primary")
r.MarkBlockCapable("r1")
r.Register(&BlockVolumeEntry{
Name: "vol1", VolumeServer: "primary", Path: "/data/vol1.blk",
Epoch: 1, LeaseTTL: 30 * time.Second,
Replicas: []ReplicaInfo{
{Server: "r1", Path: "/r1.blk", HealthScore: 1.0,
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
},
})
_, _, _, pf, err := r.ManualPromote("vol1", "", false)
if err == nil {
t.Fatal("expected rejection when primary alive and !force")
}
if pf.Reason != "primary_alive" {
t.Fatalf("expected primary_alive, got %q", pf.Reason)
}
// Verify no mutation.
e, _ := r.Lookup("vol1")
if e.VolumeServer != "primary" {
t.Fatalf("primary should not change, got %q", e.VolumeServer)
}
}
// T5: Force bypasses stale heartbeat and primary_alive gates.
func TestRegistry_ManualPromote_Force_StaleHeartbeat(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("primary")
r.MarkBlockCapable("r1")
r.Register(&BlockVolumeEntry{
Name: "vol1", VolumeServer: "primary", Path: "/data/vol1.blk",
Epoch: 1, LeaseTTL: 30 * time.Second,
Replicas: []ReplicaInfo{
{Server: "r1", Path: "/r1.blk", HealthScore: 1.0,
LastHeartbeat: time.Now().Add(-10 * time.Minute), // stale
Role: blockvol.RoleToWire(blockvol.RoleReplica)},
},
})
// Non-force: would fail on primary_alive.
// Force: bypasses primary_alive AND stale_heartbeat.
newEpoch, _, _, _, err := r.ManualPromote("vol1", "", true)
if err != nil {
t.Fatalf("force ManualPromote should succeed: %v", err)
}
if newEpoch != 2 {
t.Fatalf("epoch: got %d, want 2", newEpoch)
}
e, _ := r.Lookup("vol1")
if e.VolumeServer != "r1" {
t.Fatalf("expected r1 promoted via force, got %q", e.VolumeServer)
}
}
// T5: Force does NOT bypass server_dead (hard gate).
func TestRegistry_ManualPromote_Force_StillRejectsDeadServer(t *testing.T) {
r := NewBlockVolumeRegistry()
// "dead" is NOT marked block-capable.
r.Register(&BlockVolumeEntry{
Name: "vol1", VolumeServer: "primary", Path: "/data/vol1.blk",
Epoch: 1, LeaseTTL: 30 * time.Second,
Replicas: []ReplicaInfo{
{Server: "dead", Path: "/r1.blk", HealthScore: 1.0,
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
},
})
_, _, _, pf, err := r.ManualPromote("vol1", "dead", true)
if err == nil {
t.Fatal("force should NOT bypass server_dead")
}
if len(pf.Rejections) == 0 || pf.Rejections[0].Reason != "server_dead" {
t.Fatalf("expected server_dead rejection, got %+v", pf.Rejections)
}
}

3
weed/server/master_grpc_server.go

@ -278,6 +278,9 @@ func (ms *MasterServer) SendHeartbeat(stream master_pb.Seaweed_SendHeartbeatServ
// on subsequent heartbeats), never both in the same message. // on subsequent heartbeats), never both in the same message.
if len(heartbeat.BlockVolumeInfos) > 0 || heartbeat.HasNoBlockVolumes { if len(heartbeat.BlockVolumeInfos) > 0 || heartbeat.HasNoBlockVolumes {
ms.blockRegistry.UpdateFullHeartbeat(dn.Url(), heartbeat.BlockVolumeInfos) ms.blockRegistry.UpdateFullHeartbeat(dn.Url(), heartbeat.BlockVolumeInfos)
// T2 (B-06): After updating registry from heartbeat, check if this server
// is a replica for any volume whose primary is dead. If so, promote.
ms.reevaluateOrphanedPrimaries(dn.Url())
} else if len(heartbeat.NewBlockVolumes) > 0 || len(heartbeat.DeletedBlockVolumes) > 0 { } else if len(heartbeat.NewBlockVolumes) > 0 || len(heartbeat.DeletedBlockVolumes) > 0 {
ms.blockRegistry.UpdateDeltaHeartbeat(dn.Url(), heartbeat.NewBlockVolumes, heartbeat.DeletedBlockVolumes) ms.blockRegistry.UpdateDeltaHeartbeat(dn.Url(), heartbeat.NewBlockVolumes, heartbeat.DeletedBlockVolumes)
} }

23
weed/server/master_grpc_server_block.go

@ -283,14 +283,16 @@ func (ms *MasterServer) tryCreateOneReplica(ctx context.Context, req *master_pb.
entry.RebuildListenAddr = primaryResult.RebuildListenAddr entry.RebuildListenAddr = primaryResult.RebuildListenAddr
// CP8-2: populate Replicas[]. // CP8-2: populate Replicas[].
entry.Replicas = append(entry.Replicas, ReplicaInfo{ entry.Replicas = append(entry.Replicas, ReplicaInfo{
Server: replicaServerStr,
Path: replicaResult.Path,
ISCSIAddr: replicaResult.ISCSIAddr,
IQN: replicaResult.IQN,
NvmeAddr: replicaResult.NvmeAddr,
NQN: replicaResult.NQN,
DataAddr: replicaResult.ReplicaDataAddr,
CtrlAddr: replicaResult.ReplicaCtrlAddr,
Server: replicaServerStr,
Path: replicaResult.Path,
ISCSIAddr: replicaResult.ISCSIAddr,
IQN: replicaResult.IQN,
NvmeAddr: replicaResult.NvmeAddr,
NQN: replicaResult.NQN,
DataAddr: replicaResult.ReplicaDataAddr,
CtrlAddr: replicaResult.ReplicaCtrlAddr,
Role: blockvol.RoleToWire(blockvol.RoleReplica),
LastHeartbeat: time.Now(),
}) })
return replicaServerStr return replicaServerStr
} }
@ -409,6 +411,11 @@ func (ms *MasterServer) ExpandBlockVolume(ctx context.Context, req *master_pb.Ex
} }
}() }()
// Test-only hook: inject failover between lock acquisition and re-read.
if ms.expandPreReadHook != nil {
ms.expandPreReadHook()
}
// B-09: Re-read entry after acquiring expand lock. Between the initial // B-09: Re-read entry after acquiring expand lock. Between the initial
// Lookup and AcquireExpandInflight, failover may have changed VolumeServer // Lookup and AcquireExpandInflight, failover may have changed VolumeServer
// or Replicas. Using the stale snapshot would send PREPARE to dead nodes. // or Replicas. Using the stale snapshot would send PREPARE to dead nodes.

71
weed/server/master_grpc_server_block_test.go

@ -10,6 +10,7 @@ import (
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb" "github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb" "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol"
) )
// testMasterServer creates a minimal MasterServer with mock VS calls for testing. // testMasterServer creates a minimal MasterServer with mock VS calls for testing.
@ -1112,6 +1113,9 @@ func TestMaster_NoNvmeFieldsWhenDisabled(t *testing.T) {
func TestMaster_PromotionCopiesNvmeFields(t *testing.T) { func TestMaster_PromotionCopiesNvmeFields(t *testing.T) {
ms := testMasterServer(t) ms := testMasterServer(t)
// Mark servers as block-capable so promotion Gate 4 (liveness) passes.
ms.blockRegistry.MarkBlockCapable("vs1:9333")
ms.blockRegistry.MarkBlockCapable("vs2:9333")
// Directly register an entry with primary + replica, both having NVMe fields. // Directly register an entry with primary + replica, both having NVMe fields.
ms.blockRegistry.Register(&BlockVolumeEntry{ ms.blockRegistry.Register(&BlockVolumeEntry{
@ -1128,16 +1132,18 @@ func TestMaster_PromotionCopiesNvmeFields(t *testing.T) {
LeaseTTL: 30 * time.Second, LeaseTTL: 30 * time.Second,
Replicas: []ReplicaInfo{ Replicas: []ReplicaInfo{
{ {
Server: "vs2:9333",
Path: "/data/ha-vol.blk",
IQN: "iqn.2024.test:ha-vol-r",
ISCSIAddr: "vs2:3260",
NvmeAddr: "vs2:4420",
NQN: "nqn.2024-01.com.seaweedfs:vol.ha-vol.vs2",
DataAddr: "vs2:14260",
CtrlAddr: "vs2:14261",
HealthScore: 0.95,
WALHeadLSN: 100,
Server: "vs2:9333",
Path: "/data/ha-vol.blk",
IQN: "iqn.2024.test:ha-vol-r",
ISCSIAddr: "vs2:3260",
NvmeAddr: "vs2:4420",
NQN: "nqn.2024-01.com.seaweedfs:vol.ha-vol.vs2",
DataAddr: "vs2:14260",
CtrlAddr: "vs2:14261",
HealthScore: 0.95,
WALHeadLSN: 100,
Role: blockvol.RoleToWire(blockvol.RoleReplica),
LastHeartbeat: time.Now(),
}, },
}, },
}) })
@ -1654,10 +1660,11 @@ func TestMaster_ExpandCoordinated_RestartRecovery(t *testing.T) {
} }
func TestMaster_ExpandCoordinated_B09_ReReadsEntryAfterLock(t *testing.T) { func TestMaster_ExpandCoordinated_B09_ReReadsEntryAfterLock(t *testing.T) {
// B-09: If failover changes VolumeServer between initial Lookup and
// AcquireExpandInflight, the coordinator must use the fresh entry,
// not the stale one. Use RF=3 so promotion still leaves 1 replica
// and the coordinated path is taken.
// B-09: Exercises the actual race window — failover happens BETWEEN
// the initial Lookup (line 380) and the post-lock re-read (line 419).
// Uses expandPreReadHook to inject PromoteBestReplica at the exact
// interleaving point. RF=3 so promotion leaves 1 replica and the
// coordinated path is taken.
ms := testMasterServerWithExpandMocks(t) ms := testMasterServerWithExpandMocks(t)
ms.blockRegistry.MarkBlockCapable("vs1:9333") ms.blockRegistry.MarkBlockCapable("vs1:9333")
ms.blockRegistry.MarkBlockCapable("vs2:9333") ms.blockRegistry.MarkBlockCapable("vs2:9333")
@ -1689,31 +1696,39 @@ func TestMaster_ExpandCoordinated_B09_ReReadsEntryAfterLock(t *testing.T) {
return 2 << 30, nil return 2 << 30, nil
} }
// Simulate failover: promote best replica. With RF=3, one replica
// becomes primary and the other stays as replica → coordinated path.
ms.blockRegistry.PromoteBestReplica("b09-vol")
entry, _ = ms.blockRegistry.Lookup("b09-vol")
newPrimary := entry.VolumeServer
if newPrimary == originalPrimary {
t.Fatal("promotion didn't change primary")
}
if len(entry.Replicas) == 0 {
t.Fatal("expected at least 1 replica after RF=3 promotion")
// Hook fires AFTER AcquireExpandInflight but BEFORE the re-read Lookup.
// This is the exact race window: the initial Lookup already returned
// the old primary, but failover changes it before the re-read.
hookFired := false
ms.expandPreReadHook = func() {
hookFired = true
ms.blockRegistry.PromoteBestReplica("b09-vol")
} }
// Expand should use the NEW primary (post-failover), not the old one.
// At this point, the initial Lookup inside ExpandBlockVolume will see
// originalPrimary. The hook then promotes, changing the primary.
// The re-read must pick up the new primary.
resp, err := ms.ExpandBlockVolume(context.Background(), &master_pb.ExpandBlockVolumeRequest{ resp, err := ms.ExpandBlockVolume(context.Background(), &master_pb.ExpandBlockVolumeRequest{
Name: "b09-vol", NewSizeBytes: 2 << 30, Name: "b09-vol", NewSizeBytes: 2 << 30,
}) })
if err != nil { if err != nil {
t.Fatalf("expand: %v", err) t.Fatalf("expand: %v", err)
} }
if !hookFired {
t.Fatal("expandPreReadHook was not called — race window not exercised")
}
if resp.CapacityBytes != 2<<30 { if resp.CapacityBytes != 2<<30 {
t.Fatalf("capacity: got %d", resp.CapacityBytes) t.Fatalf("capacity: got %d", resp.CapacityBytes)
} }
// First PREPARE should have gone to the new primary, not the old one.
// Verify: after the hook promoted, the re-read must have picked up
// the new primary. The first PREPARE should go to the new primary.
entry, _ = ms.blockRegistry.Lookup("b09-vol")
newPrimary := entry.VolumeServer
if newPrimary == originalPrimary {
t.Fatal("promotion didn't change primary")
}
if len(preparedServers) == 0 { if len(preparedServers) == 0 {
t.Fatal("no prepare calls recorded") t.Fatal("no prepare calls recorded")
} }
@ -1721,7 +1736,7 @@ func TestMaster_ExpandCoordinated_B09_ReReadsEntryAfterLock(t *testing.T) {
t.Fatalf("PREPARE went to %q (stale), should go to %q (fresh primary)", t.Fatalf("PREPARE went to %q (stale), should go to %q (fresh primary)",
preparedServers[0], newPrimary) preparedServers[0], newPrimary)
} }
// Verify old primary was NOT contacted.
// Verify old primary was NOT contacted at all.
for _, s := range preparedServers { for _, s := range preparedServers {
if s == originalPrimary { if s == originalPrimary {
t.Fatalf("PREPARE sent to old primary %q — stale entry used", originalPrimary) t.Fatalf("PREPARE sent to old primary %q — stale entry used", originalPrimary)

6
weed/server/master_server.go

@ -109,6 +109,10 @@ type MasterServer struct {
blockVSCommitExpand func(ctx context.Context, server string, name string, expandEpoch uint64) (uint64, error) blockVSCommitExpand func(ctx context.Context, server string, name string, expandEpoch uint64) (uint64, error)
blockVSCancelExpand func(ctx context.Context, server string, name string, expandEpoch uint64) error blockVSCancelExpand func(ctx context.Context, server string, name string, expandEpoch uint64) error
nextExpandEpoch atomic.Uint64 nextExpandEpoch atomic.Uint64
// Test-only hook: called after AcquireExpandInflight but before the
// re-read Lookup in coordinated expand. Nil in production.
expandPreReadHook func()
} }
func NewMasterServer(r *mux.Router, option *MasterOption, peers map[string]pb.ServerAddress) *MasterServer { func NewMasterServer(r *mux.Router, option *MasterOption, peers map[string]pb.ServerAddress) *MasterServer {
@ -224,6 +228,8 @@ func NewMasterServer(r *mux.Router, option *MasterOption, peers map[string]pb.Se
r.HandleFunc("/block/volume/{name}", ms.guard.WhiteList(requestIDMiddleware(ms.blockVolumeLookupHandler))).Methods("GET") r.HandleFunc("/block/volume/{name}", ms.guard.WhiteList(requestIDMiddleware(ms.blockVolumeLookupHandler))).Methods("GET")
r.HandleFunc("/block/volumes", ms.guard.WhiteList(requestIDMiddleware(ms.blockVolumeListHandler))).Methods("GET") r.HandleFunc("/block/volumes", ms.guard.WhiteList(requestIDMiddleware(ms.blockVolumeListHandler))).Methods("GET")
r.HandleFunc("/block/volume/{name}/expand", ms.proxyToLeader(ms.guard.WhiteList(requestIDMiddleware(ms.blockVolumeExpandHandler)))).Methods("POST") r.HandleFunc("/block/volume/{name}/expand", ms.proxyToLeader(ms.guard.WhiteList(requestIDMiddleware(ms.blockVolumeExpandHandler)))).Methods("POST")
r.HandleFunc("/block/volume/{name}/preflight", ms.guard.WhiteList(requestIDMiddleware(ms.blockVolumePreflightHandler))).Methods("GET")
r.HandleFunc("/block/volume/{name}/promote", ms.proxyToLeader(ms.guard.WhiteList(requestIDMiddleware(ms.blockVolumePromoteHandler)))).Methods("POST")
r.HandleFunc("/block/assign", ms.proxyToLeader(ms.guard.WhiteList(requestIDMiddleware(ms.blockAssignHandler)))).Methods("POST") r.HandleFunc("/block/assign", ms.proxyToLeader(ms.guard.WhiteList(requestIDMiddleware(ms.blockAssignHandler)))).Methods("POST")
r.HandleFunc("/block/servers", ms.guard.WhiteList(requestIDMiddleware(ms.blockServersHandler))).Methods("GET") r.HandleFunc("/block/servers", ms.guard.WhiteList(requestIDMiddleware(ms.blockServersHandler))).Methods("GET")
r.HandleFunc("/block/status", ms.guard.WhiteList(requestIDMiddleware(ms.blockStatusHandler))).Methods("GET") r.HandleFunc("/block/status", ms.guard.WhiteList(requestIDMiddleware(ms.blockStatusHandler))).Methods("GET")

96
weed/server/master_server_handlers_block.go

@ -7,6 +7,7 @@ import (
"github.com/gorilla/mux" "github.com/gorilla/mux"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb" "github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol" "github.com/seaweedfs/seaweedfs/weed/storage/blockvol"
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol/blockapi" "github.com/seaweedfs/seaweedfs/weed/storage/blockvol/blockapi"
@ -206,6 +207,99 @@ func (ms *MasterServer) blockStatusHandler(w http.ResponseWriter, r *http.Reques
writeJsonQuiet(w, r, http.StatusOK, status) writeJsonQuiet(w, r, http.StatusOK, status)
} }
// blockVolumePreflightHandler handles GET /block/volume/{name}/preflight.
// Returns a read-only promotion preflight evaluation for the named volume.
func (ms *MasterServer) blockVolumePreflightHandler(w http.ResponseWriter, r *http.Request) {
name := mux.Vars(r)["name"]
if name == "" {
writeJsonError(w, r, http.StatusBadRequest, fmt.Errorf("name is required"))
return
}
pf, err := ms.blockRegistry.EvaluatePromotion(name)
if err != nil {
writeJsonError(w, r, http.StatusNotFound, err)
return
}
resp := blockapi.PreflightResponse{
VolumeName: pf.VolumeName,
Promotable: pf.Promotable,
Reason: pf.Reason,
}
if pf.Candidate != nil {
resp.CandidateServer = pf.Candidate.Server
resp.CandidateHealth = pf.Candidate.HealthScore
resp.CandidateWALLSN = pf.Candidate.WALHeadLSN
}
for _, rej := range pf.Rejections {
resp.Rejections = append(resp.Rejections, blockapi.PreflightRejection{
Server: rej.Server,
Reason: rej.Reason,
})
}
// Add primary liveness info.
entry, ok := ms.blockRegistry.Lookup(name)
if ok {
resp.PrimaryServer = entry.VolumeServer
resp.PrimaryAlive = ms.blockRegistry.IsBlockCapable(entry.VolumeServer)
}
writeJsonQuiet(w, r, http.StatusOK, resp)
}
// blockVolumePromoteHandler handles POST /block/volume/{name}/promote.
// Triggers a manual promotion for the named block volume.
func (ms *MasterServer) blockVolumePromoteHandler(w http.ResponseWriter, r *http.Request) {
name := mux.Vars(r)["name"]
if name == "" {
writeJsonError(w, r, http.StatusBadRequest, fmt.Errorf("name is required"))
return
}
var req blockapi.PromoteVolumeRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeJsonError(w, r, http.StatusBadRequest, fmt.Errorf("decode request: %w", err))
return
}
// ManualPromote captures oldPrimary/oldPath under lock to avoid TOCTOU (BUG-T5-2).
newEpoch, oldPrimary, oldPath, pf, err := ms.blockRegistry.ManualPromote(name, req.TargetServer, req.Force)
if err != nil {
// Distinguish not-found from rejection.
status := http.StatusConflict
if pf.Reason == "volume not found" {
status = http.StatusNotFound
}
// Build structured rejection response.
resp := blockapi.PromoteVolumeResponse{
Reason: pf.Reason,
}
for _, rej := range pf.Rejections {
resp.Rejections = append(resp.Rejections, blockapi.PreflightRejection{
Server: rej.Server,
Reason: rej.Reason,
})
}
glog.V(0).Infof("manual promote %q rejected: %s", name, pf.Reason)
writeJsonQuiet(w, r, status, resp)
return
}
// Post-promotion orchestration (same as auto path).
ms.finalizePromotion(name, oldPrimary, oldPath, newEpoch)
if req.Reason != "" {
glog.V(0).Infof("manual promote %q: reason=%q", name, req.Reason)
}
// Re-read to get the new primary server name.
entry, _ := ms.blockRegistry.Lookup(name)
writeJsonQuiet(w, r, http.StatusOK, blockapi.PromoteVolumeResponse{
NewPrimary: entry.VolumeServer,
Epoch: newEpoch,
})
}
// entryToVolumeInfo converts a BlockVolumeEntry to a blockapi.VolumeInfo. // entryToVolumeInfo converts a BlockVolumeEntry to a blockapi.VolumeInfo.
func entryToVolumeInfo(e *BlockVolumeEntry) blockapi.VolumeInfo { func entryToVolumeInfo(e *BlockVolumeEntry) blockapi.VolumeInfo {
status := "pending" status := "pending"
@ -239,6 +333,8 @@ func entryToVolumeInfo(e *BlockVolumeEntry) blockapi.VolumeInfo {
HealthScore: e.HealthScore, HealthScore: e.HealthScore,
ReplicaDegraded: e.ReplicaDegraded, ReplicaDegraded: e.ReplicaDegraded,
DurabilityMode: durMode, DurabilityMode: durMode,
NvmeAddr: e.NvmeAddr,
NQN: e.NQN,
} }
for _, ri := range e.Replicas { for _, ri := range e.Replicas {
info.Replicas = append(info.Replicas, blockapi.ReplicaDetail{ info.Replicas = append(info.Replicas, blockapi.ReplicaDetail{

1581
weed/server/qa_block_cp11b3_adversarial_test.go
File diff suppressed because it is too large
View File

25
weed/server/qa_block_cp63_test.go

@ -40,6 +40,11 @@ func testMSForQA(t *testing.T) *MasterServer {
// registerQAVolume creates a volume entry with optional replica, configurable lease state. // registerQAVolume creates a volume entry with optional replica, configurable lease state.
func registerQAVolume(t *testing.T, ms *MasterServer, name, primary, replica string, epoch uint64, leaseTTL time.Duration, leaseExpired bool) { func registerQAVolume(t *testing.T, ms *MasterServer, name, primary, replica string, epoch uint64, leaseTTL time.Duration, leaseExpired bool) {
t.Helper() t.Helper()
// Mark servers as block-capable so promotion Gate 4 (liveness) passes.
ms.blockRegistry.MarkBlockCapable(primary)
if replica != "" {
ms.blockRegistry.MarkBlockCapable(replica)
}
entry := &BlockVolumeEntry{ entry := &BlockVolumeEntry{
Name: name, Name: name,
VolumeServer: primary, VolumeServer: primary,
@ -65,11 +70,13 @@ func registerQAVolume(t *testing.T, ms *MasterServer, name, primary, replica str
// CP8-2: also populate Replicas[]. // CP8-2: also populate Replicas[].
entry.Replicas = []ReplicaInfo{ entry.Replicas = []ReplicaInfo{
{ {
Server: replica,
Path: fmt.Sprintf("/data/%s.blk", name),
IQN: fmt.Sprintf("iqn.2024.test:%s-r", name),
ISCSIAddr: replica + ":3260",
HealthScore: 1.0,
Server: replica,
Path: fmt.Sprintf("/data/%s.blk", name),
IQN: fmt.Sprintf("iqn.2024.test:%s-r", name),
ISCSIAddr: replica + ":3260",
HealthScore: 1.0,
Role: blockvol.RoleToWire(blockvol.RoleReplica),
LastHeartbeat: time.Now(),
}, },
} }
} }
@ -398,7 +405,15 @@ func TestQA_Failover_PromoteIdempotent_NoReplicaAfterFirstSwap(t *testing.T) {
// Reconnect vs1 first so it becomes a replica. // Reconnect vs1 first so it becomes a replica.
ms.recoverBlockVolumes("vs1") ms.recoverBlockVolumes("vs1")
// Simulate rebuild completion: mark vs1 as a healthy replica.
e, _ := ms.blockRegistry.Lookup("vol1") e, _ := ms.blockRegistry.Lookup("vol1")
for i := range e.Replicas {
if e.Replicas[i].Server == "vs1" {
e.Replicas[i].Role = blockvol.RoleToWire(blockvol.RoleReplica)
e.Replicas[i].LastHeartbeat = time.Now()
e.Replicas[i].HealthScore = 1.0
}
}
e.LastLeaseGrant = time.Now().Add(-1 * time.Minute) // expire the new lease e.LastLeaseGrant = time.Now().Add(-1 * time.Minute) // expire the new lease
ms.failoverBlockVolumes("vs2") ms.failoverBlockVolumes("vs2")

485
weed/server/qa_block_expand_adversarial_test.go

@ -0,0 +1,485 @@
package weed_server
import (
"context"
"fmt"
"sync"
"sync/atomic"
"testing"
"time"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol"
)
// ============================================================
// CP11A-2 Adversarial Test Suite: B-09 + B-10
//
// 8 scenarios stress-testing the coordinated expand path under
// failover, concurrent heartbeats, and partial failures.
// ============================================================
// qaExpandMaster creates a MasterServer with 3 block-capable servers
// and default expand mocks for adversarial testing.
func qaExpandMaster(t *testing.T) *MasterServer {
t.Helper()
ms := &MasterServer{
blockRegistry: NewBlockVolumeRegistry(),
blockAssignmentQueue: NewBlockAssignmentQueue(),
blockFailover: newBlockFailoverState(),
}
ms.blockVSAllocate = func(ctx context.Context, server string, name string, sizeBytes uint64, diskType string, durabilityMode string) (*blockAllocResult, error) {
return &blockAllocResult{
Path: fmt.Sprintf("/data/%s.blk", name),
IQN: fmt.Sprintf("iqn.2024.test:%s", name),
ISCSIAddr: server + ":3260",
ReplicaDataAddr: server + ":14260",
ReplicaCtrlAddr: server + ":14261",
RebuildListenAddr: server + ":15000",
}, nil
}
ms.blockVSDelete = func(ctx context.Context, server string, name string) error {
return nil
}
ms.blockVSExpand = func(ctx context.Context, server string, name string, newSize uint64) (uint64, error) {
return newSize, nil
}
ms.blockVSPrepareExpand = func(ctx context.Context, server string, name string, newSize, expandEpoch uint64) error {
return nil
}
ms.blockVSCommitExpand = func(ctx context.Context, server string, name string, expandEpoch uint64) (uint64, error) {
return 2 << 30, nil
}
ms.blockVSCancelExpand = func(ctx context.Context, server string, name string, expandEpoch uint64) error {
return nil
}
ms.blockRegistry.MarkBlockCapable("vs1:9333")
ms.blockRegistry.MarkBlockCapable("vs2:9333")
ms.blockRegistry.MarkBlockCapable("vs3:9333")
return ms
}
// qaCreateRF creates a volume with the given replica factor.
func qaCreateRF(t *testing.T, ms *MasterServer, name string, rf uint32) {
t.Helper()
_, err := ms.CreateBlockVolume(context.Background(), &master_pb.CreateBlockVolumeRequest{
Name: name,
SizeBytes: 1 << 30,
ReplicaFactor: rf,
})
if err != nil {
t.Fatalf("create %s RF=%d: %v", name, rf, err)
}
}
// ────────────────────────────────────────────────────────────
// QA-B09-1: ExpandAfterDoubleFailover_RF3
//
// RF=3 volume. Primary dies → promote replica A. Then replica A
// (now primary) dies → promote replica B. Expand must reach
// replica B (the second-generation primary), not the original.
// ────────────────────────────────────────────────────────────
func TestQA_B09_ExpandAfterDoubleFailover_RF3(t *testing.T) {
ms := qaExpandMaster(t)
qaCreateRF(t, ms, "dbl-failover", 3)
entry, _ := ms.blockRegistry.Lookup("dbl-failover")
gen0Primary := entry.VolumeServer
// First failover: kill original primary.
ms.blockRegistry.PromoteBestReplica("dbl-failover")
entry, _ = ms.blockRegistry.Lookup("dbl-failover")
gen1Primary := entry.VolumeServer
if gen1Primary == gen0Primary {
t.Fatal("first promotion didn't change primary")
}
// Second failover: kill gen1 primary.
// Need to ensure the remaining replica has a fresh heartbeat.
if len(entry.Replicas) == 0 {
t.Fatal("no replicas left after first promotion (need RF=3)")
}
ms.blockRegistry.PromoteBestReplica("dbl-failover")
entry, _ = ms.blockRegistry.Lookup("dbl-failover")
gen2Primary := entry.VolumeServer
if gen2Primary == gen1Primary || gen2Primary == gen0Primary {
t.Fatalf("second promotion should pick a new server, got %q (gen0=%q gen1=%q)",
gen2Primary, gen0Primary, gen1Primary)
}
// Track PREPARE targets.
var preparedServers []string
ms.blockVSPrepareExpand = func(ctx context.Context, server string, name string, newSize, expandEpoch uint64) error {
preparedServers = append(preparedServers, server)
return nil
}
// Expand — standalone path since no replicas remain after 2 promotions.
_, err := ms.ExpandBlockVolume(context.Background(), &master_pb.ExpandBlockVolumeRequest{
Name: "dbl-failover", NewSizeBytes: 2 << 30,
})
if err != nil {
t.Fatalf("expand: %v", err)
}
// If standalone path was taken (no replicas), preparedServers is empty — that's fine.
// If coordinated path was taken, first PREPARE must target gen2Primary.
if len(preparedServers) > 0 && preparedServers[0] != gen2Primary {
t.Fatalf("PREPARE went to %q, want gen2 primary %q", preparedServers[0], gen2Primary)
}
}
// ────────────────────────────────────────────────────────────
// QA-B09-2: ExpandSeesDeletedVolume_AfterLockAcquire
//
// Volume is deleted between the initial Lookup (succeeds) and
// the re-read after AcquireExpandInflight. The re-read must
// detect the deletion and fail cleanly.
// ────────────────────────────────────────────────────────────
func TestQA_B09_ExpandSeesDeletedVolume_AfterLockAcquire(t *testing.T) {
ms := qaExpandMaster(t)
qaCreateRF(t, ms, "disappear", 2)
// Hook PREPARE to delete the volume before it runs.
// The B-09 re-read happens before PREPARE, so we simulate deletion
// between initial Lookup and AcquireExpandInflight by having a
// goroutine that deletes the entry while expand is in progress.
// Instead, test directly: acquire expand lock, then unregister, then
// call ExpandBlockVolume — it should fail on re-read.
// Acquire expand lock manually first so the real call gets blocked.
// Then verify the error path by attempting a second expand.
if !ms.blockRegistry.AcquireExpandInflight("disappear", 2<<30, 1) {
t.Fatal("AcquireExpandInflight should succeed")
}
// Try another expand while locked — should fail with "already in progress".
_, err := ms.ExpandBlockVolume(context.Background(), &master_pb.ExpandBlockVolumeRequest{
Name: "disappear", NewSizeBytes: 2 << 30,
})
if err == nil {
t.Fatal("expand should fail when lock is held")
}
// Release and delete the volume.
ms.blockRegistry.ReleaseExpandInflight("disappear")
ms.blockRegistry.Unregister("disappear")
// Now expand on a deleted volume — should fail on initial Lookup.
_, err = ms.ExpandBlockVolume(context.Background(), &master_pb.ExpandBlockVolumeRequest{
Name: "disappear", NewSizeBytes: 2 << 30,
})
if err == nil {
t.Fatal("expand on deleted volume should fail")
}
}
// ────────────────────────────────────────────────────────────
// QA-B09-3: ConcurrentExpandAndFailover
//
// Expand and failover race on the same volume. Neither should
// panic, and the volume must be in a consistent state afterward.
// ────────────────────────────────────────────────────────────
func TestQA_B09_ConcurrentExpandAndFailover(t *testing.T) {
ms := qaExpandMaster(t)
qaCreateRF(t, ms, "race-vol", 3)
entry, _ := ms.blockRegistry.Lookup("race-vol")
primary := entry.VolumeServer
// Make PREPARE slow so expand holds the lock longer.
ms.blockVSPrepareExpand = func(ctx context.Context, server string, name string, newSize, expandEpoch uint64) error {
time.Sleep(5 * time.Millisecond)
return nil
}
var wg sync.WaitGroup
// Goroutine 1: expand.
wg.Add(1)
go func() {
defer wg.Done()
ms.ExpandBlockVolume(context.Background(), &master_pb.ExpandBlockVolumeRequest{
Name: "race-vol", NewSizeBytes: 2 << 30,
})
// Error is OK — we're testing for panics and consistency.
}()
// Goroutine 2: failover kills primary.
wg.Add(1)
go func() {
defer wg.Done()
time.Sleep(2 * time.Millisecond) // slight delay to let expand start
ms.failoverBlockVolumes(primary)
}()
wg.Wait()
// Volume must still exist regardless of outcome.
_, ok := ms.blockRegistry.Lookup("race-vol")
if !ok {
t.Fatal("volume must survive concurrent expand + failover")
}
}
// ────────────────────────────────────────────────────────────
// QA-B09-4: ConcurrentExpandsSameVolume
//
// Two goroutines try to expand the same volume simultaneously.
// Exactly one should succeed, the other should get "already in
// progress". No panic, no double-commit.
// ────────────────────────────────────────────────────────────
func TestQA_B09_ConcurrentExpandsSameVolume(t *testing.T) {
ms := qaExpandMaster(t)
qaCreateRF(t, ms, "dup-expand", 2)
var commitCount atomic.Int32
ms.blockVSPrepareExpand = func(ctx context.Context, server string, name string, newSize, expandEpoch uint64) error {
time.Sleep(5 * time.Millisecond) // slow prepare
return nil
}
ms.blockVSCommitExpand = func(ctx context.Context, server string, name string, expandEpoch uint64) (uint64, error) {
commitCount.Add(1)
return 2 << 30, nil
}
var wg sync.WaitGroup
var successes atomic.Int32
var failures atomic.Int32
for i := 0; i < 2; i++ {
wg.Add(1)
go func() {
defer wg.Done()
_, err := ms.ExpandBlockVolume(context.Background(), &master_pb.ExpandBlockVolumeRequest{
Name: "dup-expand", NewSizeBytes: 2 << 30,
})
if err == nil {
successes.Add(1)
} else {
failures.Add(1)
}
}()
}
wg.Wait()
if successes.Load() != 1 {
t.Fatalf("expected exactly 1 success, got %d", successes.Load())
}
if failures.Load() != 1 {
t.Fatalf("expected exactly 1 failure (already in progress), got %d", failures.Load())
}
}
// ────────────────────────────────────────────────────────────
// QA-B10-1: RepeatedEmptyHeartbeats_DuringExpand
//
// Multiple empty heartbeats from the primary during expand.
// Entry must survive all of them — not just the first.
// ────────────────────────────────────────────────────────────
func TestQA_B10_RepeatedEmptyHeartbeats_DuringExpand(t *testing.T) {
ms := qaExpandMaster(t)
qaCreateRF(t, ms, "multi-hb", 2)
entry, _ := ms.blockRegistry.Lookup("multi-hb")
primary := entry.VolumeServer
if !ms.blockRegistry.AcquireExpandInflight("multi-hb", 2<<30, 42) {
t.Fatal("acquire expand lock")
}
// 10 empty heartbeats from the primary — each one would delete
// the entry without the B-10 guard.
for i := 0; i < 10; i++ {
ms.blockRegistry.UpdateFullHeartbeat(primary, []*master_pb.BlockVolumeInfoMessage{})
}
_, ok := ms.blockRegistry.Lookup("multi-hb")
if !ok {
t.Fatal("entry deleted after repeated empty heartbeats during expand")
}
ms.blockRegistry.ReleaseExpandInflight("multi-hb")
}
// ────────────────────────────────────────────────────────────
// QA-B10-2: ExpandFailed_HeartbeatStillProtected
//
// After MarkExpandFailed (primary committed, replica didn't),
// empty heartbeats must NOT delete the entry. ExpandFailed
// keeps ExpandInProgress=true as a size-suppression guard.
// ────────────────────────────────────────────────────────────
func TestQA_B10_ExpandFailed_HeartbeatStillProtected(t *testing.T) {
ms := qaExpandMaster(t)
qaCreateRF(t, ms, "fail-hb", 2)
entry, _ := ms.blockRegistry.Lookup("fail-hb")
primary := entry.VolumeServer
if !ms.blockRegistry.AcquireExpandInflight("fail-hb", 2<<30, 42) {
t.Fatal("acquire expand lock")
}
ms.blockRegistry.MarkExpandFailed("fail-hb")
// Empty heartbeat should not delete — ExpandFailed keeps ExpandInProgress=true.
ms.blockRegistry.UpdateFullHeartbeat(primary, []*master_pb.BlockVolumeInfoMessage{})
e, ok := ms.blockRegistry.Lookup("fail-hb")
if !ok {
t.Fatal("entry deleted during ExpandFailed state")
}
if !e.ExpandFailed {
t.Fatal("ExpandFailed should still be true")
}
if !e.ExpandInProgress {
t.Fatal("ExpandInProgress should still be true")
}
// After ClearExpandFailed, empty heartbeat should delete normally.
ms.blockRegistry.ClearExpandFailed("fail-hb")
ms.blockRegistry.UpdateFullHeartbeat(primary, []*master_pb.BlockVolumeInfoMessage{})
_, ok = ms.blockRegistry.Lookup("fail-hb")
if ok {
t.Fatal("entry should be deleted after ClearExpandFailed + empty heartbeat")
}
}
// ────────────────────────────────────────────────────────────
// QA-B10-3: HeartbeatSizeSuppress_DuringExpand
//
// Primary reports a stale (old) size during coordinated expand.
// Registry must NOT downgrade SizeBytes — the pending expand
// size is authoritative until commit or release.
// ────────────────────────────────────────────────────────────
func TestQA_B10_HeartbeatSizeSuppress_DuringExpand(t *testing.T) {
ms := qaExpandMaster(t)
qaCreateRF(t, ms, "size-suppress", 2)
entry, _ := ms.blockRegistry.Lookup("size-suppress")
primary := entry.VolumeServer
origSize := entry.SizeBytes
if !ms.blockRegistry.AcquireExpandInflight("size-suppress", 2<<30, 42) {
t.Fatal("acquire expand lock")
}
// Heartbeat reports old size (expand hasn't committed on VS yet).
ms.blockRegistry.UpdateFullHeartbeat(primary, []*master_pb.BlockVolumeInfoMessage{
{
Path: "/data/size-suppress.blk",
VolumeSize: origSize, // old size
Epoch: 1,
Role: blockvol.RoleToWire(blockvol.RolePrimary),
},
})
entry, _ = ms.blockRegistry.Lookup("size-suppress")
if entry.SizeBytes != origSize {
t.Fatalf("size should remain %d during expand, got %d", origSize, entry.SizeBytes)
}
// Heartbeat reports a LARGER size (stale from previous expand or bug).
// Still must not update — coordinated expand owns the size.
ms.blockRegistry.UpdateFullHeartbeat(primary, []*master_pb.BlockVolumeInfoMessage{
{
Path: "/data/size-suppress.blk",
VolumeSize: 5 << 30, // bogus large size
Epoch: 1,
Role: blockvol.RoleToWire(blockvol.RolePrimary),
},
})
entry, _ = ms.blockRegistry.Lookup("size-suppress")
if entry.SizeBytes != origSize {
t.Fatalf("size should remain %d (suppressed), got %d", origSize, entry.SizeBytes)
}
ms.blockRegistry.ReleaseExpandInflight("size-suppress")
}
// ────────────────────────────────────────────────────────────
// QA-B10-4: ConcurrentHeartbeatsAndExpand
//
// Simultaneous full heartbeats from primary and replicas while
// expand runs on another goroutine. Must not panic, must not
// orphan the entry, and expand must either succeed or fail
// cleanly with a clear error.
// ────────────────────────────────────────────────────────────
func TestQA_B10_ConcurrentHeartbeatsAndExpand(t *testing.T) {
ms := qaExpandMaster(t)
qaCreateRF(t, ms, "hb-expand-race", 2)
entry, _ := ms.blockRegistry.Lookup("hb-expand-race")
primary := entry.VolumeServer
replica := ""
if len(entry.Replicas) > 0 {
replica = entry.Replicas[0].Server
}
ms.blockVSPrepareExpand = func(ctx context.Context, server string, name string, newSize, expandEpoch uint64) error {
time.Sleep(2 * time.Millisecond)
return nil
}
var wg sync.WaitGroup
const rounds = 30
// Goroutine 1: expand.
wg.Add(1)
go func() {
defer wg.Done()
ms.ExpandBlockVolume(context.Background(), &master_pb.ExpandBlockVolumeRequest{
Name: "hb-expand-race", NewSizeBytes: 2 << 30,
})
}()
// Goroutine 2: primary heartbeats (mix of reporting and not reporting).
wg.Add(1)
go func() {
defer wg.Done()
for i := 0; i < rounds; i++ {
if i%5 == 0 {
// Every 5th: empty heartbeat (simulates brief restart).
ms.blockRegistry.UpdateFullHeartbeat(primary, []*master_pb.BlockVolumeInfoMessage{})
} else {
ms.blockRegistry.UpdateFullHeartbeat(primary, []*master_pb.BlockVolumeInfoMessage{
{
Path: "/data/hb-expand-race.blk",
VolumeSize: 1 << 30,
Epoch: 1,
Role: blockvol.RoleToWire(blockvol.RolePrimary),
WalHeadLsn: uint64(100 + i),
},
})
}
}
}()
// Goroutine 3: replica heartbeats.
if replica != "" {
wg.Add(1)
go func() {
defer wg.Done()
for i := 0; i < rounds; i++ {
ms.blockRegistry.UpdateFullHeartbeat(replica, []*master_pb.BlockVolumeInfoMessage{
{
Path: "/data/hb-expand-race.blk",
VolumeSize: 1 << 30,
Epoch: 1,
Role: blockvol.RoleToWire(blockvol.RoleReplica),
WalHeadLsn: uint64(99 + i),
},
})
}
}()
}
wg.Wait()
// Volume must still exist — no orphan.
_, ok := ms.blockRegistry.Lookup("hb-expand-race")
if !ok {
t.Fatal("volume must survive concurrent heartbeats + expand")
}
}

1346
weed/server/qa_block_nvme_publication_test.go
File diff suppressed because it is too large
View File

55
weed/storage/blockvol/blockapi/client.go

@ -136,6 +136,61 @@ func (c *Client) ExpandVolume(ctx context.Context, name string, newSizeBytes uin
return out.CapacityBytes, nil return out.CapacityBytes, nil
} }
// PromoteVolume triggers a manual promotion for a block volume.
func (c *Client) PromoteVolume(ctx context.Context, name string, req PromoteVolumeRequest) (*PromoteVolumeResponse, error) {
body, err := json.Marshal(req)
if err != nil {
return nil, fmt.Errorf("marshal request: %w", err)
}
resp, err := c.doRequest(ctx, http.MethodPost, "/block/volume/"+name+"/promote", bytes.NewReader(body))
if err != nil {
return nil, err
}
defer resp.Body.Close()
if err := checkStatus(resp, http.StatusOK); err != nil {
return nil, err
}
var out PromoteVolumeResponse
if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
return nil, fmt.Errorf("decode response: %w", err)
}
return &out, nil
}
// BlockStatus fetches the block registry status metrics.
func (c *Client) BlockStatus(ctx context.Context) (*BlockStatusResponse, error) {
resp, err := c.doRequest(ctx, http.MethodGet, "/block/status", nil)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if err := checkStatus(resp, http.StatusOK); err != nil {
return nil, err
}
var out BlockStatusResponse
if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
return nil, fmt.Errorf("decode response: %w", err)
}
return &out, nil
}
// Preflight returns the promotion preflight evaluation for a block volume.
func (c *Client) Preflight(ctx context.Context, name string) (*PreflightResponse, error) {
resp, err := c.doRequest(ctx, http.MethodGet, "/block/volume/"+name+"/preflight", nil)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if err := checkStatus(resp, http.StatusOK); err != nil {
return nil, err
}
var out PreflightResponse
if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
return nil, fmt.Errorf("decode response: %w", err)
}
return &out, nil
}
// ListServers lists all block-capable volume servers. // ListServers lists all block-capable volume servers.
func (c *Client) ListServers(ctx context.Context) ([]ServerInfo, error) { func (c *Client) ListServers(ctx context.Context) ([]ServerInfo, error) {
resp, err := c.doRequest(ctx, http.MethodGet, "/block/servers", nil) resp, err := c.doRequest(ctx, http.MethodGet, "/block/servers", nil)

48
weed/storage/blockvol/blockapi/types.go

@ -38,6 +38,8 @@ type VolumeInfo struct {
HealthScore float64 `json:"health_score"` HealthScore float64 `json:"health_score"`
ReplicaDegraded bool `json:"replica_degraded,omitempty"` ReplicaDegraded bool `json:"replica_degraded,omitempty"`
DurabilityMode string `json:"durability_mode"` // CP8-3-1 DurabilityMode string `json:"durability_mode"` // CP8-3-1
NvmeAddr string `json:"nvme_addr,omitempty"`
NQN string `json:"nqn,omitempty"`
} }
// ReplicaDetail describes one replica in the API response. // ReplicaDetail describes one replica in the API response.
@ -74,6 +76,52 @@ type ExpandVolumeResponse struct {
CapacityBytes uint64 `json:"capacity_bytes"` CapacityBytes uint64 `json:"capacity_bytes"`
} }
// PromoteVolumeRequest is the request body for POST /block/volume/{name}/promote.
type PromoteVolumeRequest struct {
TargetServer string `json:"target_server,omitempty"` // specific replica, or empty for auto
Force bool `json:"force,omitempty"` // bypass soft safety checks
Reason string `json:"reason,omitempty"` // audit note
}
// PromoteVolumeResponse is the response for POST /block/volume/{name}/promote.
type PromoteVolumeResponse struct {
NewPrimary string `json:"new_primary"`
Epoch uint64 `json:"epoch"`
Reason string `json:"reason,omitempty"` // rejection reason if failed
Rejections []PreflightRejection `json:"rejections,omitempty"` // per-replica rejection details
}
// BlockStatusResponse is the response for GET /block/status.
type BlockStatusResponse struct {
VolumeCount int `json:"volume_count"`
ServerCount int `json:"server_count"`
PromotionLSNTolerance uint64 `json:"promotion_lsn_tolerance"`
BarrierLagLSN uint64 `json:"barrier_lag_lsn"`
PromotionsTotal int64 `json:"promotions_total"`
FailoversTotal int64 `json:"failovers_total"`
RebuildsTotal int64 `json:"rebuilds_total"`
AssignmentQueueDepth int `json:"assignment_queue_depth"`
}
// PreflightRejection describes why a specific replica was rejected for promotion.
type PreflightRejection struct {
Server string `json:"server"`
Reason string `json:"reason"` // "stale_heartbeat", "wal_lag", "wrong_role", "server_dead", "no_heartbeat"
}
// PreflightResponse is the response for GET /block/volume/{name}/preflight.
type PreflightResponse struct {
VolumeName string `json:"volume_name"`
Promotable bool `json:"promotable"`
Reason string `json:"reason,omitempty"`
CandidateServer string `json:"candidate_server,omitempty"`
CandidateHealth float64 `json:"candidate_health,omitempty"`
CandidateWALLSN uint64 `json:"candidate_wal_lsn,omitempty"`
Rejections []PreflightRejection `json:"rejections,omitempty"`
PrimaryServer string `json:"primary_server"`
PrimaryAlive bool `json:"primary_alive"`
}
// RoleFromString converts a role string to its uint32 wire value. // RoleFromString converts a role string to its uint32 wire value.
// Returns 0 (RoleNone) for unrecognized strings. // Returns 0 (RoleNone) for unrecognized strings.
func RoleFromString(s string) uint32 { func RoleFromString(s string) uint32 {

511
weed/storage/blockvol/qa_wal_cp11a3_adversarial_test.go

@ -0,0 +1,511 @@
package blockvol
import (
"sync"
"sync/atomic"
"testing"
"time"
)
// ============================================================
// CP11A-3 Adversarial Test Suite
//
// 10 scenarios stress-testing WAL admission pressure tracking,
// PressureState boundaries, guidance edge cases, and concurrent
// metric visibility.
// ============================================================
// ────────────────────────────────────────────────────────────
// QA-CP11A3-1: SoftMarkEqualsHardMark_NoPanic
//
// If an operator configures softMark == hardMark, the soft-zone
// delay calculation divides by (hardMark - softMark) = 0.
// Must not panic, hang, or produce NaN/Inf delay.
// ────────────────────────────────────────────────────────────
func TestQA_CP11A3_SoftMarkEqualsHardMark_NoPanic(t *testing.T) {
m := NewEngineMetrics()
a := NewWALAdmission(WALAdmissionConfig{
MaxConcurrent: 16,
SoftWatermark: 0.8,
HardWatermark: 0.8, // equal — no soft zone
WALUsedFn: func() float64 { return 0.85 }, // above both marks
NotifyFn: func() {},
ClosedFn: func() bool { return false },
Metrics: m,
})
// With equal marks, pressure >= hardMark takes the hard branch.
// The soft branch's division by zero is never reached.
// But if the code path ever changes, this test catches it.
done := make(chan error, 1)
go func() {
done <- a.Acquire(50 * time.Millisecond)
}()
select {
case err := <-done:
// ErrWALFull is expected (pressure stays above hard, times out).
if err != ErrWALFull {
t.Fatalf("expected ErrWALFull, got %v", err)
}
case <-time.After(2 * time.Second):
t.Fatal("Acquire hung — possible Inf delay from division by zero")
}
}
// ────────────────────────────────────────────────────────────
// QA-CP11A3-2: SoftZoneExactBoundary_DelayIsZero
//
// When pressure == softMark exactly, scale = 0, delay = 0.
// softPressureWaitNs should NOT increase (delay <= 0 skips sleep).
// But hitSoft should still be true → SoftAdmitTotal increments.
// ────────────────────────────────────────────────────────────
func TestQA_CP11A3_SoftZoneExactBoundary_DelayIsZero(t *testing.T) {
m := NewEngineMetrics()
a := NewWALAdmission(WALAdmissionConfig{
MaxConcurrent: 16,
SoftWatermark: 0.7,
HardWatermark: 0.9,
WALUsedFn: func() float64 { return 0.7 }, // exactly at soft mark
NotifyFn: func() {},
ClosedFn: func() bool { return false },
Metrics: m,
})
a.sleepFn = func(d time.Duration) {
t.Fatalf("sleep should not be called when delay=0, but called with %v", d)
}
if err := a.Acquire(100 * time.Millisecond); err != nil {
t.Fatalf("Acquire: %v", err)
}
a.Release()
// SoftAdmitTotal should increment (we entered the soft branch).
if m.WALAdmitSoftTotal.Load() != 1 {
t.Fatalf("WALAdmitSoftTotal = %d, want 1", m.WALAdmitSoftTotal.Load())
}
// But no sleep → softPressureWaitNs stays 0.
if a.SoftPressureWaitNs() != 0 {
t.Fatalf("SoftPressureWaitNs = %d, want 0 (no delay at exact boundary)", a.SoftPressureWaitNs())
}
}
// ────────────────────────────────────────────────────────────
// QA-CP11A3-3: ConcurrentHardWaiters_TimeAccumulates
//
// 8 goroutines enter hard zone simultaneously. Each waits ~5ms.
// Total hardPressureWaitNs should be roughly 8 × 5ms, proving
// atomic accumulation doesn't lose contributions.
// ────────────────────────────────────────────────────────────
func TestQA_CP11A3_ConcurrentHardWaiters_TimeAccumulates(t *testing.T) {
m := NewEngineMetrics()
var pressure atomic.Int64
pressure.Store(95) // above hard mark
a := NewWALAdmission(WALAdmissionConfig{
MaxConcurrent: 16,
SoftWatermark: 0.7,
HardWatermark: 0.9,
WALUsedFn: func() float64 { return float64(pressure.Load()) / 100.0 },
NotifyFn: func() {},
ClosedFn: func() bool { return false },
Metrics: m,
})
var sleepCalls atomic.Int64
a.sleepFn = func(d time.Duration) {
time.Sleep(1 * time.Millisecond)
// After enough total sleeps across all goroutines, drop pressure.
if sleepCalls.Add(1) >= 20 {
pressure.Store(50)
}
}
const workers = 8
var wg sync.WaitGroup
for i := 0; i < workers; i++ {
wg.Add(1)
go func() {
defer wg.Done()
if err := a.Acquire(5 * time.Second); err != nil {
t.Errorf("Acquire: %v", err)
}
a.Release()
}()
}
wg.Wait()
// All 8 must have entered hard zone.
if m.WALAdmitHardTotal.Load() < uint64(workers) {
t.Fatalf("WALAdmitHardTotal = %d, want >= %d", m.WALAdmitHardTotal.Load(), workers)
}
// Accumulated hard wait should be > 0, reflecting contributions from all goroutines.
if a.HardPressureWaitNs() <= 0 {
t.Fatal("HardPressureWaitNs should be > 0 after concurrent hard-zone waits")
}
}
// ────────────────────────────────────────────────────────────
// QA-CP11A3-4: PressureStateAndAcquireRace
//
// One goroutine oscillates walUsed, another reads PressureState
// rapidly. Must not panic, must always return a valid state.
// ────────────────────────────────────────────────────────────
func TestQA_CP11A3_PressureStateAndAcquireRace(t *testing.T) {
var pressure atomic.Int64
pressure.Store(50)
a := NewWALAdmission(WALAdmissionConfig{
MaxConcurrent: 16,
SoftWatermark: 0.7,
HardWatermark: 0.9,
WALUsedFn: func() float64 { return float64(pressure.Load()) / 100.0 },
NotifyFn: func() {},
ClosedFn: func() bool { return false },
Metrics: NewEngineMetrics(),
})
a.sleepFn = func(d time.Duration) { time.Sleep(100 * time.Microsecond) }
var wg sync.WaitGroup
const rounds = 200
// Goroutine 1: oscillate pressure.
wg.Add(1)
go func() {
defer wg.Done()
levels := []int64{30, 75, 95, 50, 80, 92, 10}
for i := 0; i < rounds; i++ {
pressure.Store(levels[i%len(levels)])
}
}()
// Goroutine 2: read PressureState.
wg.Add(1)
go func() {
defer wg.Done()
valid := map[string]bool{"normal": true, "soft": true, "hard": true}
for i := 0; i < rounds; i++ {
s := a.PressureState()
if !valid[s] {
t.Errorf("PressureState() = %q — not a valid state", s)
return
}
}
}()
// Goroutine 3: Acquire/Release rapidly.
wg.Add(1)
go func() {
defer wg.Done()
for i := 0; i < rounds/2; i++ {
err := a.Acquire(20 * time.Millisecond)
if err == nil {
a.Release()
}
}
}()
wg.Wait()
}
// ────────────────────────────────────────────────────────────
// QA-CP11A3-5: TimeInZoneMonotonicity
//
// softPressureWaitNs and hardPressureWaitNs must be monotonically
// non-decreasing across reads, even under concurrent writes.
// ────────────────────────────────────────────────────────────
func TestQA_CP11A3_TimeInZoneMonotonicity(t *testing.T) {
m := NewEngineMetrics()
var pressure atomic.Int64
pressure.Store(80) // soft zone
a := NewWALAdmission(WALAdmissionConfig{
MaxConcurrent: 16,
SoftWatermark: 0.7,
HardWatermark: 0.9,
WALUsedFn: func() float64 { return float64(pressure.Load()) / 100.0 },
NotifyFn: func() {},
ClosedFn: func() bool { return false },
Metrics: m,
})
a.sleepFn = func(d time.Duration) { time.Sleep(100 * time.Microsecond) }
var wg sync.WaitGroup
const writers = 4
const rounds = 30
// Writers produce soft-zone and hard-zone waits.
for i := 0; i < writers; i++ {
wg.Add(1)
go func(id int) {
defer wg.Done()
for j := 0; j < rounds; j++ {
if j%5 == 0 {
pressure.Store(95) // hard
} else {
pressure.Store(80) // soft
}
err := a.Acquire(50 * time.Millisecond)
if err == nil {
a.Release()
}
// Drop back so next Acquire can succeed.
pressure.Store(50)
}
}(i)
}
// Reader checks monotonicity.
wg.Add(1)
go func() {
defer wg.Done()
var prevSoft, prevHard int64
for i := 0; i < rounds*writers; i++ {
soft := a.SoftPressureWaitNs()
hard := a.HardPressureWaitNs()
if soft < prevSoft {
t.Errorf("SoftPressureWaitNs decreased: %d -> %d", prevSoft, soft)
}
if hard < prevHard {
t.Errorf("HardPressureWaitNs decreased: %d -> %d", prevHard, hard)
}
prevSoft = soft
prevHard = hard
}
}()
wg.Wait()
}
// ────────────────────────────────────────────────────────────
// QA-CP11A3-6: WALGuidance_ZeroInputs
//
// Zero walSize, zero blockSize, zero maxConcurrent, empty hint.
// Must not panic or produce invalid results.
// ────────────────────────────────────────────────────────────
func TestQA_CP11A3_WALGuidance_ZeroInputs(t *testing.T) {
// All zeros.
r := WALSizingGuidance(0, 0, "")
if r.Level != "warn" {
t.Errorf("zero walSize: Level = %q, want warn", r.Level)
}
// Zero blockSize: absMin = 0*64 = 0. Only workload minimum check fires.
r = WALSizingGuidance(0, 0, WorkloadGeneral)
if r.Level != "warn" {
t.Errorf("zero walSize+blockSize: Level = %q, want warn", r.Level)
}
// Zero walSize but nonzero blockSize.
r = WALSizingGuidance(0, 4096, WorkloadDatabase)
if r.Level != "warn" {
t.Errorf("zero walSize: Level = %q, want warn", r.Level)
}
if len(r.Warnings) < 2 {
t.Errorf("expected both workload + absolute minimum warnings, got %d", len(r.Warnings))
}
// EvaluateWALConfig with zero maxConcurrent should not trigger concurrency warning.
r = EvaluateWALConfig(0, 4096, 0, WorkloadGeneral)
// walSize=0 still triggers sizing warning.
if r.Level != "warn" {
t.Errorf("Level = %q, want warn for zero walSize", r.Level)
}
}
// ────────────────────────────────────────────────────────────
// QA-CP11A3-7: WALGuidance_OverflowSafe
//
// Very large blockSize × minWALEntries might overflow uint64.
// (64 × 2^60 does NOT overflow, but let's test near-boundary.)
// ────────────────────────────────────────────────────────────
func TestQA_CP11A3_WALGuidance_OverflowSafe(t *testing.T) {
// Large blockSize: 256MB blocks × 64 = 16GB minimum.
// walSize = 1GB → should warn (16GB > 1GB).
r := WALSizingGuidance(1<<30, 256<<20, WorkloadGeneral)
if r.Level != "warn" {
t.Errorf("Level = %q, want warn (1GB WAL < 16GB absMin)", r.Level)
}
// Extreme: blockSize = 1<<40 (1TB). 64 × 1TB = 64TB.
// uint64 can hold 18 EB — no overflow.
r = WALSizingGuidance(1<<50, 1<<40, WorkloadThroughput)
// 1PB WAL with 1TB blocks: absMin = 64TB, 1PB > 64TB → ok for absolute.
// 1PB > 128MB (throughput min) → ok for workload.
if r.Level != "ok" {
t.Errorf("Level = %q, want ok for huge WAL", r.Level)
}
}
// ────────────────────────────────────────────────────────────
// QA-CP11A3-8: WALStatusSnapshot_PartialInit
//
// BlockVol with Metrics but nil walAdmission, and vice versa.
// WALStatus must return coherent defaults for the nil side
// and real values for the non-nil side.
// ────────────────────────────────────────────────────────────
func TestQA_CP11A3_WALStatusSnapshot_PartialInit(t *testing.T) {
// Case 1: Metrics set, walAdmission nil.
m := NewEngineMetrics()
m.WALAdmitSoftTotal.Add(42)
m.WALAdmitHardTotal.Add(7)
vol1 := &BlockVol{Metrics: m}
ws := vol1.WALStatus()
if ws.PressureState != "normal" {
t.Errorf("nil admission: PressureState = %q, want normal", ws.PressureState)
}
if ws.SoftAdmitTotal != 42 {
t.Errorf("SoftAdmitTotal = %d, want 42", ws.SoftAdmitTotal)
}
if ws.HardAdmitTotal != 7 {
t.Errorf("HardAdmitTotal = %d, want 7", ws.HardAdmitTotal)
}
// Pressure wait should be 0 (no admission controller).
if ws.SoftPressureWaitSec != 0 || ws.HardPressureWaitSec != 0 {
t.Errorf("nil admission: pressure wait should be 0")
}
// Case 2: walAdmission set, Metrics nil.
a := NewWALAdmission(WALAdmissionConfig{
MaxConcurrent: 16,
SoftWatermark: 0.65,
HardWatermark: 0.85,
WALUsedFn: func() float64 { return 0.7 },
NotifyFn: func() {},
ClosedFn: func() bool { return false },
})
vol2 := &BlockVol{walAdmission: a}
ws2 := vol2.WALStatus()
if ws2.PressureState != "soft" {
t.Errorf("PressureState = %q, want soft (0.7 >= 0.65)", ws2.PressureState)
}
if ws2.SoftWatermark != 0.65 {
t.Errorf("SoftWatermark = %f, want 0.65", ws2.SoftWatermark)
}
// Metrics fields should be zero (nil Metrics).
if ws2.SoftAdmitTotal != 0 || ws2.HardAdmitTotal != 0 || ws2.TimeoutTotal != 0 {
t.Errorf("nil metrics: counters should be 0")
}
}
// ────────────────────────────────────────────────────────────
// QA-CP11A3-9: ObserverPanic_ContainedOrDocumented
//
// If WALAdmitWaitObserver panics, RecordWALAdmit is called from
// Acquire → recordAdmit. A panic in the observer would crash the
// writer goroutine. This test documents whether the panic is
// recovered or propagated.
// ────────────────────────────────────────────────────────────
func TestQA_CP11A3_ObserverPanic_DocumentedBehavior(t *testing.T) {
m := NewEngineMetrics()
m.WALAdmitWaitObserver = func(s float64) { panic("boom") }
// RecordWALAdmit calls the observer. If it panics, the caller panics.
// This is expected (same as prometheus.Histogram.Observe panicking).
// Document that the observer must not panic.
panicked := false
func() {
defer func() {
if r := recover(); r != nil {
panicked = true
}
}()
m.RecordWALAdmit(1*time.Millisecond, false, false, false)
}()
if !panicked {
t.Fatal("expected panic from observer — if recovered, update this test")
}
// Verify counters were NOT updated (panic happened before completion).
// Actually, the observer is called AFTER WALAdmitTotal.Add(1) and
// walAdmitWaitNs.record(). Let's verify the counter state.
if m.WALAdmitTotal.Load() != 1 {
t.Errorf("WALAdmitTotal = %d — should be 1 (incremented before observer)", m.WALAdmitTotal.Load())
}
// soft/hard/timeout flags are processed AFTER observer — panic skips them.
// With soft=false, hard=false, timedOut=false there's nothing to skip,
// but the counters should reflect what happened before the panic.
}
// ────────────────────────────────────────────────────────────
// QA-CP11A3-10: ConcurrentWALStatusReads
//
// Multiple goroutines read WALStatus while Acquire/Release runs.
// Must not panic. Fields should be internally consistent
// (SoftAdmitTotal >= 0, HardPressureWaitSec >= 0, etc.)
// ────────────────────────────────────────────────────────────
func TestQA_CP11A3_ConcurrentWALStatusReads(t *testing.T) {
m := NewEngineMetrics()
var pressure atomic.Int64
pressure.Store(50)
a := NewWALAdmission(WALAdmissionConfig{
MaxConcurrent: 16,
SoftWatermark: 0.7,
HardWatermark: 0.9,
WALUsedFn: func() float64 { return float64(pressure.Load()) / 100.0 },
NotifyFn: func() {},
ClosedFn: func() bool { return false },
Metrics: m,
})
a.sleepFn = func(d time.Duration) { time.Sleep(50 * time.Microsecond) }
vol := &BlockVol{
Metrics: m,
walAdmission: a,
}
var wg sync.WaitGroup
const rounds = 100
// Writers with varying pressure.
for i := 0; i < 4; i++ {
wg.Add(1)
go func() {
defer wg.Done()
levels := []int64{50, 75, 95, 60, 85}
for j := 0; j < rounds; j++ {
pressure.Store(levels[j%len(levels)])
if err := a.Acquire(20 * time.Millisecond); err == nil {
a.Release()
}
pressure.Store(50) // reset for next round
}
}()
}
// Concurrent WALStatus readers.
for i := 0; i < 4; i++ {
wg.Add(1)
go func() {
defer wg.Done()
valid := map[string]bool{"normal": true, "soft": true, "hard": true}
for j := 0; j < rounds*2; j++ {
ws := vol.WALStatus()
if !valid[ws.PressureState] {
t.Errorf("invalid PressureState: %q", ws.PressureState)
return
}
if ws.UsedFraction < 0 || ws.UsedFraction > 1.01 {
t.Errorf("UsedFraction out of range: %f", ws.UsedFraction)
return
}
if ws.SoftPressureWaitSec < 0 {
t.Errorf("SoftPressureWaitSec negative: %f", ws.SoftPressureWaitSec)
return
}
if ws.HardPressureWaitSec < 0 {
t.Errorf("HardPressureWaitSec negative: %f", ws.HardPressureWaitSec)
return
}
}
}()
}
wg.Wait()
}

220
weed/storage/blockvol/testrunner/actions/devops.go

@ -26,6 +26,10 @@ func RegisterDevOpsActions(r *tr.Registry) {
r.RegisterFunc("delete_block_volume", tr.TierDevOps, deleteBlockVolume) r.RegisterFunc("delete_block_volume", tr.TierDevOps, deleteBlockVolume)
r.RegisterFunc("wait_block_servers", tr.TierDevOps, waitBlockServers) r.RegisterFunc("wait_block_servers", tr.TierDevOps, waitBlockServers)
r.RegisterFunc("cluster_status", tr.TierDevOps, clusterStatus) r.RegisterFunc("cluster_status", tr.TierDevOps, clusterStatus)
r.RegisterFunc("wait_block_primary", tr.TierDevOps, waitBlockPrimary)
r.RegisterFunc("assert_block_field", tr.TierDevOps, assertBlockField)
r.RegisterFunc("block_status", tr.TierDevOps, blockStatus)
r.RegisterFunc("block_promote", tr.TierDevOps, blockPromote)
} }
// setISCSIVars sets the save_as_iscsi_host/port/addr/iqn vars from a VolumeInfo. // setISCSIVars sets the save_as_iscsi_host/port/addr/iqn vars from a VolumeInfo.
@ -434,6 +438,222 @@ func waitBlockServers(ctx context.Context, actx *tr.ActionContext, act tr.Action
} }
} }
// waitBlockPrimary polls lookup until the volume's primary server matches (or differs from) expected.
// Params: name, expected (server addr to wait for) OR not (server addr to wait to change from), timeout (default 60s).
// Sets save_as vars from the final lookup.
func waitBlockPrimary(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) {
client, err := blockAPIClient(actx, act)
if err != nil {
return nil, fmt.Errorf("wait_block_primary: %w", err)
}
name := act.Params["name"]
if name == "" {
return nil, fmt.Errorf("wait_block_primary: name param required")
}
expected := act.Params["expected"]
notServer := act.Params["not"]
if expected == "" && notServer == "" {
return nil, fmt.Errorf("wait_block_primary: expected or not param required")
}
timeout := 60 * time.Second
if t, ok := act.Params["timeout"]; ok {
if d, err := parseDuration(t); err == nil {
timeout = d
}
}
timeoutCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
ticker := time.NewTicker(2 * time.Second)
defer ticker.Stop()
pollCount := 0
for {
select {
case <-timeoutCtx.Done():
return nil, fmt.Errorf("wait_block_primary: timeout after %s waiting for primary change on %s", timeout, name)
case <-ticker.C:
pollCount++
info, err := client.LookupVolume(timeoutCtx, name)
if err != nil {
if pollCount <= 3 {
actx.Log(" poll %d: lookup error: %v", pollCount, err)
}
continue
}
if pollCount <= 3 || pollCount%10 == 0 {
actx.Log(" poll %d: %s primary=%s role=%s", pollCount, name, info.VolumeServer, info.Role)
}
match := false
if expected != "" && info.VolumeServer == expected {
match = true
}
if notServer != "" && info.VolumeServer != notServer && info.VolumeServer != "" {
match = true
}
if match {
actx.Log(" primary for %s is now %s (epoch=%d)", name, info.VolumeServer, info.Epoch)
if act.SaveAs != "" {
setISCSIVars(actx, act.SaveAs, info)
actx.Vars[act.SaveAs+"_server"] = info.VolumeServer
actx.Vars[act.SaveAs+"_epoch"] = strconv.FormatUint(info.Epoch, 10)
actx.Vars[act.SaveAs+"_role"] = info.Role
}
return map[string]string{"value": info.VolumeServer}, nil
}
}
}
}
// assertBlockField looks up a block volume and asserts a specific field matches the expected value.
// Params: name, field (one of: volume_server, role, status, epoch, size_bytes, replica_server,
// replica_factor, health_score, replica_degraded, durability_mode, iscsi_addr, iqn), expected.
func assertBlockField(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) {
client, err := blockAPIClient(actx, act)
if err != nil {
return nil, fmt.Errorf("assert_block_field: %w", err)
}
name := act.Params["name"]
if name == "" {
return nil, fmt.Errorf("assert_block_field: name param required")
}
field := act.Params["field"]
if field == "" {
return nil, fmt.Errorf("assert_block_field: field param required")
}
expected := act.Params["expected"]
if expected == "" {
return nil, fmt.Errorf("assert_block_field: expected param required")
}
info, err := client.LookupVolume(ctx, name)
if err != nil {
return nil, fmt.Errorf("assert_block_field: lookup %s: %w", name, err)
}
actual, err := extractVolumeField(info, field)
if err != nil {
return nil, fmt.Errorf("assert_block_field: %w", err)
}
if actual != expected {
return nil, fmt.Errorf("assert_block_field: %s.%s = %q, expected %q", name, field, actual, expected)
}
actx.Log(" assert %s.%s == %q OK", name, field, expected)
return map[string]string{"value": actual}, nil
}
// extractVolumeField extracts a named field from VolumeInfo as a string.
func extractVolumeField(info *blockapi.VolumeInfo, field string) (string, error) {
switch field {
case "volume_server":
return info.VolumeServer, nil
case "role":
return info.Role, nil
case "status":
return info.Status, nil
case "epoch":
return strconv.FormatUint(info.Epoch, 10), nil
case "size_bytes":
return strconv.FormatUint(info.SizeBytes, 10), nil
case "replica_server":
return info.ReplicaServer, nil
case "replica_factor":
return strconv.Itoa(info.ReplicaFactor), nil
case "health_score":
return fmt.Sprintf("%.2f", info.HealthScore), nil
case "replica_degraded":
return strconv.FormatBool(info.ReplicaDegraded), nil
case "durability_mode":
return info.DurabilityMode, nil
case "iscsi_addr":
return info.ISCSIAddr, nil
case "iqn":
return info.IQN, nil
case "name":
return info.Name, nil
case "replica_iscsi_addr":
return info.ReplicaISCSIAddr, nil
case "replica_iqn":
return info.ReplicaIQN, nil
case "replica_data_addr":
return info.ReplicaDataAddr, nil
case "replica_ctrl_addr":
return info.ReplicaCtrlAddr, nil
default:
return "", fmt.Errorf("unknown field %q", field)
}
}
// blockStatus fetches block registry status metrics from master.
// Sets save_as_promotions_total, save_as_failovers_total, etc.
func blockStatus(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) {
client, err := blockAPIClient(actx, act)
if err != nil {
return nil, fmt.Errorf("block_status: %w", err)
}
status, err := client.BlockStatus(ctx)
if err != nil {
return nil, fmt.Errorf("block_status: %w", err)
}
actx.Log(" block status: volumes=%d servers=%d promotions=%d failovers=%d rebuilds=%d",
status.VolumeCount, status.ServerCount, status.PromotionsTotal, status.FailoversTotal, status.RebuildsTotal)
if act.SaveAs != "" {
actx.Vars[act.SaveAs+"_volume_count"] = strconv.Itoa(status.VolumeCount)
actx.Vars[act.SaveAs+"_server_count"] = strconv.Itoa(status.ServerCount)
actx.Vars[act.SaveAs+"_promotions_total"] = strconv.FormatInt(status.PromotionsTotal, 10)
actx.Vars[act.SaveAs+"_failovers_total"] = strconv.FormatInt(status.FailoversTotal, 10)
actx.Vars[act.SaveAs+"_rebuilds_total"] = strconv.FormatInt(status.RebuildsTotal, 10)
actx.Vars[act.SaveAs+"_queue_depth"] = strconv.Itoa(status.AssignmentQueueDepth)
}
jsonBytes, _ := json.Marshal(status)
return map[string]string{"value": string(jsonBytes)}, nil
}
// blockPromote triggers a manual promotion for a block volume.
// Params: name, target_server (optional, empty=auto), force (optional bool), reason (optional).
func blockPromote(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) {
client, err := blockAPIClient(actx, act)
if err != nil {
return nil, fmt.Errorf("block_promote: %w", err)
}
name := act.Params["name"]
if name == "" {
return nil, fmt.Errorf("block_promote: name param required")
}
force := false
if f := act.Params["force"]; f == "true" || f == "1" {
force = true
}
resp, err := client.PromoteVolume(ctx, name, blockapi.PromoteVolumeRequest{
TargetServer: act.Params["target_server"],
Force: force,
Reason: act.Params["reason"],
})
if err != nil {
return nil, fmt.Errorf("block_promote: %w", err)
}
actx.Log(" promoted %s -> primary=%s epoch=%d", name, resp.NewPrimary, resp.Epoch)
if act.SaveAs != "" {
actx.Vars[act.SaveAs+"_server"] = resp.NewPrimary
actx.Vars[act.SaveAs+"_epoch"] = strconv.FormatUint(resp.Epoch, 10)
}
return map[string]string{"value": resp.NewPrimary}, nil
}
// clusterStatus fetches the full cluster status JSON. // clusterStatus fetches the full cluster status JSON.
func clusterStatus(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) { func clusterStatus(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) {
node, err := getNode(actx, act.Node) node, err := getNode(actx, act.Node)

22
weed/storage/blockvol/testrunner/actions/devops_test.go

@ -23,6 +23,10 @@ func TestDevOpsActions_Registration(t *testing.T) {
"delete_block_volume", "delete_block_volume",
"wait_block_servers", "wait_block_servers",
"cluster_status", "cluster_status",
"wait_block_primary",
"assert_block_field",
"block_status",
"block_promote",
} }
for _, name := range expected { for _, name := range expected {
@ -39,8 +43,8 @@ func TestDevOpsActions_Tier(t *testing.T) {
byTier := registry.ListByTier() byTier := registry.ListByTier()
devopsActions := byTier[tr.TierDevOps] devopsActions := byTier[tr.TierDevOps]
if len(devopsActions) != 11 {
t.Errorf("devops tier has %d actions, want 11", len(devopsActions))
if len(devopsActions) != 15 {
t.Errorf("devops tier has %d actions, want 15", len(devopsActions))
} }
// Verify all are in devops tier. // Verify all are in devops tier.
@ -84,11 +88,11 @@ func TestAllActions_Registration(t *testing.T) {
if n := len(byTier[tr.TierCore]); n != 11 { if n := len(byTier[tr.TierCore]); n != 11 {
t.Errorf("core: %d, want 11", n) t.Errorf("core: %d, want 11", n)
} }
if n := len(byTier[tr.TierBlock]); n != 56 {
t.Errorf("block: %d, want 56", n)
if n := len(byTier[tr.TierBlock]); n != 58 {
t.Errorf("block: %d, want 58", n)
} }
if n := len(byTier[tr.TierDevOps]); n != 11 {
t.Errorf("devops: %d, want 11", n)
if n := len(byTier[tr.TierDevOps]); n != 15 {
t.Errorf("devops: %d, want 15", n)
} }
if n := len(byTier[tr.TierChaos]); n != 5 { if n := len(byTier[tr.TierChaos]); n != 5 {
t.Errorf("chaos: %d, want 5", n) t.Errorf("chaos: %d, want 5", n)
@ -97,13 +101,13 @@ func TestAllActions_Registration(t *testing.T) {
t.Errorf("k8s: %d, want 14", n) t.Errorf("k8s: %d, want 14", n)
} }
// Total should be 97 (92 prev + 4 devops: expand/lookup/delete/wait_block_servers + 1 block: iscsi_login_direct).
// Total should be 103 (99 prev + 4 devops: wait_block_primary, assert_block_field, block_status, block_promote).
total := 0 total := 0
for _, actions := range byTier { for _, actions := range byTier {
total += len(actions) total += len(actions)
} }
if total != 97 {
t.Errorf("total actions: %d, want 97", total)
if total != 103 {
t.Errorf("total actions: %d, want 103", total)
} }
} }

89
weed/storage/blockvol/testrunner/actions/snapshot.go

@ -8,6 +8,7 @@ import (
"time" "time"
tr "github.com/seaweedfs/seaweedfs/weed/storage/blockvol/testrunner" tr "github.com/seaweedfs/seaweedfs/weed/storage/blockvol/testrunner"
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol/testrunner/infra"
) )
// RegisterSnapshotActions registers snapshot and resize actions. // RegisterSnapshotActions registers snapshot and resize actions.
@ -18,6 +19,8 @@ func RegisterSnapshotActions(r *tr.Registry) {
r.RegisterFunc("resize", tr.TierBlock, resizeAction) r.RegisterFunc("resize", tr.TierBlock, resizeAction)
r.RegisterFunc("iscsi_rescan", tr.TierBlock, iscsiRescan) r.RegisterFunc("iscsi_rescan", tr.TierBlock, iscsiRescan)
r.RegisterFunc("get_block_size", tr.TierBlock, getBlockSize) r.RegisterFunc("get_block_size", tr.TierBlock, getBlockSize)
r.RegisterFunc("snapshot_export_s3", tr.TierBlock, snapshotExportS3)
r.RegisterFunc("snapshot_import_s3", tr.TierBlock, snapshotImportS3)
} }
func snapshotCreate(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) { func snapshotCreate(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) {
@ -181,3 +184,89 @@ func parseHumanSize(s string) (uint64, error) {
} }
return val * multiplier, nil return val * multiplier, nil
} }
// snapshotExportS3 exports a snapshot from a target to an S3 bucket.
// Params: bucket, key_prefix, s3_endpoint, s3_access_key, s3_secret_key, s3_region, snapshot_id (optional).
// Returns: manifest_key, data_key, size_bytes, sha256.
func snapshotExportS3(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) {
tgt, err := getHATarget(actx, act.Target)
if err != nil {
return nil, err
}
opts := infra.ExportS3Opts{
Bucket: act.Params["bucket"],
KeyPrefix: act.Params["key_prefix"],
S3Endpoint: act.Params["s3_endpoint"],
S3AccessKey: act.Params["s3_access_key"],
S3SecretKey: act.Params["s3_secret_key"],
S3Region: act.Params["s3_region"],
}
if opts.Bucket == "" || opts.S3Endpoint == "" {
return nil, fmt.Errorf("snapshot_export_s3: bucket and s3_endpoint required")
}
if idStr := act.Params["snapshot_id"]; idStr != "" {
id, err := strconv.ParseUint(idStr, 10, 32)
if err != nil {
return nil, fmt.Errorf("snapshot_export_s3: invalid snapshot_id %q: %w", idStr, err)
}
opts.SnapshotID = uint32(id)
}
result, err := tgt.ExportSnapshotS3(ctx, opts)
if err != nil {
return nil, fmt.Errorf("snapshot_export_s3: %w", err)
}
actx.Log(" exported to s3://%s/%s (%d bytes, sha256=%s)", opts.Bucket, result.DataKey, result.SizeBytes, result.SHA256)
out := map[string]string{
"value": result.SHA256,
}
if act.SaveAs != "" {
actx.Vars[act.SaveAs+"_manifest_key"] = result.ManifestKey
actx.Vars[act.SaveAs+"_data_key"] = result.DataKey
actx.Vars[act.SaveAs+"_size_bytes"] = strconv.FormatUint(result.SizeBytes, 10)
actx.Vars[act.SaveAs+"_sha256"] = result.SHA256
}
return out, nil
}
// snapshotImportS3 imports a snapshot from an S3 bucket into a target.
// Params: bucket, manifest_key, s3_endpoint, s3_access_key, s3_secret_key, s3_region, allow_overwrite.
// Returns: size_bytes, sha256.
func snapshotImportS3(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) {
tgt, err := getHATarget(actx, act.Target)
if err != nil {
return nil, err
}
opts := infra.ImportS3Opts{
Bucket: act.Params["bucket"],
ManifestKey: act.Params["manifest_key"],
S3Endpoint: act.Params["s3_endpoint"],
S3AccessKey: act.Params["s3_access_key"],
S3SecretKey: act.Params["s3_secret_key"],
S3Region: act.Params["s3_region"],
}
if opts.Bucket == "" || opts.ManifestKey == "" || opts.S3Endpoint == "" {
return nil, fmt.Errorf("snapshot_import_s3: bucket, manifest_key, and s3_endpoint required")
}
if act.Params["allow_overwrite"] == "true" {
opts.AllowOverwrite = true
}
result, err := tgt.ImportSnapshotS3(ctx, opts)
if err != nil {
return nil, fmt.Errorf("snapshot_import_s3: %w", err)
}
actx.Log(" imported %d bytes (sha256=%s)", result.SizeBytes, result.SHA256)
out := map[string]string{
"value": result.SHA256,
}
if act.SaveAs != "" {
actx.Vars[act.SaveAs+"_size_bytes"] = strconv.FormatUint(result.SizeBytes, 10)
actx.Vars[act.SaveAs+"_sha256"] = result.SHA256
}
return out, nil
}

101
weed/storage/blockvol/testrunner/infra/ha_target.go

@ -478,6 +478,107 @@ func (h *HATarget) Resize(ctx context.Context, newSizeBytes uint64) error {
return nil return nil
} }
// ExportSnapshotS3 sends POST /export with S3 credentials.
// Returns the manifest key and data SHA-256 on success.
func (h *HATarget) ExportSnapshotS3(ctx context.Context, opts ExportS3Opts) (*ExportS3Result, error) {
reqBody := map[string]interface{}{
"bucket": opts.Bucket,
"key_prefix": opts.KeyPrefix,
"s3_endpoint": opts.S3Endpoint,
"s3_region": opts.S3Region,
}
if opts.S3AccessKey != "" {
reqBody["s3_access_key"] = opts.S3AccessKey
reqBody["s3_secret_key"] = opts.S3SecretKey
}
if opts.SnapshotID > 0 {
reqBody["snapshot_id"] = opts.SnapshotID
}
code, body, err := h.curlPost(ctx, "/export", reqBody)
if err != nil {
return nil, fmt.Errorf("export snapshot s3: %w", err)
}
if code != http.StatusOK {
return nil, fmt.Errorf("export snapshot s3 failed (HTTP %d): %s", code, body)
}
var resp ExportS3Result
if err := json.NewDecoder(strings.NewReader(body)).Decode(&resp); err != nil {
return nil, fmt.Errorf("decode export response: %w", err)
}
return &resp, nil
}
// ImportSnapshotS3 sends POST /import with S3 credentials and manifest key.
func (h *HATarget) ImportSnapshotS3(ctx context.Context, opts ImportS3Opts) (*ImportS3Result, error) {
reqBody := map[string]interface{}{
"bucket": opts.Bucket,
"manifest_key": opts.ManifestKey,
"s3_endpoint": opts.S3Endpoint,
"s3_region": opts.S3Region,
}
if opts.S3AccessKey != "" {
reqBody["s3_access_key"] = opts.S3AccessKey
reqBody["s3_secret_key"] = opts.S3SecretKey
}
if opts.AllowOverwrite {
reqBody["allow_overwrite"] = true
}
code, body, err := h.curlPost(ctx, "/import", reqBody)
if err != nil {
return nil, fmt.Errorf("import snapshot s3: %w", err)
}
if code != http.StatusOK {
return nil, fmt.Errorf("import snapshot s3 failed (HTTP %d): %s", code, body)
}
var resp ImportS3Result
if err := json.NewDecoder(strings.NewReader(body)).Decode(&resp); err != nil {
return nil, fmt.Errorf("decode import response: %w", err)
}
return &resp, nil
}
// ExportS3Opts configures a snapshot export to S3.
type ExportS3Opts struct {
Bucket string
KeyPrefix string
S3Endpoint string
S3AccessKey string
S3SecretKey string
S3Region string
SnapshotID uint32
}
// ExportS3Result is the response from POST /export.
type ExportS3Result struct {
OK bool `json:"ok"`
ManifestKey string `json:"manifest_key"`
DataKey string `json:"data_key"`
SizeBytes uint64 `json:"size_bytes"`
SHA256 string `json:"sha256"`
}
// ImportS3Opts configures a snapshot import from S3.
type ImportS3Opts struct {
Bucket string
ManifestKey string
S3Endpoint string
S3AccessKey string
S3SecretKey string
S3Region string
AllowOverwrite bool
}
// ImportS3Result is the response from POST /import.
type ImportS3Result struct {
OK bool `json:"ok"`
SizeBytes uint64 `json:"size_bytes"`
SHA256 string `json:"sha256"`
}
// WaitForRole polls GET /status until the target reports the expected role. // WaitForRole polls GET /status until the target reports the expected role.
func (h *HATarget) WaitForRole(ctx context.Context, expectedRole string) error { func (h *HATarget) WaitForRole(ctx context.Context, expectedRole string) error {
for { for {

246
weed/storage/blockvol/testrunner/scenarios/cp11b3-auto-failover.yaml

@ -0,0 +1,246 @@
name: cp11b3-auto-failover
timeout: 10m
env:
repo_dir: "/opt/work/seaweedfs"
master_url: "http://192.168.1.184:9434"
# Tests: T1 (candidate evaluation), T2 (orphan re-evaluation), T6 (preflight/status)
# Flow: Create RF=2 → write data → kill primary → master auto-promotes → verify data + metrics
topology:
nodes:
target_node:
host: "192.168.1.184"
user: testdev
key: "/opt/work/testdev_key"
client_node:
host: "192.168.1.181"
user: testdev
key: "/opt/work/testdev_key"
phases:
# Phase 1: Clean slate
- name: setup
actions:
- action: kill_stale
node: target_node
- action: kill_stale
node: client_node
iscsi_cleanup: "true"
- action: exec
node: target_node
cmd: "rm -rf /tmp/sw-b3-master /tmp/sw-b3-vs1 /tmp/sw-b3-vs2"
root: "true"
# Phase 2: Start cluster
- name: start_cluster
actions:
- action: exec
node: target_node
cmd: "mkdir -p /tmp/sw-b3-master /tmp/sw-b3-vs1/blocks /tmp/sw-b3-vs2/blocks"
- action: start_weed_master
node: target_node
port: "9434"
dir: "/tmp/sw-b3-master"
save_as: master_pid
- action: wait_cluster_ready
node: target_node
master_url: "http://localhost:9434"
timeout: 30s
- action: start_weed_volume
node: target_node
port: "18190"
master: "localhost:9434"
dir: "/tmp/sw-b3-vs1"
extra_args: "-block.dir=/tmp/sw-b3-vs1/blocks -block.listen=:3277 -ip=192.168.1.184"
save_as: vs1_pid
- action: start_weed_volume
node: target_node
port: "18191"
master: "localhost:9434"
dir: "/tmp/sw-b3-vs2"
extra_args: "-block.dir=/tmp/sw-b3-vs2/blocks -block.listen=:3278 -ip=192.168.1.184"
save_as: vs2_pid
- action: wait_block_servers
count: "2"
timeout: 60s
# Phase 3: Create RF=2 volume, record initial state
- name: create_volume
actions:
- action: create_block_volume
name: "failover-test"
size: "50M"
replica_factor: "2"
save_as: vol_info
# Wait for replica to confirm role via heartbeat.
# Without this, PromoteBestReplica rejects replica as "no_heartbeat".
- action: sleep
duration: 10s
- action: lookup_block_volume
name: "failover-test"
save_as: initial
- action: print
msg: "initial primary={{ initial_iscsi_host }}:{{ initial_iscsi_port }} capacity={{ initial_capacity }}"
# Record the initial primary server for later comparison.
- action: assert_block_field
name: "failover-test"
field: "replica_factor"
expected: "2"
- action: assert_block_field
name: "failover-test"
field: "epoch"
expected: "1"
# Capture initial block status metrics.
- action: block_status
save_as: pre_stats
# Phase 4: Write data via iSCSI
- name: write_data
actions:
- action: iscsi_login_direct
node: client_node
host: "{{ initial_iscsi_host }}"
port: "{{ initial_iscsi_port }}"
iqn: "{{ initial_iqn }}"
save_as: device
- action: dd_write
node: client_node
device: "{{ device }}"
bs: 1M
count: "1"
seek: "5"
save_as: md5_5M
- action: dd_read_md5
node: client_node
device: "{{ device }}"
bs: 1M
count: "1"
skip: "5"
save_as: verify_5M
- action: assert_equal
actual: "{{ verify_5M }}"
expected: "{{ md5_5M }}"
# Phase 5: Kill primary VS, wait for master auto-failover
- name: failover
actions:
- action: iscsi_cleanup
node: client_node
ignore_error: true
- action: lookup_block_volume
name: "failover-test"
save_as: pre_kill
- action: print
msg: "killing primary VS (server={{ pre_kill_iscsi_host }}:{{ pre_kill_iscsi_port }})"
# Crash-kill VS1 with SIGKILL (not SIGTERM) to simulate a real crash.
# SIGTERM triggers graceful shutdown which deregisters volumes from
# the master registry — preventing the failover path we want to test.
- action: exec
node: target_node
cmd: "kill -9 {{ vs1_pid }}"
root: "true"
# Wait for master to detect VS1 disconnection and promote.
# Lease TTL is 30s; if never granted (zero), promotion is immediate.
# Allow extra time for heartbeat confirmation + deferred timer.
- action: sleep
duration: 35s
- action: wait_block_primary
name: "failover-test"
not: "192.168.1.184:18190"
timeout: 60s
save_as: promoted
# Phase 6: Verify failover state
- name: verify_failover
actions:
- action: print
msg: "new primary={{ promoted_server }} epoch={{ promoted_epoch }}"
# Epoch must have incremented (real promotion, not just heartbeat update).
- action: assert_block_field
name: "failover-test"
field: "epoch"
expected: "2"
- action: block_status
save_as: post_stats
# Verify promotion counter incremented.
- action: assert_greater
actual: "{{ post_stats_promotions_total }}"
expected: "{{ pre_stats_promotions_total }}"
# Phase 7: Reconnect iSCSI to new primary, verify data
- name: verify_data
actions:
- action: iscsi_login_direct
node: client_node
host: "{{ promoted_iscsi_host }}"
port: "{{ promoted_iscsi_port }}"
iqn: "{{ promoted_iqn }}"
save_as: device2
- action: dd_read_md5
node: client_node
device: "{{ device2 }}"
bs: 1M
count: "1"
skip: "5"
save_as: post_failover_md5
- action: assert_equal
actual: "{{ post_failover_md5 }}"
expected: "{{ md5_5M }}"
# Phase 8: Restart killed VS, verify rebuild queued
- name: restart_verify
actions:
- action: iscsi_cleanup
node: client_node
ignore_error: true
- action: start_weed_volume
node: target_node
port: "18190"
master: "localhost:9434"
dir: "/tmp/sw-b3-vs1"
extra_args: "-block.dir=/tmp/sw-b3-vs1/blocks -block.listen=:3277 -ip=192.168.1.184"
save_as: vs1_pid2
- action: wait_block_servers
count: "2"
timeout: 60s
- action: sleep
duration: 5s
# After restart, the old primary should be queued for rebuild.
- action: block_status
save_as: final_stats
- action: assert_greater
actual: "{{ final_stats_rebuilds_total }}"
expected: "{{ post_stats_rebuilds_total }}"
# Cleanup (always runs)
- name: cleanup
always: true
actions:
- action: iscsi_cleanup
node: client_node
ignore_error: true
- action: delete_block_volume
name: "failover-test"
ignore_error: true
- action: stop_weed
node: target_node
pid: "{{ vs1_pid2 }}"
ignore_error: true
- action: stop_weed
node: target_node
pid: "{{ vs2_pid }}"
ignore_error: true
- action: stop_weed
node: target_node
pid: "{{ vs1_pid }}"
ignore_error: true
- action: stop_weed
node: target_node
pid: "{{ master_pid }}"
ignore_error: true
- action: exec
node: target_node
cmd: "rm -rf /tmp/sw-b3-master /tmp/sw-b3-vs1 /tmp/sw-b3-vs2"
root: "true"
ignore_error: true

214
weed/storage/blockvol/testrunner/scenarios/cp11b3-fast-reconnect.yaml

@ -0,0 +1,214 @@
name: cp11b3-fast-reconnect
timeout: 10m
env:
repo_dir: "/opt/work/seaweedfs"
master_url: "http://192.168.1.184:9436"
# Tests: T3 (deferred timer safety), T2 (fast reconnect skips failover)
# Flow: Create RF=2 → write → kill primary briefly → restart before lease expires
# → verify no promotion happened → verify data intact
topology:
nodes:
target_node:
host: "192.168.1.184"
user: testdev
key: "/opt/work/testdev_key"
client_node:
host: "192.168.1.181"
user: testdev
key: "/opt/work/testdev_key"
phases:
# Phase 1: Clean slate
- name: setup
actions:
- action: kill_stale
node: target_node
- action: kill_stale
node: client_node
iscsi_cleanup: "true"
- action: exec
node: target_node
cmd: "rm -rf /tmp/sw-b3r-master /tmp/sw-b3r-vs1 /tmp/sw-b3r-vs2"
root: "true"
# Phase 2: Start cluster
- name: start_cluster
actions:
- action: exec
node: target_node
cmd: "mkdir -p /tmp/sw-b3r-master /tmp/sw-b3r-vs1/blocks /tmp/sw-b3r-vs2/blocks"
- action: start_weed_master
node: target_node
port: "9436"
dir: "/tmp/sw-b3r-master"
save_as: master_pid
- action: wait_cluster_ready
node: target_node
master_url: "http://localhost:9436"
timeout: 30s
- action: start_weed_volume
node: target_node
port: "18194"
master: "localhost:9436"
dir: "/tmp/sw-b3r-vs1"
extra_args: "-block.dir=/tmp/sw-b3r-vs1/blocks -block.listen=:3281 -ip=192.168.1.184"
save_as: vs1_pid
- action: start_weed_volume
node: target_node
port: "18195"
master: "localhost:9436"
dir: "/tmp/sw-b3r-vs2"
extra_args: "-block.dir=/tmp/sw-b3r-vs2/blocks -block.listen=:3282 -ip=192.168.1.184"
save_as: vs2_pid
- action: wait_block_servers
count: "2"
timeout: 60s
# Phase 3: Create RF=2 volume, write data
- name: create_and_write
actions:
- action: create_block_volume
name: "reconnect-test"
size: "50M"
replica_factor: "2"
save_as: vol_info
# Wait for replica to confirm role via heartbeat.
- action: sleep
duration: 10s
- action: lookup_block_volume
name: "reconnect-test"
save_as: initial
- action: iscsi_login_direct
node: client_node
host: "{{ initial_iscsi_host }}"
port: "{{ initial_iscsi_port }}"
iqn: "{{ initial_iqn }}"
save_as: device
- action: dd_write
node: client_node
device: "{{ device }}"
bs: 1M
count: "1"
seek: "8"
save_as: md5_8M
- action: dd_read_md5
node: client_node
device: "{{ device }}"
bs: 1M
count: "1"
skip: "8"
save_as: verify_8M
- action: assert_equal
actual: "{{ verify_8M }}"
expected: "{{ md5_8M }}"
- action: iscsi_cleanup
node: client_node
ignore_error: true
# Record initial epoch.
- action: assert_block_field
name: "reconnect-test"
field: "epoch"
expected: "1"
# Record pre-kill promotion counter.
- action: block_status
save_as: pre_stats
# Phase 4: Kill and quickly restart primary VS (before lease expires)
- name: fast_reconnect
actions:
# Crash-kill primary VS with SIGKILL.
- action: exec
node: target_node
cmd: "kill -9 {{ vs1_pid }}"
root: "true"
# Restart it quickly — within a few seconds, well before the
# default 30s lease TTL expires on the master.
- action: sleep
duration: 3s
- action: start_weed_volume
node: target_node
port: "18194"
master: "localhost:9436"
dir: "/tmp/sw-b3r-vs1"
extra_args: "-block.dir=/tmp/sw-b3r-vs1/blocks -block.listen=:3281 -ip=192.168.1.184"
save_as: vs1_pid2
# Wait for VS to re-register with master.
- action: wait_block_servers
count: "2"
timeout: 60s
- action: sleep
duration: 5s
# Phase 5: Verify NO promotion happened
- name: verify_no_promotion
actions:
# Epoch should still be 1 (no promotion).
- action: assert_block_field
name: "reconnect-test"
field: "epoch"
expected: "1"
# Promotion counter should not have increased.
- action: block_status
save_as: post_stats
- action: assert_equal
actual: "{{ post_stats_promotions_total }}"
expected: "{{ pre_stats_promotions_total }}"
- action: print
msg: "fast reconnect: epoch unchanged, no promotion — deferred timer cancelled"
# Phase 6: Verify data still accessible on original primary
- name: verify_data
actions:
- action: lookup_block_volume
name: "reconnect-test"
save_as: after
- action: iscsi_login_direct
node: client_node
host: "{{ after_iscsi_host }}"
port: "{{ after_iscsi_port }}"
iqn: "{{ after_iqn }}"
save_as: device2
- action: dd_read_md5
node: client_node
device: "{{ device2 }}"
bs: 1M
count: "1"
skip: "8"
save_as: post_reconnect_md5
- action: assert_equal
actual: "{{ post_reconnect_md5 }}"
expected: "{{ md5_8M }}"
# Cleanup (always runs)
- name: cleanup
always: true
actions:
- action: iscsi_cleanup
node: client_node
ignore_error: true
- action: delete_block_volume
name: "reconnect-test"
ignore_error: true
- action: stop_weed
node: target_node
pid: "{{ vs1_pid2 }}"
ignore_error: true
- action: stop_weed
node: target_node
pid: "{{ vs2_pid }}"
ignore_error: true
- action: stop_weed
node: target_node
pid: "{{ vs1_pid }}"
ignore_error: true
- action: stop_weed
node: target_node
pid: "{{ master_pid }}"
ignore_error: true
- action: exec
node: target_node
cmd: "rm -rf /tmp/sw-b3r-master /tmp/sw-b3r-vs1 /tmp/sw-b3r-vs2"
root: "true"
ignore_error: true

190
weed/storage/blockvol/testrunner/scenarios/cp11b3-manual-promote.yaml

@ -0,0 +1,190 @@
name: cp11b3-manual-promote
timeout: 10m
env:
repo_dir: "/opt/work/seaweedfs"
master_url: "http://192.168.1.184:9435"
# Tests: T5 (manual promote API), T6 (preflight), structured rejection
# Flow: Create RF=2 → write → preflight check → kill primary → manual promote → verify data
topology:
nodes:
target_node:
host: "192.168.1.184"
user: testdev
key: "/opt/work/testdev_key"
client_node:
host: "192.168.1.181"
user: testdev
key: "/opt/work/testdev_key"
phases:
# Phase 1: Clean slate
- name: setup
actions:
- action: kill_stale
node: target_node
- action: kill_stale
node: client_node
iscsi_cleanup: "true"
- action: exec
node: target_node
cmd: "rm -rf /tmp/sw-b3m-master /tmp/sw-b3m-vs1 /tmp/sw-b3m-vs2"
root: "true"
# Phase 2: Start cluster
- name: start_cluster
actions:
- action: exec
node: target_node
cmd: "mkdir -p /tmp/sw-b3m-master /tmp/sw-b3m-vs1/blocks /tmp/sw-b3m-vs2/blocks"
- action: start_weed_master
node: target_node
port: "9435"
dir: "/tmp/sw-b3m-master"
save_as: master_pid
- action: wait_cluster_ready
node: target_node
master_url: "http://localhost:9435"
timeout: 30s
- action: start_weed_volume
node: target_node
port: "18192"
master: "localhost:9435"
dir: "/tmp/sw-b3m-vs1"
extra_args: "-block.dir=/tmp/sw-b3m-vs1/blocks -block.listen=:3279 -ip=192.168.1.184"
save_as: vs1_pid
- action: start_weed_volume
node: target_node
port: "18193"
master: "localhost:9435"
dir: "/tmp/sw-b3m-vs2"
extra_args: "-block.dir=/tmp/sw-b3m-vs2/blocks -block.listen=:3280 -ip=192.168.1.184"
save_as: vs2_pid
- action: wait_block_servers
count: "2"
timeout: 60s
# Phase 3: Create RF=2 volume, write data
- name: create_and_write
actions:
- action: create_block_volume
name: "promote-test"
size: "50M"
replica_factor: "2"
save_as: vol_info
# Wait for replica to confirm role via heartbeat.
- action: sleep
duration: 10s
- action: lookup_block_volume
name: "promote-test"
save_as: initial
- action: iscsi_login_direct
node: client_node
host: "{{ initial_iscsi_host }}"
port: "{{ initial_iscsi_port }}"
iqn: "{{ initial_iqn }}"
save_as: device
- action: dd_write
node: client_node
device: "{{ device }}"
bs: 1M
count: "2"
seek: "3"
save_as: md5_3M
- action: dd_read_md5
node: client_node
device: "{{ device }}"
bs: 1M
count: "2"
skip: "3"
save_as: verify_3M
- action: assert_equal
actual: "{{ verify_3M }}"
expected: "{{ md5_3M }}"
# Phase 4: Kill primary VS, then promote via API
- name: kill_and_promote
actions:
- action: iscsi_cleanup
node: client_node
ignore_error: true
# Crash-kill VS1 with SIGKILL to simulate a real crash.
- action: exec
node: target_node
cmd: "kill -9 {{ vs1_pid }}"
root: "true"
# Wait for master to detect the disconnection.
- action: sleep
duration: 15s
# Manual promote via the API.
- action: block_promote
name: "promote-test"
reason: "T7 integration test: manual failover"
save_as: promote_result
- action: print
msg: "promoted to {{ promote_result_server }} epoch={{ promote_result_epoch }}"
# Phase 5: Verify promoted state
- name: verify_promoted
actions:
- action: lookup_block_volume
name: "promote-test"
save_as: after
# New primary should be different from old.
- action: assert_block_field
name: "promote-test"
field: "epoch"
expected: "2"
- action: block_status
save_as: stats
- action: print
msg: "promotions_total={{ stats_promotions_total }}"
# Phase 6: Reconnect iSCSI to new primary, verify data
- name: verify_data
actions:
- action: iscsi_login_direct
node: client_node
host: "{{ after_iscsi_host }}"
port: "{{ after_iscsi_port }}"
iqn: "{{ after_iqn }}"
save_as: device2
- action: dd_read_md5
node: client_node
device: "{{ device2 }}"
bs: 1M
count: "2"
skip: "3"
save_as: post_promote_md5
- action: assert_equal
actual: "{{ post_promote_md5 }}"
expected: "{{ md5_3M }}"
# Cleanup (always runs)
- name: cleanup
always: true
actions:
- action: iscsi_cleanup
node: client_node
ignore_error: true
- action: delete_block_volume
name: "promote-test"
ignore_error: true
- action: stop_weed
node: target_node
pid: "{{ vs2_pid }}"
ignore_error: true
- action: stop_weed
node: target_node
pid: "{{ vs1_pid }}"
ignore_error: true
- action: stop_weed
node: target_node
pid: "{{ master_pid }}"
ignore_error: true
- action: exec
node: target_node
cmd: "rm -rf /tmp/sw-b3m-master /tmp/sw-b3m-vs1 /tmp/sw-b3m-vs2"
root: "true"
ignore_error: true
Loading…
Cancel
Save