fix: snapshot+tail WAL pin failure cleanup + true mid-executor epoch test

Finding 1: PlanRebuild snapshot+tail WAL pin failure now fail-closed - InvalidateSession("wal_pin_failed_during_rebuild", StateNeedsRebuild) - Snapshot pin released, session invalidated, no dangling state - New test: E2_RebuildWALPinFailure_SessionCleaned Finding 2: True mid-executor invalidation test - Executor makes 2 successful progress steps (60, 70) - Epoch bumps BETWEEN steps (real mid-execution) - Third progress step fails — session invalidated - Resources released via executor cancel - New test: E2_EpochBump_AfterExecutorProgress Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2 days ago · 5b63d34d6b
2 changed files with 66 additions and 12 deletions
--- a/sw-block/engine/replication/driver.go
+++ b/sw-block/engine/replication/driver.go
@ -165,6 +165,7 @@ func (d *RecoveryDriver) PlanRebuild(replicaID string) (*RecoveryPlan, error) {
 		retPin, err := d.Storage.PinWALRetention(snapLSN)
 		if err != nil {
 			d.Storage.ReleaseSnapshot(snapPin)
+			s.InvalidateSession("wal_pin_failed_during_rebuild", StateNeedsRebuild)
 			d.Orchestrator.Log.Record(replicaID, plan.SessionID, "wal_pin_failed", err.Error())
 			return nil, fmt.Errorf("WAL retention pin failed: %w", err)
 		}
--- a/sw-block/engine/replication/validation_test.go
+++ b/sw-block/engine/replication/validation_test.go
@ -99,7 +99,9 @@ func TestP3_E1_ChangedAddress_OldPlanCancelledByDriver(t *testing.T) {

 // --- E2 / FC2: Epoch bump during active executor step ---

-func TestP3_E2_EpochBump_MidExecutorStep(t *testing.T) {
+func TestP3_E2_EpochBump_AfterExecutorProgress(t *testing.T) {
+	// True mid-execution: executor makes progress, THEN epoch bumps,
+	// THEN next step fails.
 	storage := newMockStorage(RetainedHistory{
 		HeadLSN: 100, TailLSN: 0, CommittedLSN: 100,
 	})
@ -114,25 +116,39 @@ func TestP3_E2_EpochBump_MidExecutorStep(t *testing.T) {
 	})

 	plan, _ := driver.PlanRecovery("r1", 50)
-	exec := NewCatchUpExecutor(driver, plan)
+	s := driver.Orchestrator.Registry.Sender("r1")
+	sessID := plan.SessionID
+
+	// Manually drive the executor steps to place the epoch bump BETWEEN steps.
+	// Step 1: begin catch-up.
+	s.BeginCatchUp(sessID, 0)
+
+	// Step 2: first progress step succeeds.
+	s.RecordCatchUpProgress(sessID, 60, 1)
+
+	// Step 3: second progress step succeeds.
+	s.RecordCatchUpProgress(sessID, 70, 2)

-	// Epoch bumps BEFORE executor runs — simulates bump between plan and execute.
-	// The executor's mid-step check will detect the invalidation.
+	// EPOCH BUMPS between progress steps (real mid-execution).
 	driver.Orchestrator.InvalidateEpoch(2)
 	driver.Orchestrator.UpdateSenderEpoch("r1", 2)

-	// Executor detects invalidation at first progress step.
-	err := exec.Execute([]uint64{60, 70, 80, 90, 100}, 0)
+	// Step 4: third progress step fails — session invalidated.
+	err := s.RecordCatchUpProgress(sessID, 80, 3)
 	if err == nil {
-		t.Fatal("E2: executor should fail on invalidated session")
+		t.Fatal("E2: progress after mid-execution epoch bump must fail")
 	}

-	// Resources released by executor.
+	// Executor cancel releases resources.
+	exec := NewCatchUpExecutor(driver, plan)
+	exec.Cancel("epoch_bump_after_progress")
+
+	// WAL pin released.
 	if len(storage.pinnedWAL) != 0 {
-		t.Fatal("E2: WAL pin must be released after epoch-bump invalidation")
+		t.Fatal("E2: WAL pin must be released after mid-execution epoch bump")
 	}

-	// E5: Log shows invalidation + resource release.
+	// E5: Log shows per-replica invalidation + resource release.
 	hasInvalidation := false
 	hasRelease := false
 	for _, e := range driver.Orchestrator.Log.EventsFor("r1") {
@ -144,10 +160,47 @@ func TestP3_E2_EpochBump_MidExecutorStep(t *testing.T) {
 		}
 	}
 	if !hasInvalidation {
-		t.Fatal("E2/E5: log must show session invalidation")
+		t.Fatal("E2/E5: log must show session invalidation with epoch cause")
 	}
 	if !hasRelease {
-		t.Fatal("E2/E5: log must show resource release")
+		t.Fatal("E2/E5: log must show resource release on cancellation")
+	}
+}
+
+func TestP3_E2_RebuildWALPinFailure_SessionCleaned(t *testing.T) {
+	// Snapshot+tail rebuild: snapshot pin succeeds, WAL pin fails.
+	storage := newMockStorage(RetainedHistory{
+		HeadLSN: 100, TailLSN: 30, CommittedLSN: 100,
+		CheckpointLSN: 50, CheckpointTrusted: true,
+	})
+	storage.failWALPin = true
+	driver := NewRecoveryDriver(storage)
+
+	driver.Orchestrator.ProcessAssignment(AssignmentIntent{
+		Replicas: []ReplicaAssignment{
+			{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
+		},
+		Epoch:           1,
+		RecoveryTargets: map[string]SessionKind{"r1": SessionRebuild},
+	})
+
+	_, err := driver.PlanRebuild("r1")
+	if err == nil {
+		t.Fatal("should fail when WAL pin refused during snapshot+tail rebuild")
+	}
+
+	// Session must be invalidated — no dangling rebuild session.
+	s := driver.Orchestrator.Registry.Sender("r1")
+	if s.HasActiveSession() {
+		t.Fatal("session must be invalidated after WAL pin failure in rebuild")
+	}
+	if s.State() != StateNeedsRebuild {
+		t.Fatalf("state=%s, want needs_rebuild", s.State())
+	}
+
+	// Snapshot pin must be released (no leak).
+	if len(storage.pinnedSnaps) != 0 {
+		t.Fatal("snapshot pin must be released after WAL pin failure")
 	}
 }