fix: rebuild executor consumes bound plan, fix catch-up timing

Planner/executor contract: - RebuildExecutor.Execute() takes no arguments — consumes plan-bound RebuildSource, RebuildSnapshotLSN, RebuildTargetLSN - RecoveryPlan binds all rebuild targets at plan time - Executor cannot re-derive policy from caller-supplied history Catch-up timing: - Removed unused completeTick parameter from CatchUpExecutor.Execute - Per-step ticks synthesized as startTick + stepIndex + 1 - API shape matches implementation New test: PlanExecuteConsistency_RebuildCannotSwitchSource - Plans snapshot+tail, then mutates storage history - Executor succeeds using plan-bound values (not re-derived) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 day ago · f5c0aab454
3 changed files with 80 additions and 35 deletions
--- a/sw-block/engine/replication/driver.go
+++ b/sw-block/engine/replication/driver.go
@ -23,6 +23,7 @@ import "fmt"
 // a stepwise execution interface.

 // RecoveryPlan represents a planned recovery operation with acquired resources.
+// The executor consumes this plan — it does not re-derive policy.
 type RecoveryPlan struct {
 	ReplicaID    string
 	SessionID    uint64
@ -34,10 +35,14 @@ type RecoveryPlan struct {
 	SnapshotPin  *SnapshotPin  // for snapshot+tail rebuild
 	FullBasePin  *FullBasePin  // for full-base rebuild

-	// Targets.
-	CatchUpTarget uint64 // for catch-up: target LSN
+	// Catch-up targets (bound at plan time).
+	CatchUpTarget uint64 // for catch-up: committed LSN at plan time
 	TruncateLSN   uint64 // non-zero if truncation required
-	RebuildSource RebuildSource
+
+	// Rebuild targets (bound at plan time).
+	RebuildSource     RebuildSource
+	RebuildSnapshotLSN uint64 // for snapshot+tail: the snapshot LSN
+	RebuildTargetLSN   uint64 // committed LSN at plan time
 }

 // RecoveryDriver plans and executes recovery operations using real
@ -138,10 +143,12 @@ func (d *RecoveryDriver) PlanRebuild(replicaID string) (*RecoveryPlan, error) {
 	source, snapLSN := history.RebuildSourceDecision()

 	plan := &RecoveryPlan{
-		ReplicaID:     replicaID,
-		SessionID:     sessID,
-		Outcome:       OutcomeNeedsRebuild,
-		RebuildSource: source,
+		ReplicaID:          replicaID,
+		SessionID:          sessID,
+		Outcome:            OutcomeNeedsRebuild,
+		RebuildSource:      source,
+		RebuildSnapshotLSN: snapLSN,
+		RebuildTargetLSN:   history.CommittedLSN,
 	}

 	if source == RebuildSnapshotTail {
--- a/sw-block/engine/replication/executor.go
+++ b/sw-block/engine/replication/executor.go
@ -36,8 +36,10 @@ func NewCatchUpExecutor(driver *RecoveryDriver, plan *RecoveryPlan) *CatchUpExec
 //   4. CompleteSessionByID
 //   5. Release resources
 //
+// progressLSNs are the stepwise LSN targets. startTick is the time when
+// catch-up begins. Each step is assigned tick = startTick + stepIndex + 1.
 // On any failure or cancellation, resources are released before returning.
-func (e *CatchUpExecutor) Execute(progressLSNs []uint64, startTick, completeTick uint64) error {
+func (e *CatchUpExecutor) Execute(progressLSNs []uint64, startTick uint64) error {
 	s := e.driver.Orchestrator.Registry.Sender(e.replicaID)
 	if s == nil {
 		e.release("sender_not_found")
@ -135,39 +137,45 @@ func NewRebuildExecutor(driver *RecoveryDriver, plan *RecoveryPlan) *RebuildExec
 	}
 }

-// Execute runs the full rebuild lifecycle stepwise:
-//   1. BeginConnect + RecordHandshake
-//   2. SelectRebuildFromHistory
-//   3. BeginRebuildTransfer + progress steps
-//   4. BeginRebuildTailReplay + progress steps (snapshot+tail only)
+// Execute runs the full rebuild lifecycle from the bound plan.
+// Does NOT re-derive policy from caller-supplied history — uses
+// plan.RebuildSource, plan.RebuildSnapshotLSN, plan.RebuildTargetLSN.
+//
+// Steps:
+//   1. BeginConnect + RecordHandshake (target from plan)
+//   2. SelectRebuildSource (source/snapshot from plan)
+//   3. BeginRebuildTransfer + progress
+//   4. BeginRebuildTailReplay + progress (snapshot+tail only)
 //   5. CompleteRebuild
 //   6. Release resources
-func (e *RebuildExecutor) Execute(history *RetainedHistory) error {
+func (e *RebuildExecutor) Execute() error {
 	s := e.driver.Orchestrator.Registry.Sender(e.replicaID)
 	if s == nil {
 		e.release("sender_not_found")
 		return fmt.Errorf("sender %q not found", e.replicaID)
 	}

-	// Step 1: connect + handshake.
+	plan := e.plan
+
+	// Step 1: connect + handshake with plan-bound target.
 	if err := s.BeginConnect(e.sessID); err != nil {
 		e.release(fmt.Sprintf("connect_failed: %s", err))
 		return err
 	}
-	if err := s.RecordHandshake(e.sessID, 0, history.CommittedLSN); err != nil {
+	if err := s.RecordHandshake(e.sessID, 0, plan.RebuildTargetLSN); err != nil {
 		e.release(fmt.Sprintf("handshake_failed: %s", err))
 		return err
 	}

-	// Step 2: select source from history.
-	if err := s.SelectRebuildFromHistory(e.sessID, history); err != nil {
+	// Step 2: select source from plan-bound values (not re-derived).
+	valid := plan.RebuildSource == RebuildSnapshotTail
+	if err := s.SelectRebuildSource(e.sessID, plan.RebuildSnapshotLSN, valid, plan.RebuildTargetLSN); err != nil {
 		e.release(fmt.Sprintf("source_select_failed: %s", err))
 		return err
 	}

-	source, snapLSN := history.RebuildSourceDecision()
 	e.driver.Orchestrator.Log.Record(e.replicaID, e.sessID, "exec_rebuild_started",
-		fmt.Sprintf("source=%s", source))
+		fmt.Sprintf("source=%s target=%d", plan.RebuildSource, plan.RebuildTargetLSN))

 	// Step 3: transfer.
 	if err := s.BeginRebuildTransfer(e.sessID); err != nil {
@ -175,9 +183,8 @@ func (e *RebuildExecutor) Execute(history *RetainedHistory) error {
 		return err
 	}

-	if source == RebuildSnapshotTail {
-		// Transfer snapshot base.
-		if err := s.RecordRebuildTransferProgress(e.sessID, snapLSN); err != nil {
+	if plan.RebuildSource == RebuildSnapshotTail {
+		if err := s.RecordRebuildTransferProgress(e.sessID, plan.RebuildSnapshotLSN); err != nil {
 			e.release(fmt.Sprintf("transfer_progress_failed: %s", err))
 			return err
 		}
@ -193,13 +200,12 @@ func (e *RebuildExecutor) Execute(history *RetainedHistory) error {
 			e.release(fmt.Sprintf("tail_replay_failed: %s", err))
 			return err
 		}
-		if err := s.RecordRebuildTailProgress(e.sessID, history.CommittedLSN); err != nil {
+		if err := s.RecordRebuildTailProgress(e.sessID, plan.RebuildTargetLSN); err != nil {
 			e.release(fmt.Sprintf("tail_progress_failed: %s", err))
 			return err
 		}
 	} else {
-		// Full base transfer.
-		if err := s.RecordRebuildTransferProgress(e.sessID, history.CommittedLSN); err != nil {
+		if err := s.RecordRebuildTransferProgress(e.sessID, plan.RebuildTargetLSN); err != nil {
 			e.release(fmt.Sprintf("transfer_progress_failed: %s", err))
 			return err
 		}
--- a/sw-block/engine/replication/executor_test.go
+++ b/sw-block/engine/replication/executor_test.go
@ -79,7 +79,7 @@ func TestExecutor_E1_PartialCatchUp_ProgressFailure_ReleasesWAL(t *testing.T) {
 	exec := NewCatchUpExecutor(driver, plan)

 	// Progress with LSN that will exceed frozen target (target=100, try 101).
-	err := exec.Execute([]uint64{80, 90, 101}, 0, 0)
+	err := exec.Execute([]uint64{80, 90, 101}, 0)
 	if err == nil {
 		t.Fatal("should fail on progress beyond frozen target")
 	}
@ -124,7 +124,7 @@ func TestExecutor_E1_BudgetEscalation_ReleasesWAL(t *testing.T) {
 	exec := NewCatchUpExecutor(driver, plan)

 	// Execute with ticks that exceed budget (3 steps, each +1 tick, budget=3).
-	err := exec.Execute([]uint64{600, 700, 800, 900}, 0, 0)
+	err := exec.Execute([]uint64{600, 700, 800, 900}, 0)
 	if err == nil {
 		t.Fatal("should escalate on budget")
 	}
@ -153,7 +153,7 @@ func TestExecutor_E2_PartialRebuild_TransferFailure_ReleasesAll(t *testing.T) {
 	// Invalidate session before execution → will fail at connect.
 	driver.Orchestrator.Registry.Sender("r1").UpdateEpoch(2)

-	err := exec.Execute(&storage.history)
+	err := exec.Execute()
 	if err == nil {
 		t.Fatal("should fail on invalidated session")
 	}
@ -178,7 +178,7 @@ func TestExecutor_E3_EpochBump_MidCatchUp_ReleasesWAL(t *testing.T) {
 	driver.Orchestrator.UpdateSenderEpoch("r1", 2)

 	exec := NewCatchUpExecutor(driver, plan)
-	err := exec.Execute([]uint64{80, 90, 100}, 0, 0)
+	err := exec.Execute([]uint64{80, 90, 100}, 0)
 	if err == nil {
 		t.Fatal("should fail on stale session")
 	}
@ -228,7 +228,7 @@ func TestExecutor_E4_SuccessfulCatchUp_ReleasesWAL(t *testing.T) {
 	storage := driver.Storage.(*mockStorage)

 	exec := NewCatchUpExecutor(driver, plan)
-	err := exec.Execute([]uint64{80, 90, 100}, 0, 0)
+	err := exec.Execute([]uint64{80, 90, 100}, 0)
 	if err != nil {
 		t.Fatal(err)
 	}
@ -245,7 +245,7 @@ func TestExecutor_E4_SuccessfulRebuild_ReleasesAll(t *testing.T) {
 	driver, plan, storage := setupRebuildDriver(t)

 	exec := NewRebuildExecutor(driver, plan)
-	err := exec.Execute(&storage.history)
+	err := exec.Execute()
 	if err != nil {
 		t.Fatal(err)
 	}
@ -258,13 +258,45 @@ func TestExecutor_E4_SuccessfulRebuild_ReleasesAll(t *testing.T) {
 	}
 }

+// --- Plan/execute consistency ---
+
+func TestExecutor_PlanExecuteConsistency_RebuildCannotSwitchSource(t *testing.T) {
+	// Plan acquires snapshot+tail resources.
+	// Prove the executor uses plan-bound source, not re-derived policy.
+	driver, plan, storage := setupRebuildDriver(t)
+
+	if plan.RebuildSource != RebuildSnapshotTail {
+		t.Fatalf("plan source=%s", plan.RebuildSource)
+	}
+	if plan.RebuildSnapshotLSN != 50 {
+		t.Fatalf("plan snapshot=%d", plan.RebuildSnapshotLSN)
+	}
+	if plan.RebuildTargetLSN != 100 {
+		t.Fatalf("plan target=%d", plan.RebuildTargetLSN)
+	}
+
+	// Even if storage history changes after planning, executor uses plan.
+	storage.history.CheckpointTrusted = false // would cause full_base if re-derived
+	storage.history.TailLSN = 80              // would make snapshot unreplayable
+
+	exec := NewRebuildExecutor(driver, plan)
+	err := exec.Execute() // uses plan-bound values, not storage.history
+	if err != nil {
+		t.Fatalf("execution should succeed from plan-bound values: %v", err)
+	}
+
+	if driver.Orchestrator.Registry.Sender("r1").State() != StateInSync {
+		t.Fatalf("state=%s", driver.Orchestrator.Registry.Sender("r1").State())
+	}
+}
+
 // --- E5: Executor drives sender APIs stepwise ---

 func TestExecutor_E5_CatchUp_StepwiseNotConvenience(t *testing.T) {
 	driver, plan := setupCatchUpDriver(t, 70)

 	exec := NewCatchUpExecutor(driver, plan)
-	err := exec.Execute([]uint64{80, 90, 100}, 0, 0)
+	err := exec.Execute([]uint64{80, 90, 100}, 0)
 	if err != nil {
 		t.Fatal(err)
 	}
@ -287,10 +319,10 @@ func TestExecutor_E5_CatchUp_StepwiseNotConvenience(t *testing.T) {
 }

 func TestExecutor_E5_Rebuild_StepwiseNotConvenience(t *testing.T) {
-	driver, plan, storage := setupRebuildDriver(t)
+	driver, plan, _ := setupRebuildDriver(t)

 	exec := NewRebuildExecutor(driver, plan)
-	err := exec.Execute(&storage.history)
+	err := exec.Execute()
 	if err != nil {
 		t.Fatal(err)
 	}