Browse Source

fix: rebuild executor consumes bound plan, fix catch-up timing

Planner/executor contract:
- RebuildExecutor.Execute() takes no arguments — consumes plan-bound
  RebuildSource, RebuildSnapshotLSN, RebuildTargetLSN
- RecoveryPlan binds all rebuild targets at plan time
- Executor cannot re-derive policy from caller-supplied history

Catch-up timing:
- Removed unused completeTick parameter from CatchUpExecutor.Execute
- Per-step ticks synthesized as startTick + stepIndex + 1
- API shape matches implementation

New test: PlanExecuteConsistency_RebuildCannotSwitchSource
- Plans snapshot+tail, then mutates storage history
- Executor succeeds using plan-bound values (not re-derived)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
feature/sw-block
pingqiu 1 day ago
parent
commit
f5c0aab454
  1. 21
      sw-block/engine/replication/driver.go
  2. 44
      sw-block/engine/replication/executor.go
  3. 50
      sw-block/engine/replication/executor_test.go

21
sw-block/engine/replication/driver.go

@ -23,6 +23,7 @@ import "fmt"
// a stepwise execution interface.
// RecoveryPlan represents a planned recovery operation with acquired resources.
// The executor consumes this plan — it does not re-derive policy.
type RecoveryPlan struct {
ReplicaID string
SessionID uint64
@ -34,10 +35,14 @@ type RecoveryPlan struct {
SnapshotPin *SnapshotPin // for snapshot+tail rebuild
FullBasePin *FullBasePin // for full-base rebuild
// Targets.
CatchUpTarget uint64 // for catch-up: target LSN
// Catch-up targets (bound at plan time).
CatchUpTarget uint64 // for catch-up: committed LSN at plan time
TruncateLSN uint64 // non-zero if truncation required
RebuildSource RebuildSource
// Rebuild targets (bound at plan time).
RebuildSource RebuildSource
RebuildSnapshotLSN uint64 // for snapshot+tail: the snapshot LSN
RebuildTargetLSN uint64 // committed LSN at plan time
}
// RecoveryDriver plans and executes recovery operations using real
@ -138,10 +143,12 @@ func (d *RecoveryDriver) PlanRebuild(replicaID string) (*RecoveryPlan, error) {
source, snapLSN := history.RebuildSourceDecision()
plan := &RecoveryPlan{
ReplicaID: replicaID,
SessionID: sessID,
Outcome: OutcomeNeedsRebuild,
RebuildSource: source,
ReplicaID: replicaID,
SessionID: sessID,
Outcome: OutcomeNeedsRebuild,
RebuildSource: source,
RebuildSnapshotLSN: snapLSN,
RebuildTargetLSN: history.CommittedLSN,
}
if source == RebuildSnapshotTail {

44
sw-block/engine/replication/executor.go

@ -36,8 +36,10 @@ func NewCatchUpExecutor(driver *RecoveryDriver, plan *RecoveryPlan) *CatchUpExec
// 4. CompleteSessionByID
// 5. Release resources
//
// progressLSNs are the stepwise LSN targets. startTick is the time when
// catch-up begins. Each step is assigned tick = startTick + stepIndex + 1.
// On any failure or cancellation, resources are released before returning.
func (e *CatchUpExecutor) Execute(progressLSNs []uint64, startTick, completeTick uint64) error {
func (e *CatchUpExecutor) Execute(progressLSNs []uint64, startTick uint64) error {
s := e.driver.Orchestrator.Registry.Sender(e.replicaID)
if s == nil {
e.release("sender_not_found")
@ -135,39 +137,45 @@ func NewRebuildExecutor(driver *RecoveryDriver, plan *RecoveryPlan) *RebuildExec
}
}
// Execute runs the full rebuild lifecycle stepwise:
// 1. BeginConnect + RecordHandshake
// 2. SelectRebuildFromHistory
// 3. BeginRebuildTransfer + progress steps
// 4. BeginRebuildTailReplay + progress steps (snapshot+tail only)
// Execute runs the full rebuild lifecycle from the bound plan.
// Does NOT re-derive policy from caller-supplied history — uses
// plan.RebuildSource, plan.RebuildSnapshotLSN, plan.RebuildTargetLSN.
//
// Steps:
// 1. BeginConnect + RecordHandshake (target from plan)
// 2. SelectRebuildSource (source/snapshot from plan)
// 3. BeginRebuildTransfer + progress
// 4. BeginRebuildTailReplay + progress (snapshot+tail only)
// 5. CompleteRebuild
// 6. Release resources
func (e *RebuildExecutor) Execute(history *RetainedHistory) error {
func (e *RebuildExecutor) Execute() error {
s := e.driver.Orchestrator.Registry.Sender(e.replicaID)
if s == nil {
e.release("sender_not_found")
return fmt.Errorf("sender %q not found", e.replicaID)
}
// Step 1: connect + handshake.
plan := e.plan
// Step 1: connect + handshake with plan-bound target.
if err := s.BeginConnect(e.sessID); err != nil {
e.release(fmt.Sprintf("connect_failed: %s", err))
return err
}
if err := s.RecordHandshake(e.sessID, 0, history.CommittedLSN); err != nil {
if err := s.RecordHandshake(e.sessID, 0, plan.RebuildTargetLSN); err != nil {
e.release(fmt.Sprintf("handshake_failed: %s", err))
return err
}
// Step 2: select source from history.
if err := s.SelectRebuildFromHistory(e.sessID, history); err != nil {
// Step 2: select source from plan-bound values (not re-derived).
valid := plan.RebuildSource == RebuildSnapshotTail
if err := s.SelectRebuildSource(e.sessID, plan.RebuildSnapshotLSN, valid, plan.RebuildTargetLSN); err != nil {
e.release(fmt.Sprintf("source_select_failed: %s", err))
return err
}
source, snapLSN := history.RebuildSourceDecision()
e.driver.Orchestrator.Log.Record(e.replicaID, e.sessID, "exec_rebuild_started",
fmt.Sprintf("source=%s", source))
fmt.Sprintf("source=%s target=%d", plan.RebuildSource, plan.RebuildTargetLSN))
// Step 3: transfer.
if err := s.BeginRebuildTransfer(e.sessID); err != nil {
@ -175,9 +183,8 @@ func (e *RebuildExecutor) Execute(history *RetainedHistory) error {
return err
}
if source == RebuildSnapshotTail {
// Transfer snapshot base.
if err := s.RecordRebuildTransferProgress(e.sessID, snapLSN); err != nil {
if plan.RebuildSource == RebuildSnapshotTail {
if err := s.RecordRebuildTransferProgress(e.sessID, plan.RebuildSnapshotLSN); err != nil {
e.release(fmt.Sprintf("transfer_progress_failed: %s", err))
return err
}
@ -193,13 +200,12 @@ func (e *RebuildExecutor) Execute(history *RetainedHistory) error {
e.release(fmt.Sprintf("tail_replay_failed: %s", err))
return err
}
if err := s.RecordRebuildTailProgress(e.sessID, history.CommittedLSN); err != nil {
if err := s.RecordRebuildTailProgress(e.sessID, plan.RebuildTargetLSN); err != nil {
e.release(fmt.Sprintf("tail_progress_failed: %s", err))
return err
}
} else {
// Full base transfer.
if err := s.RecordRebuildTransferProgress(e.sessID, history.CommittedLSN); err != nil {
if err := s.RecordRebuildTransferProgress(e.sessID, plan.RebuildTargetLSN); err != nil {
e.release(fmt.Sprintf("transfer_progress_failed: %s", err))
return err
}

50
sw-block/engine/replication/executor_test.go

@ -79,7 +79,7 @@ func TestExecutor_E1_PartialCatchUp_ProgressFailure_ReleasesWAL(t *testing.T) {
exec := NewCatchUpExecutor(driver, plan)
// Progress with LSN that will exceed frozen target (target=100, try 101).
err := exec.Execute([]uint64{80, 90, 101}, 0, 0)
err := exec.Execute([]uint64{80, 90, 101}, 0)
if err == nil {
t.Fatal("should fail on progress beyond frozen target")
}
@ -124,7 +124,7 @@ func TestExecutor_E1_BudgetEscalation_ReleasesWAL(t *testing.T) {
exec := NewCatchUpExecutor(driver, plan)
// Execute with ticks that exceed budget (3 steps, each +1 tick, budget=3).
err := exec.Execute([]uint64{600, 700, 800, 900}, 0, 0)
err := exec.Execute([]uint64{600, 700, 800, 900}, 0)
if err == nil {
t.Fatal("should escalate on budget")
}
@ -153,7 +153,7 @@ func TestExecutor_E2_PartialRebuild_TransferFailure_ReleasesAll(t *testing.T) {
// Invalidate session before execution → will fail at connect.
driver.Orchestrator.Registry.Sender("r1").UpdateEpoch(2)
err := exec.Execute(&storage.history)
err := exec.Execute()
if err == nil {
t.Fatal("should fail on invalidated session")
}
@ -178,7 +178,7 @@ func TestExecutor_E3_EpochBump_MidCatchUp_ReleasesWAL(t *testing.T) {
driver.Orchestrator.UpdateSenderEpoch("r1", 2)
exec := NewCatchUpExecutor(driver, plan)
err := exec.Execute([]uint64{80, 90, 100}, 0, 0)
err := exec.Execute([]uint64{80, 90, 100}, 0)
if err == nil {
t.Fatal("should fail on stale session")
}
@ -228,7 +228,7 @@ func TestExecutor_E4_SuccessfulCatchUp_ReleasesWAL(t *testing.T) {
storage := driver.Storage.(*mockStorage)
exec := NewCatchUpExecutor(driver, plan)
err := exec.Execute([]uint64{80, 90, 100}, 0, 0)
err := exec.Execute([]uint64{80, 90, 100}, 0)
if err != nil {
t.Fatal(err)
}
@ -245,7 +245,7 @@ func TestExecutor_E4_SuccessfulRebuild_ReleasesAll(t *testing.T) {
driver, plan, storage := setupRebuildDriver(t)
exec := NewRebuildExecutor(driver, plan)
err := exec.Execute(&storage.history)
err := exec.Execute()
if err != nil {
t.Fatal(err)
}
@ -258,13 +258,45 @@ func TestExecutor_E4_SuccessfulRebuild_ReleasesAll(t *testing.T) {
}
}
// --- Plan/execute consistency ---
func TestExecutor_PlanExecuteConsistency_RebuildCannotSwitchSource(t *testing.T) {
// Plan acquires snapshot+tail resources.
// Prove the executor uses plan-bound source, not re-derived policy.
driver, plan, storage := setupRebuildDriver(t)
if plan.RebuildSource != RebuildSnapshotTail {
t.Fatalf("plan source=%s", plan.RebuildSource)
}
if plan.RebuildSnapshotLSN != 50 {
t.Fatalf("plan snapshot=%d", plan.RebuildSnapshotLSN)
}
if plan.RebuildTargetLSN != 100 {
t.Fatalf("plan target=%d", plan.RebuildTargetLSN)
}
// Even if storage history changes after planning, executor uses plan.
storage.history.CheckpointTrusted = false // would cause full_base if re-derived
storage.history.TailLSN = 80 // would make snapshot unreplayable
exec := NewRebuildExecutor(driver, plan)
err := exec.Execute() // uses plan-bound values, not storage.history
if err != nil {
t.Fatalf("execution should succeed from plan-bound values: %v", err)
}
if driver.Orchestrator.Registry.Sender("r1").State() != StateInSync {
t.Fatalf("state=%s", driver.Orchestrator.Registry.Sender("r1").State())
}
}
// --- E5: Executor drives sender APIs stepwise ---
func TestExecutor_E5_CatchUp_StepwiseNotConvenience(t *testing.T) {
driver, plan := setupCatchUpDriver(t, 70)
exec := NewCatchUpExecutor(driver, plan)
err := exec.Execute([]uint64{80, 90, 100}, 0, 0)
err := exec.Execute([]uint64{80, 90, 100}, 0)
if err != nil {
t.Fatal(err)
}
@ -287,10 +319,10 @@ func TestExecutor_E5_CatchUp_StepwiseNotConvenience(t *testing.T) {
}
func TestExecutor_E5_Rebuild_StepwiseNotConvenience(t *testing.T) {
driver, plan, storage := setupRebuildDriver(t)
driver, plan, _ := setupRebuildDriver(t)
exec := NewRebuildExecutor(driver, plan)
err := exec.Execute(&storage.history)
err := exec.Execute()
if err != nil {
t.Fatal(err)
}

Loading…
Cancel
Save