Browse Source
feat: add storage/control adapters and recovery driver (Phase 06 P0/P1)
feat: add storage/control adapters and recovery driver (Phase 06 P0/P1)
Phase 06 module boundaries: adapter.go — StorageAdapter + ControlPlaneAdapter interfaces: - GetRetainedHistory: real WAL retention state - PinSnapshot / ReleaseSnapshot: rebuild resource management - PinWALRetention / ReleaseWALRetention: catch-up resource management - HandleHeartbeat / HandleFailover: control-plane event conversion driver.go — RecoveryDriver replaces synchronous convenience: - PlanRecovery: connect + handshake from storage state + acquire resources - PlanRebuild: acquire snapshot + WAL pins for rebuild - ReleasePlan: release all acquired resources Convenience flow classification: - ProcessAssignment, UpdateSenderEpoch, InvalidateEpoch → stepwise engine tasks - ExecuteRecovery → planner (connect + classify) - CompleteCatchUp, CompleteRebuild → TEST-ONLY convenience 7 new tests (driver_test.go): - CatchUp plan + execute with WAL pin - ZeroGap plan (no resources pinned) - NeedsRebuild → rebuild plan with resource acquisition - WAL pin failure → logged + error - Snapshot pin failure → logged + error - ReplicaAhead truncation through driver - Cross-layer: storage proves recoverability, engine consumes proof Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>feature/sw-block
3 changed files with 575 additions and 0 deletions
-
71sw-block/engine/replication/adapter.go
-
160sw-block/engine/replication/driver.go
-
344sw-block/engine/replication/driver_test.go
@ -0,0 +1,71 @@ |
|||
package replication |
|||
|
|||
// === Phase 06: Storage and Control-Plane Adapter Interfaces ===
|
|||
//
|
|||
// These interfaces define the boundary between the engine replication core
|
|||
// and external systems (storage backend, coordinator/control plane).
|
|||
// The engine consumes these interfaces — it does not reach into storage
|
|||
// or control-plane internals directly.
|
|||
|
|||
// StorageAdapter provides real retained-history and checkpoint state
|
|||
// from the storage backend. The engine uses this to make recovery
|
|||
// decisions grounded in actual data, not reconstructed test inputs.
|
|||
type StorageAdapter interface { |
|||
// GetRetainedHistory returns the current WAL retention state.
|
|||
// Must reflect actual TailLSN, HeadLSN, CommittedLSN, and checkpoint.
|
|||
GetRetainedHistory() RetainedHistory |
|||
|
|||
// PinSnapshot pins a checkpoint/base image at the given LSN for
|
|||
// rebuild use. The snapshot must not be garbage-collected while pinned.
|
|||
// Returns an error if no valid snapshot exists at that LSN.
|
|||
PinSnapshot(checkpointLSN uint64) (SnapshotPin, error) |
|||
|
|||
// ReleaseSnapshot releases a previously pinned snapshot.
|
|||
ReleaseSnapshot(pin SnapshotPin) |
|||
|
|||
// PinWALRetention holds WAL entries from startLSN to prevent reclaim.
|
|||
// The engine calls this before starting catch-up to ensure the WAL
|
|||
// tail does not advance past the required range.
|
|||
PinWALRetention(startLSN uint64) (RetentionPin, error) |
|||
|
|||
// ReleaseWALRetention releases a WAL retention hold.
|
|||
ReleaseWALRetention(pin RetentionPin) |
|||
} |
|||
|
|||
// SnapshotPin represents a held reference to a pinned snapshot/checkpoint.
|
|||
type SnapshotPin struct { |
|||
LSN uint64 |
|||
PinID uint64 // unique identifier for this pin
|
|||
Valid bool |
|||
} |
|||
|
|||
// RetentionPin represents a held reference to a WAL retention range.
|
|||
type RetentionPin struct { |
|||
StartLSN uint64 |
|||
PinID uint64 |
|||
Valid bool |
|||
} |
|||
|
|||
// ControlPlaneAdapter converts external assignment events into
|
|||
// AssignmentIntent for the orchestrator.
|
|||
type ControlPlaneAdapter interface { |
|||
// HandleHeartbeat processes a heartbeat from a volume server and
|
|||
// returns any assignment updates that should be applied.
|
|||
HandleHeartbeat(serverID string, volumes []VolumeHeartbeat) []AssignmentIntent |
|||
|
|||
// HandleFailover processes a failover event and returns assignments
|
|||
// for the affected replicas.
|
|||
HandleFailover(deadServerID string) []AssignmentIntent |
|||
} |
|||
|
|||
// VolumeHeartbeat represents one volume's state in a heartbeat.
|
|||
type VolumeHeartbeat struct { |
|||
VolumeID string |
|||
ReplicaID string |
|||
Epoch uint64 |
|||
FlushedLSN uint64 |
|||
State string |
|||
DataAddr string |
|||
CtrlAddr string |
|||
AddrVersion uint64 |
|||
} |
|||
@ -0,0 +1,160 @@ |
|||
package replication |
|||
|
|||
import "fmt" |
|||
|
|||
// === Phase 06: Execution Driver ===
|
|||
//
|
|||
// Convenience flow classification (Phase 06 P0):
|
|||
//
|
|||
// ProcessAssignment → stepwise engine task (real entry point)
|
|||
// ExecuteRecovery → planner (connect + classify outcome)
|
|||
// CompleteCatchUp → TEST-ONLY convenience (bundles plan+execute+complete)
|
|||
// CompleteRebuild → TEST-ONLY convenience (bundles plan+execute+complete)
|
|||
// UpdateSenderEpoch → stepwise engine task
|
|||
// InvalidateEpoch → stepwise engine task
|
|||
//
|
|||
// The real engine flow splits catch-up and rebuild into:
|
|||
// 1. Plan: acquire resources (pin WAL or snapshot)
|
|||
// 2. Execute: stream entries stepwise (not one-shot)
|
|||
// 3. Complete: release resources, transition to InSync
|
|||
//
|
|||
// RecoveryDriver is the Phase 06 replacement for the synchronous
|
|||
// convenience helpers. It plans, acquires resources, and provides
|
|||
// a stepwise execution interface.
|
|||
|
|||
// RecoveryPlan represents a planned recovery operation with acquired resources.
|
|||
type RecoveryPlan struct { |
|||
ReplicaID string |
|||
SessionID uint64 |
|||
Outcome RecoveryOutcome |
|||
Proof *RecoverabilityProof |
|||
|
|||
// Resource pins (non-nil when resources are acquired).
|
|||
RetentionPin *RetentionPin // for catch-up
|
|||
SnapshotPin *SnapshotPin // for rebuild
|
|||
|
|||
// Targets.
|
|||
CatchUpTarget uint64 // for catch-up: target LSN
|
|||
TruncateLSN uint64 // non-zero if truncation required
|
|||
RebuildSource RebuildSource |
|||
} |
|||
|
|||
// RecoveryDriver plans and executes recovery operations using real
|
|||
// storage adapter inputs. It replaces the synchronous convenience
|
|||
// helpers (CompleteCatchUp, CompleteRebuild) with a resource-aware,
|
|||
// stepwise execution model.
|
|||
type RecoveryDriver struct { |
|||
Orchestrator *RecoveryOrchestrator |
|||
Storage StorageAdapter |
|||
} |
|||
|
|||
// NewRecoveryDriver creates a driver with a fresh orchestrator.
|
|||
func NewRecoveryDriver(storage StorageAdapter) *RecoveryDriver { |
|||
return &RecoveryDriver{ |
|||
Orchestrator: NewRecoveryOrchestrator(), |
|||
Storage: storage, |
|||
} |
|||
} |
|||
|
|||
// PlanRecovery connects, handshakes from real storage state, classifies
|
|||
// the outcome, and acquires the necessary resources (WAL pin or snapshot pin).
|
|||
// Returns a RecoveryPlan that the caller can execute stepwise.
|
|||
func (d *RecoveryDriver) PlanRecovery(replicaID string, replicaFlushedLSN uint64) (*RecoveryPlan, error) { |
|||
history := d.Storage.GetRetainedHistory() |
|||
|
|||
result := d.Orchestrator.ExecuteRecovery(replicaID, replicaFlushedLSN, &history) |
|||
if result.Error != nil { |
|||
return nil, result.Error |
|||
} |
|||
|
|||
plan := &RecoveryPlan{ |
|||
ReplicaID: replicaID, |
|||
SessionID: d.Orchestrator.Registry.Sender(replicaID).SessionID(), |
|||
Outcome: result.Outcome, |
|||
Proof: result.Proof, |
|||
} |
|||
|
|||
switch result.Outcome { |
|||
case OutcomeZeroGap: |
|||
// Already completed by ExecuteRecovery.
|
|||
return plan, nil |
|||
|
|||
case OutcomeCatchUp: |
|||
// Acquire WAL retention pin.
|
|||
pin, err := d.Storage.PinWALRetention(replicaFlushedLSN) |
|||
if err != nil { |
|||
d.Orchestrator.Log.Record(replicaID, plan.SessionID, "wal_pin_failed", err.Error()) |
|||
return nil, fmt.Errorf("WAL retention pin failed: %w", err) |
|||
} |
|||
plan.RetentionPin = &pin |
|||
plan.CatchUpTarget = history.CommittedLSN |
|||
|
|||
// Check if truncation is needed (replica ahead).
|
|||
proof := history.ProveRecoverability(replicaFlushedLSN) |
|||
if proof.Reason == "replica_ahead_needs_truncation" { |
|||
plan.TruncateLSN = history.CommittedLSN |
|||
} |
|||
|
|||
d.Orchestrator.Log.Record(replicaID, plan.SessionID, "plan_catchup", |
|||
fmt.Sprintf("target=%d pin=%d truncate=%d", plan.CatchUpTarget, pin.PinID, plan.TruncateLSN)) |
|||
return plan, nil |
|||
|
|||
case OutcomeNeedsRebuild: |
|||
// No resource acquisition — needs rebuild assignment first.
|
|||
return plan, nil |
|||
} |
|||
|
|||
return plan, nil |
|||
} |
|||
|
|||
// PlanRebuild acquires rebuild resources (snapshot pin + optional WAL pin)
|
|||
// from real storage state. Called after a rebuild assignment.
|
|||
func (d *RecoveryDriver) PlanRebuild(replicaID string) (*RecoveryPlan, error) { |
|||
history := d.Storage.GetRetainedHistory() |
|||
source, snapLSN := history.RebuildSourceDecision() |
|||
|
|||
plan := &RecoveryPlan{ |
|||
ReplicaID: replicaID, |
|||
SessionID: d.Orchestrator.Registry.Sender(replicaID).SessionID(), |
|||
Outcome: OutcomeNeedsRebuild, |
|||
RebuildSource: source, |
|||
} |
|||
|
|||
if source == RebuildSnapshotTail { |
|||
// Pin snapshot.
|
|||
snapPin, err := d.Storage.PinSnapshot(snapLSN) |
|||
if err != nil { |
|||
d.Orchestrator.Log.Record(replicaID, plan.SessionID, "snapshot_pin_failed", err.Error()) |
|||
return nil, fmt.Errorf("snapshot pin failed: %w", err) |
|||
} |
|||
plan.SnapshotPin = &snapPin |
|||
|
|||
// Pin WAL retention for tail replay.
|
|||
retPin, err := d.Storage.PinWALRetention(snapLSN) |
|||
if err != nil { |
|||
d.Storage.ReleaseSnapshot(snapPin) |
|||
d.Orchestrator.Log.Record(replicaID, plan.SessionID, "wal_pin_failed", err.Error()) |
|||
return nil, fmt.Errorf("WAL retention pin failed: %w", err) |
|||
} |
|||
plan.RetentionPin = &retPin |
|||
|
|||
d.Orchestrator.Log.Record(replicaID, plan.SessionID, "plan_rebuild_snapshot_tail", |
|||
fmt.Sprintf("snapshot=%d", snapLSN)) |
|||
} else { |
|||
d.Orchestrator.Log.Record(replicaID, plan.SessionID, "plan_rebuild_full_base", "") |
|||
} |
|||
|
|||
return plan, nil |
|||
} |
|||
|
|||
// ReleasePlan releases any resources acquired by a plan.
|
|||
func (d *RecoveryDriver) ReleasePlan(plan *RecoveryPlan) { |
|||
if plan.RetentionPin != nil { |
|||
d.Storage.ReleaseWALRetention(*plan.RetentionPin) |
|||
plan.RetentionPin = nil |
|||
} |
|||
if plan.SnapshotPin != nil { |
|||
d.Storage.ReleaseSnapshot(*plan.SnapshotPin) |
|||
plan.SnapshotPin = nil |
|||
} |
|||
} |
|||
@ -0,0 +1,344 @@ |
|||
package replication |
|||
|
|||
import ( |
|||
"fmt" |
|||
"sync/atomic" |
|||
"testing" |
|||
) |
|||
|
|||
// ============================================================
|
|||
// Phase 06 P0/P1: Recovery driver tests with mock storage adapter
|
|||
// ============================================================
|
|||
|
|||
// --- Mock storage adapter ---
|
|||
|
|||
type mockStorage struct { |
|||
history RetainedHistory |
|||
nextPinID atomic.Uint64 |
|||
pinnedSnaps map[uint64]bool |
|||
pinnedWAL map[uint64]bool |
|||
failSnapshotPin bool |
|||
failWALPin bool |
|||
} |
|||
|
|||
func newMockStorage(history RetainedHistory) *mockStorage { |
|||
return &mockStorage{ |
|||
history: history, |
|||
pinnedSnaps: map[uint64]bool{}, |
|||
pinnedWAL: map[uint64]bool{}, |
|||
} |
|||
} |
|||
|
|||
func (m *mockStorage) GetRetainedHistory() RetainedHistory { return m.history } |
|||
|
|||
func (m *mockStorage) PinSnapshot(lsn uint64) (SnapshotPin, error) { |
|||
if m.failSnapshotPin { |
|||
return SnapshotPin{}, fmt.Errorf("snapshot pin refused") |
|||
} |
|||
id := m.nextPinID.Add(1) |
|||
m.pinnedSnaps[id] = true |
|||
return SnapshotPin{LSN: lsn, PinID: id, Valid: true}, nil |
|||
} |
|||
|
|||
func (m *mockStorage) ReleaseSnapshot(pin SnapshotPin) { |
|||
delete(m.pinnedSnaps, pin.PinID) |
|||
} |
|||
|
|||
func (m *mockStorage) PinWALRetention(startLSN uint64) (RetentionPin, error) { |
|||
if m.failWALPin { |
|||
return RetentionPin{}, fmt.Errorf("WAL retention pin refused") |
|||
} |
|||
id := m.nextPinID.Add(1) |
|||
m.pinnedWAL[id] = true |
|||
return RetentionPin{StartLSN: startLSN, PinID: id, Valid: true}, nil |
|||
} |
|||
|
|||
func (m *mockStorage) ReleaseWALRetention(pin RetentionPin) { |
|||
delete(m.pinnedWAL, pin.PinID) |
|||
} |
|||
|
|||
// --- Plan + execute: catch-up ---
|
|||
|
|||
func TestDriver_PlanRecovery_CatchUp(t *testing.T) { |
|||
storage := newMockStorage(RetainedHistory{ |
|||
HeadLSN: 100, TailLSN: 30, CommittedLSN: 100, |
|||
}) |
|||
driver := NewRecoveryDriver(storage) |
|||
|
|||
driver.Orchestrator.ProcessAssignment(AssignmentIntent{ |
|||
Replicas: []ReplicaAssignment{ |
|||
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}}, |
|||
}, |
|||
Epoch: 1, |
|||
RecoveryTargets: map[string]SessionKind{"r1": SessionCatchUp}, |
|||
}) |
|||
|
|||
plan, err := driver.PlanRecovery("r1", 70) |
|||
if err != nil { |
|||
t.Fatal(err) |
|||
} |
|||
if plan.Outcome != OutcomeCatchUp { |
|||
t.Fatalf("outcome=%s", plan.Outcome) |
|||
} |
|||
if plan.RetentionPin == nil { |
|||
t.Fatal("WAL retention should be pinned") |
|||
} |
|||
if plan.CatchUpTarget != 100 { |
|||
t.Fatalf("target=%d", plan.CatchUpTarget) |
|||
} |
|||
if !plan.Proof.Recoverable { |
|||
t.Fatalf("proof: %s", plan.Proof.Reason) |
|||
} |
|||
|
|||
// WAL is pinned.
|
|||
if len(storage.pinnedWAL) != 1 { |
|||
t.Fatalf("expected 1 WAL pin, got %d", len(storage.pinnedWAL)) |
|||
} |
|||
|
|||
// Execute catch-up via orchestrator.
|
|||
driver.Orchestrator.CompleteCatchUp("r1", CatchUpOptions{TargetLSN: plan.CatchUpTarget}) |
|||
|
|||
// Release resources.
|
|||
driver.ReleasePlan(plan) |
|||
if len(storage.pinnedWAL) != 0 { |
|||
t.Fatal("WAL pin should be released") |
|||
} |
|||
} |
|||
|
|||
// --- Plan + execute: zero-gap ---
|
|||
|
|||
func TestDriver_PlanRecovery_ZeroGap(t *testing.T) { |
|||
storage := newMockStorage(RetainedHistory{ |
|||
HeadLSN: 100, TailLSN: 0, CommittedLSN: 100, |
|||
}) |
|||
driver := NewRecoveryDriver(storage) |
|||
|
|||
driver.Orchestrator.ProcessAssignment(AssignmentIntent{ |
|||
Replicas: []ReplicaAssignment{ |
|||
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}}, |
|||
}, |
|||
Epoch: 1, |
|||
RecoveryTargets: map[string]SessionKind{"r1": SessionCatchUp}, |
|||
}) |
|||
|
|||
plan, err := driver.PlanRecovery("r1", 100) |
|||
if err != nil { |
|||
t.Fatal(err) |
|||
} |
|||
if plan.Outcome != OutcomeZeroGap { |
|||
t.Fatalf("outcome=%s", plan.Outcome) |
|||
} |
|||
|
|||
// Zero-gap: no resources pinned.
|
|||
if plan.RetentionPin != nil { |
|||
t.Fatal("zero-gap should not pin WAL") |
|||
} |
|||
|
|||
// Already completed.
|
|||
if driver.Orchestrator.Registry.Sender("r1").State() != StateInSync { |
|||
t.Fatalf("state=%s", driver.Orchestrator.Registry.Sender("r1").State()) |
|||
} |
|||
} |
|||
|
|||
// --- Plan + execute: needs rebuild ---
|
|||
|
|||
func TestDriver_PlanRecovery_NeedsRebuild_ThenRebuild(t *testing.T) { |
|||
storage := newMockStorage(RetainedHistory{ |
|||
HeadLSN: 100, TailLSN: 60, CommittedLSN: 100, |
|||
CheckpointLSN: 50, CheckpointTrusted: true, |
|||
}) |
|||
driver := NewRecoveryDriver(storage) |
|||
|
|||
driver.Orchestrator.ProcessAssignment(AssignmentIntent{ |
|||
Replicas: []ReplicaAssignment{ |
|||
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}}, |
|||
}, |
|||
Epoch: 1, |
|||
RecoveryTargets: map[string]SessionKind{"r1": SessionCatchUp}, |
|||
}) |
|||
|
|||
// Plan: catch-up fails.
|
|||
plan, err := driver.PlanRecovery("r1", 30) |
|||
if err != nil { |
|||
t.Fatal(err) |
|||
} |
|||
if plan.Outcome != OutcomeNeedsRebuild { |
|||
t.Fatalf("outcome=%s", plan.Outcome) |
|||
} |
|||
|
|||
// Rebuild assignment.
|
|||
driver.Orchestrator.ProcessAssignment(AssignmentIntent{ |
|||
Replicas: []ReplicaAssignment{ |
|||
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}}, |
|||
}, |
|||
Epoch: 1, |
|||
RecoveryTargets: map[string]SessionKind{"r1": SessionRebuild}, |
|||
}) |
|||
|
|||
// Plan rebuild with resource acquisition.
|
|||
rebuildPlan, err := driver.PlanRebuild("r1") |
|||
if err != nil { |
|||
t.Fatal(err) |
|||
} |
|||
// Checkpoint at 50, tail at 60 → unreplayable → full base.
|
|||
if rebuildPlan.RebuildSource != RebuildFullBase { |
|||
t.Fatalf("source=%s (checkpoint at 50 but tail at 60)", rebuildPlan.RebuildSource) |
|||
} |
|||
|
|||
// Execute rebuild via orchestrator.
|
|||
driver.Orchestrator.CompleteRebuild("r1", &storage.history) |
|||
|
|||
if driver.Orchestrator.Registry.Sender("r1").State() != StateInSync { |
|||
t.Fatalf("state=%s", driver.Orchestrator.Registry.Sender("r1").State()) |
|||
} |
|||
} |
|||
|
|||
// --- Resource failure: WAL pin refused ---
|
|||
|
|||
func TestDriver_PlanRecovery_WALPinFailure(t *testing.T) { |
|||
storage := newMockStorage(RetainedHistory{ |
|||
HeadLSN: 100, TailLSN: 30, CommittedLSN: 100, |
|||
}) |
|||
storage.failWALPin = true |
|||
driver := NewRecoveryDriver(storage) |
|||
|
|||
driver.Orchestrator.ProcessAssignment(AssignmentIntent{ |
|||
Replicas: []ReplicaAssignment{ |
|||
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}}, |
|||
}, |
|||
Epoch: 1, |
|||
RecoveryTargets: map[string]SessionKind{"r1": SessionCatchUp}, |
|||
}) |
|||
|
|||
_, err := driver.PlanRecovery("r1", 70) |
|||
if err == nil { |
|||
t.Fatal("should fail when WAL pin is refused") |
|||
} |
|||
|
|||
// Log should show the failure.
|
|||
hasFailure := false |
|||
for _, e := range driver.Orchestrator.Log.EventsFor("r1") { |
|||
if e.Event == "wal_pin_failed" { |
|||
hasFailure = true |
|||
} |
|||
} |
|||
if !hasFailure { |
|||
t.Fatal("log should contain wal_pin_failed") |
|||
} |
|||
} |
|||
|
|||
// --- Resource failure: snapshot pin refused → fallback ---
|
|||
|
|||
func TestDriver_PlanRebuild_SnapshotPinFailure(t *testing.T) { |
|||
storage := newMockStorage(RetainedHistory{ |
|||
HeadLSN: 100, TailLSN: 30, CommittedLSN: 100, |
|||
CheckpointLSN: 50, CheckpointTrusted: true, |
|||
}) |
|||
storage.failSnapshotPin = true |
|||
driver := NewRecoveryDriver(storage) |
|||
|
|||
driver.Orchestrator.ProcessAssignment(AssignmentIntent{ |
|||
Replicas: []ReplicaAssignment{ |
|||
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}}, |
|||
}, |
|||
Epoch: 1, |
|||
RecoveryTargets: map[string]SessionKind{"r1": SessionRebuild}, |
|||
}) |
|||
|
|||
_, err := driver.PlanRebuild("r1") |
|||
if err == nil { |
|||
t.Fatal("should fail when snapshot pin is refused") |
|||
} |
|||
|
|||
hasFailure := false |
|||
for _, e := range driver.Orchestrator.Log.EventsFor("r1") { |
|||
if e.Event == "snapshot_pin_failed" { |
|||
hasFailure = true |
|||
} |
|||
} |
|||
if !hasFailure { |
|||
t.Fatal("log should contain snapshot_pin_failed") |
|||
} |
|||
} |
|||
|
|||
// --- Replica-ahead with truncation through driver ---
|
|||
|
|||
func TestDriver_PlanRecovery_ReplicaAhead_Truncation(t *testing.T) { |
|||
storage := newMockStorage(RetainedHistory{ |
|||
HeadLSN: 100, TailLSN: 0, CommittedLSN: 100, |
|||
}) |
|||
driver := NewRecoveryDriver(storage) |
|||
|
|||
driver.Orchestrator.ProcessAssignment(AssignmentIntent{ |
|||
Replicas: []ReplicaAssignment{ |
|||
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}}, |
|||
}, |
|||
Epoch: 1, |
|||
RecoveryTargets: map[string]SessionKind{"r1": SessionCatchUp}, |
|||
}) |
|||
|
|||
plan, err := driver.PlanRecovery("r1", 105) // replica ahead
|
|||
if err != nil { |
|||
t.Fatal(err) |
|||
} |
|||
if plan.Outcome != OutcomeCatchUp { |
|||
t.Fatalf("outcome=%s", plan.Outcome) |
|||
} |
|||
if plan.TruncateLSN != 100 { |
|||
t.Fatalf("truncate=%d, want 100", plan.TruncateLSN) |
|||
} |
|||
|
|||
// Execute with truncation.
|
|||
err = driver.Orchestrator.CompleteCatchUp("r1", CatchUpOptions{ |
|||
TargetLSN: plan.CatchUpTarget, |
|||
TruncateLSN: plan.TruncateLSN, |
|||
}) |
|||
if err != nil { |
|||
t.Fatalf("catch-up with truncation: %v", err) |
|||
} |
|||
|
|||
driver.ReleasePlan(plan) |
|||
} |
|||
|
|||
// --- Cross-layer contract: storage proves recoverability ---
|
|||
|
|||
func TestDriver_CrossLayer_StorageProvesRecoverability(t *testing.T) { |
|||
// The engine asks "is this recoverable?" and the storage adapter
|
|||
// answers from real state — not from test-reconstructed inputs.
|
|||
storage := newMockStorage(RetainedHistory{ |
|||
HeadLSN: 100, TailLSN: 50, CommittedLSN: 100, |
|||
CheckpointLSN: 40, CheckpointTrusted: true, |
|||
}) |
|||
driver := NewRecoveryDriver(storage) |
|||
|
|||
driver.Orchestrator.ProcessAssignment(AssignmentIntent{ |
|||
Replicas: []ReplicaAssignment{ |
|||
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}}, |
|||
}, |
|||
Epoch: 1, |
|||
RecoveryTargets: map[string]SessionKind{"r1": SessionCatchUp}, |
|||
}) |
|||
|
|||
// Engine asks storage for recoverability proof.
|
|||
history := storage.GetRetainedHistory() |
|||
proof := history.ProveRecoverability(60) // gap 60→100
|
|||
|
|||
if !proof.Recoverable { |
|||
t.Fatalf("storage should prove recoverable: %s", proof.Reason) |
|||
} |
|||
|
|||
// Engine asks for rebuild source decision.
|
|||
source, snapLSN := history.RebuildSourceDecision() |
|||
// Checkpoint at 40, tail at 50 → checkpoint < tail → unreplayable.
|
|||
if source != RebuildFullBase { |
|||
t.Fatalf("source=%s snap=%d (checkpoint 40 < tail 50)", source, snapLSN) |
|||
} |
|||
|
|||
// Failure is observable: log from PlanRecovery.
|
|||
plan, _ := driver.PlanRecovery("r1", 60) |
|||
if plan.Proof == nil || !plan.Proof.Recoverable { |
|||
t.Fatal("plan should carry proof from storage") |
|||
} |
|||
|
|||
driver.ReleasePlan(plan) |
|||
} |
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue