You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
542 lines
15 KiB
542 lines
15 KiB
package replication
|
|
|
|
import (
|
|
"fmt"
|
|
"sync/atomic"
|
|
"testing"
|
|
)
|
|
|
|
// ============================================================
|
|
// Phase 06 P0/P1: Recovery driver tests with mock storage adapter
|
|
// ============================================================
|
|
|
|
// --- Mock storage adapter ---
|
|
|
|
type mockStorage struct {
|
|
history RetainedHistory
|
|
nextPinID atomic.Uint64
|
|
pinnedSnaps map[uint64]bool
|
|
pinnedWAL map[uint64]bool
|
|
pinnedFullBase map[uint64]bool
|
|
failSnapshotPin bool
|
|
failWALPin bool
|
|
failFullBasePin bool
|
|
}
|
|
|
|
func newMockStorage(history RetainedHistory) *mockStorage {
|
|
return &mockStorage{
|
|
history: history,
|
|
pinnedSnaps: map[uint64]bool{},
|
|
pinnedWAL: map[uint64]bool{},
|
|
pinnedFullBase: map[uint64]bool{},
|
|
}
|
|
}
|
|
|
|
func (m *mockStorage) GetRetainedHistory() RetainedHistory { return m.history }
|
|
|
|
func (m *mockStorage) PinSnapshot(lsn uint64) (SnapshotPin, error) {
|
|
if m.failSnapshotPin {
|
|
return SnapshotPin{}, fmt.Errorf("snapshot pin refused")
|
|
}
|
|
id := m.nextPinID.Add(1)
|
|
m.pinnedSnaps[id] = true
|
|
return SnapshotPin{LSN: lsn, PinID: id, Valid: true}, nil
|
|
}
|
|
|
|
func (m *mockStorage) ReleaseSnapshot(pin SnapshotPin) {
|
|
delete(m.pinnedSnaps, pin.PinID)
|
|
}
|
|
|
|
func (m *mockStorage) PinWALRetention(startLSN uint64) (RetentionPin, error) {
|
|
if m.failWALPin {
|
|
return RetentionPin{}, fmt.Errorf("WAL retention pin refused")
|
|
}
|
|
id := m.nextPinID.Add(1)
|
|
m.pinnedWAL[id] = true
|
|
return RetentionPin{StartLSN: startLSN, PinID: id, Valid: true}, nil
|
|
}
|
|
|
|
func (m *mockStorage) ReleaseWALRetention(pin RetentionPin) {
|
|
delete(m.pinnedWAL, pin.PinID)
|
|
}
|
|
|
|
func (m *mockStorage) PinFullBase(committedLSN uint64) (FullBasePin, error) {
|
|
if m.failFullBasePin {
|
|
return FullBasePin{}, fmt.Errorf("full base pin refused")
|
|
}
|
|
id := m.nextPinID.Add(1)
|
|
m.pinnedFullBase[id] = true
|
|
return FullBasePin{CommittedLSN: committedLSN, PinID: id, Valid: true}, nil
|
|
}
|
|
|
|
func (m *mockStorage) ReleaseFullBase(pin FullBasePin) {
|
|
delete(m.pinnedFullBase, pin.PinID)
|
|
}
|
|
|
|
// --- Plan + execute: catch-up ---
|
|
|
|
func TestDriver_PlanRecovery_CatchUp(t *testing.T) {
|
|
storage := newMockStorage(RetainedHistory{
|
|
HeadLSN: 100, TailLSN: 30, CommittedLSN: 100,
|
|
})
|
|
driver := NewRecoveryDriver(storage)
|
|
|
|
driver.Orchestrator.ProcessAssignment(AssignmentIntent{
|
|
Replicas: []ReplicaAssignment{
|
|
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
|
|
},
|
|
Epoch: 1,
|
|
RecoveryTargets: map[string]SessionKind{"r1": SessionCatchUp},
|
|
})
|
|
|
|
plan, err := driver.PlanRecovery("r1", 70)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if plan.Outcome != OutcomeCatchUp {
|
|
t.Fatalf("outcome=%s", plan.Outcome)
|
|
}
|
|
if plan.RetentionPin == nil {
|
|
t.Fatal("WAL retention should be pinned")
|
|
}
|
|
if plan.CatchUpTarget != 100 {
|
|
t.Fatalf("target=%d", plan.CatchUpTarget)
|
|
}
|
|
if !plan.Proof.Recoverable {
|
|
t.Fatalf("proof: %s", plan.Proof.Reason)
|
|
}
|
|
|
|
// WAL is pinned.
|
|
if len(storage.pinnedWAL) != 1 {
|
|
t.Fatalf("expected 1 WAL pin, got %d", len(storage.pinnedWAL))
|
|
}
|
|
|
|
// Execute catch-up via orchestrator.
|
|
driver.Orchestrator.CompleteCatchUp("r1", CatchUpOptions{TargetLSN: plan.CatchUpTarget})
|
|
|
|
// Release resources.
|
|
driver.ReleasePlan(plan)
|
|
if len(storage.pinnedWAL) != 0 {
|
|
t.Fatal("WAL pin should be released")
|
|
}
|
|
}
|
|
|
|
// --- Plan + execute: zero-gap ---
|
|
|
|
func TestDriver_PlanRecovery_ZeroGap(t *testing.T) {
|
|
storage := newMockStorage(RetainedHistory{
|
|
HeadLSN: 100, TailLSN: 0, CommittedLSN: 100,
|
|
})
|
|
driver := NewRecoveryDriver(storage)
|
|
|
|
driver.Orchestrator.ProcessAssignment(AssignmentIntent{
|
|
Replicas: []ReplicaAssignment{
|
|
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
|
|
},
|
|
Epoch: 1,
|
|
RecoveryTargets: map[string]SessionKind{"r1": SessionCatchUp},
|
|
})
|
|
|
|
plan, err := driver.PlanRecovery("r1", 100)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if plan.Outcome != OutcomeZeroGap {
|
|
t.Fatalf("outcome=%s", plan.Outcome)
|
|
}
|
|
|
|
// Zero-gap: no resources pinned.
|
|
if plan.RetentionPin != nil {
|
|
t.Fatal("zero-gap should not pin WAL")
|
|
}
|
|
|
|
// Already completed.
|
|
if driver.Orchestrator.Registry.Sender("r1").State() != StateInSync {
|
|
t.Fatalf("state=%s", driver.Orchestrator.Registry.Sender("r1").State())
|
|
}
|
|
}
|
|
|
|
// --- Plan + execute: needs rebuild ---
|
|
|
|
func TestDriver_PlanRecovery_NeedsRebuild_ThenRebuild(t *testing.T) {
|
|
storage := newMockStorage(RetainedHistory{
|
|
HeadLSN: 100, TailLSN: 60, CommittedLSN: 100,
|
|
CheckpointLSN: 50, CheckpointTrusted: true,
|
|
})
|
|
driver := NewRecoveryDriver(storage)
|
|
|
|
driver.Orchestrator.ProcessAssignment(AssignmentIntent{
|
|
Replicas: []ReplicaAssignment{
|
|
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
|
|
},
|
|
Epoch: 1,
|
|
RecoveryTargets: map[string]SessionKind{"r1": SessionCatchUp},
|
|
})
|
|
|
|
// Plan: catch-up fails.
|
|
plan, err := driver.PlanRecovery("r1", 30)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if plan.Outcome != OutcomeNeedsRebuild {
|
|
t.Fatalf("outcome=%s", plan.Outcome)
|
|
}
|
|
|
|
// Rebuild assignment.
|
|
driver.Orchestrator.ProcessAssignment(AssignmentIntent{
|
|
Replicas: []ReplicaAssignment{
|
|
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
|
|
},
|
|
Epoch: 1,
|
|
RecoveryTargets: map[string]SessionKind{"r1": SessionRebuild},
|
|
})
|
|
|
|
// Plan rebuild with resource acquisition.
|
|
rebuildPlan, err := driver.PlanRebuild("r1")
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
// Checkpoint at 50, tail at 60 → unreplayable → full base.
|
|
if rebuildPlan.RebuildSource != RebuildFullBase {
|
|
t.Fatalf("source=%s (checkpoint at 50 but tail at 60)", rebuildPlan.RebuildSource)
|
|
}
|
|
|
|
// Execute rebuild via orchestrator.
|
|
driver.Orchestrator.CompleteRebuild("r1", &storage.history)
|
|
|
|
if driver.Orchestrator.Registry.Sender("r1").State() != StateInSync {
|
|
t.Fatalf("state=%s", driver.Orchestrator.Registry.Sender("r1").State())
|
|
}
|
|
}
|
|
|
|
// --- Resource failure: WAL pin refused ---
|
|
|
|
func TestDriver_PlanRecovery_WALPinFailure(t *testing.T) {
|
|
storage := newMockStorage(RetainedHistory{
|
|
HeadLSN: 100, TailLSN: 30, CommittedLSN: 100,
|
|
})
|
|
storage.failWALPin = true
|
|
driver := NewRecoveryDriver(storage)
|
|
|
|
driver.Orchestrator.ProcessAssignment(AssignmentIntent{
|
|
Replicas: []ReplicaAssignment{
|
|
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
|
|
},
|
|
Epoch: 1,
|
|
RecoveryTargets: map[string]SessionKind{"r1": SessionCatchUp},
|
|
})
|
|
|
|
_, err := driver.PlanRecovery("r1", 70)
|
|
if err == nil {
|
|
t.Fatal("should fail when WAL pin is refused")
|
|
}
|
|
|
|
// Log should show the failure.
|
|
hasFailure := false
|
|
for _, e := range driver.Orchestrator.Log.EventsFor("r1") {
|
|
if e.Event == "wal_pin_failed" {
|
|
hasFailure = true
|
|
}
|
|
}
|
|
if !hasFailure {
|
|
t.Fatal("log should contain wal_pin_failed")
|
|
}
|
|
}
|
|
|
|
// --- Resource failure: snapshot pin refused → fallback ---
|
|
|
|
func TestDriver_PlanRebuild_SnapshotPinFailure(t *testing.T) {
|
|
storage := newMockStorage(RetainedHistory{
|
|
HeadLSN: 100, TailLSN: 30, CommittedLSN: 100,
|
|
CheckpointLSN: 50, CheckpointTrusted: true,
|
|
})
|
|
storage.failSnapshotPin = true
|
|
driver := NewRecoveryDriver(storage)
|
|
|
|
driver.Orchestrator.ProcessAssignment(AssignmentIntent{
|
|
Replicas: []ReplicaAssignment{
|
|
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
|
|
},
|
|
Epoch: 1,
|
|
RecoveryTargets: map[string]SessionKind{"r1": SessionRebuild},
|
|
})
|
|
|
|
_, err := driver.PlanRebuild("r1")
|
|
if err == nil {
|
|
t.Fatal("should fail when snapshot pin is refused")
|
|
}
|
|
|
|
hasFailure := false
|
|
for _, e := range driver.Orchestrator.Log.EventsFor("r1") {
|
|
if e.Event == "snapshot_pin_failed" {
|
|
hasFailure = true
|
|
}
|
|
}
|
|
if !hasFailure {
|
|
t.Fatal("log should contain snapshot_pin_failed")
|
|
}
|
|
}
|
|
|
|
// --- Replica-ahead with truncation through driver ---
|
|
|
|
func TestDriver_PlanRecovery_ReplicaAhead_Truncation(t *testing.T) {
|
|
storage := newMockStorage(RetainedHistory{
|
|
HeadLSN: 100, TailLSN: 0, CommittedLSN: 100,
|
|
})
|
|
driver := NewRecoveryDriver(storage)
|
|
|
|
driver.Orchestrator.ProcessAssignment(AssignmentIntent{
|
|
Replicas: []ReplicaAssignment{
|
|
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
|
|
},
|
|
Epoch: 1,
|
|
RecoveryTargets: map[string]SessionKind{"r1": SessionCatchUp},
|
|
})
|
|
|
|
plan, err := driver.PlanRecovery("r1", 105) // replica ahead
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if plan.Outcome != OutcomeCatchUp {
|
|
t.Fatalf("outcome=%s", plan.Outcome)
|
|
}
|
|
if plan.TruncateLSN != 100 {
|
|
t.Fatalf("truncate=%d, want 100", plan.TruncateLSN)
|
|
}
|
|
|
|
// Execute with truncation.
|
|
err = driver.Orchestrator.CompleteCatchUp("r1", CatchUpOptions{
|
|
TargetLSN: plan.CatchUpTarget,
|
|
TruncateLSN: plan.TruncateLSN,
|
|
})
|
|
if err != nil {
|
|
t.Fatalf("catch-up with truncation: %v", err)
|
|
}
|
|
|
|
driver.ReleasePlan(plan)
|
|
}
|
|
|
|
// --- Truncation-only: no WAL pin needed ---
|
|
|
|
func TestDriver_PlanRecovery_ReplicaAhead_NoWALPin(t *testing.T) {
|
|
storage := newMockStorage(RetainedHistory{
|
|
HeadLSN: 100, TailLSN: 0, CommittedLSN: 100,
|
|
})
|
|
driver := NewRecoveryDriver(storage)
|
|
|
|
driver.Orchestrator.ProcessAssignment(AssignmentIntent{
|
|
Replicas: []ReplicaAssignment{
|
|
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
|
|
},
|
|
Epoch: 1,
|
|
RecoveryTargets: map[string]SessionKind{"r1": SessionCatchUp},
|
|
})
|
|
|
|
// Replica ahead — truncation only, no WAL replay.
|
|
plan, err := driver.PlanRecovery("r1", 105)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if plan.RetentionPin != nil {
|
|
t.Fatal("truncation-only should NOT pin WAL")
|
|
}
|
|
if plan.TruncateLSN != 100 {
|
|
t.Fatalf("truncate=%d, want 100", plan.TruncateLSN)
|
|
}
|
|
if len(storage.pinnedWAL) != 0 {
|
|
t.Fatal("no WAL pins should exist for truncation-only")
|
|
}
|
|
}
|
|
|
|
// --- PlanRebuild precondition checks ---
|
|
|
|
func TestDriver_PlanRebuild_MissingSender(t *testing.T) {
|
|
storage := newMockStorage(RetainedHistory{CommittedLSN: 100})
|
|
driver := NewRecoveryDriver(storage)
|
|
|
|
_, err := driver.PlanRebuild("nonexistent")
|
|
if err == nil {
|
|
t.Fatal("should fail for missing sender")
|
|
}
|
|
}
|
|
|
|
func TestDriver_PlanRebuild_NoSession(t *testing.T) {
|
|
storage := newMockStorage(RetainedHistory{CommittedLSN: 100})
|
|
driver := NewRecoveryDriver(storage)
|
|
|
|
driver.Orchestrator.ProcessAssignment(AssignmentIntent{
|
|
Replicas: []ReplicaAssignment{
|
|
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
|
|
},
|
|
Epoch: 1,
|
|
// No recovery target → no session.
|
|
})
|
|
|
|
_, err := driver.PlanRebuild("r1")
|
|
if err == nil {
|
|
t.Fatal("should fail when no session exists")
|
|
}
|
|
}
|
|
|
|
func TestDriver_PlanRebuild_NonRebuildSession(t *testing.T) {
|
|
storage := newMockStorage(RetainedHistory{CommittedLSN: 100})
|
|
driver := NewRecoveryDriver(storage)
|
|
|
|
driver.Orchestrator.ProcessAssignment(AssignmentIntent{
|
|
Replicas: []ReplicaAssignment{
|
|
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
|
|
},
|
|
Epoch: 1,
|
|
RecoveryTargets: map[string]SessionKind{"r1": SessionCatchUp}, // NOT rebuild
|
|
})
|
|
|
|
_, err := driver.PlanRebuild("r1")
|
|
if err == nil {
|
|
t.Fatal("should fail when session is not rebuild")
|
|
}
|
|
}
|
|
|
|
// --- Full-base rebuild pin ---
|
|
|
|
func TestDriver_PlanRebuild_FullBase_PinsBaseImage(t *testing.T) {
|
|
storage := newMockStorage(RetainedHistory{
|
|
HeadLSN: 100, TailLSN: 60, CommittedLSN: 100,
|
|
CheckpointLSN: 40, CheckpointTrusted: true,
|
|
})
|
|
driver := NewRecoveryDriver(storage)
|
|
|
|
driver.Orchestrator.ProcessAssignment(AssignmentIntent{
|
|
Replicas: []ReplicaAssignment{
|
|
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
|
|
},
|
|
Epoch: 1,
|
|
RecoveryTargets: map[string]SessionKind{"r1": SessionRebuild},
|
|
})
|
|
|
|
plan, err := driver.PlanRebuild("r1")
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
// Checkpoint at 40, tail at 60 → unreplayable → full base.
|
|
if plan.RebuildSource != RebuildFullBase {
|
|
t.Fatalf("source=%s", plan.RebuildSource)
|
|
}
|
|
if plan.FullBasePin == nil {
|
|
t.Fatal("full_base rebuild must have a pinned base image")
|
|
}
|
|
if len(storage.pinnedFullBase) != 1 {
|
|
t.Fatalf("expected 1 full base pin, got %d", len(storage.pinnedFullBase))
|
|
}
|
|
|
|
driver.ReleasePlan(plan)
|
|
if len(storage.pinnedFullBase) != 0 {
|
|
t.Fatal("full base pin should be released")
|
|
}
|
|
}
|
|
|
|
func TestDriver_PlanRebuild_FullBase_PinFailure(t *testing.T) {
|
|
storage := newMockStorage(RetainedHistory{
|
|
HeadLSN: 100, TailLSN: 60, CommittedLSN: 100,
|
|
CheckpointLSN: 40, CheckpointTrusted: true,
|
|
})
|
|
storage.failFullBasePin = true
|
|
driver := NewRecoveryDriver(storage)
|
|
|
|
driver.Orchestrator.ProcessAssignment(AssignmentIntent{
|
|
Replicas: []ReplicaAssignment{
|
|
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
|
|
},
|
|
Epoch: 1,
|
|
RecoveryTargets: map[string]SessionKind{"r1": SessionRebuild},
|
|
})
|
|
|
|
_, err := driver.PlanRebuild("r1")
|
|
if err == nil {
|
|
t.Fatal("should fail when full base pin is refused")
|
|
}
|
|
|
|
hasFailure := false
|
|
for _, e := range driver.Orchestrator.Log.EventsFor("r1") {
|
|
if e.Event == "full_base_pin_failed" {
|
|
hasFailure = true
|
|
}
|
|
}
|
|
if !hasFailure {
|
|
t.Fatal("log should contain full_base_pin_failed")
|
|
}
|
|
}
|
|
|
|
// --- WAL pin failure cleans up session ---
|
|
|
|
func TestDriver_PlanRecovery_WALPinFailure_CleansUpSession(t *testing.T) {
|
|
storage := newMockStorage(RetainedHistory{
|
|
HeadLSN: 100, TailLSN: 30, CommittedLSN: 100,
|
|
})
|
|
storage.failWALPin = true
|
|
driver := NewRecoveryDriver(storage)
|
|
|
|
driver.Orchestrator.ProcessAssignment(AssignmentIntent{
|
|
Replicas: []ReplicaAssignment{
|
|
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
|
|
},
|
|
Epoch: 1,
|
|
RecoveryTargets: map[string]SessionKind{"r1": SessionCatchUp},
|
|
})
|
|
|
|
_, err := driver.PlanRecovery("r1", 70)
|
|
if err == nil {
|
|
t.Fatal("should fail when WAL pin is refused")
|
|
}
|
|
|
|
// Session must be invalidated — no dangling live session.
|
|
s := driver.Orchestrator.Registry.Sender("r1")
|
|
if s.HasActiveSession() {
|
|
t.Fatal("session should be invalidated after WAL pin failure")
|
|
}
|
|
if s.State() != StateDisconnected {
|
|
t.Fatalf("sender should be disconnected after pin failure, got %s", s.State())
|
|
}
|
|
}
|
|
|
|
// --- Cross-layer contract: storage proves recoverability ---
|
|
|
|
func TestDriver_CrossLayer_StorageProvesRecoverability(t *testing.T) {
|
|
// The engine asks "is this recoverable?" and the storage adapter
|
|
// answers from real state — not from test-reconstructed inputs.
|
|
storage := newMockStorage(RetainedHistory{
|
|
HeadLSN: 100, TailLSN: 50, CommittedLSN: 100,
|
|
CheckpointLSN: 40, CheckpointTrusted: true,
|
|
})
|
|
driver := NewRecoveryDriver(storage)
|
|
|
|
driver.Orchestrator.ProcessAssignment(AssignmentIntent{
|
|
Replicas: []ReplicaAssignment{
|
|
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
|
|
},
|
|
Epoch: 1,
|
|
RecoveryTargets: map[string]SessionKind{"r1": SessionCatchUp},
|
|
})
|
|
|
|
// Engine asks storage for recoverability proof.
|
|
history := storage.GetRetainedHistory()
|
|
proof := history.ProveRecoverability(60) // gap 60→100
|
|
|
|
if !proof.Recoverable {
|
|
t.Fatalf("storage should prove recoverable: %s", proof.Reason)
|
|
}
|
|
|
|
// Engine asks for rebuild source decision.
|
|
source, snapLSN := history.RebuildSourceDecision()
|
|
// Checkpoint at 40, tail at 50 → checkpoint < tail → unreplayable.
|
|
if source != RebuildFullBase {
|
|
t.Fatalf("source=%s snap=%d (checkpoint 40 < tail 50)", source, snapLSN)
|
|
}
|
|
|
|
// Failure is observable: log from PlanRecovery.
|
|
plan, _ := driver.PlanRecovery("r1", 60)
|
|
if plan.Proof == nil || !plan.Proof.Recoverable {
|
|
t.Fatal("plan should carry proof from storage")
|
|
}
|
|
|
|
driver.ReleasePlan(plan)
|
|
}
|