You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

336 lines
9.7 KiB

package replication
import "testing"
// ============================================================
// Phase 05 Slice 4: Integration tests via RecoveryOrchestrator
//
// All tests use the orchestrator as the entry path — no direct
// sender API calls for the recovery lifecycle.
// ============================================================
// --- V2 Boundary 1: Changed-address recovery ---
func TestIntegration_ChangedAddress_ViaOrchestrator(t *testing.T) {
o := NewRecoveryOrchestrator()
primary := RetainedHistory{HeadLSN: 100, TailLSN: 30, CommittedLSN: 100}
// Initial assignment + recovery.
o.ProcessAssignment(AssignmentIntent{
Replicas: []ReplicaAssignment{
{ReplicaID: "vol1-r1", Endpoint: Endpoint{DataAddr: "10.0.0.1:9333", CtrlAddr: "10.0.0.1:9334", Version: 1}},
},
Epoch: 1,
RecoveryTargets: map[string]SessionKind{"vol1-r1": SessionCatchUp},
})
// Recovery via orchestrator.
result := o.ExecuteRecovery("vol1-r1", 80, &primary)
if result.Outcome != OutcomeCatchUp {
t.Fatalf("outcome=%s", result.Outcome)
}
o.CompleteCatchUp("vol1-r1", 100)
s := o.Registry.Sender("vol1-r1")
if s.State() != StateInSync {
t.Fatalf("state=%s", s.State())
}
// Address changes — new assignment.
o.ProcessAssignment(AssignmentIntent{
Replicas: []ReplicaAssignment{
{ReplicaID: "vol1-r1", Endpoint: Endpoint{DataAddr: "10.0.0.2:9333", CtrlAddr: "10.0.0.2:9334", Version: 2}},
},
Epoch: 1,
RecoveryTargets: map[string]SessionKind{"vol1-r1": SessionCatchUp},
})
// Sender identity preserved.
if o.Registry.Sender("vol1-r1") != s {
t.Fatal("sender identity must survive address change")
}
if s.Endpoint().DataAddr != "10.0.0.2:9333" {
t.Fatalf("endpoint not updated: %s", s.Endpoint().DataAddr)
}
// Zero-gap recovery on new endpoint.
result2 := o.ExecuteRecovery("vol1-r1", 100, &primary)
if result2.Outcome != OutcomeZeroGap {
t.Fatalf("outcome=%s", result2.Outcome)
}
// Zero-gap completes in handshake phase.
s.CompleteSessionByID(s.SessionID())
// Verify log has events from both cycles.
events := o.Log.EventsFor("vol1-r1")
if len(events) < 4 {
t.Fatalf("expected ≥4 orchestrator events, got %d", len(events))
}
t.Logf("changed-address: %d orchestrator events", len(events))
}
// --- V2 Boundary 2: NeedsRebuild → rebuild ---
func TestIntegration_NeedsRebuild_Rebuild_ViaOrchestrator(t *testing.T) {
o := NewRecoveryOrchestrator()
primary := RetainedHistory{
HeadLSN: 100, TailLSN: 60, CommittedLSN: 100,
CheckpointLSN: 40, CheckpointTrusted: true,
}
o.ProcessAssignment(AssignmentIntent{
Replicas: []ReplicaAssignment{
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
},
Epoch: 1,
RecoveryTargets: map[string]SessionKind{"r1": SessionCatchUp},
})
// Catch-up fails — gap beyond retention.
result := o.ExecuteRecovery("r1", 30, &primary)
if result.Outcome != OutcomeNeedsRebuild {
t.Fatalf("outcome=%s", result.Outcome)
}
if !result.Proof.Recoverable == true {
// Should NOT be recoverable.
}
if result.FinalState != StateNeedsRebuild {
t.Fatalf("state=%s", result.FinalState)
}
// Rebuild assignment.
o.ProcessAssignment(AssignmentIntent{
Replicas: []ReplicaAssignment{
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
},
Epoch: 1,
RecoveryTargets: map[string]SessionKind{"r1": SessionRebuild},
})
// Rebuild via orchestrator.
if err := o.CompleteRebuild("r1", &primary); err != nil {
t.Fatalf("rebuild: %v", err)
}
s := o.Registry.Sender("r1")
if s.State() != StateInSync {
t.Fatalf("state=%s", s.State())
}
// Log should show escalation + rebuild events.
events := o.Log.EventsFor("r1")
hasEscalation := false
hasRebuild := false
for _, e := range events {
if e.Event == "escalated" {
hasEscalation = true
}
if e.Event == "rebuild_completed" {
hasRebuild = true
}
}
if !hasEscalation {
t.Fatal("log should contain escalation event")
}
if !hasRebuild {
t.Fatal("log should contain rebuild_completed event")
}
}
// --- V2 Boundary 3: Epoch bump during recovery ---
func TestIntegration_EpochBump_ViaOrchestrator(t *testing.T) {
o := NewRecoveryOrchestrator()
primary := RetainedHistory{HeadLSN: 100, TailLSN: 0, CommittedLSN: 100}
o.ProcessAssignment(AssignmentIntent{
Replicas: []ReplicaAssignment{
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
},
Epoch: 1,
RecoveryTargets: map[string]SessionKind{"r1": SessionCatchUp},
})
// Epoch bumps mid-recovery.
o.InvalidateEpoch(2)
o.Registry.Sender("r1").UpdateEpoch(2)
// Old session is dead — ExecuteRecovery should fail.
result := o.ExecuteRecovery("r1", 100, &primary)
if result.Error == nil {
t.Fatal("should fail on stale session")
}
// New assignment at epoch 2.
o.ProcessAssignment(AssignmentIntent{
Replicas: []ReplicaAssignment{
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
},
Epoch: 2,
RecoveryTargets: map[string]SessionKind{"r1": SessionCatchUp},
})
result2 := o.ExecuteRecovery("r1", 100, &primary)
if result2.Outcome != OutcomeZeroGap {
t.Fatalf("epoch 2: %s", result2.Outcome)
}
o.Registry.Sender("r1").CompleteSessionByID(o.Registry.Sender("r1").SessionID())
if o.Registry.Sender("r1").State() != StateInSync {
t.Fatalf("state=%s", o.Registry.Sender("r1").State())
}
// Log should show epoch invalidation.
hasInvalidation := false
for _, e := range o.Log.Events() {
if e.Event == "epoch_invalidation" {
hasInvalidation = true
}
}
if !hasInvalidation {
t.Fatal("log should contain epoch_invalidation event")
}
}
// --- V2 Boundary 4: Multi-replica mixed outcomes ---
func TestIntegration_MultiReplica_ViaOrchestrator(t *testing.T) {
o := NewRecoveryOrchestrator()
primary := RetainedHistory{
HeadLSN: 100, TailLSN: 40, CommittedLSN: 100,
CheckpointLSN: 50, CheckpointTrusted: true,
}
o.ProcessAssignment(AssignmentIntent{
Replicas: []ReplicaAssignment{
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
{ReplicaID: "r2", Endpoint: Endpoint{DataAddr: "r2:9333", Version: 1}},
{ReplicaID: "r3", Endpoint: Endpoint{DataAddr: "r3:9333", Version: 1}},
},
Epoch: 1,
RecoveryTargets: map[string]SessionKind{
"r1": SessionCatchUp,
"r2": SessionCatchUp,
"r3": SessionCatchUp,
},
})
// r1: zero-gap.
r1 := o.ExecuteRecovery("r1", 100, &primary)
if r1.Outcome != OutcomeZeroGap {
t.Fatalf("r1: %s", r1.Outcome)
}
o.Registry.Sender("r1").CompleteSessionByID(o.Registry.Sender("r1").SessionID())
// r2: catch-up.
r2 := o.ExecuteRecovery("r2", 60, &primary)
if r2.Outcome != OutcomeCatchUp || !r2.Proof.Recoverable {
t.Fatalf("r2: outcome=%s proof=%v", r2.Outcome, r2.Proof)
}
o.CompleteCatchUp("r2", 100)
// r3: needs rebuild.
r3 := o.ExecuteRecovery("r3", 20, &primary)
if r3.Outcome != OutcomeNeedsRebuild {
t.Fatalf("r3: %s", r3.Outcome)
}
// Registry status.
status := o.Registry.Status()
if status.InSync != 2 {
t.Fatalf("in_sync=%d", status.InSync)
}
if status.Rebuilding != 1 {
t.Fatalf("rebuilding=%d", status.Rebuilding)
}
}
// --- Observability ---
func TestIntegration_Observability_SessionSnapshot(t *testing.T) {
o := NewRecoveryOrchestrator()
_ = RetainedHistory{HeadLSN: 100, TailLSN: 0, CommittedLSN: 100}
o.ProcessAssignment(AssignmentIntent{
Replicas: []ReplicaAssignment{
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
},
Epoch: 1,
RecoveryTargets: map[string]SessionKind{"r1": SessionCatchUp},
})
// After handshake with replica ahead → truncation required.
s := o.Registry.Sender("r1")
id := s.SessionID()
s.BeginConnect(id)
s.RecordHandshakeWithOutcome(id, HandshakeResult{
ReplicaFlushedLSN: 105, CommittedLSN: 100, RetentionStartLSN: 0,
})
snap := s.SessionSnapshot()
if !snap.TruncateRequired {
t.Fatal("snapshot should show truncation required")
}
if snap.TruncateToLSN != 100 {
t.Fatalf("truncate to=%d", snap.TruncateToLSN)
}
}
func TestIntegration_Observability_RebuildSnapshot(t *testing.T) {
o := NewRecoveryOrchestrator()
primary := RetainedHistory{
HeadLSN: 100, TailLSN: 30, CommittedLSN: 100,
CheckpointLSN: 50, CheckpointTrusted: true,
}
o.ProcessAssignment(AssignmentIntent{
Replicas: []ReplicaAssignment{
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
},
Epoch: 1,
RecoveryTargets: map[string]SessionKind{"r1": SessionRebuild},
})
s := o.Registry.Sender("r1")
id := s.SessionID()
s.BeginConnect(id)
s.RecordHandshake(id, 0, 100)
s.SelectRebuildFromHistory(id, &primary)
snap := s.SessionSnapshot()
if snap.RebuildSource != RebuildSnapshotTail {
t.Fatalf("rebuild source=%s", snap.RebuildSource)
}
if snap.RebuildPhase != RebuildPhaseSourceSelect {
t.Fatalf("rebuild phase=%s", snap.RebuildPhase)
}
}
func TestIntegration_RecoveryLog_AutoPopulated(t *testing.T) {
o := NewRecoveryOrchestrator()
primary := RetainedHistory{HeadLSN: 100, TailLSN: 0, CommittedLSN: 100}
o.ProcessAssignment(AssignmentIntent{
Replicas: []ReplicaAssignment{
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
},
Epoch: 1,
RecoveryTargets: map[string]SessionKind{"r1": SessionCatchUp},
})
o.ExecuteRecovery("r1", 80, &primary)
o.CompleteCatchUp("r1", 100)
events := o.Log.EventsFor("r1")
// Should have: sender_added, session_created, connected, handshake, catchup_started, completed.
if len(events) < 5 {
t.Fatalf("expected ≥5 auto-populated events, got %d", len(events))
}
// All events came from the orchestrator, not manual test logging.
for _, e := range events {
if e.Event == "" {
t.Fatal("event should have non-empty type")
}
}
t.Logf("auto-populated log: %d events", len(events))
}