You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
336 lines
9.7 KiB
336 lines
9.7 KiB
package replication
|
|
|
|
import "testing"
|
|
|
|
// ============================================================
|
|
// Phase 05 Slice 4: Integration tests via RecoveryOrchestrator
|
|
//
|
|
// All tests use the orchestrator as the entry path — no direct
|
|
// sender API calls for the recovery lifecycle.
|
|
// ============================================================
|
|
|
|
// --- V2 Boundary 1: Changed-address recovery ---
|
|
|
|
func TestIntegration_ChangedAddress_ViaOrchestrator(t *testing.T) {
|
|
o := NewRecoveryOrchestrator()
|
|
primary := RetainedHistory{HeadLSN: 100, TailLSN: 30, CommittedLSN: 100}
|
|
|
|
// Initial assignment + recovery.
|
|
o.ProcessAssignment(AssignmentIntent{
|
|
Replicas: []ReplicaAssignment{
|
|
{ReplicaID: "vol1-r1", Endpoint: Endpoint{DataAddr: "10.0.0.1:9333", CtrlAddr: "10.0.0.1:9334", Version: 1}},
|
|
},
|
|
Epoch: 1,
|
|
RecoveryTargets: map[string]SessionKind{"vol1-r1": SessionCatchUp},
|
|
})
|
|
|
|
// Recovery via orchestrator.
|
|
result := o.ExecuteRecovery("vol1-r1", 80, &primary)
|
|
if result.Outcome != OutcomeCatchUp {
|
|
t.Fatalf("outcome=%s", result.Outcome)
|
|
}
|
|
o.CompleteCatchUp("vol1-r1", 100)
|
|
|
|
s := o.Registry.Sender("vol1-r1")
|
|
if s.State() != StateInSync {
|
|
t.Fatalf("state=%s", s.State())
|
|
}
|
|
|
|
// Address changes — new assignment.
|
|
o.ProcessAssignment(AssignmentIntent{
|
|
Replicas: []ReplicaAssignment{
|
|
{ReplicaID: "vol1-r1", Endpoint: Endpoint{DataAddr: "10.0.0.2:9333", CtrlAddr: "10.0.0.2:9334", Version: 2}},
|
|
},
|
|
Epoch: 1,
|
|
RecoveryTargets: map[string]SessionKind{"vol1-r1": SessionCatchUp},
|
|
})
|
|
|
|
// Sender identity preserved.
|
|
if o.Registry.Sender("vol1-r1") != s {
|
|
t.Fatal("sender identity must survive address change")
|
|
}
|
|
if s.Endpoint().DataAddr != "10.0.0.2:9333" {
|
|
t.Fatalf("endpoint not updated: %s", s.Endpoint().DataAddr)
|
|
}
|
|
|
|
// Zero-gap recovery on new endpoint.
|
|
result2 := o.ExecuteRecovery("vol1-r1", 100, &primary)
|
|
if result2.Outcome != OutcomeZeroGap {
|
|
t.Fatalf("outcome=%s", result2.Outcome)
|
|
}
|
|
// Zero-gap completes in handshake phase.
|
|
s.CompleteSessionByID(s.SessionID())
|
|
|
|
// Verify log has events from both cycles.
|
|
events := o.Log.EventsFor("vol1-r1")
|
|
if len(events) < 4 {
|
|
t.Fatalf("expected ≥4 orchestrator events, got %d", len(events))
|
|
}
|
|
t.Logf("changed-address: %d orchestrator events", len(events))
|
|
}
|
|
|
|
// --- V2 Boundary 2: NeedsRebuild → rebuild ---
|
|
|
|
func TestIntegration_NeedsRebuild_Rebuild_ViaOrchestrator(t *testing.T) {
|
|
o := NewRecoveryOrchestrator()
|
|
primary := RetainedHistory{
|
|
HeadLSN: 100, TailLSN: 60, CommittedLSN: 100,
|
|
CheckpointLSN: 40, CheckpointTrusted: true,
|
|
}
|
|
|
|
o.ProcessAssignment(AssignmentIntent{
|
|
Replicas: []ReplicaAssignment{
|
|
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
|
|
},
|
|
Epoch: 1,
|
|
RecoveryTargets: map[string]SessionKind{"r1": SessionCatchUp},
|
|
})
|
|
|
|
// Catch-up fails — gap beyond retention.
|
|
result := o.ExecuteRecovery("r1", 30, &primary)
|
|
if result.Outcome != OutcomeNeedsRebuild {
|
|
t.Fatalf("outcome=%s", result.Outcome)
|
|
}
|
|
if !result.Proof.Recoverable == true {
|
|
// Should NOT be recoverable.
|
|
}
|
|
if result.FinalState != StateNeedsRebuild {
|
|
t.Fatalf("state=%s", result.FinalState)
|
|
}
|
|
|
|
// Rebuild assignment.
|
|
o.ProcessAssignment(AssignmentIntent{
|
|
Replicas: []ReplicaAssignment{
|
|
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
|
|
},
|
|
Epoch: 1,
|
|
RecoveryTargets: map[string]SessionKind{"r1": SessionRebuild},
|
|
})
|
|
|
|
// Rebuild via orchestrator.
|
|
if err := o.CompleteRebuild("r1", &primary); err != nil {
|
|
t.Fatalf("rebuild: %v", err)
|
|
}
|
|
|
|
s := o.Registry.Sender("r1")
|
|
if s.State() != StateInSync {
|
|
t.Fatalf("state=%s", s.State())
|
|
}
|
|
|
|
// Log should show escalation + rebuild events.
|
|
events := o.Log.EventsFor("r1")
|
|
hasEscalation := false
|
|
hasRebuild := false
|
|
for _, e := range events {
|
|
if e.Event == "escalated" {
|
|
hasEscalation = true
|
|
}
|
|
if e.Event == "rebuild_completed" {
|
|
hasRebuild = true
|
|
}
|
|
}
|
|
if !hasEscalation {
|
|
t.Fatal("log should contain escalation event")
|
|
}
|
|
if !hasRebuild {
|
|
t.Fatal("log should contain rebuild_completed event")
|
|
}
|
|
}
|
|
|
|
// --- V2 Boundary 3: Epoch bump during recovery ---
|
|
|
|
func TestIntegration_EpochBump_ViaOrchestrator(t *testing.T) {
|
|
o := NewRecoveryOrchestrator()
|
|
primary := RetainedHistory{HeadLSN: 100, TailLSN: 0, CommittedLSN: 100}
|
|
|
|
o.ProcessAssignment(AssignmentIntent{
|
|
Replicas: []ReplicaAssignment{
|
|
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
|
|
},
|
|
Epoch: 1,
|
|
RecoveryTargets: map[string]SessionKind{"r1": SessionCatchUp},
|
|
})
|
|
|
|
// Epoch bumps mid-recovery.
|
|
o.InvalidateEpoch(2)
|
|
o.Registry.Sender("r1").UpdateEpoch(2)
|
|
|
|
// Old session is dead — ExecuteRecovery should fail.
|
|
result := o.ExecuteRecovery("r1", 100, &primary)
|
|
if result.Error == nil {
|
|
t.Fatal("should fail on stale session")
|
|
}
|
|
|
|
// New assignment at epoch 2.
|
|
o.ProcessAssignment(AssignmentIntent{
|
|
Replicas: []ReplicaAssignment{
|
|
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
|
|
},
|
|
Epoch: 2,
|
|
RecoveryTargets: map[string]SessionKind{"r1": SessionCatchUp},
|
|
})
|
|
|
|
result2 := o.ExecuteRecovery("r1", 100, &primary)
|
|
if result2.Outcome != OutcomeZeroGap {
|
|
t.Fatalf("epoch 2: %s", result2.Outcome)
|
|
}
|
|
o.Registry.Sender("r1").CompleteSessionByID(o.Registry.Sender("r1").SessionID())
|
|
|
|
if o.Registry.Sender("r1").State() != StateInSync {
|
|
t.Fatalf("state=%s", o.Registry.Sender("r1").State())
|
|
}
|
|
|
|
// Log should show epoch invalidation.
|
|
hasInvalidation := false
|
|
for _, e := range o.Log.Events() {
|
|
if e.Event == "epoch_invalidation" {
|
|
hasInvalidation = true
|
|
}
|
|
}
|
|
if !hasInvalidation {
|
|
t.Fatal("log should contain epoch_invalidation event")
|
|
}
|
|
}
|
|
|
|
// --- V2 Boundary 4: Multi-replica mixed outcomes ---
|
|
|
|
func TestIntegration_MultiReplica_ViaOrchestrator(t *testing.T) {
|
|
o := NewRecoveryOrchestrator()
|
|
primary := RetainedHistory{
|
|
HeadLSN: 100, TailLSN: 40, CommittedLSN: 100,
|
|
CheckpointLSN: 50, CheckpointTrusted: true,
|
|
}
|
|
|
|
o.ProcessAssignment(AssignmentIntent{
|
|
Replicas: []ReplicaAssignment{
|
|
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
|
|
{ReplicaID: "r2", Endpoint: Endpoint{DataAddr: "r2:9333", Version: 1}},
|
|
{ReplicaID: "r3", Endpoint: Endpoint{DataAddr: "r3:9333", Version: 1}},
|
|
},
|
|
Epoch: 1,
|
|
RecoveryTargets: map[string]SessionKind{
|
|
"r1": SessionCatchUp,
|
|
"r2": SessionCatchUp,
|
|
"r3": SessionCatchUp,
|
|
},
|
|
})
|
|
|
|
// r1: zero-gap.
|
|
r1 := o.ExecuteRecovery("r1", 100, &primary)
|
|
if r1.Outcome != OutcomeZeroGap {
|
|
t.Fatalf("r1: %s", r1.Outcome)
|
|
}
|
|
o.Registry.Sender("r1").CompleteSessionByID(o.Registry.Sender("r1").SessionID())
|
|
|
|
// r2: catch-up.
|
|
r2 := o.ExecuteRecovery("r2", 60, &primary)
|
|
if r2.Outcome != OutcomeCatchUp || !r2.Proof.Recoverable {
|
|
t.Fatalf("r2: outcome=%s proof=%v", r2.Outcome, r2.Proof)
|
|
}
|
|
o.CompleteCatchUp("r2", 100)
|
|
|
|
// r3: needs rebuild.
|
|
r3 := o.ExecuteRecovery("r3", 20, &primary)
|
|
if r3.Outcome != OutcomeNeedsRebuild {
|
|
t.Fatalf("r3: %s", r3.Outcome)
|
|
}
|
|
|
|
// Registry status.
|
|
status := o.Registry.Status()
|
|
if status.InSync != 2 {
|
|
t.Fatalf("in_sync=%d", status.InSync)
|
|
}
|
|
if status.Rebuilding != 1 {
|
|
t.Fatalf("rebuilding=%d", status.Rebuilding)
|
|
}
|
|
}
|
|
|
|
// --- Observability ---
|
|
|
|
func TestIntegration_Observability_SessionSnapshot(t *testing.T) {
|
|
o := NewRecoveryOrchestrator()
|
|
_ = RetainedHistory{HeadLSN: 100, TailLSN: 0, CommittedLSN: 100}
|
|
|
|
o.ProcessAssignment(AssignmentIntent{
|
|
Replicas: []ReplicaAssignment{
|
|
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
|
|
},
|
|
Epoch: 1,
|
|
RecoveryTargets: map[string]SessionKind{"r1": SessionCatchUp},
|
|
})
|
|
|
|
// After handshake with replica ahead → truncation required.
|
|
s := o.Registry.Sender("r1")
|
|
id := s.SessionID()
|
|
s.BeginConnect(id)
|
|
s.RecordHandshakeWithOutcome(id, HandshakeResult{
|
|
ReplicaFlushedLSN: 105, CommittedLSN: 100, RetentionStartLSN: 0,
|
|
})
|
|
|
|
snap := s.SessionSnapshot()
|
|
if !snap.TruncateRequired {
|
|
t.Fatal("snapshot should show truncation required")
|
|
}
|
|
if snap.TruncateToLSN != 100 {
|
|
t.Fatalf("truncate to=%d", snap.TruncateToLSN)
|
|
}
|
|
}
|
|
|
|
func TestIntegration_Observability_RebuildSnapshot(t *testing.T) {
|
|
o := NewRecoveryOrchestrator()
|
|
primary := RetainedHistory{
|
|
HeadLSN: 100, TailLSN: 30, CommittedLSN: 100,
|
|
CheckpointLSN: 50, CheckpointTrusted: true,
|
|
}
|
|
|
|
o.ProcessAssignment(AssignmentIntent{
|
|
Replicas: []ReplicaAssignment{
|
|
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
|
|
},
|
|
Epoch: 1,
|
|
RecoveryTargets: map[string]SessionKind{"r1": SessionRebuild},
|
|
})
|
|
|
|
s := o.Registry.Sender("r1")
|
|
id := s.SessionID()
|
|
s.BeginConnect(id)
|
|
s.RecordHandshake(id, 0, 100)
|
|
s.SelectRebuildFromHistory(id, &primary)
|
|
|
|
snap := s.SessionSnapshot()
|
|
if snap.RebuildSource != RebuildSnapshotTail {
|
|
t.Fatalf("rebuild source=%s", snap.RebuildSource)
|
|
}
|
|
if snap.RebuildPhase != RebuildPhaseSourceSelect {
|
|
t.Fatalf("rebuild phase=%s", snap.RebuildPhase)
|
|
}
|
|
}
|
|
|
|
func TestIntegration_RecoveryLog_AutoPopulated(t *testing.T) {
|
|
o := NewRecoveryOrchestrator()
|
|
primary := RetainedHistory{HeadLSN: 100, TailLSN: 0, CommittedLSN: 100}
|
|
|
|
o.ProcessAssignment(AssignmentIntent{
|
|
Replicas: []ReplicaAssignment{
|
|
{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "r1:9333", Version: 1}},
|
|
},
|
|
Epoch: 1,
|
|
RecoveryTargets: map[string]SessionKind{"r1": SessionCatchUp},
|
|
})
|
|
o.ExecuteRecovery("r1", 80, &primary)
|
|
o.CompleteCatchUp("r1", 100)
|
|
|
|
events := o.Log.EventsFor("r1")
|
|
// Should have: sender_added, session_created, connected, handshake, catchup_started, completed.
|
|
if len(events) < 5 {
|
|
t.Fatalf("expected ≥5 auto-populated events, got %d", len(events))
|
|
}
|
|
|
|
// All events came from the orchestrator, not manual test logging.
|
|
for _, e := range events {
|
|
if e.Event == "" {
|
|
t.Fatal("event should have non-empty type")
|
|
}
|
|
}
|
|
t.Logf("auto-populated log: %d events", len(events))
|
|
}
|