package replication import "fmt" // RecoveryOrchestrator drives the recovery lifecycle from assignment intent // through execution to completion/escalation. It is the integrated entry // path above raw Sender APIs — callers interact with the orchestrator, // not with individual sender execution methods. // // The orchestrator owns: // - assignment processing (reconcile + session creation) // - handshake evaluation (from RetainedHistory) // - recovery execution (catch-up or rebuild through completion) // - automatic event logging at every lifecycle transition type RecoveryOrchestrator struct { Registry *Registry Log *RecoveryLog } // NewRecoveryOrchestrator creates an orchestrator with a fresh registry and log. func NewRecoveryOrchestrator() *RecoveryOrchestrator { return &RecoveryOrchestrator{ Registry: NewRegistry(), Log: NewRecoveryLog(), } } // ProcessAssignment applies an assignment intent and logs the result. // Detects endpoint-change invalidations automatically. func (o *RecoveryOrchestrator) ProcessAssignment(intent AssignmentIntent) AssignmentResult { // Snapshot pre-assignment state for invalidation detection. type preState struct { hadSession bool sessionID uint64 endpoint Endpoint } pre := map[string]preState{} for _, ra := range intent.Replicas { if s := o.Registry.Sender(ra.ReplicaID); s != nil { pre[ra.ReplicaID] = preState{ hadSession: s.HasActiveSession(), sessionID: s.SessionID(), endpoint: s.Endpoint(), } } } result := o.Registry.ApplyAssignment(intent) for _, id := range result.Added { o.Log.Record(id, 0, "sender_added", "") } for _, id := range result.Removed { o.Log.Record(id, 0, "sender_removed", "") } // Detect endpoint-change invalidations: only log "endpoint_changed" when // the endpoint actually changed, not on normal session supersede. for id, p := range pre { if p.hadSession { s := o.Registry.Sender(id) if s != nil && s.SessionID() != p.sessionID && s.Endpoint().Changed(p.endpoint) { o.Log.Record(id, p.sessionID, "session_invalidated", "endpoint_changed") } } } for _, id := range result.SessionsCreated { s := o.Registry.Sender(id) o.Log.Record(id, s.SessionID(), "session_created", "") } for _, id := range result.SessionsSuperseded { s := o.Registry.Sender(id) o.Log.Record(id, s.SessionID(), "session_superseded", "") } for _, id := range result.SessionsFailed { o.Log.Record(id, 0, "session_failed", "") } return result } // RecoveryResult captures the outcome of a single replica recovery attempt. type RecoveryResult struct { ReplicaID string Outcome RecoveryOutcome Proof *RecoverabilityProof FinalState ReplicaState Error error } // ExecuteRecovery runs the full recovery flow for a single replica: // connect → handshake (from history) → catch-up or escalate. // // For catch-up outcomes, the caller provides entries via the returned // CatchUpHandle. For rebuild outcomes, the sender is left at NeedsRebuild // and requires a separate rebuild assignment. func (o *RecoveryOrchestrator) ExecuteRecovery(replicaID string, replicaFlushedLSN uint64, history *RetainedHistory) RecoveryResult { s := o.Registry.Sender(replicaID) if s == nil { return RecoveryResult{ReplicaID: replicaID, Error: fmt.Errorf("sender not found")} } sessID := s.SessionID() if sessID == 0 { return RecoveryResult{ReplicaID: replicaID, Error: fmt.Errorf("no session")} } // Connect. if err := s.BeginConnect(sessID); err != nil { o.Log.Record(replicaID, sessID, "connect_failed", err.Error()) return RecoveryResult{ReplicaID: replicaID, FinalState: s.State(), Error: err} } o.Log.Record(replicaID, sessID, "connected", "") // Handshake from history. outcome, proof, err := s.RecordHandshakeFromHistory(sessID, replicaFlushedLSN, history) if err != nil { o.Log.Record(replicaID, sessID, "handshake_failed", err.Error()) return RecoveryResult{ReplicaID: replicaID, Outcome: outcome, Proof: proof, FinalState: s.State(), Error: err} } o.Log.Record(replicaID, sessID, "handshake", fmt.Sprintf("outcome=%s", outcome)) if outcome == OutcomeNeedsRebuild { o.Log.Record(replicaID, sessID, "escalated", fmt.Sprintf("needs_rebuild: %s", proof.Reason)) return RecoveryResult{ReplicaID: replicaID, Outcome: outcome, Proof: proof, FinalState: StateNeedsRebuild} } // Zero-gap: complete immediately (no catch-up needed). if outcome == OutcomeZeroGap { if s.CompleteSessionByID(sessID) { o.Log.Record(replicaID, sessID, "completed", "zero_gap") return RecoveryResult{ReplicaID: replicaID, Outcome: outcome, Proof: proof, FinalState: StateInSync} } } return RecoveryResult{ReplicaID: replicaID, Outcome: outcome, Proof: proof, FinalState: s.State()} } // CatchUpOptions configures the orchestrated catch-up flow. type CatchUpOptions struct { TargetLSN uint64 // required: what to catch up to StartTick uint64 // tick when catch-up begins (for budget tracking) CompleteTick uint64 // tick when catch-up is evaluated (for budget checking) TruncateLSN uint64 // if non-zero, record truncation before completion } // CompleteCatchUp drives the full catch-up lifecycle: // 1. BeginCatchUp (freezes target) // 2. RecordCatchUpProgress // 3. CheckBudget (if budget configured) // 4. RecordTruncation (if truncation required) // 5. CompleteSessionByID // // Logs causal reason for every rejection, escalation, or completion. func (o *RecoveryOrchestrator) CompleteCatchUp(replicaID string, opts CatchUpOptions) error { s := o.Registry.Sender(replicaID) if s == nil { return fmt.Errorf("sender not found") } sessID := s.SessionID() // Step 1: begin catch-up (freezes target, records start tick). if err := s.BeginCatchUp(sessID, opts.StartTick); err != nil { o.Log.Record(replicaID, sessID, "catchup_begin_failed", err.Error()) return err } o.Log.Record(replicaID, sessID, "catchup_started", fmt.Sprintf("target=%d start_tick=%d", opts.TargetLSN, opts.StartTick)) // Step 2: record progress (skip if already converged, e.g., truncation-only case). snap := s.SessionSnapshot() if snap != nil && snap.RecoveredTo < opts.TargetLSN { var progressTick []uint64 if opts.CompleteTick > 0 { progressTick = []uint64{opts.CompleteTick} } if err := s.RecordCatchUpProgress(sessID, opts.TargetLSN, progressTick...); err != nil { o.Log.Record(replicaID, sessID, "catchup_progress_failed", err.Error()) return err } } // Step 3: check budget at completion time. if opts.CompleteTick > 0 { violation, err := s.CheckBudget(sessID, opts.CompleteTick) if err != nil { return err } if violation != BudgetOK { o.Log.Record(replicaID, sessID, "budget_escalated", string(violation)) return fmt.Errorf("budget violation: %s", violation) } } // Step 4: handle truncation (if required). if opts.TruncateLSN > 0 { if err := s.RecordTruncation(sessID, opts.TruncateLSN); err != nil { o.Log.Record(replicaID, sessID, "truncation_failed", err.Error()) return err } o.Log.Record(replicaID, sessID, "truncation_recorded", fmt.Sprintf("truncated_to=%d", opts.TruncateLSN)) } // Step 5: complete. if !s.CompleteSessionByID(sessID) { o.Log.Record(replicaID, sessID, "completion_rejected", "session not convergent or truncation missing") return fmt.Errorf("completion rejected") } o.Log.Record(replicaID, sessID, "completed", "in_sync") return nil } // CompleteRebuild drives the rebuild from history and completes. // Called after a rebuild assignment when the sender is at NeedsRebuild. func (o *RecoveryOrchestrator) CompleteRebuild(replicaID string, history *RetainedHistory) error { s := o.Registry.Sender(replicaID) if s == nil { return fmt.Errorf("sender not found") } sessID := s.SessionID() if err := s.BeginConnect(sessID); err != nil { o.Log.Record(replicaID, sessID, "rebuild_connect_failed", err.Error()) return err } o.Log.Record(replicaID, sessID, "rebuild_connected", "") if err := s.RecordHandshake(sessID, 0, history.CommittedLSN); err != nil { return err } if err := s.SelectRebuildFromHistory(sessID, history); err != nil { o.Log.Record(replicaID, sessID, "rebuild_source_failed", err.Error()) return err } snap := s.SessionSnapshot() o.Log.Record(replicaID, sessID, "rebuild_source_selected", fmt.Sprintf("kind=%s", snap.Kind)) if err := s.BeginRebuildTransfer(sessID); err != nil { return err } // Determine transfer target based on rebuild source. source, snapLSN := history.RebuildSourceDecision() if source == RebuildSnapshotTail { s.RecordRebuildTransferProgress(sessID, snapLSN) if err := s.BeginRebuildTailReplay(sessID); err != nil { return err } s.RecordRebuildTailProgress(sessID, history.CommittedLSN) } else { s.RecordRebuildTransferProgress(sessID, history.CommittedLSN) } if err := s.CompleteRebuild(sessID); err != nil { o.Log.Record(replicaID, sessID, "rebuild_failed", err.Error()) return err } o.Log.Record(replicaID, sessID, "rebuild_completed", "in_sync") return nil } // UpdateSenderEpoch advances a specific sender's epoch via the orchestrator. // Logs the transition and any session invalidation. func (o *RecoveryOrchestrator) UpdateSenderEpoch(replicaID string, newEpoch uint64) { s := o.Registry.Sender(replicaID) if s == nil { return } hadSession := s.HasActiveSession() oldSessID := s.SessionID() s.UpdateEpoch(newEpoch) if hadSession && !s.HasActiveSession() { o.Log.Record(replicaID, oldSessID, "session_invalidated", fmt.Sprintf("epoch_advanced_to_%d", newEpoch)) } } // InvalidateEpoch invalidates all stale sessions with per-replica logging. func (o *RecoveryOrchestrator) InvalidateEpoch(newEpoch uint64) int { // Collect per-replica state before invalidation. type pre struct { id string sess uint64 had bool } var senders []pre for _, s := range o.Registry.All() { senders = append(senders, pre{ id: s.ReplicaID(), sess: s.SessionID(), had: s.HasActiveSession(), }) } count := o.Registry.InvalidateEpoch(newEpoch) // Log per-replica invalidations. for _, p := range senders { s := o.Registry.Sender(p.id) if s != nil && p.had && !s.HasActiveSession() { o.Log.Record(p.id, p.sess, "session_invalidated", fmt.Sprintf("epoch_bump_to_%d", newEpoch)) } } return count }