@ -6,15 +6,15 @@ import (
)
// Sender owns the replication channel to one replica. It is the authority
// boundary for all execution operations — every API validates the session
// ID before mutating state .
// boundary for all execution operations. All mutable state is unexported —
// external code reads state through accessors and mutates through execution APIs .
type Sender struct {
mu sync . Mutex
R eplicaID string
E ndpoint Endpoint
E poch uint64
S tate ReplicaState
r eplicaID string
e ndpoint Endpoint
e poch uint64
s tate ReplicaState
session * Session
stopped bool
@ -23,26 +23,92 @@ type Sender struct {
// NewSender creates a sender for a replica.
func NewSender ( replicaID string , endpoint Endpoint , epoch uint64 ) * Sender {
return & Sender {
R eplicaID: replicaID ,
E ndpoint: endpoint ,
E poch: epoch ,
S tate: StateDisconnected ,
r eplicaID: replicaID ,
e ndpoint: endpoint ,
e poch: epoch ,
s tate: StateDisconnected ,
}
}
// Read-only accessors.
func ( s * Sender ) ReplicaID ( ) string { s . mu . Lock ( ) ; defer s . mu . Unlock ( ) ; return s . replicaID }
func ( s * Sender ) Endpoint ( ) Endpoint { s . mu . Lock ( ) ; defer s . mu . Unlock ( ) ; return s . endpoint }
func ( s * Sender ) Epoch ( ) uint64 { s . mu . Lock ( ) ; defer s . mu . Unlock ( ) ; return s . epoch }
func ( s * Sender ) State ( ) ReplicaState { s . mu . Lock ( ) ; defer s . mu . Unlock ( ) ; return s . state }
func ( s * Sender ) Stopped ( ) bool { s . mu . Lock ( ) ; defer s . mu . Unlock ( ) ; return s . stopped }
// SessionSnapshot returns a read-only copy of the current session state.
// Returns nil if no session is active. The returned snapshot is disconnected
// from the live session — mutations to the Sender do not affect it.
func ( s * Sender ) SessionSnapshot ( ) * SessionSnapshot {
s . mu . Lock ( )
defer s . mu . Unlock ( )
if s . session == nil {
return nil
}
return & SessionSnapshot {
ID : s . session . id ,
ReplicaID : s . session . replicaID ,
Epoch : s . session . epoch ,
Kind : s . session . kind ,
Phase : s . session . phase ,
InvalidateReason : s . session . invalidateReason ,
StartLSN : s . session . startLSN ,
TargetLSN : s . session . targetLSN ,
FrozenTargetLSN : s . session . frozenTargetLSN ,
RecoveredTo : s . session . recoveredTo ,
Active : s . session . Active ( ) ,
}
}
// SessionSnapshot is a read-only copy of session state for external inspection.
type SessionSnapshot struct {
ID uint64
ReplicaID string
Epoch uint64
Kind SessionKind
Phase SessionPhase
InvalidateReason string
StartLSN uint64
TargetLSN uint64
FrozenTargetLSN uint64
RecoveredTo uint64
Active bool
}
// SessionID returns the current session ID, or 0 if no session.
func ( s * Sender ) SessionID ( ) uint64 {
s . mu . Lock ( )
defer s . mu . Unlock ( )
if s . session == nil {
return 0
}
return s . session . id
}
// HasActiveSession returns true if a session is currently active.
func ( s * Sender ) HasActiveSession ( ) bool {
s . mu . Lock ( )
defer s . mu . Unlock ( )
return s . session != nil && s . session . Active ( )
}
// === Lifecycle APIs ===
// UpdateEpoch advances the sender's epoch. Invalidates stale sessions.
func ( s * Sender ) UpdateEpoch ( epoch uint64 ) {
s . mu . Lock ( )
defer s . mu . Unlock ( )
if s . stopped || epoch <= s . Epoch {
if s . stopped || epoch <= s . e poch {
return
}
oldEpoch := s . Epoch
s . Epoch = epoch
if s . session != nil && s . session . Epoch < epoch {
oldEpoch := s . e poch
s . e poch = epoch
if s . session != nil && s . session . e poch < epoch {
s . session . invalidate ( fmt . Sprintf ( "epoch_advanced_%d_to_%d" , oldEpoch , epoch ) )
s . session = nil
s . State = StateDisconnected
s . s tate = StateDisconnected
}
}
@ -53,59 +119,62 @@ func (s *Sender) UpdateEndpoint(ep Endpoint) {
if s . stopped {
return
}
if s . E ndpoint. Changed ( ep ) && s . session != nil {
if s . e ndpoint. Changed ( ep ) && s . session != nil {
s . session . invalidate ( "endpoint_changed" )
s . session = nil
s . State = StateDisconnected
s . state = StateDisconnected
}
s . endpoint = ep
}
// SessionOption configures a newly created session.
type SessionOption func ( s * Session )
// WithBudget attaches a catch-up budget to the session.
func WithBudget ( budget CatchUpBudget ) SessionOption {
return func ( s * Session ) {
b := budget // copy
s . budget = & b
}
s . Endpoint = ep
}
// AttachSession creates a new recovery session. Epoch must match sender epoch.
func ( s * Sender ) AttachSession ( epoch uint64 , kind SessionKind ) ( * Session , error ) {
func ( s * Sender ) AttachSession ( epoch uint64 , kind SessionKind , opts ... SessionOption ) ( uint64 , error ) {
s . mu . Lock ( )
defer s . mu . Unlock ( )
if s . stopped {
return nil , fmt . Errorf ( "sender stopped" )
return 0 , fmt . Errorf ( "sender stopped" )
}
if epoch != s . Epoch {
return nil , fmt . Errorf ( "epoch mismatch: sender=%d session=%d" , s . E poch, epoch )
if epoch != s . e poch {
return 0 , fmt . Errorf ( "epoch mismatch: sender=%d session=%d" , s . e poch, epoch )
}
if s . session != nil && s . session . Active ( ) {
return nil , fmt . Errorf ( "session already active (id=%d)" , s . session . ID )
return 0 , fmt . Errorf ( "session already active (id=%d)" , s . session . id )
}
sess := newSession ( s . replicaID , epoch , kind )
for _ , opt := range opts {
opt ( sess )
}
sess := newSession ( s . ReplicaID , epoch , kind )
s . session = sess
return sess , nil
return sess . id , nil
}
// SupersedeSession invalidates current session and attaches new at sender epoch.
func ( s * Sender ) SupersedeSession ( kind SessionKind , reason string ) * Session {
func ( s * Sender ) SupersedeSession ( kind SessionKind , reason string , opts ... SessionOption ) uint64 {
s . mu . Lock ( )
defer s . mu . Unlock ( )
if s . stopped {
return nil
return 0
}
if s . session != nil {
s . session . invalidate ( reason )
}
sess := newSession ( s . ReplicaID , s . Epoch , kind )
sess := newSession ( s . replicaID , s . epoch , kind )
for _ , opt := range opts {
opt ( sess )
}
s . session = sess
return sess
}
// Session returns the current session, or nil.
func ( s * Sender ) Session ( ) * Session {
s . mu . Lock ( )
defer s . mu . Unlock ( )
return s . session
}
// Stopped returns true if the sender has been stopped.
func ( s * Sender ) Stopped ( ) bool {
s . mu . Lock ( )
defer s . mu . Unlock ( )
return s . stopped
return sess . id
}
// Stop shuts down the sender.
@ -130,26 +199,24 @@ func (s *Sender) InvalidateSession(reason string, targetState ReplicaState) {
s . session . invalidate ( reason )
s . session = nil
}
s . S tate = targetState
s . s tate = targetState
}
// === Catch-up execution APIs ===
// BeginConnect transitions init → connecting.
func ( s * Sender ) BeginConnect ( sessionID uint64 ) error {
s . mu . Lock ( )
defer s . mu . Unlock ( )
if err := s . checkAuthority ( sessionID ) ; err != nil {
return err
}
if ! s . session . A dvance( PhaseConnecting ) {
return fmt . Errorf ( "cannot begin connect: phase=%s" , s . session . P hase)
if ! s . session . a dvance( PhaseConnecting ) {
return fmt . Errorf ( "cannot begin connect: phase=%s" , s . session . p hase)
}
s . S tate = StateConnecting
s . s tate = StateConnecting
return nil
}
// RecordHandshake records handshake result and sets catch-up range.
func ( s * Sender ) RecordHandshake ( sessionID uint64 , startLSN , targetLSN uint64 ) error {
s . mu . Lock ( )
defer s . mu . Unlock ( )
@ -159,14 +226,13 @@ func (s *Sender) RecordHandshake(sessionID uint64, startLSN, targetLSN uint64) e
if targetLSN < startLSN {
return fmt . Errorf ( "invalid range: target=%d < start=%d" , targetLSN , startLSN )
}
if ! s . session . A dvance( PhaseHandshake ) {
return fmt . Errorf ( "cannot record handshake: phase=%s" , s . session . P hase)
if ! s . session . a dvance( PhaseHandshake ) {
return fmt . Errorf ( "cannot record handshake: phase=%s" , s . session . p hase)
}
s . session . S etRange( startLSN , targetLSN )
s . session . s etRange( startLSN , targetLSN )
return nil
}
// RecordHandshakeWithOutcome records handshake and classifies the recovery outcome.
func ( s * Sender ) RecordHandshakeWithOutcome ( sessionID uint64 , result HandshakeResult ) ( RecoveryOutcome , error ) {
outcome := ClassifyRecoveryOutcome ( result )
s . mu . Lock ( )
@ -174,106 +240,100 @@ func (s *Sender) RecordHandshakeWithOutcome(sessionID uint64, result HandshakeRe
if err := s . checkAuthority ( sessionID ) ; err != nil {
return outcome , err
}
if s . session . P hase != PhaseConnecting {
return outcome , fmt . Errorf ( "handshake requires connecting, got %s" , s . session . P hase)
if s . session . p hase != PhaseConnecting {
return outcome , fmt . Errorf ( "handshake requires connecting, got %s" , s . session . p hase)
}
if outcome == OutcomeNeedsRebuild {
s . session . invalidate ( "gap_exceeds_retention" )
s . session = nil
s . S tate = StateNeedsRebuild
s . s tate = StateNeedsRebuild
return outcome , nil
}
if ! s . session . A dvance( PhaseHandshake ) {
return outcome , fmt . Errorf ( "cannot advance to handshake: phase=%s" , s . session . P hase)
if ! s . session . a dvance( PhaseHandshake ) {
return outcome , fmt . Errorf ( "cannot advance to handshake: phase=%s" , s . session . p hase)
}
switch outcome {
case OutcomeZeroGap :
s . session . S etRange( result . ReplicaFlushedLSN , result . ReplicaFlushedLSN )
s . session . s etRange( result . ReplicaFlushedLSN , result . ReplicaFlushedLSN )
case OutcomeCatchUp :
if result . ReplicaFlushedLSN > result . CommittedLSN {
s . session . T runcateRequired = true
s . session . T runcateToLSN = result . CommittedLSN
s . session . S etRange( result . CommittedLSN , result . CommittedLSN )
s . session . t runcateRequired = true
s . session . t runcateToLSN = result . CommittedLSN
s . session . s etRange( result . CommittedLSN , result . CommittedLSN )
} else {
s . session . S etRange( result . ReplicaFlushedLSN , result . CommittedLSN )
s . session . s etRange( result . ReplicaFlushedLSN , result . CommittedLSN )
}
}
return outcome , nil
}
// BeginCatchUp transitions to catch-up phase. Rejects rebuild sessions.
// Freezes the target unconditionally.
func ( s * Sender ) BeginCatchUp ( sessionID uint64 , startTick ... uint64 ) error {
s . mu . Lock ( )
defer s . mu . Unlock ( )
if err := s . checkAuthority ( sessionID ) ; err != nil {
return err
}
if s . session . K ind == SessionRebuild {
if s . session . k ind == SessionRebuild {
return fmt . Errorf ( "rebuild sessions must use rebuild APIs" )
}
if ! s . session . A dvance( PhaseCatchUp ) {
return fmt . Errorf ( "cannot begin catch-up: phase=%s" , s . session . P hase)
if ! s . session . a dvance( PhaseCatchUp ) {
return fmt . Errorf ( "cannot begin catch-up: phase=%s" , s . session . p hase)
}
s . S tate = StateCatchingUp
s . session . F rozenTargetLSN = s . session . T argetLSN
s . s tate = StateCatchingUp
s . session . f rozenTargetLSN = s . session . t argetLSN
if len ( startTick ) > 0 {
s . session . T racker. StartTick = startTick [ 0 ]
s . session . T racker. LastProgressTick = startTick [ 0 ]
s . session . t racker. StartTick = startTick [ 0 ]
s . session . t racker. LastProgressTick = startTick [ 0 ]
}
return nil
}
// RecordCatchUpProgress records catch-up progress. Rejects rebuild sessions.
// Entry counting uses LSN delta. Tick is required when ProgressDeadlineTicks > 0.
func ( s * Sender ) RecordCatchUpProgress ( sessionID uint64 , recoveredTo uint64 , tick ... uint64 ) error {
s . mu . Lock ( )
defer s . mu . Unlock ( )
if err := s . checkAuthority ( sessionID ) ; err != nil {
return err
}
if s . session . K ind == SessionRebuild {
if s . session . k ind == SessionRebuild {
return fmt . Errorf ( "rebuild sessions must use rebuild APIs" )
}
if s . session . P hase != PhaseCatchUp {
return fmt . Errorf ( "progress requires catchup phase, got %s" , s . session . P hase)
if s . session . p hase != PhaseCatchUp {
return fmt . Errorf ( "progress requires catchup phase, got %s" , s . session . p hase)
}
if recoveredTo <= s . session . R ecoveredTo {
return fmt . Errorf ( "progress regression: %d <= %d" , recoveredTo , s . session . R ecoveredTo)
if recoveredTo <= s . session . r ecoveredTo {
return fmt . Errorf ( "progress regression: %d <= %d" , recoveredTo , s . session . r ecoveredTo)
}
if s . session . F rozenTargetLSN > 0 && recoveredTo > s . session . F rozenTargetLSN {
return fmt . Errorf ( "progress %d exceeds frozen target %d" , recoveredTo , s . session . F rozenTargetLSN)
if s . session . f rozenTargetLSN > 0 && recoveredTo > s . session . f rozenTargetLSN {
return fmt . Errorf ( "progress %d exceeds frozen target %d" , recoveredTo , s . session . f rozenTargetLSN)
}
if s . session . B udget != nil && s . session . B udget. ProgressDeadlineTicks > 0 && len ( tick ) == 0 {
if s . session . b udget != nil && s . session . b udget. ProgressDeadlineTicks > 0 && len ( tick ) == 0 {
return fmt . Errorf ( "tick required when ProgressDeadlineTicks > 0" )
}
delta := recoveredTo - s . session . R ecoveredTo
s . session . T racker. EntriesReplayed += delta
s . session . U pdateProgress( recoveredTo )
delta := recoveredTo - s . session . r ecoveredTo
s . session . t racker. EntriesReplayed += delta
s . session . u pdateProgress( recoveredTo )
if len ( tick ) > 0 {
s . session . T racker. LastProgressTick = tick [ 0 ]
s . session . t racker. LastProgressTick = tick [ 0 ]
}
return nil
}
// RecordTruncation confirms divergent tail cleanup.
func ( s * Sender ) RecordTruncation ( sessionID uint64 , truncatedToLSN uint64 ) error {
s . mu . Lock ( )
defer s . mu . Unlock ( )
if err := s . checkAuthority ( sessionID ) ; err != nil {
return err
}
if ! s . session . T runcateRequired {
if ! s . session . t runcateRequired {
return fmt . Errorf ( "truncation not required" )
}
if truncatedToLSN != s . session . T runcateToLSN {
return fmt . Errorf ( "truncation LSN mismatch: expected %d, got %d" , s . session . T runcateToLSN, truncatedToLSN )
if truncatedToLSN != s . session . t runcateToLSN {
return fmt . Errorf ( "truncation LSN mismatch: expected %d, got %d" , s . session . t runcateToLSN, truncatedToLSN )
}
s . session . T runcateRecorded = true
s . session . t runcateRecorded = true
return nil
}
// CompleteSessionByID completes catch-up sessions. Rejects rebuild sessions.
func ( s * Sender ) CompleteSessionByID ( sessionID uint64 ) bool {
s . mu . Lock ( )
defer s . mu . Unlock ( )
@ -281,19 +341,19 @@ func (s *Sender) CompleteSessionByID(sessionID uint64) bool {
return false
}
sess := s . session
if sess . K ind == SessionRebuild {
if sess . k ind == SessionRebuild {
return false
}
if sess . T runcateRequired && ! sess . T runcateRecorded {
if sess . t runcateRequired && ! sess . t runcateRecorded {
return false
}
switch sess . P hase {
switch sess . p hase {
case PhaseCatchUp :
if ! sess . Converged ( ) {
return false
}
case PhaseHandshake :
if sess . T argetLSN != sess . S tartLSN {
if sess . t argetLSN != sess . s tartLSN {
return false
}
default :
@ -301,48 +361,46 @@ func (s *Sender) CompleteSessionByID(sessionID uint64) bool {
}
sess . complete ( )
s . session = nil
s . S tate = StateInSync
s . s tate = StateInSync
return true
}
// CheckBudget evaluates catch-up budget. Auto-escalates on violation.
func ( s * Sender ) CheckBudget ( sessionID uint64 , currentTick uint64 ) ( BudgetViolation , error ) {
s . mu . Lock ( )
defer s . mu . Unlock ( )
if err := s . checkAuthority ( sessionID ) ; err != nil {
return BudgetOK , err
}
if s . session . B udget == nil {
if s . session . b udget == nil {
return BudgetOK , nil
}
v := s . session . B udget. Check ( s . session . T racker, currentTick )
v := s . session . b udget. Check ( s . session . t racker, currentTick )
if v != BudgetOK {
s . session . invalidate ( fmt . Sprintf ( "budget_%s" , v ) )
s . session = nil
s . S tate = StateNeedsRebuild
s . s tate = StateNeedsRebuild
}
return v , nil
}
// === Rebuild execution APIs ===
// SelectRebuildSource chooses rebuild source. Requires PhaseHandshake.
func ( s * Sender ) SelectRebuildSource ( sessionID uint64 , snapshotLSN uint64 , snapshotValid bool , committedLSN uint64 ) error {
s . mu . Lock ( )
defer s . mu . Unlock ( )
if err := s . checkAuthority ( sessionID ) ; err != nil {
return err
}
if s . session . K ind != SessionRebuild {
if s . session . k ind != SessionRebuild {
return fmt . Errorf ( "not a rebuild session" )
}
if s . session . P hase != PhaseHandshake {
return fmt . Errorf ( "requires PhaseHandshake, got %s" , s . session . P hase)
if s . session . p hase != PhaseHandshake {
return fmt . Errorf ( "requires PhaseHandshake, got %s" , s . session . p hase)
}
if s . session . R ebuild == nil {
if s . session . r ebuild == nil {
return fmt . Errorf ( "rebuild state not initialized" )
}
return s . session . R ebuild. SelectSource ( snapshotLSN , snapshotValid , committedLSN )
return s . session . r ebuild. SelectSource ( snapshotLSN , snapshotValid , committedLSN )
}
func ( s * Sender ) BeginRebuildTransfer ( sessionID uint64 ) error {
@ -351,10 +409,10 @@ func (s *Sender) BeginRebuildTransfer(sessionID uint64) error {
if err := s . checkAuthority ( sessionID ) ; err != nil {
return err
}
if s . session . R ebuild == nil {
if s . session . r ebuild == nil {
return fmt . Errorf ( "no rebuild state" )
}
return s . session . R ebuild. BeginTransfer ( )
return s . session . r ebuild. BeginTransfer ( )
}
func ( s * Sender ) RecordRebuildTransferProgress ( sessionID uint64 , transferredTo uint64 ) error {
@ -363,10 +421,10 @@ func (s *Sender) RecordRebuildTransferProgress(sessionID uint64, transferredTo u
if err := s . checkAuthority ( sessionID ) ; err != nil {
return err
}
if s . session . R ebuild == nil {
if s . session . r ebuild == nil {
return fmt . Errorf ( "no rebuild state" )
}
return s . session . R ebuild. RecordTransferProgress ( transferredTo )
return s . session . r ebuild. RecordTransferProgress ( transferredTo )
}
func ( s * Sender ) BeginRebuildTailReplay ( sessionID uint64 ) error {
@ -375,10 +433,10 @@ func (s *Sender) BeginRebuildTailReplay(sessionID uint64) error {
if err := s . checkAuthority ( sessionID ) ; err != nil {
return err
}
if s . session . R ebuild == nil {
if s . session . r ebuild == nil {
return fmt . Errorf ( "no rebuild state" )
}
return s . session . R ebuild. BeginTailReplay ( )
return s . session . r ebuild. BeginTailReplay ( )
}
func ( s * Sender ) RecordRebuildTailProgress ( sessionID uint64 , replayedTo uint64 ) error {
@ -387,32 +445,30 @@ func (s *Sender) RecordRebuildTailProgress(sessionID uint64, replayedTo uint64)
if err := s . checkAuthority ( sessionID ) ; err != nil {
return err
}
if s . session . R ebuild == nil {
if s . session . r ebuild == nil {
return fmt . Errorf ( "no rebuild state" )
}
return s . session . R ebuild. RecordTailReplayProgress ( replayedTo )
return s . session . r ebuild. RecordTailReplayProgress ( replayedTo )
}
// CompleteRebuild completes a rebuild session. Requires ReadyToComplete.
func ( s * Sender ) CompleteRebuild ( sessionID uint64 ) error {
s . mu . Lock ( )
defer s . mu . Unlock ( )
if err := s . checkAuthority ( sessionID ) ; err != nil {
return err
}
if s . session . R ebuild == nil {
if s . session . r ebuild == nil {
return fmt . Errorf ( "no rebuild state" )
}
if err := s . session . R ebuild. Complete ( ) ; err != nil {
if err := s . session . r ebuild. Complete ( ) ; err != nil {
return err
}
s . session . complete ( )
s . session = nil
s . S tate = StateInSync
s . s tate = StateInSync
return nil
}
// checkAuthority validates session ownership.
func ( s * Sender ) checkAuthority ( sessionID uint64 ) error {
if s . stopped {
return fmt . Errorf ( "sender stopped" )
@ -420,11 +476,11 @@ func (s *Sender) checkAuthority(sessionID uint64) error {
if s . session == nil {
return fmt . Errorf ( "no active session" )
}
if s . session . ID != sessionID {
return fmt . Errorf ( "session ID mismatch: active=%d requested=%d" , s . session . ID , sessionID )
if s . session . id != sessionID {
return fmt . Errorf ( "session ID mismatch: active=%d requested=%d" , s . session . id , sessionID )
}
if ! s . session . Active ( ) {
return fmt . Errorf ( "session %d not active (phase=%s)" , sessionID , s . session . P hase)
return fmt . Errorf ( "session %d not active (phase=%s)" , sessionID , s . session . p hase)
}
return nil
}