Browse Source

feat: separate CommittedLSN from CheckpointLSN, close catch-up ONE CHAIN (Phase 08 P2)

CommittedLSN separation:
- StatusSnapshot().CommittedLSN = nextLSN-1 (WAL head) for sync_all
- Was: flusher.CheckpointLSN() (collapsed catch-up window to zero)
- Now: entries between checkpoint and head are committed but unflushed
- Creates real catch-up window: TailLSN=5 < replica=6 < CommittedLSN=10

Catch-up ONE CHAIN PROVEN:
  assignment → PlanRecovery(replica=6) → OutcomeCatchUp
  → CatchUpExecutor(IO=v2bridge) → StreamWALEntries(6,10)
  → real ScanFrom from disk → engine progress → InSync
  → pinner.ActiveHoldCount()==0

Both chains now closed:
- Catch-up: plan → executor(IO) → v2bridge → blockvol → complete
- Rebuild: plan → executor(IO) → v2bridge → blockvol → complete

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
feature/sw-block
pingqiu 2 days ago
parent
commit
08e34e02ae
  1. 14
      weed/storage/blockvol/blockvol.go
  2. 20
      weed/storage/blockvol/v2bridge/execution_chain_test.go

14
weed/storage/blockvol/blockvol.go

@ -899,7 +899,7 @@ type V2StatusSnapshot struct {
//
// WALHeadLSN ← nextLSN - 1 (last written LSN)
// WALTailLSN ← super.WALCheckpointLSN (LSN boundary, not byte offset)
// CommittedLSN ← flusher.CheckpointLSN() (V1 interim: barrier-confirmed + flushed)
// CommittedLSN ← nextLSN - 1 (for sync_all: every write is barrier-confirmed)
// CheckpointLSN ← super.WALCheckpointLSN (durable base image)
// CheckpointTrusted ← super.Validate() == nil (superblock integrity)
func (v *BlockVol) StatusSnapshot() V2StatusSnapshot {
@ -910,15 +910,13 @@ func (v *BlockVol) StatusSnapshot() V2StatusSnapshot {
// WALTailLSN: the oldest retained LSN boundary for recovery classification.
// Entries with LSN > WALTailLSN are guaranteed in the WAL.
// Entries with LSN <= WALTailLSN have been checkpointed and WAL space
// may be reused. This is an LSN (not a physical byte offset).
walTailLSN := v.super.WALCheckpointLSN
// CommittedLSN: V1 interim mapping. committed = checkpointed after flush.
var committedLSN uint64
if v.flusher != nil {
committedLSN = v.flusher.CheckpointLSN()
}
// CommittedLSN: for sync_all mode, every write is barrier-confirmed
// before returning. So WALHeadLSN (nextLSN-1) IS the committed boundary.
// This separates CommittedLSN from CheckpointLSN — entries between
// checkpoint and head are committed but not yet flushed to extent.
committedLSN := headLSN
return V2StatusSnapshot{
WALHeadLSN: headLSN,

20
weed/storage/blockvol/v2bridge/execution_chain_test.go

@ -68,20 +68,16 @@ func TestP2_CatchUpClosure_OneChain(t *testing.T) {
t.Logf("catch-up: head=%d tail=%d committed=%d checkpoint=%d",
state.WALHeadLSN, state.WALTailLSN, state.CommittedLSN, state.CheckpointLSN)
// Precondition: head > committed (entries above checkpoint exist).
if state.WALHeadLSN <= state.CommittedLSN {
t.Fatalf("need entries above checkpoint: head=%d committed=%d", state.WALHeadLSN, state.CommittedLSN)
// Precondition: CommittedLSN > TailLSN (catch-up window exists).
if state.CommittedLSN <= state.WALTailLSN {
t.Fatalf("no catch-up window: committed=%d tail=%d", state.CommittedLSN, state.WALTailLSN)
}
// Step 1: assignment.
driver.Orchestrator.ProcessAssignment(makeIntent(ca, 1, "replica"))
// Step 2: plan — replica at committedLSN = ZeroGap (V1 interim).
// Replica at LESS than committedLSN → CatchUp.
replicaLSN := state.CommittedLSN - 1
if replicaLSN == 0 && state.CommittedLSN > 1 {
replicaLSN = state.CommittedLSN - 1
}
// Step 2: plan — replica WITHIN the catch-up window (between tail and committed).
replicaLSN := state.WALTailLSN + 1 // just above tail, within window
plan, err := driver.PlanRecovery("vol1/vs2", replicaLSN)
if err != nil {
@ -111,11 +107,7 @@ func TestP2_CatchUpClosure_OneChain(t *testing.T) {
t.Log("catch-up: ONE CHAIN proven: plan → CatchUpExecutor → complete → InSync → pins released")
} else {
// V1 interim: CommittedLSN = TailLSN after flush.
// No gap between tail and committed → OutcomeCatchUp structurally unreachable.
// This is a known V1 limitation, NOT a test failure.
t.Skipf("catch-up: V1 interim → %s (replica=%d committed=%d tail=%d). "+
"One-chain wiring exists but V1 model prevents OutcomeCatchUp when committed=tail.",
t.Fatalf("catch-up: unexpected outcome=%s (replica=%d committed=%d tail=%d)",
plan.Outcome, replicaLSN, state.CommittedLSN, state.WALTailLSN)
}
}

Loading…
Cancel
Save