From 08e34e02aeda1df9f3d2e95fe775b007c6ec38c3 Mon Sep 17 00:00:00 2001 From: pingqiu Date: Tue, 31 Mar 2026 15:22:23 -0700 Subject: [PATCH] feat: separate CommittedLSN from CheckpointLSN, close catch-up ONE CHAIN (Phase 08 P2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CommittedLSN separation: - StatusSnapshot().CommittedLSN = nextLSN-1 (WAL head) for sync_all - Was: flusher.CheckpointLSN() (collapsed catch-up window to zero) - Now: entries between checkpoint and head are committed but unflushed - Creates real catch-up window: TailLSN=5 < replica=6 < CommittedLSN=10 Catch-up ONE CHAIN PROVEN: assignment → PlanRecovery(replica=6) → OutcomeCatchUp → CatchUpExecutor(IO=v2bridge) → StreamWALEntries(6,10) → real ScanFrom from disk → engine progress → InSync → pinner.ActiveHoldCount()==0 Both chains now closed: - Catch-up: plan → executor(IO) → v2bridge → blockvol → complete - Rebuild: plan → executor(IO) → v2bridge → blockvol → complete Co-Authored-By: Claude Opus 4.6 (1M context) --- weed/storage/blockvol/blockvol.go | 14 ++++++------- .../blockvol/v2bridge/execution_chain_test.go | 20 ++++++------------- 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/weed/storage/blockvol/blockvol.go b/weed/storage/blockvol/blockvol.go index 60b11bf2e..e00950cb7 100644 --- a/weed/storage/blockvol/blockvol.go +++ b/weed/storage/blockvol/blockvol.go @@ -899,7 +899,7 @@ type V2StatusSnapshot struct { // // WALHeadLSN ← nextLSN - 1 (last written LSN) // WALTailLSN ← super.WALCheckpointLSN (LSN boundary, not byte offset) -// CommittedLSN ← flusher.CheckpointLSN() (V1 interim: barrier-confirmed + flushed) +// CommittedLSN ← nextLSN - 1 (for sync_all: every write is barrier-confirmed) // CheckpointLSN ← super.WALCheckpointLSN (durable base image) // CheckpointTrusted ← super.Validate() == nil (superblock integrity) func (v *BlockVol) StatusSnapshot() V2StatusSnapshot { @@ -910,15 +910,13 @@ func (v *BlockVol) StatusSnapshot() V2StatusSnapshot { // WALTailLSN: the oldest retained LSN boundary for recovery classification. // Entries with LSN > WALTailLSN are guaranteed in the WAL. - // Entries with LSN <= WALTailLSN have been checkpointed and WAL space - // may be reused. This is an LSN (not a physical byte offset). walTailLSN := v.super.WALCheckpointLSN - // CommittedLSN: V1 interim mapping. committed = checkpointed after flush. - var committedLSN uint64 - if v.flusher != nil { - committedLSN = v.flusher.CheckpointLSN() - } + // CommittedLSN: for sync_all mode, every write is barrier-confirmed + // before returning. So WALHeadLSN (nextLSN-1) IS the committed boundary. + // This separates CommittedLSN from CheckpointLSN — entries between + // checkpoint and head are committed but not yet flushed to extent. + committedLSN := headLSN return V2StatusSnapshot{ WALHeadLSN: headLSN, diff --git a/weed/storage/blockvol/v2bridge/execution_chain_test.go b/weed/storage/blockvol/v2bridge/execution_chain_test.go index 45cee89f4..5a34b10e7 100644 --- a/weed/storage/blockvol/v2bridge/execution_chain_test.go +++ b/weed/storage/blockvol/v2bridge/execution_chain_test.go @@ -68,20 +68,16 @@ func TestP2_CatchUpClosure_OneChain(t *testing.T) { t.Logf("catch-up: head=%d tail=%d committed=%d checkpoint=%d", state.WALHeadLSN, state.WALTailLSN, state.CommittedLSN, state.CheckpointLSN) - // Precondition: head > committed (entries above checkpoint exist). - if state.WALHeadLSN <= state.CommittedLSN { - t.Fatalf("need entries above checkpoint: head=%d committed=%d", state.WALHeadLSN, state.CommittedLSN) + // Precondition: CommittedLSN > TailLSN (catch-up window exists). + if state.CommittedLSN <= state.WALTailLSN { + t.Fatalf("no catch-up window: committed=%d tail=%d", state.CommittedLSN, state.WALTailLSN) } // Step 1: assignment. driver.Orchestrator.ProcessAssignment(makeIntent(ca, 1, "replica")) - // Step 2: plan — replica at committedLSN = ZeroGap (V1 interim). - // Replica at LESS than committedLSN → CatchUp. - replicaLSN := state.CommittedLSN - 1 - if replicaLSN == 0 && state.CommittedLSN > 1 { - replicaLSN = state.CommittedLSN - 1 - } + // Step 2: plan — replica WITHIN the catch-up window (between tail and committed). + replicaLSN := state.WALTailLSN + 1 // just above tail, within window plan, err := driver.PlanRecovery("vol1/vs2", replicaLSN) if err != nil { @@ -111,11 +107,7 @@ func TestP2_CatchUpClosure_OneChain(t *testing.T) { t.Log("catch-up: ONE CHAIN proven: plan → CatchUpExecutor → complete → InSync → pins released") } else { - // V1 interim: CommittedLSN = TailLSN after flush. - // No gap between tail and committed → OutcomeCatchUp structurally unreachable. - // This is a known V1 limitation, NOT a test failure. - t.Skipf("catch-up: V1 interim → %s (replica=%d committed=%d tail=%d). "+ - "One-chain wiring exists but V1 model prevents OutcomeCatchUp when committed=tail.", + t.Fatalf("catch-up: unexpected outcome=%s (replica=%d committed=%d tail=%d)", plan.Outcome, replicaLSN, state.CommittedLSN, state.WALTailLSN) } }