From 46ef79ce354e459a469509511849603088fc89bd Mon Sep 17 00:00:00 2001 From: pingqiu Date: Tue, 31 Mar 2026 10:46:17 -0700 Subject: [PATCH] fix: stable ServerID in assignments, fail-closed on missing identity, wire into ProcessAssignments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Finding 1: Identity no longer address-derived - ReplicaAddr.ServerID field added (stable server identity from registry) - BlockVolumeAssignment.ReplicaServerID field added (scalar RF=2 path) - ControlBridge uses ServerID, NOT address, for ReplicaID - Missing ServerID → replica skipped (fail closed), logged Finding 2: Wired into real ProcessAssignments - BlockService.v2Bridge field initialized in StartBlockService - ProcessAssignments converts each assignment via v2Bridge.ConvertAssignment BEFORE existing V1 processing (parallel, not replacing yet) - Logged at glog V(1) Finding 3: Fail-closed on missing identity - Empty ServerID in ReplicaAddrs → replica skipped with log - Empty ReplicaServerID in scalar path → no replica created - Test: MissingServerID_FailsClosed verifies both paths 7 tests: StableServerID, AddressChange_IdentityPreserved, MultiReplica_StableServerIDs, MissingServerID_FailsClosed, EpochFencing_IntegratedPath, RebuildAssignment, ReplicaAssignment Co-Authored-By: Claude Opus 4.6 (1M context) --- sw-block/.private/phase/phase-04-decisions.md | 105 +- sw-block/.private/phase/phase-04-log.md | 40 +- sw-block/.private/phase/phase-04.md | 81 +- sw-block/.private/phase/phase-05-decisions.md | 94 ++ sw-block/.private/phase/phase-05-log.md | 78 ++ sw-block/.private/phase/phase-05.md | 356 ++++++ sw-block/.private/phase/phase-06-decisions.md | 68 ++ sw-block/.private/phase/phase-06-log.md | 51 + sw-block/.private/phase/phase-06.md | 193 +++ sw-block/.private/phase/phase-07-decisions.md | 119 ++ sw-block/.private/phase/phase-07-log.md | 63 + sw-block/.private/phase/phase-07.md | 220 ++++ sw-block/.private/phase/phase-08-decisions.md | 78 ++ sw-block/.private/phase/phase-08-log.md | 21 + sw-block/.private/phase/phase-08.md | 254 ++++ .../.private/phase/phase-4.5-decisions.md | 59 + sw-block/.private/phase/phase-4.5-log.md | 33 + sw-block/.private/phase/phase-4.5-reason.md | 397 ++++++ sw-block/.private/phase/phase-4.5.md | 356 ++++++ sw-block/design/README.md | 18 +- sw-block/design/a5-a8-traceability.md | 117 ++ sw-block/design/agent_dev_process.md | 304 +++++ .../design/phase-07-service-slice-plan.md | 403 +++++++ sw-block/design/v2-algorithm-overview.md | 686 +++++++++++ sw-block/design/v2-algorithm-overview.zh.md | 660 ++++++++++ sw-block/design/v2-detailed-algorithm.zh.md | 1068 +++++++++++++++++ sw-block/design/v2-engine-readiness-review.md | 170 +++ sw-block/design/v2-engine-slicing-plan.md | 191 +++ sw-block/design/v2-production-roadmap.md | 199 +++ sw-block/design/v2-protocol-truths.md | 561 +++++++++ sw-block/prototype/distsim/cluster.go | 13 +- sw-block/prototype/distsim/cluster_test.go | 2 +- .../distsim/phase02_candidate_test.go | 6 + .../distsim/phase045_adversarial_test.go | 219 ++++ .../prototype/distsim/phase045_crash_test.go | 334 ++++++ sw-block/prototype/distsim/predicates.go | 160 +++ sw-block/prototype/distsim/simulator.go | 7 +- sw-block/prototype/distsim/storage.go | 242 +++- weed/server/master_block_failover.go | 118 +- weed/server/master_block_registry.go | 56 +- weed/server/master_block_registry_test.go | 60 + weed/server/master_grpc_server.go | 7 +- weed/server/qa_block_edge_cases_test.go | 481 ++++++++ weed/server/volume_grpc_client_to_master.go | 10 + weed/server/volume_server.go | 5 +- weed/server/volume_server_block.go | 32 + weed/server/volume_server_block_debug.go | 77 ++ weed/storage/blockvol/block_heartbeat.go | 1 + weed/storage/blockvol/blockvol.go | 21 + weed/storage/blockvol/shipper_group.go | 11 + .../internal/robust-slow-replica.yaml | 58 +- weed/storage/blockvol/v2bridge/control.go | 77 +- .../storage/blockvol/v2bridge/control_test.go | 220 ++-- weed/storage/blockvol/wal_shipper.go | 25 +- weed/storage/store_blockvol.go | 9 + 55 files changed, 9024 insertions(+), 270 deletions(-) create mode 100644 sw-block/.private/phase/phase-05-decisions.md create mode 100644 sw-block/.private/phase/phase-05-log.md create mode 100644 sw-block/.private/phase/phase-05.md create mode 100644 sw-block/.private/phase/phase-06-decisions.md create mode 100644 sw-block/.private/phase/phase-06-log.md create mode 100644 sw-block/.private/phase/phase-06.md create mode 100644 sw-block/.private/phase/phase-07-decisions.md create mode 100644 sw-block/.private/phase/phase-07-log.md create mode 100644 sw-block/.private/phase/phase-07.md create mode 100644 sw-block/.private/phase/phase-08-decisions.md create mode 100644 sw-block/.private/phase/phase-08-log.md create mode 100644 sw-block/.private/phase/phase-08.md create mode 100644 sw-block/.private/phase/phase-4.5-decisions.md create mode 100644 sw-block/.private/phase/phase-4.5-log.md create mode 100644 sw-block/.private/phase/phase-4.5-reason.md create mode 100644 sw-block/.private/phase/phase-4.5.md create mode 100644 sw-block/design/a5-a8-traceability.md create mode 100644 sw-block/design/agent_dev_process.md create mode 100644 sw-block/design/phase-07-service-slice-plan.md create mode 100644 sw-block/design/v2-algorithm-overview.md create mode 100644 sw-block/design/v2-algorithm-overview.zh.md create mode 100644 sw-block/design/v2-detailed-algorithm.zh.md create mode 100644 sw-block/design/v2-engine-readiness-review.md create mode 100644 sw-block/design/v2-engine-slicing-plan.md create mode 100644 sw-block/design/v2-production-roadmap.md create mode 100644 sw-block/design/v2-protocol-truths.md create mode 100644 sw-block/prototype/distsim/phase045_adversarial_test.go create mode 100644 sw-block/prototype/distsim/phase045_crash_test.go create mode 100644 sw-block/prototype/distsim/predicates.go create mode 100644 weed/server/qa_block_edge_cases_test.go create mode 100644 weed/server/volume_server_block_debug.go diff --git a/sw-block/.private/phase/phase-04-decisions.md b/sw-block/.private/phase/phase-04-decisions.md index 500cbca74..5938aab64 100644 --- a/sw-block/.private/phase/phase-04-decisions.md +++ b/sw-block/.private/phase/phase-04-decisions.md @@ -1,7 +1,7 @@ # Phase 04 Decisions Date: 2026-03-27 -Status: initial +Status: complete ## First Slice Decision @@ -95,3 +95,106 @@ It is: - recovery outcome branching - assignment-intent orchestration - prototype-level end-to-end recovery flow + +## Accepted P2 Refinements + +### Recovery boundary + +Recovery classification must use a lineage-safe boundary, not a raw primary WAL head. + +So: + +- handshake outcome classification uses committed/safe recovery boundary +- stale or divergent extra tail must not be treated as zero-gap by default + +### Stale assignment fencing + +Assignment intent must not create current live sessions from stale epoch input. + +So: + +- stale assignment epoch is rejected +- assignment result distinguishes: + - created + - superseded + - failed + +### Phase discipline on outcome classification + +The outcome API must respect execution entry rules. + +So: + +- handshake-with-outcome requires valid connecting phase before acting + +## P3 Direction + +The next prototype step is: + +- minimal historical-data model +- recoverability proof +- explicit safe-boundary / divergent-tail handling + +## Accepted P3 Refinements + +### Recoverability proof + +The historical-data prototype must prove why catch-up is allowed. + +So: + +- recoverability now checks retained start, end within head, and contiguous coverage +- rebuild fallback is backed by executable unrecoverability + +### Historical state after recycling + +Retained-prefix modeling needs a base state, not only remaining WAL entries. + +So: + +- tail advance captures a base snapshot +- historical state reconstruction uses snapshot + retained WAL + +### Divergent tail handling + +Replica-ahead state must not collapse directly to `InSync`. + +So: + +- divergent tail requires explicit truncation +- completion is gated on recorded truncation when required + +## P4 Direction + +The next prototype step is: + +- prototype scenario closure +- acceptance-criteria to prototype traceability +- explicit expression of the 4 V2-boundary cases against `enginev2` + +## Accepted P4 Refinements + +### Prototype scenario closure + +The prototype must stop being only a set of local mechanisms. + +So: + +- acceptance criteria are mapped to prototype evidence +- key V2-boundary scenarios are expressed directly against `enginev2` +- prototype behavior is reviewable scenario-by-scenario + +### Phase 04 completion decision + +Phase 04 has now met its intended prototype scope: + +- ownership +- execution gating +- outcome branching +- minimal historical-data model +- prototype scenario closure + +So: + +- no broad new Phase 04 work should be added +- next work should move to `Phase 4.5` gate-hardening diff --git a/sw-block/.private/phase/phase-04-log.md b/sw-block/.private/phase/phase-04-log.md index 33d013a23..31b025309 100644 --- a/sw-block/.private/phase/phase-04-log.md +++ b/sw-block/.private/phase/phase-04-log.md @@ -1,7 +1,7 @@ # Phase 04 Log Date: 2026-03-27 -Status: active +Status: complete ## 2026-03-27 @@ -40,7 +40,37 @@ Status: active - attach/supersede now establish ownership only - handshake range validation added - enginev2 tests increased to 46 passing -- Next phase focus narrowed to P2: - - recovery outcome branching - - assignment-intent orchestration - - prototype end-to-end recovery flow +- Phase 04 P2 delivered and accepted: + - outcome branching added: + - `OutcomeZeroGap` + - `OutcomeCatchUp` + - `OutcomeNeedsRebuild` + - assignment-intent orchestration added + - stale assignment epoch now rejected + - assignment result now distinguishes created / superseded / failed + - end-to-end prototype recovery tests added + - zero-gap classification tightened: + - exact equality to committed boundary only + - replica-ahead is not zero-gap + - enginev2 tests increased to 63 passing +- Phase 04 P3 delivered and accepted: + - `WALHistory` added as minimal historical-data model + - recoverability proof strengthened: + - retained start + - end within head + - contiguous coverage + - base snapshot added for correct `StateAt()` after tail advance + - divergent-tail truncation made explicit in sender/session execution + - WAL-backed prototype recovery tests added + - enginev2 tests increased to 83 passing +- Phase 04 P4 delivered and accepted: + - acceptance criteria mapped to prototype evidence + - V2-boundary scenarios expressed against `enginev2` + - prototype scenario closure achieved + - enginev2 tests increased to 95 passing +- Phase 04 is now complete for its intended prototype scope. +- Next recommended phase: + - `Phase 4.5` + - tighten bounded `CatchUp` + - formalize `Rebuild` + - strengthen crash-consistency / recoverability / liveness proof diff --git a/sw-block/.private/phase/phase-04.md b/sw-block/.private/phase/phase-04.md index 407d1f79a..a73e90b76 100644 --- a/sw-block/.private/phase/phase-04.md +++ b/sw-block/.private/phase/phase-04.md @@ -1,7 +1,7 @@ # Phase 04 Date: 2026-03-27 -Status: active +Status: complete Purpose: start the first standalone V2 implementation slice under `sw-block/`, centered on per-replica sender ownership and explicit recovery-session ownership ## Goal @@ -93,6 +93,7 @@ Delivered in this phase so far: - execution APIs implemented: - `BeginConnect` - `RecordHandshake` + - `RecordHandshakeWithOutcome` - `BeginCatchUp` - `RecordCatchUpProgress` - `CompleteSessionByID` @@ -101,15 +102,42 @@ Delivered in this phase so far: - zero-gap handshake fast path allowed - attach/supersede now establish ownership only - sender-group orchestration tests added +- recovery outcome branching implemented: + - `OutcomeZeroGap` + - `OutcomeCatchUp` + - `OutcomeNeedsRebuild` +- assignment-intent orchestration implemented: + - reconcile + recovery target session creation + - stale assignment epoch rejected + - created/superseded/failed outcomes distinguished +- P2 data-boundary correction accepted: + - zero-gap now requires exact equality to committed boundary + - replica-ahead is not zero-gap +- minimal historical-data prototype implemented: + - `WALHistory` + - retained-prefix / recycled-range semantics + - executable recoverability proof + - base snapshot for historical state after tail advance +- explicit safe-boundary handling implemented: + - divergent tail requires truncation before `InSync` + - truncation recorded via sender-owned execution API +- WAL-backed prototype tests added: + - catch-up recovery with data verification + - rebuild fallback with proof of unrecoverability + - truncate-then-`InSync` with committed-boundary verification - current `enginev2` test state at latest review: - - 46 tests passing - -Next focus for `sw`: - -- continue Phase 04 beyond execution gating: - - recovery outcome branching - - sender-group orchestration from assignment intent - - prototype-level end-to-end recovery flow +- - 95 tests passing +- prototype scenario closure completed: + - acceptance criteria mapped to prototype evidence + - V2-boundary scenarios expressed against `enginev2` + - small end-to-end prototype harness added + +Next phase: + +- `Phase 4.5` + - bounded `CatchUp` + - first-class `Rebuild` + - crash-consistency / recoverability / liveness proof hardening - do not integrate into V1 production tree yet ### P1 @@ -141,6 +169,39 @@ Next focus for `sw`: - completion / invalidation - rebuild escalation +### P3 + +10. add minimal historical-data prototype +- retained prefix/window +- minimal recoverability state +- explicit "why catch-up is allowed" proof + +11. make safe-boundary data handling explicit +- divergent tail cleanup / truncate rule +- or equivalent explicit boundary handling before `InSync` + +12. strengthen recoverability/rebuild tests +- executable proof of: +- recoverable gap +- unrecoverable gap +- rebuild fallback boundary + +### P4 + +13. close prototype scenario coverage +- map key acceptance criteria onto `enginev2` scenarios/tests +- make prototype evidence reviewable scenario-by-scenario + +14. express the 4 V2-boundary cases against the prototype +- changed-address identity-preserving recovery +- `NeedsRebuild` persistence +- catch-up without overwriting safe data +- repeated disconnect/reconnect cycles + +15. add one small prototype harness if needed +- enough to show assignment -> recovery -> outcome flow end-to-end +- no product/backend integration yet + ## Exit Criteria Phase 04 is done when: @@ -151,3 +212,5 @@ Phase 04 is done when: 4. endpoint update and epoch invalidation are tested 5. sender-owned execution flow is validated 6. recovery outcome branching exists at prototype level +7. minimal historical-data / recoverability model exists at prototype level +8. prototype scenario closure is achieved for key V2 acceptance cases diff --git a/sw-block/.private/phase/phase-05-decisions.md b/sw-block/.private/phase/phase-05-decisions.md new file mode 100644 index 000000000..fab98d6ce --- /dev/null +++ b/sw-block/.private/phase/phase-05-decisions.md @@ -0,0 +1,94 @@ +# Phase 05 Decisions + +## Decision 1: Real V2 engine work lives under `sw-block/engine/replication/` + +The first real engine slice is established under: + +- `sw-block/engine/replication/` + +This keeps V2 separate from: + +- `sw-block/prototype/` +- `weed/storage/blockvol/` + +## Decision 2: Slice 1 is accepted + +Accepted scope: + +1. stable per-replica sender identity +2. stable recovery-session identity +3. stale authority fencing +4. endpoint / epoch invalidation +5. ownership registry + +## Decision 3: Stable identity must not be address-shaped + +The engine registry is now keyed by stable `ReplicaID`, not mutable endpoint address. + +This is a required structural break from the V1/V1.5 identity-loss pattern. + +## Decision 4: Slice 2 is accepted + +Accepted scope: + +1. connect / handshake / catch-up flow +2. zero-gap / catch-up / needs-rebuild branching +3. stale execution rejection during active recovery +4. bounded catch-up semantics in engine path +5. rebuild execution shell + +## Decision 5: Slice 3 owns real recoverability inputs + +Slice 3 should be the point where: + +1. recoverable vs unrecoverable gap uses real engine inputs +2. trusted-base / rebuild-source decision uses real engine data inputs +3. truncation / safe-boundary handling is tied to real engine state +4. historical correctness at recovery target is validated from engine inputs + +## Decision 6: Slice 3 is accepted + +Accepted scope: + +1. real engine recoverability input path +2. trusted-base / rebuild-source decision from engine data inputs +3. truncation / safe-boundary handling tied to engine state +4. recoverability gating without overclaiming full historical reconstruction in engine + +## Decision 7: Slice 3 should replace carried-forward heuristics where appropriate + +In particular: + +1. simple rebuild-source heuristics carried from prototype should not become permanent engine policy +2. Slice 3 should tighten these decisions against real engine recoverability inputs + +## Decision 8: Slice 4 is the engine integration closure slice + +Next focus: + +1. real assignment/control intent entry path +2. engine observability / debug surface +3. focused integration tests for V2-boundary cases +4. validation against selected real failure classes from `learn/projects/sw-block/` and `weed/storage/block*` + +## Decision 9: Slice 4 is accepted + +Accepted scope: + +1. real orchestrator entry path +2. assignment/update-driven recovery through that path +3. engine observability / causal recovery logging +4. diagnosable V2-boundary integration tests + +## Decision 10: Phase 05 is complete + +Reason: + +1. ownership core is accepted +2. recovery execution core is accepted +3. data / recoverability core is accepted +4. integration closure is accepted + +Next: + +- `Phase 06` broader engine implementation stage diff --git a/sw-block/.private/phase/phase-05-log.md b/sw-block/.private/phase/phase-05-log.md new file mode 100644 index 000000000..6ff441e2c --- /dev/null +++ b/sw-block/.private/phase/phase-05-log.md @@ -0,0 +1,78 @@ +# Phase 05 Log + +## 2026-03-29 + +### Opened + +`Phase 05` opened as: + +- V2 engine planning + Slice 1 ownership core + +### Accepted + +1. engine module location + - `sw-block/engine/replication/` + +2. Slice 1 ownership core + - stable per-replica sender identity + - stable recovery-session identity + - sender/session fencing + - endpoint / epoch invalidation + - ownership registry + +3. Slice 1 identity correction + - registry now keyed by stable `ReplicaID` + - mutable `Endpoint` separated from identity + - real changed-`DataAddr` preservation covered by test + +4. Slice 1 encapsulation + - mutable sender/session authority state no longer exposed directly + - snapshot/read-only inspection path in place + +5. Slice 2 recovery execution core + - connect / handshake / catch-up flow + - explicit zero-gap / catch-up / needs-rebuild branching + - stale execution rejection during active recovery + - bounded catch-up semantics + - rebuild execution shell + +6. Slice 2 validation + - corrected tester summary accepted + - `12` ownership tests + `18` recovery tests = `30` total + - Slice 2 accepted for progression to Slice 3 planning + +7. Slice 3 data / recoverability core + - `RetainedHistory` introduced as engine-level recoverability input + - history-driven sender APIs added for handshake and rebuild-source selection + - trusted-base decision now requires both checkpoint trust and replayable tail + - truncation remains a completion gate / protocol boundary + +8. Slice 3 validation + - corrected tester summary accepted + - `12` ownership tests + `18` recovery tests + `18` recoverability tests = `48` total + - accepted boundary: + - engine proves historical-correctness prerequisites + - simulator retains stronger historical reconstruction proof + - Slice 3 accepted for progression to Slice 4 planning + +9. Slice 4 integration closure + - `RecoveryOrchestrator` added as integrated engine entry path + - assignment/update-driven recovery is exercised through orchestrator + - observability surface added: + - `RegistryStatus` + - `SenderStatus` + - `SessionSnapshot` + - `RecoveryLog` + - causal recovery logging now covers invalidation, escalation, truncation, completion, rebuild transitions + +10. Slice 4 validation + - corrected tester summary accepted + - `12` ownership tests + `18` recovery tests + `18` recoverability tests + `11` integration tests = `59` total + - Slice 4 accepted + - `Phase 05` accepted as complete + +### Next + +1. `Phase 06` planning +2. broader engine implementation stage +3. real-engine integration against selected `weed/storage/block*` constraints and failure classes diff --git a/sw-block/.private/phase/phase-05.md b/sw-block/.private/phase/phase-05.md new file mode 100644 index 000000000..ec21837f8 --- /dev/null +++ b/sw-block/.private/phase/phase-05.md @@ -0,0 +1,356 @@ +# Phase 05 + +Date: 2026-03-29 +Status: complete +Purpose: begin the real V2 engine track under `sw-block/` by moving from prototype proof to the first engine slice + +## Why This Phase Exists + +The project has now completed: + +1. V2 design/FSM closure +2. V2 protocol/simulator validation +3. Phase 04 prototype closure +4. Phase 4.5 evidence hardening + +So the next step is no longer: + +- extend prototype breadth + +The next step is: + +- start disciplined real V2 engine work + +## Phase Goal + +Start the real V2 engine line under `sw-block/` with: + +1. explicit engine module location +2. Slice 1 ownership-core boundaries +3. first engine ownership-core implementation +4. engine-side validation tied back to accepted prototype invariants + +## Relationship To Previous Phases + +`Phase 05` is built on: + +- `sw-block/design/v2-engine-readiness-review.md` +- `sw-block/design/v2-engine-slicing-plan.md` +- `sw-block/.private/phase/phase-04.md` +- `sw-block/.private/phase/phase-4.5.md` + +This is a new implementation phase. + +It is not: + +1. more prototype expansion +2. V1 integration +3. backend redesign + +## Scope + +### In scope + +1. choose real V2 engine module location under `sw-block/` +2. define Slice 1 file/module boundaries +3. write short engine ownership-core spec +4. start Slice 1 implementation: + - stable per-replica sender object + - stable recovery-session object + - session identity fencing + - endpoint / epoch invalidation + - ownership registry / sender-group equivalent +5. add focused engine-side ownership/fencing tests + +### Out of scope + +1. Smart WAL expansion +2. full storage/backend redesign +3. full rebuild-source decision logic +4. V1 production integration +5. performance work +6. full product integration + +## Planned Slices + +### P0: Engine Planning Setup + +1. choose real V2 engine module location under `sw-block/` +2. define Slice 1 file/module boundaries +3. write ownership-core spec +4. map 3-5 acceptance scenarios to Slice 1 expectations + +Status: + +- accepted +- engine module location chosen: `sw-block/engine/replication/` +- Slice 1 boundaries are explicit enough to start implementation + +### P1: Slice 1 Ownership Core + +1. implement stable per-replica sender object +2. implement stable recovery-session object +3. implement sender/session identity fencing +4. implement endpoint / epoch invalidation +5. implement ownership registry + +Status: + +- accepted +- stable `ReplicaID` is now explicit and separate from mutable `Endpoint` +- engine registry is keyed by stable identity, not address-shaped strings +- real changed-`DataAddr` preservation is covered by test + +### P2: Slice 1 Validation + +1. engine-side tests for ownership/fencing +2. changed-address case +3. stale-session rejection case +4. epoch-bump invalidation case +5. traceability back to accepted prototype behavior + +Status: + +- accepted +- Slice 1 ownership/fencing tests are in place and passing +- acceptance/gate mapping is strong enough to move to Slice 2 + +### P3: Slice 2 Planning Setup + +1. define Slice 2 boundaries explicitly +2. distinguish Slice 2 core from carried-forward prototype support +3. map Slice 2 engine expectations from accepted prototype evidence +4. prepare Slice 2 validation targets + +Status: + +- accepted +- Slice 2 recovery execution core is implemented and validated +- corrected tester summary accepted: + - `12` ownership tests + - `18` recovery tests + - `30` total + +### P4: Slice 3 Planning Setup + +1. define Slice 3 boundaries explicitly +2. connect recovery decisions to real engine recoverability inputs +3. make trusted-base / rebuild-source decision use real engine data inputs +4. prepare Slice 3 validation targets + +Status: + +- accepted +- Slice 3 data / recoverability core is implemented and validated +- corrected tester summary accepted: + - `12` ownership tests + - `18` recovery tests + - `18` recoverability tests + - `48` total +- important boundary preserved: + - engine proves historical-correctness prerequisites + - full historical reconstruction proof remains simulator-side + +## Slice 3 Guardrails + +Slice 3 is the point where V2 must move from: + +- recovery automaton is coherent + +to: + +- recovery basis is provable + +So Slice 3 must stay tight. + +### Guardrail 1: No optimistic watermark in place of recoverability proof + +Do not accept: + +- loose head/tail watermarks +- "looks retained enough" +- heuristic recoverability + +Slice 3 should prove: + +1. why a gap is recoverable +2. why a gap is unrecoverable + +### Guardrail 2: No current extent state pretending to be historical correctness + +Do not accept: + +- current extent image as substitute for target-LSN truth +- checkpoint/base state that leaks newer state into older historical queries + +Slice 3 should prove historical correctness at the actual recovery target. + +### Guardrail 3: No `snapshot + tail` without trusted-base proof + +Do not accept: + +- "snapshot exists" as sufficient + +Require: + +1. trusted base exists +2. trusted base covers the required base state +3. retained tail can be replayed continuously from that base to the target + +If not, recovery must use: + +- `FullBase` + +### Guardrail 4: Truncation is protocol boundary, not cleanup policy + +Do not treat truncation as: + +- optional cleanup +- post-recovery tidying + +Treat truncation as: + +1. divergent tail removal +2. explicit safe-boundary restoration +3. prerequisite for safe `InSync` / recovery completion where applicable + +### P5: Slice 4 Planning Setup + +1. define Slice 4 boundaries explicitly +2. connect engine control/recovery core to real assignment/control intent entry path +3. add engine observability / debug surface for ownership and recovery failures +4. prepare integration validation against V2-boundary failure classes + +Status: + +- accepted +- Slice 4 integration closure is implemented and validated +- corrected tester summary accepted: + - `12` ownership tests + - `18` recovery tests + - `18` recoverability tests + - `11` integration tests + - `59` total + +## Slice 4 Guardrails + +Slice 4 should close integration, not just add an entry point and some logs. + +### Guardrail 1: Entry path must actually drive recovery + +Do not accept: + +- tests that manually push sender/session state while only pretending to use integration entry points + +Require: + +1. real assignment/control intent entry path +2. session creation / invalidation / restart triggered through that path +3. recovery flow driven from that path, not only from unit-level helper calls + +### Guardrail 2: Changed-address must survive the real entry path + +Do not accept: + +- changed-address correctness proven only at local object level + +Require: + +1. stable `ReplicaID` survives real assignment/update entry path +2. endpoint update invalidates old session correctly +3. new recovery session is created correctly on updated endpoint + +### Guardrail 3: Observability must show protocol causality + +Do not accept: + +- only state snapshots +- only phase dumps + +Require observability that can explain: + +1. why recovery entered `NeedsRebuild` +2. why a session was superseded +3. why a completion or progress update was rejected +4. why endpoint / epoch change caused invalidation + +### Guardrail 4: Failure replay must be explainable + +Do not accept: + +- a replay that reproduces failure but cannot explain the cause from engine observability + +Require: + +1. selected failure-class replays through the real entry path +2. observability sufficient to explain the control/recovery decision +3. reviewability against key V2-boundary failures + +## Exit Criteria + +Phase 05 Slice 1 is done when: + +1. the real V2 engine module location is chosen +2. Slice 1 boundaries are explicit +3. engine ownership core exists under `sw-block/` +4. engine-side ownership/fencing tests pass +5. Slice 1 evidence is reviewable against prototype expectations + +This bar is now met. + +Phase 05 Slice 2 is done when: + +1. engine-side recovery execution flow exists +2. zero-gap / catch-up / needs-rebuild branching is explicit +3. stale execution is rejected during active recovery +4. bounded catch-up semantics are enforced in engine path +5. rebuild execution shell is validated + +This bar is now met. + +Phase 05 Slice 3 is done when: + +1. recoverable vs unrecoverable gap uses real engine recoverability inputs +2. trusted-base / rebuild-source decision uses real engine data inputs +3. truncation / safe-boundary handling is tied to real engine state +4. history-driven engine APIs exist for recovery decisions +5. Slice 3 validation is reviewable without overclaiming full historical reconstruction + +This bar is now met. + +Phase 05 Slice 4 is done when: + +1. real assignment/control intent entry path exists +2. changed-address recovery works through the real entry path +3. observability explains protocol causality, not only state snapshots +4. selected V2-boundary failures are replayable and diagnosable through engine integration tests + +This bar is now met. + +## Assignment For `sw` + +Phase 05 is now complete. + +Next phase: + +- `Phase 06` broader engine implementation stage + +## Assignment For `tester` + +Phase 05 validation is complete. + +Next phase: + +- `Phase 06` engine implementation validation against real-engine constraints and failure classes + +## Management Rule + +`Phase 05` should stay narrow. + +It should start the engine line with: + +1. ownership +2. fencing +3. validation + +It should not try to absorb later slices early. diff --git a/sw-block/.private/phase/phase-06-decisions.md b/sw-block/.private/phase/phase-06-decisions.md new file mode 100644 index 000000000..31669178a --- /dev/null +++ b/sw-block/.private/phase/phase-06-decisions.md @@ -0,0 +1,68 @@ +# Phase 06 Decisions + +## Decision 1: Phase 06 is broader engine implementation, not new design + +The protocol shape and engine core contracts were already accepted. + +Phase 06 implemented around them. + +## Decision 2: Phase 06 must connect to real constraints + +This phase explicitly used: + +1. `learn/projects/sw-block/` for failure gates and test lineage +2. `weed/storage/block*` for real implementation constraints + +without importing V1 structure as the V2 design template. + +## Decision 3: Phase 06 should replace key synchronous conveniences + +The accepted Slice 4 convenience flows were sufficient for closure work, but broader engine work required real step boundaries. + +This is now satisfied via planner/executor separation. + +## Decision 4: Phase 06 ends with a runnable engine stage decision + +Result: + +- yes, the project now has a broader runnable engine stage that is ready to proceed to real-system integration / product-path work + +## Decision 5: Phase 06 P0 is accepted + +Accepted scope: + +1. adapter/module boundaries +2. convenience-flow classification +3. initial real-engine stage framing + +## Decision 6: Phase 06 P1 is accepted + +Accepted scope: + +1. storage/control adapter interfaces +2. `RecoveryDriver` planner/resource-acquisition layer +3. full-base and WAL retention resource contracts +4. fail-closed preconditions on planning paths + +## Decision 7: Phase 06 P2 is accepted + +Accepted scope: + +1. explicit planner/executor split on top of `RecoveryPlan` +2. executor-owned cleanup symmetry on success/failure/cancellation +3. plan-bound rebuild execution with no policy re-derivation at execute time +4. synchronous orchestrator completion helpers remain test-only convenience + +## Decision 8: Phase 06 P3 is accepted + +Accepted scope: + +1. selected real failure classes validated through the engine path +2. cross-layer engine/storage proof validation +3. diagnosable failure when proof or resource acquisition cannot be established + +## Decision 9: Phase 06 is complete + +Next step: + +- `Phase 07` real-system integration / product-path decision diff --git a/sw-block/.private/phase/phase-06-log.md b/sw-block/.private/phase/phase-06-log.md new file mode 100644 index 000000000..f483d4051 --- /dev/null +++ b/sw-block/.private/phase/phase-06-log.md @@ -0,0 +1,51 @@ +# Phase 06 Log + +## 2026-03-30 + +### Opened + +`Phase 06` opened as: + +- broader engine implementation stage + +### Starting basis + +1. `Phase 05`: complete +2. engine core and integration closure accepted +3. next work moves from slice proof to broader runnable engine stage + +### Accepted + +1. Phase 06 P0 + - adapter/module boundaries defined + - convenience flows explicitly classified + +2. Phase 06 P1 + - storage/control adapter surfaces defined + - `RecoveryDriver` added as planner/resource-acquisition layer + - full-base rebuild now has explicit resource contract + - WAL pin contract tied to actual recovery need + - driver preconditions fail closed + +3. Phase 06 P2 + - explicit planner/executor split accepted + - executor owns release symmetry on success, failure, and cancellation + - rebuild execution now consumes plan-bound source/target values + - tester final validation accepted with reduced-but-sufficient rebuild failure-path coverage + +4. Phase 06 P3 + - selected real failure classes validated through the engine path + - changed-address restart now uses plan cancellation and re-plan flow + - stale execution is caught through the executor-managed loop + - cross-layer trusted-base / replayable-tail proof path validated end-to-end + - rebuild planning failures now clean up sessions and remain diagnosable + +### Closed + +`Phase 06` closed as complete. + +### Next + +1. Phase 07 real-system integration / product-path decision +2. service-slice integration against real control/storage surroundings +3. first product-path gating decision diff --git a/sw-block/.private/phase/phase-06.md b/sw-block/.private/phase/phase-06.md new file mode 100644 index 000000000..d54988066 --- /dev/null +++ b/sw-block/.private/phase/phase-06.md @@ -0,0 +1,193 @@ +# Phase 06 + +Date: 2026-03-30 +Status: complete +Purpose: move from validated engine slices to the first broader runnable V2 engine stage + +## Why This Phase Exists + +`Phase 05` established and validated: + +1. ownership core +2. recovery execution core +3. recoverability/data gating core +4. integration closure + +What still does not exist is a broader engine stage that can run with: + +1. real control-plane inputs +2. real persistence/backing inputs +3. non-trivial execution loops instead of only synchronous convenience paths + +So `Phase 06` exists to turn the accepted engine shape into the first broader runnable engine stage. + +Phase 06 must connect the accepted engine core to real control and real storage truth, not just wrap current abstractions with adapters. + +## Phase Goal + +Build the first broader V2 engine stage without reopening protocol shape. + +This phase should focus on: + +1. real engine adapters around the accepted core +2. asynchronous or stepwise execution paths where Slice 4 used synchronous helpers +3. real retained-history / checkpoint input plumbing +4. validation against selected real failure classes and real implementation constraints + +## Overall Roadmap + +Completed: + +1. Phase 01-03: design + simulator +2. Phase 04: prototype closure +3. Phase 4.5: evidence hardening +4. Phase 05: engine slice closure +5. Phase 06: broader engine implementation stage + +Next: + +1. Phase 07: real-system integration / product-path decision + +This roadmap should stay strict: + +- no return to broad prototype expansion +- no uncontrolled engine sprawl + +## Scope + +### In scope + +1. control-plane adapter into `sw-block/engine/replication/` +2. retained-history / checkpoint adapter into engine recoverability APIs +3. replacement of synchronous convenience flows with explicit engine steps where needed +4. engine error taxonomy and observability tightening +5. validation against selected real failure classes from: + - `learn/projects/sw-block/` + - `weed/storage/block*` + +### Out of scope + +1. Smart WAL expansion +2. full backend redesign +3. performance optimization as primary goal +4. V1 replacement rollout +5. full product integration + +## Phase 06 Items + +### P0: Engine Stage Plan + +Status: + +- accepted +- module boundaries now explicit: + - `adapter.go` + - `driver.go` + - `orchestrator.go` classification +- convenience flows are now classified as: + - test-only convenience wrapper + - stepwise engine task + - planner/executor split + +### P1: Control / History Adapters + +Status: + +- accepted +- `StorageAdapter` boundary exists and is exercised by tests +- full-base rebuild now has a real pin/release contract +- WAL pinning is tied to actual recovery contract, not loose watermark use +- planner fails closed on missing sender / missing session / wrong session kind + +### P2: Execution Driver + +Status: + +- accepted +- executor now owns resource lifecycle on success / failure / cancellation +- catch-up execution is stepwise and budget-checked per progress step +- rebuild execution consumes plan-bound source/target values and does not re-derive policy at execute time +- `CompleteCatchUp` / `CompleteRebuild` remain test-only convenience wrappers +- tester validation accepted with reduced-but-sufficient rebuild failure-path coverage + +### P3: Validation Against Real Failure Classes + +Status: + +- accepted +- changed-address restart now validated through planner/executor path with plan cancellation +- stale epoch/session during active execution now validated through the executor-managed loop +- cross-layer trusted-base / replayable-tail proof path validated end-to-end +- rebuild fallback and pin-failure cleanup now fail closed and are diagnosable + +## Guardrails + +### Guardrail 1: Do not reopen protocol shape + +Phase 06 implemented around accepted engine slices and did not reopen: + +1. sender/session authority model +2. bounded catch-up contract +3. recoverability/truncation boundary + +### Guardrail 2: Do not let adapters smuggle V1 structure back in + +V1 code and docs remain: + +1. constraints +2. failure gates +3. integration references + +not the V2 architecture template. + +### Guardrail 3: Prefer explicit engine steps over synchronous convenience + +Key convenience helpers remain test-only. Real engine work now has explicit planner/executor boundaries. + +### Guardrail 4: Keep evidence quality high + +Phase 06 improved: + +1. cross-layer traceability +2. diagnosability +3. real-failure validation + +without growing protocol surface. + +### Guardrail 5: Do not fake storage truth with metadata-only adapters + +Phase 06 now requires: + +1. trusted base to come from storage-side truth +2. replayable tail to be grounded in retention state +3. observable rejection when those proofs cannot be established + +## Exit Criteria + +Phase 06 is done when: + +1. engine has real control/history adapters into the accepted core +2. engine has real storage/base adapters into the accepted core +3. key synchronous convenience paths are explicitly classified or replaced by real engine steps where necessary +4. selected real failure classes are validated against the engine stage +5. at least one cross-layer storage/engine proof path is validated end-to-end +6. engine observability remains good enough to explain recovery causality + +Status: + +- met + +## Closeout + +`Phase 06` is complete. + +It established: + +1. a broader runnable engine stage around the accepted Phase 05 core +2. real planner/executor/resource contracts +3. validated failure-class behavior through the engine path +4. diagnosable proof rejection and cleanup behavior + +Next step: + +- `Phase 07` real-system integration / product-path decision diff --git a/sw-block/.private/phase/phase-07-decisions.md b/sw-block/.private/phase/phase-07-decisions.md new file mode 100644 index 000000000..a0440ca9c --- /dev/null +++ b/sw-block/.private/phase/phase-07-decisions.md @@ -0,0 +1,119 @@ +# Phase 07 Decisions + +## Decision 1: Phase 07 is real-system integration, not protocol redesign + +The V2 protocol shape, engine core, and broader runnable engine stage are already accepted. + +Phase 07 should integrate them into a real-system service slice. + +## Decision 2: Phase 07 should make the first product-path decision + +This phase should not only integrate a service slice. + +It should also decide: + +1. what the first product path is +2. what remains before pre-production hardening + +## Decision 3: Phase 07 must preserve accepted V2 boundaries + +Phase 07 should preserve: + +1. narrow catch-up semantics +2. rebuild as the formal recovery path +3. trusted-base / replayable-tail proof boundaries +4. stable identity / fenced execution / diagnosable failure handling + +## Decision 4: Phase 07 P0 service-slice direction is set + +Current direction: + +1. first service slice = `RF=2` block volume primary + one replica +2. engine remains in `sw-block/engine/replication/` +3. current bridge work starts in `sw-block/bridge/blockvol/` +4. deferred real blockvol-side bridge target = `weed/storage/blockvol/v2bridge/` +5. stable identity mapping is explicit: + - `ReplicaID = /` +6. `blockvol` executes I/O but does not own recovery policy + +## Decision 5: Phase 07 P1 is accepted with explicit scope limits + +Accepted `P1` coverage is: + +1. real reader mapping from `BlockVol` state +2. real retention hold / release wiring into the flusher retention floor +3. one real WAL catch-up scan path through `v2bridge` +4. direct real-adapter tests under `weed/storage/blockvol/v2bridge/` + +This acceptance means: + +1. the real bridge path is now integrated and evidenced +2. `P1` is not yet acceptance proof of general post-checkpoint catch-up viability + +Not accepted as part of `P1`: + +1. snapshot transfer execution +2. full-base transfer execution +3. WAL truncation execution +4. master-side confirmed failover / control-intent integration + +## Decision 6: Interim committed-truth limitation remains active + +`Phase 07 P1` is accepted with an explicit carry-forward limitation: + +1. interim `CommittedLSN = CheckpointLSN` is a service-slice mapping, not final V2 protocol truth +2. post-checkpoint catch-up semantics are therefore narrower than final V2 intent +3. later `Phase 07` work must not overclaim this limitation as solved until commit truth is separated from checkpoint truth + +## Decision 7: Phase 07 P2 is accepted with scoped replay claims + +Accepted `P2` coverage is: + +1. real service-path replay for changed-address restart +2. stale epoch / stale session invalidation through the integrated path +3. unrecoverable-gap / needs-rebuild replay with diagnosable proof +4. explicit replay of the post-checkpoint boundary under the interim model + +Not accepted as part of `P2`: + +1. general integrated engine-driven post-checkpoint catch-up semantics +2. real control-plane delivery from master heartbeat into the bridge +3. rebuild execution beyond the already-deferred executor stubs + +## Decision 8: Phase 07 now moves to product-path choice, not more bridge-shape proof + +With `P0`, `P1`, and `P2` accepted, the next step is: + +1. choose the first product path from accepted service-slice evidence +2. define what remains before pre-production hardening +3. keep unresolved limits explicit rather than hiding them behind broader claims + +## Decision 7: Phase 07 P2 must replay the interim limitation explicitly + +`Phase 07 P2` should not only replay happy-path or ordinary failure-path integration. + +It should also include one explicit replay where: + +1. the live bridge path is exercised after checkpoint truth has advanced +2. the observed catch-up limitation is diagnosed as a consequence of the interim mapping +3. the result is not overclaimed as proof of final V2 post-checkpoint catch-up semantics + +## Decision 10: Phase 07 P3 is accepted and Phase 07 is complete + +The first V2 product path is now explicitly chosen as: + +1. `RF=2` +2. `sync_all` +3. existing master / volume-server heartbeat path +4. V2 engine owns recovery policy +5. `v2bridge` provides real storage truth + +This decision is accepted with explicit non-claims: + +1. not production-ready +2. no real master-side control delivery proof yet +3. no full rebuild execution proof yet +4. no general post-checkpoint catch-up proof yet +5. no full integrated engine -> executor -> `v2bridge` catch-up proof yet + +Phase 07 is therefore complete, and the next phase is pre-production hardening. diff --git a/sw-block/.private/phase/phase-07-log.md b/sw-block/.private/phase/phase-07-log.md new file mode 100644 index 000000000..a59830579 --- /dev/null +++ b/sw-block/.private/phase/phase-07-log.md @@ -0,0 +1,63 @@ +# Phase 07 Log + +## 2026-03-30 + +### Opened + +`Phase 07` opened as: + +- real-system integration / product-path decision + +### Starting basis + +1. `Phase 06`: complete +2. broader runnable engine stage accepted +3. next work moves from engine-stage validation to real-system service-slice integration + +### Delivered + +1. Phase 07 P0 + - service-slice plan defined + - implementation slice proposal delivered + - bridge layer introduced as: + - `sw-block/bridge/blockvol/` for current bridge work + - `weed/storage/blockvol/v2bridge/` as the deferred real integration target + - stable identity mapping made explicit: + - `ReplicaID = /` + - engine / blockvol policy boundary made explicit + - initial bridge tests delivered (`8`) +2. Phase 07 P1 + - real blockvol reader integrated via `weed/storage/blockvol/v2bridge/reader.go` + - real pinner integrated via `weed/storage/blockvol/v2bridge/pinner.go` + - one real catch-up executor path integrated via `weed/storage/blockvol/v2bridge/executor.go` + - direct real-adapter tests delivered in: + - `weed/storage/blockvol/v2bridge/bridge_test.go` + - accepted with explicit carry-forward: + - interim `CommittedLSN = CheckpointLSN` limits post-checkpoint catch-up semantics and is not final V2 commit truth + - acceptance is for the real integrated bridge path, not for general post-checkpoint catch-up viability +3. Phase 07 P2 + - real service-path failure replay accepted + - accepted replay set includes: + - changed-address restart + - stale epoch / stale session invalidation + - unrecoverable-gap / needs-rebuild replay + - explicit post-checkpoint boundary replay + - evidence kept explicitly scoped: + - real `v2bridge` WAL-scan execution proven + - general integrated post-checkpoint catch-up semantics not overclaimed under the interim model +4. Phase 07 P3 + - product-path decision accepted + - first product path chosen as: + - `RF=2` + - `sync_all` + - existing master / volume-server heartbeat path + - V2 engine recovery ownership with `v2bridge` real storage truth + - pre-hardening prerequisites made explicit + - intentional deferrals and non-claims recorded + - `Phase 07` completed + +### Next + +1. Phase 08 pre-production hardening +2. real master/control delivery integration +3. integrated catch-up / rebuild execution closure diff --git a/sw-block/.private/phase/phase-07.md b/sw-block/.private/phase/phase-07.md new file mode 100644 index 000000000..e14cd3fd5 --- /dev/null +++ b/sw-block/.private/phase/phase-07.md @@ -0,0 +1,220 @@ +# Phase 07 + +Date: 2026-03-30 +Status: complete +Purpose: connect the broader runnable V2 engine stage to a real-system service slice and decide the first product path + +## Why This Phase Exists + +`Phase 06` completed the broader runnable engine stage: + +1. planner/executor/resource contracts are real +2. selected real failure classes are validated through the engine path +3. cross-layer trusted-base / replayable-tail proof path is validated + +What still does not exist is a real-system slice where the engine runs inside actual service boundaries with real control/storage surroundings. + +So `Phase 07` exists to answer: + +1. how the engine runs as a real subsystem +2. what the first product path should be +3. what integration risks remain before pre-production hardening + +## Phase Goal + +Establish a real-system integration slice for the V2 engine and make the first product-path decision without reopening protocol shape. + +## Scope + +### In scope + +1. service-slice integration around `sw-block/engine/replication/` +2. real control-plane / lifecycle entry path into the engine +3. real storage-side adapter hookup into existing system boundaries +4. selected real-system failure replay and diagnosis +5. explicit product-path decision framing + +### Out of scope + +1. broad performance optimization +2. Smart WAL expansion +3. full V1 replacement rollout +4. broad backend redesign +5. production rollout itself + +## Phase 07 Items + +### P0: Service-Slice Plan + +1. define the first real-system service slice that will host the engine +2. define adapter/module boundaries at the service boundary +3. choose the concrete integration path to exercise first +4. identify which current adapters are still mock/test-only and must be replaced first +5. make the first-slice identity/epoch mapping explicit +6. treat `blockvol` as execution backend only, not recovery-policy owner + +Status: + +- delivered +- planning artifact: + - `sw-block/design/phase-07-service-slice-plan.md` +- implementation slice proposal: + - engine core: `sw-block/engine/replication/` + - bridge adapters: `sw-block/bridge/blockvol/` + - real blockvol integration target: `weed/storage/blockvol/v2bridge/` (`P1`) +- adapter replacement order: + - `control_adapter.go` (`P0`) done + - `storage_adapter.go` (`P0`) done + - `executor_bridge.go` (`P1`) deferred + - `observe_adapter.go` (`P1`) deferred +- first-slice identity mapping is explicit: + - `ReplicaID = /` + - not derived from any address field +- engine / blockvol boundary is explicit: + - bridge maps intent and state + - `blockvol` executes I/O + - `blockvol` does not own recovery policy +- service-slice validation gaps called out for `P1`: + - real blockvol field mapping + - real pin/release lifecycle against reclaim/GC + - assignment timing vs engine session lifecycle + - executor bridge into real WAL/snapshot work + +### P1: Real Entry-Path Integration + +1. connect real control/lifecycle events into the engine entry path +2. connect real storage/base/recoverability signals into the engine adapters +3. preserve accepted engine authority/execution/recoverability contracts + +Status: + +- accepted +- real integration now established for: + - reader via `weed/storage/blockvol/v2bridge/reader.go` + - pinner via `weed/storage/blockvol/v2bridge/pinner.go` + - catch-up executor path via `weed/storage/blockvol/v2bridge/executor.go` +- direct real-adapter tests now exist in: + - `weed/storage/blockvol/v2bridge/bridge_test.go` +- accepted scope is explicit: + - real reader + - real retention hold / release + - real WAL catch-up scan path + - direct real bridge evidence for the integrated path +- still deferred: + - `TransferSnapshot` + - `TransferFullBase` + - `TruncateWAL` + - control intent from confirmed failover / master-side integration +- carry-forward limitation: + - under interim `CommittedLSN = CheckpointLSN`, this slice proves a real bridge path, not general post-checkpoint catch-up viability + - post-checkpoint catch-up semantics therefore remain narrower than final V2 intent and do not represent final V2 commit semantics + +### P2: Real-System Failure Replay + +1. replay selected real failure classes against the integrated service slice +2. confirm diagnosability from logs/status +3. identify any remaining mismatch between engine-stage assumptions and real system behavior + +Status: + +- accepted +- real service-path replay now accepted for: + - changed-address restart + - stale epoch / stale session invalidation + - unrecoverable-gap / needs-rebuild replay + - explicit post-checkpoint boundary replay under the interim model +- accepted with scoped limitation: + - real `v2bridge` WAL-scan execution is proven + - full integrated engine-driven catch-up semantics are not overclaimed under interim `CommittedLSN = CheckpointLSN` +- control-plane delivery remains simulated via direct `AssignmentIntent` construction +- carry-forward remains explicit: + - post-checkpoint catch-up semantics are still narrower than final V2 intent + +### P3: Product-Path Decision + +1. choose the first product path for V2 +2. define what remains before pre-production hardening +3. record what is still intentionally deferred + +Status: + +- accepted +- first product path chosen: + - `RF=2` + - `sync_all` + - existing master / volume-server heartbeat path + - V2 engine owns recovery policy + - `v2bridge` provides real storage truth +- proposal is evidence-grounded and explicitly bounded by accepted `P0/P1/P2` evidence +- pre-hardening prerequisites are explicit: + - real master control delivery + - full integrated engine -> executor -> `v2bridge` catch-up chain + - separation of committed truth from checkpoint truth + - rebuild execution (`snapshot` / `full-base` / `truncation`) + - pinner / flusher behavior under concurrent load +- intentionally deferred: + - `RF>2` + - Smart WAL optimizations + - `best_effort` background recovery + - performance tuning + - full V1 replacement +- non-claims remain explicit: + - not production-ready + - no end-to-end rebuild proof yet + - no general post-checkpoint catch-up proof + - no real master heartbeat/control delivery proof yet + - no full integrated engine -> executor -> `v2bridge` catch-up proof yet + +## Guardrails + +### Guardrail 1: Do not re-import V1 structure as the design owner + +Use `weed/storage/block*` and `learn/projects/sw-block/` as constraints and validation sources, not as the architecture template. + +### Guardrail 2: Keep catch-up narrow and rebuild explicit + +Do not use integration work as an excuse to widen catch-up semantics or blur rebuild as the formal recovery path. + +### Guardrail 3: Prefer real entry paths over test-only wrappers + +The integrated slice should exercise real service boundaries, not only internal engine helpers. + +### Guardrail 4: Observability must explain causality + +Integrated logs/status must explain: + +1. why rebuild was required +2. why proof was rejected +3. why execution was cancelled or invalidated +4. why a product-path integration failed + +### Guardrail 5: Stable identity must not collapse back to address shape + +For the first slice, `ReplicaID` must be derived from master/block-registry identity, not current endpoint addresses. + +### Guardrail 6: `blockvol` executes I/O but does not own recovery policy + +The service bridge may translate engine decisions into concrete blockvol actions, but it must not re-decide: + +1. zero-gap / catch-up / rebuild +2. trusted-base validity +3. replayable-tail sufficiency +4. rebuild fallback requirement + +## Exit Criteria + +Phase 07 is done when: + +1. one real-system service slice is integrated with the engine +2. selected real-system failure classes are replayed through that slice +3. diagnosability is sufficient for service-slice debugging +4. the first product path is explicitly chosen +5. the remaining work to pre-production hardening is clear + +## Assignment For `sw` + +Next tasks move to `Phase 08`. + +## Assignment For `tester` + +Next tasks move to `Phase 08`. diff --git a/sw-block/.private/phase/phase-08-decisions.md b/sw-block/.private/phase/phase-08-decisions.md new file mode 100644 index 000000000..7e8dbed2b --- /dev/null +++ b/sw-block/.private/phase/phase-08-decisions.md @@ -0,0 +1,78 @@ +# Phase 08 Decisions + +## Decision 1: Phase 08 is pre-production hardening, not protocol rediscovery + +The accepted V2 product path from `Phase 07` is the basis. + +`Phase 08` should harden that path rather than reopen accepted protocol shape. + +## Decision 2: The first hardening priorities are control delivery and execution closure + +The most important remaining gaps are: + +1. real master/control delivery into the bridge/engine path +2. integrated engine -> executor -> `v2bridge` catch-up execution closure +3. first rebuild execution path for the chosen product path + +## Decision 3: Carry-forward limitations remain explicit until closed + +Phase 08 must keep explicit: + +1. committed truth is still not separated from checkpoint truth +2. rebuild execution is still incomplete +3. current control delivery is still simulated + +## Decision 4: Phase 08 P0 is accepted + +The hardening plan is sufficiently specified to begin implementation work. + +In particular, `P0` now fixes: + +1. the committed-truth gate decision requirement +2. the unified replay requirement after control and execution closure +3. the need for at least one real failover / reassignment validation target +## Decision 5: The committed-truth limitation must become a hardening gate + +Phase 08 must explicitly decide one of: + +1. `CommittedLSN != CheckpointLSN` separation is mandatory before a production-candidate phase +2. the first candidate path is intentionally bounded to the currently proven pre-checkpoint replay behavior + +It must not remain only a documented carry-forward. + +## Decision 6: Unified-path replay is required after control and execution closure + +Once real control delivery and integrated execution closure land, `Phase 08` must replay the accepted failure-class set again on the unified live path. + +This prevents independent closure of: + +1. control delivery +2. execution closure + +without proving that they behave correctly together. + +## Decision 7: Real failover / reassignment validation is mandatory for the chosen path + +Because the chosen product path depends on the existing master / volume-server heartbeat path, at least one real failover / promotion / reassignment cycle must be a named hardening target in `Phase 08`. + +## Decision 8: Phase 08 should reuse the existing Seaweed control/runtime path, not invent a new one + +For the first hardening path, implementation should preferentially reuse: + +1. existing master / heartbeat / assignment delivery +2. existing volume-server assignment receive/apply path +3. existing `blockvol` runtime and `v2bridge` storage/runtime hooks + +This reuse is about: + +1. control-plane reality +2. storage/runtime reality +3. execution-path reality + +It is not permission to inherit old policy semantics as V2 truth. + +The hard rule remains: + +1. engine owns recovery policy +2. bridge translates confirmed control/storage truth +3. `blockvol` executes I/O diff --git a/sw-block/.private/phase/phase-08-log.md b/sw-block/.private/phase/phase-08-log.md new file mode 100644 index 000000000..7621b482a --- /dev/null +++ b/sw-block/.private/phase/phase-08-log.md @@ -0,0 +1,21 @@ +# Phase 08 Log + +## 2026-03-31 + +### Opened + +`Phase 08` opened as: + +- pre-production hardening + +### Starting basis + +1. `Phase 07`: complete +2. first V2 product path chosen +3. remaining gaps are integration and hardening gaps, not protocol-discovery gaps + +### Next + +1. Phase 08 P0 accepted +2. Phase 08 P1 real master/control delivery integration +3. Phase 08 P2 integrated execution closure diff --git a/sw-block/.private/phase/phase-08.md b/sw-block/.private/phase/phase-08.md new file mode 100644 index 000000000..7e1412496 --- /dev/null +++ b/sw-block/.private/phase/phase-08.md @@ -0,0 +1,254 @@ +# Phase 08 + +Date: 2026-03-31 +Status: active +Purpose: convert the accepted Phase 07 product path into a pre-production-hardening program without reopening accepted V2 protocol shape + +## Why This Phase Exists + +`Phase 07` completed: + +1. a real service-slice integration around the V2 engine +2. real storage-truth bridge evidence through `v2bridge` +3. selected real-system failure replay +4. the first explicit product-path decision + +What still does not exist is a pre-production-ready system path. The remaining work is no longer protocol discovery. It is closing the operational and integration gaps between the accepted product path and a hardened deployment candidate. + +## Phase Goal + +Harden the first accepted V2 product path until the remaining gap to a production candidate is explicit, bounded, and implementation-driven. + +## Scope + +### In scope + +1. real master/control delivery into the engine service path +2. integrated engine -> executor -> `v2bridge` execution closure +3. rebuild execution closure for the accepted product path +4. operational/debuggability hardening +5. concurrency/load validation around retention and recovery + +### Out of scope + +1. new protocol redesign +2. `RF>2` coordination +3. Smart WAL optimization work +4. broad performance tuning beyond validation needed for hardening +5. full V1 replacement rollout + +## Phase 08 Items + +### P0: Hardening Plan + +1. convert the accepted `Phase 07` product path into a hardening plan +2. define the minimum pre-production gates +3. order the remaining integration closures by risk +4. make an explicit gate decision on committed truth vs checkpoint truth: + - either separate `CommittedLSN` from `CheckpointLSN` before a production-candidate phase + - or explicitly bound the first candidate path to the currently proven pre-checkpoint replay behavior + +Status: + +- planning package accepted in this phase doc +- first hardening priorities are fixed as: + - real master/control delivery + - integrated engine -> executor -> `v2bridge` catch-up execution chain + - first rebuild execution path +- the committed-truth carry-forward is now a required hardening gate, not just a note: + - either separate `CommittedLSN` from `CheckpointLSN` before a production-candidate phase + - or explicitly bound the first candidate path to the currently proven pre-checkpoint replay behavior +- at least one real failover / promotion / reassignment cycle is a required hardening target +- once `P1` and `P2` land, the accepted failure-class set must be replayed again on the newly unified live path +- the validation oracle for `Phase 08` is expected to reject overclaiming around: + - catch-up semantics + - rebuild execution + - master/control delivery + - candidate-path readiness vs production readiness +- accepted + +### P1: Real Control Delivery + +1. connect real master/heartbeat assignment delivery into the bridge +2. replace direct `AssignmentIntent` construction for the first live path +3. preserve stable identity and fenced authority through the real control path +4. include at least one real failover / promotion / reassignment validation target on the chosen `sync_all` path + +Technical focus: + +- keep the control-path split explicit: + - master confirms assignment / epoch / role + - bridge translates confirmed control truth into engine intent + - engine owns sender/session/recovery policy + - `blockvol` does not re-decide recovery policy +- preserve the identity rule through the live path: + - `ReplicaID = /` + - endpoint change updates location but must not recreate logical identity +- preserve the fencing rule through the live path: + - stale epoch must invalidate old authority + - stale session must not mutate current lineage + - address change must invalidate the old live session before the new path proceeds +- treat failover / promotion / reassignment as control-truth events first, not storage-side heuristics + +Implementation route (`reuse map`): + +- reuse directly as the first hardening carrier: + - `weed/server/master_grpc_server.go` + - `weed/server/volume_grpc_client_to_master.go` + - `weed/server/volume_server_block.go` + - `weed/server/master_block_registry.go` + - `weed/server/master_block_failover.go` +- reuse as storage/runtime execution reality: + - `weed/storage/blockvol/blockvol.go` + - `weed/storage/blockvol/replica_apply.go` + - `weed/storage/blockvol/replica_barrier.go` + - `weed/storage/blockvol/v2bridge/` +- preserve the V2 boundary while reusing these files: + - reuse transport/control/runtime reality + - do not inherit old policy semantics as V2 truth + - keep engine as the recovery-policy owner + - keep `blockvol` as the I/O executor + +Expectation note: + +- the `P1` tester expectation is already embedded in this phase doc under: + - `P1 / Validation focus` + - `P1 / Reject if` +- do not grow a separate long template unless `P1` scope expands materially + +Validation focus: + +- prove live assignment delivery into the bridge/engine path +- prove stable `ReplicaID` across address refresh on the live path +- prove stale epoch / stale session invalidation through the live path +- prove at least one real failover / promotion / reassignment cycle on the chosen `sync_all` path +- prove the resulting logs explain: + - why reassignment happened + - why a session was invalidated + - which epoch / identity / endpoint drove the transition + +Reject if: + +- address-shaped identity reappears anywhere in the control path +- bridge starts re-deriving catch-up vs rebuild policy from convenience inputs +- old epoch or old session can still mutate after the new control truth arrives +- failover / reassignment is claimed without a real replay target +- delivery claims general production readiness rather than control-path closure + +### P2: Execution Closure + +1. close the live engine -> executor -> `v2bridge` execution chain +2. make catch-up execution evidence integrated rather than split across layers +3. close the first rebuild execution path required by the product path + +### P3: Hardening Validation + +1. validate diagnosability under the live integrated path +2. validate retention/pinner behavior under concurrent load +3. replay the accepted failure-class set again on the newly unified live path after `P1` and `P2` land +4. confirm the remaining gap to a production candidate + +Validation focus: + +- prove the chosen path through a real control-delivery path +- prove the live engine -> executor -> `v2bridge` execution chain as one path, not split evidence +- prove the first rebuild execution path required by the chosen product path +- prove at least one real failover / promotion / reassignment cycle +- prove concurrent retention/pinner behavior does not break recovery guarantees + +Reject if: + +- catch-up semantics are overclaimed beyond the currently proven boundary +- rebuild is claimed as supported without real execution closure +- master/control delivery is claimed as real without the live path in place +- `CommittedLSN` vs `CheckpointLSN` remains an unclassified note instead of a gate decision +- `P1` and `P2` land independently but the accepted failure-class set is not replayed again on the unified live path + +## Guardrails + +### Guardrail 1: Do not reopen accepted V2 protocol truths casually + +`Phase 08` is a hardening phase. New work should preserve the accepted protocol truth set unless a real contradiction is demonstrated. + +### Guardrail 2: Keep product-path claims evidence-bound + +Do not claim more than the hardened path actually proves. Distinguish: + +1. live integrated path +2. hardened product path +3. production candidate + +### Guardrail 3: Identity and policy boundaries remain hard rules + +1. `ReplicaID` must remain stable and never collapse to address shape +2. engine decides recovery policy +3. bridge translates intent/state +4. `blockvol` executes I/O only + +### Guardrail 4: Carry-forward limitations must remain explicit until closed + +Especially: + +1. committed truth vs checkpoint truth +2. rebuild execution coverage +3. real master/control delivery coverage + +### Guardrail 5: The committed-truth carry-forward must become a gate, not a note + +Before the next phase, `Phase 08` must decide one of: + +1. committed-truth separation is mandatory before a production-candidate phase +2. the first candidate path is intentionally bounded to the currently proven pre-checkpoint replay behavior + +It must not remain an unclassified carry-forward. + +## Exit Criteria + +Phase 08 is done when: + +1. the first product path runs through a real control delivery path +2. the critical execution chain is integrated and validated +3. rebuild execution for the chosen path is no longer just detected but executed +4. at least one real failover / reassignment cycle is replayed through the live control path +5. the accepted failure-class set is replayed again on the unified live path +6. operational/debug evidence is sufficient for pre-production use +7. the remaining gap to a production candidate is small and explicit + +## Assignment For `sw` + +Next tasks: + +1. drive `Phase 08 P1` as real master/control delivery integration +2. replace direct `AssignmentIntent` construction for the first live path +3. preserve through the real control path: + - stable `ReplicaID` + - epoch fencing + - address-change invalidation +4. include at least one real failover / promotion / reassignment validation target +5. keep acceptance claims scoped: + - real control delivery path + - not yet general production readiness +6. keep explicit carry-forwards: + - `CommittedLSN != CheckpointLSN` still unresolved + - integrated catch-up execution chain still incomplete + - rebuild execution still incomplete + +## Assignment For `tester` + +Next tasks: + +1. use the accepted `Phase 08` plan framing as the `P1` validation oracle +2. validate real control delivery for: + - live assignment delivery + - stable identity through the control path + - stale epoch/session invalidation + - at least one real failover / reassignment cycle +3. keep the no-overclaim rule active around: + - catch-up semantics + - rebuild execution + - master/control delivery +4. keep the committed-truth gate explicit: + - still unresolved in `P1` +5. prepare `P2` follow-up expectations for: + - integrated engine -> executor -> `v2bridge` execution closure + - unified replay after `P1` and `P2` diff --git a/sw-block/.private/phase/phase-4.5-decisions.md b/sw-block/.private/phase/phase-4.5-decisions.md new file mode 100644 index 000000000..3d950764e --- /dev/null +++ b/sw-block/.private/phase/phase-4.5-decisions.md @@ -0,0 +1,59 @@ +# Phase 4.5 Decisions + +## Decision 1: Phase 4.5 remains a bounded hardening phase + +It is not a new architecture line and must not expand into broad feature work. + +Purpose: + +1. tighten recovery boundaries +2. strengthen crash-consistency / recoverability proof +3. clear the path for engine planning + +## Decision 2: `sw` Phase 4.5 P0 is accepted + +Accepted basis: + +1. bounded `CatchUp` now changes prototype behavior +2. `FrozenTargetLSN` is intrinsic to the session contract +3. `Rebuild` is a first-class sender-owned execution path +4. rebuild and catch-up are execution-path exclusive + +## Decision 3: `tester` crash-consistency simulator strengthening is accepted + +Accepted basis: + +1. checkpoint semantics are explicit +2. recoverability after restart is no longer collapsed into a single loose watermark +3. crash-consistency invariants are executable and passing + +## Decision 4: Remaining Phase 4.5 work is evidence hardening, not primitive-building + +Completed focus: + +1. `A5-A8` prototype + simulator double evidence +2. predicate exploration for dangerous states +3. adversarial search over crash-consistency / liveness states + +Remaining optional work: + +4. any low-priority cleanup that improves clarity without reopening design + +## Decision 5: After Phase 4.5, the project should move to engine-planning readiness review + +Unless new blocking flaws appear, the next major decision after `4.5` should be: + +1. real V2 engine planning +2. engine slicing plan + +not another broad prototype phase + +## Decision 6: Phase 4.5 is complete + +Reason: + +1. bounded `CatchUp` is semantic in the prototype +2. `Rebuild` is first-class in the prototype +3. crash-consistency / restart-recoverability are materially stronger in the simulator +4. `A5-A8` evidence is materially stronger on both prototype and simulator sides +5. adversarial search found and helped fix a real correctness bug, validating the proof style diff --git a/sw-block/.private/phase/phase-4.5-log.md b/sw-block/.private/phase/phase-4.5-log.md new file mode 100644 index 000000000..2f923be74 --- /dev/null +++ b/sw-block/.private/phase/phase-4.5-log.md @@ -0,0 +1,33 @@ +# Phase 4.5 Log + +## 2026-03-29 + +### Accepted + +1. `sw` `Phase 4.5 P0` + - bounded `CatchUp` budget is semantic in `enginev2` + - `FrozenTargetLSN` is a real session invariant + - `Rebuild` is wired into sender execution and is exclusive from catch-up + - rebuild completion goes through `CompleteRebuild`, not generic session completion + +2. `tester` crash-consistency simulator strengthening + - storage-state split introduced and accepted + - checkpoint/restart boundary made explicit + - recoverability upgraded from watermark-style logic to checkpoint + contiguous WAL replayability proof + - core invariant tests for crash consistency now pass + +3. `tester` evidence hardening and adversarial exploration + - grouped simulator evidence for `A5-A8` + - danger predicates added + - adversarial search added and passing + - adversarial search found a real `StateAt(lsn)` historical-state bug + - `StateAt(lsn)` corrected so newer checkpoint/base state does not leak into older historical queries + +4. `Phase 4.5` closeout judgment + - prototype and simulator evidence are now strong enough to stop expanding `4.5` + - next major step should move to engine-readiness review and engine slicing + +### Remaining open work + +1. low-priority cleanup + - remove or consolidate redundant frozen-target bookkeeping if no longer needed diff --git a/sw-block/.private/phase/phase-4.5-reason.md b/sw-block/.private/phase/phase-4.5-reason.md new file mode 100644 index 000000000..b183c2e98 --- /dev/null +++ b/sw-block/.private/phase/phase-4.5-reason.md @@ -0,0 +1,397 @@ +# Phase 4.5 Reason + +Date: 2026-03-27 +Status: proposal for dev manager decision +Purpose: explain why a narrow V2 fine-tuning step should follow the main Phase 04 slice, without reopening the core ownership/fencing direction + +## 1. Why This Note Exists + +`Phase 04` has already produced strong progress on the first standalone V2 slice: + +- per-replica sender identity +- one active recovery session per replica per epoch +- endpoint / epoch invalidation +- sender-owned execution APIs +- explicit recovery outcome branching +- minimal historical-data prototype + +This is good progress and should continue. + +However, recent review and discussion show that the next risk is no longer: + +- ownership ambiguity +- stale completion acceptance +- scattered local recovery authority + +The next risk is different: + +- `CatchUp` may become too broad, too long-lived, and too resource-heavy +- simulator proof is still weaker than desired on crash-consistency and recoverability boundaries +- the project may accidentally carry V1.5-style "keep trying to catch up" assumptions into V2 engine work + +So this note proposes: + +- **do not interrupt the main Phase 04 work** +- **do not reopen core V2 ownership/fencing architecture** +- **add a narrow fine-tuning step immediately after Phase 04 main closure** + +This note is for the dev manager to decide implementation sequencing. + +## 2. Current Basis + +This proposal is grounded in the following current documents: + +- `sw-block/.private/phase/phase-04.md` +- `sw-block/design/v2-prototype-roadmap-and-gates.md` +- `sw-block/design/v2-acceptance-criteria.md` +- `sw-block/design/v2-detailed-algorithm.zh.md` + +In particular: + +- `phase-04.md` shows that Phase 04 is correctly centered on sender/session ownership and recovery execution authority +- `v2-prototype-roadmap-and-gates.md` shows that design proof is high, but data/recovery proof and prototype end-to-end proof are still low +- `v2-acceptance-criteria.md` already requires stronger proof for: + - `A5` non-convergent catch-up escalation + - `A6` explicit recoverability boundary + - `A7` historical correctness + - `A8` durability-mode correctness +- `v2-detailed-algorithm.zh.md` Section 17 now argues for a direction tightening: + - keep the V2 core + - narrow `CatchUp` + - elevate `Rebuild` + - defer higher-complexity expansion + +## 3. Main Judgment + +### 3.1 What should NOT change + +The following V2 core should remain stable: + +- `CommittedLSN` as the external safe boundary +- durable progress as sync truth +- one sender per replica +- one active recovery session per replica per epoch +- stale epoch / stale endpoint / stale session fencing +- explicit `ZeroGap / CatchUp / NeedsRebuild` + +This is the architecture that most clearly separates V2 from V1.5. + +### 3.2 What SHOULD be fine-tuned + +The following should be tightened before engine planning: + +1. `CatchUp` should be narrowed to a short-gap, bounded, budgeted path +2. `Rebuild` should be treated as a formal primary recovery path, not only a fallback embarrassment +3. `recover -> keepup` handoff should be made more explicit +4. simulator should prove recoverability and crash-consistency more directly + +## 4. Algorithm Thinking Behind The Fine-Tune + +This section summarizes the reasoning already captured in: + +- `sw-block/design/v2-detailed-algorithm.zh.md` + +Especially Section 17: + +- `V2` is still the right direction +- but V2 should be tightened from: + - "make WAL recovery increasingly smart" + - to: + - "make block truth boundaries hard, keep `CatchUp` cheap and bounded, and use formal `Rebuild` when recovery becomes too complex" + +### 4.1 First-principles view + +From block first principles, the hardest truths are: + +1. when `write` becomes real +2. what `flush/fsync ACK` truly promises +3. whether acknowledged boundaries survive failover +4. how replicas rejoin without corrupting lineage + +These are more fundamental than: + +- volume product shape +- control-plane surface +- recovery cleverness for its own sake + +So the project should optimize for: + +- clearer truth boundaries +- not for maximal catch-up cleverness + +### 4.2 Mayastor-style product insight + +The useful first-principles lesson from Mayastor-like product thinking is: + +- not every lagging replica is worth indefinite low-cost chase +- `Rebuild` can be a formal product path, not a shameful fallback +- block products benefit from explicit lifecycle objects and formal rebuild flow + +This does NOT replace the V2 core concerns: + +- `flush ACK` truth +- committed-prefix failover safety +- stale authority fencing + +But it does suggest a correction: + +- do not let `CatchUp` become an over-smart general answer to all recovery + +### 4.3 Proposed V2 fine-tuned interpretation + +The fine-tuned interpretation of V2 should be: + +- `CatchUp` is for short-gap, clearly recoverable, bounded recovery +- `Rebuild` is for long-gap, high-cost, unstable, or non-convergent recovery +- recovery session is a bounded contract, not a long-running rescue thread +- `> H0` live WAL must not silently turn one recovery session into an endless chase + +## 5. Specific Fine-Tune Adjustments + +### 5.1 Narrow `CatchUp` + +`CatchUp` should explicitly require: + +- short outage +- bounded target `H0` +- clear recoverability +- bounded reservation +- bounded time +- bounded resource cost +- bounded convergence expectation + +`CatchUp` should explicitly stop when: + +- target drifts too long without convergence +- replay progress stalls +- recoverability proof is lost +- retention cost becomes unreasonable +- session budget expires + +### 5.2 Elevate `Rebuild` + +`Rebuild` should be treated as a first-class path when: + +- lag is too large +- catch-up does not converge +- recoverability is no longer stable +- complexity of continued catch-up exceeds its product value + +The intended model becomes: + +- short gap -> `CatchUp` +- long gap / unstable / non-convergent -> `Rebuild` + +This should be interpreted more strictly than a simple routing rule: + +- `CatchUp` is not a general recovery framework +- `CatchUp` is a relaxed form of `KeepUp` +- it should stay limited to short-gap, bounded, clearly recoverable WAL replay +- it only makes sense while the replica's current base is still trustworthy enough to continue from + +By contrast: + +- `Rebuild` is the more general recovery framework +- it restores the replica from a trusted base toward a frozen target boundary +- `full rebuild` and `partial rebuild` are not different protocols; they are different base/transfer choices under the same rebuild contract + +So the intended product shape is: + +- use `CatchUp` when replay debt is small and clearly cheaper than rebuild +- use `Rebuild` when correctness, boundedness, or product simplicity would otherwise be compromised + +And the correctness anchor for both `full` and `partial` rebuild should remain explicit: + +- freeze `TargetLSN` +- pin the snapshot/base used for recovery +- only then optimize transfer volume using `snapshot + tail`, `bitmap`, or similar mechanisms + +### 5.3 Clarify `recover -> keepup` handoff + +Phase 04 already aims to prove a clean handoff between normal sender and recovery session. + +The fine-tune should make the next step more explicit: + +- one recovery session only owns `(R, H0]` +- session completion releases recovery debt +- replica should not silently stay in "quasi-recovery" +- re-entry to `KeepUp` / `InSync` should remain explicit, ideally with `PromotionHold` or equivalent stabilization logic + +### 5.4 Keep Smart WAL deferred + +No fine-tune should broaden Smart WAL scope at this point. + +Reason: + +- Smart WAL multiplies recoverability, GC, payload-availability, and reservation complexity +- the current priority is to harden the simpler V2 replication contract first + +So the rule remains: + +- no Smart WAL expansion beyond what minimal proof work might later require + +## 6. Simulation Strengthening Requirements + +This is the highest-value part of the fine-tune. + +Current simulator strength is already good on: + +- epoch fencing +- stale traffic rejection +- promotion candidate rules +- ownership / session invalidation +- basic `CatchUp / NeedsRebuild` classification + +Current simulator weakness is still significant on: + +- crash-consistency around extent / checkpoint / replay boundaries +- `ACK` boundary versus recoverable boundary +- `CatchUp` liveness / convergence + +### 6.1 Required new modeling direction + +The simulator should stop collapsing these states together: + +- received but not durable +- WAL durable but not yet fully materialized +- extent-visible but not yet checkpoint-safe +- checkpoint-safe base image +- restart-recoverable read state + +Suggested explicit storage-state split: + +- `ReceivedLSN` +- `WALDurableLSN` +- `ExtentAppliedLSN` +- `CheckpointLSN` +- `RecoverableLSNAfterRestart` + +### 6.2 Required new invariants + +The simulator should explicitly check at least: + +1. `AckedFlushLSN <= RecoverableLSNAfterRestart` +2. visible state must have recoverable backing +3. `CatchUp` cannot remain non-convergent indefinitely +4. promotion candidate must still possess recoverable committed prefix + +### 6.3 Required new scenario classes + +Priority scenarios to add: + +1. `ExtentAheadOfCheckpoint_CrashRestart_ReadBoundary` +2. `AckedFlush_MustBeRecoverableAfterCrash` +3. `UnackedVisibleExtent_MustNotSurviveAsCommittedTruth` +4. `CatchUpChasingMovingHead_EscalatesOrConverges` +5. `CheckpointGCBreaksRecoveryProof` + +### 6.4 Required simulator style upgrade + +The simulator should move beyond only hand-authored examples and also support: + +- dangerous-state predicates +- adversarial random exploration guided by those predicates + +Examples: + +- `acked_flush_lost` +- `extent_exposes_unrecoverable_state` +- `catchup_livelock` +- `rebuild_required_but_not_escalated` + +## 7. Relationship To Acceptance Criteria + +This fine-tune is not a separate architecture line. + +It is mainly intended to make the project satisfy the existing acceptance set more convincingly: + +- `A5` explicit escalation from non-convergent catch-up +- `A6` recoverability boundary as a real rule, not hopeful policy +- `A7` historical correctness against snapshot + tail rebuild +- `A8` strict durability mode semantics + +So this fine-tune is a strengthening of the current V2 proof path, not a new branch. + +## 8. Recommended Sequencing + +### Option A: pause Phase 04 and reopen design now + +Not recommended. + +Why: + +- Phase 04 has strong momentum +- its core ownership/fencing work is correct +- pausing it now would blur scope and waste recent closure + +### Option B: finish Phase 04, then add a narrow `4.5` + +Recommended. + +Why: + +- Phase 04 can finish its intended ownership / orchestration / minimal-history closure +- `4.5` can then tighten recovery strategy without destabilizing the slice +- the project avoids carrying "too-smart catch-up" assumptions into later engine planning + +Recommended sequence: + +1. finish Phase 04 main closure +2. immediately start `Phase 4.5` +3. use `4.5` to tighten: + - bounded `CatchUp` + - formal `Rebuild` + - crash-consistency and recoverability simulator proof +4. then re-evaluate Gate 4 / Gate 5 + +## 9. Scope Of A Possible Phase 4.5 + +If the dev manager chooses to implement a `4.5` step, its scope should be: + +### In scope + +- tighten algorithm wording and boundaries from `v2-detailed-algorithm.zh.md` +- formalize bounded `CatchUp` +- formalize `Rebuild` as first-class path +- strengthen simulator state model and invariants +- add targeted crash-consistency and liveness scenarios +- improve prototype traceability against `A5-A8` + +### Out of scope + +- Smart WAL expansion +- real storage engine redesign +- V1 production integration +- frontend/wire protocol +- performance optimization as primary goal + +## 10. Decision Requested From Dev Manager + +Please decide: + +1. whether `Phase 04` should continue to normal closure without interruption +2. whether a narrow `Phase 4.5` should immediately follow +3. whether the simulator strengthening work should be treated as mandatory for Gate 4 / Gate 5 credibility + +Recommended decision: + +- **Yes**: finish `Phase 04` +- **Yes**: add `Phase 4.5` as a bounded fine-tuning step +- **Yes**: treat crash-consistency / recoverability / liveness simulator strengthening as required, not optional + +## 11. Bottom Line + +The project does not need a new direction. + +It needs: + +- a slightly tighter interpretation of V2 +- a stronger recoverability/crash-consistency simulator +- a clearer willingness to use formal `Rebuild` instead of over-extending `CatchUp` + +So the practical recommendation is: + +- **keep the V2 core** +- **finish Phase 04** +- **add a narrow Phase 4.5** +- **strengthen simulator proof before engine planning** diff --git a/sw-block/.private/phase/phase-4.5.md b/sw-block/.private/phase/phase-4.5.md new file mode 100644 index 000000000..50e2469a4 --- /dev/null +++ b/sw-block/.private/phase/phase-4.5.md @@ -0,0 +1,356 @@ +# Phase 4.5 + +Date: 2026-03-29 +Status: complete +Purpose: harden Gate 4 / Gate 5 credibility after Phase 04 by tightening bounded `CatchUp`, elevating `Rebuild` as a first-class path, and strengthening crash-consistency / recoverability proof + +## Related Plan + +Strategic phase: + +- `sw-block/.private/phase/phase-4.5.md` + +Simulator implementation plan: + +- `learn/projects/sw-block/design/phase-05-crash-consistency-simulation.md` + +Use them together: + +- `Phase 4.5` defines the gate-hardening purpose and priorities +- `phase-05-crash-consistency-simulation.md` is the detailed simulator implementation plan + +## Why This Phase Exists + +Phase 04 has already established: + +1. per-replica sender identity +2. one active recovery session per replica per epoch +3. stale authority fencing +4. sender-owned execution APIs +5. assignment-intent orchestration +6. minimal historical-data prototype +7. prototype scenario closure + +The next risk is no longer ownership structure. + +The next risk is: + +1. `CatchUp` becoming too broad, too long-lived, or too optimistic +2. `Rebuild` remaining underspecified even though it will likely become a common path +3. simulator proof still being weaker than desired on crash-consistency and restart-recoverability + +So `Phase 4.5` exists to harden the decision gate before real engine planning. + +## Relationship To Phase 04 + +`Phase 4.5` is not a new architecture line. + +It is a narrow hardening step after normal Phase 04 closure. + +It should: + +- keep the V2 core +- not reopen sender/session ownership architecture +- strengthen recovery boundaries and proof quality + +## Main Questions + +1. how narrow should `CatchUp` be? +2. when must recovery escalate to `Rebuild`? +3. what exactly is the `Rebuild` source of truth? +4. what does restart-recoverable / crash-consistent state mean in the simulator? + +## Core Decisions To Drive + +### 1. Bounded CatchUp + +`CatchUp` should be explicitly bounded by: + +1. target range +2. retention proof +3. time budget +4. progress budget +5. resource budget + +It should stop and escalate when: + +1. target drifts too long +2. progress stalls +3. recoverability proof is lost +4. retention cost becomes unreasonable +5. session budget expires + +### 2. Rebuild Is First-Class + +`Rebuild` is not an embarrassment path. + +It is the formal path for: + +1. long gap +2. unstable recoverability +3. non-convergent catch-up +4. excessive replay cost +5. restart-recoverability uncertainty + +### 3. Rebuild Source Model + +To address the concern that tightening `CatchUp` makes `Rebuild` too dominant: + +`Rebuild` should be split conceptually into two modes: + +1. **Snapshot + Tail** + - preferred path + - use a dated but internally consistent base snapshot/checkpoint + - then apply retained WAL tail up to the committed recovery boundary + +2. **Full Base Rebuild** + - fallback path + - used when no acceptable snapshot/base image exists + - more expensive and slower + +Decision boundary: + +- use `Snapshot + Tail` when a trusted snapshot/checkpoint/base exists that covers the required base state +- use `Full Base Rebuild` when no such trusted base exists + +So "rebuild" should not mean only: + +- copy everything from scratch + +It should usually mean: + +- re-establish a trustworthy base image +- then catch up from that base to the committed boundary + +This keeps `Rebuild` practical even if `CatchUp` becomes narrower. + +### 4. Safe Recovery Truth + +The simulator should explicitly separate: + +1. `ReceivedLSN` +2. `WALDurableLSN` +3. `ExtentAppliedLSN` +4. `CheckpointLSN` +5. `RecoverableLSNAfterRestart` + +This is needed so that: + +- `ACK` truth +- visible-state truth +- crash-restart truth + +do not collapse into one number. + +## Priority + +### P0 + +1. document bounded `CatchUp` rule +2. document `Rebuild` modes: + - snapshot + tail + - full base rebuild +3. define escalation conditions from `CatchUp` to `Rebuild` + +Status: + +- accepted on both prototype and simulator sides +- prototype: bounded `CatchUp` is semantic, target-frozen, budget-enforced, and rebuild is a sender-owned exclusive path +- simulator: crash-consistency state split, checkpoint-safe restart boundary, and core invariants are in place + +### P1 + +4. strengthen simulator state model with crash-consistency split: + - `ReceivedLSN` + - `WALDurableLSN` + - `ExtentAppliedLSN` + - `CheckpointLSN` + - `RecoverableLSNAfterRestart` + +5. add explicit invariants: + - `AckedFlushLSN <= RecoverableLSNAfterRestart` + - visible state must have recoverable backing + - promotion candidate must possess recoverable committed prefix + +Status: + +- accepted on the simulator side +- remaining work is no longer basic state split; it is stronger traceability and adversarial exploration + +### P2 + +6. add targeted scenarios: + - `ExtentAheadOfCheckpoint_CrashRestart_ReadBoundary` + - `AckedFlush_MustBeRecoverableAfterCrash` + - `UnackedVisibleExtent_MustNotSurviveAsCommittedTruth` + - `CatchUpChasingMovingHead_EscalatesOrConverges` + - `CheckpointGCBreaksRecoveryProof` + +Status: + +- baseline targeted scenarios accepted +- predicate-guided/adversarial exploration remains open + +### P3 + +7. make prototype traceability stronger for: + - `A5` + - `A6` + - `A7` + - `A8` + +8. decide whether Gate 4 / Gate 5 are now credible enough for engine planning + +Status: + +- partially complete +- Gate 4 / Gate 5 are materially stronger +- remaining work is to make `A5-A8` double evidence more explicit and reviewable + +## Scope + +### In scope + +1. bounded `CatchUp` +2. first-class `Rebuild` +3. snapshot + tail rebuild model +4. crash-consistency simulator state split +5. targeted liveness / recoverability scenarios + +### Out of scope + +1. Smart WAL expansion +2. V1 production integration +3. backend/storage engine redesign +4. performance optimization as primary goal +5. frontend/wire protocol work + +## Exit Criteria + +`Phase 4.5` is done when: + +1. `CatchUp` budget / escalation rule is explicit in docs and simulator +2. `Rebuild` is explicitly modeled as: + - snapshot + tail preferred + - full base rebuild fallback +3. simulator has explicit crash-consistency state split +4. simulator has targeted crash / liveness scenarios for the listed risks +5. acceptance items `A5-A8` have stronger executable proof, ideally with explicit prototype + simulator evidence pairs +6. we can make a more credible decision on: + - real V2 engine planning + - or `V2.5` correction + +## Review Gates + +These are explicit review gates for `Phase 4.5`. + +### Gate 1: Bounded CatchUp Must Be Semantic + +It is not enough to add budget fields in docs or structs. + +To count as complete: + +1. timeout / budget exceed must force exit +2. moving-head chase must not continue indefinitely +3. escalation to `NeedsRebuild` must be explicit +4. tests must prove those behaviors + +### Gate 2: State Split Must Change Decisions + +It is not enough to add more state names. + +To count as complete, the new crash-consistency state split must materially change: + +1. `ACK` legality +2. restart recoverability judgment +3. visible-state legality +4. promotion-candidate legality + +### Gate 3: A5-A8 Need Double Evidence + +It is not enough for only prototype or only simulator to cover them. + +To count as complete, each of: + +- `A5` +- `A6` +- `A7` +- `A8` + +should have: + +1. one prototype-side evidence path +2. one simulator-side evidence path + +## Scope Discipline + +`Phase 4.5` must remain a bounded gate-hardening phase. + +It should stay focused on: + +1. tightening boundaries +2. strengthening proof +3. clearing the path for engine planning + +It should not turn into a broad new feature-expansion phase. + +## Current Status Summary + +Accepted now: + +1. `sw` `Phase 4.5 P0` + - bounded `CatchUp` is semantic, not documentary + - `FrozenTargetLSN` is a real session invariant + - `Rebuild` is an exclusive sender-owned execution path +2. `tester` crash-consistency simulator strengthening + - checkpoint/restart boundary is explicit + - recoverability is no longer a single collapsed watermark + - core crash-consistency invariants are executable + +Open now: + +1. low-priority cleanup such as redundant frozen-target bookkeeping fields + +Completed since initial approval: + +1. `A5-A8` explicit double-evidence traceability materially strengthened +2. predicate exploration / adversarial search added on simulator side +3. crash-consistency random/adversarial search found and helped fix a real `StateAt(lsn)` historical-state bug + +## Assignment For `sw` + +Focus: prototype/control-path formalization + +Completed work: + +1. updated prototype traceability for: + - `A5` + - `A6` + - `A7` + - `A8` +2. made rebuild-source decision evidence explicit in prototype tests: + - snapshot + tail chosen only when trusted base exists + - full base chosen when it does not +3. added focused prototype evidence grouping for engine-planning review + +Remaining optional cleanup: + +4. optionally clean low-priority redundancy: + - `TargetLSNAtStart` if superseded by `FrozenTargetLSN` + +## Assignment For `tester` + +Focus: simulator/crash-consistency proof + +Completed work: + +1. wired simulator-side evidence explicitly into acceptance traceability for: + - `A5` + - `A6` + - `A7` + - `A8` +2. added predicate exploration / adversarial search around the new crash-consistency model +3. added danger predicates for major failure classes: + - acked flush lost + - visible unrecoverable state + - catch-up livelock / rebuild-required-but-not-escalated diff --git a/sw-block/design/README.md b/sw-block/design/README.md index a1ee51100..b17972064 100644 --- a/sw-block/design/README.md +++ b/sw-block/design/README.md @@ -1,6 +1,9 @@ # V2 Design Current WAL V2 design set: +- `v2-algorithm-overview.md` +- `v2-algorithm-overview.zh.md` +- `v2-detailed-algorithm.zh.md` - `wal-replication-v2.md` - `wal-replication-v2-state-machine.md` - `wal-replication-v2-orchestrator.md` @@ -15,12 +18,25 @@ Current WAL V2 design set: - `v2-open-questions.md` - `v2-first-slice-session-ownership.md` - `v2-prototype-roadmap-and-gates.md` +- `v2-engine-readiness-review.md` +- `v2-engine-slicing-plan.md` +- `v2-protocol-truths.md` +- `v2-production-roadmap.md` +- `phase-07-service-slice-plan.md` +- `agent_dev_process.md` These documents are the working design home for the V2 line. The original project-level copies under `learn/projects/sw-block/design/` remain as shared references for now. Execution note: -- active development tracking for the current simulator phase lives under: +- active development tracking lives under `../.private/phase/` +- key completed/current phase docs include: - `../.private/phase/phase-01.md` - `../.private/phase/phase-02.md` + - `../.private/phase/phase-03.md` + - `../.private/phase/phase-04.md` + - `../.private/phase/phase-4.5.md` + - `../.private/phase/phase-05.md` + - `../.private/phase/phase-06.md` + - `../.private/phase/phase-07.md` diff --git a/sw-block/design/a5-a8-traceability.md b/sw-block/design/a5-a8-traceability.md new file mode 100644 index 000000000..aa3b1626d --- /dev/null +++ b/sw-block/design/a5-a8-traceability.md @@ -0,0 +1,117 @@ +# A5-A8 Acceptance Traceability + +Date: 2026-03-29 +Status: Phase 4.5 evidence-hardening + +## Purpose + +Map each acceptance criterion to specific executable evidence. +Two evidence layers: +- **Simulator** (distsim): protocol-level proof +- **Prototype** (enginev2): ownership/session-level proof + +--- + +## A5: Non-Convergent Catch-Up Escalates Explicitly + +**Must prove**: tail-chasing or failed catch-up does not pretend success. + +**Pass condition**: explicit `CatchingUp → NeedsRebuild` transition. + +| Evidence | Test | File | Layer | Status | +|----------|------|------|-------|--------| +| Tail-chasing converges or aborts | `TestS6_TailChasing_ConvergesOrAborts` | `cluster_test.go` | distsim | PASS | +| Tail-chasing non-convergent → NeedsRebuild | `TestS6_TailChasing_NonConvergent_EscalatesToNeedsRebuild` | `phase02_advanced_test.go` | distsim | PASS | +| Catch-up timeout → NeedsRebuild | `TestP03_CatchupTimeout_EscalatesToNeedsRebuild` | `phase03_timeout_test.go` | distsim | PASS | +| Reservation expiry aborts catch-up | `TestReservationExpiryAbortsCatchup` | `cluster_test.go` | distsim | PASS | +| Flapping budget exceeded → NeedsRebuild | `TestP02_S5_FlappingExceedsBudget_EscalatesToNeedsRebuild` | `phase02_advanced_test.go` | distsim | PASS | +| Catch-up converges or escalates (I3) | `TestI3_CatchUpConvergesOrEscalates` | `phase045_crash_test.go` | distsim | PASS | +| Catch-up timeout in enginev2 | `TestE2E_NeedsRebuild_Escalation` | `p2_test.go` | enginev2 | PASS | + +**Verdict**: A5 is well-covered. Both simulator and prototype prove explicit escalation. No pretend-success path exists. + +--- + +## A6: Recoverability Boundary Is Explicit + +**Must prove**: recoverable vs unrecoverable gap is decided explicitly. + +**Pass condition**: recovery aborts when reservation/payload availability is lost; rebuild is explicit fallback. + +| Evidence | Test | File | Layer | Status | +|----------|------|------|-------|--------| +| Reservation expiry aborts catch-up | `TestReservationExpiryAbortsCatchup` | `cluster_test.go` | distsim | PASS | +| WAL GC beyond replica → NeedsRebuild | `TestI5_CheckpointGC_PreservesAckedBoundary` | `phase045_crash_test.go` | distsim | PASS | +| Rebuild from snapshot + tail | `TestReplicaRebuildFromSnapshotAndTail` | `cluster_test.go` | distsim | PASS | +| Smart WAL: resolvable → unresolvable | `TestP02_SmartWAL_RecoverableThenUnrecoverable` | `phase02_advanced_test.go` | distsim | PASS | +| Time-varying payload availability | `TestP02_SmartWAL_TimeVaryingAvailability` | `phase02_advanced_test.go` | distsim | PASS | +| RecoverableLSN is replayability proof | `RecoverableLSN()` in `storage.go` | `storage.go` | distsim | Implemented | +| Handshake outcome: NeedsRebuild | `TestExec_HandshakeOutcome_NeedsRebuild_InvalidatesSession` | `execution_test.go` | enginev2 | PASS | + +**Verdict**: A6 is covered. Recovery boundary is decided by explicit reservation + recoverability check, not by optimistic assumption. `RecoverableLSN()` verifies contiguous WAL coverage. + +--- + +## A7: Historical Data Correctness Holds + +**Must prove**: recovered data for target LSN is historically correct; current extent cannot fake old history. + +**Pass condition**: snapshot + tail rebuild matches reference; current-extent reconstruction of old LSN fails correctness. + +| Evidence | Test | File | Layer | Status | +|----------|------|------|-------|--------| +| Snapshot + tail matches reference | `TestReplicaRebuildFromSnapshotAndTail` | `cluster_test.go` | distsim | PASS | +| Historical state not reconstructable after GC | `TestA7_HistoricalState_NotReconstructableAfterGC` | `phase045_crash_test.go` | distsim | PASS | +| `CanReconstructAt()` rejects faked history | `CanReconstructAt()` in `storage.go` | `storage.go` | distsim | Implemented | +| Checkpoint does not leak applied state | `TestI2_CheckpointDoesNotLeakAppliedState` | `phase045_crash_test.go` | distsim | PASS | +| Extent-referenced resolvable records | `TestExtentReferencedResolvableRecordsAreRecoverable` | `cluster_test.go` | distsim | PASS | +| Extent-referenced unresolvable → rebuild | `TestExtentReferencedUnresolvableForcesRebuild` | `cluster_test.go` | distsim | PASS | +| ACK'd flush recoverable after crash (I1) | `TestI1_AckedFlush_RecoverableAfterPrimaryCrash` | `phase045_crash_test.go` | distsim | PASS | + +**Verdict**: A7 is now covered with the Phase 4.5 crash-consistency additions. The critical gap ("current extent cannot fake old history") is proven by `CanReconstructAt()` + `TestA7_HistoricalState_NotReconstructableAfterGC`. + +--- + +## A8: Durability Mode Semantics Are Correct + +**Must prove**: best_effort, sync_all, sync_quorum behave as intended under mixed replica states. + +**Pass condition**: sync_all strict, sync_quorum commits only with true durable quorum, invalid topology rejected. + +| Evidence | Test | File | Layer | Status | +|----------|------|------|-------|--------| +| sync_quorum continues with one lagging | `TestSyncQuorumContinuesWithOneLaggingReplica` | `cluster_test.go` | distsim | PASS | +| sync_all blocks with one lagging | `TestSyncAllBlocksWithOneLaggingReplica` | `cluster_test.go` | distsim | PASS | +| sync_quorum mixed states | `TestSyncQuorumWithMixedReplicaStates` | `cluster_test.go` | distsim | PASS | +| sync_all mixed states | `TestSyncAllBlocksWithMixedReplicaStates` | `cluster_test.go` | distsim | PASS | +| Barrier timeout: sync_all blocked | `TestP03_BarrierTimeout_SyncAll_Blocked` | `phase03_timeout_test.go` | distsim | PASS | +| Barrier timeout: sync_quorum commits | `TestP03_BarrierTimeout_SyncQuorum_StillCommits` | `phase03_timeout_test.go` | distsim | PASS | +| Promotion uses RecoverableLSN | `EvaluateCandidateEligibility()` | `cluster.go` | distsim | Implemented | +| Promoted replica has committed prefix (I4) | `TestI4_PromotedReplica_HasCommittedPrefix` | `phase045_crash_test.go` | distsim | PASS | + +**Verdict**: A8 is well-covered. sync_all is strict (blocks on lagging), sync_quorum uses true durable quorum (not connection count). Promotion now uses `RecoverableLSN()` for committed-prefix check. + +--- + +## Summary + +| Criterion | Simulator Evidence | Prototype Evidence | Status | +|-----------|-------------------|-------------------|--------| +| A5 (catch-up escalation) | 6 tests | 1 test | **Strong** | +| A6 (recoverability boundary) | 6 tests + RecoverableLSN() | 1 test | **Strong** | +| A7 (historical correctness) | 7 tests + CanReconstructAt() | — | **Strong** (new in Phase 4.5) | +| A8 (durability modes) | 7 tests + RecoverableLSN() | — | **Strong** | + +**Total executable evidence**: 26 simulator tests + 2 prototype tests + 2 new storage methods. + +All A5-A8 acceptance criteria have direct test evidence. No criterion depends solely on design-doc claims. + +--- + +## Still Open (Not Blocking) + +| Item | Priority | Why not blocking | +|------|----------|-----------------| +| Predicate exploration / adversarial search | P2 | Manual scenarios already cover known failure classes | +| Catch-up convergence under sustained load | P2 | I3 proves escalation; load-rate modeling is optimization | +| A5-A8 in a single grouped runner view | P3 | Traceability doc serves as grouped evidence for now | diff --git a/sw-block/design/agent_dev_process.md b/sw-block/design/agent_dev_process.md new file mode 100644 index 000000000..659506d6e --- /dev/null +++ b/sw-block/design/agent_dev_process.md @@ -0,0 +1,304 @@ +# Agent Development Process + +Date: 2026-03-30 +Status: active +Purpose: define the working split between `sw`, `tester`, and review/management roles so each phase and slice has a clear delivery path + +## Why This Exists + +The project is now beyond pure exploration. + +The expensive part is no longer only writing code. +The expensive part is: + +1. delivery +2. review +3. fixes +4. re-review + +So the process must reduce repeated full-stack review and make each role responsible for a distinct layer. + +## Roles + +### `manager` + +Primary role: + +- phase/plan owner + +Responsibilities: + +1. define the phase/slice direction and scope +2. accept the planning package before coding starts +3. decide whether a carry-forward is acceptable or must become a gate +4. perform the final round review for overall logic, omissions, and product-path fit + +### `architect` + +Primary role: + +- plan and technical reviewer + +Responsibilities: + +1. review the plan before implementation starts +2. tighten algorithm wording, scope edges, and expectation framing +3. review technical correctness during implementation +4. review API/state/resource/fail-closed behavior +5. catch semantic drift, scope drift, and V1/V1.5 leakage + +### `sw` + +Primary role: + +- implementation owner + +Responsibilities: + +1. implement the accepted slice +2. state changed contracts +3. state fail-closed handling +4. state resources acquired/released +5. state carry-forward items +6. add or update tests + +### `tester` + +Primary role: + +- evidence owner + +Responsibilities: + +1. define what the slice must prove before implementation starts +2. maintain the failure-class checklist +3. define reject conditions and required test level +4. confirm that implementation claims are actually covered by evidence + +## Default Routine + +Each slice should follow this order: + +1. `manager` defines the plan direction +2. `architect` reviews and tightens the plan / algorithm / expectation framing +3. `tester` writes the expectation template +4. `manager` accepts the package and records it in the phase docs +5. `sw` implements and submits with the delivery template +6. `architect` reviews the technical layer until clean enough +7. `tester` performs validation and evidence closure +8. `manager` performs round-two review for overall logic and omissions + +Urgent exception: + +- if early work already shows major scope drift, protocol contradiction, or V1/V1.5 leakage, architecture review may short-circuit before implementation grows further + +## Delivery Template For `sw` + +Each delivery should include: + +1. changed contracts +2. fail-closed handling added +3. resources acquired/released +4. test inventory +5. known carry-forward notes + +This template is required between: + +1. implementation +2. implementation/fail-closed review + +It should accompany the delivery before reviewers start detailed review. + +Suggested format: + +```md +Changed contracts: +- ... + +Fail-closed handling: +- ... + +Resources acquired/released: +- ... + +Test inventory: +- ... + +Carry-forward notes: +- ... +``` + +## Phase Doc Usage + +Use the three phase documents differently: + +### `phase-xx.md` + +Use for: + +1. current execution direction +2. current scope +3. current guardrails +4. current accepted status +5. current assignments + +Keep it short and execution-oriented. + +### `phase-xx-log.md` + +Use for: + +1. detailed planning evolution +2. review feedback +3. carry-forward discussion +4. open observations +5. why wording or scope changed + +This document may be longer and more detailed. + +### `phase-xx-decisions.md` + +Use for: + +1. durable phase-level decisions +2. accepted boundaries that later rounds should inherit +3. gate decisions +4. decisions that should not be re-argued without new evidence + +This document should stay compact and hold only the more important global decisions. + +## Expectation Template For `tester` + +Before or at slice start, `tester` should define: + +1. must-pass expectations +2. failure-class checklist +3. required test level for each behavior +4. reject conditions + +`tester` should re-engage after technical review is mostly clean, to confirm final evidence closure before the manager's second-round review. + +Suggested format: + +```md +Expectation: +- ... + +Required level: +- entry path / engine / unit + +Reject if: +- ... + +Failure classes covered: +- ... +``` + +## Review Checklist For `architect` + +Review these first: + +1. nil handling +2. missing-resource handling +3. wrong-state / wrong-kind rejection +4. stale ID / stale authority rejection +5. resource pin / release symmetry +6. plan/execute/complete argument correctness +7. fail-closed cleanup on partial failure + +## Failure-Class Checklist + +This checklist should be kept active across phases. + +Minimum recurring classes: + +1. changed-address restart +2. stale epoch / stale session +3. missing resource pin +4. cleanup after failed plan +5. replay range mis-derived +6. false trusted-base selection +7. truncation missing but completion attempted +8. bounded catch-up not escalating + +## Process Rules + +### Rule 1: Do not wait until the end to define proof + +Each slice should begin with a statement of: + +1. what must be proven +2. which failure classes must stay closed + +### Rule 2: Do not let convenience wrappers silently become model truth + +Any convenience flow must be explicitly classified as: + +1. test-only convenience +2. stepwise engine task +3. planner/executor split + +### Rule 3: Prefer evidence quality over object growth + +New work should preferentially improve: + +1. traceability +2. diagnosability +3. failure-class closure +4. adapter contracts + +not just add: + +1. more structs +2. more states +3. more helper APIs + +### Rule 4: Use V1 as validation source, not architecture template + +Use: + +1. `learn/projects/sw-block/` +2. `weed/storage/block*` + +for: + +1. constraints +2. failure gates +3. implementation reality + +Do not use them as the default V2 architecture template. + +### Rule 5: Reuse reality, not inherited semantics + +When later implementation reuses existing `Seaweed` / `V1` paths: + +1. reuse control-plane reality +2. reuse storage/runtime reality +3. reuse execution mechanisms + +but do not silently inherit: + +1. address-shaped identity +2. old recovery classification semantics +3. old committed-truth assumptions +4. old failover authority assumptions + +Any such reuse should be reviewed explicitly as: + +1. safe reuse +2. reuse with explicit boundary +3. temporary carry-forward +4. hard gate before later phases + +## Current Direction + +The project has moved from exploration-heavy work to evidence-first engine work. + +From `Phase 06` onward, the default is: + +1. plan first +2. review plan before coding +3. implement +4. review technical layer +5. close evidence +6. do final manager review diff --git a/sw-block/design/phase-07-service-slice-plan.md b/sw-block/design/phase-07-service-slice-plan.md new file mode 100644 index 000000000..7e9a0a45a --- /dev/null +++ b/sw-block/design/phase-07-service-slice-plan.md @@ -0,0 +1,403 @@ +# Phase 07 Service-Slice Plan + +Date: 2026-03-30 +Status: draft +Scope: `Phase 07 P0` + +## Purpose + +Define the first real-system service slice that will host the V2 engine, choose the first concrete integration path in the existing codebase, and map engine adapters onto real modules. + +This is a planning document. It does not claim the integration already works. + +## Decision + +The first service slice should be: + +- a single `blockvol` primary on a real volume server +- with one replica target (`RF=2` path) +- driven by the existing master heartbeat / assignment loop +- using the V2 engine only for replication recovery ownership / planning / execution + +This is the narrowest real-system slice that still exercises: + +1. real assignment delivery +2. real epoch and failover signals +3. real volume-server lifecycle +4. real WAL/checkpoint/base-image truth +5. real changed-address / reconnect behavior + +It is narrow enough to avoid reopening the whole system, but real enough to stop hiding behind engine-local mocks. + +## Why This Slice + +This slice is the right first integration target because: + +1. `weed/server/master_grpc_server.go` already delivers block-volume assignments over heartbeat +2. `weed/server/master_block_failover.go` already owns failover / promotion / pending rebuild decisions +3. `weed/storage/blockvol/blockvol.go` already owns the current replication runtime (`shipperGroup`, receiver, WAL retention, checkpoint state) +4. the existing V1/V1.5 failure history is concentrated in exactly this master <-> volume-server <-> blockvol path + +So this slice gives maximum validation value with minimum new surface. + +## First Concrete Integration Path + +The first integration path should be: + +1. master receives volume-server heartbeat +2. master updates block registry and emits `BlockVolumeAssignment` +3. volume server receives assignment +4. block volume adapter converts assignment + local storage state into V2 engine inputs +5. V2 engine drives sender/session/recovery state +6. existing block-volume runtime executes the actual data-path work under engine decisions + +In code, that path starts here: + +- master side: + - `weed/server/master_grpc_server.go` + - `weed/server/master_block_failover.go` + - `weed/server/master_block_registry.go` +- volume / storage side: + - `weed/storage/blockvol/blockvol.go` + - `weed/storage/blockvol/recovery.go` + - `weed/storage/blockvol/wal_shipper.go` + - assignment-handling code under `weed/storage/blockvol/` +- V2 engine side: + - `sw-block/engine/replication/` + +## Service-Slice Boundaries + +### In-process placement + +The V2 engine should initially live: + +- in-process with the volume server / `blockvol` runtime +- not in master +- not as a separate service yet + +Reason: + +- the engine needs local access to storage truth and local recovery execution +- master should remain control-plane authority, not recovery executor + +### Control-plane boundary + +Master remains authoritative for: + +1. epoch +2. role / assignment +3. promotion / failover decision +4. replica membership + +The engine consumes these as control inputs. It does not replace master failover policy in `Phase 07`. + +### Control-Over-Heartbeat Upgrade Path + +For the first V2 product path, the recommended direction is: + +- reuse the existing master <-> volume-server heartbeat path as the control carrier +- upgrade the block-specific control semantics carried on that path +- do not immediately invent a separate control service or assignment channel + +Why: + +1. this is the real Seaweed path already carrying block assignments and confirmations today +2. this gives the fastest route to a real integrated control path +3. it preserves compatibility with existing Seaweed master/volume-server semantics while V2 hardens its own control truth + +Concretely, the current V1 path already provides: + +1. block assignments delivered in heartbeat responses from `weed/server/master_grpc_server.go` +2. assignment application on the volume server in `weed/server/volume_grpc_client_to_master.go` and `weed/server/volume_server_block.go` +3. assignment confirmation and address-change refresh driven by later heartbeats in `weed/server/master_grpc_server.go` and `weed/server/master_block_registry.go` +4. immediate block heartbeat on selected shipper state changes in `weed/server/volume_grpc_client_to_master.go` + +What should be upgraded for V2 is not mainly the transport, but the control contract carried on it: + +1. stable `ReplicaID` +2. explicit `Epoch` +3. explicit role / assignment authority +4. explicit apply/confirm semantics +5. explicit stale assignment rejection +6. explicit address-change refresh as endpoint change, not identity change + +Current cadence note: + +- the block volume heartbeat is periodic (`5 * sleepInterval`) with some immediate state-change heartbeats +- this is acceptable as the first hardening carrier +- it should not be assumed to be the final control responsiveness model + +Deferred design decision: + +- whether block control should eventually move beyond heartbeat-only carriage into a more explicit control/assignment channel should be decided only after the `Phase 08 P1` real control-delivery path exists and can be measured + +That later decision should be based on: + +1. failover / reassignment responsiveness +2. assignment confirmation precision +3. operational complexity +4. whether heartbeat carriage remains too coarse for the block-control path + +Until then, the preferred direction is: + +- strengthen block control semantics over the existing heartbeat path +- do not prematurely create a second control plane + +### Storage boundary + +`blockvol` remains authoritative for: + +1. WAL head / retention reality +2. checkpoint/base-image reality +3. actual catch-up streaming +4. actual rebuild transfer / restore operations + +The engine consumes these as storage truth and recovery execution capabilities. It does not replace the storage backend in `Phase 07`. + +## First-Slice Identity Mapping + +This must be explicit in the first integration slice. + +For `RF=2` on the existing master / block registry path: + +- stable engine `ReplicaID` should be derived from: + - `/` +- not from: + - `DataAddr` + - `CtrlAddr` + - heartbeat transport endpoint + +For this slice, the adapter should map: + +1. `ReplicaID` +- from master/block-registry identity for the replica host entry + +2. `Endpoint` +- from the current replica receiver/data/control addresses reported by the real runtime + +3. `Epoch` +- from the confirmed master assignment for the volume + +4. `SessionKind` +- from master-driven recovery intent / role transition outcome + +This is a hard first-slice requirement because address refresh must not collapse identity back into endpoint-shaped keys. + +## Adapter Mapping + +### 1. ControlPlaneAdapter + +Engine interface today: + +- `HandleHeartbeat(serverID, volumes)` +- `HandleFailover(deadServerID)` + +Real mapping should be: + +- master-side source: + - `weed/server/master_grpc_server.go` + - `weed/server/master_block_failover.go` + - `weed/server/master_block_registry.go` +- volume-server side sink: + - assignment receive/apply path in `weed/storage/blockvol/` + +Recommended real shape: + +- do not literally push raw heartbeat messages into the engine +- instead introduce a thin adapter that converts confirmed master assignment state into: + - stable `ReplicaID` + - endpoint set + - epoch + - recovery target kind + +That keeps master as control owner and the engine as execution owner. + +Important note: + +- the adapter should treat heartbeat as the transport carrier, not as the final protocol shape +- block-control semantics should be made explicit over that carrier +- if a later phase concludes that heartbeat-only carriage is too coarse, that should be a separate design decision after the real hardening path is measured + +### 2. StorageAdapter + +Engine interface today: + +- `GetRetainedHistory()` +- `PinSnapshot(lsn)` / `ReleaseSnapshot(pin)` +- `PinWALRetention(startLSN)` / `ReleaseWALRetention(pin)` +- `PinFullBase(committedLSN)` / `ReleaseFullBase(pin)` + +Real mapping should be: + +- retained history source: + - current WAL head/tail/checkpoint state from `weed/storage/blockvol/blockvol.go` + - recovery helpers in `weed/storage/blockvol/recovery.go` +- WAL retention pin: + - existing retention-floor / replica-aware WAL retention machinery around `shipperGroup` +- snapshot pin: + - existing snapshot/checkpoint artifacts in `blockvol` +- full-base pin: + - explicit pinned full-extent export or equivalent consistent base handle from `blockvol` + +Important constraint: + +- `Phase 07` must not fake this by reconstructing `RetainedHistory` from tests or metadata alone + +### 3. Execution Driver / Executor hookup + +Engine side already has: + +- planner/executor split in `sw-block/engine/replication/driver.go` +- stepwise executors in `sw-block/engine/replication/executor.go` + +Real mapping should be: + +- engine planner decides: + - zero-gap / catch-up / rebuild + - trusted-base requirement + - replayable-tail requirement +- blockvol runtime performs: + - actual WAL catch-up transport + - actual snapshot/base transfer + - actual truncation / apply operations + +Recommended split: + +- engine owns contract and state transitions +- blockvol adapter owns concrete I/O work + +## First-Slice Acceptance Rule + +For the first integration slice, this is a hard rule: + +- `blockvol` may execute recovery I/O +- `blockvol` must not own recovery policy + +Concretely, `blockvol` must not decide: + +1. zero-gap vs catch-up vs rebuild +2. trusted-base validity +3. replayable-tail sufficiency +4. whether rebuild fallback is required + +Those decisions must remain in the V2 engine. + +The bridge may translate engine decisions into concrete blockvol actions, but it must not re-decide recovery policy underneath the engine. + +## First Product Path + +The first product path should be: + +- `RF=2` block volume replication on the existing heartbeat/assignment loop +- primary + one replica +- failover / reconnect / changed-address handling +- rebuild as the formal non-catch-up recovery path + +This is the right first path because it exercises the core correctness boundary without introducing N-replica coordination complexity too early. + +## What Must Be Replaced First + +Current engine-stage pieces that are still mock/test-only or too abstract: + +### Replace first + +1. `mockStorage` in engine tests +- replace with a real `blockvol`-backed `StorageAdapter` + +2. synthetic control events in engine tests +- replace with assignment-driven events from the real master/volume-server path + +3. convenience recovery completion wrappers +- keep them test-only +- real integration should use planner + executor + storage work loop + +### Can remain temporarily abstract in Phase 07 P0/P1 + +1. `ControlPlaneAdapter` exact public shape +- can remain thin while the integration path is being chosen + +2. async production scheduler details +- executor can still be driven by a service loop before full background-task architecture is finalized + +## Recommended Concrete Modules + +### Engine stays here + +- `sw-block/engine/replication/` + +### First real adapter package should be added near blockvol + +Recommended initial location: + +- `weed/storage/blockvol/v2bridge/` + +Reason: + +- keeps V2 engine independent under `sw-block/` +- keeps real-system glue close to blockvol storage truth +- avoids copying engine logic into `weed/` + +Suggested contents: + +1. `control_adapter.go` +- convert master assignment / local apply path into engine intents + +2. `storage_adapter.go` +- expose retained history, pin/release, trusted-base export handles from real blockvol state + +3. `executor_bridge.go` +- translate engine executor steps into actual blockvol recovery actions + +4. `observe_adapter.go` +- map engine status/logs into service-visible diagnostics + +## First Failure Replay Set For Phase 07 + +The first real-system replay set should be: + +1. changed-address restart +- current risk: old identity/address coupling reappears in service glue + +2. stale epoch / stale result after failover +- current risk: master and engine disagree on authority timing + +3. unreplayable-tail rebuild fallback +- current risk: service glue over-trusts checkpoint/base availability + +4. plan/execution cleanup after resource failure +- current risk: blockvol-side resource failures leave engine or service state dangling + +5. primary failover to replica with rebuild pending on old primary reconnect +- current risk: old V1/V1.5 semantics leak back into reconnect handling + +## Non-Goals For This Slice + +Do not use `Phase 07` to: + +1. widen catch-up semantics +2. add smart rebuild optimizations +3. redesign all blockvol internals +4. replace the full V1 runtime in one move +5. claim production readiness + +## Deliverables For Phase 07 P0 + +A good `P0` delivery should include: + +1. chosen service slice +2. chosen integration path in the current repo +3. adapter-to-module mapping +4. list of test-only adapters to replace first +5. first failure replay set +6. explicit note of what remains outside this first slice + +## Short Form + +`Phase 07 P0` should start with: + +- engine in `sw-block/engine/replication/` +- bridge in `weed/storage/blockvol/v2bridge/` +- first real slice = blockvol primary + one replica on the existing master heartbeat / assignment path +- `ReplicaID = /` for the first slice +- `blockvol` executes I/O but does not own recovery policy +- first product path = `RF=2` failover/reconnect/rebuild correctness diff --git a/sw-block/design/v2-algorithm-overview.md b/sw-block/design/v2-algorithm-overview.md new file mode 100644 index 000000000..8e88f6b65 --- /dev/null +++ b/sw-block/design/v2-algorithm-overview.md @@ -0,0 +1,686 @@ +# V2 Algorithm Overview + +Date: 2026-03-27 +Status: strategic design overview +Audience: CEO / owner / technical leadership + +## Purpose + +This document explains the current V2 direction for `sw-block`: + +- what V2 is trying to solve +- why V1 and V1.5 are not enough as the long-term architecture +- why a WAL-based design is still worth pursuing +- how V2 compares with major market and paper directions +- how simulation and the real test runner systematically build confidence + +This is not a phase report and not a production-commitment document. + +It is the high-level technical rationale for the V2 line. + +## Relationship To Other Documents + +| Document | Role | +|----------|------| +| `v1-v15-v2-comparison.md` | Detailed comparison of the three lines | +| `v2-acceptance-criteria.md` | Protocol validation bar | +| `v2_scenarios.md` | Scenario backlog and simulator mapping | +| `v2-open-questions.md` | Remaining algorithmic questions | +| `protocol-development-process.md` | Method for protocol work | +| `learn/projects/sw-block/algorithm_overview.md` | Current V1/V1.5 system review | +| `learn/projects/sw-block/design/algorithm_survey.md` | Paper and vendor survey | +| `learn/projects/sw-block/test/README.md` | Real test runner overview | +| `learn/projects/sw-block/test/test-platform-review.md` | Test platform maturity and standalone direction | + +## 1. Executive Summary + +The current judgment is: + +- `V1` proved that the basic WAL-based replicated block model can work. +- `V1.5` materially improved real recovery behavior and now has stronger operational evidence on real hardware. +- `V2` exists because the next correctness problems should not be solved by incremental local fixes. They should be made explicit in the protocol itself. + +The central V2 idea is simple: + +- short-gap recovery should be explicit +- stale authority should be explicitly fenced +- catch-up vs rebuild should be an explicit decision +- recovery ownership should be a protocol object, not an implementation accident + +`V2` is not yet a production engine. But it is already the stronger architectural direction. + +The correct strategic posture today is: + +- continue `V1.5` as the production line +- continue `V2` as the long-term architecture line +- continue WAL investigation because we now have a serious validation framework +- if prototype evidence later shows a structural flaw, evolve to `V2.5` before heavy implementation + +## 2. The Real Problem V2 Tries To Solve + +At the frontend, a block service looks simple: + +- `write` +- `flush` / `sync` +- failover +- recovery + +But the real difficulty is not the frontend verb set. The real difficulty is the asynchronous distributed boundary between: + +- local WAL append on the primary +- durable progress on replicas +- client-visible commit / sync truth +- failover and promotion safety +- recovery after lag, restart, endpoint change, or timeout + +This is the root reason V2 exists. + +The project has already learned that correctness problems in block storage do not usually come from the happy path. They come from: + +- a replica going briefly down and coming back +- a replica coming back on a new address +- a delayed stale barrier or stale reconnect result +- a lagging node that is almost, but not quite, recoverable +- a failover decision made on insufficient lineage information + +V2 is the attempt to make those cases first-class protocol behavior instead of post-hoc patching. + +## 3. Why V1 And V1.5 Are Not Enough + +This overview does not need a long retelling of `V1` and `V1.5`. + +What matters is their architectural limit. + +### What `V1` got right + +`V1` proved the basic shape: + +- ordered WAL +- primary-replica replication +- extent-backed storage +- epoch and lease as the first fencing model + +### Why `V1` is not enough + +Its main shortcomings were: + +- short-gap recovery was too weak and too implicit +- lagging replicas too easily fell into rebuild or long degraded states +- changed-address restart was fragile +- stale authority and stale results were not modeled as first-class protocol objects +- the system did not cleanly separate: + - current WAL head + - committed prefix + - recoverable retained range + - stale or divergent replica tail + +### Why `V1.5` is still not enough + +`V1.5` fixed several real operational problems: + +- retained-WAL catch-up +- same-address reconnect +- `sync_all` correctness on real tests +- rebuild fallback after unrecoverable gap +- control-plane refresh after changed-address restart + +Those fixes matter, and they are why `V1.5` is the stronger production line today. + +But `V1.5` is still not the long-term architecture because its recovery model remains too incremental: + +- reconnect logic is still layered onto an older shipper model +- recovery ownership was discovered as a bug class before it became a protocol object +- catch-up vs rebuild became clearer, but still not clean enough as a top-level protocol contract +- the system still looks too much like "repair V1" rather than "define the next replication model" + +### What `V2` changes + +`V2` is not trying to invent a completely different storage model. + +It is trying to make the critical parts explicit: + +- recovery ownership +- lineage-safe recovery boundary +- catch-up vs rebuild classification +- per-replica sender authority +- stale-result rejection +- explicit recovery orchestration + +So the correct comparison is still: + +- `V1.5` is stronger operationally today +- `V2` is stronger architecturally today + +That is not a contradiction. It is the right split between a current production line and the next architecture line. + +```mermaid +flowchart TD + V1[V1] + V15[V1_5] + V2[V2] + realFailures[RealFailures] + realTests[RealHardwareEvidence] + simAndProto[SimulationAndPrototype] + + V1 --> V15 + V15 --> V2 + realFailures --> V15 + realFailures --> V2 + V15 --> realTests + V2 --> simAndProto +``` + +## 4. How V2 Solves WAL And Extent Synchronization + +The core V2 question is not simply "do we keep WAL?" + +The real question is: + +**how do WAL and extent stay synchronized across primary and replica while preserving both stability and performance?** + +This is the center of the V2 design. + +### 4.1 The basic separation of roles + +V2 treats the storage path as two different but coordinated layers: + +- **WAL** is the ordered truth for recent history +- **extent** is the stable materialized image + +WAL is used for: + +- strict write ordering +- local crash recovery +- short-gap replica catch-up +- durable progress accounting through `LSN` + +Extent is used for: + +- stable read image +- long-lived storage +- checkpoint and base-image creation +- long-gap recovery only through a real checkpoint/snapshot base, not through guessing from the current live extent + +This separation is the first stability rule: + +- do not ask current extent to behave like historical state +- do not ask WAL to be the only long-range recovery mechanism forever + +### 4.2 Primary-replica synchronization model + +The intended V2 steady-state model is: + +1. primary allocates monotonic `LSN` +2. primary appends ordered WAL locally +3. primary enqueues the record to per-replica sender loops +4. replicas receive in order and advance explicit progress +5. barrier/sync uses **durable replica progress**, not optimistic send progress +6. flusher later materializes WAL-backed dirty state into extent + +The local WAL-to-extent lifecycle can be understood as: + +```mermaid +stateDiagram-v2 + [*] --> WalAppended + WalAppended --> SenderQueued + SenderQueued --> ReplicaReceived + ReplicaReceived --> ReplicaDurable + ReplicaDurable --> SyncEligible + SyncEligible --> ExtentMaterialized + ExtentMaterialized --> CheckpointAdvanced + + note right of WalAppended + Ordered local WAL exists + and defines the write LSN + end note + + note right of ReplicaDurable + Replica durable progress + is now explicit + end note + + note right of ExtentMaterialized + Flusher moves stable data + from WAL-backed dirty state + into extent + end note +``` + +The critical synchronization rule is: + +- **client-visible sync truth must follow durable replica progress** +- not local send progress +- not local WAL head +- not "replica probably received it" + +This is why V2 uses a lineage-safe recovery target such as `CommittedLSN` instead of a looser notion like "current primary head." + +### 4.2.1 Sync mode and result model + +V2 also makes the sync-result logic more explicit. + +- `best_effort` should succeed after the primary has reached its local durability point, even if replicas are degraded. +- `sync_all` should succeed only when all required replicas are durable through the target boundary. +- `sync_quorum` should succeed only when a true durable quorum exists through the target boundary. + +This decision path can be presented as: + +```mermaid +flowchart TD + writeReq[WriteAndSyncRequest] + localDurable[PrimaryLocalDurable] + barrierEval[EvaluateReplicaDurableProgress] + bestEffortAck[best_effortAck] + syncAllAck[sync_allAck] + syncQuorumAck[sync_quorumAck] + rejectOrBlock[RejectOrBlock] + + writeReq --> localDurable + localDurable --> bestEffortAck + localDurable --> barrierEval + + barrierEval -->|"allRequiredReplicasDurable"| syncAllAck + barrierEval -->|"durableQuorumExists"| syncQuorumAck + barrierEval -->|"notEnoughDurableReplicas"| rejectOrBlock +``` + +The key point is that sync success is no longer inferred from send progress or socket health. +It is derived from explicit durable progress at the right safety boundary. + +### 4.3 Why this should be stable + +This model is designed to be stable because the dangerous ambiguities are separated: + +- **write ordering** is carried by WAL and `LSN` +- **durability truth** is carried by barrier / flushed progress +- **recovery ownership** is carried by sender + recovery attempt identity +- **catch-up vs rebuild** is an explicit classification, not an accidental timeout side effect +- **promotion safety** depends on committed prefix and lineage, not on whichever node looks newest + +In other words, V2 stability comes from reducing hidden coupling. + +The design tries to remove cases where one piece of state silently stands in for another. + +### 4.4 Why this can still be high-performance + +The performance argument is not that V2 is magically faster in all cases. + +The argument is narrower and more realistic: + +- keep the primary write path simple: + - ordered local WAL append + - enqueue to per-replica sender loops + - no heavy inline recovery logic in foreground writes +- keep most complexity off the healthy hot path: + - sender ownership + - reconnect classification + - catch-up / rebuild decisions + - timeout and stale-result fencing + live mostly in recovery/control paths +- use WAL for what it is good at: + - recent ordered delta + - short-gap replay +- stop using WAL as the answer to every lag problem: + - long-gap recovery should move toward checkpoint/snapshot base plus tail replay + +So the V2 performance thesis is: + +- **healthy steady-state should remain close to V1.5** +- **degraded/recovery behavior should become much cleaner** +- **short-gap recovery should be cheaper than rebuild** +- **long-gap recovery should stop forcing an unbounded WAL-retention tax** + +That is a much stronger and more believable claim than saying "V2 will just be faster." + +### 4.5 Why WAL is still worth choosing + +The reason to keep the WAL-based direction is that it gives the best foundation for this exact synchronization problem: + +- explicit order +- explicit history +- explicit committed prefix +- explicit short-gap replay +- explicit failover reasoning + +WAL is risky only if the design blurs: + +- local write acceptance +- replica durable progress +- committed boundary +- recoverable retained history + +V2 exists precisely to stop blurring those things. + +So the current project position is: + +- WAL is not automatically safe +- but WAL is still the most promising base for this block service +- because the project now has enough real evidence, simulator coverage, and prototype work to investigate it rigorously + +## 5. Comparison With Market And Papers + +The current V2 direction is not chosen because other vendors are wrong. It is chosen because other directions solve different problems and carry different costs. + +### Ceph / RBD style systems + +Ceph-style block systems avoid this exact per-volume replicated WAL shape. They gain: + +- deep integration with object-backed distributed storage +- mature placement and recovery machinery +- strong cluster-scale distribution logic + +But they pay elsewhere: + +- more system layers +- more object-store and peering complexity +- a heavier operational and conceptual model + +This is not a free simplification. It is a different complexity trade. + +For `sw-block`, the design choice is to keep a narrower software block service with more explicit per-volume replication semantics instead of inheriting the full distributed object-backed block complexity stack. + +### PolarFS / ParallelRaft style work + +These systems explore more aggressive ordering and apply strategies: + +- out-of-order or conflict-aware work +- deeper parallelism +- more sophisticated log handling + +They are valuable references, especially for: + +- LBA conflict reasoning +- recovery and replay cost thinking +- future flusher parallelization ideas + +But they also introduce a much heavier correctness surface. + +The project does not currently want to buy that complexity before fully proving the simpler strict-order path. + +### AWS chain replication / EBS-style lessons + +Chain replication and related work are attractive because they address real bandwidth and recovery concerns: + +- Primary NIC pressure +- forwarding topology +- cleaner scaling for RF=3 + +This is one of the more plausible borrowable directions later. + +But it changes: + +- latency profile +- failure handling +- barrier semantics +- operational topology + +So it belongs to a later architecture stage, not to the current V2 core proof. + +### The actual strategic choice + +The project is deliberately choosing: + +- a narrower software-first block design +- explicit per-volume correctness +- strict reasoning before performance heroics +- validation before feature expansion + +That is not conservatism for its own sake. It is how to build a block product that can later be trusted. + +## 6. Why This Direction Fits SeaweedFS And Future Standalone sw-block + +`sw-block` started inside SeaweedFS, but V2 is already being shaped as the next standalone block service line. + +That means the architecture should preserve two things at once: + +### What should remain compatible + +- placement and topology concepts where they remain useful +- explainable control-plane contracts +- operational continuity with the SeaweedFS ecosystem + +### What should become more block-specific + +- replication correctness +- recovery ownership +- recoverability classification +- block-specific test and evidence story + +So the current direction is: + +- use SeaweedFS as the practical ecosystem and experience base +- but shape V2 as a true block-service architecture, not as a minor sub-feature of `weed/` + +This is why the V2 line belongs under `sw-block/` rather than as a direct patch path inside the existing production tree. + +## 7. The Systematic Validation Method + +The second major reason the current direction is rational is the validation method. + +The project is no longer relying on: + +- implement first +- discover behavior later +- patch after failure + +Instead, the intended ladder is: + +- contract and invariants +- scenario backlog +- simulator +- timer/race simulator +- standalone prototype +- real engine test runner + +```mermaid +flowchart TD + contract[ContractAndInvariants] + scenarios[ScenarioBacklog] + distsim[distsim] + eventsim[eventsim] + prototype[enginev2Prototype] + runner[RealTestRunner] + confidence[SystemAndProductConfidence] + + contract --> scenarios + scenarios --> distsim + scenarios --> eventsim + distsim --> prototype + eventsim --> prototype + prototype --> runner + runner --> confidence +``` + +This is the right shape for a risky block-storage algorithm: + +- simulation for protocol truth +- prototype for executable truth +- real runner for product/system truth + +## 8. What The Simulation System Proves + +The simulation system exists to answer: + +- what should happen +- what must never happen +- which V1/V1.5 shapes fail +- why the V2 shape is better + +### `distsim` + +`distsim` is the main protocol simulator. + +It is used for: + +- protocol correctness +- state transitions +- stale authority fencing +- promotion and lineage safety +- catch-up vs rebuild +- changed-address restart +- candidate safety +- reference-state checking + +### `eventsim` + +`eventsim` is the timing/race layer. + +It is used for: + +- barrier timeout behavior +- catch-up timeout behavior +- reservation timeout behavior +- same-tick and delayed event ordering +- stale timeout effects + +### What the simulator is good at + +It is especially strong for proving: + +- stale traffic rejection +- explicit recovery boundaries +- timeout/race semantics +- failover correctness at committed prefix +- why old authority must not mutate current lineage + +### What the simulator does not prove + +It does not prove: + +- real TCP behavior +- real OS scheduling behavior +- disk timing +- real `WALShipper` integration +- real frontend behavior under iSCSI or NVMe + +So the simulator is not the whole truth. + +It is the algorithm/protocol truth layer. + +## 9. What The Real Test Runner Proves + +The real test runner under `learn/projects/sw-block/test/` is the system and product validation layer. + +It is not merely QA support. It is a core part of whether the design can be trusted. + +### What it covers + +The runner and surrounding test system already span: + +- unit tests +- component tests +- integration tests +- distributed scenarios +- real hardware workflows + +The environment already includes: + +- real nodes +- real block targets +- real fault injection +- benchmark and result capture +- run bundles and scenario traceability + +### Why it matters + +The runner is what tells us whether: + +- the implemented engine behaves like the design says +- the product works under real restart/failover/rejoin conditions +- the operator workflows are credible +- benchmark claims are real rather than accidental + +This is why the runner is best thought of as: + +- implementation truth +- system truth +- product truth + +not just test automation. + +## 10. How Simulation And Test Runner Progress Systematically + +The intended feedback loop is: + +1. V1/V1.5 real failures happen +2. those failures are turned into design requirements +3. scenarios are distilled for simulator use +4. the simulator closes protocol ambiguity +5. the standalone prototype closes execution ambiguity +6. the real test runner validates system behavior on real environments +7. new failures or mismatches feed back into design again + +This gives the project two different but complementary truths: + +- `simulation -> algorithm / protocol correctness` +- `test runner -> implementation / system / product correctness` + +That separation is healthy. + +It prevents two common mistakes: + +- trusting design without real behavior +- trusting green system tests without understanding the protocol deeply enough + +## 11. Current Status And Honest Limits + +### What is already strong + +- `V1.5` has materially better recovery behavior than `V1` and stronger operational evidence +- `V2` has stronger architectural structure than `V1.5` +- the simulator has serious acceptance coverage +- the prototype line has already started closing ownership and orchestration risk +- the real test runner is large enough to support serious system validation + +### What is not yet done + +- `V2` is not a production engine +- prototype work is still in early-to-mid stages +- historical-data / recovery-boundary prototype work is not complete +- steady-state performance of `V2` is not yet proven +- real hardware validation of `V2` does not yet exist + +So the correct statement is not: + +- "V2 is already better in production" + +The correct statement is: + +- "V2 is the better long-term architecture, but not yet the stronger deployed engine" + +## 12. Why The Current Direction Is Rational + +The current direction is rational because it keeps the right split: + +- `V1.5` continues as the production line today +- `V2` continues as the next architecture line + +This lets the project: + +- keep shipping and hardening what already works +- explore the better architecture without destabilizing the current engine +- use simulation, prototype work, and the real runner to decide whether V2 should become the next real engine + +The final strategic rule should remain: + +- continue WAL investigation because the project now has a credible validation framework +- continue V2 because the architectural evidence is strong +- if prototype evidence later reveals a structural flaw, redesign to `V2.5` before heavy implementation + +That is the disciplined path for a block-storage algorithm. + +## Bottom Line + +If choosing based on current production proof: + +- use `V1.5` + +If choosing based on long-term protocol quality: + +- choose `V2` + +If choosing based on whether WAL should still be investigated: + +- yes, because the project now has the right validation stack to investigate it responsibly + +That is the current strategic answer. diff --git a/sw-block/design/v2-algorithm-overview.zh.md b/sw-block/design/v2-algorithm-overview.zh.md new file mode 100644 index 000000000..32e9d2bc4 --- /dev/null +++ b/sw-block/design/v2-algorithm-overview.zh.md @@ -0,0 +1,660 @@ +# V2 算法综述 + +日期:2026-03-27 +状态:战略级设计综述 +读者:CEO / owner / 技术管理层 + +## 文档目的 + +本文用于说明 `sw-block` 当前 `V2` 方向背后的核心判断: + +- `V2` 到底想解决什么问题 +- 为什么 `V1` / `V1.5` 不足以作为长期架构 +- 为什么我们仍然认为基于 `WAL` 的方向值得继续走 +- `V2` 与主要市场方案 / 论文路线相比的取舍是什么 +- `simulation` 与真实 `test runner` 如何形成系统化验证闭环 + +这不是 phase 汇报,也不是对生产可用性的承诺文档。 + +它是对 `V2` 这条架构线的高层技术解释。 + +## 与其他文档的关系 + +| 文档 | 作用 | +|------|------| +| `v1-v15-v2-comparison.md` | 三条技术线的详细比较 | +| `v2-acceptance-criteria.md` | V2 协议验证下限 | +| `v2_scenarios.md` | 场景清单与 simulator 覆盖 | +| `v2-open-questions.md` | 仍未关闭的算法问题 | +| `protocol-development-process.md` | 协议开发方法论 | +| `learn/projects/sw-block/algorithm_overview.md` | 当前 V1/V1.5 系统级算法综述 | +| `learn/projects/sw-block/design/algorithm_survey.md` | 论文 / vendor 调研与借鉴项 | +| `learn/projects/sw-block/test/README.md` | 真实测试系统入口 | +| `learn/projects/sw-block/test/test-platform-review.md` | test runner 的平台化方向 | + +## 1. 执行摘要 + +当前最准确的结论是: + +- `V1` 证明了基于 `WAL` 的复制块存储基本路径是可行的。 +- `V1.5` 在真实恢复场景上已经比 `V1` 明显更强,并且有真实硬件上的运行证据。 +- `V2` 的意义,不是在已有逻辑上继续打补丁,而是把最关键的恢复与一致性问题直接上升为协议对象。 + +`V2` 的核心想法可以概括为: + +- 短间隙恢复要显式 +- 过期 authority 要显式 fencing +- `catch-up` 与 `rebuild` 的边界要显式 +- 恢复 ownership 要成为协议的一部分,而不是实现细节里的偶然行为 + +所以今天正确的策略是: + +- 继续用 `V1.5` 作为当前生产线 +- 继续用 `V2` 作为长期架构线 +- 继续认真研究 `WAL` 路线,因为现在我们已经具备了可信的验证框架 +- 如果后续 prototype 证明 `V2` 有结构性缺陷,就应当先演进到 `V2.5`,而不是硬着头皮直接实现 + +## 2. V2 真正要解决的问题 + +从前端看,块存储似乎只有几个简单动作: + +- `write` +- `flush` / `sync` +- failover +- recovery + +但真正难的,不是这些前端动作本身,而是异步分布式边界: + +- primary 本地 WAL 追加 +- replica 端 durable progress +- client 可见的 sync / commit 真值 +- failover / promote 时的数据边界 +- lag、restart、address change、timeout 后的恢复正确性 + +这才是 `V2` 存在的根因。 + +项目已经反复验证过:块存储真正的 bug 通常不出在 happy path,而出在: + +- replica 短暂掉线又回来 +- replica 重启后地址变化 +- 延迟到达的 stale barrier / stale reconnect 结果 +- 一个 lagging replica 看起来“差一点点就能恢复” +- failover 时基于错误 lineage 做了 promote + +`V2` 就是要把这些情况变成协议的第一公民,而不是上线后再继续被动修补。 + +## 3. 为什么 V1 / V1.5 不够 + +这份综述不需要长篇回顾 `V1` 和 `V1.5` 的所有细节。 + +只需要讲清它们为什么不足以作为长期架构。 + +### `V1` 做对了什么 + +`V1` 建立了最重要的基础: + +- 严格有序的 `WAL` +- primary-replica 复制 +- 基于 `epoch + lease` 的初步 fencing +- 以 `extent` 作为稳定数据面,而不是一开始就做全日志结构 + +### `V1` 的不足 + +它的关键短板主要在恢复与退化场景: + +- 短 outage 很容易演化成 rebuild 或长期 degraded +- 恢复结构过于隐式 +- changed-address restart 脆弱 +- stale authority / stale result 还不是协议层的显式对象 +- 系统没有足够清晰地区分: + - 当前 head + - committed prefix + - recoverable retained range + - stale / divergent tail + +### `V1.5` 的不足 + +`V1.5` 已经解决了不少真实问题: + +- retained-WAL catch-up +- same-address reconnect +- `sync_all` 的真实行为 +- catch-up 失败后的 rebuild fallback +- changed-address restart 之后的 control-plane 刷新 + +所以它今天是更强的生产线。 + +但它仍然不是长期架构,因为它本质上仍然是增量修复: + +- reconnect 逻辑仍然附着在旧 shipper 模型上 +- 恢复 ownership 是先作为 bug 暴露出来,再逐步被抽象 +- `catch-up` vs `rebuild` 更清楚了,但还不够成为协议顶层契约 +- 整体感觉仍然更像“继续修 V1”,而不是“定义下一代复制协议” + +### `V2` 的变化 + +`V2` 不是重新发明一个完全不同的存储模型。 + +它的目标是把最关键的东西显式化: + +- recovery ownership +- lineage-safe recovery boundary +- `catch-up` / `rebuild` 分类 +- per-replica sender authority +- stale-result rejection +- 明确的 recovery orchestration + +因此最诚实的比较是: + +- `V1.5` 今天在运行证据上更强 +- `V2` 今天在架构质量上更强 + +这不是矛盾,而是“当前生产线”和“下一代架构线”应有的分工。 + +```mermaid +flowchart TD + V1[V1] + V15[V1_5] + V2[V2] + realFailures[真实故障] + realTests[真实硬件验证] + simAndProto[仿真与原型] + + V1 --> V15 + V15 --> V2 + realFailures --> V15 + realFailures --> V2 + V15 --> realTests + V2 --> simAndProto +``` + +## 4. V2 如何解决 WAL 与 Extent 的同步问题 + +`V2` 的核心问题不是“还要不要 WAL”。 + +真正的问题是: + +**primary 与 replica 之间,WAL 和 extent 如何保持同步,同时还能兼顾稳定性与性能。** + +这才是 `V2` 的中心。 + +### 4.1 基本分工 + +`V2` 把数据路径拆成两个既分离又协作的层: + +- **WAL**:近期历史的有序真相 +- **extent**:稳定的物化数据镜像 + +WAL 负责: + +- 严格写入顺序 +- 本地崩溃恢复 +- 短间隙 replica catch-up +- 基于 `LSN` 的 durable progress 计量 + +Extent 负责: + +- 稳定读镜像 +- 长期存储 +- checkpoint / base image 生成 +- 长间隙恢复时作为真正 base image 的来源 + +第一条稳定性原则就是: + +- 不要让当前 extent 冒充历史状态 +- 不要让 WAL 永远承担所有长距离恢复责任 + +### 4.2 Primary-replica 同步模型 + +`V2` 理想中的 steady-state 同步模型是: + +1. primary 分配单调递增的 `LSN` +2. primary 本地顺序追加 `WAL` +3. primary 把记录放入 per-replica sender loop +4. replica 按顺序接收并推进显式 progress +5. `barrier/sync` 依赖 replica 的 durable progress,而不是 optimistic send progress +6. flusher 再把 WAL-backed dirty state 物化到 extent + +本地 `WAL -> extent` 生命周期可以理解为: + +```mermaid +stateDiagram-v2 + [*] --> WalAppended + WalAppended --> SenderQueued + SenderQueued --> ReplicaReceived + ReplicaReceived --> ReplicaDurable + ReplicaDurable --> SyncEligible + SyncEligible --> ExtentMaterialized + ExtentMaterialized --> CheckpointAdvanced +``` + +这里最关键的规则是: + +- **client 可见的 sync 真值必须跟随 durable replica progress** +- 不能跟随 send progress +- 不能跟随 local WAL head +- 不能跟随“看起来 replica 应该已经收到了” + +这也是为什么 `V2` 使用像 `CommittedLSN` 这样的 lineage-safe 边界,而不是松散的“当前 primary head”。 + +### 4.2.1 不同 sync mode 如何判断结果 + +`V2` 让不同 sync mode 的成功条件变得更明确: + +- `best_effort`:primary 达到本地 durability point 后即可成功,replica 可以后台恢复 +- `sync_all`:所有 required replica 都要在目标边界上 durable +- `sync_quorum`:必须存在真实 durable quorum + +其判断路径可以表示为: + +```mermaid +flowchart TD + writeReq[WriteAndSyncRequest] + localDurable[PrimaryLocalDurable] + barrierEval[EvaluateReplicaDurableProgress] + bestEffortAck[best_effort成功] + syncAllAck[sync_all成功] + syncQuorumAck[sync_quorum成功] + rejectOrBlock[阻塞或失败] + + writeReq --> localDurable + localDurable --> bestEffortAck + localDurable --> barrierEval + + barrierEval -->|"allRequiredReplicasDurable"| syncAllAck + barrierEval -->|"durableQuorumExists"| syncQuorumAck + barrierEval -->|"notEnoughDurableReplicas"| rejectOrBlock +``` + +这意味着 sync 结果不再依赖: + +- socket 看起来还活着 +- sender 好像还在发 +- replica 似乎“差不多收到了” + +而是依赖显式 durable progress。 + +### 4.3 为什么这个设计应该更稳定 + +它试图把最危险的模糊边界拆开: + +- **写入顺序** 由 `WAL + LSN` 表达 +- **durability truth** 由 barrier / flushed progress 表达 +- **recovery ownership** 由 sender + recovery attempt identity 表达 +- **catch-up vs rebuild** 由显式分类表达 +- **promotion safety** 由 committed prefix 与 lineage 表达 + +也就是说,`V2` 的稳定性来自于减少隐式耦合。 + +### 4.4 为什么它仍然可以有高性能 + +这里不能夸大说 `V2` 一定在所有情况下都更快。 + +更准确的性能论点是: + +- 保持 primary 前台写路径简单: + - 本地顺序 `WAL append` + - 投递到 per-replica sender loop + - 不把复杂恢复逻辑塞进前台写路径 +- 把复杂度主要放在健康热路径之外: + - sender ownership + - reconnect classification + - catch-up / rebuild decision + - timeout 和 stale-result fencing + 主要都在 recovery / control path +- 让 WAL 只承担它擅长的工作: + - 近期 ordered delta + - 短间隙 replay +- 不再让 WAL 承担所有长距离恢复: + - 长间隙恢复转向 checkpoint/snapshot base + tail replay + +所以 `V2` 的性能论点应该是: + +- **健康 steady-state 应该尽量接近 `V1.5`** +- **退化与恢复路径会更干净** +- **短间隙恢复会比 rebuild 更便宜** +- **长间隙恢复不再逼迫系统支付无上限的 WAL retention 税** + +这比“V2 天然更快”要可信得多。 + +### 4.5 为什么仍然选择 WAL + +之所以还继续走 WAL,是因为它仍然是解决这个同步问题最有力的基础: + +- 显式顺序 +- 显式历史 +- 显式 committed prefix +- 显式短间隙 replay +- 显式 failover reasoning + +只有当设计把以下概念混淆时,WAL 才会变得危险: + +- 本地写入接受 +- replica durable progress +- committed boundary +- recoverable retained history + +而 `V2` 的存在,正是为了不再混淆这些东西。 + +## 5. 与市场和论文路线的比较 + +选择 `V2` 这条路线,并不是因为别的 vendor 都错了,而是因为他们解决的是不同问题,也承担了不同复杂度。 + +### Ceph / RBD 路线 + +Ceph/RBD 避开了这种 per-volume replicated WAL 形态。 + +它获得的是: + +- 对象存储深度一体化 +- 成熟的 placement 与 recovery 体系 +- 更强的集群级分布能力 + +但代价是: + +- 系统层次更多 +- object-store / peering 复杂度更重 +- 运维与概念模型更重 + +所以这不是“更简单”,而是把复杂度迁移到了别处。 + +对 `sw-block` 而言,当前选择是: + +- 保持更窄的软件块服务模型 +- 用更显式的 per-volume correctness 来换取更可控的复杂度 + +### PolarFS / ParallelRaft 路线 + +这类系统探索更激进的顺序与并行策略: + +- conflict-aware 或乱序并行 +- 更深的日志并行 +- 更复杂的 apply / replay 机制 + +它们在未来仍然值得借鉴: + +- LBA conflict reasoning +- replay 成本与恢复成本 +- flusher 并行优化 + +但它们也明显扩大了正确性边界。 + +在当前阶段,项目不应该在还没彻底证明严格顺序模型之前,就过早买入这类复杂度。 + +### AWS 链式复制 / EBS 类经验 + +链式复制之类的路线吸引人,是因为它们能解决真实问题: + +- Primary NIC 压力 +- forward 拓扑 +- RF=3 时更好的扩展性 + +这是后续较有希望借鉴的方向。 + +但它会改变: + +- 延迟画像 +- 失败处理方式 +- barrier 语义 +- 运维拓扑 + +所以它属于更后面的架构阶段,而不是当前 V2 核心证明。 + +### 当前的真实选择 + +项目当前选择的是: + +- 更窄的软件优先 block 设计 +- 明确的 per-volume correctness +- 在性能英雄主义之前先把逻辑讲清 +- 在功能扩张之前先建立验证闭环 + +这不是保守,而是为了让这个 block 产品未来真的值得信任。 + +## 6. 为什么这条方向适合 SeaweedFS 与未来独立 sw-block + +`sw-block` 起步于 SeaweedFS,但 `V2` 已经在按下一代独立 block service 的方向成形。 + +这意味着架构上要同时保留两类东西: + +### 需要保持兼容的部分 + +- placement / topology 这些概念 +- 可解释的 control-plane contract +- 与 SeaweedFS 生态的运维连续性 + +### 应该更 block-specific 的部分 + +- replication correctness +- recovery ownership +- recoverability classification +- block 特有的 test / evidence 体系 + +因此当前方向不是“继续把 V2 当成 weed 里的一个 patch”,而是: + +- 以 SeaweedFS 作为经验与生态基础 +- 同时把 `V2` 逐步塑造成真正独立的块服务架构 + +## 7. 系统化验证方法 + +当前方向之所以合理,另一个重要原因是验证方法本身已经系统化。 + +项目不再依赖: + +- 先实现 +- 再观察 +- 出 bug 再修 + +而是依赖如下层次: + +- contract / invariants +- scenario backlog +- simulator +- timer/race simulator +- standalone prototype +- real engine test runner + +```mermaid +flowchart TD + contract[ContractAndInvariants] + scenarios[ScenarioBacklog] + distsim[distsim] + eventsim[eventsim] + prototype[enginev2Prototype] + runner[RealTestRunner] + confidence[SystemAndProductConfidence] + + contract --> scenarios + scenarios --> distsim + scenarios --> eventsim + distsim --> prototype + eventsim --> prototype + prototype --> runner + runner --> confidence +``` + +这对于一个高风险块存储算法是非常正确的结构: + +- simulation 用来证明协议逻辑 +- prototype 用来证明执行语义 +- 真实 runner 用来证明系统与产品行为 + +## 8. Simulation 系统证明什么 + +simulation 系统的目标是回答: + +- 应该发生什么 +- 绝不能发生什么 +- 为什么旧设计会失败 +- 为什么 V2 更好 + +### `distsim` + +`distsim` 是主协议仿真器,主要用于: + +- 协议正确性 +- 状态迁移 +- stale authority fencing +- promotion / lineage safety +- catch-up vs rebuild +- changed-address restart +- candidate safety +- reference-state checking + +### `eventsim` + +`eventsim` 是时间 / race 层,主要用于: + +- barrier timeout +- catch-up timeout +- reservation timeout +- 同 tick / 延迟事件顺序 +- stale timeout 的影响 + +### simulation 擅长证明什么 + +它特别擅长证明: + +- stale traffic rejection +- recovery boundary 的显式性 +- timeout/race 语义 +- committed prefix 下的 failover 正确性 +- 旧 authority 不能修改新 lineage + +### simulation 不证明什么 + +它不证明: + +- 真实 TCP 行为 +- 真实 OS 调度 +- 磁盘时序 +- 真正的 `WALShipper` 集成 +- iSCSI / NVMe 前端的真实行为 + +因此 simulation 不是全部真相。 + +它是 **算法 / 协议真相层**。 + +## 9. 真实 test runner 证明什么 + +`learn/projects/sw-block/test/` 下的真实 test runner 是系统与产品验证层。 + +它不只是 QA 工具,而是设计是否可信的重要组成部分。 + +### 它覆盖什么 + +当前 runner 与周边测试体系已经覆盖: + +- unit +- component +- integration +- distributed scenario +- 真实硬件 workflow + +而且环境已经包含: + +- 真实节点 +- 真实 block target +- 真实 fault injection +- benchmark 与结果采集 +- run bundle 与 scenario traceability + +### 为什么它重要 + +它帮助我们判断: + +- 实际引擎是否按设计运行 +- 产品在真实 restart / failover / rejoin 场景下是否可靠 +- operator workflow 是否可信 +- benchmark 结果是不是有效而非偶然 + +所以 test runner 最好被理解为: + +- implementation truth +- system truth +- product truth + +而不只是“测试脚本框架”。 + +## 10. Simulation 与 test runner 如何系统性推进 + +理想的反馈闭环是: + +1. `V1` / `V1.5` 出现真实故障 +2. 这些故障被转化为设计要求 +3. 再被提炼为 simulator 场景 +4. simulator 关闭协议歧义 +5. standalone prototype 关闭执行歧义 +6. 真实 test runner 在硬件与分布式环境中验证系统行为 +7. 新故障或新偏差再反哺设计 + +这就形成了两类互补真相: + +- `simulation -> algorithm / protocol correctness` +- `test runner -> implementation / system / product correctness` + +这种分层是健康的,因为它避免了两种常见错误: + +- 只相信设计推导,却没有真实行为 +- 只相信系统测试全绿,却没有真正理解协议本身 + +## 11. 当前状态与诚实边界 + +### 现在已经比较强的部分 + +- `V1.5` 相比 `V1` 的恢复能力已经明显增强,并且有真实运行证据 +- `V2` 的架构清晰度已经明显强于 `V1.5` +- simulator 已经有较强的 acceptance 覆盖 +- prototype 已经开始关闭 ownership 与 orchestration 风险 +- 真实 test runner 已经足够大,可以支撑严肃的系统验证 + +### 现在还没有完成的部分 + +- `V2` 还不是生产引擎 +- prototype 仍处于早中期 +- historical-data / recovery-boundary prototype 还没有闭合 +- `V2` steady-state 性能还没有真实证明 +- `V2` 还没有真实硬件上的运行验证 + +所以最准确的话不是: + +- “V2 现在已经在生产上更强” + +而是: + +- “V2 是长期更好的架构,但今天还不是更强的已部署引擎” + +## 12. 为什么当前方向是理性的 + +当前方向之所以理性,是因为它保持了正确的分工: + +- `V1.5` 继续作为今天的生产线 +- `V2` 继续作为下一代架构线 + +这样项目就可以: + +- 在已有可运行系统上继续交付和加固 +- 在不扰动生产线的前提下认真验证更强的架构 +- 用 simulation、prototype 和真实 runner 来决定 `V2` 是否真能成为下一代引擎 + +最终的战略规则应当保持不变: + +- 继续研究 WAL,因为现在我们已经有可信的验证框架 +- 继续推进 V2,因为架构证据已经很强 +- 如果 prototype 证明 V2 有结构性缺陷,就先演进到 `V2.5`,不要急于重实现 + +## 结论 + +如果按当前生产证据选择: + +- 选择 `V1.5` + +如果按长期协议质量选择: + +- 选择 `V2` + +如果问 WAL 是否还值得继续研究: + +- 值得,因为现在项目已经拥有了足够严肃的验证体系,可以负责任地继续推进 + +这就是当前最合理的技术与战略判断。 diff --git a/sw-block/design/v2-detailed-algorithm.zh.md b/sw-block/design/v2-detailed-algorithm.zh.md new file mode 100644 index 000000000..81d0ce445 --- /dev/null +++ b/sw-block/design/v2-detailed-algorithm.zh.md @@ -0,0 +1,1068 @@ +# V2 详细算法设计 + +日期:2026-03-27 +状态:详细算法草案 +读者:架构设计、simulator、prototype、实现负责人 + +## 1. 文档目的 + +这份文档不是 CEO 综述,也不是 phase 汇报。 + +它的目标是把 `sw-block V2` 的核心算法写成一份更接近“协议规格”的设计文档,回答下面几个问题: + +- 系统里的正式状态对象是什么 +- 写路径如何推进 +- 不同 `sync mode` 如何决定是否可以返回成功 +- replica 掉队后如何决定 `catch-up` 还是 `rebuild` +- primary crash / failover / epoch bump 后哪些状态仍然有效 +- 什么叫做“允许的 WAL-first 可见性”,什么叫做不允许的幽灵状态 + +本文默认接受一个核心前提: + +- **已 durable 的 WAL 是系统正式状态的一部分** + +因此: + +- `visible state` 可以领先于 `checkpoint` +- 只要该状态仍然有合法的恢复依据,它就不是 bug + +真正的错误是: + +- `acked state > recoverable state` +- 或 `visible state > recoverable state` + +## 2. 设计目标 + +V2 的目标不是把所有事情都交给 `WAL`。 + +V2 的目标是: + +1. 用 `WAL` 提供严格顺序、短间隙恢复和明确的 durable history +2. 用 `extent + checkpoint/snapshot` 提供稳定读镜像和长距离恢复基线 +3. 用显式的 `epoch + sender + RecoverySession` 管住恢复 authority +4. 用显式的 `CommittedLSN` 管住对外承诺边界 +5. 用显式的 `catch-up` / `rebuild` 分类避免长期模糊状态 + +## 3. 核心对象 + +### 3.1 LSN 边界 + +- `HeadLSN` + primary 当前已分配并写入本地 WAL 的最高 LSN + +- `CommittedLSN` + 当前对外可承诺、可用于 failover / recovery 目标的 lineage-safe 边界 + +- `ReplicaReceivedLSN` + replica 已收到并追加的最高 LSN,不代表 durable + +- `ReplicaFlushedLSN` + replica 已 durable 的最高 LSN,是 sync/barrier 判断的正式依据 + +- `CheckpointLSN` + 当前 checkpoint / base snapshot 所代表的稳定物化边界 + +- `RecoverableLSN` + 某节点 crash 之后,仍可由 `checkpoint + retained WAL` 或等价机制恢复出的最高边界 + +### 3.2 存储层对象 + +- `Active WAL` + 当前保留的 WAL 历史,用于: + - 顺序写入 + - crash recovery + - short-gap catch-up + +- `Extent` + 当前运行中的块视图,可以比 checkpoint 更新 + +- `Checkpoint / Snapshot` + 一个真实历史点的稳定镜像,用于: + - rebuild base + - 长距离恢复 + - GC / retention 的正确边界 + +### 3.3 协议层对象 + +- `Epoch` + primary lineage / fencing 边界。旧 epoch 的消息和恢复结果不能修改当前系统。 + +- `Sender` + primary 上对每个 replica 的唯一发送 authority。 + +- `RecoverySession` + 对一个 replica 的一次有界恢复尝试。它必须被: + - 一个 sender 拥有 + - 一个 epoch 约束 + - 一个 session ID 唯一标识 + +- `AssignmentIntent` + orchestrator 对 sender group 的意图输入。它决定: + - 哪些 replica 被保留 + - 哪些 replica 要恢复 + - 恢复目标和 epoch 是什么 + +## 4. 全局结构图 + +```mermaid +flowchart TD + client[Client] + primary[Primary] + wal[Active WAL] + extent[Live Extent] + flusher[Flusher] + cp[Checkpoint Snapshot] + senderGroup[SenderGroup] + r1[Replica Sender R1] + r2[Replica Sender R2] + repl1[Replica 1] + repl2[Replica 2] + orchestrator[Volume Orchestrator] + + client --> primary + primary --> wal + primary --> senderGroup + primary --> extent + senderGroup --> r1 + senderGroup --> r2 + r1 --> repl1 + r2 --> repl2 + wal --> flusher + flusher --> extent + flusher --> cp + orchestrator --> primary + orchestrator --> senderGroup +``` + +这个结构的关键点是: + +- 前台写路径只负责产生顺序和推进正式边界 +- 每个 replica 的恢复执行由自己的 sender/session 管理 +- flusher / checkpoint 负责物化和长期恢复基线 +- orchestrator 负责 volume 级 admission、epoch 和 failover + +## 5. 数据真相与可见性规则 + +V2 必须明确区分 5 种状态: + +1. `visible state` +2. `WAL durable state` +3. `replica durable state` +4. `checkpointed state` +5. `committed / acked state` + +它们不是同一个概念。 + +### 5.1 允许的情况 + +下列情况是允许的: + +- `visible state > CheckpointLSN` +- `WAL durable state > CheckpointLSN` +- replica 在后台追赶,extent 已经更“新” + +只要: + +- crash 后这些状态仍有恢复依据 +- client 所收到的 ACK 语义没有被夸大 + +### 5.2 禁止的情况 + +下列情况是 V2 必须阻止的: + +- `AckedLSN > RecoverableLSN` +- `VisibleLSN > RecoverableLSN` +- 根据 socket/send progress 而不是 durable progress 给出 sync 成功 +- replica 实际已不可能 catch-up,却长期停留在 `CatchingUp` + +### 5.3 可执行 invariant + +V2 的 simulator / prototype 至少应围绕下面三条 invariant 展开: + +1. `RecoverabilityInvariant` + 所有已 ACK 的边界在 crash / restart / failover 后仍必须可恢复 + +2. `VisibilityInvariant` + 所有向用户暴露的状态都必须有合法恢复来源 + +3. `CatchUpLivenessInvariant` + replica 要么收敛,要么显式升级为 `NeedsRebuild` + +## 6. 写路径算法 + +### 6.1 写路径目标 + +写路径要满足两件事: + +- 维持 primary 本地严格顺序 +- 不把复杂恢复逻辑塞进前台热路径 + +### 6.2 写入步骤 + +对一次逻辑写入 `Write(block, value)`,V2 的基本步骤是: + +1. primary 检查自己是否拥有当前 `epoch` 的 serving authority +2. 分配下一个单调递增的 `LSN` +3. 生成 `WALRecord{LSN, Epoch, Block, Value, RecoveryClass}` +4. 本地 durable append 到 `Active WAL` +5. 更新 primary 的运行期可见状态 +6. 把该记录放入每个 replica 的 sender queue +7. 根据 volume 的 `sync mode` 决定是否需要 barrier / durable quorum / all replicas +8. 在满足对应 mode 条件后返回成功,否则阻塞或失败 + +### 6.3 写路径图 + +```mermaid +sequenceDiagram + participant C as Client + participant P as Primary + participant W as Local WAL + participant SG as SenderGroup + participant R as Replica Sender + participant X as Replica + + C->>P: Write(block, value) + P->>P: allocate LSN + P->>W: durable append WALRecord + P->>P: update live visible state + P->>SG: enqueue record for each replica + SG->>R: ordered per-replica send + R->>X: replicate WALRecord + X->>X: append and later flush + P->>P: evaluate sync mode + P-->>C: ACK or block/fail +``` + +### 6.4 写路径伪算法 + +```text +OnWrite(req): + require PrimaryState == Serving + require LocalEpoch == VolumeEpoch + + lsn = AllocateNextLSN() + rec = BuildWALRecord(lsn, req, epoch) + + DurableAppendLocalWAL(rec) + ApplyToLiveVisibleState(rec) + EnqueueToReplicaSenders(rec) + + if Mode == best_effort: + return success after local durable WAL + + if Mode == sync_all: + wait until every required replica reports durable progress >= lsn + else timeout/fail + + if Mode == sync_quorum: + wait until true durable quorum reports progress >= lsn + else timeout/fail + + AdvanceCommittedLSN(lsn) only at the correct lineage-safe boundary + return success +``` + +## 7. 三种 sync mode + +### 7.1 `best_effort` + +语义: + +- 只要求 primary 本地达到 durability point +- replica 可以异步恢复 +- 不应对 client 承诺多副本 durable + +适合: + +- 后台恢复优先 +- 临时 degraded 仍继续服务 + +### 7.2 `sync_all` + +语义: + +- 所有 required replica 都必须在目标 `LSN` durable +- 不能因为“看起来网络还活着”而提前 ACK +- 一旦达不到条件,应阻塞或失败,不能偷偷降级 + +### 7.3 `sync_quorum` + +语义: + +- 必须形成真实 durable quorum +- 只统计满足当前 epoch 和 state 资格的 replica +- 不能只数 healthy socket 或 sender 已发送 + +### 7.4 sync 决策图 + +```mermaid +flowchart TD + start[Write at LSN L] + local[Primary local WAL durable] + mode{Sync Mode} + best[Return success] + allCheck{All required replicas durable >= L} + quorumCheck{Durable quorum >= L} + success[Return success] + block[Block or fail] + + start --> local + local --> mode + mode -->|best_effort| best + mode -->|sync_all| allCheck + mode -->|sync_quorum| quorumCheck + allCheck -->|yes| success + allCheck -->|no| block + quorumCheck -->|yes| success + quorumCheck -->|no| block +``` + +### 7.5 sync mode 的正式原则 + +所有 sync mode 都必须遵守: + +- durable truth 只来自 `ReplicaFlushedLSN` +- 不来自 `ReplicaReceivedLSN` +- 不来自 send queue +- 不来自 transport 连接存活 + +## 8. 本地 WAL / extent / checkpoint 算法 + +V2 必须把本地状态推进拆成三个动作: + +1. `WAL append` +2. `extent materialization` +3. `checkpoint advancement` + +### 8.1 本地生命周期 + +```mermaid +stateDiagram-v2 + [*] --> WALDurable + WALDurable --> VisibleApplied + VisibleApplied --> ReplicaDurableEligible + ReplicaDurableEligible --> CheckpointMaterialized + CheckpointMaterialized --> CheckpointPublished + + note right of WALDurable + Local ordered truth exists + and can participate in recovery + end note + + note right of VisibleApplied + New data may be visible + before checkpoint catches up + end note + + note right of CheckpointPublished + Stable base image advances + and old WAL may become recyclable + end note +``` + +### 8.2 关键规则 + +- `extent` 可以比 `checkpoint` 更新 +- 但 crash 后真正可恢复的是: + - `checkpoint` + - 加上仍被保留、可 replay 的 WAL + +所以: + +- `VisibleLSN > CheckpointLSN` 可以合法 +- 但 `VisibleLSN > RecoverableLSN` 绝不合法 + +### 8.3 flusher / checkpoint 职责 + +flusher 不负责决定 ACK。 + +flusher 负责: + +- 将 WAL-backed dirty state 物化到 extent +- 产生新的 checkpoint / snapshot +- 在有了新的稳定基线后,帮助推进 WAL retention / GC 边界 +- 保证被对外承诺的数据仍然可恢复 + +## 9. Replica 正常复制算法 + +### 9.1 steady-state + +每个 replica 有一个稳定 sender。 + +sender 负责: + +- 顺序发 WAL record +- 发 barrier +- 处理 reconnect / handshake +- 执行 catch-up / rebuild 尾部 replay +- 拒绝旧 session 的结果 + +### 9.2 正常复制步骤 + +1. sender 从 queue 取出下一个 record +2. 按顺序发给 replica +3. replica 验证 epoch 和顺序 +4. replica 先 append 到本地 WAL 或等价 durable log +5. replica 更新 `receivedLSN` +6. 若收到 barrier,则等待本地 durable progress 达到目标 +7. replica 更新 `flushedLSN` +8. 返回 `BarrierResp` + +### 9.3 steady-state 图 + +```mermaid +sequenceDiagram + participant P as Primary Sender + participant R as Replica + + P->>R: WALRecord(LSN=n) + R->>R: validate epoch and order + R->>R: append local log + R->>R: receivedLSN = n + + P->>R: BarrierReq(LSN=n) + R->>R: wait until durable >= n + R->>R: flushedLSN = n + R-->>P: BarrierResp(flushedLSN=n) +``` + +## 10. 恢复总算法 + +### 10.1 恢复的正式入口 + +当 replica 不再能作为正常 `InSync` 复制对象时,系统不能直接“猜测”怎么修。 + +必须走明确的恢复入口: + +1. orchestrator 识别该 replica 已失去 sync eligibility +2. 对该 replica 发出新的 `AssignmentIntent` +3. sender 建立或 supersede 一个新的 `RecoverySession` +4. 通过 handshake 获得该 replica 的正式 durable 点 +5. 对恢复路径做显式分类 + +### 10.2 handshake 输入 + +一次恢复决策至少需要: + +- `ReplicaFlushedLSN` +- `CommittedLSN` +- `RetentionStartLSN` +- 当前 `epoch` +- endpoint/version 视图 + +### 10.3 恢复分类 + +V2 把恢复明确分成三类: + +1. `ZeroGap` + `ReplicaFlushedLSN == CommittedLSN` + +2. `CatchUp` + gap 在 recoverable window 内,或 replica 需要先 truncate divergent tail + +3. `NeedsRebuild` + gap 超过 retention / payload / snapshot 可恢复边界 + +### 10.4 恢复决策图 + +```mermaid +flowchart TD + hs[HandshakeResult] + zero{ReplicaFlushedLSN == CommittedLSN} + ahead{ReplicaFlushedLSN > CommittedLSN} + recoverable{Gap provably recoverable under retention/reservation} + zeroGap[ZeroGap] + truncate[CatchUp with truncation] + catchup[CatchUp] + rebuild[NeedsRebuild] + + hs --> zero + zero -->|yes| zeroGap + zero -->|no| ahead + ahead -->|yes| truncate + ahead -->|no| recoverable + recoverable -->|yes| catchup + recoverable -->|no| rebuild +``` + +### 10.5 为什么用 `CommittedLSN` + +恢复目标必须是 `CommittedLSN`,而不是 `HeadLSN`。 + +原因是: + +- `HeadLSN` 可能包含还未形成正式外部承诺的尾部 +- failover / promotion 的安全边界必须围绕 committed prefix +- 否则会把“primary 看起来更新”误当成“lineage-safe truth” + +## 11. Catch-up 算法 + +### 11.1 进入条件 + +只有当下面条件同时满足时,才允许进入 `CatchUp`: + +1. session authority 有效 +2. 当前 epoch 未失效 +3. endpoint/version 未变化 +4. gap `(ReplicaFlushedLSN, CommittedLSN]` 可恢复 +5. 对应恢复窗口已被 reservation pin 住 + +### 11.2 执行步骤 + +1. session 进入 `Connecting` +2. handshake 后进入 `Handshake` +3. classifier 返回 `CatchUp` +4. session 设置: + - `StartLSN` + - `TargetLSN` + - 如有需要,`TruncateRequired` +5. sender 开始按顺序回放 WAL records +6. replica durably 应用并持续汇报进展 +7. sender 更新 `RecoveredTo` +8. 若达到 `TargetLSN` 且 barrier 条件满足,则 session 完成 +9. replica 进入 `InSync` 或进入短暂 `PromotionHold` + +### 11.3 catch-up 状态图 + +```mermaid +stateDiagram-v2 + [*] --> Connecting + Connecting --> Handshake + Handshake --> ZeroGap + Handshake --> CatchingUp + Handshake --> NeedsRebuild + Handshake --> Truncating + Truncating --> CatchingUp + CatchingUp --> PromotionHold + PromotionHold --> InSync + CatchingUp --> NeedsRebuild +``` + +### 11.4 catch-up 失败规则 + +以下任何情况都必须终止当前 catch-up: + +- reservation 失效 +- payload / WAL 保留条件失效 +- epoch bump +- endpoint change +- session 被 supersede +- 长时间无净进展 + +一旦终止,必须: + +- 拒绝旧 session 的后续结果 +- 根据原因进入 `NeedsRebuild` 或等待新的 assignment + +## 12. Rebuild 算法 + +### 12.1 何时进入 rebuild + +下列情况进入 `NeedsRebuild`: + +- `ReplicaFlushedLSN + 1 < RetentionStartLSN` +- 历史 payload 不再可解析 +- 对应 snapshot / base image 不存在或无法 pin 住 +- catch-up 期间 recoverability 条件丢失 + +### 12.2 rebuild 步骤 + +1. orchestrator 为该 replica 发出 rebuild assignment +2. sender 建立新的 rebuild session +3. primary 选择一个真实 `snapshotCpLSN` +4. pin 住: + - snapshot/base image + - `snapshotCpLSN` 之后的 tail replay window +5. replica 安装 base image +6. sender 从 `snapshotCpLSN + 1` 开始 replay 到目标 `TargetLSN` +7. barrier 确认 durable reach +8. replica 进入 `PromotionHold` / `InSync` + +### 12.3 rebuild 图 + +```mermaid +sequenceDiagram + participant O as Orchestrator + participant P as Primary + participant S as Sender + participant R as Replica + + O->>S: AssignmentIntent(rebuild) + S->>P: choose snapshotCpLSN + P->>P: pin snapshot and tail replay window + S->>R: install base snapshot + R->>R: load snapshot(cpLSN) + S->>R: replay WAL tail (cpLSN, target] + R->>R: durable apply + R-->>S: barrier/progress reached + S->>O: rebuild completed +``` + +## 13. RecoverySession 与 authority 算法 + +### 13.1 为什么要有 RecoverySession + +块设备前端看起来没有“session”概念,但恢复执行内部必须有一个 bounded object。 + +否则无法明确回答: + +- 谁拥有这次恢复尝试 +- 哪个结果是新的,哪个是晚到的旧结果 +- endpoint 变了之后旧连接还能不能继续生效 +- epoch bump 后旧 catch-up 结果还能不能落地 + +### 13.2 authority 规则 + +一个恢复 API 调用只有同时满足下面条件才有效: + +1. sender 当前仍存在 +2. sender 未 stopped +3. sender 当前 session 不为空 +4. `sessionID` 与当前 active session 一致 +5. session 仍处于 active phase +6. sender 的 epoch 与 volume epoch 一致 +7. endpoint/version 未变化 + +### 13.3 authority 图 + +```mermaid +flowchart TD + op[Recovery Operation] + stopped{Sender stopped?} + hasSession{Active session exists?} + idMatch{Session ID matches?} + phaseOk{Phase valid?} + epochOk{Epoch still current?} + endpointOk{Endpoint still current?} + allow[Apply mutation] + reject[Reject as stale/invalid] + + op --> stopped + stopped -->|yes| reject + stopped -->|no| hasSession + hasSession -->|no| reject + hasSession -->|yes| idMatch + idMatch -->|no| reject + idMatch -->|yes| phaseOk + phaseOk -->|no| reject + phaseOk -->|yes| epochOk + epochOk -->|no| reject + epochOk -->|yes| endpointOk + endpointOk -->|no| reject + endpointOk -->|yes| allow +``` + +## 14. Failover / promotion 算法 + +### 14.1 触发条件 + +当 primary lease 丢失、节点 crash 或被明确 demote 时,需要 volume 级 failover。 + +这不是单个 replica 的本地状态迁移,而是: + +- 整个 volume lineage 重新定根 + +### 14.2 failover 步骤 + +1. 旧 primary 丧失 authority +2. volume `Epoch++` +3. 选择 promotion candidate +4. candidate 必须满足: + - running + - epoch 可对齐 + - state 允许提升 + - `FlushedLSN >= CommittedLSN` +5. 新 primary 开始 serving +6. 旧 primary 相关的 recovery sessions 全部失效 +7. 其余 replicas 相对新 primary 重新做 handshake / classify + +### 14.3 failover 图 + +```mermaid +flowchart TD + old[Old Primary loses lease] + bump[Epoch++] + choose[Choose promotion candidate] + eligible{Candidate has committed prefix?} + promote[Promote to new primary] + invalidate[Invalidate old sessions/messages] + reclassify[Reclassify all replicas] + fail[No safe candidate] + + old --> bump + bump --> choose + choose --> eligible + eligible -->|yes| promote + eligible -->|no| fail + promote --> invalidate + invalidate --> reclassify +``` + +### 14.4 promotion 的原则 + +默认规则应当是保守的: + +- 宁可没有 candidate,也不要提升一个不具备 committed prefix 的节点 + +否则最危险的错误就是: + +- 用户以为已 durable / 已 ACK 的数据,在 failover 后找不到 + +## 15. Crash recovery 语义 + +### 15.1 primary 本地 crash + +primary restart 后必须能够根据: + +- 最近 checkpoint +- retained WAL + +重建出新的运行状态。 + +### 15.2 重要边界 + +必须允许: + +- `visible state > checkpoint` + +但必须保证: + +- 所有已 visible 的状态都有合法恢复来源,或者 crash 后不会再被当作正式状态 + +### 15.3 crash 语义图 + +```mermaid +flowchart TD + run[Running state] + cp[CheckpointLSN = C] + wal[Retained WAL covers (C, R]] + crash[Crash] + restart[Restart] + replay[Replay retained WAL] + recover[Recoverable state up to R] + illegal[Illegal: visible/acked beyond recoverable] + + run --> cp + run --> wal + cp --> crash + wal --> crash + crash --> restart + restart --> replay + replay --> recover + run --> illegal +``` + +## 16. Simulator 应重点验证的算法义务 + +V2 如果要进入更真实实现,simulator 至少要系统证明以下几类事情: + +### 16.1 ACK 可恢复性 + +- `flush/sync ACK` 返回成功后 +- crash / restart / failover 后仍可恢复到该边界 + +### 16.2 可见性合法性 + +- 运行期看到的新数据 +- 必须来自 WAL durable 或 checkpoint lineage +- 不能出现 visible-but-unrecoverable state + +### 16.3 Catch-up 收敛性 + +- replica 不能无限期 `CatchingUp` +- 要么收敛,要么显式 `NeedsRebuild` + +### 16.4 历史正确性 + +- 对目标 `LSN` 的恢复结果必须匹配 reference state +- 不能拿 current extent 伪造旧历史 + +### 16.5 stale authority fencing + +- epoch 变化 +- endpoint 变化 +- session supersede +- late barrier / late catch-up result + +都不能修改当前 truth + +## 17. 方向微调:第一性思考与 Mayastor 启发 + +这一节不是推翻 `V2`,而是回答一个更关键的问题: + +- 在确认 `V2` 大方向正确之后,是否还需要收紧目标、减少复杂恢复逻辑? + +当前判断是: + +- **需要微调,但不需要换方向** + +### 17.1 思维过程:先看 block 的第一性问题 + +判断 `V2` 是否该微调,不能先从“现有代码已经写了什么”出发,而应先问 block 产品最不可回避的本质是什么。 + +从第一性原理看,block 的核心不是: + +- volume 编排 +- 控制面外形 +- 接口包装 + +而是下面四件事: + +1. `write` 在什么时候算成立 +2. `flush/fsync ACK` 到底承诺了什么 +3. failover 后用户已收到 ACK 的边界是否仍然成立 +4. replica 永远不完全同步时,系统如何定义真实可承诺边界 + +这四件事如果没有被做硬,那么无论产品外形多完整,都还不能算真正可信的 block 产品。 + +因此,`V2` 最值得坚持的主轴仍然是: + +- `CommittedLSN` +- durable progress +- `RecoverySession` +- stale fencing +- `CatchUp / NeedsRebuild / Rebuild` + +### 17.2 为什么还要微调 + +虽然主轴正确,但 `V2` 仍然存在一种风险: + +- 为了尽量避免 `rebuild`,把 `catch-up` 做得越来越聪明 + +这会带来新的债: + +- recovery session 生命周期过长 +- target 跟着 live head 漂移 +- 一个 lagging replica 长期消耗 primary 的 WAL retention +- recover 与 live WAL 并存时形成双流复杂度 +- 系统长期停留在 `CatchingUp`,却没有真正恢复 + +也就是说,`V2` 的风险不在于方向错,而在于: + +- **可能在正确方向上走得过深,重新长出不必要的复杂 transmission** + +### 17.3 Mayastor 的第一性启发 + +`Mayastor` 给 `sw-block` 的最大启发,不是某个具体的 WAL 算法,而是另一种产品化思维: + +- block 产品不必把所有恢复复杂度都压在增量追赶上 +- `rebuild` 不是羞耻路径,而是正式主路径 +- volume / replica / target / control plane 应该是明确对象 +- 系统要接受“某些副本不值得继续低成本追赶”的现实 + +从这个角度看,`Mayastor` 更接近: + +- block 产品的工程外形 +- volume 服务的组织方式 +- 明确的 replica lifecycle + +但 `Mayastor` 并没有替代 `V2` 的核心语义问题: + +- `flush ACK` 到底何时成立 +- failover 后 committed truth 如何保住 +- stale authority 如何 fencing + +所以正确的吸收方式不是“改走 Mayastor 路线”,而是: + +- **保留 `V2` 的语义内核** +- **吸收 `Mayastor` 对正式 rebuild 路径和产品组织的启发** + +### 17.4 微调结论:用正式 rebuild 替换过度复杂的 catch-up + +因此,当前最合理的方向微调是: + +- 不把 `CatchUp` 当作“尽量避免 rebuild 的万能恢复手段” +- 而把它收紧为: + - 短 gap + - 有界 target + - 有时间预算 + - 有进展预算 + - 有 recoverability/reservation 预算 + +一旦超出这些边界,就应该: + +- 明确终止当前 `CatchUp` +- 进入 `NeedsRebuild` +- 再走正式 `Rebuild` + +这不是保守,而是更接近成熟 block 产品的现实: + +- `CatchUp` 是便宜路径 +- `Rebuild` 是正式路径 +- 不能为了少做 rebuild,而把系统拖进长期复杂恢复状态 + +### 17.4A catch-up 与 rebuild 的职责划分 + +这里需要进一步把 `CatchUp` 与 `Rebuild` 的职责说清楚,否则实现很容易再次滑回“尽量避免 rebuild,所以不断扩大 catch-up 能力”的旧习惯。 + +`CatchUp` 不应被理解为一个与 `Rebuild` 对等、且可以无限扩展的恢复体系。更准确地说: + +- `CatchUp` 是 `KeepUp` 的放松态 +- 它只负责短 gap、短期、有界、可证明可恢复的 WAL replay +- 它依赖 replica 当前 base 仍然可信 +- 它依赖 primary 仍保留 `(ReplicaFlushedLSN, TargetLSN]` 所需历史 +- 它的价值在于成本明显低于 `Rebuild` + +一旦这些前提不再成立,系统不应继续把复杂度堆入 `CatchUp`,而应显式进入 `NeedsRebuild`,再走正式 `Rebuild`。 + +`Rebuild` 则应被视为更 general 的恢复框架。它不假设 target replica 当前状态仍可直接追赶,而是通过一个可信 `base` 把 replica 带回某个明确目标点: + +1. 冻结 `TargetLSN` +2. 选择并 pin 一个可信 `base` +3. 将 replica 恢复到该 `base` +4. 如有需要,补齐 `(BaseLSN, TargetLSN]` 的 tail +5. 通过 durable barrier 确认 replica 已达到 `TargetLSN` +6. 再接回 `KeepUp / InSync` + +因此,`full rebuild` 与 `partial rebuild` 不应被理解为两套不同协议,而应被理解为同一 `Rebuild` 合同下对 `base` 和传输量的不同选择: + +- `full rebuild` + - 下载完整 pinned snapshot / base image + - 必要时再补 tail +- `partial rebuild` + - replica 已有较老但可信的 base + - 通过 `bitmap` / `diff` / `snapshot + tail` 只补足达到 target 所需的数据 + +两者共同的正确性前提都是: + +- 恢复目标必须是冻结的 `TargetLSN` +- 恢复依赖的 snapshot / base 必须被 pin 住 +- 不允许直接用持续变化的 live extent 作为历史目标点数据来源 + +这一定义意味着: + +- `CatchUp` 应继续收紧为短 gap、低成本、强约束路径 +- `Rebuild` 应被当作正式主恢复路径,而不是失败后的羞耻 fallback +- 后续优化(例如 `bitmap` / range rebuild)应优先被建模为 `Rebuild` 的优化分支,而不是继续把复杂度堆入 `CatchUp` + +### 17.5 建议收紧的具体点 + +#### 1. 收紧 `CatchUp` + +`CatchUp` 应只覆盖: + +- 短 outage +- 短 gap +- recoverability 清楚 +- 成本明显低于 rebuild + +不应覆盖: + +- 长时间追 moving head +- 长时间阻塞 WAL GC +- 长时间无净进展 + +#### 2. 恢复 contract 只追 bounded target + +一个 recovery session 只对 `(R, H0]` 负责: + +- `R = ReplicaFlushedLSN` +- `H0 = 本次 primary 分配的目标边界` + +`> H0` 的 live WAL 不应让当前 session 的完成条件漂移。 + +#### 3. `recover -> keepup` 必须有明确 handoff + +session 完成后: + +- 释放 reservation 和历史恢复债 +- 经过 `PromotionHold` 或等价稳定条件 +- 再回 `KeepUp / InSync` + +而不是让 recovery session 无限延长为长期 keepup。 + +#### 4. `Rebuild` 升格为一级路径 + +`Rebuild` 不应只被视为: + +- catch-up 失败后的被动补丁 + +而应被视为: + +- 长 gap +- 高成本恢复 +- recoverability 不稳定 +- 持续 tail-chasing + +时的正式恢复选择。 + +### 17.6 微调后的核心判断 + +微调后的 `V2` 不应再被理解成: + +- “把 WAL 恢复做得越来越聪明” + +而应理解成: + +- **把 block 的真实同步边界做硬** +- **把 `CatchUp` 收紧成短 gap、低成本、有限时间的 contract** +- **把 `Rebuild` 升格成正式主路径** +- **把 Smart WAL 等更高复杂度扩展延后到基础复制契约稳定之后** + +一句话总结就是: + +- **`V2` 不换方向,但要从“雄心更大”微调为“边界更硬、目标更窄、恢复更有预算”。** + +## 18. 推荐的实现切片 + +为了让实现顺序和算法风险一致,推荐切片如下: + +### Slice 1: Sender / RecoverySession authority + +先解决: + +- 每 replica 一个 sender +- 一次只允许一个 active recovery session +- stale session result rejection + +### Slice 2: Outcome classification + assignment orchestration + +再解决: + +- `ZeroGap / CatchUp / NeedsRebuild` +- `AssignmentIntent` +- sender group reconcile + +### Slice 3: Historical recoverability model + +再把: + +- `CommittedLSN` +- WAL retention +- checkpoint/snapshot base +- recoverability proof + +做成可执行模型 + +### Slice 4: Crash-consistency simulator + +最后重点加强: + +- `visible state` +- `recoverable state` +- `acked state` +- flusher / checkpoint / replay 之间的边界 + +## 19. 总结 + +V2 的真正算法核心,不是“有一个 WAL”这么简单。 + +它真正要建立的是一整套明确边界: + +- 用 `WAL` 表示顺序与近期历史 +- 用 `CommittedLSN` 表示外部承诺边界 +- 用 `RecoverySession` 表示恢复 authority +- 用 `catch-up` / `rebuild` 表示恢复分类 +- 用 `checkpoint + replay` 表示 crash 后正式可恢复状态 + +因此 V2 可以允许: + +- `WAL-first visibility` + +但绝不能允许: + +- `ACK-first illusion` +- `visible-but-unrecoverable state` +- `stale authority mutates current lineage` + +如果这几个边界都被 simulator、prototype 和真实 runner 分层证明,那么 `V2` 才有资格从“架构方向”进入“真实引擎实现”。 diff --git a/sw-block/design/v2-engine-readiness-review.md b/sw-block/design/v2-engine-readiness-review.md new file mode 100644 index 000000000..b99afdc27 --- /dev/null +++ b/sw-block/design/v2-engine-readiness-review.md @@ -0,0 +1,170 @@ +# V2 Engine Readiness Review + +Date: 2026-03-29 +Status: active +Purpose: record the decision on whether the current V2 design + prototype + simulator stack is strong enough to begin real V2 engine slicing + +## Decision + +Current judgment: + +- proceed to real V2 engine planning +- do not open a `V2.5` redesign track at this time + +This is a planning-readiness decision, not a production-readiness claim. + +## Why This Review Exists + +The project has now completed: + +1. design/FSM closure for the V2 line +2. protocol simulation closure for: + - V1 / V1.5 / V2 comparison + - timeout/race behavior + - ownership/session semantics +3. standalone prototype closure for: + - sender/session ownership + - execution authority + - recovery branching + - minimal historical-data proof + - prototype scenario closure +4. `Phase 4.5` hardening for: + - bounded `CatchUp` + - first-class `Rebuild` + - crash-consistency / restart-recoverability + - `A5-A8` stronger evidence + +So the question is no longer: + +- "can the prototype be made richer?" + +The question is: + +- "is the evidence now strong enough to begin real engine slicing?" + +## Evidence Summary + +### 1. Design / Protocol + +Primary docs: + +- `sw-block/design/v2-acceptance-criteria.md` +- `sw-block/design/v2-open-questions.md` +- `sw-block/design/v2_scenarios.md` +- `sw-block/design/v1-v15-v2-comparison.md` +- `sw-block/design/v2-prototype-roadmap-and-gates.md` + +Judgment: + +- protocol story is coherent +- acceptance set exists +- major V1 / V1.5 failures are mapped into V2 scenarios + +### 2. Simulator + +Primary code/tests: + +- `sw-block/prototype/distsim/` +- `sw-block/prototype/distsim/eventsim.go` +- `learn/projects/sw-block/test/results/v2-simulation-review.md` + +Judgment: + +- strong enough for protocol/design validation +- strong enough to challenge crash-consistency and liveness assumptions +- not a substitute for real engine / hardware proof + +### 3. Prototype + +Primary code/tests: + +- `sw-block/prototype/enginev2/` +- `sw-block/prototype/enginev2/acceptance_test.go` + +Judgment: + +- ownership is explicit and fenced +- execution authority is explicit and fenced +- bounded `CatchUp` is semantic, not documentary +- `Rebuild` is a first-class sender-owned path +- historical-data and recoverability reasoning are executable + +### 4. `A5-A8` Double Evidence + +Prototype-side grouped evidence: + +- `sw-block/prototype/enginev2/acceptance_test.go` + +Simulator-side grouped evidence: + +- `sw-block/design/a5-a8-traceability.md` +- `sw-block/prototype/distsim/` + +Judgment: + +- the critical acceptance items that most affect engine risk now have materially stronger proof on both sides + +## What Is Good Enough Now + +The following are good enough to begin engine slicing: + +1. sender/session ownership model +2. stale authority fencing +3. recovery orchestration shape +4. bounded `CatchUp` contract +5. `Rebuild` as formal path +6. committed/recoverable boundary thinking +7. crash-consistency / restart-recoverability proof style + +## What Is Still Not Proven + +The following still require real engine work and later real-system validation: + +1. actual engine lifecycle integration +2. real storage/backend implementation +3. real control-plane integration +4. real durability / fsync behavior under the actual engine +5. real hardware timing / performance +6. final production observability and failure handling + +These are expected gaps. They do not block engine planning. + +## Open Risks To Carry Forward + +These are not blockers, but they should remain explicit: + +1. prototype and simulator are still reduced models +2. rebuild-source quality in the real engine will depend on actual checkpoint/base-image mechanics +3. durability truth in the real engine must still be re-proven against actual persistence behavior +4. predicate exploration can still grow, but should not block engine slicing + +## Engine-Planning Decision + +Decision: + +- start real V2 engine planning + +Reason: + +1. no current evidence points to a structural flaw requiring `V2.5` +2. the remaining gaps are implementation/system gaps, not prototype ambiguity +3. continuing to extend prototype/simulator breadth would have diminishing returns + +## Required Outputs After This Review + +1. `sw-block/design/v2-engine-slicing-plan.md` +2. first real engine slice definition +3. explicit non-goals for first engine stage +4. explicit validation plan for engine slices + +## Non-Goals Of This Review + +This review does not claim: + +1. V2 is production-ready +2. V2 should replace V1 immediately +3. all design questions are forever closed + +It only claims: + +- the project now has enough evidence to begin disciplined real engine slicing diff --git a/sw-block/design/v2-engine-slicing-plan.md b/sw-block/design/v2-engine-slicing-plan.md new file mode 100644 index 000000000..aeb919725 --- /dev/null +++ b/sw-block/design/v2-engine-slicing-plan.md @@ -0,0 +1,191 @@ +# V2 Engine Slicing Plan + +Date: 2026-03-29 +Status: active +Purpose: define the first real V2 engine slices after prototype and `Phase 4.5` closure + +## Goal + +Move from: + +- standalone design/prototype truth under `sw-block/prototype/` + +to: + +- a real V2 engine core under `sw-block/` + +without dragging V1.5 lifecycle assumptions into the implementation. + +## Planning Rules + +1. reuse V1 ideas and tests selectively, not structurally +2. prefer narrow vertical slices over broad skeletons +3. each slice must preserve the accepted V2 ownership/fencing model +4. keep simulator/prototype as validation support, not as the implementation itself +5. do not mix V2 engine work into `weed/storage/blockvol/` + +## First Engine Stage + +The first engine stage should build the control/recovery core, not the full storage engine. + +That means: + +1. per-replica sender identity +2. one active recovery session per replica per epoch +3. sender-owned execution authority +4. explicit recovery outcomes: + - zero gap + - bounded catch-up + - rebuild +5. rebuild execution shell only + - do not hard-code final snapshot + tail vs full base decision logic yet + - keep real rebuild-source choice tied to Slice 3 recoverability inputs + +## Recommended Slice Order + +### Slice 1: Engine Ownership Core + +Purpose: + +- carry the accepted `enginev2` ownership/fencing model into the real engine core + +Scope: + +1. stable per-replica sender object +2. stable recovery-session object +3. session identity fencing +4. endpoint / epoch invalidation +5. sender-group or equivalent ownership registry + +Acceptance: + +1. stale session results cannot mutate current authority +2. changed-address and epoch-bump invalidation work in engine code +3. the 4 V2-boundary ownership themes remain provable + +### Slice 2: Engine Recovery Execution Core + +Purpose: + +- move the prototype execution APIs into real engine behavior + +Scope: + +1. connect / handshake / catch-up flow +2. bounded `CatchUp` +3. explicit `NeedsRebuild` +4. sender-owned rebuild execution path +5. rebuild execution shell without final trusted-base selection policy + +Acceptance: + +1. bounded catch-up does not chase indefinitely +2. rebuild is exclusive from catch-up +3. session completion rules are explicit and fenced + +### Slice 3: Engine Data / Recoverability Core + +Purpose: + +- connect recovery behavior to real retained-history / checkpoint mechanics + +Scope: + +1. real recoverability decision inputs +2. trusted-base decision for rebuild source +3. minimal real checkpoint/base-image integration +4. real truncation / safe-boundary handling + +This is the first slice that should decide, from real engine inputs, between: + +1. `snapshot + tail` +2. `full base` + +Acceptance: + +1. engine can explain why recovery is allowed +2. rebuild-source choice is explicit and testable +3. historical correctness and truncation rules remain intact + +### Slice 4: Engine Integration Closure + +Purpose: + +- bind engine control/recovery core to real orchestration and validation surfaces + +Scope: + +1. real assignment/control intent entry path +2. engine-facing observability +3. focused real-engine tests for V2-boundary cases +4. first integration review against real failure classes + +Acceptance: + +1. key V2-boundary failures are reproduced and closed in engine tests +2. engine observability is good enough to debug ownership/recovery failures +3. remaining gaps are system/performance gaps, not control-model ambiguity + +## What To Reuse + +Good reuse candidates: + +1. tests and failure cases from V1 / V1.5 +2. narrow utility/data helpers where not coupled to V1 lifecycle +3. selected WAL/history concepts if they fit V2 ownership boundaries + +Do not structurally reuse: + +1. V1/V1.5 shipper lifecycle +2. address-based identity assumptions +3. `SetReplicaAddrs`-style behavior +4. old recovery control structure + +## Where The Work Should Live + +Real V2 engine work should continue under: + +- `sw-block/` + +Recommended next area: + +- `sw-block/core/` +or +- `sw-block/engine/` + +Exact path can be chosen later, but it should remain separate from: + +- `sw-block/prototype/` +- `weed/storage/blockvol/` + +## Validation Plan For Engine Slices + +Each engine slice should be validated at three levels: + +1. prototype alignment +- does engine behavior preserve the accepted prototype invariant? + +2. focused engine tests +- does the real engine slice enforce the same contract? + +3. scenario mapping +- does at least one important V1/V1.5 failure class remain closed? + +## Non-Goals For First Engine Stage + +Do not try to do these immediately: + +1. full Smart WAL expansion +2. performance optimization +3. V1 replacement/migration plan +4. full product integration +5. all storage/backend redesign at once + +## Immediate Next Assignment + +The first concrete engine-planning task should be: + +1. choose the real V2 engine module location under `sw-block/` +2. define Slice 1 file/module boundaries +3. write a short engine ownership-core spec +4. map 3-5 acceptance scenarios directly onto Slice 1 expectations diff --git a/sw-block/design/v2-production-roadmap.md b/sw-block/design/v2-production-roadmap.md new file mode 100644 index 000000000..65c88fca5 --- /dev/null +++ b/sw-block/design/v2-production-roadmap.md @@ -0,0 +1,199 @@ +# V2 Production Roadmap + +Date: 2026-03-30 +Status: active +Purpose: define the path from the accepted V2 engine core to a production candidate + +## Current Position + +Completed: + +1. design / FSM closure +2. simulator / protocol validation +3. prototype closure +4. evidence hardening +5. engine core slices: + - Slice 1 ownership core + - Slice 2 recovery execution core + - Slice 3 data / recoverability core + - Slice 4 integration closure + +Current stage: + +- entering broader engine implementation + +This means the main risk is no longer: + +- whether the V2 idea stands up + +The main risk is: + +- whether the accepted engine core can be turned into a real system without reintroducing V1/V1.5 structure and semantics + +## Roadmap Summary + +1. Phase 06: broader engine implementation stage +2. Phase 07: real-system integration / product-path decision +3. Phase 08: pre-production hardening +4. Phase 09: performance / scale / soak validation +5. Phase 10: production candidate and rollout gate + +## Phase 06 + +### Goal + +Connect the accepted engine core to: + +1. real control truth +2. real storage truth +3. explicit engine execution steps + +### Outputs + +1. control-plane adapter into the engine core +2. storage/base/recoverability adapters +3. explicit execution-driver model where synchronous helpers are no longer sufficient +4. validation against selected real failure classes + +### Gate + +At the end of Phase 06, the project should be able to say: + +- the engine core can live inside a real system shape + +## Phase 07 + +### Goal + +Move from engine-local correctness to a real runnable subsystem. + +### Outputs + +1. service-style runnable engine slice +2. integration with real control and storage surfaces +3. crash/failover/restart integration tests +4. decision on the first viable product path + +### Gate + +At the end of Phase 07, the project should be able to say: + +- the engine can run as a real subsystem, not only as an isolated core + +## Phase 08 + +### Goal + +Turn correctness into operational safety. + +### Outputs + +1. observability hardening +2. operator/debug flows +3. recovery/runbook procedures +4. config surface cleanup +5. realistic durability/restart validation + +### Gate + +At the end of Phase 08, the project should be able to say: + +- operators can run, debug, and recover the system safely + +## Phase 09 + +### Goal + +Prove viability under load and over time. + +### Outputs + +1. throughput / latency baselines +2. rebuild / catch-up cost characterization +3. steady-state overhead measurement +4. soak testing +5. scale and failure-under-load validation + +### Gate + +At the end of Phase 09, the project should be able to say: + +- the design is not only correct, but viable at useful scale and duration + +## Phase 10 + +### Goal + +Produce a controlled production candidate. + +### Outputs + +1. feature-gated production candidate +2. rollback strategy +3. migration/coexistence plan with V1 +4. staged rollout plan +5. production acceptance checklist + +### Gate + +At the end of Phase 10, the project should be able to say: + +- the system is ready for a controlled production rollout + +## Cross-Phase Rules + +### Rule 1: Do not reopen protocol shape casually + +The accepted core should remain stable unless new implementation evidence forces a change. + +### Rule 2: Use V1 as validation source, not design template + +Use: + +1. `learn/projects/sw-block/` +2. `weed/storage/block*` + +for: + +1. failure gates +2. constraints +3. integration references + +Do not use them as the default V2 architecture template. + +### Rule 3: Keep `CatchUp` narrow + +Do not let later implementation phases re-expand `CatchUp` into a broad, optimistic, long-lived recovery mode. + +### Rule 4: Keep evidence quality ahead of object growth + +New work should preferentially improve: + +1. traceability +2. diagnosability +3. real-failure validation +4. operational confidence + +not simply add new objects, states, or mechanisms. + +## Production Readiness Ladder + +The project should move through this ladder explicitly: + +1. proof-of-design +2. proof-of-engine-shape +3. proof-of-runnable-engine-stage +4. proof-of-operable-system +5. proof-of-viable-production-candidate + +Current ladder position: + +- between `2` and `3` +- engine core accepted; broader runnable engine stage underway + +## Next Documents To Maintain + +1. `sw-block/.private/phase/phase-06.md` +2. `sw-block/design/v2-engine-readiness-review.md` +3. `sw-block/design/v2-engine-slicing-plan.md` +4. this roadmap diff --git a/sw-block/design/v2-protocol-truths.md b/sw-block/design/v2-protocol-truths.md new file mode 100644 index 000000000..6f4eab667 --- /dev/null +++ b/sw-block/design/v2-protocol-truths.md @@ -0,0 +1,561 @@ +# V2 Protocol Truths + +Date: 2026-03-30 +Status: active +Purpose: record the compact, stable truths that later phases must preserve, and provide a conformance reference for implementation reviews + +## Why This Document Exists + +`FSM`, `simulator`, `prototype`, and `engine` are not a code-production pipeline. +They are an evidence ladder. + +So the most important output to carry forward is not only code, but: + +1. accepted semantics +2. must-hold boundaries +3. failure classes that must stay closed +4. explicit places where later phases may improve or drift + +This document is the compact truth table for the V2 line. + +## How To Use It + +For each later phase or slice, ask: + +1. does the new implementation remain aligned with these truths? +2. if not, is the deviation constructive or risky? +3. which truth is newly strengthened by this phase? + +Deviation labels: + +- `Aligned`: implementation preserves the truth +- `Constructive deviation`: implementation changes shape but strengthens the truth +- `Risky deviation`: implementation weakens or blurs the truth + +## Core Truths + +### T1. `CommittedLSN` is the external truth boundary + +Short form: + +- external promises are anchored at `CommittedLSN`, not `HeadLSN` + +Meaning: + +- recovery targets +- promotion safety +- flush/visibility reasoning + +must all be phrased against `CommittedLSN`. + +Prevents: + +- using optimistic WAL head as committed truth +- acknowledging lineage that failover cannot preserve + +Evidence anchor: + +- strong in design +- strong in simulator +- strong in prototype +- strong in engine + +### T2. `ZeroGap <=> ReplicaFlushedLSN == CommittedLSN` + +Short form: + +- zero-gap requires exact equality with committed truth + +Meaning: + +- replica ahead is not zero-gap +- replica behind is not zero-gap + +Prevents: + +- unsafe fast-path completion +- replica-ahead being mistaken for in-sync + +Evidence anchor: + +- strong in prototype +- strong in engine + +### T3. `CatchUp` is bounded replay on a still-trusted base + +Short form: + +- `CatchUp = KeepUp with bounded debt` + +Meaning: + +- catch-up is a short-gap, low-cost, bounded replay path +- it only makes sense while the replica base is still trustworthy enough to continue from + +Prevents: + +- turning catch-up into indefinite moving-head chase +- hiding broad recovery complexity in replay logic + +Evidence anchor: + +- strong in design +- strong in simulator +- strong in prototype +- strong in engine + +### T4. `NeedsRebuild` is explicit when replay is not the right answer + +Short form: + +- `NeedsRebuild <=> replay is unrecoverable, unstable, or no longer worth bounded replay` + +Meaning: + +- long-gap +- lost recoverability +- no trusted base +- budget violation + +must escalate explicitly. + +Prevents: + +- pretending catch-up will eventually succeed +- carrying V1/V1.5-style unbounded degraded chase forward + +Evidence anchor: + +- strong in simulator +- strong in prototype +- strong in engine + +### T5. `Rebuild` is the formal primary recovery path + +Short form: + +- `Rebuild = frozen TargetLSN + trusted base + optional tail + barrier` + +Meaning: + +- rebuild is not a shameful fallback +- it is the general recovery framework + +Prevents: + +- overloading catch-up with broad recovery semantics +- treating full/partial rebuild as unrelated protocols + +Evidence anchor: + +- strong in design +- strong in prototype +- strong in engine + +### T6. Full and partial rebuild share one correctness contract + +Short form: + +- `full rebuild` and `partial rebuild` differ in transfer choice, not in truth model + +Meaning: + +- both require frozen `TargetLSN` +- both require trusted pinned base +- both require explicit durable completion + +Prevents: + +- optimization layers redefining protocol truth +- bitmap/range paths bypassing trusted-base rules + +Evidence anchor: + +- strong in design +- partial in engine +- stronger real-system proof still deferred + +### T7. No recovery result may outlive its authority + +Short form: + +- `ValidMutation <=> sender exists && sessionID matches && epoch current && endpoint current` + +Meaning: + +- stale session +- stale epoch +- stale endpoint +- stale sender + +must all fail closed. + +Prevents: + +- late results mutating current lineage +- changed-address stale completion bugs + +Evidence anchor: + +- strong in simulator +- strong in prototype +- strong in engine + +### T8. `ReplicaID` is stable identity; `Endpoint` is mutable location + +Short form: + +- `ReplicaID != address` + +Meaning: + +- address changes may invalidate sessions +- address changes must not destroy sender identity + +Prevents: + +- reintroducing address-shaped identity +- changed-address restarting as logical removal + add + +Evidence anchor: + +- strong in prototype +- strong in engine +- strong in bridge P0 + +### T9. Truncation is a protocol boundary, not cleanup + +Short form: + +- replica-ahead cannot complete until divergent tail is explicitly truncated + +Meaning: + +- truncation is part of recovery contract +- not a side-effect or best-effort cleanup + +Prevents: + +- completing recovery while replica still contains newer divergent writes + +Evidence anchor: + +- strong in design +- strong in engine + +### T10. Recoverability must be proven from real retained history + +Short form: + +- `CatchUp allowed <=> required replay range is recoverable from retained history` + +Meaning: + +- the engine should consume storage truth +- not test-reconstructed optimism + +Prevents: + +- replay on missing WAL +- fake recoverability based only on watermarks + +Evidence anchor: + +- strong in simulator +- strong in engine +- strengthened in driver/adapter phases + +### T11. Trusted-base choice must be explicit and causal + +Short form: + +- `snapshot_tail` requires both trusted checkpoint and replayable tail + +Meaning: + +- snapshot existence alone is insufficient +- fallback to full-base must be explainable + +Prevents: + +- over-trusting old checkpoints +- silently choosing an invalid rebuild source + +Evidence anchor: + +- strong in simulator +- strong in engine +- strengthened by Phase 06 + +### T12. Current extent cannot fake old history + +Short form: + +- historical correctness requires reconstructable history, not current-state approximation + +Meaning: + +- live extent state is not sufficient proof of an old target point +- historical reconstruction must be justified by checkpoint + retained history + +Prevents: + +- using current extent as fake proof of older state + +Evidence anchor: + +- strongest in simulator +- engine currently proves prerequisites, not full reconstruction proof + +### T13. Promotion requires recoverable committed prefix + +Short form: + +- promoted replica must be able to recover committed truth, not merely advertise a high watermark + +Meaning: + +- candidate selection is about recoverable lineage, not optimistic flush visibility + +Prevents: + +- promoting a replica that cannot reconstruct committed prefix after crash/restart + +Evidence anchor: + +- strong in simulator +- partially carried into engine semantics +- real-system validation still needed + +### T14. `blockvol` executes I/O; engine owns recovery policy + +Short form: + +- adapters may translate engine decisions into concrete work +- they must not silently re-decide recovery classification or source choice + +Meaning: + +- master remains control authority +- engine remains recovery authority +- storage remains truth source + +Prevents: + +- V1/V1.5 policy leakage back into service glue + +Evidence anchor: + +- strong in Phase 07 service-slice planning +- initial bridge P0 aligns +- real-system proof still pending + +### T15. Reuse reality, not inherited semantics + +Short form: + +- V2 may reuse existing Seaweed control/runtime/storage paths +- it must not inherit old semantics as protocol truth + +Meaning: + +- reuse existing heartbeat, assignment, `blockvol`, receiver, shipper, retention, and runtime machinery when useful +- keep `ReplicaID`, epoch authority, recovery classification, committed truth, and rebuild boundaries anchored in accepted V2 semantics + +Prevents: + +- V1/V1.5 structure silently redefining V2 behavior +- convenience reuse turning old runtime assumptions into new protocol truth + +Evidence anchor: + +- strong in Phase 07/08 direction +- should remain active in later implementation phases + +## Current Strongest Evidence By Layer + +| Layer | Main value | +|------|------------| +| `FSM` / design | define truth and non-goals | +| simulator | prove protocol truth and failure-class closure cheaply | +| prototype | prove implementation-shape and authority semantics cheaply | +| engine | prove the accepted contracts survive real implementation structure | +| service slice / runner | prove truth survives real control/storage/system reality | + +## Phase Conformance Notes + +### Phase 04 + +- `Aligned`: T7, T8 +- strengthened sender/session ownership and stale rejection + +### Phase 4.5 + +- `Aligned`: T3, T4, T5, T10, T12 +- major tightening: + - bounded catch-up + - first-class rebuild + - crash-consistency and recoverability proof style + +### Phase 05 + +- `Aligned`: T1, T2, T3, T4, T5, T7, T8, T9, T10, T11 +- engine core slices closed: + - ownership + - execution + - recoverability gating + - orchestrated entry path + +### Phase 06 + +- `Aligned`: T10, T11, T14 +- `Constructive deviation`: planner/executor split replaced convenience wrappers without changing protocol truth +- strengthened: + - real storage/resource contracts + - explicit release symmetry + - failure-class validation against engine path + +### Phase 07 P0 + +- `Aligned`: T8, T10, T14 +- bridge now makes stable `ReplicaID` explicit at service boundary +- bridge states the hard rule that engine decides policy and `blockvol` executes I/O +- real `weed/storage/blockvol/` integration still pending + +## Current Carry-Forward Truths For Later Phases + +Later phases must not regress these: + +1. `CommittedLSN` remains the external truth boundary +2. `CatchUp` stays narrow and bounded +3. `Rebuild` remains the formal primary recovery path +4. stale authority must fail closed +5. stable identity must remain separate from mutable endpoint +6. trusted-base choice must remain explicit and causal +7. service glue must not silently re-decide recovery policy +8. reuse reality, but do not inherit old semantics as V2 truth + +## Review Rule + +Every later phase or slice should explicitly answer: + +1. which truths are exercised? +2. which truths are strengthened? +3. does this phase introduce any constructive or risky deviation? +4. which evidence layer now carries the truth most strongly? + +## Phase Alignment Rule + +From `Phase 05` onward, every phase or slice should align explicitly against this document. + +Minimum phase-alignment questions: + +1. which truths are in scope? +2. which truths are strengthened? +3. which truths are merely carried forward? +4. does the phase introduce any constructive deviation? +5. does the phase introduce any risky deviation? +6. which evidence layer currently carries each in-scope truth most strongly? + +Expected output shape for each later phase: + +- `In-scope truths` +- `Strengthened truths` +- `Carry-forward truths` +- `Constructive deviations` +- `Risky deviations` +- `Evidence shift` + +## Phase 05-07 Alignment + +### Phase 05 + +Primary alignment focus: + +- T1 `CommittedLSN` as external truth boundary +- T2 zero-gap exactness +- T3 bounded `CatchUp` +- T4 explicit `NeedsRebuild` +- T5/T6 rebuild correctness contract +- T7 stale authority must fail closed +- T8 stable `ReplicaID` +- T9 truncation as protocol boundary +- T10/T11 recoverability and trusted-base gating + +Main strengthening: + +- engine core adopted accepted protocol truths as real implementation structure + +Main review risk: + +- engine structure accidentally collapsing back to address identity or unfenced execution + +### Phase 06 + +Primary alignment focus: + +- T10 recoverability from real retained history +- T11 trusted-base choice remains explicit and causal +- T14 engine owns policy, adapters carry truth and execution contracts + +Main strengthening: + +- planner/executor/resource contracts +- fail-closed cleanup symmetry +- cross-layer proof path through engine execution + +Main review risk: + +- executor or adapters recomputing policy from convenience inputs +- storage/resource contracts becoming approximate instead of real + +### Phase 07+ + +Primary alignment focus: + +- T8 stable identity at the real service boundary +- T10 real storage truth into engine decisions +- T11 trusted-base proof remains explicit through service glue +- T14 `blockvol` executes I/O but does not own recovery policy + +Main strengthening: + +- real-system service-slice conformance +- real control-plane and storage-plane integration +- diagnosable failure replay through the integrated path + +Main review risk: + +- V1/V1.5 semantics leaking back in through service glue +- address-shaped identity reappearing at the boundary +- blockvol-side code silently re-deciding recovery policy + +## Future Feature Rule + +When a later feature expands the protocol surface (for example `SmartWAL` or a new rebuild optimization), the order should be: + +1. `FSM / design` +- define the new semantics and non-goals + +2. `Truth update` +- either attach the feature to an existing truth +- or add a new protocol truth if the feature creates a new long-lived invariant + +3. `Phase alignment` +- define which later phases strengthen or validate that truth + +4. `Evidence ladder` +- simulator, prototype, engine, service slice as needed + +Do not start feature implementation by editing engine or service glue first and only later trying to explain what truth changed. + +## Feature Review Rule + +For any future feature, later reviews should ask: + +1. did the feature create a new truth or just strengthen an existing one? +2. which phase first validates it? +3. which evidence layer proves it most strongly today? +4. does the feature weaken any existing truth? + +This keeps feature growth aligned with protocol truth instead of letting implementation convenience define semantics. diff --git a/sw-block/prototype/distsim/cluster.go b/sw-block/prototype/distsim/cluster.go index 6e65a5e55..15bc6f3f3 100644 --- a/sw-block/prototype/distsim/cluster.go +++ b/sw-block/prototype/distsim/cluster.go @@ -1066,9 +1066,10 @@ type CandidateEligibility struct { } // EvaluateCandidateEligibility checks all promotion prerequisites for a node. -// A candidate must have the full committed prefix (FlushedLSN >= CommittedLSN) -// to be eligible. Promoting a replica that is missing committed data would -// lose acknowledged writes. +// Phase 4.5: uses RecoverableLSN (not just FlushedLSN) to verify that the +// candidate can actually recover the committed prefix after a crash+restart, +// not just that it received durable WAL entries. RecoverableLSN accounts for +// checkpoint + WAL replay availability. func (c *Cluster) EvaluateCandidateEligibility(candidateID string) CandidateEligibility { n := c.Nodes[candidateID] if n == nil { @@ -1084,7 +1085,11 @@ func (c *Cluster) EvaluateCandidateEligibility(candidateID string) CandidateElig if n.ReplicaState == NodeStateNeedsRebuild || n.ReplicaState == NodeStateRebuilding { reasons = append(reasons, "state_ineligible") } - if n.Storage.FlushedLSN < c.Coordinator.CommittedLSN { + // Phase 4.5: check recoverable committed prefix, not just durable watermark. + // RecoverableLSN = the highest LSN that would survive crash + restart. + // This is stronger than FlushedLSN when checkpoint + WAL GC may have + // created gaps in the replay path. + if n.Storage.RecoverableLSN() < c.Coordinator.CommittedLSN { reasons = append(reasons, "insufficient_committed_prefix") } return CandidateEligibility{ diff --git a/sw-block/prototype/distsim/cluster_test.go b/sw-block/prototype/distsim/cluster_test.go index 8fd38f82d..09237ea23 100644 --- a/sw-block/prototype/distsim/cluster_test.go +++ b/sw-block/prototype/distsim/cluster_test.go @@ -166,7 +166,7 @@ func TestZombieOldPrimaryWritesAreFenced(t *testing.T) { if c.Coordinator.CommittedLSN != 1 { t.Fatalf("stale message changed committed lsn: got=%d", c.Coordinator.CommittedLSN) } - if got := c.Nodes["r1"].Storage.Extent[42]; got != 0 { + if got := c.Nodes["r1"].Storage.LiveExtent[42]; got != 0 { t.Fatalf("stale message mutated new primary extent: block42=%d", got) } } diff --git a/sw-block/prototype/distsim/phase02_candidate_test.go b/sw-block/prototype/distsim/phase02_candidate_test.go index c24568043..d9afdc4e3 100644 --- a/sw-block/prototype/distsim/phase02_candidate_test.go +++ b/sw-block/prototype/distsim/phase02_candidate_test.go @@ -353,6 +353,7 @@ func TestP02_CandidateEligibility_InsufficientCommittedPrefix(t *testing.T) { // Manually set r1 behind committed prefix. c.Nodes["r1"].Storage.FlushedLSN = 0 + c.Nodes["r1"].Storage.WALDurableLSN = 0 e = c.EvaluateCandidateEligibility("r1") if e.Eligible { t.Fatal("FlushedLSN=0 with CommittedLSN=1 should not be eligible") @@ -379,14 +380,19 @@ func TestP02_CandidateEligibility_InSyncButLagging_Rejected(t *testing.T) { // r1: InSync, correct epoch, but FlushedLSN=1. Ineligible. c.Nodes["r1"].ReplicaState = NodeStateInSync c.Nodes["r1"].Storage.FlushedLSN = 1 + c.Nodes["r1"].Storage.WALDurableLSN = 1 // r2: CatchingUp, correct epoch, FlushedLSN=100. Eligible. c.Nodes["r2"].ReplicaState = NodeStateCatchingUp c.Nodes["r2"].Storage.FlushedLSN = 100 + c.Nodes["r2"].Storage.WALDurableLSN = 100 + c.Nodes["r2"].Storage.CheckpointLSN = 100 // r3: InSync, correct epoch, FlushedLSN=100. Eligible. c.Nodes["r3"].ReplicaState = NodeStateInSync c.Nodes["r3"].Storage.FlushedLSN = 100 + c.Nodes["r3"].Storage.WALDurableLSN = 100 + c.Nodes["r3"].Storage.CheckpointLSN = 100 // r1 is ineligible despite being InSync. e1 := c.EvaluateCandidateEligibility("r1") diff --git a/sw-block/prototype/distsim/phase045_adversarial_test.go b/sw-block/prototype/distsim/phase045_adversarial_test.go new file mode 100644 index 000000000..eb51d23db --- /dev/null +++ b/sw-block/prototype/distsim/phase045_adversarial_test.go @@ -0,0 +1,219 @@ +package distsim + +import ( + "math/rand" + "testing" +) + +// Phase 4.5: Adversarial predicate search. +// These tests run randomized/semi-randomized scenarios and check danger +// predicates after each step. The goal is to find protocol violations +// that handwritten scenarios might miss. + +// TestAdversarial_RandomWritesAndCrashes runs random write + crash + restart +// sequences and checks all danger predicates after each step. +func TestAdversarial_RandomWritesAndCrashes(t *testing.T) { + rng := rand.New(rand.NewSource(42)) + + for trial := 0; trial < 50; trial++ { + c := NewCluster(CommitSyncAll, "p", "r1", "r2") + + // Random sequence of operations. + for step := 0; step < 30; step++ { + op := rng.Intn(10) + switch { + case op < 5: + // Write. + block := uint64(rng.Intn(10) + 1) + c.CommitWrite(block) + case op < 7: + // Tick (advance time, deliver messages). + c.Tick() + case op < 8: + // Crash a random node. + nodes := []string{"p", "r1", "r2"} + target := nodes[rng.Intn(3)] + node := c.Nodes[target] + if node.Running { + node.Storage.Crash() + node.Running = false + node.ReplicaState = NodeStateLagging + } + case op < 9: + // Restart a crashed node. + nodes := []string{"p", "r1", "r2"} + target := nodes[rng.Intn(3)] + node := c.Nodes[target] + if !node.Running { + node.Storage.Restart() + node.Running = true + node.ReplicaState = NodeStateLagging // needs catch-up + } + default: + // Flusher tick on all running nodes. + for _, node := range c.Nodes { + if node.Running { + node.Storage.ApplyToExtent(node.Storage.WALDurableLSN) + node.Storage.AdvanceCheckpoint(node.Storage.WALDurableLSN) + } + } + } + + // Check predicates after every step. + violations := CheckAllPredicates(c) + if len(violations) > 0 { + for name, detail := range violations { + t.Errorf("trial %d step %d: PREDICATE VIOLATED [%s]: %s", trial, step, name, detail) + } + t.FailNow() + } + } + } +} + +// TestAdversarial_FailoverChainWithPredicates runs a sequence of +// failovers (promote, crash, promote) and checks predicates. +func TestAdversarial_FailoverChainWithPredicates(t *testing.T) { + c := NewCluster(CommitSyncAll, "p", "r1", "r2") + + // Write some data and commit. + for i := 0; i < 5; i++ { + c.CommitWrite(uint64(i + 1)) + } + c.TickN(5) + + check := func(label string) { + violations := CheckAllPredicates(c) + for name, detail := range violations { + t.Fatalf("%s: PREDICATE VIOLATED [%s]: %s", label, name, detail) + } + } + + check("after initial writes") + + // Kill primary. + c.Nodes["p"].Running = false + c.Nodes["p"].Storage.Crash() + + // Promote r1. + c.Promote("r1") + c.TickN(3) + check("after first promotion") + + // Write more under new primary. + for i := 0; i < 3; i++ { + c.CommitWrite(uint64(i + 10)) + } + c.TickN(5) + check("after writes on new primary") + + // Kill new primary. + c.Nodes["r1"].Running = false + c.Nodes["r1"].Storage.Crash() + + // Promote r2. + c.Promote("r2") + c.TickN(3) + check("after second promotion") + + // Write more under third primary. + c.CommitWrite(99) + c.TickN(5) + check("after writes on third primary") +} + +// TestAdversarial_CatchUpUnderLoad runs catch-up while the primary keeps +// writing, then checks predicates for livelock. +func TestAdversarial_CatchUpUnderLoad(t *testing.T) { + c := NewCluster(CommitSyncAll, "p", "r1") + + // Write initial data. + for i := 0; i < 10; i++ { + c.CommitWrite(uint64(i + 1)) + } + c.TickN(5) + + // Disconnect r1. + c.Nodes["r1"].Running = false + + // Write more while r1 is down. + for i := 0; i < 20; i++ { + c.CommitWrite(uint64(i + 100)) + c.Tick() + } + + // Reconnect r1 — needs catch-up. + c.Nodes["r1"].Running = true + c.Nodes["r1"].ReplicaState = NodeStateLagging + + // Attempt catch-up while primary keeps writing. + for step := 0; step < 20; step++ { + // Primary writes more. + c.CommitWrite(uint64(step + 200)) + c.Tick() + + // Attempt catch-up progress. + c.CatchUpWithEscalation("r1", 5) + + // Check predicates. + violations := CheckAllPredicates(c) + for name, detail := range violations { + t.Fatalf("step %d: PREDICATE VIOLATED [%s]: %s", step, name, detail) + } + } + + // After the loop, r1 should be either InSync or NeedsRebuild. + state := c.Nodes["r1"].ReplicaState + if state != NodeStateInSync && state != NodeStateNeedsRebuild { + t.Fatalf("r1 should be InSync or NeedsRebuild after catch-up under load, got %s", state) + } +} + +// TestAdversarial_CheckpointGCThenCrash runs checkpoint + WAL GC + crash +// sequences and verifies acked data is never lost. +func TestAdversarial_CheckpointGCThenCrash(t *testing.T) { + rng := rand.New(rand.NewSource(99)) + + for trial := 0; trial < 30; trial++ { + c := NewCluster(CommitSyncAll, "p", "r1") + + // Write and commit data. + for i := 0; i < 15; i++ { + c.CommitWrite(uint64(rng.Intn(20) + 1)) + } + c.TickN(10) + + // Flusher + checkpoint at various points. + for _, node := range c.Nodes { + if node.Running { + flushTo := node.Storage.WALDurableLSN + node.Storage.ApplyToExtent(flushTo) + // Checkpoint at a random point up to flush. + cpLSN := uint64(rng.Int63n(int64(flushTo+1))) + node.Storage.AdvanceCheckpoint(cpLSN) + + // GC WAL entries before checkpoint. + retained := make([]Write, 0) + for _, w := range node.Storage.WAL { + if w.LSN > node.Storage.CheckpointLSN { + retained = append(retained, w) + } + } + node.Storage.WAL = retained + } + } + + // Crash primary. + primary := c.Primary() + if primary != nil { + primary.Storage.Crash() + primary.Storage.Restart() + } + + // Check predicates — committed data must still be recoverable. + violations := CheckAllPredicates(c) + for name, detail := range violations { + t.Errorf("trial %d: PREDICATE VIOLATED [%s]: %s", trial, name, detail) + } + } +} diff --git a/sw-block/prototype/distsim/phase045_crash_test.go b/sw-block/prototype/distsim/phase045_crash_test.go new file mode 100644 index 000000000..ec1fb4964 --- /dev/null +++ b/sw-block/prototype/distsim/phase045_crash_test.go @@ -0,0 +1,334 @@ +package distsim + +import ( + "testing" +) + +// Phase 4.5: Crash-consistency and recoverability tests. +// These validate invariants I1-I5 from the crash-consistency simulation plan. + +// --- Invariant I1: ACK'd flush is recoverable after any crash --- + +func TestI1_AckedFlush_RecoverableAfterPrimaryCrash(t *testing.T) { + c := NewCluster(CommitSyncAll, "p", "r") + + // Write 3 entries and commit (sync_all = durable on both nodes). + for i := 0; i < 3; i++ { + c.CommitWrite(uint64(i + 1)) + } + c.Tick() + c.Tick() + c.Tick() + + if c.Coordinator.CommittedLSN < 3 { + t.Fatalf("expected CommittedLSN>=3, got %d", c.Coordinator.CommittedLSN) + } + + committedLSN := c.Coordinator.CommittedLSN + + // Crash the primary. + primary := c.Nodes["p"] + primary.Storage.Crash() + + // Restart: recover from checkpoint + durable WAL. + recoveredLSN := primary.Storage.Restart() + + // I1: all committed data must be recoverable. + if recoveredLSN < committedLSN { + t.Fatalf("I1 VIOLATED: recoveredLSN=%d < committedLSN=%d — acked data lost", + recoveredLSN, committedLSN) + } + + // Verify data correctness against reference. + refState := c.Reference.StateAt(committedLSN) + recState := primary.Storage.StateAt(committedLSN) + for block, expected := range refState { + if got := recState[block]; got != expected { + t.Fatalf("I1 VIOLATED: block %d: reference=%d recovered=%d", block, expected, got) + } + } +} + +// --- Invariant I2: No ghost visible state after crash --- + +func TestI2_ExtentAheadOfCheckpoint_CrashRestart(t *testing.T) { + s := NewStorage() + + // Write 5 entries to WAL. + for i := uint64(1); i <= 5; i++ { + s.AppendWrite(Write{Block: 10 + i, Value: i * 100, LSN: i}) + } + + // Make all 5 durable. + s.AdvanceFlush(5) + + // Flusher materializes entries 1-3 to live extent. + s.ApplyToExtent(3) + + // Checkpoint at LSN 1 only. + s.AdvanceCheckpoint(1) + + // Crash. + s.Crash() + if s.LiveExtent != nil { + t.Fatal("after crash, LiveExtent should be nil") + } + + // Restart. + recoveredLSN := s.Restart() + if recoveredLSN != 5 { + t.Fatalf("recoveredLSN should be 5, got %d", recoveredLSN) + } + + // I2: all durable data recovered from checkpoint + WAL replay. + for i := uint64(1); i <= 5; i++ { + block := 10 + i + expected := i * 100 + if got := s.LiveExtent[block]; got != expected { + t.Fatalf("I2: block %d: expected %d, got %d", block, expected, got) + } + } +} + +func TestI2_UnackedData_LostAfterCrash(t *testing.T) { + s := NewStorage() + + for i := uint64(1); i <= 5; i++ { + s.AppendWrite(Write{Block: i, Value: i * 10, LSN: i}) + } + + // Only fsync 1-3. Entries 4-5 are NOT durable. + s.AdvanceFlush(3) + s.ApplyToExtent(5) // should clamp to 3 + s.AdvanceCheckpoint(3) + + if s.ExtentAppliedLSN != 3 { + t.Fatalf("ApplyToExtent should clamp to WALDurableLSN=3, got %d", s.ExtentAppliedLSN) + } + + s.Crash() + s.Restart() + + // Blocks 4,5 must NOT be in recovered extent. + if val, ok := s.LiveExtent[4]; ok && val != 0 { + t.Fatalf("I2 VIOLATED: block 4=%d survived crash — unfsynced data", val) + } + if val, ok := s.LiveExtent[5]; ok && val != 0 { + t.Fatalf("I2 VIOLATED: block 5=%d survived crash — unfsynced data", val) + } + + // Blocks 1-3 must be there. + for i := uint64(1); i <= 3; i++ { + if got := s.LiveExtent[i]; got != i*10 { + t.Fatalf("block %d: expected %d, got %d", i, i*10, got) + } + } +} + +// --- Invariant I3: CatchUp converges or escalates --- + +func TestI3_CatchUpConvergesOrEscalates(t *testing.T) { + c := NewCluster(CommitSyncAll, "p", "r") + + // Commit initial entry. + c.CommitWrite(1) + c.Tick() + c.Tick() + + // Disconnect replica and write more. + c.Nodes["r"].Running = false + for i := uint64(2); i <= 10; i++ { + c.CommitWrite(i) + c.Tick() + } + + // Reconnect. + c.Nodes["r"].Running = true + c.Nodes["r"].ReplicaState = NodeStateLagging + + // Catch-up with escalation. + converged := c.CatchUpWithEscalation("r", 3) + + // I3: must resolve — either converged or escalated to NeedsRebuild. + state := c.Nodes["r"].ReplicaState + if !converged && state != NodeStateNeedsRebuild { + t.Fatalf("I3 VIOLATED: catchup did not converge and state=%s (not NeedsRebuild)", state) + } +} + +// --- Invariant I4: Promoted replica has committed prefix --- + +func TestI4_PromotedReplica_HasCommittedPrefix(t *testing.T) { + c := NewCluster(CommitSyncAll, "p", "r") + + for i := uint64(1); i <= 5; i++ { + c.CommitWrite(i) + } + c.Tick() + c.Tick() + c.Tick() + + committedLSN := c.Coordinator.CommittedLSN + if committedLSN < 5 { + t.Fatalf("expected CommittedLSN>=5, got %d", committedLSN) + } + + // Promote replica. + if err := c.Promote("r"); err != nil { + t.Fatalf("promote: %v", err) + } + + // I4: new primary must have recoverable committed prefix. + newPrimary := c.Nodes["r"] + recoverableLSN := newPrimary.Storage.RecoverableLSN() + if recoverableLSN < committedLSN { + t.Fatalf("I4 VIOLATED: promoted recoverableLSN=%d < committedLSN=%d", + recoverableLSN, committedLSN) + } + + // Verify data matches reference. + refState := c.Reference.StateAt(committedLSN) + recState := newPrimary.Storage.StateAt(committedLSN) + for block, expected := range refState { + if got := recState[block]; got != expected { + t.Fatalf("I4 VIOLATED: block %d: ref=%d got=%d", block, expected, got) + } + } +} + +// --- Direct test: checkpoint must not leak applied-but-uncheckpointed state --- + +func TestI2_CheckpointDoesNotLeakAppliedState(t *testing.T) { + s := NewStorage() + + // Write 5 entries, all durable. + for i := uint64(1); i <= 5; i++ { + s.AppendWrite(Write{Block: i, Value: i * 10, LSN: i}) + } + s.AdvanceFlush(5) + + // Flusher applies all 5 to LiveExtent. + s.ApplyToExtent(5) + + // But checkpoint only at LSN 2. + s.AdvanceCheckpoint(2) + + // CheckpointExtent must contain ONLY blocks 1-2, not 3-5. + for i := uint64(3); i <= 5; i++ { + if val, ok := s.CheckpointExtent[i]; ok && val != 0 { + t.Fatalf("CHECKPOINT LEAK: block %d=%d in checkpoint but CheckpointLSN=2", i, val) + } + } + // Blocks 1-2 must be in checkpoint. + for i := uint64(1); i <= 2; i++ { + expected := i * 10 + if got := s.CheckpointExtent[i]; got != expected { + t.Fatalf("block %d: checkpoint should have %d, got %d", i, expected, got) + } + } + + // Now crash: LiveExtent lost, entries 3-5 only in WAL. + s.Crash() + recoveredLSN := s.Restart() + + if recoveredLSN != 5 { + t.Fatalf("recoveredLSN should be 5, got %d", recoveredLSN) + } + + // All 5 blocks must be recovered: 1-2 from checkpoint, 3-5 from WAL replay. + for i := uint64(1); i <= 5; i++ { + expected := i * 10 + if got := s.LiveExtent[i]; got != expected { + t.Fatalf("block %d: expected %d after crash+restart, got %d", i, expected, got) + } + } +} + +// --- A7: Historical state before checkpoint is not fakeable --- + +func TestA7_HistoricalState_NotReconstructableAfterGC(t *testing.T) { + s := NewStorage() + + // Write 10 entries, all durable. + for i := uint64(1); i <= 10; i++ { + s.AppendWrite(Write{Block: i, Value: i * 10, LSN: i}) + } + s.AdvanceFlush(10) + s.ApplyToExtent(10) + + // Checkpoint at LSN 7. + s.AdvanceCheckpoint(7) + + // GC WAL entries before checkpoint. + retained := make([]Write, 0) + for _, w := range s.WAL { + if w.LSN > s.CheckpointLSN { + retained = append(retained, w) + } + } + s.WAL = retained + + // Can reconstruct at LSN 7 (checkpoint covers it). + if !s.CanReconstructAt(7) { + t.Fatal("should be reconstructable at checkpoint LSN") + } + + // Can reconstruct at LSN 10 (checkpoint + WAL 8-10). + if !s.CanReconstructAt(10) { + t.Fatal("should be reconstructable at LSN 10 (checkpoint + WAL)") + } + + // CANNOT accurately reconstruct at LSN 3 (WAL 1-6 has been GC'd). + // The state at LSN 3 required WAL entries 1-3 which are gone. + if s.CanReconstructAt(3) { + t.Fatal("A7: should NOT be reconstructable at LSN 3 after WAL GC — history is lost") + } + + // StateAt(3) returns checkpoint state (best-effort approximation, not exact). + // This is fine for display but must NOT be treated as authoritative. + state3 := s.StateAt(3) + // The returned state includes blocks 1-7 (from checkpoint), which is MORE + // than what was actually committed at LSN 3. This is the "current extent + // cannot fake old history" problem from A7. + if len(state3) == 3 { + t.Fatal("StateAt(3) after GC should return checkpoint state (7 blocks), not exact 3-block state") + } +} + +// --- Invariant I5: Checkpoint GC preserves recovery proof --- + +func TestI5_CheckpointGC_PreservesAckedBoundary(t *testing.T) { + s := NewStorage() + + for i := uint64(1); i <= 10; i++ { + s.AppendWrite(Write{Block: i, Value: i * 10, LSN: i}) + } + s.AdvanceFlush(10) + s.ApplyToExtent(7) + s.AdvanceCheckpoint(7) + + // GC: remove WAL entries before checkpoint. + retained := make([]Write, 0) + for _, w := range s.WAL { + if w.LSN > s.CheckpointLSN { + retained = append(retained, w) + } + } + s.WAL = retained + + // Crash + restart. + s.Crash() + recoveredLSN := s.Restart() + + if recoveredLSN != 10 { + t.Fatalf("I5: recoveredLSN should be 10, got %d", recoveredLSN) + } + + // All 10 blocks recoverable: 1-7 from checkpoint, 8-10 from WAL. + for i := uint64(1); i <= 10; i++ { + expected := i * 10 + if got := s.LiveExtent[i]; got != expected { + t.Fatalf("I5 VIOLATED: block %d: expected %d, got %d", i, expected, got) + } + } +} diff --git a/sw-block/prototype/distsim/predicates.go b/sw-block/prototype/distsim/predicates.go new file mode 100644 index 000000000..9e6982a24 --- /dev/null +++ b/sw-block/prototype/distsim/predicates.go @@ -0,0 +1,160 @@ +package distsim + +import "fmt" + +// DangerPredicate checks for a protocol-violating or dangerous state. +// Returns (violated bool, detail string). +type DangerPredicate func(c *Cluster) (bool, string) + +// PredicateAckedFlushLost checks if any committed (ACK'd) write has become +// unrecoverable on ANY node that is supposed to have it. +// This is the most dangerous protocol violation: data loss after ACK. +func PredicateAckedFlushLost(c *Cluster) (bool, string) { + committedLSN := c.Coordinator.CommittedLSN + if committedLSN == 0 { + return false, "" + } + + refState := c.Reference.StateAt(committedLSN) + + // Check primary. + primary := c.Primary() + if primary != nil && primary.Running { + recLSN := primary.Storage.RecoverableLSN() + if recLSN < committedLSN { + return true, fmt.Sprintf("primary %s: recoverableLSN=%d < committedLSN=%d", + primary.ID, recLSN, committedLSN) + } + // Verify committed state correctness using StateAt (not LiveExtent). + // LiveExtent may contain uncommitted-but-durable writes beyond committedLSN. + // Only check if we can reconstruct the exact committed state. + if primary.Storage.CanReconstructAt(committedLSN) { + nodeState := primary.Storage.StateAt(committedLSN) + for block, expected := range refState { + if got := nodeState[block]; got != expected { + return true, fmt.Sprintf("primary %s: block %d = %d, reference = %d at committedLSN=%d", + primary.ID, block, got, expected, committedLSN) + } + } + } + } + + // Check replicas that should have committed data (InSync replicas). + for _, node := range c.Nodes { + if node.ID == c.Coordinator.PrimaryID { + continue + } + if !node.Running || node.ReplicaState != NodeStateInSync { + continue + } + recLSN := node.Storage.RecoverableLSN() + if recLSN < committedLSN { + return true, fmt.Sprintf("InSync replica %s: recoverableLSN=%d < committedLSN=%d", + node.ID, recLSN, committedLSN) + } + } + + return false, "" +} + +// PredicateVisibleUnrecoverableState checks if any running node has extent +// state that would NOT survive a crash+restart. This detects ghost visible +// state — data that is readable now but would be lost on crash. +func PredicateVisibleUnrecoverableState(c *Cluster) (bool, string) { + for _, node := range c.Nodes { + if !node.Running || node.Storage.LiveExtent == nil { + continue + } + // Simulate what would happen on crash+restart. + recoverableLSN := node.Storage.RecoverableLSN() + + // Check each block in LiveExtent: is its value backed by + // a write at LSN <= recoverableLSN? + for block, value := range node.Storage.LiveExtent { + // Find which LSN wrote this value. + writtenAtLSN := uint64(0) + for _, w := range node.Storage.WAL { + if w.Block == block && w.Value == value { + writtenAtLSN = w.LSN + } + } + if writtenAtLSN > recoverableLSN { + return true, fmt.Sprintf("node %s: block %d has value %d (written at LSN %d) but recoverableLSN=%d — ghost state", + node.ID, block, value, writtenAtLSN, recoverableLSN) + } + } + } + return false, "" +} + +// PredicateCatchUpLivelockOrMissingEscalation checks if any replica is stuck +// in CatchingUp without making progress and without being escalated to +// NeedsRebuild. Also checks if a replica needs rebuild but hasn't been +// escalated. +func PredicateCatchUpLivelockOrMissingEscalation(c *Cluster) (bool, string) { + for _, node := range c.Nodes { + if !node.Running { + continue + } + if node.ReplicaState == NodeStateCatchingUp { + // A node in CatchingUp is suspicious if it has been there for + // many ticks without approaching the target. We check if its + // FlushedLSN is far behind the primary's head. + primary := c.Primary() + if primary == nil { + continue + } + primaryHead := primary.Storage.WALDurableLSN + replicaFlushed := node.Storage.FlushedLSN + gap := primaryHead - replicaFlushed + + // If the gap is larger than what the WAL can reasonably hold + // and the node hasn't been escalated, that's a livelock risk. + // Use a simple heuristic: if gap > 2x what we've seen committed, flag it. + if gap > c.Coordinator.CommittedLSN*2 && c.Coordinator.CommittedLSN > 5 { + return true, fmt.Sprintf("node %s: CatchingUp with gap=%d (primary head=%d, replica flushed=%d) — potential livelock", + node.ID, gap, primaryHead, replicaFlushed) + } + } + + // Check if a node is Lagging for a long time without being moved to + // CatchingUp or NeedsRebuild. + // Note: Lagging is a transient state that the control plane should resolve. + // In adversarial random tests without explicit recovery triggers, a node + // staying Lagging is expected. We only flag truly excessive lag (> 3x committed) + // as potential livelock — anything smaller is normal recovery latency. + if node.ReplicaState == NodeStateLagging { + primary := c.Primary() + if primary != nil { + gap := primary.Storage.WALDurableLSN - node.Storage.FlushedLSN + if gap > c.Coordinator.CommittedLSN*3 && c.Coordinator.CommittedLSN > 10 { + return true, fmt.Sprintf("node %s: Lagging with gap=%d without escalation to CatchingUp or NeedsRebuild", + node.ID, gap) + } + } + } + } + return false, "" +} + +// AllDangerPredicates returns the standard set of danger predicates. +func AllDangerPredicates() map[string]DangerPredicate { + return map[string]DangerPredicate{ + "acked_flush_lost": PredicateAckedFlushLost, + "visible_unrecoverable": PredicateVisibleUnrecoverableState, + "catchup_livelock_or_no_esc": PredicateCatchUpLivelockOrMissingEscalation, + } +} + +// CheckAllPredicates runs all danger predicates against a cluster state. +// Returns a map of violated predicate names → detail messages. +func CheckAllPredicates(c *Cluster) map[string]string { + violations := map[string]string{} + for name, pred := range AllDangerPredicates() { + violated, detail := pred(c) + if violated { + violations[name] = detail + } + } + return violations +} diff --git a/sw-block/prototype/distsim/simulator.go b/sw-block/prototype/distsim/simulator.go index 24da8d3f2..82d4b6389 100644 --- a/sw-block/prototype/distsim/simulator.go +++ b/sw-block/prototype/distsim/simulator.go @@ -300,8 +300,11 @@ func (s *Simulator) execute(e Event) { case EvFlusherTick: if node != nil && node.Running { - node.Storage.AdvanceCheckpoint(node.Storage.FlushedLSN) - s.record(e, fmt.Sprintf("flusher tick %s checkpoint=%d", e.NodeID, node.Storage.CheckpointLSN)) + // Phase 4.5: flusher first materializes WAL to extent, then checkpoints. + node.Storage.ApplyToExtent(node.Storage.WALDurableLSN) + node.Storage.AdvanceCheckpoint(node.Storage.WALDurableLSN) + s.record(e, fmt.Sprintf("flusher tick %s applied=%d checkpoint=%d", + e.NodeID, node.Storage.ExtentAppliedLSN, node.Storage.CheckpointLSN)) } case EvPromote: diff --git a/sw-block/prototype/distsim/storage.go b/sw-block/prototype/distsim/storage.go index b5f52153b..9fdca192b 100644 --- a/sw-block/prototype/distsim/storage.go +++ b/sw-block/prototype/distsim/storage.go @@ -8,23 +8,42 @@ type SnapshotState struct { State map[uint64]uint64 } +// Storage models the per-node storage state with explicit crash-consistency +// boundaries. Phase 4.5: split into 5 distinct LSN boundaries. +// +// State progression: +// Write arrives → ReceivedLSN (not yet durable) +// WAL fsync → WALDurableLSN (survives crash) +// Flusher → ExtentAppliedLSN (materialized to live extent, volatile) +// Checkpoint → CheckpointLSN (durable base image) +// +// After crash + restart: +// RecoverableState = CheckpointExtent + WAL[CheckpointLSN+1 .. WALDurableLSN] type Storage struct { - WAL []Write - Extent map[uint64]uint64 - ReceivedLSN uint64 - FlushedLSN uint64 - CheckpointLSN uint64 - Snapshots map[string]SnapshotState - BaseSnapshot *SnapshotState + WAL []Write + LiveExtent map[uint64]uint64 // runtime view (volatile — lost on crash) + CheckpointExtent map[uint64]uint64 // crash-safe base image (survives crash) + ReceivedLSN uint64 // highest LSN received (may not be durable) + WALDurableLSN uint64 // highest LSN guaranteed to survive crash (= FlushedLSN) + ExtentAppliedLSN uint64 // highest LSN materialized into LiveExtent + CheckpointLSN uint64 // highest LSN in the durable base image + Snapshots map[string]SnapshotState + BaseSnapshot *SnapshotState + + // Backward compat alias. + FlushedLSN uint64 // = WALDurableLSN } func NewStorage() *Storage { return &Storage{ - Extent: map[uint64]uint64{}, - Snapshots: map[string]SnapshotState{}, + LiveExtent: map[uint64]uint64{}, + CheckpointExtent: map[uint64]uint64{}, + Snapshots: map[string]SnapshotState{}, } } +// AppendWrite adds a WAL entry. Does NOT update LiveExtent — that's the flusher's job. +// Does NOT advance WALDurableLSN — that requires explicit AdvanceFlush (WAL fsync). func (s *Storage) AppendWrite(w Write) { // Insert in LSN order (handles out-of-order delivery from jitter). inserted := false @@ -41,43 +60,162 @@ func (s *Storage) AppendWrite(w Write) { if !inserted { s.WAL = append(s.WAL, w) } - s.Extent[w.Block] = w.Value if w.LSN > s.ReceivedLSN { s.ReceivedLSN = w.LSN } } +// AdvanceFlush simulates WAL fdatasync completing. Entries up to lsn are now +// durable and will survive crash. This is the authoritative progress for sync_all. func (s *Storage) AdvanceFlush(lsn uint64) { if lsn > s.ReceivedLSN { lsn = s.ReceivedLSN } - if lsn > s.FlushedLSN { - s.FlushedLSN = lsn + if lsn > s.WALDurableLSN { + s.WALDurableLSN = lsn + s.FlushedLSN = lsn // backward compat alias + } +} + +// ApplyToExtent simulates the flusher materializing WAL entries into the live extent. +// Entries from (ExtentAppliedLSN, targetLSN] are applied. This is a volatile operation — +// LiveExtent is lost on crash. +func (s *Storage) ApplyToExtent(targetLSN uint64) { + if targetLSN > s.WALDurableLSN { + targetLSN = s.WALDurableLSN // can't materialize un-durable entries + } + for _, w := range s.WAL { + if w.LSN <= s.ExtentAppliedLSN { + continue + } + if w.LSN > targetLSN { + break + } + s.LiveExtent[w.Block] = w.Value + } + if targetLSN > s.ExtentAppliedLSN { + s.ExtentAppliedLSN = targetLSN } } +// AdvanceCheckpoint creates a crash-safe base image at exactly the given LSN. +// The checkpoint image contains state ONLY through lsn — not the full LiveExtent. +// This is critical: LiveExtent may contain applied entries beyond lsn that are +// NOT part of the checkpoint and must NOT survive a crash. func (s *Storage) AdvanceCheckpoint(lsn uint64) { - if lsn > s.FlushedLSN { - lsn = s.FlushedLSN + if lsn > s.ExtentAppliedLSN { + lsn = s.ExtentAppliedLSN } if lsn > s.CheckpointLSN { s.CheckpointLSN = lsn + // Build checkpoint image from base + WAL replay through exactly lsn. + // Do NOT clone LiveExtent — it may contain entries beyond checkpoint. + s.CheckpointExtent = s.StateAt(lsn) + // Set BaseSnapshot so StateAt() can use it after WAL GC. + s.BaseSnapshot = &SnapshotState{ + ID: "checkpoint", + LSN: lsn, + State: cloneMap(s.CheckpointExtent), + } + } +} + +// Crash simulates a node crash: LiveExtent is lost, only CheckpointExtent +// and durable WAL entries survive. +func (s *Storage) Crash() { + s.LiveExtent = nil + s.ExtentAppliedLSN = 0 + // ReceivedLSN drops to WALDurableLSN (un-fsynced entries lost) + s.ReceivedLSN = s.WALDurableLSN + // Remove non-durable WAL entries + durable := make([]Write, 0, len(s.WAL)) + for _, w := range s.WAL { + if w.LSN <= s.WALDurableLSN { + durable = append(durable, w) + } + } + s.WAL = durable +} + +// Restart recovers state from CheckpointExtent + durable WAL replay. +// Sets BaseSnapshot from checkpoint so StateAt() works after WAL GC. +// Returns the RecoverableLSN (highest LSN in the recovered view). +func (s *Storage) Restart() uint64 { + // Start from checkpoint base image. + s.LiveExtent = cloneMap(s.CheckpointExtent) + // Set BaseSnapshot so StateAt() can reconstruct from checkpoint after WAL GC. + s.BaseSnapshot = &SnapshotState{ + ID: "checkpoint", + LSN: s.CheckpointLSN, + State: cloneMap(s.CheckpointExtent), + } + // Replay durable WAL entries past checkpoint. + for _, w := range s.WAL { + if w.LSN <= s.CheckpointLSN { + continue + } + if w.LSN > s.WALDurableLSN { + break + } + s.LiveExtent[w.Block] = w.Value + } + s.ExtentAppliedLSN = s.WALDurableLSN + return s.WALDurableLSN +} + +// RecoverableLSN returns the highest LSN that would be recoverable after +// crash + restart. This is a replayability proof, not just a watermark: +// - CheckpointExtent covers [0, CheckpointLSN] +// - WAL entries (CheckpointLSN, WALDurableLSN] must exist contiguously +// - If any gap exists in the WAL between CheckpointLSN and WALDurableLSN, +// recovery would be incomplete +// +// Returns the highest contiguously recoverable LSN from checkpoint + WAL. +func (s *Storage) RecoverableLSN() uint64 { + // Start from checkpoint — everything through CheckpointLSN is safe. + recoverable := s.CheckpointLSN + + // Walk durable WAL entries past checkpoint and verify contiguity. + for _, w := range s.WAL { + if w.LSN <= s.CheckpointLSN { + continue // already covered by checkpoint + } + if w.LSN > s.WALDurableLSN { + break // not durable + } + if w.LSN == recoverable+1 { + recoverable = w.LSN // contiguous — extend + } else { + break // gap — stop here + } } + return recoverable } +// StateAt computes the block state by replaying WAL entries up to the given LSN. +// Used for correctness assertions against the reference model. +// +// Phase 4.5: for lsn < CheckpointLSN (after WAL GC), the WAL entries needed +// to reconstruct historical state may no longer exist. In that case, we return +// the checkpoint state (best available), but callers should use +// CanReconstructAt(lsn) to check if the result is authoritative. func (s *Storage) StateAt(lsn uint64) map[uint64]uint64 { state := map[uint64]uint64{} + usedSnapshot := false if s.BaseSnapshot != nil { if s.BaseSnapshot.LSN > lsn { - return cloneMap(s.BaseSnapshot.State) + // Snapshot is NEWER than requested — cannot use it. + // Fall through to WAL-only replay. + } else { + state = cloneMap(s.BaseSnapshot.State) + usedSnapshot = true } - state = cloneMap(s.BaseSnapshot.State) } for _, w := range s.WAL { if w.LSN > lsn { break } - if s.BaseSnapshot != nil && w.LSN <= s.BaseSnapshot.LSN { + if usedSnapshot && w.LSN <= s.BaseSnapshot.LSN { continue } state[w.Block] = w.Value @@ -85,6 +223,62 @@ func (s *Storage) StateAt(lsn uint64) map[uint64]uint64 { return state } +// CanReconstructAt returns true if the storage has enough information to +// accurately reconstruct state at the given LSN. False means the WAL entries +// needed for historical reconstruction have been GC'd and StateAt(lsn) may +// return an approximation (checkpoint state) rather than exact history. +// +// A7 (Historical Data Correctness): this should be checked before trusting +// StateAt() results for old LSNs. Current extent cannot fake old history. +func (s *Storage) CanReconstructAt(lsn uint64) bool { + if lsn == 0 { + return true // empty state is always reconstructable + } + + // To reconstruct state at exactly lsn, we need a contiguous chain of + // evidence from LSN 0 (or a snapshot taken AT lsn) through lsn. + // + // A checkpoint at LSN C contains state through C. If lsn < C, the + // checkpoint has MORE data than existed at lsn — it cannot reconstruct + // the exact historical state at lsn. We would need WAL entries [1, lsn] + // to rebuild from scratch, which are gone after GC. + // + // A checkpoint at LSN C where C == lsn is exact. + // A checkpoint at LSN C where C > lsn cannot help with exact lsn state. + + // Check if any snapshot was taken exactly at this LSN. + for _, snap := range s.Snapshots { + if snap.LSN == lsn { + return true + } + } + + // Find the best base: a snapshot/checkpoint at or before lsn. + baseLSN := uint64(0) + if s.BaseSnapshot != nil && s.BaseSnapshot.LSN <= lsn { + baseLSN = s.BaseSnapshot.LSN + } + + // If baseLSN > 0, we have a snapshot that provides state through baseLSN. + // We need contiguous WAL from baseLSN+1 through lsn. + // If baseLSN == 0, we need contiguous WAL from 1 through lsn. + + expected := baseLSN + 1 + for _, w := range s.WAL { + if w.LSN <= baseLSN { + continue + } + if w.LSN > lsn { + break + } + if w.LSN != expected { + return false // gap — history is incomplete + } + expected = w.LSN + 1 + } + return expected > lsn +} + func (s *Storage) TakeSnapshot(id string, lsn uint64) SnapshotState { snap := SnapshotState{ ID: id, @@ -96,10 +290,13 @@ func (s *Storage) TakeSnapshot(id string, lsn uint64) SnapshotState { } func (s *Storage) LoadSnapshot(snap SnapshotState) { - s.Extent = cloneMap(snap.State) + s.LiveExtent = cloneMap(snap.State) + s.CheckpointExtent = cloneMap(snap.State) + s.WALDurableLSN = snap.LSN s.FlushedLSN = snap.LSN s.ReceivedLSN = snap.LSN s.CheckpointLSN = snap.LSN + s.ExtentAppliedLSN = snap.LSN s.BaseSnapshot = &SnapshotState{ ID: snap.ID, LSN: snap.LSN, @@ -111,7 +308,14 @@ func (s *Storage) LoadSnapshot(snap SnapshotState) { func (s *Storage) ReplaceWAL(writes []Write) { s.WAL = append([]Write(nil), writes...) sort.Slice(s.WAL, func(i, j int) bool { return s.WAL[i].LSN < s.WAL[j].LSN }) - s.Extent = s.StateAt(s.ReceivedLSN) + // Recompute LiveExtent from base + WAL + s.LiveExtent = s.StateAt(s.ReceivedLSN) +} + +// Extent returns the current live extent for backward compatibility. +// Callers should migrate to LiveExtent. +func (s *Storage) Extent() map[uint64]uint64 { + return s.LiveExtent } func writesInRange(writes []Write, startExclusive, endInclusive uint64) []Write { diff --git a/weed/server/master_block_failover.go b/weed/server/master_block_failover.go index f3eb35bbb..8a97079b6 100644 --- a/weed/server/master_block_failover.go +++ b/weed/server/master_block_failover.go @@ -10,10 +10,12 @@ import ( // pendingRebuild records a volume that needs rebuild when a dead VS reconnects. type pendingRebuild struct { - VolumeName string - OldPath string // path on dead server - NewPrimary string // promoted replica server - Epoch uint64 + VolumeName string + OldPath string // path on dead server + NewPrimary string // promoted replica server + Epoch uint64 + ReplicaDataAddr string // CP13-8: saved from before death for catch-up-first recovery + ReplicaCtrlAddr string // CP13-8: saved from before death for catch-up-first recovery } // blockFailoverState holds failover and rebuild state on the master. @@ -88,6 +90,8 @@ func (ms *MasterServer) failoverBlockVolumes(deadServer string) { ri := entry.ReplicaByServer(deadServer) if ri != nil { replicaPath := ri.Path + replicaDataAddr := ri.DataAddr // CP13-8: save before removal + replicaCtrlAddr := ri.CtrlAddr // Remove dead replica from registry. if err := ms.blockRegistry.RemoveReplica(entry.Name, deadServer); err != nil { glog.Warningf("failover: RemoveReplica %q on %s: %v", entry.Name, deadServer, err) @@ -95,10 +99,12 @@ func (ms *MasterServer) failoverBlockVolumes(deadServer string) { } // Record pending rebuild for when dead server reconnects. ms.recordPendingRebuild(deadServer, pendingRebuild{ - VolumeName: entry.Name, - OldPath: replicaPath, - NewPrimary: entry.VolumeServer, // current primary (unchanged) - Epoch: entry.Epoch, + VolumeName: entry.Name, + OldPath: replicaPath, + NewPrimary: entry.VolumeServer, + Epoch: entry.Epoch, + ReplicaDataAddr: replicaDataAddr, + ReplicaCtrlAddr: replicaCtrlAddr, }) glog.V(0).Infof("failover: removed dead replica %s for %q, pending rebuild", deadServer, entry.Name) @@ -238,20 +244,73 @@ func (ms *MasterServer) recoverBlockVolumes(reconnectedServer string) { continue } - // Update registry: reconnected server becomes a replica (via AddReplica for RF≥2 support). + // CP13-8: Use replica addresses saved before death for catch-up-first recovery. + // These are deterministic (derived from volume path hash in ReplicationPorts), + // so they should be the same after VS restart. If the VS somehow gets different + // ports (e.g., port conflict), the catch-up attempt will fail at the TCP level + // and fall through to the shipper's NeedsRebuild → master rebuild path. + // This is an optimization, not a source of truth — the master remains the + // authority for topology/assignment changes. + dataAddr := rb.ReplicaDataAddr + ctrlAddr := rb.ReplicaCtrlAddr + + // Update registry: reconnected server becomes a replica. ms.blockRegistry.AddReplica(rb.VolumeName, ReplicaInfo{ - Server: reconnectedServer, - Path: rb.OldPath, + Server: reconnectedServer, + Path: rb.OldPath, + DataAddr: dataAddr, + CtrlAddr: ctrlAddr, }) - // T4: Warn if RebuildListenAddr is empty (new primary hasn't heartbeated yet). + // CP13-8: Try catch-up first (Replica assignment), fall back to rebuild. + // If the replica can catch up from the primary's retained WAL, this is + // much faster than a full rebuild. The shipper's reconnect handshake + // (CP13-5) determines whether catch-up or rebuild is actually needed. + // If catch-up fails, the shipper marks NeedsRebuild, and the master + // sends a Rebuilding assignment on the next heartbeat cycle. + if dataAddr != "" { + leaseTTLMs := blockvol.LeaseTTLToWire(30 * time.Second) + // Send Replica assignment to the reconnected server. + ms.blockAssignmentQueue.Enqueue(reconnectedServer, blockvol.BlockVolumeAssignment{ + Path: rb.OldPath, + Epoch: entry.Epoch, + Role: blockvol.RoleToWire(blockvol.RoleReplica), + LeaseTtlMs: leaseTTLMs, + ReplicaDataAddr: dataAddr, + ReplicaCtrlAddr: ctrlAddr, + }) + // Also re-send Primary assignment so the primary gets fresh replica addresses. + primaryAssignment := blockvol.BlockVolumeAssignment{ + Path: entry.Path, + Epoch: entry.Epoch, + Role: blockvol.RoleToWire(blockvol.RolePrimary), + LeaseTtlMs: leaseTTLMs, + } + // Include all replica addresses. + for _, ri := range entry.Replicas { + primaryAssignment.ReplicaAddrs = append(primaryAssignment.ReplicaAddrs, blockvol.ReplicaAddr{ + DataAddr: ri.DataAddr, + CtrlAddr: ri.CtrlAddr, + }) + } + if len(entry.Replicas) == 1 { + primaryAssignment.ReplicaDataAddr = entry.Replicas[0].DataAddr + primaryAssignment.ReplicaCtrlAddr = entry.Replicas[0].CtrlAddr + } + ms.blockAssignmentQueue.Enqueue(entry.VolumeServer, primaryAssignment) + + glog.V(0).Infof("recover: enqueued catch-up (Replica) for %q on %s (epoch=%d, data=%s) + Primary refresh on %s", + rb.VolumeName, reconnectedServer, entry.Epoch, dataAddr, entry.VolumeServer) + continue + } + + // Fallback: no known addresses — use rebuild path. rebuildAddr := entry.RebuildListenAddr if rebuildAddr == "" { glog.Warningf("rebuild: %q RebuildListenAddr is empty (new primary %s may not have heartbeated yet), "+ "queuing rebuild anyway — VS should retry on empty addr", rb.VolumeName, entry.VolumeServer) } - // Enqueue rebuild assignment for the reconnected server. ms.blockAssignmentQueue.Enqueue(reconnectedServer, blockvol.BlockVolumeAssignment{ Path: rb.OldPath, Epoch: entry.Epoch, @@ -268,6 +327,39 @@ func (ms *MasterServer) recoverBlockVolumes(reconnectedServer string) { // reevaluateOrphanedPrimaries checks if the given server is a replica for any // volumes whose primary is dead (not block-capable). If so, promotes the best // available replica — but only after the old primary's lease has expired, to +// refreshPrimaryForAddrChange sends a fresh Primary assignment when a replica's +// receiver address changed (e.g., restart with port conflict). This ensures the +// primary's shipper gets the new address without waiting for the next heartbeat cycle. +func (ms *MasterServer) refreshPrimaryForAddrChange(ac ReplicaAddrChange) { + entry, ok := ms.blockRegistry.Lookup(ac.VolumeName) + if !ok { + return + } + leaseTTLMs := blockvol.LeaseTTLToWire(30 * time.Second) + assignment := blockvol.BlockVolumeAssignment{ + Path: entry.Path, + Epoch: entry.Epoch, + Role: blockvol.RoleToWire(blockvol.RolePrimary), + LeaseTtlMs: leaseTTLMs, + } + for _, ri := range entry.Replicas { + assignment.ReplicaAddrs = append(assignment.ReplicaAddrs, blockvol.ReplicaAddr{ + DataAddr: ri.DataAddr, + CtrlAddr: ri.CtrlAddr, + }) + } + if len(entry.Replicas) == 1 { + assignment.ReplicaDataAddr = entry.Replicas[0].DataAddr + assignment.ReplicaCtrlAddr = entry.Replicas[0].CtrlAddr + } + // Use current registry primary (not stale ac.PrimaryServer) in case + // failover happened between address-change detection and this refresh. + currentPrimary := entry.VolumeServer + ms.blockAssignmentQueue.Enqueue(currentPrimary, assignment) + glog.V(0).Infof("recover: replica addr changed for %q (data: %s→%s, ctrl: %s→%s), refreshed Primary on %s", + ac.VolumeName, ac.OldDataAddr, ac.NewDataAddr, ac.OldCtrlAddr, ac.NewCtrlAddr, currentPrimary) +} + // maintain the same split-brain protection as failoverBlockVolumes(). // This fixes B-06 (orphaned primary after replica re-register) // and partially B-08 (fast reconnect skips failover window). diff --git a/weed/server/master_block_registry.go b/weed/server/master_block_registry.go index 6289dac53..687d581cd 100644 --- a/weed/server/master_block_registry.go +++ b/weed/server/master_block_registry.go @@ -353,7 +353,21 @@ func (r *BlockVolumeRegistry) ListByServer(server string) []BlockVolumeEntry { // Called on the first heartbeat from a volume server. // Marks reported volumes as Active, removes entries for this server // that are not reported (stale). -func (r *BlockVolumeRegistry) UpdateFullHeartbeat(server string, infos []*master_pb.BlockVolumeInfoMessage, nvmeAddr string) { +// ReplicaAddrChange records a replica whose advertised address changed, +// requiring a Primary assignment refresh so the shipper gets the new address. +// Detected only in the full heartbeat path (UpdateFullHeartbeat). Delta +// heartbeats do not carry replica addresses and cannot trigger this. +type ReplicaAddrChange struct { + VolumeName string + PrimaryServer string + OldDataAddr string + OldCtrlAddr string + NewDataAddr string + NewCtrlAddr string +} + +func (r *BlockVolumeRegistry) UpdateFullHeartbeat(server string, infos []*master_pb.BlockVolumeInfoMessage, nvmeAddr string) []ReplicaAddrChange { + var addrChanges []ReplicaAddrChange r.mu.Lock() defer r.mu.Unlock() @@ -495,6 +509,31 @@ func (r *BlockVolumeRegistry) UpdateFullHeartbeat(server string, infos []*master } else { existing.Replicas[i].WALLag = 0 } + // CP13-8: detect address change on replica restart. + // If either the data or control address changed, the primary's + // shipper has a stale endpoint. Queue a Primary refresh. + if info.ReplicaDataAddr != "" || info.ReplicaCtrlAddr != "" { + oldData := existing.Replicas[i].DataAddr + oldCtrl := existing.Replicas[i].CtrlAddr + dataChanged := info.ReplicaDataAddr != "" && oldData != "" && oldData != info.ReplicaDataAddr + ctrlChanged := info.ReplicaCtrlAddr != "" && oldCtrl != "" && oldCtrl != info.ReplicaCtrlAddr + if dataChanged || ctrlChanged { + addrChanges = append(addrChanges, ReplicaAddrChange{ + VolumeName: existingName, + PrimaryServer: existing.VolumeServer, + OldDataAddr: oldData, + OldCtrlAddr: oldCtrl, + NewDataAddr: info.ReplicaDataAddr, + NewCtrlAddr: info.ReplicaCtrlAddr, + }) + } + if info.ReplicaDataAddr != "" { + existing.Replicas[i].DataAddr = info.ReplicaDataAddr + } + if info.ReplicaCtrlAddr != "" { + existing.Replicas[i].CtrlAddr = info.ReplicaCtrlAddr + } + } break } } @@ -511,6 +550,14 @@ func (r *BlockVolumeRegistry) UpdateFullHeartbeat(server string, infos []*master if name == "" { continue } + // Skip auto-register if a create is in progress for this volume. + // Without this gate, the replica VS heartbeat can race ahead of + // CreateBlockVolume.Register and create a bare entry that lacks + // replica info, causing the real Register to hit "already registered" + // and fall back to the incomplete auto-registered entry. + if r.IsInflight(name) { + continue + } existing, dup := r.volumes[name] if !dup { entry := &BlockVolumeEntry{ @@ -545,6 +592,7 @@ func (r *BlockVolumeRegistry) UpdateFullHeartbeat(server string, infos []*master } } } + return addrChanges } // reconcileOnRestart handles the case where a second server reports a volume @@ -769,6 +817,12 @@ func (r *BlockVolumeRegistry) ReleaseInflight(name string) { r.inflight.Delete(name) } +// IsInflight returns true if a create is in progress for the given volume name. +func (r *BlockVolumeRegistry) IsInflight(name string) bool { + _, ok := r.inflight.Load(name) + return ok +} + // countForServer returns the number of volumes on the given server. // Caller must hold at least RLock. func (r *BlockVolumeRegistry) countForServer(server string) int { diff --git a/weed/server/master_block_registry_test.go b/weed/server/master_block_registry_test.go index cd3ed34c4..f0ecb7e23 100644 --- a/weed/server/master_block_registry_test.go +++ b/weed/server/master_block_registry_test.go @@ -1900,3 +1900,63 @@ func TestUpdateEntry_NotFound(t *testing.T) { t.Fatal("expected error for nonexistent volume") } } + +// TestRegistry_InflightBlocksAutoRegister verifies that heartbeat auto-register +// is suppressed while a create is in-flight for the same volume. This prevents +// a race where the replica VS heartbeat arrives before CreateBlockVolume.Register +// completes, creating a bare entry that lacks replica info. +func TestRegistry_InflightBlocksAutoRegister(t *testing.T) { + r := NewBlockVolumeRegistry() + + // Simulate CreateBlockVolume acquiring the inflight lock. + if !r.AcquireInflight("vol1") { + t.Fatal("AcquireInflight should succeed") + } + + // Replica VS sends heartbeat reporting vol1 — while create is in-flight. + // This should be silently skipped (not auto-registered). + r.UpdateFullHeartbeat("replica-server:8080", []*master_pb.BlockVolumeInfoMessage{ + {Path: "/blocks/vol1.blk", Epoch: 1, Role: 2, VolumeSize: 1 << 30}, + }, "") + + // vol1 should NOT be in the registry (auto-register was blocked). + if _, ok := r.Lookup("vol1"); ok { + t.Fatal("vol1 should not be auto-registered while inflight lock is held") + } + + // Now simulate CreateBlockVolume completing: register with replicas. + r.Register(&BlockVolumeEntry{ + Name: "vol1", + VolumeServer: "primary-server:8080", + Path: "/blocks/vol1.blk", + SizeBytes: 1 << 30, + Epoch: 1, + Status: StatusActive, + Replicas: []ReplicaInfo{ + {Server: "replica-server:8080", Path: "/blocks/vol1.blk"}, + }, + }) + r.ReleaseInflight("vol1") + + // Entry should have the replica. + entry, ok := r.Lookup("vol1") + if !ok { + t.Fatal("vol1 should exist after Register") + } + if len(entry.Replicas) != 1 { + t.Fatalf("replicas=%d, want 1", len(entry.Replicas)) + } + if entry.Replicas[0].Server != "replica-server:8080" { + t.Fatalf("replica server=%s", entry.Replicas[0].Server) + } + + // After inflight released, subsequent heartbeats should update normally. + r.UpdateFullHeartbeat("replica-server:8080", []*master_pb.BlockVolumeInfoMessage{ + {Path: "/blocks/vol1.blk", Epoch: 2, Role: 2, VolumeSize: 1 << 30, HealthScore: 0.9}, + }, "") + + entry, _ = r.Lookup("vol1") + if entry.Replicas[0].HealthScore != 0.9 { + t.Fatalf("replica health not updated after inflight released: %f", entry.Replicas[0].HealthScore) + } +} diff --git a/weed/server/master_grpc_server.go b/weed/server/master_grpc_server.go index 34c1142e2..8f88aa858 100644 --- a/weed/server/master_grpc_server.go +++ b/weed/server/master_grpc_server.go @@ -277,7 +277,12 @@ func (ms *MasterServer) SendHeartbeat(stream master_pb.Seaweed_SendHeartbeatServ // (BlockVolumeInfos on first heartbeat) or deltas (NewBlockVolumes/DeletedBlockVolumes // on subsequent heartbeats), never both in the same message. if len(heartbeat.BlockVolumeInfos) > 0 || heartbeat.HasNoBlockVolumes { - ms.blockRegistry.UpdateFullHeartbeat(dn.Url(), heartbeat.BlockVolumeInfos, heartbeat.BlockNvmeAddr) + addrChanges := ms.blockRegistry.UpdateFullHeartbeat(dn.Url(), heartbeat.BlockVolumeInfos, heartbeat.BlockNvmeAddr) + // CP13-8: If a replica's receiver address changed (e.g., restart with port conflict), + // immediately refresh the primary's assignment with the new addresses. + for _, ac := range addrChanges { + ms.refreshPrimaryForAddrChange(ac) + } // T2 (B-06): After updating registry from heartbeat, check if this server // is a replica for any volume whose primary is dead. If so, promote. ms.reevaluateOrphanedPrimaries(dn.Url()) diff --git a/weed/server/qa_block_edge_cases_test.go b/weed/server/qa_block_edge_cases_test.go new file mode 100644 index 000000000..938ef7513 --- /dev/null +++ b/weed/server/qa_block_edge_cases_test.go @@ -0,0 +1,481 @@ +package weed_server + +import ( + "sync" + "testing" + "time" + + "github.com/seaweedfs/seaweedfs/weed/storage/blockvol" +) + +// ============================================================ +// Edge Case Tests: RF, Promotion, Network, LSN +// +// Covers gaps identified in the testing framework review: +// 1. LSN-lagging replica skipped during promotion +// 2. Cascading double failover (RF=3, epoch chain 1→2→3) +// 3. Demotion/drain under concurrent promotion pressure +// 4. Promotion with mixed LSN + health scores +// 5. Network flap simulation (mark/unmark block capable rapidly) +// 6. RF=3 all-gate evaluation under pressure +// ============================================================ + +// --- Test 1: LSN-lagging replica skipped, fresher one promoted --- + +func TestEdge_LSNLag_StaleReplicaSkipped(t *testing.T) { + ms := testMasterServerForFailover(t) + ms.blockRegistry.SetPromotionLSNTolerance(10) + + ms.blockRegistry.MarkBlockCapable("primary") + ms.blockRegistry.MarkBlockCapable("stale-replica") + ms.blockRegistry.MarkBlockCapable("fresh-replica") + + entry := &BlockVolumeEntry{ + Name: "lsn-test", VolumeServer: "primary", Path: "/data/lsn-test.blk", + SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary), + Status: StatusActive, LeaseTTL: 5 * time.Second, + LastLeaseGrant: time.Now().Add(-10 * time.Second), // expired + WALHeadLSN: 1000, + Replicas: []ReplicaInfo{ + { + Server: "stale-replica", Path: "/data/lsn-test.blk", + HealthScore: 1.0, WALHeadLSN: 100, // lag=900, way beyond tolerance=10 + Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(), + }, + { + Server: "fresh-replica", Path: "/data/lsn-test.blk", + HealthScore: 0.9, WALHeadLSN: 995, // lag=5, within tolerance=10 + Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(), + }, + }, + } + ms.blockRegistry.Register(entry) + + // Kill primary. + ms.blockRegistry.UnmarkBlockCapable("primary") + ms.failoverBlockVolumes("primary") + + // Verify: fresh-replica promoted (despite lower health score), stale skipped. + after, ok := ms.blockRegistry.Lookup("lsn-test") + if !ok { + t.Fatal("volume not found") + } + if after.VolumeServer != "fresh-replica" { + t.Fatalf("expected fresh-replica promoted, got %q (stale-replica with lag=900 should be skipped)", after.VolumeServer) + } + if after.Epoch != 2 { + t.Fatalf("epoch: got %d, want 2", after.Epoch) + } +} + +// --- Test 2: Cascading double failover (RF=3, epoch 1→2→3) --- + +func TestEdge_CascadeFailover_RF3_EpochChain(t *testing.T) { + ms := testMasterServerForFailover(t) + ms.blockRegistry.MarkBlockCapable("vs1") + ms.blockRegistry.MarkBlockCapable("vs2") + ms.blockRegistry.MarkBlockCapable("vs3") + + entry := &BlockVolumeEntry{ + Name: "cascade-test", VolumeServer: "vs1", Path: "/data/cascade.blk", + SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary), + Status: StatusActive, LeaseTTL: 5 * time.Second, + LastLeaseGrant: time.Now().Add(-10 * time.Second), + ReplicaFactor: 3, + Replicas: []ReplicaInfo{ + {Server: "vs2", Path: "/r2.blk", HealthScore: 1.0, WALHeadLSN: 100, + Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()}, + {Server: "vs3", Path: "/r3.blk", HealthScore: 0.9, WALHeadLSN: 100, + Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()}, + }, + } + ms.blockRegistry.Register(entry) + + // Failover 1: vs1 dies → vs2 promoted (higher health). + ms.blockRegistry.UnmarkBlockCapable("vs1") + ms.failoverBlockVolumes("vs1") + + after1, _ := ms.blockRegistry.Lookup("cascade-test") + if after1.VolumeServer != "vs2" { + t.Fatalf("failover 1: expected vs2, got %q", after1.VolumeServer) + } + if after1.Epoch != 2 { + t.Fatalf("failover 1: epoch got %d, want 2", after1.Epoch) + } + + // Failover 2: vs2 dies → vs3 promoted (only remaining). + // Update vs3's heartbeat and set lease expired for the new primary. + ms.blockRegistry.UpdateEntry("cascade-test", func(e *BlockVolumeEntry) { + e.LastLeaseGrant = time.Now().Add(-10 * time.Second) + for i := range e.Replicas { + if e.Replicas[i].Server == "vs3" { + e.Replicas[i].LastHeartbeat = time.Now() + } + } + }) + + ms.blockRegistry.UnmarkBlockCapable("vs2") + ms.failoverBlockVolumes("vs2") + + after2, _ := ms.blockRegistry.Lookup("cascade-test") + if after2.VolumeServer != "vs3" { + t.Fatalf("failover 2: expected vs3, got %q", after2.VolumeServer) + } + if after2.Epoch != 3 { + t.Fatalf("failover 2: epoch got %d, want 3", after2.Epoch) + } + + // No more replicas — third failover should fail silently. + ms.blockRegistry.UpdateEntry("cascade-test", func(e *BlockVolumeEntry) { + e.LastLeaseGrant = time.Now().Add(-10 * time.Second) + }) + ms.blockRegistry.UnmarkBlockCapable("vs3") + ms.failoverBlockVolumes("vs3") + + after3, _ := ms.blockRegistry.Lookup("cascade-test") + // Epoch should still be 3 — no eligible replicas. + if after3.Epoch != 3 { + t.Fatalf("failover 3: epoch should stay 3, got %d", after3.Epoch) + } +} + +// --- Test 3: Concurrent failover + heartbeat + promotion (stress) --- + +func TestEdge_ConcurrentFailoverAndHeartbeat_NoPanic(t *testing.T) { + ms := testMasterServerForFailover(t) + ms.blockRegistry.MarkBlockCapable("vs1") + ms.blockRegistry.MarkBlockCapable("vs2") + ms.blockRegistry.MarkBlockCapable("vs3") + + setup := func() { + ms.blockRegistry.Unregister("stress-vol") + ms.blockRegistry.MarkBlockCapable("vs1") + ms.blockRegistry.MarkBlockCapable("vs2") + ms.blockRegistry.MarkBlockCapable("vs3") + ms.blockRegistry.Register(&BlockVolumeEntry{ + Name: "stress-vol", VolumeServer: "vs1", Path: "/data/stress.blk", + SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary), + Status: StatusActive, LeaseTTL: 5 * time.Second, + LastLeaseGrant: time.Now().Add(-10 * time.Second), + Replicas: []ReplicaInfo{ + {Server: "vs2", Path: "/r2.blk", HealthScore: 1.0, + Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()}, + {Server: "vs3", Path: "/r3.blk", HealthScore: 0.9, + Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()}, + }, + }) + } + + for round := 0; round < 30; round++ { + setup() + var wg sync.WaitGroup + wg.Add(4) + go func() { defer wg.Done(); ms.failoverBlockVolumes("vs1") }() + go func() { defer wg.Done(); ms.reevaluateOrphanedPrimaries("vs2") }() + go func() { defer wg.Done(); ms.blockRegistry.PromoteBestReplica("stress-vol") }() + go func() { + defer wg.Done() + ms.blockRegistry.ManualPromote("stress-vol", "", true) + }() + wg.Wait() + } + // No panic = pass. +} + +// --- Test 4: LSN + health score interaction — health wins within tolerance --- + +func TestEdge_LSNWithinTolerance_HealthWins(t *testing.T) { + ms := testMasterServerForFailover(t) + ms.blockRegistry.SetPromotionLSNTolerance(100) + ms.blockRegistry.MarkBlockCapable("primary") + ms.blockRegistry.MarkBlockCapable("high-health") + ms.blockRegistry.MarkBlockCapable("high-lsn") + + ms.blockRegistry.Register(&BlockVolumeEntry{ + Name: "health-vs-lsn", VolumeServer: "primary", Path: "/data/hvl.blk", + SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary), + Status: StatusActive, LeaseTTL: 5 * time.Second, + LastLeaseGrant: time.Now().Add(-10 * time.Second), + WALHeadLSN: 1000, + Replicas: []ReplicaInfo{ + {Server: "high-health", Path: "/r1.blk", HealthScore: 1.0, WALHeadLSN: 950, + Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()}, + {Server: "high-lsn", Path: "/r2.blk", HealthScore: 0.5, WALHeadLSN: 999, + Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()}, + }, + }) + + ms.blockRegistry.UnmarkBlockCapable("primary") + ms.failoverBlockVolumes("primary") + + after, _ := ms.blockRegistry.Lookup("health-vs-lsn") + // Both within tolerance (lag ≤ 100). Health wins: high-health (1.0) > high-lsn (0.5). + if after.VolumeServer != "high-health" { + t.Fatalf("expected high-health promoted (higher health, both within LSN tolerance), got %q", after.VolumeServer) + } +} + +// --- Test 5: Network flap simulation — rapid mark/unmark block capable --- + +func TestEdge_NetworkFlap_RapidMarkUnmark(t *testing.T) { + ms := testMasterServerForFailover(t) + ms.blockRegistry.MarkBlockCapable("flapper") + ms.blockRegistry.MarkBlockCapable("stable") + + ms.blockRegistry.Register(&BlockVolumeEntry{ + Name: "flap-test", VolumeServer: "stable", Path: "/data/flap.blk", + SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary), + Status: StatusActive, LeaseTTL: 5 * time.Second, + LastLeaseGrant: time.Now(), + Replicas: []ReplicaInfo{ + {Server: "flapper", Path: "/r.blk", HealthScore: 1.0, + Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()}, + }, + }) + + var wg sync.WaitGroup + // Goroutine 1: rapidly flap the "flapper" server. + wg.Add(1) + go func() { + defer wg.Done() + for i := 0; i < 100; i++ { + ms.blockRegistry.UnmarkBlockCapable("flapper") + ms.blockRegistry.MarkBlockCapable("flapper") + } + }() + + // Goroutine 2: attempt promotions during flapping. + wg.Add(1) + go func() { + defer wg.Done() + for i := 0; i < 50; i++ { + ms.blockRegistry.EvaluatePromotion("flap-test") + } + }() + + // Goroutine 3: concurrent heartbeat updates. + wg.Add(1) + go func() { + defer wg.Done() + for i := 0; i < 50; i++ { + ms.blockRegistry.UpdateFullHeartbeat("flapper", nil, "") + } + }() + + wg.Wait() + // No panic, no corruption = pass. + + // Volume should still be on stable primary. + after, ok := ms.blockRegistry.Lookup("flap-test") + if !ok { + t.Fatal("volume lost during flapping") + } + if after.VolumeServer != "stable" { + t.Fatalf("primary changed from stable to %q during flapping", after.VolumeServer) + } +} + +// --- Test 6: RF=3 all gates — mixed rejection reasons --- + +func TestEdge_RF3_MixedGates_BestEligiblePromoted(t *testing.T) { + ms := testMasterServerForFailover(t) + ms.blockRegistry.SetPromotionLSNTolerance(50) + ms.blockRegistry.MarkBlockCapable("primary") + // Note: "dead-server" NOT marked block capable. + ms.blockRegistry.MarkBlockCapable("stale-hb") + ms.blockRegistry.MarkBlockCapable("good") + + ms.blockRegistry.Register(&BlockVolumeEntry{ + Name: "mixed-gates", VolumeServer: "primary", Path: "/data/mixed.blk", + SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary), + Status: StatusActive, LeaseTTL: 5 * time.Second, + LastLeaseGrant: time.Now().Add(-10 * time.Second), + WALHeadLSN: 500, + Replicas: []ReplicaInfo{ + {Server: "dead-server", Path: "/r1.blk", HealthScore: 1.0, WALHeadLSN: 500, + Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()}, + {Server: "stale-hb", Path: "/r2.blk", HealthScore: 1.0, WALHeadLSN: 500, + Role: blockvol.RoleToWire(blockvol.RoleReplica), + LastHeartbeat: time.Now().Add(-10 * time.Minute)}, // stale + {Server: "good", Path: "/r3.blk", HealthScore: 0.8, WALHeadLSN: 480, + Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()}, + }, + }) + + // Evaluate preflight first (read-only). + pf, err := ms.blockRegistry.EvaluatePromotion("mixed-gates") + if err != nil { + t.Fatalf("evaluate: %v", err) + } + if !pf.Promotable { + t.Fatalf("should be promotable, reason=%s, rejections=%v", pf.Reason, pf.Rejections) + } + // Should have 2 rejections: dead-server (server_dead) + stale-hb (stale_heartbeat). + if len(pf.Rejections) != 2 { + t.Fatalf("expected 2 rejections, got %d: %v", len(pf.Rejections), pf.Rejections) + } + reasons := map[string]string{} + for _, r := range pf.Rejections { + reasons[r.Server] = r.Reason + } + if reasons["dead-server"] != "server_dead" { + t.Fatalf("dead-server: got %q, want server_dead", reasons["dead-server"]) + } + if reasons["stale-hb"] != "stale_heartbeat" { + t.Fatalf("stale-hb: got %q, want stale_heartbeat", reasons["stale-hb"]) + } + + // Now actually promote. + ms.blockRegistry.UnmarkBlockCapable("primary") + ms.failoverBlockVolumes("primary") + + after, _ := ms.blockRegistry.Lookup("mixed-gates") + if after.VolumeServer != "good" { + t.Fatalf("expected 'good' promoted (only eligible), got %q", after.VolumeServer) + } +} + +// --- Test 7: Promotion changes publication (ISCSIAddr, NvmeAddr) --- + +func TestEdge_PromotionUpdatesPublication(t *testing.T) { + ms := testMasterServerForFailover(t) + ms.blockRegistry.MarkBlockCapable("primary") + ms.blockRegistry.MarkBlockCapable("replica") + + ms.blockRegistry.Register(&BlockVolumeEntry{ + Name: "pub-test", VolumeServer: "primary", Path: "/data/pub.blk", + ISCSIAddr: "primary:3260", NvmeAddr: "primary:4420", NQN: "nqn.primary", + SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary), + Status: StatusActive, LeaseTTL: 5 * time.Second, + LastLeaseGrant: time.Now().Add(-10 * time.Second), + Replicas: []ReplicaInfo{ + {Server: "replica", Path: "/r.blk", HealthScore: 1.0, + ISCSIAddr: "replica:3261", NvmeAddr: "replica:4421", NQN: "nqn.replica", + Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()}, + }, + }) + + ms.blockRegistry.UnmarkBlockCapable("primary") + ms.failoverBlockVolumes("primary") + + after, _ := ms.blockRegistry.Lookup("pub-test") + if after.ISCSIAddr != "replica:3261" { + t.Fatalf("ISCSIAddr: got %q, want replica:3261", after.ISCSIAddr) + } + if after.NvmeAddr != "replica:4421" { + t.Fatalf("NvmeAddr: got %q, want replica:4421", after.NvmeAddr) + } + if after.NQN != "nqn.replica" { + t.Fatalf("NQN: got %q, want nqn.replica", after.NQN) + } +} + +// --- Test 8: Orphaned primary re-evaluation with LSN lag --- + +func TestEdge_OrphanReevaluation_LSNLag_StillPromotes(t *testing.T) { + ms := testMasterServerForFailover(t) + ms.blockRegistry.SetPromotionLSNTolerance(10) + // Primary is dead, replica is alive but lagging. + ms.blockRegistry.MarkBlockCapable("replica") + + ms.blockRegistry.Register(&BlockVolumeEntry{ + Name: "orphan-lag", VolumeServer: "dead-primary", Path: "/data/orphan.blk", + SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary), + Status: StatusActive, LeaseTTL: 5 * time.Second, + LastLeaseGrant: time.Now().Add(-10 * time.Second), // expired + WALHeadLSN: 1000, + Replicas: []ReplicaInfo{ + {Server: "replica", Path: "/r.blk", HealthScore: 1.0, WALHeadLSN: 500, + Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()}, + }, + }) + + // Orphan re-evaluation: replica reconnects. + ms.reevaluateOrphanedPrimaries("replica") + + // The replica has WAL lag of 500 (way beyond tolerance=10). + // But it's the ONLY replica — should it promote or not? + // Current behavior: LSN gate rejects it. No promotion. + after, _ := ms.blockRegistry.Lookup("orphan-lag") + if after.Epoch != 1 { + // If epoch changed, the lagging replica was promoted. + // This may or may not be desired — document the behavior. + t.Logf("NOTE: lagging replica WAS promoted (epoch=%d). LSN lag=%d, tolerance=%d", + after.Epoch, 1000-500, 10) + } else { + t.Logf("NOTE: lagging replica was NOT promoted (epoch=1). Volume is stuck with dead primary.") + t.Logf("This is the current behavior: LSN gate blocks promotion even when it's the only option.") + } + // This test documents behavior, doesn't assert pass/fail. + // The question is: should a lagging-but-only replica be promoted to avoid downtime? +} + +// --- Test 9: Rebuild addr cleared after promotion, then repopulated --- + +func TestEdge_RebuildAddr_ClearedThenRepopulated(t *testing.T) { + ms := testMasterServerForFailover(t) + ms.blockRegistry.MarkBlockCapable("primary") + ms.blockRegistry.MarkBlockCapable("replica") + + ms.blockRegistry.Register(&BlockVolumeEntry{ + Name: "rebuild-addr", VolumeServer: "primary", Path: "/data/rebuild.blk", + SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary), + Status: StatusActive, LeaseTTL: 5 * time.Second, + LastLeaseGrant: time.Now().Add(-10 * time.Second), + RebuildListenAddr: "primary:15000", // old primary's rebuild addr + Replicas: []ReplicaInfo{ + {Server: "replica", Path: "/r.blk", HealthScore: 1.0, + Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()}, + }, + }) + + ms.blockRegistry.UnmarkBlockCapable("primary") + ms.failoverBlockVolumes("primary") + + after, _ := ms.blockRegistry.Lookup("rebuild-addr") + // RebuildListenAddr should be cleared after promotion (B-11 fix). + if after.RebuildListenAddr != "" { + t.Fatalf("RebuildListenAddr should be cleared after promotion, got %q", after.RebuildListenAddr) + } +} + +// --- Test 10: Multiple volumes on same server — all fail over --- + +func TestEdge_MultipleVolumes_SameServer_AllFailover(t *testing.T) { + ms := testMasterServerForFailover(t) + ms.blockRegistry.MarkBlockCapable("vs1") + ms.blockRegistry.MarkBlockCapable("vs2") + + // Register 5 volumes, all with primary on vs1. + for i := 0; i < 5; i++ { + name := "multi-" + string(rune('a'+i)) + ms.blockRegistry.Register(&BlockVolumeEntry{ + Name: name, VolumeServer: "vs1", Path: "/data/" + name + ".blk", + SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary), + Status: StatusActive, LeaseTTL: 5 * time.Second, + LastLeaseGrant: time.Now().Add(-10 * time.Second), + Replicas: []ReplicaInfo{ + {Server: "vs2", Path: "/r/" + name + ".blk", HealthScore: 1.0, + Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()}, + }, + }) + } + + // Kill vs1 — all 5 volumes should fail over. + ms.blockRegistry.UnmarkBlockCapable("vs1") + ms.failoverBlockVolumes("vs1") + + for i := 0; i < 5; i++ { + name := "multi-" + string(rune('a'+i)) + entry, ok := ms.blockRegistry.Lookup(name) + if !ok { + t.Fatalf("volume %s not found", name) + } + if entry.VolumeServer != "vs2" { + t.Fatalf("volume %s: expected vs2, got %q", name, entry.VolumeServer) + } + if entry.Epoch != 2 { + t.Fatalf("volume %s: epoch got %d, want 2", name, entry.Epoch) + } + } +} diff --git a/weed/server/volume_grpc_client_to_master.go b/weed/server/volume_grpc_client_to_master.go index 10be5b1b7..8454471a2 100644 --- a/weed/server/volume_grpc_client_to_master.go +++ b/weed/server/volume_grpc_client_to_master.go @@ -309,6 +309,16 @@ func (vs *VolumeServer) doHeartbeatWithRetry(masterAddress pb.ServerAddress, grp glog.V(0).Infof("Volume Server Failed to update to master %s: %v", masterAddress, err) return "", err } + case <-vs.blockStateChangeChan: + // Immediate block heartbeat on shipper state change (degraded/recovered). + if vs.blockService == nil { + continue + } + glog.V(0).Infof("volume server %s:%d block state change → immediate heartbeat", vs.store.Ip, vs.store.Port) + if err = stream.Send(vs.collectBlockVolumeHeartbeat(ip, port, dataCenter, rack)); err != nil { + glog.V(0).Infof("Volume Server Failed to send block state-change heartbeat to master %s: %v", masterAddress, err) + return "", err + } case <-blockVolTickChan.C: if vs.blockService == nil { continue diff --git a/weed/server/volume_server.go b/weed/server/volume_server.go index 4db60cc19..ea4f3b7c6 100644 --- a/weed/server/volume_server.go +++ b/weed/server/volume_server.go @@ -55,7 +55,8 @@ type VolumeServer struct { isHeartbeating bool stopChan chan bool - blockService *BlockService // block volume iSCSI service (nil if disabled) + blockService *BlockService // block volume iSCSI service (nil if disabled) + blockStateChangeChan chan bool // triggers immediate block heartbeat on shipper state change } func NewVolumeServer(adminMux, publicMux *http.ServeMux, ip string, @@ -103,6 +104,7 @@ func NewVolumeServer(adminMux, publicMux *http.ServeMux, ip string, fileSizeLimitBytes: int64(fileSizeLimitMB) * 1024 * 1024, isHeartbeating: true, stopChan: make(chan bool), + blockStateChangeChan: make(chan bool, 1), inFlightUploadDataLimitCond: sync.NewCond(new(sync.Mutex)), inFlightDownloadDataLimitCond: sync.NewCond(new(sync.Mutex)), concurrentUploadLimit: concurrentUploadLimit, @@ -135,6 +137,7 @@ func NewVolumeServer(adminMux, publicMux *http.ServeMux, ip string, adminMux.HandleFunc("/stats/disk", vs.guard.WhiteList(vs.statsDiskHandler)) */ } + adminMux.HandleFunc("/debug/block/shipper", vs.debugBlockShipperHandler) adminMux.HandleFunc("/", requestIDMiddleware(vs.privateStoreHandler)) if publicMux != adminMux { // separated admin and public port diff --git a/weed/server/volume_server_block.go b/weed/server/volume_server_block.go index c5e0390d8..03444ed9d 100644 --- a/weed/server/volume_server_block.go +++ b/weed/server/volume_server_block.go @@ -14,6 +14,7 @@ import ( "github.com/seaweedfs/seaweedfs/weed/storage/blockvol" "github.com/seaweedfs/seaweedfs/weed/storage/blockvol/iscsi" "github.com/seaweedfs/seaweedfs/weed/storage/blockvol/nvme" + "github.com/seaweedfs/seaweedfs/weed/storage/blockvol/v2bridge" ) // volReplState tracks active replication addresses per volume. @@ -45,6 +46,23 @@ type BlockService struct { // Replication state (CP6-3). replMu sync.RWMutex replStates map[string]*volReplState // keyed by volume path + + // V2 engine bridge (Phase 08 P1). + v2Bridge *v2bridge.ControlBridge +} + +// WireStateChangeNotify sets up shipper state change callbacks on all +// registered volumes so that degradation/recovery triggers an immediate +// heartbeat via the provided channel. Non-blocking send (buffered chan 1). +func (bs *BlockService) WireStateChangeNotify(ch chan bool) { + bs.blockStore.IterateBlockVolumes(func(path string, vol *blockvol.BlockVol) { + vol.SetOnShipperStateChange(func(from, to blockvol.ReplicaState) { + select { + case ch <- true: + default: // already pending + } + }) + }) } // StartBlockService scans blockDir for .blk files, opens them as block volumes, @@ -70,6 +88,7 @@ func StartBlockService(listenAddr, blockDir, iqnPrefix, portalAddr string, nvmeC blockDir: blockDir, listenAddr: listenAddr, nvmeListenAddr: nvmeCfg.ListenAddr, + v2Bridge: v2bridge.NewControlBridge(), } // iSCSI target setup. @@ -312,7 +331,18 @@ func (bs *BlockService) DeleteBlockVol(name string) error { } // ProcessAssignments applies assignments from master, including replication setup. +// V2 bridge: also delivers each assignment to the V2 engine for recovery ownership. func (bs *BlockService) ProcessAssignments(assignments []blockvol.BlockVolumeAssignment) { + // V2 bridge: convert and deliver to engine (Phase 08 P1). + if bs.v2Bridge != nil { + for _, a := range assignments { + intent := bs.v2Bridge.ConvertAssignment(a, bs.listenAddr) + _ = intent // TODO(P2): deliver to engine orchestrator + glog.V(1).Infof("v2bridge: converted assignment %s epoch=%d → %d replicas", + a.Path, a.Epoch, len(intent.Replicas)) + } + } + for _, a := range assignments { role := blockvol.RoleFromWire(a.Role) ttl := blockvol.LeaseTTLFromWire(a.LeaseTtlMs) @@ -645,6 +675,8 @@ func (bs *BlockService) Shutdown() { // SetBlockService wires a BlockService into the VolumeServer so that // heartbeats include block volume info and the server is marked block-capable. +// Also wires shipper state change callbacks for immediate heartbeat on degradation. func (vs *VolumeServer) SetBlockService(bs *BlockService) { vs.blockService = bs + bs.WireStateChangeNotify(vs.blockStateChangeChan) } diff --git a/weed/server/volume_server_block_debug.go b/weed/server/volume_server_block_debug.go new file mode 100644 index 000000000..747bc5f25 --- /dev/null +++ b/weed/server/volume_server_block_debug.go @@ -0,0 +1,77 @@ +package weed_server + +import ( + "encoding/json" + "net/http" + "time" + + "github.com/seaweedfs/seaweedfs/weed/storage/blockvol" +) + +// ShipperDebugInfo is the real-time shipper state for one replica. +type ShipperDebugInfo struct { + DataAddr string `json:"data_addr"` + State string `json:"state"` + FlushedLSN uint64 `json:"flushed_lsn"` +} + +// BlockVolumeDebugInfo is the real-time block volume state. +type BlockVolumeDebugInfo struct { + Path string `json:"path"` + Role string `json:"role"` + Epoch uint64 `json:"epoch"` + HeadLSN uint64 `json:"head_lsn"` + Degraded bool `json:"degraded"` + Shippers []ShipperDebugInfo `json:"shippers,omitempty"` + Timestamp string `json:"timestamp"` +} + +// debugBlockShipperHandler returns real-time shipper state for all block volumes. +// Unlike the master's replica_degraded (heartbeat-lagged), this reads directly +// from the shipper's atomic state field — no heartbeat delay. +// +// GET /debug/block/shipper +func (vs *VolumeServer) debugBlockShipperHandler(w http.ResponseWriter, r *http.Request) { + if vs.blockService == nil { + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode([]BlockVolumeDebugInfo{}) + return + } + + store := vs.blockService.Store() + if store == nil { + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode([]BlockVolumeDebugInfo{}) + return + } + + var infos []BlockVolumeDebugInfo + store.IterateBlockVolumes(func(path string, vol *blockvol.BlockVol) { + status := vol.Status() + info := BlockVolumeDebugInfo{ + Path: path, + Role: status.Role.String(), + Epoch: status.Epoch, + HeadLSN: status.WALHeadLSN, + Degraded: status.ReplicaDegraded, + Timestamp: time.Now().UTC().Format(time.RFC3339Nano), + } + + // Get per-shipper state from ShipperGroup if available. + sg := vol.GetShipperGroup() + if sg != nil { + for _, ss := range sg.ShipperStates() { + info.Shippers = append(info.Shippers, ShipperDebugInfo{ + DataAddr: ss.DataAddr, + State: ss.State, + FlushedLSN: ss.FlushedLSN, + }) + } + } + + infos = append(infos, info) + }) + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(infos) +} diff --git a/weed/storage/blockvol/block_heartbeat.go b/weed/storage/blockvol/block_heartbeat.go index 4e788b689..2e1862897 100644 --- a/weed/storage/blockvol/block_heartbeat.go +++ b/weed/storage/blockvol/block_heartbeat.go @@ -47,6 +47,7 @@ type BlockVolumeAssignment struct { LeaseTtlMs uint32 // lease TTL in milliseconds (0 = no lease) ReplicaDataAddr string // where primary ships WAL data (scalar, RF=2 compat) ReplicaCtrlAddr string // where primary sends barriers (scalar, RF=2 compat) + ReplicaServerID string // V2: stable server identity for scalar replica (from registry) RebuildAddr string // where rebuild server listens ReplicaAddrs []ReplicaAddr // CP8-2: multi-replica addrs (precedence over scalar) } diff --git a/weed/storage/blockvol/blockvol.go b/weed/storage/blockvol/blockvol.go index c4fade920..60b11bf2e 100644 --- a/weed/storage/blockvol/blockvol.go +++ b/weed/storage/blockvol/blockvol.go @@ -83,6 +83,9 @@ type BlockVol struct { // Observability (CP8-4). Metrics *EngineMetrics + // Shipper state change callback — triggers immediate heartbeat. + onShipperStateChange func(from, to ReplicaState) + // Snapshot fields (Phase 5 CP5-2). snapMu sync.RWMutex snapshots map[uint32]*activeSnapshot @@ -782,6 +785,7 @@ func (v *BlockVol) SyncCache() error { type ReplicaAddr struct { DataAddr string CtrlAddr string + ServerID string // V2: stable server identity from registry (not address-derived) } // WALAccess provides the shipper with the minimal WAL interface needed @@ -824,6 +828,18 @@ func (a *walAccess) StreamEntries(fromLSN uint64, fn func(*WALEntry) error) erro return a.vol.wal.ScanFrom(a.vol.fd, a.vol.super.WALOffset, checkpointLSN, fromLSN, fn) } +// SetOnShipperStateChange registers a callback for shipper state transitions. +// Called by the volume server to trigger immediate heartbeat on degradation/recovery. +func (v *BlockVol) SetOnShipperStateChange(fn func(from, to ReplicaState)) { + v.onShipperStateChange = fn +} + +// GetShipperGroup returns the shipper group for debug/observability. +// Returns nil if no replication is configured. +func (v *BlockVol) GetShipperGroup() *ShipperGroup { + return v.shipperGroup +} + // SetReplicaAddr configures a single replica endpoint. Backward-compatible wrapper // around SetReplicaAddrs for RF=2 callers. func (v *BlockVol) SetReplicaAddr(dataAddr, ctrlAddr string) { @@ -842,6 +858,11 @@ func (v *BlockVol) SetReplicaAddrs(addrs []ReplicaAddr) { } v.shipperGroup = NewShipperGroup(shippers) + // Wire state change callback so shipper degradation triggers immediate heartbeat. + if v.onShipperStateChange != nil { + v.shipperGroup.SetOnStateChange(v.onShipperStateChange) + } + // Replace the group committer's sync function with a distributed version. v.groupCommit.Stop() v.groupCommit = NewGroupCommitter(GroupCommitterConfig{ diff --git a/weed/storage/blockvol/shipper_group.go b/weed/storage/blockvol/shipper_group.go index 96db846d0..aede8f6f9 100644 --- a/weed/storage/blockvol/shipper_group.go +++ b/weed/storage/blockvol/shipper_group.go @@ -188,6 +188,17 @@ func (sg *ShipperGroup) EvaluateRetentionBudgets(timeout time.Duration) { } } +// SetOnStateChange registers a callback on all current shippers for state transitions. +// Used by the volume server to trigger an immediate block heartbeat when a shipper +// transitions to/from degraded. +func (sg *ShipperGroup) SetOnStateChange(fn func(from, to ReplicaState)) { + sg.mu.RLock() + defer sg.mu.RUnlock() + for _, s := range sg.shippers { + s.SetOnStateChange(fn) + } +} + // ShipperStates returns per-replica status for heartbeat reporting. // Master uses this to identify which replicas need rebuild. func (sg *ShipperGroup) ShipperStates() []ReplicaShipperStatus { diff --git a/weed/storage/blockvol/testrunner/scenarios/internal/robust-slow-replica.yaml b/weed/storage/blockvol/testrunner/scenarios/internal/robust-slow-replica.yaml index c04e3a8fb..a8d092563 100644 --- a/weed/storage/blockvol/testrunner/scenarios/internal/robust-slow-replica.yaml +++ b/weed/storage/blockvol/testrunner/scenarios/internal/robust-slow-replica.yaml @@ -104,28 +104,46 @@ phases: iqn: "{{ vol_iqn }}" save_as: device - - name: inject-partition + - name: inject-delay actions: - action: print - msg: "=== Blocking replication ports (3295) from primary to replica ===" + msg: "=== Blocking replication ports (4000-6000) from primary to replica ===" - # Block only replication port — SSH and master heartbeat still work. - - action: inject_partition + # Block the replication port range. Replication data/ctrl ports are + # basePort(3295) + 1000 + hash*3, landing in ~4295-5794 range. + # Blocking 4000-6000 covers all possible replication ports while + # leaving SSH (22) and master heartbeat (9433/18480) open. + - action: exec node: m02 - target_ip: "192.168.1.181" - ports: "3295" + cmd: "iptables -A OUTPUT -d 192.168.1.181 -p tcp --dport 4000:6000 -j REJECT --reject-with tcp-reset" + root: "true" - # Trigger a write so barrier fires and times out. - - action: exec + - action: print + msg: "=== Writing to trigger Ship failure + degradation ===" + + # Write in background via fio (best_effort: writes succeed locally). + - action: fio_json node: m01 - cmd: "timeout 10 dd if=/dev/urandom of={{ device }} bs=4k count=1 oflag=direct 2>/dev/null; true" - root: "true" + device: "{{ device }}" + rw: randwrite + bs: 4k + iodepth: "1" + runtime: "10" + time_based: "true" + name: write-during-fault + save_as: fio_fault ignore_error: true - # Wait for barrier timeout (5s) + degradation detection. - - action: sleep - duration: 10s + - action: fio_parse + json_var: fio_fault + metric: iops + save_as: iops_fault + ignore_error: true + + - action: print + msg: "Write IOPS during fault: {{ iops_fault }}" + # Check degraded state after writes. - action: assert_block_field name: "{{ volume_name }}" field: replica_degraded @@ -134,16 +152,17 @@ phases: ignore_error: true - action: print - msg: "During partition: degraded={{ degraded_during }}" + msg: "During fault: degraded={{ degraded_during }}" - name: clear-and-measure actions: - action: print - msg: "=== Clearing partition, measuring shipper recovery ===" + msg: "=== Clearing fault, measuring shipper recovery ===" - - action: clear_fault + - action: exec node: m02 - type: partition + cmd: "iptables -D OUTPUT -d 192.168.1.181 -p tcp --dport 4000:6000 -j REJECT --reject-with tcp-reset 2>/dev/null; true" + root: "true" # Check at 5s — V1.5 background reconnect interval is 5s. - action: sleep @@ -221,9 +240,10 @@ phases: - name: cleanup always: true actions: - - action: clear_fault + - action: exec node: m02 - type: netem + cmd: "iptables -D OUTPUT -d 192.168.1.181 -p tcp --dport 4000:6000 -j REJECT --reject-with tcp-reset 2>/dev/null; true" + root: "true" ignore_error: true - action: stop_weed node: m01 diff --git a/weed/storage/blockvol/v2bridge/control.go b/weed/storage/blockvol/v2bridge/control.go index b2636e336..2104b494b 100644 --- a/weed/storage/blockvol/v2bridge/control.go +++ b/weed/storage/blockvol/v2bridge/control.go @@ -1,15 +1,15 @@ // control.go implements the real control-plane delivery bridge. -// It converts BlockVolumeAssignment (from master heartbeat) into -// V2 engine AssignmentIntent, using real master/registry identity. +// Converts BlockVolumeAssignment (from master heartbeat) into V2 engine +// AssignmentIntent using stable server identity from the master registry. // -// Identity rule: ReplicaID = / -// The replica-server is the VS identity from the master registry, -// not a transport address. This survives address changes. +// Identity rule: ReplicaID = / +// ServerID comes from BlockVolumeAssignment.ReplicaServerID or +// ReplicaAddr.ServerID — NOT derived from transport addresses. package v2bridge import ( "fmt" - "strings" + "log" bridge "github.com/seaweedfs/seaweedfs/sw-block/bridge/blockvol" engine "github.com/seaweedfs/seaweedfs/sw-block/engine/replication" @@ -17,27 +17,16 @@ import ( ) // ControlBridge converts real BlockVolumeAssignment into V2 engine intents. -// It is the live replacement for direct AssignmentIntent construction. type ControlBridge struct { adapter *bridge.ControlAdapter } -// NewControlBridge creates a control bridge. func NewControlBridge() *ControlBridge { - return &ControlBridge{ - adapter: bridge.NewControlAdapter(), - } + return &ControlBridge{adapter: bridge.NewControlAdapter()} } -// ConvertAssignment converts a real BlockVolumeAssignment from the master -// heartbeat response into a V2 engine AssignmentIntent. -// -// Identity mapping: -// - VolumeName = assignment.Path -// - For primary: ReplicaID per replica = / -// - replica-server-id = extracted from ReplicaAddrs or scalar fields -// - Epoch from assignment -// - SessionKind from Role +// ConvertAssignment converts a real BlockVolumeAssignment into an engine intent. +// localServerID is the identity of the local volume server (for replica/rebuild roles). func (cb *ControlBridge) ConvertAssignment(a blockvol.BlockVolumeAssignment, localServerID string) engine.AssignmentIntent { role := blockvol.RoleFromWire(a.Role) volumeName := a.Path @@ -54,48 +43,48 @@ func (cb *ControlBridge) ConvertAssignment(a blockvol.BlockVolumeAssignment, loc } } -// convertPrimaryAssignment: primary receives assignment with replica targets. func (cb *ControlBridge) convertPrimaryAssignment(a blockvol.BlockVolumeAssignment, volumeName string) engine.AssignmentIntent { primary := bridge.MasterAssignment{ - VolumeName: volumeName, - Epoch: a.Epoch, - Role: "primary", - PrimaryServerID: "", // primary doesn't need its own server ID in the assignment + VolumeName: volumeName, + Epoch: a.Epoch, + Role: "primary", } var replicas []bridge.MasterAssignment if len(a.ReplicaAddrs) > 0 { for _, ra := range a.ReplicaAddrs { - serverID := extractServerID(ra.DataAddr) + if ra.ServerID == "" { + log.Printf("v2bridge: skipping replica with empty ServerID (data=%s)", ra.DataAddr) + continue // fail closed: skip replicas without stable identity + } replicas = append(replicas, bridge.MasterAssignment{ VolumeName: volumeName, Epoch: a.Epoch, Role: "replica", - ReplicaServerID: serverID, + ReplicaServerID: ra.ServerID, DataAddr: ra.DataAddr, CtrlAddr: ra.CtrlAddr, - AddrVersion: 0, // will be bumped on address change detection }) } - } else if a.ReplicaDataAddr != "" { - // Scalar RF=2 compat. - serverID := extractServerID(a.ReplicaDataAddr) + } else if a.ReplicaServerID != "" && a.ReplicaDataAddr != "" { + // Scalar RF=2 path with explicit ServerID. replicas = append(replicas, bridge.MasterAssignment{ VolumeName: volumeName, Epoch: a.Epoch, Role: "replica", - ReplicaServerID: serverID, + ReplicaServerID: a.ReplicaServerID, DataAddr: a.ReplicaDataAddr, CtrlAddr: a.ReplicaCtrlAddr, }) + } else if a.ReplicaDataAddr != "" { + log.Printf("v2bridge: scalar replica assignment without ServerID (data=%s) — skipping", a.ReplicaDataAddr) + // Fail closed: do not create address-derived identity. } return cb.adapter.ToAssignmentIntent(primary, replicas) } -// convertReplicaAssignment: replica receives its own role assignment. func (cb *ControlBridge) convertReplicaAssignment(a blockvol.BlockVolumeAssignment, volumeName, localServerID string) engine.AssignmentIntent { - // Replica doesn't manage other replicas — just acknowledges its role. return engine.AssignmentIntent{ Epoch: a.Epoch, Replicas: []engine.ReplicaAssignment{ @@ -110,7 +99,6 @@ func (cb *ControlBridge) convertReplicaAssignment(a blockvol.BlockVolumeAssignme } } -// convertRebuildAssignment: rebuilding replica. func (cb *ControlBridge) convertRebuildAssignment(a blockvol.BlockVolumeAssignment, volumeName, localServerID string) engine.AssignmentIntent { replicaID := fmt.Sprintf("%s/%s", volumeName, localServerID) return engine.AssignmentIntent{ @@ -129,22 +117,3 @@ func (cb *ControlBridge) convertRebuildAssignment(a blockvol.BlockVolumeAssignme }, } } - -// extractServerID derives a stable server identity from an address. -// Uses the host:port as the server ID (this is how the master registry -// keys servers). In production, this would come from the registry's -// ReplicaInfo.Server field directly. -// -// For now: strip to host:grpc-port format to match master registry keys. -func extractServerID(addr string) string { - // addr is typically "ip:port" — use as-is for server ID. - // The master registry uses the same format for ReplicaInfo.Server. - if addr == "" { - return "unknown" - } - // Strip any path suffix, keep host:port. - if idx := strings.Index(addr, "/"); idx >= 0 { - return addr[:idx] - } - return addr -} diff --git a/weed/storage/blockvol/v2bridge/control_test.go b/weed/storage/blockvol/v2bridge/control_test.go index b91e40394..b7112fdc3 100644 --- a/weed/storage/blockvol/v2bridge/control_test.go +++ b/weed/storage/blockvol/v2bridge/control_test.go @@ -9,234 +9,184 @@ import ( // ============================================================ // Phase 08 P1: Real control delivery tests -// Validates real BlockVolumeAssignment → engine AssignmentIntent. +// Identity: ReplicaID = / — NOT address-derived. // ============================================================ -// --- E1: Live assignment delivery → engine intent --- - -func TestControl_PrimaryAssignment_StableIdentity(t *testing.T) { +func TestControl_PrimaryAssignment_StableServerID(t *testing.T) { cb := NewControlBridge() - // Real assignment from master heartbeat. a := blockvol.BlockVolumeAssignment{ Path: "pvc-data-1", Epoch: 3, Role: uint32(blockvol.RolePrimary), + ReplicaServerID: "vs2", ReplicaDataAddr: "10.0.0.2:9333", ReplicaCtrlAddr: "10.0.0.2:9334", } - intent := cb.ConvertAssignment(a, "vs1:9333") + intent := cb.ConvertAssignment(a, "vs1") - if intent.Epoch != 3 { - t.Fatalf("epoch=%d", intent.Epoch) - } if len(intent.Replicas) != 1 { t.Fatalf("replicas=%d", len(intent.Replicas)) } - // ReplicaID = volume-path / replica-server (NOT address-derived transport endpoint). r := intent.Replicas[0] - expected := "pvc-data-1/10.0.0.2:9333" - if r.ReplicaID != expected { - t.Fatalf("ReplicaID=%s, want %s", r.ReplicaID, expected) + // ReplicaID uses ServerID, not address. + if r.ReplicaID != "pvc-data-1/vs2" { + t.Fatalf("ReplicaID=%s, want pvc-data-1/vs2", r.ReplicaID) } - - // Endpoint is the transport address. if r.Endpoint.DataAddr != "10.0.0.2:9333" { t.Fatalf("DataAddr=%s", r.Endpoint.DataAddr) } + if intent.RecoveryTargets["pvc-data-1/vs2"] != engine.SessionCatchUp { + t.Fatalf("recovery=%s", intent.RecoveryTargets["pvc-data-1/vs2"]) + } +} + +func TestControl_AddressChange_IdentityPreserved(t *testing.T) { + cb := NewControlBridge() + + // Same ServerID, different address. + a1 := blockvol.BlockVolumeAssignment{ + Path: "vol1", Epoch: 1, Role: uint32(blockvol.RolePrimary), + ReplicaServerID: "vs2", + ReplicaDataAddr: "10.0.0.2:9333", ReplicaCtrlAddr: "10.0.0.2:9334", + } + a2 := blockvol.BlockVolumeAssignment{ + Path: "vol1", Epoch: 1, Role: uint32(blockvol.RolePrimary), + ReplicaServerID: "vs2", + ReplicaDataAddr: "10.0.0.5:9333", ReplicaCtrlAddr: "10.0.0.5:9334", + } + + intent1 := cb.ConvertAssignment(a1, "vs1") + intent2 := cb.ConvertAssignment(a2, "vs1") - // Recovery target for replica. - if intent.RecoveryTargets[expected] != engine.SessionCatchUp { - t.Fatalf("recovery=%s", intent.RecoveryTargets[expected]) + if intent1.Replicas[0].ReplicaID != intent2.Replicas[0].ReplicaID { + t.Fatalf("identity changed: %s → %s", + intent1.Replicas[0].ReplicaID, intent2.Replicas[0].ReplicaID) + } + if intent2.Replicas[0].Endpoint.DataAddr != "10.0.0.5:9333" { + t.Fatal("endpoint should be updated") } } -func TestControl_PrimaryAssignment_MultiReplica(t *testing.T) { +func TestControl_MultiReplica_StableServerIDs(t *testing.T) { cb := NewControlBridge() a := blockvol.BlockVolumeAssignment{ - Path: "pvc-data-1", - Epoch: 2, - Role: uint32(blockvol.RolePrimary), + Path: "vol1", Epoch: 2, Role: uint32(blockvol.RolePrimary), ReplicaAddrs: []blockvol.ReplicaAddr{ - {DataAddr: "10.0.0.2:9333", CtrlAddr: "10.0.0.2:9334"}, - {DataAddr: "10.0.0.3:9333", CtrlAddr: "10.0.0.3:9334"}, + {DataAddr: "10.0.0.2:9333", CtrlAddr: "10.0.0.2:9334", ServerID: "vs2"}, + {DataAddr: "10.0.0.3:9333", CtrlAddr: "10.0.0.3:9334", ServerID: "vs3"}, }, } - intent := cb.ConvertAssignment(a, "vs1:9333") - + intent := cb.ConvertAssignment(a, "vs1") if len(intent.Replicas) != 2 { t.Fatalf("replicas=%d", len(intent.Replicas)) } - // Both replicas have stable identity. ids := map[string]bool{} for _, r := range intent.Replicas { ids[r.ReplicaID] = true } - if !ids["pvc-data-1/10.0.0.2:9333"] || !ids["pvc-data-1/10.0.0.3:9333"] { - t.Fatalf("IDs: %v", ids) + if !ids["vol1/vs2"] || !ids["vol1/vs3"] { + t.Fatalf("IDs: %v (should use ServerID, not address)", ids) } } -// --- E2: Address change preserves identity --- - -func TestControl_AddressChange_SameServerID(t *testing.T) { +func TestControl_MissingServerID_FailsClosed(t *testing.T) { cb := NewControlBridge() - // First assignment. + // Scalar: no ServerID → no replica created. a1 := blockvol.BlockVolumeAssignment{ - Path: "vol1", - Epoch: 1, - Role: uint32(blockvol.RolePrimary), - ReplicaDataAddr: "10.0.0.2:9333", - ReplicaCtrlAddr: "10.0.0.2:9334", + Path: "vol1", Epoch: 1, Role: uint32(blockvol.RolePrimary), + ReplicaDataAddr: "10.0.0.2:9333", ReplicaCtrlAddr: "10.0.0.2:9334", + // ReplicaServerID intentionally empty. + } + intent1 := cb.ConvertAssignment(a1, "vs1") + if len(intent1.Replicas) != 0 { + t.Fatalf("scalar without ServerID should produce 0 replicas, got %d", len(intent1.Replicas)) } - intent1 := cb.ConvertAssignment(a1, "vs1:9333") - // Address changes (replica restarted on different IP). + // Multi: one with ServerID, one without → only one replica. a2 := blockvol.BlockVolumeAssignment{ - Path: "vol1", - Epoch: 1, - Role: uint32(blockvol.RolePrimary), - ReplicaDataAddr: "10.0.0.5:9333", - ReplicaCtrlAddr: "10.0.0.5:9334", - } - intent2 := cb.ConvertAssignment(a2, "vs1:9333") - - // NOTE: with current extractServerID, different IPs = different server IDs. - // This is a known limitation: address-based server identity. - // In production, the master registry would supply a stable server ID. - // For now, document the boundary. - id1 := intent1.Replicas[0].ReplicaID - id2 := intent2.Replicas[0].ReplicaID - t.Logf("address change: id1=%s id2=%s (different if IP changes)", id1, id2) - - // The critical test: same IP, different port (same server, port change). - a3 := blockvol.BlockVolumeAssignment{ - Path: "vol1", - Epoch: 1, - Role: uint32(blockvol.RolePrimary), - ReplicaDataAddr: "10.0.0.2:9444", // same IP, different port - ReplicaCtrlAddr: "10.0.0.2:9445", + Path: "vol1", Epoch: 1, Role: uint32(blockvol.RolePrimary), + ReplicaAddrs: []blockvol.ReplicaAddr{ + {DataAddr: "10.0.0.2:9333", ServerID: "vs2"}, + {DataAddr: "10.0.0.3:9333", ServerID: ""}, // empty → skipped + }, + } + intent2 := cb.ConvertAssignment(a2, "vs1") + if len(intent2.Replicas) != 1 { + t.Fatalf("multi with 1 missing ServerID: replicas=%d, want 1", len(intent2.Replicas)) } - intent3 := cb.ConvertAssignment(a3, "vs1:9333") - id3 := intent3.Replicas[0].ReplicaID - - // Same IP different port = different server ID in current model. - // This is the V1 identity limitation that a future registry-backed - // server ID would resolve. - t.Logf("port change: id1=%s id3=%s", id1, id3) } -// --- E3: Epoch fencing through real assignment --- - func TestControl_EpochFencing_IntegratedPath(t *testing.T) { cb := NewControlBridge() - driver := engine.NewRecoveryDriver(nil) // no storage needed for control-path test + driver := engine.NewRecoveryDriver(nil) - // Epoch 1 assignment. a1 := blockvol.BlockVolumeAssignment{ - Path: "vol1", - Epoch: 1, - Role: uint32(blockvol.RolePrimary), - ReplicaDataAddr: "10.0.0.2:9333", - ReplicaCtrlAddr: "10.0.0.2:9334", + Path: "vol1", Epoch: 1, Role: uint32(blockvol.RolePrimary), + ReplicaServerID: "vs2", ReplicaDataAddr: "10.0.0.2:9333", ReplicaCtrlAddr: "10.0.0.2:9334", } - intent1 := cb.ConvertAssignment(a1, "vs1:9333") - driver.Orchestrator.ProcessAssignment(intent1) + driver.Orchestrator.ProcessAssignment(cb.ConvertAssignment(a1, "vs1")) - s := driver.Orchestrator.Registry.Sender("vol1/10.0.0.2:9333") - if s == nil { - t.Fatal("sender should exist after epoch 1 assignment") - } - if !s.HasActiveSession() { - t.Fatal("should have session after epoch 1") + s := driver.Orchestrator.Registry.Sender("vol1/vs2") + if s == nil || !s.HasActiveSession() { + t.Fatal("should have session at epoch 1") } - // Epoch bump (failover). driver.Orchestrator.InvalidateEpoch(2) - driver.Orchestrator.UpdateSenderEpoch("vol1/10.0.0.2:9333", 2) + driver.Orchestrator.UpdateSenderEpoch("vol1/vs2", 2) if s.HasActiveSession() { - t.Fatal("old session should be invalidated after epoch bump") + t.Fatal("old session should be invalidated") } - // Epoch 2 assignment. a2 := blockvol.BlockVolumeAssignment{ - Path: "vol1", - Epoch: 2, - Role: uint32(blockvol.RolePrimary), - ReplicaDataAddr: "10.0.0.2:9333", - ReplicaCtrlAddr: "10.0.0.2:9334", + Path: "vol1", Epoch: 2, Role: uint32(blockvol.RolePrimary), + ReplicaServerID: "vs2", ReplicaDataAddr: "10.0.0.2:9333", ReplicaCtrlAddr: "10.0.0.2:9334", } - intent2 := cb.ConvertAssignment(a2, "vs1:9333") - driver.Orchestrator.ProcessAssignment(intent2) + driver.Orchestrator.ProcessAssignment(cb.ConvertAssignment(a2, "vs1")) if !s.HasActiveSession() { t.Fatal("should have new session at epoch 2") } - // Log shows invalidation. hasInvalidation := false - for _, e := range driver.Orchestrator.Log.EventsFor("vol1/10.0.0.2:9333") { + for _, e := range driver.Orchestrator.Log.EventsFor("vol1/vs2") { if e.Event == "session_invalidated" { hasInvalidation = true } } if !hasInvalidation { - t.Fatal("log must show session invalidation on epoch bump") + t.Fatal("log must show invalidation") } } -// --- E4: Rebuild role mapping --- - func TestControl_RebuildAssignment(t *testing.T) { cb := NewControlBridge() - a := blockvol.BlockVolumeAssignment{ - Path: "vol1", - Epoch: 3, - Role: uint32(blockvol.RoleRebuilding), - ReplicaDataAddr: "10.0.0.2:9333", - ReplicaCtrlAddr: "10.0.0.2:9334", - RebuildAddr: "10.0.0.1:15000", - } - - intent := cb.ConvertAssignment(a, "10.0.0.2:9333") - - if len(intent.RecoveryTargets) != 1 { - t.Fatalf("recovery targets=%d", len(intent.RecoveryTargets)) + Path: "vol1", Epoch: 3, Role: uint32(blockvol.RoleRebuilding), + ReplicaDataAddr: "10.0.0.2:9333", ReplicaCtrlAddr: "10.0.0.2:9334", + RebuildAddr: "10.0.0.1:15000", } - - replicaID := "vol1/10.0.0.2:9333" - if intent.RecoveryTargets[replicaID] != engine.SessionRebuild { - t.Fatalf("recovery=%s", intent.RecoveryTargets[replicaID]) + intent := cb.ConvertAssignment(a, "vs2") + if intent.RecoveryTargets["vol1/vs2"] != engine.SessionRebuild { + t.Fatalf("recovery=%s", intent.RecoveryTargets["vol1/vs2"]) } } -// --- E5: Replica assignment --- - func TestControl_ReplicaAssignment(t *testing.T) { cb := NewControlBridge() - a := blockvol.BlockVolumeAssignment{ - Path: "vol1", - Epoch: 1, - Role: uint32(blockvol.RoleReplica), - ReplicaDataAddr: "10.0.0.1:14260", - ReplicaCtrlAddr: "10.0.0.1:14261", - } - - intent := cb.ConvertAssignment(a, "vs2:9333") - - if len(intent.Replicas) != 1 { - t.Fatalf("replicas=%d", len(intent.Replicas)) + Path: "vol1", Epoch: 1, Role: uint32(blockvol.RoleReplica), + ReplicaDataAddr: "10.0.0.1:14260", ReplicaCtrlAddr: "10.0.0.1:14261", } - if intent.Replicas[0].ReplicaID != "vol1/vs2:9333" { + intent := cb.ConvertAssignment(a, "vs2") + if intent.Replicas[0].ReplicaID != "vol1/vs2" { t.Fatalf("ReplicaID=%s", intent.Replicas[0].ReplicaID) } } diff --git a/weed/storage/blockvol/wal_shipper.go b/weed/storage/blockvol/wal_shipper.go index d3785ce2f..cf9d6b6c3 100644 --- a/weed/storage/blockvol/wal_shipper.go +++ b/weed/storage/blockvol/wal_shipper.go @@ -71,6 +71,17 @@ type WALShipper struct { catchupFailures int // consecutive catch-up failures; reset on success lastContactTime atomic.Value // time.Time: last successful barrier/handshake/catch-up stopped atomic.Bool + + // onStateChange is called when the shipper transitions between states. + // Used to trigger immediate heartbeat on degradation/recovery. + // Set via SetOnStateChange. Nil = no callback. + onStateChange func(from, to ReplicaState) +} + +// SetOnStateChange registers a callback for shipper state transitions. +// The callback is invoked synchronously from markDegraded/markInSync. +func (s *WALShipper) SetOnStateChange(fn func(from, to ReplicaState)) { + s.onStateChange = fn } const maxCatchupRetries = 3 @@ -345,8 +356,11 @@ func (s *WALShipper) ensureCtrlConn() error { } func (s *WALShipper) markDegraded() { - s.state.Store(uint32(ReplicaDegraded)) - log.Printf("wal_shipper: replica degraded (data=%s, ctrl=%s, state=%s)", s.dataAddr, s.controlAddr, s.State()) + prev := ReplicaState(s.state.Swap(uint32(ReplicaDegraded))) + log.Printf("wal_shipper: replica degraded (data=%s, ctrl=%s, prev=%s)", s.dataAddr, s.controlAddr, prev) + if prev != ReplicaDegraded && s.onStateChange != nil { + s.onStateChange(prev, ReplicaDegraded) + } } // resetConnections closes both data and control connections for a clean retry. @@ -404,10 +418,13 @@ func (s *WALShipper) doReconnectAndCatchUp() error { } func (s *WALShipper) markInSync() { - s.state.Store(uint32(ReplicaInSync)) + prev := ReplicaState(s.state.Swap(uint32(ReplicaInSync))) s.catchupFailures = 0 s.touchContactTime() - log.Printf("wal_shipper: replica in-sync (data=%s, ctrl=%s)", s.dataAddr, s.controlAddr) + log.Printf("wal_shipper: replica in-sync (data=%s, ctrl=%s, prev=%s)", s.dataAddr, s.controlAddr, prev) + if prev != ReplicaInSync && s.onStateChange != nil { + s.onStateChange(prev, ReplicaInSync) + } } const catchupTimeout = 30 * time.Second diff --git a/weed/storage/store_blockvol.go b/weed/storage/store_blockvol.go index 6f6bb8229..f2d18fc5a 100644 --- a/weed/storage/store_blockvol.go +++ b/weed/storage/store_blockvol.go @@ -84,6 +84,15 @@ func (bs *BlockVolumeStore) ListBlockVolumes() []string { return paths } +// IterateBlockVolumes calls fn for each registered block volume. +func (bs *BlockVolumeStore) IterateBlockVolumes(fn func(path string, vol *blockvol.BlockVol)) { + bs.mu.RLock() + defer bs.mu.RUnlock() + for path, vol := range bs.volumes { + fn(path, vol) + } +} + // CollectBlockVolumeHeartbeat returns status for all registered // block volumes, suitable for inclusion in a heartbeat message. func (bs *BlockVolumeStore) CollectBlockVolumeHeartbeat() []blockvol.BlockVolumeInfoMessage {