From 46ef79ce354e459a469509511849603088fc89bd Mon Sep 17 00:00:00 2001
From: pingqiu <pingqiu@gmail.com>
Date: Tue, 31 Mar 2026 10:46:17 -0700
Subject: [PATCH] fix: stable ServerID in assignments, fail-closed on missing
 identity, wire into ProcessAssignments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Finding 1: Identity no longer address-derived
- ReplicaAddr.ServerID field added (stable server identity from registry)
- BlockVolumeAssignment.ReplicaServerID field added (scalar RF=2 path)
- ControlBridge uses ServerID, NOT address, for ReplicaID
- Missing ServerID → replica skipped (fail closed), logged

Finding 2: Wired into real ProcessAssignments
- BlockService.v2Bridge field initialized in StartBlockService
- ProcessAssignments converts each assignment via v2Bridge.ConvertAssignment
  BEFORE existing V1 processing (parallel, not replacing yet)
- Logged at glog V(1)

Finding 3: Fail-closed on missing identity
- Empty ServerID in ReplicaAddrs → replica skipped with log
- Empty ReplicaServerID in scalar path → no replica created
- Test: MissingServerID_FailsClosed verifies both paths

7 tests: StableServerID, AddressChange_IdentityPreserved,
MultiReplica_StableServerIDs, MissingServerID_FailsClosed,
EpochFencing_IntegratedPath, RebuildAssignment, ReplicaAssignment

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 sw-block/.private/phase/phase-04-decisions.md |  105 +-
 sw-block/.private/phase/phase-04-log.md       |   40 +-
 sw-block/.private/phase/phase-04.md           |   81 +-
 sw-block/.private/phase/phase-05-decisions.md |   94 ++
 sw-block/.private/phase/phase-05-log.md       |   78 ++
 sw-block/.private/phase/phase-05.md           |  356 ++++++
 sw-block/.private/phase/phase-06-decisions.md |   68 ++
 sw-block/.private/phase/phase-06-log.md       |   51 +
 sw-block/.private/phase/phase-06.md           |  193 +++
 sw-block/.private/phase/phase-07-decisions.md |  119 ++
 sw-block/.private/phase/phase-07-log.md       |   63 +
 sw-block/.private/phase/phase-07.md           |  220 ++++
 sw-block/.private/phase/phase-08-decisions.md |   78 ++
 sw-block/.private/phase/phase-08-log.md       |   21 +
 sw-block/.private/phase/phase-08.md           |  254 ++++
 .../.private/phase/phase-4.5-decisions.md     |   59 +
 sw-block/.private/phase/phase-4.5-log.md      |   33 +
 sw-block/.private/phase/phase-4.5-reason.md   |  397 ++++++
 sw-block/.private/phase/phase-4.5.md          |  356 ++++++
 sw-block/design/README.md                     |   18 +-
 sw-block/design/a5-a8-traceability.md         |  117 ++
 sw-block/design/agent_dev_process.md          |  304 +++++
 .../design/phase-07-service-slice-plan.md     |  403 +++++++
 sw-block/design/v2-algorithm-overview.md      |  686 +++++++++++
 sw-block/design/v2-algorithm-overview.zh.md   |  660 ++++++++++
 sw-block/design/v2-detailed-algorithm.zh.md   | 1068 +++++++++++++++++
 sw-block/design/v2-engine-readiness-review.md |  170 +++
 sw-block/design/v2-engine-slicing-plan.md     |  191 +++
 sw-block/design/v2-production-roadmap.md      |  199 +++
 sw-block/design/v2-protocol-truths.md         |  561 +++++++++
 sw-block/prototype/distsim/cluster.go         |   13 +-
 sw-block/prototype/distsim/cluster_test.go    |    2 +-
 .../distsim/phase02_candidate_test.go         |    6 +
 .../distsim/phase045_adversarial_test.go      |  219 ++++
 .../prototype/distsim/phase045_crash_test.go  |  334 ++++++
 sw-block/prototype/distsim/predicates.go      |  160 +++
 sw-block/prototype/distsim/simulator.go       |    7 +-
 sw-block/prototype/distsim/storage.go         |  242 +++-
 weed/server/master_block_failover.go          |  118 +-
 weed/server/master_block_registry.go          |   56 +-
 weed/server/master_block_registry_test.go     |   60 +
 weed/server/master_grpc_server.go             |    7 +-
 weed/server/qa_block_edge_cases_test.go       |  481 ++++++++
 weed/server/volume_grpc_client_to_master.go   |   10 +
 weed/server/volume_server.go                  |    5 +-
 weed/server/volume_server_block.go            |   32 +
 weed/server/volume_server_block_debug.go      |   77 ++
 weed/storage/blockvol/block_heartbeat.go      |    1 +
 weed/storage/blockvol/blockvol.go             |   21 +
 weed/storage/blockvol/shipper_group.go        |   11 +
 .../internal/robust-slow-replica.yaml         |   58 +-
 weed/storage/blockvol/v2bridge/control.go     |   77 +-
 .../storage/blockvol/v2bridge/control_test.go |  220 ++--
 weed/storage/blockvol/wal_shipper.go          |   25 +-
 weed/storage/store_blockvol.go                |    9 +
 55 files changed, 9024 insertions(+), 270 deletions(-)
 create mode 100644 sw-block/.private/phase/phase-05-decisions.md
 create mode 100644 sw-block/.private/phase/phase-05-log.md
 create mode 100644 sw-block/.private/phase/phase-05.md
 create mode 100644 sw-block/.private/phase/phase-06-decisions.md
 create mode 100644 sw-block/.private/phase/phase-06-log.md
 create mode 100644 sw-block/.private/phase/phase-06.md
 create mode 100644 sw-block/.private/phase/phase-07-decisions.md
 create mode 100644 sw-block/.private/phase/phase-07-log.md
 create mode 100644 sw-block/.private/phase/phase-07.md
 create mode 100644 sw-block/.private/phase/phase-08-decisions.md
 create mode 100644 sw-block/.private/phase/phase-08-log.md
 create mode 100644 sw-block/.private/phase/phase-08.md
 create mode 100644 sw-block/.private/phase/phase-4.5-decisions.md
 create mode 100644 sw-block/.private/phase/phase-4.5-log.md
 create mode 100644 sw-block/.private/phase/phase-4.5-reason.md
 create mode 100644 sw-block/.private/phase/phase-4.5.md
 create mode 100644 sw-block/design/a5-a8-traceability.md
 create mode 100644 sw-block/design/agent_dev_process.md
 create mode 100644 sw-block/design/phase-07-service-slice-plan.md
 create mode 100644 sw-block/design/v2-algorithm-overview.md
 create mode 100644 sw-block/design/v2-algorithm-overview.zh.md
 create mode 100644 sw-block/design/v2-detailed-algorithm.zh.md
 create mode 100644 sw-block/design/v2-engine-readiness-review.md
 create mode 100644 sw-block/design/v2-engine-slicing-plan.md
 create mode 100644 sw-block/design/v2-production-roadmap.md
 create mode 100644 sw-block/design/v2-protocol-truths.md
 create mode 100644 sw-block/prototype/distsim/phase045_adversarial_test.go
 create mode 100644 sw-block/prototype/distsim/phase045_crash_test.go
 create mode 100644 sw-block/prototype/distsim/predicates.go
 create mode 100644 weed/server/qa_block_edge_cases_test.go
 create mode 100644 weed/server/volume_server_block_debug.go

diff --git a/sw-block/.private/phase/phase-04-decisions.md b/sw-block/.private/phase/phase-04-decisions.md
index 500cbca74..5938aab64 100644
--- a/sw-block/.private/phase/phase-04-decisions.md
+++ b/sw-block/.private/phase/phase-04-decisions.md
@@ -1,7 +1,7 @@
 # Phase 04 Decisions
 
 Date: 2026-03-27
-Status: initial
+Status: complete
 
 ## First Slice Decision
 
@@ -95,3 +95,106 @@ It is:
 - recovery outcome branching
 - assignment-intent orchestration
 - prototype-level end-to-end recovery flow
+
+## Accepted P2 Refinements
+
+### Recovery boundary
+
+Recovery classification must use a lineage-safe boundary, not a raw primary WAL head.
+
+So:
+
+- handshake outcome classification uses committed/safe recovery boundary
+- stale or divergent extra tail must not be treated as zero-gap by default
+
+### Stale assignment fencing
+
+Assignment intent must not create current live sessions from stale epoch input.
+
+So:
+
+- stale assignment epoch is rejected
+- assignment result distinguishes:
+  - created
+  - superseded
+  - failed
+
+### Phase discipline on outcome classification
+
+The outcome API must respect execution entry rules.
+
+So:
+
+- handshake-with-outcome requires valid connecting phase before acting
+
+## P3 Direction
+
+The next prototype step is:
+
+- minimal historical-data model
+- recoverability proof
+- explicit safe-boundary / divergent-tail handling
+
+## Accepted P3 Refinements
+
+### Recoverability proof
+
+The historical-data prototype must prove why catch-up is allowed.
+
+So:
+
+- recoverability now checks retained start, end within head, and contiguous coverage
+- rebuild fallback is backed by executable unrecoverability
+
+### Historical state after recycling
+
+Retained-prefix modeling needs a base state, not only remaining WAL entries.
+
+So:
+
+- tail advance captures a base snapshot
+- historical state reconstruction uses snapshot + retained WAL
+
+### Divergent tail handling
+
+Replica-ahead state must not collapse directly to `InSync`.
+
+So:
+
+- divergent tail requires explicit truncation
+- completion is gated on recorded truncation when required
+
+## P4 Direction
+
+The next prototype step is:
+
+- prototype scenario closure
+- acceptance-criteria to prototype traceability
+- explicit expression of the 4 V2-boundary cases against `enginev2`
+
+## Accepted P4 Refinements
+
+### Prototype scenario closure
+
+The prototype must stop being only a set of local mechanisms.
+
+So:
+
+- acceptance criteria are mapped to prototype evidence
+- key V2-boundary scenarios are expressed directly against `enginev2`
+- prototype behavior is reviewable scenario-by-scenario
+
+### Phase 04 completion decision
+
+Phase 04 has now met its intended prototype scope:
+
+- ownership
+- execution gating
+- outcome branching
+- minimal historical-data model
+- prototype scenario closure
+
+So:
+
+- no broad new Phase 04 work should be added
+- next work should move to `Phase 4.5` gate-hardening
diff --git a/sw-block/.private/phase/phase-04-log.md b/sw-block/.private/phase/phase-04-log.md
index 33d013a23..31b025309 100644
--- a/sw-block/.private/phase/phase-04-log.md
+++ b/sw-block/.private/phase/phase-04-log.md
@@ -1,7 +1,7 @@
 # Phase 04 Log
 
 Date: 2026-03-27
-Status: active
+Status: complete
 
 ## 2026-03-27
 
@@ -40,7 +40,37 @@ Status: active
   - attach/supersede now establish ownership only
   - handshake range validation added
   - enginev2 tests increased to 46 passing
-- Next phase focus narrowed to P2:
-  - recovery outcome branching
-  - assignment-intent orchestration
-  - prototype end-to-end recovery flow
+- Phase 04 P2 delivered and accepted:
+  - outcome branching added:
+    - `OutcomeZeroGap`
+    - `OutcomeCatchUp`
+    - `OutcomeNeedsRebuild`
+  - assignment-intent orchestration added
+  - stale assignment epoch now rejected
+  - assignment result now distinguishes created / superseded / failed
+  - end-to-end prototype recovery tests added
+  - zero-gap classification tightened:
+    - exact equality to committed boundary only
+    - replica-ahead is not zero-gap
+  - enginev2 tests increased to 63 passing
+- Phase 04 P3 delivered and accepted:
+  - `WALHistory` added as minimal historical-data model
+  - recoverability proof strengthened:
+    - retained start
+    - end within head
+    - contiguous coverage
+  - base snapshot added for correct `StateAt()` after tail advance
+  - divergent-tail truncation made explicit in sender/session execution
+  - WAL-backed prototype recovery tests added
+  - enginev2 tests increased to 83 passing
+- Phase 04 P4 delivered and accepted:
+  - acceptance criteria mapped to prototype evidence
+  - V2-boundary scenarios expressed against `enginev2`
+  - prototype scenario closure achieved
+  - enginev2 tests increased to 95 passing
+- Phase 04 is now complete for its intended prototype scope.
+- Next recommended phase:
+  - `Phase 4.5`
+  - tighten bounded `CatchUp`
+  - formalize `Rebuild`
+  - strengthen crash-consistency / recoverability / liveness proof
diff --git a/sw-block/.private/phase/phase-04.md b/sw-block/.private/phase/phase-04.md
index 407d1f79a..a73e90b76 100644
--- a/sw-block/.private/phase/phase-04.md
+++ b/sw-block/.private/phase/phase-04.md
@@ -1,7 +1,7 @@
 # Phase 04
 
 Date: 2026-03-27
-Status: active
+Status: complete
 Purpose: start the first standalone V2 implementation slice under `sw-block/`, centered on per-replica sender ownership and explicit recovery-session ownership
 
 ## Goal
@@ -93,6 +93,7 @@ Delivered in this phase so far:
 - execution APIs implemented:
   - `BeginConnect`
   - `RecordHandshake`
+  - `RecordHandshakeWithOutcome`
   - `BeginCatchUp`
   - `RecordCatchUpProgress`
   - `CompleteSessionByID`
@@ -101,15 +102,42 @@ Delivered in this phase so far:
   - zero-gap handshake fast path allowed
 - attach/supersede now establish ownership only
 - sender-group orchestration tests added
+- recovery outcome branching implemented:
+  - `OutcomeZeroGap`
+  - `OutcomeCatchUp`
+  - `OutcomeNeedsRebuild`
+- assignment-intent orchestration implemented:
+  - reconcile + recovery target session creation
+  - stale assignment epoch rejected
+  - created/superseded/failed outcomes distinguished
+- P2 data-boundary correction accepted:
+  - zero-gap now requires exact equality to committed boundary
+  - replica-ahead is not zero-gap
+- minimal historical-data prototype implemented:
+  - `WALHistory`
+  - retained-prefix / recycled-range semantics
+  - executable recoverability proof
+  - base snapshot for historical state after tail advance
+- explicit safe-boundary handling implemented:
+  - divergent tail requires truncation before `InSync`
+  - truncation recorded via sender-owned execution API
+- WAL-backed prototype tests added:
+  - catch-up recovery with data verification
+  - rebuild fallback with proof of unrecoverability
+  - truncate-then-`InSync` with committed-boundary verification
 - current `enginev2` test state at latest review:
-  - 46 tests passing
-
-Next focus for `sw`:
-
-- continue Phase 04 beyond execution gating:
-  - recovery outcome branching
-  - sender-group orchestration from assignment intent
-  - prototype-level end-to-end recovery flow
+-  - 95 tests passing
+- prototype scenario closure completed:
+  - acceptance criteria mapped to prototype evidence
+  - V2-boundary scenarios expressed against `enginev2`
+  - small end-to-end prototype harness added
+
+Next phase:
+
+- `Phase 4.5`
+  - bounded `CatchUp`
+  - first-class `Rebuild`
+  - crash-consistency / recoverability / liveness proof hardening
 - do not integrate into V1 production tree yet
 
 ### P1
@@ -141,6 +169,39 @@ Next focus for `sw`:
 - completion / invalidation
 - rebuild escalation
 
+### P3
+
+10. add minimal historical-data prototype
+- retained prefix/window
+- minimal recoverability state
+- explicit "why catch-up is allowed" proof
+
+11. make safe-boundary data handling explicit
+- divergent tail cleanup / truncate rule
+- or equivalent explicit boundary handling before `InSync`
+
+12. strengthen recoverability/rebuild tests
+- executable proof of:
+- recoverable gap
+- unrecoverable gap
+- rebuild fallback boundary
+
+### P4
+
+13. close prototype scenario coverage
+- map key acceptance criteria onto `enginev2` scenarios/tests
+- make prototype evidence reviewable scenario-by-scenario
+
+14. express the 4 V2-boundary cases against the prototype
+- changed-address identity-preserving recovery
+- `NeedsRebuild` persistence
+- catch-up without overwriting safe data
+- repeated disconnect/reconnect cycles
+
+15. add one small prototype harness if needed
+- enough to show assignment -> recovery -> outcome flow end-to-end
+- no product/backend integration yet
+
 ## Exit Criteria
 
 Phase 04 is done when:
@@ -151,3 +212,5 @@ Phase 04 is done when:
 4. endpoint update and epoch invalidation are tested
 5. sender-owned execution flow is validated
 6. recovery outcome branching exists at prototype level
+7. minimal historical-data / recoverability model exists at prototype level
+8. prototype scenario closure is achieved for key V2 acceptance cases
diff --git a/sw-block/.private/phase/phase-05-decisions.md b/sw-block/.private/phase/phase-05-decisions.md
new file mode 100644
index 000000000..fab98d6ce
--- /dev/null
+++ b/sw-block/.private/phase/phase-05-decisions.md
@@ -0,0 +1,94 @@
+# Phase 05 Decisions
+
+## Decision 1: Real V2 engine work lives under `sw-block/engine/replication/`
+
+The first real engine slice is established under:
+
+- `sw-block/engine/replication/`
+
+This keeps V2 separate from:
+
+- `sw-block/prototype/`
+- `weed/storage/blockvol/`
+
+## Decision 2: Slice 1 is accepted
+
+Accepted scope:
+
+1. stable per-replica sender identity
+2. stable recovery-session identity
+3. stale authority fencing
+4. endpoint / epoch invalidation
+5. ownership registry
+
+## Decision 3: Stable identity must not be address-shaped
+
+The engine registry is now keyed by stable `ReplicaID`, not mutable endpoint address.
+
+This is a required structural break from the V1/V1.5 identity-loss pattern.
+
+## Decision 4: Slice 2 is accepted
+
+Accepted scope:
+
+1. connect / handshake / catch-up flow
+2. zero-gap / catch-up / needs-rebuild branching
+3. stale execution rejection during active recovery
+4. bounded catch-up semantics in engine path
+5. rebuild execution shell
+
+## Decision 5: Slice 3 owns real recoverability inputs
+
+Slice 3 should be the point where:
+
+1. recoverable vs unrecoverable gap uses real engine inputs
+2. trusted-base / rebuild-source decision uses real engine data inputs
+3. truncation / safe-boundary handling is tied to real engine state
+4. historical correctness at recovery target is validated from engine inputs
+
+## Decision 6: Slice 3 is accepted
+
+Accepted scope:
+
+1. real engine recoverability input path
+2. trusted-base / rebuild-source decision from engine data inputs
+3. truncation / safe-boundary handling tied to engine state
+4. recoverability gating without overclaiming full historical reconstruction in engine
+
+## Decision 7: Slice 3 should replace carried-forward heuristics where appropriate
+
+In particular:
+
+1. simple rebuild-source heuristics carried from prototype should not become permanent engine policy
+2. Slice 3 should tighten these decisions against real engine recoverability inputs
+
+## Decision 8: Slice 4 is the engine integration closure slice
+
+Next focus:
+
+1. real assignment/control intent entry path
+2. engine observability / debug surface
+3. focused integration tests for V2-boundary cases
+4. validation against selected real failure classes from `learn/projects/sw-block/` and `weed/storage/block*`
+
+## Decision 9: Slice 4 is accepted
+
+Accepted scope:
+
+1. real orchestrator entry path
+2. assignment/update-driven recovery through that path
+3. engine observability / causal recovery logging
+4. diagnosable V2-boundary integration tests
+
+## Decision 10: Phase 05 is complete
+
+Reason:
+
+1. ownership core is accepted
+2. recovery execution core is accepted
+3. data / recoverability core is accepted
+4. integration closure is accepted
+
+Next:
+
+- `Phase 06` broader engine implementation stage
diff --git a/sw-block/.private/phase/phase-05-log.md b/sw-block/.private/phase/phase-05-log.md
new file mode 100644
index 000000000..6ff441e2c
--- /dev/null
+++ b/sw-block/.private/phase/phase-05-log.md
@@ -0,0 +1,78 @@
+# Phase 05 Log
+
+## 2026-03-29
+
+### Opened
+
+`Phase 05` opened as:
+
+- V2 engine planning + Slice 1 ownership core
+
+### Accepted
+
+1. engine module location
+   - `sw-block/engine/replication/`
+
+2. Slice 1 ownership core
+   - stable per-replica sender identity
+   - stable recovery-session identity
+   - sender/session fencing
+   - endpoint / epoch invalidation
+   - ownership registry
+
+3. Slice 1 identity correction
+   - registry now keyed by stable `ReplicaID`
+   - mutable `Endpoint` separated from identity
+   - real changed-`DataAddr` preservation covered by test
+
+4. Slice 1 encapsulation
+   - mutable sender/session authority state no longer exposed directly
+   - snapshot/read-only inspection path in place
+
+5. Slice 2 recovery execution core
+   - connect / handshake / catch-up flow
+   - explicit zero-gap / catch-up / needs-rebuild branching
+   - stale execution rejection during active recovery
+   - bounded catch-up semantics
+   - rebuild execution shell
+
+6. Slice 2 validation
+   - corrected tester summary accepted
+   - `12` ownership tests + `18` recovery tests = `30` total
+   - Slice 2 accepted for progression to Slice 3 planning
+
+7. Slice 3 data / recoverability core
+   - `RetainedHistory` introduced as engine-level recoverability input
+   - history-driven sender APIs added for handshake and rebuild-source selection
+   - trusted-base decision now requires both checkpoint trust and replayable tail
+   - truncation remains a completion gate / protocol boundary
+
+8. Slice 3 validation
+   - corrected tester summary accepted
+   - `12` ownership tests + `18` recovery tests + `18` recoverability tests = `48` total
+   - accepted boundary:
+     - engine proves historical-correctness prerequisites
+     - simulator retains stronger historical reconstruction proof
+   - Slice 3 accepted for progression to Slice 4 planning
+
+9. Slice 4 integration closure
+   - `RecoveryOrchestrator` added as integrated engine entry path
+   - assignment/update-driven recovery is exercised through orchestrator
+   - observability surface added:
+     - `RegistryStatus`
+     - `SenderStatus`
+     - `SessionSnapshot`
+     - `RecoveryLog`
+   - causal recovery logging now covers invalidation, escalation, truncation, completion, rebuild transitions
+
+10. Slice 4 validation
+   - corrected tester summary accepted
+   - `12` ownership tests + `18` recovery tests + `18` recoverability tests + `11` integration tests = `59` total
+   - Slice 4 accepted
+   - `Phase 05` accepted as complete
+
+### Next
+
+1. `Phase 06` planning
+2. broader engine implementation stage
+3. real-engine integration against selected `weed/storage/block*` constraints and failure classes
diff --git a/sw-block/.private/phase/phase-05.md b/sw-block/.private/phase/phase-05.md
new file mode 100644
index 000000000..ec21837f8
--- /dev/null
+++ b/sw-block/.private/phase/phase-05.md
@@ -0,0 +1,356 @@
+# Phase 05
+
+Date: 2026-03-29
+Status: complete
+Purpose: begin the real V2 engine track under `sw-block/` by moving from prototype proof to the first engine slice
+
+## Why This Phase Exists
+
+The project has now completed:
+
+1. V2 design/FSM closure
+2. V2 protocol/simulator validation
+3. Phase 04 prototype closure
+4. Phase 4.5 evidence hardening
+
+So the next step is no longer:
+
+- extend prototype breadth
+
+The next step is:
+
+- start disciplined real V2 engine work
+
+## Phase Goal
+
+Start the real V2 engine line under `sw-block/` with:
+
+1. explicit engine module location
+2. Slice 1 ownership-core boundaries
+3. first engine ownership-core implementation
+4. engine-side validation tied back to accepted prototype invariants
+
+## Relationship To Previous Phases
+
+`Phase 05` is built on:
+
+- `sw-block/design/v2-engine-readiness-review.md`
+- `sw-block/design/v2-engine-slicing-plan.md`
+- `sw-block/.private/phase/phase-04.md`
+- `sw-block/.private/phase/phase-4.5.md`
+
+This is a new implementation phase.
+
+It is not:
+
+1. more prototype expansion
+2. V1 integration
+3. backend redesign
+
+## Scope
+
+### In scope
+
+1. choose real V2 engine module location under `sw-block/`
+2. define Slice 1 file/module boundaries
+3. write short engine ownership-core spec
+4. start Slice 1 implementation:
+   - stable per-replica sender object
+   - stable recovery-session object
+   - session identity fencing
+   - endpoint / epoch invalidation
+   - ownership registry / sender-group equivalent
+5. add focused engine-side ownership/fencing tests
+
+### Out of scope
+
+1. Smart WAL expansion
+2. full storage/backend redesign
+3. full rebuild-source decision logic
+4. V1 production integration
+5. performance work
+6. full product integration
+
+## Planned Slices
+
+### P0: Engine Planning Setup
+
+1. choose real V2 engine module location under `sw-block/`
+2. define Slice 1 file/module boundaries
+3. write ownership-core spec
+4. map 3-5 acceptance scenarios to Slice 1 expectations
+
+Status:
+
+- accepted
+- engine module location chosen: `sw-block/engine/replication/`
+- Slice 1 boundaries are explicit enough to start implementation
+
+### P1: Slice 1 Ownership Core
+
+1. implement stable per-replica sender object
+2. implement stable recovery-session object
+3. implement sender/session identity fencing
+4. implement endpoint / epoch invalidation
+5. implement ownership registry
+
+Status:
+
+- accepted
+- stable `ReplicaID` is now explicit and separate from mutable `Endpoint`
+- engine registry is keyed by stable identity, not address-shaped strings
+- real changed-`DataAddr` preservation is covered by test
+
+### P2: Slice 1 Validation
+
+1. engine-side tests for ownership/fencing
+2. changed-address case
+3. stale-session rejection case
+4. epoch-bump invalidation case
+5. traceability back to accepted prototype behavior
+
+Status:
+
+- accepted
+- Slice 1 ownership/fencing tests are in place and passing
+- acceptance/gate mapping is strong enough to move to Slice 2
+
+### P3: Slice 2 Planning Setup
+
+1. define Slice 2 boundaries explicitly
+2. distinguish Slice 2 core from carried-forward prototype support
+3. map Slice 2 engine expectations from accepted prototype evidence
+4. prepare Slice 2 validation targets
+
+Status:
+
+- accepted
+- Slice 2 recovery execution core is implemented and validated
+- corrected tester summary accepted:
+  - `12` ownership tests
+  - `18` recovery tests
+  - `30` total
+
+### P4: Slice 3 Planning Setup
+
+1. define Slice 3 boundaries explicitly
+2. connect recovery decisions to real engine recoverability inputs
+3. make trusted-base / rebuild-source decision use real engine data inputs
+4. prepare Slice 3 validation targets
+
+Status:
+
+- accepted
+- Slice 3 data / recoverability core is implemented and validated
+- corrected tester summary accepted:
+  - `12` ownership tests
+  - `18` recovery tests
+  - `18` recoverability tests
+  - `48` total
+- important boundary preserved:
+  - engine proves historical-correctness prerequisites
+  - full historical reconstruction proof remains simulator-side
+
+## Slice 3 Guardrails
+
+Slice 3 is the point where V2 must move from:
+
+- recovery automaton is coherent
+
+to:
+
+- recovery basis is provable
+
+So Slice 3 must stay tight.
+
+### Guardrail 1: No optimistic watermark in place of recoverability proof
+
+Do not accept:
+
+- loose head/tail watermarks
+- "looks retained enough"
+- heuristic recoverability
+
+Slice 3 should prove:
+
+1. why a gap is recoverable
+2. why a gap is unrecoverable
+
+### Guardrail 2: No current extent state pretending to be historical correctness
+
+Do not accept:
+
+- current extent image as substitute for target-LSN truth
+- checkpoint/base state that leaks newer state into older historical queries
+
+Slice 3 should prove historical correctness at the actual recovery target.
+
+### Guardrail 3: No `snapshot + tail` without trusted-base proof
+
+Do not accept:
+
+- "snapshot exists" as sufficient
+
+Require:
+
+1. trusted base exists
+2. trusted base covers the required base state
+3. retained tail can be replayed continuously from that base to the target
+
+If not, recovery must use:
+
+- `FullBase`
+
+### Guardrail 4: Truncation is protocol boundary, not cleanup policy
+
+Do not treat truncation as:
+
+- optional cleanup
+- post-recovery tidying
+
+Treat truncation as:
+
+1. divergent tail removal
+2. explicit safe-boundary restoration
+3. prerequisite for safe `InSync` / recovery completion where applicable
+
+### P5: Slice 4 Planning Setup
+
+1. define Slice 4 boundaries explicitly
+2. connect engine control/recovery core to real assignment/control intent entry path
+3. add engine observability / debug surface for ownership and recovery failures
+4. prepare integration validation against V2-boundary failure classes
+
+Status:
+
+- accepted
+- Slice 4 integration closure is implemented and validated
+- corrected tester summary accepted:
+  - `12` ownership tests
+  - `18` recovery tests
+  - `18` recoverability tests
+  - `11` integration tests
+  - `59` total
+
+## Slice 4 Guardrails
+
+Slice 4 should close integration, not just add an entry point and some logs.
+
+### Guardrail 1: Entry path must actually drive recovery
+
+Do not accept:
+
+- tests that manually push sender/session state while only pretending to use integration entry points
+
+Require:
+
+1. real assignment/control intent entry path
+2. session creation / invalidation / restart triggered through that path
+3. recovery flow driven from that path, not only from unit-level helper calls
+
+### Guardrail 2: Changed-address must survive the real entry path
+
+Do not accept:
+
+- changed-address correctness proven only at local object level
+
+Require:
+
+1. stable `ReplicaID` survives real assignment/update entry path
+2. endpoint update invalidates old session correctly
+3. new recovery session is created correctly on updated endpoint
+
+### Guardrail 3: Observability must show protocol causality
+
+Do not accept:
+
+- only state snapshots
+- only phase dumps
+
+Require observability that can explain:
+
+1. why recovery entered `NeedsRebuild`
+2. why a session was superseded
+3. why a completion or progress update was rejected
+4. why endpoint / epoch change caused invalidation
+
+### Guardrail 4: Failure replay must be explainable
+
+Do not accept:
+
+- a replay that reproduces failure but cannot explain the cause from engine observability
+
+Require:
+
+1. selected failure-class replays through the real entry path
+2. observability sufficient to explain the control/recovery decision
+3. reviewability against key V2-boundary failures
+
+## Exit Criteria
+
+Phase 05 Slice 1 is done when:
+
+1. the real V2 engine module location is chosen
+2. Slice 1 boundaries are explicit
+3. engine ownership core exists under `sw-block/`
+4. engine-side ownership/fencing tests pass
+5. Slice 1 evidence is reviewable against prototype expectations
+
+This bar is now met.
+
+Phase 05 Slice 2 is done when:
+
+1. engine-side recovery execution flow exists
+2. zero-gap / catch-up / needs-rebuild branching is explicit
+3. stale execution is rejected during active recovery
+4. bounded catch-up semantics are enforced in engine path
+5. rebuild execution shell is validated
+
+This bar is now met.
+
+Phase 05 Slice 3 is done when:
+
+1. recoverable vs unrecoverable gap uses real engine recoverability inputs
+2. trusted-base / rebuild-source decision uses real engine data inputs
+3. truncation / safe-boundary handling is tied to real engine state
+4. history-driven engine APIs exist for recovery decisions
+5. Slice 3 validation is reviewable without overclaiming full historical reconstruction
+
+This bar is now met.
+
+Phase 05 Slice 4 is done when:
+
+1. real assignment/control intent entry path exists
+2. changed-address recovery works through the real entry path
+3. observability explains protocol causality, not only state snapshots
+4. selected V2-boundary failures are replayable and diagnosable through engine integration tests
+
+This bar is now met.
+
+## Assignment For `sw`
+
+Phase 05 is now complete.
+
+Next phase:
+
+- `Phase 06` broader engine implementation stage
+
+## Assignment For `tester`
+
+Phase 05 validation is complete.
+
+Next phase:
+
+- `Phase 06` engine implementation validation against real-engine constraints and failure classes
+
+## Management Rule
+
+`Phase 05` should stay narrow.
+
+It should start the engine line with:
+
+1. ownership
+2. fencing
+3. validation
+
+It should not try to absorb later slices early.
diff --git a/sw-block/.private/phase/phase-06-decisions.md b/sw-block/.private/phase/phase-06-decisions.md
new file mode 100644
index 000000000..31669178a
--- /dev/null
+++ b/sw-block/.private/phase/phase-06-decisions.md
@@ -0,0 +1,68 @@
+# Phase 06 Decisions
+
+## Decision 1: Phase 06 is broader engine implementation, not new design
+
+The protocol shape and engine core contracts were already accepted.
+
+Phase 06 implemented around them.
+
+## Decision 2: Phase 06 must connect to real constraints
+
+This phase explicitly used:
+
+1. `learn/projects/sw-block/` for failure gates and test lineage
+2. `weed/storage/block*` for real implementation constraints
+
+without importing V1 structure as the V2 design template.
+
+## Decision 3: Phase 06 should replace key synchronous conveniences
+
+The accepted Slice 4 convenience flows were sufficient for closure work, but broader engine work required real step boundaries.
+
+This is now satisfied via planner/executor separation.
+
+## Decision 4: Phase 06 ends with a runnable engine stage decision
+
+Result:
+
+- yes, the project now has a broader runnable engine stage that is ready to proceed to real-system integration / product-path work
+
+## Decision 5: Phase 06 P0 is accepted
+
+Accepted scope:
+
+1. adapter/module boundaries
+2. convenience-flow classification
+3. initial real-engine stage framing
+
+## Decision 6: Phase 06 P1 is accepted
+
+Accepted scope:
+
+1. storage/control adapter interfaces
+2. `RecoveryDriver` planner/resource-acquisition layer
+3. full-base and WAL retention resource contracts
+4. fail-closed preconditions on planning paths
+
+## Decision 7: Phase 06 P2 is accepted
+
+Accepted scope:
+
+1. explicit planner/executor split on top of `RecoveryPlan`
+2. executor-owned cleanup symmetry on success/failure/cancellation
+3. plan-bound rebuild execution with no policy re-derivation at execute time
+4. synchronous orchestrator completion helpers remain test-only convenience
+
+## Decision 8: Phase 06 P3 is accepted
+
+Accepted scope:
+
+1. selected real failure classes validated through the engine path
+2. cross-layer engine/storage proof validation
+3. diagnosable failure when proof or resource acquisition cannot be established
+
+## Decision 9: Phase 06 is complete
+
+Next step:
+
+- `Phase 07` real-system integration / product-path decision
diff --git a/sw-block/.private/phase/phase-06-log.md b/sw-block/.private/phase/phase-06-log.md
new file mode 100644
index 000000000..f483d4051
--- /dev/null
+++ b/sw-block/.private/phase/phase-06-log.md
@@ -0,0 +1,51 @@
+# Phase 06 Log
+
+## 2026-03-30
+
+### Opened
+
+`Phase 06` opened as:
+
+- broader engine implementation stage
+
+### Starting basis
+
+1. `Phase 05`: complete
+2. engine core and integration closure accepted
+3. next work moves from slice proof to broader runnable engine stage
+
+### Accepted
+
+1. Phase 06 P0
+   - adapter/module boundaries defined
+   - convenience flows explicitly classified
+
+2. Phase 06 P1
+   - storage/control adapter surfaces defined
+   - `RecoveryDriver` added as planner/resource-acquisition layer
+   - full-base rebuild now has explicit resource contract
+   - WAL pin contract tied to actual recovery need
+   - driver preconditions fail closed
+
+3. Phase 06 P2
+   - explicit planner/executor split accepted
+   - executor owns release symmetry on success, failure, and cancellation
+   - rebuild execution now consumes plan-bound source/target values
+   - tester final validation accepted with reduced-but-sufficient rebuild failure-path coverage
+
+4. Phase 06 P3
+   - selected real failure classes validated through the engine path
+   - changed-address restart now uses plan cancellation and re-plan flow
+   - stale execution is caught through the executor-managed loop
+   - cross-layer trusted-base / replayable-tail proof path validated end-to-end
+   - rebuild planning failures now clean up sessions and remain diagnosable
+
+### Closed
+
+`Phase 06` closed as complete.
+
+### Next
+
+1. Phase 07 real-system integration / product-path decision
+2. service-slice integration against real control/storage surroundings
+3. first product-path gating decision
diff --git a/sw-block/.private/phase/phase-06.md b/sw-block/.private/phase/phase-06.md
new file mode 100644
index 000000000..d54988066
--- /dev/null
+++ b/sw-block/.private/phase/phase-06.md
@@ -0,0 +1,193 @@
+# Phase 06
+
+Date: 2026-03-30
+Status: complete
+Purpose: move from validated engine slices to the first broader runnable V2 engine stage
+
+## Why This Phase Exists
+
+`Phase 05` established and validated:
+
+1. ownership core
+2. recovery execution core
+3. recoverability/data gating core
+4. integration closure
+
+What still does not exist is a broader engine stage that can run with:
+
+1. real control-plane inputs
+2. real persistence/backing inputs
+3. non-trivial execution loops instead of only synchronous convenience paths
+
+So `Phase 06` exists to turn the accepted engine shape into the first broader runnable engine stage.
+
+Phase 06 must connect the accepted engine core to real control and real storage truth, not just wrap current abstractions with adapters.
+
+## Phase Goal
+
+Build the first broader V2 engine stage without reopening protocol shape.
+
+This phase should focus on:
+
+1. real engine adapters around the accepted core
+2. asynchronous or stepwise execution paths where Slice 4 used synchronous helpers
+3. real retained-history / checkpoint input plumbing
+4. validation against selected real failure classes and real implementation constraints
+
+## Overall Roadmap
+
+Completed:
+
+1. Phase 01-03: design + simulator
+2. Phase 04: prototype closure
+3. Phase 4.5: evidence hardening
+4. Phase 05: engine slice closure
+5. Phase 06: broader engine implementation stage
+
+Next:
+
+1. Phase 07: real-system integration / product-path decision
+
+This roadmap should stay strict:
+
+- no return to broad prototype expansion
+- no uncontrolled engine sprawl
+
+## Scope
+
+### In scope
+
+1. control-plane adapter into `sw-block/engine/replication/`
+2. retained-history / checkpoint adapter into engine recoverability APIs
+3. replacement of synchronous convenience flows with explicit engine steps where needed
+4. engine error taxonomy and observability tightening
+5. validation against selected real failure classes from:
+   - `learn/projects/sw-block/`
+   - `weed/storage/block*`
+
+### Out of scope
+
+1. Smart WAL expansion
+2. full backend redesign
+3. performance optimization as primary goal
+4. V1 replacement rollout
+5. full product integration
+
+## Phase 06 Items
+
+### P0: Engine Stage Plan
+
+Status:
+
+- accepted
+- module boundaries now explicit:
+  - `adapter.go`
+  - `driver.go`
+  - `orchestrator.go` classification
+- convenience flows are now classified as:
+  - test-only convenience wrapper
+  - stepwise engine task
+  - planner/executor split
+
+### P1: Control / History Adapters
+
+Status:
+
+- accepted
+- `StorageAdapter` boundary exists and is exercised by tests
+- full-base rebuild now has a real pin/release contract
+- WAL pinning is tied to actual recovery contract, not loose watermark use
+- planner fails closed on missing sender / missing session / wrong session kind
+
+### P2: Execution Driver
+
+Status:
+
+- accepted
+- executor now owns resource lifecycle on success / failure / cancellation
+- catch-up execution is stepwise and budget-checked per progress step
+- rebuild execution consumes plan-bound source/target values and does not re-derive policy at execute time
+- `CompleteCatchUp` / `CompleteRebuild` remain test-only convenience wrappers
+- tester validation accepted with reduced-but-sufficient rebuild failure-path coverage
+
+### P3: Validation Against Real Failure Classes
+
+Status:
+
+- accepted
+- changed-address restart now validated through planner/executor path with plan cancellation
+- stale epoch/session during active execution now validated through the executor-managed loop
+- cross-layer trusted-base / replayable-tail proof path validated end-to-end
+- rebuild fallback and pin-failure cleanup now fail closed and are diagnosable
+
+## Guardrails
+
+### Guardrail 1: Do not reopen protocol shape
+
+Phase 06 implemented around accepted engine slices and did not reopen:
+
+1. sender/session authority model
+2. bounded catch-up contract
+3. recoverability/truncation boundary
+
+### Guardrail 2: Do not let adapters smuggle V1 structure back in
+
+V1 code and docs remain:
+
+1. constraints
+2. failure gates
+3. integration references
+
+not the V2 architecture template.
+
+### Guardrail 3: Prefer explicit engine steps over synchronous convenience
+
+Key convenience helpers remain test-only. Real engine work now has explicit planner/executor boundaries.
+
+### Guardrail 4: Keep evidence quality high
+
+Phase 06 improved:
+
+1. cross-layer traceability
+2. diagnosability
+3. real-failure validation
+
+without growing protocol surface.
+
+### Guardrail 5: Do not fake storage truth with metadata-only adapters
+
+Phase 06 now requires:
+
+1. trusted base to come from storage-side truth
+2. replayable tail to be grounded in retention state
+3. observable rejection when those proofs cannot be established
+
+## Exit Criteria
+
+Phase 06 is done when:
+
+1. engine has real control/history adapters into the accepted core
+2. engine has real storage/base adapters into the accepted core
+3. key synchronous convenience paths are explicitly classified or replaced by real engine steps where necessary
+4. selected real failure classes are validated against the engine stage
+5. at least one cross-layer storage/engine proof path is validated end-to-end
+6. engine observability remains good enough to explain recovery causality
+
+Status:
+
+- met
+
+## Closeout
+
+`Phase 06` is complete.
+
+It established:
+
+1. a broader runnable engine stage around the accepted Phase 05 core
+2. real planner/executor/resource contracts
+3. validated failure-class behavior through the engine path
+4. diagnosable proof rejection and cleanup behavior
+
+Next step:
+
+- `Phase 07` real-system integration / product-path decision
diff --git a/sw-block/.private/phase/phase-07-decisions.md b/sw-block/.private/phase/phase-07-decisions.md
new file mode 100644
index 000000000..a0440ca9c
--- /dev/null
+++ b/sw-block/.private/phase/phase-07-decisions.md
@@ -0,0 +1,119 @@
+# Phase 07 Decisions
+
+## Decision 1: Phase 07 is real-system integration, not protocol redesign
+
+The V2 protocol shape, engine core, and broader runnable engine stage are already accepted.
+
+Phase 07 should integrate them into a real-system service slice.
+
+## Decision 2: Phase 07 should make the first product-path decision
+
+This phase should not only integrate a service slice.
+
+It should also decide:
+
+1. what the first product path is
+2. what remains before pre-production hardening
+
+## Decision 3: Phase 07 must preserve accepted V2 boundaries
+
+Phase 07 should preserve:
+
+1. narrow catch-up semantics
+2. rebuild as the formal recovery path
+3. trusted-base / replayable-tail proof boundaries
+4. stable identity / fenced execution / diagnosable failure handling
+
+## Decision 4: Phase 07 P0 service-slice direction is set
+
+Current direction:
+
+1. first service slice = `RF=2` block volume primary + one replica
+2. engine remains in `sw-block/engine/replication/`
+3. current bridge work starts in `sw-block/bridge/blockvol/`
+4. deferred real blockvol-side bridge target = `weed/storage/blockvol/v2bridge/`
+5. stable identity mapping is explicit:
+   - `ReplicaID = <volume-name>/<server-id>`
+6. `blockvol` executes I/O but does not own recovery policy
+
+## Decision 5: Phase 07 P1 is accepted with explicit scope limits
+
+Accepted `P1` coverage is:
+
+1. real reader mapping from `BlockVol` state
+2. real retention hold / release wiring into the flusher retention floor
+3. one real WAL catch-up scan path through `v2bridge`
+4. direct real-adapter tests under `weed/storage/blockvol/v2bridge/`
+
+This acceptance means:
+
+1. the real bridge path is now integrated and evidenced
+2. `P1` is not yet acceptance proof of general post-checkpoint catch-up viability
+
+Not accepted as part of `P1`:
+
+1. snapshot transfer execution
+2. full-base transfer execution
+3. WAL truncation execution
+4. master-side confirmed failover / control-intent integration
+
+## Decision 6: Interim committed-truth limitation remains active
+
+`Phase 07 P1` is accepted with an explicit carry-forward limitation:
+
+1. interim `CommittedLSN = CheckpointLSN` is a service-slice mapping, not final V2 protocol truth
+2. post-checkpoint catch-up semantics are therefore narrower than final V2 intent
+3. later `Phase 07` work must not overclaim this limitation as solved until commit truth is separated from checkpoint truth
+
+## Decision 7: Phase 07 P2 is accepted with scoped replay claims
+
+Accepted `P2` coverage is:
+
+1. real service-path replay for changed-address restart
+2. stale epoch / stale session invalidation through the integrated path
+3. unrecoverable-gap / needs-rebuild replay with diagnosable proof
+4. explicit replay of the post-checkpoint boundary under the interim model
+
+Not accepted as part of `P2`:
+
+1. general integrated engine-driven post-checkpoint catch-up semantics
+2. real control-plane delivery from master heartbeat into the bridge
+3. rebuild execution beyond the already-deferred executor stubs
+
+## Decision 8: Phase 07 now moves to product-path choice, not more bridge-shape proof
+
+With `P0`, `P1`, and `P2` accepted, the next step is:
+
+1. choose the first product path from accepted service-slice evidence
+2. define what remains before pre-production hardening
+3. keep unresolved limits explicit rather than hiding them behind broader claims
+
+## Decision 7: Phase 07 P2 must replay the interim limitation explicitly
+
+`Phase 07 P2` should not only replay happy-path or ordinary failure-path integration.
+
+It should also include one explicit replay where:
+
+1. the live bridge path is exercised after checkpoint truth has advanced
+2. the observed catch-up limitation is diagnosed as a consequence of the interim mapping
+3. the result is not overclaimed as proof of final V2 post-checkpoint catch-up semantics
+
+## Decision 10: Phase 07 P3 is accepted and Phase 07 is complete
+
+The first V2 product path is now explicitly chosen as:
+
+1. `RF=2`
+2. `sync_all`
+3. existing master / volume-server heartbeat path
+4. V2 engine owns recovery policy
+5. `v2bridge` provides real storage truth
+
+This decision is accepted with explicit non-claims:
+
+1. not production-ready
+2. no real master-side control delivery proof yet
+3. no full rebuild execution proof yet
+4. no general post-checkpoint catch-up proof yet
+5. no full integrated engine -> executor -> `v2bridge` catch-up proof yet
+
+Phase 07 is therefore complete, and the next phase is pre-production hardening.
diff --git a/sw-block/.private/phase/phase-07-log.md b/sw-block/.private/phase/phase-07-log.md
new file mode 100644
index 000000000..a59830579
--- /dev/null
+++ b/sw-block/.private/phase/phase-07-log.md
@@ -0,0 +1,63 @@
+# Phase 07 Log
+
+## 2026-03-30
+
+### Opened
+
+`Phase 07` opened as:
+
+- real-system integration / product-path decision
+
+### Starting basis
+
+1. `Phase 06`: complete
+2. broader runnable engine stage accepted
+3. next work moves from engine-stage validation to real-system service-slice integration
+
+### Delivered
+
+1. Phase 07 P0
+   - service-slice plan defined
+   - implementation slice proposal delivered
+   - bridge layer introduced as:
+     - `sw-block/bridge/blockvol/` for current bridge work
+     - `weed/storage/blockvol/v2bridge/` as the deferred real integration target
+   - stable identity mapping made explicit:
+     - `ReplicaID = <volume-name>/<server-id>`
+   - engine / blockvol policy boundary made explicit
+   - initial bridge tests delivered (`8`)
+2. Phase 07 P1
+   - real blockvol reader integrated via `weed/storage/blockvol/v2bridge/reader.go`
+   - real pinner integrated via `weed/storage/blockvol/v2bridge/pinner.go`
+   - one real catch-up executor path integrated via `weed/storage/blockvol/v2bridge/executor.go`
+   - direct real-adapter tests delivered in:
+     - `weed/storage/blockvol/v2bridge/bridge_test.go`
+   - accepted with explicit carry-forward:
+     - interim `CommittedLSN = CheckpointLSN` limits post-checkpoint catch-up semantics and is not final V2 commit truth
+     - acceptance is for the real integrated bridge path, not for general post-checkpoint catch-up viability
+3. Phase 07 P2
+   - real service-path failure replay accepted
+   - accepted replay set includes:
+     - changed-address restart
+     - stale epoch / stale session invalidation
+     - unrecoverable-gap / needs-rebuild replay
+     - explicit post-checkpoint boundary replay
+   - evidence kept explicitly scoped:
+     - real `v2bridge` WAL-scan execution proven
+     - general integrated post-checkpoint catch-up semantics not overclaimed under the interim model
+4. Phase 07 P3
+   - product-path decision accepted
+   - first product path chosen as:
+     - `RF=2`
+     - `sync_all`
+     - existing master / volume-server heartbeat path
+     - V2 engine recovery ownership with `v2bridge` real storage truth
+   - pre-hardening prerequisites made explicit
+   - intentional deferrals and non-claims recorded
+   - `Phase 07` completed
+
+### Next
+
+1. Phase 08 pre-production hardening
+2. real master/control delivery integration
+3. integrated catch-up / rebuild execution closure
diff --git a/sw-block/.private/phase/phase-07.md b/sw-block/.private/phase/phase-07.md
new file mode 100644
index 000000000..e14cd3fd5
--- /dev/null
+++ b/sw-block/.private/phase/phase-07.md
@@ -0,0 +1,220 @@
+# Phase 07
+
+Date: 2026-03-30
+Status: complete
+Purpose: connect the broader runnable V2 engine stage to a real-system service slice and decide the first product path
+
+## Why This Phase Exists
+
+`Phase 06` completed the broader runnable engine stage:
+
+1. planner/executor/resource contracts are real
+2. selected real failure classes are validated through the engine path
+3. cross-layer trusted-base / replayable-tail proof path is validated
+
+What still does not exist is a real-system slice where the engine runs inside actual service boundaries with real control/storage surroundings.
+
+So `Phase 07` exists to answer:
+
+1. how the engine runs as a real subsystem
+2. what the first product path should be
+3. what integration risks remain before pre-production hardening
+
+## Phase Goal
+
+Establish a real-system integration slice for the V2 engine and make the first product-path decision without reopening protocol shape.
+
+## Scope
+
+### In scope
+
+1. service-slice integration around `sw-block/engine/replication/`
+2. real control-plane / lifecycle entry path into the engine
+3. real storage-side adapter hookup into existing system boundaries
+4. selected real-system failure replay and diagnosis
+5. explicit product-path decision framing
+
+### Out of scope
+
+1. broad performance optimization
+2. Smart WAL expansion
+3. full V1 replacement rollout
+4. broad backend redesign
+5. production rollout itself
+
+## Phase 07 Items
+
+### P0: Service-Slice Plan
+
+1. define the first real-system service slice that will host the engine
+2. define adapter/module boundaries at the service boundary
+3. choose the concrete integration path to exercise first
+4. identify which current adapters are still mock/test-only and must be replaced first
+5. make the first-slice identity/epoch mapping explicit
+6. treat `blockvol` as execution backend only, not recovery-policy owner
+
+Status:
+
+- delivered
+- planning artifact:
+  - `sw-block/design/phase-07-service-slice-plan.md`
+- implementation slice proposal:
+  - engine core: `sw-block/engine/replication/`
+  - bridge adapters: `sw-block/bridge/blockvol/`
+  - real blockvol integration target: `weed/storage/blockvol/v2bridge/` (`P1`)
+- adapter replacement order:
+  - `control_adapter.go` (`P0`) done
+  - `storage_adapter.go` (`P0`) done
+  - `executor_bridge.go` (`P1`) deferred
+  - `observe_adapter.go` (`P1`) deferred
+- first-slice identity mapping is explicit:
+  - `ReplicaID = <volume-name>/<server-id>`
+  - not derived from any address field
+- engine / blockvol boundary is explicit:
+  - bridge maps intent and state
+  - `blockvol` executes I/O
+  - `blockvol` does not own recovery policy
+- service-slice validation gaps called out for `P1`:
+  - real blockvol field mapping
+  - real pin/release lifecycle against reclaim/GC
+  - assignment timing vs engine session lifecycle
+  - executor bridge into real WAL/snapshot work
+
+### P1: Real Entry-Path Integration
+
+1. connect real control/lifecycle events into the engine entry path
+2. connect real storage/base/recoverability signals into the engine adapters
+3. preserve accepted engine authority/execution/recoverability contracts
+
+Status:
+
+- accepted
+- real integration now established for:
+  - reader via `weed/storage/blockvol/v2bridge/reader.go`
+  - pinner via `weed/storage/blockvol/v2bridge/pinner.go`
+  - catch-up executor path via `weed/storage/blockvol/v2bridge/executor.go`
+- direct real-adapter tests now exist in:
+  - `weed/storage/blockvol/v2bridge/bridge_test.go`
+- accepted scope is explicit:
+  - real reader
+  - real retention hold / release
+  - real WAL catch-up scan path
+  - direct real bridge evidence for the integrated path
+- still deferred:
+  - `TransferSnapshot`
+  - `TransferFullBase`
+  - `TruncateWAL`
+  - control intent from confirmed failover / master-side integration
+- carry-forward limitation:
+  - under interim `CommittedLSN = CheckpointLSN`, this slice proves a real bridge path, not general post-checkpoint catch-up viability
+  - post-checkpoint catch-up semantics therefore remain narrower than final V2 intent and do not represent final V2 commit semantics
+
+### P2: Real-System Failure Replay
+
+1. replay selected real failure classes against the integrated service slice
+2. confirm diagnosability from logs/status
+3. identify any remaining mismatch between engine-stage assumptions and real system behavior
+
+Status:
+
+- accepted
+- real service-path replay now accepted for:
+  - changed-address restart
+  - stale epoch / stale session invalidation
+  - unrecoverable-gap / needs-rebuild replay
+  - explicit post-checkpoint boundary replay under the interim model
+- accepted with scoped limitation:
+  - real `v2bridge` WAL-scan execution is proven
+  - full integrated engine-driven catch-up semantics are not overclaimed under interim `CommittedLSN = CheckpointLSN`
+- control-plane delivery remains simulated via direct `AssignmentIntent` construction
+- carry-forward remains explicit:
+  - post-checkpoint catch-up semantics are still narrower than final V2 intent
+
+### P3: Product-Path Decision
+
+1. choose the first product path for V2
+2. define what remains before pre-production hardening
+3. record what is still intentionally deferred
+
+Status:
+
+- accepted
+- first product path chosen:
+  - `RF=2`
+  - `sync_all`
+  - existing master / volume-server heartbeat path
+  - V2 engine owns recovery policy
+  - `v2bridge` provides real storage truth
+- proposal is evidence-grounded and explicitly bounded by accepted `P0/P1/P2` evidence
+- pre-hardening prerequisites are explicit:
+  - real master control delivery
+  - full integrated engine -> executor -> `v2bridge` catch-up chain
+  - separation of committed truth from checkpoint truth
+  - rebuild execution (`snapshot` / `full-base` / `truncation`)
+  - pinner / flusher behavior under concurrent load
+- intentionally deferred:
+  - `RF>2`
+  - Smart WAL optimizations
+  - `best_effort` background recovery
+  - performance tuning
+  - full V1 replacement
+- non-claims remain explicit:
+  - not production-ready
+  - no end-to-end rebuild proof yet
+  - no general post-checkpoint catch-up proof
+  - no real master heartbeat/control delivery proof yet
+  - no full integrated engine -> executor -> `v2bridge` catch-up proof yet
+
+## Guardrails
+
+### Guardrail 1: Do not re-import V1 structure as the design owner
+
+Use `weed/storage/block*` and `learn/projects/sw-block/` as constraints and validation sources, not as the architecture template.
+
+### Guardrail 2: Keep catch-up narrow and rebuild explicit
+
+Do not use integration work as an excuse to widen catch-up semantics or blur rebuild as the formal recovery path.
+
+### Guardrail 3: Prefer real entry paths over test-only wrappers
+
+The integrated slice should exercise real service boundaries, not only internal engine helpers.
+
+### Guardrail 4: Observability must explain causality
+
+Integrated logs/status must explain:
+
+1. why rebuild was required
+2. why proof was rejected
+3. why execution was cancelled or invalidated
+4. why a product-path integration failed
+
+### Guardrail 5: Stable identity must not collapse back to address shape
+
+For the first slice, `ReplicaID` must be derived from master/block-registry identity, not current endpoint addresses.
+
+### Guardrail 6: `blockvol` executes I/O but does not own recovery policy
+
+The service bridge may translate engine decisions into concrete blockvol actions, but it must not re-decide:
+
+1. zero-gap / catch-up / rebuild
+2. trusted-base validity
+3. replayable-tail sufficiency
+4. rebuild fallback requirement
+
+## Exit Criteria
+
+Phase 07 is done when:
+
+1. one real-system service slice is integrated with the engine
+2. selected real-system failure classes are replayed through that slice
+3. diagnosability is sufficient for service-slice debugging
+4. the first product path is explicitly chosen
+5. the remaining work to pre-production hardening is clear
+
+## Assignment For `sw`
+
+Next tasks move to `Phase 08`.
+
+## Assignment For `tester`
+
+Next tasks move to `Phase 08`.
diff --git a/sw-block/.private/phase/phase-08-decisions.md b/sw-block/.private/phase/phase-08-decisions.md
new file mode 100644
index 000000000..7e8dbed2b
--- /dev/null
+++ b/sw-block/.private/phase/phase-08-decisions.md
@@ -0,0 +1,78 @@
+# Phase 08 Decisions
+
+## Decision 1: Phase 08 is pre-production hardening, not protocol rediscovery
+
+The accepted V2 product path from `Phase 07` is the basis.
+
+`Phase 08` should harden that path rather than reopen accepted protocol shape.
+
+## Decision 2: The first hardening priorities are control delivery and execution closure
+
+The most important remaining gaps are:
+
+1. real master/control delivery into the bridge/engine path
+2. integrated engine -> executor -> `v2bridge` catch-up execution closure
+3. first rebuild execution path for the chosen product path
+
+## Decision 3: Carry-forward limitations remain explicit until closed
+
+Phase 08 must keep explicit:
+
+1. committed truth is still not separated from checkpoint truth
+2. rebuild execution is still incomplete
+3. current control delivery is still simulated
+
+## Decision 4: Phase 08 P0 is accepted
+
+The hardening plan is sufficiently specified to begin implementation work.
+
+In particular, `P0` now fixes:
+
+1. the committed-truth gate decision requirement
+2. the unified replay requirement after control and execution closure
+3. the need for at least one real failover / reassignment validation target
+## Decision 5: The committed-truth limitation must become a hardening gate
+
+Phase 08 must explicitly decide one of:
+
+1. `CommittedLSN != CheckpointLSN` separation is mandatory before a production-candidate phase
+2. the first candidate path is intentionally bounded to the currently proven pre-checkpoint replay behavior
+
+It must not remain only a documented carry-forward.
+
+## Decision 6: Unified-path replay is required after control and execution closure
+
+Once real control delivery and integrated execution closure land, `Phase 08` must replay the accepted failure-class set again on the unified live path.
+
+This prevents independent closure of:
+
+1. control delivery
+2. execution closure
+
+without proving that they behave correctly together.
+
+## Decision 7: Real failover / reassignment validation is mandatory for the chosen path
+
+Because the chosen product path depends on the existing master / volume-server heartbeat path, at least one real failover / promotion / reassignment cycle must be a named hardening target in `Phase 08`.
+
+## Decision 8: Phase 08 should reuse the existing Seaweed control/runtime path, not invent a new one
+
+For the first hardening path, implementation should preferentially reuse:
+
+1. existing master / heartbeat / assignment delivery
+2. existing volume-server assignment receive/apply path
+3. existing `blockvol` runtime and `v2bridge` storage/runtime hooks
+
+This reuse is about:
+
+1. control-plane reality
+2. storage/runtime reality
+3. execution-path reality
+
+It is not permission to inherit old policy semantics as V2 truth.
+
+The hard rule remains:
+
+1. engine owns recovery policy
+2. bridge translates confirmed control/storage truth
+3. `blockvol` executes I/O
diff --git a/sw-block/.private/phase/phase-08-log.md b/sw-block/.private/phase/phase-08-log.md
new file mode 100644
index 000000000..7621b482a
--- /dev/null
+++ b/sw-block/.private/phase/phase-08-log.md
@@ -0,0 +1,21 @@
+# Phase 08 Log
+
+## 2026-03-31
+
+### Opened
+
+`Phase 08` opened as:
+
+- pre-production hardening
+
+### Starting basis
+
+1. `Phase 07`: complete
+2. first V2 product path chosen
+3. remaining gaps are integration and hardening gaps, not protocol-discovery gaps
+
+### Next
+
+1. Phase 08 P0 accepted
+2. Phase 08 P1 real master/control delivery integration
+3. Phase 08 P2 integrated execution closure
diff --git a/sw-block/.private/phase/phase-08.md b/sw-block/.private/phase/phase-08.md
new file mode 100644
index 000000000..7e1412496
--- /dev/null
+++ b/sw-block/.private/phase/phase-08.md
@@ -0,0 +1,254 @@
+# Phase 08
+
+Date: 2026-03-31
+Status: active
+Purpose: convert the accepted Phase 07 product path into a pre-production-hardening program without reopening accepted V2 protocol shape
+
+## Why This Phase Exists
+
+`Phase 07` completed:
+
+1. a real service-slice integration around the V2 engine
+2. real storage-truth bridge evidence through `v2bridge`
+3. selected real-system failure replay
+4. the first explicit product-path decision
+
+What still does not exist is a pre-production-ready system path. The remaining work is no longer protocol discovery. It is closing the operational and integration gaps between the accepted product path and a hardened deployment candidate.
+
+## Phase Goal
+
+Harden the first accepted V2 product path until the remaining gap to a production candidate is explicit, bounded, and implementation-driven.
+
+## Scope
+
+### In scope
+
+1. real master/control delivery into the engine service path
+2. integrated engine -> executor -> `v2bridge` execution closure
+3. rebuild execution closure for the accepted product path
+4. operational/debuggability hardening
+5. concurrency/load validation around retention and recovery
+
+### Out of scope
+
+1. new protocol redesign
+2. `RF>2` coordination
+3. Smart WAL optimization work
+4. broad performance tuning beyond validation needed for hardening
+5. full V1 replacement rollout
+
+## Phase 08 Items
+
+### P0: Hardening Plan
+
+1. convert the accepted `Phase 07` product path into a hardening plan
+2. define the minimum pre-production gates
+3. order the remaining integration closures by risk
+4. make an explicit gate decision on committed truth vs checkpoint truth:
+   - either separate `CommittedLSN` from `CheckpointLSN` before a production-candidate phase
+   - or explicitly bound the first candidate path to the currently proven pre-checkpoint replay behavior
+
+Status:
+
+- planning package accepted in this phase doc
+- first hardening priorities are fixed as:
+  - real master/control delivery
+  - integrated engine -> executor -> `v2bridge` catch-up execution chain
+  - first rebuild execution path
+- the committed-truth carry-forward is now a required hardening gate, not just a note:
+  - either separate `CommittedLSN` from `CheckpointLSN` before a production-candidate phase
+  - or explicitly bound the first candidate path to the currently proven pre-checkpoint replay behavior
+- at least one real failover / promotion / reassignment cycle is a required hardening target
+- once `P1` and `P2` land, the accepted failure-class set must be replayed again on the newly unified live path
+- the validation oracle for `Phase 08` is expected to reject overclaiming around:
+  - catch-up semantics
+  - rebuild execution
+  - master/control delivery
+  - candidate-path readiness vs production readiness
+- accepted
+
+### P1: Real Control Delivery
+
+1. connect real master/heartbeat assignment delivery into the bridge
+2. replace direct `AssignmentIntent` construction for the first live path
+3. preserve stable identity and fenced authority through the real control path
+4. include at least one real failover / promotion / reassignment validation target on the chosen `sync_all` path
+
+Technical focus:
+
+- keep the control-path split explicit:
+  - master confirms assignment / epoch / role
+  - bridge translates confirmed control truth into engine intent
+  - engine owns sender/session/recovery policy
+  - `blockvol` does not re-decide recovery policy
+- preserve the identity rule through the live path:
+  - `ReplicaID = <volume>/<server>`
+  - endpoint change updates location but must not recreate logical identity
+- preserve the fencing rule through the live path:
+  - stale epoch must invalidate old authority
+  - stale session must not mutate current lineage
+  - address change must invalidate the old live session before the new path proceeds
+- treat failover / promotion / reassignment as control-truth events first, not storage-side heuristics
+
+Implementation route (`reuse map`):
+
+- reuse directly as the first hardening carrier:
+  - `weed/server/master_grpc_server.go`
+  - `weed/server/volume_grpc_client_to_master.go`
+  - `weed/server/volume_server_block.go`
+  - `weed/server/master_block_registry.go`
+  - `weed/server/master_block_failover.go`
+- reuse as storage/runtime execution reality:
+  - `weed/storage/blockvol/blockvol.go`
+  - `weed/storage/blockvol/replica_apply.go`
+  - `weed/storage/blockvol/replica_barrier.go`
+  - `weed/storage/blockvol/v2bridge/`
+- preserve the V2 boundary while reusing these files:
+  - reuse transport/control/runtime reality
+  - do not inherit old policy semantics as V2 truth
+  - keep engine as the recovery-policy owner
+  - keep `blockvol` as the I/O executor
+
+Expectation note:
+
+- the `P1` tester expectation is already embedded in this phase doc under:
+  - `P1 / Validation focus`
+  - `P1 / Reject if`
+- do not grow a separate long template unless `P1` scope expands materially
+
+Validation focus:
+
+- prove live assignment delivery into the bridge/engine path
+- prove stable `ReplicaID` across address refresh on the live path
+- prove stale epoch / stale session invalidation through the live path
+- prove at least one real failover / promotion / reassignment cycle on the chosen `sync_all` path
+- prove the resulting logs explain:
+  - why reassignment happened
+  - why a session was invalidated
+  - which epoch / identity / endpoint drove the transition
+
+Reject if:
+
+- address-shaped identity reappears anywhere in the control path
+- bridge starts re-deriving catch-up vs rebuild policy from convenience inputs
+- old epoch or old session can still mutate after the new control truth arrives
+- failover / reassignment is claimed without a real replay target
+- delivery claims general production readiness rather than control-path closure
+
+### P2: Execution Closure
+
+1. close the live engine -> executor -> `v2bridge` execution chain
+2. make catch-up execution evidence integrated rather than split across layers
+3. close the first rebuild execution path required by the product path
+
+### P3: Hardening Validation
+
+1. validate diagnosability under the live integrated path
+2. validate retention/pinner behavior under concurrent load
+3. replay the accepted failure-class set again on the newly unified live path after `P1` and `P2` land
+4. confirm the remaining gap to a production candidate
+
+Validation focus:
+
+- prove the chosen path through a real control-delivery path
+- prove the live engine -> executor -> `v2bridge` execution chain as one path, not split evidence
+- prove the first rebuild execution path required by the chosen product path
+- prove at least one real failover / promotion / reassignment cycle
+- prove concurrent retention/pinner behavior does not break recovery guarantees
+
+Reject if:
+
+- catch-up semantics are overclaimed beyond the currently proven boundary
+- rebuild is claimed as supported without real execution closure
+- master/control delivery is claimed as real without the live path in place
+- `CommittedLSN` vs `CheckpointLSN` remains an unclassified note instead of a gate decision
+- `P1` and `P2` land independently but the accepted failure-class set is not replayed again on the unified live path
+
+## Guardrails
+
+### Guardrail 1: Do not reopen accepted V2 protocol truths casually
+
+`Phase 08` is a hardening phase. New work should preserve the accepted protocol truth set unless a real contradiction is demonstrated.
+
+### Guardrail 2: Keep product-path claims evidence-bound
+
+Do not claim more than the hardened path actually proves. Distinguish:
+
+1. live integrated path
+2. hardened product path
+3. production candidate
+
+### Guardrail 3: Identity and policy boundaries remain hard rules
+
+1. `ReplicaID` must remain stable and never collapse to address shape
+2. engine decides recovery policy
+3. bridge translates intent/state
+4. `blockvol` executes I/O only
+
+### Guardrail 4: Carry-forward limitations must remain explicit until closed
+
+Especially:
+
+1. committed truth vs checkpoint truth
+2. rebuild execution coverage
+3. real master/control delivery coverage
+
+### Guardrail 5: The committed-truth carry-forward must become a gate, not a note
+
+Before the next phase, `Phase 08` must decide one of:
+
+1. committed-truth separation is mandatory before a production-candidate phase
+2. the first candidate path is intentionally bounded to the currently proven pre-checkpoint replay behavior
+
+It must not remain an unclassified carry-forward.
+
+## Exit Criteria
+
+Phase 08 is done when:
+
+1. the first product path runs through a real control delivery path
+2. the critical execution chain is integrated and validated
+3. rebuild execution for the chosen path is no longer just detected but executed
+4. at least one real failover / reassignment cycle is replayed through the live control path
+5. the accepted failure-class set is replayed again on the unified live path
+6. operational/debug evidence is sufficient for pre-production use
+7. the remaining gap to a production candidate is small and explicit
+
+## Assignment For `sw`
+
+Next tasks:
+
+1. drive `Phase 08 P1` as real master/control delivery integration
+2. replace direct `AssignmentIntent` construction for the first live path
+3. preserve through the real control path:
+   - stable `ReplicaID`
+   - epoch fencing
+   - address-change invalidation
+4. include at least one real failover / promotion / reassignment validation target
+5. keep acceptance claims scoped:
+   - real control delivery path
+   - not yet general production readiness
+6. keep explicit carry-forwards:
+   - `CommittedLSN != CheckpointLSN` still unresolved
+   - integrated catch-up execution chain still incomplete
+   - rebuild execution still incomplete
+
+## Assignment For `tester`
+
+Next tasks:
+
+1. use the accepted `Phase 08` plan framing as the `P1` validation oracle
+2. validate real control delivery for:
+   - live assignment delivery
+   - stable identity through the control path
+   - stale epoch/session invalidation
+   - at least one real failover / reassignment cycle
+3. keep the no-overclaim rule active around:
+   - catch-up semantics
+   - rebuild execution
+   - master/control delivery
+4. keep the committed-truth gate explicit:
+   - still unresolved in `P1`
+5. prepare `P2` follow-up expectations for:
+   - integrated engine -> executor -> `v2bridge` execution closure
+   - unified replay after `P1` and `P2`
diff --git a/sw-block/.private/phase/phase-4.5-decisions.md b/sw-block/.private/phase/phase-4.5-decisions.md
new file mode 100644
index 000000000..3d950764e
--- /dev/null
+++ b/sw-block/.private/phase/phase-4.5-decisions.md
@@ -0,0 +1,59 @@
+# Phase 4.5 Decisions
+
+## Decision 1: Phase 4.5 remains a bounded hardening phase
+
+It is not a new architecture line and must not expand into broad feature work.
+
+Purpose:
+
+1. tighten recovery boundaries
+2. strengthen crash-consistency / recoverability proof
+3. clear the path for engine planning
+
+## Decision 2: `sw` Phase 4.5 P0 is accepted
+
+Accepted basis:
+
+1. bounded `CatchUp` now changes prototype behavior
+2. `FrozenTargetLSN` is intrinsic to the session contract
+3. `Rebuild` is a first-class sender-owned execution path
+4. rebuild and catch-up are execution-path exclusive
+
+## Decision 3: `tester` crash-consistency simulator strengthening is accepted
+
+Accepted basis:
+
+1. checkpoint semantics are explicit
+2. recoverability after restart is no longer collapsed into a single loose watermark
+3. crash-consistency invariants are executable and passing
+
+## Decision 4: Remaining Phase 4.5 work is evidence hardening, not primitive-building
+
+Completed focus:
+
+1. `A5-A8` prototype + simulator double evidence
+2. predicate exploration for dangerous states
+3. adversarial search over crash-consistency / liveness states
+
+Remaining optional work:
+
+4. any low-priority cleanup that improves clarity without reopening design
+
+## Decision 5: After Phase 4.5, the project should move to engine-planning readiness review
+
+Unless new blocking flaws appear, the next major decision after `4.5` should be:
+
+1. real V2 engine planning
+2. engine slicing plan
+
+not another broad prototype phase
+
+## Decision 6: Phase 4.5 is complete
+
+Reason:
+
+1. bounded `CatchUp` is semantic in the prototype
+2. `Rebuild` is first-class in the prototype
+3. crash-consistency / restart-recoverability are materially stronger in the simulator
+4. `A5-A8` evidence is materially stronger on both prototype and simulator sides
+5. adversarial search found and helped fix a real correctness bug, validating the proof style
diff --git a/sw-block/.private/phase/phase-4.5-log.md b/sw-block/.private/phase/phase-4.5-log.md
new file mode 100644
index 000000000..2f923be74
--- /dev/null
+++ b/sw-block/.private/phase/phase-4.5-log.md
@@ -0,0 +1,33 @@
+# Phase 4.5 Log
+
+## 2026-03-29
+
+### Accepted
+
+1. `sw` `Phase 4.5 P0`
+   - bounded `CatchUp` budget is semantic in `enginev2`
+   - `FrozenTargetLSN` is a real session invariant
+   - `Rebuild` is wired into sender execution and is exclusive from catch-up
+   - rebuild completion goes through `CompleteRebuild`, not generic session completion
+
+2. `tester` crash-consistency simulator strengthening
+   - storage-state split introduced and accepted
+   - checkpoint/restart boundary made explicit
+   - recoverability upgraded from watermark-style logic to checkpoint + contiguous WAL replayability proof
+   - core invariant tests for crash consistency now pass
+
+3. `tester` evidence hardening and adversarial exploration
+   - grouped simulator evidence for `A5-A8`
+   - danger predicates added
+   - adversarial search added and passing
+   - adversarial search found a real `StateAt(lsn)` historical-state bug
+   - `StateAt(lsn)` corrected so newer checkpoint/base state does not leak into older historical queries
+
+4. `Phase 4.5` closeout judgment
+   - prototype and simulator evidence are now strong enough to stop expanding `4.5`
+   - next major step should move to engine-readiness review and engine slicing
+
+### Remaining open work
+
+1. low-priority cleanup
+   - remove or consolidate redundant frozen-target bookkeeping if no longer needed
diff --git a/sw-block/.private/phase/phase-4.5-reason.md b/sw-block/.private/phase/phase-4.5-reason.md
new file mode 100644
index 000000000..b183c2e98
--- /dev/null
+++ b/sw-block/.private/phase/phase-4.5-reason.md
@@ -0,0 +1,397 @@
+# Phase 4.5 Reason
+
+Date: 2026-03-27
+Status: proposal for dev manager decision
+Purpose: explain why a narrow V2 fine-tuning step should follow the main Phase 04 slice, without reopening the core ownership/fencing direction
+
+## 1. Why This Note Exists
+
+`Phase 04` has already produced strong progress on the first standalone V2 slice:
+
+- per-replica sender identity
+- one active recovery session per replica per epoch
+- endpoint / epoch invalidation
+- sender-owned execution APIs
+- explicit recovery outcome branching
+- minimal historical-data prototype
+
+This is good progress and should continue.
+
+However, recent review and discussion show that the next risk is no longer:
+
+- ownership ambiguity
+- stale completion acceptance
+- scattered local recovery authority
+
+The next risk is different:
+
+- `CatchUp` may become too broad, too long-lived, and too resource-heavy
+- simulator proof is still weaker than desired on crash-consistency and recoverability boundaries
+- the project may accidentally carry V1.5-style "keep trying to catch up" assumptions into V2 engine work
+
+So this note proposes:
+
+- **do not interrupt the main Phase 04 work**
+- **do not reopen core V2 ownership/fencing architecture**
+- **add a narrow fine-tuning step immediately after Phase 04 main closure**
+
+This note is for the dev manager to decide implementation sequencing.
+
+## 2. Current Basis
+
+This proposal is grounded in the following current documents:
+
+- `sw-block/.private/phase/phase-04.md`
+- `sw-block/design/v2-prototype-roadmap-and-gates.md`
+- `sw-block/design/v2-acceptance-criteria.md`
+- `sw-block/design/v2-detailed-algorithm.zh.md`
+
+In particular:
+
+- `phase-04.md` shows that Phase 04 is correctly centered on sender/session ownership and recovery execution authority
+- `v2-prototype-roadmap-and-gates.md` shows that design proof is high, but data/recovery proof and prototype end-to-end proof are still low
+- `v2-acceptance-criteria.md` already requires stronger proof for:
+  - `A5` non-convergent catch-up escalation
+  - `A6` explicit recoverability boundary
+  - `A7` historical correctness
+  - `A8` durability-mode correctness
+- `v2-detailed-algorithm.zh.md` Section 17 now argues for a direction tightening:
+  - keep the V2 core
+  - narrow `CatchUp`
+  - elevate `Rebuild`
+  - defer higher-complexity expansion
+
+## 3. Main Judgment
+
+### 3.1 What should NOT change
+
+The following V2 core should remain stable:
+
+- `CommittedLSN` as the external safe boundary
+- durable progress as sync truth
+- one sender per replica
+- one active recovery session per replica per epoch
+- stale epoch / stale endpoint / stale session fencing
+- explicit `ZeroGap / CatchUp / NeedsRebuild`
+
+This is the architecture that most clearly separates V2 from V1.5.
+
+### 3.2 What SHOULD be fine-tuned
+
+The following should be tightened before engine planning:
+
+1. `CatchUp` should be narrowed to a short-gap, bounded, budgeted path
+2. `Rebuild` should be treated as a formal primary recovery path, not only a fallback embarrassment
+3. `recover -> keepup` handoff should be made more explicit
+4. simulator should prove recoverability and crash-consistency more directly
+
+## 4. Algorithm Thinking Behind The Fine-Tune
+
+This section summarizes the reasoning already captured in:
+
+- `sw-block/design/v2-detailed-algorithm.zh.md`
+
+Especially Section 17:
+
+- `V2` is still the right direction
+- but V2 should be tightened from:
+  - "make WAL recovery increasingly smart"
+  - to:
+  - "make block truth boundaries hard, keep `CatchUp` cheap and bounded, and use formal `Rebuild` when recovery becomes too complex"
+
+### 4.1 First-principles view
+
+From block first principles, the hardest truths are:
+
+1. when `write` becomes real
+2. what `flush/fsync ACK` truly promises
+3. whether acknowledged boundaries survive failover
+4. how replicas rejoin without corrupting lineage
+
+These are more fundamental than:
+
+- volume product shape
+- control-plane surface
+- recovery cleverness for its own sake
+
+So the project should optimize for:
+
+- clearer truth boundaries
+- not for maximal catch-up cleverness
+
+### 4.2 Mayastor-style product insight
+
+The useful first-principles lesson from Mayastor-like product thinking is:
+
+- not every lagging replica is worth indefinite low-cost chase
+- `Rebuild` can be a formal product path, not a shameful fallback
+- block products benefit from explicit lifecycle objects and formal rebuild flow
+
+This does NOT replace the V2 core concerns:
+
+- `flush ACK` truth
+- committed-prefix failover safety
+- stale authority fencing
+
+But it does suggest a correction:
+
+- do not let `CatchUp` become an over-smart general answer to all recovery
+
+### 4.3 Proposed V2 fine-tuned interpretation
+
+The fine-tuned interpretation of V2 should be:
+
+- `CatchUp` is for short-gap, clearly recoverable, bounded recovery
+- `Rebuild` is for long-gap, high-cost, unstable, or non-convergent recovery
+- recovery session is a bounded contract, not a long-running rescue thread
+- `> H0` live WAL must not silently turn one recovery session into an endless chase
+
+## 5. Specific Fine-Tune Adjustments
+
+### 5.1 Narrow `CatchUp`
+
+`CatchUp` should explicitly require:
+
+- short outage
+- bounded target `H0`
+- clear recoverability
+- bounded reservation
+- bounded time
+- bounded resource cost
+- bounded convergence expectation
+
+`CatchUp` should explicitly stop when:
+
+- target drifts too long without convergence
+- replay progress stalls
+- recoverability proof is lost
+- retention cost becomes unreasonable
+- session budget expires
+
+### 5.2 Elevate `Rebuild`
+
+`Rebuild` should be treated as a first-class path when:
+
+- lag is too large
+- catch-up does not converge
+- recoverability is no longer stable
+- complexity of continued catch-up exceeds its product value
+
+The intended model becomes:
+
+- short gap -> `CatchUp`
+- long gap / unstable / non-convergent -> `Rebuild`
+
+This should be interpreted more strictly than a simple routing rule:
+
+- `CatchUp` is not a general recovery framework
+- `CatchUp` is a relaxed form of `KeepUp`
+- it should stay limited to short-gap, bounded, clearly recoverable WAL replay
+- it only makes sense while the replica's current base is still trustworthy enough to continue from
+
+By contrast:
+
+- `Rebuild` is the more general recovery framework
+- it restores the replica from a trusted base toward a frozen target boundary
+- `full rebuild` and `partial rebuild` are not different protocols; they are different base/transfer choices under the same rebuild contract
+
+So the intended product shape is:
+
+- use `CatchUp` when replay debt is small and clearly cheaper than rebuild
+- use `Rebuild` when correctness, boundedness, or product simplicity would otherwise be compromised
+
+And the correctness anchor for both `full` and `partial` rebuild should remain explicit:
+
+- freeze `TargetLSN`
+- pin the snapshot/base used for recovery
+- only then optimize transfer volume using `snapshot + tail`, `bitmap`, or similar mechanisms
+
+### 5.3 Clarify `recover -> keepup` handoff
+
+Phase 04 already aims to prove a clean handoff between normal sender and recovery session.
+
+The fine-tune should make the next step more explicit:
+
+- one recovery session only owns `(R, H0]`
+- session completion releases recovery debt
+- replica should not silently stay in "quasi-recovery"
+- re-entry to `KeepUp` / `InSync` should remain explicit, ideally with `PromotionHold` or equivalent stabilization logic
+
+### 5.4 Keep Smart WAL deferred
+
+No fine-tune should broaden Smart WAL scope at this point.
+
+Reason:
+
+- Smart WAL multiplies recoverability, GC, payload-availability, and reservation complexity
+- the current priority is to harden the simpler V2 replication contract first
+
+So the rule remains:
+
+- no Smart WAL expansion beyond what minimal proof work might later require
+
+## 6. Simulation Strengthening Requirements
+
+This is the highest-value part of the fine-tune.
+
+Current simulator strength is already good on:
+
+- epoch fencing
+- stale traffic rejection
+- promotion candidate rules
+- ownership / session invalidation
+- basic `CatchUp / NeedsRebuild` classification
+
+Current simulator weakness is still significant on:
+
+- crash-consistency around extent / checkpoint / replay boundaries
+- `ACK` boundary versus recoverable boundary
+- `CatchUp` liveness / convergence
+
+### 6.1 Required new modeling direction
+
+The simulator should stop collapsing these states together:
+
+- received but not durable
+- WAL durable but not yet fully materialized
+- extent-visible but not yet checkpoint-safe
+- checkpoint-safe base image
+- restart-recoverable read state
+
+Suggested explicit storage-state split:
+
+- `ReceivedLSN`
+- `WALDurableLSN`
+- `ExtentAppliedLSN`
+- `CheckpointLSN`
+- `RecoverableLSNAfterRestart`
+
+### 6.2 Required new invariants
+
+The simulator should explicitly check at least:
+
+1. `AckedFlushLSN <= RecoverableLSNAfterRestart`
+2. visible state must have recoverable backing
+3. `CatchUp` cannot remain non-convergent indefinitely
+4. promotion candidate must still possess recoverable committed prefix
+
+### 6.3 Required new scenario classes
+
+Priority scenarios to add:
+
+1. `ExtentAheadOfCheckpoint_CrashRestart_ReadBoundary`
+2. `AckedFlush_MustBeRecoverableAfterCrash`
+3. `UnackedVisibleExtent_MustNotSurviveAsCommittedTruth`
+4. `CatchUpChasingMovingHead_EscalatesOrConverges`
+5. `CheckpointGCBreaksRecoveryProof`
+
+### 6.4 Required simulator style upgrade
+
+The simulator should move beyond only hand-authored examples and also support:
+
+- dangerous-state predicates
+- adversarial random exploration guided by those predicates
+
+Examples:
+
+- `acked_flush_lost`
+- `extent_exposes_unrecoverable_state`
+- `catchup_livelock`
+- `rebuild_required_but_not_escalated`
+
+## 7. Relationship To Acceptance Criteria
+
+This fine-tune is not a separate architecture line.
+
+It is mainly intended to make the project satisfy the existing acceptance set more convincingly:
+
+- `A5` explicit escalation from non-convergent catch-up
+- `A6` recoverability boundary as a real rule, not hopeful policy
+- `A7` historical correctness against snapshot + tail rebuild
+- `A8` strict durability mode semantics
+
+So this fine-tune is a strengthening of the current V2 proof path, not a new branch.
+
+## 8. Recommended Sequencing
+
+### Option A: pause Phase 04 and reopen design now
+
+Not recommended.
+
+Why:
+
+- Phase 04 has strong momentum
+- its core ownership/fencing work is correct
+- pausing it now would blur scope and waste recent closure
+
+### Option B: finish Phase 04, then add a narrow `4.5`
+
+Recommended.
+
+Why:
+
+- Phase 04 can finish its intended ownership / orchestration / minimal-history closure
+- `4.5` can then tighten recovery strategy without destabilizing the slice
+- the project avoids carrying "too-smart catch-up" assumptions into later engine planning
+
+Recommended sequence:
+
+1. finish Phase 04 main closure
+2. immediately start `Phase 4.5`
+3. use `4.5` to tighten:
+   - bounded `CatchUp`
+   - formal `Rebuild`
+   - crash-consistency and recoverability simulator proof
+4. then re-evaluate Gate 4 / Gate 5
+
+## 9. Scope Of A Possible Phase 4.5
+
+If the dev manager chooses to implement a `4.5` step, its scope should be:
+
+### In scope
+
+- tighten algorithm wording and boundaries from `v2-detailed-algorithm.zh.md`
+- formalize bounded `CatchUp`
+- formalize `Rebuild` as first-class path
+- strengthen simulator state model and invariants
+- add targeted crash-consistency and liveness scenarios
+- improve prototype traceability against `A5-A8`
+
+### Out of scope
+
+- Smart WAL expansion
+- real storage engine redesign
+- V1 production integration
+- frontend/wire protocol
+- performance optimization as primary goal
+
+## 10. Decision Requested From Dev Manager
+
+Please decide:
+
+1. whether `Phase 04` should continue to normal closure without interruption
+2. whether a narrow `Phase 4.5` should immediately follow
+3. whether the simulator strengthening work should be treated as mandatory for Gate 4 / Gate 5 credibility
+
+Recommended decision:
+
+- **Yes**: finish `Phase 04`
+- **Yes**: add `Phase 4.5` as a bounded fine-tuning step
+- **Yes**: treat crash-consistency / recoverability / liveness simulator strengthening as required, not optional
+
+## 11. Bottom Line
+
+The project does not need a new direction.
+
+It needs:
+
+- a slightly tighter interpretation of V2
+- a stronger recoverability/crash-consistency simulator
+- a clearer willingness to use formal `Rebuild` instead of over-extending `CatchUp`
+
+So the practical recommendation is:
+
+- **keep the V2 core**
+- **finish Phase 04**
+- **add a narrow Phase 4.5**
+- **strengthen simulator proof before engine planning**
diff --git a/sw-block/.private/phase/phase-4.5.md b/sw-block/.private/phase/phase-4.5.md
new file mode 100644
index 000000000..50e2469a4
--- /dev/null
+++ b/sw-block/.private/phase/phase-4.5.md
@@ -0,0 +1,356 @@
+# Phase 4.5
+
+Date: 2026-03-29
+Status: complete
+Purpose: harden Gate 4 / Gate 5 credibility after Phase 04 by tightening bounded `CatchUp`, elevating `Rebuild` as a first-class path, and strengthening crash-consistency / recoverability proof
+
+## Related Plan
+
+Strategic phase:
+
+- `sw-block/.private/phase/phase-4.5.md`
+
+Simulator implementation plan:
+
+- `learn/projects/sw-block/design/phase-05-crash-consistency-simulation.md`
+
+Use them together:
+
+- `Phase 4.5` defines the gate-hardening purpose and priorities
+- `phase-05-crash-consistency-simulation.md` is the detailed simulator implementation plan
+
+## Why This Phase Exists
+
+Phase 04 has already established:
+
+1. per-replica sender identity
+2. one active recovery session per replica per epoch
+3. stale authority fencing
+4. sender-owned execution APIs
+5. assignment-intent orchestration
+6. minimal historical-data prototype
+7. prototype scenario closure
+
+The next risk is no longer ownership structure.
+
+The next risk is:
+
+1. `CatchUp` becoming too broad, too long-lived, or too optimistic
+2. `Rebuild` remaining underspecified even though it will likely become a common path
+3. simulator proof still being weaker than desired on crash-consistency and restart-recoverability
+
+So `Phase 4.5` exists to harden the decision gate before real engine planning.
+
+## Relationship To Phase 04
+
+`Phase 4.5` is not a new architecture line.
+
+It is a narrow hardening step after normal Phase 04 closure.
+
+It should:
+
+- keep the V2 core
+- not reopen sender/session ownership architecture
+- strengthen recovery boundaries and proof quality
+
+## Main Questions
+
+1. how narrow should `CatchUp` be?
+2. when must recovery escalate to `Rebuild`?
+3. what exactly is the `Rebuild` source of truth?
+4. what does restart-recoverable / crash-consistent state mean in the simulator?
+
+## Core Decisions To Drive
+
+### 1. Bounded CatchUp
+
+`CatchUp` should be explicitly bounded by:
+
+1. target range
+2. retention proof
+3. time budget
+4. progress budget
+5. resource budget
+
+It should stop and escalate when:
+
+1. target drifts too long
+2. progress stalls
+3. recoverability proof is lost
+4. retention cost becomes unreasonable
+5. session budget expires
+
+### 2. Rebuild Is First-Class
+
+`Rebuild` is not an embarrassment path.
+
+It is the formal path for:
+
+1. long gap
+2. unstable recoverability
+3. non-convergent catch-up
+4. excessive replay cost
+5. restart-recoverability uncertainty
+
+### 3. Rebuild Source Model
+
+To address the concern that tightening `CatchUp` makes `Rebuild` too dominant:
+
+`Rebuild` should be split conceptually into two modes:
+
+1. **Snapshot + Tail**
+   - preferred path
+   - use a dated but internally consistent base snapshot/checkpoint
+   - then apply retained WAL tail up to the committed recovery boundary
+
+2. **Full Base Rebuild**
+   - fallback path
+   - used when no acceptable snapshot/base image exists
+   - more expensive and slower
+
+Decision boundary:
+
+- use `Snapshot + Tail` when a trusted snapshot/checkpoint/base exists that covers the required base state
+- use `Full Base Rebuild` when no such trusted base exists
+
+So "rebuild" should not mean only:
+
+- copy everything from scratch
+
+It should usually mean:
+
+- re-establish a trustworthy base image
+- then catch up from that base to the committed boundary
+
+This keeps `Rebuild` practical even if `CatchUp` becomes narrower.
+
+### 4. Safe Recovery Truth
+
+The simulator should explicitly separate:
+
+1. `ReceivedLSN`
+2. `WALDurableLSN`
+3. `ExtentAppliedLSN`
+4. `CheckpointLSN`
+5. `RecoverableLSNAfterRestart`
+
+This is needed so that:
+
+- `ACK` truth
+- visible-state truth
+- crash-restart truth
+
+do not collapse into one number.
+
+## Priority
+
+### P0
+
+1. document bounded `CatchUp` rule
+2. document `Rebuild` modes:
+   - snapshot + tail
+   - full base rebuild
+3. define escalation conditions from `CatchUp` to `Rebuild`
+
+Status:
+
+- accepted on both prototype and simulator sides
+- prototype: bounded `CatchUp` is semantic, target-frozen, budget-enforced, and rebuild is a sender-owned exclusive path
+- simulator: crash-consistency state split, checkpoint-safe restart boundary, and core invariants are in place
+
+### P1
+
+4. strengthen simulator state model with crash-consistency split:
+   - `ReceivedLSN`
+   - `WALDurableLSN`
+   - `ExtentAppliedLSN`
+   - `CheckpointLSN`
+   - `RecoverableLSNAfterRestart`
+
+5. add explicit invariants:
+   - `AckedFlushLSN <= RecoverableLSNAfterRestart`
+   - visible state must have recoverable backing
+   - promotion candidate must possess recoverable committed prefix
+
+Status:
+
+- accepted on the simulator side
+- remaining work is no longer basic state split; it is stronger traceability and adversarial exploration
+
+### P2
+
+6. add targeted scenarios:
+   - `ExtentAheadOfCheckpoint_CrashRestart_ReadBoundary`
+   - `AckedFlush_MustBeRecoverableAfterCrash`
+   - `UnackedVisibleExtent_MustNotSurviveAsCommittedTruth`
+   - `CatchUpChasingMovingHead_EscalatesOrConverges`
+   - `CheckpointGCBreaksRecoveryProof`
+
+Status:
+
+- baseline targeted scenarios accepted
+- predicate-guided/adversarial exploration remains open
+
+### P3
+
+7. make prototype traceability stronger for:
+   - `A5`
+   - `A6`
+   - `A7`
+   - `A8`
+
+8. decide whether Gate 4 / Gate 5 are now credible enough for engine planning
+
+Status:
+
+- partially complete
+- Gate 4 / Gate 5 are materially stronger
+- remaining work is to make `A5-A8` double evidence more explicit and reviewable
+
+## Scope
+
+### In scope
+
+1. bounded `CatchUp`
+2. first-class `Rebuild`
+3. snapshot + tail rebuild model
+4. crash-consistency simulator state split
+5. targeted liveness / recoverability scenarios
+
+### Out of scope
+
+1. Smart WAL expansion
+2. V1 production integration
+3. backend/storage engine redesign
+4. performance optimization as primary goal
+5. frontend/wire protocol work
+
+## Exit Criteria
+
+`Phase 4.5` is done when:
+
+1. `CatchUp` budget / escalation rule is explicit in docs and simulator
+2. `Rebuild` is explicitly modeled as:
+   - snapshot + tail preferred
+   - full base rebuild fallback
+3. simulator has explicit crash-consistency state split
+4. simulator has targeted crash / liveness scenarios for the listed risks
+5. acceptance items `A5-A8` have stronger executable proof, ideally with explicit prototype + simulator evidence pairs
+6. we can make a more credible decision on:
+   - real V2 engine planning
+   - or `V2.5` correction
+
+## Review Gates
+
+These are explicit review gates for `Phase 4.5`.
+
+### Gate 1: Bounded CatchUp Must Be Semantic
+
+It is not enough to add budget fields in docs or structs.
+
+To count as complete:
+
+1. timeout / budget exceed must force exit
+2. moving-head chase must not continue indefinitely
+3. escalation to `NeedsRebuild` must be explicit
+4. tests must prove those behaviors
+
+### Gate 2: State Split Must Change Decisions
+
+It is not enough to add more state names.
+
+To count as complete, the new crash-consistency state split must materially change:
+
+1. `ACK` legality
+2. restart recoverability judgment
+3. visible-state legality
+4. promotion-candidate legality
+
+### Gate 3: A5-A8 Need Double Evidence
+
+It is not enough for only prototype or only simulator to cover them.
+
+To count as complete, each of:
+
+- `A5`
+- `A6`
+- `A7`
+- `A8`
+
+should have:
+
+1. one prototype-side evidence path
+2. one simulator-side evidence path
+
+## Scope Discipline
+
+`Phase 4.5` must remain a bounded gate-hardening phase.
+
+It should stay focused on:
+
+1. tightening boundaries
+2. strengthening proof
+3. clearing the path for engine planning
+
+It should not turn into a broad new feature-expansion phase.
+
+## Current Status Summary
+
+Accepted now:
+
+1. `sw` `Phase 4.5 P0`
+   - bounded `CatchUp` is semantic, not documentary
+   - `FrozenTargetLSN` is a real session invariant
+   - `Rebuild` is an exclusive sender-owned execution path
+2. `tester` crash-consistency simulator strengthening
+   - checkpoint/restart boundary is explicit
+   - recoverability is no longer a single collapsed watermark
+   - core crash-consistency invariants are executable
+
+Open now:
+
+1. low-priority cleanup such as redundant frozen-target bookkeeping fields
+
+Completed since initial approval:
+
+1. `A5-A8` explicit double-evidence traceability materially strengthened
+2. predicate exploration / adversarial search added on simulator side
+3. crash-consistency random/adversarial search found and helped fix a real `StateAt(lsn)` historical-state bug
+
+## Assignment For `sw`
+
+Focus: prototype/control-path formalization
+
+Completed work:
+
+1. updated prototype traceability for:
+   - `A5`
+   - `A6`
+   - `A7`
+   - `A8`
+2. made rebuild-source decision evidence explicit in prototype tests:
+   - snapshot + tail chosen only when trusted base exists
+   - full base chosen when it does not
+3. added focused prototype evidence grouping for engine-planning review
+
+Remaining optional cleanup:
+
+4. optionally clean low-priority redundancy:
+   - `TargetLSNAtStart` if superseded by `FrozenTargetLSN`
+
+## Assignment For `tester`
+
+Focus: simulator/crash-consistency proof
+
+Completed work:
+
+1. wired simulator-side evidence explicitly into acceptance traceability for:
+   - `A5`
+   - `A6`
+   - `A7`
+   - `A8`
+2. added predicate exploration / adversarial search around the new crash-consistency model
+3. added danger predicates for major failure classes:
+   - acked flush lost
+   - visible unrecoverable state
+   - catch-up livelock / rebuild-required-but-not-escalated
diff --git a/sw-block/design/README.md b/sw-block/design/README.md
index a1ee51100..b17972064 100644
--- a/sw-block/design/README.md
+++ b/sw-block/design/README.md
@@ -1,6 +1,9 @@
 # V2 Design
 
 Current WAL V2 design set:
+- `v2-algorithm-overview.md`
+- `v2-algorithm-overview.zh.md`
+- `v2-detailed-algorithm.zh.md`
 - `wal-replication-v2.md`
 - `wal-replication-v2-state-machine.md`
 - `wal-replication-v2-orchestrator.md`
@@ -15,12 +18,25 @@ Current WAL V2 design set:
 - `v2-open-questions.md`
 - `v2-first-slice-session-ownership.md`
 - `v2-prototype-roadmap-and-gates.md`
+- `v2-engine-readiness-review.md`
+- `v2-engine-slicing-plan.md`
+- `v2-protocol-truths.md`
+- `v2-production-roadmap.md`
+- `phase-07-service-slice-plan.md`
+- `agent_dev_process.md`
 
 These documents are the working design home for the V2 line.
 
 The original project-level copies under `learn/projects/sw-block/design/` remain as shared references for now.
 
 Execution note:
-- active development tracking for the current simulator phase lives under:
+- active development tracking lives under `../.private/phase/`
+- key completed/current phase docs include:
   - `../.private/phase/phase-01.md`
   - `../.private/phase/phase-02.md`
+  - `../.private/phase/phase-03.md`
+  - `../.private/phase/phase-04.md`
+  - `../.private/phase/phase-4.5.md`
+  - `../.private/phase/phase-05.md`
+  - `../.private/phase/phase-06.md`
+  - `../.private/phase/phase-07.md`
diff --git a/sw-block/design/a5-a8-traceability.md b/sw-block/design/a5-a8-traceability.md
new file mode 100644
index 000000000..aa3b1626d
--- /dev/null
+++ b/sw-block/design/a5-a8-traceability.md
@@ -0,0 +1,117 @@
+# A5-A8 Acceptance Traceability
+
+Date: 2026-03-29
+Status: Phase 4.5 evidence-hardening
+
+## Purpose
+
+Map each acceptance criterion to specific executable evidence.
+Two evidence layers:
+- **Simulator** (distsim): protocol-level proof
+- **Prototype** (enginev2): ownership/session-level proof
+
+---
+
+## A5: Non-Convergent Catch-Up Escalates Explicitly
+
+**Must prove**: tail-chasing or failed catch-up does not pretend success.
+
+**Pass condition**: explicit `CatchingUp → NeedsRebuild` transition.
+
+| Evidence | Test | File | Layer | Status |
+|----------|------|------|-------|--------|
+| Tail-chasing converges or aborts | `TestS6_TailChasing_ConvergesOrAborts` | `cluster_test.go` | distsim | PASS |
+| Tail-chasing non-convergent → NeedsRebuild | `TestS6_TailChasing_NonConvergent_EscalatesToNeedsRebuild` | `phase02_advanced_test.go` | distsim | PASS |
+| Catch-up timeout → NeedsRebuild | `TestP03_CatchupTimeout_EscalatesToNeedsRebuild` | `phase03_timeout_test.go` | distsim | PASS |
+| Reservation expiry aborts catch-up | `TestReservationExpiryAbortsCatchup` | `cluster_test.go` | distsim | PASS |
+| Flapping budget exceeded → NeedsRebuild | `TestP02_S5_FlappingExceedsBudget_EscalatesToNeedsRebuild` | `phase02_advanced_test.go` | distsim | PASS |
+| Catch-up converges or escalates (I3) | `TestI3_CatchUpConvergesOrEscalates` | `phase045_crash_test.go` | distsim | PASS |
+| Catch-up timeout in enginev2 | `TestE2E_NeedsRebuild_Escalation` | `p2_test.go` | enginev2 | PASS |
+
+**Verdict**: A5 is well-covered. Both simulator and prototype prove explicit escalation. No pretend-success path exists.
+
+---
+
+## A6: Recoverability Boundary Is Explicit
+
+**Must prove**: recoverable vs unrecoverable gap is decided explicitly.
+
+**Pass condition**: recovery aborts when reservation/payload availability is lost; rebuild is explicit fallback.
+
+| Evidence | Test | File | Layer | Status |
+|----------|------|------|-------|--------|
+| Reservation expiry aborts catch-up | `TestReservationExpiryAbortsCatchup` | `cluster_test.go` | distsim | PASS |
+| WAL GC beyond replica → NeedsRebuild | `TestI5_CheckpointGC_PreservesAckedBoundary` | `phase045_crash_test.go` | distsim | PASS |
+| Rebuild from snapshot + tail | `TestReplicaRebuildFromSnapshotAndTail` | `cluster_test.go` | distsim | PASS |
+| Smart WAL: resolvable → unresolvable | `TestP02_SmartWAL_RecoverableThenUnrecoverable` | `phase02_advanced_test.go` | distsim | PASS |
+| Time-varying payload availability | `TestP02_SmartWAL_TimeVaryingAvailability` | `phase02_advanced_test.go` | distsim | PASS |
+| RecoverableLSN is replayability proof | `RecoverableLSN()` in `storage.go` | `storage.go` | distsim | Implemented |
+| Handshake outcome: NeedsRebuild | `TestExec_HandshakeOutcome_NeedsRebuild_InvalidatesSession` | `execution_test.go` | enginev2 | PASS |
+
+**Verdict**: A6 is covered. Recovery boundary is decided by explicit reservation + recoverability check, not by optimistic assumption. `RecoverableLSN()` verifies contiguous WAL coverage.
+
+---
+
+## A7: Historical Data Correctness Holds
+
+**Must prove**: recovered data for target LSN is historically correct; current extent cannot fake old history.
+
+**Pass condition**: snapshot + tail rebuild matches reference; current-extent reconstruction of old LSN fails correctness.
+
+| Evidence | Test | File | Layer | Status |
+|----------|------|------|-------|--------|
+| Snapshot + tail matches reference | `TestReplicaRebuildFromSnapshotAndTail` | `cluster_test.go` | distsim | PASS |
+| Historical state not reconstructable after GC | `TestA7_HistoricalState_NotReconstructableAfterGC` | `phase045_crash_test.go` | distsim | PASS |
+| `CanReconstructAt()` rejects faked history | `CanReconstructAt()` in `storage.go` | `storage.go` | distsim | Implemented |
+| Checkpoint does not leak applied state | `TestI2_CheckpointDoesNotLeakAppliedState` | `phase045_crash_test.go` | distsim | PASS |
+| Extent-referenced resolvable records | `TestExtentReferencedResolvableRecordsAreRecoverable` | `cluster_test.go` | distsim | PASS |
+| Extent-referenced unresolvable → rebuild | `TestExtentReferencedUnresolvableForcesRebuild` | `cluster_test.go` | distsim | PASS |
+| ACK'd flush recoverable after crash (I1) | `TestI1_AckedFlush_RecoverableAfterPrimaryCrash` | `phase045_crash_test.go` | distsim | PASS |
+
+**Verdict**: A7 is now covered with the Phase 4.5 crash-consistency additions. The critical gap ("current extent cannot fake old history") is proven by `CanReconstructAt()` + `TestA7_HistoricalState_NotReconstructableAfterGC`.
+
+---
+
+## A8: Durability Mode Semantics Are Correct
+
+**Must prove**: best_effort, sync_all, sync_quorum behave as intended under mixed replica states.
+
+**Pass condition**: sync_all strict, sync_quorum commits only with true durable quorum, invalid topology rejected.
+
+| Evidence | Test | File | Layer | Status |
+|----------|------|------|-------|--------|
+| sync_quorum continues with one lagging | `TestSyncQuorumContinuesWithOneLaggingReplica` | `cluster_test.go` | distsim | PASS |
+| sync_all blocks with one lagging | `TestSyncAllBlocksWithOneLaggingReplica` | `cluster_test.go` | distsim | PASS |
+| sync_quorum mixed states | `TestSyncQuorumWithMixedReplicaStates` | `cluster_test.go` | distsim | PASS |
+| sync_all mixed states | `TestSyncAllBlocksWithMixedReplicaStates` | `cluster_test.go` | distsim | PASS |
+| Barrier timeout: sync_all blocked | `TestP03_BarrierTimeout_SyncAll_Blocked` | `phase03_timeout_test.go` | distsim | PASS |
+| Barrier timeout: sync_quorum commits | `TestP03_BarrierTimeout_SyncQuorum_StillCommits` | `phase03_timeout_test.go` | distsim | PASS |
+| Promotion uses RecoverableLSN | `EvaluateCandidateEligibility()` | `cluster.go` | distsim | Implemented |
+| Promoted replica has committed prefix (I4) | `TestI4_PromotedReplica_HasCommittedPrefix` | `phase045_crash_test.go` | distsim | PASS |
+
+**Verdict**: A8 is well-covered. sync_all is strict (blocks on lagging), sync_quorum uses true durable quorum (not connection count). Promotion now uses `RecoverableLSN()` for committed-prefix check.
+
+---
+
+## Summary
+
+| Criterion | Simulator Evidence | Prototype Evidence | Status |
+|-----------|-------------------|-------------------|--------|
+| A5 (catch-up escalation) | 6 tests | 1 test | **Strong** |
+| A6 (recoverability boundary) | 6 tests + RecoverableLSN() | 1 test | **Strong** |
+| A7 (historical correctness) | 7 tests + CanReconstructAt() | — | **Strong** (new in Phase 4.5) |
+| A8 (durability modes) | 7 tests + RecoverableLSN() | — | **Strong** |
+
+**Total executable evidence**: 26 simulator tests + 2 prototype tests + 2 new storage methods.
+
+All A5-A8 acceptance criteria have direct test evidence. No criterion depends solely on design-doc claims.
+
+---
+
+## Still Open (Not Blocking)
+
+| Item | Priority | Why not blocking |
+|------|----------|-----------------|
+| Predicate exploration / adversarial search | P2 | Manual scenarios already cover known failure classes |
+| Catch-up convergence under sustained load | P2 | I3 proves escalation; load-rate modeling is optimization |
+| A5-A8 in a single grouped runner view | P3 | Traceability doc serves as grouped evidence for now |
diff --git a/sw-block/design/agent_dev_process.md b/sw-block/design/agent_dev_process.md
new file mode 100644
index 000000000..659506d6e
--- /dev/null
+++ b/sw-block/design/agent_dev_process.md
@@ -0,0 +1,304 @@
+# Agent Development Process
+
+Date: 2026-03-30
+Status: active
+Purpose: define the working split between `sw`, `tester`, and review/management roles so each phase and slice has a clear delivery path
+
+## Why This Exists
+
+The project is now beyond pure exploration.
+
+The expensive part is no longer only writing code.
+The expensive part is:
+
+1. delivery
+2. review
+3. fixes
+4. re-review
+
+So the process must reduce repeated full-stack review and make each role responsible for a distinct layer.
+
+## Roles
+
+### `manager`
+
+Primary role:
+
+- phase/plan owner
+
+Responsibilities:
+
+1. define the phase/slice direction and scope
+2. accept the planning package before coding starts
+3. decide whether a carry-forward is acceptable or must become a gate
+4. perform the final round review for overall logic, omissions, and product-path fit
+
+### `architect`
+
+Primary role:
+
+- plan and technical reviewer
+
+Responsibilities:
+
+1. review the plan before implementation starts
+2. tighten algorithm wording, scope edges, and expectation framing
+3. review technical correctness during implementation
+4. review API/state/resource/fail-closed behavior
+5. catch semantic drift, scope drift, and V1/V1.5 leakage
+
+### `sw`
+
+Primary role:
+
+- implementation owner
+
+Responsibilities:
+
+1. implement the accepted slice
+2. state changed contracts
+3. state fail-closed handling
+4. state resources acquired/released
+5. state carry-forward items
+6. add or update tests
+
+### `tester`
+
+Primary role:
+
+- evidence owner
+
+Responsibilities:
+
+1. define what the slice must prove before implementation starts
+2. maintain the failure-class checklist
+3. define reject conditions and required test level
+4. confirm that implementation claims are actually covered by evidence
+
+## Default Routine
+
+Each slice should follow this order:
+
+1. `manager` defines the plan direction
+2. `architect` reviews and tightens the plan / algorithm / expectation framing
+3. `tester` writes the expectation template
+4. `manager` accepts the package and records it in the phase docs
+5. `sw` implements and submits with the delivery template
+6. `architect` reviews the technical layer until clean enough
+7. `tester` performs validation and evidence closure
+8. `manager` performs round-two review for overall logic and omissions
+
+Urgent exception:
+
+- if early work already shows major scope drift, protocol contradiction, or V1/V1.5 leakage, architecture review may short-circuit before implementation grows further
+
+## Delivery Template For `sw`
+
+Each delivery should include:
+
+1. changed contracts
+2. fail-closed handling added
+3. resources acquired/released
+4. test inventory
+5. known carry-forward notes
+
+This template is required between:
+
+1. implementation
+2. implementation/fail-closed review
+
+It should accompany the delivery before reviewers start detailed review.
+
+Suggested format:
+
+```md
+Changed contracts:
+- ...
+
+Fail-closed handling:
+- ...
+
+Resources acquired/released:
+- ...
+
+Test inventory:
+- ...
+
+Carry-forward notes:
+- ...
+```
+
+## Phase Doc Usage
+
+Use the three phase documents differently:
+
+### `phase-xx.md`
+
+Use for:
+
+1. current execution direction
+2. current scope
+3. current guardrails
+4. current accepted status
+5. current assignments
+
+Keep it short and execution-oriented.
+
+### `phase-xx-log.md`
+
+Use for:
+
+1. detailed planning evolution
+2. review feedback
+3. carry-forward discussion
+4. open observations
+5. why wording or scope changed
+
+This document may be longer and more detailed.
+
+### `phase-xx-decisions.md`
+
+Use for:
+
+1. durable phase-level decisions
+2. accepted boundaries that later rounds should inherit
+3. gate decisions
+4. decisions that should not be re-argued without new evidence
+
+This document should stay compact and hold only the more important global decisions.
+
+## Expectation Template For `tester`
+
+Before or at slice start, `tester` should define:
+
+1. must-pass expectations
+2. failure-class checklist
+3. required test level for each behavior
+4. reject conditions
+
+`tester` should re-engage after technical review is mostly clean, to confirm final evidence closure before the manager's second-round review.
+
+Suggested format:
+
+```md
+Expectation:
+- ...
+
+Required level:
+- entry path / engine / unit
+
+Reject if:
+- ...
+
+Failure classes covered:
+- ...
+```
+
+## Review Checklist For `architect`
+
+Review these first:
+
+1. nil handling
+2. missing-resource handling
+3. wrong-state / wrong-kind rejection
+4. stale ID / stale authority rejection
+5. resource pin / release symmetry
+6. plan/execute/complete argument correctness
+7. fail-closed cleanup on partial failure
+
+## Failure-Class Checklist
+
+This checklist should be kept active across phases.
+
+Minimum recurring classes:
+
+1. changed-address restart
+2. stale epoch / stale session
+3. missing resource pin
+4. cleanup after failed plan
+5. replay range mis-derived
+6. false trusted-base selection
+7. truncation missing but completion attempted
+8. bounded catch-up not escalating
+
+## Process Rules
+
+### Rule 1: Do not wait until the end to define proof
+
+Each slice should begin with a statement of:
+
+1. what must be proven
+2. which failure classes must stay closed
+
+### Rule 2: Do not let convenience wrappers silently become model truth
+
+Any convenience flow must be explicitly classified as:
+
+1. test-only convenience
+2. stepwise engine task
+3. planner/executor split
+
+### Rule 3: Prefer evidence quality over object growth
+
+New work should preferentially improve:
+
+1. traceability
+2. diagnosability
+3. failure-class closure
+4. adapter contracts
+
+not just add:
+
+1. more structs
+2. more states
+3. more helper APIs
+
+### Rule 4: Use V1 as validation source, not architecture template
+
+Use:
+
+1. `learn/projects/sw-block/`
+2. `weed/storage/block*`
+
+for:
+
+1. constraints
+2. failure gates
+3. implementation reality
+
+Do not use them as the default V2 architecture template.
+
+### Rule 5: Reuse reality, not inherited semantics
+
+When later implementation reuses existing `Seaweed` / `V1` paths:
+
+1. reuse control-plane reality
+2. reuse storage/runtime reality
+3. reuse execution mechanisms
+
+but do not silently inherit:
+
+1. address-shaped identity
+2. old recovery classification semantics
+3. old committed-truth assumptions
+4. old failover authority assumptions
+
+Any such reuse should be reviewed explicitly as:
+
+1. safe reuse
+2. reuse with explicit boundary
+3. temporary carry-forward
+4. hard gate before later phases
+
+## Current Direction
+
+The project has moved from exploration-heavy work to evidence-first engine work.
+
+From `Phase 06` onward, the default is:
+
+1. plan first
+2. review plan before coding
+3. implement
+4. review technical layer
+5. close evidence
+6. do final manager review
diff --git a/sw-block/design/phase-07-service-slice-plan.md b/sw-block/design/phase-07-service-slice-plan.md
new file mode 100644
index 000000000..7e9a0a45a
--- /dev/null
+++ b/sw-block/design/phase-07-service-slice-plan.md
@@ -0,0 +1,403 @@
+# Phase 07 Service-Slice Plan
+
+Date: 2026-03-30
+Status: draft
+Scope: `Phase 07 P0`
+
+## Purpose
+
+Define the first real-system service slice that will host the V2 engine, choose the first concrete integration path in the existing codebase, and map engine adapters onto real modules.
+
+This is a planning document. It does not claim the integration already works.
+
+## Decision
+
+The first service slice should be:
+
+- a single `blockvol` primary on a real volume server
+- with one replica target (`RF=2` path)
+- driven by the existing master heartbeat / assignment loop
+- using the V2 engine only for replication recovery ownership / planning / execution
+
+This is the narrowest real-system slice that still exercises:
+
+1. real assignment delivery
+2. real epoch and failover signals
+3. real volume-server lifecycle
+4. real WAL/checkpoint/base-image truth
+5. real changed-address / reconnect behavior
+
+It is narrow enough to avoid reopening the whole system, but real enough to stop hiding behind engine-local mocks.
+
+## Why This Slice
+
+This slice is the right first integration target because:
+
+1. `weed/server/master_grpc_server.go` already delivers block-volume assignments over heartbeat
+2. `weed/server/master_block_failover.go` already owns failover / promotion / pending rebuild decisions
+3. `weed/storage/blockvol/blockvol.go` already owns the current replication runtime (`shipperGroup`, receiver, WAL retention, checkpoint state)
+4. the existing V1/V1.5 failure history is concentrated in exactly this master <-> volume-server <-> blockvol path
+
+So this slice gives maximum validation value with minimum new surface.
+
+## First Concrete Integration Path
+
+The first integration path should be:
+
+1. master receives volume-server heartbeat
+2. master updates block registry and emits `BlockVolumeAssignment`
+3. volume server receives assignment
+4. block volume adapter converts assignment + local storage state into V2 engine inputs
+5. V2 engine drives sender/session/recovery state
+6. existing block-volume runtime executes the actual data-path work under engine decisions
+
+In code, that path starts here:
+
+- master side:
+  - `weed/server/master_grpc_server.go`
+  - `weed/server/master_block_failover.go`
+  - `weed/server/master_block_registry.go`
+- volume / storage side:
+  - `weed/storage/blockvol/blockvol.go`
+  - `weed/storage/blockvol/recovery.go`
+  - `weed/storage/blockvol/wal_shipper.go`
+  - assignment-handling code under `weed/storage/blockvol/`
+- V2 engine side:
+  - `sw-block/engine/replication/`
+
+## Service-Slice Boundaries
+
+### In-process placement
+
+The V2 engine should initially live:
+
+- in-process with the volume server / `blockvol` runtime
+- not in master
+- not as a separate service yet
+
+Reason:
+
+- the engine needs local access to storage truth and local recovery execution
+- master should remain control-plane authority, not recovery executor
+
+### Control-plane boundary
+
+Master remains authoritative for:
+
+1. epoch
+2. role / assignment
+3. promotion / failover decision
+4. replica membership
+
+The engine consumes these as control inputs. It does not replace master failover policy in `Phase 07`.
+
+### Control-Over-Heartbeat Upgrade Path
+
+For the first V2 product path, the recommended direction is:
+
+- reuse the existing master <-> volume-server heartbeat path as the control carrier
+- upgrade the block-specific control semantics carried on that path
+- do not immediately invent a separate control service or assignment channel
+
+Why:
+
+1. this is the real Seaweed path already carrying block assignments and confirmations today
+2. this gives the fastest route to a real integrated control path
+3. it preserves compatibility with existing Seaweed master/volume-server semantics while V2 hardens its own control truth
+
+Concretely, the current V1 path already provides:
+
+1. block assignments delivered in heartbeat responses from `weed/server/master_grpc_server.go`
+2. assignment application on the volume server in `weed/server/volume_grpc_client_to_master.go` and `weed/server/volume_server_block.go`
+3. assignment confirmation and address-change refresh driven by later heartbeats in `weed/server/master_grpc_server.go` and `weed/server/master_block_registry.go`
+4. immediate block heartbeat on selected shipper state changes in `weed/server/volume_grpc_client_to_master.go`
+
+What should be upgraded for V2 is not mainly the transport, but the control contract carried on it:
+
+1. stable `ReplicaID`
+2. explicit `Epoch`
+3. explicit role / assignment authority
+4. explicit apply/confirm semantics
+5. explicit stale assignment rejection
+6. explicit address-change refresh as endpoint change, not identity change
+
+Current cadence note:
+
+- the block volume heartbeat is periodic (`5 * sleepInterval`) with some immediate state-change heartbeats
+- this is acceptable as the first hardening carrier
+- it should not be assumed to be the final control responsiveness model
+
+Deferred design decision:
+
+- whether block control should eventually move beyond heartbeat-only carriage into a more explicit control/assignment channel should be decided only after the `Phase 08 P1` real control-delivery path exists and can be measured
+
+That later decision should be based on:
+
+1. failover / reassignment responsiveness
+2. assignment confirmation precision
+3. operational complexity
+4. whether heartbeat carriage remains too coarse for the block-control path
+
+Until then, the preferred direction is:
+
+- strengthen block control semantics over the existing heartbeat path
+- do not prematurely create a second control plane
+
+### Storage boundary
+
+`blockvol` remains authoritative for:
+
+1. WAL head / retention reality
+2. checkpoint/base-image reality
+3. actual catch-up streaming
+4. actual rebuild transfer / restore operations
+
+The engine consumes these as storage truth and recovery execution capabilities. It does not replace the storage backend in `Phase 07`.
+
+## First-Slice Identity Mapping
+
+This must be explicit in the first integration slice.
+
+For `RF=2` on the existing master / block registry path:
+
+- stable engine `ReplicaID` should be derived from:
+  - `<volume-name>/<replica-server-id>`
+- not from:
+  - `DataAddr`
+  - `CtrlAddr`
+  - heartbeat transport endpoint
+
+For this slice, the adapter should map:
+
+1. `ReplicaID`
+- from master/block-registry identity for the replica host entry
+
+2. `Endpoint`
+- from the current replica receiver/data/control addresses reported by the real runtime
+
+3. `Epoch`
+- from the confirmed master assignment for the volume
+
+4. `SessionKind`
+- from master-driven recovery intent / role transition outcome
+
+This is a hard first-slice requirement because address refresh must not collapse identity back into endpoint-shaped keys.
+
+## Adapter Mapping
+
+### 1. ControlPlaneAdapter
+
+Engine interface today:
+
+- `HandleHeartbeat(serverID, volumes)`
+- `HandleFailover(deadServerID)`
+
+Real mapping should be:
+
+- master-side source:
+  - `weed/server/master_grpc_server.go`
+  - `weed/server/master_block_failover.go`
+  - `weed/server/master_block_registry.go`
+- volume-server side sink:
+  - assignment receive/apply path in `weed/storage/blockvol/`
+
+Recommended real shape:
+
+- do not literally push raw heartbeat messages into the engine
+- instead introduce a thin adapter that converts confirmed master assignment state into:
+  - stable `ReplicaID`
+  - endpoint set
+  - epoch
+  - recovery target kind
+
+That keeps master as control owner and the engine as execution owner.
+
+Important note:
+
+- the adapter should treat heartbeat as the transport carrier, not as the final protocol shape
+- block-control semantics should be made explicit over that carrier
+- if a later phase concludes that heartbeat-only carriage is too coarse, that should be a separate design decision after the real hardening path is measured
+
+### 2. StorageAdapter
+
+Engine interface today:
+
+- `GetRetainedHistory()`
+- `PinSnapshot(lsn)` / `ReleaseSnapshot(pin)`
+- `PinWALRetention(startLSN)` / `ReleaseWALRetention(pin)`
+- `PinFullBase(committedLSN)` / `ReleaseFullBase(pin)`
+
+Real mapping should be:
+
+- retained history source:
+  - current WAL head/tail/checkpoint state from `weed/storage/blockvol/blockvol.go`
+  - recovery helpers in `weed/storage/blockvol/recovery.go`
+- WAL retention pin:
+  - existing retention-floor / replica-aware WAL retention machinery around `shipperGroup`
+- snapshot pin:
+  - existing snapshot/checkpoint artifacts in `blockvol`
+- full-base pin:
+  - explicit pinned full-extent export or equivalent consistent base handle from `blockvol`
+
+Important constraint:
+
+- `Phase 07` must not fake this by reconstructing `RetainedHistory` from tests or metadata alone
+
+### 3. Execution Driver / Executor hookup
+
+Engine side already has:
+
+- planner/executor split in `sw-block/engine/replication/driver.go`
+- stepwise executors in `sw-block/engine/replication/executor.go`
+
+Real mapping should be:
+
+- engine planner decides:
+  - zero-gap / catch-up / rebuild
+  - trusted-base requirement
+  - replayable-tail requirement
+- blockvol runtime performs:
+  - actual WAL catch-up transport
+  - actual snapshot/base transfer
+  - actual truncation / apply operations
+
+Recommended split:
+
+- engine owns contract and state transitions
+- blockvol adapter owns concrete I/O work
+
+## First-Slice Acceptance Rule
+
+For the first integration slice, this is a hard rule:
+
+- `blockvol` may execute recovery I/O
+- `blockvol` must not own recovery policy
+
+Concretely, `blockvol` must not decide:
+
+1. zero-gap vs catch-up vs rebuild
+2. trusted-base validity
+3. replayable-tail sufficiency
+4. whether rebuild fallback is required
+
+Those decisions must remain in the V2 engine.
+
+The bridge may translate engine decisions into concrete blockvol actions, but it must not re-decide recovery policy underneath the engine.
+
+## First Product Path
+
+The first product path should be:
+
+- `RF=2` block volume replication on the existing heartbeat/assignment loop
+- primary + one replica
+- failover / reconnect / changed-address handling
+- rebuild as the formal non-catch-up recovery path
+
+This is the right first path because it exercises the core correctness boundary without introducing N-replica coordination complexity too early.
+
+## What Must Be Replaced First
+
+Current engine-stage pieces that are still mock/test-only or too abstract:
+
+### Replace first
+
+1. `mockStorage` in engine tests
+- replace with a real `blockvol`-backed `StorageAdapter`
+
+2. synthetic control events in engine tests
+- replace with assignment-driven events from the real master/volume-server path
+
+3. convenience recovery completion wrappers
+- keep them test-only
+- real integration should use planner + executor + storage work loop
+
+### Can remain temporarily abstract in Phase 07 P0/P1
+
+1. `ControlPlaneAdapter` exact public shape
+- can remain thin while the integration path is being chosen
+
+2. async production scheduler details
+- executor can still be driven by a service loop before full background-task architecture is finalized
+
+## Recommended Concrete Modules
+
+### Engine stays here
+
+- `sw-block/engine/replication/`
+
+### First real adapter package should be added near blockvol
+
+Recommended initial location:
+
+- `weed/storage/blockvol/v2bridge/`
+
+Reason:
+
+- keeps V2 engine independent under `sw-block/`
+- keeps real-system glue close to blockvol storage truth
+- avoids copying engine logic into `weed/`
+
+Suggested contents:
+
+1. `control_adapter.go`
+- convert master assignment / local apply path into engine intents
+
+2. `storage_adapter.go`
+- expose retained history, pin/release, trusted-base export handles from real blockvol state
+
+3. `executor_bridge.go`
+- translate engine executor steps into actual blockvol recovery actions
+
+4. `observe_adapter.go`
+- map engine status/logs into service-visible diagnostics
+
+## First Failure Replay Set For Phase 07
+
+The first real-system replay set should be:
+
+1. changed-address restart
+- current risk: old identity/address coupling reappears in service glue
+
+2. stale epoch / stale result after failover
+- current risk: master and engine disagree on authority timing
+
+3. unreplayable-tail rebuild fallback
+- current risk: service glue over-trusts checkpoint/base availability
+
+4. plan/execution cleanup after resource failure
+- current risk: blockvol-side resource failures leave engine or service state dangling
+
+5. primary failover to replica with rebuild pending on old primary reconnect
+- current risk: old V1/V1.5 semantics leak back into reconnect handling
+
+## Non-Goals For This Slice
+
+Do not use `Phase 07` to:
+
+1. widen catch-up semantics
+2. add smart rebuild optimizations
+3. redesign all blockvol internals
+4. replace the full V1 runtime in one move
+5. claim production readiness
+
+## Deliverables For Phase 07 P0
+
+A good `P0` delivery should include:
+
+1. chosen service slice
+2. chosen integration path in the current repo
+3. adapter-to-module mapping
+4. list of test-only adapters to replace first
+5. first failure replay set
+6. explicit note of what remains outside this first slice
+
+## Short Form
+
+`Phase 07 P0` should start with:
+
+- engine in `sw-block/engine/replication/`
+- bridge in `weed/storage/blockvol/v2bridge/`
+- first real slice = blockvol primary + one replica on the existing master heartbeat / assignment path
+- `ReplicaID = <volume-name>/<replica-server-id>` for the first slice
+- `blockvol` executes I/O but does not own recovery policy
+- first product path = `RF=2` failover/reconnect/rebuild correctness
diff --git a/sw-block/design/v2-algorithm-overview.md b/sw-block/design/v2-algorithm-overview.md
new file mode 100644
index 000000000..8e88f6b65
--- /dev/null
+++ b/sw-block/design/v2-algorithm-overview.md
@@ -0,0 +1,686 @@
+# V2 Algorithm Overview
+
+Date: 2026-03-27
+Status: strategic design overview
+Audience: CEO / owner / technical leadership
+
+## Purpose
+
+This document explains the current V2 direction for `sw-block`:
+
+- what V2 is trying to solve
+- why V1 and V1.5 are not enough as the long-term architecture
+- why a WAL-based design is still worth pursuing
+- how V2 compares with major market and paper directions
+- how simulation and the real test runner systematically build confidence
+
+This is not a phase report and not a production-commitment document.
+
+It is the high-level technical rationale for the V2 line.
+
+## Relationship To Other Documents
+
+| Document | Role |
+|----------|------|
+| `v1-v15-v2-comparison.md` | Detailed comparison of the three lines |
+| `v2-acceptance-criteria.md` | Protocol validation bar |
+| `v2_scenarios.md` | Scenario backlog and simulator mapping |
+| `v2-open-questions.md` | Remaining algorithmic questions |
+| `protocol-development-process.md` | Method for protocol work |
+| `learn/projects/sw-block/algorithm_overview.md` | Current V1/V1.5 system review |
+| `learn/projects/sw-block/design/algorithm_survey.md` | Paper and vendor survey |
+| `learn/projects/sw-block/test/README.md` | Real test runner overview |
+| `learn/projects/sw-block/test/test-platform-review.md` | Test platform maturity and standalone direction |
+
+## 1. Executive Summary
+
+The current judgment is:
+
+- `V1` proved that the basic WAL-based replicated block model can work.
+- `V1.5` materially improved real recovery behavior and now has stronger operational evidence on real hardware.
+- `V2` exists because the next correctness problems should not be solved by incremental local fixes. They should be made explicit in the protocol itself.
+
+The central V2 idea is simple:
+
+- short-gap recovery should be explicit
+- stale authority should be explicitly fenced
+- catch-up vs rebuild should be an explicit decision
+- recovery ownership should be a protocol object, not an implementation accident
+
+`V2` is not yet a production engine. But it is already the stronger architectural direction.
+
+The correct strategic posture today is:
+
+- continue `V1.5` as the production line
+- continue `V2` as the long-term architecture line
+- continue WAL investigation because we now have a serious validation framework
+- if prototype evidence later shows a structural flaw, evolve to `V2.5` before heavy implementation
+
+## 2. The Real Problem V2 Tries To Solve
+
+At the frontend, a block service looks simple:
+
+- `write`
+- `flush` / `sync`
+- failover
+- recovery
+
+But the real difficulty is not the frontend verb set. The real difficulty is the asynchronous distributed boundary between:
+
+- local WAL append on the primary
+- durable progress on replicas
+- client-visible commit / sync truth
+- failover and promotion safety
+- recovery after lag, restart, endpoint change, or timeout
+
+This is the root reason V2 exists.
+
+The project has already learned that correctness problems in block storage do not usually come from the happy path. They come from:
+
+- a replica going briefly down and coming back
+- a replica coming back on a new address
+- a delayed stale barrier or stale reconnect result
+- a lagging node that is almost, but not quite, recoverable
+- a failover decision made on insufficient lineage information
+
+V2 is the attempt to make those cases first-class protocol behavior instead of post-hoc patching.
+
+## 3. Why V1 And V1.5 Are Not Enough
+
+This overview does not need a long retelling of `V1` and `V1.5`.
+
+What matters is their architectural limit.
+
+### What `V1` got right
+
+`V1` proved the basic shape:
+
+- ordered WAL
+- primary-replica replication
+- extent-backed storage
+- epoch and lease as the first fencing model
+
+### Why `V1` is not enough
+
+Its main shortcomings were:
+
+- short-gap recovery was too weak and too implicit
+- lagging replicas too easily fell into rebuild or long degraded states
+- changed-address restart was fragile
+- stale authority and stale results were not modeled as first-class protocol objects
+- the system did not cleanly separate:
+  - current WAL head
+  - committed prefix
+  - recoverable retained range
+  - stale or divergent replica tail
+
+### Why `V1.5` is still not enough
+
+`V1.5` fixed several real operational problems:
+
+- retained-WAL catch-up
+- same-address reconnect
+- `sync_all` correctness on real tests
+- rebuild fallback after unrecoverable gap
+- control-plane refresh after changed-address restart
+
+Those fixes matter, and they are why `V1.5` is the stronger production line today.
+
+But `V1.5` is still not the long-term architecture because its recovery model remains too incremental:
+
+- reconnect logic is still layered onto an older shipper model
+- recovery ownership was discovered as a bug class before it became a protocol object
+- catch-up vs rebuild became clearer, but still not clean enough as a top-level protocol contract
+- the system still looks too much like "repair V1" rather than "define the next replication model"
+
+### What `V2` changes
+
+`V2` is not trying to invent a completely different storage model.
+
+It is trying to make the critical parts explicit:
+
+- recovery ownership
+- lineage-safe recovery boundary
+- catch-up vs rebuild classification
+- per-replica sender authority
+- stale-result rejection
+- explicit recovery orchestration
+
+So the correct comparison is still:
+
+- `V1.5` is stronger operationally today
+- `V2` is stronger architecturally today
+
+That is not a contradiction. It is the right split between a current production line and the next architecture line.
+
+```mermaid
+flowchart TD
+    V1[V1]
+    V15[V1_5]
+    V2[V2]
+    realFailures[RealFailures]
+    realTests[RealHardwareEvidence]
+    simAndProto[SimulationAndPrototype]
+
+    V1 --> V15
+    V15 --> V2
+    realFailures --> V15
+    realFailures --> V2
+    V15 --> realTests
+    V2 --> simAndProto
+```
+
+## 4. How V2 Solves WAL And Extent Synchronization
+
+The core V2 question is not simply "do we keep WAL?"
+
+The real question is:
+
+**how do WAL and extent stay synchronized across primary and replica while preserving both stability and performance?**
+
+This is the center of the V2 design.
+
+### 4.1 The basic separation of roles
+
+V2 treats the storage path as two different but coordinated layers:
+
+- **WAL** is the ordered truth for recent history
+- **extent** is the stable materialized image
+
+WAL is used for:
+
+- strict write ordering
+- local crash recovery
+- short-gap replica catch-up
+- durable progress accounting through `LSN`
+
+Extent is used for:
+
+- stable read image
+- long-lived storage
+- checkpoint and base-image creation
+- long-gap recovery only through a real checkpoint/snapshot base, not through guessing from the current live extent
+
+This separation is the first stability rule:
+
+- do not ask current extent to behave like historical state
+- do not ask WAL to be the only long-range recovery mechanism forever
+
+### 4.2 Primary-replica synchronization model
+
+The intended V2 steady-state model is:
+
+1. primary allocates monotonic `LSN`
+2. primary appends ordered WAL locally
+3. primary enqueues the record to per-replica sender loops
+4. replicas receive in order and advance explicit progress
+5. barrier/sync uses **durable replica progress**, not optimistic send progress
+6. flusher later materializes WAL-backed dirty state into extent
+
+The local WAL-to-extent lifecycle can be understood as:
+
+```mermaid
+stateDiagram-v2
+    [*] --> WalAppended
+    WalAppended --> SenderQueued
+    SenderQueued --> ReplicaReceived
+    ReplicaReceived --> ReplicaDurable
+    ReplicaDurable --> SyncEligible
+    SyncEligible --> ExtentMaterialized
+    ExtentMaterialized --> CheckpointAdvanced
+
+    note right of WalAppended
+        Ordered local WAL exists
+        and defines the write LSN
+    end note
+
+    note right of ReplicaDurable
+        Replica durable progress
+        is now explicit
+    end note
+
+    note right of ExtentMaterialized
+        Flusher moves stable data
+        from WAL-backed dirty state
+        into extent
+    end note
+```
+
+The critical synchronization rule is:
+
+- **client-visible sync truth must follow durable replica progress**
+- not local send progress
+- not local WAL head
+- not "replica probably received it"
+
+This is why V2 uses a lineage-safe recovery target such as `CommittedLSN` instead of a looser notion like "current primary head."
+
+### 4.2.1 Sync mode and result model
+
+V2 also makes the sync-result logic more explicit.
+
+- `best_effort` should succeed after the primary has reached its local durability point, even if replicas are degraded.
+- `sync_all` should succeed only when all required replicas are durable through the target boundary.
+- `sync_quorum` should succeed only when a true durable quorum exists through the target boundary.
+
+This decision path can be presented as:
+
+```mermaid
+flowchart TD
+    writeReq[WriteAndSyncRequest]
+    localDurable[PrimaryLocalDurable]
+    barrierEval[EvaluateReplicaDurableProgress]
+    bestEffortAck[best_effortAck]
+    syncAllAck[sync_allAck]
+    syncQuorumAck[sync_quorumAck]
+    rejectOrBlock[RejectOrBlock]
+
+    writeReq --> localDurable
+    localDurable --> bestEffortAck
+    localDurable --> barrierEval
+
+    barrierEval -->|"allRequiredReplicasDurable"| syncAllAck
+    barrierEval -->|"durableQuorumExists"| syncQuorumAck
+    barrierEval -->|"notEnoughDurableReplicas"| rejectOrBlock
+```
+
+The key point is that sync success is no longer inferred from send progress or socket health.
+It is derived from explicit durable progress at the right safety boundary.
+
+### 4.3 Why this should be stable
+
+This model is designed to be stable because the dangerous ambiguities are separated:
+
+- **write ordering** is carried by WAL and `LSN`
+- **durability truth** is carried by barrier / flushed progress
+- **recovery ownership** is carried by sender + recovery attempt identity
+- **catch-up vs rebuild** is an explicit classification, not an accidental timeout side effect
+- **promotion safety** depends on committed prefix and lineage, not on whichever node looks newest
+
+In other words, V2 stability comes from reducing hidden coupling.
+
+The design tries to remove cases where one piece of state silently stands in for another.
+
+### 4.4 Why this can still be high-performance
+
+The performance argument is not that V2 is magically faster in all cases.
+
+The argument is narrower and more realistic:
+
+- keep the primary write path simple:
+  - ordered local WAL append
+  - enqueue to per-replica sender loops
+  - no heavy inline recovery logic in foreground writes
+- keep most complexity off the healthy hot path:
+  - sender ownership
+  - reconnect classification
+  - catch-up / rebuild decisions
+  - timeout and stale-result fencing
+  live mostly in recovery/control paths
+- use WAL for what it is good at:
+  - recent ordered delta
+  - short-gap replay
+- stop using WAL as the answer to every lag problem:
+  - long-gap recovery should move toward checkpoint/snapshot base plus tail replay
+
+So the V2 performance thesis is:
+
+- **healthy steady-state should remain close to V1.5**
+- **degraded/recovery behavior should become much cleaner**
+- **short-gap recovery should be cheaper than rebuild**
+- **long-gap recovery should stop forcing an unbounded WAL-retention tax**
+
+That is a much stronger and more believable claim than saying "V2 will just be faster."
+
+### 4.5 Why WAL is still worth choosing
+
+The reason to keep the WAL-based direction is that it gives the best foundation for this exact synchronization problem:
+
+- explicit order
+- explicit history
+- explicit committed prefix
+- explicit short-gap replay
+- explicit failover reasoning
+
+WAL is risky only if the design blurs:
+
+- local write acceptance
+- replica durable progress
+- committed boundary
+- recoverable retained history
+
+V2 exists precisely to stop blurring those things.
+
+So the current project position is:
+
+- WAL is not automatically safe
+- but WAL is still the most promising base for this block service
+- because the project now has enough real evidence, simulator coverage, and prototype work to investigate it rigorously
+
+## 5. Comparison With Market And Papers
+
+The current V2 direction is not chosen because other vendors are wrong. It is chosen because other directions solve different problems and carry different costs.
+
+### Ceph / RBD style systems
+
+Ceph-style block systems avoid this exact per-volume replicated WAL shape. They gain:
+
+- deep integration with object-backed distributed storage
+- mature placement and recovery machinery
+- strong cluster-scale distribution logic
+
+But they pay elsewhere:
+
+- more system layers
+- more object-store and peering complexity
+- a heavier operational and conceptual model
+
+This is not a free simplification. It is a different complexity trade.
+
+For `sw-block`, the design choice is to keep a narrower software block service with more explicit per-volume replication semantics instead of inheriting the full distributed object-backed block complexity stack.
+
+### PolarFS / ParallelRaft style work
+
+These systems explore more aggressive ordering and apply strategies:
+
+- out-of-order or conflict-aware work
+- deeper parallelism
+- more sophisticated log handling
+
+They are valuable references, especially for:
+
+- LBA conflict reasoning
+- recovery and replay cost thinking
+- future flusher parallelization ideas
+
+But they also introduce a much heavier correctness surface.
+
+The project does not currently want to buy that complexity before fully proving the simpler strict-order path.
+
+### AWS chain replication / EBS-style lessons
+
+Chain replication and related work are attractive because they address real bandwidth and recovery concerns:
+
+- Primary NIC pressure
+- forwarding topology
+- cleaner scaling for RF=3
+
+This is one of the more plausible borrowable directions later.
+
+But it changes:
+
+- latency profile
+- failure handling
+- barrier semantics
+- operational topology
+
+So it belongs to a later architecture stage, not to the current V2 core proof.
+
+### The actual strategic choice
+
+The project is deliberately choosing:
+
+- a narrower software-first block design
+- explicit per-volume correctness
+- strict reasoning before performance heroics
+- validation before feature expansion
+
+That is not conservatism for its own sake. It is how to build a block product that can later be trusted.
+
+## 6. Why This Direction Fits SeaweedFS And Future Standalone sw-block
+
+`sw-block` started inside SeaweedFS, but V2 is already being shaped as the next standalone block service line.
+
+That means the architecture should preserve two things at once:
+
+### What should remain compatible
+
+- placement and topology concepts where they remain useful
+- explainable control-plane contracts
+- operational continuity with the SeaweedFS ecosystem
+
+### What should become more block-specific
+
+- replication correctness
+- recovery ownership
+- recoverability classification
+- block-specific test and evidence story
+
+So the current direction is:
+
+- use SeaweedFS as the practical ecosystem and experience base
+- but shape V2 as a true block-service architecture, not as a minor sub-feature of `weed/`
+
+This is why the V2 line belongs under `sw-block/` rather than as a direct patch path inside the existing production tree.
+
+## 7. The Systematic Validation Method
+
+The second major reason the current direction is rational is the validation method.
+
+The project is no longer relying on:
+
+- implement first
+- discover behavior later
+- patch after failure
+
+Instead, the intended ladder is:
+
+- contract and invariants
+- scenario backlog
+- simulator
+- timer/race simulator
+- standalone prototype
+- real engine test runner
+
+```mermaid
+flowchart TD
+    contract[ContractAndInvariants]
+    scenarios[ScenarioBacklog]
+    distsim[distsim]
+    eventsim[eventsim]
+    prototype[enginev2Prototype]
+    runner[RealTestRunner]
+    confidence[SystemAndProductConfidence]
+
+    contract --> scenarios
+    scenarios --> distsim
+    scenarios --> eventsim
+    distsim --> prototype
+    eventsim --> prototype
+    prototype --> runner
+    runner --> confidence
+```
+
+This is the right shape for a risky block-storage algorithm:
+
+- simulation for protocol truth
+- prototype for executable truth
+- real runner for product/system truth
+
+## 8. What The Simulation System Proves
+
+The simulation system exists to answer:
+
+- what should happen
+- what must never happen
+- which V1/V1.5 shapes fail
+- why the V2 shape is better
+
+### `distsim`
+
+`distsim` is the main protocol simulator.
+
+It is used for:
+
+- protocol correctness
+- state transitions
+- stale authority fencing
+- promotion and lineage safety
+- catch-up vs rebuild
+- changed-address restart
+- candidate safety
+- reference-state checking
+
+### `eventsim`
+
+`eventsim` is the timing/race layer.
+
+It is used for:
+
+- barrier timeout behavior
+- catch-up timeout behavior
+- reservation timeout behavior
+- same-tick and delayed event ordering
+- stale timeout effects
+
+### What the simulator is good at
+
+It is especially strong for proving:
+
+- stale traffic rejection
+- explicit recovery boundaries
+- timeout/race semantics
+- failover correctness at committed prefix
+- why old authority must not mutate current lineage
+
+### What the simulator does not prove
+
+It does not prove:
+
+- real TCP behavior
+- real OS scheduling behavior
+- disk timing
+- real `WALShipper` integration
+- real frontend behavior under iSCSI or NVMe
+
+So the simulator is not the whole truth.
+
+It is the algorithm/protocol truth layer.
+
+## 9. What The Real Test Runner Proves
+
+The real test runner under `learn/projects/sw-block/test/` is the system and product validation layer.
+
+It is not merely QA support. It is a core part of whether the design can be trusted.
+
+### What it covers
+
+The runner and surrounding test system already span:
+
+- unit tests
+- component tests
+- integration tests
+- distributed scenarios
+- real hardware workflows
+
+The environment already includes:
+
+- real nodes
+- real block targets
+- real fault injection
+- benchmark and result capture
+- run bundles and scenario traceability
+
+### Why it matters
+
+The runner is what tells us whether:
+
+- the implemented engine behaves like the design says
+- the product works under real restart/failover/rejoin conditions
+- the operator workflows are credible
+- benchmark claims are real rather than accidental
+
+This is why the runner is best thought of as:
+
+- implementation truth
+- system truth
+- product truth
+
+not just test automation.
+
+## 10. How Simulation And Test Runner Progress Systematically
+
+The intended feedback loop is:
+
+1. V1/V1.5 real failures happen
+2. those failures are turned into design requirements
+3. scenarios are distilled for simulator use
+4. the simulator closes protocol ambiguity
+5. the standalone prototype closes execution ambiguity
+6. the real test runner validates system behavior on real environments
+7. new failures or mismatches feed back into design again
+
+This gives the project two different but complementary truths:
+
+- `simulation -> algorithm / protocol correctness`
+- `test runner -> implementation / system / product correctness`
+
+That separation is healthy.
+
+It prevents two common mistakes:
+
+- trusting design without real behavior
+- trusting green system tests without understanding the protocol deeply enough
+
+## 11. Current Status And Honest Limits
+
+### What is already strong
+
+- `V1.5` has materially better recovery behavior than `V1` and stronger operational evidence
+- `V2` has stronger architectural structure than `V1.5`
+- the simulator has serious acceptance coverage
+- the prototype line has already started closing ownership and orchestration risk
+- the real test runner is large enough to support serious system validation
+
+### What is not yet done
+
+- `V2` is not a production engine
+- prototype work is still in early-to-mid stages
+- historical-data / recovery-boundary prototype work is not complete
+- steady-state performance of `V2` is not yet proven
+- real hardware validation of `V2` does not yet exist
+
+So the correct statement is not:
+
+- "V2 is already better in production"
+
+The correct statement is:
+
+- "V2 is the better long-term architecture, but not yet the stronger deployed engine"
+
+## 12. Why The Current Direction Is Rational
+
+The current direction is rational because it keeps the right split:
+
+- `V1.5` continues as the production line today
+- `V2` continues as the next architecture line
+
+This lets the project:
+
+- keep shipping and hardening what already works
+- explore the better architecture without destabilizing the current engine
+- use simulation, prototype work, and the real runner to decide whether V2 should become the next real engine
+
+The final strategic rule should remain:
+
+- continue WAL investigation because the project now has a credible validation framework
+- continue V2 because the architectural evidence is strong
+- if prototype evidence later reveals a structural flaw, redesign to `V2.5` before heavy implementation
+
+That is the disciplined path for a block-storage algorithm.
+
+## Bottom Line
+
+If choosing based on current production proof:
+
+- use `V1.5`
+
+If choosing based on long-term protocol quality:
+
+- choose `V2`
+
+If choosing based on whether WAL should still be investigated:
+
+- yes, because the project now has the right validation stack to investigate it responsibly
+
+That is the current strategic answer.
diff --git a/sw-block/design/v2-algorithm-overview.zh.md b/sw-block/design/v2-algorithm-overview.zh.md
new file mode 100644
index 000000000..32e9d2bc4
--- /dev/null
+++ b/sw-block/design/v2-algorithm-overview.zh.md
@@ -0,0 +1,660 @@
+# V2 算法综述
+
+日期：2026-03-27
+状态：战略级设计综述
+读者：CEO / owner / 技术管理层
+
+## 文档目的
+
+本文用于说明 `sw-block` 当前 `V2` 方向背后的核心判断：
+
+- `V2` 到底想解决什么问题
+- 为什么 `V1` / `V1.5` 不足以作为长期架构
+- 为什么我们仍然认为基于 `WAL` 的方向值得继续走
+- `V2` 与主要市场方案 / 论文路线相比的取舍是什么
+- `simulation` 与真实 `test runner` 如何形成系统化验证闭环
+
+这不是 phase 汇报，也不是对生产可用性的承诺文档。
+
+它是对 `V2` 这条架构线的高层技术解释。
+
+## 与其他文档的关系
+
+| 文档 | 作用 |
+|------|------|
+| `v1-v15-v2-comparison.md` | 三条技术线的详细比较 |
+| `v2-acceptance-criteria.md` | V2 协议验证下限 |
+| `v2_scenarios.md` | 场景清单与 simulator 覆盖 |
+| `v2-open-questions.md` | 仍未关闭的算法问题 |
+| `protocol-development-process.md` | 协议开发方法论 |
+| `learn/projects/sw-block/algorithm_overview.md` | 当前 V1/V1.5 系统级算法综述 |
+| `learn/projects/sw-block/design/algorithm_survey.md` | 论文 / vendor 调研与借鉴项 |
+| `learn/projects/sw-block/test/README.md` | 真实测试系统入口 |
+| `learn/projects/sw-block/test/test-platform-review.md` | test runner 的平台化方向 |
+
+## 1. 执行摘要
+
+当前最准确的结论是：
+
+- `V1` 证明了基于 `WAL` 的复制块存储基本路径是可行的。
+- `V1.5` 在真实恢复场景上已经比 `V1` 明显更强，并且有真实硬件上的运行证据。
+- `V2` 的意义，不是在已有逻辑上继续打补丁，而是把最关键的恢复与一致性问题直接上升为协议对象。
+
+`V2` 的核心想法可以概括为：
+
+- 短间隙恢复要显式
+- 过期 authority 要显式 fencing
+- `catch-up` 与 `rebuild` 的边界要显式
+- 恢复 ownership 要成为协议的一部分，而不是实现细节里的偶然行为
+
+所以今天正确的策略是：
+
+- 继续用 `V1.5` 作为当前生产线
+- 继续用 `V2` 作为长期架构线
+- 继续认真研究 `WAL` 路线，因为现在我们已经具备了可信的验证框架
+- 如果后续 prototype 证明 `V2` 有结构性缺陷，就应当先演进到 `V2.5`，而不是硬着头皮直接实现
+
+## 2. V2 真正要解决的问题
+
+从前端看，块存储似乎只有几个简单动作：
+
+- `write`
+- `flush` / `sync`
+- failover
+- recovery
+
+但真正难的，不是这些前端动作本身，而是异步分布式边界：
+
+- primary 本地 WAL 追加
+- replica 端 durable progress
+- client 可见的 sync / commit 真值
+- failover / promote 时的数据边界
+- lag、restart、address change、timeout 后的恢复正确性
+
+这才是 `V2` 存在的根因。
+
+项目已经反复验证过：块存储真正的 bug 通常不出在 happy path，而出在：
+
+- replica 短暂掉线又回来
+- replica 重启后地址变化
+- 延迟到达的 stale barrier / stale reconnect 结果
+- 一个 lagging replica 看起来“差一点点就能恢复”
+- failover 时基于错误 lineage 做了 promote
+
+`V2` 就是要把这些情况变成协议的第一公民，而不是上线后再继续被动修补。
+
+## 3. 为什么 V1 / V1.5 不够
+
+这份综述不需要长篇回顾 `V1` 和 `V1.5` 的所有细节。
+
+只需要讲清它们为什么不足以作为长期架构。
+
+### `V1` 做对了什么
+
+`V1` 建立了最重要的基础：
+
+- 严格有序的 `WAL`
+- primary-replica 复制
+- 基于 `epoch + lease` 的初步 fencing
+- 以 `extent` 作为稳定数据面，而不是一开始就做全日志结构
+
+### `V1` 的不足
+
+它的关键短板主要在恢复与退化场景：
+
+- 短 outage 很容易演化成 rebuild 或长期 degraded
+- 恢复结构过于隐式
+- changed-address restart 脆弱
+- stale authority / stale result 还不是协议层的显式对象
+- 系统没有足够清晰地区分：
+  - 当前 head
+  - committed prefix
+  - recoverable retained range
+  - stale / divergent tail
+
+### `V1.5` 的不足
+
+`V1.5` 已经解决了不少真实问题：
+
+- retained-WAL catch-up
+- same-address reconnect
+- `sync_all` 的真实行为
+- catch-up 失败后的 rebuild fallback
+- changed-address restart 之后的 control-plane 刷新
+
+所以它今天是更强的生产线。
+
+但它仍然不是长期架构，因为它本质上仍然是增量修复：
+
+- reconnect 逻辑仍然附着在旧 shipper 模型上
+- 恢复 ownership 是先作为 bug 暴露出来，再逐步被抽象
+- `catch-up` vs `rebuild` 更清楚了，但还不够成为协议顶层契约
+- 整体感觉仍然更像“继续修 V1”，而不是“定义下一代复制协议”
+
+### `V2` 的变化
+
+`V2` 不是重新发明一个完全不同的存储模型。
+
+它的目标是把最关键的东西显式化：
+
+- recovery ownership
+- lineage-safe recovery boundary
+- `catch-up` / `rebuild` 分类
+- per-replica sender authority
+- stale-result rejection
+- 明确的 recovery orchestration
+
+因此最诚实的比较是：
+
+- `V1.5` 今天在运行证据上更强
+- `V2` 今天在架构质量上更强
+
+这不是矛盾，而是“当前生产线”和“下一代架构线”应有的分工。
+
+```mermaid
+flowchart TD
+    V1[V1]
+    V15[V1_5]
+    V2[V2]
+    realFailures[真实故障]
+    realTests[真实硬件验证]
+    simAndProto[仿真与原型]
+
+    V1 --> V15
+    V15 --> V2
+    realFailures --> V15
+    realFailures --> V2
+    V15 --> realTests
+    V2 --> simAndProto
+```
+
+## 4. V2 如何解决 WAL 与 Extent 的同步问题
+
+`V2` 的核心问题不是“还要不要 WAL”。
+
+真正的问题是：
+
+**primary 与 replica 之间，WAL 和 extent 如何保持同步，同时还能兼顾稳定性与性能。**
+
+这才是 `V2` 的中心。
+
+### 4.1 基本分工
+
+`V2` 把数据路径拆成两个既分离又协作的层：
+
+- **WAL**：近期历史的有序真相
+- **extent**：稳定的物化数据镜像
+
+WAL 负责：
+
+- 严格写入顺序
+- 本地崩溃恢复
+- 短间隙 replica catch-up
+- 基于 `LSN` 的 durable progress 计量
+
+Extent 负责：
+
+- 稳定读镜像
+- 长期存储
+- checkpoint / base image 生成
+- 长间隙恢复时作为真正 base image 的来源
+
+第一条稳定性原则就是：
+
+- 不要让当前 extent 冒充历史状态
+- 不要让 WAL 永远承担所有长距离恢复责任
+
+### 4.2 Primary-replica 同步模型
+
+`V2` 理想中的 steady-state 同步模型是：
+
+1. primary 分配单调递增的 `LSN`
+2. primary 本地顺序追加 `WAL`
+3. primary 把记录放入 per-replica sender loop
+4. replica 按顺序接收并推进显式 progress
+5. `barrier/sync` 依赖 replica 的 durable progress，而不是 optimistic send progress
+6. flusher 再把 WAL-backed dirty state 物化到 extent
+
+本地 `WAL -> extent` 生命周期可以理解为：
+
+```mermaid
+stateDiagram-v2
+    [*] --> WalAppended
+    WalAppended --> SenderQueued
+    SenderQueued --> ReplicaReceived
+    ReplicaReceived --> ReplicaDurable
+    ReplicaDurable --> SyncEligible
+    SyncEligible --> ExtentMaterialized
+    ExtentMaterialized --> CheckpointAdvanced
+```
+
+这里最关键的规则是：
+
+- **client 可见的 sync 真值必须跟随 durable replica progress**
+- 不能跟随 send progress
+- 不能跟随 local WAL head
+- 不能跟随“看起来 replica 应该已经收到了”
+
+这也是为什么 `V2` 使用像 `CommittedLSN` 这样的 lineage-safe 边界，而不是松散的“当前 primary head”。
+
+### 4.2.1 不同 sync mode 如何判断结果
+
+`V2` 让不同 sync mode 的成功条件变得更明确：
+
+- `best_effort`：primary 达到本地 durability point 后即可成功，replica 可以后台恢复
+- `sync_all`：所有 required replica 都要在目标边界上 durable
+- `sync_quorum`：必须存在真实 durable quorum
+
+其判断路径可以表示为：
+
+```mermaid
+flowchart TD
+    writeReq[WriteAndSyncRequest]
+    localDurable[PrimaryLocalDurable]
+    barrierEval[EvaluateReplicaDurableProgress]
+    bestEffortAck[best_effort成功]
+    syncAllAck[sync_all成功]
+    syncQuorumAck[sync_quorum成功]
+    rejectOrBlock[阻塞或失败]
+
+    writeReq --> localDurable
+    localDurable --> bestEffortAck
+    localDurable --> barrierEval
+
+    barrierEval -->|"allRequiredReplicasDurable"| syncAllAck
+    barrierEval -->|"durableQuorumExists"| syncQuorumAck
+    barrierEval -->|"notEnoughDurableReplicas"| rejectOrBlock
+```
+
+这意味着 sync 结果不再依赖：
+
+- socket 看起来还活着
+- sender 好像还在发
+- replica 似乎“差不多收到了”
+
+而是依赖显式 durable progress。
+
+### 4.3 为什么这个设计应该更稳定
+
+它试图把最危险的模糊边界拆开：
+
+- **写入顺序** 由 `WAL + LSN` 表达
+- **durability truth** 由 barrier / flushed progress 表达
+- **recovery ownership** 由 sender + recovery attempt identity 表达
+- **catch-up vs rebuild** 由显式分类表达
+- **promotion safety** 由 committed prefix 与 lineage 表达
+
+也就是说，`V2` 的稳定性来自于减少隐式耦合。
+
+### 4.4 为什么它仍然可以有高性能
+
+这里不能夸大说 `V2` 一定在所有情况下都更快。
+
+更准确的性能论点是：
+
+- 保持 primary 前台写路径简单：
+  - 本地顺序 `WAL append`
+  - 投递到 per-replica sender loop
+  - 不把复杂恢复逻辑塞进前台写路径
+- 把复杂度主要放在健康热路径之外：
+  - sender ownership
+  - reconnect classification
+  - catch-up / rebuild decision
+  - timeout 和 stale-result fencing
+  主要都在 recovery / control path
+- 让 WAL 只承担它擅长的工作：
+  - 近期 ordered delta
+  - 短间隙 replay
+- 不再让 WAL 承担所有长距离恢复：
+  - 长间隙恢复转向 checkpoint/snapshot base + tail replay
+
+所以 `V2` 的性能论点应该是：
+
+- **健康 steady-state 应该尽量接近 `V1.5`**
+- **退化与恢复路径会更干净**
+- **短间隙恢复会比 rebuild 更便宜**
+- **长间隙恢复不再逼迫系统支付无上限的 WAL retention 税**
+
+这比“V2 天然更快”要可信得多。
+
+### 4.5 为什么仍然选择 WAL
+
+之所以还继续走 WAL，是因为它仍然是解决这个同步问题最有力的基础：
+
+- 显式顺序
+- 显式历史
+- 显式 committed prefix
+- 显式短间隙 replay
+- 显式 failover reasoning
+
+只有当设计把以下概念混淆时，WAL 才会变得危险：
+
+- 本地写入接受
+- replica durable progress
+- committed boundary
+- recoverable retained history
+
+而 `V2` 的存在，正是为了不再混淆这些东西。
+
+## 5. 与市场和论文路线的比较
+
+选择 `V2` 这条路线，并不是因为别的 vendor 都错了，而是因为他们解决的是不同问题，也承担了不同复杂度。
+
+### Ceph / RBD 路线
+
+Ceph/RBD 避开了这种 per-volume replicated WAL 形态。
+
+它获得的是：
+
+- 对象存储深度一体化
+- 成熟的 placement 与 recovery 体系
+- 更强的集群级分布能力
+
+但代价是：
+
+- 系统层次更多
+- object-store / peering 复杂度更重
+- 运维与概念模型更重
+
+所以这不是“更简单”，而是把复杂度迁移到了别处。
+
+对 `sw-block` 而言，当前选择是：
+
+- 保持更窄的软件块服务模型
+- 用更显式的 per-volume correctness 来换取更可控的复杂度
+
+### PolarFS / ParallelRaft 路线
+
+这类系统探索更激进的顺序与并行策略：
+
+- conflict-aware 或乱序并行
+- 更深的日志并行
+- 更复杂的 apply / replay 机制
+
+它们在未来仍然值得借鉴：
+
+- LBA conflict reasoning
+- replay 成本与恢复成本
+- flusher 并行优化
+
+但它们也明显扩大了正确性边界。
+
+在当前阶段，项目不应该在还没彻底证明严格顺序模型之前，就过早买入这类复杂度。
+
+### AWS 链式复制 / EBS 类经验
+
+链式复制之类的路线吸引人，是因为它们能解决真实问题：
+
+- Primary NIC 压力
+- forward 拓扑
+- RF=3 时更好的扩展性
+
+这是后续较有希望借鉴的方向。
+
+但它会改变：
+
+- 延迟画像
+- 失败处理方式
+- barrier 语义
+- 运维拓扑
+
+所以它属于更后面的架构阶段，而不是当前 V2 核心证明。
+
+### 当前的真实选择
+
+项目当前选择的是：
+
+- 更窄的软件优先 block 设计
+- 明确的 per-volume correctness
+- 在性能英雄主义之前先把逻辑讲清
+- 在功能扩张之前先建立验证闭环
+
+这不是保守，而是为了让这个 block 产品未来真的值得信任。
+
+## 6. 为什么这条方向适合 SeaweedFS 与未来独立 sw-block
+
+`sw-block` 起步于 SeaweedFS，但 `V2` 已经在按下一代独立 block service 的方向成形。
+
+这意味着架构上要同时保留两类东西：
+
+### 需要保持兼容的部分
+
+- placement / topology 这些概念
+- 可解释的 control-plane contract
+- 与 SeaweedFS 生态的运维连续性
+
+### 应该更 block-specific 的部分
+
+- replication correctness
+- recovery ownership
+- recoverability classification
+- block 特有的 test / evidence 体系
+
+因此当前方向不是“继续把 V2 当成 weed 里的一个 patch”，而是：
+
+- 以 SeaweedFS 作为经验与生态基础
+- 同时把 `V2` 逐步塑造成真正独立的块服务架构
+
+## 7. 系统化验证方法
+
+当前方向之所以合理，另一个重要原因是验证方法本身已经系统化。
+
+项目不再依赖：
+
+- 先实现
+- 再观察
+- 出 bug 再修
+
+而是依赖如下层次：
+
+- contract / invariants
+- scenario backlog
+- simulator
+- timer/race simulator
+- standalone prototype
+- real engine test runner
+
+```mermaid
+flowchart TD
+    contract[ContractAndInvariants]
+    scenarios[ScenarioBacklog]
+    distsim[distsim]
+    eventsim[eventsim]
+    prototype[enginev2Prototype]
+    runner[RealTestRunner]
+    confidence[SystemAndProductConfidence]
+
+    contract --> scenarios
+    scenarios --> distsim
+    scenarios --> eventsim
+    distsim --> prototype
+    eventsim --> prototype
+    prototype --> runner
+    runner --> confidence
+```
+
+这对于一个高风险块存储算法是非常正确的结构：
+
+- simulation 用来证明协议逻辑
+- prototype 用来证明执行语义
+- 真实 runner 用来证明系统与产品行为
+
+## 8. Simulation 系统证明什么
+
+simulation 系统的目标是回答：
+
+- 应该发生什么
+- 绝不能发生什么
+- 为什么旧设计会失败
+- 为什么 V2 更好
+
+### `distsim`
+
+`distsim` 是主协议仿真器，主要用于：
+
+- 协议正确性
+- 状态迁移
+- stale authority fencing
+- promotion / lineage safety
+- catch-up vs rebuild
+- changed-address restart
+- candidate safety
+- reference-state checking
+
+### `eventsim`
+
+`eventsim` 是时间 / race 层，主要用于：
+
+- barrier timeout
+- catch-up timeout
+- reservation timeout
+- 同 tick / 延迟事件顺序
+- stale timeout 的影响
+
+### simulation 擅长证明什么
+
+它特别擅长证明：
+
+- stale traffic rejection
+- recovery boundary 的显式性
+- timeout/race 语义
+- committed prefix 下的 failover 正确性
+- 旧 authority 不能修改新 lineage
+
+### simulation 不证明什么
+
+它不证明：
+
+- 真实 TCP 行为
+- 真实 OS 调度
+- 磁盘时序
+- 真正的 `WALShipper` 集成
+- iSCSI / NVMe 前端的真实行为
+
+因此 simulation 不是全部真相。
+
+它是 **算法 / 协议真相层**。
+
+## 9. 真实 test runner 证明什么
+
+`learn/projects/sw-block/test/` 下的真实 test runner 是系统与产品验证层。
+
+它不只是 QA 工具，而是设计是否可信的重要组成部分。
+
+### 它覆盖什么
+
+当前 runner 与周边测试体系已经覆盖：
+
+- unit
+- component
+- integration
+- distributed scenario
+- 真实硬件 workflow
+
+而且环境已经包含：
+
+- 真实节点
+- 真实 block target
+- 真实 fault injection
+- benchmark 与结果采集
+- run bundle 与 scenario traceability
+
+### 为什么它重要
+
+它帮助我们判断：
+
+- 实际引擎是否按设计运行
+- 产品在真实 restart / failover / rejoin 场景下是否可靠
+- operator workflow 是否可信
+- benchmark 结果是不是有效而非偶然
+
+所以 test runner 最好被理解为：
+
+- implementation truth
+- system truth
+- product truth
+
+而不只是“测试脚本框架”。
+
+## 10. Simulation 与 test runner 如何系统性推进
+
+理想的反馈闭环是：
+
+1. `V1` / `V1.5` 出现真实故障
+2. 这些故障被转化为设计要求
+3. 再被提炼为 simulator 场景
+4. simulator 关闭协议歧义
+5. standalone prototype 关闭执行歧义
+6. 真实 test runner 在硬件与分布式环境中验证系统行为
+7. 新故障或新偏差再反哺设计
+
+这就形成了两类互补真相：
+
+- `simulation -> algorithm / protocol correctness`
+- `test runner -> implementation / system / product correctness`
+
+这种分层是健康的，因为它避免了两种常见错误：
+
+- 只相信设计推导，却没有真实行为
+- 只相信系统测试全绿，却没有真正理解协议本身
+
+## 11. 当前状态与诚实边界
+
+### 现在已经比较强的部分
+
+- `V1.5` 相比 `V1` 的恢复能力已经明显增强，并且有真实运行证据
+- `V2` 的架构清晰度已经明显强于 `V1.5`
+- simulator 已经有较强的 acceptance 覆盖
+- prototype 已经开始关闭 ownership 与 orchestration 风险
+- 真实 test runner 已经足够大，可以支撑严肃的系统验证
+
+### 现在还没有完成的部分
+
+- `V2` 还不是生产引擎
+- prototype 仍处于早中期
+- historical-data / recovery-boundary prototype 还没有闭合
+- `V2` steady-state 性能还没有真实证明
+- `V2` 还没有真实硬件上的运行验证
+
+所以最准确的话不是：
+
+- “V2 现在已经在生产上更强”
+
+而是：
+
+- “V2 是长期更好的架构，但今天还不是更强的已部署引擎”
+
+## 12. 为什么当前方向是理性的
+
+当前方向之所以理性，是因为它保持了正确的分工：
+
+- `V1.5` 继续作为今天的生产线
+- `V2` 继续作为下一代架构线
+
+这样项目就可以：
+
+- 在已有可运行系统上继续交付和加固
+- 在不扰动生产线的前提下认真验证更强的架构
+- 用 simulation、prototype 和真实 runner 来决定 `V2` 是否真能成为下一代引擎
+
+最终的战略规则应当保持不变：
+
+- 继续研究 WAL，因为现在我们已经有可信的验证框架
+- 继续推进 V2，因为架构证据已经很强
+- 如果 prototype 证明 V2 有结构性缺陷，就先演进到 `V2.5`，不要急于重实现
+
+## 结论
+
+如果按当前生产证据选择：
+
+- 选择 `V1.5`
+
+如果按长期协议质量选择：
+
+- 选择 `V2`
+
+如果问 WAL 是否还值得继续研究：
+
+- 值得，因为现在项目已经拥有了足够严肃的验证体系，可以负责任地继续推进
+
+这就是当前最合理的技术与战略判断。
diff --git a/sw-block/design/v2-detailed-algorithm.zh.md b/sw-block/design/v2-detailed-algorithm.zh.md
new file mode 100644
index 000000000..81d0ce445
--- /dev/null
+++ b/sw-block/design/v2-detailed-algorithm.zh.md
@@ -0,0 +1,1068 @@
+# V2 详细算法设计
+
+日期：2026-03-27
+状态：详细算法草案
+读者：架构设计、simulator、prototype、实现负责人
+
+## 1. 文档目的
+
+这份文档不是 CEO 综述，也不是 phase 汇报。
+
+它的目标是把 `sw-block V2` 的核心算法写成一份更接近“协议规格”的设计文档，回答下面几个问题：
+
+- 系统里的正式状态对象是什么
+- 写路径如何推进
+- 不同 `sync mode` 如何决定是否可以返回成功
+- replica 掉队后如何决定 `catch-up` 还是 `rebuild`
+- primary crash / failover / epoch bump 后哪些状态仍然有效
+- 什么叫做“允许的 WAL-first 可见性”，什么叫做不允许的幽灵状态
+
+本文默认接受一个核心前提：
+
+- **已 durable 的 WAL 是系统正式状态的一部分**
+
+因此：
+
+- `visible state` 可以领先于 `checkpoint`
+- 只要该状态仍然有合法的恢复依据，它就不是 bug
+
+真正的错误是：
+
+- `acked state > recoverable state`
+- 或 `visible state > recoverable state`
+
+## 2. 设计目标
+
+V2 的目标不是把所有事情都交给 `WAL`。
+
+V2 的目标是：
+
+1. 用 `WAL` 提供严格顺序、短间隙恢复和明确的 durable history
+2. 用 `extent + checkpoint/snapshot` 提供稳定读镜像和长距离恢复基线
+3. 用显式的 `epoch + sender + RecoverySession` 管住恢复 authority
+4. 用显式的 `CommittedLSN` 管住对外承诺边界
+5. 用显式的 `catch-up` / `rebuild` 分类避免长期模糊状态
+
+## 3. 核心对象
+
+### 3.1 LSN 边界
+
+- `HeadLSN`
+  primary 当前已分配并写入本地 WAL 的最高 LSN
+
+- `CommittedLSN`
+  当前对外可承诺、可用于 failover / recovery 目标的 lineage-safe 边界
+
+- `ReplicaReceivedLSN`
+  replica 已收到并追加的最高 LSN，不代表 durable
+
+- `ReplicaFlushedLSN`
+  replica 已 durable 的最高 LSN，是 sync/barrier 判断的正式依据
+
+- `CheckpointLSN`
+  当前 checkpoint / base snapshot 所代表的稳定物化边界
+
+- `RecoverableLSN`
+  某节点 crash 之后，仍可由 `checkpoint + retained WAL` 或等价机制恢复出的最高边界
+
+### 3.2 存储层对象
+
+- `Active WAL`
+  当前保留的 WAL 历史，用于：
+  - 顺序写入
+  - crash recovery
+  - short-gap catch-up
+
+- `Extent`
+  当前运行中的块视图，可以比 checkpoint 更新
+
+- `Checkpoint / Snapshot`
+  一个真实历史点的稳定镜像，用于：
+  - rebuild base
+  - 长距离恢复
+  - GC / retention 的正确边界
+
+### 3.3 协议层对象
+
+- `Epoch`
+  primary lineage / fencing 边界。旧 epoch 的消息和恢复结果不能修改当前系统。
+
+- `Sender`
+  primary 上对每个 replica 的唯一发送 authority。
+
+- `RecoverySession`
+  对一个 replica 的一次有界恢复尝试。它必须被：
+  - 一个 sender 拥有
+  - 一个 epoch 约束
+  - 一个 session ID 唯一标识
+
+- `AssignmentIntent`
+  orchestrator 对 sender group 的意图输入。它决定：
+  - 哪些 replica 被保留
+  - 哪些 replica 要恢复
+  - 恢复目标和 epoch 是什么
+
+## 4. 全局结构图
+
+```mermaid
+flowchart TD
+    client[Client]
+    primary[Primary]
+    wal[Active WAL]
+    extent[Live Extent]
+    flusher[Flusher]
+    cp[Checkpoint Snapshot]
+    senderGroup[SenderGroup]
+    r1[Replica Sender R1]
+    r2[Replica Sender R2]
+    repl1[Replica 1]
+    repl2[Replica 2]
+    orchestrator[Volume Orchestrator]
+
+    client --> primary
+    primary --> wal
+    primary --> senderGroup
+    primary --> extent
+    senderGroup --> r1
+    senderGroup --> r2
+    r1 --> repl1
+    r2 --> repl2
+    wal --> flusher
+    flusher --> extent
+    flusher --> cp
+    orchestrator --> primary
+    orchestrator --> senderGroup
+```
+
+这个结构的关键点是：
+
+- 前台写路径只负责产生顺序和推进正式边界
+- 每个 replica 的恢复执行由自己的 sender/session 管理
+- flusher / checkpoint 负责物化和长期恢复基线
+- orchestrator 负责 volume 级 admission、epoch 和 failover
+
+## 5. 数据真相与可见性规则
+
+V2 必须明确区分 5 种状态：
+
+1. `visible state`
+2. `WAL durable state`
+3. `replica durable state`
+4. `checkpointed state`
+5. `committed / acked state`
+
+它们不是同一个概念。
+
+### 5.1 允许的情况
+
+下列情况是允许的：
+
+- `visible state > CheckpointLSN`
+- `WAL durable state > CheckpointLSN`
+- replica 在后台追赶，extent 已经更“新”
+
+只要：
+
+- crash 后这些状态仍有恢复依据
+- client 所收到的 ACK 语义没有被夸大
+
+### 5.2 禁止的情况
+
+下列情况是 V2 必须阻止的：
+
+- `AckedLSN > RecoverableLSN`
+- `VisibleLSN > RecoverableLSN`
+- 根据 socket/send progress 而不是 durable progress 给出 sync 成功
+- replica 实际已不可能 catch-up，却长期停留在 `CatchingUp`
+
+### 5.3 可执行 invariant
+
+V2 的 simulator / prototype 至少应围绕下面三条 invariant 展开：
+
+1. `RecoverabilityInvariant`
+   所有已 ACK 的边界在 crash / restart / failover 后仍必须可恢复
+
+2. `VisibilityInvariant`
+   所有向用户暴露的状态都必须有合法恢复来源
+
+3. `CatchUpLivenessInvariant`
+   replica 要么收敛，要么显式升级为 `NeedsRebuild`
+
+## 6. 写路径算法
+
+### 6.1 写路径目标
+
+写路径要满足两件事：
+
+- 维持 primary 本地严格顺序
+- 不把复杂恢复逻辑塞进前台热路径
+
+### 6.2 写入步骤
+
+对一次逻辑写入 `Write(block, value)`，V2 的基本步骤是：
+
+1. primary 检查自己是否拥有当前 `epoch` 的 serving authority
+2. 分配下一个单调递增的 `LSN`
+3. 生成 `WALRecord{LSN, Epoch, Block, Value, RecoveryClass}`
+4. 本地 durable append 到 `Active WAL`
+5. 更新 primary 的运行期可见状态
+6. 把该记录放入每个 replica 的 sender queue
+7. 根据 volume 的 `sync mode` 决定是否需要 barrier / durable quorum / all replicas
+8. 在满足对应 mode 条件后返回成功，否则阻塞或失败
+
+### 6.3 写路径图
+
+```mermaid
+sequenceDiagram
+    participant C as Client
+    participant P as Primary
+    participant W as Local WAL
+    participant SG as SenderGroup
+    participant R as Replica Sender
+    participant X as Replica
+
+    C->>P: Write(block, value)
+    P->>P: allocate LSN
+    P->>W: durable append WALRecord
+    P->>P: update live visible state
+    P->>SG: enqueue record for each replica
+    SG->>R: ordered per-replica send
+    R->>X: replicate WALRecord
+    X->>X: append and later flush
+    P->>P: evaluate sync mode
+    P-->>C: ACK or block/fail
+```
+
+### 6.4 写路径伪算法
+
+```text
+OnWrite(req):
+  require PrimaryState == Serving
+  require LocalEpoch == VolumeEpoch
+
+  lsn = AllocateNextLSN()
+  rec = BuildWALRecord(lsn, req, epoch)
+
+  DurableAppendLocalWAL(rec)
+  ApplyToLiveVisibleState(rec)
+  EnqueueToReplicaSenders(rec)
+
+  if Mode == best_effort:
+      return success after local durable WAL
+
+  if Mode == sync_all:
+      wait until every required replica reports durable progress >= lsn
+      else timeout/fail
+
+  if Mode == sync_quorum:
+      wait until true durable quorum reports progress >= lsn
+      else timeout/fail
+
+  AdvanceCommittedLSN(lsn) only at the correct lineage-safe boundary
+  return success
+```
+
+## 7. 三种 sync mode
+
+### 7.1 `best_effort`
+
+语义：
+
+- 只要求 primary 本地达到 durability point
+- replica 可以异步恢复
+- 不应对 client 承诺多副本 durable
+
+适合：
+
+- 后台恢复优先
+- 临时 degraded 仍继续服务
+
+### 7.2 `sync_all`
+
+语义：
+
+- 所有 required replica 都必须在目标 `LSN` durable
+- 不能因为“看起来网络还活着”而提前 ACK
+- 一旦达不到条件，应阻塞或失败，不能偷偷降级
+
+### 7.3 `sync_quorum`
+
+语义：
+
+- 必须形成真实 durable quorum
+- 只统计满足当前 epoch 和 state 资格的 replica
+- 不能只数 healthy socket 或 sender 已发送
+
+### 7.4 sync 决策图
+
+```mermaid
+flowchart TD
+    start[Write at LSN L]
+    local[Primary local WAL durable]
+    mode{Sync Mode}
+    best[Return success]
+    allCheck{All required replicas durable >= L}
+    quorumCheck{Durable quorum >= L}
+    success[Return success]
+    block[Block or fail]
+
+    start --> local
+    local --> mode
+    mode -->|best_effort| best
+    mode -->|sync_all| allCheck
+    mode -->|sync_quorum| quorumCheck
+    allCheck -->|yes| success
+    allCheck -->|no| block
+    quorumCheck -->|yes| success
+    quorumCheck -->|no| block
+```
+
+### 7.5 sync mode 的正式原则
+
+所有 sync mode 都必须遵守：
+
+- durable truth 只来自 `ReplicaFlushedLSN`
+- 不来自 `ReplicaReceivedLSN`
+- 不来自 send queue
+- 不来自 transport 连接存活
+
+## 8. 本地 WAL / extent / checkpoint 算法
+
+V2 必须把本地状态推进拆成三个动作：
+
+1. `WAL append`
+2. `extent materialization`
+3. `checkpoint advancement`
+
+### 8.1 本地生命周期
+
+```mermaid
+stateDiagram-v2
+    [*] --> WALDurable
+    WALDurable --> VisibleApplied
+    VisibleApplied --> ReplicaDurableEligible
+    ReplicaDurableEligible --> CheckpointMaterialized
+    CheckpointMaterialized --> CheckpointPublished
+
+    note right of WALDurable
+        Local ordered truth exists
+        and can participate in recovery
+    end note
+
+    note right of VisibleApplied
+        New data may be visible
+        before checkpoint catches up
+    end note
+
+    note right of CheckpointPublished
+        Stable base image advances
+        and old WAL may become recyclable
+    end note
+```
+
+### 8.2 关键规则
+
+- `extent` 可以比 `checkpoint` 更新
+- 但 crash 后真正可恢复的是：
+  - `checkpoint`
+  - 加上仍被保留、可 replay 的 WAL
+
+所以：
+
+- `VisibleLSN > CheckpointLSN` 可以合法
+- 但 `VisibleLSN > RecoverableLSN` 绝不合法
+
+### 8.3 flusher / checkpoint 职责
+
+flusher 不负责决定 ACK。
+
+flusher 负责：
+
+- 将 WAL-backed dirty state 物化到 extent
+- 产生新的 checkpoint / snapshot
+- 在有了新的稳定基线后，帮助推进 WAL retention / GC 边界
+- 保证被对外承诺的数据仍然可恢复
+
+## 9. Replica 正常复制算法
+
+### 9.1 steady-state
+
+每个 replica 有一个稳定 sender。
+
+sender 负责：
+
+- 顺序发 WAL record
+- 发 barrier
+- 处理 reconnect / handshake
+- 执行 catch-up / rebuild 尾部 replay
+- 拒绝旧 session 的结果
+
+### 9.2 正常复制步骤
+
+1. sender 从 queue 取出下一个 record
+2. 按顺序发给 replica
+3. replica 验证 epoch 和顺序
+4. replica 先 append 到本地 WAL 或等价 durable log
+5. replica 更新 `receivedLSN`
+6. 若收到 barrier，则等待本地 durable progress 达到目标
+7. replica 更新 `flushedLSN`
+8. 返回 `BarrierResp`
+
+### 9.3 steady-state 图
+
+```mermaid
+sequenceDiagram
+    participant P as Primary Sender
+    participant R as Replica
+
+    P->>R: WALRecord(LSN=n)
+    R->>R: validate epoch and order
+    R->>R: append local log
+    R->>R: receivedLSN = n
+
+    P->>R: BarrierReq(LSN=n)
+    R->>R: wait until durable >= n
+    R->>R: flushedLSN = n
+    R-->>P: BarrierResp(flushedLSN=n)
+```
+
+## 10. 恢复总算法
+
+### 10.1 恢复的正式入口
+
+当 replica 不再能作为正常 `InSync` 复制对象时，系统不能直接“猜测”怎么修。
+
+必须走明确的恢复入口：
+
+1. orchestrator 识别该 replica 已失去 sync eligibility
+2. 对该 replica 发出新的 `AssignmentIntent`
+3. sender 建立或 supersede 一个新的 `RecoverySession`
+4. 通过 handshake 获得该 replica 的正式 durable 点
+5. 对恢复路径做显式分类
+
+### 10.2 handshake 输入
+
+一次恢复决策至少需要：
+
+- `ReplicaFlushedLSN`
+- `CommittedLSN`
+- `RetentionStartLSN`
+- 当前 `epoch`
+- endpoint/version 视图
+
+### 10.3 恢复分类
+
+V2 把恢复明确分成三类：
+
+1. `ZeroGap`
+   `ReplicaFlushedLSN == CommittedLSN`
+
+2. `CatchUp`
+   gap 在 recoverable window 内，或 replica 需要先 truncate divergent tail
+
+3. `NeedsRebuild`
+   gap 超过 retention / payload / snapshot 可恢复边界
+
+### 10.4 恢复决策图
+
+```mermaid
+flowchart TD
+    hs[HandshakeResult]
+    zero{ReplicaFlushedLSN == CommittedLSN}
+    ahead{ReplicaFlushedLSN > CommittedLSN}
+    recoverable{Gap provably recoverable under retention/reservation}
+    zeroGap[ZeroGap]
+    truncate[CatchUp with truncation]
+    catchup[CatchUp]
+    rebuild[NeedsRebuild]
+
+    hs --> zero
+    zero -->|yes| zeroGap
+    zero -->|no| ahead
+    ahead -->|yes| truncate
+    ahead -->|no| recoverable
+    recoverable -->|yes| catchup
+    recoverable -->|no| rebuild
+```
+
+### 10.5 为什么用 `CommittedLSN`
+
+恢复目标必须是 `CommittedLSN`，而不是 `HeadLSN`。
+
+原因是：
+
+- `HeadLSN` 可能包含还未形成正式外部承诺的尾部
+- failover / promotion 的安全边界必须围绕 committed prefix
+- 否则会把“primary 看起来更新”误当成“lineage-safe truth”
+
+## 11. Catch-up 算法
+
+### 11.1 进入条件
+
+只有当下面条件同时满足时，才允许进入 `CatchUp`：
+
+1. session authority 有效
+2. 当前 epoch 未失效
+3. endpoint/version 未变化
+4. gap `(ReplicaFlushedLSN, CommittedLSN]` 可恢复
+5. 对应恢复窗口已被 reservation pin 住
+
+### 11.2 执行步骤
+
+1. session 进入 `Connecting`
+2. handshake 后进入 `Handshake`
+3. classifier 返回 `CatchUp`
+4. session 设置：
+   - `StartLSN`
+   - `TargetLSN`
+   - 如有需要，`TruncateRequired`
+5. sender 开始按顺序回放 WAL records
+6. replica durably 应用并持续汇报进展
+7. sender 更新 `RecoveredTo`
+8. 若达到 `TargetLSN` 且 barrier 条件满足，则 session 完成
+9. replica 进入 `InSync` 或进入短暂 `PromotionHold`
+
+### 11.3 catch-up 状态图
+
+```mermaid
+stateDiagram-v2
+    [*] --> Connecting
+    Connecting --> Handshake
+    Handshake --> ZeroGap
+    Handshake --> CatchingUp
+    Handshake --> NeedsRebuild
+    Handshake --> Truncating
+    Truncating --> CatchingUp
+    CatchingUp --> PromotionHold
+    PromotionHold --> InSync
+    CatchingUp --> NeedsRebuild
+```
+
+### 11.4 catch-up 失败规则
+
+以下任何情况都必须终止当前 catch-up：
+
+- reservation 失效
+- payload / WAL 保留条件失效
+- epoch bump
+- endpoint change
+- session 被 supersede
+- 长时间无净进展
+
+一旦终止，必须：
+
+- 拒绝旧 session 的后续结果
+- 根据原因进入 `NeedsRebuild` 或等待新的 assignment
+
+## 12. Rebuild 算法
+
+### 12.1 何时进入 rebuild
+
+下列情况进入 `NeedsRebuild`：
+
+- `ReplicaFlushedLSN + 1 < RetentionStartLSN`
+- 历史 payload 不再可解析
+- 对应 snapshot / base image 不存在或无法 pin 住
+- catch-up 期间 recoverability 条件丢失
+
+### 12.2 rebuild 步骤
+
+1. orchestrator 为该 replica 发出 rebuild assignment
+2. sender 建立新的 rebuild session
+3. primary 选择一个真实 `snapshotCpLSN`
+4. pin 住：
+   - snapshot/base image
+   - `snapshotCpLSN` 之后的 tail replay window
+5. replica 安装 base image
+6. sender 从 `snapshotCpLSN + 1` 开始 replay 到目标 `TargetLSN`
+7. barrier 确认 durable reach
+8. replica 进入 `PromotionHold` / `InSync`
+
+### 12.3 rebuild 图
+
+```mermaid
+sequenceDiagram
+    participant O as Orchestrator
+    participant P as Primary
+    participant S as Sender
+    participant R as Replica
+
+    O->>S: AssignmentIntent(rebuild)
+    S->>P: choose snapshotCpLSN
+    P->>P: pin snapshot and tail replay window
+    S->>R: install base snapshot
+    R->>R: load snapshot(cpLSN)
+    S->>R: replay WAL tail (cpLSN, target]
+    R->>R: durable apply
+    R-->>S: barrier/progress reached
+    S->>O: rebuild completed
+```
+
+## 13. RecoverySession 与 authority 算法
+
+### 13.1 为什么要有 RecoverySession
+
+块设备前端看起来没有“session”概念，但恢复执行内部必须有一个 bounded object。
+
+否则无法明确回答：
+
+- 谁拥有这次恢复尝试
+- 哪个结果是新的，哪个是晚到的旧结果
+- endpoint 变了之后旧连接还能不能继续生效
+- epoch bump 后旧 catch-up 结果还能不能落地
+
+### 13.2 authority 规则
+
+一个恢复 API 调用只有同时满足下面条件才有效：
+
+1. sender 当前仍存在
+2. sender 未 stopped
+3. sender 当前 session 不为空
+4. `sessionID` 与当前 active session 一致
+5. session 仍处于 active phase
+6. sender 的 epoch 与 volume epoch 一致
+7. endpoint/version 未变化
+
+### 13.3 authority 图
+
+```mermaid
+flowchart TD
+    op[Recovery Operation]
+    stopped{Sender stopped?}
+    hasSession{Active session exists?}
+    idMatch{Session ID matches?}
+    phaseOk{Phase valid?}
+    epochOk{Epoch still current?}
+    endpointOk{Endpoint still current?}
+    allow[Apply mutation]
+    reject[Reject as stale/invalid]
+
+    op --> stopped
+    stopped -->|yes| reject
+    stopped -->|no| hasSession
+    hasSession -->|no| reject
+    hasSession -->|yes| idMatch
+    idMatch -->|no| reject
+    idMatch -->|yes| phaseOk
+    phaseOk -->|no| reject
+    phaseOk -->|yes| epochOk
+    epochOk -->|no| reject
+    epochOk -->|yes| endpointOk
+    endpointOk -->|no| reject
+    endpointOk -->|yes| allow
+```
+
+## 14. Failover / promotion 算法
+
+### 14.1 触发条件
+
+当 primary lease 丢失、节点 crash 或被明确 demote 时，需要 volume 级 failover。
+
+这不是单个 replica 的本地状态迁移，而是：
+
+- 整个 volume lineage 重新定根
+
+### 14.2 failover 步骤
+
+1. 旧 primary 丧失 authority
+2. volume `Epoch++`
+3. 选择 promotion candidate
+4. candidate 必须满足：
+   - running
+   - epoch 可对齐
+   - state 允许提升
+   - `FlushedLSN >= CommittedLSN`
+5. 新 primary 开始 serving
+6. 旧 primary 相关的 recovery sessions 全部失效
+7. 其余 replicas 相对新 primary 重新做 handshake / classify
+
+### 14.3 failover 图
+
+```mermaid
+flowchart TD
+    old[Old Primary loses lease]
+    bump[Epoch++]
+    choose[Choose promotion candidate]
+    eligible{Candidate has committed prefix?}
+    promote[Promote to new primary]
+    invalidate[Invalidate old sessions/messages]
+    reclassify[Reclassify all replicas]
+    fail[No safe candidate]
+
+    old --> bump
+    bump --> choose
+    choose --> eligible
+    eligible -->|yes| promote
+    eligible -->|no| fail
+    promote --> invalidate
+    invalidate --> reclassify
+```
+
+### 14.4 promotion 的原则
+
+默认规则应当是保守的：
+
+- 宁可没有 candidate，也不要提升一个不具备 committed prefix 的节点
+
+否则最危险的错误就是：
+
+- 用户以为已 durable / 已 ACK 的数据，在 failover 后找不到
+
+## 15. Crash recovery 语义
+
+### 15.1 primary 本地 crash
+
+primary restart 后必须能够根据：
+
+- 最近 checkpoint
+- retained WAL
+
+重建出新的运行状态。
+
+### 15.2 重要边界
+
+必须允许：
+
+- `visible state > checkpoint`
+
+但必须保证：
+
+- 所有已 visible 的状态都有合法恢复来源，或者 crash 后不会再被当作正式状态
+
+### 15.3 crash 语义图
+
+```mermaid
+flowchart TD
+    run[Running state]
+    cp[CheckpointLSN = C]
+    wal[Retained WAL covers (C, R]]
+    crash[Crash]
+    restart[Restart]
+    replay[Replay retained WAL]
+    recover[Recoverable state up to R]
+    illegal[Illegal: visible/acked beyond recoverable]
+
+    run --> cp
+    run --> wal
+    cp --> crash
+    wal --> crash
+    crash --> restart
+    restart --> replay
+    replay --> recover
+    run --> illegal
+```
+
+## 16. Simulator 应重点验证的算法义务
+
+V2 如果要进入更真实实现，simulator 至少要系统证明以下几类事情：
+
+### 16.1 ACK 可恢复性
+
+- `flush/sync ACK` 返回成功后
+- crash / restart / failover 后仍可恢复到该边界
+
+### 16.2 可见性合法性
+
+- 运行期看到的新数据
+- 必须来自 WAL durable 或 checkpoint lineage
+- 不能出现 visible-but-unrecoverable state
+
+### 16.3 Catch-up 收敛性
+
+- replica 不能无限期 `CatchingUp`
+- 要么收敛，要么显式 `NeedsRebuild`
+
+### 16.4 历史正确性
+
+- 对目标 `LSN` 的恢复结果必须匹配 reference state
+- 不能拿 current extent 伪造旧历史
+
+### 16.5 stale authority fencing
+
+- epoch 变化
+- endpoint 变化
+- session supersede
+- late barrier / late catch-up result
+
+都不能修改当前 truth
+
+## 17. 方向微调：第一性思考与 Mayastor 启发
+
+这一节不是推翻 `V2`，而是回答一个更关键的问题：
+
+- 在确认 `V2` 大方向正确之后，是否还需要收紧目标、减少复杂恢复逻辑？
+
+当前判断是：
+
+- **需要微调，但不需要换方向**
+
+### 17.1 思维过程：先看 block 的第一性问题
+
+判断 `V2` 是否该微调，不能先从“现有代码已经写了什么”出发，而应先问 block 产品最不可回避的本质是什么。
+
+从第一性原理看，block 的核心不是：
+
+- volume 编排
+- 控制面外形
+- 接口包装
+
+而是下面四件事：
+
+1. `write` 在什么时候算成立
+2. `flush/fsync ACK` 到底承诺了什么
+3. failover 后用户已收到 ACK 的边界是否仍然成立
+4. replica 永远不完全同步时，系统如何定义真实可承诺边界
+
+这四件事如果没有被做硬，那么无论产品外形多完整，都还不能算真正可信的 block 产品。
+
+因此，`V2` 最值得坚持的主轴仍然是：
+
+- `CommittedLSN`
+- durable progress
+- `RecoverySession`
+- stale fencing
+- `CatchUp / NeedsRebuild / Rebuild`
+
+### 17.2 为什么还要微调
+
+虽然主轴正确，但 `V2` 仍然存在一种风险：
+
+- 为了尽量避免 `rebuild`，把 `catch-up` 做得越来越聪明
+
+这会带来新的债：
+
+- recovery session 生命周期过长
+- target 跟着 live head 漂移
+- 一个 lagging replica 长期消耗 primary 的 WAL retention
+- recover 与 live WAL 并存时形成双流复杂度
+- 系统长期停留在 `CatchingUp`，却没有真正恢复
+
+也就是说，`V2` 的风险不在于方向错，而在于：
+
+- **可能在正确方向上走得过深，重新长出不必要的复杂 transmission**
+
+### 17.3 Mayastor 的第一性启发
+
+`Mayastor` 给 `sw-block` 的最大启发，不是某个具体的 WAL 算法，而是另一种产品化思维：
+
+- block 产品不必把所有恢复复杂度都压在增量追赶上
+- `rebuild` 不是羞耻路径，而是正式主路径
+- volume / replica / target / control plane 应该是明确对象
+- 系统要接受“某些副本不值得继续低成本追赶”的现实
+
+从这个角度看，`Mayastor` 更接近：
+
+- block 产品的工程外形
+- volume 服务的组织方式
+- 明确的 replica lifecycle
+
+但 `Mayastor` 并没有替代 `V2` 的核心语义问题：
+
+- `flush ACK` 到底何时成立
+- failover 后 committed truth 如何保住
+- stale authority 如何 fencing
+
+所以正确的吸收方式不是“改走 Mayastor 路线”，而是：
+
+- **保留 `V2` 的语义内核**
+- **吸收 `Mayastor` 对正式 rebuild 路径和产品组织的启发**
+
+### 17.4 微调结论：用正式 rebuild 替换过度复杂的 catch-up
+
+因此，当前最合理的方向微调是：
+
+- 不把 `CatchUp` 当作“尽量避免 rebuild 的万能恢复手段”
+- 而把它收紧为：
+  - 短 gap
+  - 有界 target
+  - 有时间预算
+  - 有进展预算
+  - 有 recoverability/reservation 预算
+
+一旦超出这些边界，就应该：
+
+- 明确终止当前 `CatchUp`
+- 进入 `NeedsRebuild`
+- 再走正式 `Rebuild`
+
+这不是保守，而是更接近成熟 block 产品的现实：
+
+- `CatchUp` 是便宜路径
+- `Rebuild` 是正式路径
+- 不能为了少做 rebuild，而把系统拖进长期复杂恢复状态
+
+### 17.4A catch-up 与 rebuild 的职责划分
+
+这里需要进一步把 `CatchUp` 与 `Rebuild` 的职责说清楚，否则实现很容易再次滑回“尽量避免 rebuild，所以不断扩大 catch-up 能力”的旧习惯。
+
+`CatchUp` 不应被理解为一个与 `Rebuild` 对等、且可以无限扩展的恢复体系。更准确地说：
+
+- `CatchUp` 是 `KeepUp` 的放松态
+- 它只负责短 gap、短期、有界、可证明可恢复的 WAL replay
+- 它依赖 replica 当前 base 仍然可信
+- 它依赖 primary 仍保留 `(ReplicaFlushedLSN, TargetLSN]` 所需历史
+- 它的价值在于成本明显低于 `Rebuild`
+
+一旦这些前提不再成立，系统不应继续把复杂度堆入 `CatchUp`，而应显式进入 `NeedsRebuild`，再走正式 `Rebuild`。
+
+`Rebuild` 则应被视为更 general 的恢复框架。它不假设 target replica 当前状态仍可直接追赶，而是通过一个可信 `base` 把 replica 带回某个明确目标点：
+
+1. 冻结 `TargetLSN`
+2. 选择并 pin 一个可信 `base`
+3. 将 replica 恢复到该 `base`
+4. 如有需要，补齐 `(BaseLSN, TargetLSN]` 的 tail
+5. 通过 durable barrier 确认 replica 已达到 `TargetLSN`
+6. 再接回 `KeepUp / InSync`
+
+因此，`full rebuild` 与 `partial rebuild` 不应被理解为两套不同协议，而应被理解为同一 `Rebuild` 合同下对 `base` 和传输量的不同选择：
+
+- `full rebuild`
+  - 下载完整 pinned snapshot / base image
+  - 必要时再补 tail
+- `partial rebuild`
+  - replica 已有较老但可信的 base
+  - 通过 `bitmap` / `diff` / `snapshot + tail` 只补足达到 target 所需的数据
+
+两者共同的正确性前提都是：
+
+- 恢复目标必须是冻结的 `TargetLSN`
+- 恢复依赖的 snapshot / base 必须被 pin 住
+- 不允许直接用持续变化的 live extent 作为历史目标点数据来源
+
+这一定义意味着：
+
+- `CatchUp` 应继续收紧为短 gap、低成本、强约束路径
+- `Rebuild` 应被当作正式主恢复路径，而不是失败后的羞耻 fallback
+- 后续优化（例如 `bitmap` / range rebuild）应优先被建模为 `Rebuild` 的优化分支，而不是继续把复杂度堆入 `CatchUp`
+
+### 17.5 建议收紧的具体点
+
+#### 1. 收紧 `CatchUp`
+
+`CatchUp` 应只覆盖：
+
+- 短 outage
+- 短 gap
+- recoverability 清楚
+- 成本明显低于 rebuild
+
+不应覆盖：
+
+- 长时间追 moving head
+- 长时间阻塞 WAL GC
+- 长时间无净进展
+
+#### 2. 恢复 contract 只追 bounded target
+
+一个 recovery session 只对 `(R, H0]` 负责：
+
+- `R = ReplicaFlushedLSN`
+- `H0 = 本次 primary 分配的目标边界`
+
+`> H0` 的 live WAL 不应让当前 session 的完成条件漂移。
+
+#### 3. `recover -> keepup` 必须有明确 handoff
+
+session 完成后：
+
+- 释放 reservation 和历史恢复债
+- 经过 `PromotionHold` 或等价稳定条件
+- 再回 `KeepUp / InSync`
+
+而不是让 recovery session 无限延长为长期 keepup。
+
+#### 4. `Rebuild` 升格为一级路径
+
+`Rebuild` 不应只被视为：
+
+- catch-up 失败后的被动补丁
+
+而应被视为：
+
+- 长 gap
+- 高成本恢复
+- recoverability 不稳定
+- 持续 tail-chasing
+
+时的正式恢复选择。
+
+### 17.6 微调后的核心判断
+
+微调后的 `V2` 不应再被理解成：
+
+- “把 WAL 恢复做得越来越聪明”
+
+而应理解成：
+
+- **把 block 的真实同步边界做硬**
+- **把 `CatchUp` 收紧成短 gap、低成本、有限时间的 contract**
+- **把 `Rebuild` 升格成正式主路径**
+- **把 Smart WAL 等更高复杂度扩展延后到基础复制契约稳定之后**
+
+一句话总结就是：
+
+- **`V2` 不换方向，但要从“雄心更大”微调为“边界更硬、目标更窄、恢复更有预算”。**
+
+## 18. 推荐的实现切片
+
+为了让实现顺序和算法风险一致，推荐切片如下：
+
+### Slice 1: Sender / RecoverySession authority
+
+先解决：
+
+- 每 replica 一个 sender
+- 一次只允许一个 active recovery session
+- stale session result rejection
+
+### Slice 2: Outcome classification + assignment orchestration
+
+再解决：
+
+- `ZeroGap / CatchUp / NeedsRebuild`
+- `AssignmentIntent`
+- sender group reconcile
+
+### Slice 3: Historical recoverability model
+
+再把：
+
+- `CommittedLSN`
+- WAL retention
+- checkpoint/snapshot base
+- recoverability proof
+
+做成可执行模型
+
+### Slice 4: Crash-consistency simulator
+
+最后重点加强：
+
+- `visible state`
+- `recoverable state`
+- `acked state`
+- flusher / checkpoint / replay 之间的边界
+
+## 19. 总结
+
+V2 的真正算法核心，不是“有一个 WAL”这么简单。
+
+它真正要建立的是一整套明确边界：
+
+- 用 `WAL` 表示顺序与近期历史
+- 用 `CommittedLSN` 表示外部承诺边界
+- 用 `RecoverySession` 表示恢复 authority
+- 用 `catch-up` / `rebuild` 表示恢复分类
+- 用 `checkpoint + replay` 表示 crash 后正式可恢复状态
+
+因此 V2 可以允许：
+
+- `WAL-first visibility`
+
+但绝不能允许：
+
+- `ACK-first illusion`
+- `visible-but-unrecoverable state`
+- `stale authority mutates current lineage`
+
+如果这几个边界都被 simulator、prototype 和真实 runner 分层证明，那么 `V2` 才有资格从“架构方向”进入“真实引擎实现”。
diff --git a/sw-block/design/v2-engine-readiness-review.md b/sw-block/design/v2-engine-readiness-review.md
new file mode 100644
index 000000000..b99afdc27
--- /dev/null
+++ b/sw-block/design/v2-engine-readiness-review.md
@@ -0,0 +1,170 @@
+# V2 Engine Readiness Review
+
+Date: 2026-03-29
+Status: active
+Purpose: record the decision on whether the current V2 design + prototype + simulator stack is strong enough to begin real V2 engine slicing
+
+## Decision
+
+Current judgment:
+
+- proceed to real V2 engine planning
+- do not open a `V2.5` redesign track at this time
+
+This is a planning-readiness decision, not a production-readiness claim.
+
+## Why This Review Exists
+
+The project has now completed:
+
+1. design/FSM closure for the V2 line
+2. protocol simulation closure for:
+   - V1 / V1.5 / V2 comparison
+   - timeout/race behavior
+   - ownership/session semantics
+3. standalone prototype closure for:
+   - sender/session ownership
+   - execution authority
+   - recovery branching
+   - minimal historical-data proof
+   - prototype scenario closure
+4. `Phase 4.5` hardening for:
+   - bounded `CatchUp`
+   - first-class `Rebuild`
+   - crash-consistency / restart-recoverability
+   - `A5-A8` stronger evidence
+
+So the question is no longer:
+
+- "can the prototype be made richer?"
+
+The question is:
+
+- "is the evidence now strong enough to begin real engine slicing?"
+
+## Evidence Summary
+
+### 1. Design / Protocol
+
+Primary docs:
+
+- `sw-block/design/v2-acceptance-criteria.md`
+- `sw-block/design/v2-open-questions.md`
+- `sw-block/design/v2_scenarios.md`
+- `sw-block/design/v1-v15-v2-comparison.md`
+- `sw-block/design/v2-prototype-roadmap-and-gates.md`
+
+Judgment:
+
+- protocol story is coherent
+- acceptance set exists
+- major V1 / V1.5 failures are mapped into V2 scenarios
+
+### 2. Simulator
+
+Primary code/tests:
+
+- `sw-block/prototype/distsim/`
+- `sw-block/prototype/distsim/eventsim.go`
+- `learn/projects/sw-block/test/results/v2-simulation-review.md`
+
+Judgment:
+
+- strong enough for protocol/design validation
+- strong enough to challenge crash-consistency and liveness assumptions
+- not a substitute for real engine / hardware proof
+
+### 3. Prototype
+
+Primary code/tests:
+
+- `sw-block/prototype/enginev2/`
+- `sw-block/prototype/enginev2/acceptance_test.go`
+
+Judgment:
+
+- ownership is explicit and fenced
+- execution authority is explicit and fenced
+- bounded `CatchUp` is semantic, not documentary
+- `Rebuild` is a first-class sender-owned path
+- historical-data and recoverability reasoning are executable
+
+### 4. `A5-A8` Double Evidence
+
+Prototype-side grouped evidence:
+
+- `sw-block/prototype/enginev2/acceptance_test.go`
+
+Simulator-side grouped evidence:
+
+- `sw-block/design/a5-a8-traceability.md`
+- `sw-block/prototype/distsim/`
+
+Judgment:
+
+- the critical acceptance items that most affect engine risk now have materially stronger proof on both sides
+
+## What Is Good Enough Now
+
+The following are good enough to begin engine slicing:
+
+1. sender/session ownership model
+2. stale authority fencing
+3. recovery orchestration shape
+4. bounded `CatchUp` contract
+5. `Rebuild` as formal path
+6. committed/recoverable boundary thinking
+7. crash-consistency / restart-recoverability proof style
+
+## What Is Still Not Proven
+
+The following still require real engine work and later real-system validation:
+
+1. actual engine lifecycle integration
+2. real storage/backend implementation
+3. real control-plane integration
+4. real durability / fsync behavior under the actual engine
+5. real hardware timing / performance
+6. final production observability and failure handling
+
+These are expected gaps. They do not block engine planning.
+
+## Open Risks To Carry Forward
+
+These are not blockers, but they should remain explicit:
+
+1. prototype and simulator are still reduced models
+2. rebuild-source quality in the real engine will depend on actual checkpoint/base-image mechanics
+3. durability truth in the real engine must still be re-proven against actual persistence behavior
+4. predicate exploration can still grow, but should not block engine slicing
+
+## Engine-Planning Decision
+
+Decision:
+
+- start real V2 engine planning
+
+Reason:
+
+1. no current evidence points to a structural flaw requiring `V2.5`
+2. the remaining gaps are implementation/system gaps, not prototype ambiguity
+3. continuing to extend prototype/simulator breadth would have diminishing returns
+
+## Required Outputs After This Review
+
+1. `sw-block/design/v2-engine-slicing-plan.md`
+2. first real engine slice definition
+3. explicit non-goals for first engine stage
+4. explicit validation plan for engine slices
+
+## Non-Goals Of This Review
+
+This review does not claim:
+
+1. V2 is production-ready
+2. V2 should replace V1 immediately
+3. all design questions are forever closed
+
+It only claims:
+
+- the project now has enough evidence to begin disciplined real engine slicing
diff --git a/sw-block/design/v2-engine-slicing-plan.md b/sw-block/design/v2-engine-slicing-plan.md
new file mode 100644
index 000000000..aeb919725
--- /dev/null
+++ b/sw-block/design/v2-engine-slicing-plan.md
@@ -0,0 +1,191 @@
+# V2 Engine Slicing Plan
+
+Date: 2026-03-29
+Status: active
+Purpose: define the first real V2 engine slices after prototype and `Phase 4.5` closure
+
+## Goal
+
+Move from:
+
+- standalone design/prototype truth under `sw-block/prototype/`
+
+to:
+
+- a real V2 engine core under `sw-block/`
+
+without dragging V1.5 lifecycle assumptions into the implementation.
+
+## Planning Rules
+
+1. reuse V1 ideas and tests selectively, not structurally
+2. prefer narrow vertical slices over broad skeletons
+3. each slice must preserve the accepted V2 ownership/fencing model
+4. keep simulator/prototype as validation support, not as the implementation itself
+5. do not mix V2 engine work into `weed/storage/blockvol/`
+
+## First Engine Stage
+
+The first engine stage should build the control/recovery core, not the full storage engine.
+
+That means:
+
+1. per-replica sender identity
+2. one active recovery session per replica per epoch
+3. sender-owned execution authority
+4. explicit recovery outcomes:
+   - zero gap
+   - bounded catch-up
+   - rebuild
+5. rebuild execution shell only
+   - do not hard-code final snapshot + tail vs full base decision logic yet
+   - keep real rebuild-source choice tied to Slice 3 recoverability inputs
+
+## Recommended Slice Order
+
+### Slice 1: Engine Ownership Core
+
+Purpose:
+
+- carry the accepted `enginev2` ownership/fencing model into the real engine core
+
+Scope:
+
+1. stable per-replica sender object
+2. stable recovery-session object
+3. session identity fencing
+4. endpoint / epoch invalidation
+5. sender-group or equivalent ownership registry
+
+Acceptance:
+
+1. stale session results cannot mutate current authority
+2. changed-address and epoch-bump invalidation work in engine code
+3. the 4 V2-boundary ownership themes remain provable
+
+### Slice 2: Engine Recovery Execution Core
+
+Purpose:
+
+- move the prototype execution APIs into real engine behavior
+
+Scope:
+
+1. connect / handshake / catch-up flow
+2. bounded `CatchUp`
+3. explicit `NeedsRebuild`
+4. sender-owned rebuild execution path
+5. rebuild execution shell without final trusted-base selection policy
+
+Acceptance:
+
+1. bounded catch-up does not chase indefinitely
+2. rebuild is exclusive from catch-up
+3. session completion rules are explicit and fenced
+
+### Slice 3: Engine Data / Recoverability Core
+
+Purpose:
+
+- connect recovery behavior to real retained-history / checkpoint mechanics
+
+Scope:
+
+1. real recoverability decision inputs
+2. trusted-base decision for rebuild source
+3. minimal real checkpoint/base-image integration
+4. real truncation / safe-boundary handling
+
+This is the first slice that should decide, from real engine inputs, between:
+
+1. `snapshot + tail`
+2. `full base`
+
+Acceptance:
+
+1. engine can explain why recovery is allowed
+2. rebuild-source choice is explicit and testable
+3. historical correctness and truncation rules remain intact
+
+### Slice 4: Engine Integration Closure
+
+Purpose:
+
+- bind engine control/recovery core to real orchestration and validation surfaces
+
+Scope:
+
+1. real assignment/control intent entry path
+2. engine-facing observability
+3. focused real-engine tests for V2-boundary cases
+4. first integration review against real failure classes
+
+Acceptance:
+
+1. key V2-boundary failures are reproduced and closed in engine tests
+2. engine observability is good enough to debug ownership/recovery failures
+3. remaining gaps are system/performance gaps, not control-model ambiguity
+
+## What To Reuse
+
+Good reuse candidates:
+
+1. tests and failure cases from V1 / V1.5
+2. narrow utility/data helpers where not coupled to V1 lifecycle
+3. selected WAL/history concepts if they fit V2 ownership boundaries
+
+Do not structurally reuse:
+
+1. V1/V1.5 shipper lifecycle
+2. address-based identity assumptions
+3. `SetReplicaAddrs`-style behavior
+4. old recovery control structure
+
+## Where The Work Should Live
+
+Real V2 engine work should continue under:
+
+- `sw-block/`
+
+Recommended next area:
+
+- `sw-block/core/`
+or
+- `sw-block/engine/`
+
+Exact path can be chosen later, but it should remain separate from:
+
+- `sw-block/prototype/`
+- `weed/storage/blockvol/`
+
+## Validation Plan For Engine Slices
+
+Each engine slice should be validated at three levels:
+
+1. prototype alignment
+- does engine behavior preserve the accepted prototype invariant?
+
+2. focused engine tests
+- does the real engine slice enforce the same contract?
+
+3. scenario mapping
+- does at least one important V1/V1.5 failure class remain closed?
+
+## Non-Goals For First Engine Stage
+
+Do not try to do these immediately:
+
+1. full Smart WAL expansion
+2. performance optimization
+3. V1 replacement/migration plan
+4. full product integration
+5. all storage/backend redesign at once
+
+## Immediate Next Assignment
+
+The first concrete engine-planning task should be:
+
+1. choose the real V2 engine module location under `sw-block/`
+2. define Slice 1 file/module boundaries
+3. write a short engine ownership-core spec
+4. map 3-5 acceptance scenarios directly onto Slice 1 expectations
diff --git a/sw-block/design/v2-production-roadmap.md b/sw-block/design/v2-production-roadmap.md
new file mode 100644
index 000000000..65c88fca5
--- /dev/null
+++ b/sw-block/design/v2-production-roadmap.md
@@ -0,0 +1,199 @@
+# V2 Production Roadmap
+
+Date: 2026-03-30
+Status: active
+Purpose: define the path from the accepted V2 engine core to a production candidate
+
+## Current Position
+
+Completed:
+
+1. design / FSM closure
+2. simulator / protocol validation
+3. prototype closure
+4. evidence hardening
+5. engine core slices:
+   - Slice 1 ownership core
+   - Slice 2 recovery execution core
+   - Slice 3 data / recoverability core
+   - Slice 4 integration closure
+
+Current stage:
+
+- entering broader engine implementation
+
+This means the main risk is no longer:
+
+- whether the V2 idea stands up
+
+The main risk is:
+
+- whether the accepted engine core can be turned into a real system without reintroducing V1/V1.5 structure and semantics
+
+## Roadmap Summary
+
+1. Phase 06: broader engine implementation stage
+2. Phase 07: real-system integration / product-path decision
+3. Phase 08: pre-production hardening
+4. Phase 09: performance / scale / soak validation
+5. Phase 10: production candidate and rollout gate
+
+## Phase 06
+
+### Goal
+
+Connect the accepted engine core to:
+
+1. real control truth
+2. real storage truth
+3. explicit engine execution steps
+
+### Outputs
+
+1. control-plane adapter into the engine core
+2. storage/base/recoverability adapters
+3. explicit execution-driver model where synchronous helpers are no longer sufficient
+4. validation against selected real failure classes
+
+### Gate
+
+At the end of Phase 06, the project should be able to say:
+
+- the engine core can live inside a real system shape
+
+## Phase 07
+
+### Goal
+
+Move from engine-local correctness to a real runnable subsystem.
+
+### Outputs
+
+1. service-style runnable engine slice
+2. integration with real control and storage surfaces
+3. crash/failover/restart integration tests
+4. decision on the first viable product path
+
+### Gate
+
+At the end of Phase 07, the project should be able to say:
+
+- the engine can run as a real subsystem, not only as an isolated core
+
+## Phase 08
+
+### Goal
+
+Turn correctness into operational safety.
+
+### Outputs
+
+1. observability hardening
+2. operator/debug flows
+3. recovery/runbook procedures
+4. config surface cleanup
+5. realistic durability/restart validation
+
+### Gate
+
+At the end of Phase 08, the project should be able to say:
+
+- operators can run, debug, and recover the system safely
+
+## Phase 09
+
+### Goal
+
+Prove viability under load and over time.
+
+### Outputs
+
+1. throughput / latency baselines
+2. rebuild / catch-up cost characterization
+3. steady-state overhead measurement
+4. soak testing
+5. scale and failure-under-load validation
+
+### Gate
+
+At the end of Phase 09, the project should be able to say:
+
+- the design is not only correct, but viable at useful scale and duration
+
+## Phase 10
+
+### Goal
+
+Produce a controlled production candidate.
+
+### Outputs
+
+1. feature-gated production candidate
+2. rollback strategy
+3. migration/coexistence plan with V1
+4. staged rollout plan
+5. production acceptance checklist
+
+### Gate
+
+At the end of Phase 10, the project should be able to say:
+
+- the system is ready for a controlled production rollout
+
+## Cross-Phase Rules
+
+### Rule 1: Do not reopen protocol shape casually
+
+The accepted core should remain stable unless new implementation evidence forces a change.
+
+### Rule 2: Use V1 as validation source, not design template
+
+Use:
+
+1. `learn/projects/sw-block/`
+2. `weed/storage/block*`
+
+for:
+
+1. failure gates
+2. constraints
+3. integration references
+
+Do not use them as the default V2 architecture template.
+
+### Rule 3: Keep `CatchUp` narrow
+
+Do not let later implementation phases re-expand `CatchUp` into a broad, optimistic, long-lived recovery mode.
+
+### Rule 4: Keep evidence quality ahead of object growth
+
+New work should preferentially improve:
+
+1. traceability
+2. diagnosability
+3. real-failure validation
+4. operational confidence
+
+not simply add new objects, states, or mechanisms.
+
+## Production Readiness Ladder
+
+The project should move through this ladder explicitly:
+
+1. proof-of-design
+2. proof-of-engine-shape
+3. proof-of-runnable-engine-stage
+4. proof-of-operable-system
+5. proof-of-viable-production-candidate
+
+Current ladder position:
+
+- between `2` and `3`
+- engine core accepted; broader runnable engine stage underway
+
+## Next Documents To Maintain
+
+1. `sw-block/.private/phase/phase-06.md`
+2. `sw-block/design/v2-engine-readiness-review.md`
+3. `sw-block/design/v2-engine-slicing-plan.md`
+4. this roadmap
diff --git a/sw-block/design/v2-protocol-truths.md b/sw-block/design/v2-protocol-truths.md
new file mode 100644
index 000000000..6f4eab667
--- /dev/null
+++ b/sw-block/design/v2-protocol-truths.md
@@ -0,0 +1,561 @@
+# V2 Protocol Truths
+
+Date: 2026-03-30
+Status: active
+Purpose: record the compact, stable truths that later phases must preserve, and provide a conformance reference for implementation reviews
+
+## Why This Document Exists
+
+`FSM`, `simulator`, `prototype`, and `engine` are not a code-production pipeline.
+They are an evidence ladder.
+
+So the most important output to carry forward is not only code, but:
+
+1. accepted semantics
+2. must-hold boundaries
+3. failure classes that must stay closed
+4. explicit places where later phases may improve or drift
+
+This document is the compact truth table for the V2 line.
+
+## How To Use It
+
+For each later phase or slice, ask:
+
+1. does the new implementation remain aligned with these truths?
+2. if not, is the deviation constructive or risky?
+3. which truth is newly strengthened by this phase?
+
+Deviation labels:
+
+- `Aligned`: implementation preserves the truth
+- `Constructive deviation`: implementation changes shape but strengthens the truth
+- `Risky deviation`: implementation weakens or blurs the truth
+
+## Core Truths
+
+### T1. `CommittedLSN` is the external truth boundary
+
+Short form:
+
+- external promises are anchored at `CommittedLSN`, not `HeadLSN`
+
+Meaning:
+
+- recovery targets
+- promotion safety
+- flush/visibility reasoning
+
+must all be phrased against `CommittedLSN`.
+
+Prevents:
+
+- using optimistic WAL head as committed truth
+- acknowledging lineage that failover cannot preserve
+
+Evidence anchor:
+
+- strong in design
+- strong in simulator
+- strong in prototype
+- strong in engine
+
+### T2. `ZeroGap <=> ReplicaFlushedLSN == CommittedLSN`
+
+Short form:
+
+- zero-gap requires exact equality with committed truth
+
+Meaning:
+
+- replica ahead is not zero-gap
+- replica behind is not zero-gap
+
+Prevents:
+
+- unsafe fast-path completion
+- replica-ahead being mistaken for in-sync
+
+Evidence anchor:
+
+- strong in prototype
+- strong in engine
+
+### T3. `CatchUp` is bounded replay on a still-trusted base
+
+Short form:
+
+- `CatchUp = KeepUp with bounded debt`
+
+Meaning:
+
+- catch-up is a short-gap, low-cost, bounded replay path
+- it only makes sense while the replica base is still trustworthy enough to continue from
+
+Prevents:
+
+- turning catch-up into indefinite moving-head chase
+- hiding broad recovery complexity in replay logic
+
+Evidence anchor:
+
+- strong in design
+- strong in simulator
+- strong in prototype
+- strong in engine
+
+### T4. `NeedsRebuild` is explicit when replay is not the right answer
+
+Short form:
+
+- `NeedsRebuild <=> replay is unrecoverable, unstable, or no longer worth bounded replay`
+
+Meaning:
+
+- long-gap
+- lost recoverability
+- no trusted base
+- budget violation
+
+must escalate explicitly.
+
+Prevents:
+
+- pretending catch-up will eventually succeed
+- carrying V1/V1.5-style unbounded degraded chase forward
+
+Evidence anchor:
+
+- strong in simulator
+- strong in prototype
+- strong in engine
+
+### T5. `Rebuild` is the formal primary recovery path
+
+Short form:
+
+- `Rebuild = frozen TargetLSN + trusted base + optional tail + barrier`
+
+Meaning:
+
+- rebuild is not a shameful fallback
+- it is the general recovery framework
+
+Prevents:
+
+- overloading catch-up with broad recovery semantics
+- treating full/partial rebuild as unrelated protocols
+
+Evidence anchor:
+
+- strong in design
+- strong in prototype
+- strong in engine
+
+### T6. Full and partial rebuild share one correctness contract
+
+Short form:
+
+- `full rebuild` and `partial rebuild` differ in transfer choice, not in truth model
+
+Meaning:
+
+- both require frozen `TargetLSN`
+- both require trusted pinned base
+- both require explicit durable completion
+
+Prevents:
+
+- optimization layers redefining protocol truth
+- bitmap/range paths bypassing trusted-base rules
+
+Evidence anchor:
+
+- strong in design
+- partial in engine
+- stronger real-system proof still deferred
+
+### T7. No recovery result may outlive its authority
+
+Short form:
+
+- `ValidMutation <=> sender exists && sessionID matches && epoch current && endpoint current`
+
+Meaning:
+
+- stale session
+- stale epoch
+- stale endpoint
+- stale sender
+
+must all fail closed.
+
+Prevents:
+
+- late results mutating current lineage
+- changed-address stale completion bugs
+
+Evidence anchor:
+
+- strong in simulator
+- strong in prototype
+- strong in engine
+
+### T8. `ReplicaID` is stable identity; `Endpoint` is mutable location
+
+Short form:
+
+- `ReplicaID != address`
+
+Meaning:
+
+- address changes may invalidate sessions
+- address changes must not destroy sender identity
+
+Prevents:
+
+- reintroducing address-shaped identity
+- changed-address restarting as logical removal + add
+
+Evidence anchor:
+
+- strong in prototype
+- strong in engine
+- strong in bridge P0
+
+### T9. Truncation is a protocol boundary, not cleanup
+
+Short form:
+
+- replica-ahead cannot complete until divergent tail is explicitly truncated
+
+Meaning:
+
+- truncation is part of recovery contract
+- not a side-effect or best-effort cleanup
+
+Prevents:
+
+- completing recovery while replica still contains newer divergent writes
+
+Evidence anchor:
+
+- strong in design
+- strong in engine
+
+### T10. Recoverability must be proven from real retained history
+
+Short form:
+
+- `CatchUp allowed <=> required replay range is recoverable from retained history`
+
+Meaning:
+
+- the engine should consume storage truth
+- not test-reconstructed optimism
+
+Prevents:
+
+- replay on missing WAL
+- fake recoverability based only on watermarks
+
+Evidence anchor:
+
+- strong in simulator
+- strong in engine
+- strengthened in driver/adapter phases
+
+### T11. Trusted-base choice must be explicit and causal
+
+Short form:
+
+- `snapshot_tail` requires both trusted checkpoint and replayable tail
+
+Meaning:
+
+- snapshot existence alone is insufficient
+- fallback to full-base must be explainable
+
+Prevents:
+
+- over-trusting old checkpoints
+- silently choosing an invalid rebuild source
+
+Evidence anchor:
+
+- strong in simulator
+- strong in engine
+- strengthened by Phase 06
+
+### T12. Current extent cannot fake old history
+
+Short form:
+
+- historical correctness requires reconstructable history, not current-state approximation
+
+Meaning:
+
+- live extent state is not sufficient proof of an old target point
+- historical reconstruction must be justified by checkpoint + retained history
+
+Prevents:
+
+- using current extent as fake proof of older state
+
+Evidence anchor:
+
+- strongest in simulator
+- engine currently proves prerequisites, not full reconstruction proof
+
+### T13. Promotion requires recoverable committed prefix
+
+Short form:
+
+- promoted replica must be able to recover committed truth, not merely advertise a high watermark
+
+Meaning:
+
+- candidate selection is about recoverable lineage, not optimistic flush visibility
+
+Prevents:
+
+- promoting a replica that cannot reconstruct committed prefix after crash/restart
+
+Evidence anchor:
+
+- strong in simulator
+- partially carried into engine semantics
+- real-system validation still needed
+
+### T14. `blockvol` executes I/O; engine owns recovery policy
+
+Short form:
+
+- adapters may translate engine decisions into concrete work
+- they must not silently re-decide recovery classification or source choice
+
+Meaning:
+
+- master remains control authority
+- engine remains recovery authority
+- storage remains truth source
+
+Prevents:
+
+- V1/V1.5 policy leakage back into service glue
+
+Evidence anchor:
+
+- strong in Phase 07 service-slice planning
+- initial bridge P0 aligns
+- real-system proof still pending
+
+### T15. Reuse reality, not inherited semantics
+
+Short form:
+
+- V2 may reuse existing Seaweed control/runtime/storage paths
+- it must not inherit old semantics as protocol truth
+
+Meaning:
+
+- reuse existing heartbeat, assignment, `blockvol`, receiver, shipper, retention, and runtime machinery when useful
+- keep `ReplicaID`, epoch authority, recovery classification, committed truth, and rebuild boundaries anchored in accepted V2 semantics
+
+Prevents:
+
+- V1/V1.5 structure silently redefining V2 behavior
+- convenience reuse turning old runtime assumptions into new protocol truth
+
+Evidence anchor:
+
+- strong in Phase 07/08 direction
+- should remain active in later implementation phases
+
+## Current Strongest Evidence By Layer
+
+| Layer | Main value |
+|------|------------|
+| `FSM` / design | define truth and non-goals |
+| simulator | prove protocol truth and failure-class closure cheaply |
+| prototype | prove implementation-shape and authority semantics cheaply |
+| engine | prove the accepted contracts survive real implementation structure |
+| service slice / runner | prove truth survives real control/storage/system reality |
+
+## Phase Conformance Notes
+
+### Phase 04
+
+- `Aligned`: T7, T8
+- strengthened sender/session ownership and stale rejection
+
+### Phase 4.5
+
+- `Aligned`: T3, T4, T5, T10, T12
+- major tightening:
+  - bounded catch-up
+  - first-class rebuild
+  - crash-consistency and recoverability proof style
+
+### Phase 05
+
+- `Aligned`: T1, T2, T3, T4, T5, T7, T8, T9, T10, T11
+- engine core slices closed:
+  - ownership
+  - execution
+  - recoverability gating
+  - orchestrated entry path
+
+### Phase 06
+
+- `Aligned`: T10, T11, T14
+- `Constructive deviation`: planner/executor split replaced convenience wrappers without changing protocol truth
+- strengthened:
+  - real storage/resource contracts
+  - explicit release symmetry
+  - failure-class validation against engine path
+
+### Phase 07 P0
+
+- `Aligned`: T8, T10, T14
+- bridge now makes stable `ReplicaID` explicit at service boundary
+- bridge states the hard rule that engine decides policy and `blockvol` executes I/O
+- real `weed/storage/blockvol/` integration still pending
+
+## Current Carry-Forward Truths For Later Phases
+
+Later phases must not regress these:
+
+1. `CommittedLSN` remains the external truth boundary
+2. `CatchUp` stays narrow and bounded
+3. `Rebuild` remains the formal primary recovery path
+4. stale authority must fail closed
+5. stable identity must remain separate from mutable endpoint
+6. trusted-base choice must remain explicit and causal
+7. service glue must not silently re-decide recovery policy
+8. reuse reality, but do not inherit old semantics as V2 truth
+
+## Review Rule
+
+Every later phase or slice should explicitly answer:
+
+1. which truths are exercised?
+2. which truths are strengthened?
+3. does this phase introduce any constructive or risky deviation?
+4. which evidence layer now carries the truth most strongly?
+
+## Phase Alignment Rule
+
+From `Phase 05` onward, every phase or slice should align explicitly against this document.
+
+Minimum phase-alignment questions:
+
+1. which truths are in scope?
+2. which truths are strengthened?
+3. which truths are merely carried forward?
+4. does the phase introduce any constructive deviation?
+5. does the phase introduce any risky deviation?
+6. which evidence layer currently carries each in-scope truth most strongly?
+
+Expected output shape for each later phase:
+
+- `In-scope truths`
+- `Strengthened truths`
+- `Carry-forward truths`
+- `Constructive deviations`
+- `Risky deviations`
+- `Evidence shift`
+
+## Phase 05-07 Alignment
+
+### Phase 05
+
+Primary alignment focus:
+
+- T1 `CommittedLSN` as external truth boundary
+- T2 zero-gap exactness
+- T3 bounded `CatchUp`
+- T4 explicit `NeedsRebuild`
+- T5/T6 rebuild correctness contract
+- T7 stale authority must fail closed
+- T8 stable `ReplicaID`
+- T9 truncation as protocol boundary
+- T10/T11 recoverability and trusted-base gating
+
+Main strengthening:
+
+- engine core adopted accepted protocol truths as real implementation structure
+
+Main review risk:
+
+- engine structure accidentally collapsing back to address identity or unfenced execution
+
+### Phase 06
+
+Primary alignment focus:
+
+- T10 recoverability from real retained history
+- T11 trusted-base choice remains explicit and causal
+- T14 engine owns policy, adapters carry truth and execution contracts
+
+Main strengthening:
+
+- planner/executor/resource contracts
+- fail-closed cleanup symmetry
+- cross-layer proof path through engine execution
+
+Main review risk:
+
+- executor or adapters recomputing policy from convenience inputs
+- storage/resource contracts becoming approximate instead of real
+
+### Phase 07+
+
+Primary alignment focus:
+
+- T8 stable identity at the real service boundary
+- T10 real storage truth into engine decisions
+- T11 trusted-base proof remains explicit through service glue
+- T14 `blockvol` executes I/O but does not own recovery policy
+
+Main strengthening:
+
+- real-system service-slice conformance
+- real control-plane and storage-plane integration
+- diagnosable failure replay through the integrated path
+
+Main review risk:
+
+- V1/V1.5 semantics leaking back in through service glue
+- address-shaped identity reappearing at the boundary
+- blockvol-side code silently re-deciding recovery policy
+
+## Future Feature Rule
+
+When a later feature expands the protocol surface (for example `SmartWAL` or a new rebuild optimization), the order should be:
+
+1. `FSM / design`
+- define the new semantics and non-goals
+
+2. `Truth update`
+- either attach the feature to an existing truth
+- or add a new protocol truth if the feature creates a new long-lived invariant
+
+3. `Phase alignment`
+- define which later phases strengthen or validate that truth
+
+4. `Evidence ladder`
+- simulator, prototype, engine, service slice as needed
+
+Do not start feature implementation by editing engine or service glue first and only later trying to explain what truth changed.
+
+## Feature Review Rule
+
+For any future feature, later reviews should ask:
+
+1. did the feature create a new truth or just strengthen an existing one?
+2. which phase first validates it?
+3. which evidence layer proves it most strongly today?
+4. does the feature weaken any existing truth?
+
+This keeps feature growth aligned with protocol truth instead of letting implementation convenience define semantics.
diff --git a/sw-block/prototype/distsim/cluster.go b/sw-block/prototype/distsim/cluster.go
index 6e65a5e55..15bc6f3f3 100644
--- a/sw-block/prototype/distsim/cluster.go
+++ b/sw-block/prototype/distsim/cluster.go
@@ -1066,9 +1066,10 @@ type CandidateEligibility struct {
 }
 
 // EvaluateCandidateEligibility checks all promotion prerequisites for a node.
-// A candidate must have the full committed prefix (FlushedLSN >= CommittedLSN)
-// to be eligible. Promoting a replica that is missing committed data would
-// lose acknowledged writes.
+// Phase 4.5: uses RecoverableLSN (not just FlushedLSN) to verify that the
+// candidate can actually recover the committed prefix after a crash+restart,
+// not just that it received durable WAL entries. RecoverableLSN accounts for
+// checkpoint + WAL replay availability.
 func (c *Cluster) EvaluateCandidateEligibility(candidateID string) CandidateEligibility {
 	n := c.Nodes[candidateID]
 	if n == nil {
@@ -1084,7 +1085,11 @@ func (c *Cluster) EvaluateCandidateEligibility(candidateID string) CandidateElig
 	if n.ReplicaState == NodeStateNeedsRebuild || n.ReplicaState == NodeStateRebuilding {
 		reasons = append(reasons, "state_ineligible")
 	}
-	if n.Storage.FlushedLSN < c.Coordinator.CommittedLSN {
+	// Phase 4.5: check recoverable committed prefix, not just durable watermark.
+	// RecoverableLSN = the highest LSN that would survive crash + restart.
+	// This is stronger than FlushedLSN when checkpoint + WAL GC may have
+	// created gaps in the replay path.
+	if n.Storage.RecoverableLSN() < c.Coordinator.CommittedLSN {
 		reasons = append(reasons, "insufficient_committed_prefix")
 	}
 	return CandidateEligibility{
diff --git a/sw-block/prototype/distsim/cluster_test.go b/sw-block/prototype/distsim/cluster_test.go
index 8fd38f82d..09237ea23 100644
--- a/sw-block/prototype/distsim/cluster_test.go
+++ b/sw-block/prototype/distsim/cluster_test.go
@@ -166,7 +166,7 @@ func TestZombieOldPrimaryWritesAreFenced(t *testing.T) {
 	if c.Coordinator.CommittedLSN != 1 {
 		t.Fatalf("stale message changed committed lsn: got=%d", c.Coordinator.CommittedLSN)
 	}
-	if got := c.Nodes["r1"].Storage.Extent[42]; got != 0 {
+	if got := c.Nodes["r1"].Storage.LiveExtent[42]; got != 0 {
 		t.Fatalf("stale message mutated new primary extent: block42=%d", got)
 	}
 }
diff --git a/sw-block/prototype/distsim/phase02_candidate_test.go b/sw-block/prototype/distsim/phase02_candidate_test.go
index c24568043..d9afdc4e3 100644
--- a/sw-block/prototype/distsim/phase02_candidate_test.go
+++ b/sw-block/prototype/distsim/phase02_candidate_test.go
@@ -353,6 +353,7 @@ func TestP02_CandidateEligibility_InsufficientCommittedPrefix(t *testing.T) {
 
 	// Manually set r1 behind committed prefix.
 	c.Nodes["r1"].Storage.FlushedLSN = 0
+	c.Nodes["r1"].Storage.WALDurableLSN = 0
 	e = c.EvaluateCandidateEligibility("r1")
 	if e.Eligible {
 		t.Fatal("FlushedLSN=0 with CommittedLSN=1 should not be eligible")
@@ -379,14 +380,19 @@ func TestP02_CandidateEligibility_InSyncButLagging_Rejected(t *testing.T) {
 	// r1: InSync, correct epoch, but FlushedLSN=1. Ineligible.
 	c.Nodes["r1"].ReplicaState = NodeStateInSync
 	c.Nodes["r1"].Storage.FlushedLSN = 1
+	c.Nodes["r1"].Storage.WALDurableLSN = 1
 
 	// r2: CatchingUp, correct epoch, FlushedLSN=100. Eligible.
 	c.Nodes["r2"].ReplicaState = NodeStateCatchingUp
 	c.Nodes["r2"].Storage.FlushedLSN = 100
+	c.Nodes["r2"].Storage.WALDurableLSN = 100
+	c.Nodes["r2"].Storage.CheckpointLSN = 100
 
 	// r3: InSync, correct epoch, FlushedLSN=100. Eligible.
 	c.Nodes["r3"].ReplicaState = NodeStateInSync
 	c.Nodes["r3"].Storage.FlushedLSN = 100
+	c.Nodes["r3"].Storage.WALDurableLSN = 100
+	c.Nodes["r3"].Storage.CheckpointLSN = 100
 
 	// r1 is ineligible despite being InSync.
 	e1 := c.EvaluateCandidateEligibility("r1")
diff --git a/sw-block/prototype/distsim/phase045_adversarial_test.go b/sw-block/prototype/distsim/phase045_adversarial_test.go
new file mode 100644
index 000000000..eb51d23db
--- /dev/null
+++ b/sw-block/prototype/distsim/phase045_adversarial_test.go
@@ -0,0 +1,219 @@
+package distsim
+
+import (
+	"math/rand"
+	"testing"
+)
+
+// Phase 4.5: Adversarial predicate search.
+// These tests run randomized/semi-randomized scenarios and check danger
+// predicates after each step. The goal is to find protocol violations
+// that handwritten scenarios might miss.
+
+// TestAdversarial_RandomWritesAndCrashes runs random write + crash + restart
+// sequences and checks all danger predicates after each step.
+func TestAdversarial_RandomWritesAndCrashes(t *testing.T) {
+	rng := rand.New(rand.NewSource(42))
+
+	for trial := 0; trial < 50; trial++ {
+		c := NewCluster(CommitSyncAll, "p", "r1", "r2")
+
+		// Random sequence of operations.
+		for step := 0; step < 30; step++ {
+			op := rng.Intn(10)
+			switch {
+			case op < 5:
+				// Write.
+				block := uint64(rng.Intn(10) + 1)
+				c.CommitWrite(block)
+			case op < 7:
+				// Tick (advance time, deliver messages).
+				c.Tick()
+			case op < 8:
+				// Crash a random node.
+				nodes := []string{"p", "r1", "r2"}
+				target := nodes[rng.Intn(3)]
+				node := c.Nodes[target]
+				if node.Running {
+					node.Storage.Crash()
+					node.Running = false
+					node.ReplicaState = NodeStateLagging
+				}
+			case op < 9:
+				// Restart a crashed node.
+				nodes := []string{"p", "r1", "r2"}
+				target := nodes[rng.Intn(3)]
+				node := c.Nodes[target]
+				if !node.Running {
+					node.Storage.Restart()
+					node.Running = true
+					node.ReplicaState = NodeStateLagging // needs catch-up
+				}
+			default:
+				// Flusher tick on all running nodes.
+				for _, node := range c.Nodes {
+					if node.Running {
+						node.Storage.ApplyToExtent(node.Storage.WALDurableLSN)
+						node.Storage.AdvanceCheckpoint(node.Storage.WALDurableLSN)
+					}
+				}
+			}
+
+			// Check predicates after every step.
+			violations := CheckAllPredicates(c)
+			if len(violations) > 0 {
+				for name, detail := range violations {
+					t.Errorf("trial %d step %d: PREDICATE VIOLATED [%s]: %s", trial, step, name, detail)
+				}
+				t.FailNow()
+			}
+		}
+	}
+}
+
+// TestAdversarial_FailoverChainWithPredicates runs a sequence of
+// failovers (promote, crash, promote) and checks predicates.
+func TestAdversarial_FailoverChainWithPredicates(t *testing.T) {
+	c := NewCluster(CommitSyncAll, "p", "r1", "r2")
+
+	// Write some data and commit.
+	for i := 0; i < 5; i++ {
+		c.CommitWrite(uint64(i + 1))
+	}
+	c.TickN(5)
+
+	check := func(label string) {
+		violations := CheckAllPredicates(c)
+		for name, detail := range violations {
+			t.Fatalf("%s: PREDICATE VIOLATED [%s]: %s", label, name, detail)
+		}
+	}
+
+	check("after initial writes")
+
+	// Kill primary.
+	c.Nodes["p"].Running = false
+	c.Nodes["p"].Storage.Crash()
+
+	// Promote r1.
+	c.Promote("r1")
+	c.TickN(3)
+	check("after first promotion")
+
+	// Write more under new primary.
+	for i := 0; i < 3; i++ {
+		c.CommitWrite(uint64(i + 10))
+	}
+	c.TickN(5)
+	check("after writes on new primary")
+
+	// Kill new primary.
+	c.Nodes["r1"].Running = false
+	c.Nodes["r1"].Storage.Crash()
+
+	// Promote r2.
+	c.Promote("r2")
+	c.TickN(3)
+	check("after second promotion")
+
+	// Write more under third primary.
+	c.CommitWrite(99)
+	c.TickN(5)
+	check("after writes on third primary")
+}
+
+// TestAdversarial_CatchUpUnderLoad runs catch-up while the primary keeps
+// writing, then checks predicates for livelock.
+func TestAdversarial_CatchUpUnderLoad(t *testing.T) {
+	c := NewCluster(CommitSyncAll, "p", "r1")
+
+	// Write initial data.
+	for i := 0; i < 10; i++ {
+		c.CommitWrite(uint64(i + 1))
+	}
+	c.TickN(5)
+
+	// Disconnect r1.
+	c.Nodes["r1"].Running = false
+
+	// Write more while r1 is down.
+	for i := 0; i < 20; i++ {
+		c.CommitWrite(uint64(i + 100))
+		c.Tick()
+	}
+
+	// Reconnect r1 — needs catch-up.
+	c.Nodes["r1"].Running = true
+	c.Nodes["r1"].ReplicaState = NodeStateLagging
+
+	// Attempt catch-up while primary keeps writing.
+	for step := 0; step < 20; step++ {
+		// Primary writes more.
+		c.CommitWrite(uint64(step + 200))
+		c.Tick()
+
+		// Attempt catch-up progress.
+		c.CatchUpWithEscalation("r1", 5)
+
+		// Check predicates.
+		violations := CheckAllPredicates(c)
+		for name, detail := range violations {
+			t.Fatalf("step %d: PREDICATE VIOLATED [%s]: %s", step, name, detail)
+		}
+	}
+
+	// After the loop, r1 should be either InSync or NeedsRebuild.
+	state := c.Nodes["r1"].ReplicaState
+	if state != NodeStateInSync && state != NodeStateNeedsRebuild {
+		t.Fatalf("r1 should be InSync or NeedsRebuild after catch-up under load, got %s", state)
+	}
+}
+
+// TestAdversarial_CheckpointGCThenCrash runs checkpoint + WAL GC + crash
+// sequences and verifies acked data is never lost.
+func TestAdversarial_CheckpointGCThenCrash(t *testing.T) {
+	rng := rand.New(rand.NewSource(99))
+
+	for trial := 0; trial < 30; trial++ {
+		c := NewCluster(CommitSyncAll, "p", "r1")
+
+		// Write and commit data.
+		for i := 0; i < 15; i++ {
+			c.CommitWrite(uint64(rng.Intn(20) + 1))
+		}
+		c.TickN(10)
+
+		// Flusher + checkpoint at various points.
+		for _, node := range c.Nodes {
+			if node.Running {
+				flushTo := node.Storage.WALDurableLSN
+				node.Storage.ApplyToExtent(flushTo)
+				// Checkpoint at a random point up to flush.
+				cpLSN := uint64(rng.Int63n(int64(flushTo+1)))
+				node.Storage.AdvanceCheckpoint(cpLSN)
+
+				// GC WAL entries before checkpoint.
+				retained := make([]Write, 0)
+				for _, w := range node.Storage.WAL {
+					if w.LSN > node.Storage.CheckpointLSN {
+						retained = append(retained, w)
+					}
+				}
+				node.Storage.WAL = retained
+			}
+		}
+
+		// Crash primary.
+		primary := c.Primary()
+		if primary != nil {
+			primary.Storage.Crash()
+			primary.Storage.Restart()
+		}
+
+		// Check predicates — committed data must still be recoverable.
+		violations := CheckAllPredicates(c)
+		for name, detail := range violations {
+			t.Errorf("trial %d: PREDICATE VIOLATED [%s]: %s", trial, name, detail)
+		}
+	}
+}
diff --git a/sw-block/prototype/distsim/phase045_crash_test.go b/sw-block/prototype/distsim/phase045_crash_test.go
new file mode 100644
index 000000000..ec1fb4964
--- /dev/null
+++ b/sw-block/prototype/distsim/phase045_crash_test.go
@@ -0,0 +1,334 @@
+package distsim
+
+import (
+	"testing"
+)
+
+// Phase 4.5: Crash-consistency and recoverability tests.
+// These validate invariants I1-I5 from the crash-consistency simulation plan.
+
+// --- Invariant I1: ACK'd flush is recoverable after any crash ---
+
+func TestI1_AckedFlush_RecoverableAfterPrimaryCrash(t *testing.T) {
+	c := NewCluster(CommitSyncAll, "p", "r")
+
+	// Write 3 entries and commit (sync_all = durable on both nodes).
+	for i := 0; i < 3; i++ {
+		c.CommitWrite(uint64(i + 1))
+	}
+	c.Tick()
+	c.Tick()
+	c.Tick()
+
+	if c.Coordinator.CommittedLSN < 3 {
+		t.Fatalf("expected CommittedLSN>=3, got %d", c.Coordinator.CommittedLSN)
+	}
+
+	committedLSN := c.Coordinator.CommittedLSN
+
+	// Crash the primary.
+	primary := c.Nodes["p"]
+	primary.Storage.Crash()
+
+	// Restart: recover from checkpoint + durable WAL.
+	recoveredLSN := primary.Storage.Restart()
+
+	// I1: all committed data must be recoverable.
+	if recoveredLSN < committedLSN {
+		t.Fatalf("I1 VIOLATED: recoveredLSN=%d < committedLSN=%d — acked data lost",
+			recoveredLSN, committedLSN)
+	}
+
+	// Verify data correctness against reference.
+	refState := c.Reference.StateAt(committedLSN)
+	recState := primary.Storage.StateAt(committedLSN)
+	for block, expected := range refState {
+		if got := recState[block]; got != expected {
+			t.Fatalf("I1 VIOLATED: block %d: reference=%d recovered=%d", block, expected, got)
+		}
+	}
+}
+
+// --- Invariant I2: No ghost visible state after crash ---
+
+func TestI2_ExtentAheadOfCheckpoint_CrashRestart(t *testing.T) {
+	s := NewStorage()
+
+	// Write 5 entries to WAL.
+	for i := uint64(1); i <= 5; i++ {
+		s.AppendWrite(Write{Block: 10 + i, Value: i * 100, LSN: i})
+	}
+
+	// Make all 5 durable.
+	s.AdvanceFlush(5)
+
+	// Flusher materializes entries 1-3 to live extent.
+	s.ApplyToExtent(3)
+
+	// Checkpoint at LSN 1 only.
+	s.AdvanceCheckpoint(1)
+
+	// Crash.
+	s.Crash()
+	if s.LiveExtent != nil {
+		t.Fatal("after crash, LiveExtent should be nil")
+	}
+
+	// Restart.
+	recoveredLSN := s.Restart()
+	if recoveredLSN != 5 {
+		t.Fatalf("recoveredLSN should be 5, got %d", recoveredLSN)
+	}
+
+	// I2: all durable data recovered from checkpoint + WAL replay.
+	for i := uint64(1); i <= 5; i++ {
+		block := 10 + i
+		expected := i * 100
+		if got := s.LiveExtent[block]; got != expected {
+			t.Fatalf("I2: block %d: expected %d, got %d", block, expected, got)
+		}
+	}
+}
+
+func TestI2_UnackedData_LostAfterCrash(t *testing.T) {
+	s := NewStorage()
+
+	for i := uint64(1); i <= 5; i++ {
+		s.AppendWrite(Write{Block: i, Value: i * 10, LSN: i})
+	}
+
+	// Only fsync 1-3. Entries 4-5 are NOT durable.
+	s.AdvanceFlush(3)
+	s.ApplyToExtent(5) // should clamp to 3
+	s.AdvanceCheckpoint(3)
+
+	if s.ExtentAppliedLSN != 3 {
+		t.Fatalf("ApplyToExtent should clamp to WALDurableLSN=3, got %d", s.ExtentAppliedLSN)
+	}
+
+	s.Crash()
+	s.Restart()
+
+	// Blocks 4,5 must NOT be in recovered extent.
+	if val, ok := s.LiveExtent[4]; ok && val != 0 {
+		t.Fatalf("I2 VIOLATED: block 4=%d survived crash — unfsynced data", val)
+	}
+	if val, ok := s.LiveExtent[5]; ok && val != 0 {
+		t.Fatalf("I2 VIOLATED: block 5=%d survived crash — unfsynced data", val)
+	}
+
+	// Blocks 1-3 must be there.
+	for i := uint64(1); i <= 3; i++ {
+		if got := s.LiveExtent[i]; got != i*10 {
+			t.Fatalf("block %d: expected %d, got %d", i, i*10, got)
+		}
+	}
+}
+
+// --- Invariant I3: CatchUp converges or escalates ---
+
+func TestI3_CatchUpConvergesOrEscalates(t *testing.T) {
+	c := NewCluster(CommitSyncAll, "p", "r")
+
+	// Commit initial entry.
+	c.CommitWrite(1)
+	c.Tick()
+	c.Tick()
+
+	// Disconnect replica and write more.
+	c.Nodes["r"].Running = false
+	for i := uint64(2); i <= 10; i++ {
+		c.CommitWrite(i)
+		c.Tick()
+	}
+
+	// Reconnect.
+	c.Nodes["r"].Running = true
+	c.Nodes["r"].ReplicaState = NodeStateLagging
+
+	// Catch-up with escalation.
+	converged := c.CatchUpWithEscalation("r", 3)
+
+	// I3: must resolve — either converged or escalated to NeedsRebuild.
+	state := c.Nodes["r"].ReplicaState
+	if !converged && state != NodeStateNeedsRebuild {
+		t.Fatalf("I3 VIOLATED: catchup did not converge and state=%s (not NeedsRebuild)", state)
+	}
+}
+
+// --- Invariant I4: Promoted replica has committed prefix ---
+
+func TestI4_PromotedReplica_HasCommittedPrefix(t *testing.T) {
+	c := NewCluster(CommitSyncAll, "p", "r")
+
+	for i := uint64(1); i <= 5; i++ {
+		c.CommitWrite(i)
+	}
+	c.Tick()
+	c.Tick()
+	c.Tick()
+
+	committedLSN := c.Coordinator.CommittedLSN
+	if committedLSN < 5 {
+		t.Fatalf("expected CommittedLSN>=5, got %d", committedLSN)
+	}
+
+	// Promote replica.
+	if err := c.Promote("r"); err != nil {
+		t.Fatalf("promote: %v", err)
+	}
+
+	// I4: new primary must have recoverable committed prefix.
+	newPrimary := c.Nodes["r"]
+	recoverableLSN := newPrimary.Storage.RecoverableLSN()
+	if recoverableLSN < committedLSN {
+		t.Fatalf("I4 VIOLATED: promoted recoverableLSN=%d < committedLSN=%d",
+			recoverableLSN, committedLSN)
+	}
+
+	// Verify data matches reference.
+	refState := c.Reference.StateAt(committedLSN)
+	recState := newPrimary.Storage.StateAt(committedLSN)
+	for block, expected := range refState {
+		if got := recState[block]; got != expected {
+			t.Fatalf("I4 VIOLATED: block %d: ref=%d got=%d", block, expected, got)
+		}
+	}
+}
+
+// --- Direct test: checkpoint must not leak applied-but-uncheckpointed state ---
+
+func TestI2_CheckpointDoesNotLeakAppliedState(t *testing.T) {
+	s := NewStorage()
+
+	// Write 5 entries, all durable.
+	for i := uint64(1); i <= 5; i++ {
+		s.AppendWrite(Write{Block: i, Value: i * 10, LSN: i})
+	}
+	s.AdvanceFlush(5)
+
+	// Flusher applies all 5 to LiveExtent.
+	s.ApplyToExtent(5)
+
+	// But checkpoint only at LSN 2.
+	s.AdvanceCheckpoint(2)
+
+	// CheckpointExtent must contain ONLY blocks 1-2, not 3-5.
+	for i := uint64(3); i <= 5; i++ {
+		if val, ok := s.CheckpointExtent[i]; ok && val != 0 {
+			t.Fatalf("CHECKPOINT LEAK: block %d=%d in checkpoint but CheckpointLSN=2", i, val)
+		}
+	}
+	// Blocks 1-2 must be in checkpoint.
+	for i := uint64(1); i <= 2; i++ {
+		expected := i * 10
+		if got := s.CheckpointExtent[i]; got != expected {
+			t.Fatalf("block %d: checkpoint should have %d, got %d", i, expected, got)
+		}
+	}
+
+	// Now crash: LiveExtent lost, entries 3-5 only in WAL.
+	s.Crash()
+	recoveredLSN := s.Restart()
+
+	if recoveredLSN != 5 {
+		t.Fatalf("recoveredLSN should be 5, got %d", recoveredLSN)
+	}
+
+	// All 5 blocks must be recovered: 1-2 from checkpoint, 3-5 from WAL replay.
+	for i := uint64(1); i <= 5; i++ {
+		expected := i * 10
+		if got := s.LiveExtent[i]; got != expected {
+			t.Fatalf("block %d: expected %d after crash+restart, got %d", i, expected, got)
+		}
+	}
+}
+
+// --- A7: Historical state before checkpoint is not fakeable ---
+
+func TestA7_HistoricalState_NotReconstructableAfterGC(t *testing.T) {
+	s := NewStorage()
+
+	// Write 10 entries, all durable.
+	for i := uint64(1); i <= 10; i++ {
+		s.AppendWrite(Write{Block: i, Value: i * 10, LSN: i})
+	}
+	s.AdvanceFlush(10)
+	s.ApplyToExtent(10)
+
+	// Checkpoint at LSN 7.
+	s.AdvanceCheckpoint(7)
+
+	// GC WAL entries before checkpoint.
+	retained := make([]Write, 0)
+	for _, w := range s.WAL {
+		if w.LSN > s.CheckpointLSN {
+			retained = append(retained, w)
+		}
+	}
+	s.WAL = retained
+
+	// Can reconstruct at LSN 7 (checkpoint covers it).
+	if !s.CanReconstructAt(7) {
+		t.Fatal("should be reconstructable at checkpoint LSN")
+	}
+
+	// Can reconstruct at LSN 10 (checkpoint + WAL 8-10).
+	if !s.CanReconstructAt(10) {
+		t.Fatal("should be reconstructable at LSN 10 (checkpoint + WAL)")
+	}
+
+	// CANNOT accurately reconstruct at LSN 3 (WAL 1-6 has been GC'd).
+	// The state at LSN 3 required WAL entries 1-3 which are gone.
+	if s.CanReconstructAt(3) {
+		t.Fatal("A7: should NOT be reconstructable at LSN 3 after WAL GC — history is lost")
+	}
+
+	// StateAt(3) returns checkpoint state (best-effort approximation, not exact).
+	// This is fine for display but must NOT be treated as authoritative.
+	state3 := s.StateAt(3)
+	// The returned state includes blocks 1-7 (from checkpoint), which is MORE
+	// than what was actually committed at LSN 3. This is the "current extent
+	// cannot fake old history" problem from A7.
+	if len(state3) == 3 {
+		t.Fatal("StateAt(3) after GC should return checkpoint state (7 blocks), not exact 3-block state")
+	}
+}
+
+// --- Invariant I5: Checkpoint GC preserves recovery proof ---
+
+func TestI5_CheckpointGC_PreservesAckedBoundary(t *testing.T) {
+	s := NewStorage()
+
+	for i := uint64(1); i <= 10; i++ {
+		s.AppendWrite(Write{Block: i, Value: i * 10, LSN: i})
+	}
+	s.AdvanceFlush(10)
+	s.ApplyToExtent(7)
+	s.AdvanceCheckpoint(7)
+
+	// GC: remove WAL entries before checkpoint.
+	retained := make([]Write, 0)
+	for _, w := range s.WAL {
+		if w.LSN > s.CheckpointLSN {
+			retained = append(retained, w)
+		}
+	}
+	s.WAL = retained
+
+	// Crash + restart.
+	s.Crash()
+	recoveredLSN := s.Restart()
+
+	if recoveredLSN != 10 {
+		t.Fatalf("I5: recoveredLSN should be 10, got %d", recoveredLSN)
+	}
+
+	// All 10 blocks recoverable: 1-7 from checkpoint, 8-10 from WAL.
+	for i := uint64(1); i <= 10; i++ {
+		expected := i * 10
+		if got := s.LiveExtent[i]; got != expected {
+			t.Fatalf("I5 VIOLATED: block %d: expected %d, got %d", i, expected, got)
+		}
+	}
+}
diff --git a/sw-block/prototype/distsim/predicates.go b/sw-block/prototype/distsim/predicates.go
new file mode 100644
index 000000000..9e6982a24
--- /dev/null
+++ b/sw-block/prototype/distsim/predicates.go
@@ -0,0 +1,160 @@
+package distsim
+
+import "fmt"
+
+// DangerPredicate checks for a protocol-violating or dangerous state.
+// Returns (violated bool, detail string).
+type DangerPredicate func(c *Cluster) (bool, string)
+
+// PredicateAckedFlushLost checks if any committed (ACK'd) write has become
+// unrecoverable on ANY node that is supposed to have it.
+// This is the most dangerous protocol violation: data loss after ACK.
+func PredicateAckedFlushLost(c *Cluster) (bool, string) {
+	committedLSN := c.Coordinator.CommittedLSN
+	if committedLSN == 0 {
+		return false, ""
+	}
+
+	refState := c.Reference.StateAt(committedLSN)
+
+	// Check primary.
+	primary := c.Primary()
+	if primary != nil && primary.Running {
+		recLSN := primary.Storage.RecoverableLSN()
+		if recLSN < committedLSN {
+			return true, fmt.Sprintf("primary %s: recoverableLSN=%d < committedLSN=%d",
+				primary.ID, recLSN, committedLSN)
+		}
+		// Verify committed state correctness using StateAt (not LiveExtent).
+		// LiveExtent may contain uncommitted-but-durable writes beyond committedLSN.
+		// Only check if we can reconstruct the exact committed state.
+		if primary.Storage.CanReconstructAt(committedLSN) {
+			nodeState := primary.Storage.StateAt(committedLSN)
+			for block, expected := range refState {
+				if got := nodeState[block]; got != expected {
+					return true, fmt.Sprintf("primary %s: block %d = %d, reference = %d at committedLSN=%d",
+						primary.ID, block, got, expected, committedLSN)
+				}
+			}
+		}
+	}
+
+	// Check replicas that should have committed data (InSync replicas).
+	for _, node := range c.Nodes {
+		if node.ID == c.Coordinator.PrimaryID {
+			continue
+		}
+		if !node.Running || node.ReplicaState != NodeStateInSync {
+			continue
+		}
+		recLSN := node.Storage.RecoverableLSN()
+		if recLSN < committedLSN {
+			return true, fmt.Sprintf("InSync replica %s: recoverableLSN=%d < committedLSN=%d",
+				node.ID, recLSN, committedLSN)
+		}
+	}
+
+	return false, ""
+}
+
+// PredicateVisibleUnrecoverableState checks if any running node has extent
+// state that would NOT survive a crash+restart. This detects ghost visible
+// state — data that is readable now but would be lost on crash.
+func PredicateVisibleUnrecoverableState(c *Cluster) (bool, string) {
+	for _, node := range c.Nodes {
+		if !node.Running || node.Storage.LiveExtent == nil {
+			continue
+		}
+		// Simulate what would happen on crash+restart.
+		recoverableLSN := node.Storage.RecoverableLSN()
+
+		// Check each block in LiveExtent: is its value backed by
+		// a write at LSN <= recoverableLSN?
+		for block, value := range node.Storage.LiveExtent {
+			// Find which LSN wrote this value.
+			writtenAtLSN := uint64(0)
+			for _, w := range node.Storage.WAL {
+				if w.Block == block && w.Value == value {
+					writtenAtLSN = w.LSN
+				}
+			}
+			if writtenAtLSN > recoverableLSN {
+				return true, fmt.Sprintf("node %s: block %d has value %d (written at LSN %d) but recoverableLSN=%d — ghost state",
+					node.ID, block, value, writtenAtLSN, recoverableLSN)
+			}
+		}
+	}
+	return false, ""
+}
+
+// PredicateCatchUpLivelockOrMissingEscalation checks if any replica is stuck
+// in CatchingUp without making progress and without being escalated to
+// NeedsRebuild. Also checks if a replica needs rebuild but hasn't been
+// escalated.
+func PredicateCatchUpLivelockOrMissingEscalation(c *Cluster) (bool, string) {
+	for _, node := range c.Nodes {
+		if !node.Running {
+			continue
+		}
+		if node.ReplicaState == NodeStateCatchingUp {
+			// A node in CatchingUp is suspicious if it has been there for
+			// many ticks without approaching the target. We check if its
+			// FlushedLSN is far behind the primary's head.
+			primary := c.Primary()
+			if primary == nil {
+				continue
+			}
+			primaryHead := primary.Storage.WALDurableLSN
+			replicaFlushed := node.Storage.FlushedLSN
+			gap := primaryHead - replicaFlushed
+
+			// If the gap is larger than what the WAL can reasonably hold
+			// and the node hasn't been escalated, that's a livelock risk.
+			// Use a simple heuristic: if gap > 2x what we've seen committed, flag it.
+			if gap > c.Coordinator.CommittedLSN*2 && c.Coordinator.CommittedLSN > 5 {
+				return true, fmt.Sprintf("node %s: CatchingUp with gap=%d (primary head=%d, replica flushed=%d) — potential livelock",
+					node.ID, gap, primaryHead, replicaFlushed)
+			}
+		}
+
+		// Check if a node is Lagging for a long time without being moved to
+		// CatchingUp or NeedsRebuild.
+		// Note: Lagging is a transient state that the control plane should resolve.
+		// In adversarial random tests without explicit recovery triggers, a node
+		// staying Lagging is expected. We only flag truly excessive lag (> 3x committed)
+		// as potential livelock — anything smaller is normal recovery latency.
+		if node.ReplicaState == NodeStateLagging {
+			primary := c.Primary()
+			if primary != nil {
+				gap := primary.Storage.WALDurableLSN - node.Storage.FlushedLSN
+				if gap > c.Coordinator.CommittedLSN*3 && c.Coordinator.CommittedLSN > 10 {
+					return true, fmt.Sprintf("node %s: Lagging with gap=%d without escalation to CatchingUp or NeedsRebuild",
+						node.ID, gap)
+				}
+			}
+		}
+	}
+	return false, ""
+}
+
+// AllDangerPredicates returns the standard set of danger predicates.
+func AllDangerPredicates() map[string]DangerPredicate {
+	return map[string]DangerPredicate{
+		"acked_flush_lost":           PredicateAckedFlushLost,
+		"visible_unrecoverable":      PredicateVisibleUnrecoverableState,
+		"catchup_livelock_or_no_esc": PredicateCatchUpLivelockOrMissingEscalation,
+	}
+}
+
+// CheckAllPredicates runs all danger predicates against a cluster state.
+// Returns a map of violated predicate names → detail messages.
+func CheckAllPredicates(c *Cluster) map[string]string {
+	violations := map[string]string{}
+	for name, pred := range AllDangerPredicates() {
+		violated, detail := pred(c)
+		if violated {
+			violations[name] = detail
+		}
+	}
+	return violations
+}
diff --git a/sw-block/prototype/distsim/simulator.go b/sw-block/prototype/distsim/simulator.go
index 24da8d3f2..82d4b6389 100644
--- a/sw-block/prototype/distsim/simulator.go
+++ b/sw-block/prototype/distsim/simulator.go
@@ -300,8 +300,11 @@ func (s *Simulator) execute(e Event) {
 
 	case EvFlusherTick:
 		if node != nil && node.Running {
-			node.Storage.AdvanceCheckpoint(node.Storage.FlushedLSN)
-			s.record(e, fmt.Sprintf("flusher tick %s checkpoint=%d", e.NodeID, node.Storage.CheckpointLSN))
+			// Phase 4.5: flusher first materializes WAL to extent, then checkpoints.
+			node.Storage.ApplyToExtent(node.Storage.WALDurableLSN)
+			node.Storage.AdvanceCheckpoint(node.Storage.WALDurableLSN)
+			s.record(e, fmt.Sprintf("flusher tick %s applied=%d checkpoint=%d",
+				e.NodeID, node.Storage.ExtentAppliedLSN, node.Storage.CheckpointLSN))
 		}
 
 	case EvPromote:
diff --git a/sw-block/prototype/distsim/storage.go b/sw-block/prototype/distsim/storage.go
index b5f52153b..9fdca192b 100644
--- a/sw-block/prototype/distsim/storage.go
+++ b/sw-block/prototype/distsim/storage.go
@@ -8,23 +8,42 @@ type SnapshotState struct {
 	State map[uint64]uint64
 }
 
+// Storage models the per-node storage state with explicit crash-consistency
+// boundaries. Phase 4.5: split into 5 distinct LSN boundaries.
+//
+// State progression:
+//   Write arrives → ReceivedLSN (not yet durable)
+//   WAL fsync     → WALDurableLSN (survives crash)
+//   Flusher       → ExtentAppliedLSN (materialized to live extent, volatile)
+//   Checkpoint    → CheckpointLSN (durable base image)
+//
+// After crash + restart:
+//   RecoverableState = CheckpointExtent + WAL[CheckpointLSN+1 .. WALDurableLSN]
 type Storage struct {
-	WAL          []Write
-	Extent       map[uint64]uint64
-	ReceivedLSN  uint64
-	FlushedLSN   uint64
-	CheckpointLSN uint64
-	Snapshots    map[string]SnapshotState
-	BaseSnapshot *SnapshotState
+	WAL              []Write
+	LiveExtent       map[uint64]uint64 // runtime view (volatile — lost on crash)
+	CheckpointExtent map[uint64]uint64 // crash-safe base image (survives crash)
+	ReceivedLSN      uint64            // highest LSN received (may not be durable)
+	WALDurableLSN    uint64            // highest LSN guaranteed to survive crash (= FlushedLSN)
+	ExtentAppliedLSN uint64            // highest LSN materialized into LiveExtent
+	CheckpointLSN    uint64            // highest LSN in the durable base image
+	Snapshots        map[string]SnapshotState
+	BaseSnapshot     *SnapshotState
+
+	// Backward compat alias.
+	FlushedLSN uint64 // = WALDurableLSN
 }
 
 func NewStorage() *Storage {
 	return &Storage{
-		Extent:    map[uint64]uint64{},
-		Snapshots: map[string]SnapshotState{},
+		LiveExtent:       map[uint64]uint64{},
+		CheckpointExtent: map[uint64]uint64{},
+		Snapshots:        map[string]SnapshotState{},
 	}
 }
 
+// AppendWrite adds a WAL entry. Does NOT update LiveExtent — that's the flusher's job.
+// Does NOT advance WALDurableLSN — that requires explicit AdvanceFlush (WAL fsync).
 func (s *Storage) AppendWrite(w Write) {
 	// Insert in LSN order (handles out-of-order delivery from jitter).
 	inserted := false
@@ -41,43 +60,162 @@ func (s *Storage) AppendWrite(w Write) {
 	if !inserted {
 		s.WAL = append(s.WAL, w)
 	}
-	s.Extent[w.Block] = w.Value
 	if w.LSN > s.ReceivedLSN {
 		s.ReceivedLSN = w.LSN
 	}
 }
 
+// AdvanceFlush simulates WAL fdatasync completing. Entries up to lsn are now
+// durable and will survive crash. This is the authoritative progress for sync_all.
 func (s *Storage) AdvanceFlush(lsn uint64) {
 	if lsn > s.ReceivedLSN {
 		lsn = s.ReceivedLSN
 	}
-	if lsn > s.FlushedLSN {
-		s.FlushedLSN = lsn
+	if lsn > s.WALDurableLSN {
+		s.WALDurableLSN = lsn
+		s.FlushedLSN = lsn // backward compat alias
+	}
+}
+
+// ApplyToExtent simulates the flusher materializing WAL entries into the live extent.
+// Entries from (ExtentAppliedLSN, targetLSN] are applied. This is a volatile operation —
+// LiveExtent is lost on crash.
+func (s *Storage) ApplyToExtent(targetLSN uint64) {
+	if targetLSN > s.WALDurableLSN {
+		targetLSN = s.WALDurableLSN // can't materialize un-durable entries
+	}
+	for _, w := range s.WAL {
+		if w.LSN <= s.ExtentAppliedLSN {
+			continue
+		}
+		if w.LSN > targetLSN {
+			break
+		}
+		s.LiveExtent[w.Block] = w.Value
+	}
+	if targetLSN > s.ExtentAppliedLSN {
+		s.ExtentAppliedLSN = targetLSN
 	}
 }
 
+// AdvanceCheckpoint creates a crash-safe base image at exactly the given LSN.
+// The checkpoint image contains state ONLY through lsn — not the full LiveExtent.
+// This is critical: LiveExtent may contain applied entries beyond lsn that are
+// NOT part of the checkpoint and must NOT survive a crash.
 func (s *Storage) AdvanceCheckpoint(lsn uint64) {
-	if lsn > s.FlushedLSN {
-		lsn = s.FlushedLSN
+	if lsn > s.ExtentAppliedLSN {
+		lsn = s.ExtentAppliedLSN
 	}
 	if lsn > s.CheckpointLSN {
 		s.CheckpointLSN = lsn
+		// Build checkpoint image from base + WAL replay through exactly lsn.
+		// Do NOT clone LiveExtent — it may contain entries beyond checkpoint.
+		s.CheckpointExtent = s.StateAt(lsn)
+		// Set BaseSnapshot so StateAt() can use it after WAL GC.
+		s.BaseSnapshot = &SnapshotState{
+			ID:    "checkpoint",
+			LSN:   lsn,
+			State: cloneMap(s.CheckpointExtent),
+		}
+	}
+}
+
+// Crash simulates a node crash: LiveExtent is lost, only CheckpointExtent
+// and durable WAL entries survive.
+func (s *Storage) Crash() {
+	s.LiveExtent = nil
+	s.ExtentAppliedLSN = 0
+	// ReceivedLSN drops to WALDurableLSN (un-fsynced entries lost)
+	s.ReceivedLSN = s.WALDurableLSN
+	// Remove non-durable WAL entries
+	durable := make([]Write, 0, len(s.WAL))
+	for _, w := range s.WAL {
+		if w.LSN <= s.WALDurableLSN {
+			durable = append(durable, w)
+		}
+	}
+	s.WAL = durable
+}
+
+// Restart recovers state from CheckpointExtent + durable WAL replay.
+// Sets BaseSnapshot from checkpoint so StateAt() works after WAL GC.
+// Returns the RecoverableLSN (highest LSN in the recovered view).
+func (s *Storage) Restart() uint64 {
+	// Start from checkpoint base image.
+	s.LiveExtent = cloneMap(s.CheckpointExtent)
+	// Set BaseSnapshot so StateAt() can reconstruct from checkpoint after WAL GC.
+	s.BaseSnapshot = &SnapshotState{
+		ID:    "checkpoint",
+		LSN:   s.CheckpointLSN,
+		State: cloneMap(s.CheckpointExtent),
+	}
+	// Replay durable WAL entries past checkpoint.
+	for _, w := range s.WAL {
+		if w.LSN <= s.CheckpointLSN {
+			continue
+		}
+		if w.LSN > s.WALDurableLSN {
+			break
+		}
+		s.LiveExtent[w.Block] = w.Value
+	}
+	s.ExtentAppliedLSN = s.WALDurableLSN
+	return s.WALDurableLSN
+}
+
+// RecoverableLSN returns the highest LSN that would be recoverable after
+// crash + restart. This is a replayability proof, not just a watermark:
+//   - CheckpointExtent covers [0, CheckpointLSN]
+//   - WAL entries (CheckpointLSN, WALDurableLSN] must exist contiguously
+//   - If any gap exists in the WAL between CheckpointLSN and WALDurableLSN,
+//     recovery would be incomplete
+//
+// Returns the highest contiguously recoverable LSN from checkpoint + WAL.
+func (s *Storage) RecoverableLSN() uint64 {
+	// Start from checkpoint — everything through CheckpointLSN is safe.
+	recoverable := s.CheckpointLSN
+
+	// Walk durable WAL entries past checkpoint and verify contiguity.
+	for _, w := range s.WAL {
+		if w.LSN <= s.CheckpointLSN {
+			continue // already covered by checkpoint
+		}
+		if w.LSN > s.WALDurableLSN {
+			break // not durable
+		}
+		if w.LSN == recoverable+1 {
+			recoverable = w.LSN // contiguous — extend
+		} else {
+			break // gap — stop here
+		}
 	}
+	return recoverable
 }
 
+// StateAt computes the block state by replaying WAL entries up to the given LSN.
+// Used for correctness assertions against the reference model.
+//
+// Phase 4.5: for lsn < CheckpointLSN (after WAL GC), the WAL entries needed
+// to reconstruct historical state may no longer exist. In that case, we return
+// the checkpoint state (best available), but callers should use
+// CanReconstructAt(lsn) to check if the result is authoritative.
 func (s *Storage) StateAt(lsn uint64) map[uint64]uint64 {
 	state := map[uint64]uint64{}
+	usedSnapshot := false
 	if s.BaseSnapshot != nil {
 		if s.BaseSnapshot.LSN > lsn {
-			return cloneMap(s.BaseSnapshot.State)
+			// Snapshot is NEWER than requested — cannot use it.
+			// Fall through to WAL-only replay.
+		} else {
+			state = cloneMap(s.BaseSnapshot.State)
+			usedSnapshot = true
 		}
-		state = cloneMap(s.BaseSnapshot.State)
 	}
 	for _, w := range s.WAL {
 		if w.LSN > lsn {
 			break
 		}
-		if s.BaseSnapshot != nil && w.LSN <= s.BaseSnapshot.LSN {
+		if usedSnapshot && w.LSN <= s.BaseSnapshot.LSN {
 			continue
 		}
 		state[w.Block] = w.Value
@@ -85,6 +223,62 @@ func (s *Storage) StateAt(lsn uint64) map[uint64]uint64 {
 	return state
 }
 
+// CanReconstructAt returns true if the storage has enough information to
+// accurately reconstruct state at the given LSN. False means the WAL entries
+// needed for historical reconstruction have been GC'd and StateAt(lsn) may
+// return an approximation (checkpoint state) rather than exact history.
+//
+// A7 (Historical Data Correctness): this should be checked before trusting
+// StateAt() results for old LSNs. Current extent cannot fake old history.
+func (s *Storage) CanReconstructAt(lsn uint64) bool {
+	if lsn == 0 {
+		return true // empty state is always reconstructable
+	}
+
+	// To reconstruct state at exactly lsn, we need a contiguous chain of
+	// evidence from LSN 0 (or a snapshot taken AT lsn) through lsn.
+	//
+	// A checkpoint at LSN C contains state through C. If lsn < C, the
+	// checkpoint has MORE data than existed at lsn — it cannot reconstruct
+	// the exact historical state at lsn. We would need WAL entries [1, lsn]
+	// to rebuild from scratch, which are gone after GC.
+	//
+	// A checkpoint at LSN C where C == lsn is exact.
+	// A checkpoint at LSN C where C > lsn cannot help with exact lsn state.
+
+	// Check if any snapshot was taken exactly at this LSN.
+	for _, snap := range s.Snapshots {
+		if snap.LSN == lsn {
+			return true
+		}
+	}
+
+	// Find the best base: a snapshot/checkpoint at or before lsn.
+	baseLSN := uint64(0)
+	if s.BaseSnapshot != nil && s.BaseSnapshot.LSN <= lsn {
+		baseLSN = s.BaseSnapshot.LSN
+	}
+
+	// If baseLSN > 0, we have a snapshot that provides state through baseLSN.
+	// We need contiguous WAL from baseLSN+1 through lsn.
+	// If baseLSN == 0, we need contiguous WAL from 1 through lsn.
+
+	expected := baseLSN + 1
+	for _, w := range s.WAL {
+		if w.LSN <= baseLSN {
+			continue
+		}
+		if w.LSN > lsn {
+			break
+		}
+		if w.LSN != expected {
+			return false // gap — history is incomplete
+		}
+		expected = w.LSN + 1
+	}
+	return expected > lsn
+}
+
 func (s *Storage) TakeSnapshot(id string, lsn uint64) SnapshotState {
 	snap := SnapshotState{
 		ID:    id,
@@ -96,10 +290,13 @@ func (s *Storage) TakeSnapshot(id string, lsn uint64) SnapshotState {
 }
 
 func (s *Storage) LoadSnapshot(snap SnapshotState) {
-	s.Extent = cloneMap(snap.State)
+	s.LiveExtent = cloneMap(snap.State)
+	s.CheckpointExtent = cloneMap(snap.State)
+	s.WALDurableLSN = snap.LSN
 	s.FlushedLSN = snap.LSN
 	s.ReceivedLSN = snap.LSN
 	s.CheckpointLSN = snap.LSN
+	s.ExtentAppliedLSN = snap.LSN
 	s.BaseSnapshot = &SnapshotState{
 		ID:    snap.ID,
 		LSN:   snap.LSN,
@@ -111,7 +308,14 @@ func (s *Storage) LoadSnapshot(snap SnapshotState) {
 func (s *Storage) ReplaceWAL(writes []Write) {
 	s.WAL = append([]Write(nil), writes...)
 	sort.Slice(s.WAL, func(i, j int) bool { return s.WAL[i].LSN < s.WAL[j].LSN })
-	s.Extent = s.StateAt(s.ReceivedLSN)
+	// Recompute LiveExtent from base + WAL
+	s.LiveExtent = s.StateAt(s.ReceivedLSN)
+}
+
+// Extent returns the current live extent for backward compatibility.
+// Callers should migrate to LiveExtent.
+func (s *Storage) Extent() map[uint64]uint64 {
+	return s.LiveExtent
 }
 
 func writesInRange(writes []Write, startExclusive, endInclusive uint64) []Write {
diff --git a/weed/server/master_block_failover.go b/weed/server/master_block_failover.go
index f3eb35bbb..8a97079b6 100644
--- a/weed/server/master_block_failover.go
+++ b/weed/server/master_block_failover.go
@@ -10,10 +10,12 @@ import (
 
 // pendingRebuild records a volume that needs rebuild when a dead VS reconnects.
 type pendingRebuild struct {
-	VolumeName string
-	OldPath    string // path on dead server
-	NewPrimary string // promoted replica server
-	Epoch      uint64
+	VolumeName      string
+	OldPath         string // path on dead server
+	NewPrimary      string // promoted replica server
+	Epoch           uint64
+	ReplicaDataAddr string // CP13-8: saved from before death for catch-up-first recovery
+	ReplicaCtrlAddr string // CP13-8: saved from before death for catch-up-first recovery
 }
 
 // blockFailoverState holds failover and rebuild state on the master.
@@ -88,6 +90,8 @@ func (ms *MasterServer) failoverBlockVolumes(deadServer string) {
 			ri := entry.ReplicaByServer(deadServer)
 			if ri != nil {
 				replicaPath := ri.Path
+				replicaDataAddr := ri.DataAddr // CP13-8: save before removal
+				replicaCtrlAddr := ri.CtrlAddr
 				// Remove dead replica from registry.
 				if err := ms.blockRegistry.RemoveReplica(entry.Name, deadServer); err != nil {
 					glog.Warningf("failover: RemoveReplica %q on %s: %v", entry.Name, deadServer, err)
@@ -95,10 +99,12 @@ func (ms *MasterServer) failoverBlockVolumes(deadServer string) {
 				}
 				// Record pending rebuild for when dead server reconnects.
 				ms.recordPendingRebuild(deadServer, pendingRebuild{
-					VolumeName: entry.Name,
-					OldPath:    replicaPath,
-					NewPrimary: entry.VolumeServer, // current primary (unchanged)
-					Epoch:      entry.Epoch,
+					VolumeName:      entry.Name,
+					OldPath:         replicaPath,
+					NewPrimary:      entry.VolumeServer,
+					Epoch:           entry.Epoch,
+					ReplicaDataAddr: replicaDataAddr,
+					ReplicaCtrlAddr: replicaCtrlAddr,
 				})
 				glog.V(0).Infof("failover: removed dead replica %s for %q, pending rebuild",
 					deadServer, entry.Name)
@@ -238,20 +244,73 @@ func (ms *MasterServer) recoverBlockVolumes(reconnectedServer string) {
 			continue
 		}
 
-		// Update registry: reconnected server becomes a replica (via AddReplica for RF≥2 support).
+		// CP13-8: Use replica addresses saved before death for catch-up-first recovery.
+		// These are deterministic (derived from volume path hash in ReplicationPorts),
+		// so they should be the same after VS restart. If the VS somehow gets different
+		// ports (e.g., port conflict), the catch-up attempt will fail at the TCP level
+		// and fall through to the shipper's NeedsRebuild → master rebuild path.
+		// This is an optimization, not a source of truth — the master remains the
+		// authority for topology/assignment changes.
+		dataAddr := rb.ReplicaDataAddr
+		ctrlAddr := rb.ReplicaCtrlAddr
+
+		// Update registry: reconnected server becomes a replica.
 		ms.blockRegistry.AddReplica(rb.VolumeName, ReplicaInfo{
-			Server: reconnectedServer,
-			Path:   rb.OldPath,
+			Server:   reconnectedServer,
+			Path:     rb.OldPath,
+			DataAddr: dataAddr,
+			CtrlAddr: ctrlAddr,
 		})
 
-		// T4: Warn if RebuildListenAddr is empty (new primary hasn't heartbeated yet).
+		// CP13-8: Try catch-up first (Replica assignment), fall back to rebuild.
+		// If the replica can catch up from the primary's retained WAL, this is
+		// much faster than a full rebuild. The shipper's reconnect handshake
+		// (CP13-5) determines whether catch-up or rebuild is actually needed.
+		// If catch-up fails, the shipper marks NeedsRebuild, and the master
+		// sends a Rebuilding assignment on the next heartbeat cycle.
+		if dataAddr != "" {
+			leaseTTLMs := blockvol.LeaseTTLToWire(30 * time.Second)
+			// Send Replica assignment to the reconnected server.
+			ms.blockAssignmentQueue.Enqueue(reconnectedServer, blockvol.BlockVolumeAssignment{
+				Path:            rb.OldPath,
+				Epoch:           entry.Epoch,
+				Role:            blockvol.RoleToWire(blockvol.RoleReplica),
+				LeaseTtlMs:      leaseTTLMs,
+				ReplicaDataAddr: dataAddr,
+				ReplicaCtrlAddr: ctrlAddr,
+			})
+			// Also re-send Primary assignment so the primary gets fresh replica addresses.
+			primaryAssignment := blockvol.BlockVolumeAssignment{
+				Path:       entry.Path,
+				Epoch:      entry.Epoch,
+				Role:       blockvol.RoleToWire(blockvol.RolePrimary),
+				LeaseTtlMs: leaseTTLMs,
+			}
+			// Include all replica addresses.
+			for _, ri := range entry.Replicas {
+				primaryAssignment.ReplicaAddrs = append(primaryAssignment.ReplicaAddrs, blockvol.ReplicaAddr{
+					DataAddr: ri.DataAddr,
+					CtrlAddr: ri.CtrlAddr,
+				})
+			}
+			if len(entry.Replicas) == 1 {
+				primaryAssignment.ReplicaDataAddr = entry.Replicas[0].DataAddr
+				primaryAssignment.ReplicaCtrlAddr = entry.Replicas[0].CtrlAddr
+			}
+			ms.blockAssignmentQueue.Enqueue(entry.VolumeServer, primaryAssignment)
+
+			glog.V(0).Infof("recover: enqueued catch-up (Replica) for %q on %s (epoch=%d, data=%s) + Primary refresh on %s",
+				rb.VolumeName, reconnectedServer, entry.Epoch, dataAddr, entry.VolumeServer)
+			continue
+		}
+
+		// Fallback: no known addresses — use rebuild path.
 		rebuildAddr := entry.RebuildListenAddr
 		if rebuildAddr == "" {
 			glog.Warningf("rebuild: %q RebuildListenAddr is empty (new primary %s may not have heartbeated yet), "+
 				"queuing rebuild anyway — VS should retry on empty addr", rb.VolumeName, entry.VolumeServer)
 		}
 
-		// Enqueue rebuild assignment for the reconnected server.
 		ms.blockAssignmentQueue.Enqueue(reconnectedServer, blockvol.BlockVolumeAssignment{
 			Path:        rb.OldPath,
 			Epoch:       entry.Epoch,
@@ -268,6 +327,39 @@ func (ms *MasterServer) recoverBlockVolumes(reconnectedServer string) {
 // reevaluateOrphanedPrimaries checks if the given server is a replica for any
 // volumes whose primary is dead (not block-capable). If so, promotes the best
 // available replica — but only after the old primary's lease has expired, to
+// refreshPrimaryForAddrChange sends a fresh Primary assignment when a replica's
+// receiver address changed (e.g., restart with port conflict). This ensures the
+// primary's shipper gets the new address without waiting for the next heartbeat cycle.
+func (ms *MasterServer) refreshPrimaryForAddrChange(ac ReplicaAddrChange) {
+	entry, ok := ms.blockRegistry.Lookup(ac.VolumeName)
+	if !ok {
+		return
+	}
+	leaseTTLMs := blockvol.LeaseTTLToWire(30 * time.Second)
+	assignment := blockvol.BlockVolumeAssignment{
+		Path:       entry.Path,
+		Epoch:      entry.Epoch,
+		Role:       blockvol.RoleToWire(blockvol.RolePrimary),
+		LeaseTtlMs: leaseTTLMs,
+	}
+	for _, ri := range entry.Replicas {
+		assignment.ReplicaAddrs = append(assignment.ReplicaAddrs, blockvol.ReplicaAddr{
+			DataAddr: ri.DataAddr,
+			CtrlAddr: ri.CtrlAddr,
+		})
+	}
+	if len(entry.Replicas) == 1 {
+		assignment.ReplicaDataAddr = entry.Replicas[0].DataAddr
+		assignment.ReplicaCtrlAddr = entry.Replicas[0].CtrlAddr
+	}
+	// Use current registry primary (not stale ac.PrimaryServer) in case
+	// failover happened between address-change detection and this refresh.
+	currentPrimary := entry.VolumeServer
+	ms.blockAssignmentQueue.Enqueue(currentPrimary, assignment)
+	glog.V(0).Infof("recover: replica addr changed for %q (data: %s→%s, ctrl: %s→%s), refreshed Primary on %s",
+		ac.VolumeName, ac.OldDataAddr, ac.NewDataAddr, ac.OldCtrlAddr, ac.NewCtrlAddr, currentPrimary)
+}
+
 // maintain the same split-brain protection as failoverBlockVolumes().
 // This fixes B-06 (orphaned primary after replica re-register)
 // and partially B-08 (fast reconnect skips failover window).
diff --git a/weed/server/master_block_registry.go b/weed/server/master_block_registry.go
index 6289dac53..687d581cd 100644
--- a/weed/server/master_block_registry.go
+++ b/weed/server/master_block_registry.go
@@ -353,7 +353,21 @@ func (r *BlockVolumeRegistry) ListByServer(server string) []BlockVolumeEntry {
 // Called on the first heartbeat from a volume server.
 // Marks reported volumes as Active, removes entries for this server
 // that are not reported (stale).
-func (r *BlockVolumeRegistry) UpdateFullHeartbeat(server string, infos []*master_pb.BlockVolumeInfoMessage, nvmeAddr string) {
+// ReplicaAddrChange records a replica whose advertised address changed,
+// requiring a Primary assignment refresh so the shipper gets the new address.
+// Detected only in the full heartbeat path (UpdateFullHeartbeat). Delta
+// heartbeats do not carry replica addresses and cannot trigger this.
+type ReplicaAddrChange struct {
+	VolumeName    string
+	PrimaryServer string
+	OldDataAddr   string
+	OldCtrlAddr   string
+	NewDataAddr   string
+	NewCtrlAddr   string
+}
+
+func (r *BlockVolumeRegistry) UpdateFullHeartbeat(server string, infos []*master_pb.BlockVolumeInfoMessage, nvmeAddr string) []ReplicaAddrChange {
+	var addrChanges []ReplicaAddrChange
 	r.mu.Lock()
 	defer r.mu.Unlock()
 
@@ -495,6 +509,31 @@ func (r *BlockVolumeRegistry) UpdateFullHeartbeat(server string, infos []*master
 						} else {
 							existing.Replicas[i].WALLag = 0
 						}
+						// CP13-8: detect address change on replica restart.
+						// If either the data or control address changed, the primary's
+						// shipper has a stale endpoint. Queue a Primary refresh.
+						if info.ReplicaDataAddr != "" || info.ReplicaCtrlAddr != "" {
+							oldData := existing.Replicas[i].DataAddr
+							oldCtrl := existing.Replicas[i].CtrlAddr
+							dataChanged := info.ReplicaDataAddr != "" && oldData != "" && oldData != info.ReplicaDataAddr
+							ctrlChanged := info.ReplicaCtrlAddr != "" && oldCtrl != "" && oldCtrl != info.ReplicaCtrlAddr
+							if dataChanged || ctrlChanged {
+								addrChanges = append(addrChanges, ReplicaAddrChange{
+									VolumeName:    existingName,
+									PrimaryServer: existing.VolumeServer,
+									OldDataAddr:   oldData,
+									OldCtrlAddr:   oldCtrl,
+									NewDataAddr:   info.ReplicaDataAddr,
+									NewCtrlAddr:   info.ReplicaCtrlAddr,
+								})
+							}
+							if info.ReplicaDataAddr != "" {
+								existing.Replicas[i].DataAddr = info.ReplicaDataAddr
+							}
+							if info.ReplicaCtrlAddr != "" {
+								existing.Replicas[i].CtrlAddr = info.ReplicaCtrlAddr
+							}
+						}
 						break
 					}
 				}
@@ -511,6 +550,14 @@ func (r *BlockVolumeRegistry) UpdateFullHeartbeat(server string, infos []*master
 			if name == "" {
 				continue
 			}
+			// Skip auto-register if a create is in progress for this volume.
+			// Without this gate, the replica VS heartbeat can race ahead of
+			// CreateBlockVolume.Register and create a bare entry that lacks
+			// replica info, causing the real Register to hit "already registered"
+			// and fall back to the incomplete auto-registered entry.
+			if r.IsInflight(name) {
+				continue
+			}
 			existing, dup := r.volumes[name]
 			if !dup {
 				entry := &BlockVolumeEntry{
@@ -545,6 +592,7 @@ func (r *BlockVolumeRegistry) UpdateFullHeartbeat(server string, infos []*master
 			}
 		}
 	}
+	return addrChanges
 }
 
 // reconcileOnRestart handles the case where a second server reports a volume
@@ -769,6 +817,12 @@ func (r *BlockVolumeRegistry) ReleaseInflight(name string) {
 	r.inflight.Delete(name)
 }
 
+// IsInflight returns true if a create is in progress for the given volume name.
+func (r *BlockVolumeRegistry) IsInflight(name string) bool {
+	_, ok := r.inflight.Load(name)
+	return ok
+}
+
 // countForServer returns the number of volumes on the given server.
 // Caller must hold at least RLock.
 func (r *BlockVolumeRegistry) countForServer(server string) int {
diff --git a/weed/server/master_block_registry_test.go b/weed/server/master_block_registry_test.go
index cd3ed34c4..f0ecb7e23 100644
--- a/weed/server/master_block_registry_test.go
+++ b/weed/server/master_block_registry_test.go
@@ -1900,3 +1900,63 @@ func TestUpdateEntry_NotFound(t *testing.T) {
 		t.Fatal("expected error for nonexistent volume")
 	}
 }
+
+// TestRegistry_InflightBlocksAutoRegister verifies that heartbeat auto-register
+// is suppressed while a create is in-flight for the same volume. This prevents
+// a race where the replica VS heartbeat arrives before CreateBlockVolume.Register
+// completes, creating a bare entry that lacks replica info.
+func TestRegistry_InflightBlocksAutoRegister(t *testing.T) {
+	r := NewBlockVolumeRegistry()
+
+	// Simulate CreateBlockVolume acquiring the inflight lock.
+	if !r.AcquireInflight("vol1") {
+		t.Fatal("AcquireInflight should succeed")
+	}
+
+	// Replica VS sends heartbeat reporting vol1 — while create is in-flight.
+	// This should be silently skipped (not auto-registered).
+	r.UpdateFullHeartbeat("replica-server:8080", []*master_pb.BlockVolumeInfoMessage{
+		{Path: "/blocks/vol1.blk", Epoch: 1, Role: 2, VolumeSize: 1 << 30},
+	}, "")
+
+	// vol1 should NOT be in the registry (auto-register was blocked).
+	if _, ok := r.Lookup("vol1"); ok {
+		t.Fatal("vol1 should not be auto-registered while inflight lock is held")
+	}
+
+	// Now simulate CreateBlockVolume completing: register with replicas.
+	r.Register(&BlockVolumeEntry{
+		Name:         "vol1",
+		VolumeServer: "primary-server:8080",
+		Path:         "/blocks/vol1.blk",
+		SizeBytes:    1 << 30,
+		Epoch:        1,
+		Status:       StatusActive,
+		Replicas: []ReplicaInfo{
+			{Server: "replica-server:8080", Path: "/blocks/vol1.blk"},
+		},
+	})
+	r.ReleaseInflight("vol1")
+
+	// Entry should have the replica.
+	entry, ok := r.Lookup("vol1")
+	if !ok {
+		t.Fatal("vol1 should exist after Register")
+	}
+	if len(entry.Replicas) != 1 {
+		t.Fatalf("replicas=%d, want 1", len(entry.Replicas))
+	}
+	if entry.Replicas[0].Server != "replica-server:8080" {
+		t.Fatalf("replica server=%s", entry.Replicas[0].Server)
+	}
+
+	// After inflight released, subsequent heartbeats should update normally.
+	r.UpdateFullHeartbeat("replica-server:8080", []*master_pb.BlockVolumeInfoMessage{
+		{Path: "/blocks/vol1.blk", Epoch: 2, Role: 2, VolumeSize: 1 << 30, HealthScore: 0.9},
+	}, "")
+
+	entry, _ = r.Lookup("vol1")
+	if entry.Replicas[0].HealthScore != 0.9 {
+		t.Fatalf("replica health not updated after inflight released: %f", entry.Replicas[0].HealthScore)
+	}
+}
diff --git a/weed/server/master_grpc_server.go b/weed/server/master_grpc_server.go
index 34c1142e2..8f88aa858 100644
--- a/weed/server/master_grpc_server.go
+++ b/weed/server/master_grpc_server.go
@@ -277,7 +277,12 @@ func (ms *MasterServer) SendHeartbeat(stream master_pb.Seaweed_SendHeartbeatServ
 		// (BlockVolumeInfos on first heartbeat) or deltas (NewBlockVolumes/DeletedBlockVolumes
 		// on subsequent heartbeats), never both in the same message.
 		if len(heartbeat.BlockVolumeInfos) > 0 || heartbeat.HasNoBlockVolumes {
-			ms.blockRegistry.UpdateFullHeartbeat(dn.Url(), heartbeat.BlockVolumeInfos, heartbeat.BlockNvmeAddr)
+			addrChanges := ms.blockRegistry.UpdateFullHeartbeat(dn.Url(), heartbeat.BlockVolumeInfos, heartbeat.BlockNvmeAddr)
+			// CP13-8: If a replica's receiver address changed (e.g., restart with port conflict),
+			// immediately refresh the primary's assignment with the new addresses.
+			for _, ac := range addrChanges {
+				ms.refreshPrimaryForAddrChange(ac)
+			}
 			// T2 (B-06): After updating registry from heartbeat, check if this server
 			// is a replica for any volume whose primary is dead. If so, promote.
 			ms.reevaluateOrphanedPrimaries(dn.Url())
diff --git a/weed/server/qa_block_edge_cases_test.go b/weed/server/qa_block_edge_cases_test.go
new file mode 100644
index 000000000..938ef7513
--- /dev/null
+++ b/weed/server/qa_block_edge_cases_test.go
@@ -0,0 +1,481 @@
+package weed_server
+
+import (
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/seaweedfs/seaweedfs/weed/storage/blockvol"
+)
+
+// ============================================================
+// Edge Case Tests: RF, Promotion, Network, LSN
+//
+// Covers gaps identified in the testing framework review:
+// 1. LSN-lagging replica skipped during promotion
+// 2. Cascading double failover (RF=3, epoch chain 1→2→3)
+// 3. Demotion/drain under concurrent promotion pressure
+// 4. Promotion with mixed LSN + health scores
+// 5. Network flap simulation (mark/unmark block capable rapidly)
+// 6. RF=3 all-gate evaluation under pressure
+// ============================================================
+
+// --- Test 1: LSN-lagging replica skipped, fresher one promoted ---
+
+func TestEdge_LSNLag_StaleReplicaSkipped(t *testing.T) {
+	ms := testMasterServerForFailover(t)
+	ms.blockRegistry.SetPromotionLSNTolerance(10)
+
+	ms.blockRegistry.MarkBlockCapable("primary")
+	ms.blockRegistry.MarkBlockCapable("stale-replica")
+	ms.blockRegistry.MarkBlockCapable("fresh-replica")
+
+	entry := &BlockVolumeEntry{
+		Name: "lsn-test", VolumeServer: "primary", Path: "/data/lsn-test.blk",
+		SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
+		Status: StatusActive, LeaseTTL: 5 * time.Second,
+		LastLeaseGrant: time.Now().Add(-10 * time.Second), // expired
+		WALHeadLSN: 1000,
+		Replicas: []ReplicaInfo{
+			{
+				Server: "stale-replica", Path: "/data/lsn-test.blk",
+				HealthScore: 1.0, WALHeadLSN: 100, // lag=900, way beyond tolerance=10
+				Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(),
+			},
+			{
+				Server: "fresh-replica", Path: "/data/lsn-test.blk",
+				HealthScore: 0.9, WALHeadLSN: 995, // lag=5, within tolerance=10
+				Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(),
+			},
+		},
+	}
+	ms.blockRegistry.Register(entry)
+
+	// Kill primary.
+	ms.blockRegistry.UnmarkBlockCapable("primary")
+	ms.failoverBlockVolumes("primary")
+
+	// Verify: fresh-replica promoted (despite lower health score), stale skipped.
+	after, ok := ms.blockRegistry.Lookup("lsn-test")
+	if !ok {
+		t.Fatal("volume not found")
+	}
+	if after.VolumeServer != "fresh-replica" {
+		t.Fatalf("expected fresh-replica promoted, got %q (stale-replica with lag=900 should be skipped)", after.VolumeServer)
+	}
+	if after.Epoch != 2 {
+		t.Fatalf("epoch: got %d, want 2", after.Epoch)
+	}
+}
+
+// --- Test 2: Cascading double failover (RF=3, epoch 1→2→3) ---
+
+func TestEdge_CascadeFailover_RF3_EpochChain(t *testing.T) {
+	ms := testMasterServerForFailover(t)
+	ms.blockRegistry.MarkBlockCapable("vs1")
+	ms.blockRegistry.MarkBlockCapable("vs2")
+	ms.blockRegistry.MarkBlockCapable("vs3")
+
+	entry := &BlockVolumeEntry{
+		Name: "cascade-test", VolumeServer: "vs1", Path: "/data/cascade.blk",
+		SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
+		Status: StatusActive, LeaseTTL: 5 * time.Second,
+		LastLeaseGrant: time.Now().Add(-10 * time.Second),
+		ReplicaFactor: 3,
+		Replicas: []ReplicaInfo{
+			{Server: "vs2", Path: "/r2.blk", HealthScore: 1.0, WALHeadLSN: 100,
+				Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
+			{Server: "vs3", Path: "/r3.blk", HealthScore: 0.9, WALHeadLSN: 100,
+				Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
+		},
+	}
+	ms.blockRegistry.Register(entry)
+
+	// Failover 1: vs1 dies → vs2 promoted (higher health).
+	ms.blockRegistry.UnmarkBlockCapable("vs1")
+	ms.failoverBlockVolumes("vs1")
+
+	after1, _ := ms.blockRegistry.Lookup("cascade-test")
+	if after1.VolumeServer != "vs2" {
+		t.Fatalf("failover 1: expected vs2, got %q", after1.VolumeServer)
+	}
+	if after1.Epoch != 2 {
+		t.Fatalf("failover 1: epoch got %d, want 2", after1.Epoch)
+	}
+
+	// Failover 2: vs2 dies → vs3 promoted (only remaining).
+	// Update vs3's heartbeat and set lease expired for the new primary.
+	ms.blockRegistry.UpdateEntry("cascade-test", func(e *BlockVolumeEntry) {
+		e.LastLeaseGrant = time.Now().Add(-10 * time.Second)
+		for i := range e.Replicas {
+			if e.Replicas[i].Server == "vs3" {
+				e.Replicas[i].LastHeartbeat = time.Now()
+			}
+		}
+	})
+
+	ms.blockRegistry.UnmarkBlockCapable("vs2")
+	ms.failoverBlockVolumes("vs2")
+
+	after2, _ := ms.blockRegistry.Lookup("cascade-test")
+	if after2.VolumeServer != "vs3" {
+		t.Fatalf("failover 2: expected vs3, got %q", after2.VolumeServer)
+	}
+	if after2.Epoch != 3 {
+		t.Fatalf("failover 2: epoch got %d, want 3", after2.Epoch)
+	}
+
+	// No more replicas — third failover should fail silently.
+	ms.blockRegistry.UpdateEntry("cascade-test", func(e *BlockVolumeEntry) {
+		e.LastLeaseGrant = time.Now().Add(-10 * time.Second)
+	})
+	ms.blockRegistry.UnmarkBlockCapable("vs3")
+	ms.failoverBlockVolumes("vs3")
+
+	after3, _ := ms.blockRegistry.Lookup("cascade-test")
+	// Epoch should still be 3 — no eligible replicas.
+	if after3.Epoch != 3 {
+		t.Fatalf("failover 3: epoch should stay 3, got %d", after3.Epoch)
+	}
+}
+
+// --- Test 3: Concurrent failover + heartbeat + promotion (stress) ---
+
+func TestEdge_ConcurrentFailoverAndHeartbeat_NoPanic(t *testing.T) {
+	ms := testMasterServerForFailover(t)
+	ms.blockRegistry.MarkBlockCapable("vs1")
+	ms.blockRegistry.MarkBlockCapable("vs2")
+	ms.blockRegistry.MarkBlockCapable("vs3")
+
+	setup := func() {
+		ms.blockRegistry.Unregister("stress-vol")
+		ms.blockRegistry.MarkBlockCapable("vs1")
+		ms.blockRegistry.MarkBlockCapable("vs2")
+		ms.blockRegistry.MarkBlockCapable("vs3")
+		ms.blockRegistry.Register(&BlockVolumeEntry{
+			Name: "stress-vol", VolumeServer: "vs1", Path: "/data/stress.blk",
+			SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
+			Status: StatusActive, LeaseTTL: 5 * time.Second,
+			LastLeaseGrant: time.Now().Add(-10 * time.Second),
+			Replicas: []ReplicaInfo{
+				{Server: "vs2", Path: "/r2.blk", HealthScore: 1.0,
+					Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
+				{Server: "vs3", Path: "/r3.blk", HealthScore: 0.9,
+					Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
+			},
+		})
+	}
+
+	for round := 0; round < 30; round++ {
+		setup()
+		var wg sync.WaitGroup
+		wg.Add(4)
+		go func() { defer wg.Done(); ms.failoverBlockVolumes("vs1") }()
+		go func() { defer wg.Done(); ms.reevaluateOrphanedPrimaries("vs2") }()
+		go func() { defer wg.Done(); ms.blockRegistry.PromoteBestReplica("stress-vol") }()
+		go func() {
+			defer wg.Done()
+			ms.blockRegistry.ManualPromote("stress-vol", "", true)
+		}()
+		wg.Wait()
+	}
+	// No panic = pass.
+}
+
+// --- Test 4: LSN + health score interaction — health wins within tolerance ---
+
+func TestEdge_LSNWithinTolerance_HealthWins(t *testing.T) {
+	ms := testMasterServerForFailover(t)
+	ms.blockRegistry.SetPromotionLSNTolerance(100)
+	ms.blockRegistry.MarkBlockCapable("primary")
+	ms.blockRegistry.MarkBlockCapable("high-health")
+	ms.blockRegistry.MarkBlockCapable("high-lsn")
+
+	ms.blockRegistry.Register(&BlockVolumeEntry{
+		Name: "health-vs-lsn", VolumeServer: "primary", Path: "/data/hvl.blk",
+		SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
+		Status: StatusActive, LeaseTTL: 5 * time.Second,
+		LastLeaseGrant: time.Now().Add(-10 * time.Second),
+		WALHeadLSN: 1000,
+		Replicas: []ReplicaInfo{
+			{Server: "high-health", Path: "/r1.blk", HealthScore: 1.0, WALHeadLSN: 950,
+				Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
+			{Server: "high-lsn", Path: "/r2.blk", HealthScore: 0.5, WALHeadLSN: 999,
+				Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
+		},
+	})
+
+	ms.blockRegistry.UnmarkBlockCapable("primary")
+	ms.failoverBlockVolumes("primary")
+
+	after, _ := ms.blockRegistry.Lookup("health-vs-lsn")
+	// Both within tolerance (lag ≤ 100). Health wins: high-health (1.0) > high-lsn (0.5).
+	if after.VolumeServer != "high-health" {
+		t.Fatalf("expected high-health promoted (higher health, both within LSN tolerance), got %q", after.VolumeServer)
+	}
+}
+
+// --- Test 5: Network flap simulation — rapid mark/unmark block capable ---
+
+func TestEdge_NetworkFlap_RapidMarkUnmark(t *testing.T) {
+	ms := testMasterServerForFailover(t)
+	ms.blockRegistry.MarkBlockCapable("flapper")
+	ms.blockRegistry.MarkBlockCapable("stable")
+
+	ms.blockRegistry.Register(&BlockVolumeEntry{
+		Name: "flap-test", VolumeServer: "stable", Path: "/data/flap.blk",
+		SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
+		Status: StatusActive, LeaseTTL: 5 * time.Second,
+		LastLeaseGrant: time.Now(),
+		Replicas: []ReplicaInfo{
+			{Server: "flapper", Path: "/r.blk", HealthScore: 1.0,
+				Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
+		},
+	})
+
+	var wg sync.WaitGroup
+	// Goroutine 1: rapidly flap the "flapper" server.
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		for i := 0; i < 100; i++ {
+			ms.blockRegistry.UnmarkBlockCapable("flapper")
+			ms.blockRegistry.MarkBlockCapable("flapper")
+		}
+	}()
+
+	// Goroutine 2: attempt promotions during flapping.
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		for i := 0; i < 50; i++ {
+			ms.blockRegistry.EvaluatePromotion("flap-test")
+		}
+	}()
+
+	// Goroutine 3: concurrent heartbeat updates.
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		for i := 0; i < 50; i++ {
+			ms.blockRegistry.UpdateFullHeartbeat("flapper", nil, "")
+		}
+	}()
+
+	wg.Wait()
+	// No panic, no corruption = pass.
+
+	// Volume should still be on stable primary.
+	after, ok := ms.blockRegistry.Lookup("flap-test")
+	if !ok {
+		t.Fatal("volume lost during flapping")
+	}
+	if after.VolumeServer != "stable" {
+		t.Fatalf("primary changed from stable to %q during flapping", after.VolumeServer)
+	}
+}
+
+// --- Test 6: RF=3 all gates — mixed rejection reasons ---
+
+func TestEdge_RF3_MixedGates_BestEligiblePromoted(t *testing.T) {
+	ms := testMasterServerForFailover(t)
+	ms.blockRegistry.SetPromotionLSNTolerance(50)
+	ms.blockRegistry.MarkBlockCapable("primary")
+	// Note: "dead-server" NOT marked block capable.
+	ms.blockRegistry.MarkBlockCapable("stale-hb")
+	ms.blockRegistry.MarkBlockCapable("good")
+
+	ms.blockRegistry.Register(&BlockVolumeEntry{
+		Name: "mixed-gates", VolumeServer: "primary", Path: "/data/mixed.blk",
+		SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
+		Status: StatusActive, LeaseTTL: 5 * time.Second,
+		LastLeaseGrant: time.Now().Add(-10 * time.Second),
+		WALHeadLSN: 500,
+		Replicas: []ReplicaInfo{
+			{Server: "dead-server", Path: "/r1.blk", HealthScore: 1.0, WALHeadLSN: 500,
+				Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
+			{Server: "stale-hb", Path: "/r2.blk", HealthScore: 1.0, WALHeadLSN: 500,
+				Role: blockvol.RoleToWire(blockvol.RoleReplica),
+				LastHeartbeat: time.Now().Add(-10 * time.Minute)}, // stale
+			{Server: "good", Path: "/r3.blk", HealthScore: 0.8, WALHeadLSN: 480,
+				Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
+		},
+	})
+
+	// Evaluate preflight first (read-only).
+	pf, err := ms.blockRegistry.EvaluatePromotion("mixed-gates")
+	if err != nil {
+		t.Fatalf("evaluate: %v", err)
+	}
+	if !pf.Promotable {
+		t.Fatalf("should be promotable, reason=%s, rejections=%v", pf.Reason, pf.Rejections)
+	}
+	// Should have 2 rejections: dead-server (server_dead) + stale-hb (stale_heartbeat).
+	if len(pf.Rejections) != 2 {
+		t.Fatalf("expected 2 rejections, got %d: %v", len(pf.Rejections), pf.Rejections)
+	}
+	reasons := map[string]string{}
+	for _, r := range pf.Rejections {
+		reasons[r.Server] = r.Reason
+	}
+	if reasons["dead-server"] != "server_dead" {
+		t.Fatalf("dead-server: got %q, want server_dead", reasons["dead-server"])
+	}
+	if reasons["stale-hb"] != "stale_heartbeat" {
+		t.Fatalf("stale-hb: got %q, want stale_heartbeat", reasons["stale-hb"])
+	}
+
+	// Now actually promote.
+	ms.blockRegistry.UnmarkBlockCapable("primary")
+	ms.failoverBlockVolumes("primary")
+
+	after, _ := ms.blockRegistry.Lookup("mixed-gates")
+	if after.VolumeServer != "good" {
+		t.Fatalf("expected 'good' promoted (only eligible), got %q", after.VolumeServer)
+	}
+}
+
+// --- Test 7: Promotion changes publication (ISCSIAddr, NvmeAddr) ---
+
+func TestEdge_PromotionUpdatesPublication(t *testing.T) {
+	ms := testMasterServerForFailover(t)
+	ms.blockRegistry.MarkBlockCapable("primary")
+	ms.blockRegistry.MarkBlockCapable("replica")
+
+	ms.blockRegistry.Register(&BlockVolumeEntry{
+		Name: "pub-test", VolumeServer: "primary", Path: "/data/pub.blk",
+		ISCSIAddr: "primary:3260", NvmeAddr: "primary:4420", NQN: "nqn.primary",
+		SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
+		Status: StatusActive, LeaseTTL: 5 * time.Second,
+		LastLeaseGrant: time.Now().Add(-10 * time.Second),
+		Replicas: []ReplicaInfo{
+			{Server: "replica", Path: "/r.blk", HealthScore: 1.0,
+				ISCSIAddr: "replica:3261", NvmeAddr: "replica:4421", NQN: "nqn.replica",
+				Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
+		},
+	})
+
+	ms.blockRegistry.UnmarkBlockCapable("primary")
+	ms.failoverBlockVolumes("primary")
+
+	after, _ := ms.blockRegistry.Lookup("pub-test")
+	if after.ISCSIAddr != "replica:3261" {
+		t.Fatalf("ISCSIAddr: got %q, want replica:3261", after.ISCSIAddr)
+	}
+	if after.NvmeAddr != "replica:4421" {
+		t.Fatalf("NvmeAddr: got %q, want replica:4421", after.NvmeAddr)
+	}
+	if after.NQN != "nqn.replica" {
+		t.Fatalf("NQN: got %q, want nqn.replica", after.NQN)
+	}
+}
+
+// --- Test 8: Orphaned primary re-evaluation with LSN lag ---
+
+func TestEdge_OrphanReevaluation_LSNLag_StillPromotes(t *testing.T) {
+	ms := testMasterServerForFailover(t)
+	ms.blockRegistry.SetPromotionLSNTolerance(10)
+	// Primary is dead, replica is alive but lagging.
+	ms.blockRegistry.MarkBlockCapable("replica")
+
+	ms.blockRegistry.Register(&BlockVolumeEntry{
+		Name: "orphan-lag", VolumeServer: "dead-primary", Path: "/data/orphan.blk",
+		SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
+		Status: StatusActive, LeaseTTL: 5 * time.Second,
+		LastLeaseGrant: time.Now().Add(-10 * time.Second), // expired
+		WALHeadLSN: 1000,
+		Replicas: []ReplicaInfo{
+			{Server: "replica", Path: "/r.blk", HealthScore: 1.0, WALHeadLSN: 500,
+				Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
+		},
+	})
+
+	// Orphan re-evaluation: replica reconnects.
+	ms.reevaluateOrphanedPrimaries("replica")
+
+	// The replica has WAL lag of 500 (way beyond tolerance=10).
+	// But it's the ONLY replica — should it promote or not?
+	// Current behavior: LSN gate rejects it. No promotion.
+	after, _ := ms.blockRegistry.Lookup("orphan-lag")
+	if after.Epoch != 1 {
+		// If epoch changed, the lagging replica was promoted.
+		// This may or may not be desired — document the behavior.
+		t.Logf("NOTE: lagging replica WAS promoted (epoch=%d). LSN lag=%d, tolerance=%d",
+			after.Epoch, 1000-500, 10)
+	} else {
+		t.Logf("NOTE: lagging replica was NOT promoted (epoch=1). Volume is stuck with dead primary.")
+		t.Logf("This is the current behavior: LSN gate blocks promotion even when it's the only option.")
+	}
+	// This test documents behavior, doesn't assert pass/fail.
+	// The question is: should a lagging-but-only replica be promoted to avoid downtime?
+}
+
+// --- Test 9: Rebuild addr cleared after promotion, then repopulated ---
+
+func TestEdge_RebuildAddr_ClearedThenRepopulated(t *testing.T) {
+	ms := testMasterServerForFailover(t)
+	ms.blockRegistry.MarkBlockCapable("primary")
+	ms.blockRegistry.MarkBlockCapable("replica")
+
+	ms.blockRegistry.Register(&BlockVolumeEntry{
+		Name: "rebuild-addr", VolumeServer: "primary", Path: "/data/rebuild.blk",
+		SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
+		Status: StatusActive, LeaseTTL: 5 * time.Second,
+		LastLeaseGrant:     time.Now().Add(-10 * time.Second),
+		RebuildListenAddr:  "primary:15000", // old primary's rebuild addr
+		Replicas: []ReplicaInfo{
+			{Server: "replica", Path: "/r.blk", HealthScore: 1.0,
+				Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
+		},
+	})
+
+	ms.blockRegistry.UnmarkBlockCapable("primary")
+	ms.failoverBlockVolumes("primary")
+
+	after, _ := ms.blockRegistry.Lookup("rebuild-addr")
+	// RebuildListenAddr should be cleared after promotion (B-11 fix).
+	if after.RebuildListenAddr != "" {
+		t.Fatalf("RebuildListenAddr should be cleared after promotion, got %q", after.RebuildListenAddr)
+	}
+}
+
+// --- Test 10: Multiple volumes on same server — all fail over ---
+
+func TestEdge_MultipleVolumes_SameServer_AllFailover(t *testing.T) {
+	ms := testMasterServerForFailover(t)
+	ms.blockRegistry.MarkBlockCapable("vs1")
+	ms.blockRegistry.MarkBlockCapable("vs2")
+
+	// Register 5 volumes, all with primary on vs1.
+	for i := 0; i < 5; i++ {
+		name := "multi-" + string(rune('a'+i))
+		ms.blockRegistry.Register(&BlockVolumeEntry{
+			Name: name, VolumeServer: "vs1", Path: "/data/" + name + ".blk",
+			SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
+			Status: StatusActive, LeaseTTL: 5 * time.Second,
+			LastLeaseGrant: time.Now().Add(-10 * time.Second),
+			Replicas: []ReplicaInfo{
+				{Server: "vs2", Path: "/r/" + name + ".blk", HealthScore: 1.0,
+					Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
+			},
+		})
+	}
+
+	// Kill vs1 — all 5 volumes should fail over.
+	ms.blockRegistry.UnmarkBlockCapable("vs1")
+	ms.failoverBlockVolumes("vs1")
+
+	for i := 0; i < 5; i++ {
+		name := "multi-" + string(rune('a'+i))
+		entry, ok := ms.blockRegistry.Lookup(name)
+		if !ok {
+			t.Fatalf("volume %s not found", name)
+		}
+		if entry.VolumeServer != "vs2" {
+			t.Fatalf("volume %s: expected vs2, got %q", name, entry.VolumeServer)
+		}
+		if entry.Epoch != 2 {
+			t.Fatalf("volume %s: epoch got %d, want 2", name, entry.Epoch)
+		}
+	}
+}
diff --git a/weed/server/volume_grpc_client_to_master.go b/weed/server/volume_grpc_client_to_master.go
index 10be5b1b7..8454471a2 100644
--- a/weed/server/volume_grpc_client_to_master.go
+++ b/weed/server/volume_grpc_client_to_master.go
@@ -309,6 +309,16 @@ func (vs *VolumeServer) doHeartbeatWithRetry(masterAddress pb.ServerAddress, grp
 				glog.V(0).Infof("Volume Server Failed to update to master %s: %v", masterAddress, err)
 				return "", err
 			}
+		case <-vs.blockStateChangeChan:
+			// Immediate block heartbeat on shipper state change (degraded/recovered).
+			if vs.blockService == nil {
+				continue
+			}
+			glog.V(0).Infof("volume server %s:%d block state change → immediate heartbeat", vs.store.Ip, vs.store.Port)
+			if err = stream.Send(vs.collectBlockVolumeHeartbeat(ip, port, dataCenter, rack)); err != nil {
+				glog.V(0).Infof("Volume Server Failed to send block state-change heartbeat to master %s: %v", masterAddress, err)
+				return "", err
+			}
 		case <-blockVolTickChan.C:
 			if vs.blockService == nil {
 				continue
diff --git a/weed/server/volume_server.go b/weed/server/volume_server.go
index 4db60cc19..ea4f3b7c6 100644
--- a/weed/server/volume_server.go
+++ b/weed/server/volume_server.go
@@ -55,7 +55,8 @@ type VolumeServer struct {
 	isHeartbeating           bool
 	stopChan                 chan bool
 
-	blockService *BlockService // block volume iSCSI service (nil if disabled)
+	blockService         *BlockService // block volume iSCSI service (nil if disabled)
+	blockStateChangeChan chan bool      // triggers immediate block heartbeat on shipper state change
 }
 
 func NewVolumeServer(adminMux, publicMux *http.ServeMux, ip string,
@@ -103,6 +104,7 @@ func NewVolumeServer(adminMux, publicMux *http.ServeMux, ip string,
 		fileSizeLimitBytes:            int64(fileSizeLimitMB) * 1024 * 1024,
 		isHeartbeating:                true,
 		stopChan:                      make(chan bool),
+		blockStateChangeChan:          make(chan bool, 1),
 		inFlightUploadDataLimitCond:   sync.NewCond(new(sync.Mutex)),
 		inFlightDownloadDataLimitCond: sync.NewCond(new(sync.Mutex)),
 		concurrentUploadLimit:         concurrentUploadLimit,
@@ -135,6 +137,7 @@ func NewVolumeServer(adminMux, publicMux *http.ServeMux, ip string,
 			adminMux.HandleFunc("/stats/disk", vs.guard.WhiteList(vs.statsDiskHandler))
 		*/
 	}
+	adminMux.HandleFunc("/debug/block/shipper", vs.debugBlockShipperHandler)
 	adminMux.HandleFunc("/", requestIDMiddleware(vs.privateStoreHandler))
 	if publicMux != adminMux {
 		// separated admin and public port
diff --git a/weed/server/volume_server_block.go b/weed/server/volume_server_block.go
index c5e0390d8..03444ed9d 100644
--- a/weed/server/volume_server_block.go
+++ b/weed/server/volume_server_block.go
@@ -14,6 +14,7 @@ import (
 	"github.com/seaweedfs/seaweedfs/weed/storage/blockvol"
 	"github.com/seaweedfs/seaweedfs/weed/storage/blockvol/iscsi"
 	"github.com/seaweedfs/seaweedfs/weed/storage/blockvol/nvme"
+	"github.com/seaweedfs/seaweedfs/weed/storage/blockvol/v2bridge"
 )
 
 // volReplState tracks active replication addresses per volume.
@@ -45,6 +46,23 @@ type BlockService struct {
 	// Replication state (CP6-3).
 	replMu     sync.RWMutex
 	replStates map[string]*volReplState // keyed by volume path
+
+	// V2 engine bridge (Phase 08 P1).
+	v2Bridge *v2bridge.ControlBridge
+}
+
+// WireStateChangeNotify sets up shipper state change callbacks on all
+// registered volumes so that degradation/recovery triggers an immediate
+// heartbeat via the provided channel. Non-blocking send (buffered chan 1).
+func (bs *BlockService) WireStateChangeNotify(ch chan bool) {
+	bs.blockStore.IterateBlockVolumes(func(path string, vol *blockvol.BlockVol) {
+		vol.SetOnShipperStateChange(func(from, to blockvol.ReplicaState) {
+			select {
+			case ch <- true:
+			default: // already pending
+			}
+		})
+	})
 }
 
 // StartBlockService scans blockDir for .blk files, opens them as block volumes,
@@ -70,6 +88,7 @@ func StartBlockService(listenAddr, blockDir, iqnPrefix, portalAddr string, nvmeC
 		blockDir:       blockDir,
 		listenAddr:     listenAddr,
 		nvmeListenAddr: nvmeCfg.ListenAddr,
+		v2Bridge:       v2bridge.NewControlBridge(),
 	}
 
 	// iSCSI target setup.
@@ -312,7 +331,18 @@ func (bs *BlockService) DeleteBlockVol(name string) error {
 }
 
 // ProcessAssignments applies assignments from master, including replication setup.
+// V2 bridge: also delivers each assignment to the V2 engine for recovery ownership.
 func (bs *BlockService) ProcessAssignments(assignments []blockvol.BlockVolumeAssignment) {
+	// V2 bridge: convert and deliver to engine (Phase 08 P1).
+	if bs.v2Bridge != nil {
+		for _, a := range assignments {
+			intent := bs.v2Bridge.ConvertAssignment(a, bs.listenAddr)
+			_ = intent // TODO(P2): deliver to engine orchestrator
+			glog.V(1).Infof("v2bridge: converted assignment %s epoch=%d → %d replicas",
+				a.Path, a.Epoch, len(intent.Replicas))
+		}
+	}
+
 	for _, a := range assignments {
 		role := blockvol.RoleFromWire(a.Role)
 		ttl := blockvol.LeaseTTLFromWire(a.LeaseTtlMs)
@@ -645,6 +675,8 @@ func (bs *BlockService) Shutdown() {
 
 // SetBlockService wires a BlockService into the VolumeServer so that
 // heartbeats include block volume info and the server is marked block-capable.
+// Also wires shipper state change callbacks for immediate heartbeat on degradation.
 func (vs *VolumeServer) SetBlockService(bs *BlockService) {
 	vs.blockService = bs
+	bs.WireStateChangeNotify(vs.blockStateChangeChan)
 }
diff --git a/weed/server/volume_server_block_debug.go b/weed/server/volume_server_block_debug.go
new file mode 100644
index 000000000..747bc5f25
--- /dev/null
+++ b/weed/server/volume_server_block_debug.go
@@ -0,0 +1,77 @@
+package weed_server
+
+import (
+	"encoding/json"
+	"net/http"
+	"time"
+
+	"github.com/seaweedfs/seaweedfs/weed/storage/blockvol"
+)
+
+// ShipperDebugInfo is the real-time shipper state for one replica.
+type ShipperDebugInfo struct {
+	DataAddr   string `json:"data_addr"`
+	State      string `json:"state"`
+	FlushedLSN uint64 `json:"flushed_lsn"`
+}
+
+// BlockVolumeDebugInfo is the real-time block volume state.
+type BlockVolumeDebugInfo struct {
+	Path      string             `json:"path"`
+	Role      string             `json:"role"`
+	Epoch     uint64             `json:"epoch"`
+	HeadLSN   uint64             `json:"head_lsn"`
+	Degraded  bool               `json:"degraded"`
+	Shippers  []ShipperDebugInfo `json:"shippers,omitempty"`
+	Timestamp string             `json:"timestamp"`
+}
+
+// debugBlockShipperHandler returns real-time shipper state for all block volumes.
+// Unlike the master's replica_degraded (heartbeat-lagged), this reads directly
+// from the shipper's atomic state field — no heartbeat delay.
+//
+// GET /debug/block/shipper
+func (vs *VolumeServer) debugBlockShipperHandler(w http.ResponseWriter, r *http.Request) {
+	if vs.blockService == nil {
+		w.Header().Set("Content-Type", "application/json")
+		json.NewEncoder(w).Encode([]BlockVolumeDebugInfo{})
+		return
+	}
+
+	store := vs.blockService.Store()
+	if store == nil {
+		w.Header().Set("Content-Type", "application/json")
+		json.NewEncoder(w).Encode([]BlockVolumeDebugInfo{})
+		return
+	}
+
+	var infos []BlockVolumeDebugInfo
+	store.IterateBlockVolumes(func(path string, vol *blockvol.BlockVol) {
+		status := vol.Status()
+		info := BlockVolumeDebugInfo{
+			Path:      path,
+			Role:      status.Role.String(),
+			Epoch:     status.Epoch,
+			HeadLSN:   status.WALHeadLSN,
+			Degraded:  status.ReplicaDegraded,
+			Timestamp: time.Now().UTC().Format(time.RFC3339Nano),
+		}
+
+		// Get per-shipper state from ShipperGroup if available.
+		sg := vol.GetShipperGroup()
+		if sg != nil {
+			for _, ss := range sg.ShipperStates() {
+				info.Shippers = append(info.Shippers, ShipperDebugInfo{
+					DataAddr:   ss.DataAddr,
+					State:      ss.State,
+					FlushedLSN: ss.FlushedLSN,
+				})
+			}
+		}
+
+		infos = append(infos, info)
+	})
+
+	w.Header().Set("Content-Type", "application/json")
+	json.NewEncoder(w).Encode(infos)
+}
diff --git a/weed/storage/blockvol/block_heartbeat.go b/weed/storage/blockvol/block_heartbeat.go
index 4e788b689..2e1862897 100644
--- a/weed/storage/blockvol/block_heartbeat.go
+++ b/weed/storage/blockvol/block_heartbeat.go
@@ -47,6 +47,7 @@ type BlockVolumeAssignment struct {
 	LeaseTtlMs      uint32        // lease TTL in milliseconds (0 = no lease)
 	ReplicaDataAddr string        // where primary ships WAL data (scalar, RF=2 compat)
 	ReplicaCtrlAddr string        // where primary sends barriers (scalar, RF=2 compat)
+	ReplicaServerID string        // V2: stable server identity for scalar replica (from registry)
 	RebuildAddr     string        // where rebuild server listens
 	ReplicaAddrs    []ReplicaAddr // CP8-2: multi-replica addrs (precedence over scalar)
 }
diff --git a/weed/storage/blockvol/blockvol.go b/weed/storage/blockvol/blockvol.go
index c4fade920..60b11bf2e 100644
--- a/weed/storage/blockvol/blockvol.go
+++ b/weed/storage/blockvol/blockvol.go
@@ -83,6 +83,9 @@ type BlockVol struct {
 	// Observability (CP8-4).
 	Metrics *EngineMetrics
 
+	// Shipper state change callback — triggers immediate heartbeat.
+	onShipperStateChange func(from, to ReplicaState)
+
 	// Snapshot fields (Phase 5 CP5-2).
 	snapMu    sync.RWMutex
 	snapshots map[uint32]*activeSnapshot
@@ -782,6 +785,7 @@ func (v *BlockVol) SyncCache() error {
 type ReplicaAddr struct {
 	DataAddr string
 	CtrlAddr string
+	ServerID string // V2: stable server identity from registry (not address-derived)
 }
 
 // WALAccess provides the shipper with the minimal WAL interface needed
@@ -824,6 +828,18 @@ func (a *walAccess) StreamEntries(fromLSN uint64, fn func(*WALEntry) error) erro
 	return a.vol.wal.ScanFrom(a.vol.fd, a.vol.super.WALOffset, checkpointLSN, fromLSN, fn)
 }
 
+// SetOnShipperStateChange registers a callback for shipper state transitions.
+// Called by the volume server to trigger immediate heartbeat on degradation/recovery.
+func (v *BlockVol) SetOnShipperStateChange(fn func(from, to ReplicaState)) {
+	v.onShipperStateChange = fn
+}
+
+// GetShipperGroup returns the shipper group for debug/observability.
+// Returns nil if no replication is configured.
+func (v *BlockVol) GetShipperGroup() *ShipperGroup {
+	return v.shipperGroup
+}
+
 // SetReplicaAddr configures a single replica endpoint. Backward-compatible wrapper
 // around SetReplicaAddrs for RF=2 callers.
 func (v *BlockVol) SetReplicaAddr(dataAddr, ctrlAddr string) {
@@ -842,6 +858,11 @@ func (v *BlockVol) SetReplicaAddrs(addrs []ReplicaAddr) {
 	}
 	v.shipperGroup = NewShipperGroup(shippers)
 
+	// Wire state change callback so shipper degradation triggers immediate heartbeat.
+	if v.onShipperStateChange != nil {
+		v.shipperGroup.SetOnStateChange(v.onShipperStateChange)
+	}
+
 	// Replace the group committer's sync function with a distributed version.
 	v.groupCommit.Stop()
 	v.groupCommit = NewGroupCommitter(GroupCommitterConfig{
diff --git a/weed/storage/blockvol/shipper_group.go b/weed/storage/blockvol/shipper_group.go
index 96db846d0..aede8f6f9 100644
--- a/weed/storage/blockvol/shipper_group.go
+++ b/weed/storage/blockvol/shipper_group.go
@@ -188,6 +188,17 @@ func (sg *ShipperGroup) EvaluateRetentionBudgets(timeout time.Duration) {
 	}
 }
 
+// SetOnStateChange registers a callback on all current shippers for state transitions.
+// Used by the volume server to trigger an immediate block heartbeat when a shipper
+// transitions to/from degraded.
+func (sg *ShipperGroup) SetOnStateChange(fn func(from, to ReplicaState)) {
+	sg.mu.RLock()
+	defer sg.mu.RUnlock()
+	for _, s := range sg.shippers {
+		s.SetOnStateChange(fn)
+	}
+}
+
 // ShipperStates returns per-replica status for heartbeat reporting.
 // Master uses this to identify which replicas need rebuild.
 func (sg *ShipperGroup) ShipperStates() []ReplicaShipperStatus {
diff --git a/weed/storage/blockvol/testrunner/scenarios/internal/robust-slow-replica.yaml b/weed/storage/blockvol/testrunner/scenarios/internal/robust-slow-replica.yaml
index c04e3a8fb..a8d092563 100644
--- a/weed/storage/blockvol/testrunner/scenarios/internal/robust-slow-replica.yaml
+++ b/weed/storage/blockvol/testrunner/scenarios/internal/robust-slow-replica.yaml
@@ -104,28 +104,46 @@ phases:
         iqn: "{{ vol_iqn }}"
         save_as: device
 
-  - name: inject-partition
+  - name: inject-delay
     actions:
       - action: print
-        msg: "=== Blocking replication ports (3295) from primary to replica ==="
+        msg: "=== Blocking replication ports (4000-6000) from primary to replica ==="
 
-      # Block only replication port — SSH and master heartbeat still work.
-      - action: inject_partition
+      # Block the replication port range. Replication data/ctrl ports are
+      # basePort(3295) + 1000 + hash*3, landing in ~4295-5794 range.
+      # Blocking 4000-6000 covers all possible replication ports while
+      # leaving SSH (22) and master heartbeat (9433/18480) open.
+      - action: exec
         node: m02
-        target_ip: "192.168.1.181"
-        ports: "3295"
+        cmd: "iptables -A OUTPUT -d 192.168.1.181 -p tcp --dport 4000:6000 -j REJECT --reject-with tcp-reset"
+        root: "true"
 
-      # Trigger a write so barrier fires and times out.
-      - action: exec
+      - action: print
+        msg: "=== Writing to trigger Ship failure + degradation ==="
+
+      # Write in background via fio (best_effort: writes succeed locally).
+      - action: fio_json
         node: m01
-        cmd: "timeout 10 dd if=/dev/urandom of={{ device }} bs=4k count=1 oflag=direct 2>/dev/null; true"
-        root: "true"
+        device: "{{ device }}"
+        rw: randwrite
+        bs: 4k
+        iodepth: "1"
+        runtime: "10"
+        time_based: "true"
+        name: write-during-fault
+        save_as: fio_fault
         ignore_error: true
 
-      # Wait for barrier timeout (5s) + degradation detection.
-      - action: sleep
-        duration: 10s
+      - action: fio_parse
+        json_var: fio_fault
+        metric: iops
+        save_as: iops_fault
+        ignore_error: true
+
+      - action: print
+        msg: "Write IOPS during fault: {{ iops_fault }}"
 
+      # Check degraded state after writes.
       - action: assert_block_field
         name: "{{ volume_name }}"
         field: replica_degraded
@@ -134,16 +152,17 @@ phases:
         ignore_error: true
 
       - action: print
-        msg: "During partition: degraded={{ degraded_during }}"
+        msg: "During fault: degraded={{ degraded_during }}"
 
   - name: clear-and-measure
     actions:
       - action: print
-        msg: "=== Clearing partition, measuring shipper recovery ==="
+        msg: "=== Clearing fault, measuring shipper recovery ==="
 
-      - action: clear_fault
+      - action: exec
         node: m02
-        type: partition
+        cmd: "iptables -D OUTPUT -d 192.168.1.181 -p tcp --dport 4000:6000 -j REJECT --reject-with tcp-reset 2>/dev/null; true"
+        root: "true"
 
       # Check at 5s — V1.5 background reconnect interval is 5s.
       - action: sleep
@@ -221,9 +240,10 @@ phases:
   - name: cleanup
     always: true
     actions:
-      - action: clear_fault
+      - action: exec
         node: m02
-        type: netem
+        cmd: "iptables -D OUTPUT -d 192.168.1.181 -p tcp --dport 4000:6000 -j REJECT --reject-with tcp-reset 2>/dev/null; true"
+        root: "true"
         ignore_error: true
       - action: stop_weed
         node: m01
diff --git a/weed/storage/blockvol/v2bridge/control.go b/weed/storage/blockvol/v2bridge/control.go
index b2636e336..2104b494b 100644
--- a/weed/storage/blockvol/v2bridge/control.go
+++ b/weed/storage/blockvol/v2bridge/control.go
@@ -1,15 +1,15 @@
 // control.go implements the real control-plane delivery bridge.
-// It converts BlockVolumeAssignment (from master heartbeat) into
-// V2 engine AssignmentIntent, using real master/registry identity.
+// Converts BlockVolumeAssignment (from master heartbeat) into V2 engine
+// AssignmentIntent using stable server identity from the master registry.
 //
-// Identity rule: ReplicaID = <volume-path>/<replica-server>
-// The replica-server is the VS identity from the master registry,
-// not a transport address. This survives address changes.
+// Identity rule: ReplicaID = <volume-path>/<server-id>
+// ServerID comes from BlockVolumeAssignment.ReplicaServerID or
+// ReplicaAddr.ServerID — NOT derived from transport addresses.
 package v2bridge
 
 import (
 	"fmt"
-	"strings"
+	"log"
 
 	bridge "github.com/seaweedfs/seaweedfs/sw-block/bridge/blockvol"
 	engine "github.com/seaweedfs/seaweedfs/sw-block/engine/replication"
@@ -17,27 +17,16 @@ import (
 )
 
 // ControlBridge converts real BlockVolumeAssignment into V2 engine intents.
-// It is the live replacement for direct AssignmentIntent construction.
 type ControlBridge struct {
 	adapter *bridge.ControlAdapter
 }
 
-// NewControlBridge creates a control bridge.
 func NewControlBridge() *ControlBridge {
-	return &ControlBridge{
-		adapter: bridge.NewControlAdapter(),
-	}
+	return &ControlBridge{adapter: bridge.NewControlAdapter()}
 }
 
-// ConvertAssignment converts a real BlockVolumeAssignment from the master
-// heartbeat response into a V2 engine AssignmentIntent.
-//
-// Identity mapping:
-//   - VolumeName = assignment.Path
-//   - For primary: ReplicaID per replica = <path>/<replica-server-id>
-//   - replica-server-id = extracted from ReplicaAddrs or scalar fields
-//   - Epoch from assignment
-//   - SessionKind from Role
+// ConvertAssignment converts a real BlockVolumeAssignment into an engine intent.
+// localServerID is the identity of the local volume server (for replica/rebuild roles).
 func (cb *ControlBridge) ConvertAssignment(a blockvol.BlockVolumeAssignment, localServerID string) engine.AssignmentIntent {
 	role := blockvol.RoleFromWire(a.Role)
 	volumeName := a.Path
@@ -54,48 +43,48 @@ func (cb *ControlBridge) ConvertAssignment(a blockvol.BlockVolumeAssignment, loc
 	}
 }
 
-// convertPrimaryAssignment: primary receives assignment with replica targets.
 func (cb *ControlBridge) convertPrimaryAssignment(a blockvol.BlockVolumeAssignment, volumeName string) engine.AssignmentIntent {
 	primary := bridge.MasterAssignment{
-		VolumeName:      volumeName,
-		Epoch:           a.Epoch,
-		Role:            "primary",
-		PrimaryServerID: "", // primary doesn't need its own server ID in the assignment
+		VolumeName: volumeName,
+		Epoch:      a.Epoch,
+		Role:       "primary",
 	}
 
 	var replicas []bridge.MasterAssignment
 	if len(a.ReplicaAddrs) > 0 {
 		for _, ra := range a.ReplicaAddrs {
-			serverID := extractServerID(ra.DataAddr)
+			if ra.ServerID == "" {
+				log.Printf("v2bridge: skipping replica with empty ServerID (data=%s)", ra.DataAddr)
+				continue // fail closed: skip replicas without stable identity
+			}
 			replicas = append(replicas, bridge.MasterAssignment{
 				VolumeName:      volumeName,
 				Epoch:           a.Epoch,
 				Role:            "replica",
-				ReplicaServerID: serverID,
+				ReplicaServerID: ra.ServerID,
 				DataAddr:        ra.DataAddr,
 				CtrlAddr:        ra.CtrlAddr,
-				AddrVersion:     0, // will be bumped on address change detection
 			})
 		}
-	} else if a.ReplicaDataAddr != "" {
-		// Scalar RF=2 compat.
-		serverID := extractServerID(a.ReplicaDataAddr)
+	} else if a.ReplicaServerID != "" && a.ReplicaDataAddr != "" {
+		// Scalar RF=2 path with explicit ServerID.
 		replicas = append(replicas, bridge.MasterAssignment{
 			VolumeName:      volumeName,
 			Epoch:           a.Epoch,
 			Role:            "replica",
-			ReplicaServerID: serverID,
+			ReplicaServerID: a.ReplicaServerID,
 			DataAddr:        a.ReplicaDataAddr,
 			CtrlAddr:        a.ReplicaCtrlAddr,
 		})
+	} else if a.ReplicaDataAddr != "" {
+		log.Printf("v2bridge: scalar replica assignment without ServerID (data=%s) — skipping", a.ReplicaDataAddr)
+		// Fail closed: do not create address-derived identity.
 	}
 
 	return cb.adapter.ToAssignmentIntent(primary, replicas)
 }
 
-// convertReplicaAssignment: replica receives its own role assignment.
 func (cb *ControlBridge) convertReplicaAssignment(a blockvol.BlockVolumeAssignment, volumeName, localServerID string) engine.AssignmentIntent {
-	// Replica doesn't manage other replicas — just acknowledges its role.
 	return engine.AssignmentIntent{
 		Epoch: a.Epoch,
 		Replicas: []engine.ReplicaAssignment{
@@ -110,7 +99,6 @@ func (cb *ControlBridge) convertReplicaAssignment(a blockvol.BlockVolumeAssignme
 	}
 }
 
-// convertRebuildAssignment: rebuilding replica.
 func (cb *ControlBridge) convertRebuildAssignment(a blockvol.BlockVolumeAssignment, volumeName, localServerID string) engine.AssignmentIntent {
 	replicaID := fmt.Sprintf("%s/%s", volumeName, localServerID)
 	return engine.AssignmentIntent{
@@ -129,22 +117,3 @@ func (cb *ControlBridge) convertRebuildAssignment(a blockvol.BlockVolumeAssignme
 		},
 	}
 }
-
-// extractServerID derives a stable server identity from an address.
-// Uses the host:port as the server ID (this is how the master registry
-// keys servers). In production, this would come from the registry's
-// ReplicaInfo.Server field directly.
-//
-// For now: strip to host:grpc-port format to match master registry keys.
-func extractServerID(addr string) string {
-	// addr is typically "ip:port" — use as-is for server ID.
-	// The master registry uses the same format for ReplicaInfo.Server.
-	if addr == "" {
-		return "unknown"
-	}
-	// Strip any path suffix, keep host:port.
-	if idx := strings.Index(addr, "/"); idx >= 0 {
-		return addr[:idx]
-	}
-	return addr
-}
diff --git a/weed/storage/blockvol/v2bridge/control_test.go b/weed/storage/blockvol/v2bridge/control_test.go
index b91e40394..b7112fdc3 100644
--- a/weed/storage/blockvol/v2bridge/control_test.go
+++ b/weed/storage/blockvol/v2bridge/control_test.go
@@ -9,234 +9,184 @@ import (
 
 // ============================================================
 // Phase 08 P1: Real control delivery tests
-// Validates real BlockVolumeAssignment → engine AssignmentIntent.
+// Identity: ReplicaID = <path>/<ServerID> — NOT address-derived.
 // ============================================================
 
-// --- E1: Live assignment delivery → engine intent ---
-
-func TestControl_PrimaryAssignment_StableIdentity(t *testing.T) {
+func TestControl_PrimaryAssignment_StableServerID(t *testing.T) {
 	cb := NewControlBridge()
 
-	// Real assignment from master heartbeat.
 	a := blockvol.BlockVolumeAssignment{
 		Path:            "pvc-data-1",
 		Epoch:           3,
 		Role:            uint32(blockvol.RolePrimary),
+		ReplicaServerID: "vs2",
 		ReplicaDataAddr: "10.0.0.2:9333",
 		ReplicaCtrlAddr: "10.0.0.2:9334",
 	}
 
-	intent := cb.ConvertAssignment(a, "vs1:9333")
+	intent := cb.ConvertAssignment(a, "vs1")
 
-	if intent.Epoch != 3 {
-		t.Fatalf("epoch=%d", intent.Epoch)
-	}
 	if len(intent.Replicas) != 1 {
 		t.Fatalf("replicas=%d", len(intent.Replicas))
 	}
 
-	// ReplicaID = volume-path / replica-server (NOT address-derived transport endpoint).
 	r := intent.Replicas[0]
-	expected := "pvc-data-1/10.0.0.2:9333"
-	if r.ReplicaID != expected {
-		t.Fatalf("ReplicaID=%s, want %s", r.ReplicaID, expected)
+	// ReplicaID uses ServerID, not address.
+	if r.ReplicaID != "pvc-data-1/vs2" {
+		t.Fatalf("ReplicaID=%s, want pvc-data-1/vs2", r.ReplicaID)
 	}
-
-	// Endpoint is the transport address.
 	if r.Endpoint.DataAddr != "10.0.0.2:9333" {
 		t.Fatalf("DataAddr=%s", r.Endpoint.DataAddr)
 	}
+	if intent.RecoveryTargets["pvc-data-1/vs2"] != engine.SessionCatchUp {
+		t.Fatalf("recovery=%s", intent.RecoveryTargets["pvc-data-1/vs2"])
+	}
+}
+
+func TestControl_AddressChange_IdentityPreserved(t *testing.T) {
+	cb := NewControlBridge()
+
+	// Same ServerID, different address.
+	a1 := blockvol.BlockVolumeAssignment{
+		Path: "vol1", Epoch: 1, Role: uint32(blockvol.RolePrimary),
+		ReplicaServerID: "vs2",
+		ReplicaDataAddr: "10.0.0.2:9333", ReplicaCtrlAddr: "10.0.0.2:9334",
+	}
+	a2 := blockvol.BlockVolumeAssignment{
+		Path: "vol1", Epoch: 1, Role: uint32(blockvol.RolePrimary),
+		ReplicaServerID: "vs2",
+		ReplicaDataAddr: "10.0.0.5:9333", ReplicaCtrlAddr: "10.0.0.5:9334",
+	}
+
+	intent1 := cb.ConvertAssignment(a1, "vs1")
+	intent2 := cb.ConvertAssignment(a2, "vs1")
 
-	// Recovery target for replica.
-	if intent.RecoveryTargets[expected] != engine.SessionCatchUp {
-		t.Fatalf("recovery=%s", intent.RecoveryTargets[expected])
+	if intent1.Replicas[0].ReplicaID != intent2.Replicas[0].ReplicaID {
+		t.Fatalf("identity changed: %s → %s",
+			intent1.Replicas[0].ReplicaID, intent2.Replicas[0].ReplicaID)
+	}
+	if intent2.Replicas[0].Endpoint.DataAddr != "10.0.0.5:9333" {
+		t.Fatal("endpoint should be updated")
 	}
 }
 
-func TestControl_PrimaryAssignment_MultiReplica(t *testing.T) {
+func TestControl_MultiReplica_StableServerIDs(t *testing.T) {
 	cb := NewControlBridge()
 
 	a := blockvol.BlockVolumeAssignment{
-		Path:  "pvc-data-1",
-		Epoch: 2,
-		Role:  uint32(blockvol.RolePrimary),
+		Path: "vol1", Epoch: 2, Role: uint32(blockvol.RolePrimary),
 		ReplicaAddrs: []blockvol.ReplicaAddr{
-			{DataAddr: "10.0.0.2:9333", CtrlAddr: "10.0.0.2:9334"},
-			{DataAddr: "10.0.0.3:9333", CtrlAddr: "10.0.0.3:9334"},
+			{DataAddr: "10.0.0.2:9333", CtrlAddr: "10.0.0.2:9334", ServerID: "vs2"},
+			{DataAddr: "10.0.0.3:9333", CtrlAddr: "10.0.0.3:9334", ServerID: "vs3"},
 		},
 	}
 
-	intent := cb.ConvertAssignment(a, "vs1:9333")
-
+	intent := cb.ConvertAssignment(a, "vs1")
 	if len(intent.Replicas) != 2 {
 		t.Fatalf("replicas=%d", len(intent.Replicas))
 	}
 
-	// Both replicas have stable identity.
 	ids := map[string]bool{}
 	for _, r := range intent.Replicas {
 		ids[r.ReplicaID] = true
 	}
-	if !ids["pvc-data-1/10.0.0.2:9333"] || !ids["pvc-data-1/10.0.0.3:9333"] {
-		t.Fatalf("IDs: %v", ids)
+	if !ids["vol1/vs2"] || !ids["vol1/vs3"] {
+		t.Fatalf("IDs: %v (should use ServerID, not address)", ids)
 	}
 }
 
-// --- E2: Address change preserves identity ---
-
-func TestControl_AddressChange_SameServerID(t *testing.T) {
+func TestControl_MissingServerID_FailsClosed(t *testing.T) {
 	cb := NewControlBridge()
 
-	// First assignment.
+	// Scalar: no ServerID → no replica created.
 	a1 := blockvol.BlockVolumeAssignment{
-		Path:            "vol1",
-		Epoch:           1,
-		Role:            uint32(blockvol.RolePrimary),
-		ReplicaDataAddr: "10.0.0.2:9333",
-		ReplicaCtrlAddr: "10.0.0.2:9334",
+		Path: "vol1", Epoch: 1, Role: uint32(blockvol.RolePrimary),
+		ReplicaDataAddr: "10.0.0.2:9333", ReplicaCtrlAddr: "10.0.0.2:9334",
+		// ReplicaServerID intentionally empty.
+	}
+	intent1 := cb.ConvertAssignment(a1, "vs1")
+	if len(intent1.Replicas) != 0 {
+		t.Fatalf("scalar without ServerID should produce 0 replicas, got %d", len(intent1.Replicas))
 	}
-	intent1 := cb.ConvertAssignment(a1, "vs1:9333")
 
-	// Address changes (replica restarted on different IP).
+	// Multi: one with ServerID, one without → only one replica.
 	a2 := blockvol.BlockVolumeAssignment{
-		Path:            "vol1",
-		Epoch:           1,
-		Role:            uint32(blockvol.RolePrimary),
-		ReplicaDataAddr: "10.0.0.5:9333",
-		ReplicaCtrlAddr: "10.0.0.5:9334",
-	}
-	intent2 := cb.ConvertAssignment(a2, "vs1:9333")
-
-	// NOTE: with current extractServerID, different IPs = different server IDs.
-	// This is a known limitation: address-based server identity.
-	// In production, the master registry would supply a stable server ID.
-	// For now, document the boundary.
-	id1 := intent1.Replicas[0].ReplicaID
-	id2 := intent2.Replicas[0].ReplicaID
-	t.Logf("address change: id1=%s id2=%s (different if IP changes)", id1, id2)
-
-	// The critical test: same IP, different port (same server, port change).
-	a3 := blockvol.BlockVolumeAssignment{
-		Path:            "vol1",
-		Epoch:           1,
-		Role:            uint32(blockvol.RolePrimary),
-		ReplicaDataAddr: "10.0.0.2:9444", // same IP, different port
-		ReplicaCtrlAddr: "10.0.0.2:9445",
+		Path: "vol1", Epoch: 1, Role: uint32(blockvol.RolePrimary),
+		ReplicaAddrs: []blockvol.ReplicaAddr{
+			{DataAddr: "10.0.0.2:9333", ServerID: "vs2"},
+			{DataAddr: "10.0.0.3:9333", ServerID: ""}, // empty → skipped
+		},
+	}
+	intent2 := cb.ConvertAssignment(a2, "vs1")
+	if len(intent2.Replicas) != 1 {
+		t.Fatalf("multi with 1 missing ServerID: replicas=%d, want 1", len(intent2.Replicas))
 	}
-	intent3 := cb.ConvertAssignment(a3, "vs1:9333")
-	id3 := intent3.Replicas[0].ReplicaID
-
-	// Same IP different port = different server ID in current model.
-	// This is the V1 identity limitation that a future registry-backed
-	// server ID would resolve.
-	t.Logf("port change: id1=%s id3=%s", id1, id3)
 }
 
-// --- E3: Epoch fencing through real assignment ---
-
 func TestControl_EpochFencing_IntegratedPath(t *testing.T) {
 	cb := NewControlBridge()
-	driver := engine.NewRecoveryDriver(nil) // no storage needed for control-path test
+	driver := engine.NewRecoveryDriver(nil)
 
-	// Epoch 1 assignment.
 	a1 := blockvol.BlockVolumeAssignment{
-		Path:            "vol1",
-		Epoch:           1,
-		Role:            uint32(blockvol.RolePrimary),
-		ReplicaDataAddr: "10.0.0.2:9333",
-		ReplicaCtrlAddr: "10.0.0.2:9334",
+		Path: "vol1", Epoch: 1, Role: uint32(blockvol.RolePrimary),
+		ReplicaServerID: "vs2", ReplicaDataAddr: "10.0.0.2:9333", ReplicaCtrlAddr: "10.0.0.2:9334",
 	}
-	intent1 := cb.ConvertAssignment(a1, "vs1:9333")
-	driver.Orchestrator.ProcessAssignment(intent1)
+	driver.Orchestrator.ProcessAssignment(cb.ConvertAssignment(a1, "vs1"))
 
-	s := driver.Orchestrator.Registry.Sender("vol1/10.0.0.2:9333")
-	if s == nil {
-		t.Fatal("sender should exist after epoch 1 assignment")
-	}
-	if !s.HasActiveSession() {
-		t.Fatal("should have session after epoch 1")
+	s := driver.Orchestrator.Registry.Sender("vol1/vs2")
+	if s == nil || !s.HasActiveSession() {
+		t.Fatal("should have session at epoch 1")
 	}
 
-	// Epoch bump (failover).
 	driver.Orchestrator.InvalidateEpoch(2)
-	driver.Orchestrator.UpdateSenderEpoch("vol1/10.0.0.2:9333", 2)
+	driver.Orchestrator.UpdateSenderEpoch("vol1/vs2", 2)
 
 	if s.HasActiveSession() {
-		t.Fatal("old session should be invalidated after epoch bump")
+		t.Fatal("old session should be invalidated")
 	}
 
-	// Epoch 2 assignment.
 	a2 := blockvol.BlockVolumeAssignment{
-		Path:            "vol1",
-		Epoch:           2,
-		Role:            uint32(blockvol.RolePrimary),
-		ReplicaDataAddr: "10.0.0.2:9333",
-		ReplicaCtrlAddr: "10.0.0.2:9334",
+		Path: "vol1", Epoch: 2, Role: uint32(blockvol.RolePrimary),
+		ReplicaServerID: "vs2", ReplicaDataAddr: "10.0.0.2:9333", ReplicaCtrlAddr: "10.0.0.2:9334",
 	}
-	intent2 := cb.ConvertAssignment(a2, "vs1:9333")
-	driver.Orchestrator.ProcessAssignment(intent2)
+	driver.Orchestrator.ProcessAssignment(cb.ConvertAssignment(a2, "vs1"))
 
 	if !s.HasActiveSession() {
 		t.Fatal("should have new session at epoch 2")
 	}
 
-	// Log shows invalidation.
 	hasInvalidation := false
-	for _, e := range driver.Orchestrator.Log.EventsFor("vol1/10.0.0.2:9333") {
+	for _, e := range driver.Orchestrator.Log.EventsFor("vol1/vs2") {
 		if e.Event == "session_invalidated" {
 			hasInvalidation = true
 		}
 	}
 	if !hasInvalidation {
-		t.Fatal("log must show session invalidation on epoch bump")
+		t.Fatal("log must show invalidation")
 	}
 }
 
-// --- E4: Rebuild role mapping ---
-
 func TestControl_RebuildAssignment(t *testing.T) {
 	cb := NewControlBridge()
-
 	a := blockvol.BlockVolumeAssignment{
-		Path:            "vol1",
-		Epoch:           3,
-		Role:            uint32(blockvol.RoleRebuilding),
-		ReplicaDataAddr: "10.0.0.2:9333",
-		ReplicaCtrlAddr: "10.0.0.2:9334",
-		RebuildAddr:     "10.0.0.1:15000",
-	}
-
-	intent := cb.ConvertAssignment(a, "10.0.0.2:9333")
-
-	if len(intent.RecoveryTargets) != 1 {
-		t.Fatalf("recovery targets=%d", len(intent.RecoveryTargets))
+		Path: "vol1", Epoch: 3, Role: uint32(blockvol.RoleRebuilding),
+		ReplicaDataAddr: "10.0.0.2:9333", ReplicaCtrlAddr: "10.0.0.2:9334",
+		RebuildAddr: "10.0.0.1:15000",
 	}
-
-	replicaID := "vol1/10.0.0.2:9333"
-	if intent.RecoveryTargets[replicaID] != engine.SessionRebuild {
-		t.Fatalf("recovery=%s", intent.RecoveryTargets[replicaID])
+	intent := cb.ConvertAssignment(a, "vs2")
+	if intent.RecoveryTargets["vol1/vs2"] != engine.SessionRebuild {
+		t.Fatalf("recovery=%s", intent.RecoveryTargets["vol1/vs2"])
 	}
 }
 
-// --- E5: Replica assignment ---
-
 func TestControl_ReplicaAssignment(t *testing.T) {
 	cb := NewControlBridge()
-
 	a := blockvol.BlockVolumeAssignment{
-		Path:            "vol1",
-		Epoch:           1,
-		Role:            uint32(blockvol.RoleReplica),
-		ReplicaDataAddr: "10.0.0.1:14260",
-		ReplicaCtrlAddr: "10.0.0.1:14261",
-	}
-
-	intent := cb.ConvertAssignment(a, "vs2:9333")
-
-	if len(intent.Replicas) != 1 {
-		t.Fatalf("replicas=%d", len(intent.Replicas))
+		Path: "vol1", Epoch: 1, Role: uint32(blockvol.RoleReplica),
+		ReplicaDataAddr: "10.0.0.1:14260", ReplicaCtrlAddr: "10.0.0.1:14261",
 	}
-	if intent.Replicas[0].ReplicaID != "vol1/vs2:9333" {
+	intent := cb.ConvertAssignment(a, "vs2")
+	if intent.Replicas[0].ReplicaID != "vol1/vs2" {
 		t.Fatalf("ReplicaID=%s", intent.Replicas[0].ReplicaID)
 	}
 }
diff --git a/weed/storage/blockvol/wal_shipper.go b/weed/storage/blockvol/wal_shipper.go
index d3785ce2f..cf9d6b6c3 100644
--- a/weed/storage/blockvol/wal_shipper.go
+++ b/weed/storage/blockvol/wal_shipper.go
@@ -71,6 +71,17 @@ type WALShipper struct {
 	catchupFailures    int           // consecutive catch-up failures; reset on success
 	lastContactTime    atomic.Value  // time.Time: last successful barrier/handshake/catch-up
 	stopped            atomic.Bool
+
+	// onStateChange is called when the shipper transitions between states.
+	// Used to trigger immediate heartbeat on degradation/recovery.
+	// Set via SetOnStateChange. Nil = no callback.
+	onStateChange func(from, to ReplicaState)
+}
+
+// SetOnStateChange registers a callback for shipper state transitions.
+// The callback is invoked synchronously from markDegraded/markInSync.
+func (s *WALShipper) SetOnStateChange(fn func(from, to ReplicaState)) {
+	s.onStateChange = fn
 }
 
 const maxCatchupRetries = 3
@@ -345,8 +356,11 @@ func (s *WALShipper) ensureCtrlConn() error {
 }
 
 func (s *WALShipper) markDegraded() {
-	s.state.Store(uint32(ReplicaDegraded))
-	log.Printf("wal_shipper: replica degraded (data=%s, ctrl=%s, state=%s)", s.dataAddr, s.controlAddr, s.State())
+	prev := ReplicaState(s.state.Swap(uint32(ReplicaDegraded)))
+	log.Printf("wal_shipper: replica degraded (data=%s, ctrl=%s, prev=%s)", s.dataAddr, s.controlAddr, prev)
+	if prev != ReplicaDegraded && s.onStateChange != nil {
+		s.onStateChange(prev, ReplicaDegraded)
+	}
 }
 
 // resetConnections closes both data and control connections for a clean retry.
@@ -404,10 +418,13 @@ func (s *WALShipper) doReconnectAndCatchUp() error {
 }
 
 func (s *WALShipper) markInSync() {
-	s.state.Store(uint32(ReplicaInSync))
+	prev := ReplicaState(s.state.Swap(uint32(ReplicaInSync)))
 	s.catchupFailures = 0
 	s.touchContactTime()
-	log.Printf("wal_shipper: replica in-sync (data=%s, ctrl=%s)", s.dataAddr, s.controlAddr)
+	log.Printf("wal_shipper: replica in-sync (data=%s, ctrl=%s, prev=%s)", s.dataAddr, s.controlAddr, prev)
+	if prev != ReplicaInSync && s.onStateChange != nil {
+		s.onStateChange(prev, ReplicaInSync)
+	}
 }
 
 const catchupTimeout = 30 * time.Second
diff --git a/weed/storage/store_blockvol.go b/weed/storage/store_blockvol.go
index 6f6bb8229..f2d18fc5a 100644
--- a/weed/storage/store_blockvol.go
+++ b/weed/storage/store_blockvol.go
@@ -84,6 +84,15 @@ func (bs *BlockVolumeStore) ListBlockVolumes() []string {
 	return paths
 }
 
+// IterateBlockVolumes calls fn for each registered block volume.
+func (bs *BlockVolumeStore) IterateBlockVolumes(fn func(path string, vol *blockvol.BlockVol)) {
+	bs.mu.RLock()
+	defer bs.mu.RUnlock()
+	for path, vol := range bs.volumes {
+		fn(path, vol)
+	}
+}
+
 // CollectBlockVolumeHeartbeat returns status for all registered
 // block volumes, suitable for inclusion in a heartbeat message.
 func (bs *BlockVolumeStore) CollectBlockVolumeHeartbeat() []blockvol.BlockVolumeInfoMessage {