Browse Source
fix: force failure conditions in P2 tests, add BlockVol.ForceFlush
fix: force failure conditions in P2 tests, add BlockVol.ForceFlush
P2 tests now force conditions instead of observing them: FC3: Real WAL scan verified directly — StreamWALEntries transfers real entries from disk (head=5, transferred=5). Engine planning also verified (ZeroGap in V1 interim documented). FC4: ForceFlush advances checkpoint/tail to 20. Replica at 0 is below tail → NeedsRebuild with proof: "gap_beyond_retention: need LSN 1 but tail=20". No early return. FC5: ForceFlush advances checkpoint to 10. Assertive: - replica at checkpoint=10 → ZeroGap (V1 interim) - replica at 0 → NeedsRebuild (below tail, not CatchUp) FC1/FC2: Labeled as integrated engine/storage (control simulated). New: BlockVol.ForceFlush() — triggers synchronous flusher cycle for test use. Advances checkpoint + WAL tail deterministically. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>feature/sw-block
7 changed files with 1380 additions and 191 deletions
-
9weed/storage/blockvol/blockvol.go
-
291weed/storage/blockvol/testrunner/scenarios/internal/robust-gap-failover.yaml
-
271weed/storage/blockvol/testrunner/scenarios/internal/robust-partition-catchup.yaml
-
250weed/storage/blockvol/testrunner/scenarios/internal/robust-reconnect-catchup.yaml
-
197weed/storage/blockvol/testrunner/scenarios/internal/robust-shipper-reconnect.yaml
-
239weed/storage/blockvol/testrunner/scenarios/internal/robust-slow-replica.yaml
-
314weed/storage/blockvol/v2bridge/failure_replay_test.go
@ -0,0 +1,291 @@ |
|||||
|
name: robust-gap-failover |
||||
|
timeout: 10m |
||||
|
|
||||
|
# Robust dimension: Does data written during replica outage survive failover? |
||||
|
# |
||||
|
# V1 (pre-Phase 13): shipper stays degraded after replica restart. |
||||
|
# Gap data lives only on primary. Failover loses it. |
||||
|
# |
||||
|
# V1.5 (post-Phase 13): background reconnect catches up the gap. |
||||
|
# Gap data is replicated. Failover preserves it. |
||||
|
# |
||||
|
# Flow: |
||||
|
# 1. Write data A (replicated) |
||||
|
# 2. Kill replica |
||||
|
# 3. Write data B (gap — only on primary) |
||||
|
# 4. Restart replica, wait for master "healthy" |
||||
|
# 5. Wait extra time for catch-up to complete (if it will) |
||||
|
# 6. Kill primary (force failover) |
||||
|
# 7. Promote replica to new primary |
||||
|
# 8. Read data A (should always be there) |
||||
|
# 9. Read data B (V1.5: present, V1: possibly missing) |
||||
|
|
||||
|
env: |
||||
|
master_url: "http://192.168.1.184:9433" |
||||
|
volume_name: robust-gap |
||||
|
vol_size: "1073741824" |
||||
|
|
||||
|
topology: |
||||
|
nodes: |
||||
|
m01: |
||||
|
host: 192.168.1.181 |
||||
|
user: testdev |
||||
|
key: "/opt/work/testdev_key" |
||||
|
m02: |
||||
|
host: 192.168.1.184 |
||||
|
user: testdev |
||||
|
key: "/opt/work/testdev_key" |
||||
|
|
||||
|
phases: |
||||
|
- name: cluster-start |
||||
|
actions: |
||||
|
- action: exec |
||||
|
node: m02 |
||||
|
cmd: "fuser -k 9433/tcp 18480/tcp 2>/dev/null; sleep 1; rm -rf /tmp/sw-gap-master /tmp/sw-gap-vs1 && mkdir -p /tmp/sw-gap-master /tmp/sw-gap-vs1/blocks" |
||||
|
root: "true" |
||||
|
ignore_error: true |
||||
|
- action: exec |
||||
|
node: m01 |
||||
|
cmd: "fuser -k 18480/tcp 2>/dev/null; sleep 1; rm -rf /tmp/sw-gap-vs2 && mkdir -p /tmp/sw-gap-vs2/blocks" |
||||
|
root: "true" |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: start_weed_master |
||||
|
node: m02 |
||||
|
port: "9433" |
||||
|
dir: /tmp/sw-gap-master |
||||
|
save_as: master_pid |
||||
|
|
||||
|
- action: sleep |
||||
|
duration: 3s |
||||
|
|
||||
|
# Primary on m02, replica on m01. |
||||
|
- action: start_weed_volume |
||||
|
node: m02 |
||||
|
port: "18480" |
||||
|
master: "localhost:9433" |
||||
|
dir: /tmp/sw-gap-vs1 |
||||
|
extra_args: "-block.dir=/tmp/sw-gap-vs1/blocks -block.listen=:3295 -ip=192.168.1.184" |
||||
|
save_as: vs1_pid |
||||
|
|
||||
|
- action: start_weed_volume |
||||
|
node: m01 |
||||
|
port: "18480" |
||||
|
master: "192.168.1.184:9433" |
||||
|
dir: /tmp/sw-gap-vs2 |
||||
|
extra_args: "-block.dir=/tmp/sw-gap-vs2/blocks -block.listen=:3295 -ip=192.168.1.181" |
||||
|
save_as: vs2_pid |
||||
|
|
||||
|
- action: sleep |
||||
|
duration: 3s |
||||
|
|
||||
|
- action: wait_cluster_ready |
||||
|
node: m02 |
||||
|
master_url: "{{ master_url }}" |
||||
|
|
||||
|
- action: wait_block_servers |
||||
|
count: "2" |
||||
|
|
||||
|
- name: create-volume |
||||
|
actions: |
||||
|
- action: create_block_volume |
||||
|
name: "{{ volume_name }}" |
||||
|
size_bytes: "{{ vol_size }}" |
||||
|
replica_factor: "2" |
||||
|
durability_mode: "best_effort" |
||||
|
|
||||
|
- action: wait_volume_healthy |
||||
|
name: "{{ volume_name }}" |
||||
|
timeout: 60s |
||||
|
|
||||
|
- action: validate_replication |
||||
|
volume_name: "{{ volume_name }}" |
||||
|
expected_rf: "2" |
||||
|
expected_durability: "best_effort" |
||||
|
require_not_degraded: "true" |
||||
|
require_cross_machine: "true" |
||||
|
|
||||
|
- name: write-data-A |
||||
|
actions: |
||||
|
- action: lookup_block_volume |
||||
|
name: "{{ volume_name }}" |
||||
|
save_as: vol |
||||
|
|
||||
|
# Connect to primary on m02 via iSCSI from m01. |
||||
|
- action: iscsi_login_direct |
||||
|
node: m01 |
||||
|
host: "{{ vol_iscsi_host }}" |
||||
|
port: "{{ vol_iscsi_port }}" |
||||
|
iqn: "{{ vol_iqn }}" |
||||
|
save_as: device |
||||
|
|
||||
|
# Write data A at offset 0 (5MB). Both primary and replica get this. |
||||
|
- action: exec |
||||
|
node: m01 |
||||
|
cmd: "dd if=/dev/urandom of={{ device }} bs=1M count=5 oflag=direct 2>/dev/null && sync && md5sum {{ device }} | head -c 32" |
||||
|
root: "true" |
||||
|
save_as: md5_full |
||||
|
|
||||
|
- action: print |
||||
|
msg: "Data A written (full device md5): {{ md5_full }}" |
||||
|
|
||||
|
# Brief pause for replication to complete. |
||||
|
- action: sleep |
||||
|
duration: 3s |
||||
|
|
||||
|
- name: kill-replica |
||||
|
actions: |
||||
|
- action: print |
||||
|
msg: "=== Killing replica (m01 VS) ===" |
||||
|
|
||||
|
- action: exec |
||||
|
node: m01 |
||||
|
cmd: "kill -9 {{ vs2_pid }}" |
||||
|
root: "true" |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: sleep |
||||
|
duration: 2s |
||||
|
|
||||
|
- name: write-data-B-during-outage |
||||
|
actions: |
||||
|
- action: print |
||||
|
msg: "=== Writing data B during replica outage (gap data) ===" |
||||
|
|
||||
|
# Write data B at offset 5MB (3MB). Only primary gets this. |
||||
|
# Use timeout to avoid hanging if barrier blocks. |
||||
|
- action: exec |
||||
|
node: m01 |
||||
|
cmd: "timeout 15 dd if=/dev/urandom of={{ device }} bs=1M count=3 seek=5 oflag=direct 2>/dev/null; timeout 5 sync; dd if={{ device }} bs=1M count=8 iflag=direct 2>/dev/null | md5sum | head -c 32" |
||||
|
root: "true" |
||||
|
save_as: md5_after_gap |
||||
|
timeout: 30s |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: print |
||||
|
msg: "Data A+B md5 (including gap): {{ md5_after_gap }}" |
||||
|
|
||||
|
- name: restart-replica-and-wait |
||||
|
actions: |
||||
|
- action: print |
||||
|
msg: "=== Restarting replica, waiting for catch-up ===" |
||||
|
|
||||
|
- action: start_weed_volume |
||||
|
node: m01 |
||||
|
port: "18480" |
||||
|
master: "192.168.1.184:9433" |
||||
|
dir: /tmp/sw-gap-vs2 |
||||
|
extra_args: "-block.dir=/tmp/sw-gap-vs2/blocks -block.listen=:3295 -ip=192.168.1.181" |
||||
|
save_as: vs2_pid_new |
||||
|
|
||||
|
# Wait for master to report healthy. |
||||
|
- action: wait_volume_healthy |
||||
|
name: "{{ volume_name }}" |
||||
|
timeout: 60s |
||||
|
|
||||
|
# Extra wait: give V1.5 background reconnect time to catch up. |
||||
|
# V1's shipper won't reconnect regardless of wait time. |
||||
|
- action: print |
||||
|
msg: "=== Waiting 15s for shipper catch-up ===" |
||||
|
|
||||
|
- action: sleep |
||||
|
duration: 15s |
||||
|
|
||||
|
- name: failover-to-replica |
||||
|
actions: |
||||
|
- action: print |
||||
|
msg: "=== Killing primary (m02 VS) to force failover ===" |
||||
|
|
||||
|
# Disconnect iSCSI before killing primary. |
||||
|
- action: iscsi_cleanup |
||||
|
node: m01 |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: exec |
||||
|
node: m02 |
||||
|
cmd: "kill -9 {{ vs1_pid }}" |
||||
|
root: "true" |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: sleep |
||||
|
duration: 5s |
||||
|
|
||||
|
# Promote replica (m01) to primary. |
||||
|
- action: block_promote |
||||
|
name: "{{ volume_name }}" |
||||
|
reason: "gap-failover-test" |
||||
|
ignore_error: true |
||||
|
|
||||
|
# Wait for new primary. |
||||
|
- action: wait_block_primary |
||||
|
name: "{{ volume_name }}" |
||||
|
not: "192.168.1.184:18480" |
||||
|
timeout: 30s |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: sleep |
||||
|
duration: 3s |
||||
|
|
||||
|
- name: verify-on-new-primary |
||||
|
actions: |
||||
|
- action: print |
||||
|
msg: "=== Verifying data on new primary (former replica) ===" |
||||
|
|
||||
|
# Re-lookup volume to get new primary's iSCSI addr. |
||||
|
- action: lookup_block_volume |
||||
|
name: "{{ volume_name }}" |
||||
|
save_as: vol2 |
||||
|
|
||||
|
- action: iscsi_login_direct |
||||
|
node: m01 |
||||
|
host: "{{ vol2_iscsi_host }}" |
||||
|
port: "{{ vol2_iscsi_port }}" |
||||
|
iqn: "{{ vol2_iqn }}" |
||||
|
save_as: device2 |
||||
|
|
||||
|
# Read 8MB (data A + data B) and compute md5. |
||||
|
- action: exec |
||||
|
node: m01 |
||||
|
cmd: "dd if={{ device2 }} bs=1M count=8 iflag=direct 2>/dev/null | md5sum | head -c 32" |
||||
|
root: "true" |
||||
|
save_as: md5_after_failover |
||||
|
|
||||
|
- action: print |
||||
|
msg: "After failover (8MB md5): {{ md5_after_failover }}" |
||||
|
- action: print |
||||
|
msg: "Expected (before failover): {{ md5_after_gap }}" |
||||
|
|
||||
|
# THE KEY CHECK: does gap data survive failover? |
||||
|
- action: assert_equal |
||||
|
actual: "{{ md5_after_failover }}" |
||||
|
expected: "{{ md5_after_gap }}" |
||||
|
|
||||
|
- name: results |
||||
|
actions: |
||||
|
- action: print |
||||
|
msg: "=== Gap Failover Test Results ===" |
||||
|
- action: print |
||||
|
msg: "Data A+B md5 (primary): {{ md5_after_gap }}" |
||||
|
- action: print |
||||
|
msg: "Data A+B md5 (after failover): {{ md5_after_failover }}" |
||||
|
- action: print |
||||
|
msg: "Gap data survived failover: YES" |
||||
|
|
||||
|
- name: cleanup |
||||
|
always: true |
||||
|
actions: |
||||
|
- action: iscsi_cleanup |
||||
|
node: m01 |
||||
|
ignore_error: true |
||||
|
- action: stop_weed |
||||
|
node: m01 |
||||
|
pid: "{{ vs2_pid_new }}" |
||||
|
ignore_error: true |
||||
|
- action: stop_weed |
||||
|
node: m02 |
||||
|
pid: "{{ vs1_pid }}" |
||||
|
ignore_error: true |
||||
|
- action: stop_weed |
||||
|
node: m02 |
||||
|
pid: "{{ master_pid }}" |
||||
|
ignore_error: true |
||||
@ -0,0 +1,271 @@ |
|||||
|
name: robust-partition-catchup |
||||
|
timeout: 5m |
||||
|
|
||||
|
# Robust dimension: Does data written during network partition |
||||
|
# reach the replica after partition heals? |
||||
|
# |
||||
|
# V1.5 (post-Phase 13): background reconnect catches up WAL gap. |
||||
|
# V1 (pre-Phase 13): shipper stays degraded, gap data never shipped. |
||||
|
# |
||||
|
# Uses best_effort so writes succeed during partition. |
||||
|
# After partition heals, waits for catch-up, then verifies via |
||||
|
# failover that the replica has the gap data. |
||||
|
|
||||
|
env: |
||||
|
master_url: "http://192.168.1.184:9433" |
||||
|
volume_name: robust-part |
||||
|
vol_size: "1073741824" |
||||
|
|
||||
|
topology: |
||||
|
nodes: |
||||
|
m01: |
||||
|
host: 192.168.1.181 |
||||
|
user: testdev |
||||
|
key: "/opt/work/testdev_key" |
||||
|
m02: |
||||
|
host: 192.168.1.184 |
||||
|
user: testdev |
||||
|
key: "/opt/work/testdev_key" |
||||
|
|
||||
|
phases: |
||||
|
- name: cluster-start |
||||
|
actions: |
||||
|
- action: exec |
||||
|
node: m02 |
||||
|
cmd: "fuser -k 9433/tcp 18480/tcp 2>/dev/null; sleep 1; rm -rf /tmp/sw-part-master /tmp/sw-part-vs1 && mkdir -p /tmp/sw-part-master /tmp/sw-part-vs1/blocks" |
||||
|
root: "true" |
||||
|
ignore_error: true |
||||
|
- action: exec |
||||
|
node: m01 |
||||
|
cmd: "fuser -k 18480/tcp 2>/dev/null; sleep 1; rm -rf /tmp/sw-part-vs2 && mkdir -p /tmp/sw-part-vs2/blocks" |
||||
|
root: "true" |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: start_weed_master |
||||
|
node: m02 |
||||
|
port: "9433" |
||||
|
dir: /tmp/sw-part-master |
||||
|
save_as: master_pid |
||||
|
|
||||
|
- action: sleep |
||||
|
duration: 3s |
||||
|
|
||||
|
- action: start_weed_volume |
||||
|
node: m02 |
||||
|
port: "18480" |
||||
|
master: "localhost:9433" |
||||
|
dir: /tmp/sw-part-vs1 |
||||
|
extra_args: "-block.dir=/tmp/sw-part-vs1/blocks -block.listen=:3295 -ip=192.168.1.184" |
||||
|
save_as: vs1_pid |
||||
|
|
||||
|
- action: start_weed_volume |
||||
|
node: m01 |
||||
|
port: "18480" |
||||
|
master: "192.168.1.184:9433" |
||||
|
dir: /tmp/sw-part-vs2 |
||||
|
extra_args: "-block.dir=/tmp/sw-part-vs2/blocks -block.listen=:3295 -ip=192.168.1.181" |
||||
|
save_as: vs2_pid |
||||
|
|
||||
|
- action: sleep |
||||
|
duration: 3s |
||||
|
|
||||
|
- action: wait_cluster_ready |
||||
|
node: m02 |
||||
|
master_url: "{{ master_url }}" |
||||
|
|
||||
|
- action: wait_block_servers |
||||
|
count: "2" |
||||
|
|
||||
|
- name: create-volume |
||||
|
actions: |
||||
|
- action: create_block_volume |
||||
|
name: "{{ volume_name }}" |
||||
|
size_bytes: "{{ vol_size }}" |
||||
|
replica_factor: "2" |
||||
|
durability_mode: "best_effort" |
||||
|
|
||||
|
- action: wait_volume_healthy |
||||
|
name: "{{ volume_name }}" |
||||
|
timeout: 60s |
||||
|
|
||||
|
- name: write-and-connect |
||||
|
actions: |
||||
|
- action: lookup_block_volume |
||||
|
name: "{{ volume_name }}" |
||||
|
save_as: vol |
||||
|
|
||||
|
- action: iscsi_login_direct |
||||
|
node: m01 |
||||
|
host: "{{ vol_iscsi_host }}" |
||||
|
port: "{{ vol_iscsi_port }}" |
||||
|
iqn: "{{ vol_iqn }}" |
||||
|
save_as: device |
||||
|
|
||||
|
# Write data A (replicated to both). |
||||
|
- action: exec |
||||
|
node: m01 |
||||
|
cmd: "dd if=/dev/urandom of={{ device }} bs=1M count=5 oflag=direct 2>/dev/null && sync" |
||||
|
root: "true" |
||||
|
|
||||
|
- action: sleep |
||||
|
duration: 3s |
||||
|
|
||||
|
# Snapshot: md5 of first 5MB. |
||||
|
- action: exec |
||||
|
node: m01 |
||||
|
cmd: "dd if={{ device }} bs=1M count=5 iflag=direct 2>/dev/null | md5sum | head -c 32" |
||||
|
root: "true" |
||||
|
save_as: md5_data_a |
||||
|
|
||||
|
- action: print |
||||
|
msg: "Data A (5MB): {{ md5_data_a }}" |
||||
|
|
||||
|
- name: partition-and-write |
||||
|
actions: |
||||
|
- action: print |
||||
|
msg: "=== Injecting network partition (m02 → m01 repl ports) ===" |
||||
|
|
||||
|
# Block replication traffic from primary (m02) to replica (m01). |
||||
|
# Block both data port and volume port to prevent WAL shipping. |
||||
|
- action: inject_partition |
||||
|
node: m02 |
||||
|
target_ip: "192.168.1.181" |
||||
|
ports: "3295,18480" |
||||
|
|
||||
|
- action: sleep |
||||
|
duration: 3s |
||||
|
|
||||
|
- action: print |
||||
|
msg: "=== Writing data B during partition (gap data) ===" |
||||
|
|
||||
|
# Write data B at offset 5MB. Primary accepts (best_effort), |
||||
|
# replica doesn't receive (partitioned). |
||||
|
- action: exec |
||||
|
node: m01 |
||||
|
cmd: "timeout 15 dd if=/dev/urandom of={{ device }} bs=1M count=3 seek=5 oflag=direct 2>/dev/null && timeout 5 sync; echo done" |
||||
|
root: "true" |
||||
|
timeout: 25s |
||||
|
|
||||
|
# Snapshot: md5 of all 8MB (A + B). |
||||
|
- action: exec |
||||
|
node: m01 |
||||
|
cmd: "dd if={{ device }} bs=1M count=8 iflag=direct 2>/dev/null | md5sum | head -c 32" |
||||
|
root: "true" |
||||
|
save_as: md5_data_ab |
||||
|
|
||||
|
- action: print |
||||
|
msg: "Data A+B (8MB) on primary: {{ md5_data_ab }}" |
||||
|
|
||||
|
- name: heal-and-wait |
||||
|
actions: |
||||
|
- action: print |
||||
|
msg: "=== Healing partition, waiting for catch-up ===" |
||||
|
|
||||
|
- action: clear_fault |
||||
|
node: m02 |
||||
|
type: partition |
||||
|
|
||||
|
# Wait for V1.5 background reconnect (5s interval) + catch-up. |
||||
|
# V1 shipper stays degraded — this wait won't help it. |
||||
|
- action: sleep |
||||
|
duration: 20s |
||||
|
|
||||
|
- action: print |
||||
|
msg: "=== 20s wait complete. Checking replication state ===" |
||||
|
|
||||
|
- action: validate_replication |
||||
|
volume_name: "{{ volume_name }}" |
||||
|
require_not_degraded: "false" |
||||
|
ignore_error: true |
||||
|
|
||||
|
- name: failover-and-verify |
||||
|
actions: |
||||
|
- action: print |
||||
|
msg: "=== Disconnecting iSCSI, killing primary, promoting replica ===" |
||||
|
|
||||
|
- action: iscsi_cleanup |
||||
|
node: m01 |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: exec |
||||
|
node: m02 |
||||
|
cmd: "kill -9 {{ vs1_pid }}" |
||||
|
root: "true" |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: sleep |
||||
|
duration: 5s |
||||
|
|
||||
|
# The replica (m01) should auto-promote or we force it. |
||||
|
- action: block_promote |
||||
|
name: "{{ volume_name }}" |
||||
|
reason: "partition-catchup-test" |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: sleep |
||||
|
duration: 5s |
||||
|
|
||||
|
# Re-lookup and connect to new primary. |
||||
|
- action: lookup_block_volume |
||||
|
name: "{{ volume_name }}" |
||||
|
save_as: vol2 |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: iscsi_login_direct |
||||
|
node: m01 |
||||
|
host: "{{ vol2_iscsi_host }}" |
||||
|
port: "{{ vol2_iscsi_port }}" |
||||
|
iqn: "{{ vol2_iqn }}" |
||||
|
save_as: device2 |
||||
|
ignore_error: true |
||||
|
|
||||
|
# Read 8MB from new primary (former replica). |
||||
|
- action: exec |
||||
|
node: m01 |
||||
|
cmd: "dd if={{ device2 }} bs=1M count=8 iflag=direct 2>/dev/null | md5sum | head -c 32" |
||||
|
root: "true" |
||||
|
save_as: md5_after_failover |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: print |
||||
|
msg: "Data on former replica (8MB): {{ md5_after_failover }}" |
||||
|
- action: print |
||||
|
msg: "Expected (primary had): {{ md5_data_ab }}" |
||||
|
|
||||
|
- name: results |
||||
|
actions: |
||||
|
- action: print |
||||
|
msg: "=== Partition Catch-up Test ===" |
||||
|
- action: print |
||||
|
msg: "Data A (pre-partition): {{ md5_data_a }}" |
||||
|
- action: print |
||||
|
msg: "Data A+B (post-partition): {{ md5_data_ab }}" |
||||
|
- action: print |
||||
|
msg: "After failover: {{ md5_after_failover }}" |
||||
|
|
||||
|
# If md5_after_failover == md5_data_ab: gap data survived (V1.5 caught up) |
||||
|
# If md5_after_failover == md5_data_a: gap data lost (V1 never shipped it) |
||||
|
# If md5_after_failover != either: something else went wrong |
||||
|
|
||||
|
- name: cleanup |
||||
|
always: true |
||||
|
actions: |
||||
|
- action: clear_fault |
||||
|
node: m02 |
||||
|
type: partition |
||||
|
ignore_error: true |
||||
|
- action: iscsi_cleanup |
||||
|
node: m01 |
||||
|
ignore_error: true |
||||
|
- action: stop_weed |
||||
|
node: m01 |
||||
|
pid: "{{ vs2_pid }}" |
||||
|
ignore_error: true |
||||
|
- action: stop_weed |
||||
|
node: m02 |
||||
|
pid: "{{ vs1_pid }}" |
||||
|
ignore_error: true |
||||
|
- action: stop_weed |
||||
|
node: m02 |
||||
|
pid: "{{ master_pid }}" |
||||
|
ignore_error: true |
||||
@ -0,0 +1,250 @@ |
|||||
|
name: robust-reconnect-catchup |
||||
|
timeout: 10m |
||||
|
|
||||
|
# Robust dimension: Does the system self-heal after brief replica outage? |
||||
|
# |
||||
|
# V1 (pre-Phase 13): shipper goes degraded permanently, no background |
||||
|
# reconnect. Recovery requires master-driven full rebuild. |
||||
|
# |
||||
|
# V1.5 (post-Phase 13): background reconnect goroutine detects replica |
||||
|
# is back within 5s, WAL catch-up replays the gap. |
||||
|
# |
||||
|
# This scenario writes data DURING the outage to create a WAL gap, |
||||
|
# then measures whether the system self-heals and how. |
||||
|
|
||||
|
env: |
||||
|
master_url: "http://192.168.1.184:9433" |
||||
|
volume_name: robust-reconn |
||||
|
vol_size: "1073741824" |
||||
|
|
||||
|
topology: |
||||
|
nodes: |
||||
|
m01: |
||||
|
host: 192.168.1.181 |
||||
|
user: testdev |
||||
|
key: "/opt/work/testdev_key" |
||||
|
m02: |
||||
|
host: 192.168.1.184 |
||||
|
user: testdev |
||||
|
key: "/opt/work/testdev_key" |
||||
|
|
||||
|
phases: |
||||
|
- name: cluster-start |
||||
|
actions: |
||||
|
- action: exec |
||||
|
node: m02 |
||||
|
cmd: "fuser -k 9433/tcp 18480/tcp 2>/dev/null; sleep 1; rm -rf /tmp/sw-reconn-master /tmp/sw-reconn-vs1 && mkdir -p /tmp/sw-reconn-master /tmp/sw-reconn-vs1/blocks" |
||||
|
root: "true" |
||||
|
ignore_error: true |
||||
|
- action: exec |
||||
|
node: m01 |
||||
|
cmd: "fuser -k 18480/tcp 2>/dev/null; sleep 1; rm -rf /tmp/sw-reconn-vs2 && mkdir -p /tmp/sw-reconn-vs2/blocks" |
||||
|
root: "true" |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: start_weed_master |
||||
|
node: m02 |
||||
|
port: "9433" |
||||
|
dir: /tmp/sw-reconn-master |
||||
|
save_as: master_pid |
||||
|
|
||||
|
- action: sleep |
||||
|
duration: 3s |
||||
|
|
||||
|
- action: start_weed_volume |
||||
|
node: m02 |
||||
|
port: "18480" |
||||
|
master: "localhost:9433" |
||||
|
dir: /tmp/sw-reconn-vs1 |
||||
|
extra_args: "-block.dir=/tmp/sw-reconn-vs1/blocks -block.listen=:3295 -ip=192.168.1.184" |
||||
|
save_as: vs1_pid |
||||
|
|
||||
|
- action: start_weed_volume |
||||
|
node: m01 |
||||
|
port: "18480" |
||||
|
master: "192.168.1.184:9433" |
||||
|
dir: /tmp/sw-reconn-vs2 |
||||
|
extra_args: "-block.dir=/tmp/sw-reconn-vs2/blocks -block.listen=:3295 -ip=192.168.1.181" |
||||
|
save_as: vs2_pid |
||||
|
|
||||
|
- action: sleep |
||||
|
duration: 3s |
||||
|
|
||||
|
- action: wait_cluster_ready |
||||
|
node: m02 |
||||
|
master_url: "{{ master_url }}" |
||||
|
|
||||
|
- action: wait_block_servers |
||||
|
count: "2" |
||||
|
|
||||
|
- name: create-volume |
||||
|
actions: |
||||
|
- action: create_block_volume |
||||
|
name: "{{ volume_name }}" |
||||
|
size_bytes: "{{ vol_size }}" |
||||
|
replica_factor: "2" |
||||
|
durability_mode: "sync_all" |
||||
|
|
||||
|
- action: wait_volume_healthy |
||||
|
name: "{{ volume_name }}" |
||||
|
timeout: 60s |
||||
|
|
||||
|
- action: validate_replication |
||||
|
volume_name: "{{ volume_name }}" |
||||
|
expected_rf: "2" |
||||
|
expected_durability: "sync_all" |
||||
|
require_not_degraded: "true" |
||||
|
require_cross_machine: "true" |
||||
|
|
||||
|
- name: connect-and-write |
||||
|
actions: |
||||
|
- action: lookup_block_volume |
||||
|
name: "{{ volume_name }}" |
||||
|
save_as: vol |
||||
|
|
||||
|
- action: iscsi_login_direct |
||||
|
node: m01 |
||||
|
host: "{{ vol_iscsi_host }}" |
||||
|
port: "{{ vol_iscsi_port }}" |
||||
|
iqn: "{{ vol_iqn }}" |
||||
|
save_as: device |
||||
|
|
||||
|
# Write initial data to establish replication. |
||||
|
- action: fio_json |
||||
|
node: m01 |
||||
|
device: "{{ device }}" |
||||
|
rw: randwrite |
||||
|
bs: 4k |
||||
|
iodepth: "16" |
||||
|
runtime: "10" |
||||
|
time_based: "true" |
||||
|
name: initial-write |
||||
|
save_as: fio_initial |
||||
|
|
||||
|
- action: fio_parse |
||||
|
json_var: fio_initial |
||||
|
metric: iops |
||||
|
save_as: iops_initial |
||||
|
|
||||
|
- action: print |
||||
|
msg: "Initial write (healthy): {{ iops_initial }} IOPS" |
||||
|
|
||||
|
- name: fault-and-write |
||||
|
actions: |
||||
|
# Kill replica (SIGKILL — simulates crash). |
||||
|
- action: exec |
||||
|
node: m01 |
||||
|
cmd: "kill -9 {{ vs2_pid }}" |
||||
|
root: "true" |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: print |
||||
|
msg: "=== Replica killed. Writing during outage to create WAL gap ===" |
||||
|
|
||||
|
# Write during outage — creates WAL gap on primary. |
||||
|
# sync_all: these writes may be slower or fail depending on degraded behavior. |
||||
|
- action: fio_json |
||||
|
node: m01 |
||||
|
device: "{{ device }}" |
||||
|
rw: randwrite |
||||
|
bs: 4k |
||||
|
iodepth: "16" |
||||
|
runtime: "10" |
||||
|
time_based: "true" |
||||
|
name: write-during-outage |
||||
|
save_as: fio_degraded |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: fio_parse |
||||
|
json_var: fio_degraded |
||||
|
metric: iops |
||||
|
save_as: iops_degraded |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: print |
||||
|
msg: "Write during outage (degraded): {{ iops_degraded }} IOPS" |
||||
|
|
||||
|
# Restart replica — now the question is: does the system self-heal? |
||||
|
- action: start_weed_volume |
||||
|
node: m01 |
||||
|
port: "18480" |
||||
|
master: "192.168.1.184:9433" |
||||
|
dir: /tmp/sw-reconn-vs2 |
||||
|
extra_args: "-block.dir=/tmp/sw-reconn-vs2/blocks -block.listen=:3295 -ip=192.168.1.181" |
||||
|
save_as: vs2_pid_new |
||||
|
|
||||
|
- action: print |
||||
|
msg: "=== Replica restarted. Measuring self-heal ===" |
||||
|
|
||||
|
# Measure recovery — does the system catch up or stay degraded? |
||||
|
- action: measure_recovery |
||||
|
name: "{{ volume_name }}" |
||||
|
timeout: 90s |
||||
|
poll_interval: 2s |
||||
|
fault_type: reconnect |
||||
|
save_as: rp |
||||
|
|
||||
|
- name: post-recovery-write |
||||
|
actions: |
||||
|
# Write after recovery to verify system is fully functional. |
||||
|
- action: fio_json |
||||
|
node: m01 |
||||
|
device: "{{ device }}" |
||||
|
rw: randwrite |
||||
|
bs: 4k |
||||
|
iodepth: "16" |
||||
|
runtime: "10" |
||||
|
time_based: "true" |
||||
|
name: post-recovery-write |
||||
|
save_as: fio_post |
||||
|
|
||||
|
- action: fio_parse |
||||
|
json_var: fio_post |
||||
|
metric: iops |
||||
|
save_as: iops_post |
||||
|
|
||||
|
- action: print |
||||
|
msg: "Post-recovery write: {{ iops_post }} IOPS" |
||||
|
|
||||
|
- name: results |
||||
|
actions: |
||||
|
- action: print |
||||
|
msg: "=== Robust: Reconnect/Catch-up Test ===" |
||||
|
- action: print |
||||
|
msg: "Initial (healthy): {{ iops_initial }} IOPS" |
||||
|
- action: print |
||||
|
msg: "During outage: {{ iops_degraded }} IOPS" |
||||
|
- action: print |
||||
|
msg: "Post-recovery: {{ iops_post }} IOPS" |
||||
|
- action: print |
||||
|
msg: "Recovery duration: {{ rp_duration_ms }} ms" |
||||
|
- action: print |
||||
|
msg: "Recovery path: {{ rp_path }}" |
||||
|
- action: print |
||||
|
msg: "Transitions: {{ rp_transitions }}" |
||||
|
- action: print |
||||
|
msg: "Degraded window: {{ rp_degraded_ms }} ms" |
||||
|
|
||||
|
- action: collect_results |
||||
|
title: "Robust: Reconnect/Catch-up" |
||||
|
volume_name: "{{ volume_name }}" |
||||
|
recovery_profile: rp |
||||
|
|
||||
|
- name: cleanup |
||||
|
always: true |
||||
|
actions: |
||||
|
- action: iscsi_cleanup |
||||
|
node: m01 |
||||
|
ignore_error: true |
||||
|
- action: stop_weed |
||||
|
node: m01 |
||||
|
pid: "{{ vs2_pid_new }}" |
||||
|
ignore_error: true |
||||
|
- action: stop_weed |
||||
|
node: m02 |
||||
|
pid: "{{ vs1_pid }}" |
||||
|
ignore_error: true |
||||
|
- action: stop_weed |
||||
|
node: m02 |
||||
|
pid: "{{ master_pid }}" |
||||
|
ignore_error: true |
||||
@ -0,0 +1,197 @@ |
|||||
|
name: robust-shipper-reconnect |
||||
|
timeout: 5m |
||||
|
|
||||
|
# Robust dimension: Does the shipper reconnect after replica crash+restart? |
||||
|
# |
||||
|
# V1.5: background reconnect → replica_degraded clears within ~10s |
||||
|
# V1: shipper stays degraded permanently after crash |
||||
|
# |
||||
|
# No writes during outage needed. Just checks the degraded flag. |
||||
|
|
||||
|
env: |
||||
|
master_url: "http://192.168.1.184:9433" |
||||
|
volume_name: robust-ship |
||||
|
vol_size: "1073741824" |
||||
|
|
||||
|
topology: |
||||
|
nodes: |
||||
|
m01: |
||||
|
host: 192.168.1.181 |
||||
|
user: testdev |
||||
|
key: "/opt/work/testdev_key" |
||||
|
m02: |
||||
|
host: 192.168.1.184 |
||||
|
user: testdev |
||||
|
key: "/opt/work/testdev_key" |
||||
|
|
||||
|
phases: |
||||
|
- name: cluster-start |
||||
|
actions: |
||||
|
- action: exec |
||||
|
node: m02 |
||||
|
cmd: "fuser -k 9433/tcp 18480/tcp 2>/dev/null; sleep 1; rm -rf /tmp/sw-ship-master /tmp/sw-ship-vs1 && mkdir -p /tmp/sw-ship-master /tmp/sw-ship-vs1/blocks" |
||||
|
root: "true" |
||||
|
ignore_error: true |
||||
|
- action: exec |
||||
|
node: m01 |
||||
|
cmd: "fuser -k 18480/tcp 2>/dev/null; sleep 1; rm -rf /tmp/sw-ship-vs2 && mkdir -p /tmp/sw-ship-vs2/blocks" |
||||
|
root: "true" |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: start_weed_master |
||||
|
node: m02 |
||||
|
port: "9433" |
||||
|
dir: /tmp/sw-ship-master |
||||
|
save_as: master_pid |
||||
|
|
||||
|
- action: sleep |
||||
|
duration: 3s |
||||
|
|
||||
|
- action: start_weed_volume |
||||
|
node: m02 |
||||
|
port: "18480" |
||||
|
master: "localhost:9433" |
||||
|
dir: /tmp/sw-ship-vs1 |
||||
|
extra_args: "-block.dir=/tmp/sw-ship-vs1/blocks -block.listen=:3295 -ip=192.168.1.184" |
||||
|
save_as: vs1_pid |
||||
|
|
||||
|
- action: start_weed_volume |
||||
|
node: m01 |
||||
|
port: "18480" |
||||
|
master: "192.168.1.184:9433" |
||||
|
dir: /tmp/sw-ship-vs2 |
||||
|
extra_args: "-block.dir=/tmp/sw-ship-vs2/blocks -block.listen=:3295 -ip=192.168.1.181" |
||||
|
save_as: vs2_pid |
||||
|
|
||||
|
- action: sleep |
||||
|
duration: 3s |
||||
|
|
||||
|
- action: wait_cluster_ready |
||||
|
node: m02 |
||||
|
master_url: "{{ master_url }}" |
||||
|
|
||||
|
- action: wait_block_servers |
||||
|
count: "2" |
||||
|
|
||||
|
- name: create-and-verify |
||||
|
actions: |
||||
|
- action: create_block_volume |
||||
|
name: "{{ volume_name }}" |
||||
|
size_bytes: "{{ vol_size }}" |
||||
|
replica_factor: "2" |
||||
|
durability_mode: "best_effort" |
||||
|
|
||||
|
- action: wait_volume_healthy |
||||
|
name: "{{ volume_name }}" |
||||
|
timeout: 60s |
||||
|
|
||||
|
- action: print |
||||
|
msg: "=== Volume healthy, replica connected ===" |
||||
|
|
||||
|
- name: crash-replica |
||||
|
actions: |
||||
|
- action: print |
||||
|
msg: "=== SIGKILL replica (m01 VS) ===" |
||||
|
|
||||
|
- action: exec |
||||
|
node: m01 |
||||
|
cmd: "kill -9 {{ vs2_pid }}" |
||||
|
root: "true" |
||||
|
ignore_error: true |
||||
|
|
||||
|
# Wait for primary to detect broken connection. |
||||
|
- action: sleep |
||||
|
duration: 5s |
||||
|
|
||||
|
# Check: primary should report degraded now. |
||||
|
- action: lookup_block_volume |
||||
|
name: "{{ volume_name }}" |
||||
|
save_as: vol_degraded |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: print |
||||
|
msg: "After crash (before restart): degraded={{ vol_degraded_degraded }}" |
||||
|
|
||||
|
- name: restart-and-check |
||||
|
actions: |
||||
|
- action: print |
||||
|
msg: "=== Restarting replica ===" |
||||
|
|
||||
|
- action: start_weed_volume |
||||
|
node: m01 |
||||
|
port: "18480" |
||||
|
master: "192.168.1.184:9433" |
||||
|
dir: /tmp/sw-ship-vs2 |
||||
|
extra_args: "-block.dir=/tmp/sw-ship-vs2/blocks -block.listen=:3295 -ip=192.168.1.181" |
||||
|
save_as: vs2_pid_new |
||||
|
|
||||
|
# Check at 5s — heartbeat re-registers but shipper may still be degraded. |
||||
|
- action: sleep |
||||
|
duration: 5s |
||||
|
|
||||
|
- action: assert_block_field |
||||
|
name: "{{ volume_name }}" |
||||
|
field: replica_degraded |
||||
|
expected: "false" |
||||
|
save_as: check_5s |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: print |
||||
|
msg: "5s after restart: degraded check = {{ check_5s }}" |
||||
|
|
||||
|
# Check at 15s — V1.5 background reconnect (5s interval) should have caught up. |
||||
|
- action: sleep |
||||
|
duration: 10s |
||||
|
|
||||
|
- action: assert_block_field |
||||
|
name: "{{ volume_name }}" |
||||
|
field: replica_degraded |
||||
|
expected: "false" |
||||
|
save_as: check_15s |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: print |
||||
|
msg: "15s after restart: degraded check = {{ check_15s }}" |
||||
|
|
||||
|
# Check at 30s — give V1 every chance. |
||||
|
- action: sleep |
||||
|
duration: 15s |
||||
|
|
||||
|
- action: assert_block_field |
||||
|
name: "{{ volume_name }}" |
||||
|
field: replica_degraded |
||||
|
expected: "false" |
||||
|
save_as: check_30s |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: print |
||||
|
msg: "30s after restart: degraded check = {{ check_30s }}" |
||||
|
|
||||
|
- name: results |
||||
|
actions: |
||||
|
- action: print |
||||
|
msg: "=== Shipper Reconnect Test ===" |
||||
|
- action: print |
||||
|
msg: "5s: {{ check_5s }}" |
||||
|
- action: print |
||||
|
msg: "15s: {{ check_15s }}" |
||||
|
- action: print |
||||
|
msg: "30s: {{ check_30s }}" |
||||
|
- action: print |
||||
|
msg: "(empty = not degraded, error = still degraded)" |
||||
|
|
||||
|
- name: cleanup |
||||
|
always: true |
||||
|
actions: |
||||
|
- action: stop_weed |
||||
|
node: m01 |
||||
|
pid: "{{ vs2_pid_new }}" |
||||
|
ignore_error: true |
||||
|
- action: stop_weed |
||||
|
node: m02 |
||||
|
pid: "{{ vs1_pid }}" |
||||
|
ignore_error: true |
||||
|
- action: stop_weed |
||||
|
node: m02 |
||||
|
pid: "{{ master_pid }}" |
||||
|
ignore_error: true |
||||
@ -0,0 +1,239 @@ |
|||||
|
name: robust-slow-replica |
||||
|
timeout: 5m |
||||
|
|
||||
|
# Robust dimension: What happens when replica fsync is too slow? |
||||
|
# |
||||
|
# Inject 10s delay on replication link → barrier timeout (5s) → |
||||
|
# shipper goes degraded. Clear delay → check if shipper recovers. |
||||
|
# |
||||
|
# V1: shipper stays degraded after barrier timeout (no background reconnect) |
||||
|
# V1.5: background reconnect catches up after delay clears |
||||
|
# |
||||
|
# This is the scenario that directly demonstrates the Phase 13 improvement. |
||||
|
|
||||
|
env: |
||||
|
master_url: "http://192.168.1.184:9433" |
||||
|
volume_name: robust-slow |
||||
|
vol_size: "1073741824" |
||||
|
|
||||
|
topology: |
||||
|
nodes: |
||||
|
m01: |
||||
|
host: 192.168.1.181 |
||||
|
user: testdev |
||||
|
key: "/opt/work/testdev_key" |
||||
|
m02: |
||||
|
host: 192.168.1.184 |
||||
|
user: testdev |
||||
|
key: "/opt/work/testdev_key" |
||||
|
|
||||
|
phases: |
||||
|
- name: cluster-start |
||||
|
actions: |
||||
|
- action: exec |
||||
|
node: m02 |
||||
|
cmd: "fuser -k 9433/tcp 18480/tcp 2>/dev/null; sleep 1; rm -rf /tmp/sw-slow-master /tmp/sw-slow-vs1 && mkdir -p /tmp/sw-slow-master /tmp/sw-slow-vs1/blocks" |
||||
|
root: "true" |
||||
|
ignore_error: true |
||||
|
- action: exec |
||||
|
node: m01 |
||||
|
cmd: "fuser -k 18480/tcp 2>/dev/null; sleep 1; rm -rf /tmp/sw-slow-vs2 && mkdir -p /tmp/sw-slow-vs2/blocks" |
||||
|
root: "true" |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: start_weed_master |
||||
|
node: m02 |
||||
|
port: "9433" |
||||
|
dir: /tmp/sw-slow-master |
||||
|
save_as: master_pid |
||||
|
|
||||
|
- action: sleep |
||||
|
duration: 3s |
||||
|
|
||||
|
- action: start_weed_volume |
||||
|
node: m02 |
||||
|
port: "18480" |
||||
|
master: "localhost:9433" |
||||
|
dir: /tmp/sw-slow-vs1 |
||||
|
extra_args: "-block.dir=/tmp/sw-slow-vs1/blocks -block.listen=:3295 -ip=192.168.1.184" |
||||
|
save_as: vs1_pid |
||||
|
|
||||
|
- action: start_weed_volume |
||||
|
node: m01 |
||||
|
port: "18480" |
||||
|
master: "192.168.1.184:9433" |
||||
|
dir: /tmp/sw-slow-vs2 |
||||
|
extra_args: "-block.dir=/tmp/sw-slow-vs2/blocks -block.listen=:3295 -ip=192.168.1.181" |
||||
|
save_as: vs2_pid |
||||
|
|
||||
|
- action: sleep |
||||
|
duration: 3s |
||||
|
|
||||
|
- action: wait_cluster_ready |
||||
|
node: m02 |
||||
|
master_url: "{{ master_url }}" |
||||
|
|
||||
|
- action: wait_block_servers |
||||
|
count: "2" |
||||
|
|
||||
|
- name: create-volume |
||||
|
actions: |
||||
|
- action: create_block_volume |
||||
|
name: "{{ volume_name }}" |
||||
|
size_bytes: "{{ vol_size }}" |
||||
|
replica_factor: "2" |
||||
|
durability_mode: "best_effort" |
||||
|
|
||||
|
- action: wait_volume_healthy |
||||
|
name: "{{ volume_name }}" |
||||
|
timeout: 60s |
||||
|
|
||||
|
- action: print |
||||
|
msg: "=== Volume healthy, replication established ===" |
||||
|
|
||||
|
- name: connect-and-write |
||||
|
actions: |
||||
|
- action: lookup_block_volume |
||||
|
name: "{{ volume_name }}" |
||||
|
save_as: vol |
||||
|
|
||||
|
- action: iscsi_login_direct |
||||
|
node: m01 |
||||
|
host: "{{ vol_iscsi_host }}" |
||||
|
port: "{{ vol_iscsi_port }}" |
||||
|
iqn: "{{ vol_iqn }}" |
||||
|
save_as: device |
||||
|
|
||||
|
- name: inject-partition |
||||
|
actions: |
||||
|
- action: print |
||||
|
msg: "=== Blocking replication ports (3295) from primary to replica ===" |
||||
|
|
||||
|
# Block only replication port — SSH and master heartbeat still work. |
||||
|
- action: inject_partition |
||||
|
node: m02 |
||||
|
target_ip: "192.168.1.181" |
||||
|
ports: "3295" |
||||
|
|
||||
|
# Trigger a write so barrier fires and times out. |
||||
|
- action: exec |
||||
|
node: m01 |
||||
|
cmd: "timeout 10 dd if=/dev/urandom of={{ device }} bs=4k count=1 oflag=direct 2>/dev/null; true" |
||||
|
root: "true" |
||||
|
ignore_error: true |
||||
|
|
||||
|
# Wait for barrier timeout (5s) + degradation detection. |
||||
|
- action: sleep |
||||
|
duration: 10s |
||||
|
|
||||
|
- action: assert_block_field |
||||
|
name: "{{ volume_name }}" |
||||
|
field: replica_degraded |
||||
|
expected: "true" |
||||
|
save_as: degraded_during |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: print |
||||
|
msg: "During partition: degraded={{ degraded_during }}" |
||||
|
|
||||
|
- name: clear-and-measure |
||||
|
actions: |
||||
|
- action: print |
||||
|
msg: "=== Clearing partition, measuring shipper recovery ===" |
||||
|
|
||||
|
- action: clear_fault |
||||
|
node: m02 |
||||
|
type: partition |
||||
|
|
||||
|
# Check at 5s — V1.5 background reconnect interval is 5s. |
||||
|
- action: sleep |
||||
|
duration: 5s |
||||
|
|
||||
|
- action: assert_block_field |
||||
|
name: "{{ volume_name }}" |
||||
|
field: replica_degraded |
||||
|
expected: "false" |
||||
|
save_as: check_5s |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: print |
||||
|
msg: "5s after clear: {{ check_5s }}" |
||||
|
|
||||
|
# Check at 15s. |
||||
|
- action: sleep |
||||
|
duration: 10s |
||||
|
|
||||
|
- action: assert_block_field |
||||
|
name: "{{ volume_name }}" |
||||
|
field: replica_degraded |
||||
|
expected: "false" |
||||
|
save_as: check_15s |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: print |
||||
|
msg: "15s after clear: {{ check_15s }}" |
||||
|
|
||||
|
# Check at 30s. |
||||
|
- action: sleep |
||||
|
duration: 15s |
||||
|
|
||||
|
- action: assert_block_field |
||||
|
name: "{{ volume_name }}" |
||||
|
field: replica_degraded |
||||
|
expected: "false" |
||||
|
save_as: check_30s |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: print |
||||
|
msg: "30s after clear: {{ check_30s }}" |
||||
|
|
||||
|
# Check at 60s — give V1 every chance. |
||||
|
- action: sleep |
||||
|
duration: 30s |
||||
|
|
||||
|
- action: assert_block_field |
||||
|
name: "{{ volume_name }}" |
||||
|
field: replica_degraded |
||||
|
expected: "false" |
||||
|
save_as: check_60s |
||||
|
ignore_error: true |
||||
|
|
||||
|
- action: print |
||||
|
msg: "60s after clear: {{ check_60s }}" |
||||
|
|
||||
|
- name: results |
||||
|
actions: |
||||
|
- action: print |
||||
|
msg: "=== Slow Replica Recovery Test ===" |
||||
|
- action: print |
||||
|
msg: "During delay: degraded={{ degraded_during }}" |
||||
|
- action: print |
||||
|
msg: "5s after clear: {{ check_5s }}" |
||||
|
- action: print |
||||
|
msg: "15s after clear: {{ check_15s }}" |
||||
|
- action: print |
||||
|
msg: "30s after clear: {{ check_30s }}" |
||||
|
- action: print |
||||
|
msg: "60s after clear: {{ check_60s }}" |
||||
|
- action: print |
||||
|
msg: "(false = recovered, error msg = still degraded)" |
||||
|
|
||||
|
- name: cleanup |
||||
|
always: true |
||||
|
actions: |
||||
|
- action: clear_fault |
||||
|
node: m02 |
||||
|
type: netem |
||||
|
ignore_error: true |
||||
|
- action: stop_weed |
||||
|
node: m01 |
||||
|
pid: "{{ vs2_pid }}" |
||||
|
ignore_error: true |
||||
|
- action: stop_weed |
||||
|
node: m02 |
||||
|
pid: "{{ vs1_pid }}" |
||||
|
ignore_error: true |
||||
|
- action: stop_weed |
||||
|
node: m02 |
||||
|
pid: "{{ master_pid }}" |
||||
|
ignore_error: true |
||||
Write
Preview
Loading…
Cancel
Save
Reference in new issue