Browse Source
fix: flusher OOM on multi-block writes + testrunner enhancements
fix: flusher OOM on multi-block writes + testrunner enhancements
Bug: flusher.go:336 allocated make([]byte, entryLen) per dirty block instead of per unique WAL entry. A 4MB WriteLBA creates 1024 dirty map entries (one per 4KB block), all sharing the same WAL offset. The flusher read the full 4MB WAL entry 1024 times into separate buffers: 1024 × 4MB = 4GB per 4MB write → OOM on mkfs.ext4. Root cause: flusher assumed 1:1 dirty-block-to-WAL-entry mapping. WriteLBA supports multi-block writes but the flusher never deduplicated shared WAL offsets. Fix: deduplicate WAL reads by WalOffset in flushOnceLocked(). Multiple dirty blocks from the same WAL entry share one read buffer and one DecodeWALEntry call. Memory: O(WAL_entries × size) not O(blocks × size). For a 4MB write: 4GB → 4MB. Verified on hardware (m01/M02 25Gbps RoCE): - Before: mkfs.ext4 → VS RSS 100MB→25GB → OOM killed - After: mkfs.ext4 → VS RSS 129MB stable, mkfs succeeds - pgbench TPC-B c=4: 1,248 TPS (RF=1, previously blocked by OOM) Tests added: - flusher_test.go: flush_multiblock_shared_wal_read (16 blocks share one WAL offset, flush dedup verified) - flusher_test.go: flush_multiblock_data_correct (3 mixed multi-block writes, all data correct after flush) - test/component/large_write_test.go: 7 component tests (single 4MB, sequential mkfs sim, concurrent, mixed sizes, production volume, flusher throughput 30s sustained) - iscsi/large_write_mem_test.go: 2 iSCSI session memory tests (4MB R2T flow, slow device) Testrunner enhancements (same commit — all tested on hardware): - discover_primary action: maps primary IP → topology node name, supports alt_ips for multi-NIC (RoCE + management) - NodeSpec.AltIPs field for multi-NIC node identification - 5 new YAML scenarios: ec3, ec5, degraded sync_all/best_effort, pgbench - All 13 hardware-verified scenarios PASS Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>feature/sw-block
12 changed files with 2047 additions and 25 deletions
-
46weed/storage/blockvol/flusher.go
-
153weed/storage/blockvol/flusher_test.go
-
248weed/storage/blockvol/iscsi/large_write_mem_test.go
-
283weed/storage/blockvol/test/component/large_write_test.go
-
112weed/storage/blockvol/testrunner/actions/devops.go
-
20weed/storage/blockvol/testrunner/actions/devops_test.go
-
209weed/storage/blockvol/testrunner/scenarios/internal/benchmark-pgbench.yaml
-
306weed/storage/blockvol/testrunner/scenarios/internal/ec3-fast-reconnect-skips-failover.yaml
-
241weed/storage/blockvol/testrunner/scenarios/internal/ec5-wrong-primary-master-restart.yaml
-
215weed/storage/blockvol/testrunner/scenarios/internal/stable-degraded-best-effort.yaml
-
228weed/storage/blockvol/testrunner/scenarios/internal/stable-degraded-mode.yaml
-
11weed/storage/blockvol/testrunner/types.go
@ -0,0 +1,248 @@ |
|||
package iscsi |
|||
|
|||
import ( |
|||
"bytes" |
|||
"encoding/binary" |
|||
"io" |
|||
"log" |
|||
"net" |
|||
"runtime" |
|||
"sync/atomic" |
|||
"testing" |
|||
"time" |
|||
) |
|||
|
|||
// slowDevice wraps a device and adds configurable delay to WriteAt.
|
|||
// This simulates a real blockvol that blocks on WAL-full during large writes.
|
|||
type slowDevice struct { |
|||
inner BlockDevice |
|||
writeDelay time.Duration |
|||
writeCalls atomic.Int64 |
|||
} |
|||
|
|||
func (d *slowDevice) ReadAt(lba uint64, length uint32) ([]byte, error) { |
|||
return d.inner.ReadAt(lba, length) |
|||
} |
|||
|
|||
func (d *slowDevice) WriteAt(lba uint64, data []byte) error { |
|||
d.writeCalls.Add(1) |
|||
if d.writeDelay > 0 { |
|||
time.Sleep(d.writeDelay) |
|||
} |
|||
return d.inner.WriteAt(lba, data) |
|||
} |
|||
|
|||
func (d *slowDevice) Trim(lba uint64, length uint32) error { return d.inner.Trim(lba, length) } |
|||
func (d *slowDevice) SyncCache() error { return d.inner.SyncCache() } |
|||
func (d *slowDevice) BlockSize() uint32 { return d.inner.BlockSize() } |
|||
func (d *slowDevice) VolumeSize() uint64 { return d.inner.VolumeSize() } |
|||
func (d *slowDevice) IsHealthy() bool { return true } |
|||
|
|||
// TestLargeWriteMemory_4MB sends 4MB WRITE(10) commands through a real
|
|||
// iSCSI session and measures heap growth. This is the in-process version
|
|||
// of the hardware test that showed 25GB RSS.
|
|||
func TestLargeWriteMemory_4MB(t *testing.T) { |
|||
// Use a mock device with 64MB (16K blocks).
|
|||
mockDev := newMockDevice(64 * 1024 * 1024) |
|||
dev := &slowDevice{inner: mockDev, writeDelay: 0} |
|||
|
|||
client, server := net.Pipe() |
|||
defer client.Close() |
|||
|
|||
config := DefaultTargetConfig() |
|||
config.TargetName = testTargetName |
|||
config.MaxRecvDataSegmentLength = 262144 // 256KB
|
|||
config.MaxBurstLength = 4 * 1024 * 1024 // 4MB — allow full 4MB burst
|
|||
config.FirstBurstLength = 65536 // 64KB immediate
|
|||
config.ImmediateData = true |
|||
config.InitialR2T = true |
|||
|
|||
resolver := newTestResolverWithDevice(dev) |
|||
logger := log.New(io.Discard, "", 0) |
|||
|
|||
sess := NewSession(server, config, resolver, resolver, logger) |
|||
done := make(chan error, 1) |
|||
go func() { done <- sess.HandleConnection() }() |
|||
defer func() { client.Close(); <-done }() |
|||
|
|||
doLogin(t, client) |
|||
|
|||
runtime.GC() |
|||
var m runtime.MemStats |
|||
runtime.ReadMemStats(&m) |
|||
heapBefore := int64(m.HeapAlloc) |
|||
|
|||
// Send 10 × 4MB WRITE(10) commands with immediate data + R2T flow.
|
|||
writeSize := 4 * 1024 * 1024 |
|||
data := make([]byte, writeSize) |
|||
for i := range data { |
|||
data[i] = byte(i) |
|||
} |
|||
|
|||
cmdSN := uint32(0) |
|||
for i := 0; i < 10; i++ { |
|||
lba := uint32(i * 1024) // 4MB = 1024 blocks of 4KB
|
|||
blocks := uint16(1024) |
|||
|
|||
// Send SCSI WRITE(10) with immediate data (first 64KB).
|
|||
cmd := &PDU{} |
|||
cmd.SetOpcode(OpSCSICmd) |
|||
cmd.SetOpSpecific1(FlagF | FlagW) |
|||
cmd.SetInitiatorTaskTag(uint32(i + 1)) |
|||
cmd.SetExpectedDataTransferLength(uint32(writeSize)) |
|||
cmd.SetCmdSN(cmdSN) |
|||
cmdSN++ |
|||
|
|||
var cdb [16]byte |
|||
cdb[0] = ScsiWrite10 |
|||
binary.BigEndian.PutUint32(cdb[2:6], lba) |
|||
binary.BigEndian.PutUint16(cdb[7:9], blocks) |
|||
cmd.SetCDB(cdb) |
|||
|
|||
// Immediate data: first 64KB.
|
|||
immLen := 65536 |
|||
cmd.DataSegment = data[:immLen] |
|||
|
|||
if err := WritePDU(client, cmd); err != nil { |
|||
t.Fatalf("write cmd %d: %v", i, err) |
|||
} |
|||
|
|||
// Handle R2T + Data-Out for remaining data.
|
|||
sent := immLen |
|||
for sent < writeSize { |
|||
// Read R2T.
|
|||
r2t, err := ReadPDU(client) |
|||
if err != nil { |
|||
t.Fatalf("read R2T %d: %v", i, err) |
|||
} |
|||
if r2t.Opcode() == OpSCSIResp { |
|||
status := r2t.SCSIStatus() |
|||
t.Fatalf("write %d: early SCSI response status=0x%02x (sent %d/%d)", i, status, sent, writeSize) |
|||
} |
|||
if r2t.Opcode() != OpR2T { |
|||
t.Fatalf("write %d: expected R2T, got %s", i, OpcodeName(r2t.Opcode())) |
|||
} |
|||
|
|||
desiredLen := int(r2t.DesiredDataLength()) |
|||
ttt := r2t.TargetTransferTag() |
|||
|
|||
// Send Data-Out PDUs in MaxRecvDataSegmentLength chunks.
|
|||
seqSent := 0 |
|||
dataSN := uint32(0) |
|||
for seqSent < desiredLen && sent < writeSize { |
|||
chunk := config.MaxRecvDataSegmentLength |
|||
if desiredLen-seqSent < chunk { |
|||
chunk = desiredLen - seqSent |
|||
} |
|||
if writeSize-sent < chunk { |
|||
chunk = writeSize - sent |
|||
} |
|||
|
|||
doPDU := &PDU{} |
|||
doPDU.SetOpcode(OpSCSIDataOut) |
|||
doPDU.SetInitiatorTaskTag(uint32(i + 1)) |
|||
doPDU.SetTargetTransferTag(ttt) |
|||
doPDU.SetBufferOffset(uint32(sent)) |
|||
doPDU.SetDataSN(dataSN) |
|||
dataSN++ |
|||
doPDU.DataSegment = data[sent : sent+chunk] |
|||
|
|||
if seqSent+chunk >= desiredLen || sent+chunk >= writeSize { |
|||
doPDU.SetOpSpecific1(FlagF) // Final
|
|||
} |
|||
|
|||
if err := WritePDU(client, doPDU); err != nil { |
|||
t.Fatalf("write data-out %d: %v", i, err) |
|||
} |
|||
|
|||
sent += chunk |
|||
seqSent += chunk |
|||
} |
|||
} |
|||
|
|||
// Read SCSI Response.
|
|||
client.SetReadDeadline(time.Now().Add(10 * time.Second)) |
|||
resp, err := ReadPDU(client) |
|||
client.SetReadDeadline(time.Time{}) |
|||
if err != nil { |
|||
t.Fatalf("read response %d: %v", i, err) |
|||
} |
|||
if resp.Opcode() != OpSCSIResp { |
|||
t.Fatalf("write %d: expected SCSIResp, got %s", i, OpcodeName(resp.Opcode())) |
|||
} |
|||
if resp.SCSIStatus() != SCSIStatusGood { |
|||
t.Fatalf("write %d: status=0x%02x", i, resp.SCSIStatus()) |
|||
} |
|||
|
|||
runtime.GC() |
|||
runtime.ReadMemStats(&m) |
|||
heap := int64(m.HeapAlloc) |
|||
t.Logf("write %d: heap=%d MB (delta=%d MB)", i, heap/(1024*1024), (heap-heapBefore)/(1024*1024)) |
|||
} |
|||
|
|||
runtime.GC() |
|||
runtime.ReadMemStats(&m) |
|||
heapAfter := int64(m.HeapAlloc) |
|||
deltaMB := (heapAfter - heapBefore) / (1024 * 1024) |
|||
t.Logf("final: heap=%d MB, delta=%d MB, writes=%d", heapAfter/(1024*1024), deltaMB, dev.writeCalls.Load()) |
|||
|
|||
if deltaMB > 200 { |
|||
t.Errorf("MEMORY LEAK: heap grew %d MB for 10 × 4MB iSCSI writes", deltaMB) |
|||
} |
|||
} |
|||
|
|||
// TestLargeWriteMemory_SlowDevice simulates WAL-full blocking: writes
|
|||
// take 100ms each (as if WAL admission is throttling). This keeps buffers
|
|||
// alive longer and tests if they accumulate.
|
|||
func TestLargeWriteMemory_SlowDevice(t *testing.T) { |
|||
mockDev := newMockDevice(64 * 1024 * 1024) |
|||
dev := &slowDevice{inner: mockDev, writeDelay: 100 * time.Millisecond} |
|||
|
|||
client, server := net.Pipe() |
|||
defer client.Close() |
|||
|
|||
config := DefaultTargetConfig() |
|||
config.TargetName = testTargetName |
|||
config.ImmediateData = true |
|||
config.InitialR2T = false // allow full immediate data
|
|||
|
|||
resolver := newTestResolverWithDevice(dev) |
|||
logger := log.New(io.Discard, "", 0) |
|||
|
|||
sess := NewSession(server, config, resolver, resolver, logger) |
|||
doneCh := make(chan error, 1) |
|||
go func() { doneCh <- sess.HandleConnection() }() |
|||
defer func() { client.Close(); <-doneCh }() |
|||
|
|||
doLogin(t, client) |
|||
|
|||
runtime.GC() |
|||
var m runtime.MemStats |
|||
runtime.ReadMemStats(&m) |
|||
heapBefore := int64(m.HeapAlloc) |
|||
|
|||
// Send many 4KB writes rapidly (simulating inode table writes).
|
|||
cmdSN := uint32(0) |
|||
for i := 0; i < 200; i++ { |
|||
sendSCSIWriteImmediate(t, client, uint32(i), bytes.Repeat([]byte{0xBB}, 4096), uint32(i+1), cmdSN) |
|||
cmdSN++ |
|||
|
|||
resp, err := ReadPDU(client) |
|||
if err != nil { |
|||
t.Fatalf("write %d response: %v", i, err) |
|||
} |
|||
if resp.SCSIStatus() != SCSIStatusGood { |
|||
t.Fatalf("write %d: status=0x%02x", i, resp.SCSIStatus()) |
|||
} |
|||
} |
|||
|
|||
runtime.GC() |
|||
runtime.ReadMemStats(&m) |
|||
heapAfter := int64(m.HeapAlloc) |
|||
delta := (heapAfter - heapBefore) / (1024 * 1024) |
|||
t.Logf("200 × 4KB writes with 100ms delay: heap delta=%d MB, writes=%d", delta, dev.writeCalls.Load()) |
|||
|
|||
if delta > 100 { |
|||
t.Errorf("MEMORY LEAK with slow device: heap grew %d MB", delta) |
|||
} |
|||
} |
|||
@ -0,0 +1,283 @@ |
|||
package component |
|||
|
|||
import ( |
|||
"fmt" |
|||
"runtime" |
|||
"sync" |
|||
"testing" |
|||
"time" |
|||
|
|||
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol" |
|||
) |
|||
|
|||
// These tests verify that multi-block writes (4MB from mkfs.ext4) work
|
|||
// correctly and do not cause excessive memory usage in the flusher.
|
|||
//
|
|||
// Bug: flusher.go allocated make([]byte, 4MB) per dirty block instead of
|
|||
// per unique WAL entry. A 4MB write (1024 blocks) caused 1024 × 4MB = 4GB.
|
|||
// Fix: deduplicate WAL reads by WalOffset.
|
|||
|
|||
func createLargeWriteVol(t *testing.T, volSize, walSize uint64) *blockvol.BlockVol { |
|||
t.Helper() |
|||
path := t.TempDir() + "/test.blk" |
|||
vol, err := blockvol.CreateBlockVol(path, blockvol.CreateOptions{ |
|||
VolumeSize: volSize, |
|||
BlockSize: 4096, |
|||
WALSize: walSize, |
|||
}) |
|||
if err != nil { |
|||
t.Fatalf("CreateBlockVol: %v", err) |
|||
} |
|||
t.Cleanup(func() { vol.Close() }) |
|||
return vol |
|||
} |
|||
|
|||
// heapMB returns current heap usage in MB as int64 (safe for subtraction).
|
|||
func heapMB() int64 { |
|||
var m runtime.MemStats |
|||
runtime.ReadMemStats(&m) |
|||
return int64(m.HeapAlloc) / (1024 * 1024) |
|||
} |
|||
|
|||
// Test 1: Single 4MB write — does it succeed at all?
|
|||
func TestLargeWrite_Single4MB(t *testing.T) { |
|||
vol := createLargeWriteVol(t, 64*1024*1024, 64*1024*1024) |
|||
|
|||
data := make([]byte, 4*1024*1024) |
|||
for i := range data { |
|||
data[i] = 0xAA |
|||
} |
|||
|
|||
if err := vol.WriteLBA(0, data); err != nil { |
|||
t.Fatalf("4MB WriteLBA failed: %v", err) |
|||
} |
|||
|
|||
readBack, err := vol.ReadLBA(0, 4*1024*1024) |
|||
if err != nil { |
|||
t.Fatalf("4MB ReadLBA failed: %v", err) |
|||
} |
|||
if readBack[0] != 0xAA || readBack[4*1024*1024-1] != 0xAA { |
|||
t.Fatal("read data mismatch") |
|||
} |
|||
t.Log("single 4MB write+read: OK") |
|||
} |
|||
|
|||
// Test 2: Sequential 4MB writes — simulates mkfs journal creation.
|
|||
func TestLargeWrite_Sequential4MB_MkfsSim(t *testing.T) { |
|||
vol := createLargeWriteVol(t, 128*1024*1024, 64*1024*1024) |
|||
|
|||
data := make([]byte, 4*1024*1024) |
|||
for i := range data { |
|||
data[i] = byte(i & 0xFF) |
|||
} |
|||
|
|||
before := heapMB() |
|||
t.Logf("heap before: %d MB", before) |
|||
|
|||
start := time.Now() |
|||
const writeCount = 20 |
|||
|
|||
for i := range writeCount { |
|||
lba := uint64(i) * 1024 |
|||
if err := vol.WriteLBA(lba, data); err != nil { |
|||
t.Fatalf("write %d at LBA %d failed after %s: %v", i, lba, time.Since(start), err) |
|||
} |
|||
if i > 0 && i%5 == 0 { |
|||
t.Logf(" write %d/%d: elapsed=%s heap=%dMB", i, writeCount, time.Since(start), heapMB()) |
|||
} |
|||
} |
|||
|
|||
after := heapMB() |
|||
delta := after - before |
|||
t.Logf("heap after: %d MB (delta=%d MB)", after, delta) |
|||
t.Logf("%d × 4MB writes in %s (%.1f MB/s)", writeCount, time.Since(start), |
|||
float64(writeCount*4)/time.Since(start).Seconds()) |
|||
|
|||
if delta > 500 { |
|||
t.Errorf("MEMORY BLOWUP: heap grew by %d MB for %d MB of writes", delta, writeCount*4) |
|||
} |
|||
} |
|||
|
|||
// Test 3: Concurrent 4MB writes — worst case for WAL admission.
|
|||
func TestLargeWrite_Concurrent4MB(t *testing.T) { |
|||
vol := createLargeWriteVol(t, 512*1024*1024, 64*1024*1024) |
|||
|
|||
data := make([]byte, 4*1024*1024) |
|||
for i := range data { |
|||
data[i] = byte(i & 0xFF) |
|||
} |
|||
|
|||
before := heapMB() |
|||
t.Logf("heap before concurrent: %d MB", before) |
|||
|
|||
const writers = 16 |
|||
const writesPerWriter = 5 |
|||
var wg sync.WaitGroup |
|||
errs := make([]error, writers) |
|||
|
|||
start := time.Now() |
|||
for w := range writers { |
|||
wg.Add(1) |
|||
go func() { |
|||
defer wg.Done() |
|||
for i := range writesPerWriter { |
|||
lba := uint64(w*writesPerWriter+i) * 1024 |
|||
if err := vol.WriteLBA(lba, data); err != nil { |
|||
errs[w] = fmt.Errorf("writer %d write %d: %v", w, i, err) |
|||
return |
|||
} |
|||
} |
|||
}() |
|||
} |
|||
|
|||
done := make(chan struct{}) |
|||
go func() { wg.Wait(); close(done) }() |
|||
|
|||
select { |
|||
case <-done: |
|||
case <-time.After(60 * time.Second): |
|||
t.Fatal("TIMEOUT: concurrent 4MB writes hung for 60s") |
|||
} |
|||
|
|||
for _, err := range errs { |
|||
if err != nil { |
|||
t.Error(err) |
|||
} |
|||
} |
|||
|
|||
after := heapMB() |
|||
delta := after - before |
|||
t.Logf("heap after concurrent: %d MB (delta=%d MB)", after, delta) |
|||
t.Logf("%d writers × %d × 4MB = %d MB in %s", |
|||
writers, writesPerWriter, writers*writesPerWriter*4, time.Since(start)) |
|||
|
|||
if delta > 1000 { |
|||
t.Errorf("MEMORY BLOWUP: heap grew by %d MB", delta) |
|||
} |
|||
} |
|||
|
|||
// Test 4: WAL size vs entry size — 4MB entry in 1MB WAL should fail, not hang.
|
|||
func TestLargeWrite_EntryLargerThanWAL(t *testing.T) { |
|||
vol := createLargeWriteVol(t, 64*1024*1024, 1*1024*1024) |
|||
|
|||
err := vol.WriteLBA(0, make([]byte, 4*1024*1024)) |
|||
if err == nil { |
|||
t.Log("4MB write succeeded in 1MB WAL — entry was split or WAL grew") |
|||
} else { |
|||
t.Logf("4MB write in 1MB WAL returned error (expected): %v", err) |
|||
} |
|||
} |
|||
|
|||
// Test 5: Mixed 4K+1M+4M writes — simulates real mkfs pattern.
|
|||
func TestLargeWrite_MixedSizes(t *testing.T) { |
|||
vol := createLargeWriteVol(t, 128*1024*1024, 64*1024*1024) |
|||
|
|||
small := make([]byte, 4096) |
|||
large := make([]byte, 1<<20) |
|||
huge := make([]byte, 4<<20) |
|||
for i := range small { |
|||
small[i] = 0x11 |
|||
} |
|||
for i := range large { |
|||
large[i] = 0x22 |
|||
} |
|||
for i := range huge { |
|||
huge[i] = 0x33 |
|||
} |
|||
|
|||
start := time.Now() |
|||
for i := range 100 { |
|||
if err := vol.WriteLBA(uint64(i), small); err != nil { |
|||
t.Fatalf("small write %d: %v", i, err) |
|||
} |
|||
} |
|||
t.Logf("phase 1: 100 × 4KB in %s", time.Since(start)) |
|||
|
|||
phase2 := time.Now() |
|||
for i := range 10 { |
|||
if err := vol.WriteLBA(uint64(1000+i*256), large); err != nil { |
|||
t.Fatalf("large write %d: %v", i, err) |
|||
} |
|||
} |
|||
t.Logf("phase 2: 10 × 1MB in %s", time.Since(phase2)) |
|||
|
|||
phase3 := time.Now() |
|||
for i := range 10 { |
|||
if err := vol.WriteLBA(uint64(10000+i*1024), huge); err != nil { |
|||
t.Fatalf("huge write %d: %v (elapsed %s)", i, err, time.Since(phase3)) |
|||
} |
|||
} |
|||
t.Logf("phase 3: 10 × 4MB in %s", time.Since(phase3)) |
|||
t.Logf("total: %s, final heap: %d MB", time.Since(start), heapMB()) |
|||
} |
|||
|
|||
// Test 6: Production-sized volume (2GB) — memory stays bounded.
|
|||
func TestLargeWrite_ProductionVolumeMemory(t *testing.T) { |
|||
if testing.Short() { |
|||
t.Skip("skipping: creates 2GB file on disk") |
|||
} |
|||
|
|||
runtime.GC() |
|||
before := heapMB() |
|||
t.Logf("heap before create: %d MB", before) |
|||
|
|||
vol := createLargeWriteVol(t, 2*1024*1024*1024, 64*1024*1024) |
|||
|
|||
runtime.GC() |
|||
afterCreate := heapMB() |
|||
t.Logf("heap after create: %d MB (delta=%d MB)", afterCreate, afterCreate-before) |
|||
|
|||
small := make([]byte, 4096) |
|||
for i := range 1000 { |
|||
if err := vol.WriteLBA(uint64(i)*100, small); err != nil { |
|||
t.Fatalf("small write %d: %v", i, err) |
|||
} |
|||
} |
|||
|
|||
runtime.GC() |
|||
afterSmall := heapMB() |
|||
t.Logf("heap after 1000 × 4KB: %d MB (delta=%d MB)", afterSmall, afterSmall-before) |
|||
|
|||
large := make([]byte, 4*1024*1024) |
|||
for i := range 10 { |
|||
if err := vol.WriteLBA(uint64(200000+i*1024), large); err != nil { |
|||
t.Logf("large write %d failed: %v", i, err) |
|||
break |
|||
} |
|||
} |
|||
|
|||
runtime.GC() |
|||
afterLarge := heapMB() |
|||
delta := afterLarge - before |
|||
t.Logf("heap after 4MB writes: %d MB (delta=%d MB)", afterLarge, delta) |
|||
|
|||
if delta > 500 { |
|||
t.Errorf("EXCESSIVE MEMORY: heap grew %d MB for 2GB volume", delta) |
|||
} |
|||
} |
|||
|
|||
// Test 7: Sustained 4MB writes for 30s — flusher keeps up.
|
|||
func TestLargeWrite_FlusherThroughput(t *testing.T) { |
|||
if testing.Short() { |
|||
t.Skip("30s sustained write test") |
|||
} |
|||
vol := createLargeWriteVol(t, 256*1024*1024, 64*1024*1024) |
|||
|
|||
data := make([]byte, 4*1024*1024) |
|||
start := time.Now() |
|||
deadline := start.Add(30 * time.Second) |
|||
|
|||
writes := 0 |
|||
for time.Now().Before(deadline) { |
|||
lba := uint64(writes%50) * 1024 |
|||
if err := vol.WriteLBA(lba, data); err != nil { |
|||
t.Fatalf("write %d failed after %s: %v", writes, time.Since(start), err) |
|||
} |
|||
writes++ |
|||
} |
|||
|
|||
t.Logf("flusher throughput: %d × 4MB in %s (%.1f writes/s, %.1f MB/s)", |
|||
writes, time.Since(start), float64(writes)/time.Since(start).Seconds(), |
|||
float64(writes*4)/time.Since(start).Seconds()) |
|||
t.Logf("final heap: %d MB", heapMB()) |
|||
} |
|||
@ -0,0 +1,209 @@ |
|||
name: benchmark-pgbench |
|||
timeout: 15m |
|||
|
|||
# Database benchmark: PostgreSQL pgbench on sw-block. |
|||
# |
|||
# Measures TPS for TPC-B (mixed read/write) and SELECT-only at various |
|||
# client counts. This is the most product-relevant benchmark — operators |
|||
# care about database performance on block storage. |
|||
# |
|||
# Competitive reference (March 2024, RF=2 sync, 25Gbps): |
|||
# Ceph RBD: 431 TPS |
|||
# Longhorn V1: 2,754 TPS |
|||
# sw-block: 3,226 TPS (old code, commit e92263b) |
|||
# Mayastor: 5,801 TPS |
|||
# |
|||
# Current sw-block number after syncWithWALProgress fix: unknown. |
|||
|
|||
env: |
|||
master_url: "http://10.0.0.3:9433" |
|||
volume_name: bench-pgbench |
|||
# 2GB is enough for pgbench scale=10 (~150MB data). |
|||
# 4GB caused OOM on VS (20GB RSS → OOM killed). |
|||
vol_size: "2147483648" |
|||
|
|||
topology: |
|||
nodes: |
|||
m01: |
|||
host: 192.168.1.181 |
|||
alt_ips: ["10.0.0.1"] |
|||
user: testdev |
|||
key: "/opt/work/testdev_key" |
|||
m02: |
|||
host: 192.168.1.184 |
|||
alt_ips: ["10.0.0.3"] |
|||
user: testdev |
|||
key: "/opt/work/testdev_key" |
|||
|
|||
phases: |
|||
- name: cluster-start |
|||
actions: |
|||
- action: exec |
|||
node: m02 |
|||
cmd: "fuser -k 9433/tcp 18480/tcp 2>/dev/null; sleep 1; rm -rf /tmp/sw-pgb-master /tmp/sw-pgb-vs1; mkdir -p /tmp/sw-pgb-master /tmp/sw-pgb-vs1/blocks" |
|||
root: "true" |
|||
ignore_error: true |
|||
- action: exec |
|||
node: m01 |
|||
cmd: "fuser -k 18480/tcp 2>/dev/null; sleep 1; rm -rf /tmp/sw-pgb-vs2; mkdir -p /tmp/sw-pgb-vs2/blocks" |
|||
root: "true" |
|||
ignore_error: true |
|||
|
|||
- action: start_weed_master |
|||
node: m02 |
|||
port: "9433" |
|||
dir: /tmp/sw-pgb-master |
|||
extra_args: "-ip=10.0.0.3" |
|||
save_as: master_pid |
|||
|
|||
- action: sleep |
|||
duration: 3s |
|||
|
|||
- action: start_weed_volume |
|||
node: m02 |
|||
port: "18480" |
|||
master: "10.0.0.3:9433" |
|||
dir: /tmp/sw-pgb-vs1 |
|||
extra_args: "-block.dir=/tmp/sw-pgb-vs1/blocks -block.listen=:3295 -ip=10.0.0.3" |
|||
save_as: vs1_pid |
|||
|
|||
- action: start_weed_volume |
|||
node: m01 |
|||
port: "18480" |
|||
master: "10.0.0.3:9433" |
|||
dir: /tmp/sw-pgb-vs2 |
|||
extra_args: "-block.dir=/tmp/sw-pgb-vs2/blocks -block.listen=:3295 -ip=10.0.0.1" |
|||
save_as: vs2_pid |
|||
|
|||
- action: sleep |
|||
duration: 3s |
|||
|
|||
- action: wait_cluster_ready |
|||
node: m02 |
|||
master_url: "{{ master_url }}" |
|||
|
|||
- action: wait_block_servers |
|||
count: "2" |
|||
|
|||
- name: create-volume |
|||
actions: |
|||
- action: create_block_volume |
|||
name: "{{ volume_name }}" |
|||
size_bytes: "{{ vol_size }}" |
|||
replica_factor: "2" |
|||
durability_mode: "sync_all" |
|||
|
|||
- action: wait_volume_healthy |
|||
name: "{{ volume_name }}" |
|||
timeout: 60s |
|||
|
|||
- name: connect |
|||
actions: |
|||
- action: lookup_block_volume |
|||
name: "{{ volume_name }}" |
|||
save_as: vol |
|||
|
|||
- action: iscsi_login_direct |
|||
node: m01 |
|||
host: "{{ vol_iscsi_host }}" |
|||
port: "{{ vol_iscsi_port }}" |
|||
iqn: "{{ vol_iqn }}" |
|||
save_as: device |
|||
|
|||
# Short sleep for device to settle. |
|||
- action: sleep |
|||
duration: 3s |
|||
|
|||
- name: pgbench-setup |
|||
actions: |
|||
- action: pgbench_init |
|||
node: m01 |
|||
device: "{{ device }}" |
|||
mount: "/mnt/pgbench" |
|||
port: "5434" |
|||
scale: "10" |
|||
fstype: "ext4" |
|||
|
|||
- name: tpcb-c1 |
|||
actions: |
|||
- action: pgbench_run |
|||
node: m01 |
|||
clients: "1" |
|||
duration: "30" |
|||
save_as: tps_c1 |
|||
|
|||
- action: print |
|||
msg: "TPC-B c=1: {{ tps_c1 }} TPS" |
|||
|
|||
- name: tpcb-c4 |
|||
actions: |
|||
- action: pgbench_run |
|||
node: m01 |
|||
clients: "4" |
|||
duration: "30" |
|||
save_as: tps_c4 |
|||
|
|||
- action: print |
|||
msg: "TPC-B c=4: {{ tps_c4 }} TPS" |
|||
|
|||
- name: tpcb-c8 |
|||
actions: |
|||
- action: pgbench_run |
|||
node: m01 |
|||
clients: "8" |
|||
duration: "30" |
|||
save_as: tps_c8 |
|||
|
|||
- action: print |
|||
msg: "TPC-B c=8: {{ tps_c8 }} TPS" |
|||
|
|||
- name: select-c4 |
|||
actions: |
|||
- action: pgbench_run |
|||
node: m01 |
|||
clients: "4" |
|||
duration: "30" |
|||
select_only: "true" |
|||
save_as: tps_sel |
|||
|
|||
- action: print |
|||
msg: "SELECT-only c=4: {{ tps_sel }} TPS" |
|||
|
|||
- name: results |
|||
actions: |
|||
- action: print |
|||
msg: "=== pgbench: RF=2 sync_all iSCSI/RoCE ===" |
|||
- action: print |
|||
msg: "TPC-B c=1: {{ tps_c1 }} TPS" |
|||
- action: print |
|||
msg: "TPC-B c=4: {{ tps_c4 }} TPS" |
|||
- action: print |
|||
msg: "TPC-B c=8: {{ tps_c8 }} TPS" |
|||
- action: print |
|||
msg: "SELECT-only c=4: {{ tps_sel }} TPS" |
|||
|
|||
- action: collect_results |
|||
title: "pgbench: RF=2 sync_all iSCSI/RoCE" |
|||
volume_name: "{{ volume_name }}" |
|||
|
|||
- name: cleanup |
|||
always: true |
|||
actions: |
|||
- action: pgbench_cleanup |
|||
node: m01 |
|||
ignore_error: true |
|||
- action: iscsi_cleanup |
|||
node: m01 |
|||
ignore_error: true |
|||
- action: stop_weed |
|||
node: m01 |
|||
pid: "{{ vs2_pid }}" |
|||
ignore_error: true |
|||
- action: stop_weed |
|||
node: m02 |
|||
pid: "{{ vs1_pid }}" |
|||
ignore_error: true |
|||
- action: stop_weed |
|||
node: m02 |
|||
pid: "{{ master_pid }}" |
|||
ignore_error: true |
|||
@ -0,0 +1,306 @@ |
|||
name: ec3-fast-reconnect-skips-failover |
|||
timeout: 10m |
|||
|
|||
# Edge Case #3: Fast reconnect skips failover. |
|||
# |
|||
# Scenario: Kill primary VS and restart it BEFORE lease expires (~30s TTL). |
|||
# Risk: The restarted VS re-registers via heartbeat and master accepts it |
|||
# as primary again without going through the failover+promotion path. |
|||
# The VS may serve with stale epoch/role if the fast reconnect bypasses |
|||
# the assignment pipeline. |
|||
# |
|||
# What we verify: |
|||
# 1. After restart, the primary epoch is >= the original epoch |
|||
# 2. I/O works after the fast restart |
|||
# 3. Replication is healthy (not stuck in degraded) |
|||
|
|||
env: |
|||
master_url: "http://10.0.0.3:9433" |
|||
volume_name: ec3-test |
|||
vol_size: "1073741824" |
|||
|
|||
topology: |
|||
nodes: |
|||
m01: |
|||
host: 192.168.1.181 |
|||
alt_ips: ["10.0.0.1"] |
|||
user: testdev |
|||
key: "/opt/work/testdev_key" |
|||
m02: |
|||
host: 192.168.1.184 |
|||
alt_ips: ["10.0.0.3"] |
|||
user: testdev |
|||
key: "/opt/work/testdev_key" |
|||
|
|||
phases: |
|||
- name: cluster-start |
|||
actions: |
|||
- action: exec |
|||
node: m02 |
|||
cmd: "fuser -k 9433/tcp 18480/tcp 2>/dev/null; sleep 1; rm -rf /tmp/sw-ec3-master /tmp/sw-ec3-vs1 && mkdir -p /tmp/sw-ec3-master /tmp/sw-ec3-vs1/blocks" |
|||
root: "true" |
|||
ignore_error: true |
|||
- action: exec |
|||
node: m01 |
|||
cmd: "fuser -k 18480/tcp 2>/dev/null; sleep 1; rm -rf /tmp/sw-ec3-vs2 && mkdir -p /tmp/sw-ec3-vs2/blocks" |
|||
root: "true" |
|||
ignore_error: true |
|||
|
|||
- action: start_weed_master |
|||
node: m02 |
|||
port: "9433" |
|||
dir: /tmp/sw-ec3-master |
|||
extra_args: "-ip=10.0.0.3" |
|||
save_as: master_pid |
|||
|
|||
- action: sleep |
|||
duration: 3s |
|||
|
|||
- action: start_weed_volume |
|||
node: m02 |
|||
port: "18480" |
|||
master: "10.0.0.3:9433" |
|||
dir: /tmp/sw-ec3-vs1 |
|||
extra_args: "-block.dir=/tmp/sw-ec3-vs1/blocks -block.listen=:3295 -ip=10.0.0.3" |
|||
save_as: vs1_pid |
|||
|
|||
- action: start_weed_volume |
|||
node: m01 |
|||
port: "18480" |
|||
master: "10.0.0.3:9433" |
|||
dir: /tmp/sw-ec3-vs2 |
|||
extra_args: "-block.dir=/tmp/sw-ec3-vs2/blocks -block.listen=:3295 -ip=10.0.0.1" |
|||
save_as: vs2_pid |
|||
|
|||
- action: sleep |
|||
duration: 3s |
|||
|
|||
- action: wait_cluster_ready |
|||
node: m02 |
|||
master_url: "{{ master_url }}" |
|||
|
|||
- action: wait_block_servers |
|||
count: "2" |
|||
|
|||
- name: create-volume |
|||
actions: |
|||
- action: create_block_volume |
|||
name: "{{ volume_name }}" |
|||
size_bytes: "{{ vol_size }}" |
|||
replica_factor: "2" |
|||
durability_mode: "sync_all" |
|||
|
|||
- action: wait_volume_healthy |
|||
name: "{{ volume_name }}" |
|||
timeout: 60s |
|||
|
|||
- name: record-state-before |
|||
actions: |
|||
- action: discover_primary |
|||
name: "{{ volume_name }}" |
|||
save_as: before |
|||
|
|||
- action: lookup_block_volume |
|||
name: "{{ volume_name }}" |
|||
save_as: vol_before |
|||
|
|||
- action: assert_block_field |
|||
name: "{{ volume_name }}" |
|||
field: epoch |
|||
expected: "1" |
|||
|
|||
# Write data so state is non-trivial. |
|||
- action: iscsi_login_direct |
|||
node: m01 |
|||
host: "{{ vol_before_iscsi_host }}" |
|||
port: "{{ vol_before_iscsi_port }}" |
|||
iqn: "{{ vol_before_iqn }}" |
|||
save_as: device |
|||
|
|||
- action: fio_json |
|||
node: m01 |
|||
device: "{{ device }}" |
|||
rw: randwrite |
|||
bs: 4k |
|||
iodepth: "16" |
|||
runtime: "10" |
|||
time_based: "true" |
|||
name: pre-write |
|||
|
|||
- action: iscsi_cleanup |
|||
node: m01 |
|||
ignore_error: true |
|||
|
|||
- action: print |
|||
msg: "Before: primary={{ before }} server={{ before_server }}" |
|||
|
|||
- name: force-primary-m02 |
|||
actions: |
|||
# Force primary to m02 so we have a known node to kill. |
|||
- action: block_promote |
|||
name: "{{ volume_name }}" |
|||
target_server: "10.0.0.3:18480" |
|||
force: "true" |
|||
reason: "ec3-setup" |
|||
|
|||
- action: sleep |
|||
duration: 5s |
|||
|
|||
- action: wait_volume_healthy |
|||
name: "{{ volume_name }}" |
|||
timeout: 60s |
|||
|
|||
- action: discover_primary |
|||
name: "{{ volume_name }}" |
|||
save_as: primary_pre |
|||
|
|||
- action: print |
|||
msg: "Primary forced to m02: {{ primary_pre_server }}" |
|||
|
|||
- name: fast-kill-restart |
|||
actions: |
|||
# Kill PRIMARY VS on m02 with SIGKILL. |
|||
- action: exec |
|||
node: m02 |
|||
cmd: "kill -9 {{ vs1_pid }}" |
|||
root: "true" |
|||
ignore_error: true |
|||
|
|||
- action: print |
|||
msg: "Primary killed on m02. Fast restart in 3s (before 30s lease expires)..." |
|||
|
|||
# CRITICAL: restart FAST, well within the 30s lease TTL. |
|||
- action: sleep |
|||
duration: 3s |
|||
|
|||
- action: start_weed_volume |
|||
node: m02 |
|||
port: "18480" |
|||
master: "10.0.0.3:9433" |
|||
dir: /tmp/sw-ec3-vs1 |
|||
extra_args: "-block.dir=/tmp/sw-ec3-vs1/blocks -block.listen=:3295 -ip=10.0.0.3" |
|||
save_as: vs1_pid_new |
|||
|
|||
- action: print |
|||
msg: "VS restarted on m02. Waiting for re-registration..." |
|||
|
|||
# Wait for heartbeat re-registration. |
|||
- action: sleep |
|||
duration: 10s |
|||
|
|||
- action: wait_block_servers |
|||
count: "2" |
|||
timeout: 30s |
|||
|
|||
- name: verify-state-after |
|||
actions: |
|||
# Wait for volume to become healthy again. |
|||
# Use longer timeout — primary fast-restart may need full re-assignment cycle. |
|||
- action: wait_volume_healthy |
|||
name: "{{ volume_name }}" |
|||
timeout: 180s |
|||
|
|||
- action: discover_primary |
|||
name: "{{ volume_name }}" |
|||
save_as: after |
|||
|
|||
- action: print |
|||
msg: "After fast restart: primary={{ after }} server={{ after_server }}" |
|||
|
|||
# Verify replication is not stuck degraded. |
|||
- action: assert_block_field |
|||
name: "{{ volume_name }}" |
|||
field: replica_degraded |
|||
expected: "false" |
|||
|
|||
# Verify epoch is valid (>= 1, ideally bumped if re-assignment happened). |
|||
- action: lookup_block_volume |
|||
name: "{{ volume_name }}" |
|||
save_as: vol_after |
|||
|
|||
- action: print |
|||
msg: "Epoch after: checking via lookup..." |
|||
|
|||
- name: verify-io-after |
|||
actions: |
|||
# Fresh lookup to get current primary's iSCSI address |
|||
# (primary moved to m02 after promote, so original vars point to wrong host). |
|||
- action: lookup_block_volume |
|||
name: "{{ volume_name }}" |
|||
save_as: vol_cur |
|||
|
|||
- action: iscsi_login_direct |
|||
node: m01 |
|||
host: "{{ vol_cur_iscsi_host }}" |
|||
port: "{{ vol_cur_iscsi_port }}" |
|||
iqn: "{{ vol_cur_iqn }}" |
|||
save_as: device2 |
|||
|
|||
- action: fio_json |
|||
node: m01 |
|||
device: "{{ device2 }}" |
|||
rw: randwrite |
|||
bs: 4k |
|||
iodepth: "16" |
|||
runtime: "10" |
|||
time_based: "true" |
|||
name: post-restart-write |
|||
save_as: fio_after |
|||
|
|||
- action: fio_parse |
|||
json_var: fio_after |
|||
metric: iops |
|||
save_as: iops_after |
|||
|
|||
- action: print |
|||
msg: "EC-3 post-restart write IOPS: {{ iops_after }}" |
|||
|
|||
# Also verify read works. |
|||
- action: fio_json |
|||
node: m01 |
|||
device: "{{ device2 }}" |
|||
rw: randread |
|||
bs: 4k |
|||
iodepth: "16" |
|||
runtime: "5" |
|||
time_based: "true" |
|||
name: post-restart-read |
|||
save_as: fio_read |
|||
|
|||
- action: fio_parse |
|||
json_var: fio_read |
|||
metric: iops |
|||
save_as: iops_read |
|||
|
|||
- action: print |
|||
msg: "EC-3 post-restart read IOPS: {{ iops_read }}" |
|||
|
|||
- action: iscsi_cleanup |
|||
node: m01 |
|||
ignore_error: true |
|||
|
|||
- action: print |
|||
msg: "EC-3 PASS: fast reconnect did not leave system in broken state" |
|||
|
|||
- name: cleanup |
|||
always: true |
|||
actions: |
|||
- action: iscsi_cleanup |
|||
node: m01 |
|||
ignore_error: true |
|||
- action: stop_weed |
|||
node: m01 |
|||
pid: "{{ vs2_pid }}" |
|||
ignore_error: true |
|||
- action: stop_weed |
|||
node: m02 |
|||
pid: "{{ vs1_pid_new }}" |
|||
ignore_error: true |
|||
- action: stop_weed |
|||
node: m02 |
|||
pid: "{{ vs1_pid }}" |
|||
ignore_error: true |
|||
- action: stop_weed |
|||
node: m02 |
|||
pid: "{{ master_pid }}" |
|||
ignore_error: true |
|||
@ -0,0 +1,241 @@ |
|||
name: ec5-wrong-primary-master-restart |
|||
timeout: 10m |
|||
|
|||
# Edge Case #5: Wrong primary after master restart. |
|||
# |
|||
# Scenario: Create RF=2 volume, note primary. Kill master, restart it. |
|||
# Verify the SAME volume server remains primary (master state survived restart |
|||
# via heartbeat re-registration, not random re-assignment). |
|||
# |
|||
# Risk: If master loses all state on restart and re-assigns primary based on |
|||
# first heartbeat, the "wrong" VS could become primary, causing split-brain |
|||
# or data loss if the old primary still has unflushed WAL entries. |
|||
|
|||
env: |
|||
master_url: "http://10.0.0.3:9433" |
|||
volume_name: ec5-test |
|||
vol_size: "1073741824" |
|||
|
|||
topology: |
|||
nodes: |
|||
m01: |
|||
host: 192.168.1.181 |
|||
alt_ips: ["10.0.0.1"] |
|||
user: testdev |
|||
key: "/opt/work/testdev_key" |
|||
m02: |
|||
host: 192.168.1.184 |
|||
alt_ips: ["10.0.0.3"] |
|||
user: testdev |
|||
key: "/opt/work/testdev_key" |
|||
|
|||
phases: |
|||
- name: cluster-start |
|||
actions: |
|||
- action: exec |
|||
node: m02 |
|||
cmd: "fuser -k 9433/tcp 18480/tcp 2>/dev/null; sleep 1; rm -rf /tmp/sw-ec5-master /tmp/sw-ec5-vs1 && mkdir -p /tmp/sw-ec5-master /tmp/sw-ec5-vs1/blocks" |
|||
root: "true" |
|||
ignore_error: true |
|||
- action: exec |
|||
node: m01 |
|||
cmd: "fuser -k 18480/tcp 2>/dev/null; sleep 1; rm -rf /tmp/sw-ec5-vs2 && mkdir -p /tmp/sw-ec5-vs2/blocks" |
|||
root: "true" |
|||
ignore_error: true |
|||
|
|||
- action: start_weed_master |
|||
node: m02 |
|||
port: "9433" |
|||
dir: /tmp/sw-ec5-master |
|||
extra_args: "-ip=10.0.0.3" |
|||
save_as: master_pid |
|||
|
|||
- action: sleep |
|||
duration: 3s |
|||
|
|||
- action: start_weed_volume |
|||
node: m02 |
|||
port: "18480" |
|||
master: "10.0.0.3:9433" |
|||
dir: /tmp/sw-ec5-vs1 |
|||
extra_args: "-block.dir=/tmp/sw-ec5-vs1/blocks -block.listen=:3295 -ip=10.0.0.3" |
|||
save_as: vs1_pid |
|||
|
|||
- action: start_weed_volume |
|||
node: m01 |
|||
port: "18480" |
|||
master: "10.0.0.3:9433" |
|||
dir: /tmp/sw-ec5-vs2 |
|||
extra_args: "-block.dir=/tmp/sw-ec5-vs2/blocks -block.listen=:3295 -ip=10.0.0.1" |
|||
save_as: vs2_pid |
|||
|
|||
- action: sleep |
|||
duration: 3s |
|||
|
|||
- action: wait_cluster_ready |
|||
node: m02 |
|||
master_url: "{{ master_url }}" |
|||
|
|||
- action: wait_block_servers |
|||
count: "2" |
|||
|
|||
- name: create-volume |
|||
actions: |
|||
- action: create_block_volume |
|||
name: "{{ volume_name }}" |
|||
size_bytes: "{{ vol_size }}" |
|||
replica_factor: "2" |
|||
durability_mode: "sync_all" |
|||
|
|||
- action: wait_volume_healthy |
|||
name: "{{ volume_name }}" |
|||
timeout: 60s |
|||
|
|||
- name: record-primary-before |
|||
actions: |
|||
- action: discover_primary |
|||
name: "{{ volume_name }}" |
|||
save_as: before |
|||
|
|||
- action: print |
|||
msg: "Before master restart: primary={{ before }} (server={{ before_server }})" |
|||
|
|||
# Write some data so primary has real state. |
|||
- action: lookup_block_volume |
|||
name: "{{ volume_name }}" |
|||
save_as: vol |
|||
|
|||
- action: iscsi_login_direct |
|||
node: m01 |
|||
host: "{{ vol_iscsi_host }}" |
|||
port: "{{ vol_iscsi_port }}" |
|||
iqn: "{{ vol_iqn }}" |
|||
save_as: device |
|||
|
|||
- action: fio_json |
|||
node: m01 |
|||
device: "{{ device }}" |
|||
rw: randwrite |
|||
bs: 4k |
|||
iodepth: "16" |
|||
runtime: "10" |
|||
time_based: "true" |
|||
name: pre-write |
|||
|
|||
- action: iscsi_cleanup |
|||
node: m01 |
|||
ignore_error: true |
|||
|
|||
- name: kill-master |
|||
actions: |
|||
- action: exec |
|||
node: m02 |
|||
cmd: "kill -9 {{ master_pid }}" |
|||
root: "true" |
|||
|
|||
- action: print |
|||
msg: "Master killed. Waiting 5s before restart..." |
|||
|
|||
- action: sleep |
|||
duration: 5s |
|||
|
|||
- name: restart-master |
|||
actions: |
|||
- action: start_weed_master |
|||
node: m02 |
|||
port: "9433" |
|||
dir: /tmp/sw-ec5-master |
|||
extra_args: "-ip=10.0.0.3" |
|||
save_as: master_pid_new |
|||
|
|||
- action: sleep |
|||
duration: 5s |
|||
|
|||
- action: wait_cluster_ready |
|||
node: m02 |
|||
master_url: "{{ master_url }}" |
|||
|
|||
# Wait for volume servers to re-register via heartbeat. |
|||
- action: wait_block_servers |
|||
count: "2" |
|||
timeout: 30s |
|||
|
|||
# Give time for block volume re-registration via heartbeat. |
|||
- action: sleep |
|||
duration: 10s |
|||
|
|||
- name: verify-primary-after |
|||
actions: |
|||
- action: discover_primary |
|||
name: "{{ volume_name }}" |
|||
save_as: after |
|||
|
|||
- action: print |
|||
msg: "After master restart: primary={{ after }} (server={{ after_server }})" |
|||
|
|||
# CRITICAL ASSERTION: primary must be the same node as before. |
|||
- action: assert_equal |
|||
actual: "{{ after_server }}" |
|||
expected: "{{ before_server }}" |
|||
msg: "EC-5 FAIL: primary changed from {{ before_server }} to {{ after_server }} after master restart" |
|||
|
|||
- action: print |
|||
msg: "EC-5 PASS: primary unchanged after master restart ({{ before_server }})" |
|||
|
|||
- name: verify-io-after |
|||
actions: |
|||
# Verify I/O still works after master restart. |
|||
# Use original lookup vars (vol_*) since re-registered volume |
|||
# may not have iSCSI addr populated until VS re-reports it. |
|||
- action: iscsi_login_direct |
|||
node: m01 |
|||
host: "{{ vol_iscsi_host }}" |
|||
port: "{{ vol_iscsi_port }}" |
|||
iqn: "{{ vol_iqn }}" |
|||
save_as: device2 |
|||
|
|||
- action: fio_json |
|||
node: m01 |
|||
device: "{{ device2 }}" |
|||
rw: randwrite |
|||
bs: 4k |
|||
iodepth: "16" |
|||
runtime: "10" |
|||
time_based: "true" |
|||
name: post-restart-write |
|||
save_as: fio_after |
|||
|
|||
- action: fio_parse |
|||
json_var: fio_after |
|||
metric: iops |
|||
save_as: iops_after |
|||
|
|||
- action: print |
|||
msg: "Post-restart write IOPS: {{ iops_after }}" |
|||
|
|||
- action: iscsi_cleanup |
|||
node: m01 |
|||
ignore_error: true |
|||
|
|||
- name: cleanup |
|||
always: true |
|||
actions: |
|||
- action: iscsi_cleanup |
|||
node: m01 |
|||
ignore_error: true |
|||
- action: stop_weed |
|||
node: m01 |
|||
pid: "{{ vs2_pid }}" |
|||
ignore_error: true |
|||
- action: stop_weed |
|||
node: m02 |
|||
pid: "{{ vs1_pid }}" |
|||
ignore_error: true |
|||
- action: stop_weed |
|||
node: m02 |
|||
pid: "{{ master_pid_new }}" |
|||
ignore_error: true |
|||
- action: stop_weed |
|||
node: m02 |
|||
pid: "{{ master_pid }}" |
|||
ignore_error: true |
|||
@ -0,0 +1,215 @@ |
|||
name: stable-degraded-best-effort |
|||
timeout: 10m |
|||
|
|||
# Stable dimension: RF=2 best_effort with replica down. |
|||
# best_effort writes succeed without barrier — should show ~0% IOPS impact |
|||
# when replica dies. Contrast with sync_all degraded mode. |
|||
|
|||
env: |
|||
master_url: "http://10.0.0.3:9433" |
|||
volume_name: stable-be |
|||
vol_size: "1073741824" |
|||
|
|||
topology: |
|||
nodes: |
|||
m01: |
|||
host: 192.168.1.181 |
|||
alt_ips: ["10.0.0.1"] |
|||
user: testdev |
|||
key: "/opt/work/testdev_key" |
|||
m02: |
|||
host: 192.168.1.184 |
|||
alt_ips: ["10.0.0.3"] |
|||
user: testdev |
|||
key: "/opt/work/testdev_key" |
|||
|
|||
phases: |
|||
- name: cluster-start |
|||
actions: |
|||
- action: exec |
|||
node: m02 |
|||
cmd: "fuser -k 9433/tcp 18480/tcp 2>/dev/null; sleep 1; rm -rf /tmp/sw-be-master /tmp/sw-be-vs1 && mkdir -p /tmp/sw-be-master /tmp/sw-be-vs1/blocks" |
|||
root: "true" |
|||
ignore_error: true |
|||
- action: exec |
|||
node: m01 |
|||
cmd: "fuser -k 18480/tcp 2>/dev/null; sleep 1; rm -rf /tmp/sw-be-vs2 && mkdir -p /tmp/sw-be-vs2/blocks" |
|||
root: "true" |
|||
ignore_error: true |
|||
|
|||
- action: start_weed_master |
|||
node: m02 |
|||
port: "9433" |
|||
dir: /tmp/sw-be-master |
|||
extra_args: "-ip=10.0.0.3" |
|||
save_as: master_pid |
|||
|
|||
- action: sleep |
|||
duration: 3s |
|||
|
|||
- action: start_weed_volume |
|||
node: m02 |
|||
port: "18480" |
|||
master: "10.0.0.3:9433" |
|||
dir: /tmp/sw-be-vs1 |
|||
extra_args: "-block.dir=/tmp/sw-be-vs1/blocks -block.listen=:3295 -ip=10.0.0.3" |
|||
save_as: vs1_pid |
|||
|
|||
- action: start_weed_volume |
|||
node: m01 |
|||
port: "18480" |
|||
master: "10.0.0.3:9433" |
|||
dir: /tmp/sw-be-vs2 |
|||
extra_args: "-block.dir=/tmp/sw-be-vs2/blocks -block.listen=:3295 -ip=10.0.0.1" |
|||
save_as: vs2_pid |
|||
|
|||
- action: sleep |
|||
duration: 3s |
|||
|
|||
- action: wait_cluster_ready |
|||
node: m02 |
|||
master_url: "{{ master_url }}" |
|||
|
|||
- action: wait_block_servers |
|||
count: "2" |
|||
|
|||
- name: create-volume |
|||
actions: |
|||
- action: create_block_volume |
|||
name: "{{ volume_name }}" |
|||
size_bytes: "{{ vol_size }}" |
|||
replica_factor: "2" |
|||
durability_mode: "best_effort" |
|||
|
|||
- action: wait_volume_healthy |
|||
name: "{{ volume_name }}" |
|||
timeout: 60s |
|||
|
|||
# Force primary to m02 so we can kill m01 (replica) cleanly. |
|||
- action: block_promote |
|||
name: "{{ volume_name }}" |
|||
target_server: "10.0.0.3:18480" |
|||
force: "true" |
|||
reason: "be-degraded-setup" |
|||
|
|||
- action: sleep |
|||
duration: 5s |
|||
|
|||
- action: wait_volume_healthy |
|||
name: "{{ volume_name }}" |
|||
timeout: 60s |
|||
|
|||
- action: discover_primary |
|||
name: "{{ volume_name }}" |
|||
save_as: pri |
|||
|
|||
- action: print |
|||
msg: "Primary on {{ pri }} ({{ pri_server }}), replica on {{ pri_replica_node }}" |
|||
|
|||
- name: connect |
|||
actions: |
|||
- action: lookup_block_volume |
|||
name: "{{ volume_name }}" |
|||
save_as: vol |
|||
|
|||
- action: iscsi_login_direct |
|||
node: m01 |
|||
host: "{{ vol_iscsi_host }}" |
|||
port: "{{ vol_iscsi_port }}" |
|||
iqn: "{{ vol_iqn }}" |
|||
save_as: device |
|||
|
|||
- name: baseline |
|||
actions: |
|||
- action: print |
|||
msg: "=== Baseline: RF=2 best_effort, healthy ===" |
|||
|
|||
- action: fio_json |
|||
node: m01 |
|||
device: "{{ device }}" |
|||
rw: randwrite |
|||
bs: 4k |
|||
iodepth: "32" |
|||
runtime: "30" |
|||
time_based: "true" |
|||
name: baseline |
|||
save_as: fio_baseline |
|||
|
|||
- action: fio_parse |
|||
json_var: fio_baseline |
|||
metric: iops |
|||
save_as: iops_baseline |
|||
|
|||
- action: print |
|||
msg: "Healthy: {{ iops_baseline }} IOPS" |
|||
|
|||
- name: kill-replica |
|||
actions: |
|||
- action: print |
|||
msg: "=== Killing replica VS on m01 ===" |
|||
|
|||
- action: exec |
|||
node: m01 |
|||
cmd: "kill -9 {{ vs2_pid }}" |
|||
root: "true" |
|||
ignore_error: true |
|||
|
|||
# Wait for shipper to detect dead replica. |
|||
- action: sleep |
|||
duration: 10s |
|||
|
|||
- name: degraded-write |
|||
actions: |
|||
- action: print |
|||
msg: "=== Degraded: best_effort, primary only ===" |
|||
|
|||
- action: fio_json |
|||
node: m01 |
|||
device: "{{ device }}" |
|||
rw: randwrite |
|||
bs: 4k |
|||
iodepth: "32" |
|||
runtime: "30" |
|||
time_based: "true" |
|||
name: degraded |
|||
save_as: fio_degraded |
|||
|
|||
- action: fio_parse |
|||
json_var: fio_degraded |
|||
metric: iops |
|||
save_as: iops_degraded |
|||
|
|||
- action: print |
|||
msg: "Degraded: {{ iops_degraded }} IOPS" |
|||
|
|||
- name: results |
|||
actions: |
|||
- action: print |
|||
msg: "=== Stable: Degraded Mode (best_effort RF=2, iSCSI/RoCE) ===" |
|||
- action: print |
|||
msg: "Healthy: {{ iops_baseline }} IOPS" |
|||
- action: print |
|||
msg: "Degraded: {{ iops_degraded }} IOPS" |
|||
|
|||
- action: collect_results |
|||
title: "Stable: Degraded Mode (best_effort RF=2)" |
|||
volume_name: "{{ volume_name }}" |
|||
|
|||
- name: cleanup |
|||
always: true |
|||
actions: |
|||
- action: iscsi_cleanup |
|||
node: m01 |
|||
ignore_error: true |
|||
- action: stop_weed |
|||
node: m01 |
|||
pid: "{{ vs2_pid }}" |
|||
ignore_error: true |
|||
- action: stop_weed |
|||
node: m02 |
|||
pid: "{{ vs1_pid }}" |
|||
ignore_error: true |
|||
- action: stop_weed |
|||
node: m02 |
|||
pid: "{{ master_pid }}" |
|||
ignore_error: true |
|||
@ -0,0 +1,228 @@ |
|||
name: stable-degraded-mode |
|||
timeout: 10m |
|||
|
|||
# Stable dimension: write IOPS with replica down (sync_all RF=2). |
|||
# |
|||
# Answers: how much does primary performance change when replication |
|||
# barrier has no healthy replica to wait for? |
|||
# |
|||
# Flow: |
|||
# 1. Create volume, force primary to m02 |
|||
# 2. Baseline: healthy RF=2 sync_all fio → IOPS |
|||
# 3. Kill replica (m01 VS) |
|||
# 4. Degraded: fio → IOPS (sync_all with dead replica) |
|||
# 5. Compare |
|||
|
|||
env: |
|||
master_url: "http://10.0.0.3:9433" |
|||
volume_name: stable-deg |
|||
vol_size: "1073741824" |
|||
|
|||
topology: |
|||
nodes: |
|||
m01: |
|||
host: 192.168.1.181 |
|||
alt_ips: ["10.0.0.1"] |
|||
user: testdev |
|||
key: "/opt/work/testdev_key" |
|||
m02: |
|||
host: 192.168.1.184 |
|||
alt_ips: ["10.0.0.3"] |
|||
user: testdev |
|||
key: "/opt/work/testdev_key" |
|||
|
|||
phases: |
|||
- name: cluster-start |
|||
actions: |
|||
- action: exec |
|||
node: m02 |
|||
cmd: "fuser -k 9433/tcp 18480/tcp 2>/dev/null; sleep 1; rm -rf /tmp/sw-deg-master /tmp/sw-deg-vs1 && mkdir -p /tmp/sw-deg-master /tmp/sw-deg-vs1/blocks" |
|||
root: "true" |
|||
ignore_error: true |
|||
- action: exec |
|||
node: m01 |
|||
cmd: "fuser -k 18480/tcp 2>/dev/null; sleep 1; rm -rf /tmp/sw-deg-vs2 && mkdir -p /tmp/sw-deg-vs2/blocks" |
|||
root: "true" |
|||
ignore_error: true |
|||
|
|||
- action: start_weed_master |
|||
node: m02 |
|||
port: "9433" |
|||
dir: /tmp/sw-deg-master |
|||
extra_args: "-ip=10.0.0.3" |
|||
save_as: master_pid |
|||
|
|||
- action: sleep |
|||
duration: 3s |
|||
|
|||
- action: start_weed_volume |
|||
node: m02 |
|||
port: "18480" |
|||
master: "10.0.0.3:9433" |
|||
dir: /tmp/sw-deg-vs1 |
|||
extra_args: "-block.dir=/tmp/sw-deg-vs1/blocks -block.listen=:3295 -ip=10.0.0.3" |
|||
save_as: vs1_pid |
|||
|
|||
- action: start_weed_volume |
|||
node: m01 |
|||
port: "18480" |
|||
master: "10.0.0.3:9433" |
|||
dir: /tmp/sw-deg-vs2 |
|||
extra_args: "-block.dir=/tmp/sw-deg-vs2/blocks -block.listen=:3295 -ip=10.0.0.1" |
|||
save_as: vs2_pid |
|||
|
|||
- action: sleep |
|||
duration: 3s |
|||
|
|||
- action: wait_cluster_ready |
|||
node: m02 |
|||
master_url: "{{ master_url }}" |
|||
|
|||
- action: wait_block_servers |
|||
count: "2" |
|||
|
|||
- name: create-volume |
|||
actions: |
|||
- action: create_block_volume |
|||
name: "{{ volume_name }}" |
|||
size_bytes: "{{ vol_size }}" |
|||
replica_factor: "2" |
|||
durability_mode: "sync_all" |
|||
|
|||
- action: wait_volume_healthy |
|||
name: "{{ volume_name }}" |
|||
timeout: 60s |
|||
|
|||
# Force primary to m02 so we know which node to keep alive. |
|||
- action: block_promote |
|||
name: "{{ volume_name }}" |
|||
target_server: "10.0.0.3:18480" |
|||
force: "true" |
|||
reason: "degraded-setup" |
|||
|
|||
- action: sleep |
|||
duration: 5s |
|||
|
|||
- action: wait_volume_healthy |
|||
name: "{{ volume_name }}" |
|||
timeout: 60s |
|||
|
|||
- action: discover_primary |
|||
name: "{{ volume_name }}" |
|||
save_as: pri |
|||
|
|||
- action: print |
|||
msg: "Primary on {{ pri }} ({{ pri_server }}), replica on {{ pri_replica_node }}" |
|||
|
|||
- name: connect |
|||
actions: |
|||
- action: lookup_block_volume |
|||
name: "{{ volume_name }}" |
|||
save_as: vol |
|||
|
|||
# Connect iSCSI client (on m01) to primary (on m02). |
|||
- action: iscsi_login_direct |
|||
node: m01 |
|||
host: "{{ vol_iscsi_host }}" |
|||
port: "{{ vol_iscsi_port }}" |
|||
iqn: "{{ vol_iqn }}" |
|||
save_as: device |
|||
|
|||
- name: baseline-healthy |
|||
actions: |
|||
- action: print |
|||
msg: "=== Baseline: RF=2 sync_all, both replicas healthy ===" |
|||
|
|||
- action: fio_json |
|||
node: m01 |
|||
device: "{{ device }}" |
|||
rw: randwrite |
|||
bs: 4k |
|||
iodepth: "32" |
|||
runtime: "30" |
|||
time_based: "true" |
|||
name: baseline-healthy |
|||
save_as: fio_healthy |
|||
|
|||
- action: fio_parse |
|||
json_var: fio_healthy |
|||
metric: iops |
|||
save_as: iops_healthy |
|||
|
|||
- action: print |
|||
msg: "Healthy: {{ iops_healthy }} IOPS" |
|||
|
|||
- name: kill-replica |
|||
actions: |
|||
- action: print |
|||
msg: "=== Killing replica VS on m01 ===" |
|||
|
|||
# Kill replica (m01 VS). Primary (m02) keeps serving. |
|||
- action: exec |
|||
node: m01 |
|||
cmd: "kill -9 {{ vs2_pid }}" |
|||
root: "true" |
|||
ignore_error: true |
|||
|
|||
# Wait for shipper to detect dead replica and enter degraded mode. |
|||
- action: sleep |
|||
duration: 10s |
|||
|
|||
- action: print |
|||
msg: "Replica killed. Shipper should be degraded now." |
|||
|
|||
- name: degraded-write |
|||
actions: |
|||
- action: print |
|||
msg: "=== Degraded: sync_all primary only, replica dead ===" |
|||
|
|||
- action: fio_json |
|||
node: m01 |
|||
device: "{{ device }}" |
|||
rw: randwrite |
|||
bs: 4k |
|||
iodepth: "32" |
|||
runtime: "30" |
|||
time_based: "true" |
|||
name: degraded-write |
|||
save_as: fio_degraded |
|||
|
|||
- action: fio_parse |
|||
json_var: fio_degraded |
|||
metric: iops |
|||
save_as: iops_degraded |
|||
|
|||
- action: print |
|||
msg: "Degraded: {{ iops_degraded }} IOPS" |
|||
|
|||
- name: results |
|||
actions: |
|||
- action: print |
|||
msg: "=== Stable: Degraded Mode (sync_all RF=2, iSCSI/RoCE) ===" |
|||
- action: print |
|||
msg: "Healthy: {{ iops_healthy }} IOPS" |
|||
- action: print |
|||
msg: "Degraded: {{ iops_degraded }} IOPS" |
|||
|
|||
- action: collect_results |
|||
title: "Stable: Degraded Mode (sync_all RF=2)" |
|||
volume_name: "{{ volume_name }}" |
|||
|
|||
- name: cleanup |
|||
always: true |
|||
actions: |
|||
- action: iscsi_cleanup |
|||
node: m01 |
|||
ignore_error: true |
|||
- action: stop_weed |
|||
node: m01 |
|||
pid: "{{ vs2_pid }}" |
|||
ignore_error: true |
|||
- action: stop_weed |
|||
node: m02 |
|||
pid: "{{ vs1_pid }}" |
|||
ignore_error: true |
|||
- action: stop_weed |
|||
node: m02 |
|||
pid: "{{ master_pid }}" |
|||
ignore_error: true |
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue