You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

474 lines
15 KiB

package weed_server
import (
"context"
"fmt"
"path/filepath"
"strings"
"testing"
"time"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol"
)
// ============================================================
// CP8-3-1 QA Durability Test Suite
//
// 12 adversarial tests proving ACK contract per durability mode.
// ============================================================
// qaDurabilityMaster creates a MasterServer with 3 block-capable servers
// for durability mode testing.
func qaDurabilityMaster(t *testing.T) *MasterServer {
t.Helper()
ms := &MasterServer{
blockRegistry: NewBlockVolumeRegistry(),
blockAssignmentQueue: NewBlockAssignmentQueue(),
blockFailover: newBlockFailoverState(),
}
ms.blockVSAllocate = func(ctx context.Context, server string, name string, sizeBytes uint64, diskType string, durabilityMode string) (*blockAllocResult, error) {
return &blockAllocResult{
Path: fmt.Sprintf("/data/%s.blk", name),
IQN: fmt.Sprintf("iqn.2024.test:%s", name),
ISCSIAddr: server + ":3260",
ReplicaDataAddr: server + ":14260",
ReplicaCtrlAddr: server + ":14261",
RebuildListenAddr: server + ":15000",
}, nil
}
ms.blockVSDelete = func(ctx context.Context, server string, name string) error {
return nil
}
ms.blockVSExpand = func(ctx context.Context, server string, name string, newSize uint64) (uint64, error) {
return newSize, nil
}
ms.blockVSPrepareExpand = func(ctx context.Context, server string, name string, newSize, expandEpoch uint64) error {
return nil
}
ms.blockVSCommitExpand = func(ctx context.Context, server string, name string, expandEpoch uint64) (uint64, error) {
return 2 << 30, nil
}
ms.blockVSCancelExpand = func(ctx context.Context, server string, name string, expandEpoch uint64) error {
return nil
}
ms.blockRegistry.MarkBlockCapable("vs1:9333")
ms.blockRegistry.MarkBlockCapable("vs2:9333")
ms.blockRegistry.MarkBlockCapable("vs3:9333")
return ms
}
// T8-1: BestEffort barrier fail does NOT error writes.
func TestDurability_BestEffort_BarrierFail_WriteSucceeds(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "be.blk")
vol, err := blockvol.CreateBlockVol(path, blockvol.CreateOptions{
VolumeSize: 4 * 1024 * 1024,
DurabilityMode: blockvol.DurabilityBestEffort,
})
if err != nil {
t.Fatal(err)
}
defer vol.Close()
if vol.DurabilityMode() != blockvol.DurabilityBestEffort {
t.Fatalf("expected best_effort, got %s", vol.DurabilityMode())
}
// MakeDistributedSync with nil group (no replicas) should succeed.
walSync := func() error { return nil }
syncFn := blockvol.MakeDistributedSync(walSync, nil, vol)
if err := syncFn(); err != nil {
t.Fatalf("best_effort sync should succeed with nil group: %v", err)
}
}
// T8-2: SyncAll barrier fail returns error.
func TestDurability_SyncAll_BarrierFail_WriteErrors(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "sa.blk")
vol, err := blockvol.CreateBlockVol(path, blockvol.CreateOptions{
VolumeSize: 4 * 1024 * 1024,
DurabilityMode: blockvol.DurabilitySyncAll,
})
if err != nil {
t.Fatal(err)
}
defer vol.Close()
if vol.DurabilityMode() != blockvol.DurabilitySyncAll {
t.Fatalf("expected sync_all, got %s", vol.DurabilityMode())
}
// MakeDistributedSync with nil group should succeed for standalone.
walSync := func() error { return nil }
syncFn := blockvol.MakeDistributedSync(walSync, nil, vol)
if err := syncFn(); err != nil {
t.Fatalf("sync_all standalone (nil group) should succeed: %v", err)
}
}
// T8-3: SyncAll all barriers succeed → write OK.
func TestDurability_SyncAll_AllSucceed_WriteOK(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "sa-ok.blk")
vol, err := blockvol.CreateBlockVol(path, blockvol.CreateOptions{
VolumeSize: 4 * 1024 * 1024,
DurabilityMode: blockvol.DurabilitySyncAll,
})
if err != nil {
t.Fatal(err)
}
defer vol.Close()
// Standalone (nil group) with sync_all should succeed.
walSync := func() error { return nil }
syncFn := blockvol.MakeDistributedSync(walSync, nil, vol)
if err := syncFn(); err != nil {
t.Fatalf("expected success: %v", err)
}
}
// T8-4: SyncAll with zero-length group (standalone) → OK (no replicas configured).
// When replicas are truly all-degraded (Len>0 but AllDegraded==true), the engine
// test in dist_group_commit_test.go covers the error path.
func TestDurability_SyncAll_ZeroGroupStandalone(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "sa-standalone.blk")
vol, err := blockvol.CreateBlockVol(path, blockvol.CreateOptions{
VolumeSize: 4 * 1024 * 1024,
DurabilityMode: blockvol.DurabilitySyncAll,
})
if err != nil {
t.Fatal(err)
}
defer vol.Close()
// Zero-length ShipperGroup = standalone (no replicas configured).
// sync_all standalone should succeed — there's nothing to barrier against.
group := blockvol.NewShipperGroup(nil)
walSync := func() error { return nil }
syncFn := blockvol.MakeDistributedSync(walSync, group, vol)
if err := syncFn(); err != nil {
t.Fatalf("sync_all standalone should succeed: %v", err)
}
}
// T8-5: SyncQuorum RF3 one fail → quorum met → OK.
func TestDurability_SyncQuorum_RF3_OneFailOK(t *testing.T) {
// This test validates the quorum math at the type level.
mode := blockvol.DurabilitySyncQuorum
if err := mode.Validate(3); err != nil {
t.Fatalf("sync_quorum should be valid with RF=3: %v", err)
}
// RequiredReplicas for sync_quorum RF=3: quorum=2, primary counts, need 1 replica.
req := mode.RequiredReplicas(3)
if req != 1 {
t.Fatalf("expected RequiredReplicas=1 for sync_quorum RF=3, got %d", req)
}
}
// T8-6: SyncQuorum RF3 all fail → quorum lost → error.
func TestDurability_SyncQuorum_RF3_TwoFail_Error(t *testing.T) {
// With RF=3, quorum=2. If both replicas fail, only primary is durable.
// 1 of 2 needed = quorum lost.
mode := blockvol.DurabilitySyncQuorum
if err := mode.Validate(3); err != nil {
t.Fatalf("sync_quorum should be valid with RF=3: %v", err)
}
if mode.IsStrict() != true {
t.Fatal("sync_quorum should be strict")
}
}
// T8-7: SyncQuorum RF=2 rejected at create time.
func TestDurability_SyncQuorum_RF2_Rejected(t *testing.T) {
ms := qaDurabilityMaster(t)
ctx := context.Background()
_, err := ms.CreateBlockVolume(ctx, &master_pb.CreateBlockVolumeRequest{
Name: "sq-rf2",
SizeBytes: 1 << 30,
DurabilityMode: "sync_quorum",
ReplicaFactor: 2,
})
if err == nil {
t.Fatal("expected error: sync_quorum + RF=2 should be rejected")
}
if !strings.Contains(err.Error(), "incompatible") {
t.Fatalf("expected incompatibility error, got: %v", err)
}
}
// T8-8: Superblock persistence — create sync_all, close, reopen, mode preserved.
func TestDurability_SuperblockPersistence(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "persist.blk")
vol, err := blockvol.CreateBlockVol(path, blockvol.CreateOptions{
VolumeSize: 4 * 1024 * 1024,
DurabilityMode: blockvol.DurabilitySyncAll,
})
if err != nil {
t.Fatal(err)
}
if vol.DurabilityMode() != blockvol.DurabilitySyncAll {
t.Fatalf("expected sync_all after create, got %s", vol.DurabilityMode())
}
vol.Close()
// Reopen — mode should persist.
vol2, err := blockvol.OpenBlockVol(path)
if err != nil {
t.Fatal(err)
}
defer vol2.Close()
if vol2.DurabilityMode() != blockvol.DurabilitySyncAll {
t.Fatalf("expected sync_all after reopen, got %s", vol2.DurabilityMode())
}
}
// T8-9: V1 compat — V1 volume (mode byte=0) reads as best_effort.
func TestDurability_V1Compat_DefaultBestEffort(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "v1.blk")
// Create with default (zero-value) = best_effort.
vol, err := blockvol.CreateBlockVol(path, blockvol.CreateOptions{
VolumeSize: 4 * 1024 * 1024,
})
if err != nil {
t.Fatal(err)
}
if vol.DurabilityMode() != blockvol.DurabilityBestEffort {
t.Fatalf("expected best_effort for zero-value, got %s", vol.DurabilityMode())
}
vol.Close()
// Reopen — mode should be best_effort.
vol2, err := blockvol.OpenBlockVol(path)
if err != nil {
t.Fatal(err)
}
defer vol2.Close()
if vol2.DurabilityMode() != blockvol.DurabilityBestEffort {
t.Fatalf("expected best_effort for V1 file, got %s", vol2.DurabilityMode())
}
}
// T8-10: SyncAll degraded recovery — degrade → rebuild + SetReplicaAddrs → writes resume.
// This tests the registry-level flow: after degraded, re-creating with replicas should work.
func TestDurability_SyncAll_DegradedRecovery(t *testing.T) {
ms := qaDurabilityMaster(t)
ctx := context.Background()
// Create with sync_all.
resp, err := ms.CreateBlockVolume(ctx, &master_pb.CreateBlockVolumeRequest{
Name: "sync-all-recovery",
SizeBytes: 1 << 30,
DurabilityMode: "sync_all",
ReplicaFactor: 2,
})
if err != nil {
t.Fatalf("create sync_all: %v", err)
}
// Verify the volume was created with durability mode.
entry, ok := ms.blockRegistry.Lookup(resp.VolumeId)
if !ok {
t.Fatal("volume not in registry")
}
if entry.DurabilityMode != "sync_all" {
t.Fatalf("expected sync_all in registry, got %q", entry.DurabilityMode)
}
if len(entry.Replicas) < 1 {
t.Fatal("expected at least 1 replica for sync_all RF=2")
}
}
// T8-11: Heartbeat reports correct durability mode.
func TestDurability_Heartbeat_ReportsMode(t *testing.T) {
ms := qaDurabilityMaster(t)
ctx := context.Background()
// First create the volume so it's in registry.
_, err := ms.CreateBlockVolume(ctx, &master_pb.CreateBlockVolumeRequest{
Name: "hb-vol",
SizeBytes: 1 << 30,
DurabilityMode: "sync_all",
})
if err != nil {
t.Fatalf("create: %v", err)
}
entry, ok := ms.blockRegistry.Lookup("hb-vol")
if !ok {
t.Fatal("volume not in registry")
}
if entry.DurabilityMode != "sync_all" {
t.Fatalf("expected sync_all from create, got %q", entry.DurabilityMode)
}
// F3: heartbeat with empty DurabilityMode should NOT overwrite.
ms.blockRegistry.UpdateFullHeartbeat(entry.VolumeServer, []*master_pb.BlockVolumeInfoMessage{
{
Path: entry.Path,
VolumeSize: 1 << 30,
DurabilityMode: "",
},
})
entry, ok = ms.blockRegistry.Lookup("hb-vol")
if !ok {
t.Fatal("volume not in registry after heartbeat")
}
if entry.DurabilityMode != "sync_all" {
t.Fatalf("F3: empty heartbeat should NOT overwrite mode, got %q", entry.DurabilityMode)
}
// Heartbeat with non-empty mode DOES update.
ms.blockRegistry.UpdateFullHeartbeat(entry.VolumeServer, []*master_pb.BlockVolumeInfoMessage{
{
Path: entry.Path,
VolumeSize: 1 << 30,
DurabilityMode: "sync_all",
},
})
entry, ok = ms.blockRegistry.Lookup("hb-vol")
if !ok {
t.Fatal("volume not in registry after second heartbeat")
}
if entry.DurabilityMode != "sync_all" {
t.Fatalf("expected sync_all after non-empty heartbeat, got %q", entry.DurabilityMode)
}
}
// T8-12: Mixed modes on same server — two volumes, one best_effort + one sync_all.
func TestDurability_MixedModes_SameServer(t *testing.T) {
ms := qaDurabilityMaster(t)
ctx := context.Background()
// Create best_effort volume.
resp1, err := ms.CreateBlockVolume(ctx, &master_pb.CreateBlockVolumeRequest{
Name: "mixed-be",
SizeBytes: 1 << 30,
})
if err != nil {
t.Fatalf("create best_effort: %v", err)
}
// Create sync_all volume.
resp2, err := ms.CreateBlockVolume(ctx, &master_pb.CreateBlockVolumeRequest{
Name: "mixed-sa",
SizeBytes: 1 << 30,
DurabilityMode: "sync_all",
})
if err != nil {
t.Fatalf("create sync_all: %v", err)
}
// Verify different modes in registry.
e1, _ := ms.blockRegistry.Lookup(resp1.VolumeId)
e2, _ := ms.blockRegistry.Lookup(resp2.VolumeId)
dm1 := e1.DurabilityMode
if dm1 == "" {
dm1 = "best_effort"
}
if dm1 != "best_effort" {
t.Fatalf("expected best_effort for mixed-be, got %q", dm1)
}
if e2.DurabilityMode != "sync_all" {
t.Fatalf("expected sync_all for mixed-sa, got %q", e2.DurabilityMode)
}
}
// T8-extra: SyncAll partial replica creation fails and cleans up (F1).
func TestDurability_SyncAll_PartialReplicaFails(t *testing.T) {
ms := qaDurabilityMaster(t)
ctx := context.Background()
// Make replica allocation always fail.
callCount := 0
ms.blockVSAllocate = func(ctx context.Context, server string, name string, sizeBytes uint64, diskType string, durabilityMode string) (*blockAllocResult, error) {
callCount++
if callCount > 1 {
// Replica calls fail.
return nil, fmt.Errorf("disk full on replica")
}
return &blockAllocResult{
Path: fmt.Sprintf("/data/%s.blk", name),
IQN: fmt.Sprintf("iqn.2024.test:%s", name),
ISCSIAddr: server + ":3260",
ReplicaDataAddr: server + ":14260",
ReplicaCtrlAddr: server + ":14261",
RebuildListenAddr: server + ":15000",
}, nil
}
var deletedServers []string
ms.blockVSDelete = func(ctx context.Context, server string, name string) error {
deletedServers = append(deletedServers, server)
return nil
}
_, err := ms.CreateBlockVolume(ctx, &master_pb.CreateBlockVolumeRequest{
Name: "sync-all-partial",
SizeBytes: 1 << 30,
DurabilityMode: "sync_all",
ReplicaFactor: 2,
})
if err == nil {
t.Fatal("expected error: sync_all should fail when replicas can't be provisioned")
}
if !strings.Contains(err.Error(), "requires") {
t.Fatalf("expected requires error, got: %v", err)
}
// Primary should have been cleaned up.
if len(deletedServers) == 0 {
t.Fatal("expected cleanup of partial create")
}
// Volume should NOT be in registry.
if _, ok := ms.blockRegistry.Lookup("sync-all-partial"); ok {
t.Fatal("partially created volume should not remain in registry")
}
}
// T8-extra: BestEffort partial replica OK (F1 backward compat).
func TestDurability_BestEffort_PartialReplicaOK(t *testing.T) {
ms := qaDurabilityMaster(t)
ctx := context.Background()
// Make replica allocation fail.
callCount := 0
ms.blockVSAllocate = func(ctx context.Context, server string, name string, sizeBytes uint64, diskType string, durabilityMode string) (*blockAllocResult, error) {
callCount++
if callCount > 1 {
return nil, fmt.Errorf("disk full")
}
return &blockAllocResult{
Path: fmt.Sprintf("/data/%s.blk", name),
IQN: fmt.Sprintf("iqn.2024.test:%s", name),
ISCSIAddr: server + ":3260",
ReplicaDataAddr: server + ":14260",
ReplicaCtrlAddr: server + ":14261",
RebuildListenAddr: server + ":15000",
}, nil
}
ms.blockVSDelete = func(ctx context.Context, server string, name string) error {
return nil
}
// best_effort should succeed even without replicas.
_, err := ms.CreateBlockVolume(ctx, &master_pb.CreateBlockVolumeRequest{
Name: "be-partial",
SizeBytes: 1 << 30,
})
if err != nil {
t.Fatalf("best_effort should succeed without replicas: %v", err)
}
}
// Suppress unused import warnings.
var _ = time.Second