You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1902 lines
57 KiB
1902 lines
57 KiB
package weed_server
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
"sync"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol"
|
|
)
|
|
|
|
func TestRegistry_RegisterLookup(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
entry := &BlockVolumeEntry{
|
|
Name: "vol1",
|
|
VolumeServer: "server1:9333",
|
|
Path: "/data/vol1.blk",
|
|
IQN: "iqn.2024.com.seaweedfs:vol1",
|
|
ISCSIAddr: "10.0.0.1:3260",
|
|
SizeBytes: 1 << 30,
|
|
Epoch: 1,
|
|
Role: 1,
|
|
Status: StatusPending,
|
|
}
|
|
if err := r.Register(entry); err != nil {
|
|
t.Fatalf("Register: %v", err)
|
|
}
|
|
got, ok := r.Lookup("vol1")
|
|
if !ok {
|
|
t.Fatal("Lookup: not found")
|
|
}
|
|
if got.Name != "vol1" || got.VolumeServer != "server1:9333" || got.Path != "/data/vol1.blk" {
|
|
t.Fatalf("Lookup: unexpected entry: %+v", got)
|
|
}
|
|
if got.Status != StatusPending {
|
|
t.Fatalf("Status: got %d, want %d", got.Status, StatusPending)
|
|
}
|
|
}
|
|
|
|
func TestRegistry_Unregister(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{Name: "vol1", VolumeServer: "s1", Path: "/vol1.blk"})
|
|
removed := r.Unregister("vol1")
|
|
if removed == nil {
|
|
t.Fatal("Unregister returned nil")
|
|
}
|
|
if _, ok := r.Lookup("vol1"); ok {
|
|
t.Fatal("vol1 should not be found after Unregister")
|
|
}
|
|
// Double unregister returns nil.
|
|
if r.Unregister("vol1") != nil {
|
|
t.Fatal("double Unregister should return nil")
|
|
}
|
|
}
|
|
|
|
func TestRegistry_DuplicateRegister(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{Name: "vol1", VolumeServer: "s1", Path: "/vol1.blk"})
|
|
err := r.Register(&BlockVolumeEntry{Name: "vol1", VolumeServer: "s2", Path: "/vol1.blk"})
|
|
if err == nil {
|
|
t.Fatal("duplicate Register should return error")
|
|
}
|
|
}
|
|
|
|
func TestRegistry_ListByServer(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{Name: "vol1", VolumeServer: "s1", Path: "/v1.blk"})
|
|
r.Register(&BlockVolumeEntry{Name: "vol2", VolumeServer: "s1", Path: "/v2.blk"})
|
|
r.Register(&BlockVolumeEntry{Name: "vol3", VolumeServer: "s2", Path: "/v3.blk"})
|
|
|
|
s1Vols := r.ListByServer("s1")
|
|
if len(s1Vols) != 2 {
|
|
t.Fatalf("ListByServer(s1): got %d, want 2", len(s1Vols))
|
|
}
|
|
s2Vols := r.ListByServer("s2")
|
|
if len(s2Vols) != 1 {
|
|
t.Fatalf("ListByServer(s2): got %d, want 1", len(s2Vols))
|
|
}
|
|
s3Vols := r.ListByServer("s3")
|
|
if len(s3Vols) != 0 {
|
|
t.Fatalf("ListByServer(s3): got %d, want 0", len(s3Vols))
|
|
}
|
|
}
|
|
|
|
func TestRegistry_UpdateFullHeartbeat(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
// Register two volumes on server s1.
|
|
r.Register(&BlockVolumeEntry{Name: "vol1", VolumeServer: "s1", Path: "/v1.blk", Status: StatusPending})
|
|
r.Register(&BlockVolumeEntry{Name: "vol2", VolumeServer: "s1", Path: "/v2.blk", Status: StatusPending})
|
|
|
|
// Full heartbeat reports only vol1 (vol2 is stale).
|
|
r.UpdateFullHeartbeat("s1", []*master_pb.BlockVolumeInfoMessage{
|
|
{Path: "/v1.blk", Epoch: 5, Role: 1},
|
|
}, "")
|
|
|
|
// vol1 should be Active.
|
|
e1, ok := r.Lookup("vol1")
|
|
if !ok {
|
|
t.Fatal("vol1 should exist after full heartbeat")
|
|
}
|
|
if e1.Status != StatusActive {
|
|
t.Fatalf("vol1 status: got %d, want %d", e1.Status, StatusActive)
|
|
}
|
|
if e1.Epoch != 5 {
|
|
t.Fatalf("vol1 epoch: got %d, want 5", e1.Epoch)
|
|
}
|
|
|
|
// vol2 should be removed (stale).
|
|
if _, ok := r.Lookup("vol2"); ok {
|
|
t.Fatal("vol2 should have been removed as stale")
|
|
}
|
|
}
|
|
|
|
func TestRegistry_UpdateDeltaHeartbeat(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{Name: "vol1", VolumeServer: "s1", Path: "/v1.blk", Status: StatusPending})
|
|
r.Register(&BlockVolumeEntry{Name: "vol2", VolumeServer: "s1", Path: "/v2.blk", Status: StatusActive})
|
|
|
|
// Delta: vol1 newly appeared, vol2 deleted.
|
|
r.UpdateDeltaHeartbeat("s1",
|
|
[]*master_pb.BlockVolumeShortInfoMessage{{Path: "/v1.blk"}},
|
|
[]*master_pb.BlockVolumeShortInfoMessage{{Path: "/v2.blk"}},
|
|
)
|
|
|
|
// vol1 should be Active.
|
|
e1, ok := r.Lookup("vol1")
|
|
if !ok {
|
|
t.Fatal("vol1 should exist")
|
|
}
|
|
if e1.Status != StatusActive {
|
|
t.Fatalf("vol1 status: got %d, want Active", e1.Status)
|
|
}
|
|
|
|
// vol2 should be removed.
|
|
if _, ok := r.Lookup("vol2"); ok {
|
|
t.Fatal("vol2 should have been removed by delta")
|
|
}
|
|
}
|
|
|
|
func TestRegistry_PendingToActive(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1", VolumeServer: "s1", Path: "/v1.blk",
|
|
Status: StatusPending, Epoch: 1,
|
|
})
|
|
|
|
// Full heartbeat confirms the volume.
|
|
r.UpdateFullHeartbeat("s1", []*master_pb.BlockVolumeInfoMessage{
|
|
{Path: "/v1.blk", Epoch: 1, Role: 1},
|
|
}, "")
|
|
|
|
e, _ := r.Lookup("vol1")
|
|
if e.Status != StatusActive {
|
|
t.Fatalf("expected Active after heartbeat, got %d", e.Status)
|
|
}
|
|
}
|
|
|
|
func TestRegistry_PickServer(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
// s1 has 2 volumes, s2 has 1, s3 has 0.
|
|
r.Register(&BlockVolumeEntry{Name: "v1", VolumeServer: "s1", Path: "/v1.blk"})
|
|
r.Register(&BlockVolumeEntry{Name: "v2", VolumeServer: "s1", Path: "/v2.blk"})
|
|
r.Register(&BlockVolumeEntry{Name: "v3", VolumeServer: "s2", Path: "/v3.blk"})
|
|
|
|
got, err := r.PickServer([]string{"s1", "s2", "s3"})
|
|
if err != nil {
|
|
t.Fatalf("PickServer: %v", err)
|
|
}
|
|
if got != "s3" {
|
|
t.Fatalf("PickServer: got %q, want s3 (fewest volumes)", got)
|
|
}
|
|
}
|
|
|
|
func TestRegistry_PickServerEmpty(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
_, err := r.PickServer(nil)
|
|
if err == nil {
|
|
t.Fatal("PickServer with no servers should return error")
|
|
}
|
|
}
|
|
|
|
func TestRegistry_InflightLock(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
|
|
// First acquire succeeds.
|
|
if !r.AcquireInflight("vol1") {
|
|
t.Fatal("first AcquireInflight should succeed")
|
|
}
|
|
|
|
// Second acquire for same name fails.
|
|
if r.AcquireInflight("vol1") {
|
|
t.Fatal("second AcquireInflight for same name should fail")
|
|
}
|
|
|
|
// Different name succeeds.
|
|
if !r.AcquireInflight("vol2") {
|
|
t.Fatal("AcquireInflight for different name should succeed")
|
|
}
|
|
|
|
// Release and re-acquire.
|
|
r.ReleaseInflight("vol1")
|
|
if !r.AcquireInflight("vol1") {
|
|
t.Fatal("AcquireInflight after release should succeed")
|
|
}
|
|
|
|
r.ReleaseInflight("vol1")
|
|
r.ReleaseInflight("vol2")
|
|
}
|
|
|
|
func TestRegistry_UnmarkDeadServer(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.MarkBlockCapable("s1")
|
|
r.MarkBlockCapable("s2")
|
|
|
|
servers := r.BlockCapableServers()
|
|
if len(servers) != 2 {
|
|
t.Fatalf("expected 2 servers, got %d", len(servers))
|
|
}
|
|
|
|
// Simulate s1 disconnect.
|
|
r.UnmarkBlockCapable("s1")
|
|
|
|
servers = r.BlockCapableServers()
|
|
if len(servers) != 1 {
|
|
t.Fatalf("expected 1 server after unmark, got %d", len(servers))
|
|
}
|
|
if servers[0] != "s2" {
|
|
t.Fatalf("expected s2, got %s", servers[0])
|
|
}
|
|
}
|
|
|
|
func TestRegistry_FullHeartbeatUpdatesSizeBytes(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1", VolumeServer: "s1", Path: "/v1.blk",
|
|
SizeBytes: 1 << 30, Status: StatusPending,
|
|
})
|
|
|
|
// Heartbeat with updated size (online resize).
|
|
r.UpdateFullHeartbeat("s1", []*master_pb.BlockVolumeInfoMessage{
|
|
{Path: "/v1.blk", VolumeSize: 2 << 30, Epoch: 1, Role: 1},
|
|
}, "")
|
|
|
|
e, _ := r.Lookup("vol1")
|
|
if e.SizeBytes != 2<<30 {
|
|
t.Fatalf("SizeBytes: got %d, want %d", e.SizeBytes, 2<<30)
|
|
}
|
|
}
|
|
|
|
func TestRegistry_ConcurrentAccess(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
var wg sync.WaitGroup
|
|
n := 50
|
|
|
|
// Concurrent register.
|
|
for i := 0; i < n; i++ {
|
|
wg.Add(1)
|
|
go func(i int) {
|
|
defer wg.Done()
|
|
name := fmt.Sprintf("vol%d", i)
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: name, VolumeServer: "s1",
|
|
Path: fmt.Sprintf("/v%d.blk", i),
|
|
})
|
|
}(i)
|
|
}
|
|
wg.Wait()
|
|
|
|
// All should be findable.
|
|
for i := 0; i < n; i++ {
|
|
name := fmt.Sprintf("vol%d", i)
|
|
if _, ok := r.Lookup(name); !ok {
|
|
t.Fatalf("vol%d not found after concurrent register", i)
|
|
}
|
|
}
|
|
|
|
// Concurrent unregister.
|
|
for i := 0; i < n; i++ {
|
|
wg.Add(1)
|
|
go func(i int) {
|
|
defer wg.Done()
|
|
r.Unregister(fmt.Sprintf("vol%d", i))
|
|
}(i)
|
|
}
|
|
wg.Wait()
|
|
|
|
// All should be gone.
|
|
for i := 0; i < n; i++ {
|
|
if _, ok := r.Lookup(fmt.Sprintf("vol%d", i)); ok {
|
|
t.Fatalf("vol%d found after concurrent unregister", i)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestRegistry_SetReplica(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{Name: "vol1", VolumeServer: "s1", Path: "/v1.blk"})
|
|
|
|
err := r.SetReplica("vol1", "s2", "/replica/v1.blk", "10.0.0.2:3260", "iqn.2024.test:vol1-replica")
|
|
if err != nil {
|
|
t.Fatalf("SetReplica: %v", err)
|
|
}
|
|
|
|
e, _ := r.Lookup("vol1")
|
|
if e.ReplicaServer != "s2" {
|
|
t.Fatalf("ReplicaServer: got %q, want s2", e.ReplicaServer)
|
|
}
|
|
if e.ReplicaPath != "/replica/v1.blk" {
|
|
t.Fatalf("ReplicaPath: got %q", e.ReplicaPath)
|
|
}
|
|
if e.ReplicaISCSIAddr != "10.0.0.2:3260" {
|
|
t.Fatalf("ReplicaISCSIAddr: got %q", e.ReplicaISCSIAddr)
|
|
}
|
|
if e.ReplicaIQN != "iqn.2024.test:vol1-replica" {
|
|
t.Fatalf("ReplicaIQN: got %q", e.ReplicaIQN)
|
|
}
|
|
|
|
// Replica server should appear in byServer index.
|
|
s2Vols := r.ListByServer("s2")
|
|
if len(s2Vols) != 1 || s2Vols[0].Name != "vol1" {
|
|
t.Fatalf("ListByServer(s2): got %v, want [vol1]", s2Vols)
|
|
}
|
|
}
|
|
|
|
func TestRegistry_ClearReplica(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{Name: "vol1", VolumeServer: "s1", Path: "/v1.blk"})
|
|
r.SetReplica("vol1", "s2", "/replica/v1.blk", "10.0.0.2:3260", "iqn.2024.test:vol1-replica")
|
|
|
|
err := r.ClearReplica("vol1")
|
|
if err != nil {
|
|
t.Fatalf("ClearReplica: %v", err)
|
|
}
|
|
|
|
e, _ := r.Lookup("vol1")
|
|
if e.ReplicaServer != "" {
|
|
t.Fatalf("ReplicaServer should be empty, got %q", e.ReplicaServer)
|
|
}
|
|
if e.ReplicaPath != "" || e.ReplicaISCSIAddr != "" || e.ReplicaIQN != "" {
|
|
t.Fatal("replica fields should be empty after ClearReplica")
|
|
}
|
|
|
|
// Replica server should be gone from byServer index.
|
|
s2Vols := r.ListByServer("s2")
|
|
if len(s2Vols) != 0 {
|
|
t.Fatalf("ListByServer(s2) after clear: got %d, want 0", len(s2Vols))
|
|
}
|
|
}
|
|
|
|
func TestRegistry_SetReplicaNotFound(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
err := r.SetReplica("nonexistent", "s2", "/r.blk", "addr", "iqn")
|
|
if err == nil {
|
|
t.Fatal("SetReplica on nonexistent volume should return error")
|
|
}
|
|
}
|
|
|
|
func TestRegistry_SwapPrimaryReplica(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1",
|
|
VolumeServer: "s1",
|
|
Path: "/v1.blk",
|
|
IQN: "iqn:vol1-primary",
|
|
ISCSIAddr: "10.0.0.1:3260",
|
|
ReplicaServer: "s2",
|
|
ReplicaPath: "/replica/v1.blk",
|
|
ReplicaIQN: "iqn:vol1-replica",
|
|
ReplicaISCSIAddr: "10.0.0.2:3260",
|
|
Epoch: 3,
|
|
Role: 1,
|
|
})
|
|
|
|
newEpoch, err := r.SwapPrimaryReplica("vol1")
|
|
if err != nil {
|
|
t.Fatalf("SwapPrimaryReplica: %v", err)
|
|
}
|
|
if newEpoch != 4 {
|
|
t.Fatalf("newEpoch: got %d, want 4", newEpoch)
|
|
}
|
|
|
|
e, _ := r.Lookup("vol1")
|
|
// New primary should be the old replica.
|
|
if e.VolumeServer != "s2" {
|
|
t.Fatalf("VolumeServer after swap: got %q, want s2", e.VolumeServer)
|
|
}
|
|
if e.Path != "/replica/v1.blk" {
|
|
t.Fatalf("Path after swap: got %q", e.Path)
|
|
}
|
|
if e.Epoch != 4 {
|
|
t.Fatalf("Epoch after swap: got %d, want 4", e.Epoch)
|
|
}
|
|
// Old primary should become replica.
|
|
if e.ReplicaServer != "s1" {
|
|
t.Fatalf("ReplicaServer after swap: got %q, want s1", e.ReplicaServer)
|
|
}
|
|
if e.ReplicaPath != "/v1.blk" {
|
|
t.Fatalf("ReplicaPath after swap: got %q", e.ReplicaPath)
|
|
}
|
|
}
|
|
|
|
func TestFullHeartbeat_UpdatesReplicaAddrs(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1",
|
|
VolumeServer: "server1",
|
|
Path: "/data/vol1.blk",
|
|
SizeBytes: 1 << 30,
|
|
Status: StatusPending,
|
|
})
|
|
|
|
// Full heartbeat includes replica addresses.
|
|
r.UpdateFullHeartbeat("server1", []*master_pb.BlockVolumeInfoMessage{
|
|
{
|
|
Path: "/data/vol1.blk",
|
|
VolumeSize: 1 << 30,
|
|
Epoch: 5,
|
|
Role: 1,
|
|
ReplicaDataAddr: "10.0.0.2:14260",
|
|
ReplicaCtrlAddr: "10.0.0.2:14261",
|
|
},
|
|
}, "")
|
|
|
|
entry, ok := r.Lookup("vol1")
|
|
if !ok {
|
|
t.Fatal("vol1 not found after heartbeat")
|
|
}
|
|
if entry.Status != StatusActive {
|
|
t.Fatalf("expected Active, got %v", entry.Status)
|
|
}
|
|
if entry.ReplicaDataAddr != "10.0.0.2:14260" {
|
|
t.Fatalf("ReplicaDataAddr: got %q, want 10.0.0.2:14260", entry.ReplicaDataAddr)
|
|
}
|
|
if entry.ReplicaCtrlAddr != "10.0.0.2:14261" {
|
|
t.Fatalf("ReplicaCtrlAddr: got %q, want 10.0.0.2:14261", entry.ReplicaCtrlAddr)
|
|
}
|
|
}
|
|
|
|
// --- CP8-2 new tests ---
|
|
|
|
func TestRegistry_AddReplica(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{Name: "vol1", VolumeServer: "s1", Path: "/v1.blk"})
|
|
|
|
err := r.AddReplica("vol1", ReplicaInfo{
|
|
Server: "s2",
|
|
Path: "/replica/v1.blk",
|
|
ISCSIAddr: "10.0.0.2:3260",
|
|
IQN: "iqn:vol1-r1",
|
|
DataAddr: "s2:14260",
|
|
CtrlAddr: "s2:14261",
|
|
})
|
|
if err != nil {
|
|
t.Fatalf("AddReplica: %v", err)
|
|
}
|
|
|
|
e, _ := r.Lookup("vol1")
|
|
if len(e.Replicas) != 1 {
|
|
t.Fatalf("Replicas len: got %d, want 1", len(e.Replicas))
|
|
}
|
|
if e.Replicas[0].Server != "s2" {
|
|
t.Fatalf("Replicas[0].Server: got %q", e.Replicas[0].Server)
|
|
}
|
|
// Deprecated scalar should be synced.
|
|
if e.ReplicaServer != "s2" {
|
|
t.Fatalf("ReplicaServer (deprecated): got %q", e.ReplicaServer)
|
|
}
|
|
// byServer index should include replica.
|
|
if len(r.ListByServer("s2")) != 1 {
|
|
t.Fatalf("ListByServer(s2): got %d, want 1", len(r.ListByServer("s2")))
|
|
}
|
|
}
|
|
|
|
func TestRegistry_AddReplica_TwoRF3(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{Name: "vol1", VolumeServer: "s1", Path: "/v1.blk", ReplicaFactor: 3})
|
|
|
|
r.AddReplica("vol1", ReplicaInfo{Server: "s2", Path: "/r1.blk", IQN: "iqn:r1"})
|
|
r.AddReplica("vol1", ReplicaInfo{Server: "s3", Path: "/r2.blk", IQN: "iqn:r2"})
|
|
|
|
e, _ := r.Lookup("vol1")
|
|
if len(e.Replicas) != 2 {
|
|
t.Fatalf("Replicas len: got %d, want 2", len(e.Replicas))
|
|
}
|
|
if e.Replicas[0].Server != "s2" || e.Replicas[1].Server != "s3" {
|
|
t.Fatalf("Replicas: got %+v", e.Replicas)
|
|
}
|
|
// byServer index should include both.
|
|
if len(r.ListByServer("s2")) != 1 || len(r.ListByServer("s3")) != 1 {
|
|
t.Fatal("byServer should include both replica servers")
|
|
}
|
|
}
|
|
|
|
func TestRegistry_AddReplica_Upsert(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{Name: "vol1", VolumeServer: "s1", Path: "/v1.blk"})
|
|
|
|
r.AddReplica("vol1", ReplicaInfo{Server: "s2", Path: "/r1.blk"})
|
|
r.AddReplica("vol1", ReplicaInfo{Server: "s2", Path: "/r1-new.blk"})
|
|
|
|
e, _ := r.Lookup("vol1")
|
|
if len(e.Replicas) != 1 {
|
|
t.Fatalf("Replicas len: got %d, want 1 (upsert, not duplicate)", len(e.Replicas))
|
|
}
|
|
if e.Replicas[0].Path != "/r1-new.blk" {
|
|
t.Fatalf("Replicas[0].Path: got %q, want /r1-new.blk", e.Replicas[0].Path)
|
|
}
|
|
}
|
|
|
|
func TestRegistry_RemoveReplica(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{Name: "vol1", VolumeServer: "s1", Path: "/v1.blk"})
|
|
r.AddReplica("vol1", ReplicaInfo{Server: "s2", Path: "/r1.blk"})
|
|
r.AddReplica("vol1", ReplicaInfo{Server: "s3", Path: "/r2.blk"})
|
|
|
|
err := r.RemoveReplica("vol1", "s2")
|
|
if err != nil {
|
|
t.Fatalf("RemoveReplica: %v", err)
|
|
}
|
|
|
|
e, _ := r.Lookup("vol1")
|
|
if len(e.Replicas) != 1 {
|
|
t.Fatalf("Replicas len: got %d, want 1", len(e.Replicas))
|
|
}
|
|
if e.Replicas[0].Server != "s3" {
|
|
t.Fatalf("remaining replica should be s3, got %q", e.Replicas[0].Server)
|
|
}
|
|
// Deprecated scalar should sync to first remaining replica.
|
|
if e.ReplicaServer != "s3" {
|
|
t.Fatalf("ReplicaServer (deprecated): got %q, want s3", e.ReplicaServer)
|
|
}
|
|
// s2 should be removed from byServer.
|
|
if len(r.ListByServer("s2")) != 0 {
|
|
t.Fatalf("ListByServer(s2): got %d, want 0", len(r.ListByServer("s2")))
|
|
}
|
|
}
|
|
|
|
func TestRegistry_PromoteBestReplica_PicksHighest(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.MarkBlockCapable("s2")
|
|
r.MarkBlockCapable("s3")
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1",
|
|
VolumeServer: "s1",
|
|
Path: "/v1.blk",
|
|
Epoch: 5,
|
|
Role: 1,
|
|
Replicas: []ReplicaInfo{
|
|
{Server: "s2", Path: "/r1.blk", IQN: "iqn:r1", ISCSIAddr: "s2:3260", HealthScore: 0.8, WALHeadLSN: 100, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
|
|
{Server: "s3", Path: "/r2.blk", IQN: "iqn:r2", ISCSIAddr: "s3:3260", HealthScore: 0.95, WALHeadLSN: 90, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
|
|
},
|
|
})
|
|
// Add to byServer for s2 and s3.
|
|
r.mu.Lock()
|
|
r.addToServer("s2", "vol1")
|
|
r.addToServer("s3", "vol1")
|
|
r.mu.Unlock()
|
|
|
|
newEpoch, err := r.PromoteBestReplica("vol1")
|
|
if err != nil {
|
|
t.Fatalf("PromoteBestReplica: %v", err)
|
|
}
|
|
if newEpoch != 6 {
|
|
t.Fatalf("newEpoch: got %d, want 6", newEpoch)
|
|
}
|
|
|
|
e, _ := r.Lookup("vol1")
|
|
// s3 had higher health score → promoted.
|
|
if e.VolumeServer != "s3" {
|
|
t.Fatalf("VolumeServer: got %q, want s3 (higher health)", e.VolumeServer)
|
|
}
|
|
if e.Path != "/r2.blk" {
|
|
t.Fatalf("Path: got %q", e.Path)
|
|
}
|
|
// s2 should remain in Replicas.
|
|
if len(e.Replicas) != 1 {
|
|
t.Fatalf("Replicas len: got %d, want 1 (s2 stays)", len(e.Replicas))
|
|
}
|
|
if e.Replicas[0].Server != "s2" {
|
|
t.Fatalf("remaining replica: got %q, want s2", e.Replicas[0].Server)
|
|
}
|
|
}
|
|
|
|
func TestRegistry_PromoteBestReplica_NoReplica(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{Name: "vol1", VolumeServer: "s1", Path: "/v1.blk"})
|
|
|
|
_, err := r.PromoteBestReplica("vol1")
|
|
if err == nil {
|
|
t.Fatal("PromoteBestReplica with no replicas should return error")
|
|
}
|
|
}
|
|
|
|
func TestRegistry_PromoteBestReplica_TiebreakByLSN(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.MarkBlockCapable("s2")
|
|
r.MarkBlockCapable("s3")
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1",
|
|
VolumeServer: "s1",
|
|
Path: "/v1.blk",
|
|
Epoch: 3,
|
|
Replicas: []ReplicaInfo{
|
|
{Server: "s2", Path: "/r1.blk", IQN: "iqn:r1", ISCSIAddr: "s2:3260", HealthScore: 0.9, WALHeadLSN: 50, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
|
|
{Server: "s3", Path: "/r2.blk", IQN: "iqn:r2", ISCSIAddr: "s3:3260", HealthScore: 0.9, WALHeadLSN: 100, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
|
|
},
|
|
})
|
|
r.mu.Lock()
|
|
r.addToServer("s2", "vol1")
|
|
r.addToServer("s3", "vol1")
|
|
r.mu.Unlock()
|
|
|
|
newEpoch, err := r.PromoteBestReplica("vol1")
|
|
if err != nil {
|
|
t.Fatalf("PromoteBestReplica: %v", err)
|
|
}
|
|
if newEpoch != 4 {
|
|
t.Fatalf("newEpoch: got %d, want 4", newEpoch)
|
|
}
|
|
|
|
e, _ := r.Lookup("vol1")
|
|
// Same health → tie-break by WALHeadLSN → s3 wins.
|
|
if e.VolumeServer != "s3" {
|
|
t.Fatalf("VolumeServer: got %q, want s3 (higher LSN)", e.VolumeServer)
|
|
}
|
|
if len(e.Replicas) != 1 || e.Replicas[0].Server != "s2" {
|
|
t.Fatalf("remaining replica: got %+v, want [s2]", e.Replicas)
|
|
}
|
|
}
|
|
|
|
func TestRegistry_PromoteBestReplica_KeepsOthers(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.MarkBlockCapable("s2")
|
|
r.MarkBlockCapable("s3")
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1",
|
|
VolumeServer: "s1",
|
|
Path: "/v1.blk",
|
|
Epoch: 1,
|
|
Replicas: []ReplicaInfo{
|
|
{Server: "s2", Path: "/r1.blk", IQN: "iqn:r1", ISCSIAddr: "s2:3260", HealthScore: 1.0, WALHeadLSN: 100, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
|
|
{Server: "s3", Path: "/r2.blk", IQN: "iqn:r2", ISCSIAddr: "s3:3260", HealthScore: 0.5, WALHeadLSN: 100, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
|
|
},
|
|
})
|
|
r.mu.Lock()
|
|
r.addToServer("s2", "vol1")
|
|
r.addToServer("s3", "vol1")
|
|
r.mu.Unlock()
|
|
|
|
r.PromoteBestReplica("vol1")
|
|
|
|
e, _ := r.Lookup("vol1")
|
|
// s2 promoted, s3 stays.
|
|
if e.VolumeServer != "s2" {
|
|
t.Fatalf("VolumeServer: got %q, want s2", e.VolumeServer)
|
|
}
|
|
if len(e.Replicas) != 1 || e.Replicas[0].Server != "s3" {
|
|
t.Fatalf("remaining replicas: got %+v, want [s3]", e.Replicas)
|
|
}
|
|
}
|
|
|
|
func TestRegistry_BackwardCompatAccessors(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{Name: "vol1", VolumeServer: "s1", Path: "/v1.blk"})
|
|
|
|
e, _ := r.Lookup("vol1")
|
|
if e.HasReplica() {
|
|
t.Fatal("HasReplica should be false with no replicas")
|
|
}
|
|
if e.FirstReplica() != nil {
|
|
t.Fatal("FirstReplica should be nil")
|
|
}
|
|
if e.BestReplicaForPromotion() != nil {
|
|
t.Fatal("BestReplicaForPromotion should be nil")
|
|
}
|
|
|
|
r.AddReplica("vol1", ReplicaInfo{Server: "s2", Path: "/r.blk", HealthScore: 0.9})
|
|
|
|
e, _ = r.Lookup("vol1")
|
|
if !e.HasReplica() {
|
|
t.Fatal("HasReplica should be true after AddReplica")
|
|
}
|
|
if e.FirstReplica() == nil || e.FirstReplica().Server != "s2" {
|
|
t.Fatal("FirstReplica should return s2")
|
|
}
|
|
if e.ReplicaByServer("s2") == nil {
|
|
t.Fatal("ReplicaByServer(s2) should not be nil")
|
|
}
|
|
if e.ReplicaByServer("s3") != nil {
|
|
t.Fatal("ReplicaByServer(s3) should be nil")
|
|
}
|
|
}
|
|
|
|
func TestRegistry_ReplicaFactorDefault(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{Name: "vol1", VolumeServer: "s1", Path: "/v1.blk"})
|
|
|
|
e, _ := r.Lookup("vol1")
|
|
// ReplicaFactor defaults to 0 (zero value). API handler defaults to 2.
|
|
if e.ReplicaFactor != 0 {
|
|
t.Fatalf("default ReplicaFactor: got %d, want 0", e.ReplicaFactor)
|
|
}
|
|
|
|
// Explicit RF=3.
|
|
r.Register(&BlockVolumeEntry{Name: "vol2", VolumeServer: "s1", Path: "/v2.blk", ReplicaFactor: 3})
|
|
e2, _ := r.Lookup("vol2")
|
|
if e2.ReplicaFactor != 3 {
|
|
t.Fatalf("ReplicaFactor: got %d, want 3", e2.ReplicaFactor)
|
|
}
|
|
}
|
|
|
|
func TestRegistry_FullHeartbeat_UpdatesHealthScore(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1",
|
|
VolumeServer: "s1",
|
|
Path: "/v1.blk",
|
|
Status: StatusPending,
|
|
})
|
|
|
|
r.UpdateFullHeartbeat("s1", []*master_pb.BlockVolumeInfoMessage{
|
|
{
|
|
Path: "/v1.blk",
|
|
VolumeSize: 1 << 30,
|
|
Epoch: 1,
|
|
Role: 1,
|
|
HealthScore: 0.85,
|
|
ScrubErrors: 2,
|
|
WalHeadLsn: 500,
|
|
},
|
|
}, "")
|
|
|
|
e, _ := r.Lookup("vol1")
|
|
if e.HealthScore != 0.85 {
|
|
t.Fatalf("HealthScore: got %f, want 0.85", e.HealthScore)
|
|
}
|
|
if e.WALHeadLSN != 500 {
|
|
t.Fatalf("WALHeadLSN: got %d, want 500", e.WALHeadLSN)
|
|
}
|
|
}
|
|
|
|
// Fix #1: Replica heartbeat must NOT delete the volume.
|
|
func TestRegistry_ReplicaHeartbeat_DoesNotDeleteVolume(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1",
|
|
VolumeServer: "primary",
|
|
Path: "/data/vol1.blk",
|
|
Status: StatusActive,
|
|
Replicas: []ReplicaInfo{
|
|
{Server: "replica1", Path: "/data/vol1.blk"},
|
|
},
|
|
})
|
|
|
|
// Replica sends heartbeat reporting its path.
|
|
r.UpdateFullHeartbeat("replica1", []*master_pb.BlockVolumeInfoMessage{
|
|
{Path: "/data/vol1.blk", Epoch: 1, Role: 2},
|
|
}, "")
|
|
|
|
// Volume must still exist with primary intact.
|
|
e, ok := r.Lookup("vol1")
|
|
if !ok {
|
|
t.Fatal("vol1 should not be deleted when replica sends heartbeat")
|
|
}
|
|
if e.VolumeServer != "primary" {
|
|
t.Fatalf("primary should remain 'primary', got %q", e.VolumeServer)
|
|
}
|
|
}
|
|
|
|
// Fix #1: Replica path NOT reported → replica removed, volume preserved.
|
|
func TestRegistry_ReplicaHeartbeat_StaleReplicaRemoved(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1",
|
|
VolumeServer: "primary",
|
|
Path: "/data/vol1.blk",
|
|
Status: StatusActive,
|
|
Replicas: []ReplicaInfo{
|
|
{Server: "replica1", Path: "/data/vol1.blk"},
|
|
{Server: "replica2", Path: "/data/vol1.blk"},
|
|
},
|
|
})
|
|
|
|
// replica1 heartbeat does NOT report vol1 path → stale replica.
|
|
r.UpdateFullHeartbeat("replica1", []*master_pb.BlockVolumeInfoMessage{}, "")
|
|
|
|
// Volume still exists, but replica1 removed.
|
|
e, ok := r.Lookup("vol1")
|
|
if !ok {
|
|
t.Fatal("vol1 should exist (only replica removed)")
|
|
}
|
|
if len(e.Replicas) != 1 {
|
|
t.Fatalf("expected 1 replica after stale removal, got %d", len(e.Replicas))
|
|
}
|
|
if e.Replicas[0].Server != "replica2" {
|
|
t.Fatalf("remaining replica should be replica2, got %q", e.Replicas[0].Server)
|
|
}
|
|
}
|
|
|
|
// Fix #3: Replica heartbeat after master restart reconstructs ReplicaInfo.
|
|
func TestRegistry_ReplicaHeartbeat_ReconstructsAfterRestart(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
// Simulate master restart: primary heartbeat re-created entry (epoch 1).
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1",
|
|
VolumeServer: "primary",
|
|
Path: "/data/vol1.blk",
|
|
Epoch: 1,
|
|
WALHeadLSN: 100,
|
|
Status: StatusActive,
|
|
})
|
|
|
|
// Replica heartbeat arrives — vol1 exists but has no record of this server.
|
|
// Same epoch, lower LSN, Role=2 (replica) → added as replica.
|
|
r.UpdateFullHeartbeat("replica1", []*master_pb.BlockVolumeInfoMessage{
|
|
{Path: "/data/vol1.blk", Epoch: 1, Role: 2, HealthScore: 0.95, WalHeadLsn: 42},
|
|
}, "")
|
|
|
|
// vol1 should now have replica1 in Replicas[].
|
|
e, ok := r.Lookup("vol1")
|
|
if !ok {
|
|
t.Fatal("vol1 should exist")
|
|
}
|
|
if len(e.Replicas) != 1 {
|
|
t.Fatalf("expected 1 replica after reconstruction, got %d", len(e.Replicas))
|
|
}
|
|
ri := e.Replicas[0]
|
|
if ri.Server != "replica1" {
|
|
t.Fatalf("replica server: got %q, want replica1", ri.Server)
|
|
}
|
|
if ri.HealthScore != 0.95 {
|
|
t.Fatalf("replica health: got %f, want 0.95", ri.HealthScore)
|
|
}
|
|
if ri.WALHeadLSN != 42 {
|
|
t.Fatalf("replica WALHeadLSN: got %d, want 42", ri.WALHeadLSN)
|
|
}
|
|
// byServer index should include replica1.
|
|
entries := r.ListByServer("replica1")
|
|
if len(entries) != 1 || entries[0].Name != "vol1" {
|
|
t.Fatalf("ListByServer(replica1) should return vol1, got %+v", entries)
|
|
}
|
|
}
|
|
|
|
// Fix #2: Stale replica (old heartbeat) not eligible for promotion.
|
|
func TestRegistry_PromoteBestReplica_StaleHeartbeatIneligible(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1",
|
|
VolumeServer: "primary",
|
|
Path: "/data/vol1.blk",
|
|
Epoch: 1,
|
|
LeaseTTL: 5 * time.Second,
|
|
WALHeadLSN: 100,
|
|
Replicas: []ReplicaInfo{
|
|
{
|
|
Server: "stale-replica",
|
|
Path: "/data/vol1.blk",
|
|
HealthScore: 1.0,
|
|
WALHeadLSN: 100,
|
|
LastHeartbeat: time.Now().Add(-30 * time.Second), // stale (>2×5s)
|
|
},
|
|
},
|
|
})
|
|
|
|
_, err := r.PromoteBestReplica("vol1")
|
|
if err == nil {
|
|
t.Fatal("expected error: stale replica should not be eligible")
|
|
}
|
|
}
|
|
|
|
// Fix #2: Replica with WAL lag too large is not eligible.
|
|
func TestRegistry_PromoteBestReplica_WALLagIneligible(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1",
|
|
VolumeServer: "primary",
|
|
Path: "/data/vol1.blk",
|
|
Epoch: 1,
|
|
LeaseTTL: 30 * time.Second,
|
|
WALHeadLSN: 1000,
|
|
Replicas: []ReplicaInfo{
|
|
{
|
|
Server: "lagging",
|
|
Path: "/data/vol1.blk",
|
|
HealthScore: 1.0,
|
|
WALHeadLSN: 800, // lag=200, tolerance=100
|
|
LastHeartbeat: time.Now(),
|
|
Role: blockvol.RoleToWire(blockvol.RoleReplica),
|
|
},
|
|
},
|
|
})
|
|
|
|
_, err := r.PromoteBestReplica("vol1")
|
|
if err == nil {
|
|
t.Fatal("expected error: lagging replica should not be eligible")
|
|
}
|
|
}
|
|
|
|
// Fix #2: Rebuilding replica is not eligible for promotion.
|
|
func TestRegistry_PromoteBestReplica_RebuildingIneligible(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1",
|
|
VolumeServer: "primary",
|
|
Path: "/data/vol1.blk",
|
|
Epoch: 1,
|
|
LeaseTTL: 30 * time.Second,
|
|
WALHeadLSN: 100,
|
|
Replicas: []ReplicaInfo{
|
|
{
|
|
Server: "rebuilding",
|
|
Path: "/data/vol1.blk",
|
|
HealthScore: 1.0,
|
|
WALHeadLSN: 100,
|
|
LastHeartbeat: time.Now(),
|
|
Role: blockvol.RoleToWire(blockvol.RoleRebuilding),
|
|
},
|
|
},
|
|
})
|
|
|
|
_, err := r.PromoteBestReplica("vol1")
|
|
if err == nil {
|
|
t.Fatal("expected error: rebuilding replica should not be eligible")
|
|
}
|
|
}
|
|
|
|
// Fix #2: Among eligible replicas, best (health+LSN) wins.
|
|
func TestRegistry_PromoteBestReplica_EligibilityFiltersCorrectly(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.MarkBlockCapable("stale")
|
|
r.MarkBlockCapable("good")
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1",
|
|
VolumeServer: "primary",
|
|
Path: "/data/vol1.blk",
|
|
Epoch: 1,
|
|
LeaseTTL: 30 * time.Second,
|
|
WALHeadLSN: 100,
|
|
Replicas: []ReplicaInfo{
|
|
{
|
|
Server: "stale", // ineligible: old heartbeat
|
|
Path: "/data/vol1.blk",
|
|
HealthScore: 1.0,
|
|
WALHeadLSN: 100,
|
|
LastHeartbeat: time.Now().Add(-2 * time.Minute),
|
|
},
|
|
{
|
|
Server: "good", // eligible
|
|
Path: "/data/vol1.blk",
|
|
HealthScore: 0.8,
|
|
WALHeadLSN: 95,
|
|
LastHeartbeat: time.Now(),
|
|
Role: blockvol.RoleToWire(blockvol.RoleReplica),
|
|
},
|
|
},
|
|
})
|
|
|
|
_, err := r.PromoteBestReplica("vol1")
|
|
if err != nil {
|
|
t.Fatalf("expected promotion to succeed: %v", err)
|
|
}
|
|
e, _ := r.Lookup("vol1")
|
|
if e.VolumeServer != "good" {
|
|
t.Fatalf("expected 'good' promoted (only eligible), got %q", e.VolumeServer)
|
|
}
|
|
}
|
|
|
|
// Configurable tolerance: widen tolerance to allow lagging replicas.
|
|
func TestRegistry_PromoteBestReplica_ConfigurableTolerance(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.MarkBlockCapable("lagging")
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1",
|
|
VolumeServer: "primary",
|
|
Path: "/data/vol1.blk",
|
|
Epoch: 1,
|
|
LeaseTTL: 30 * time.Second,
|
|
WALHeadLSN: 1000,
|
|
Replicas: []ReplicaInfo{
|
|
{
|
|
Server: "lagging",
|
|
Path: "/data/vol1.blk",
|
|
HealthScore: 1.0,
|
|
WALHeadLSN: 800, // lag=200
|
|
LastHeartbeat: time.Now(),
|
|
Role: blockvol.RoleToWire(blockvol.RoleReplica),
|
|
},
|
|
},
|
|
})
|
|
|
|
// Default tolerance (100): lag 200 > tolerance → ineligible.
|
|
_, err := r.PromoteBestReplica("vol1")
|
|
if err == nil {
|
|
t.Fatal("expected error with default tolerance")
|
|
}
|
|
|
|
// Widen tolerance to 250: lag 200 < tolerance → eligible.
|
|
r.SetPromotionLSNTolerance(250)
|
|
_, err = r.PromoteBestReplica("vol1")
|
|
if err != nil {
|
|
t.Fatalf("expected success with widened tolerance: %v", err)
|
|
}
|
|
e, _ := r.Lookup("vol1")
|
|
if e.VolumeServer != "lagging" {
|
|
t.Fatalf("expected 'lagging' promoted, got %q", e.VolumeServer)
|
|
}
|
|
}
|
|
|
|
// B-12: PromoteBestReplica rejects dead replica (server not in blockServers).
|
|
func TestRegistry_PromoteBestReplica_DeadServerIneligible(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
// Intentionally do NOT mark "dead-replica" as block-capable.
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1",
|
|
VolumeServer: "primary",
|
|
Path: "/data/vol1.blk",
|
|
Epoch: 1,
|
|
LeaseTTL: 30 * time.Second,
|
|
WALHeadLSN: 100,
|
|
Replicas: []ReplicaInfo{
|
|
{
|
|
Server: "dead-replica",
|
|
Path: "/data/vol1.blk",
|
|
HealthScore: 1.0,
|
|
WALHeadLSN: 100,
|
|
LastHeartbeat: time.Now(),
|
|
Role: blockvol.RoleToWire(blockvol.RoleReplica),
|
|
},
|
|
},
|
|
})
|
|
|
|
_, err := r.PromoteBestReplica("vol1")
|
|
if err == nil {
|
|
t.Fatal("expected error: dead replica should be rejected")
|
|
}
|
|
if !strings.Contains(err.Error(), "server_dead") {
|
|
t.Fatalf("error should mention server_dead, got: %v", err)
|
|
}
|
|
}
|
|
|
|
// B-12: Dead replica rejected but alive replica promoted when both exist.
|
|
func TestRegistry_PromoteBestReplica_DeadSkipped_AlivePromoted(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
// Only mark s3 as alive.
|
|
r.MarkBlockCapable("s3")
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1",
|
|
VolumeServer: "primary",
|
|
Path: "/data/vol1.blk",
|
|
Epoch: 1,
|
|
LeaseTTL: 30 * time.Second,
|
|
WALHeadLSN: 100,
|
|
Replicas: []ReplicaInfo{
|
|
{Server: "s2-dead", Path: "/r1.blk", HealthScore: 1.0, WALHeadLSN: 100, LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
|
{Server: "s3", Path: "/r2.blk", HealthScore: 0.8, WALHeadLSN: 95, LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
|
},
|
|
})
|
|
|
|
newEpoch, err := r.PromoteBestReplica("vol1")
|
|
if err != nil {
|
|
t.Fatalf("PromoteBestReplica: %v", err)
|
|
}
|
|
if newEpoch != 2 {
|
|
t.Fatalf("newEpoch: got %d, want 2", newEpoch)
|
|
}
|
|
e, _ := r.Lookup("vol1")
|
|
if e.VolumeServer != "s3" {
|
|
t.Fatalf("expected alive s3 promoted, got %q", e.VolumeServer)
|
|
}
|
|
}
|
|
|
|
// EvaluatePromotion returns read-only preflight without mutating registry.
|
|
func TestRegistry_EvaluatePromotion_Basic(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.MarkBlockCapable("replica1")
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1",
|
|
VolumeServer: "primary",
|
|
Path: "/data/vol1.blk",
|
|
Epoch: 5,
|
|
LeaseTTL: 30 * time.Second,
|
|
WALHeadLSN: 100,
|
|
Replicas: []ReplicaInfo{
|
|
{Server: "replica1", Path: "/r1.blk", HealthScore: 0.9, WALHeadLSN: 100, LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
|
},
|
|
})
|
|
|
|
pf, err := r.EvaluatePromotion("vol1")
|
|
if err != nil {
|
|
t.Fatalf("EvaluatePromotion: %v", err)
|
|
}
|
|
if !pf.Promotable {
|
|
t.Fatalf("expected promotable, got reason: %s", pf.Reason)
|
|
}
|
|
if pf.Candidate == nil || pf.Candidate.Server != "replica1" {
|
|
t.Fatalf("expected candidate replica1, got %+v", pf.Candidate)
|
|
}
|
|
|
|
// Registry must be unmutated.
|
|
e, _ := r.Lookup("vol1")
|
|
if e.VolumeServer != "primary" {
|
|
t.Fatal("EvaluatePromotion should not mutate the registry")
|
|
}
|
|
if e.Epoch != 5 {
|
|
t.Fatal("EvaluatePromotion should not bump epoch")
|
|
}
|
|
}
|
|
|
|
// EvaluatePromotion with all replicas rejected.
|
|
func TestRegistry_EvaluatePromotion_AllRejected(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
// No servers marked as block-capable.
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1",
|
|
VolumeServer: "primary",
|
|
Path: "/data/vol1.blk",
|
|
Epoch: 1,
|
|
Replicas: []ReplicaInfo{
|
|
{Server: "dead1", Path: "/r1.blk", HealthScore: 1.0, WALHeadLSN: 100, LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
|
{Server: "dead2", Path: "/r2.blk", HealthScore: 0.9, WALHeadLSN: 100, LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
|
},
|
|
})
|
|
|
|
pf, err := r.EvaluatePromotion("vol1")
|
|
if err != nil {
|
|
t.Fatalf("EvaluatePromotion: %v", err)
|
|
}
|
|
if pf.Promotable {
|
|
t.Fatal("expected not promotable")
|
|
}
|
|
if len(pf.Rejections) != 2 {
|
|
t.Fatalf("expected 2 rejections, got %d", len(pf.Rejections))
|
|
}
|
|
for _, rej := range pf.Rejections {
|
|
if rej.Reason != "server_dead" {
|
|
t.Fatalf("expected server_dead rejection, got %q", rej.Reason)
|
|
}
|
|
}
|
|
}
|
|
|
|
// EvaluatePromotion for nonexistent volume.
|
|
func TestRegistry_EvaluatePromotion_NotFound(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
_, err := r.EvaluatePromotion("nonexistent")
|
|
if err == nil {
|
|
t.Fatal("expected error for nonexistent volume")
|
|
}
|
|
}
|
|
|
|
// Replica created but never heartbeated is not promotable.
|
|
func TestRegistry_PromoteBestReplica_NoHeartbeatIneligible(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.MarkBlockCapable("replica1")
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1",
|
|
VolumeServer: "primary",
|
|
Path: "/data/vol1.blk",
|
|
Epoch: 1,
|
|
LeaseTTL: 30 * time.Second,
|
|
WALHeadLSN: 100,
|
|
Replicas: []ReplicaInfo{
|
|
{
|
|
Server: "replica1",
|
|
Path: "/r1.blk",
|
|
HealthScore: 1.0,
|
|
WALHeadLSN: 100,
|
|
Role: blockvol.RoleToWire(blockvol.RoleReplica),
|
|
// LastHeartbeat: zero — never heartbeated
|
|
},
|
|
},
|
|
})
|
|
|
|
_, err := r.PromoteBestReplica("vol1")
|
|
if err == nil {
|
|
t.Fatal("expected error: replica with no heartbeat should be rejected")
|
|
}
|
|
if !strings.Contains(err.Error(), "no_heartbeat") {
|
|
t.Fatalf("error should mention no_heartbeat, got: %v", err)
|
|
}
|
|
}
|
|
|
|
// Replica with unset (zero) role is not promotable.
|
|
func TestRegistry_PromoteBestReplica_UnsetRoleIneligible(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.MarkBlockCapable("replica1")
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1",
|
|
VolumeServer: "primary",
|
|
Path: "/data/vol1.blk",
|
|
Epoch: 1,
|
|
LeaseTTL: 30 * time.Second,
|
|
WALHeadLSN: 100,
|
|
Replicas: []ReplicaInfo{
|
|
{
|
|
Server: "replica1",
|
|
Path: "/r1.blk",
|
|
HealthScore: 1.0,
|
|
WALHeadLSN: 100,
|
|
LastHeartbeat: time.Now(),
|
|
// Role: 0 — unset/RoleNone
|
|
},
|
|
},
|
|
})
|
|
|
|
_, err := r.PromoteBestReplica("vol1")
|
|
if err == nil {
|
|
t.Fatal("expected error: replica with unset role should be rejected")
|
|
}
|
|
if !strings.Contains(err.Error(), "wrong_role") {
|
|
t.Fatalf("error should mention wrong_role, got: %v", err)
|
|
}
|
|
}
|
|
|
|
// PromoteBestReplica clears RebuildListenAddr on promotion (B-11 partial fix).
|
|
func TestRegistry_PromoteBestReplica_ClearsRebuildAddr(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.MarkBlockCapable("replica1")
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1",
|
|
VolumeServer: "primary",
|
|
Path: "/data/vol1.blk",
|
|
Epoch: 1,
|
|
RebuildListenAddr: "primary:15000",
|
|
Replicas: []ReplicaInfo{
|
|
{Server: "replica1", Path: "/r1.blk", HealthScore: 1.0, WALHeadLSN: 100, LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
|
},
|
|
})
|
|
|
|
_, err := r.PromoteBestReplica("vol1")
|
|
if err != nil {
|
|
t.Fatalf("PromoteBestReplica: %v", err)
|
|
}
|
|
e, _ := r.Lookup("vol1")
|
|
if e.RebuildListenAddr != "" {
|
|
t.Fatalf("RebuildListenAddr should be cleared after promotion, got %q", e.RebuildListenAddr)
|
|
}
|
|
}
|
|
|
|
// --- LeaseGrants ---
|
|
|
|
func TestRegistry_LeaseGrants_PrimaryOnly(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
|
|
// Register a primary volume.
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "prim1",
|
|
VolumeServer: "s1:18080",
|
|
Path: "/data/prim1.blk",
|
|
SizeBytes: 1 << 30,
|
|
Epoch: 5,
|
|
Role: blockvol.RoleToWire(blockvol.RolePrimary),
|
|
Status: StatusActive,
|
|
LeaseTTL: 30 * time.Second,
|
|
})
|
|
|
|
// Register a replica volume on the same server.
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "repl1",
|
|
VolumeServer: "s2:18080",
|
|
Path: "/data/repl1.blk",
|
|
SizeBytes: 1 << 30,
|
|
Epoch: 3,
|
|
Role: blockvol.RoleToWire(blockvol.RoleReplica),
|
|
Status: StatusActive,
|
|
})
|
|
r.AddReplica("repl1", ReplicaInfo{Server: "s1:18080", Path: "/data/repl1-replica.blk"})
|
|
|
|
// Register a none-role volume.
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "none1",
|
|
VolumeServer: "s1:18080",
|
|
Path: "/data/none1.blk",
|
|
SizeBytes: 1 << 30,
|
|
Epoch: 1,
|
|
Role: blockvol.RoleToWire(blockvol.RoleNone),
|
|
Status: StatusActive,
|
|
})
|
|
|
|
// LeaseGrants for s1 should only include prim1 (the primary).
|
|
grants := r.LeaseGrants("s1:18080", nil)
|
|
if len(grants) != 1 {
|
|
t.Fatalf("expected 1 grant, got %d: %+v", len(grants), grants)
|
|
}
|
|
if grants[0].Path != "/data/prim1.blk" {
|
|
t.Errorf("expected prim1 path, got %q", grants[0].Path)
|
|
}
|
|
if grants[0].Epoch != 5 {
|
|
t.Errorf("expected epoch 5, got %d", grants[0].Epoch)
|
|
}
|
|
if grants[0].LeaseTtlMs != 30000 {
|
|
t.Errorf("expected 30000ms TTL, got %d", grants[0].LeaseTtlMs)
|
|
}
|
|
}
|
|
|
|
func TestRegistry_LeaseGrants_PendingExcluded(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1",
|
|
VolumeServer: "s1:18080",
|
|
Path: "/data/vol1.blk",
|
|
SizeBytes: 1 << 30,
|
|
Epoch: 2,
|
|
Role: blockvol.RoleToWire(blockvol.RolePrimary),
|
|
Status: StatusActive,
|
|
LeaseTTL: 30 * time.Second,
|
|
})
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol2",
|
|
VolumeServer: "s1:18080",
|
|
Path: "/data/vol2.blk",
|
|
SizeBytes: 1 << 30,
|
|
Epoch: 1,
|
|
Role: blockvol.RoleToWire(blockvol.RolePrimary),
|
|
Status: StatusActive,
|
|
LeaseTTL: 30 * time.Second,
|
|
})
|
|
|
|
// vol1 has a pending assignment — should be excluded.
|
|
pending := map[string]bool{"/data/vol1.blk": true}
|
|
grants := r.LeaseGrants("s1:18080", pending)
|
|
if len(grants) != 1 {
|
|
t.Fatalf("expected 1 grant (vol2 only), got %d: %+v", len(grants), grants)
|
|
}
|
|
if grants[0].Path != "/data/vol2.blk" {
|
|
t.Errorf("expected vol2 path, got %q", grants[0].Path)
|
|
}
|
|
}
|
|
|
|
func TestRegistry_LeaseGrants_InactiveExcluded(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "pending-vol",
|
|
VolumeServer: "s1:18080",
|
|
Path: "/data/pending.blk",
|
|
SizeBytes: 1 << 30,
|
|
Epoch: 1,
|
|
Role: blockvol.RoleToWire(blockvol.RolePrimary),
|
|
Status: StatusPending, // not yet confirmed by heartbeat
|
|
LeaseTTL: 30 * time.Second,
|
|
})
|
|
|
|
grants := r.LeaseGrants("s1:18080", nil)
|
|
if len(grants) != 0 {
|
|
t.Fatalf("expected 0 grants for pending volume, got %d", len(grants))
|
|
}
|
|
}
|
|
|
|
func TestRegistry_LeaseGrants_UnknownServer(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
grants := r.LeaseGrants("unknown:18080", nil)
|
|
if grants != nil {
|
|
t.Fatalf("expected nil for unknown server, got %+v", grants)
|
|
}
|
|
}
|
|
|
|
// ============================================================
|
|
// CP11B-3 T2: IsBlockCapable + VolumesWithDeadPrimary
|
|
// ============================================================
|
|
|
|
func TestRegistry_IsBlockCapable(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.MarkBlockCapable("vs1:8080")
|
|
|
|
if !r.IsBlockCapable("vs1:8080") {
|
|
t.Fatal("vs1 should be block-capable")
|
|
}
|
|
if r.IsBlockCapable("vs2:8080") {
|
|
t.Fatal("vs2 should NOT be block-capable")
|
|
}
|
|
|
|
r.UnmarkBlockCapable("vs1:8080")
|
|
if r.IsBlockCapable("vs1:8080") {
|
|
t.Fatal("vs1 should no longer be block-capable after unmark")
|
|
}
|
|
}
|
|
|
|
func TestRegistry_VolumesWithDeadPrimary_Basic(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.MarkBlockCapable("vs1")
|
|
r.MarkBlockCapable("vs2")
|
|
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1", VolumeServer: "vs1", Path: "/data/vol1.blk",
|
|
SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
|
|
Status: StatusActive,
|
|
Replicas: []ReplicaInfo{{Server: "vs2", Path: "/data/vol1.blk"}},
|
|
})
|
|
|
|
// Both alive → no orphans.
|
|
orphaned := r.VolumesWithDeadPrimary("vs2")
|
|
if len(orphaned) != 0 {
|
|
t.Fatalf("expected 0 orphaned volumes, got %d", len(orphaned))
|
|
}
|
|
|
|
// Kill primary.
|
|
r.UnmarkBlockCapable("vs1")
|
|
orphaned = r.VolumesWithDeadPrimary("vs2")
|
|
if len(orphaned) != 1 || orphaned[0] != "vol1" {
|
|
t.Fatalf("expected [vol1], got %v", orphaned)
|
|
}
|
|
}
|
|
|
|
func TestRegistry_VolumesWithDeadPrimary_PrimaryServer_NotIncluded(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.MarkBlockCapable("vs1")
|
|
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1", VolumeServer: "vs1", Path: "/data/vol1.blk",
|
|
SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
|
|
Status: StatusActive,
|
|
})
|
|
|
|
// vs1 is the primary for vol1 — should NOT appear in orphaned list for vs1.
|
|
orphaned := r.VolumesWithDeadPrimary("vs1")
|
|
if len(orphaned) != 0 {
|
|
t.Fatalf("primary server should not appear in its own orphan list, got %v", orphaned)
|
|
}
|
|
}
|
|
|
|
// T6: EvaluatePromotion preflight includes primary liveness.
|
|
func TestRegistry_EvaluatePromotion_PrimaryDead_StillShowsCandidate(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.MarkBlockCapable("vs1")
|
|
r.MarkBlockCapable("vs2")
|
|
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1", VolumeServer: "vs1", Path: "/data/vol1.blk",
|
|
SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
|
|
Status: StatusActive, LeaseTTL: 30 * time.Second,
|
|
Replicas: []ReplicaInfo{{
|
|
Server: "vs2", Path: "/data/vol1.blk", HealthScore: 1.0,
|
|
Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(),
|
|
}},
|
|
})
|
|
|
|
// Kill primary but keep vs2 alive.
|
|
r.UnmarkBlockCapable("vs1")
|
|
|
|
pf, err := r.EvaluatePromotion("vol1")
|
|
if err != nil {
|
|
t.Fatalf("EvaluatePromotion: %v", err)
|
|
}
|
|
if !pf.Promotable {
|
|
t.Fatalf("should be promotable (vs2 alive), reason=%s", pf.Reason)
|
|
}
|
|
if pf.Candidate.Server != "vs2" {
|
|
t.Fatalf("candidate should be vs2, got %q", pf.Candidate.Server)
|
|
}
|
|
}
|
|
|
|
// ============================================================
|
|
// CP11B-3 T5: ManualPromote Dev Tests
|
|
// ============================================================
|
|
|
|
// T5: ManualPromote with empty target → auto-picks best candidate.
|
|
func TestRegistry_ManualPromote_AutoTarget(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.MarkBlockCapable("best")
|
|
r.MarkBlockCapable("worse")
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1", VolumeServer: "primary", Path: "/data/vol1.blk",
|
|
Epoch: 1, LeaseTTL: 30 * time.Second, WALHeadLSN: 100,
|
|
Replicas: []ReplicaInfo{
|
|
{Server: "worse", Path: "/r1.blk", HealthScore: 0.5, WALHeadLSN: 100,
|
|
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
|
{Server: "best", Path: "/r2.blk", HealthScore: 1.0, WALHeadLSN: 100,
|
|
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
|
},
|
|
})
|
|
// Primary not block-capable → non-force should still pass (primary_alive gate won't trigger).
|
|
|
|
newEpoch, _, _, pf, err := r.ManualPromote("vol1", "", false)
|
|
if err != nil {
|
|
t.Fatalf("ManualPromote: %v", err)
|
|
}
|
|
if newEpoch != 2 {
|
|
t.Fatalf("epoch: got %d, want 2", newEpoch)
|
|
}
|
|
if !pf.Promotable {
|
|
t.Fatal("should be promotable")
|
|
}
|
|
e, _ := r.Lookup("vol1")
|
|
if e.VolumeServer != "best" {
|
|
t.Fatalf("expected 'best' promoted, got %q", e.VolumeServer)
|
|
}
|
|
}
|
|
|
|
// T5: ManualPromote targets a specific replica (not the best by health).
|
|
func TestRegistry_ManualPromote_SpecificTarget(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.MarkBlockCapable("r1")
|
|
r.MarkBlockCapable("r2")
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1", VolumeServer: "primary", Path: "/data/vol1.blk",
|
|
Epoch: 1, LeaseTTL: 30 * time.Second,
|
|
Replicas: []ReplicaInfo{
|
|
{Server: "r1", Path: "/r1.blk", HealthScore: 1.0, WALHeadLSN: 100,
|
|
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
|
{Server: "r2", Path: "/r2.blk", HealthScore: 0.5, WALHeadLSN: 50,
|
|
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
|
},
|
|
})
|
|
|
|
// Target r2 specifically (worse health).
|
|
newEpoch, _, _, _, err := r.ManualPromote("vol1", "r2", false)
|
|
if err != nil {
|
|
t.Fatalf("ManualPromote: %v", err)
|
|
}
|
|
if newEpoch != 2 {
|
|
t.Fatalf("epoch: got %d, want 2", newEpoch)
|
|
}
|
|
e, _ := r.Lookup("vol1")
|
|
if e.VolumeServer != "r2" {
|
|
t.Fatalf("expected r2 promoted (specific target), got %q", e.VolumeServer)
|
|
}
|
|
}
|
|
|
|
// T5: ManualPromote with non-existent target → error.
|
|
func TestRegistry_ManualPromote_TargetNotFound(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.MarkBlockCapable("r1")
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1", VolumeServer: "primary", Path: "/data/vol1.blk",
|
|
Epoch: 1, LeaseTTL: 30 * time.Second,
|
|
Replicas: []ReplicaInfo{
|
|
{Server: "r1", Path: "/r1.blk", HealthScore: 1.0,
|
|
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
|
},
|
|
})
|
|
|
|
_, _, _, pf, err := r.ManualPromote("vol1", "nonexistent", false)
|
|
if err == nil {
|
|
t.Fatal("expected error for nonexistent target")
|
|
}
|
|
if pf.Reason != "target_not_found" {
|
|
t.Fatalf("expected target_not_found, got %q", pf.Reason)
|
|
}
|
|
}
|
|
|
|
// T5: ManualPromote non-force with alive primary → rejected.
|
|
func TestRegistry_ManualPromote_PrimaryAlive_Rejected(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.MarkBlockCapable("primary")
|
|
r.MarkBlockCapable("r1")
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1", VolumeServer: "primary", Path: "/data/vol1.blk",
|
|
Epoch: 1, LeaseTTL: 30 * time.Second,
|
|
Replicas: []ReplicaInfo{
|
|
{Server: "r1", Path: "/r1.blk", HealthScore: 1.0,
|
|
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
|
},
|
|
})
|
|
|
|
_, _, _, pf, err := r.ManualPromote("vol1", "", false)
|
|
if err == nil {
|
|
t.Fatal("expected rejection when primary alive and !force")
|
|
}
|
|
if pf.Reason != "primary_alive" {
|
|
t.Fatalf("expected primary_alive, got %q", pf.Reason)
|
|
}
|
|
// Verify no mutation.
|
|
e, _ := r.Lookup("vol1")
|
|
if e.VolumeServer != "primary" {
|
|
t.Fatalf("primary should not change, got %q", e.VolumeServer)
|
|
}
|
|
}
|
|
|
|
// T5: Force bypasses stale heartbeat and primary_alive gates.
|
|
func TestRegistry_ManualPromote_Force_StaleHeartbeat(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.MarkBlockCapable("primary")
|
|
r.MarkBlockCapable("r1")
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1", VolumeServer: "primary", Path: "/data/vol1.blk",
|
|
Epoch: 1, LeaseTTL: 30 * time.Second,
|
|
Replicas: []ReplicaInfo{
|
|
{Server: "r1", Path: "/r1.blk", HealthScore: 1.0,
|
|
LastHeartbeat: time.Now().Add(-10 * time.Minute), // stale
|
|
Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
|
},
|
|
})
|
|
|
|
// Non-force: would fail on primary_alive.
|
|
// Force: bypasses primary_alive AND stale_heartbeat.
|
|
newEpoch, _, _, _, err := r.ManualPromote("vol1", "", true)
|
|
if err != nil {
|
|
t.Fatalf("force ManualPromote should succeed: %v", err)
|
|
}
|
|
if newEpoch != 2 {
|
|
t.Fatalf("epoch: got %d, want 2", newEpoch)
|
|
}
|
|
e, _ := r.Lookup("vol1")
|
|
if e.VolumeServer != "r1" {
|
|
t.Fatalf("expected r1 promoted via force, got %q", e.VolumeServer)
|
|
}
|
|
}
|
|
|
|
// T5: Force does NOT bypass server_dead (hard gate).
|
|
func TestRegistry_ManualPromote_Force_StillRejectsDeadServer(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
// "dead" is NOT marked block-capable.
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1", VolumeServer: "primary", Path: "/data/vol1.blk",
|
|
Epoch: 1, LeaseTTL: 30 * time.Second,
|
|
Replicas: []ReplicaInfo{
|
|
{Server: "dead", Path: "/r1.blk", HealthScore: 1.0,
|
|
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
|
},
|
|
})
|
|
|
|
_, _, _, pf, err := r.ManualPromote("vol1", "dead", true)
|
|
if err == nil {
|
|
t.Fatal("force should NOT bypass server_dead")
|
|
}
|
|
if len(pf.Rejections) == 0 || pf.Rejections[0].Reason != "server_dead" {
|
|
t.Fatalf("expected server_dead rejection, got %+v", pf.Rejections)
|
|
}
|
|
}
|
|
|
|
// --- Master restart reconciliation tests ---
|
|
|
|
func TestMasterRestart_HigherEpochWins(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
|
|
// First heartbeat from stale primary (epoch 5).
|
|
r.UpdateFullHeartbeat("vs1:9333", []*master_pb.BlockVolumeInfoMessage{
|
|
{Path: "/data/vol1.blk", Epoch: 5, Role: 1, WalHeadLsn: 100, VolumeSize: 1 << 30},
|
|
}, "")
|
|
|
|
entry, ok := r.Lookup("vol1")
|
|
if !ok {
|
|
t.Fatal("vol1 not found after first heartbeat")
|
|
}
|
|
if entry.VolumeServer != "vs1:9333" {
|
|
t.Fatalf("expected vs1 as initial primary, got %q", entry.VolumeServer)
|
|
}
|
|
|
|
// Second heartbeat from real primary (epoch 6 — post-failover).
|
|
r.UpdateFullHeartbeat("vs2:9333", []*master_pb.BlockVolumeInfoMessage{
|
|
{Path: "/data/vol1.blk", Epoch: 6, Role: 1, WalHeadLsn: 150, VolumeSize: 1 << 30},
|
|
}, "")
|
|
|
|
entry, _ = r.Lookup("vol1")
|
|
if entry.VolumeServer != "vs2:9333" {
|
|
t.Fatalf("expected vs2 (higher epoch) as primary, got %q", entry.VolumeServer)
|
|
}
|
|
if entry.Epoch != 6 {
|
|
t.Fatalf("expected epoch 6, got %d", entry.Epoch)
|
|
}
|
|
// Old primary should be a replica.
|
|
if len(entry.Replicas) != 1 || entry.Replicas[0].Server != "vs1:9333" {
|
|
t.Fatalf("expected vs1 demoted to replica, got replicas=%+v", entry.Replicas)
|
|
}
|
|
}
|
|
|
|
func TestMasterRestart_LowerEpochBecomesReplica(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
|
|
// First heartbeat from real primary (epoch 6).
|
|
r.UpdateFullHeartbeat("vs2:9333", []*master_pb.BlockVolumeInfoMessage{
|
|
{Path: "/data/vol1.blk", Epoch: 6, Role: 1, WalHeadLsn: 150, VolumeSize: 1 << 30},
|
|
}, "")
|
|
|
|
// Second heartbeat from stale server (epoch 5).
|
|
r.UpdateFullHeartbeat("vs1:9333", []*master_pb.BlockVolumeInfoMessage{
|
|
{Path: "/data/vol1.blk", Epoch: 5, Role: 1, WalHeadLsn: 100, VolumeSize: 1 << 30},
|
|
}, "")
|
|
|
|
entry, _ := r.Lookup("vol1")
|
|
if entry.VolumeServer != "vs2:9333" {
|
|
t.Fatalf("expected vs2 (higher epoch) to stay primary, got %q", entry.VolumeServer)
|
|
}
|
|
if entry.Epoch != 6 {
|
|
t.Fatalf("expected epoch 6, got %d", entry.Epoch)
|
|
}
|
|
if len(entry.Replicas) != 1 || entry.Replicas[0].Server != "vs1:9333" {
|
|
t.Fatalf("expected vs1 added as replica, got replicas=%+v", entry.Replicas)
|
|
}
|
|
}
|
|
|
|
func TestMasterRestart_SameEpoch_HigherLSNWins(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
|
|
// First heartbeat: epoch 5, LSN 100.
|
|
r.UpdateFullHeartbeat("vs1:9333", []*master_pb.BlockVolumeInfoMessage{
|
|
{Path: "/data/vol1.blk", Epoch: 5, Role: 1, WalHeadLsn: 100, VolumeSize: 1 << 30},
|
|
}, "")
|
|
|
|
// Second heartbeat: same epoch 5, higher LSN 200 — heuristic: this server is more recent.
|
|
r.UpdateFullHeartbeat("vs2:9333", []*master_pb.BlockVolumeInfoMessage{
|
|
{Path: "/data/vol1.blk", Epoch: 5, Role: 1, WalHeadLsn: 200, VolumeSize: 1 << 30},
|
|
}, "")
|
|
|
|
entry, _ := r.Lookup("vol1")
|
|
if entry.VolumeServer != "vs2:9333" {
|
|
t.Fatalf("expected vs2 (higher LSN) as primary, got %q", entry.VolumeServer)
|
|
}
|
|
if len(entry.Replicas) != 1 || entry.Replicas[0].Server != "vs1:9333" {
|
|
t.Fatalf("expected vs1 demoted to replica, got replicas=%+v", entry.Replicas)
|
|
}
|
|
}
|
|
|
|
func TestMasterRestart_SameEpoch_SameLSN_ExistingWins(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
|
|
// First heartbeat: epoch 5, LSN 100.
|
|
r.UpdateFullHeartbeat("vs1:9333", []*master_pb.BlockVolumeInfoMessage{
|
|
{Path: "/data/vol1.blk", Epoch: 5, Role: 1, WalHeadLsn: 100, VolumeSize: 1 << 30},
|
|
}, "")
|
|
|
|
// Second heartbeat: same epoch 5, same LSN 100 — existing wins (deterministic).
|
|
r.UpdateFullHeartbeat("vs2:9333", []*master_pb.BlockVolumeInfoMessage{
|
|
{Path: "/data/vol1.blk", Epoch: 5, Role: 1, WalHeadLsn: 100, VolumeSize: 1 << 30},
|
|
}, "")
|
|
|
|
entry, _ := r.Lookup("vol1")
|
|
if entry.VolumeServer != "vs1:9333" {
|
|
t.Fatalf("expected vs1 (existing, same LSN) to stay primary, got %q", entry.VolumeServer)
|
|
}
|
|
if len(entry.Replicas) != 1 || entry.Replicas[0].Server != "vs2:9333" {
|
|
t.Fatalf("expected vs2 added as replica, got replicas=%+v", entry.Replicas)
|
|
}
|
|
}
|
|
|
|
func TestMasterRestart_ReplicaHeartbeat_AddedCorrectly(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
|
|
// Primary heartbeat first.
|
|
r.UpdateFullHeartbeat("vs1:9333", []*master_pb.BlockVolumeInfoMessage{
|
|
{Path: "/data/vol1.blk", Epoch: 5, Role: 1, WalHeadLsn: 100, VolumeSize: 1 << 30},
|
|
}, "")
|
|
|
|
// Replica heartbeat: same volume, lower epoch (stale replica never got promoted).
|
|
r.UpdateFullHeartbeat("vs2:9333", []*master_pb.BlockVolumeInfoMessage{
|
|
{Path: "/data/vol1.blk", Epoch: 4, Role: 2, WalHeadLsn: 90, VolumeSize: 1 << 30},
|
|
}, "")
|
|
|
|
entry, _ := r.Lookup("vol1")
|
|
if entry.VolumeServer != "vs1:9333" {
|
|
t.Fatalf("primary should be vs1, got %q", entry.VolumeServer)
|
|
}
|
|
if len(entry.Replicas) != 1 || entry.Replicas[0].Server != "vs2:9333" {
|
|
t.Fatalf("expected vs2 as replica, got %+v", entry.Replicas)
|
|
}
|
|
}
|
|
|
|
func TestMasterRestart_SameEpoch_RoleTrusted(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
|
|
// First heartbeat: vs1 claims primary, epoch 5, LSN 50.
|
|
r.UpdateFullHeartbeat("vs1:9333", []*master_pb.BlockVolumeInfoMessage{
|
|
{Path: "/data/vol1.blk", Epoch: 5, Role: 1, WalHeadLsn: 50, VolumeSize: 1 << 30},
|
|
}, "")
|
|
|
|
// Second heartbeat: vs2 claims replica (Role=2), same epoch 5, HIGHER LSN.
|
|
// Even though LSN is higher, it reports Role=replica, so it should NOT become primary.
|
|
r.UpdateFullHeartbeat("vs2:9333", []*master_pb.BlockVolumeInfoMessage{
|
|
{Path: "/data/vol1.blk", Epoch: 5, Role: 2, WalHeadLsn: 200, VolumeSize: 1 << 30},
|
|
}, "")
|
|
|
|
entry, _ := r.Lookup("vol1")
|
|
if entry.VolumeServer != "vs1:9333" {
|
|
t.Fatalf("expected vs1 (claims primary) to stay primary, got %q", entry.VolumeServer)
|
|
}
|
|
if len(entry.Replicas) != 1 || entry.Replicas[0].Server != "vs2:9333" {
|
|
t.Fatalf("expected vs2 as replica, got %+v", entry.Replicas)
|
|
}
|
|
}
|
|
|
|
func TestMasterRestart_DuplicateReplicaHeartbeat_NoDuplicate(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
|
|
// Primary heartbeat.
|
|
r.UpdateFullHeartbeat("vs1:9333", []*master_pb.BlockVolumeInfoMessage{
|
|
{Path: "/data/vol1.blk", Epoch: 5, Role: 1, WalHeadLsn: 100, VolumeSize: 1 << 30},
|
|
}, "")
|
|
|
|
// Replica heartbeat — first time.
|
|
r.UpdateFullHeartbeat("vs2:9333", []*master_pb.BlockVolumeInfoMessage{
|
|
{Path: "/data/vol1.blk", Epoch: 5, Role: 2, WalHeadLsn: 90, VolumeSize: 1 << 30, HealthScore: 0.8},
|
|
}, "")
|
|
|
|
// Same replica heartbeat again — should update, not duplicate.
|
|
r.UpdateFullHeartbeat("vs2:9333", []*master_pb.BlockVolumeInfoMessage{
|
|
{Path: "/data/vol1.blk", Epoch: 5, Role: 2, WalHeadLsn: 95, VolumeSize: 1 << 30, HealthScore: 0.9},
|
|
}, "")
|
|
|
|
entry, _ := r.Lookup("vol1")
|
|
if len(entry.Replicas) != 1 {
|
|
t.Fatalf("expected 1 replica (no duplicates), got %d", len(entry.Replicas))
|
|
}
|
|
// Should have the updated values from the second heartbeat.
|
|
if entry.Replicas[0].WALHeadLSN != 95 {
|
|
t.Fatalf("expected updated LSN 95, got %d", entry.Replicas[0].WALHeadLSN)
|
|
}
|
|
if entry.Replicas[0].HealthScore != 0.9 {
|
|
t.Fatalf("expected updated health 0.9, got %f", entry.Replicas[0].HealthScore)
|
|
}
|
|
}
|
|
|
|
// --- Copy semantics tests (pointer escape fix) ---
|
|
|
|
func TestLookup_ReturnsCopy(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1",
|
|
VolumeServer: "vs1:9333",
|
|
Path: "/data/vol1.blk",
|
|
Epoch: 1,
|
|
Role: blockvol.RoleToWire(blockvol.RolePrimary),
|
|
Status: StatusActive,
|
|
})
|
|
|
|
// Get a copy via Lookup.
|
|
entry, ok := r.Lookup("vol1")
|
|
if !ok {
|
|
t.Fatal("vol1 not found")
|
|
}
|
|
|
|
// Mutate the copy.
|
|
entry.Epoch = 999
|
|
entry.VolumeServer = "mutated:9333"
|
|
|
|
// Registry must be unaffected.
|
|
original, _ := r.Lookup("vol1")
|
|
if original.Epoch != 1 {
|
|
t.Fatalf("Lookup copy mutation leaked: Epoch=%d, want 1", original.Epoch)
|
|
}
|
|
if original.VolumeServer != "vs1:9333" {
|
|
t.Fatalf("Lookup copy mutation leaked: VolumeServer=%q", original.VolumeServer)
|
|
}
|
|
}
|
|
|
|
func TestLookup_ReplicaSliceCopy(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1",
|
|
VolumeServer: "vs1:9333",
|
|
Path: "/data/vol1.blk",
|
|
Status: StatusActive,
|
|
Replicas: []ReplicaInfo{{Server: "vs2:9333", HealthScore: 1.0}},
|
|
})
|
|
|
|
entry, _ := r.Lookup("vol1")
|
|
// Mutate replica slice on the copy.
|
|
entry.Replicas[0].HealthScore = 0.0
|
|
entry.Replicas = append(entry.Replicas, ReplicaInfo{Server: "vs3:9333"})
|
|
|
|
// Registry must be unaffected.
|
|
original, _ := r.Lookup("vol1")
|
|
if len(original.Replicas) != 1 {
|
|
t.Fatalf("Replica slice mutation leaked: len=%d, want 1", len(original.Replicas))
|
|
}
|
|
if original.Replicas[0].HealthScore != 1.0 {
|
|
t.Fatalf("Replica HealthScore mutation leaked: %f", original.Replicas[0].HealthScore)
|
|
}
|
|
}
|
|
|
|
func TestListAll_ReturnsCopies(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1", VolumeServer: "vs1:9333", Path: "/data/vol1.blk", Status: StatusActive,
|
|
})
|
|
|
|
entries := r.ListAll()
|
|
entries[0].Epoch = 999
|
|
|
|
original, _ := r.Lookup("vol1")
|
|
if original.Epoch != 0 {
|
|
t.Fatalf("ListAll copy mutation leaked: Epoch=%d", original.Epoch)
|
|
}
|
|
}
|
|
|
|
func TestUpdateEntry_MutatesRegistry(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
r.Register(&BlockVolumeEntry{
|
|
Name: "vol1", VolumeServer: "vs1:9333", Path: "/data/vol1.blk", Status: StatusActive,
|
|
})
|
|
|
|
r.UpdateEntry("vol1", func(e *BlockVolumeEntry) {
|
|
e.Preset = "database"
|
|
})
|
|
|
|
entry, _ := r.Lookup("vol1")
|
|
if entry.Preset != "database" {
|
|
t.Fatalf("UpdateEntry did not mutate: Preset=%q", entry.Preset)
|
|
}
|
|
}
|
|
|
|
func TestUpdateEntry_NotFound(t *testing.T) {
|
|
r := NewBlockVolumeRegistry()
|
|
err := r.UpdateEntry("nonexistent", func(e *BlockVolumeEntry) {
|
|
e.Epoch = 99
|
|
})
|
|
if err == nil {
|
|
t.Fatal("expected error for nonexistent volume")
|
|
}
|
|
}
|