Browse Source
Add cluster.raft.leader.transfer command for graceful leader change (#7819)
Add cluster.raft.leader.transfer command for graceful leader change (#7819)
* proto: add RaftLeadershipTransfer RPC for forced leader change Add new gRPC RPC and messages for leadership transfer: - RaftLeadershipTransferRequest: optional target_id and target_address - RaftLeadershipTransferResponse: previous_leader and new_leader This enables graceful leadership transfer before master maintenance, reducing errors in filers during planned maintenance windows. Ref: https://github.com/seaweedfs/seaweedfs/issues/7527 * proto: regenerate Go files for RaftLeadershipTransfer Generated from master.proto changes. * master: implement RaftLeadershipTransfer gRPC handler Add gRPC handler for leadership transfer with support for: - Transfer to any eligible follower (when target_id is empty) - Transfer to a specific server (when target_id and target_address are provided) Uses hashicorp/raft LeadershipTransfer() and LeadershipTransferToServer() APIs. Returns the previous and new leader in the response. * shell: add cluster.raft.leader.transfer command Add weed shell command for graceful leadership transfer: - Displays current cluster status before transfer - Supports auto-selection of target (any eligible follower) - Supports targeted transfer with -id and -address flags - Provides clear feedback on success/failure with troubleshooting tips Usage: cluster.raft.leader.transfer cluster.raft.leader.transfer -id <server_id> -address <grpc_address> * master: add unit tests for raft gRPC handlers Add tests covering: - RaftLeadershipTransfer with no raft initialized - RaftLeadershipTransfer with target_id but no address - RaftListClusterServers with no raft initialized - RaftAddServer with no raft initialized - RaftRemoveServer with no raft initialized These tests verify error handling when raft is not configured. * shell: add tests for cluster.raft.leader.transfer command Add tests covering: - Command name and help text validation - HasTag returns false for ResourceHeavy - Validation of -id without -address - Argument parsing with unknown flags * master: clarify that leadership transfer requires -raftHashicorp The default raft implementation (seaweedfs/raft, a goraft fork) does not support graceful leadership transfer. This feature is only available when using hashicorp raft (-raftHashicorp=true). Update error messages and help text to make this requirement clear: - gRPC handler returns specific error for goraft users - Shell command help text notes the requirement - Added test for goraft case * test: use strings.Contains instead of custom helper Replace custom contains/containsHelper functions with the standard library strings.Contains for better maintainability. * shell: return flag parsing errors instead of swallowing them - Return the error from flag.Parse() instead of returning nil - Update test to explicitly assert error for unknown flags * test: document integration test scenarios for Raft leadership transfer Add comments explaining: - Why these unit tests only cover 'Raft not initialized' scenarios - What integration tests should cover (with multi-master cluster) - hashicorp/raft uses concrete types that cannot be easily mocked * fix: address reviewer feedback on tests and leader routing - Remove misleading tests that couldn't properly validate their documented behavior without a real Raft cluster: - TestRaftLeadershipTransfer_GoraftNotSupported - TestRaftLeadershipTransfer_ValidationTargetIdWithoutAddress - Change WithClient(false) to WithClient(true) for RaftLeadershipTransfer RPC to ensure the request is routed to the current leader * Improve cluster.raft.transferLeader command - Rename command from cluster.raft.leader.transfer to cluster.raft.transferLeader - Add symmetric validation: -id and -address must be specified together - Handle case where same leader is re-elected after transfer - Add test for -address without -id validation - Add docker compose file for 5-master raft cluster testingmaster
committed by
GitHub
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 621 additions and 67 deletions
-
11weed/pb/master.proto
-
250weed/pb/master_pb/master.pb.go
-
38weed/pb/master_pb/master_grpc.pb.go
-
47weed/server/master_grpc_server_raft.go
-
109weed/server/master_grpc_server_raft_test.go
-
144weed/shell/command_cluster_raft_leader_transfer.go
-
89weed/shell/command_cluster_raft_leader_transfer_test.go
@ -0,0 +1,109 @@ |
|||
package weed_server |
|||
|
|||
// These tests cover the Raft gRPC handlers in scenarios where Raft is not initialized
|
|||
// (single master mode). Testing with an initialized Raft cluster requires integration
|
|||
// tests with a multi-master setup, as hashicorp/raft uses concrete types that cannot
|
|||
// be easily mocked.
|
|||
//
|
|||
// Integration tests for RaftLeadershipTransfer should cover:
|
|||
// - Successful leadership transfer to any follower (auto-selection)
|
|||
// - Successful leadership transfer to a specific target server
|
|||
// - Error when caller is not the current leader
|
|||
// - Error when target server is not a voting member
|
|||
// - Error when target server is unreachable
|
|||
//
|
|||
// These scenarios are best tested with test/multi_master/ integration tests
|
|||
// using a real 3-node master cluster with -raftHashicorp=true.
|
|||
|
|||
import ( |
|||
"context" |
|||
"strings" |
|||
"testing" |
|||
|
|||
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb" |
|||
"github.com/seaweedfs/seaweedfs/weed/topology" |
|||
) |
|||
|
|||
func TestRaftLeadershipTransfer_NoRaft(t *testing.T) { |
|||
// Test case: raft not initialized (single master mode)
|
|||
ms := &MasterServer{ |
|||
Topo: topology.NewTopology("test", nil, 0, 0, false), |
|||
} |
|||
|
|||
ctx := context.Background() |
|||
req := &master_pb.RaftLeadershipTransferRequest{} |
|||
|
|||
_, err := ms.RaftLeadershipTransfer(ctx, req) |
|||
if err == nil { |
|||
t.Error("expected error when raft is not initialized") |
|||
} |
|||
|
|||
expectedMsg := "single master mode" |
|||
if err != nil && !strings.Contains(err.Error(), expectedMsg) { |
|||
t.Errorf("expected error message to contain %q, got %q", expectedMsg, err.Error()) |
|||
} |
|||
} |
|||
|
|||
func TestRaftListClusterServers_NoRaft(t *testing.T) { |
|||
// Test case: raft not initialized returns empty response
|
|||
ms := &MasterServer{ |
|||
Topo: topology.NewTopology("test", nil, 0, 0, false), |
|||
} |
|||
|
|||
ctx := context.Background() |
|||
req := &master_pb.RaftListClusterServersRequest{} |
|||
|
|||
resp, err := ms.RaftListClusterServers(ctx, req) |
|||
if err != nil { |
|||
t.Errorf("unexpected error: %v", err) |
|||
} |
|||
if resp == nil { |
|||
t.Error("expected non-nil response") |
|||
} |
|||
if len(resp.ClusterServers) != 0 { |
|||
t.Errorf("expected empty cluster servers, got %d", len(resp.ClusterServers)) |
|||
} |
|||
} |
|||
|
|||
func TestRaftAddServer_NoRaft(t *testing.T) { |
|||
// Test case: raft not initialized returns empty response
|
|||
ms := &MasterServer{ |
|||
Topo: topology.NewTopology("test", nil, 0, 0, false), |
|||
} |
|||
|
|||
ctx := context.Background() |
|||
req := &master_pb.RaftAddServerRequest{ |
|||
Id: "test-server", |
|||
Address: "localhost:19333", |
|||
Voter: true, |
|||
} |
|||
|
|||
resp, err := ms.RaftAddServer(ctx, req) |
|||
if err != nil { |
|||
t.Errorf("unexpected error: %v", err) |
|||
} |
|||
if resp == nil { |
|||
t.Error("expected non-nil response") |
|||
} |
|||
} |
|||
|
|||
func TestRaftRemoveServer_NoRaft(t *testing.T) { |
|||
// Test case: raft not initialized returns empty response
|
|||
ms := &MasterServer{ |
|||
Topo: topology.NewTopology("test", nil, 0, 0, false), |
|||
} |
|||
|
|||
ctx := context.Background() |
|||
req := &master_pb.RaftRemoveServerRequest{ |
|||
Id: "test-server", |
|||
Force: true, |
|||
} |
|||
|
|||
resp, err := ms.RaftRemoveServer(ctx, req) |
|||
if err != nil { |
|||
t.Errorf("unexpected error: %v", err) |
|||
} |
|||
if resp == nil { |
|||
t.Error("expected non-nil response") |
|||
} |
|||
} |
|||
@ -0,0 +1,144 @@ |
|||
package shell |
|||
|
|||
import ( |
|||
"context" |
|||
"flag" |
|||
"fmt" |
|||
"io" |
|||
"time" |
|||
|
|||
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb" |
|||
) |
|||
|
|||
func init() { |
|||
Commands = append(Commands, &commandRaftLeaderTransfer{}) |
|||
} |
|||
|
|||
type commandRaftLeaderTransfer struct{} |
|||
|
|||
func (c *commandRaftLeaderTransfer) Name() string { |
|||
return "cluster.raft.transferLeader" |
|||
} |
|||
|
|||
func (c *commandRaftLeaderTransfer) Help() string { |
|||
return `transfer raft leadership to another master server |
|||
|
|||
This command initiates a graceful leadership transfer from the current |
|||
leader to another server. Use this before performing maintenance on |
|||
the current leader to reduce errors in filers and other components. |
|||
|
|||
Examples: |
|||
# Transfer to any eligible follower (auto-selection) |
|||
cluster.raft.transferLeader |
|||
|
|||
# Transfer to a specific server |
|||
cluster.raft.transferLeader -id <server_id> -address <server_grpc_address> |
|||
|
|||
Notes: |
|||
- Requires hashicorp raft (-raftHashicorp=true on master) |
|||
- This command must be sent to the current leader |
|||
- The target server must be a voting member of the raft cluster |
|||
- Use 'cluster.raft.ps' to list available servers and identify the leader |
|||
` |
|||
} |
|||
|
|||
func (c *commandRaftLeaderTransfer) HasTag(CommandTag) bool { |
|||
return false |
|||
} |
|||
|
|||
func (c *commandRaftLeaderTransfer) Do(args []string, commandEnv *CommandEnv, writer io.Writer) error { |
|||
leaderTransferCommand := flag.NewFlagSet(c.Name(), flag.ContinueOnError) |
|||
targetId := leaderTransferCommand.String("id", "", "target server id (must be used with -address)") |
|||
targetAddress := leaderTransferCommand.String("address", "", "target server grpc address (must be used with -id)") |
|||
|
|||
if err := leaderTransferCommand.Parse(args); err != nil { |
|||
return err |
|||
} |
|||
|
|||
// Validate: id and address must be specified together
|
|||
if *targetId != "" && *targetAddress == "" { |
|||
return fmt.Errorf("-address is required when -id is specified") |
|||
} |
|||
if *targetAddress != "" && *targetId == "" { |
|||
return fmt.Errorf("-id is required when -address is specified") |
|||
} |
|||
|
|||
// First, show current cluster status
|
|||
fmt.Fprintf(writer, "Checking current raft cluster status...\n") |
|||
|
|||
var currentLeader string |
|||
err := commandEnv.MasterClient.WithClient(false, func(client master_pb.SeaweedClient) error { |
|||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) |
|||
defer cancel() |
|||
|
|||
resp, err := client.RaftListClusterServers(ctx, &master_pb.RaftListClusterServersRequest{}) |
|||
if err != nil { |
|||
return fmt.Errorf("failed to list cluster servers: %v", err) |
|||
} |
|||
|
|||
if len(resp.ClusterServers) == 0 { |
|||
fmt.Fprintf(writer, "No raft cluster configured (single master mode)\n") |
|||
return fmt.Errorf("leadership transfer not available in single master mode") |
|||
} |
|||
|
|||
fmt.Fprintf(writer, "Raft cluster has %d servers:\n", len(resp.ClusterServers)) |
|||
for _, server := range resp.ClusterServers { |
|||
suffix := "" |
|||
if server.IsLeader { |
|||
suffix = " <- current leader" |
|||
currentLeader = server.Id |
|||
} |
|||
fmt.Fprintf(writer, " %s %s [%s]%s\n", server.Id, server.Address, server.Suffrage, suffix) |
|||
} |
|||
return nil |
|||
}) |
|||
if err != nil { |
|||
return err |
|||
} |
|||
|
|||
if currentLeader == "" { |
|||
return fmt.Errorf("no leader found in cluster") |
|||
} |
|||
|
|||
// Perform the transfer
|
|||
targetDesc := "any eligible follower" |
|||
if *targetId != "" { |
|||
targetDesc = fmt.Sprintf("server %s (%s)", *targetId, *targetAddress) |
|||
} |
|||
fmt.Fprintf(writer, "\nTransferring leadership from %s to %s...\n", currentLeader, targetDesc) |
|||
|
|||
err = commandEnv.MasterClient.WithClient(true, func(client master_pb.SeaweedClient) error { |
|||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) |
|||
defer cancel() |
|||
|
|||
resp, err := client.RaftLeadershipTransfer(ctx, &master_pb.RaftLeadershipTransferRequest{ |
|||
TargetId: *targetId, |
|||
TargetAddress: *targetAddress, |
|||
}) |
|||
if err != nil { |
|||
return fmt.Errorf("leadership transfer failed: %v", err) |
|||
} |
|||
|
|||
if resp.PreviousLeader != resp.NewLeader { |
|||
fmt.Fprintf(writer, "Leadership successfully transferred.\n") |
|||
fmt.Fprintf(writer, " Previous leader: %s\n", resp.PreviousLeader) |
|||
fmt.Fprintf(writer, " New leader: %s\n", resp.NewLeader) |
|||
} else { |
|||
fmt.Fprintf(writer, "Leadership transfer initiated, but the same leader was re-elected.\n") |
|||
fmt.Fprintf(writer, " Current leader: %s\n", resp.NewLeader) |
|||
} |
|||
return nil |
|||
}) |
|||
|
|||
if err != nil { |
|||
fmt.Fprintf(writer, "\nLeadership transfer failed: %v\n", err) |
|||
fmt.Fprintf(writer, "\nTroubleshooting:\n") |
|||
fmt.Fprintf(writer, " - Ensure you are connected to the current leader\n") |
|||
fmt.Fprintf(writer, " - Ensure target server is a voting member (use 'cluster.raft.ps')\n") |
|||
fmt.Fprintf(writer, " - Ensure target server is healthy and reachable\n") |
|||
return err |
|||
} |
|||
|
|||
return nil |
|||
} |
|||
|
|||
@ -0,0 +1,89 @@ |
|||
package shell |
|||
|
|||
import ( |
|||
"bytes" |
|||
"strings" |
|||
"testing" |
|||
) |
|||
|
|||
func TestRaftLeaderTransfer_Name(t *testing.T) { |
|||
cmd := &commandRaftLeaderTransfer{} |
|||
expected := "cluster.raft.transferLeader" |
|||
if cmd.Name() != expected { |
|||
t.Errorf("expected name %q, got %q", expected, cmd.Name()) |
|||
} |
|||
} |
|||
|
|||
func TestRaftLeaderTransfer_Help(t *testing.T) { |
|||
cmd := &commandRaftLeaderTransfer{} |
|||
help := cmd.Help() |
|||
|
|||
// Verify help text contains key information
|
|||
expectedPhrases := []string{ |
|||
"transfer raft leadership", |
|||
"cluster.raft.transferLeader", |
|||
"-id", |
|||
"-address", |
|||
"cluster.raft.ps", |
|||
"-raftHashicorp", |
|||
} |
|||
|
|||
for _, phrase := range expectedPhrases { |
|||
if !strings.Contains(help, phrase) { |
|||
t.Errorf("help text should contain %q", phrase) |
|||
} |
|||
} |
|||
} |
|||
|
|||
func TestRaftLeaderTransfer_HasTag(t *testing.T) { |
|||
cmd := &commandRaftLeaderTransfer{} |
|||
// The command should not have any special tags
|
|||
if cmd.HasTag(ResourceHeavy) { |
|||
t.Error("expected HasTag to return false for ResourceHeavy") |
|||
} |
|||
} |
|||
|
|||
func TestRaftLeaderTransfer_ValidateTargetIdWithoutAddress(t *testing.T) { |
|||
cmd := &commandRaftLeaderTransfer{} |
|||
var buf bytes.Buffer |
|||
|
|||
// Create a mock command environment - this will fail because no master client
|
|||
// but we can verify argument parsing
|
|||
err := cmd.Do([]string{"-id", "test-server"}, nil, &buf) |
|||
|
|||
// Should fail because -address is required when -id is specified
|
|||
if err == nil { |
|||
t.Error("expected error when -id is specified without -address") |
|||
} |
|||
if err != nil && !strings.Contains(err.Error(), "-address is required") { |
|||
t.Errorf("expected error about missing -address, got: %v", err) |
|||
} |
|||
} |
|||
|
|||
func TestRaftLeaderTransfer_ValidateTargetAddressWithoutId(t *testing.T) { |
|||
cmd := &commandRaftLeaderTransfer{} |
|||
var buf bytes.Buffer |
|||
|
|||
// Verify argument parsing - address without id should fail
|
|||
err := cmd.Do([]string{"-address", "localhost:19333"}, nil, &buf) |
|||
|
|||
// Should fail because -id is required when -address is specified
|
|||
if err == nil { |
|||
t.Error("expected error when -address is specified without -id") |
|||
} |
|||
if err != nil && !strings.Contains(err.Error(), "-id is required") { |
|||
t.Errorf("expected error about missing -id, got: %v", err) |
|||
} |
|||
} |
|||
|
|||
func TestRaftLeaderTransfer_UnknownFlag(t *testing.T) { |
|||
cmd := &commandRaftLeaderTransfer{} |
|||
var buf bytes.Buffer |
|||
|
|||
// Unknown flag should return an error
|
|||
err := cmd.Do([]string{"-unknown-flag"}, nil, &buf) |
|||
if err == nil { |
|||
t.Error("expected error for unknown flag") |
|||
} |
|||
} |
|||
|
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue