Browse Source

fix(s3): start KeepConnectedToMaster for filer discovery with filerGroup (#7732)

Fixes #7721

When S3 server is configured with a filerGroup, it creates a MasterClient
to enable dynamic filer discovery. However, the KeepConnectedToMaster()
background goroutine was never started, causing GetMaster() to block
indefinitely in WaitUntilConnected().

This resulted in the log message:
  WaitUntilConnected still waiting for master connection (attempt N)...

being logged repeatedly every ~20 seconds.

The fix adds the missing 'go masterClient.KeepConnectedToMaster(ctx)'
call to properly establish the connection to master servers.

Also adds unit tests to verify WaitUntilConnected respects context
cancellation.
pull/7736/head
Chris Lu 4 weeks ago
committed by GitHub
parent
commit
d80d8be012
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
  1. 2
      weed/s3api/s3api_server.go
  2. 104
      weed/wdclient/masterclient_test.go

2
weed/s3api/s3api_server.go

@ -115,6 +115,8 @@ func NewS3ApiServerWithStore(router *mux.Router, option *S3ApiServerOption, expl
masterMap[fmt.Sprintf("master%d", i)] = addr masterMap[fmt.Sprintf("master%d", i)] = addr
} }
masterClient := wdclient.NewMasterClient(option.GrpcDialOption, option.FilerGroup, cluster.S3Type, "", "", "", *pb.NewServiceDiscoveryFromMap(masterMap)) masterClient := wdclient.NewMasterClient(option.GrpcDialOption, option.FilerGroup, cluster.S3Type, "", "", "", *pb.NewServiceDiscoveryFromMap(masterMap))
// Start the master client connection loop - required for GetMaster() to work
go masterClient.KeepConnectedToMaster(context.Background())
filerClient = wdclient.NewFilerClient(option.Filers, option.GrpcDialOption, option.DataCenter, &wdclient.FilerClientOption{ filerClient = wdclient.NewFilerClient(option.Filers, option.GrpcDialOption, option.DataCenter, &wdclient.FilerClientOption{
MasterClient: masterClient, MasterClient: masterClient,

104
weed/wdclient/masterclient_test.go

@ -0,0 +1,104 @@
package wdclient
import (
"context"
"testing"
"time"
"github.com/seaweedfs/seaweedfs/weed/pb"
"google.golang.org/grpc"
)
// TestWaitUntilConnectedWithoutKeepConnected verifies that WaitUntilConnected
// respects context cancellation when KeepConnectedToMaster is not running.
// This tests the fix for https://github.com/seaweedfs/seaweedfs/issues/7721
func TestWaitUntilConnectedWithoutKeepConnected(t *testing.T) {
mc := NewMasterClient(grpc.EmptyDialOption{}, "test-group", "test-client", "", "", "", pb.ServerDiscovery{})
// Without KeepConnectedToMaster running, WaitUntilConnected should
// respect context cancellation and not block forever
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer cancel()
start := time.Now()
mc.WaitUntilConnected(ctx)
elapsed := time.Since(start)
// Should have returned due to context timeout, not blocked forever
if elapsed > 200*time.Millisecond {
t.Errorf("WaitUntilConnected blocked for %v, expected to return on context timeout", elapsed)
}
// GetMaster should return empty since we never connected
master := mc.getCurrentMaster()
if master != "" {
t.Errorf("Expected empty master, got %s", master)
}
}
// TestWaitUntilConnectedReturnsImmediatelyWhenConnected verifies that
// WaitUntilConnected returns immediately when a master is already set.
func TestWaitUntilConnectedReturnsImmediatelyWhenConnected(t *testing.T) {
mc := NewMasterClient(grpc.EmptyDialOption{}, "test-group", "test-client", "", "", "", pb.ServerDiscovery{})
// Simulate that KeepConnectedToMaster has already established a connection
mc.setCurrentMaster("localhost:9333")
ctx := context.Background()
start := time.Now()
mc.WaitUntilConnected(ctx)
elapsed := time.Since(start)
// Should return almost immediately (< 10ms)
if elapsed > 10*time.Millisecond {
t.Errorf("WaitUntilConnected took %v when master was already set, expected immediate return", elapsed)
}
// Verify master is returned
master := mc.getCurrentMaster()
if master != "localhost:9333" {
t.Errorf("Expected master localhost:9333, got %s", master)
}
}
// TestGetMasterRespectsContextCancellation verifies that GetMaster
// respects context cancellation and doesn't block forever.
func TestGetMasterRespectsContextCancellation(t *testing.T) {
mc := NewMasterClient(grpc.EmptyDialOption{}, "test-group", "test-client", "", "", "", pb.ServerDiscovery{})
// GetMaster calls WaitUntilConnected internally
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer cancel()
start := time.Now()
master := mc.GetMaster(ctx)
elapsed := time.Since(start)
// Should return on context timeout
if elapsed > 200*time.Millisecond {
t.Errorf("GetMaster blocked for %v, expected to return on context timeout", elapsed)
}
// Master should be empty since we never connected
if master != "" {
t.Errorf("Expected empty master when not connected, got %s", master)
}
}
// TestMasterClientFilerGroupLogging verifies the FilerGroup is properly set
// and would be logged correctly (regression test for issue #7721 log message format)
func TestMasterClientFilerGroupLogging(t *testing.T) {
filerGroup := "filer_1"
clientType := "s3"
mc := NewMasterClient(grpc.EmptyDialOption{}, filerGroup, clientType, "", "", "", pb.ServerDiscovery{})
if mc.FilerGroup != filerGroup {
t.Errorf("Expected FilerGroup %s, got %s", filerGroup, mc.FilerGroup)
}
if mc.clientType != clientType {
t.Errorf("Expected clientType %s, got %s", clientType, mc.clientType)
}
}
Loading…
Cancel
Save