From b0058af417e281032004d824602d0b276ba64360 Mon Sep 17 00:00:00 2001 From: chrislu Date: Thu, 20 Nov 2025 16:34:06 -0800 Subject: [PATCH] fix: IAM server must start KeepConnectedToMaster for masterClient usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The IAM server creates and uses a MasterClient but never started KeepConnectedToMaster, which could cause blocking if IAM config files have chunks requiring volume lookups. Problem flow: NewIamApiServerWithStore() → creates masterClient → ❌ NEVER starts KeepConnectedToMaster GetS3ApiConfigurationFromFiler() → filer.ReadEntry(iama.masterClient, ...) → StreamContent(masterClient, ...) if file has chunks → masterClient.GetLookupFileIdFunction() → GetMaster(ctx) ← BLOCKS indefinitely waiting for connection! While IAM config files (identity & policies) are typically small and stored inline without chunks, the code path exists and would block if the files ever had chunks. Fix: Start KeepConnectedToMaster in background goroutine right after creating masterClient, following the documented pattern: mc := wdclient.NewMasterClient(...) go mc.KeepConnectedToMaster(ctx) This ensures masterClient is usable if ReadEntry ever needs to stream chunked content from volume servers. Note: This bug was dormant because IAM config files are small (<256 bytes) and SeaweedFS stores small files inline in Entry.Content, not as chunks. The bug would only manifest if: - IAM config grew > 256 bytes (inline threshold) - Config was stored as chunks on volume servers - ReadEntry called StreamContent - GetMaster blocked indefinitely Now all 9 production MasterClient instances correctly follow the pattern. --- weed/iamapi/iamapi_server.go | 9 ++++++++- weed/wdclient/filer_client.go | 10 +++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/weed/iamapi/iamapi_server.go b/weed/iamapi/iamapi_server.go index cf507ee82..82977458d 100644 --- a/weed/iamapi/iamapi_server.go +++ b/weed/iamapi/iamapi_server.go @@ -56,9 +56,16 @@ func NewIamApiServer(router *mux.Router, option *IamServerOption) (iamApiServer } func NewIamApiServerWithStore(router *mux.Router, option *IamServerOption, explicitStore string) (iamApiServer *IamApiServer, err error) { + masterClient := wdclient.NewMasterClient(option.GrpcDialOption, "", "iam", "", "", "", *pb.NewServiceDiscoveryFromMap(option.Masters)) + + // Start KeepConnectedToMaster for volume location lookups + // IAM config files are typically small and inline, but if they ever have chunks, + // ReadEntry→StreamContent needs masterClient for volume lookups + go masterClient.KeepConnectedToMaster(context.Background()) + configure := &IamS3ApiConfigure{ option: option, - masterClient: wdclient.NewMasterClient(option.GrpcDialOption, "", "iam", "", "", "", *pb.NewServiceDiscoveryFromMap(option.Masters)), + masterClient: masterClient, } s3ApiConfigure = configure diff --git a/weed/wdclient/filer_client.go b/weed/wdclient/filer_client.go index 038432d1a..bb2e9e5a4 100644 --- a/weed/wdclient/filer_client.go +++ b/weed/wdclient/filer_client.go @@ -156,7 +156,7 @@ func (fc *FilerClient) GetLookupFileIdFunction() LookupFileIdFunctionType { } // isRetryableGrpcError checks if a gRPC error is transient and should be retried -// +// // Note on codes.Aborted: While Aborted can indicate application-level conflicts // (e.g., transaction failures), in the context of volume location lookups (which // are simple read-only operations with no transactions), Aborted is more likely @@ -166,14 +166,14 @@ func isRetryableGrpcError(err error) bool { if err == nil { return false } - + // Check gRPC status code st, ok := status.FromError(err) if ok { switch st.Code() { - case codes.Unavailable: // Server unavailable (temporary) + case codes.Unavailable: // Server unavailable (temporary) return true - case codes.DeadlineExceeded: // Request timeout + case codes.DeadlineExceeded: // Request timeout return true case codes.ResourceExhausted: // Rate limited or overloaded return true @@ -184,7 +184,7 @@ func isRetryableGrpcError(err error) bool { return true } } - + // Fallback to string matching for non-gRPC errors (e.g., network errors) errStr := err.Error() return strings.Contains(errStr, "transport") ||