Browse Source

fix: FilerClient supports multiple filer addresses for high availability

Critical fix: FilerClient now accepts []ServerAddress instead of single address
- Prevents mount failure when first filer is down (regression fix)
- Implements automatic failover to remaining filers
- Uses round-robin with atomic index tracking (same pattern as WFS.WithFilerClient)
- Retries all configured filers before giving up
- Updates successful filer index for future requests

Changes:
- NewFilerClient([]pb.ServerAddress, ...) instead of (pb.ServerAddress, ...)
- filerVolumeProvider references FilerClient for failover access
- LookupVolumeIds tries all filers with util.Retry pattern
- Mount passes all option.FilerAddresses for HA
- S3 wraps single filer in slice for API consistency

This restores the high availability that existed in the old implementation
where mount would automatically failover between configured filers.
pull/7518/head
chrislu 2 weeks ago
parent
commit
01b9b68ac5
  1. 3
      weed/mount/weedfs.go
  2. 3
      weed/s3api/s3api_server.go
  3. 82
      weed/wdclient/filer_client.go

3
weed/mount/weedfs.go

@ -102,6 +102,7 @@ type WFS struct {
func NewSeaweedFileSystem(option *Option) *WFS { func NewSeaweedFileSystem(option *Option) *WFS {
// Create FilerClient for efficient volume location caching // Create FilerClient for efficient volume location caching
// Pass all filer addresses for high availability with automatic failover
// Configure URL preference based on VolumeServerAccess option // Configure URL preference based on VolumeServerAccess option
var opts *wdclient.FilerClientOption var opts *wdclient.FilerClientOption
if option.VolumeServerAccess == "publicUrl" { if option.VolumeServerAccess == "publicUrl" {
@ -111,7 +112,7 @@ func NewSeaweedFileSystem(option *Option) *WFS {
} }
filerClient := wdclient.NewFilerClient( filerClient := wdclient.NewFilerClient(
option.FilerAddresses[0],
option.FilerAddresses, // Pass all filer addresses for HA
option.GrpcDialOption, option.GrpcDialOption,
option.DataCenter, option.DataCenter,
opts, opts,

3
weed/s3api/s3api_server.go

@ -95,7 +95,8 @@ func NewS3ApiServerWithStore(router *mux.Router, option *S3ApiServerOption, expl
// Initialize FilerClient for volume location caching // Initialize FilerClient for volume location caching
// Uses the battle-tested vidMap with filer-based lookups // Uses the battle-tested vidMap with filer-based lookups
filerClient := wdclient.NewFilerClient(option.Filer, option.GrpcDialOption, option.DataCenter)
// S3 API typically connects to a single filer, but wrap in slice for consistency
filerClient := wdclient.NewFilerClient([]pb.ServerAddress{option.Filer}, option.GrpcDialOption, option.DataCenter)
glog.V(0).Infof("S3 API initialized FilerClient for volume location caching") glog.V(0).Infof("S3 API initialized FilerClient for volume location caching")
s3ApiServer = &S3ApiServer{ s3ApiServer = &S3ApiServer{

82
weed/wdclient/filer_client.go

@ -4,6 +4,7 @@ import (
"context" "context"
"fmt" "fmt"
"strings" "strings"
"sync/atomic"
"time" "time"
"google.golang.org/grpc" "google.golang.org/grpc"
@ -11,6 +12,7 @@ import (
"github.com/seaweedfs/seaweedfs/weed/glog" "github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb" "github.com/seaweedfs/seaweedfs/weed/pb"
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb" "github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
"github.com/seaweedfs/seaweedfs/weed/util"
) )
// UrlPreference controls which URL to use for volume access // UrlPreference controls which URL to use for volume access
@ -23,9 +25,11 @@ const (
// FilerClient provides volume location services by querying a filer // FilerClient provides volume location services by querying a filer
// It uses the shared vidMap cache for efficient lookups // It uses the shared vidMap cache for efficient lookups
// Supports multiple filer addresses with automatic failover for high availability
type FilerClient struct { type FilerClient struct {
*vidMapClient *vidMapClient
filerAddress pb.ServerAddress
filerAddresses []pb.ServerAddress
filerIndex int32 // atomic: current filer index for round-robin
grpcDialOption grpc.DialOption grpcDialOption grpc.DialOption
urlPreference UrlPreference urlPreference UrlPreference
grpcTimeout time.Duration grpcTimeout time.Duration
@ -33,10 +37,9 @@ type FilerClient struct {
} }
// filerVolumeProvider implements VolumeLocationProvider by querying filer // filerVolumeProvider implements VolumeLocationProvider by querying filer
// Supports multiple filer addresses with automatic failover
type filerVolumeProvider struct { type filerVolumeProvider struct {
filerAddress pb.ServerAddress
grpcDialOption grpc.DialOption
grpcTimeout time.Duration
filerClient *FilerClient
} }
// FilerClientOption holds optional configuration for FilerClient // FilerClientOption holds optional configuration for FilerClient
@ -46,9 +49,14 @@ type FilerClientOption struct {
CacheSize int // Number of historical vidMap snapshots (0 = use default) CacheSize int // Number of historical vidMap snapshots (0 = use default)
} }
// NewFilerClient creates a new client that queries filer for volume locations
// NewFilerClient creates a new client that queries filer(s) for volume locations
// Supports multiple filer addresses for high availability with automatic failover
// Uses sensible defaults: 5-second gRPC timeout, PreferUrl, DefaultVidMapCacheSize // Uses sensible defaults: 5-second gRPC timeout, PreferUrl, DefaultVidMapCacheSize
func NewFilerClient(filerAddress pb.ServerAddress, grpcDialOption grpc.DialOption, dataCenter string, opts ...*FilerClientOption) *FilerClient {
func NewFilerClient(filerAddresses []pb.ServerAddress, grpcDialOption grpc.DialOption, dataCenter string, opts ...*FilerClientOption) *FilerClient {
if len(filerAddresses) == 0 {
glog.Fatal("NewFilerClient requires at least one filer address")
}
// Apply defaults // Apply defaults
grpcTimeout := 5 * time.Second grpcTimeout := 5 * time.Second
urlPref := PreferUrl urlPref := PreferUrl
@ -68,20 +76,23 @@ func NewFilerClient(filerAddress pb.ServerAddress, grpcDialOption grpc.DialOptio
} }
} }
provider := &filerVolumeProvider{
filerAddress: filerAddress,
grpcDialOption: grpcDialOption,
grpcTimeout: grpcTimeout,
}
return &FilerClient{
vidMapClient: newVidMapClient(provider, dataCenter, cacheSize),
filerAddress: filerAddress,
fc := &FilerClient{
filerAddresses: filerAddresses,
filerIndex: 0,
grpcDialOption: grpcDialOption, grpcDialOption: grpcDialOption,
urlPreference: urlPref, urlPreference: urlPref,
grpcTimeout: grpcTimeout, grpcTimeout: grpcTimeout,
cacheSize: cacheSize, cacheSize: cacheSize,
} }
// Create provider that references this FilerClient for failover support
provider := &filerVolumeProvider{
filerClient: fc,
}
fc.vidMapClient = newVidMapClient(provider, dataCenter, cacheSize)
return fc
} }
// GetLookupFileIdFunction returns a lookup function with URL preference handling // GetLookupFileIdFunction returns a lookup function with URL preference handling
@ -123,21 +134,32 @@ func (fc *FilerClient) GetLookupFileIdFunction() LookupFileIdFunctionType {
} }
} }
// LookupVolumeIds queries the filer for volume locations
// LookupVolumeIds queries the filer for volume locations with automatic failover
// Tries all configured filer addresses until one succeeds (high availability)
// Note: Unlike master's VolumeIdLocation, filer's Locations message doesn't currently have // Note: Unlike master's VolumeIdLocation, filer's Locations message doesn't currently have
// an Error field. This implementation handles the current structure while being prepared // an Error field. This implementation handles the current structure while being prepared
// for future error reporting enhancements. // for future error reporting enhancements.
func (p *filerVolumeProvider) LookupVolumeIds(ctx context.Context, volumeIds []string) (map[string][]Location, error) { func (p *filerVolumeProvider) LookupVolumeIds(ctx context.Context, volumeIds []string) (map[string][]Location, error) {
fc := p.filerClient
result := make(map[string][]Location) result := make(map[string][]Location)
// Create a timeout context for the gRPC call // Create a timeout context for the gRPC call
timeoutCtx, cancel := context.WithTimeout(ctx, p.grpcTimeout)
timeoutCtx, cancel := context.WithTimeout(ctx, fc.grpcTimeout)
defer cancel() defer cancel()
// Convert grpcTimeout to milliseconds for the signature parameter // Convert grpcTimeout to milliseconds for the signature parameter
timeoutMs := int32(p.grpcTimeout.Milliseconds())
timeoutMs := int32(fc.grpcTimeout.Milliseconds())
// Try all filer addresses with round-robin starting from current index
var lastErr error
err := util.Retry("filer volume lookup", func() error {
i := atomic.LoadInt32(&fc.filerIndex)
n := int32(len(fc.filerAddresses))
for x := int32(0); x < n; x++ {
filerAddress := fc.filerAddresses[i]
err := pb.WithGrpcFilerClient(false, timeoutMs, p.filerAddress, p.grpcDialOption, func(client filer_pb.SeaweedFilerClient) error {
err := pb.WithGrpcFilerClient(false, timeoutMs, filerAddress, fc.grpcDialOption, func(client filer_pb.SeaweedFilerClient) error {
resp, err := client.LookupVolume(timeoutCtx, &filer_pb.LookupVolumeRequest{ resp, err := client.LookupVolume(timeoutCtx, &filer_pb.LookupVolumeRequest{
VolumeIds: volumeIds, VolumeIds: volumeIds,
}) })
@ -180,10 +202,28 @@ func (p *filerVolumeProvider) LookupVolumeIds(ctx context.Context, volumeIds []s
}) })
if err != nil { if err != nil {
// gRPC error - this is a communication or server failure
glog.V(1).Infof("FilerClient: filer %s lookup failed (attempt %d/%d): %v", filerAddress, x+1, n, err)
lastErr = err
i++
if i >= n {
i = 0
}
continue
}
// Success - update the preferred filer index for next time
atomic.StoreInt32(&fc.filerIndex, i)
glog.V(3).Infof("FilerClient: looked up %d volumes on %s, found %d", len(volumeIds), filerAddress, len(result))
return nil
}
// All filers failed
return fmt.Errorf("all %d filer(s) failed, last error: %w", n, lastErr)
})
if err != nil {
return nil, fmt.Errorf("filer volume lookup failed for %d volume(s): %w", len(volumeIds), err) return nil, fmt.Errorf("filer volume lookup failed for %d volume(s): %w", len(volumeIds), err)
} }
glog.V(3).Infof("FilerClient: looked up %d volumes, found %d", len(volumeIds), len(result))
return result, nil return result, nil
} }
Loading…
Cancel
Save