From 3e6ec1e7271786ff94d1411f019a2c8ea700697f Mon Sep 17 00:00:00 2001 From: chrislu Date: Thu, 20 Nov 2025 13:54:02 -0800 Subject: [PATCH] fix: restore observability instrumentation in MasterClient During the refactoring, several important stats counters and logging statements were accidentally removed from tryConnectToMaster. These are critical for monitoring and debugging the health of master client connections. Restored instrumentation: 1. stats.MasterClientConnectCounter("total") - tracks all connection attempts 2. stats.MasterClientConnectCounter(FailedToKeepConnected) - when KeepConnected stream fails 3. stats.MasterClientConnectCounter(FailedToReceive) - when Recv() fails in loop 4. stats.MasterClientConnectCounter(Failed) - when overall gprcErr occurs 5. stats.MasterClientConnectCounter(OnPeerUpdate) - when peer updates detected Additionally restored peer update logging: - "+ filer@host noticed group.type address" for node additions - "- filer@host noticed group.type address" for node removals - Only logs updates matching the client's FilerGroup for noise reduction This information is valuable for: - Monitoring cluster health and connection stability - Debugging cluster membership changes - Tracking master failover and reconnection patterns - Identifying network issues between clients and masters No functional changes - purely observability restoration. --- weed/wdclient/masterclient.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/weed/wdclient/masterclient.go b/weed/wdclient/masterclient.go index 2672bb5b4..5a1b4f57c 100644 --- a/weed/wdclient/masterclient.go +++ b/weed/wdclient/masterclient.go @@ -157,6 +157,8 @@ func (mc *MasterClient) tryAllMasters(ctx context.Context) { } func (mc *MasterClient) tryConnectToMaster(ctx context.Context, master pb.ServerAddress) (nextHintedLeader pb.ServerAddress) { + glog.V(1).Infof("%s.%s masterClient Connecting to master %v", mc.FilerGroup, mc.clientType, master) + stats.MasterClientConnectCounter.WithLabelValues("total").Inc() gprcErr := pb.WithMasterClient(true, master, mc.grpcDialOption, false, func(client master_pb.SeaweedClient) error { ctx, cancel := context.WithCancel(ctx) defer cancel() @@ -164,6 +166,7 @@ func (mc *MasterClient) tryConnectToMaster(ctx context.Context, master pb.Server stream, err := client.KeepConnected(ctx) if err != nil { glog.V(1).Infof("%s.%s masterClient failed to keep connected to %s: %v", mc.FilerGroup, mc.clientType, master, err) + stats.MasterClientConnectCounter.WithLabelValues(stats.FailedToKeepConnected).Inc() return err } @@ -205,6 +208,7 @@ func (mc *MasterClient) tryConnectToMaster(ctx context.Context, master pb.Server resp, err := stream.Recv() if err != nil { glog.V(0).Infof("%s.%s masterClient failed to receive from %s: %v", mc.FilerGroup, mc.clientType, master, err) + stats.MasterClientConnectCounter.WithLabelValues(stats.FailedToReceive).Inc() return err } @@ -227,6 +231,14 @@ func (mc *MasterClient) tryConnectToMaster(ctx context.Context, master pb.Server update := resp.ClusterNodeUpdate mc.OnPeerUpdateLock.RLock() if mc.OnPeerUpdate != nil { + if update.FilerGroup == mc.FilerGroup { + if update.IsAdd { + glog.V(0).Infof("+ %s@%s noticed %s.%s %s\n", mc.clientType, mc.clientHost, update.FilerGroup, update.NodeType, update.Address) + } else { + glog.V(0).Infof("- %s@%s noticed %s.%s %s\n", mc.clientType, mc.clientHost, update.FilerGroup, update.NodeType, update.Address) + } + stats.MasterClientConnectCounter.WithLabelValues(stats.OnPeerUpdate).Inc() + } mc.OnPeerUpdate(update, time.Now()) } mc.OnPeerUpdateLock.RUnlock() @@ -238,6 +250,7 @@ func (mc *MasterClient) tryConnectToMaster(ctx context.Context, master pb.Server } }) if gprcErr != nil { + stats.MasterClientConnectCounter.WithLabelValues(stats.Failed).Inc() glog.V(1).Infof("%s.%s masterClient failed to connect with master %v: %v", mc.FilerGroup, mc.clientType, master, gprcErr) } return nextHintedLeader