Browse Source

fix: restore observability instrumentation in MasterClient

During the refactoring, several important stats counters and logging
statements were accidentally removed from tryConnectToMaster. These are
critical for monitoring and debugging the health of master client connections.

Restored instrumentation:
1. stats.MasterClientConnectCounter("total") - tracks all connection attempts
2. stats.MasterClientConnectCounter(FailedToKeepConnected) - when KeepConnected stream fails
3. stats.MasterClientConnectCounter(FailedToReceive) - when Recv() fails in loop
4. stats.MasterClientConnectCounter(Failed) - when overall gprcErr occurs
5. stats.MasterClientConnectCounter(OnPeerUpdate) - when peer updates detected

Additionally restored peer update logging:
- "+ filer@host noticed group.type address" for node additions
- "- filer@host noticed group.type address" for node removals
- Only logs updates matching the client's FilerGroup for noise reduction

This information is valuable for:
- Monitoring cluster health and connection stability
- Debugging cluster membership changes
- Tracking master failover and reconnection patterns
- Identifying network issues between clients and masters

No functional changes - purely observability restoration.
pull/7518/head
chrislu 2 weeks ago
parent
commit
3e6ec1e727
  1. 13
      weed/wdclient/masterclient.go

13
weed/wdclient/masterclient.go

@ -157,6 +157,8 @@ func (mc *MasterClient) tryAllMasters(ctx context.Context) {
} }
func (mc *MasterClient) tryConnectToMaster(ctx context.Context, master pb.ServerAddress) (nextHintedLeader pb.ServerAddress) { func (mc *MasterClient) tryConnectToMaster(ctx context.Context, master pb.ServerAddress) (nextHintedLeader pb.ServerAddress) {
glog.V(1).Infof("%s.%s masterClient Connecting to master %v", mc.FilerGroup, mc.clientType, master)
stats.MasterClientConnectCounter.WithLabelValues("total").Inc()
gprcErr := pb.WithMasterClient(true, master, mc.grpcDialOption, false, func(client master_pb.SeaweedClient) error { gprcErr := pb.WithMasterClient(true, master, mc.grpcDialOption, false, func(client master_pb.SeaweedClient) error {
ctx, cancel := context.WithCancel(ctx) ctx, cancel := context.WithCancel(ctx)
defer cancel() defer cancel()
@ -164,6 +166,7 @@ func (mc *MasterClient) tryConnectToMaster(ctx context.Context, master pb.Server
stream, err := client.KeepConnected(ctx) stream, err := client.KeepConnected(ctx)
if err != nil { if err != nil {
glog.V(1).Infof("%s.%s masterClient failed to keep connected to %s: %v", mc.FilerGroup, mc.clientType, master, err) glog.V(1).Infof("%s.%s masterClient failed to keep connected to %s: %v", mc.FilerGroup, mc.clientType, master, err)
stats.MasterClientConnectCounter.WithLabelValues(stats.FailedToKeepConnected).Inc()
return err return err
} }
@ -205,6 +208,7 @@ func (mc *MasterClient) tryConnectToMaster(ctx context.Context, master pb.Server
resp, err := stream.Recv() resp, err := stream.Recv()
if err != nil { if err != nil {
glog.V(0).Infof("%s.%s masterClient failed to receive from %s: %v", mc.FilerGroup, mc.clientType, master, err) glog.V(0).Infof("%s.%s masterClient failed to receive from %s: %v", mc.FilerGroup, mc.clientType, master, err)
stats.MasterClientConnectCounter.WithLabelValues(stats.FailedToReceive).Inc()
return err return err
} }
@ -227,6 +231,14 @@ func (mc *MasterClient) tryConnectToMaster(ctx context.Context, master pb.Server
update := resp.ClusterNodeUpdate update := resp.ClusterNodeUpdate
mc.OnPeerUpdateLock.RLock() mc.OnPeerUpdateLock.RLock()
if mc.OnPeerUpdate != nil { if mc.OnPeerUpdate != nil {
if update.FilerGroup == mc.FilerGroup {
if update.IsAdd {
glog.V(0).Infof("+ %s@%s noticed %s.%s %s\n", mc.clientType, mc.clientHost, update.FilerGroup, update.NodeType, update.Address)
} else {
glog.V(0).Infof("- %s@%s noticed %s.%s %s\n", mc.clientType, mc.clientHost, update.FilerGroup, update.NodeType, update.Address)
}
stats.MasterClientConnectCounter.WithLabelValues(stats.OnPeerUpdate).Inc()
}
mc.OnPeerUpdate(update, time.Now()) mc.OnPeerUpdate(update, time.Now())
} }
mc.OnPeerUpdateLock.RUnlock() mc.OnPeerUpdateLock.RUnlock()
@ -238,6 +250,7 @@ func (mc *MasterClient) tryConnectToMaster(ctx context.Context, master pb.Server
} }
}) })
if gprcErr != nil { if gprcErr != nil {
stats.MasterClientConnectCounter.WithLabelValues(stats.Failed).Inc()
glog.V(1).Infof("%s.%s masterClient failed to connect with master %v: %v", mc.FilerGroup, mc.clientType, master, gprcErr) glog.V(1).Infof("%s.%s masterClient failed to connect with master %v: %v", mc.FilerGroup, mc.clientType, master, gprcErr)
} }
return nextHintedLeader return nextHintedLeader

Loading…
Cancel
Save