fix: volume server healthz now checks local conditions only

This fixes issue #6823 where a single volume server shutdown would cause other healthy volume servers to fail their health checks and get restarted by Kubernetes, causing a cascading failure. Previously, the healthz handler checked if all replicated volumes could reach their remote replicas via GetWritableRemoteReplications(). When a volume server went down, the master would remove it from the volume location list. Other volume servers would then fail their healthz checks because they couldn't find all required replicas, causing Kubernetes to restart them. The healthz endpoint now only checks local conditions: 1. Is the server shutting down? 2. Is the server heartbeating with the master? This follows the principle that a health check should only verify the health of THIS server, not the overall cluster state. Fixes #6823
2 months ago · 35c1199690
2 changed files with 22 additions and 13 deletions
--- a/weed/server/volume_server_handlers_admin.go
+++ b/weed/server/volume_server_handlers_admin.go
@ -4,28 +4,33 @@ import (
 	"net/http"
 	"path/filepath"

-	"github.com/seaweedfs/seaweedfs/weed/topology"
 	"github.com/seaweedfs/seaweedfs/weed/util/version"

 	"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
 	"github.com/seaweedfs/seaweedfs/weed/stats"
 )

+// healthzHandler checks the local health of the volume server.
+// It only checks local conditions to avoid cascading failures when remote
+// volume servers go down. Previously, this handler checked if all replicated
+// volumes could reach their remote replicas, which caused healthy volume
+// servers to fail health checks when a peer went down.
+// See https://github.com/seaweedfs/seaweedfs/issues/6823
 func (vs *VolumeServer) healthzHandler(w http.ResponseWriter, r *http.Request) {
 	w.Header().Set("Server", "SeaweedFS Volume "+version.VERSION)
-	volumeInfos := vs.store.VolumeInfos()
-	for _, vinfo := range volumeInfos {
-		if len(vinfo.Collection) == 0 {
-			continue
-		}
-		if vinfo.ReplicaPlacement.GetCopyCount() > 1 {
-			_, err := topology.GetWritableRemoteReplications(vs.store, vs.grpcDialOption, vinfo.Id, vs.GetMaster)
-			if err != nil {
-				w.WriteHeader(http.StatusServiceUnavailable)
-				return
-			}
-		}
+
+	// Check if the server is shutting down
+	if vs.store.IsStopping() {
+		w.WriteHeader(http.StatusServiceUnavailable)
+		return
 	}
+
+	// Check if we can communicate with master
+	if !vs.isHeartbeating {
+		w.WriteHeader(http.StatusServiceUnavailable)
+		return
+	}
+
 	w.WriteHeader(http.StatusOK)
 }

--- a/weed/storage/store.go
+++ b/weed/storage/store.go
@ -469,6 +469,10 @@ func (s *Store) SetStopping() {
 	}
 }

+func (s *Store) IsStopping() bool {
+	return s.isStopping
+}
+
 func (s *Store) LoadNewVolumes() {
 	for _, location := range s.Locations {
 		location.loadExistingVolumes(s.NeedleMapKind, 0)