Browse Source

fix: volume server healthz now checks local conditions only

This fixes issue #6823 where a single volume server shutdown would cause
other healthy volume servers to fail their health checks and get restarted
by Kubernetes, causing a cascading failure.

Previously, the healthz handler checked if all replicated volumes could
reach their remote replicas via GetWritableRemoteReplications(). When a
volume server went down, the master would remove it from the volume
location list. Other volume servers would then fail their healthz checks
because they couldn't find all required replicas, causing Kubernetes to
restart them.

The healthz endpoint now only checks local conditions:
1. Is the server shutting down?
2. Is the server heartbeating with the master?

This follows the principle that a health check should only verify the
health of THIS server, not the overall cluster state.

Fixes #6823
pull/7610/head
chrislu 16 hours ago
parent
commit
35c1199690
  1. 31
      weed/server/volume_server_handlers_admin.go
  2. 4
      weed/storage/store.go

31
weed/server/volume_server_handlers_admin.go

@ -4,28 +4,33 @@ import (
"net/http"
"path/filepath"
"github.com/seaweedfs/seaweedfs/weed/topology"
"github.com/seaweedfs/seaweedfs/weed/util/version"
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
"github.com/seaweedfs/seaweedfs/weed/stats"
)
// healthzHandler checks the local health of the volume server.
// It only checks local conditions to avoid cascading failures when remote
// volume servers go down. Previously, this handler checked if all replicated
// volumes could reach their remote replicas, which caused healthy volume
// servers to fail health checks when a peer went down.
// See https://github.com/seaweedfs/seaweedfs/issues/6823
func (vs *VolumeServer) healthzHandler(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Server", "SeaweedFS Volume "+version.VERSION)
volumeInfos := vs.store.VolumeInfos()
for _, vinfo := range volumeInfos {
if len(vinfo.Collection) == 0 {
continue
}
if vinfo.ReplicaPlacement.GetCopyCount() > 1 {
_, err := topology.GetWritableRemoteReplications(vs.store, vs.grpcDialOption, vinfo.Id, vs.GetMaster)
if err != nil {
w.WriteHeader(http.StatusServiceUnavailable)
return
}
}
// Check if the server is shutting down
if vs.store.IsStopping() {
w.WriteHeader(http.StatusServiceUnavailable)
return
}
// Check if we can communicate with master
if !vs.isHeartbeating {
w.WriteHeader(http.StatusServiceUnavailable)
return
}
w.WriteHeader(http.StatusOK)
}

4
weed/storage/store.go

@ -469,6 +469,10 @@ func (s *Store) SetStopping() {
}
}
func (s *Store) IsStopping() bool {
return s.isStopping
}
func (s *Store) LoadNewVolumes() {
for _, location := range s.Locations {
location.loadExistingVolumes(s.NeedleMapKind, 0)

Loading…
Cancel
Save