You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
120 lines
3.3 KiB
120 lines
3.3 KiB
package weed_server
|
|
|
|
import (
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol"
|
|
)
|
|
|
|
// Health-state constants. Priority: unsafe > rebuilding > degraded > healthy.
|
|
const (
|
|
HealthStateHealthy = "healthy"
|
|
HealthStateDegraded = "degraded"
|
|
HealthStateRebuilding = "rebuilding"
|
|
HealthStateUnsafe = "unsafe"
|
|
)
|
|
|
|
// deriveHealthState determines the operator-facing health state for a volume
|
|
// from registry facts. This is the shared derivation used by both per-volume
|
|
// responses and cluster-level summaries.
|
|
//
|
|
// Priority: unsafe > rebuilding > degraded > healthy.
|
|
func deriveHealthState(entry *BlockVolumeEntry) string {
|
|
role := blockvol.RoleFromWire(entry.Role)
|
|
|
|
// unsafe: no primary, primary not alive is handled at cluster level
|
|
// (server liveness is not available on the entry itself).
|
|
// Entry-level unsafe: role is not primary, or volume has failed control state.
|
|
if role != blockvol.RolePrimary {
|
|
return HealthStateUnsafe
|
|
}
|
|
|
|
// unsafe: strict durability below required replica count.
|
|
if entry.DurabilityMode == "sync_all" || entry.DurabilityMode == "sync_quorum" {
|
|
durMode, _ := blockvol.ParseDurabilityMode(entry.DurabilityMode)
|
|
rf := entry.ReplicaFactor
|
|
if rf == 0 {
|
|
rf = 2
|
|
}
|
|
required := durMode.RequiredReplicas(rf)
|
|
if len(entry.Replicas) < required {
|
|
return HealthStateUnsafe
|
|
}
|
|
}
|
|
|
|
// rebuilding: any replica in rebuild state.
|
|
for _, ri := range entry.Replicas {
|
|
riRole := blockvol.RoleFromWire(ri.Role)
|
|
if riRole == blockvol.RoleRebuilding {
|
|
return HealthStateRebuilding
|
|
}
|
|
}
|
|
|
|
// degraded: actual replicas below desired.
|
|
rf := entry.ReplicaFactor
|
|
if rf == 0 {
|
|
rf = 2
|
|
}
|
|
desiredReplicas := rf - 1 // RF includes primary
|
|
if desiredReplicas > 0 && len(entry.Replicas) < desiredReplicas {
|
|
return HealthStateDegraded
|
|
}
|
|
|
|
// degraded: replica degraded flag set.
|
|
if entry.ReplicaDegraded {
|
|
return HealthStateDegraded
|
|
}
|
|
|
|
return HealthStateHealthy
|
|
}
|
|
|
|
// deriveHealthStateWithLiveness adds server-liveness awareness.
|
|
// primaryAlive comes from the registry's blockServers check.
|
|
func deriveHealthStateWithLiveness(entry *BlockVolumeEntry, primaryAlive bool) string {
|
|
if !primaryAlive {
|
|
return HealthStateUnsafe
|
|
}
|
|
return deriveHealthState(entry)
|
|
}
|
|
|
|
// clusterHealthSummary holds aggregated health counts for the cluster summary.
|
|
type clusterHealthSummary struct {
|
|
Healthy int
|
|
Degraded int
|
|
Rebuilding int
|
|
Unsafe int
|
|
}
|
|
|
|
// computeClusterHealthSummary iterates all volumes and computes health counts.
|
|
// Uses server liveness from the registry for accurate unsafe detection.
|
|
func (r *BlockVolumeRegistry) ComputeClusterHealthSummary() clusterHealthSummary {
|
|
r.mu.RLock()
|
|
defer r.mu.RUnlock()
|
|
var summary clusterHealthSummary
|
|
for _, entry := range r.volumes {
|
|
primaryAlive := r.blockServers[entry.VolumeServer] != nil
|
|
state := deriveHealthStateWithLiveness(entry, primaryAlive)
|
|
switch state {
|
|
case HealthStateHealthy:
|
|
summary.Healthy++
|
|
case HealthStateDegraded:
|
|
summary.Degraded++
|
|
case HealthStateRebuilding:
|
|
summary.Rebuilding++
|
|
case HealthStateUnsafe:
|
|
summary.Unsafe++
|
|
}
|
|
}
|
|
return summary
|
|
}
|
|
|
|
// NvmeCapableServerCount returns the number of servers with NVMe enabled.
|
|
func (r *BlockVolumeRegistry) NvmeCapableServerCount() int {
|
|
r.mu.RLock()
|
|
defer r.mu.RUnlock()
|
|
count := 0
|
|
for _, info := range r.blockServers {
|
|
if info != nil && info.NvmeAddr != "" {
|
|
count++
|
|
}
|
|
}
|
|
return count
|
|
}
|