Browse Source

admin/balance: fix Max Volumes display and balancer source selection (#8583)

* admin: fix Max Volumes column always showing 0

GetClusterVolumeServers() computed DiskCapacity from
diskInfo.MaxVolumeCount but never populated the MaxVolumes field
on the VolumeServer struct, causing the column to always display 0.

* balance: use utilization ratio for source server selection

The balancer selected the source server (to move volumes FROM) by raw
volume count. In clusters with heterogeneous MaxVolumeCount settings,
the server with the highest capacity naturally holds the most volumes
and was always picked as the source, even when it had the lowest
utilization ratio.

Change source selection and imbalance calculation to use utilization
ratio (effectiveCount / maxVolumeCount) so servers are compared by how
full they are relative to their capacity, not by absolute volume count.

This matches how destination scoring already works via
calculateBalanceScore().
pull/8594/head
Chris Lu 4 days ago
committed by GitHub
parent
commit
470075dd90
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
  1. 1
      weed/admin/dash/volume_management.go
  2. 61
      weed/worker/tasks/balance/detection.go
  3. 65
      weed/worker/tasks/balance/detection_test.go

1
weed/admin/dash/volume_management.go

@ -457,6 +457,7 @@ func (s *AdminServer) GetClusterVolumeServers() (*ClusterVolumeServersData, erro
// Process disk information
for _, diskInfo := range node.DiskInfos {
vs.MaxVolumes += int(diskInfo.MaxVolumeCount)
vs.DiskCapacity += int64(diskInfo.MaxVolumeCount) * int64(volumeSizeLimitMB) * 1024 * 1024 // Use actual volume size limit
// Count regular volumes and calculate disk usage

61
weed/worker/tasks/balance/detection.go

@ -80,16 +80,19 @@ func detectForDiskType(diskType string, diskMetrics []*types.VolumeHealthMetrics
// Analyze volume distribution across servers.
// Seed from ActiveTopology so servers with matching disk type but zero
// volumes are included in the count and imbalance calculation.
// Also collect MaxVolumeCount per server to compute utilization ratios.
serverVolumeCounts := make(map[string]int)
serverMaxVolumes := make(map[string]int64)
if clusterInfo.ActiveTopology != nil {
topologyInfo := clusterInfo.ActiveTopology.GetTopologyInfo()
if topologyInfo != nil {
for _, dc := range topologyInfo.DataCenterInfos {
for _, rack := range dc.RackInfos {
for _, node := range rack.DataNodeInfos {
for diskTypeName := range node.DiskInfos {
for diskTypeName, diskInfo := range node.DiskInfos {
if diskTypeName == diskType {
serverVolumeCounts[node.Id] = 0
serverMaxVolumes[node.Id] += diskInfo.MaxVolumeCount
}
}
}
@ -141,38 +144,62 @@ func detectForDiskType(diskType string, diskMetrics []*types.VolumeHealthMetrics
var results []*types.TaskDetectionResult
balanced := false
// Decide upfront whether all servers have MaxVolumeCount info.
// If any server is missing it, fall back to raw counts for ALL servers
// to avoid mixing utilization ratios (0.0–1.0) with raw counts.
allServersHaveMaxInfo := true
for _, server := range sortedServers {
if maxVol, ok := serverMaxVolumes[server]; !ok || maxVol <= 0 {
allServersHaveMaxInfo = false
glog.V(1).Infof("BALANCE [%s]: Server %s is missing MaxVolumeCount info, falling back to raw volume counts for balancing", diskType, server)
break
}
}
var serverUtilization func(server string, effectiveCount int) float64
if allServersHaveMaxInfo {
serverUtilization = func(server string, effectiveCount int) float64 {
return float64(effectiveCount) / float64(serverMaxVolumes[server])
}
} else {
serverUtilization = func(_ string, effectiveCount int) float64 {
return float64(effectiveCount)
}
}
for len(results) < maxResults {
// Compute effective volume counts with adjustments from planned moves
effectiveCounts := make(map[string]int, len(serverVolumeCounts))
totalVolumes := 0
for server, count := range serverVolumeCounts {
effective := count + adjustments[server]
if effective < 0 {
effective = 0
}
effectiveCounts[server] = effective
totalVolumes += effective
}
avgVolumesPerServer := float64(totalVolumes) / float64(len(effectiveCounts))
maxVolumes := 0
minVolumes := totalVolumes
// Find the most and least utilized servers using utilization ratio
// (volumes / maxVolumes) so that servers with higher capacity are
// expected to hold proportionally more volumes.
maxUtilization := -1.0
minUtilization := math.Inf(1)
maxServer := ""
minServer := ""
for _, server := range sortedServers {
count := effectiveCounts[server]
util := serverUtilization(server, count)
// Min is calculated across all servers for an accurate imbalance ratio
if count < minVolumes {
minVolumes = count
if util < minUtilization {
minUtilization = util
minServer = server
}
// Max is only among non-exhausted servers since we can only move from them
if exhaustedServers[server] {
continue
}
if count > maxVolumes {
maxVolumes = count
if util > maxUtilization {
maxUtilization = util
maxServer = server
}
}
@ -183,12 +210,18 @@ func detectForDiskType(diskType string, diskMetrics []*types.VolumeHealthMetrics
break
}
// Check if imbalance exceeds threshold
imbalanceRatio := float64(maxVolumes-minVolumes) / avgVolumesPerServer
// Check if utilization imbalance exceeds threshold.
// imbalanceRatio is the difference between the most and least utilized
// servers, expressed as a fraction of mean utilization.
avgUtilization := (maxUtilization + minUtilization) / 2.0
var imbalanceRatio float64
if avgUtilization > 0 {
imbalanceRatio = (maxUtilization - minUtilization) / avgUtilization
}
if imbalanceRatio <= balanceConfig.ImbalanceThreshold {
if len(results) == 0 {
glog.Infof("BALANCE [%s]: No tasks created - cluster well balanced. Imbalance=%.1f%% (threshold=%.1f%%). Max=%d volumes on %s, Min=%d on %s, Avg=%.1f",
diskType, imbalanceRatio*100, balanceConfig.ImbalanceThreshold*100, maxVolumes, maxServer, minVolumes, minServer, avgVolumesPerServer)
glog.Infof("BALANCE [%s]: No tasks created - cluster well balanced. Imbalance=%.1f%% (threshold=%.1f%%). MaxUtil=%.1f%% on %s, MinUtil=%.1f%% on %s",
diskType, imbalanceRatio*100, balanceConfig.ImbalanceThreshold*100, maxUtilization*100, maxServer, minUtilization*100, minServer)
} else {
glog.Infof("BALANCE [%s]: Created %d task(s), cluster now balanced. Imbalance=%.1f%% (threshold=%.1f%%)",
diskType, len(results), imbalanceRatio*100, balanceConfig.ImbalanceThreshold*100)

65
weed/worker/tasks/balance/detection_test.go

@ -852,3 +852,68 @@ func TestDetection_ExhaustedServerFallsThrough(t *testing.T) {
assertNoDuplicateVolumes(t, tasks)
t.Logf("Created %d tasks from node-b after node-a exhausted", len(tasks))
}
// TestDetection_HeterogeneousCapacity verifies that the balancer uses
// utilization ratio (volumes/maxVolumes) rather than raw volume counts.
// A server with more volumes but proportionally lower utilization should
// NOT be picked as the source over a server with fewer volumes but higher
// utilization.
func TestDetection_HeterogeneousCapacity(t *testing.T) {
// Simulate a cluster like:
// server-1: 600 volumes, max 700 → utilization 85.7%
// server-2: 690 volumes, max 700 → utilization 98.6% ← most utilized
// server-3: 695 volumes, max 700 → utilization 99.3% ← most utilized
// server-4: 900 volumes, max 1260 → utilization 71.4% ← least utilized
//
// The old algorithm (raw counts) would pick server-4 as source (900 > 695).
// The correct behavior is to pick server-3 (or server-2) as source since
// they have the highest utilization ratio.
servers := []serverSpec{
{id: "server-1", diskType: "hdd", dc: "dc1", rack: "rack1", maxVolumes: 700},
{id: "server-2", diskType: "hdd", dc: "dc1", rack: "rack1", maxVolumes: 700},
{id: "server-3", diskType: "hdd", dc: "dc1", rack: "rack1", maxVolumes: 700},
{id: "server-4", diskType: "hdd", dc: "dc1", rack: "rack1", maxVolumes: 1260},
}
volCounts := map[string]int{
"server-1": 600,
"server-2": 690,
"server-3": 695,
"server-4": 900,
}
var metrics []*types.VolumeHealthMetrics
vid := uint32(1)
for _, server := range []string{"server-1", "server-2", "server-3", "server-4"} {
count := volCounts[server]
metrics = append(metrics, makeVolumes(server, "hdd", "dc1", "rack1", "", vid, count)...)
vid += uint32(count)
}
at := buildTopology(servers, metrics)
clusterInfo := &types.ClusterInfo{ActiveTopology: at}
cfg := &Config{
BaseConfig: base.BaseConfig{Enabled: true},
ImbalanceThreshold: 0.20,
MinServerCount: 2,
}
tasks, _, err := Detection(metrics, clusterInfo, cfg, 5)
if err != nil {
t.Fatalf("Detection failed: %v", err)
}
if len(tasks) == 0 {
t.Fatal("Expected balance tasks but got none")
}
// The source of the first task should be the most utilized server
// (server-3 at 99.3% or server-2 at 98.6%), NOT server-4.
firstSource := tasks[0].Server
if firstSource == "server-4" {
t.Errorf("Balancer incorrectly picked server-4 (lowest utilization 71.4%%) as source; should pick server-3 (99.3%%) or server-2 (98.6%%)")
}
if firstSource != "server-3" && firstSource != "server-2" {
t.Errorf("Expected server-3 or server-2 as first source, got %s", firstSource)
}
t.Logf("First balance task: move from %s (correct: highest utilization)", firstSource)
}
Loading…
Cancel
Save