From 470075dd90874ef4a35da05eb543976360e4ed15 Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Mon, 9 Mar 2026 18:34:11 -0700 Subject: [PATCH] admin/balance: fix Max Volumes display and balancer source selection (#8583) * admin: fix Max Volumes column always showing 0 GetClusterVolumeServers() computed DiskCapacity from diskInfo.MaxVolumeCount but never populated the MaxVolumes field on the VolumeServer struct, causing the column to always display 0. * balance: use utilization ratio for source server selection The balancer selected the source server (to move volumes FROM) by raw volume count. In clusters with heterogeneous MaxVolumeCount settings, the server with the highest capacity naturally holds the most volumes and was always picked as the source, even when it had the lowest utilization ratio. Change source selection and imbalance calculation to use utilization ratio (effectiveCount / maxVolumeCount) so servers are compared by how full they are relative to their capacity, not by absolute volume count. This matches how destination scoring already works via calculateBalanceScore(). --- weed/admin/dash/volume_management.go | 1 + weed/worker/tasks/balance/detection.go | 61 ++++++++++++++----- weed/worker/tasks/balance/detection_test.go | 65 +++++++++++++++++++++ 3 files changed, 113 insertions(+), 14 deletions(-) diff --git a/weed/admin/dash/volume_management.go b/weed/admin/dash/volume_management.go index 4009922bb..805e891d0 100644 --- a/weed/admin/dash/volume_management.go +++ b/weed/admin/dash/volume_management.go @@ -457,6 +457,7 @@ func (s *AdminServer) GetClusterVolumeServers() (*ClusterVolumeServersData, erro // Process disk information for _, diskInfo := range node.DiskInfos { + vs.MaxVolumes += int(diskInfo.MaxVolumeCount) vs.DiskCapacity += int64(diskInfo.MaxVolumeCount) * int64(volumeSizeLimitMB) * 1024 * 1024 // Use actual volume size limit // Count regular volumes and calculate disk usage diff --git a/weed/worker/tasks/balance/detection.go b/weed/worker/tasks/balance/detection.go index 313ff1ca9..ac1ad488f 100644 --- a/weed/worker/tasks/balance/detection.go +++ b/weed/worker/tasks/balance/detection.go @@ -80,16 +80,19 @@ func detectForDiskType(diskType string, diskMetrics []*types.VolumeHealthMetrics // Analyze volume distribution across servers. // Seed from ActiveTopology so servers with matching disk type but zero // volumes are included in the count and imbalance calculation. + // Also collect MaxVolumeCount per server to compute utilization ratios. serverVolumeCounts := make(map[string]int) + serverMaxVolumes := make(map[string]int64) if clusterInfo.ActiveTopology != nil { topologyInfo := clusterInfo.ActiveTopology.GetTopologyInfo() if topologyInfo != nil { for _, dc := range topologyInfo.DataCenterInfos { for _, rack := range dc.RackInfos { for _, node := range rack.DataNodeInfos { - for diskTypeName := range node.DiskInfos { + for diskTypeName, diskInfo := range node.DiskInfos { if diskTypeName == diskType { serverVolumeCounts[node.Id] = 0 + serverMaxVolumes[node.Id] += diskInfo.MaxVolumeCount } } } @@ -141,38 +144,62 @@ func detectForDiskType(diskType string, diskMetrics []*types.VolumeHealthMetrics var results []*types.TaskDetectionResult balanced := false + // Decide upfront whether all servers have MaxVolumeCount info. + // If any server is missing it, fall back to raw counts for ALL servers + // to avoid mixing utilization ratios (0.0–1.0) with raw counts. + allServersHaveMaxInfo := true + for _, server := range sortedServers { + if maxVol, ok := serverMaxVolumes[server]; !ok || maxVol <= 0 { + allServersHaveMaxInfo = false + glog.V(1).Infof("BALANCE [%s]: Server %s is missing MaxVolumeCount info, falling back to raw volume counts for balancing", diskType, server) + break + } + } + + var serverUtilization func(server string, effectiveCount int) float64 + if allServersHaveMaxInfo { + serverUtilization = func(server string, effectiveCount int) float64 { + return float64(effectiveCount) / float64(serverMaxVolumes[server]) + } + } else { + serverUtilization = func(_ string, effectiveCount int) float64 { + return float64(effectiveCount) + } + } + for len(results) < maxResults { // Compute effective volume counts with adjustments from planned moves effectiveCounts := make(map[string]int, len(serverVolumeCounts)) - totalVolumes := 0 for server, count := range serverVolumeCounts { effective := count + adjustments[server] if effective < 0 { effective = 0 } effectiveCounts[server] = effective - totalVolumes += effective } - avgVolumesPerServer := float64(totalVolumes) / float64(len(effectiveCounts)) - maxVolumes := 0 - minVolumes := totalVolumes + // Find the most and least utilized servers using utilization ratio + // (volumes / maxVolumes) so that servers with higher capacity are + // expected to hold proportionally more volumes. + maxUtilization := -1.0 + minUtilization := math.Inf(1) maxServer := "" minServer := "" for _, server := range sortedServers { count := effectiveCounts[server] + util := serverUtilization(server, count) // Min is calculated across all servers for an accurate imbalance ratio - if count < minVolumes { - minVolumes = count + if util < minUtilization { + minUtilization = util minServer = server } // Max is only among non-exhausted servers since we can only move from them if exhaustedServers[server] { continue } - if count > maxVolumes { - maxVolumes = count + if util > maxUtilization { + maxUtilization = util maxServer = server } } @@ -183,12 +210,18 @@ func detectForDiskType(diskType string, diskMetrics []*types.VolumeHealthMetrics break } - // Check if imbalance exceeds threshold - imbalanceRatio := float64(maxVolumes-minVolumes) / avgVolumesPerServer + // Check if utilization imbalance exceeds threshold. + // imbalanceRatio is the difference between the most and least utilized + // servers, expressed as a fraction of mean utilization. + avgUtilization := (maxUtilization + minUtilization) / 2.0 + var imbalanceRatio float64 + if avgUtilization > 0 { + imbalanceRatio = (maxUtilization - minUtilization) / avgUtilization + } if imbalanceRatio <= balanceConfig.ImbalanceThreshold { if len(results) == 0 { - glog.Infof("BALANCE [%s]: No tasks created - cluster well balanced. Imbalance=%.1f%% (threshold=%.1f%%). Max=%d volumes on %s, Min=%d on %s, Avg=%.1f", - diskType, imbalanceRatio*100, balanceConfig.ImbalanceThreshold*100, maxVolumes, maxServer, minVolumes, minServer, avgVolumesPerServer) + glog.Infof("BALANCE [%s]: No tasks created - cluster well balanced. Imbalance=%.1f%% (threshold=%.1f%%). MaxUtil=%.1f%% on %s, MinUtil=%.1f%% on %s", + diskType, imbalanceRatio*100, balanceConfig.ImbalanceThreshold*100, maxUtilization*100, maxServer, minUtilization*100, minServer) } else { glog.Infof("BALANCE [%s]: Created %d task(s), cluster now balanced. Imbalance=%.1f%% (threshold=%.1f%%)", diskType, len(results), imbalanceRatio*100, balanceConfig.ImbalanceThreshold*100) diff --git a/weed/worker/tasks/balance/detection_test.go b/weed/worker/tasks/balance/detection_test.go index 344c63f04..af7b5bc9f 100644 --- a/weed/worker/tasks/balance/detection_test.go +++ b/weed/worker/tasks/balance/detection_test.go @@ -852,3 +852,68 @@ func TestDetection_ExhaustedServerFallsThrough(t *testing.T) { assertNoDuplicateVolumes(t, tasks) t.Logf("Created %d tasks from node-b after node-a exhausted", len(tasks)) } + +// TestDetection_HeterogeneousCapacity verifies that the balancer uses +// utilization ratio (volumes/maxVolumes) rather than raw volume counts. +// A server with more volumes but proportionally lower utilization should +// NOT be picked as the source over a server with fewer volumes but higher +// utilization. +func TestDetection_HeterogeneousCapacity(t *testing.T) { + // Simulate a cluster like: + // server-1: 600 volumes, max 700 → utilization 85.7% + // server-2: 690 volumes, max 700 → utilization 98.6% ← most utilized + // server-3: 695 volumes, max 700 → utilization 99.3% ← most utilized + // server-4: 900 volumes, max 1260 → utilization 71.4% ← least utilized + // + // The old algorithm (raw counts) would pick server-4 as source (900 > 695). + // The correct behavior is to pick server-3 (or server-2) as source since + // they have the highest utilization ratio. + servers := []serverSpec{ + {id: "server-1", diskType: "hdd", dc: "dc1", rack: "rack1", maxVolumes: 700}, + {id: "server-2", diskType: "hdd", dc: "dc1", rack: "rack1", maxVolumes: 700}, + {id: "server-3", diskType: "hdd", dc: "dc1", rack: "rack1", maxVolumes: 700}, + {id: "server-4", diskType: "hdd", dc: "dc1", rack: "rack1", maxVolumes: 1260}, + } + + volCounts := map[string]int{ + "server-1": 600, + "server-2": 690, + "server-3": 695, + "server-4": 900, + } + + var metrics []*types.VolumeHealthMetrics + vid := uint32(1) + for _, server := range []string{"server-1", "server-2", "server-3", "server-4"} { + count := volCounts[server] + metrics = append(metrics, makeVolumes(server, "hdd", "dc1", "rack1", "", vid, count)...) + vid += uint32(count) + } + + at := buildTopology(servers, metrics) + clusterInfo := &types.ClusterInfo{ActiveTopology: at} + cfg := &Config{ + BaseConfig: base.BaseConfig{Enabled: true}, + ImbalanceThreshold: 0.20, + MinServerCount: 2, + } + + tasks, _, err := Detection(metrics, clusterInfo, cfg, 5) + if err != nil { + t.Fatalf("Detection failed: %v", err) + } + if len(tasks) == 0 { + t.Fatal("Expected balance tasks but got none") + } + + // The source of the first task should be the most utilized server + // (server-3 at 99.3% or server-2 at 98.6%), NOT server-4. + firstSource := tasks[0].Server + if firstSource == "server-4" { + t.Errorf("Balancer incorrectly picked server-4 (lowest utilization 71.4%%) as source; should pick server-3 (99.3%%) or server-2 (98.6%%)") + } + if firstSource != "server-3" && firstSource != "server-2" { + t.Errorf("Expected server-3 or server-2 as first source, got %s", firstSource) + } + t.Logf("First balance task: move from %s (correct: highest utilization)", firstSource) +}