diff --git a/weed/admin/dash/volume_management.go b/weed/admin/dash/volume_management.go index 4009922bb..805e891d0 100644 --- a/weed/admin/dash/volume_management.go +++ b/weed/admin/dash/volume_management.go @@ -457,6 +457,7 @@ func (s *AdminServer) GetClusterVolumeServers() (*ClusterVolumeServersData, erro // Process disk information for _, diskInfo := range node.DiskInfos { + vs.MaxVolumes += int(diskInfo.MaxVolumeCount) vs.DiskCapacity += int64(diskInfo.MaxVolumeCount) * int64(volumeSizeLimitMB) * 1024 * 1024 // Use actual volume size limit // Count regular volumes and calculate disk usage diff --git a/weed/worker/tasks/balance/detection.go b/weed/worker/tasks/balance/detection.go index 313ff1ca9..ac1ad488f 100644 --- a/weed/worker/tasks/balance/detection.go +++ b/weed/worker/tasks/balance/detection.go @@ -80,16 +80,19 @@ func detectForDiskType(diskType string, diskMetrics []*types.VolumeHealthMetrics // Analyze volume distribution across servers. // Seed from ActiveTopology so servers with matching disk type but zero // volumes are included in the count and imbalance calculation. + // Also collect MaxVolumeCount per server to compute utilization ratios. serverVolumeCounts := make(map[string]int) + serverMaxVolumes := make(map[string]int64) if clusterInfo.ActiveTopology != nil { topologyInfo := clusterInfo.ActiveTopology.GetTopologyInfo() if topologyInfo != nil { for _, dc := range topologyInfo.DataCenterInfos { for _, rack := range dc.RackInfos { for _, node := range rack.DataNodeInfos { - for diskTypeName := range node.DiskInfos { + for diskTypeName, diskInfo := range node.DiskInfos { if diskTypeName == diskType { serverVolumeCounts[node.Id] = 0 + serverMaxVolumes[node.Id] += diskInfo.MaxVolumeCount } } } @@ -141,38 +144,62 @@ func detectForDiskType(diskType string, diskMetrics []*types.VolumeHealthMetrics var results []*types.TaskDetectionResult balanced := false + // Decide upfront whether all servers have MaxVolumeCount info. + // If any server is missing it, fall back to raw counts for ALL servers + // to avoid mixing utilization ratios (0.0–1.0) with raw counts. + allServersHaveMaxInfo := true + for _, server := range sortedServers { + if maxVol, ok := serverMaxVolumes[server]; !ok || maxVol <= 0 { + allServersHaveMaxInfo = false + glog.V(1).Infof("BALANCE [%s]: Server %s is missing MaxVolumeCount info, falling back to raw volume counts for balancing", diskType, server) + break + } + } + + var serverUtilization func(server string, effectiveCount int) float64 + if allServersHaveMaxInfo { + serverUtilization = func(server string, effectiveCount int) float64 { + return float64(effectiveCount) / float64(serverMaxVolumes[server]) + } + } else { + serverUtilization = func(_ string, effectiveCount int) float64 { + return float64(effectiveCount) + } + } + for len(results) < maxResults { // Compute effective volume counts with adjustments from planned moves effectiveCounts := make(map[string]int, len(serverVolumeCounts)) - totalVolumes := 0 for server, count := range serverVolumeCounts { effective := count + adjustments[server] if effective < 0 { effective = 0 } effectiveCounts[server] = effective - totalVolumes += effective } - avgVolumesPerServer := float64(totalVolumes) / float64(len(effectiveCounts)) - maxVolumes := 0 - minVolumes := totalVolumes + // Find the most and least utilized servers using utilization ratio + // (volumes / maxVolumes) so that servers with higher capacity are + // expected to hold proportionally more volumes. + maxUtilization := -1.0 + minUtilization := math.Inf(1) maxServer := "" minServer := "" for _, server := range sortedServers { count := effectiveCounts[server] + util := serverUtilization(server, count) // Min is calculated across all servers for an accurate imbalance ratio - if count < minVolumes { - minVolumes = count + if util < minUtilization { + minUtilization = util minServer = server } // Max is only among non-exhausted servers since we can only move from them if exhaustedServers[server] { continue } - if count > maxVolumes { - maxVolumes = count + if util > maxUtilization { + maxUtilization = util maxServer = server } } @@ -183,12 +210,18 @@ func detectForDiskType(diskType string, diskMetrics []*types.VolumeHealthMetrics break } - // Check if imbalance exceeds threshold - imbalanceRatio := float64(maxVolumes-minVolumes) / avgVolumesPerServer + // Check if utilization imbalance exceeds threshold. + // imbalanceRatio is the difference between the most and least utilized + // servers, expressed as a fraction of mean utilization. + avgUtilization := (maxUtilization + minUtilization) / 2.0 + var imbalanceRatio float64 + if avgUtilization > 0 { + imbalanceRatio = (maxUtilization - minUtilization) / avgUtilization + } if imbalanceRatio <= balanceConfig.ImbalanceThreshold { if len(results) == 0 { - glog.Infof("BALANCE [%s]: No tasks created - cluster well balanced. Imbalance=%.1f%% (threshold=%.1f%%). Max=%d volumes on %s, Min=%d on %s, Avg=%.1f", - diskType, imbalanceRatio*100, balanceConfig.ImbalanceThreshold*100, maxVolumes, maxServer, minVolumes, minServer, avgVolumesPerServer) + glog.Infof("BALANCE [%s]: No tasks created - cluster well balanced. Imbalance=%.1f%% (threshold=%.1f%%). MaxUtil=%.1f%% on %s, MinUtil=%.1f%% on %s", + diskType, imbalanceRatio*100, balanceConfig.ImbalanceThreshold*100, maxUtilization*100, maxServer, minUtilization*100, minServer) } else { glog.Infof("BALANCE [%s]: Created %d task(s), cluster now balanced. Imbalance=%.1f%% (threshold=%.1f%%)", diskType, len(results), imbalanceRatio*100, balanceConfig.ImbalanceThreshold*100) diff --git a/weed/worker/tasks/balance/detection_test.go b/weed/worker/tasks/balance/detection_test.go index 344c63f04..af7b5bc9f 100644 --- a/weed/worker/tasks/balance/detection_test.go +++ b/weed/worker/tasks/balance/detection_test.go @@ -852,3 +852,68 @@ func TestDetection_ExhaustedServerFallsThrough(t *testing.T) { assertNoDuplicateVolumes(t, tasks) t.Logf("Created %d tasks from node-b after node-a exhausted", len(tasks)) } + +// TestDetection_HeterogeneousCapacity verifies that the balancer uses +// utilization ratio (volumes/maxVolumes) rather than raw volume counts. +// A server with more volumes but proportionally lower utilization should +// NOT be picked as the source over a server with fewer volumes but higher +// utilization. +func TestDetection_HeterogeneousCapacity(t *testing.T) { + // Simulate a cluster like: + // server-1: 600 volumes, max 700 → utilization 85.7% + // server-2: 690 volumes, max 700 → utilization 98.6% ← most utilized + // server-3: 695 volumes, max 700 → utilization 99.3% ← most utilized + // server-4: 900 volumes, max 1260 → utilization 71.4% ← least utilized + // + // The old algorithm (raw counts) would pick server-4 as source (900 > 695). + // The correct behavior is to pick server-3 (or server-2) as source since + // they have the highest utilization ratio. + servers := []serverSpec{ + {id: "server-1", diskType: "hdd", dc: "dc1", rack: "rack1", maxVolumes: 700}, + {id: "server-2", diskType: "hdd", dc: "dc1", rack: "rack1", maxVolumes: 700}, + {id: "server-3", diskType: "hdd", dc: "dc1", rack: "rack1", maxVolumes: 700}, + {id: "server-4", diskType: "hdd", dc: "dc1", rack: "rack1", maxVolumes: 1260}, + } + + volCounts := map[string]int{ + "server-1": 600, + "server-2": 690, + "server-3": 695, + "server-4": 900, + } + + var metrics []*types.VolumeHealthMetrics + vid := uint32(1) + for _, server := range []string{"server-1", "server-2", "server-3", "server-4"} { + count := volCounts[server] + metrics = append(metrics, makeVolumes(server, "hdd", "dc1", "rack1", "", vid, count)...) + vid += uint32(count) + } + + at := buildTopology(servers, metrics) + clusterInfo := &types.ClusterInfo{ActiveTopology: at} + cfg := &Config{ + BaseConfig: base.BaseConfig{Enabled: true}, + ImbalanceThreshold: 0.20, + MinServerCount: 2, + } + + tasks, _, err := Detection(metrics, clusterInfo, cfg, 5) + if err != nil { + t.Fatalf("Detection failed: %v", err) + } + if len(tasks) == 0 { + t.Fatal("Expected balance tasks but got none") + } + + // The source of the first task should be the most utilized server + // (server-3 at 99.3% or server-2 at 98.6%), NOT server-4. + firstSource := tasks[0].Server + if firstSource == "server-4" { + t.Errorf("Balancer incorrectly picked server-4 (lowest utilization 71.4%%) as source; should pick server-3 (99.3%%) or server-2 (98.6%%)") + } + if firstSource != "server-3" && firstSource != "server-2" { + t.Errorf("Expected server-3 or server-2 as first source, got %s", firstSource) + } + t.Logf("First balance task: move from %s (correct: highest utilization)", firstSource) +}