From 33cfa8c19418a8935c25c76511e32317baba3517 Mon Sep 17 00:00:00 2001 From: chrislu Date: Mon, 27 Oct 2025 21:21:04 -0700 Subject: [PATCH] fix: add defensive bounds checks and compute actual shard counts Critical fixes from code review: 1. topology_ec.go: Add defensive bounds checks to AddShard/DeleteShard - Prevent panic when shardId >= MaxShardCount (32) - Return false instead of crashing on out-of-range shard IDs 2. command_ec_common.go: Fix doBalanceEcShardsAcrossRacks - Was using hardcoded TotalShardsCount (14) for all volumes - Now computes actual totalShardsForVolume from rackToShardCount - Fixes incorrect rebalancing for volumes with custom EC ratios - Example: 5+2=7 shards would incorrectly use 14 as average These fixes improve robustness and prepare for future custom EC ratios without changing current behavior for default 10+4 volumes. Note: MinusParityShards and ec_task.go intentionally NOT changed for seaweedfs repo - these will be enhanced in seaweed-enterprise repo where custom EC ratio configuration is added. --- weed/shell/command_ec_common.go | 11 ++++++++--- weed/topology/topology_ec.go | 8 ++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/weed/shell/command_ec_common.go b/weed/shell/command_ec_common.go index 3616c4e16..f059b4e74 100644 --- a/weed/shell/command_ec_common.go +++ b/weed/shell/command_ec_common.go @@ -678,11 +678,16 @@ func countShardsByRack(vid needle.VolumeId, locations []*EcNode) map[string]int func (ecb *ecBalancer) doBalanceEcShardsAcrossRacks(collection string, vid needle.VolumeId, locations []*EcNode) error { racks := ecb.racks() - // calculate average number of shards an ec rack should have for one volume - averageShardsPerEcRack := ceilDivide(erasure_coding.TotalShardsCount, len(racks)) - // see the volume's shards are in how many racks, and how many in each rack rackToShardCount := countShardsByRack(vid, locations) + + // Calculate actual total shards for this volume (not hardcoded default) + var totalShardsForVolume int + for _, count := range rackToShardCount { + totalShardsForVolume += count + } + // calculate average number of shards an ec rack should have for one volume + averageShardsPerEcRack := ceilDivide(totalShardsForVolume, len(racks)) rackEcNodesWithVid := groupBy(locations, func(ecNode *EcNode) string { return string(ecNode.rack) }) diff --git a/weed/topology/topology_ec.go b/weed/topology/topology_ec.go index 7097cb84b..88fcf5c8d 100644 --- a/weed/topology/topology_ec.go +++ b/weed/topology/topology_ec.go @@ -91,6 +91,10 @@ func NewEcShardLocations(collection string) *EcShardLocations { } func (loc *EcShardLocations) AddShard(shardId erasure_coding.ShardId, dn *DataNode) (added bool) { + // Defensive bounds check to prevent panic with out-of-range shard IDs + if int(shardId) >= erasure_coding.MaxShardCount { + return false + } dataNodes := loc.Locations[shardId] for _, n := range dataNodes { if n.Id() == dn.Id() { @@ -102,6 +106,10 @@ func (loc *EcShardLocations) AddShard(shardId erasure_coding.ShardId, dn *DataNo } func (loc *EcShardLocations) DeleteShard(shardId erasure_coding.ShardId, dn *DataNode) (deleted bool) { + // Defensive bounds check to prevent panic with out-of-range shard IDs + if int(shardId) >= erasure_coding.MaxShardCount { + return false + } dataNodes := loc.Locations[shardId] foundIndex := -1 for index, n := range dataNodes {