You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

861 lines
28 KiB

2 months ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
2 months ago
  1. package shell
  2. import (
  3. "context"
  4. "fmt"
  5. "math/rand/v2"
  6. "github.com/seaweedfs/seaweedfs/weed/glog"
  7. "github.com/seaweedfs/seaweedfs/weed/operation"
  8. "github.com/seaweedfs/seaweedfs/weed/pb"
  9. "github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
  10. "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
  11. "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
  12. "github.com/seaweedfs/seaweedfs/weed/storage/needle"
  13. "github.com/seaweedfs/seaweedfs/weed/storage/super_block"
  14. "github.com/seaweedfs/seaweedfs/weed/storage/types"
  15. "golang.org/x/exp/slices"
  16. "google.golang.org/grpc"
  17. )
  18. type DataCenterId string
  19. type EcNodeId string
  20. type RackId string
  21. type EcNode struct {
  22. info *master_pb.DataNodeInfo
  23. dc DataCenterId
  24. rack RackId
  25. freeEcSlot int
  26. }
  27. type CandidateEcNode struct {
  28. ecNode *EcNode
  29. shardCount int
  30. }
  31. type EcRack struct {
  32. ecNodes map[EcNodeId]*EcNode
  33. freeEcSlot int
  34. }
  35. func moveMountedShardToEcNode(commandEnv *CommandEnv, existingLocation *EcNode, collection string, vid needle.VolumeId, shardId erasure_coding.ShardId, destinationEcNode *EcNode, applyBalancing bool) (err error) {
  36. if !commandEnv.isLocked() {
  37. return fmt.Errorf("lock is lost")
  38. }
  39. copiedShardIds := []uint32{uint32(shardId)}
  40. if applyBalancing {
  41. existingServerAddress := pb.NewServerAddressFromDataNode(existingLocation.info)
  42. // ask destination node to copy shard and the ecx file from source node, and mount it
  43. copiedShardIds, err = oneServerCopyAndMountEcShardsFromSource(commandEnv.option.GrpcDialOption, destinationEcNode, []uint32{uint32(shardId)}, vid, collection, existingServerAddress)
  44. if err != nil {
  45. return err
  46. }
  47. // unmount the to be deleted shards
  48. err = unmountEcShards(commandEnv.option.GrpcDialOption, vid, existingServerAddress, copiedShardIds)
  49. if err != nil {
  50. return err
  51. }
  52. // ask source node to delete the shard, and maybe the ecx file
  53. err = sourceServerDeleteEcShards(commandEnv.option.GrpcDialOption, collection, vid, existingServerAddress, copiedShardIds)
  54. if err != nil {
  55. return err
  56. }
  57. fmt.Printf("moved ec shard %d.%d %s => %s\n", vid, shardId, existingLocation.info.Id, destinationEcNode.info.Id)
  58. }
  59. destinationEcNode.addEcVolumeShards(vid, collection, copiedShardIds)
  60. existingLocation.deleteEcVolumeShards(vid, copiedShardIds)
  61. return nil
  62. }
  63. func oneServerCopyAndMountEcShardsFromSource(grpcDialOption grpc.DialOption,
  64. targetServer *EcNode, shardIdsToCopy []uint32,
  65. volumeId needle.VolumeId, collection string, existingLocation pb.ServerAddress) (copiedShardIds []uint32, err error) {
  66. fmt.Printf("allocate %d.%v %s => %s\n", volumeId, shardIdsToCopy, existingLocation, targetServer.info.Id)
  67. targetAddress := pb.NewServerAddressFromDataNode(targetServer.info)
  68. err = operation.WithVolumeServerClient(false, targetAddress, grpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
  69. if targetAddress != existingLocation {
  70. fmt.Printf("copy %d.%v %s => %s\n", volumeId, shardIdsToCopy, existingLocation, targetServer.info.Id)
  71. _, copyErr := volumeServerClient.VolumeEcShardsCopy(context.Background(), &volume_server_pb.VolumeEcShardsCopyRequest{
  72. VolumeId: uint32(volumeId),
  73. Collection: collection,
  74. ShardIds: shardIdsToCopy,
  75. CopyEcxFile: true,
  76. CopyEcjFile: true,
  77. CopyVifFile: true,
  78. SourceDataNode: string(existingLocation),
  79. })
  80. if copyErr != nil {
  81. return fmt.Errorf("copy %d.%v %s => %s : %v\n", volumeId, shardIdsToCopy, existingLocation, targetServer.info.Id, copyErr)
  82. }
  83. }
  84. fmt.Printf("mount %d.%v on %s\n", volumeId, shardIdsToCopy, targetServer.info.Id)
  85. _, mountErr := volumeServerClient.VolumeEcShardsMount(context.Background(), &volume_server_pb.VolumeEcShardsMountRequest{
  86. VolumeId: uint32(volumeId),
  87. Collection: collection,
  88. ShardIds: shardIdsToCopy,
  89. })
  90. if mountErr != nil {
  91. return fmt.Errorf("mount %d.%v on %s : %v\n", volumeId, shardIdsToCopy, targetServer.info.Id, mountErr)
  92. }
  93. if targetAddress != existingLocation {
  94. copiedShardIds = shardIdsToCopy
  95. glog.V(0).Infof("%s ec volume %d deletes shards %+v", existingLocation, volumeId, copiedShardIds)
  96. }
  97. return nil
  98. })
  99. if err != nil {
  100. return
  101. }
  102. return
  103. }
  104. func eachDataNode(topo *master_pb.TopologyInfo, fn func(dc DataCenterId, rack RackId, dn *master_pb.DataNodeInfo)) {
  105. for _, dc := range topo.DataCenterInfos {
  106. for _, rack := range dc.RackInfos {
  107. for _, dn := range rack.DataNodeInfos {
  108. fn(DataCenterId(dc.Id), RackId(rack.Id), dn)
  109. }
  110. }
  111. }
  112. }
  113. func sortEcNodesByFreeslotsDescending(ecNodes []*EcNode) {
  114. slices.SortFunc(ecNodes, func(a, b *EcNode) int {
  115. return b.freeEcSlot - a.freeEcSlot
  116. })
  117. }
  118. func sortEcNodesByFreeslotsAscending(ecNodes []*EcNode) {
  119. slices.SortFunc(ecNodes, func(a, b *EcNode) int {
  120. return a.freeEcSlot - b.freeEcSlot
  121. })
  122. }
  123. // if the index node changed the freeEcSlot, need to keep every EcNode still sorted
  124. func ensureSortedEcNodes(data []*CandidateEcNode, index int, lessThan func(i, j int) bool) {
  125. for i := index - 1; i >= 0; i-- {
  126. if lessThan(i+1, i) {
  127. swap(data, i, i+1)
  128. } else {
  129. break
  130. }
  131. }
  132. for i := index + 1; i < len(data); i++ {
  133. if lessThan(i, i-1) {
  134. swap(data, i, i-1)
  135. } else {
  136. break
  137. }
  138. }
  139. }
  140. func swap(data []*CandidateEcNode, i, j int) {
  141. t := data[i]
  142. data[i] = data[j]
  143. data[j] = t
  144. }
  145. func countShards(ecShardInfos []*master_pb.VolumeEcShardInformationMessage) (count int) {
  146. for _, ecShardInfo := range ecShardInfos {
  147. shardBits := erasure_coding.ShardBits(ecShardInfo.EcIndexBits)
  148. count += shardBits.ShardIdCount()
  149. }
  150. return
  151. }
  152. func countFreeShardSlots(dn *master_pb.DataNodeInfo, diskType types.DiskType) (count int) {
  153. if dn.DiskInfos == nil {
  154. return 0
  155. }
  156. diskInfo := dn.DiskInfos[string(diskType)]
  157. if diskInfo == nil {
  158. return 0
  159. }
  160. return int(diskInfo.MaxVolumeCount-diskInfo.VolumeCount)*erasure_coding.DataShardsCount - countShards(diskInfo.EcShardInfos)
  161. }
  162. func (ecNode *EcNode) localShardIdCount(vid uint32) int {
  163. for _, diskInfo := range ecNode.info.DiskInfos {
  164. for _, ecShardInfo := range diskInfo.EcShardInfos {
  165. if vid == ecShardInfo.Id {
  166. shardBits := erasure_coding.ShardBits(ecShardInfo.EcIndexBits)
  167. return shardBits.ShardIdCount()
  168. }
  169. }
  170. }
  171. return 0
  172. }
  173. func collectEcNodes(commandEnv *CommandEnv, selectedDataCenter string) (ecNodes []*EcNode, totalFreeEcSlots int, err error) {
  174. // list all possible locations
  175. // collect topology information
  176. topologyInfo, _, err := collectTopologyInfo(commandEnv, 0)
  177. if err != nil {
  178. return
  179. }
  180. // find out all volume servers with one slot left.
  181. ecNodes, totalFreeEcSlots = collectEcVolumeServersByDc(topologyInfo, selectedDataCenter)
  182. sortEcNodesByFreeslotsDescending(ecNodes)
  183. return
  184. }
  185. func collectEcVolumeServersByDc(topo *master_pb.TopologyInfo, selectedDataCenter string) (ecNodes []*EcNode, totalFreeEcSlots int) {
  186. eachDataNode(topo, func(dc DataCenterId, rack RackId, dn *master_pb.DataNodeInfo) {
  187. if selectedDataCenter != "" && selectedDataCenter != string(dc) {
  188. return
  189. }
  190. freeEcSlots := countFreeShardSlots(dn, types.HardDriveType)
  191. ecNodes = append(ecNodes, &EcNode{
  192. info: dn,
  193. dc: dc,
  194. rack: rack,
  195. freeEcSlot: int(freeEcSlots),
  196. })
  197. totalFreeEcSlots += freeEcSlots
  198. })
  199. return
  200. }
  201. func sourceServerDeleteEcShards(grpcDialOption grpc.DialOption, collection string, volumeId needle.VolumeId, sourceLocation pb.ServerAddress, toBeDeletedShardIds []uint32) error {
  202. fmt.Printf("delete %d.%v from %s\n", volumeId, toBeDeletedShardIds, sourceLocation)
  203. return operation.WithVolumeServerClient(false, sourceLocation, grpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
  204. _, deleteErr := volumeServerClient.VolumeEcShardsDelete(context.Background(), &volume_server_pb.VolumeEcShardsDeleteRequest{
  205. VolumeId: uint32(volumeId),
  206. Collection: collection,
  207. ShardIds: toBeDeletedShardIds,
  208. })
  209. return deleteErr
  210. })
  211. }
  212. func unmountEcShards(grpcDialOption grpc.DialOption, volumeId needle.VolumeId, sourceLocation pb.ServerAddress, toBeUnmountedhardIds []uint32) error {
  213. fmt.Printf("unmount %d.%v from %s\n", volumeId, toBeUnmountedhardIds, sourceLocation)
  214. return operation.WithVolumeServerClient(false, sourceLocation, grpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
  215. _, deleteErr := volumeServerClient.VolumeEcShardsUnmount(context.Background(), &volume_server_pb.VolumeEcShardsUnmountRequest{
  216. VolumeId: uint32(volumeId),
  217. ShardIds: toBeUnmountedhardIds,
  218. })
  219. return deleteErr
  220. })
  221. }
  222. func mountEcShards(grpcDialOption grpc.DialOption, collection string, volumeId needle.VolumeId, sourceLocation pb.ServerAddress, toBeMountedhardIds []uint32) error {
  223. fmt.Printf("mount %d.%v on %s\n", volumeId, toBeMountedhardIds, sourceLocation)
  224. return operation.WithVolumeServerClient(false, sourceLocation, grpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
  225. _, mountErr := volumeServerClient.VolumeEcShardsMount(context.Background(), &volume_server_pb.VolumeEcShardsMountRequest{
  226. VolumeId: uint32(volumeId),
  227. Collection: collection,
  228. ShardIds: toBeMountedhardIds,
  229. })
  230. return mountErr
  231. })
  232. }
  233. func ceilDivide(a, b int) int {
  234. var r int
  235. if (a % b) != 0 {
  236. r = 1
  237. }
  238. return (a / b) + r
  239. }
  240. func findEcVolumeShards(ecNode *EcNode, vid needle.VolumeId) erasure_coding.ShardBits {
  241. if diskInfo, found := ecNode.info.DiskInfos[string(types.HardDriveType)]; found {
  242. for _, shardInfo := range diskInfo.EcShardInfos {
  243. if needle.VolumeId(shardInfo.Id) == vid {
  244. return erasure_coding.ShardBits(shardInfo.EcIndexBits)
  245. }
  246. }
  247. }
  248. return 0
  249. }
  250. func (ecNode *EcNode) addEcVolumeShards(vid needle.VolumeId, collection string, shardIds []uint32) *EcNode {
  251. foundVolume := false
  252. diskInfo, found := ecNode.info.DiskInfos[string(types.HardDriveType)]
  253. if found {
  254. for _, shardInfo := range diskInfo.EcShardInfos {
  255. if needle.VolumeId(shardInfo.Id) == vid {
  256. oldShardBits := erasure_coding.ShardBits(shardInfo.EcIndexBits)
  257. newShardBits := oldShardBits
  258. for _, shardId := range shardIds {
  259. newShardBits = newShardBits.AddShardId(erasure_coding.ShardId(shardId))
  260. }
  261. shardInfo.EcIndexBits = uint32(newShardBits)
  262. ecNode.freeEcSlot -= newShardBits.ShardIdCount() - oldShardBits.ShardIdCount()
  263. foundVolume = true
  264. break
  265. }
  266. }
  267. } else {
  268. diskInfo = &master_pb.DiskInfo{
  269. Type: string(types.HardDriveType),
  270. }
  271. ecNode.info.DiskInfos[string(types.HardDriveType)] = diskInfo
  272. }
  273. if !foundVolume {
  274. var newShardBits erasure_coding.ShardBits
  275. for _, shardId := range shardIds {
  276. newShardBits = newShardBits.AddShardId(erasure_coding.ShardId(shardId))
  277. }
  278. diskInfo.EcShardInfos = append(diskInfo.EcShardInfos, &master_pb.VolumeEcShardInformationMessage{
  279. Id: uint32(vid),
  280. Collection: collection,
  281. EcIndexBits: uint32(newShardBits),
  282. DiskType: string(types.HardDriveType),
  283. })
  284. ecNode.freeEcSlot -= len(shardIds)
  285. }
  286. return ecNode
  287. }
  288. func (ecNode *EcNode) deleteEcVolumeShards(vid needle.VolumeId, shardIds []uint32) *EcNode {
  289. if diskInfo, found := ecNode.info.DiskInfos[string(types.HardDriveType)]; found {
  290. for _, shardInfo := range diskInfo.EcShardInfos {
  291. if needle.VolumeId(shardInfo.Id) == vid {
  292. oldShardBits := erasure_coding.ShardBits(shardInfo.EcIndexBits)
  293. newShardBits := oldShardBits
  294. for _, shardId := range shardIds {
  295. newShardBits = newShardBits.RemoveShardId(erasure_coding.ShardId(shardId))
  296. }
  297. shardInfo.EcIndexBits = uint32(newShardBits)
  298. ecNode.freeEcSlot -= newShardBits.ShardIdCount() - oldShardBits.ShardIdCount()
  299. }
  300. }
  301. }
  302. return ecNode
  303. }
  304. func groupByCount(data []*EcNode, identifierFn func(*EcNode) (id string, count int)) map[string]int {
  305. countMap := make(map[string]int)
  306. for _, d := range data {
  307. id, count := identifierFn(d)
  308. countMap[id] += count
  309. }
  310. return countMap
  311. }
  312. func groupBy(data []*EcNode, identifierFn func(*EcNode) (id string)) map[string][]*EcNode {
  313. groupMap := make(map[string][]*EcNode)
  314. for _, d := range data {
  315. id := identifierFn(d)
  316. groupMap[id] = append(groupMap[id], d)
  317. }
  318. return groupMap
  319. }
  320. func collectRacks(allEcNodes []*EcNode) map[RackId]*EcRack {
  321. // collect racks info
  322. racks := make(map[RackId]*EcRack)
  323. for _, ecNode := range allEcNodes {
  324. if racks[ecNode.rack] == nil {
  325. racks[ecNode.rack] = &EcRack{
  326. ecNodes: make(map[EcNodeId]*EcNode),
  327. }
  328. }
  329. racks[ecNode.rack].ecNodes[EcNodeId(ecNode.info.Id)] = ecNode
  330. racks[ecNode.rack].freeEcSlot += ecNode.freeEcSlot
  331. }
  332. return racks
  333. }
  334. func balanceEcVolumes(commandEnv *CommandEnv, collection string, allEcNodes []*EcNode, racks map[RackId]*EcRack, applyBalancing bool) error {
  335. fmt.Printf("balanceEcVolumes %s\n", collection)
  336. if err := deleteDuplicatedEcShards(commandEnv, allEcNodes, collection, applyBalancing); err != nil {
  337. return fmt.Errorf("delete duplicated collection %s ec shards: %v", collection, err)
  338. }
  339. if err := balanceEcShardsAcrossRacks(commandEnv, allEcNodes, racks, collection, applyBalancing); err != nil {
  340. return fmt.Errorf("balance across racks collection %s ec shards: %v", collection, err)
  341. }
  342. if err := balanceEcShardsWithinRacks(commandEnv, allEcNodes, racks, collection, applyBalancing); err != nil {
  343. return fmt.Errorf("balance within racks collection %s ec shards: %v", collection, err)
  344. }
  345. return nil
  346. }
  347. func deleteDuplicatedEcShards(commandEnv *CommandEnv, allEcNodes []*EcNode, collection string, applyBalancing bool) error {
  348. // vid => []ecNode
  349. vidLocations := collectVolumeIdToEcNodes(allEcNodes, collection)
  350. // deduplicate ec shards
  351. for vid, locations := range vidLocations {
  352. if err := doDeduplicateEcShards(commandEnv, collection, vid, locations, applyBalancing); err != nil {
  353. return err
  354. }
  355. }
  356. return nil
  357. }
  358. func doDeduplicateEcShards(commandEnv *CommandEnv, collection string, vid needle.VolumeId, locations []*EcNode, applyBalancing bool) error {
  359. // check whether this volume has ecNodes that are over average
  360. shardToLocations := make([][]*EcNode, erasure_coding.TotalShardsCount)
  361. for _, ecNode := range locations {
  362. shardBits := findEcVolumeShards(ecNode, vid)
  363. for _, shardId := range shardBits.ShardIds() {
  364. shardToLocations[shardId] = append(shardToLocations[shardId], ecNode)
  365. }
  366. }
  367. for shardId, ecNodes := range shardToLocations {
  368. if len(ecNodes) <= 1 {
  369. continue
  370. }
  371. sortEcNodesByFreeslotsAscending(ecNodes)
  372. fmt.Printf("ec shard %d.%d has %d copies, keeping %v\n", vid, shardId, len(ecNodes), ecNodes[0].info.Id)
  373. if !applyBalancing {
  374. continue
  375. }
  376. duplicatedShardIds := []uint32{uint32(shardId)}
  377. for _, ecNode := range ecNodes[1:] {
  378. if err := unmountEcShards(commandEnv.option.GrpcDialOption, vid, pb.NewServerAddressFromDataNode(ecNode.info), duplicatedShardIds); err != nil {
  379. return err
  380. }
  381. if err := sourceServerDeleteEcShards(commandEnv.option.GrpcDialOption, collection, vid, pb.NewServerAddressFromDataNode(ecNode.info), duplicatedShardIds); err != nil {
  382. return err
  383. }
  384. ecNode.deleteEcVolumeShards(vid, duplicatedShardIds)
  385. }
  386. }
  387. return nil
  388. }
  389. func balanceEcShardsAcrossRacks(commandEnv *CommandEnv, allEcNodes []*EcNode, racks map[RackId]*EcRack, collection string, applyBalancing bool) error {
  390. // collect vid => []ecNode, since previous steps can change the locations
  391. vidLocations := collectVolumeIdToEcNodes(allEcNodes, collection)
  392. // spread the ec shards evenly
  393. for vid, locations := range vidLocations {
  394. if err := doBalanceEcShardsAcrossRacks(commandEnv, collection, vid, locations, racks, applyBalancing); err != nil {
  395. return err
  396. }
  397. }
  398. return nil
  399. }
  400. func countShardsByRack(vid needle.VolumeId, locations []*EcNode) map[string]int {
  401. return groupByCount(locations, func(ecNode *EcNode) (id string, count int) {
  402. shardBits := findEcVolumeShards(ecNode, vid)
  403. return string(ecNode.rack), shardBits.ShardIdCount()
  404. })
  405. }
  406. func doBalanceEcShardsAcrossRacks(commandEnv *CommandEnv, collection string, vid needle.VolumeId, locations []*EcNode, racks map[RackId]*EcRack, applyBalancing bool) error {
  407. // calculate average number of shards an ec rack should have for one volume
  408. averageShardsPerEcRack := ceilDivide(erasure_coding.TotalShardsCount, len(racks))
  409. // see the volume's shards are in how many racks, and how many in each rack
  410. rackToShardCount := countShardsByRack(vid, locations)
  411. rackEcNodesWithVid := groupBy(locations, func(ecNode *EcNode) string {
  412. return string(ecNode.rack)
  413. })
  414. // ecShardsToMove = select overflown ec shards from racks with ec shard counts > averageShardsPerEcRack
  415. ecShardsToMove := make(map[erasure_coding.ShardId]*EcNode)
  416. for rackId, count := range rackToShardCount {
  417. if count <= averageShardsPerEcRack {
  418. continue
  419. }
  420. possibleEcNodes := rackEcNodesWithVid[rackId]
  421. for shardId, ecNode := range pickNEcShardsToMoveFrom(possibleEcNodes, vid, count-averageShardsPerEcRack) {
  422. ecShardsToMove[shardId] = ecNode
  423. }
  424. }
  425. for shardId, ecNode := range ecShardsToMove {
  426. // TODO: consider volume replica info when balancing racks
  427. rackId := pickRackToBalanceShardsInto(racks, rackToShardCount, nil, averageShardsPerEcRack)
  428. if rackId == "" {
  429. fmt.Printf("ec shard %d.%d at %s can not find a destination rack\n", vid, shardId, ecNode.info.Id)
  430. continue
  431. }
  432. var possibleDestinationEcNodes []*EcNode
  433. for _, n := range racks[rackId].ecNodes {
  434. possibleDestinationEcNodes = append(possibleDestinationEcNodes, n)
  435. }
  436. err := pickOneEcNodeAndMoveOneShard(commandEnv, averageShardsPerEcRack, ecNode, collection, vid, shardId, possibleDestinationEcNodes, applyBalancing)
  437. if err != nil {
  438. return err
  439. }
  440. rackToShardCount[string(rackId)] += 1
  441. rackToShardCount[string(ecNode.rack)] -= 1
  442. racks[rackId].freeEcSlot -= 1
  443. racks[ecNode.rack].freeEcSlot += 1
  444. }
  445. return nil
  446. }
  447. func pickRackToBalanceShardsInto(rackToEcNodes map[RackId]*EcRack, rackToShardCount map[string]int, replicaPlacement *super_block.ReplicaPlacement, averageShardsPerEcRack int) RackId {
  448. targets := []RackId{}
  449. targetShards := -1
  450. for _, shards := range rackToShardCount {
  451. if shards > targetShards {
  452. targetShards = shards
  453. }
  454. }
  455. for rackId, rack := range rackToEcNodes {
  456. shards := rackToShardCount[string(rackId)]
  457. if rack.freeEcSlot <= 0 {
  458. // No EC shards slots left :(
  459. continue
  460. }
  461. if replicaPlacement != nil && shards >= replicaPlacement.DiffRackCount {
  462. // Don't select racks with more EC shards for the target volume than the replicaton limit.
  463. continue
  464. }
  465. if shards >= averageShardsPerEcRack {
  466. // Keep EC shards across racks as balanced as possible.
  467. continue
  468. }
  469. if shards < targetShards {
  470. // Favor racks with less shards, to ensure an uniform distribution.
  471. targets = nil
  472. targetShards = shards
  473. }
  474. if shards == targetShards {
  475. targets = append(targets, rackId)
  476. }
  477. }
  478. if len(targets) == 0 {
  479. return ""
  480. }
  481. return targets[rand.IntN(len(targets))]
  482. }
  483. func balanceEcShardsWithinRacks(commandEnv *CommandEnv, allEcNodes []*EcNode, racks map[RackId]*EcRack, collection string, applyBalancing bool) error {
  484. // collect vid => []ecNode, since previous steps can change the locations
  485. vidLocations := collectVolumeIdToEcNodes(allEcNodes, collection)
  486. // spread the ec shards evenly
  487. for vid, locations := range vidLocations {
  488. // see the volume's shards are in how many racks, and how many in each rack
  489. rackToShardCount := groupByCount(locations, func(ecNode *EcNode) (id string, count int) {
  490. shardBits := findEcVolumeShards(ecNode, vid)
  491. return string(ecNode.rack), shardBits.ShardIdCount()
  492. })
  493. rackEcNodesWithVid := groupBy(locations, func(ecNode *EcNode) string {
  494. return string(ecNode.rack)
  495. })
  496. for rackId, _ := range rackToShardCount {
  497. var possibleDestinationEcNodes []*EcNode
  498. for _, n := range racks[RackId(rackId)].ecNodes {
  499. if _, found := n.info.DiskInfos[string(types.HardDriveType)]; found {
  500. possibleDestinationEcNodes = append(possibleDestinationEcNodes, n)
  501. }
  502. }
  503. sourceEcNodes := rackEcNodesWithVid[rackId]
  504. averageShardsPerEcNode := ceilDivide(rackToShardCount[rackId], len(possibleDestinationEcNodes))
  505. if err := doBalanceEcShardsWithinOneRack(commandEnv, averageShardsPerEcNode, collection, vid, sourceEcNodes, possibleDestinationEcNodes, applyBalancing); err != nil {
  506. return err
  507. }
  508. }
  509. }
  510. return nil
  511. }
  512. func doBalanceEcShardsWithinOneRack(commandEnv *CommandEnv, averageShardsPerEcNode int, collection string, vid needle.VolumeId, existingLocations, possibleDestinationEcNodes []*EcNode, applyBalancing bool) error {
  513. for _, ecNode := range existingLocations {
  514. shardBits := findEcVolumeShards(ecNode, vid)
  515. overLimitCount := shardBits.ShardIdCount() - averageShardsPerEcNode
  516. for _, shardId := range shardBits.ShardIds() {
  517. if overLimitCount <= 0 {
  518. break
  519. }
  520. fmt.Printf("%s has %d overlimit, moving ec shard %d.%d\n", ecNode.info.Id, overLimitCount, vid, shardId)
  521. err := pickOneEcNodeAndMoveOneShard(commandEnv, averageShardsPerEcNode, ecNode, collection, vid, shardId, possibleDestinationEcNodes, applyBalancing)
  522. if err != nil {
  523. return err
  524. }
  525. overLimitCount--
  526. }
  527. }
  528. return nil
  529. }
  530. func balanceEcRacks(commandEnv *CommandEnv, racks map[RackId]*EcRack, applyBalancing bool) error {
  531. // balance one rack for all ec shards
  532. for _, ecRack := range racks {
  533. if err := doBalanceEcRack(commandEnv, ecRack, applyBalancing); err != nil {
  534. return err
  535. }
  536. }
  537. return nil
  538. }
  539. func doBalanceEcRack(commandEnv *CommandEnv, ecRack *EcRack, applyBalancing bool) error {
  540. if len(ecRack.ecNodes) <= 1 {
  541. return nil
  542. }
  543. var rackEcNodes []*EcNode
  544. for _, node := range ecRack.ecNodes {
  545. rackEcNodes = append(rackEcNodes, node)
  546. }
  547. ecNodeIdToShardCount := groupByCount(rackEcNodes, func(ecNode *EcNode) (id string, count int) {
  548. diskInfo, found := ecNode.info.DiskInfos[string(types.HardDriveType)]
  549. if !found {
  550. return
  551. }
  552. for _, ecShardInfo := range diskInfo.EcShardInfos {
  553. count += erasure_coding.ShardBits(ecShardInfo.EcIndexBits).ShardIdCount()
  554. }
  555. return ecNode.info.Id, count
  556. })
  557. var totalShardCount int
  558. for _, count := range ecNodeIdToShardCount {
  559. totalShardCount += count
  560. }
  561. averageShardCount := ceilDivide(totalShardCount, len(rackEcNodes))
  562. hasMove := true
  563. for hasMove {
  564. hasMove = false
  565. slices.SortFunc(rackEcNodes, func(a, b *EcNode) int {
  566. return b.freeEcSlot - a.freeEcSlot
  567. })
  568. emptyNode, fullNode := rackEcNodes[0], rackEcNodes[len(rackEcNodes)-1]
  569. emptyNodeShardCount, fullNodeShardCount := ecNodeIdToShardCount[emptyNode.info.Id], ecNodeIdToShardCount[fullNode.info.Id]
  570. if fullNodeShardCount > averageShardCount && emptyNodeShardCount+1 <= averageShardCount {
  571. emptyNodeIds := make(map[uint32]bool)
  572. if emptyDiskInfo, found := emptyNode.info.DiskInfos[string(types.HardDriveType)]; found {
  573. for _, shards := range emptyDiskInfo.EcShardInfos {
  574. emptyNodeIds[shards.Id] = true
  575. }
  576. }
  577. if fullDiskInfo, found := fullNode.info.DiskInfos[string(types.HardDriveType)]; found {
  578. for _, shards := range fullDiskInfo.EcShardInfos {
  579. if _, found := emptyNodeIds[shards.Id]; !found {
  580. for _, shardId := range erasure_coding.ShardBits(shards.EcIndexBits).ShardIds() {
  581. fmt.Printf("%s moves ec shards %d.%d to %s\n", fullNode.info.Id, shards.Id, shardId, emptyNode.info.Id)
  582. err := moveMountedShardToEcNode(commandEnv, fullNode, shards.Collection, needle.VolumeId(shards.Id), shardId, emptyNode, applyBalancing)
  583. if err != nil {
  584. return err
  585. }
  586. ecNodeIdToShardCount[emptyNode.info.Id]++
  587. ecNodeIdToShardCount[fullNode.info.Id]--
  588. hasMove = true
  589. break
  590. }
  591. break
  592. }
  593. }
  594. }
  595. }
  596. }
  597. return nil
  598. }
  599. func pickOneEcNodeAndMoveOneShard(commandEnv *CommandEnv, averageShardsPerEcNode int, existingLocation *EcNode, collection string, vid needle.VolumeId, shardId erasure_coding.ShardId, possibleDestinationEcNodes []*EcNode, applyBalancing bool) error {
  600. sortEcNodesByFreeslotsDescending(possibleDestinationEcNodes)
  601. skipReason := ""
  602. for _, destEcNode := range possibleDestinationEcNodes {
  603. if destEcNode.info.Id == existingLocation.info.Id {
  604. continue
  605. }
  606. if destEcNode.freeEcSlot <= 0 {
  607. skipReason += fmt.Sprintf(" Skipping %s because it has no free slots\n", destEcNode.info.Id)
  608. continue
  609. }
  610. if findEcVolumeShards(destEcNode, vid).ShardIdCount() >= averageShardsPerEcNode {
  611. skipReason += fmt.Sprintf(" Skipping %s because it %d >= avernageShards (%d)\n",
  612. destEcNode.info.Id, findEcVolumeShards(destEcNode, vid).ShardIdCount(), averageShardsPerEcNode)
  613. continue
  614. }
  615. fmt.Printf("%s moves ec shard %d.%d to %s\n", existingLocation.info.Id, vid, shardId, destEcNode.info.Id)
  616. err := moveMountedShardToEcNode(commandEnv, existingLocation, collection, vid, shardId, destEcNode, applyBalancing)
  617. if err != nil {
  618. return err
  619. }
  620. return nil
  621. }
  622. fmt.Printf("WARNING: Could not find suitable taget node for %d.%d:\n%s", vid, shardId, skipReason)
  623. return nil
  624. }
  625. func pickNEcShardsToMoveFrom(ecNodes []*EcNode, vid needle.VolumeId, n int) map[erasure_coding.ShardId]*EcNode {
  626. picked := make(map[erasure_coding.ShardId]*EcNode)
  627. var candidateEcNodes []*CandidateEcNode
  628. for _, ecNode := range ecNodes {
  629. shardBits := findEcVolumeShards(ecNode, vid)
  630. if shardBits.ShardIdCount() > 0 {
  631. candidateEcNodes = append(candidateEcNodes, &CandidateEcNode{
  632. ecNode: ecNode,
  633. shardCount: shardBits.ShardIdCount(),
  634. })
  635. }
  636. }
  637. slices.SortFunc(candidateEcNodes, func(a, b *CandidateEcNode) int {
  638. return b.shardCount - a.shardCount
  639. })
  640. for i := 0; i < n; i++ {
  641. selectedEcNodeIndex := -1
  642. for i, candidateEcNode := range candidateEcNodes {
  643. shardBits := findEcVolumeShards(candidateEcNode.ecNode, vid)
  644. if shardBits > 0 {
  645. selectedEcNodeIndex = i
  646. for _, shardId := range shardBits.ShardIds() {
  647. candidateEcNode.shardCount--
  648. picked[shardId] = candidateEcNode.ecNode
  649. candidateEcNode.ecNode.deleteEcVolumeShards(vid, []uint32{uint32(shardId)})
  650. break
  651. }
  652. break
  653. }
  654. }
  655. if selectedEcNodeIndex >= 0 {
  656. ensureSortedEcNodes(candidateEcNodes, selectedEcNodeIndex, func(i, j int) bool {
  657. return candidateEcNodes[i].shardCount > candidateEcNodes[j].shardCount
  658. })
  659. }
  660. }
  661. return picked
  662. }
  663. func collectVolumeIdToEcNodes(allEcNodes []*EcNode, collection string) map[needle.VolumeId][]*EcNode {
  664. vidLocations := make(map[needle.VolumeId][]*EcNode)
  665. for _, ecNode := range allEcNodes {
  666. diskInfo, found := ecNode.info.DiskInfos[string(types.HardDriveType)]
  667. if !found {
  668. continue
  669. }
  670. for _, shardInfo := range diskInfo.EcShardInfos {
  671. // ignore if not in current collection
  672. if shardInfo.Collection == collection {
  673. vidLocations[needle.VolumeId(shardInfo.Id)] = append(vidLocations[needle.VolumeId(shardInfo.Id)], ecNode)
  674. }
  675. }
  676. }
  677. return vidLocations
  678. }
  679. // TODO: EC volumes have no replica placement info :( Maybe rely on the master's default?
  680. func volumeIdToReplicaPlacement(vid needle.VolumeId, nodes []*EcNode) (*super_block.ReplicaPlacement, error) {
  681. for _, ecNode := range nodes {
  682. for _, diskInfo := range ecNode.info.DiskInfos {
  683. for _, volumeInfo := range diskInfo.VolumeInfos {
  684. if needle.VolumeId(volumeInfo.Id) != vid {
  685. continue
  686. }
  687. return super_block.NewReplicaPlacementFromByte(byte(volumeInfo.ReplicaPlacement))
  688. }
  689. }
  690. }
  691. return nil, fmt.Errorf("failed to resolve replica placement for volume ID %d", vid)
  692. }
  693. func getDefaultReplicaPlacement(commandEnv *CommandEnv) (*super_block.ReplicaPlacement, error) {
  694. var resp *master_pb.GetMasterConfigurationResponse
  695. var err error
  696. err = commandEnv.MasterClient.WithClient(false, func(client master_pb.SeaweedClient) error {
  697. resp, err = client.GetMasterConfiguration(context.Background(), &master_pb.GetMasterConfigurationRequest{})
  698. return err
  699. })
  700. if err != nil {
  701. return nil, err
  702. }
  703. return super_block.NewReplicaPlacementFromString(resp.DefaultReplication)
  704. }
  705. func EcBalance(commandEnv *CommandEnv, collections []string, dc string, applyBalancing bool) (err error) {
  706. if len(collections) == 0 {
  707. return fmt.Errorf("no collections to balance")
  708. }
  709. // collect all ec nodes
  710. allEcNodes, totalFreeEcSlots, err := collectEcNodes(commandEnv, dc)
  711. if err != nil {
  712. return err
  713. }
  714. if totalFreeEcSlots < 1 {
  715. return fmt.Errorf("no free ec shard slots. only %d left", totalFreeEcSlots)
  716. }
  717. racks := collectRacks(allEcNodes)
  718. for _, c := range collections {
  719. if err = balanceEcVolumes(commandEnv, c, allEcNodes, racks, applyBalancing); err != nil {
  720. return err
  721. }
  722. }
  723. if err := balanceEcRacks(commandEnv, racks, applyBalancing); err != nil {
  724. return fmt.Errorf("balance ec racks: %v", err)
  725. }
  726. return nil
  727. }