You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

801 lines
26 KiB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
  1. package shell
  2. import (
  3. "context"
  4. "fmt"
  5. "math"
  6. "github.com/seaweedfs/seaweedfs/weed/glog"
  7. "github.com/seaweedfs/seaweedfs/weed/operation"
  8. "github.com/seaweedfs/seaweedfs/weed/pb"
  9. "github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
  10. "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
  11. "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
  12. "github.com/seaweedfs/seaweedfs/weed/storage/needle"
  13. "github.com/seaweedfs/seaweedfs/weed/storage/types"
  14. "golang.org/x/exp/slices"
  15. "google.golang.org/grpc"
  16. )
  17. func moveMountedShardToEcNode(commandEnv *CommandEnv, existingLocation *EcNode, collection string, vid needle.VolumeId, shardId erasure_coding.ShardId, destinationEcNode *EcNode, applyBalancing bool) (err error) {
  18. if !commandEnv.isLocked() {
  19. return fmt.Errorf("lock is lost")
  20. }
  21. copiedShardIds := []uint32{uint32(shardId)}
  22. if applyBalancing {
  23. existingServerAddress := pb.NewServerAddressFromDataNode(existingLocation.info)
  24. // ask destination node to copy shard and the ecx file from source node, and mount it
  25. copiedShardIds, err = oneServerCopyAndMountEcShardsFromSource(commandEnv.option.GrpcDialOption, destinationEcNode, []uint32{uint32(shardId)}, vid, collection, existingServerAddress)
  26. if err != nil {
  27. return err
  28. }
  29. // unmount the to be deleted shards
  30. err = unmountEcShards(commandEnv.option.GrpcDialOption, vid, existingServerAddress, copiedShardIds)
  31. if err != nil {
  32. return err
  33. }
  34. // ask source node to delete the shard, and maybe the ecx file
  35. err = sourceServerDeleteEcShards(commandEnv.option.GrpcDialOption, collection, vid, existingServerAddress, copiedShardIds)
  36. if err != nil {
  37. return err
  38. }
  39. fmt.Printf("moved ec shard %d.%d %s => %s\n", vid, shardId, existingLocation.info.Id, destinationEcNode.info.Id)
  40. }
  41. destinationEcNode.addEcVolumeShards(vid, collection, copiedShardIds)
  42. existingLocation.deleteEcVolumeShards(vid, copiedShardIds)
  43. return nil
  44. }
  45. func oneServerCopyAndMountEcShardsFromSource(grpcDialOption grpc.DialOption,
  46. targetServer *EcNode, shardIdsToCopy []uint32,
  47. volumeId needle.VolumeId, collection string, existingLocation pb.ServerAddress) (copiedShardIds []uint32, err error) {
  48. fmt.Printf("allocate %d.%v %s => %s\n", volumeId, shardIdsToCopy, existingLocation, targetServer.info.Id)
  49. targetAddress := pb.NewServerAddressFromDataNode(targetServer.info)
  50. err = operation.WithVolumeServerClient(false, targetAddress, grpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
  51. if targetAddress != existingLocation {
  52. fmt.Printf("copy %d.%v %s => %s\n", volumeId, shardIdsToCopy, existingLocation, targetServer.info.Id)
  53. _, copyErr := volumeServerClient.VolumeEcShardsCopy(context.Background(), &volume_server_pb.VolumeEcShardsCopyRequest{
  54. VolumeId: uint32(volumeId),
  55. Collection: collection,
  56. ShardIds: shardIdsToCopy,
  57. CopyEcxFile: true,
  58. CopyEcjFile: true,
  59. CopyVifFile: true,
  60. SourceDataNode: string(existingLocation),
  61. })
  62. if copyErr != nil {
  63. return fmt.Errorf("copy %d.%v %s => %s : %v\n", volumeId, shardIdsToCopy, existingLocation, targetServer.info.Id, copyErr)
  64. }
  65. }
  66. fmt.Printf("mount %d.%v on %s\n", volumeId, shardIdsToCopy, targetServer.info.Id)
  67. _, mountErr := volumeServerClient.VolumeEcShardsMount(context.Background(), &volume_server_pb.VolumeEcShardsMountRequest{
  68. VolumeId: uint32(volumeId),
  69. Collection: collection,
  70. ShardIds: shardIdsToCopy,
  71. })
  72. if mountErr != nil {
  73. return fmt.Errorf("mount %d.%v on %s : %v\n", volumeId, shardIdsToCopy, targetServer.info.Id, mountErr)
  74. }
  75. if targetAddress != existingLocation {
  76. copiedShardIds = shardIdsToCopy
  77. glog.V(0).Infof("%s ec volume %d deletes shards %+v", existingLocation, volumeId, copiedShardIds)
  78. }
  79. return nil
  80. })
  81. if err != nil {
  82. return
  83. }
  84. return
  85. }
  86. func eachDataNode(topo *master_pb.TopologyInfo, fn func(dc string, rack RackId, dn *master_pb.DataNodeInfo)) {
  87. for _, dc := range topo.DataCenterInfos {
  88. for _, rack := range dc.RackInfos {
  89. for _, dn := range rack.DataNodeInfos {
  90. fn(dc.Id, RackId(rack.Id), dn)
  91. }
  92. }
  93. }
  94. }
  95. func sortEcNodesByFreeslotsDescending(ecNodes []*EcNode) {
  96. slices.SortFunc(ecNodes, func(a, b *EcNode) int {
  97. return b.freeEcSlot - a.freeEcSlot
  98. })
  99. }
  100. func sortEcNodesByFreeslotsAscending(ecNodes []*EcNode) {
  101. slices.SortFunc(ecNodes, func(a, b *EcNode) int {
  102. return a.freeEcSlot - b.freeEcSlot
  103. })
  104. }
  105. type CandidateEcNode struct {
  106. ecNode *EcNode
  107. shardCount int
  108. }
  109. // if the index node changed the freeEcSlot, need to keep every EcNode still sorted
  110. func ensureSortedEcNodes(data []*CandidateEcNode, index int, lessThan func(i, j int) bool) {
  111. for i := index - 1; i >= 0; i-- {
  112. if lessThan(i+1, i) {
  113. swap(data, i, i+1)
  114. } else {
  115. break
  116. }
  117. }
  118. for i := index + 1; i < len(data); i++ {
  119. if lessThan(i, i-1) {
  120. swap(data, i, i-1)
  121. } else {
  122. break
  123. }
  124. }
  125. }
  126. func swap(data []*CandidateEcNode, i, j int) {
  127. t := data[i]
  128. data[i] = data[j]
  129. data[j] = t
  130. }
  131. func countShards(ecShardInfos []*master_pb.VolumeEcShardInformationMessage) (count int) {
  132. for _, ecShardInfo := range ecShardInfos {
  133. shardBits := erasure_coding.ShardBits(ecShardInfo.EcIndexBits)
  134. count += shardBits.ShardIdCount()
  135. }
  136. return
  137. }
  138. func countFreeShardSlots(dn *master_pb.DataNodeInfo, diskType types.DiskType) (count int) {
  139. if dn.DiskInfos == nil {
  140. return 0
  141. }
  142. diskInfo := dn.DiskInfos[string(diskType)]
  143. if diskInfo == nil {
  144. return 0
  145. }
  146. return int(diskInfo.MaxVolumeCount-diskInfo.VolumeCount)*erasure_coding.DataShardsCount - countShards(diskInfo.EcShardInfos)
  147. }
  148. type RackId string
  149. type EcNodeId string
  150. type EcNode struct {
  151. info *master_pb.DataNodeInfo
  152. dc string
  153. rack RackId
  154. freeEcSlot int
  155. }
  156. func (ecNode *EcNode) localShardIdCount(vid uint32) int {
  157. for _, diskInfo := range ecNode.info.DiskInfos {
  158. for _, ecShardInfo := range diskInfo.EcShardInfos {
  159. if vid == ecShardInfo.Id {
  160. shardBits := erasure_coding.ShardBits(ecShardInfo.EcIndexBits)
  161. return shardBits.ShardIdCount()
  162. }
  163. }
  164. }
  165. return 0
  166. }
  167. type EcRack struct {
  168. ecNodes map[EcNodeId]*EcNode
  169. freeEcSlot int
  170. }
  171. func collectEcNodes(commandEnv *CommandEnv, selectedDataCenter string) (ecNodes []*EcNode, totalFreeEcSlots int, err error) {
  172. // list all possible locations
  173. // collect topology information
  174. topologyInfo, _, err := collectTopologyInfo(commandEnv, 0)
  175. if err != nil {
  176. return
  177. }
  178. // find out all volume servers with one slot left.
  179. ecNodes, totalFreeEcSlots = collectEcVolumeServersByDc(topologyInfo, selectedDataCenter)
  180. sortEcNodesByFreeslotsDescending(ecNodes)
  181. return
  182. }
  183. func collectEcVolumeServersByDc(topo *master_pb.TopologyInfo, selectedDataCenter string) (ecNodes []*EcNode, totalFreeEcSlots int) {
  184. eachDataNode(topo, func(dc string, rack RackId, dn *master_pb.DataNodeInfo) {
  185. if selectedDataCenter != "" && selectedDataCenter != dc {
  186. return
  187. }
  188. freeEcSlots := countFreeShardSlots(dn, types.HardDriveType)
  189. ecNodes = append(ecNodes, &EcNode{
  190. info: dn,
  191. dc: dc,
  192. rack: rack,
  193. freeEcSlot: int(freeEcSlots),
  194. })
  195. totalFreeEcSlots += freeEcSlots
  196. })
  197. return
  198. }
  199. func sourceServerDeleteEcShards(grpcDialOption grpc.DialOption, collection string, volumeId needle.VolumeId, sourceLocation pb.ServerAddress, toBeDeletedShardIds []uint32) error {
  200. fmt.Printf("delete %d.%v from %s\n", volumeId, toBeDeletedShardIds, sourceLocation)
  201. return operation.WithVolumeServerClient(false, sourceLocation, grpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
  202. _, deleteErr := volumeServerClient.VolumeEcShardsDelete(context.Background(), &volume_server_pb.VolumeEcShardsDeleteRequest{
  203. VolumeId: uint32(volumeId),
  204. Collection: collection,
  205. ShardIds: toBeDeletedShardIds,
  206. })
  207. return deleteErr
  208. })
  209. }
  210. func unmountEcShards(grpcDialOption grpc.DialOption, volumeId needle.VolumeId, sourceLocation pb.ServerAddress, toBeUnmountedhardIds []uint32) error {
  211. fmt.Printf("unmount %d.%v from %s\n", volumeId, toBeUnmountedhardIds, sourceLocation)
  212. return operation.WithVolumeServerClient(false, sourceLocation, grpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
  213. _, deleteErr := volumeServerClient.VolumeEcShardsUnmount(context.Background(), &volume_server_pb.VolumeEcShardsUnmountRequest{
  214. VolumeId: uint32(volumeId),
  215. ShardIds: toBeUnmountedhardIds,
  216. })
  217. return deleteErr
  218. })
  219. }
  220. func mountEcShards(grpcDialOption grpc.DialOption, collection string, volumeId needle.VolumeId, sourceLocation pb.ServerAddress, toBeMountedhardIds []uint32) error {
  221. fmt.Printf("mount %d.%v on %s\n", volumeId, toBeMountedhardIds, sourceLocation)
  222. return operation.WithVolumeServerClient(false, sourceLocation, grpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
  223. _, mountErr := volumeServerClient.VolumeEcShardsMount(context.Background(), &volume_server_pb.VolumeEcShardsMountRequest{
  224. VolumeId: uint32(volumeId),
  225. Collection: collection,
  226. ShardIds: toBeMountedhardIds,
  227. })
  228. return mountErr
  229. })
  230. }
  231. func ceilDivide(total, n int) int {
  232. return int(math.Ceil(float64(total) / float64(n)))
  233. }
  234. func findEcVolumeShards(ecNode *EcNode, vid needle.VolumeId) erasure_coding.ShardBits {
  235. if diskInfo, found := ecNode.info.DiskInfos[string(types.HardDriveType)]; found {
  236. for _, shardInfo := range diskInfo.EcShardInfos {
  237. if needle.VolumeId(shardInfo.Id) == vid {
  238. return erasure_coding.ShardBits(shardInfo.EcIndexBits)
  239. }
  240. }
  241. }
  242. return 0
  243. }
  244. func (ecNode *EcNode) addEcVolumeShards(vid needle.VolumeId, collection string, shardIds []uint32) *EcNode {
  245. foundVolume := false
  246. diskInfo, found := ecNode.info.DiskInfos[string(types.HardDriveType)]
  247. if found {
  248. for _, shardInfo := range diskInfo.EcShardInfos {
  249. if needle.VolumeId(shardInfo.Id) == vid {
  250. oldShardBits := erasure_coding.ShardBits(shardInfo.EcIndexBits)
  251. newShardBits := oldShardBits
  252. for _, shardId := range shardIds {
  253. newShardBits = newShardBits.AddShardId(erasure_coding.ShardId(shardId))
  254. }
  255. shardInfo.EcIndexBits = uint32(newShardBits)
  256. ecNode.freeEcSlot -= newShardBits.ShardIdCount() - oldShardBits.ShardIdCount()
  257. foundVolume = true
  258. break
  259. }
  260. }
  261. } else {
  262. diskInfo = &master_pb.DiskInfo{
  263. Type: string(types.HardDriveType),
  264. }
  265. ecNode.info.DiskInfos[string(types.HardDriveType)] = diskInfo
  266. }
  267. if !foundVolume {
  268. var newShardBits erasure_coding.ShardBits
  269. for _, shardId := range shardIds {
  270. newShardBits = newShardBits.AddShardId(erasure_coding.ShardId(shardId))
  271. }
  272. diskInfo.EcShardInfos = append(diskInfo.EcShardInfos, &master_pb.VolumeEcShardInformationMessage{
  273. Id: uint32(vid),
  274. Collection: collection,
  275. EcIndexBits: uint32(newShardBits),
  276. DiskType: string(types.HardDriveType),
  277. })
  278. ecNode.freeEcSlot -= len(shardIds)
  279. }
  280. return ecNode
  281. }
  282. func (ecNode *EcNode) deleteEcVolumeShards(vid needle.VolumeId, shardIds []uint32) *EcNode {
  283. if diskInfo, found := ecNode.info.DiskInfos[string(types.HardDriveType)]; found {
  284. for _, shardInfo := range diskInfo.EcShardInfos {
  285. if needle.VolumeId(shardInfo.Id) == vid {
  286. oldShardBits := erasure_coding.ShardBits(shardInfo.EcIndexBits)
  287. newShardBits := oldShardBits
  288. for _, shardId := range shardIds {
  289. newShardBits = newShardBits.RemoveShardId(erasure_coding.ShardId(shardId))
  290. }
  291. shardInfo.EcIndexBits = uint32(newShardBits)
  292. ecNode.freeEcSlot -= newShardBits.ShardIdCount() - oldShardBits.ShardIdCount()
  293. }
  294. }
  295. }
  296. return ecNode
  297. }
  298. func groupByCount(data []*EcNode, identifierFn func(*EcNode) (id string, count int)) map[string]int {
  299. countMap := make(map[string]int)
  300. for _, d := range data {
  301. id, count := identifierFn(d)
  302. countMap[id] += count
  303. }
  304. return countMap
  305. }
  306. func groupBy(data []*EcNode, identifierFn func(*EcNode) (id string)) map[string][]*EcNode {
  307. groupMap := make(map[string][]*EcNode)
  308. for _, d := range data {
  309. id := identifierFn(d)
  310. groupMap[id] = append(groupMap[id], d)
  311. }
  312. return groupMap
  313. }
  314. func collectRacks(allEcNodes []*EcNode) map[RackId]*EcRack {
  315. // collect racks info
  316. racks := make(map[RackId]*EcRack)
  317. for _, ecNode := range allEcNodes {
  318. if racks[ecNode.rack] == nil {
  319. racks[ecNode.rack] = &EcRack{
  320. ecNodes: make(map[EcNodeId]*EcNode),
  321. }
  322. }
  323. racks[ecNode.rack].ecNodes[EcNodeId(ecNode.info.Id)] = ecNode
  324. racks[ecNode.rack].freeEcSlot += ecNode.freeEcSlot
  325. }
  326. return racks
  327. }
  328. func balanceEcVolumes(commandEnv *CommandEnv, collection string, allEcNodes []*EcNode, racks map[RackId]*EcRack, applyBalancing bool) error {
  329. fmt.Printf("balanceEcVolumes %s\n", collection)
  330. if err := deleteDuplicatedEcShards(commandEnv, allEcNodes, collection, applyBalancing); err != nil {
  331. return fmt.Errorf("delete duplicated collection %s ec shards: %v", collection, err)
  332. }
  333. if err := balanceEcShardsAcrossRacks(commandEnv, allEcNodes, racks, collection, applyBalancing); err != nil {
  334. return fmt.Errorf("balance across racks collection %s ec shards: %v", collection, err)
  335. }
  336. if err := balanceEcShardsWithinRacks(commandEnv, allEcNodes, racks, collection, applyBalancing); err != nil {
  337. return fmt.Errorf("balance within racks collection %s ec shards: %v", collection, err)
  338. }
  339. return nil
  340. }
  341. func deleteDuplicatedEcShards(commandEnv *CommandEnv, allEcNodes []*EcNode, collection string, applyBalancing bool) error {
  342. // vid => []ecNode
  343. vidLocations := collectVolumeIdToEcNodes(allEcNodes, collection)
  344. // deduplicate ec shards
  345. for vid, locations := range vidLocations {
  346. if err := doDeduplicateEcShards(commandEnv, collection, vid, locations, applyBalancing); err != nil {
  347. return err
  348. }
  349. }
  350. return nil
  351. }
  352. func doDeduplicateEcShards(commandEnv *CommandEnv, collection string, vid needle.VolumeId, locations []*EcNode, applyBalancing bool) error {
  353. // check whether this volume has ecNodes that are over average
  354. shardToLocations := make([][]*EcNode, erasure_coding.TotalShardsCount)
  355. for _, ecNode := range locations {
  356. shardBits := findEcVolumeShards(ecNode, vid)
  357. for _, shardId := range shardBits.ShardIds() {
  358. shardToLocations[shardId] = append(shardToLocations[shardId], ecNode)
  359. }
  360. }
  361. for shardId, ecNodes := range shardToLocations {
  362. if len(ecNodes) <= 1 {
  363. continue
  364. }
  365. sortEcNodesByFreeslotsAscending(ecNodes)
  366. fmt.Printf("ec shard %d.%d has %d copies, keeping %v\n", vid, shardId, len(ecNodes), ecNodes[0].info.Id)
  367. if !applyBalancing {
  368. continue
  369. }
  370. duplicatedShardIds := []uint32{uint32(shardId)}
  371. for _, ecNode := range ecNodes[1:] {
  372. if err := unmountEcShards(commandEnv.option.GrpcDialOption, vid, pb.NewServerAddressFromDataNode(ecNode.info), duplicatedShardIds); err != nil {
  373. return err
  374. }
  375. if err := sourceServerDeleteEcShards(commandEnv.option.GrpcDialOption, collection, vid, pb.NewServerAddressFromDataNode(ecNode.info), duplicatedShardIds); err != nil {
  376. return err
  377. }
  378. ecNode.deleteEcVolumeShards(vid, duplicatedShardIds)
  379. }
  380. }
  381. return nil
  382. }
  383. func balanceEcShardsAcrossRacks(commandEnv *CommandEnv, allEcNodes []*EcNode, racks map[RackId]*EcRack, collection string, applyBalancing bool) error {
  384. // collect vid => []ecNode, since previous steps can change the locations
  385. vidLocations := collectVolumeIdToEcNodes(allEcNodes, collection)
  386. // spread the ec shards evenly
  387. for vid, locations := range vidLocations {
  388. if err := doBalanceEcShardsAcrossRacks(commandEnv, collection, vid, locations, racks, applyBalancing); err != nil {
  389. return err
  390. }
  391. }
  392. return nil
  393. }
  394. func doBalanceEcShardsAcrossRacks(commandEnv *CommandEnv, collection string, vid needle.VolumeId, locations []*EcNode, racks map[RackId]*EcRack, applyBalancing bool) error {
  395. // calculate average number of shards an ec rack should have for one volume
  396. averageShardsPerEcRack := ceilDivide(erasure_coding.TotalShardsCount, len(racks))
  397. // see the volume's shards are in how many racks, and how many in each rack
  398. rackToShardCount := groupByCount(locations, func(ecNode *EcNode) (id string, count int) {
  399. shardBits := findEcVolumeShards(ecNode, vid)
  400. return string(ecNode.rack), shardBits.ShardIdCount()
  401. })
  402. rackEcNodesWithVid := groupBy(locations, func(ecNode *EcNode) string {
  403. return string(ecNode.rack)
  404. })
  405. // ecShardsToMove = select overflown ec shards from racks with ec shard counts > averageShardsPerEcRack
  406. ecShardsToMove := make(map[erasure_coding.ShardId]*EcNode)
  407. for rackId, count := range rackToShardCount {
  408. if count > averageShardsPerEcRack {
  409. possibleEcNodes := rackEcNodesWithVid[rackId]
  410. for shardId, ecNode := range pickNEcShardsToMoveFrom(possibleEcNodes, vid, count-averageShardsPerEcRack) {
  411. ecShardsToMove[shardId] = ecNode
  412. }
  413. }
  414. }
  415. for shardId, ecNode := range ecShardsToMove {
  416. rackId := pickOneRack(racks, rackToShardCount, averageShardsPerEcRack)
  417. if rackId == "" {
  418. fmt.Printf("ec shard %d.%d at %s can not find a destination rack\n", vid, shardId, ecNode.info.Id)
  419. continue
  420. }
  421. var possibleDestinationEcNodes []*EcNode
  422. for _, n := range racks[rackId].ecNodes {
  423. possibleDestinationEcNodes = append(possibleDestinationEcNodes, n)
  424. }
  425. err := pickOneEcNodeAndMoveOneShard(commandEnv, averageShardsPerEcRack, ecNode, collection, vid, shardId, possibleDestinationEcNodes, applyBalancing)
  426. if err != nil {
  427. return err
  428. }
  429. rackToShardCount[string(rackId)] += 1
  430. rackToShardCount[string(ecNode.rack)] -= 1
  431. racks[rackId].freeEcSlot -= 1
  432. racks[ecNode.rack].freeEcSlot += 1
  433. }
  434. return nil
  435. }
  436. func pickOneRack(rackToEcNodes map[RackId]*EcRack, rackToShardCount map[string]int, averageShardsPerEcRack int) RackId {
  437. // TODO later may need to add some randomness
  438. for rackId, rack := range rackToEcNodes {
  439. if rackToShardCount[string(rackId)] >= averageShardsPerEcRack {
  440. continue
  441. }
  442. if rack.freeEcSlot <= 0 {
  443. continue
  444. }
  445. return rackId
  446. }
  447. return ""
  448. }
  449. func balanceEcShardsWithinRacks(commandEnv *CommandEnv, allEcNodes []*EcNode, racks map[RackId]*EcRack, collection string, applyBalancing bool) error {
  450. // collect vid => []ecNode, since previous steps can change the locations
  451. vidLocations := collectVolumeIdToEcNodes(allEcNodes, collection)
  452. // spread the ec shards evenly
  453. for vid, locations := range vidLocations {
  454. // see the volume's shards are in how many racks, and how many in each rack
  455. rackToShardCount := groupByCount(locations, func(ecNode *EcNode) (id string, count int) {
  456. shardBits := findEcVolumeShards(ecNode, vid)
  457. return string(ecNode.rack), shardBits.ShardIdCount()
  458. })
  459. rackEcNodesWithVid := groupBy(locations, func(ecNode *EcNode) string {
  460. return string(ecNode.rack)
  461. })
  462. for rackId, _ := range rackToShardCount {
  463. var possibleDestinationEcNodes []*EcNode
  464. for _, n := range racks[RackId(rackId)].ecNodes {
  465. if _, found := n.info.DiskInfos[string(types.HardDriveType)]; found {
  466. possibleDestinationEcNodes = append(possibleDestinationEcNodes, n)
  467. }
  468. }
  469. sourceEcNodes := rackEcNodesWithVid[rackId]
  470. averageShardsPerEcNode := ceilDivide(rackToShardCount[rackId], len(possibleDestinationEcNodes))
  471. if err := doBalanceEcShardsWithinOneRack(commandEnv, averageShardsPerEcNode, collection, vid, sourceEcNodes, possibleDestinationEcNodes, applyBalancing); err != nil {
  472. return err
  473. }
  474. }
  475. }
  476. return nil
  477. }
  478. func doBalanceEcShardsWithinOneRack(commandEnv *CommandEnv, averageShardsPerEcNode int, collection string, vid needle.VolumeId, existingLocations, possibleDestinationEcNodes []*EcNode, applyBalancing bool) error {
  479. for _, ecNode := range existingLocations {
  480. shardBits := findEcVolumeShards(ecNode, vid)
  481. overLimitCount := shardBits.ShardIdCount() - averageShardsPerEcNode
  482. for _, shardId := range shardBits.ShardIds() {
  483. if overLimitCount <= 0 {
  484. break
  485. }
  486. fmt.Printf("%s has %d overlimit, moving ec shard %d.%d\n", ecNode.info.Id, overLimitCount, vid, shardId)
  487. err := pickOneEcNodeAndMoveOneShard(commandEnv, averageShardsPerEcNode, ecNode, collection, vid, shardId, possibleDestinationEcNodes, applyBalancing)
  488. if err != nil {
  489. return err
  490. }
  491. overLimitCount--
  492. }
  493. }
  494. return nil
  495. }
  496. func balanceEcRacks(commandEnv *CommandEnv, racks map[RackId]*EcRack, applyBalancing bool) error {
  497. // balance one rack for all ec shards
  498. for _, ecRack := range racks {
  499. if err := doBalanceEcRack(commandEnv, ecRack, applyBalancing); err != nil {
  500. return err
  501. }
  502. }
  503. return nil
  504. }
  505. func doBalanceEcRack(commandEnv *CommandEnv, ecRack *EcRack, applyBalancing bool) error {
  506. if len(ecRack.ecNodes) <= 1 {
  507. return nil
  508. }
  509. var rackEcNodes []*EcNode
  510. for _, node := range ecRack.ecNodes {
  511. rackEcNodes = append(rackEcNodes, node)
  512. }
  513. ecNodeIdToShardCount := groupByCount(rackEcNodes, func(ecNode *EcNode) (id string, count int) {
  514. diskInfo, found := ecNode.info.DiskInfos[string(types.HardDriveType)]
  515. if !found {
  516. return
  517. }
  518. for _, ecShardInfo := range diskInfo.EcShardInfos {
  519. count += erasure_coding.ShardBits(ecShardInfo.EcIndexBits).ShardIdCount()
  520. }
  521. return ecNode.info.Id, count
  522. })
  523. var totalShardCount int
  524. for _, count := range ecNodeIdToShardCount {
  525. totalShardCount += count
  526. }
  527. averageShardCount := ceilDivide(totalShardCount, len(rackEcNodes))
  528. hasMove := true
  529. for hasMove {
  530. hasMove = false
  531. slices.SortFunc(rackEcNodes, func(a, b *EcNode) int {
  532. return b.freeEcSlot - a.freeEcSlot
  533. })
  534. emptyNode, fullNode := rackEcNodes[0], rackEcNodes[len(rackEcNodes)-1]
  535. emptyNodeShardCount, fullNodeShardCount := ecNodeIdToShardCount[emptyNode.info.Id], ecNodeIdToShardCount[fullNode.info.Id]
  536. if fullNodeShardCount > averageShardCount && emptyNodeShardCount+1 <= averageShardCount {
  537. emptyNodeIds := make(map[uint32]bool)
  538. if emptyDiskInfo, found := emptyNode.info.DiskInfos[string(types.HardDriveType)]; found {
  539. for _, shards := range emptyDiskInfo.EcShardInfos {
  540. emptyNodeIds[shards.Id] = true
  541. }
  542. }
  543. if fullDiskInfo, found := fullNode.info.DiskInfos[string(types.HardDriveType)]; found {
  544. for _, shards := range fullDiskInfo.EcShardInfos {
  545. if _, found := emptyNodeIds[shards.Id]; !found {
  546. for _, shardId := range erasure_coding.ShardBits(shards.EcIndexBits).ShardIds() {
  547. fmt.Printf("%s moves ec shards %d.%d to %s\n", fullNode.info.Id, shards.Id, shardId, emptyNode.info.Id)
  548. err := moveMountedShardToEcNode(commandEnv, fullNode, shards.Collection, needle.VolumeId(shards.Id), shardId, emptyNode, applyBalancing)
  549. if err != nil {
  550. return err
  551. }
  552. ecNodeIdToShardCount[emptyNode.info.Id]++
  553. ecNodeIdToShardCount[fullNode.info.Id]--
  554. hasMove = true
  555. break
  556. }
  557. break
  558. }
  559. }
  560. }
  561. }
  562. }
  563. return nil
  564. }
  565. func pickOneEcNodeAndMoveOneShard(commandEnv *CommandEnv, averageShardsPerEcNode int, existingLocation *EcNode, collection string, vid needle.VolumeId, shardId erasure_coding.ShardId, possibleDestinationEcNodes []*EcNode, applyBalancing bool) error {
  566. sortEcNodesByFreeslotsDescending(possibleDestinationEcNodes)
  567. skipReason := ""
  568. for _, destEcNode := range possibleDestinationEcNodes {
  569. if destEcNode.info.Id == existingLocation.info.Id {
  570. continue
  571. }
  572. if destEcNode.freeEcSlot <= 0 {
  573. skipReason += fmt.Sprintf(" Skipping %s because it has no free slots\n", destEcNode.info.Id)
  574. continue
  575. }
  576. if findEcVolumeShards(destEcNode, vid).ShardIdCount() >= averageShardsPerEcNode {
  577. skipReason += fmt.Sprintf(" Skipping %s because it %d >= avernageShards (%d)\n",
  578. destEcNode.info.Id, findEcVolumeShards(destEcNode, vid).ShardIdCount(), averageShardsPerEcNode)
  579. continue
  580. }
  581. fmt.Printf("%s moves ec shard %d.%d to %s\n", existingLocation.info.Id, vid, shardId, destEcNode.info.Id)
  582. err := moveMountedShardToEcNode(commandEnv, existingLocation, collection, vid, shardId, destEcNode, applyBalancing)
  583. if err != nil {
  584. return err
  585. }
  586. return nil
  587. }
  588. fmt.Printf("WARNING: Could not find suitable taget node for %d.%d:\n%s", vid, shardId, skipReason)
  589. return nil
  590. }
  591. func pickNEcShardsToMoveFrom(ecNodes []*EcNode, vid needle.VolumeId, n int) map[erasure_coding.ShardId]*EcNode {
  592. picked := make(map[erasure_coding.ShardId]*EcNode)
  593. var candidateEcNodes []*CandidateEcNode
  594. for _, ecNode := range ecNodes {
  595. shardBits := findEcVolumeShards(ecNode, vid)
  596. if shardBits.ShardIdCount() > 0 {
  597. candidateEcNodes = append(candidateEcNodes, &CandidateEcNode{
  598. ecNode: ecNode,
  599. shardCount: shardBits.ShardIdCount(),
  600. })
  601. }
  602. }
  603. slices.SortFunc(candidateEcNodes, func(a, b *CandidateEcNode) int {
  604. return b.shardCount - a.shardCount
  605. })
  606. for i := 0; i < n; i++ {
  607. selectedEcNodeIndex := -1
  608. for i, candidateEcNode := range candidateEcNodes {
  609. shardBits := findEcVolumeShards(candidateEcNode.ecNode, vid)
  610. if shardBits > 0 {
  611. selectedEcNodeIndex = i
  612. for _, shardId := range shardBits.ShardIds() {
  613. candidateEcNode.shardCount--
  614. picked[shardId] = candidateEcNode.ecNode
  615. candidateEcNode.ecNode.deleteEcVolumeShards(vid, []uint32{uint32(shardId)})
  616. break
  617. }
  618. break
  619. }
  620. }
  621. if selectedEcNodeIndex >= 0 {
  622. ensureSortedEcNodes(candidateEcNodes, selectedEcNodeIndex, func(i, j int) bool {
  623. return candidateEcNodes[i].shardCount > candidateEcNodes[j].shardCount
  624. })
  625. }
  626. }
  627. return picked
  628. }
  629. func collectVolumeIdToEcNodes(allEcNodes []*EcNode, collection string) map[needle.VolumeId][]*EcNode {
  630. vidLocations := make(map[needle.VolumeId][]*EcNode)
  631. for _, ecNode := range allEcNodes {
  632. diskInfo, found := ecNode.info.DiskInfos[string(types.HardDriveType)]
  633. if !found {
  634. continue
  635. }
  636. for _, shardInfo := range diskInfo.EcShardInfos {
  637. // ignore if not in current collection
  638. if shardInfo.Collection == collection {
  639. vidLocations[needle.VolumeId(shardInfo.Id)] = append(vidLocations[needle.VolumeId(shardInfo.Id)], ecNode)
  640. }
  641. }
  642. }
  643. return vidLocations
  644. }
  645. func EcBalance(commandEnv *CommandEnv, collections []string, dc string, applyBalancing bool) (err error) {
  646. if len(collections) == 0 {
  647. return fmt.Errorf("no collections to balance")
  648. }
  649. // collect all ec nodes
  650. allEcNodes, totalFreeEcSlots, err := collectEcNodes(commandEnv, dc)
  651. if err != nil {
  652. return err
  653. }
  654. if totalFreeEcSlots < 1 {
  655. return fmt.Errorf("no free ec shard slots. only %d left", totalFreeEcSlots)
  656. }
  657. racks := collectRacks(allEcNodes)
  658. for _, c := range collections {
  659. if err = balanceEcVolumes(commandEnv, c, allEcNodes, racks, applyBalancing); err != nil {
  660. return err
  661. }
  662. }
  663. if err := balanceEcRacks(commandEnv, racks, applyBalancing); err != nil {
  664. return fmt.Errorf("balance ec racks: %v", err)
  665. }
  666. return nil
  667. }