535 lines
17 KiB

4 years ago
6 years ago
  1. package shell
  2. import (
  3. "flag"
  4. "fmt"
  5. "github.com/seaweedfs/seaweedfs/weed/pb"
  6. "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
  7. "github.com/seaweedfs/seaweedfs/weed/storage/needle"
  8. "github.com/seaweedfs/seaweedfs/weed/storage/types"
  9. "golang.org/x/exp/slices"
  10. "io"
  11. )
  12. func init() {
  13. Commands = append(Commands, &commandEcBalance{})
  14. }
  15. type commandEcBalance struct {
  16. }
  17. func (c *commandEcBalance) Name() string {
  18. return "ec.balance"
  19. }
  20. func (c *commandEcBalance) Help() string {
  21. return `balance all ec shards among all racks and volume servers
  22. ec.balance [-c EACH_COLLECTION|<collection_name>] [-force] [-dataCenter <data_center>]
  23. Algorithm:
  24. func EcBalance() {
  25. for each collection:
  26. balanceEcVolumes(collectionName)
  27. for each rack:
  28. balanceEcRack(rack)
  29. }
  30. func balanceEcVolumes(collectionName){
  31. for each volume:
  32. doDeduplicateEcShards(volumeId)
  33. tracks rack~shardCount mapping
  34. for each volume:
  35. doBalanceEcShardsAcrossRacks(volumeId)
  36. for each volume:
  37. doBalanceEcShardsWithinRacks(volumeId)
  38. }
  39. // spread ec shards into more racks
  40. func doBalanceEcShardsAcrossRacks(volumeId){
  41. tracks rack~volumeIdShardCount mapping
  42. averageShardsPerEcRack = totalShardNumber / numRacks // totalShardNumber is 14 for now, later could varies for each dc
  43. ecShardsToMove = select overflown ec shards from racks with ec shard counts > averageShardsPerEcRack
  44. for each ecShardsToMove {
  45. destRack = pickOneRack(rack~shardCount, rack~volumeIdShardCount, averageShardsPerEcRack)
  46. destVolumeServers = volume servers on the destRack
  47. pickOneEcNodeAndMoveOneShard(destVolumeServers)
  48. }
  49. }
  50. func doBalanceEcShardsWithinRacks(volumeId){
  51. racks = collect all racks that the volume id is on
  52. for rack, shards := range racks
  53. doBalanceEcShardsWithinOneRack(volumeId, shards, rack)
  54. }
  55. // move ec shards
  56. func doBalanceEcShardsWithinOneRack(volumeId, shards, rackId){
  57. tracks volumeServer~volumeIdShardCount mapping
  58. averageShardCount = len(shards) / numVolumeServers
  59. volumeServersOverAverage = volume servers with volumeId's ec shard counts > averageShardsPerEcRack
  60. ecShardsToMove = select overflown ec shards from volumeServersOverAverage
  61. for each ecShardsToMove {
  62. destVolumeServer = pickOneVolumeServer(volumeServer~shardCount, volumeServer~volumeIdShardCount, averageShardCount)
  63. pickOneEcNodeAndMoveOneShard(destVolumeServers)
  64. }
  65. }
  66. // move ec shards while keeping shard distribution for the same volume unchanged or more even
  67. func balanceEcRack(rack){
  68. averageShardCount = total shards / numVolumeServers
  69. for hasMovedOneEcShard {
  70. sort all volume servers ordered by the number of local ec shards
  71. pick the volume server A with the lowest number of ec shards x
  72. pick the volume server B with the highest number of ec shards y
  73. if y > averageShardCount and x +1 <= averageShardCount {
  74. if B has a ec shard with volume id v that A does not have {
  75. move one ec shard v from B to A
  76. hasMovedOneEcShard = true
  77. }
  78. }
  79. }
  80. }
  81. `
  82. }
  83. func (c *commandEcBalance) Do(args []string, commandEnv *CommandEnv, writer io.Writer) (err error) {
  84. balanceCommand := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
  85. collection := balanceCommand.String("collection", "EACH_COLLECTION", "collection name, or \"EACH_COLLECTION\" for each collection")
  86. dc := balanceCommand.String("dataCenter", "", "only apply the balancing for this dataCenter")
  87. applyBalancing := balanceCommand.Bool("force", false, "apply the balancing plan")
  88. if err = balanceCommand.Parse(args); err != nil {
  89. return nil
  90. }
  91. infoAboutSimulationMode(writer, *applyBalancing, "-force")
  92. if err = commandEnv.confirmIsLocked(args); err != nil {
  93. return
  94. }
  95. // collect all ec nodes
  96. allEcNodes, totalFreeEcSlots, err := collectEcNodes(commandEnv, *dc)
  97. if err != nil {
  98. return err
  99. }
  100. if totalFreeEcSlots < 1 {
  101. return fmt.Errorf("no free ec shard slots. only %d left", totalFreeEcSlots)
  102. }
  103. racks := collectRacks(allEcNodes)
  104. if *collection == "EACH_COLLECTION" {
  105. collections, err := ListCollectionNames(commandEnv, false, true)
  106. if err != nil {
  107. return err
  108. }
  109. fmt.Printf("balanceEcVolumes collections %+v\n", len(collections))
  110. for _, c := range collections {
  111. fmt.Printf("balanceEcVolumes collection %+v\n", c)
  112. if err = balanceEcVolumes(commandEnv, c, allEcNodes, racks, *applyBalancing); err != nil {
  113. return err
  114. }
  115. }
  116. } else {
  117. if err = balanceEcVolumes(commandEnv, *collection, allEcNodes, racks, *applyBalancing); err != nil {
  118. return err
  119. }
  120. }
  121. if err := balanceEcRacks(commandEnv, racks, *applyBalancing); err != nil {
  122. return fmt.Errorf("balance ec racks: %v", err)
  123. }
  124. return nil
  125. }
  126. func collectRacks(allEcNodes []*EcNode) map[RackId]*EcRack {
  127. // collect racks info
  128. racks := make(map[RackId]*EcRack)
  129. for _, ecNode := range allEcNodes {
  130. if racks[ecNode.rack] == nil {
  131. racks[ecNode.rack] = &EcRack{
  132. ecNodes: make(map[EcNodeId]*EcNode),
  133. }
  134. }
  135. racks[ecNode.rack].ecNodes[EcNodeId(ecNode.info.Id)] = ecNode
  136. racks[ecNode.rack].freeEcSlot += ecNode.freeEcSlot
  137. }
  138. return racks
  139. }
  140. func balanceEcVolumes(commandEnv *CommandEnv, collection string, allEcNodes []*EcNode, racks map[RackId]*EcRack, applyBalancing bool) error {
  141. fmt.Printf("balanceEcVolumes %s\n", collection)
  142. if err := deleteDuplicatedEcShards(commandEnv, allEcNodes, collection, applyBalancing); err != nil {
  143. return fmt.Errorf("delete duplicated collection %s ec shards: %v", collection, err)
  144. }
  145. if err := balanceEcShardsAcrossRacks(commandEnv, allEcNodes, racks, collection, applyBalancing); err != nil {
  146. return fmt.Errorf("balance across racks collection %s ec shards: %v", collection, err)
  147. }
  148. if err := balanceEcShardsWithinRacks(commandEnv, allEcNodes, racks, collection, applyBalancing); err != nil {
  149. return fmt.Errorf("balance within racks collection %s ec shards: %v", collection, err)
  150. }
  151. return nil
  152. }
  153. func deleteDuplicatedEcShards(commandEnv *CommandEnv, allEcNodes []*EcNode, collection string, applyBalancing bool) error {
  154. // vid => []ecNode
  155. vidLocations := collectVolumeIdToEcNodes(allEcNodes)
  156. // deduplicate ec shards
  157. for vid, locations := range vidLocations {
  158. if err := doDeduplicateEcShards(commandEnv, collection, vid, locations, applyBalancing); err != nil {
  159. return err
  160. }
  161. }
  162. return nil
  163. }
  164. func doDeduplicateEcShards(commandEnv *CommandEnv, collection string, vid needle.VolumeId, locations []*EcNode, applyBalancing bool) error {
  165. // check whether this volume has ecNodes that are over average
  166. shardToLocations := make([][]*EcNode, erasure_coding.TotalShardsCount)
  167. for _, ecNode := range locations {
  168. shardBits := findEcVolumeShards(ecNode, vid)
  169. for _, shardId := range shardBits.ShardIds() {
  170. shardToLocations[shardId] = append(shardToLocations[shardId], ecNode)
  171. }
  172. }
  173. for shardId, ecNodes := range shardToLocations {
  174. if len(ecNodes) <= 1 {
  175. continue
  176. }
  177. sortEcNodesByFreeslotsAscending(ecNodes)
  178. fmt.Printf("ec shard %d.%d has %d copies, keeping %v\n", vid, shardId, len(ecNodes), ecNodes[0].info.Id)
  179. if !applyBalancing {
  180. continue
  181. }
  182. duplicatedShardIds := []uint32{uint32(shardId)}
  183. for _, ecNode := range ecNodes[1:] {
  184. if err := unmountEcShards(commandEnv.option.GrpcDialOption, vid, pb.NewServerAddressFromDataNode(ecNode.info), duplicatedShardIds); err != nil {
  185. return err
  186. }
  187. if err := sourceServerDeleteEcShards(commandEnv.option.GrpcDialOption, collection, vid, pb.NewServerAddressFromDataNode(ecNode.info), duplicatedShardIds); err != nil {
  188. return err
  189. }
  190. ecNode.deleteEcVolumeShards(vid, duplicatedShardIds)
  191. }
  192. }
  193. return nil
  194. }
  195. func balanceEcShardsAcrossRacks(commandEnv *CommandEnv, allEcNodes []*EcNode, racks map[RackId]*EcRack, collection string, applyBalancing bool) error {
  196. // collect vid => []ecNode, since previous steps can change the locations
  197. vidLocations := collectVolumeIdToEcNodes(allEcNodes)
  198. // spread the ec shards evenly
  199. for vid, locations := range vidLocations {
  200. if err := doBalanceEcShardsAcrossRacks(commandEnv, collection, vid, locations, racks, applyBalancing); err != nil {
  201. return err
  202. }
  203. }
  204. return nil
  205. }
  206. func doBalanceEcShardsAcrossRacks(commandEnv *CommandEnv, collection string, vid needle.VolumeId, locations []*EcNode, racks map[RackId]*EcRack, applyBalancing bool) error {
  207. // calculate average number of shards an ec rack should have for one volume
  208. averageShardsPerEcRack := ceilDivide(erasure_coding.TotalShardsCount, len(racks))
  209. // see the volume's shards are in how many racks, and how many in each rack
  210. rackToShardCount := groupByCount(locations, func(ecNode *EcNode) (id string, count int) {
  211. shardBits := findEcVolumeShards(ecNode, vid)
  212. return string(ecNode.rack), shardBits.ShardIdCount()
  213. })
  214. rackEcNodesWithVid := groupBy(locations, func(ecNode *EcNode) string {
  215. return string(ecNode.rack)
  216. })
  217. // ecShardsToMove = select overflown ec shards from racks with ec shard counts > averageShardsPerEcRack
  218. ecShardsToMove := make(map[erasure_coding.ShardId]*EcNode)
  219. for rackId, count := range rackToShardCount {
  220. if count > averageShardsPerEcRack {
  221. possibleEcNodes := rackEcNodesWithVid[rackId]
  222. for shardId, ecNode := range pickNEcShardsToMoveFrom(possibleEcNodes, vid, count-averageShardsPerEcRack) {
  223. ecShardsToMove[shardId] = ecNode
  224. }
  225. }
  226. }
  227. for shardId, ecNode := range ecShardsToMove {
  228. rackId := pickOneRack(racks, rackToShardCount, averageShardsPerEcRack)
  229. if rackId == "" {
  230. fmt.Printf("ec shard %d.%d at %s can not find a destination rack\n", vid, shardId, ecNode.info.Id)
  231. continue
  232. }
  233. var possibleDestinationEcNodes []*EcNode
  234. for _, n := range racks[rackId].ecNodes {
  235. possibleDestinationEcNodes = append(possibleDestinationEcNodes, n)
  236. }
  237. err := pickOneEcNodeAndMoveOneShard(commandEnv, averageShardsPerEcRack, ecNode, collection, vid, shardId, possibleDestinationEcNodes, applyBalancing)
  238. if err != nil {
  239. return err
  240. }
  241. rackToShardCount[string(rackId)] += 1
  242. rackToShardCount[string(ecNode.rack)] -= 1
  243. racks[rackId].freeEcSlot -= 1
  244. racks[ecNode.rack].freeEcSlot += 1
  245. }
  246. return nil
  247. }
  248. func pickOneRack(rackToEcNodes map[RackId]*EcRack, rackToShardCount map[string]int, averageShardsPerEcRack int) RackId {
  249. // TODO later may need to add some randomness
  250. for rackId, rack := range rackToEcNodes {
  251. if rackToShardCount[string(rackId)] >= averageShardsPerEcRack {
  252. continue
  253. }
  254. if rack.freeEcSlot <= 0 {
  255. continue
  256. }
  257. return rackId
  258. }
  259. return ""
  260. }
  261. func balanceEcShardsWithinRacks(commandEnv *CommandEnv, allEcNodes []*EcNode, racks map[RackId]*EcRack, collection string, applyBalancing bool) error {
  262. // collect vid => []ecNode, since previous steps can change the locations
  263. vidLocations := collectVolumeIdToEcNodes(allEcNodes)
  264. // spread the ec shards evenly
  265. for vid, locations := range vidLocations {
  266. // see the volume's shards are in how many racks, and how many in each rack
  267. rackToShardCount := groupByCount(locations, func(ecNode *EcNode) (id string, count int) {
  268. shardBits := findEcVolumeShards(ecNode, vid)
  269. return string(ecNode.rack), shardBits.ShardIdCount()
  270. })
  271. rackEcNodesWithVid := groupBy(locations, func(ecNode *EcNode) string {
  272. return string(ecNode.rack)
  273. })
  274. for rackId, _ := range rackToShardCount {
  275. var possibleDestinationEcNodes []*EcNode
  276. for _, n := range racks[RackId(rackId)].ecNodes {
  277. if _, found := n.info.DiskInfos[string(types.HardDriveType)]; found {
  278. possibleDestinationEcNodes = append(possibleDestinationEcNodes, n)
  279. }
  280. }
  281. sourceEcNodes := rackEcNodesWithVid[rackId]
  282. averageShardsPerEcNode := ceilDivide(rackToShardCount[rackId], len(possibleDestinationEcNodes))
  283. if err := doBalanceEcShardsWithinOneRack(commandEnv, averageShardsPerEcNode, collection, vid, sourceEcNodes, possibleDestinationEcNodes, applyBalancing); err != nil {
  284. return err
  285. }
  286. }
  287. }
  288. return nil
  289. }
  290. func doBalanceEcShardsWithinOneRack(commandEnv *CommandEnv, averageShardsPerEcNode int, collection string, vid needle.VolumeId, existingLocations, possibleDestinationEcNodes []*EcNode, applyBalancing bool) error {
  291. for _, ecNode := range existingLocations {
  292. shardBits := findEcVolumeShards(ecNode, vid)
  293. overLimitCount := shardBits.ShardIdCount() - averageShardsPerEcNode
  294. for _, shardId := range shardBits.ShardIds() {
  295. if overLimitCount <= 0 {
  296. break
  297. }
  298. fmt.Printf("%s has %d overlimit, moving ec shard %d.%d\n", ecNode.info.Id, overLimitCount, vid, shardId)
  299. err := pickOneEcNodeAndMoveOneShard(commandEnv, averageShardsPerEcNode, ecNode, collection, vid, shardId, possibleDestinationEcNodes, applyBalancing)
  300. if err != nil {
  301. return err
  302. }
  303. overLimitCount--
  304. }
  305. }
  306. return nil
  307. }
  308. func balanceEcRacks(commandEnv *CommandEnv, racks map[RackId]*EcRack, applyBalancing bool) error {
  309. // balance one rack for all ec shards
  310. for _, ecRack := range racks {
  311. if err := doBalanceEcRack(commandEnv, ecRack, applyBalancing); err != nil {
  312. return err
  313. }
  314. }
  315. return nil
  316. }
  317. func doBalanceEcRack(commandEnv *CommandEnv, ecRack *EcRack, applyBalancing bool) error {
  318. if len(ecRack.ecNodes) <= 1 {
  319. return nil
  320. }
  321. var rackEcNodes []*EcNode
  322. for _, node := range ecRack.ecNodes {
  323. rackEcNodes = append(rackEcNodes, node)
  324. }
  325. ecNodeIdToShardCount := groupByCount(rackEcNodes, func(ecNode *EcNode) (id string, count int) {
  326. diskInfo, found := ecNode.info.DiskInfos[string(types.HardDriveType)]
  327. if !found {
  328. return
  329. }
  330. for _, ecShardInfo := range diskInfo.EcShardInfos {
  331. count += erasure_coding.ShardBits(ecShardInfo.EcIndexBits).ShardIdCount()
  332. }
  333. return ecNode.info.Id, count
  334. })
  335. var totalShardCount int
  336. for _, count := range ecNodeIdToShardCount {
  337. totalShardCount += count
  338. }
  339. averageShardCount := ceilDivide(totalShardCount, len(rackEcNodes))
  340. hasMove := true
  341. for hasMove {
  342. hasMove = false
  343. slices.SortFunc(rackEcNodes, func(a, b *EcNode) int {
  344. return b.freeEcSlot - a.freeEcSlot
  345. })
  346. emptyNode, fullNode := rackEcNodes[0], rackEcNodes[len(rackEcNodes)-1]
  347. emptyNodeShardCount, fullNodeShardCount := ecNodeIdToShardCount[emptyNode.info.Id], ecNodeIdToShardCount[fullNode.info.Id]
  348. if fullNodeShardCount > averageShardCount && emptyNodeShardCount+1 <= averageShardCount {
  349. emptyNodeIds := make(map[uint32]bool)
  350. if emptyDiskInfo, found := emptyNode.info.DiskInfos[string(types.HardDriveType)]; found {
  351. for _, shards := range emptyDiskInfo.EcShardInfos {
  352. emptyNodeIds[shards.Id] = true
  353. }
  354. }
  355. if fullDiskInfo, found := fullNode.info.DiskInfos[string(types.HardDriveType)]; found {
  356. for _, shards := range fullDiskInfo.EcShardInfos {
  357. if _, found := emptyNodeIds[shards.Id]; !found {
  358. for _, shardId := range erasure_coding.ShardBits(shards.EcIndexBits).ShardIds() {
  359. fmt.Printf("%s moves ec shards %d.%d to %s\n", fullNode.info.Id, shards.Id, shardId, emptyNode.info.Id)
  360. err := moveMountedShardToEcNode(commandEnv, fullNode, shards.Collection, needle.VolumeId(shards.Id), shardId, emptyNode, applyBalancing)
  361. if err != nil {
  362. return err
  363. }
  364. ecNodeIdToShardCount[emptyNode.info.Id]++
  365. ecNodeIdToShardCount[fullNode.info.Id]--
  366. hasMove = true
  367. break
  368. }
  369. break
  370. }
  371. }
  372. }
  373. }
  374. }
  375. return nil
  376. }
  377. func pickOneEcNodeAndMoveOneShard(commandEnv *CommandEnv, averageShardsPerEcNode int, existingLocation *EcNode, collection string, vid needle.VolumeId, shardId erasure_coding.ShardId, possibleDestinationEcNodes []*EcNode, applyBalancing bool) error {
  378. sortEcNodesByFreeslotsDescending(possibleDestinationEcNodes)
  379. for _, destEcNode := range possibleDestinationEcNodes {
  380. if destEcNode.info.Id == existingLocation.info.Id {
  381. continue
  382. }
  383. if destEcNode.freeEcSlot <= 0 {
  384. continue
  385. }
  386. if findEcVolumeShards(destEcNode, vid).ShardIdCount() >= averageShardsPerEcNode {
  387. continue
  388. }
  389. fmt.Printf("%s moves ec shard %d.%d to %s\n", existingLocation.info.Id, vid, shardId, destEcNode.info.Id)
  390. err := moveMountedShardToEcNode(commandEnv, existingLocation, collection, vid, shardId, destEcNode, applyBalancing)
  391. if err != nil {
  392. return err
  393. }
  394. return nil
  395. }
  396. return nil
  397. }
  398. func pickNEcShardsToMoveFrom(ecNodes []*EcNode, vid needle.VolumeId, n int) map[erasure_coding.ShardId]*EcNode {
  399. picked := make(map[erasure_coding.ShardId]*EcNode)
  400. var candidateEcNodes []*CandidateEcNode
  401. for _, ecNode := range ecNodes {
  402. shardBits := findEcVolumeShards(ecNode, vid)
  403. if shardBits.ShardIdCount() > 0 {
  404. candidateEcNodes = append(candidateEcNodes, &CandidateEcNode{
  405. ecNode: ecNode,
  406. shardCount: shardBits.ShardIdCount(),
  407. })
  408. }
  409. }
  410. slices.SortFunc(candidateEcNodes, func(a, b *CandidateEcNode) int {
  411. return b.shardCount - a.shardCount
  412. })
  413. for i := 0; i < n; i++ {
  414. selectedEcNodeIndex := -1
  415. for i, candidateEcNode := range candidateEcNodes {
  416. shardBits := findEcVolumeShards(candidateEcNode.ecNode, vid)
  417. if shardBits > 0 {
  418. selectedEcNodeIndex = i
  419. for _, shardId := range shardBits.ShardIds() {
  420. candidateEcNode.shardCount--
  421. picked[shardId] = candidateEcNode.ecNode
  422. candidateEcNode.ecNode.deleteEcVolumeShards(vid, []uint32{uint32(shardId)})
  423. break
  424. }
  425. break
  426. }
  427. }
  428. if selectedEcNodeIndex >= 0 {
  429. ensureSortedEcNodes(candidateEcNodes, selectedEcNodeIndex, func(i, j int) bool {
  430. return candidateEcNodes[i].shardCount > candidateEcNodes[j].shardCount
  431. })
  432. }
  433. }
  434. return picked
  435. }
  436. func collectVolumeIdToEcNodes(allEcNodes []*EcNode) map[needle.VolumeId][]*EcNode {
  437. vidLocations := make(map[needle.VolumeId][]*EcNode)
  438. for _, ecNode := range allEcNodes {
  439. diskInfo, found := ecNode.info.DiskInfos[string(types.HardDriveType)]
  440. if !found {
  441. continue
  442. }
  443. for _, shardInfo := range diskInfo.EcShardInfos {
  444. vidLocations[needle.VolumeId(shardInfo.Id)] = append(vidLocations[needle.VolumeId(shardInfo.Id)], ecNode)
  445. }
  446. }
  447. return vidLocations
  448. }