You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

517 lines
17 KiB

  1. package shell
  2. import (
  3. "context"
  4. "flag"
  5. "fmt"
  6. "io"
  7. "sort"
  8. "github.com/chrislusf/seaweedfs/weed/storage/erasure_coding"
  9. "github.com/chrislusf/seaweedfs/weed/storage/needle"
  10. )
  11. func init() {
  12. Commands = append(Commands, &commandEcBalance{})
  13. }
  14. type commandEcBalance struct {
  15. }
  16. func (c *commandEcBalance) Name() string {
  17. return "ec.balance"
  18. }
  19. func (c *commandEcBalance) Help() string {
  20. return `balance all ec shards among all racks and volume servers
  21. ec.balance [-c EACH_COLLECTION|<collection_name>] [-force] [-dataCenter <data_center>]
  22. Algorithm:
  23. For each type of volume server (different max volume count limit){
  24. for each collection:
  25. balanceEcVolumes(collectionName)
  26. for each rack:
  27. balanceEcRack(rack)
  28. }
  29. func balanceEcVolumes(collectionName){
  30. for each volume:
  31. doDeduplicateEcShards(volumeId)
  32. tracks rack~shardCount mapping
  33. for each volume:
  34. doBalanceEcShardsAcrossRacks(volumeId)
  35. for each volume:
  36. doBalanceEcShardsWithinRacks(volumeId)
  37. }
  38. // spread ec shards into more racks
  39. func doBalanceEcShardsAcrossRacks(volumeId){
  40. tracks rack~volumeIdShardCount mapping
  41. averageShardsPerEcRack = totalShardNumber / numRacks // totalShardNumber is 14 for now, later could varies for each dc
  42. ecShardsToMove = select overflown ec shards from racks with ec shard counts > averageShardsPerEcRack
  43. for each ecShardsToMove {
  44. destRack = pickOneRack(rack~shardCount, rack~volumeIdShardCount, averageShardsPerEcRack)
  45. destVolumeServers = volume servers on the destRack
  46. pickOneEcNodeAndMoveOneShard(destVolumeServers)
  47. }
  48. }
  49. func doBalanceEcShardsWithinRacks(volumeId){
  50. racks = collect all racks that the volume id is on
  51. for rack, shards := range racks
  52. doBalanceEcShardsWithinOneRack(volumeId, shards, rack)
  53. }
  54. // move ec shards
  55. func doBalanceEcShardsWithinOneRack(volumeId, shards, rackId){
  56. tracks volumeServer~volumeIdShardCount mapping
  57. averageShardCount = len(shards) / numVolumeServers
  58. volumeServersOverAverage = volume servers with volumeId's ec shard counts > averageShardsPerEcRack
  59. ecShardsToMove = select overflown ec shards from volumeServersOverAverage
  60. for each ecShardsToMove {
  61. destVolumeServer = pickOneVolumeServer(volumeServer~shardCount, volumeServer~volumeIdShardCount, averageShardCount)
  62. pickOneEcNodeAndMoveOneShard(destVolumeServers)
  63. }
  64. }
  65. // move ec shards while keeping shard distribution for the same volume unchanged or more even
  66. func balanceEcRack(rack){
  67. averageShardCount = total shards / numVolumeServers
  68. for hasMovedOneEcShard {
  69. sort all volume servers ordered by the number of local ec shards
  70. pick the volume server A with the lowest number of ec shards x
  71. pick the volume server B with the highest number of ec shards y
  72. if y > averageShardCount and x +1 <= averageShardCount {
  73. if B has a ec shard with volume id v that A does not have {
  74. move one ec shard v from B to A
  75. hasMovedOneEcShard = true
  76. }
  77. }
  78. }
  79. }
  80. `
  81. }
  82. func (c *commandEcBalance) Do(args []string, commandEnv *CommandEnv, writer io.Writer) (err error) {
  83. balanceCommand := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
  84. collection := balanceCommand.String("collection", "EACH_COLLECTION", "collection name, or \"EACH_COLLECTION\" for each collection")
  85. dc := balanceCommand.String("dataCenter", "", "only apply the balancing for this dataCenter")
  86. applyBalancing := balanceCommand.Bool("force", false, "apply the balancing plan")
  87. if err = balanceCommand.Parse(args); err != nil {
  88. return nil
  89. }
  90. ctx := context.Background()
  91. // collect all ec nodes
  92. allEcNodes, totalFreeEcSlots, err := collectEcNodes(ctx, commandEnv, *dc)
  93. if err != nil {
  94. return err
  95. }
  96. if totalFreeEcSlots < 1 {
  97. return fmt.Errorf("no free ec shard slots. only %d left", totalFreeEcSlots)
  98. }
  99. racks := collectRacks(allEcNodes)
  100. if *collection == "EACH_COLLECTION" {
  101. collections, err := ListCollectionNames(commandEnv, false, true)
  102. if err != nil {
  103. return err
  104. }
  105. fmt.Printf("balanceEcVolumes collections %+v\n", len(collections))
  106. for _, c := range collections {
  107. fmt.Printf("balanceEcVolumes collection %+v\n", c)
  108. if err = balanceEcVolumes(commandEnv, c, allEcNodes, racks, *applyBalancing); err != nil {
  109. return err
  110. }
  111. }
  112. } else {
  113. if err = balanceEcVolumes(commandEnv, *collection, allEcNodes, racks, *applyBalancing); err != nil {
  114. return err
  115. }
  116. }
  117. if err := balanceEcRacks(ctx, commandEnv, racks, *applyBalancing); err != nil {
  118. return fmt.Errorf("balance ec racks: %v", err)
  119. }
  120. return nil
  121. }
  122. func collectRacks(allEcNodes []*EcNode) map[RackId]*EcRack {
  123. // collect racks info
  124. racks := make(map[RackId]*EcRack)
  125. for _, ecNode := range allEcNodes {
  126. if racks[ecNode.rack] == nil {
  127. racks[ecNode.rack] = &EcRack{
  128. ecNodes: make(map[EcNodeId]*EcNode),
  129. }
  130. }
  131. racks[ecNode.rack].ecNodes[EcNodeId(ecNode.info.Id)] = ecNode
  132. racks[ecNode.rack].freeEcSlot += ecNode.freeEcSlot
  133. }
  134. return racks
  135. }
  136. func balanceEcVolumes(commandEnv *CommandEnv, collection string, allEcNodes []*EcNode, racks map[RackId]*EcRack, applyBalancing bool) error {
  137. ctx := context.Background()
  138. fmt.Printf("balanceEcVolumes %s\n", collection)
  139. if err := deleteDuplicatedEcShards(ctx, commandEnv, allEcNodes, collection, applyBalancing); err != nil {
  140. return fmt.Errorf("delete duplicated collection %s ec shards: %v", collection, err)
  141. }
  142. if err := balanceEcShardsAcrossRacks(ctx, commandEnv, allEcNodes, racks, collection, applyBalancing); err != nil {
  143. return fmt.Errorf("balance across racks collection %s ec shards: %v", collection, err)
  144. }
  145. if err := balanceEcShardsWithinRacks(ctx, commandEnv, allEcNodes, racks, collection, applyBalancing); err != nil {
  146. return fmt.Errorf("balance across racks collection %s ec shards: %v", collection, err)
  147. }
  148. return nil
  149. }
  150. func deleteDuplicatedEcShards(ctx context.Context, commandEnv *CommandEnv, allEcNodes []*EcNode, collection string, applyBalancing bool) error {
  151. // vid => []ecNode
  152. vidLocations := collectVolumeIdToEcNodes(allEcNodes)
  153. // deduplicate ec shards
  154. for vid, locations := range vidLocations {
  155. if err := doDeduplicateEcShards(ctx, commandEnv, collection, vid, locations, applyBalancing); err != nil {
  156. return err
  157. }
  158. }
  159. return nil
  160. }
  161. func doDeduplicateEcShards(ctx context.Context, commandEnv *CommandEnv, collection string, vid needle.VolumeId, locations []*EcNode, applyBalancing bool) error {
  162. // check whether this volume has ecNodes that are over average
  163. shardToLocations := make([][]*EcNode, erasure_coding.TotalShardsCount)
  164. for _, ecNode := range locations {
  165. shardBits := findEcVolumeShards(ecNode, vid)
  166. for _, shardId := range shardBits.ShardIds() {
  167. shardToLocations[shardId] = append(shardToLocations[shardId], ecNode)
  168. }
  169. }
  170. for shardId, ecNodes := range shardToLocations {
  171. if len(ecNodes) <= 1 {
  172. continue
  173. }
  174. sortEcNodes(ecNodes)
  175. fmt.Printf("ec shard %d.%d has %d copies, keeping %v\n", vid, shardId, len(ecNodes), ecNodes[0].info.Id)
  176. if !applyBalancing {
  177. continue
  178. }
  179. duplicatedShardIds := []uint32{uint32(shardId)}
  180. for _, ecNode := range ecNodes[1:] {
  181. if err := unmountEcShards(ctx, commandEnv.option.GrpcDialOption, vid, ecNode.info.Id, duplicatedShardIds); err != nil {
  182. return err
  183. }
  184. if err := sourceServerDeleteEcShards(ctx, commandEnv.option.GrpcDialOption, collection, vid, ecNode.info.Id, duplicatedShardIds); err != nil {
  185. return err
  186. }
  187. ecNode.deleteEcVolumeShards(vid, duplicatedShardIds)
  188. }
  189. }
  190. return nil
  191. }
  192. func balanceEcShardsAcrossRacks(ctx context.Context, commandEnv *CommandEnv, allEcNodes []*EcNode, racks map[RackId]*EcRack, collection string, applyBalancing bool) error {
  193. // collect vid => []ecNode, since previous steps can change the locations
  194. vidLocations := collectVolumeIdToEcNodes(allEcNodes)
  195. // spread the ec shards evenly
  196. for vid, locations := range vidLocations {
  197. if err := doBalanceEcShardsAcrossRacks(ctx, commandEnv, collection, vid, locations, racks, applyBalancing); err != nil {
  198. return err
  199. }
  200. }
  201. return nil
  202. }
  203. func doBalanceEcShardsAcrossRacks(ctx context.Context, commandEnv *CommandEnv, collection string, vid needle.VolumeId, locations []*EcNode, racks map[RackId]*EcRack, applyBalancing bool) error {
  204. // calculate average number of shards an ec rack should have for one volume
  205. averageShardsPerEcRack := ceilDivide(erasure_coding.TotalShardsCount, len(racks))
  206. // see the volume's shards are in how many racks, and how many in each rack
  207. rackToShardCount := groupByCount(locations, func(ecNode *EcNode) (id string, count int) {
  208. shardBits := findEcVolumeShards(ecNode, vid)
  209. return string(ecNode.rack), shardBits.ShardIdCount()
  210. })
  211. rackEcNodesWithVid := groupBy(locations, func(ecNode *EcNode) string {
  212. return string(ecNode.rack)
  213. })
  214. // ecShardsToMove = select overflown ec shards from racks with ec shard counts > averageShardsPerEcRack
  215. ecShardsToMove := make(map[erasure_coding.ShardId]*EcNode)
  216. for rackId, count := range rackToShardCount {
  217. if count > averageShardsPerEcRack {
  218. possibleEcNodes := rackEcNodesWithVid[rackId]
  219. for shardId, ecNode := range pickNEcShardsToMoveFrom(possibleEcNodes, vid, count-averageShardsPerEcRack) {
  220. ecShardsToMove[shardId] = ecNode
  221. }
  222. }
  223. }
  224. for shardId, ecNode := range ecShardsToMove {
  225. rackId := pickOneRack(racks, rackToShardCount, averageShardsPerEcRack)
  226. var possibleDestinationEcNodes []*EcNode
  227. for _, n := range racks[rackId].ecNodes {
  228. possibleDestinationEcNodes = append(possibleDestinationEcNodes, n)
  229. }
  230. err := pickOneEcNodeAndMoveOneShard(ctx, commandEnv, averageShardsPerEcRack, ecNode, collection, vid, shardId, possibleDestinationEcNodes, applyBalancing)
  231. if err != nil {
  232. return err
  233. }
  234. rackToShardCount[string(rackId)] += 1
  235. rackToShardCount[string(ecNode.rack)] -= 1
  236. racks[rackId].freeEcSlot -= 1
  237. racks[ecNode.rack].freeEcSlot += 1
  238. }
  239. return nil
  240. }
  241. func pickOneRack(rackToEcNodes map[RackId]*EcRack, rackToShardCount map[string]int, averageShardsPerEcRack int) RackId {
  242. // TODO later may need to add some randomness
  243. for rackId, rack := range rackToEcNodes {
  244. if rackToShardCount[string(rackId)] >= averageShardsPerEcRack {
  245. continue
  246. }
  247. if rack.freeEcSlot <= 0 {
  248. continue
  249. }
  250. return rackId
  251. }
  252. return ""
  253. }
  254. func balanceEcShardsWithinRacks(ctx context.Context, commandEnv *CommandEnv, allEcNodes []*EcNode, racks map[RackId]*EcRack, collection string, applyBalancing bool) error {
  255. // collect vid => []ecNode, since previous steps can change the locations
  256. vidLocations := collectVolumeIdToEcNodes(allEcNodes)
  257. // spread the ec shards evenly
  258. for vid, locations := range vidLocations {
  259. // see the volume's shards are in how many racks, and how many in each rack
  260. rackToShardCount := groupByCount(locations, func(ecNode *EcNode) (id string, count int) {
  261. shardBits := findEcVolumeShards(ecNode, vid)
  262. return string(ecNode.rack), shardBits.ShardIdCount()
  263. })
  264. rackEcNodesWithVid := groupBy(locations, func(ecNode *EcNode) string {
  265. return string(ecNode.rack)
  266. })
  267. for rackId, _ := range rackToShardCount {
  268. var possibleDestinationEcNodes []*EcNode
  269. for _, n := range racks[RackId(rackId)].ecNodes {
  270. possibleDestinationEcNodes = append(possibleDestinationEcNodes, n)
  271. }
  272. sourceEcNodes := rackEcNodesWithVid[rackId]
  273. averageShardsPerEcNode := ceilDivide(rackToShardCount[rackId], len(possibleDestinationEcNodes))
  274. if err := doBalanceEcShardsWithinOneRack(ctx, commandEnv, averageShardsPerEcNode, collection, vid, sourceEcNodes, possibleDestinationEcNodes, applyBalancing); err != nil {
  275. return err
  276. }
  277. }
  278. }
  279. return nil
  280. }
  281. func doBalanceEcShardsWithinOneRack(ctx context.Context, commandEnv *CommandEnv, averageShardsPerEcNode int, collection string, vid needle.VolumeId, existingLocations, possibleDestinationEcNodes []*EcNode, applyBalancing bool) error {
  282. for _, ecNode := range existingLocations {
  283. shardBits := findEcVolumeShards(ecNode, vid)
  284. overLimitCount := shardBits.ShardIdCount() - averageShardsPerEcNode
  285. for _, shardId := range shardBits.ShardIds() {
  286. if overLimitCount <= 0 {
  287. break
  288. }
  289. fmt.Printf("%s has %d overlimit, moving ec shard %d.%d\n", ecNode.info.Id, overLimitCount, vid, shardId)
  290. err := pickOneEcNodeAndMoveOneShard(ctx, commandEnv, averageShardsPerEcNode, ecNode, collection, vid, shardId, possibleDestinationEcNodes, applyBalancing)
  291. if err != nil {
  292. return err
  293. }
  294. overLimitCount--
  295. }
  296. }
  297. return nil
  298. }
  299. func balanceEcRacks(ctx context.Context, commandEnv *CommandEnv, racks map[RackId]*EcRack, applyBalancing bool) error {
  300. // balance one rack for all ec shards
  301. for _, ecRack := range racks {
  302. if err := doBalanceEcRack(ctx, commandEnv, ecRack, applyBalancing); err != nil {
  303. return err
  304. }
  305. }
  306. return nil
  307. }
  308. func doBalanceEcRack(ctx context.Context, commandEnv *CommandEnv, ecRack *EcRack, applyBalancing bool) error {
  309. if len(ecRack.ecNodes) <= 1 {
  310. return nil
  311. }
  312. var rackEcNodes []*EcNode
  313. for _, node := range ecRack.ecNodes {
  314. rackEcNodes = append(rackEcNodes, node)
  315. }
  316. ecNodeIdToShardCount := groupByCount(rackEcNodes, func(node *EcNode) (id string, count int) {
  317. for _, ecShardInfo := range node.info.EcShardInfos {
  318. count += erasure_coding.ShardBits(ecShardInfo.EcIndexBits).ShardIdCount()
  319. }
  320. return node.info.Id, count
  321. })
  322. var totalShardCount int
  323. for _, count := range ecNodeIdToShardCount {
  324. totalShardCount += count
  325. }
  326. averageShardCount := ceilDivide(totalShardCount, len(rackEcNodes))
  327. hasMove := true
  328. for hasMove {
  329. hasMove = false
  330. sort.Slice(rackEcNodes, func(i, j int) bool {
  331. return rackEcNodes[i].freeEcSlot > rackEcNodes[j].freeEcSlot
  332. })
  333. emptyNode, fullNode := rackEcNodes[0], rackEcNodes[len(rackEcNodes)-1]
  334. emptyNodeShardCount, fullNodeShardCount := ecNodeIdToShardCount[emptyNode.info.Id], ecNodeIdToShardCount[fullNode.info.Id]
  335. if fullNodeShardCount > averageShardCount && emptyNodeShardCount+1 <= averageShardCount {
  336. emptyNodeIds := make(map[uint32]bool)
  337. for _, shards := range emptyNode.info.EcShardInfos {
  338. emptyNodeIds[shards.Id] = true
  339. }
  340. for _, shards := range fullNode.info.EcShardInfos {
  341. if _, found := emptyNodeIds[shards.Id]; !found {
  342. for _, shardId := range erasure_coding.ShardBits(shards.EcIndexBits).ShardIds() {
  343. fmt.Printf("%s moves ec shards %d.%d to %s\n", fullNode.info.Id, shards.Id, shardId, emptyNode.info.Id)
  344. err := moveMountedShardToEcNode(ctx, commandEnv, fullNode, shards.Collection, needle.VolumeId(shards.Id), shardId, emptyNode, applyBalancing)
  345. if err != nil {
  346. return err
  347. }
  348. ecNodeIdToShardCount[emptyNode.info.Id]++
  349. ecNodeIdToShardCount[fullNode.info.Id]--
  350. hasMove = true
  351. break
  352. }
  353. break
  354. }
  355. }
  356. }
  357. }
  358. return nil
  359. }
  360. func pickOneEcNodeAndMoveOneShard(ctx context.Context, commandEnv *CommandEnv, expectedTotalEcShards int, existingLocation *EcNode, collection string, vid needle.VolumeId, shardId erasure_coding.ShardId, possibleDestinationEcNodes []*EcNode, applyBalancing bool) error {
  361. sortEcNodes(possibleDestinationEcNodes)
  362. averageShardsPerEcNode := ceilDivide(expectedTotalEcShards, len(possibleDestinationEcNodes))
  363. for _, destEcNode := range possibleDestinationEcNodes {
  364. if destEcNode.info.Id == existingLocation.info.Id {
  365. continue
  366. }
  367. if destEcNode.freeEcSlot <= 0 {
  368. continue
  369. }
  370. if findEcVolumeShards(destEcNode, vid).ShardIdCount() >= averageShardsPerEcNode {
  371. continue
  372. }
  373. fmt.Printf("%s moves ec shard %d.%d to %s\n", existingLocation.info.Id, vid, shardId, destEcNode.info.Id)
  374. err := moveMountedShardToEcNode(ctx, commandEnv, existingLocation, collection, vid, shardId, destEcNode, applyBalancing)
  375. if err != nil {
  376. return err
  377. }
  378. return nil
  379. }
  380. return nil
  381. }
  382. func pickNEcShardsToMoveFrom(ecNodes []*EcNode, vid needle.VolumeId, n int) (map[erasure_coding.ShardId]*EcNode) {
  383. picked := make(map[erasure_coding.ShardId]*EcNode)
  384. var candidateEcNodes []*CandidateEcNode
  385. for _, ecNode := range ecNodes {
  386. shardBits := findEcVolumeShards(ecNode, vid)
  387. if shardBits.ShardIdCount() > 0 {
  388. candidateEcNodes = append(candidateEcNodes, &CandidateEcNode{
  389. ecNode: ecNode,
  390. shardCount: shardBits.ShardIdCount(),
  391. })
  392. }
  393. }
  394. sort.Slice(candidateEcNodes, func(i, j int) bool {
  395. return candidateEcNodes[i].shardCount > candidateEcNodes[j].shardCount
  396. })
  397. for i := 0; i < n; i++ {
  398. selectedEcNodeIndex := -1
  399. for i, candidateEcNode := range candidateEcNodes {
  400. shardBits := findEcVolumeShards(candidateEcNode.ecNode, vid)
  401. if shardBits > 0 {
  402. selectedEcNodeIndex = i
  403. for _, shardId := range shardBits.ShardIds() {
  404. candidateEcNode.shardCount--
  405. picked[shardId] = candidateEcNode.ecNode
  406. candidateEcNode.ecNode.deleteEcVolumeShards(vid, []uint32{uint32(shardId)})
  407. break
  408. }
  409. break
  410. }
  411. }
  412. if selectedEcNodeIndex >= 0 {
  413. ensureSortedEcNodes(candidateEcNodes, selectedEcNodeIndex, func(i, j int) bool {
  414. return candidateEcNodes[i].shardCount > candidateEcNodes[j].shardCount
  415. })
  416. }
  417. }
  418. return picked
  419. }
  420. func collectVolumeIdToEcNodes(allEcNodes []*EcNode) map[needle.VolumeId][]*EcNode {
  421. vidLocations := make(map[needle.VolumeId][]*EcNode)
  422. for _, ecNode := range allEcNodes {
  423. for _, shardInfo := range ecNode.info.EcShardInfos {
  424. vidLocations[needle.VolumeId(shardInfo.Id)] = append(vidLocations[needle.VolumeId(shardInfo.Id)], ecNode)
  425. }
  426. }
  427. return vidLocations
  428. }