You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

521 lines
17 KiB

6 years ago
  1. package shell
  2. import (
  3. "context"
  4. "flag"
  5. "fmt"
  6. "io"
  7. "sort"
  8. "github.com/chrislusf/seaweedfs/weed/storage/erasure_coding"
  9. "github.com/chrislusf/seaweedfs/weed/storage/needle"
  10. )
  11. func init() {
  12. Commands = append(Commands, &commandEcBalance{})
  13. }
  14. type commandEcBalance struct {
  15. }
  16. func (c *commandEcBalance) Name() string {
  17. return "ec.balance"
  18. }
  19. func (c *commandEcBalance) Help() string {
  20. return `balance all ec shards among all racks and volume servers
  21. ec.balance [-c EACH_COLLECTION|<collection_name>] [-force] [-dataCenter <data_center>]
  22. Algorithm:
  23. For each type of volume server (different max volume count limit){
  24. for each collection:
  25. balanceEcVolumes(collectionName)
  26. for each rack:
  27. balanceEcRack(rack)
  28. }
  29. func balanceEcVolumes(collectionName){
  30. for each volume:
  31. doDeduplicateEcShards(volumeId)
  32. tracks rack~shardCount mapping
  33. for each volume:
  34. doBalanceEcShardsAcrossRacks(volumeId)
  35. for each volume:
  36. doBalanceEcShardsWithinRacks(volumeId)
  37. }
  38. // spread ec shards into more racks
  39. func doBalanceEcShardsAcrossRacks(volumeId){
  40. tracks rack~volumeIdShardCount mapping
  41. averageShardsPerEcRack = totalShardNumber / numRacks // totalShardNumber is 14 for now, later could varies for each dc
  42. ecShardsToMove = select overflown ec shards from racks with ec shard counts > averageShardsPerEcRack
  43. for each ecShardsToMove {
  44. destRack = pickOneRack(rack~shardCount, rack~volumeIdShardCount, averageShardsPerEcRack)
  45. destVolumeServers = volume servers on the destRack
  46. pickOneEcNodeAndMoveOneShard(destVolumeServers)
  47. }
  48. }
  49. func doBalanceEcShardsWithinRacks(volumeId){
  50. racks = collect all racks that the volume id is on
  51. for rack, shards := range racks
  52. doBalanceEcShardsWithinOneRack(volumeId, shards, rack)
  53. }
  54. // move ec shards
  55. func doBalanceEcShardsWithinOneRack(volumeId, shards, rackId){
  56. tracks volumeServer~volumeIdShardCount mapping
  57. averageShardCount = len(shards) / numVolumeServers
  58. volumeServersOverAverage = volume servers with volumeId's ec shard counts > averageShardsPerEcRack
  59. ecShardsToMove = select overflown ec shards from volumeServersOverAverage
  60. for each ecShardsToMove {
  61. destVolumeServer = pickOneVolumeServer(volumeServer~shardCount, volumeServer~volumeIdShardCount, averageShardCount)
  62. pickOneEcNodeAndMoveOneShard(destVolumeServers)
  63. }
  64. }
  65. // move ec shards while keeping shard distribution for the same volume unchanged or more even
  66. func balanceEcRack(rack){
  67. averageShardCount = total shards / numVolumeServers
  68. for hasMovedOneEcShard {
  69. sort all volume servers ordered by the number of local ec shards
  70. pick the volume server A with the lowest number of ec shards x
  71. pick the volume server B with the highest number of ec shards y
  72. if y > averageShardCount and x +1 <= averageShardCount {
  73. if B has a ec shard with volume id v that A does not have {
  74. move one ec shard v from B to A
  75. hasMovedOneEcShard = true
  76. }
  77. }
  78. }
  79. }
  80. `
  81. }
  82. func (c *commandEcBalance) Do(args []string, commandEnv *CommandEnv, writer io.Writer) (err error) {
  83. balanceCommand := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
  84. collection := balanceCommand.String("collection", "EACH_COLLECTION", "collection name, or \"EACH_COLLECTION\" for each collection")
  85. dc := balanceCommand.String("dataCenter", "", "only apply the balancing for this dataCenter")
  86. applyBalancing := balanceCommand.Bool("force", false, "apply the balancing plan")
  87. if err = balanceCommand.Parse(args); err != nil {
  88. return nil
  89. }
  90. ctx := context.Background()
  91. // collect all ec nodes
  92. allEcNodes, totalFreeEcSlots, err := collectEcNodes(ctx, commandEnv, *dc)
  93. if err != nil {
  94. return err
  95. }
  96. if totalFreeEcSlots < 1 {
  97. return fmt.Errorf("no free ec shard slots. only %d left", totalFreeEcSlots)
  98. }
  99. racks := collectRacks(allEcNodes)
  100. if *collection == "EACH_COLLECTION" {
  101. collections, err := ListCollectionNames(commandEnv, false, true)
  102. if err != nil {
  103. return err
  104. }
  105. fmt.Printf("balanceEcVolumes collections %+v\n", len(collections))
  106. for _, c := range collections {
  107. fmt.Printf("balanceEcVolumes collection %+v\n", c)
  108. if err = balanceEcVolumes(commandEnv, c, allEcNodes, racks, *applyBalancing); err != nil {
  109. return err
  110. }
  111. }
  112. } else {
  113. if err = balanceEcVolumes(commandEnv, *collection, allEcNodes, racks, *applyBalancing); err != nil {
  114. return err
  115. }
  116. }
  117. if err := balanceEcRacks(ctx, commandEnv, racks, *applyBalancing); err != nil {
  118. return fmt.Errorf("balance ec racks: %v", err)
  119. }
  120. return nil
  121. }
  122. func collectRacks(allEcNodes []*EcNode) map[RackId]*EcRack {
  123. // collect racks info
  124. racks := make(map[RackId]*EcRack)
  125. for _, ecNode := range allEcNodes {
  126. if racks[ecNode.rack] == nil {
  127. racks[ecNode.rack] = &EcRack{
  128. ecNodes: make(map[EcNodeId]*EcNode),
  129. }
  130. }
  131. racks[ecNode.rack].ecNodes[EcNodeId(ecNode.info.Id)] = ecNode
  132. racks[ecNode.rack].freeEcSlot += ecNode.freeEcSlot
  133. }
  134. return racks
  135. }
  136. func balanceEcVolumes(commandEnv *CommandEnv, collection string, allEcNodes []*EcNode, racks map[RackId]*EcRack, applyBalancing bool) error {
  137. ctx := context.Background()
  138. fmt.Printf("balanceEcVolumes %s\n", collection)
  139. if err := deleteDuplicatedEcShards(ctx, commandEnv, allEcNodes, collection, applyBalancing); err != nil {
  140. return fmt.Errorf("delete duplicated collection %s ec shards: %v", collection, err)
  141. }
  142. if err := balanceEcShardsAcrossRacks(ctx, commandEnv, allEcNodes, racks, collection, applyBalancing); err != nil {
  143. return fmt.Errorf("balance across racks collection %s ec shards: %v", collection, err)
  144. }
  145. if err := balanceEcShardsWithinRacks(ctx, commandEnv, allEcNodes, racks, collection, applyBalancing); err != nil {
  146. return fmt.Errorf("balance across racks collection %s ec shards: %v", collection, err)
  147. }
  148. return nil
  149. }
  150. func deleteDuplicatedEcShards(ctx context.Context, commandEnv *CommandEnv, allEcNodes []*EcNode, collection string, applyBalancing bool) error {
  151. // vid => []ecNode
  152. vidLocations := collectVolumeIdToEcNodes(allEcNodes)
  153. // deduplicate ec shards
  154. for vid, locations := range vidLocations {
  155. if err := doDeduplicateEcShards(ctx, commandEnv, collection, vid, locations, applyBalancing); err != nil {
  156. return err
  157. }
  158. }
  159. return nil
  160. }
  161. func doDeduplicateEcShards(ctx context.Context, commandEnv *CommandEnv, collection string, vid needle.VolumeId, locations []*EcNode, applyBalancing bool) error {
  162. // check whether this volume has ecNodes that are over average
  163. shardToLocations := make([][]*EcNode, erasure_coding.TotalShardsCount)
  164. for _, ecNode := range locations {
  165. shardBits := findEcVolumeShards(ecNode, vid)
  166. for _, shardId := range shardBits.ShardIds() {
  167. shardToLocations[shardId] = append(shardToLocations[shardId], ecNode)
  168. }
  169. }
  170. for shardId, ecNodes := range shardToLocations {
  171. if len(ecNodes) <= 1 {
  172. continue
  173. }
  174. sortEcNodes(ecNodes)
  175. fmt.Printf("ec shard %d.%d has %d copies, keeping %v\n", vid, shardId, len(ecNodes), ecNodes[0].info.Id)
  176. if !applyBalancing {
  177. continue
  178. }
  179. duplicatedShardIds := []uint32{uint32(shardId)}
  180. for _, ecNode := range ecNodes[1:] {
  181. if err := unmountEcShards(ctx, commandEnv.option.GrpcDialOption, vid, ecNode.info.Id, duplicatedShardIds); err != nil {
  182. return err
  183. }
  184. if err := sourceServerDeleteEcShards(ctx, commandEnv.option.GrpcDialOption, collection, vid, ecNode.info.Id, duplicatedShardIds); err != nil {
  185. return err
  186. }
  187. ecNode.deleteEcVolumeShards(vid, duplicatedShardIds)
  188. }
  189. }
  190. return nil
  191. }
  192. func balanceEcShardsAcrossRacks(ctx context.Context, commandEnv *CommandEnv, allEcNodes []*EcNode, racks map[RackId]*EcRack, collection string, applyBalancing bool) error {
  193. // collect vid => []ecNode, since previous steps can change the locations
  194. vidLocations := collectVolumeIdToEcNodes(allEcNodes)
  195. // spread the ec shards evenly
  196. for vid, locations := range vidLocations {
  197. if err := doBalanceEcShardsAcrossRacks(ctx, commandEnv, collection, vid, locations, racks, applyBalancing); err != nil {
  198. return err
  199. }
  200. }
  201. return nil
  202. }
  203. func doBalanceEcShardsAcrossRacks(ctx context.Context, commandEnv *CommandEnv, collection string, vid needle.VolumeId, locations []*EcNode, racks map[RackId]*EcRack, applyBalancing bool) error {
  204. // calculate average number of shards an ec rack should have for one volume
  205. averageShardsPerEcRack := ceilDivide(erasure_coding.TotalShardsCount, len(racks))
  206. // see the volume's shards are in how many racks, and how many in each rack
  207. rackToShardCount := groupByCount(locations, func(ecNode *EcNode) (id string, count int) {
  208. shardBits := findEcVolumeShards(ecNode, vid)
  209. return string(ecNode.rack), shardBits.ShardIdCount()
  210. })
  211. rackEcNodesWithVid := groupBy(locations, func(ecNode *EcNode) string {
  212. return string(ecNode.rack)
  213. })
  214. // ecShardsToMove = select overflown ec shards from racks with ec shard counts > averageShardsPerEcRack
  215. ecShardsToMove := make(map[erasure_coding.ShardId]*EcNode)
  216. for rackId, count := range rackToShardCount {
  217. if count > averageShardsPerEcRack {
  218. possibleEcNodes := rackEcNodesWithVid[rackId]
  219. for shardId, ecNode := range pickNEcShardsToMoveFrom(possibleEcNodes, vid, count-averageShardsPerEcRack) {
  220. ecShardsToMove[shardId] = ecNode
  221. }
  222. }
  223. }
  224. for shardId, ecNode := range ecShardsToMove {
  225. rackId := pickOneRack(racks, rackToShardCount, averageShardsPerEcRack)
  226. if rackId == "" {
  227. fmt.Printf("ec shard %d.%d at %s can not find a destination rack\n", vid, shardId, ecNode.info.Id)
  228. continue
  229. }
  230. var possibleDestinationEcNodes []*EcNode
  231. for _, n := range racks[rackId].ecNodes {
  232. possibleDestinationEcNodes = append(possibleDestinationEcNodes, n)
  233. }
  234. err := pickOneEcNodeAndMoveOneShard(ctx, commandEnv, averageShardsPerEcRack, ecNode, collection, vid, shardId, possibleDestinationEcNodes, applyBalancing)
  235. if err != nil {
  236. return err
  237. }
  238. rackToShardCount[string(rackId)] += 1
  239. rackToShardCount[string(ecNode.rack)] -= 1
  240. racks[rackId].freeEcSlot -= 1
  241. racks[ecNode.rack].freeEcSlot += 1
  242. }
  243. return nil
  244. }
  245. func pickOneRack(rackToEcNodes map[RackId]*EcRack, rackToShardCount map[string]int, averageShardsPerEcRack int) RackId {
  246. // TODO later may need to add some randomness
  247. for rackId, rack := range rackToEcNodes {
  248. if rackToShardCount[string(rackId)] >= averageShardsPerEcRack {
  249. continue
  250. }
  251. if rack.freeEcSlot <= 0 {
  252. continue
  253. }
  254. return rackId
  255. }
  256. return ""
  257. }
  258. func balanceEcShardsWithinRacks(ctx context.Context, commandEnv *CommandEnv, allEcNodes []*EcNode, racks map[RackId]*EcRack, collection string, applyBalancing bool) error {
  259. // collect vid => []ecNode, since previous steps can change the locations
  260. vidLocations := collectVolumeIdToEcNodes(allEcNodes)
  261. // spread the ec shards evenly
  262. for vid, locations := range vidLocations {
  263. // see the volume's shards are in how many racks, and how many in each rack
  264. rackToShardCount := groupByCount(locations, func(ecNode *EcNode) (id string, count int) {
  265. shardBits := findEcVolumeShards(ecNode, vid)
  266. return string(ecNode.rack), shardBits.ShardIdCount()
  267. })
  268. rackEcNodesWithVid := groupBy(locations, func(ecNode *EcNode) string {
  269. return string(ecNode.rack)
  270. })
  271. for rackId, _ := range rackToShardCount {
  272. var possibleDestinationEcNodes []*EcNode
  273. for _, n := range racks[RackId(rackId)].ecNodes {
  274. possibleDestinationEcNodes = append(possibleDestinationEcNodes, n)
  275. }
  276. sourceEcNodes := rackEcNodesWithVid[rackId]
  277. averageShardsPerEcNode := ceilDivide(rackToShardCount[rackId], len(possibleDestinationEcNodes))
  278. if err := doBalanceEcShardsWithinOneRack(ctx, commandEnv, averageShardsPerEcNode, collection, vid, sourceEcNodes, possibleDestinationEcNodes, applyBalancing); err != nil {
  279. return err
  280. }
  281. }
  282. }
  283. return nil
  284. }
  285. func doBalanceEcShardsWithinOneRack(ctx context.Context, commandEnv *CommandEnv, averageShardsPerEcNode int, collection string, vid needle.VolumeId, existingLocations, possibleDestinationEcNodes []*EcNode, applyBalancing bool) error {
  286. for _, ecNode := range existingLocations {
  287. shardBits := findEcVolumeShards(ecNode, vid)
  288. overLimitCount := shardBits.ShardIdCount() - averageShardsPerEcNode
  289. for _, shardId := range shardBits.ShardIds() {
  290. if overLimitCount <= 0 {
  291. break
  292. }
  293. fmt.Printf("%s has %d overlimit, moving ec shard %d.%d\n", ecNode.info.Id, overLimitCount, vid, shardId)
  294. err := pickOneEcNodeAndMoveOneShard(ctx, commandEnv, averageShardsPerEcNode, ecNode, collection, vid, shardId, possibleDestinationEcNodes, applyBalancing)
  295. if err != nil {
  296. return err
  297. }
  298. overLimitCount--
  299. }
  300. }
  301. return nil
  302. }
  303. func balanceEcRacks(ctx context.Context, commandEnv *CommandEnv, racks map[RackId]*EcRack, applyBalancing bool) error {
  304. // balance one rack for all ec shards
  305. for _, ecRack := range racks {
  306. if err := doBalanceEcRack(ctx, commandEnv, ecRack, applyBalancing); err != nil {
  307. return err
  308. }
  309. }
  310. return nil
  311. }
  312. func doBalanceEcRack(ctx context.Context, commandEnv *CommandEnv, ecRack *EcRack, applyBalancing bool) error {
  313. if len(ecRack.ecNodes) <= 1 {
  314. return nil
  315. }
  316. var rackEcNodes []*EcNode
  317. for _, node := range ecRack.ecNodes {
  318. rackEcNodes = append(rackEcNodes, node)
  319. }
  320. ecNodeIdToShardCount := groupByCount(rackEcNodes, func(node *EcNode) (id string, count int) {
  321. for _, ecShardInfo := range node.info.EcShardInfos {
  322. count += erasure_coding.ShardBits(ecShardInfo.EcIndexBits).ShardIdCount()
  323. }
  324. return node.info.Id, count
  325. })
  326. var totalShardCount int
  327. for _, count := range ecNodeIdToShardCount {
  328. totalShardCount += count
  329. }
  330. averageShardCount := ceilDivide(totalShardCount, len(rackEcNodes))
  331. hasMove := true
  332. for hasMove {
  333. hasMove = false
  334. sort.Slice(rackEcNodes, func(i, j int) bool {
  335. return rackEcNodes[i].freeEcSlot > rackEcNodes[j].freeEcSlot
  336. })
  337. emptyNode, fullNode := rackEcNodes[0], rackEcNodes[len(rackEcNodes)-1]
  338. emptyNodeShardCount, fullNodeShardCount := ecNodeIdToShardCount[emptyNode.info.Id], ecNodeIdToShardCount[fullNode.info.Id]
  339. if fullNodeShardCount > averageShardCount && emptyNodeShardCount+1 <= averageShardCount {
  340. emptyNodeIds := make(map[uint32]bool)
  341. for _, shards := range emptyNode.info.EcShardInfos {
  342. emptyNodeIds[shards.Id] = true
  343. }
  344. for _, shards := range fullNode.info.EcShardInfos {
  345. if _, found := emptyNodeIds[shards.Id]; !found {
  346. for _, shardId := range erasure_coding.ShardBits(shards.EcIndexBits).ShardIds() {
  347. fmt.Printf("%s moves ec shards %d.%d to %s\n", fullNode.info.Id, shards.Id, shardId, emptyNode.info.Id)
  348. err := moveMountedShardToEcNode(ctx, commandEnv, fullNode, shards.Collection, needle.VolumeId(shards.Id), shardId, emptyNode, applyBalancing)
  349. if err != nil {
  350. return err
  351. }
  352. ecNodeIdToShardCount[emptyNode.info.Id]++
  353. ecNodeIdToShardCount[fullNode.info.Id]--
  354. hasMove = true
  355. break
  356. }
  357. break
  358. }
  359. }
  360. }
  361. }
  362. return nil
  363. }
  364. func pickOneEcNodeAndMoveOneShard(ctx context.Context, commandEnv *CommandEnv, expectedTotalEcShards int, existingLocation *EcNode, collection string, vid needle.VolumeId, shardId erasure_coding.ShardId, possibleDestinationEcNodes []*EcNode, applyBalancing bool) error {
  365. sortEcNodes(possibleDestinationEcNodes)
  366. averageShardsPerEcNode := ceilDivide(expectedTotalEcShards, len(possibleDestinationEcNodes))
  367. for _, destEcNode := range possibleDestinationEcNodes {
  368. if destEcNode.info.Id == existingLocation.info.Id {
  369. continue
  370. }
  371. if destEcNode.freeEcSlot <= 0 {
  372. continue
  373. }
  374. if findEcVolumeShards(destEcNode, vid).ShardIdCount() >= averageShardsPerEcNode {
  375. continue
  376. }
  377. fmt.Printf("%s moves ec shard %d.%d to %s\n", existingLocation.info.Id, vid, shardId, destEcNode.info.Id)
  378. err := moveMountedShardToEcNode(ctx, commandEnv, existingLocation, collection, vid, shardId, destEcNode, applyBalancing)
  379. if err != nil {
  380. return err
  381. }
  382. return nil
  383. }
  384. return nil
  385. }
  386. func pickNEcShardsToMoveFrom(ecNodes []*EcNode, vid needle.VolumeId, n int) map[erasure_coding.ShardId]*EcNode {
  387. picked := make(map[erasure_coding.ShardId]*EcNode)
  388. var candidateEcNodes []*CandidateEcNode
  389. for _, ecNode := range ecNodes {
  390. shardBits := findEcVolumeShards(ecNode, vid)
  391. if shardBits.ShardIdCount() > 0 {
  392. candidateEcNodes = append(candidateEcNodes, &CandidateEcNode{
  393. ecNode: ecNode,
  394. shardCount: shardBits.ShardIdCount(),
  395. })
  396. }
  397. }
  398. sort.Slice(candidateEcNodes, func(i, j int) bool {
  399. return candidateEcNodes[i].shardCount > candidateEcNodes[j].shardCount
  400. })
  401. for i := 0; i < n; i++ {
  402. selectedEcNodeIndex := -1
  403. for i, candidateEcNode := range candidateEcNodes {
  404. shardBits := findEcVolumeShards(candidateEcNode.ecNode, vid)
  405. if shardBits > 0 {
  406. selectedEcNodeIndex = i
  407. for _, shardId := range shardBits.ShardIds() {
  408. candidateEcNode.shardCount--
  409. picked[shardId] = candidateEcNode.ecNode
  410. candidateEcNode.ecNode.deleteEcVolumeShards(vid, []uint32{uint32(shardId)})
  411. break
  412. }
  413. break
  414. }
  415. }
  416. if selectedEcNodeIndex >= 0 {
  417. ensureSortedEcNodes(candidateEcNodes, selectedEcNodeIndex, func(i, j int) bool {
  418. return candidateEcNodes[i].shardCount > candidateEcNodes[j].shardCount
  419. })
  420. }
  421. }
  422. return picked
  423. }
  424. func collectVolumeIdToEcNodes(allEcNodes []*EcNode) map[needle.VolumeId][]*EcNode {
  425. vidLocations := make(map[needle.VolumeId][]*EcNode)
  426. for _, ecNode := range allEcNodes {
  427. for _, shardInfo := range ecNode.info.EcShardInfos {
  428. vidLocations[needle.VolumeId(shardInfo.Id)] = append(vidLocations[needle.VolumeId(shardInfo.Id)], ecNode)
  429. }
  430. }
  431. return vidLocations
  432. }