You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

296 lines
9.2 KiB

  1. package shell
  2. import (
  3. "context"
  4. "flag"
  5. "fmt"
  6. "io"
  7. "sort"
  8. "sync"
  9. "github.com/chrislusf/seaweedfs/weed/operation"
  10. "github.com/chrislusf/seaweedfs/weed/pb/master_pb"
  11. "github.com/chrislusf/seaweedfs/weed/pb/volume_server_pb"
  12. "github.com/chrislusf/seaweedfs/weed/storage/erasure_coding"
  13. "github.com/chrislusf/seaweedfs/weed/storage/needle"
  14. "github.com/chrislusf/seaweedfs/weed/wdclient"
  15. "google.golang.org/grpc"
  16. )
  17. func init() {
  18. commands = append(commands, &commandEcEncode{})
  19. }
  20. type commandEcEncode struct {
  21. }
  22. func (c *commandEcEncode) Name() string {
  23. return "ec.encode"
  24. }
  25. func (c *commandEcEncode) Help() string {
  26. return `apply erasure coding to a volume
  27. This command will:
  28. 1. freeze one volume
  29. 2. apply erasure coding to the volume
  30. 3. move the encoded shards to multiple volume servers
  31. The erasure coding is 10.4. So ideally you have more than 14 volume servers, and you can afford
  32. to lose 4 volume servers.
  33. If the number of volumes are not high, the worst case is that you only have 4 volume servers,
  34. and the shards are spread as 4,4,3,3, respectively. You can afford to lose one volume server.
  35. If you only have less than 4 volume servers, with erasure coding, at least you can afford to
  36. have 4 corrupted shard files.
  37. `
  38. }
  39. func (c *commandEcEncode) Do(args []string, commandEnv *commandEnv, writer io.Writer) (err error) {
  40. encodeCommand := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
  41. volumeId := encodeCommand.Int("volumeId", 0, "the volume id")
  42. collection := encodeCommand.String("collection", "", "the collection name")
  43. if err = encodeCommand.Parse(args); err != nil {
  44. return nil
  45. }
  46. ctx := context.Background()
  47. // find volume location
  48. locations := commandEnv.masterClient.GetLocations(uint32(*volumeId))
  49. if len(locations) == 0 {
  50. return fmt.Errorf("volume %d not found", *volumeId)
  51. }
  52. // generate ec shards
  53. err = generateEcShards(ctx, commandEnv.option.GrpcDialOption, needle.VolumeId(*volumeId), *collection, locations[0].Url)
  54. if err != nil {
  55. return fmt.Errorf("generate ec shards for volume %d on %s: %v", *volumeId, locations[0].Url, err)
  56. }
  57. // balance the ec shards to current cluster
  58. err = balanceEcShards(ctx, commandEnv, needle.VolumeId(*volumeId), *collection, locations)
  59. if err != nil {
  60. return fmt.Errorf("balance ec shards for volume %d on %s: %v", *volumeId, locations[0].Url, err)
  61. }
  62. return err
  63. }
  64. func generateEcShards(ctx context.Context, grpcDialOption grpc.DialOption, volumeId needle.VolumeId, collection string, sourceVolumeServer string) error {
  65. err := operation.WithVolumeServerClient(sourceVolumeServer, grpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
  66. _, genErr := volumeServerClient.VolumeEcShardsGenerate(ctx, &volume_server_pb.VolumeEcShardsGenerateRequest{
  67. VolumeId: uint32(volumeId),
  68. Collection: collection,
  69. })
  70. return genErr
  71. })
  72. return err
  73. }
  74. func balanceEcShards(ctx context.Context, commandEnv *commandEnv, volumeId needle.VolumeId, collection string, existingLocations []wdclient.Location) (err error) {
  75. // list all possible locations
  76. var resp *master_pb.VolumeListResponse
  77. err = commandEnv.masterClient.WithClient(ctx, func(client master_pb.SeaweedClient) error {
  78. resp, err = client.VolumeList(ctx, &master_pb.VolumeListRequest{})
  79. return err
  80. })
  81. if err != nil {
  82. return err
  83. }
  84. // find out all volume servers with one volume slot left.
  85. var allDataNodes []*master_pb.DataNodeInfo
  86. var totalFreeEcSlots uint32
  87. eachDataNode(resp.TopologyInfo, func(dn *master_pb.DataNodeInfo) {
  88. if freeEcSlots := countFreeShardSlots(dn); freeEcSlots > 0 {
  89. allDataNodes = append(allDataNodes, dn)
  90. totalFreeEcSlots += freeEcSlots
  91. }
  92. })
  93. if totalFreeEcSlots < erasure_coding.TotalShardsCount {
  94. return fmt.Errorf("not enough free ec shard slots. only %d left", totalFreeEcSlots)
  95. }
  96. sort.Slice(allDataNodes, func(i, j int) bool {
  97. return countFreeShardSlots(allDataNodes[j]) < countFreeShardSlots(allDataNodes[i])
  98. })
  99. if len(allDataNodes) > erasure_coding.TotalShardsCount {
  100. allDataNodes = allDataNodes[:erasure_coding.TotalShardsCount]
  101. }
  102. // calculate how many shards to allocate for these servers
  103. allocated := balancedEcDistribution(allDataNodes)
  104. // ask the data nodes to copy from the source volume server
  105. copiedShardIds, err := parallelCopyEcShardsFromSource(ctx, commandEnv.option.GrpcDialOption, allDataNodes, allocated, volumeId, collection, existingLocations[0])
  106. if err != nil {
  107. return nil
  108. }
  109. // ask the source volume server to clean up copied ec shards
  110. err = sourceServerDeleteEcShards(ctx, commandEnv.option.GrpcDialOption, volumeId, existingLocations[0], copiedShardIds)
  111. if err != nil {
  112. return fmt.Errorf("sourceServerDeleteEcShards %s %d.%v: %v", existingLocations[0], volumeId, copiedShardIds, err)
  113. }
  114. // ask the source volume server to delete the original volume
  115. for _, location := range existingLocations {
  116. err = deleteVolume(ctx, commandEnv.option.GrpcDialOption, volumeId, location.Url)
  117. if err != nil {
  118. return fmt.Errorf("deleteVolume %s volume %d: %v", location.Url, volumeId, err)
  119. }
  120. }
  121. return err
  122. }
  123. func parallelCopyEcShardsFromSource(ctx context.Context, grpcDialOption grpc.DialOption,
  124. targetServers []*master_pb.DataNodeInfo, allocated []uint32,
  125. volumeId needle.VolumeId, collection string, existingLocation wdclient.Location) (actuallyCopied []uint32, err error) {
  126. // parallelize
  127. shardIdChan := make(chan []uint32, len(targetServers))
  128. var wg sync.WaitGroup
  129. startFromShardId := uint32(0)
  130. for i, server := range targetServers {
  131. if allocated[i] <= 0 {
  132. continue
  133. }
  134. wg.Add(1)
  135. go func(server *master_pb.DataNodeInfo, startFromShardId uint32, shardCount uint32) {
  136. defer wg.Done()
  137. copiedShardIds, copyErr := oneServerCopyEcShardsFromSource(ctx, grpcDialOption, server,
  138. startFromShardId, shardCount, volumeId, collection, existingLocation)
  139. if copyErr != nil {
  140. err = copyErr
  141. } else {
  142. shardIdChan <- copiedShardIds
  143. }
  144. }(server, startFromShardId, allocated[i])
  145. startFromShardId += allocated[i]
  146. }
  147. wg.Wait()
  148. close(shardIdChan)
  149. if err != nil {
  150. return nil, err
  151. }
  152. for shardIds := range shardIdChan {
  153. actuallyCopied = append(actuallyCopied, shardIds...)
  154. }
  155. return
  156. }
  157. func oneServerCopyEcShardsFromSource(ctx context.Context, grpcDialOption grpc.DialOption,
  158. targetServer *master_pb.DataNodeInfo, startFromShardId uint32, shardCount uint32,
  159. volumeId needle.VolumeId, collection string, existingLocation wdclient.Location) (copiedShardIds []uint32, err error) {
  160. for shardId := startFromShardId; shardId < startFromShardId+shardCount; shardId++ {
  161. fmt.Printf("allocate %d.%d %s => %s\n", volumeId, shardId, existingLocation.Url, targetServer.Id)
  162. copiedShardIds = append(copiedShardIds, shardId)
  163. }
  164. err = operation.WithVolumeServerClient(targetServer.Id, grpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
  165. if targetServer.Id != existingLocation.Url {
  166. _, copyErr := volumeServerClient.VolumeEcShardsCopy(ctx, &volume_server_pb.VolumeEcShardsCopyRequest{
  167. VolumeId: uint32(volumeId),
  168. Collection: collection,
  169. ShardIds: copiedShardIds,
  170. SourceDataNode: existingLocation.Url,
  171. })
  172. if copyErr != nil {
  173. return copyErr
  174. }
  175. }
  176. _, mountErr := volumeServerClient.VolumeEcShardsMount(ctx, &volume_server_pb.VolumeEcShardsMountRequest{
  177. VolumeId: uint32(volumeId),
  178. Collection: collection,
  179. ShardIds: copiedShardIds,
  180. })
  181. if mountErr != nil {
  182. return mountErr
  183. }
  184. return nil
  185. })
  186. if err != nil {
  187. return
  188. }
  189. return
  190. }
  191. func sourceServerDeleteEcShards(ctx context.Context, grpcDialOption grpc.DialOption,
  192. volumeId needle.VolumeId, sourceLocation wdclient.Location, toBeDeletedShardIds []uint32) error {
  193. shouldDeleteEcx := len(toBeDeletedShardIds) == erasure_coding.TotalShardsCount
  194. return operation.WithVolumeServerClient(sourceLocation.Url, grpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
  195. _, deleteErr := volumeServerClient.VolumeEcShardsDelete(ctx, &volume_server_pb.VolumeEcShardsDeleteRequest{
  196. VolumeId: uint32(volumeId),
  197. ShardIds: toBeDeletedShardIds,
  198. ShouldDeleteEcx: shouldDeleteEcx,
  199. })
  200. return deleteErr
  201. })
  202. }
  203. func balancedEcDistribution(servers []*master_pb.DataNodeInfo) (allocated []uint32) {
  204. freeSlots := make([]uint32, len(servers))
  205. allocated = make([]uint32, len(servers))
  206. for i, server := range servers {
  207. freeSlots[i] = countFreeShardSlots(server)
  208. }
  209. allocatedCount := 0
  210. for allocatedCount < erasure_coding.TotalShardsCount {
  211. for i, _ := range servers {
  212. if freeSlots[i]-allocated[i] > 0 {
  213. allocated[i] += 1
  214. allocatedCount += 1
  215. }
  216. if allocatedCount >= erasure_coding.TotalShardsCount {
  217. break
  218. }
  219. }
  220. }
  221. return allocated
  222. }
  223. func eachDataNode(topo *master_pb.TopologyInfo, fn func(*master_pb.DataNodeInfo)) {
  224. for _, dc := range topo.DataCenterInfos {
  225. for _, rack := range dc.RackInfos {
  226. for _, dn := range rack.DataNodeInfos {
  227. fn(dn)
  228. }
  229. }
  230. }
  231. }
  232. func countShards(ecShardInfos []*master_pb.VolumeEcShardInformationMessage) (count uint32) {
  233. for _, ecShardInfo := range ecShardInfos {
  234. shardBits := erasure_coding.ShardBits(ecShardInfo.EcIndexBits)
  235. count += uint32(shardBits.ShardIdCount())
  236. }
  237. return
  238. }
  239. func countFreeShardSlots(dn *master_pb.DataNodeInfo) (count uint32) {
  240. return uint32(dn.FreeVolumeCount)*10 - countShards(dn.EcShardInfos)
  241. }