You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

497 lines
16 KiB

3 years ago
6 years ago
4 years ago
6 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
  1. package shell
  2. import (
  3. "context"
  4. "flag"
  5. "fmt"
  6. "github.com/chrislusf/seaweedfs/weed/pb"
  7. "github.com/chrislusf/seaweedfs/weed/storage/needle"
  8. "github.com/chrislusf/seaweedfs/weed/storage/types"
  9. "io"
  10. "path/filepath"
  11. "sort"
  12. "strconv"
  13. "time"
  14. "github.com/chrislusf/seaweedfs/weed/operation"
  15. "github.com/chrislusf/seaweedfs/weed/pb/master_pb"
  16. "github.com/chrislusf/seaweedfs/weed/pb/volume_server_pb"
  17. "github.com/chrislusf/seaweedfs/weed/storage/super_block"
  18. )
  19. func init() {
  20. Commands = append(Commands, &commandVolumeFixReplication{})
  21. }
  22. type commandVolumeFixReplication struct {
  23. collectionPattern *string
  24. }
  25. func (c *commandVolumeFixReplication) Name() string {
  26. return "volume.fix.replication"
  27. }
  28. func (c *commandVolumeFixReplication) Help() string {
  29. return `add or remove replicas to volumes that are missing replicas or over-replicated
  30. This command finds all over-replicated volumes. If found, it will purge the oldest copies and stop.
  31. This command also finds all under-replicated volumes, and finds volume servers with free slots.
  32. If the free slots satisfy the replication requirement, the volume content is copied over and mounted.
  33. volume.fix.replication -n # do not take action
  34. volume.fix.replication # actually deleting or copying the volume files and mount the volume
  35. volume.fix.replication -collectionPattern=important* # fix any collections with prefix "important"
  36. Note:
  37. * each time this will only add back one replica for each volume id that is under replicated.
  38. If there are multiple replicas are missing, e.g. replica count is > 2, you may need to run this multiple times.
  39. * do not run this too quickly within seconds, since the new volume replica may take a few seconds
  40. to register itself to the master.
  41. `
  42. }
  43. func (c *commandVolumeFixReplication) Do(args []string, commandEnv *CommandEnv, writer io.Writer) (err error) {
  44. volFixReplicationCommand := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
  45. c.collectionPattern = volFixReplicationCommand.String("collectionPattern", "", "match with wildcard characters '*' and '?'")
  46. skipChange := volFixReplicationCommand.Bool("n", false, "skip the changes")
  47. retryCount := volFixReplicationCommand.Int("retry", 0, "how many times to retry")
  48. volumesPerStep := volFixReplicationCommand.Int("volumesPerStep", 0, "how many volumes to fix in one cycle")
  49. if err = volFixReplicationCommand.Parse(args); err != nil {
  50. return nil
  51. }
  52. if err = commandEnv.confirmIsLocked(); err != nil {
  53. return
  54. }
  55. takeAction := !*skipChange
  56. underReplicatedVolumeIdsCount := 1
  57. for underReplicatedVolumeIdsCount > 0 {
  58. fixedVolumeReplicas := map[string]int{}
  59. // collect topology information
  60. topologyInfo, _, err := collectTopologyInfo(commandEnv)
  61. if err != nil {
  62. return err
  63. }
  64. // find all volumes that needs replication
  65. // collect all data nodes
  66. volumeReplicas, allLocations := collectVolumeReplicaLocations(topologyInfo)
  67. if len(allLocations) == 0 {
  68. return fmt.Errorf("no data nodes at all")
  69. }
  70. // find all under replicated volumes
  71. var underReplicatedVolumeIds, overReplicatedVolumeIds []uint32
  72. for vid, replicas := range volumeReplicas {
  73. replica := replicas[0]
  74. replicaPlacement, _ := super_block.NewReplicaPlacementFromByte(byte(replica.info.ReplicaPlacement))
  75. if replicaPlacement.GetCopyCount() > len(replicas) {
  76. underReplicatedVolumeIds = append(underReplicatedVolumeIds, vid)
  77. } else if replicaPlacement.GetCopyCount() < len(replicas) {
  78. overReplicatedVolumeIds = append(overReplicatedVolumeIds, vid)
  79. fmt.Fprintf(writer, "volume %d replication %s, but over replicated %+d\n", replica.info.Id, replicaPlacement, len(replicas))
  80. }
  81. }
  82. if len(overReplicatedVolumeIds) > 0 {
  83. if err := c.fixOverReplicatedVolumes(commandEnv, writer, takeAction, overReplicatedVolumeIds, volumeReplicas, allLocations); err != nil {
  84. return err
  85. }
  86. }
  87. underReplicatedVolumeIdsCount = len(underReplicatedVolumeIds)
  88. if underReplicatedVolumeIdsCount > 0 {
  89. // find the most under populated data nodes
  90. fixedVolumeReplicas, err = c.fixUnderReplicatedVolumes(commandEnv, writer, takeAction, underReplicatedVolumeIds, volumeReplicas, allLocations, *retryCount, *volumesPerStep)
  91. if err != nil {
  92. return err
  93. }
  94. }
  95. if *skipChange {
  96. break
  97. }
  98. // check that the topology has been updated
  99. if len(fixedVolumeReplicas) > 0 {
  100. fixedVolumes := make([]string, 0, len(fixedVolumeReplicas))
  101. for k, _ := range fixedVolumeReplicas {
  102. fixedVolumes = append(fixedVolumes, k)
  103. }
  104. volumeIdLocations, err := lookupVolumeIds(commandEnv, fixedVolumes)
  105. if err != nil {
  106. return err
  107. }
  108. for _, volumeIdLocation := range volumeIdLocations {
  109. volumeId := volumeIdLocation.VolumeOrFileId
  110. volumeIdLocationCount := len(volumeIdLocation.Locations)
  111. i := 0
  112. for fixedVolumeReplicas[volumeId] >= volumeIdLocationCount {
  113. fmt.Fprintf(writer, "the number of locations for volume %s has not increased yet, let's wait\n", volumeId)
  114. time.Sleep(time.Duration(i+1) * time.Second * 7)
  115. volumeLocIds, err := lookupVolumeIds(commandEnv, []string{volumeId})
  116. if err != nil {
  117. return err
  118. }
  119. volumeIdLocationCount = len(volumeLocIds[0].Locations)
  120. if *retryCount > i {
  121. return fmt.Errorf("replicas volume %s mismatch in topology", volumeId)
  122. }
  123. i += 1
  124. }
  125. }
  126. }
  127. }
  128. return nil
  129. }
  130. func collectVolumeReplicaLocations(topologyInfo *master_pb.TopologyInfo) (map[uint32][]*VolumeReplica, []location) {
  131. volumeReplicas := make(map[uint32][]*VolumeReplica)
  132. var allLocations []location
  133. eachDataNode(topologyInfo, func(dc string, rack RackId, dn *master_pb.DataNodeInfo) {
  134. loc := newLocation(dc, string(rack), dn)
  135. for _, diskInfo := range dn.DiskInfos {
  136. for _, v := range diskInfo.VolumeInfos {
  137. volumeReplicas[v.Id] = append(volumeReplicas[v.Id], &VolumeReplica{
  138. location: &loc,
  139. info: v,
  140. })
  141. }
  142. }
  143. allLocations = append(allLocations, loc)
  144. })
  145. return volumeReplicas, allLocations
  146. }
  147. func (c *commandVolumeFixReplication) fixOverReplicatedVolumes(commandEnv *CommandEnv, writer io.Writer, takeAction bool, overReplicatedVolumeIds []uint32, volumeReplicas map[uint32][]*VolumeReplica, allLocations []location) error {
  148. for _, vid := range overReplicatedVolumeIds {
  149. replicas := volumeReplicas[vid]
  150. replicaPlacement, _ := super_block.NewReplicaPlacementFromByte(byte(replicas[0].info.ReplicaPlacement))
  151. replica := pickOneReplicaToDelete(replicas, replicaPlacement)
  152. // check collection name pattern
  153. if *c.collectionPattern != "" {
  154. matched, err := filepath.Match(*c.collectionPattern, replica.info.Collection)
  155. if err != nil {
  156. return fmt.Errorf("match pattern %s with collection %s: %v", *c.collectionPattern, replica.info.Collection, err)
  157. }
  158. if !matched {
  159. break
  160. }
  161. }
  162. fmt.Fprintf(writer, "deleting volume %d from %s ...\n", replica.info.Id, replica.location.dataNode.Id)
  163. if !takeAction {
  164. break
  165. }
  166. if err := deleteVolume(commandEnv.option.GrpcDialOption, needle.VolumeId(replica.info.Id), pb.NewServerAddressFromDataNode(replica.location.dataNode)); err != nil {
  167. return fmt.Errorf("deleting volume %d from %s : %v", replica.info.Id, replica.location.dataNode.Id, err)
  168. }
  169. }
  170. return nil
  171. }
  172. func (c *commandVolumeFixReplication) fixUnderReplicatedVolumes(commandEnv *CommandEnv, writer io.Writer, takeAction bool, underReplicatedVolumeIds []uint32, volumeReplicas map[uint32][]*VolumeReplica, allLocations []location, retryCount int, volumesPerStep int) (fixedVolumes map[string]int, err error) {
  173. fixedVolumes = map[string]int{}
  174. if len(underReplicatedVolumeIds) > volumesPerStep && volumesPerStep > 0 {
  175. underReplicatedVolumeIds = underReplicatedVolumeIds[0:volumesPerStep]
  176. }
  177. for _, vid := range underReplicatedVolumeIds {
  178. for i := 0; i < retryCount+1; i++ {
  179. if err = c.fixOneUnderReplicatedVolume(commandEnv, writer, takeAction, volumeReplicas, vid, allLocations); err == nil {
  180. if takeAction {
  181. fixedVolumes[strconv.FormatUint(uint64(vid), 10)] = len(volumeReplicas[vid])
  182. }
  183. break
  184. }
  185. }
  186. }
  187. return fixedVolumes, nil
  188. }
  189. func (c *commandVolumeFixReplication) fixOneUnderReplicatedVolume(commandEnv *CommandEnv, writer io.Writer, takeAction bool, volumeReplicas map[uint32][]*VolumeReplica, vid uint32, allLocations []location) error {
  190. replicas := volumeReplicas[vid]
  191. replica := pickOneReplicaToCopyFrom(replicas)
  192. replicaPlacement, _ := super_block.NewReplicaPlacementFromByte(byte(replica.info.ReplicaPlacement))
  193. foundNewLocation := false
  194. hasSkippedCollection := false
  195. keepDataNodesSorted(allLocations, types.ToDiskType(replica.info.DiskType))
  196. fn := capacityByFreeVolumeCount(types.ToDiskType(replica.info.DiskType))
  197. for _, dst := range allLocations {
  198. // check whether data nodes satisfy the constraints
  199. if fn(dst.dataNode) > 0 && satisfyReplicaPlacement(replicaPlacement, replicas, dst) {
  200. // check collection name pattern
  201. if *c.collectionPattern != "" {
  202. matched, err := filepath.Match(*c.collectionPattern, replica.info.Collection)
  203. if err != nil {
  204. return fmt.Errorf("match pattern %s with collection %s: %v", *c.collectionPattern, replica.info.Collection, err)
  205. }
  206. if !matched {
  207. hasSkippedCollection = true
  208. break
  209. }
  210. }
  211. // ask the volume server to replicate the volume
  212. foundNewLocation = true
  213. fmt.Fprintf(writer, "replicating volume %d %s from %s to dataNode %s ...\n", replica.info.Id, replicaPlacement, replica.location.dataNode.Id, dst.dataNode.Id)
  214. if !takeAction {
  215. // adjust free volume count
  216. dst.dataNode.DiskInfos[replica.info.DiskType].FreeVolumeCount--
  217. break
  218. }
  219. err := operation.WithVolumeServerClient(pb.NewServerAddressFromDataNode(dst.dataNode), commandEnv.option.GrpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
  220. stream, replicateErr := volumeServerClient.VolumeCopy(context.Background(), &volume_server_pb.VolumeCopyRequest{
  221. VolumeId: replica.info.Id,
  222. SourceDataNode: string(pb.NewServerAddressFromDataNode(replica.location.dataNode)),
  223. })
  224. if replicateErr != nil {
  225. return fmt.Errorf("copying from %s => %s : %v", replica.location.dataNode.Id, dst.dataNode.Id, replicateErr)
  226. }
  227. for {
  228. resp, recvErr := stream.Recv()
  229. if recvErr != nil {
  230. if recvErr == io.EOF {
  231. break
  232. } else {
  233. return recvErr
  234. }
  235. }
  236. if resp.ProcessedBytes > 0 {
  237. fmt.Fprintf(writer, "volume %d processed %d bytes\n", replica.info.Id, resp.ProcessedBytes)
  238. }
  239. }
  240. return nil
  241. })
  242. if err != nil {
  243. return err
  244. }
  245. // adjust free volume count
  246. dst.dataNode.DiskInfos[replica.info.DiskType].FreeVolumeCount--
  247. break
  248. }
  249. }
  250. if !foundNewLocation && !hasSkippedCollection {
  251. fmt.Fprintf(writer, "failed to place volume %d replica as %s, existing:%+v\n", replica.info.Id, replicaPlacement, len(replicas))
  252. }
  253. return nil
  254. }
  255. func keepDataNodesSorted(dataNodes []location, diskType types.DiskType) {
  256. fn := capacityByFreeVolumeCount(diskType)
  257. sort.Slice(dataNodes, func(i, j int) bool {
  258. return fn(dataNodes[i].dataNode) > fn(dataNodes[j].dataNode)
  259. })
  260. }
  261. /*
  262. if on an existing data node {
  263. return false
  264. }
  265. if different from existing dcs {
  266. if lack on different dcs {
  267. return true
  268. }else{
  269. return false
  270. }
  271. }
  272. if not on primary dc {
  273. return false
  274. }
  275. if different from existing racks {
  276. if lack on different racks {
  277. return true
  278. }else{
  279. return false
  280. }
  281. }
  282. if not on primary rack {
  283. return false
  284. }
  285. if lacks on same rack {
  286. return true
  287. } else {
  288. return false
  289. }
  290. */
  291. func satisfyReplicaPlacement(replicaPlacement *super_block.ReplicaPlacement, replicas []*VolumeReplica, possibleLocation location) bool {
  292. existingDataCenters, _, existingDataNodes := countReplicas(replicas)
  293. if _, found := existingDataNodes[possibleLocation.String()]; found {
  294. // avoid duplicated volume on the same data node
  295. return false
  296. }
  297. primaryDataCenters, _ := findTopKeys(existingDataCenters)
  298. // ensure data center count is within limit
  299. if _, found := existingDataCenters[possibleLocation.DataCenter()]; !found {
  300. // different from existing dcs
  301. if len(existingDataCenters) < replicaPlacement.DiffDataCenterCount+1 {
  302. // lack on different dcs
  303. return true
  304. } else {
  305. // adding this would go over the different dcs limit
  306. return false
  307. }
  308. }
  309. // now this is same as one of the existing data center
  310. if !isAmong(possibleLocation.DataCenter(), primaryDataCenters) {
  311. // not on one of the primary dcs
  312. return false
  313. }
  314. // now this is one of the primary dcs
  315. primaryDcRacks := make(map[string]int)
  316. for _, replica := range replicas {
  317. if replica.location.DataCenter() != possibleLocation.DataCenter() {
  318. continue
  319. }
  320. primaryDcRacks[replica.location.Rack()] += 1
  321. }
  322. primaryRacks, _ := findTopKeys(primaryDcRacks)
  323. sameRackCount := primaryDcRacks[possibleLocation.Rack()]
  324. // ensure rack count is within limit
  325. if _, found := primaryDcRacks[possibleLocation.Rack()]; !found {
  326. // different from existing racks
  327. if len(primaryDcRacks) < replicaPlacement.DiffRackCount+1 {
  328. // lack on different racks
  329. return true
  330. } else {
  331. // adding this would go over the different racks limit
  332. return false
  333. }
  334. }
  335. // now this is same as one of the existing racks
  336. if !isAmong(possibleLocation.Rack(), primaryRacks) {
  337. // not on the primary rack
  338. return false
  339. }
  340. // now this is on the primary rack
  341. // different from existing data nodes
  342. if sameRackCount < replicaPlacement.SameRackCount+1 {
  343. // lack on same rack
  344. return true
  345. } else {
  346. // adding this would go over the same data node limit
  347. return false
  348. }
  349. }
  350. func findTopKeys(m map[string]int) (topKeys []string, max int) {
  351. for k, c := range m {
  352. if max < c {
  353. topKeys = topKeys[:0]
  354. topKeys = append(topKeys, k)
  355. max = c
  356. } else if max == c {
  357. topKeys = append(topKeys, k)
  358. }
  359. }
  360. return
  361. }
  362. func isAmong(key string, keys []string) bool {
  363. for _, k := range keys {
  364. if k == key {
  365. return true
  366. }
  367. }
  368. return false
  369. }
  370. type VolumeReplica struct {
  371. location *location
  372. info *master_pb.VolumeInformationMessage
  373. }
  374. type location struct {
  375. dc string
  376. rack string
  377. dataNode *master_pb.DataNodeInfo
  378. }
  379. func newLocation(dc, rack string, dataNode *master_pb.DataNodeInfo) location {
  380. return location{
  381. dc: dc,
  382. rack: rack,
  383. dataNode: dataNode,
  384. }
  385. }
  386. func (l location) String() string {
  387. return fmt.Sprintf("%s %s %s", l.dc, l.rack, l.dataNode.Id)
  388. }
  389. func (l location) Rack() string {
  390. return fmt.Sprintf("%s %s", l.dc, l.rack)
  391. }
  392. func (l location) DataCenter() string {
  393. return l.dc
  394. }
  395. func pickOneReplicaToCopyFrom(replicas []*VolumeReplica) *VolumeReplica {
  396. mostRecent := replicas[0]
  397. for _, replica := range replicas {
  398. if replica.info.ModifiedAtSecond > mostRecent.info.ModifiedAtSecond {
  399. mostRecent = replica
  400. }
  401. }
  402. return mostRecent
  403. }
  404. func countReplicas(replicas []*VolumeReplica) (diffDc, diffRack, diffNode map[string]int) {
  405. diffDc = make(map[string]int)
  406. diffRack = make(map[string]int)
  407. diffNode = make(map[string]int)
  408. for _, replica := range replicas {
  409. diffDc[replica.location.DataCenter()] += 1
  410. diffRack[replica.location.Rack()] += 1
  411. diffNode[replica.location.String()] += 1
  412. }
  413. return
  414. }
  415. func pickOneReplicaToDelete(replicas []*VolumeReplica, replicaPlacement *super_block.ReplicaPlacement) *VolumeReplica {
  416. sort.Slice(replicas, func(i, j int) bool {
  417. a, b := replicas[i], replicas[j]
  418. if a.info.Size != b.info.Size {
  419. return a.info.Size < b.info.Size
  420. }
  421. if a.info.ModifiedAtSecond != b.info.ModifiedAtSecond {
  422. return a.info.ModifiedAtSecond < b.info.ModifiedAtSecond
  423. }
  424. if a.info.CompactRevision != b.info.CompactRevision {
  425. return a.info.CompactRevision < b.info.CompactRevision
  426. }
  427. return false
  428. })
  429. return replicas[0]
  430. }