You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

574 lines
18 KiB

3 years ago
6 years ago
4 years ago
6 years ago
2 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
2 years ago
4 years ago
4 years ago
4 years ago
  1. package shell
  2. import (
  3. "context"
  4. "flag"
  5. "fmt"
  6. "github.com/seaweedfs/seaweedfs/weed/pb"
  7. "github.com/seaweedfs/seaweedfs/weed/storage/needle"
  8. "github.com/seaweedfs/seaweedfs/weed/storage/types"
  9. "golang.org/x/exp/slices"
  10. "io"
  11. "path/filepath"
  12. "strconv"
  13. "time"
  14. "github.com/seaweedfs/seaweedfs/weed/operation"
  15. "github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
  16. "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
  17. "github.com/seaweedfs/seaweedfs/weed/storage/super_block"
  18. )
  19. func init() {
  20. Commands = append(Commands, &commandVolumeFixReplication{})
  21. }
  22. type commandVolumeFixReplication struct {
  23. collectionPattern *string
  24. }
  25. func (c *commandVolumeFixReplication) Name() string {
  26. return "volume.fix.replication"
  27. }
  28. func (c *commandVolumeFixReplication) Help() string {
  29. return `add or remove replicas to volumes that are missing replicas or over-replicated
  30. This command finds all over-replicated volumes. If found, it will purge the oldest copies and stop.
  31. This command also finds all under-replicated volumes, and finds volume servers with free slots.
  32. If the free slots satisfy the replication requirement, the volume content is copied over and mounted.
  33. volume.fix.replication -n # do not take action
  34. volume.fix.replication # actually deleting or copying the volume files and mount the volume
  35. volume.fix.replication -collectionPattern=important* # fix any collections with prefix "important"
  36. Note:
  37. * each time this will only add back one replica for each volume id that is under replicated.
  38. If there are multiple replicas are missing, e.g. replica count is > 2, you may need to run this multiple times.
  39. * do not run this too quickly within seconds, since the new volume replica may take a few seconds
  40. to register itself to the master.
  41. `
  42. }
  43. func (c *commandVolumeFixReplication) Do(args []string, commandEnv *CommandEnv, writer io.Writer) (err error) {
  44. volFixReplicationCommand := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
  45. c.collectionPattern = volFixReplicationCommand.String("collectionPattern", "", "match with wildcard characters '*' and '?'")
  46. skipChange := volFixReplicationCommand.Bool("n", false, "skip the changes")
  47. noDelete := volFixReplicationCommand.Bool("noDelete", false, "Do not delete over-replicated volumes, only fix under-replication")
  48. retryCount := volFixReplicationCommand.Int("retry", 5, "how many times to retry")
  49. volumesPerStep := volFixReplicationCommand.Int("volumesPerStep", 0, "how many volumes to fix in one cycle")
  50. if err = volFixReplicationCommand.Parse(args); err != nil {
  51. return nil
  52. }
  53. if err = commandEnv.confirmIsLocked(args); err != nil {
  54. return
  55. }
  56. takeAction := !*skipChange
  57. doDeletes := !*noDelete
  58. underReplicatedVolumeIdsCount := 1
  59. for underReplicatedVolumeIdsCount > 0 {
  60. fixedVolumeReplicas := map[string]int{}
  61. // collect topology information
  62. topologyInfo, _, err := collectTopologyInfo(commandEnv, 15*time.Second)
  63. if err != nil {
  64. return err
  65. }
  66. // find all volumes that needs replication
  67. // collect all data nodes
  68. volumeReplicas, allLocations := collectVolumeReplicaLocations(topologyInfo)
  69. if len(allLocations) == 0 {
  70. return fmt.Errorf("no data nodes at all")
  71. }
  72. // find all under replicated volumes
  73. var underReplicatedVolumeIds, overReplicatedVolumeIds, misplacedVolumeIds []uint32
  74. for vid, replicas := range volumeReplicas {
  75. replica := replicas[0]
  76. replicaPlacement, _ := super_block.NewReplicaPlacementFromByte(byte(replica.info.ReplicaPlacement))
  77. switch {
  78. case replicaPlacement.GetCopyCount() > len(replicas):
  79. underReplicatedVolumeIds = append(underReplicatedVolumeIds, vid)
  80. case isMisplaced(replicas, replicaPlacement):
  81. misplacedVolumeIds = append(misplacedVolumeIds, vid)
  82. fmt.Fprintf(writer, "volume %d replication %s is not well placed %+v\n", replica.info.Id, replicaPlacement, replica)
  83. case replicaPlacement.GetCopyCount() < len(replicas):
  84. overReplicatedVolumeIds = append(overReplicatedVolumeIds, vid)
  85. fmt.Fprintf(writer, "volume %d replication %s, but over replicated %+d\n", replica.info.Id, replicaPlacement, len(replicas))
  86. }
  87. }
  88. if !commandEnv.isLocked() {
  89. return fmt.Errorf("lock is lost")
  90. }
  91. if len(overReplicatedVolumeIds) > 0 && doDeletes {
  92. if err := c.deleteOneVolume(commandEnv, writer, takeAction, overReplicatedVolumeIds, volumeReplicas, allLocations, pickOneReplicaToDelete); err != nil {
  93. return err
  94. }
  95. }
  96. if len(misplacedVolumeIds) > 0 && doDeletes {
  97. if err := c.deleteOneVolume(commandEnv, writer, takeAction, misplacedVolumeIds, volumeReplicas, allLocations, pickOneMisplacedVolume); err != nil {
  98. return err
  99. }
  100. }
  101. underReplicatedVolumeIdsCount = len(underReplicatedVolumeIds)
  102. if underReplicatedVolumeIdsCount > 0 {
  103. // find the most under populated data nodes
  104. fixedVolumeReplicas, err = c.fixUnderReplicatedVolumes(commandEnv, writer, takeAction, underReplicatedVolumeIds, volumeReplicas, allLocations, *retryCount, *volumesPerStep)
  105. if err != nil {
  106. return err
  107. }
  108. }
  109. if *skipChange {
  110. break
  111. }
  112. // check that the topology has been updated
  113. if len(fixedVolumeReplicas) > 0 {
  114. fixedVolumes := make([]string, 0, len(fixedVolumeReplicas))
  115. for k, _ := range fixedVolumeReplicas {
  116. fixedVolumes = append(fixedVolumes, k)
  117. }
  118. volumeIdLocations, err := lookupVolumeIds(commandEnv, fixedVolumes)
  119. if err != nil {
  120. return err
  121. }
  122. for _, volumeIdLocation := range volumeIdLocations {
  123. volumeId := volumeIdLocation.VolumeOrFileId
  124. volumeIdLocationCount := len(volumeIdLocation.Locations)
  125. i := 0
  126. for fixedVolumeReplicas[volumeId] >= volumeIdLocationCount {
  127. fmt.Fprintf(writer, "the number of locations for volume %s has not increased yet, let's wait\n", volumeId)
  128. time.Sleep(time.Duration(i+1) * time.Second * 7)
  129. volumeLocIds, err := lookupVolumeIds(commandEnv, []string{volumeId})
  130. if err != nil {
  131. return err
  132. }
  133. volumeIdLocationCount = len(volumeLocIds[0].Locations)
  134. if *retryCount <= i {
  135. return fmt.Errorf("replicas volume %s mismatch in topology", volumeId)
  136. }
  137. i += 1
  138. }
  139. }
  140. }
  141. }
  142. return nil
  143. }
  144. func collectVolumeReplicaLocations(topologyInfo *master_pb.TopologyInfo) (map[uint32][]*VolumeReplica, []location) {
  145. volumeReplicas := make(map[uint32][]*VolumeReplica)
  146. var allLocations []location
  147. eachDataNode(topologyInfo, func(dc string, rack RackId, dn *master_pb.DataNodeInfo) {
  148. loc := newLocation(dc, string(rack), dn)
  149. for _, diskInfo := range dn.DiskInfos {
  150. for _, v := range diskInfo.VolumeInfos {
  151. volumeReplicas[v.Id] = append(volumeReplicas[v.Id], &VolumeReplica{
  152. location: &loc,
  153. info: v,
  154. })
  155. }
  156. }
  157. allLocations = append(allLocations, loc)
  158. })
  159. return volumeReplicas, allLocations
  160. }
  161. type SelectOneVolumeFunc func(replicas []*VolumeReplica, replicaPlacement *super_block.ReplicaPlacement) *VolumeReplica
  162. func (c *commandVolumeFixReplication) deleteOneVolume(commandEnv *CommandEnv, writer io.Writer, takeAction bool, overReplicatedVolumeIds []uint32, volumeReplicas map[uint32][]*VolumeReplica, allLocations []location, selectOneVolumeFn SelectOneVolumeFunc) error {
  163. for _, vid := range overReplicatedVolumeIds {
  164. replicas := volumeReplicas[vid]
  165. replicaPlacement, _ := super_block.NewReplicaPlacementFromByte(byte(replicas[0].info.ReplicaPlacement))
  166. replica := selectOneVolumeFn(replicas, replicaPlacement)
  167. // check collection name pattern
  168. if *c.collectionPattern != "" {
  169. matched, err := filepath.Match(*c.collectionPattern, replica.info.Collection)
  170. if err != nil {
  171. return fmt.Errorf("match pattern %s with collection %s: %v", *c.collectionPattern, replica.info.Collection, err)
  172. }
  173. if !matched {
  174. break
  175. }
  176. }
  177. collectionIsMismatch := false
  178. for _, volumeReplica := range replicas {
  179. if volumeReplica.info.Collection != replica.info.Collection {
  180. fmt.Fprintf(writer, "skip delete volume %d as collection %s is mismatch: %s\n", replica.info.Id, replica.info.Collection, volumeReplica.info.Collection)
  181. collectionIsMismatch = true
  182. }
  183. }
  184. if collectionIsMismatch {
  185. continue
  186. }
  187. fmt.Fprintf(writer, "deleting volume %d from %s ...\n", replica.info.Id, replica.location.dataNode.Id)
  188. if !takeAction {
  189. break
  190. }
  191. if err := deleteVolume(commandEnv.option.GrpcDialOption, needle.VolumeId(replica.info.Id),
  192. pb.NewServerAddressFromDataNode(replica.location.dataNode), false); err != nil {
  193. return fmt.Errorf("deleting volume %d from %s : %v", replica.info.Id, replica.location.dataNode.Id, err)
  194. }
  195. }
  196. return nil
  197. }
  198. func (c *commandVolumeFixReplication) fixUnderReplicatedVolumes(commandEnv *CommandEnv, writer io.Writer, takeAction bool, underReplicatedVolumeIds []uint32, volumeReplicas map[uint32][]*VolumeReplica, allLocations []location, retryCount int, volumesPerStep int) (fixedVolumes map[string]int, err error) {
  199. fixedVolumes = map[string]int{}
  200. if len(underReplicatedVolumeIds) > volumesPerStep && volumesPerStep > 0 {
  201. underReplicatedVolumeIds = underReplicatedVolumeIds[0:volumesPerStep]
  202. }
  203. for _, vid := range underReplicatedVolumeIds {
  204. for i := 0; i < retryCount+1; i++ {
  205. if err = c.fixOneUnderReplicatedVolume(commandEnv, writer, takeAction, volumeReplicas, vid, allLocations); err == nil {
  206. if takeAction {
  207. fixedVolumes[strconv.FormatUint(uint64(vid), 10)] = len(volumeReplicas[vid])
  208. }
  209. break
  210. } else {
  211. fmt.Fprintf(writer, "fixing under replicated volume %d: %v\n", vid, err)
  212. }
  213. }
  214. }
  215. return fixedVolumes, nil
  216. }
  217. func (c *commandVolumeFixReplication) fixOneUnderReplicatedVolume(commandEnv *CommandEnv, writer io.Writer, takeAction bool, volumeReplicas map[uint32][]*VolumeReplica, vid uint32, allLocations []location) error {
  218. replicas := volumeReplicas[vid]
  219. replica := pickOneReplicaToCopyFrom(replicas)
  220. replicaPlacement, _ := super_block.NewReplicaPlacementFromByte(byte(replica.info.ReplicaPlacement))
  221. foundNewLocation := false
  222. hasSkippedCollection := false
  223. keepDataNodesSorted(allLocations, types.ToDiskType(replica.info.DiskType))
  224. fn := capacityByFreeVolumeCount(types.ToDiskType(replica.info.DiskType))
  225. for _, dst := range allLocations {
  226. // check whether data nodes satisfy the constraints
  227. if fn(dst.dataNode) > 0 && satisfyReplicaPlacement(replicaPlacement, replicas, dst) {
  228. // check collection name pattern
  229. if *c.collectionPattern != "" {
  230. matched, err := filepath.Match(*c.collectionPattern, replica.info.Collection)
  231. if err != nil {
  232. return fmt.Errorf("match pattern %s with collection %s: %v", *c.collectionPattern, replica.info.Collection, err)
  233. }
  234. if !matched {
  235. hasSkippedCollection = true
  236. break
  237. }
  238. }
  239. // ask the volume server to replicate the volume
  240. foundNewLocation = true
  241. fmt.Fprintf(writer, "replicating volume %d %s from %s to dataNode %s ...\n", replica.info.Id, replicaPlacement, replica.location.dataNode.Id, dst.dataNode.Id)
  242. if !takeAction {
  243. // adjust free volume count
  244. dst.dataNode.DiskInfos[replica.info.DiskType].FreeVolumeCount--
  245. break
  246. }
  247. err := operation.WithVolumeServerClient(false, pb.NewServerAddressFromDataNode(dst.dataNode), commandEnv.option.GrpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
  248. stream, replicateErr := volumeServerClient.VolumeCopy(context.Background(), &volume_server_pb.VolumeCopyRequest{
  249. VolumeId: replica.info.Id,
  250. SourceDataNode: string(pb.NewServerAddressFromDataNode(replica.location.dataNode)),
  251. })
  252. if replicateErr != nil {
  253. return fmt.Errorf("copying from %s => %s : %v", replica.location.dataNode.Id, dst.dataNode.Id, replicateErr)
  254. }
  255. for {
  256. resp, recvErr := stream.Recv()
  257. if recvErr != nil {
  258. if recvErr == io.EOF {
  259. break
  260. } else {
  261. return recvErr
  262. }
  263. }
  264. if resp.ProcessedBytes > 0 {
  265. fmt.Fprintf(writer, "volume %d processed %d bytes\n", replica.info.Id, resp.ProcessedBytes)
  266. }
  267. }
  268. return nil
  269. })
  270. if err != nil {
  271. return err
  272. }
  273. // adjust free volume count
  274. dst.dataNode.DiskInfos[replica.info.DiskType].FreeVolumeCount--
  275. break
  276. }
  277. }
  278. if !foundNewLocation && !hasSkippedCollection {
  279. fmt.Fprintf(writer, "failed to place volume %d replica as %s, existing:%+v\n", replica.info.Id, replicaPlacement, len(replicas))
  280. }
  281. return nil
  282. }
  283. func keepDataNodesSorted(dataNodes []location, diskType types.DiskType) {
  284. fn := capacityByFreeVolumeCount(diskType)
  285. slices.SortFunc(dataNodes, func(a, b location) int {
  286. return int(fn(b.dataNode) - fn(a.dataNode))
  287. })
  288. }
  289. /*
  290. if on an existing data node {
  291. return false
  292. }
  293. if different from existing dcs {
  294. if lack on different dcs {
  295. return true
  296. }else{
  297. return false
  298. }
  299. }
  300. if not on primary dc {
  301. return false
  302. }
  303. if different from existing racks {
  304. if lack on different racks {
  305. return true
  306. }else{
  307. return false
  308. }
  309. }
  310. if not on primary rack {
  311. return false
  312. }
  313. if lacks on same rack {
  314. return true
  315. } else {
  316. return false
  317. }
  318. */
  319. func satisfyReplicaPlacement(replicaPlacement *super_block.ReplicaPlacement, replicas []*VolumeReplica, possibleLocation location) bool {
  320. existingDataCenters, _, existingDataNodes := countReplicas(replicas)
  321. if _, found := existingDataNodes[possibleLocation.String()]; found {
  322. // avoid duplicated volume on the same data node
  323. return false
  324. }
  325. primaryDataCenters, _ := findTopKeys(existingDataCenters)
  326. // ensure data center count is within limit
  327. if _, found := existingDataCenters[possibleLocation.DataCenter()]; !found {
  328. // different from existing dcs
  329. if len(existingDataCenters) < replicaPlacement.DiffDataCenterCount+1 {
  330. // lack on different dcs
  331. return true
  332. } else {
  333. // adding this would go over the different dcs limit
  334. return false
  335. }
  336. }
  337. // now this is same as one of the existing data center
  338. if !isAmong(possibleLocation.DataCenter(), primaryDataCenters) {
  339. // not on one of the primary dcs
  340. return false
  341. }
  342. // now this is one of the primary dcs
  343. primaryDcRacks := make(map[string]int)
  344. for _, replica := range replicas {
  345. if replica.location.DataCenter() != possibleLocation.DataCenter() {
  346. continue
  347. }
  348. primaryDcRacks[replica.location.Rack()] += 1
  349. }
  350. primaryRacks, _ := findTopKeys(primaryDcRacks)
  351. sameRackCount := primaryDcRacks[possibleLocation.Rack()]
  352. // ensure rack count is within limit
  353. if _, found := primaryDcRacks[possibleLocation.Rack()]; !found {
  354. // different from existing racks
  355. if len(primaryDcRacks) < replicaPlacement.DiffRackCount+1 {
  356. // lack on different racks
  357. return true
  358. } else {
  359. // adding this would go over the different racks limit
  360. return false
  361. }
  362. }
  363. // now this is same as one of the existing racks
  364. if !isAmong(possibleLocation.Rack(), primaryRacks) {
  365. // not on the primary rack
  366. return false
  367. }
  368. // now this is on the primary rack
  369. // different from existing data nodes
  370. if sameRackCount < replicaPlacement.SameRackCount+1 {
  371. // lack on same rack
  372. return true
  373. } else {
  374. // adding this would go over the same data node limit
  375. return false
  376. }
  377. }
  378. func findTopKeys(m map[string]int) (topKeys []string, max int) {
  379. for k, c := range m {
  380. if max < c {
  381. topKeys = topKeys[:0]
  382. topKeys = append(topKeys, k)
  383. max = c
  384. } else if max == c {
  385. topKeys = append(topKeys, k)
  386. }
  387. }
  388. return
  389. }
  390. func isAmong(key string, keys []string) bool {
  391. for _, k := range keys {
  392. if k == key {
  393. return true
  394. }
  395. }
  396. return false
  397. }
  398. type VolumeReplica struct {
  399. location *location
  400. info *master_pb.VolumeInformationMessage
  401. }
  402. type location struct {
  403. dc string
  404. rack string
  405. dataNode *master_pb.DataNodeInfo
  406. }
  407. func newLocation(dc, rack string, dataNode *master_pb.DataNodeInfo) location {
  408. return location{
  409. dc: dc,
  410. rack: rack,
  411. dataNode: dataNode,
  412. }
  413. }
  414. func (l location) String() string {
  415. return fmt.Sprintf("%s %s %s", l.dc, l.rack, l.dataNode.Id)
  416. }
  417. func (l location) Rack() string {
  418. return fmt.Sprintf("%s %s", l.dc, l.rack)
  419. }
  420. func (l location) DataCenter() string {
  421. return l.dc
  422. }
  423. func pickOneReplicaToCopyFrom(replicas []*VolumeReplica) *VolumeReplica {
  424. mostRecent := replicas[0]
  425. for _, replica := range replicas {
  426. if replica.info.ModifiedAtSecond > mostRecent.info.ModifiedAtSecond {
  427. mostRecent = replica
  428. }
  429. }
  430. return mostRecent
  431. }
  432. func countReplicas(replicas []*VolumeReplica) (diffDc, diffRack, diffNode map[string]int) {
  433. diffDc = make(map[string]int)
  434. diffRack = make(map[string]int)
  435. diffNode = make(map[string]int)
  436. for _, replica := range replicas {
  437. diffDc[replica.location.DataCenter()] += 1
  438. diffRack[replica.location.Rack()] += 1
  439. diffNode[replica.location.String()] += 1
  440. }
  441. return
  442. }
  443. func pickOneReplicaToDelete(replicas []*VolumeReplica, replicaPlacement *super_block.ReplicaPlacement) *VolumeReplica {
  444. slices.SortFunc(replicas, func(a, b *VolumeReplica) int {
  445. if a.info.Size != b.info.Size {
  446. return int(a.info.Size - b.info.Size)
  447. }
  448. if a.info.ModifiedAtSecond != b.info.ModifiedAtSecond {
  449. return int(a.info.ModifiedAtSecond - b.info.ModifiedAtSecond)
  450. }
  451. if a.info.CompactRevision != b.info.CompactRevision {
  452. return int(a.info.CompactRevision - b.info.CompactRevision)
  453. }
  454. return 0
  455. })
  456. return replicas[0]
  457. }
  458. // check and fix misplaced volumes
  459. func isMisplaced(replicas []*VolumeReplica, replicaPlacement *super_block.ReplicaPlacement) bool {
  460. for i := 0; i < len(replicas); i++ {
  461. others := otherThan(replicas, i)
  462. if satisfyReplicaPlacement(replicaPlacement, others, *replicas[i].location) {
  463. return false
  464. }
  465. }
  466. return true
  467. }
  468. func otherThan(replicas []*VolumeReplica, index int) (others []*VolumeReplica) {
  469. for i := 0; i < len(replicas); i++ {
  470. if index != i {
  471. others = append(others, replicas[i])
  472. }
  473. }
  474. return
  475. }
  476. func pickOneMisplacedVolume(replicas []*VolumeReplica, replicaPlacement *super_block.ReplicaPlacement) (toDelete *VolumeReplica) {
  477. var deletionCandidates []*VolumeReplica
  478. for i := 0; i < len(replicas); i++ {
  479. others := otherThan(replicas, i)
  480. if !isMisplaced(others, replicaPlacement) {
  481. deletionCandidates = append(deletionCandidates, replicas[i])
  482. }
  483. }
  484. if len(deletionCandidates) > 0 {
  485. return pickOneReplicaToDelete(deletionCandidates, replicaPlacement)
  486. }
  487. return pickOneReplicaToDelete(replicas, replicaPlacement)
  488. }