558 lines
18 KiB

3 years ago
6 years ago
4 years ago
6 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
  1. package shell
  2. import (
  3. "context"
  4. "flag"
  5. "fmt"
  6. "github.com/chrislusf/seaweedfs/weed/pb"
  7. "github.com/chrislusf/seaweedfs/weed/storage/needle"
  8. "github.com/chrislusf/seaweedfs/weed/storage/types"
  9. "golang.org/x/exp/slices"
  10. "io"
  11. "path/filepath"
  12. "strconv"
  13. "time"
  14. "github.com/chrislusf/seaweedfs/weed/operation"
  15. "github.com/chrislusf/seaweedfs/weed/pb/master_pb"
  16. "github.com/chrislusf/seaweedfs/weed/pb/volume_server_pb"
  17. "github.com/chrislusf/seaweedfs/weed/storage/super_block"
  18. )
  19. func init() {
  20. Commands = append(Commands, &commandVolumeFixReplication{})
  21. }
  22. type commandVolumeFixReplication struct {
  23. collectionPattern *string
  24. }
  25. func (c *commandVolumeFixReplication) Name() string {
  26. return "volume.fix.replication"
  27. }
  28. func (c *commandVolumeFixReplication) Help() string {
  29. return `add or remove replicas to volumes that are missing replicas or over-replicated
  30. This command finds all over-replicated volumes. If found, it will purge the oldest copies and stop.
  31. This command also finds all under-replicated volumes, and finds volume servers with free slots.
  32. If the free slots satisfy the replication requirement, the volume content is copied over and mounted.
  33. volume.fix.replication -n # do not take action
  34. volume.fix.replication # actually deleting or copying the volume files and mount the volume
  35. volume.fix.replication -collectionPattern=important* # fix any collections with prefix "important"
  36. Note:
  37. * each time this will only add back one replica for each volume id that is under replicated.
  38. If there are multiple replicas are missing, e.g. replica count is > 2, you may need to run this multiple times.
  39. * do not run this too quickly within seconds, since the new volume replica may take a few seconds
  40. to register itself to the master.
  41. `
  42. }
  43. func (c *commandVolumeFixReplication) Do(args []string, commandEnv *CommandEnv, writer io.Writer) (err error) {
  44. volFixReplicationCommand := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
  45. c.collectionPattern = volFixReplicationCommand.String("collectionPattern", "", "match with wildcard characters '*' and '?'")
  46. skipChange := volFixReplicationCommand.Bool("n", false, "skip the changes")
  47. retryCount := volFixReplicationCommand.Int("retry", 0, "how many times to retry")
  48. volumesPerStep := volFixReplicationCommand.Int("volumesPerStep", 0, "how many volumes to fix in one cycle")
  49. if err = volFixReplicationCommand.Parse(args); err != nil {
  50. return nil
  51. }
  52. if err = commandEnv.confirmIsLocked(args); err != nil {
  53. return
  54. }
  55. takeAction := !*skipChange
  56. underReplicatedVolumeIdsCount := 1
  57. for underReplicatedVolumeIdsCount > 0 {
  58. fixedVolumeReplicas := map[string]int{}
  59. // collect topology information
  60. topologyInfo, _, err := collectTopologyInfo(commandEnv, 15*time.Second)
  61. if err != nil {
  62. return err
  63. }
  64. // find all volumes that needs replication
  65. // collect all data nodes
  66. volumeReplicas, allLocations := collectVolumeReplicaLocations(topologyInfo)
  67. if len(allLocations) == 0 {
  68. return fmt.Errorf("no data nodes at all")
  69. }
  70. // find all under replicated volumes
  71. var underReplicatedVolumeIds, overReplicatedVolumeIds, misplacedVolumeIds []uint32
  72. for vid, replicas := range volumeReplicas {
  73. replica := replicas[0]
  74. replicaPlacement, _ := super_block.NewReplicaPlacementFromByte(byte(replica.info.ReplicaPlacement))
  75. if replicaPlacement.GetCopyCount() > len(replicas) {
  76. underReplicatedVolumeIds = append(underReplicatedVolumeIds, vid)
  77. } else if replicaPlacement.GetCopyCount() < len(replicas) {
  78. overReplicatedVolumeIds = append(overReplicatedVolumeIds, vid)
  79. fmt.Fprintf(writer, "volume %d replication %s, but over replicated %+d\n", replica.info.Id, replicaPlacement, len(replicas))
  80. } else if isMisplaced(replicas, replicaPlacement) {
  81. misplacedVolumeIds = append(misplacedVolumeIds, vid)
  82. fmt.Fprintf(writer, "volume %d replication %s is not well placed %+v\n", replica.info.Id, replicaPlacement, replicas)
  83. }
  84. }
  85. if len(overReplicatedVolumeIds) > 0 {
  86. if err := c.deleteOneVolume(commandEnv, writer, takeAction, overReplicatedVolumeIds, volumeReplicas, allLocations, pickOneReplicaToDelete); err != nil {
  87. return err
  88. }
  89. }
  90. if len(misplacedVolumeIds) > 0 {
  91. if err := c.deleteOneVolume(commandEnv, writer, takeAction, misplacedVolumeIds, volumeReplicas, allLocations, pickOneMisplacedVolume); err != nil {
  92. return err
  93. }
  94. }
  95. underReplicatedVolumeIdsCount = len(underReplicatedVolumeIds)
  96. if underReplicatedVolumeIdsCount > 0 {
  97. // find the most under populated data nodes
  98. fixedVolumeReplicas, err = c.fixUnderReplicatedVolumes(commandEnv, writer, takeAction, underReplicatedVolumeIds, volumeReplicas, allLocations, *retryCount, *volumesPerStep)
  99. if err != nil {
  100. return err
  101. }
  102. }
  103. if *skipChange {
  104. break
  105. }
  106. // check that the topology has been updated
  107. if len(fixedVolumeReplicas) > 0 {
  108. fixedVolumes := make([]string, 0, len(fixedVolumeReplicas))
  109. for k, _ := range fixedVolumeReplicas {
  110. fixedVolumes = append(fixedVolumes, k)
  111. }
  112. volumeIdLocations, err := lookupVolumeIds(commandEnv, fixedVolumes)
  113. if err != nil {
  114. return err
  115. }
  116. for _, volumeIdLocation := range volumeIdLocations {
  117. volumeId := volumeIdLocation.VolumeOrFileId
  118. volumeIdLocationCount := len(volumeIdLocation.Locations)
  119. i := 0
  120. for fixedVolumeReplicas[volumeId] >= volumeIdLocationCount {
  121. fmt.Fprintf(writer, "the number of locations for volume %s has not increased yet, let's wait\n", volumeId)
  122. time.Sleep(time.Duration(i+1) * time.Second * 7)
  123. volumeLocIds, err := lookupVolumeIds(commandEnv, []string{volumeId})
  124. if err != nil {
  125. return err
  126. }
  127. volumeIdLocationCount = len(volumeLocIds[0].Locations)
  128. if *retryCount <= i {
  129. return fmt.Errorf("replicas volume %s mismatch in topology", volumeId)
  130. }
  131. i += 1
  132. }
  133. }
  134. }
  135. }
  136. return nil
  137. }
  138. func collectVolumeReplicaLocations(topologyInfo *master_pb.TopologyInfo) (map[uint32][]*VolumeReplica, []location) {
  139. volumeReplicas := make(map[uint32][]*VolumeReplica)
  140. var allLocations []location
  141. eachDataNode(topologyInfo, func(dc string, rack RackId, dn *master_pb.DataNodeInfo) {
  142. loc := newLocation(dc, string(rack), dn)
  143. for _, diskInfo := range dn.DiskInfos {
  144. for _, v := range diskInfo.VolumeInfos {
  145. volumeReplicas[v.Id] = append(volumeReplicas[v.Id], &VolumeReplica{
  146. location: &loc,
  147. info: v,
  148. })
  149. }
  150. }
  151. allLocations = append(allLocations, loc)
  152. })
  153. return volumeReplicas, allLocations
  154. }
  155. type SelectOneVolumeFunc func(replicas []*VolumeReplica, replicaPlacement *super_block.ReplicaPlacement) *VolumeReplica
  156. func (c *commandVolumeFixReplication) deleteOneVolume(commandEnv *CommandEnv, writer io.Writer, takeAction bool, overReplicatedVolumeIds []uint32, volumeReplicas map[uint32][]*VolumeReplica, allLocations []location, selectOneVolumeFn SelectOneVolumeFunc) error {
  157. for _, vid := range overReplicatedVolumeIds {
  158. replicas := volumeReplicas[vid]
  159. replicaPlacement, _ := super_block.NewReplicaPlacementFromByte(byte(replicas[0].info.ReplicaPlacement))
  160. replica := selectOneVolumeFn(replicas, replicaPlacement)
  161. // check collection name pattern
  162. if *c.collectionPattern != "" {
  163. matched, err := filepath.Match(*c.collectionPattern, replica.info.Collection)
  164. if err != nil {
  165. return fmt.Errorf("match pattern %s with collection %s: %v", *c.collectionPattern, replica.info.Collection, err)
  166. }
  167. if !matched {
  168. break
  169. }
  170. }
  171. collectionIsMismatch := false
  172. for _, volumeReplica := range replicas {
  173. if volumeReplica.info.Collection != replica.info.Collection {
  174. fmt.Fprintf(writer, "skip delete volume %d as collection %s is mismatch: %s\n", replica.info.Id, replica.info.Collection, volumeReplica.info.Collection)
  175. collectionIsMismatch = true
  176. }
  177. }
  178. if collectionIsMismatch {
  179. continue
  180. }
  181. fmt.Fprintf(writer, "deleting volume %d from %s ...\n", replica.info.Id, replica.location.dataNode.Id)
  182. if !takeAction {
  183. break
  184. }
  185. if err := deleteVolume(commandEnv.option.GrpcDialOption, needle.VolumeId(replica.info.Id), pb.NewServerAddressFromDataNode(replica.location.dataNode)); err != nil {
  186. return fmt.Errorf("deleting volume %d from %s : %v", replica.info.Id, replica.location.dataNode.Id, err)
  187. }
  188. }
  189. return nil
  190. }
  191. func (c *commandVolumeFixReplication) fixUnderReplicatedVolumes(commandEnv *CommandEnv, writer io.Writer, takeAction bool, underReplicatedVolumeIds []uint32, volumeReplicas map[uint32][]*VolumeReplica, allLocations []location, retryCount int, volumesPerStep int) (fixedVolumes map[string]int, err error) {
  192. fixedVolumes = map[string]int{}
  193. if len(underReplicatedVolumeIds) > volumesPerStep && volumesPerStep > 0 {
  194. underReplicatedVolumeIds = underReplicatedVolumeIds[0:volumesPerStep]
  195. }
  196. for _, vid := range underReplicatedVolumeIds {
  197. for i := 0; i < retryCount+1; i++ {
  198. if err = c.fixOneUnderReplicatedVolume(commandEnv, writer, takeAction, volumeReplicas, vid, allLocations); err == nil {
  199. if takeAction {
  200. fixedVolumes[strconv.FormatUint(uint64(vid), 10)] = len(volumeReplicas[vid])
  201. }
  202. break
  203. }
  204. }
  205. }
  206. return fixedVolumes, nil
  207. }
  208. func (c *commandVolumeFixReplication) fixOneUnderReplicatedVolume(commandEnv *CommandEnv, writer io.Writer, takeAction bool, volumeReplicas map[uint32][]*VolumeReplica, vid uint32, allLocations []location) error {
  209. replicas := volumeReplicas[vid]
  210. replica := pickOneReplicaToCopyFrom(replicas)
  211. replicaPlacement, _ := super_block.NewReplicaPlacementFromByte(byte(replica.info.ReplicaPlacement))
  212. foundNewLocation := false
  213. hasSkippedCollection := false
  214. keepDataNodesSorted(allLocations, types.ToDiskType(replica.info.DiskType))
  215. fn := capacityByFreeVolumeCount(types.ToDiskType(replica.info.DiskType))
  216. for _, dst := range allLocations {
  217. // check whether data nodes satisfy the constraints
  218. if fn(dst.dataNode) > 0 && satisfyReplicaPlacement(replicaPlacement, replicas, dst) {
  219. // check collection name pattern
  220. if *c.collectionPattern != "" {
  221. matched, err := filepath.Match(*c.collectionPattern, replica.info.Collection)
  222. if err != nil {
  223. return fmt.Errorf("match pattern %s with collection %s: %v", *c.collectionPattern, replica.info.Collection, err)
  224. }
  225. if !matched {
  226. hasSkippedCollection = true
  227. break
  228. }
  229. }
  230. // ask the volume server to replicate the volume
  231. foundNewLocation = true
  232. fmt.Fprintf(writer, "replicating volume %d %s from %s to dataNode %s ...\n", replica.info.Id, replicaPlacement, replica.location.dataNode.Id, dst.dataNode.Id)
  233. if !takeAction {
  234. // adjust free volume count
  235. dst.dataNode.DiskInfos[replica.info.DiskType].FreeVolumeCount--
  236. break
  237. }
  238. err := operation.WithVolumeServerClient(false, pb.NewServerAddressFromDataNode(dst.dataNode), commandEnv.option.GrpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
  239. stream, replicateErr := volumeServerClient.VolumeCopy(context.Background(), &volume_server_pb.VolumeCopyRequest{
  240. VolumeId: replica.info.Id,
  241. SourceDataNode: string(pb.NewServerAddressFromDataNode(replica.location.dataNode)),
  242. })
  243. if replicateErr != nil {
  244. return fmt.Errorf("copying from %s => %s : %v", replica.location.dataNode.Id, dst.dataNode.Id, replicateErr)
  245. }
  246. for {
  247. resp, recvErr := stream.Recv()
  248. if recvErr != nil {
  249. if recvErr == io.EOF {
  250. break
  251. } else {
  252. return recvErr
  253. }
  254. }
  255. if resp.ProcessedBytes > 0 {
  256. fmt.Fprintf(writer, "volume %d processed %d bytes\n", replica.info.Id, resp.ProcessedBytes)
  257. }
  258. }
  259. return nil
  260. })
  261. if err != nil {
  262. return err
  263. }
  264. // adjust free volume count
  265. dst.dataNode.DiskInfos[replica.info.DiskType].FreeVolumeCount--
  266. break
  267. }
  268. }
  269. if !foundNewLocation && !hasSkippedCollection {
  270. fmt.Fprintf(writer, "failed to place volume %d replica as %s, existing:%+v\n", replica.info.Id, replicaPlacement, len(replicas))
  271. }
  272. return nil
  273. }
  274. func keepDataNodesSorted(dataNodes []location, diskType types.DiskType) {
  275. fn := capacityByFreeVolumeCount(diskType)
  276. slices.SortFunc(dataNodes, func(a, b location) bool {
  277. return fn(a.dataNode) > fn(b.dataNode)
  278. })
  279. }
  280. /*
  281. if on an existing data node {
  282. return false
  283. }
  284. if different from existing dcs {
  285. if lack on different dcs {
  286. return true
  287. }else{
  288. return false
  289. }
  290. }
  291. if not on primary dc {
  292. return false
  293. }
  294. if different from existing racks {
  295. if lack on different racks {
  296. return true
  297. }else{
  298. return false
  299. }
  300. }
  301. if not on primary rack {
  302. return false
  303. }
  304. if lacks on same rack {
  305. return true
  306. } else {
  307. return false
  308. }
  309. */
  310. func satisfyReplicaPlacement(replicaPlacement *super_block.ReplicaPlacement, replicas []*VolumeReplica, possibleLocation location) bool {
  311. existingDataCenters, _, existingDataNodes := countReplicas(replicas)
  312. if _, found := existingDataNodes[possibleLocation.String()]; found {
  313. // avoid duplicated volume on the same data node
  314. return false
  315. }
  316. primaryDataCenters, _ := findTopKeys(existingDataCenters)
  317. // ensure data center count is within limit
  318. if _, found := existingDataCenters[possibleLocation.DataCenter()]; !found {
  319. // different from existing dcs
  320. if len(existingDataCenters) < replicaPlacement.DiffDataCenterCount+1 {
  321. // lack on different dcs
  322. return true
  323. } else {
  324. // adding this would go over the different dcs limit
  325. return false
  326. }
  327. }
  328. // now this is same as one of the existing data center
  329. if !isAmong(possibleLocation.DataCenter(), primaryDataCenters) {
  330. // not on one of the primary dcs
  331. return false
  332. }
  333. // now this is one of the primary dcs
  334. primaryDcRacks := make(map[string]int)
  335. for _, replica := range replicas {
  336. if replica.location.DataCenter() != possibleLocation.DataCenter() {
  337. continue
  338. }
  339. primaryDcRacks[replica.location.Rack()] += 1
  340. }
  341. primaryRacks, _ := findTopKeys(primaryDcRacks)
  342. sameRackCount := primaryDcRacks[possibleLocation.Rack()]
  343. // ensure rack count is within limit
  344. if _, found := primaryDcRacks[possibleLocation.Rack()]; !found {
  345. // different from existing racks
  346. if len(primaryDcRacks) < replicaPlacement.DiffRackCount+1 {
  347. // lack on different racks
  348. return true
  349. } else {
  350. // adding this would go over the different racks limit
  351. return false
  352. }
  353. }
  354. // now this is same as one of the existing racks
  355. if !isAmong(possibleLocation.Rack(), primaryRacks) {
  356. // not on the primary rack
  357. return false
  358. }
  359. // now this is on the primary rack
  360. // different from existing data nodes
  361. if sameRackCount < replicaPlacement.SameRackCount+1 {
  362. // lack on same rack
  363. return true
  364. } else {
  365. // adding this would go over the same data node limit
  366. return false
  367. }
  368. }
  369. func findTopKeys(m map[string]int) (topKeys []string, max int) {
  370. for k, c := range m {
  371. if max < c {
  372. topKeys = topKeys[:0]
  373. topKeys = append(topKeys, k)
  374. max = c
  375. } else if max == c {
  376. topKeys = append(topKeys, k)
  377. }
  378. }
  379. return
  380. }
  381. func isAmong(key string, keys []string) bool {
  382. for _, k := range keys {
  383. if k == key {
  384. return true
  385. }
  386. }
  387. return false
  388. }
  389. type VolumeReplica struct {
  390. location *location
  391. info *master_pb.VolumeInformationMessage
  392. }
  393. type location struct {
  394. dc string
  395. rack string
  396. dataNode *master_pb.DataNodeInfo
  397. }
  398. func newLocation(dc, rack string, dataNode *master_pb.DataNodeInfo) location {
  399. return location{
  400. dc: dc,
  401. rack: rack,
  402. dataNode: dataNode,
  403. }
  404. }
  405. func (l location) String() string {
  406. return fmt.Sprintf("%s %s %s", l.dc, l.rack, l.dataNode.Id)
  407. }
  408. func (l location) Rack() string {
  409. return fmt.Sprintf("%s %s", l.dc, l.rack)
  410. }
  411. func (l location) DataCenter() string {
  412. return l.dc
  413. }
  414. func pickOneReplicaToCopyFrom(replicas []*VolumeReplica) *VolumeReplica {
  415. mostRecent := replicas[0]
  416. for _, replica := range replicas {
  417. if replica.info.ModifiedAtSecond > mostRecent.info.ModifiedAtSecond {
  418. mostRecent = replica
  419. }
  420. }
  421. return mostRecent
  422. }
  423. func countReplicas(replicas []*VolumeReplica) (diffDc, diffRack, diffNode map[string]int) {
  424. diffDc = make(map[string]int)
  425. diffRack = make(map[string]int)
  426. diffNode = make(map[string]int)
  427. for _, replica := range replicas {
  428. diffDc[replica.location.DataCenter()] += 1
  429. diffRack[replica.location.Rack()] += 1
  430. diffNode[replica.location.String()] += 1
  431. }
  432. return
  433. }
  434. func pickOneReplicaToDelete(replicas []*VolumeReplica, replicaPlacement *super_block.ReplicaPlacement) *VolumeReplica {
  435. slices.SortFunc(replicas, func(a, b *VolumeReplica) bool {
  436. if a.info.Size != b.info.Size {
  437. return a.info.Size < b.info.Size
  438. }
  439. if a.info.ModifiedAtSecond != b.info.ModifiedAtSecond {
  440. return a.info.ModifiedAtSecond < b.info.ModifiedAtSecond
  441. }
  442. if a.info.CompactRevision != b.info.CompactRevision {
  443. return a.info.CompactRevision < b.info.CompactRevision
  444. }
  445. return false
  446. })
  447. return replicas[0]
  448. }
  449. // check and fix misplaced volumes
  450. func isMisplaced(replicas []*VolumeReplica, replicaPlacement *super_block.ReplicaPlacement) bool {
  451. for i := 0; i < len(replicas); i++ {
  452. others := otherThan(replicas, i)
  453. if satisfyReplicaPlacement(replicaPlacement, others, *replicas[i].location) {
  454. return false
  455. }
  456. }
  457. return true
  458. }
  459. func otherThan(replicas []*VolumeReplica, index int) (others []*VolumeReplica) {
  460. for i := 0; i < len(replicas); i++ {
  461. if index != i {
  462. others = append(others, replicas[i])
  463. }
  464. }
  465. return
  466. }
  467. func pickOneMisplacedVolume(replicas []*VolumeReplica, replicaPlacement *super_block.ReplicaPlacement) (toDelete *VolumeReplica) {
  468. var deletionCandidates []*VolumeReplica
  469. for i := 0; i < len(replicas); i++ {
  470. others := otherThan(replicas, i)
  471. if !isMisplaced(others, replicaPlacement) {
  472. deletionCandidates = append(deletionCandidates, replicas[i])
  473. }
  474. }
  475. if len(deletionCandidates) > 0 {
  476. return pickOneReplicaToDelete(deletionCandidates, replicaPlacement)
  477. }
  478. return pickOneReplicaToDelete(replicas, replicaPlacement)
  479. }