You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

271 lines
8.7 KiB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
  1. package shell
  2. import (
  3. "bytes"
  4. "context"
  5. "flag"
  6. "fmt"
  7. "github.com/chrislusf/seaweedfs/weed/operation"
  8. "github.com/chrislusf/seaweedfs/weed/pb"
  9. "github.com/chrislusf/seaweedfs/weed/pb/volume_server_pb"
  10. "github.com/chrislusf/seaweedfs/weed/storage/needle_map"
  11. "io"
  12. "math"
  13. "sort"
  14. )
  15. func init() {
  16. Commands = append(Commands, &commandVolumeCheckDisk{})
  17. }
  18. type commandVolumeCheckDisk struct {
  19. env *CommandEnv
  20. }
  21. func (c *commandVolumeCheckDisk) Name() string {
  22. return "volume.check.disk"
  23. }
  24. func (c *commandVolumeCheckDisk) Help() string {
  25. return `check all replicated volumes to find and fix inconsistencies
  26. How it works:
  27. find all volumes that are replicated
  28. for each volume id, if there are more than 2 replicas, find one pair with the largest 2 in file count.
  29. for the pair volume A and B
  30. append entries in A and not in B to B
  31. append entries in B and not in A to A
  32. `
  33. }
  34. func (c *commandVolumeCheckDisk) Do(args []string, commandEnv *CommandEnv, writer io.Writer) (err error) {
  35. if err = commandEnv.confirmIsLocked(); err != nil {
  36. return
  37. }
  38. fsckCommand := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
  39. slowMode := fsckCommand.Bool("slow", false, "slow mode checks all replicas even file counts are the same")
  40. verbose := fsckCommand.Bool("v", false, "verbose mode")
  41. applyChanges := fsckCommand.Bool("force", false, "apply the fix")
  42. nonRepairThreshold := fsckCommand.Float64("nonRepairThreshold", 0.3, "repair when missing keys is not more than this limit")
  43. if err = fsckCommand.Parse(args); err != nil {
  44. return nil
  45. }
  46. c.env = commandEnv
  47. // collect topology information
  48. topologyInfo, _, err := collectTopologyInfo(commandEnv)
  49. if err != nil {
  50. return err
  51. }
  52. volumeReplicas, _ := collectVolumeReplicaLocations(topologyInfo)
  53. // pick 1 pairs of volume replica
  54. fileCount := func(replica *VolumeReplica) uint64 {
  55. return replica.info.FileCount - replica.info.DeleteCount
  56. }
  57. aDB, bDB := needle_map.NewMemDb(), needle_map.NewMemDb()
  58. defer aDB.Close()
  59. defer bDB.Close()
  60. for _, replicas := range volumeReplicas {
  61. sort.Slice(replicas, func(i, j int) bool {
  62. return fileCount(replicas[i]) > fileCount(replicas[j])
  63. })
  64. for len(replicas) >= 2 {
  65. a, b := replicas[0], replicas[1]
  66. if !*slowMode {
  67. if fileCount(a) == fileCount(b) {
  68. replicas = replicas[1:]
  69. continue
  70. }
  71. }
  72. if a.info.ReadOnly || b.info.ReadOnly {
  73. fmt.Fprintf(writer, "skipping readonly volume %d on %s and %s\n", a.info.Id, a.location.dataNode.Id, b.location.dataNode.Id)
  74. replicas = replicas[1:]
  75. continue
  76. }
  77. if err := c.syncTwoReplicas(aDB, bDB, a, verbose, writer, b, err, applyChanges, nonRepairThreshold); err != nil {
  78. fmt.Fprintf(writer, "sync volume %d on %s and %s: %v\n", a.info.Id, a.location.dataNode.Id, b.location.dataNode.Id, err)
  79. }
  80. replicas = replicas[1:]
  81. }
  82. }
  83. return nil
  84. }
  85. func (c *commandVolumeCheckDisk) syncTwoReplicas(aDB *needle_map.MemDb, bDB *needle_map.MemDb, a *VolumeReplica, verbose *bool, writer io.Writer, b *VolumeReplica, err error, applyChanges *bool, nonRepairThreshold *float64) error {
  86. aHasChanges, bHasChanges := true, true
  87. for aHasChanges || bHasChanges {
  88. // reset index db
  89. aDB.Close()
  90. bDB.Close()
  91. aDB, bDB = needle_map.NewMemDb(), needle_map.NewMemDb()
  92. // read index db
  93. if err := c.readIndexDatabase(aDB, a.info.Collection, a.info.Id, pb.NewServerAddressFromDataNode(a.location.dataNode), *verbose, writer); err != nil {
  94. return err
  95. }
  96. if err := c.readIndexDatabase(bDB, b.info.Collection, b.info.Id, pb.NewServerAddressFromDataNode(b.location.dataNode), *verbose, writer); err != nil {
  97. return err
  98. }
  99. // find and make up the differences
  100. if aHasChanges, err = c.doVolumeCheckDisk(aDB, bDB, a, b, *verbose, writer, *applyChanges, *nonRepairThreshold); err != nil {
  101. return err
  102. }
  103. if bHasChanges, err = c.doVolumeCheckDisk(bDB, aDB, b, a, *verbose, writer, *applyChanges, *nonRepairThreshold); err != nil {
  104. return err
  105. }
  106. }
  107. return nil
  108. }
  109. func (c *commandVolumeCheckDisk) doVolumeCheckDisk(subtrahend, minuend *needle_map.MemDb, source, target *VolumeReplica, verbose bool, writer io.Writer, applyChanges bool, nonRepairThreshold float64) (hasChanges bool, err error) {
  110. // find missing keys
  111. // hash join, can be more efficient
  112. var missingNeedles []needle_map.NeedleValue
  113. var counter int
  114. subtrahend.AscendingVisit(func(value needle_map.NeedleValue) error {
  115. counter++
  116. if _, found := minuend.Get(value.Key); !found {
  117. missingNeedles = append(missingNeedles, value)
  118. }
  119. return nil
  120. })
  121. fmt.Fprintf(writer, "volume %d %s has %d entries, %s missed %d entries\n", source.info.Id, source.location.dataNode.Id, counter, target.location.dataNode.Id, len(missingNeedles))
  122. if counter == 0 || len(missingNeedles) == 0 {
  123. return false, nil
  124. }
  125. missingNeedlesFraction := float64(len(missingNeedles)) / float64(counter)
  126. if missingNeedlesFraction > nonRepairThreshold {
  127. return false, fmt.Errorf(
  128. "failed to start repair volume %d, percentage of missing keys is greater than the threshold: %.2f > %.2f",
  129. source.info.Id, missingNeedlesFraction, nonRepairThreshold)
  130. }
  131. for _, needleValue := range missingNeedles {
  132. needleBlob, err := c.readSourceNeedleBlob(pb.NewServerAddressFromDataNode(source.location.dataNode), source.info.Id, needleValue)
  133. if err != nil {
  134. return hasChanges, err
  135. }
  136. if !applyChanges {
  137. continue
  138. }
  139. if verbose {
  140. fmt.Fprintf(writer, "read %d,%x %s => %s \n", source.info.Id, needleValue.Key, source.location.dataNode.Id, target.location.dataNode.Id)
  141. }
  142. hasChanges = true
  143. if err = c.writeNeedleBlobToTarget(pb.NewServerAddressFromDataNode(target.location.dataNode), source.info.Id, needleValue, needleBlob); err != nil {
  144. return hasChanges, err
  145. }
  146. }
  147. return
  148. }
  149. func (c *commandVolumeCheckDisk) readSourceNeedleBlob(sourceVolumeServer pb.ServerAddress, volumeId uint32, needleValue needle_map.NeedleValue) (needleBlob []byte, err error) {
  150. err = operation.WithVolumeServerClient(sourceVolumeServer, c.env.option.GrpcDialOption, func(client volume_server_pb.VolumeServerClient) error {
  151. resp, err := client.ReadNeedleBlob(context.Background(), &volume_server_pb.ReadNeedleBlobRequest{
  152. VolumeId: volumeId,
  153. NeedleId: uint64(needleValue.Key),
  154. Offset: needleValue.Offset.ToActualOffset(),
  155. Size: int32(needleValue.Size),
  156. })
  157. if err != nil {
  158. return err
  159. }
  160. needleBlob = resp.NeedleBlob
  161. return nil
  162. })
  163. return
  164. }
  165. func (c *commandVolumeCheckDisk) writeNeedleBlobToTarget(targetVolumeServer pb.ServerAddress, volumeId uint32, needleValue needle_map.NeedleValue, needleBlob []byte) error {
  166. return operation.WithVolumeServerClient(targetVolumeServer, c.env.option.GrpcDialOption, func(client volume_server_pb.VolumeServerClient) error {
  167. _, err := client.WriteNeedleBlob(context.Background(), &volume_server_pb.WriteNeedleBlobRequest{
  168. VolumeId: volumeId,
  169. NeedleId: uint64(needleValue.Key),
  170. Size: int32(needleValue.Size),
  171. NeedleBlob: needleBlob,
  172. })
  173. return err
  174. })
  175. }
  176. func (c *commandVolumeCheckDisk) readIndexDatabase(db *needle_map.MemDb, collection string, volumeId uint32, volumeServer pb.ServerAddress, verbose bool, writer io.Writer) error {
  177. var buf bytes.Buffer
  178. if err := c.copyVolumeIndexFile(collection, volumeId, volumeServer, &buf, verbose, writer); err != nil {
  179. return err
  180. }
  181. if verbose {
  182. fmt.Fprintf(writer, "load collection %s volume %d index size %d from %s ...\n", collection, volumeId, buf.Len(), volumeServer)
  183. }
  184. return db.LoadFromReaderAt(bytes.NewReader(buf.Bytes()))
  185. }
  186. func (c *commandVolumeCheckDisk) copyVolumeIndexFile(collection string, volumeId uint32, volumeServer pb.ServerAddress, buf *bytes.Buffer, verbose bool, writer io.Writer) error {
  187. return operation.WithVolumeServerClient(volumeServer, c.env.option.GrpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
  188. ext := ".idx"
  189. copyFileClient, err := volumeServerClient.CopyFile(context.Background(), &volume_server_pb.CopyFileRequest{
  190. VolumeId: volumeId,
  191. Ext: ".idx",
  192. CompactionRevision: math.MaxUint32,
  193. StopOffset: math.MaxInt64,
  194. Collection: collection,
  195. IsEcVolume: false,
  196. IgnoreSourceFileNotFound: false,
  197. })
  198. if err != nil {
  199. return fmt.Errorf("failed to start copying volume %d%s: %v", volumeId, ext, err)
  200. }
  201. err = writeToBuffer(copyFileClient, buf)
  202. if err != nil {
  203. return fmt.Errorf("failed to copy %d%s from %s: %v", volumeId, ext, volumeServer, err)
  204. }
  205. return nil
  206. })
  207. }
  208. func writeToBuffer(client volume_server_pb.VolumeServer_CopyFileClient, buf *bytes.Buffer) error {
  209. for {
  210. resp, receiveErr := client.Recv()
  211. if receiveErr == io.EOF {
  212. break
  213. }
  214. if receiveErr != nil {
  215. return fmt.Errorf("receiving: %v", receiveErr)
  216. }
  217. buf.Write(resp.FileContent)
  218. }
  219. return nil
  220. }