You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

265 lines
8.4 KiB

3 months ago
  1. package shell
  2. import (
  3. "context"
  4. "flag"
  5. "fmt"
  6. "io"
  7. "github.com/seaweedfs/seaweedfs/weed/cluster"
  8. "github.com/seaweedfs/seaweedfs/weed/pb"
  9. "github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
  10. "github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
  11. "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
  12. )
  13. func init() {
  14. Commands = append(Commands, &commandClusterCheck{})
  15. }
  16. type commandClusterCheck struct {
  17. }
  18. func (c *commandClusterCheck) Name() string {
  19. return "cluster.check"
  20. }
  21. func (c *commandClusterCheck) Help() string {
  22. return `check current cluster network connectivity
  23. cluster.check
  24. `
  25. }
  26. func (c *commandClusterCheck) HasTag(CommandTag) bool {
  27. return false
  28. }
  29. func (c *commandClusterCheck) Do(args []string, commandEnv *CommandEnv, writer io.Writer) (err error) {
  30. clusterPsCommand := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
  31. if err = clusterPsCommand.Parse(args); err != nil {
  32. return nil
  33. }
  34. // collect topology information
  35. topologyInfo, volumeSizeLimitMb, err := collectTopologyInfo(commandEnv, 0)
  36. if err != nil {
  37. return err
  38. }
  39. fmt.Fprintf(writer, "Topology volumeSizeLimit:%d MB%s\n", volumeSizeLimitMb, diskInfosToString(topologyInfo.DiskInfos))
  40. if len(topologyInfo.DiskInfos) == 0 {
  41. return fmt.Errorf("no disk type defined")
  42. }
  43. for diskType, diskInfo := range topologyInfo.DiskInfos {
  44. if diskInfo.MaxVolumeCount == 0 {
  45. return fmt.Errorf("no volume available for \"%s\" disk type", diskType)
  46. }
  47. }
  48. // collect filers
  49. var filers []pb.ServerAddress
  50. err = commandEnv.MasterClient.WithClient(false, func(client master_pb.SeaweedClient) error {
  51. resp, err := client.ListClusterNodes(context.Background(), &master_pb.ListClusterNodesRequest{
  52. ClientType: cluster.FilerType,
  53. FilerGroup: *commandEnv.option.FilerGroup,
  54. })
  55. for _, node := range resp.ClusterNodes {
  56. filers = append(filers, pb.ServerAddress(node.Address))
  57. }
  58. return err
  59. })
  60. if err != nil {
  61. return
  62. }
  63. fmt.Fprintf(writer, "the cluster has %d filers: %+v\n", len(filers), filers)
  64. if len(filers) > 0 {
  65. genericDiskInfo, genericDiskInfoOk := topologyInfo.DiskInfos[""]
  66. hddDiskInfo, hddDiskInfoOk := topologyInfo.DiskInfos["hdd"]
  67. if !genericDiskInfoOk && !hddDiskInfoOk {
  68. return fmt.Errorf("filer metadata logs need generic or hdd disk type to be defined")
  69. }
  70. if (genericDiskInfoOk && genericDiskInfo.MaxVolumeCount == 0) || (hddDiskInfoOk && hddDiskInfo.MaxVolumeCount == 0) {
  71. return fmt.Errorf("filer metadata logs need generic or hdd volumes to be available")
  72. }
  73. }
  74. // collect volume servers
  75. var volumeServers []pb.ServerAddress
  76. t, _, err := collectTopologyInfo(commandEnv, 0)
  77. if err != nil {
  78. return err
  79. }
  80. for _, dc := range t.DataCenterInfos {
  81. for _, r := range dc.RackInfos {
  82. for _, dn := range r.DataNodeInfos {
  83. volumeServers = append(volumeServers, pb.NewServerAddressFromDataNode(dn))
  84. }
  85. }
  86. }
  87. fmt.Fprintf(writer, "the cluster has %d volume servers: %+v\n", len(volumeServers), volumeServers)
  88. // collect all masters
  89. var masters []pb.ServerAddress
  90. masters = append(masters, commandEnv.MasterClient.GetMasters(context.Background())...)
  91. // check from master to volume servers
  92. for _, master := range masters {
  93. for _, volumeServer := range volumeServers {
  94. fmt.Fprintf(writer, "checking master %s to volume server %s ... ", string(master), string(volumeServer))
  95. err := pb.WithMasterClient(false, master, commandEnv.option.GrpcDialOption, false, func(client master_pb.SeaweedClient) error {
  96. pong, err := client.Ping(context.Background(), &master_pb.PingRequest{
  97. Target: string(volumeServer),
  98. TargetType: cluster.VolumeServerType,
  99. })
  100. if err == nil {
  101. printTiming(writer, pong.StartTimeNs, pong.RemoteTimeNs, pong.StopTimeNs)
  102. }
  103. return err
  104. })
  105. if err != nil {
  106. fmt.Fprintf(writer, "%v\n", err)
  107. }
  108. }
  109. }
  110. // check between masters
  111. for _, sourceMaster := range masters {
  112. for _, targetMaster := range masters {
  113. if sourceMaster == targetMaster {
  114. continue
  115. }
  116. fmt.Fprintf(writer, "checking master %s to %s ... ", string(sourceMaster), string(targetMaster))
  117. err := pb.WithMasterClient(false, sourceMaster, commandEnv.option.GrpcDialOption, false, func(client master_pb.SeaweedClient) error {
  118. pong, err := client.Ping(context.Background(), &master_pb.PingRequest{
  119. Target: string(targetMaster),
  120. TargetType: cluster.MasterType,
  121. })
  122. if err == nil {
  123. printTiming(writer, pong.StartTimeNs, pong.RemoteTimeNs, pong.StopTimeNs)
  124. }
  125. return err
  126. })
  127. if err != nil {
  128. fmt.Fprintf(writer, "%v\n", err)
  129. }
  130. }
  131. }
  132. // check from volume servers to masters
  133. for _, volumeServer := range volumeServers {
  134. for _, master := range masters {
  135. fmt.Fprintf(writer, "checking volume server %s to master %s ... ", string(volumeServer), string(master))
  136. err := pb.WithVolumeServerClient(false, volumeServer, commandEnv.option.GrpcDialOption, func(client volume_server_pb.VolumeServerClient) error {
  137. pong, err := client.Ping(context.Background(), &volume_server_pb.PingRequest{
  138. Target: string(master),
  139. TargetType: cluster.MasterType,
  140. })
  141. if err == nil {
  142. printTiming(writer, pong.StartTimeNs, pong.RemoteTimeNs, pong.StopTimeNs)
  143. }
  144. return err
  145. })
  146. if err != nil {
  147. fmt.Fprintf(writer, "%v\n", err)
  148. }
  149. }
  150. }
  151. // check from filers to masters
  152. for _, filer := range filers {
  153. for _, master := range masters {
  154. fmt.Fprintf(writer, "checking filer %s to master %s ... ", string(filer), string(master))
  155. err := pb.WithFilerClient(false, 0, filer, commandEnv.option.GrpcDialOption, func(client filer_pb.SeaweedFilerClient) error {
  156. pong, err := client.Ping(context.Background(), &filer_pb.PingRequest{
  157. Target: string(master),
  158. TargetType: cluster.MasterType,
  159. })
  160. if err == nil {
  161. printTiming(writer, pong.StartTimeNs, pong.RemoteTimeNs, pong.StopTimeNs)
  162. }
  163. return err
  164. })
  165. if err != nil {
  166. fmt.Fprintf(writer, "%v\n", err)
  167. }
  168. }
  169. }
  170. // check from filers to volume servers
  171. for _, filer := range filers {
  172. for _, volumeServer := range volumeServers {
  173. fmt.Fprintf(writer, "checking filer %s to volume server %s ... ", string(filer), string(volumeServer))
  174. err := pb.WithFilerClient(false, 0, filer, commandEnv.option.GrpcDialOption, func(client filer_pb.SeaweedFilerClient) error {
  175. pong, err := client.Ping(context.Background(), &filer_pb.PingRequest{
  176. Target: string(volumeServer),
  177. TargetType: cluster.VolumeServerType,
  178. })
  179. if err == nil {
  180. printTiming(writer, pong.StartTimeNs, pong.RemoteTimeNs, pong.StopTimeNs)
  181. }
  182. return err
  183. })
  184. if err != nil {
  185. fmt.Fprintf(writer, "%v\n", err)
  186. }
  187. }
  188. }
  189. // check between volume servers
  190. for _, sourceVolumeServer := range volumeServers {
  191. for _, targetVolumeServer := range volumeServers {
  192. if sourceVolumeServer == targetVolumeServer {
  193. continue
  194. }
  195. fmt.Fprintf(writer, "checking volume server %s to %s ... ", string(sourceVolumeServer), string(targetVolumeServer))
  196. err := pb.WithVolumeServerClient(false, sourceVolumeServer, commandEnv.option.GrpcDialOption, func(client volume_server_pb.VolumeServerClient) error {
  197. pong, err := client.Ping(context.Background(), &volume_server_pb.PingRequest{
  198. Target: string(targetVolumeServer),
  199. TargetType: cluster.VolumeServerType,
  200. })
  201. if err == nil {
  202. printTiming(writer, pong.StartTimeNs, pong.RemoteTimeNs, pong.StopTimeNs)
  203. }
  204. return err
  205. })
  206. if err != nil {
  207. fmt.Fprintf(writer, "%v\n", err)
  208. }
  209. }
  210. }
  211. // check between filers, and need to connect to itself
  212. for _, sourceFiler := range filers {
  213. for _, targetFiler := range filers {
  214. fmt.Fprintf(writer, "checking filer %s to %s ... ", string(sourceFiler), string(targetFiler))
  215. err := pb.WithFilerClient(false, 0, sourceFiler, commandEnv.option.GrpcDialOption, func(client filer_pb.SeaweedFilerClient) error {
  216. pong, err := client.Ping(context.Background(), &filer_pb.PingRequest{
  217. Target: string(targetFiler),
  218. TargetType: cluster.FilerType,
  219. })
  220. if err == nil {
  221. printTiming(writer, pong.StartTimeNs, pong.RemoteTimeNs, pong.StopTimeNs)
  222. }
  223. return err
  224. })
  225. if err != nil {
  226. fmt.Fprintf(writer, "%v\n", err)
  227. }
  228. }
  229. }
  230. return nil
  231. }
  232. func printTiming(writer io.Writer, startNs, remoteNs, stopNs int64) {
  233. roundTripTimeMs := float32(stopNs-startNs) / 1000000
  234. deltaTimeMs := float32(remoteNs-(startNs+stopNs)/2) / 1000000
  235. fmt.Fprintf(writer, "ok round trip %.3fms clock delta %.3fms\n", roundTripTimeMs, deltaTimeMs)
  236. }