You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

227 lines
6.6 KiB

  1. package weed_server
  2. // https://yusufs.medium.com/creating-distributed-kv-database-by-implementing-raft-consensus-using-golang-d0884eef2e28
  3. // https://github.com/Jille/raft-grpc-example/blob/cd5bcab0218f008e044fbeee4facdd01b06018ad/application.go#L18
  4. import (
  5. "fmt"
  6. "math/rand/v2"
  7. "os"
  8. "path"
  9. "path/filepath"
  10. "sort"
  11. "strings"
  12. "time"
  13. transport "github.com/Jille/raft-grpc-transport"
  14. "github.com/armon/go-metrics"
  15. "github.com/armon/go-metrics/prometheus"
  16. "github.com/hashicorp/raft"
  17. boltdb "github.com/hashicorp/raft-boltdb/v2"
  18. "github.com/seaweedfs/seaweedfs/weed/glog"
  19. "github.com/seaweedfs/seaweedfs/weed/pb"
  20. "github.com/seaweedfs/seaweedfs/weed/stats"
  21. "google.golang.org/grpc"
  22. )
  23. const (
  24. ldbFile = "logs.dat"
  25. sdbFile = "stable.dat"
  26. updatePeersTimeout = 15 * time.Minute
  27. )
  28. func getPeerIdx(self pb.ServerAddress, mapPeers map[string]pb.ServerAddress) int {
  29. peers := make([]pb.ServerAddress, 0, len(mapPeers))
  30. for _, peer := range mapPeers {
  31. peers = append(peers, peer)
  32. }
  33. sort.Slice(peers, func(i, j int) bool {
  34. return strings.Compare(string(peers[i]), string(peers[j])) < 0
  35. })
  36. for i, peer := range peers {
  37. if string(peer) == string(self) {
  38. return i
  39. }
  40. }
  41. return -1
  42. }
  43. func (s *RaftServer) AddPeersConfiguration() (cfg raft.Configuration) {
  44. for _, peer := range s.peers {
  45. cfg.Servers = append(cfg.Servers, raft.Server{
  46. Suffrage: raft.Voter,
  47. ID: raft.ServerID(peer),
  48. Address: raft.ServerAddress(peer.ToGrpcAddress()),
  49. })
  50. }
  51. return cfg
  52. }
  53. func (s *RaftServer) monitorLeaderLoop(updatePeers bool) {
  54. for {
  55. prevLeader, _ := s.RaftHashicorp.LeaderWithID()
  56. select {
  57. case isLeader := <-s.RaftHashicorp.LeaderCh():
  58. leader, _ := s.RaftHashicorp.LeaderWithID()
  59. if isLeader {
  60. if updatePeers {
  61. s.updatePeers()
  62. updatePeers = false
  63. }
  64. s.topo.DoBarrier()
  65. stats.MasterLeaderChangeCounter.WithLabelValues(fmt.Sprintf("%+v", leader)).Inc()
  66. } else {
  67. s.topo.BarrierReset()
  68. }
  69. glog.V(0).Infof("is leader %+v change event: %+v => %+v", isLeader, prevLeader, leader)
  70. prevLeader = leader
  71. s.topo.LastLeaderChangeTime = time.Now()
  72. }
  73. }
  74. }
  75. func (s *RaftServer) updatePeers() {
  76. peerLeader := string(s.serverAddr)
  77. existsPeerName := make(map[string]bool)
  78. for _, server := range s.RaftHashicorp.GetConfiguration().Configuration().Servers {
  79. if string(server.ID) == peerLeader {
  80. continue
  81. }
  82. existsPeerName[string(server.ID)] = true
  83. }
  84. for _, peer := range s.peers {
  85. peerName := string(peer)
  86. if peerName == peerLeader || existsPeerName[peerName] {
  87. continue
  88. }
  89. glog.V(0).Infof("adding new peer: %s", peerName)
  90. s.RaftHashicorp.AddVoter(
  91. raft.ServerID(peerName), raft.ServerAddress(peer.ToGrpcAddress()), 0, 0)
  92. }
  93. for peer := range existsPeerName {
  94. if _, found := s.peers[peer]; !found {
  95. glog.V(0).Infof("removing old peer: %s", peer)
  96. s.RaftHashicorp.RemoveServer(raft.ServerID(peer), 0, 0)
  97. }
  98. }
  99. if _, found := s.peers[peerLeader]; !found {
  100. glog.V(0).Infof("removing old leader peer: %s", peerLeader)
  101. s.RaftHashicorp.RemoveServer(raft.ServerID(peerLeader), 0, 0)
  102. }
  103. }
  104. func NewHashicorpRaftServer(option *RaftServerOption) (*RaftServer, error) {
  105. s := &RaftServer{
  106. peers: option.Peers,
  107. serverAddr: option.ServerAddr,
  108. dataDir: option.DataDir,
  109. topo: option.Topo,
  110. }
  111. c := raft.DefaultConfig()
  112. c.LocalID = raft.ServerID(s.serverAddr) // TODO maybee the IP:port address will change
  113. c.HeartbeatTimeout = time.Duration(float64(option.HeartbeatInterval) * (rand.Float64()*0.25 + 1))
  114. c.ElectionTimeout = option.ElectionTimeout
  115. if c.LeaderLeaseTimeout > c.HeartbeatTimeout {
  116. c.LeaderLeaseTimeout = c.HeartbeatTimeout
  117. }
  118. if glog.V(4) {
  119. c.LogLevel = "Debug"
  120. } else if glog.V(2) {
  121. c.LogLevel = "Info"
  122. } else if glog.V(1) {
  123. c.LogLevel = "Warn"
  124. } else if glog.V(0) {
  125. c.LogLevel = "Error"
  126. }
  127. if err := raft.ValidateConfig(c); err != nil {
  128. return nil, fmt.Errorf(`raft.ValidateConfig: %v`, err)
  129. }
  130. if option.RaftBootstrap {
  131. os.RemoveAll(path.Join(s.dataDir, ldbFile))
  132. os.RemoveAll(path.Join(s.dataDir, sdbFile))
  133. os.RemoveAll(path.Join(s.dataDir, "snapshots"))
  134. }
  135. if err := os.MkdirAll(path.Join(s.dataDir, "snapshots"), os.ModePerm); err != nil {
  136. return nil, err
  137. }
  138. baseDir := s.dataDir
  139. ldb, err := boltdb.NewBoltStore(filepath.Join(baseDir, ldbFile))
  140. if err != nil {
  141. return nil, fmt.Errorf(`boltdb.NewBoltStore(%q): %v`, filepath.Join(baseDir, "logs.dat"), err)
  142. }
  143. sdb, err := boltdb.NewBoltStore(filepath.Join(baseDir, sdbFile))
  144. if err != nil {
  145. return nil, fmt.Errorf(`boltdb.NewBoltStore(%q): %v`, filepath.Join(baseDir, "stable.dat"), err)
  146. }
  147. fss, err := raft.NewFileSnapshotStore(baseDir, 3, os.Stderr)
  148. if err != nil {
  149. return nil, fmt.Errorf(`raft.NewFileSnapshotStore(%q, ...): %v`, baseDir, err)
  150. }
  151. s.TransportManager = transport.New(raft.ServerAddress(s.serverAddr), []grpc.DialOption{option.GrpcDialOption})
  152. stateMachine := StateMachine{topo: option.Topo}
  153. s.RaftHashicorp, err = raft.NewRaft(c, &stateMachine, ldb, sdb, fss, s.TransportManager.Transport())
  154. if err != nil {
  155. return nil, fmt.Errorf("raft.NewRaft: %v", err)
  156. }
  157. updatePeers := false
  158. if option.RaftBootstrap || len(s.RaftHashicorp.GetConfiguration().Configuration().Servers) == 0 {
  159. cfg := s.AddPeersConfiguration()
  160. // Need to get lock, in case all servers do this at the same time.
  161. peerIdx := getPeerIdx(s.serverAddr, s.peers)
  162. timeSleep := time.Duration(float64(c.LeaderLeaseTimeout) * (rand.Float64()*0.25 + 1) * float64(peerIdx))
  163. glog.V(0).Infof("Bootstrapping idx: %d sleep: %v new cluster: %+v", peerIdx, timeSleep, cfg)
  164. time.Sleep(timeSleep)
  165. f := s.RaftHashicorp.BootstrapCluster(cfg)
  166. if err := f.Error(); err != nil {
  167. return nil, fmt.Errorf("raft.Raft.BootstrapCluster: %v", err)
  168. }
  169. } else {
  170. updatePeers = true
  171. }
  172. go s.monitorLeaderLoop(updatePeers)
  173. ticker := time.NewTicker(c.HeartbeatTimeout * 10)
  174. if glog.V(4) {
  175. go func() {
  176. for {
  177. select {
  178. case <-ticker.C:
  179. cfuture := s.RaftHashicorp.GetConfiguration()
  180. if err = cfuture.Error(); err != nil {
  181. glog.Fatalf("error getting config: %s", err)
  182. }
  183. configuration := cfuture.Configuration()
  184. glog.V(4).Infof("Showing peers known by %s:\n%+v", s.RaftHashicorp.String(), configuration.Servers)
  185. }
  186. }
  187. }()
  188. }
  189. // Configure a prometheus sink as the raft metrics sink
  190. if sink, err := prometheus.NewPrometheusSinkFrom(prometheus.PrometheusOpts{
  191. Registerer: stats.Gather,
  192. }); err != nil {
  193. return nil, fmt.Errorf("NewPrometheusSink: %v", err)
  194. } else {
  195. metricsConf := metrics.DefaultConfig(stats.Namespace)
  196. metricsConf.EnableRuntimeMetrics = false
  197. if _, err = metrics.NewGlobal(metricsConf, sink); err != nil {
  198. return nil, fmt.Errorf("metrics.NewGlobal: %v", err)
  199. }
  200. }
  201. return s, nil
  202. }