You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

447 lines
14 KiB

6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
4 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
  1. package stats
  2. import (
  3. "log"
  4. "net"
  5. "net/http"
  6. "os"
  7. "strconv"
  8. "strings"
  9. "time"
  10. "github.com/prometheus/client_golang/prometheus"
  11. "github.com/prometheus/client_golang/prometheus/collectors"
  12. "github.com/prometheus/client_golang/prometheus/promhttp"
  13. "github.com/prometheus/client_golang/prometheus/push"
  14. "github.com/seaweedfs/seaweedfs/weed/glog"
  15. )
  16. // Readonly volume types
  17. const (
  18. Namespace = "SeaweedFS"
  19. IsReadOnly = "IsReadOnly"
  20. NoWriteOrDelete = "noWriteOrDelete"
  21. NoWriteCanDelete = "noWriteCanDelete"
  22. IsDiskSpaceLow = "isDiskSpaceLow"
  23. bucketAtiveTTL = 10 * time.Minute
  24. )
  25. var readOnlyVolumeTypes = [4]string{IsReadOnly, NoWriteOrDelete, NoWriteCanDelete, IsDiskSpaceLow}
  26. var bucketLastActiveTsNs map[string]int64 = map[string]int64{}
  27. var (
  28. Gather = prometheus.NewRegistry()
  29. MasterClientConnectCounter = prometheus.NewCounterVec(
  30. prometheus.CounterOpts{
  31. Namespace: Namespace,
  32. Subsystem: "wdclient",
  33. Name: "connect_updates",
  34. Help: "Counter of master client leader updates.",
  35. }, []string{"type"})
  36. MasterRaftIsleader = prometheus.NewGauge(
  37. prometheus.GaugeOpts{
  38. Namespace: Namespace,
  39. Subsystem: "master",
  40. Name: "is_leader",
  41. Help: "is leader",
  42. })
  43. MasterAdminLock = prometheus.NewGaugeVec(
  44. prometheus.GaugeOpts{
  45. Namespace: Namespace,
  46. Subsystem: "master",
  47. Name: "admin_lock",
  48. Help: "admin lock",
  49. }, []string{"client"})
  50. MasterReceivedHeartbeatCounter = prometheus.NewCounterVec(
  51. prometheus.CounterOpts{
  52. Namespace: Namespace,
  53. Subsystem: "master",
  54. Name: "received_heartbeats",
  55. Help: "Counter of master received heartbeat.",
  56. }, []string{"type"})
  57. MasterReplicaPlacementMismatch = prometheus.NewGaugeVec(
  58. prometheus.GaugeOpts{
  59. Namespace: Namespace,
  60. Subsystem: "master",
  61. Name: "replica_placement_mismatch",
  62. Help: "replica placement mismatch",
  63. }, []string{"collection", "id"})
  64. MasterVolumeLayoutWritable = prometheus.NewGaugeVec(
  65. prometheus.GaugeOpts{
  66. Namespace: Namespace,
  67. Subsystem: "master",
  68. Name: "volume_layout_writable",
  69. Help: "Number of writable volumes in volume layouts",
  70. }, []string{"collection", "disk", "rp", "ttl"})
  71. MasterVolumeLayoutCrowded = prometheus.NewGaugeVec(
  72. prometheus.GaugeOpts{
  73. Namespace: Namespace,
  74. Subsystem: "master",
  75. Name: "volume_layout_crowded",
  76. Help: "Number of crowded volumes in volume layouts",
  77. }, []string{"collection", "disk", "rp", "ttl"})
  78. MasterPickForWriteErrorCounter = prometheus.NewCounter(
  79. prometheus.CounterOpts{
  80. Namespace: Namespace,
  81. Subsystem: "master",
  82. Name: "pick_for_write_error",
  83. Help: "Counter of master pick for write error",
  84. })
  85. MasterBroadcastToFullErrorCounter = prometheus.NewCounter(
  86. prometheus.CounterOpts{
  87. Namespace: Namespace,
  88. Subsystem: "master",
  89. Name: "broadcast_to_full",
  90. Help: "Counter of master broadcast send to full message channel err",
  91. })
  92. MasterLeaderChangeCounter = prometheus.NewCounterVec(
  93. prometheus.CounterOpts{
  94. Namespace: Namespace,
  95. Subsystem: "master",
  96. Name: "leader_changes",
  97. Help: "Counter of master leader changes.",
  98. }, []string{"type"})
  99. FilerRequestCounter = prometheus.NewCounterVec(
  100. prometheus.CounterOpts{
  101. Namespace: Namespace,
  102. Subsystem: "filer",
  103. Name: "request_total",
  104. Help: "Counter of filer requests.",
  105. }, []string{"type", "code"})
  106. FilerHandlerCounter = prometheus.NewCounterVec(
  107. prometheus.CounterOpts{
  108. Namespace: Namespace,
  109. Subsystem: "filer",
  110. Name: "handler_total",
  111. Help: "Counter of filer handlers.",
  112. }, []string{"type"})
  113. FilerRequestHistogram = prometheus.NewHistogramVec(
  114. prometheus.HistogramOpts{
  115. Namespace: Namespace,
  116. Subsystem: "filer",
  117. Name: "request_seconds",
  118. Help: "Bucketed histogram of filer request processing time.",
  119. Buckets: prometheus.ExponentialBuckets(0.0001, 2, 24),
  120. }, []string{"type"})
  121. FilerInFlightRequestsGauge = prometheus.NewGaugeVec(
  122. prometheus.GaugeOpts{
  123. Namespace: Namespace,
  124. Subsystem: "filer",
  125. Name: "in_flight_requests",
  126. Help: "Current number of in-flight requests being handled by filer.",
  127. }, []string{"type"})
  128. FilerServerLastSendTsOfSubscribeGauge = prometheus.NewGaugeVec(
  129. prometheus.GaugeOpts{
  130. Namespace: Namespace,
  131. Subsystem: "filer",
  132. Name: "last_send_timestamp_of_subscribe",
  133. Help: "The last send timestamp of the filer subscription.",
  134. }, []string{"sourceFiler", "clientName", "path"})
  135. FilerStoreCounter = prometheus.NewCounterVec(
  136. prometheus.CounterOpts{
  137. Namespace: Namespace,
  138. Subsystem: "filerStore",
  139. Name: "request_total",
  140. Help: "Counter of filer store requests.",
  141. }, []string{"store", "type"})
  142. FilerStoreHistogram = prometheus.NewHistogramVec(
  143. prometheus.HistogramOpts{
  144. Namespace: Namespace,
  145. Subsystem: "filerStore",
  146. Name: "request_seconds",
  147. Help: "Bucketed histogram of filer store request processing time.",
  148. Buckets: prometheus.ExponentialBuckets(0.0001, 2, 24),
  149. }, []string{"store", "type"})
  150. FilerSyncOffsetGauge = prometheus.NewGaugeVec(
  151. prometheus.GaugeOpts{
  152. Namespace: Namespace,
  153. Subsystem: "filerSync",
  154. Name: "sync_offset",
  155. Help: "The offset of the filer synchronization service.",
  156. }, []string{"sourceFiler", "targetFiler", "clientName", "path"})
  157. VolumeServerRequestCounter = prometheus.NewCounterVec(
  158. prometheus.CounterOpts{
  159. Namespace: Namespace,
  160. Subsystem: "volumeServer",
  161. Name: "request_total",
  162. Help: "Counter of volume server requests.",
  163. }, []string{"type", "code"})
  164. VolumeServerHandlerCounter = prometheus.NewCounterVec(
  165. prometheus.CounterOpts{
  166. Namespace: Namespace,
  167. Subsystem: "volumeServer",
  168. Name: "handler_total",
  169. Help: "Counter of volume server handlers.",
  170. }, []string{"type"})
  171. VolumeServerVacuumingCompactCounter = prometheus.NewCounterVec(
  172. prometheus.CounterOpts{
  173. Namespace: Namespace,
  174. Subsystem: "volumeServer",
  175. Name: "vacuuming_compact_count",
  176. Help: "Counter of volume vacuuming Compact counter",
  177. }, []string{"success"})
  178. VolumeServerVacuumingCommitCounter = prometheus.NewCounterVec(
  179. prometheus.CounterOpts{
  180. Namespace: Namespace,
  181. Subsystem: "volumeServer",
  182. Name: "vacuuming_commit_count",
  183. Help: "Counter of volume vacuuming commit counter",
  184. }, []string{"success"})
  185. VolumeServerVacuumingHistogram = prometheus.NewHistogramVec(
  186. prometheus.HistogramOpts{
  187. Namespace: Namespace,
  188. Subsystem: "volumeServer",
  189. Name: "vacuuming_seconds",
  190. Help: "Bucketed histogram of volume server vacuuming processing time.",
  191. Buckets: prometheus.ExponentialBuckets(0.0001, 2, 24),
  192. }, []string{"type"})
  193. VolumeServerRequestHistogram = prometheus.NewHistogramVec(
  194. prometheus.HistogramOpts{
  195. Namespace: Namespace,
  196. Subsystem: "volumeServer",
  197. Name: "request_seconds",
  198. Help: "Bucketed histogram of volume server request processing time.",
  199. Buckets: prometheus.ExponentialBuckets(0.0001, 2, 24),
  200. }, []string{"type"})
  201. VolumeServerInFlightRequestsGauge = prometheus.NewGaugeVec(
  202. prometheus.GaugeOpts{
  203. Namespace: Namespace,
  204. Subsystem: "volumeServer",
  205. Name: "in_flight_requests",
  206. Help: "Current number of in-flight requests being handled by volume server.",
  207. }, []string{"type"})
  208. VolumeServerVolumeGauge = prometheus.NewGaugeVec(
  209. prometheus.GaugeOpts{
  210. Namespace: Namespace,
  211. Subsystem: "volumeServer",
  212. Name: "volumes",
  213. Help: "Number of volumes or shards.",
  214. }, []string{"collection", "type"})
  215. VolumeServerReadOnlyVolumeGauge = prometheus.NewGaugeVec(
  216. prometheus.GaugeOpts{
  217. Namespace: Namespace,
  218. Subsystem: "volumeServer",
  219. Name: "read_only_volumes",
  220. Help: "Number of read only volumes.",
  221. }, []string{"collection", "type"})
  222. VolumeServerMaxVolumeCounter = prometheus.NewGauge(
  223. prometheus.GaugeOpts{
  224. Namespace: Namespace,
  225. Subsystem: "volumeServer",
  226. Name: "max_volumes",
  227. Help: "Maximum number of volumes.",
  228. })
  229. VolumeServerDiskSizeGauge = prometheus.NewGaugeVec(
  230. prometheus.GaugeOpts{
  231. Namespace: Namespace,
  232. Subsystem: "volumeServer",
  233. Name: "total_disk_size",
  234. Help: "Actual disk size used by volumes.",
  235. }, []string{"collection", "type"})
  236. VolumeServerResourceGauge = prometheus.NewGaugeVec(
  237. prometheus.GaugeOpts{
  238. Namespace: Namespace,
  239. Subsystem: "volumeServer",
  240. Name: "resource",
  241. Help: "Resource usage",
  242. }, []string{"name", "type"})
  243. S3RequestCounter = prometheus.NewCounterVec(
  244. prometheus.CounterOpts{
  245. Namespace: Namespace,
  246. Subsystem: "s3",
  247. Name: "request_total",
  248. Help: "Counter of s3 requests.",
  249. }, []string{"type", "code", "bucket"})
  250. S3HandlerCounter = prometheus.NewCounterVec(
  251. prometheus.CounterOpts{
  252. Namespace: Namespace,
  253. Subsystem: "s3",
  254. Name: "handler_total",
  255. Help: "Counter of s3 server handlers.",
  256. }, []string{"type"})
  257. S3RequestHistogram = prometheus.NewHistogramVec(
  258. prometheus.HistogramOpts{
  259. Namespace: Namespace,
  260. Subsystem: "s3",
  261. Name: "request_seconds",
  262. Help: "Bucketed histogram of s3 request processing time.",
  263. Buckets: prometheus.ExponentialBuckets(0.0001, 2, 24),
  264. }, []string{"type", "bucket"})
  265. S3TimeToFirstByteHistogram = prometheus.NewHistogramVec(
  266. prometheus.HistogramOpts{
  267. Namespace: Namespace,
  268. Subsystem: "s3",
  269. Name: "time_to_first_byte_millisecond",
  270. Help: "Bucketed histogram of s3 time to first byte request processing time.",
  271. Buckets: prometheus.ExponentialBuckets(0.001, 2, 27),
  272. }, []string{"type", "bucket"})
  273. S3InFlightRequestsGauge = prometheus.NewGaugeVec(
  274. prometheus.GaugeOpts{
  275. Namespace: Namespace,
  276. Subsystem: "s3",
  277. Name: "in_flight_requests",
  278. Help: "Current number of in-flight requests being handled by s3.",
  279. }, []string{"type"})
  280. )
  281. func init() {
  282. Gather.MustRegister(MasterClientConnectCounter)
  283. Gather.MustRegister(MasterRaftIsleader)
  284. Gather.MustRegister(MasterAdminLock)
  285. Gather.MustRegister(MasterReceivedHeartbeatCounter)
  286. Gather.MustRegister(MasterLeaderChangeCounter)
  287. Gather.MustRegister(MasterReplicaPlacementMismatch)
  288. Gather.MustRegister(MasterVolumeLayoutWritable)
  289. Gather.MustRegister(MasterVolumeLayoutCrowded)
  290. Gather.MustRegister(MasterPickForWriteErrorCounter)
  291. Gather.MustRegister(MasterBroadcastToFullErrorCounter)
  292. Gather.MustRegister(FilerRequestCounter)
  293. Gather.MustRegister(FilerHandlerCounter)
  294. Gather.MustRegister(FilerRequestHistogram)
  295. Gather.MustRegister(FilerInFlightRequestsGauge)
  296. Gather.MustRegister(FilerStoreCounter)
  297. Gather.MustRegister(FilerStoreHistogram)
  298. Gather.MustRegister(FilerSyncOffsetGauge)
  299. Gather.MustRegister(FilerServerLastSendTsOfSubscribeGauge)
  300. Gather.MustRegister(collectors.NewGoCollector())
  301. Gather.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}))
  302. Gather.MustRegister(VolumeServerRequestCounter)
  303. Gather.MustRegister(VolumeServerHandlerCounter)
  304. Gather.MustRegister(VolumeServerRequestHistogram)
  305. Gather.MustRegister(VolumeServerInFlightRequestsGauge)
  306. Gather.MustRegister(VolumeServerVacuumingCompactCounter)
  307. Gather.MustRegister(VolumeServerVacuumingCommitCounter)
  308. Gather.MustRegister(VolumeServerVacuumingHistogram)
  309. Gather.MustRegister(VolumeServerVolumeGauge)
  310. Gather.MustRegister(VolumeServerMaxVolumeCounter)
  311. Gather.MustRegister(VolumeServerReadOnlyVolumeGauge)
  312. Gather.MustRegister(VolumeServerDiskSizeGauge)
  313. Gather.MustRegister(VolumeServerResourceGauge)
  314. Gather.MustRegister(S3RequestCounter)
  315. Gather.MustRegister(S3HandlerCounter)
  316. Gather.MustRegister(S3RequestHistogram)
  317. Gather.MustRegister(S3InFlightRequestsGauge)
  318. Gather.MustRegister(S3TimeToFirstByteHistogram)
  319. go bucketMetricTTLControl()
  320. }
  321. func LoopPushingMetric(name, instance, addr string, intervalSeconds int) {
  322. if addr == "" || intervalSeconds == 0 {
  323. return
  324. }
  325. glog.V(0).Infof("%s server sends metrics to %s every %d seconds", name, addr, intervalSeconds)
  326. pusher := push.New(addr, name).Gatherer(Gather).Grouping("instance", instance)
  327. for {
  328. err := pusher.Push()
  329. if err != nil && !strings.HasPrefix(err.Error(), "unexpected status code 200") {
  330. glog.V(0).Infof("could not push metrics to prometheus push gateway %s: %v", addr, err)
  331. }
  332. if intervalSeconds <= 0 {
  333. intervalSeconds = 15
  334. }
  335. time.Sleep(time.Duration(intervalSeconds) * time.Second)
  336. }
  337. }
  338. func JoinHostPort(host string, port int) string {
  339. portStr := strconv.Itoa(port)
  340. if strings.HasPrefix(host, "[") && strings.HasSuffix(host, "]") {
  341. return host + ":" + portStr
  342. }
  343. return net.JoinHostPort(host, portStr)
  344. }
  345. func StartMetricsServer(ip string, port int) {
  346. if port == 0 {
  347. return
  348. }
  349. http.Handle("/metrics", promhttp.HandlerFor(Gather, promhttp.HandlerOpts{}))
  350. log.Fatal(http.ListenAndServe(JoinHostPort(ip, port), nil))
  351. }
  352. func SourceName(port uint32) string {
  353. hostname, err := os.Hostname()
  354. if err != nil {
  355. return "unknown"
  356. }
  357. return net.JoinHostPort(hostname, strconv.Itoa(int(port)))
  358. }
  359. func RecordBucketActiveTime(bucket string) {
  360. bucketLastActiveTsNs[bucket] = time.Now().UnixNano()
  361. }
  362. func DeleteCollectionMetrics(collection string) {
  363. labels := prometheus.Labels{"collection": collection}
  364. c := MasterReplicaPlacementMismatch.DeletePartialMatch(labels)
  365. c += MasterVolumeLayoutWritable.DeletePartialMatch(labels)
  366. c += MasterVolumeLayoutCrowded.DeletePartialMatch(labels)
  367. c += VolumeServerDiskSizeGauge.DeletePartialMatch(labels)
  368. c += VolumeServerVolumeGauge.DeletePartialMatch(labels)
  369. c += VolumeServerReadOnlyVolumeGauge.DeletePartialMatch(labels)
  370. glog.V(0).Infof("delete collection metrics, %s: %d", collection, c)
  371. }
  372. func bucketMetricTTLControl() {
  373. ttlNs := bucketAtiveTTL.Nanoseconds()
  374. for {
  375. now := time.Now().UnixNano()
  376. for bucket, ts := range bucketLastActiveTsNs {
  377. if (now - ts) > ttlNs {
  378. delete(bucketLastActiveTsNs, bucket)
  379. labels := prometheus.Labels{"bucket": bucket}
  380. c := S3RequestCounter.DeletePartialMatch(labels)
  381. c += S3RequestHistogram.DeletePartialMatch(labels)
  382. c += S3TimeToFirstByteHistogram.DeletePartialMatch(labels)
  383. glog.V(0).Infof("delete inactive bucket metrics, %s: %d", bucket, c)
  384. }
  385. }
  386. time.Sleep(bucketAtiveTTL)
  387. }
  388. }