You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

493 lines
16 KiB

6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
5 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
  1. package stats
  2. import (
  3. "net"
  4. "net/http"
  5. "os"
  6. "strconv"
  7. "strings"
  8. "sync"
  9. "time"
  10. "github.com/prometheus/client_golang/prometheus"
  11. "github.com/prometheus/client_golang/prometheus/collectors"
  12. "github.com/prometheus/client_golang/prometheus/promhttp"
  13. "github.com/prometheus/client_golang/prometheus/push"
  14. "github.com/seaweedfs/seaweedfs/weed/glog"
  15. )
  16. // Readonly volume types
  17. const (
  18. Namespace = "SeaweedFS"
  19. IsReadOnly = "IsReadOnly"
  20. NoWriteOrDelete = "noWriteOrDelete"
  21. NoWriteCanDelete = "noWriteCanDelete"
  22. IsDiskSpaceLow = "isDiskSpaceLow"
  23. bucketAtiveTTL = 10 * time.Minute
  24. )
  25. var readOnlyVolumeTypes = [4]string{IsReadOnly, NoWriteOrDelete, NoWriteCanDelete, IsDiskSpaceLow}
  26. var bucketLastActiveTsNs map[string]int64 = map[string]int64{}
  27. var bucketLastActiveLock sync.Mutex
  28. var (
  29. Gather = prometheus.NewRegistry()
  30. MasterClientConnectCounter = prometheus.NewCounterVec(
  31. prometheus.CounterOpts{
  32. Namespace: Namespace,
  33. Subsystem: "wdclient",
  34. Name: "connect_updates",
  35. Help: "Counter of master client leader updates.",
  36. }, []string{"type"})
  37. MasterRaftIsleader = prometheus.NewGauge(
  38. prometheus.GaugeOpts{
  39. Namespace: Namespace,
  40. Subsystem: "master",
  41. Name: "is_leader",
  42. Help: "is leader",
  43. })
  44. MasterAdminLock = prometheus.NewGaugeVec(
  45. prometheus.GaugeOpts{
  46. Namespace: Namespace,
  47. Subsystem: "master",
  48. Name: "admin_lock",
  49. Help: "admin lock",
  50. }, []string{"client"})
  51. MasterReceivedHeartbeatCounter = prometheus.NewCounterVec(
  52. prometheus.CounterOpts{
  53. Namespace: Namespace,
  54. Subsystem: "master",
  55. Name: "received_heartbeats",
  56. Help: "Counter of master received heartbeat.",
  57. }, []string{"type"})
  58. MasterReplicaPlacementMismatch = prometheus.NewGaugeVec(
  59. prometheus.GaugeOpts{
  60. Namespace: Namespace,
  61. Subsystem: "master",
  62. Name: "replica_placement_mismatch",
  63. Help: "replica placement mismatch",
  64. }, []string{"collection", "id"})
  65. MasterVolumeLayoutWritable = prometheus.NewGaugeVec(
  66. prometheus.GaugeOpts{
  67. Namespace: Namespace,
  68. Subsystem: "master",
  69. Name: "volume_layout_writable",
  70. Help: "Number of writable volumes in volume layouts",
  71. }, []string{"collection", "disk", "rp", "ttl"})
  72. MasterVolumeLayoutCrowded = prometheus.NewGaugeVec(
  73. prometheus.GaugeOpts{
  74. Namespace: Namespace,
  75. Subsystem: "master",
  76. Name: "volume_layout_crowded",
  77. Help: "Number of crowded volumes in volume layouts",
  78. }, []string{"collection", "disk", "rp", "ttl"})
  79. MasterPickForWriteErrorCounter = prometheus.NewCounter(
  80. prometheus.CounterOpts{
  81. Namespace: Namespace,
  82. Subsystem: "master",
  83. Name: "pick_for_write_error",
  84. Help: "Counter of master pick for write error",
  85. })
  86. MasterBroadcastToFullErrorCounter = prometheus.NewCounter(
  87. prometheus.CounterOpts{
  88. Namespace: Namespace,
  89. Subsystem: "master",
  90. Name: "broadcast_to_full",
  91. Help: "Counter of master broadcast send to full message channel err",
  92. })
  93. MasterLeaderChangeCounter = prometheus.NewCounterVec(
  94. prometheus.CounterOpts{
  95. Namespace: Namespace,
  96. Subsystem: "master",
  97. Name: "leader_changes",
  98. Help: "Counter of master leader changes.",
  99. }, []string{"type"})
  100. FilerRequestCounter = prometheus.NewCounterVec(
  101. prometheus.CounterOpts{
  102. Namespace: Namespace,
  103. Subsystem: "filer",
  104. Name: "request_total",
  105. Help: "Counter of filer requests.",
  106. }, []string{"type", "code"})
  107. FilerHandlerCounter = prometheus.NewCounterVec(
  108. prometheus.CounterOpts{
  109. Namespace: Namespace,
  110. Subsystem: "filer",
  111. Name: "handler_total",
  112. Help: "Counter of filer handlers.",
  113. }, []string{"type"})
  114. FilerRequestHistogram = prometheus.NewHistogramVec(
  115. prometheus.HistogramOpts{
  116. Namespace: Namespace,
  117. Subsystem: "filer",
  118. Name: "request_seconds",
  119. Help: "Bucketed histogram of filer request processing time.",
  120. Buckets: prometheus.ExponentialBuckets(0.0001, 2, 24),
  121. }, []string{"type"})
  122. FilerInFlightRequestsGauge = prometheus.NewGaugeVec(
  123. prometheus.GaugeOpts{
  124. Namespace: Namespace,
  125. Subsystem: "filer",
  126. Name: "in_flight_requests",
  127. Help: "Current number of in-flight requests being handled by filer.",
  128. }, []string{"type"})
  129. FilerServerLastSendTsOfSubscribeGauge = prometheus.NewGaugeVec(
  130. prometheus.GaugeOpts{
  131. Namespace: Namespace,
  132. Subsystem: "filer",
  133. Name: "last_send_timestamp_of_subscribe",
  134. Help: "The last send timestamp of the filer subscription.",
  135. }, []string{"sourceFiler", "clientName", "path"})
  136. FilerStoreCounter = prometheus.NewCounterVec(
  137. prometheus.CounterOpts{
  138. Namespace: Namespace,
  139. Subsystem: "filerStore",
  140. Name: "request_total",
  141. Help: "Counter of filer store requests.",
  142. }, []string{"store", "type"})
  143. FilerStoreHistogram = prometheus.NewHistogramVec(
  144. prometheus.HistogramOpts{
  145. Namespace: Namespace,
  146. Subsystem: "filerStore",
  147. Name: "request_seconds",
  148. Help: "Bucketed histogram of filer store request processing time.",
  149. Buckets: prometheus.ExponentialBuckets(0.0001, 2, 24),
  150. }, []string{"store", "type"})
  151. FilerSyncOffsetGauge = prometheus.NewGaugeVec(
  152. prometheus.GaugeOpts{
  153. Namespace: Namespace,
  154. Subsystem: "filerSync",
  155. Name: "sync_offset",
  156. Help: "The offset of the filer synchronization service.",
  157. }, []string{"sourceFiler", "targetFiler", "clientName", "path"})
  158. VolumeServerRequestCounter = prometheus.NewCounterVec(
  159. prometheus.CounterOpts{
  160. Namespace: Namespace,
  161. Subsystem: "volumeServer",
  162. Name: "request_total",
  163. Help: "Counter of volume server requests.",
  164. }, []string{"type", "code"})
  165. VolumeServerHandlerCounter = prometheus.NewCounterVec(
  166. prometheus.CounterOpts{
  167. Namespace: Namespace,
  168. Subsystem: "volumeServer",
  169. Name: "handler_total",
  170. Help: "Counter of volume server handlers.",
  171. }, []string{"type"})
  172. VolumeServerVacuumingCompactCounter = prometheus.NewCounterVec(
  173. prometheus.CounterOpts{
  174. Namespace: Namespace,
  175. Subsystem: "volumeServer",
  176. Name: "vacuuming_compact_count",
  177. Help: "Counter of volume vacuuming Compact counter",
  178. }, []string{"success"})
  179. VolumeServerVacuumingCommitCounter = prometheus.NewCounterVec(
  180. prometheus.CounterOpts{
  181. Namespace: Namespace,
  182. Subsystem: "volumeServer",
  183. Name: "vacuuming_commit_count",
  184. Help: "Counter of volume vacuuming commit counter",
  185. }, []string{"success"})
  186. VolumeServerVacuumingHistogram = prometheus.NewHistogramVec(
  187. prometheus.HistogramOpts{
  188. Namespace: Namespace,
  189. Subsystem: "volumeServer",
  190. Name: "vacuuming_seconds",
  191. Help: "Bucketed histogram of volume server vacuuming processing time.",
  192. Buckets: prometheus.ExponentialBuckets(0.0001, 2, 24),
  193. }, []string{"type"})
  194. VolumeServerRequestHistogram = prometheus.NewHistogramVec(
  195. prometheus.HistogramOpts{
  196. Namespace: Namespace,
  197. Subsystem: "volumeServer",
  198. Name: "request_seconds",
  199. Help: "Bucketed histogram of volume server request processing time.",
  200. Buckets: prometheus.ExponentialBuckets(0.0001, 2, 24),
  201. }, []string{"type"})
  202. VolumeServerInFlightRequestsGauge = prometheus.NewGaugeVec(
  203. prometheus.GaugeOpts{
  204. Namespace: Namespace,
  205. Subsystem: "volumeServer",
  206. Name: "in_flight_requests",
  207. Help: "Current number of in-flight requests being handled by volume server.",
  208. }, []string{"type"})
  209. VolumeServerVolumeGauge = prometheus.NewGaugeVec(
  210. prometheus.GaugeOpts{
  211. Namespace: Namespace,
  212. Subsystem: "volumeServer",
  213. Name: "volumes",
  214. Help: "Number of volumes or shards.",
  215. }, []string{"collection", "type"})
  216. VolumeServerReadOnlyVolumeGauge = prometheus.NewGaugeVec(
  217. prometheus.GaugeOpts{
  218. Namespace: Namespace,
  219. Subsystem: "volumeServer",
  220. Name: "read_only_volumes",
  221. Help: "Number of read only volumes.",
  222. }, []string{"collection", "type"})
  223. VolumeServerMaxVolumeCounter = prometheus.NewGauge(
  224. prometheus.GaugeOpts{
  225. Namespace: Namespace,
  226. Subsystem: "volumeServer",
  227. Name: "max_volumes",
  228. Help: "Maximum number of volumes.",
  229. })
  230. VolumeServerDiskSizeGauge = prometheus.NewGaugeVec(
  231. prometheus.GaugeOpts{
  232. Namespace: Namespace,
  233. Subsystem: "volumeServer",
  234. Name: "total_disk_size",
  235. Help: "Actual disk size used by volumes.",
  236. }, []string{"collection", "type"})
  237. VolumeServerResourceGauge = prometheus.NewGaugeVec(
  238. prometheus.GaugeOpts{
  239. Namespace: Namespace,
  240. Subsystem: "volumeServer",
  241. Name: "resource",
  242. Help: "Resource usage",
  243. }, []string{"name", "type"})
  244. S3RequestCounter = prometheus.NewCounterVec(
  245. prometheus.CounterOpts{
  246. Namespace: Namespace,
  247. Subsystem: "s3",
  248. Name: "request_total",
  249. Help: "Counter of s3 requests.",
  250. }, []string{"type", "code", "bucket"})
  251. S3HandlerCounter = prometheus.NewCounterVec(
  252. prometheus.CounterOpts{
  253. Namespace: Namespace,
  254. Subsystem: "s3",
  255. Name: "handler_total",
  256. Help: "Counter of s3 server handlers.",
  257. }, []string{"type"})
  258. S3RequestHistogram = prometheus.NewHistogramVec(
  259. prometheus.HistogramOpts{
  260. Namespace: Namespace,
  261. Subsystem: "s3",
  262. Name: "request_seconds",
  263. Help: "Bucketed histogram of s3 request processing time.",
  264. Buckets: prometheus.ExponentialBuckets(0.0001, 2, 24),
  265. }, []string{"type", "bucket"})
  266. S3TimeToFirstByteHistogram = prometheus.NewHistogramVec(
  267. prometheus.HistogramOpts{
  268. Namespace: Namespace,
  269. Subsystem: "s3",
  270. Name: "time_to_first_byte_millisecond",
  271. Help: "Bucketed histogram of s3 time to first byte request processing time.",
  272. Buckets: prometheus.ExponentialBuckets(0.001, 2, 27),
  273. }, []string{"type", "bucket"})
  274. S3InFlightRequestsGauge = prometheus.NewGaugeVec(
  275. prometheus.GaugeOpts{
  276. Namespace: Namespace,
  277. Subsystem: "s3",
  278. Name: "in_flight_requests",
  279. Help: "Current number of in-flight requests being handled by s3.",
  280. }, []string{"type"})
  281. S3BucketTrafficReceivedBytesCounter = prometheus.NewCounterVec(
  282. prometheus.CounterOpts{
  283. Namespace: Namespace,
  284. Subsystem: "s3",
  285. Name: "bucket_traffic_received_bytes_total",
  286. Help: "Total number of bytes received by an S3 bucket from clients.",
  287. }, []string{"bucket"})
  288. S3BucketTrafficSentBytesCounter = prometheus.NewCounterVec(
  289. prometheus.CounterOpts{
  290. Namespace: Namespace,
  291. Subsystem: "s3",
  292. Name: "bucket_traffic_sent_bytes_total",
  293. Help: "Total number of bytes sent from an S3 bucket to clients.",
  294. }, []string{"bucket"})
  295. S3DeletedObjectsCounter = prometheus.NewCounterVec(
  296. prometheus.CounterOpts{
  297. Namespace: Namespace,
  298. Subsystem: "s3",
  299. Name: "deleted_objects",
  300. Help: "Number of objects deleted in each bucket.",
  301. }, []string{"bucket"})
  302. S3UploadedObjectsCounter = prometheus.NewCounterVec(
  303. prometheus.CounterOpts{
  304. Namespace: Namespace,
  305. Subsystem: "s3",
  306. Name: "uploaded_objects",
  307. Help: "Number of objects uploaded in each bucket.",
  308. }, []string{"bucket"})
  309. )
  310. func init() {
  311. Gather.MustRegister(MasterClientConnectCounter)
  312. Gather.MustRegister(MasterRaftIsleader)
  313. Gather.MustRegister(MasterAdminLock)
  314. Gather.MustRegister(MasterReceivedHeartbeatCounter)
  315. Gather.MustRegister(MasterLeaderChangeCounter)
  316. Gather.MustRegister(MasterReplicaPlacementMismatch)
  317. Gather.MustRegister(MasterVolumeLayoutWritable)
  318. Gather.MustRegister(MasterVolumeLayoutCrowded)
  319. Gather.MustRegister(MasterPickForWriteErrorCounter)
  320. Gather.MustRegister(MasterBroadcastToFullErrorCounter)
  321. Gather.MustRegister(FilerRequestCounter)
  322. Gather.MustRegister(FilerHandlerCounter)
  323. Gather.MustRegister(FilerRequestHistogram)
  324. Gather.MustRegister(FilerInFlightRequestsGauge)
  325. Gather.MustRegister(FilerStoreCounter)
  326. Gather.MustRegister(FilerStoreHistogram)
  327. Gather.MustRegister(FilerSyncOffsetGauge)
  328. Gather.MustRegister(FilerServerLastSendTsOfSubscribeGauge)
  329. Gather.MustRegister(collectors.NewGoCollector())
  330. Gather.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}))
  331. Gather.MustRegister(VolumeServerRequestCounter)
  332. Gather.MustRegister(VolumeServerHandlerCounter)
  333. Gather.MustRegister(VolumeServerRequestHistogram)
  334. Gather.MustRegister(VolumeServerInFlightRequestsGauge)
  335. Gather.MustRegister(VolumeServerVacuumingCompactCounter)
  336. Gather.MustRegister(VolumeServerVacuumingCommitCounter)
  337. Gather.MustRegister(VolumeServerVacuumingHistogram)
  338. Gather.MustRegister(VolumeServerVolumeGauge)
  339. Gather.MustRegister(VolumeServerMaxVolumeCounter)
  340. Gather.MustRegister(VolumeServerReadOnlyVolumeGauge)
  341. Gather.MustRegister(VolumeServerDiskSizeGauge)
  342. Gather.MustRegister(VolumeServerResourceGauge)
  343. Gather.MustRegister(S3RequestCounter)
  344. Gather.MustRegister(S3HandlerCounter)
  345. Gather.MustRegister(S3RequestHistogram)
  346. Gather.MustRegister(S3InFlightRequestsGauge)
  347. Gather.MustRegister(S3TimeToFirstByteHistogram)
  348. Gather.MustRegister(S3BucketTrafficReceivedBytesCounter)
  349. Gather.MustRegister(S3BucketTrafficSentBytesCounter)
  350. Gather.MustRegister(S3DeletedObjectsCounter)
  351. Gather.MustRegister(S3UploadedObjectsCounter)
  352. go bucketMetricTTLControl()
  353. }
  354. func LoopPushingMetric(name, instance, addr string, intervalSeconds int) {
  355. if addr == "" || intervalSeconds == 0 {
  356. return
  357. }
  358. glog.V(0).Infof("%s server sends metrics to %s every %d seconds", name, addr, intervalSeconds)
  359. pusher := push.New(addr, name).Gatherer(Gather).Grouping("instance", instance)
  360. for {
  361. err := pusher.Push()
  362. if err != nil && !strings.HasPrefix(err.Error(), "unexpected status code 200") {
  363. glog.V(0).Infof("could not push metrics to prometheus push gateway %s: %v", addr, err)
  364. }
  365. if intervalSeconds <= 0 {
  366. intervalSeconds = 15
  367. }
  368. time.Sleep(time.Duration(intervalSeconds) * time.Second)
  369. }
  370. }
  371. func JoinHostPort(host string, port int) string {
  372. portStr := strconv.Itoa(port)
  373. if strings.HasPrefix(host, "[") && strings.HasSuffix(host, "]") {
  374. return host + ":" + portStr
  375. }
  376. return net.JoinHostPort(host, portStr)
  377. }
  378. func StartMetricsServer(ip string, port int) {
  379. if port == 0 {
  380. return
  381. }
  382. http.Handle("/metrics", promhttp.HandlerFor(Gather, promhttp.HandlerOpts{}))
  383. glog.Fatal(http.ListenAndServe(JoinHostPort(ip, port), nil))
  384. }
  385. func SourceName(port uint32) string {
  386. hostname, err := os.Hostname()
  387. if err != nil {
  388. return "unknown"
  389. }
  390. return net.JoinHostPort(hostname, strconv.Itoa(int(port)))
  391. }
  392. func RecordBucketActiveTime(bucket string) {
  393. bucketLastActiveLock.Lock()
  394. bucketLastActiveTsNs[bucket] = time.Now().UnixNano()
  395. bucketLastActiveLock.Unlock()
  396. }
  397. func DeleteCollectionMetrics(collection string) {
  398. labels := prometheus.Labels{"collection": collection}
  399. c := MasterReplicaPlacementMismatch.DeletePartialMatch(labels)
  400. c += MasterVolumeLayoutWritable.DeletePartialMatch(labels)
  401. c += MasterVolumeLayoutCrowded.DeletePartialMatch(labels)
  402. c += VolumeServerDiskSizeGauge.DeletePartialMatch(labels)
  403. c += VolumeServerVolumeGauge.DeletePartialMatch(labels)
  404. c += VolumeServerReadOnlyVolumeGauge.DeletePartialMatch(labels)
  405. glog.V(0).Infof("delete collection metrics, %s: %d", collection, c)
  406. }
  407. func bucketMetricTTLControl() {
  408. ttlNs := bucketAtiveTTL.Nanoseconds()
  409. for {
  410. now := time.Now().UnixNano()
  411. bucketLastActiveLock.Lock()
  412. for bucket, ts := range bucketLastActiveTsNs {
  413. if (now - ts) > ttlNs {
  414. delete(bucketLastActiveTsNs, bucket)
  415. labels := prometheus.Labels{"bucket": bucket}
  416. c := S3RequestCounter.DeletePartialMatch(labels)
  417. c += S3RequestHistogram.DeletePartialMatch(labels)
  418. c += S3TimeToFirstByteHistogram.DeletePartialMatch(labels)
  419. c += S3BucketTrafficReceivedBytesCounter.DeletePartialMatch(labels)
  420. c += S3BucketTrafficSentBytesCounter.DeletePartialMatch(labels)
  421. c += S3DeletedObjectsCounter.DeletePartialMatch(labels)
  422. c += S3UploadedObjectsCounter.DeletePartialMatch(labels)
  423. glog.V(0).Infof("delete inactive bucket metrics, %s: %d", bucket, c)
  424. }
  425. }
  426. bucketLastActiveLock.Unlock()
  427. time.Sleep(bucketAtiveTTL)
  428. }
  429. }