From 0a5c5ed4ce8ad55dac79fbe2f028ce6de974824b Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Tue, 10 Mar 2026 19:00:40 -0700 Subject: [PATCH] Persist S3 bucket counter metrics across idle periods (#8595) * Stop deleting counter metrics during bucket TTL cleanup Counter metrics (traffic bytes, request counts, object counts) are monotonically increasing by design. Deleting them after 10 minutes of bucket inactivity causes them to vanish from /metrics output and reset to zero when traffic resumes, breaking Prometheus rate()/increase() queries and making historical traffic reporting impossible. Only delete gauges and histograms in the TTL cleanup loop, as these represent current state and are safely re-populated on next activity. Fixes https://github.com/seaweedfs/seaweedfs/issues/8521 * Clean up all bucket metrics on bucket deletion Add DeleteBucketMetrics() to delete all metrics (including counters) for a bucket when it is explicitly deleted. This prevents unbounded label cardinality from accumulating for buckets that no longer exist. Called from DeleteBucketHandler after successful bucket deletion. * Reduce mutex scope in bucket metrics TTL sweep Collect expired bucket names under the lock, then release before calling DeletePartialMatch on Prometheus metrics. This prevents RecordBucketActiveTime from blocking during the expensive cleanup. --- weed/s3api/s3api_bucket_handlers.go | 4 +++- weed/stats/metrics.go | 30 +++++++++++++++++++++++------ 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/weed/s3api/s3api_bucket_handlers.go b/weed/s3api/s3api_bucket_handlers.go index c4cfd1bd9..249933035 100644 --- a/weed/s3api/s3api_bucket_handlers.go +++ b/weed/s3api/s3api_bucket_handlers.go @@ -20,6 +20,7 @@ import ( "github.com/seaweedfs/seaweedfs/weed/filer" "github.com/seaweedfs/seaweedfs/weed/s3api/s3_constants" + stats_collect "github.com/seaweedfs/seaweedfs/weed/stats" "github.com/seaweedfs/seaweedfs/weed/storage/needle" "github.com/seaweedfs/seaweedfs/weed/storage/super_block" @@ -396,8 +397,9 @@ func (s3a *S3ApiServer) DeleteBucketHandler(w http.ResponseWriter, r *http.Reque return } - // Clean up bucket-related caches and locks after successful deletion + // Clean up bucket-related caches, locks, and metrics after successful deletion s3a.invalidateBucketConfigCache(bucket) + stats_collect.DeleteBucketMetrics(bucket) s3err.WriteEmptyResponse(w, r, http.StatusNoContent) } diff --git a/weed/stats/metrics.go b/weed/stats/metrics.go index 9c3b902a3..d07b53d76 100644 --- a/weed/stats/metrics.go +++ b/weed/stats/metrics.go @@ -573,6 +573,26 @@ func RecordBucketActiveTime(bucket string) { bucketLastActiveLock.Unlock() } +func DeleteBucketMetrics(bucket string) { + bucketLastActiveLock.Lock() + delete(bucketLastActiveTsNs, bucket) + bucketLastActiveLock.Unlock() + + labels := prometheus.Labels{"bucket": bucket} + c := S3RequestCounter.DeletePartialMatch(labels) + c += S3RequestHistogram.DeletePartialMatch(labels) + c += S3TimeToFirstByteHistogram.DeletePartialMatch(labels) + c += S3BucketTrafficReceivedBytesCounter.DeletePartialMatch(labels) + c += S3BucketTrafficSentBytesCounter.DeletePartialMatch(labels) + c += S3DeletedObjectsCounter.DeletePartialMatch(labels) + c += S3UploadedObjectsCounter.DeletePartialMatch(labels) + c += S3BucketSizeBytesGauge.DeletePartialMatch(labels) + c += S3BucketPhysicalSizeBytesGauge.DeletePartialMatch(labels) + c += S3BucketObjectCountGauge.DeletePartialMatch(labels) + + glog.V(0).Infof("delete bucket metrics, %s: %d", bucket, c) +} + func DeleteCollectionMetrics(collection string) { labels := prometheus.Labels{"collection": collection} c := MasterReplicaPlacementMismatch.DeletePartialMatch(labels) @@ -605,13 +625,11 @@ func bucketMetricTTLControl() { for _, bucket := range expiredBuckets { labels := prometheus.Labels{"bucket": bucket} - c := S3RequestCounter.DeletePartialMatch(labels) - c += S3RequestHistogram.DeletePartialMatch(labels) + // Only delete gauges and histograms, which represent current state. + // Counters (traffic, requests, objects) must persist for the process + // lifetime so that Prometheus rate()/increase() queries work correctly. + c := S3RequestHistogram.DeletePartialMatch(labels) c += S3TimeToFirstByteHistogram.DeletePartialMatch(labels) - c += S3BucketTrafficReceivedBytesCounter.DeletePartialMatch(labels) - c += S3BucketTrafficSentBytesCounter.DeletePartialMatch(labels) - c += S3DeletedObjectsCounter.DeletePartialMatch(labels) - c += S3UploadedObjectsCounter.DeletePartialMatch(labels) c += S3BucketSizeBytesGauge.DeletePartialMatch(labels) c += S3BucketPhysicalSizeBytesGauge.DeletePartialMatch(labels) c += S3BucketObjectCountGauge.DeletePartialMatch(labels)