Browse Source

Add Prometheus metric to count upload errors (#8788)

Add Prometheus metric to count upload errors (#8775)

Add SeaweedFS_upload_error_total counter labeled by HTTP status code,
so operators can alert on write/replication failures. Code "0" indicates
a transport error (no HTTP response received).

Also add an "Upload Errors" panel to the Grafana dashboard.
pull/8790/head
Chris Lu 16 hours ago
committed by GitHub
parent
commit
5fa5507234
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
  1. 156
      k8s/charts/seaweedfs/dashboards/seaweedfs-grafana-dashboard.json
  2. 5
      weed/operation/upload_content.go
  3. 9
      weed/stats/metrics.go

156
k8s/charts/seaweedfs/dashboards/seaweedfs-grafana-dashboard.json

@ -1168,6 +1168,108 @@
"title": "Filer QPS",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short",
"unitScale": true
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 28
},
"id": 91,
"links": [],
"options": {
"legend": {
"calcs": [
"lastNotNull",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"width": 250
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "10.3.1",
"targets": [
{
"exemplar": true,
"expr": "sum by (code) (rate(SeaweedFS_upload_error_total{namespace=\"$NAMESPACE\"}[$__rate_interval]))",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{code}}",
"refId": "A",
"step": 30
}
],
"title": "Upload Errors",
"type": "timeseries"
},
{
"collapsed": false,
"datasource": {
@ -1178,7 +1280,7 @@
"h": 1,
"w": 24,
"x": 0,
"y": 28
"y": 35
},
"id": 61,
"panels": [],
@ -1251,7 +1353,7 @@
"h": 7,
"w": 8,
"x": 0,
"y": 29
"y": 36
},
"id": 65,
"links": [],
@ -1357,7 +1459,7 @@
"h": 7,
"w": 8,
"x": 8,
"y": 29
"y": 36
},
"id": 56,
"links": [],
@ -1463,7 +1565,7 @@
"h": 7,
"w": 8,
"x": 16,
"y": 29
"y": 36
},
"id": 58,
"links": [],
@ -1520,7 +1622,7 @@
"h": 7,
"w": 12,
"x": 0,
"y": 36
"y": 43
},
"id": 84,
"links": [],
@ -1565,7 +1667,7 @@
"h": 7,
"w": 12,
"x": 12,
"y": 36
"y": 43
},
"id": 85,
"links": [],
@ -1661,7 +1763,7 @@
"h": 7,
"w": 12,
"x": 0,
"y": 41
"y": 48
},
"id": 86,
"options": {
@ -1756,7 +1858,7 @@
"h": 7,
"w": 12,
"x": 12,
"y": 41
"y": 48
},
"id": 72,
"links": [],
@ -1874,7 +1976,7 @@
"h": 7,
"w": 24,
"x": 0,
"y": 50
"y": 57
},
"id": 73,
"links": [],
@ -2030,7 +2132,7 @@
"h": 7,
"w": 24,
"x": 0,
"y": 57
"y": 64
},
"id": 55,
"links": [],
@ -2187,7 +2289,7 @@
"h": 7,
"w": 24,
"x": 0,
"y": 64
"y": 71
},
"hideTimeOverride": false,
"id": 59,
@ -2259,7 +2361,7 @@
"h": 1,
"w": 24,
"x": 0,
"y": 71
"y": 78
},
"id": 62,
"panels": [],
@ -2331,7 +2433,7 @@
"h": 7,
"w": 12,
"x": 0,
"y": 72
"y": 79
},
"id": 47,
"links": [],
@ -2474,7 +2576,7 @@
"h": 7,
"w": 12,
"x": 12,
"y": 72
"y": 79
},
"id": 40,
"links": [],
@ -2571,7 +2673,7 @@
"h": 7,
"w": 24,
"x": 0,
"y": 79
"y": 86
},
"id": 48,
"links": [],
@ -2681,7 +2783,7 @@
"h": 7,
"w": 24,
"x": 0,
"y": 86
"y": 93
},
"id": 50,
"links": [],
@ -2783,7 +2885,7 @@
"h": 7,
"w": 24,
"x": 0,
"y": 93
"y": 100
},
"id": 51,
"links": [],
@ -2823,7 +2925,7 @@
"h": 1,
"w": 24,
"x": 0,
"y": 93
"y": 100
},
"id": 63,
"panels": [],
@ -2896,7 +2998,7 @@
"h": 7,
"w": 12,
"x": 0,
"y": 101
"y": 108
},
"id": 12,
"links": [],
@ -2991,7 +3093,7 @@
"h": 7,
"w": 12,
"x": 12,
"y": 101
"y": 108
},
"id": 14,
"links": [],
@ -3033,7 +3135,7 @@
"h": 1,
"w": 24,
"x": 0,
"y": 108
"y": 115
},
"id": 64,
"panels": [],
@ -3106,7 +3208,7 @@
"h": 7,
"w": 12,
"x": 0,
"y": 109
"y": 116
},
"id": 52,
"links": [],
@ -3234,7 +3336,7 @@
"h": 7,
"w": 12,
"x": 12,
"y": 109
"y": 116
},
"id": 54,
"links": [],
@ -3331,7 +3433,7 @@
"h": 7,
"w": 24,
"x": 0,
"y": 116
"y": 123
},
"id": 53,
"links": [],
@ -3426,7 +3528,7 @@
"h": 7,
"w": 12,
"x": 0,
"y": 48
"y": 55
},
"id": 89,
"options": {
@ -3533,7 +3635,7 @@
"h": 7,
"w": 12,
"x": 12,
"y": 48
"y": 55
},
"id": 90,
"options": {
@ -3654,4 +3756,4 @@
"uid": "a24009d7-cbda-4443-a132-1cc1c4677304",
"version": 1,
"weekStart": ""
}
}

5
weed/operation/upload_content.go

@ -11,6 +11,7 @@ import (
"net/http"
"net/textproto"
"path/filepath"
"strconv"
"strings"
"sync"
"time"
@ -415,6 +416,7 @@ func (uploader *Uploader) upload_content(ctx context.Context, fillBufferFunction
}
}
if post_err != nil {
stats.UploadErrorCounter.WithLabelValues("0").Inc()
return nil, fmt.Errorf("upload %s %d bytes to %v: %v", option.Filename, originalDataSize, option.UploadUrl, post_err)
}
// print("-")
@ -428,15 +430,18 @@ func (uploader *Uploader) upload_content(ctx context.Context, fillBufferFunction
resp_body, ra_err := io.ReadAll(resp.Body)
if ra_err != nil {
stats.UploadErrorCounter.WithLabelValues(strconv.Itoa(resp.StatusCode)).Inc()
return nil, fmt.Errorf("read response body %v: %w", option.UploadUrl, ra_err)
}
unmarshal_err := json.Unmarshal(resp_body, &ret)
if unmarshal_err != nil {
stats.UploadErrorCounter.WithLabelValues(strconv.Itoa(resp.StatusCode)).Inc()
glog.ErrorfCtx(ctx, "unmarshal %s: %v", option.UploadUrl, string(resp_body))
return nil, fmt.Errorf("unmarshal %v: %w", option.UploadUrl, unmarshal_err)
}
if ret.Error != "" {
stats.UploadErrorCounter.WithLabelValues(strconv.Itoa(resp.StatusCode)).Inc()
return nil, fmt.Errorf("unmarshalled error %v: %v", option.UploadUrl, ret.Error)
}
ret.ETag = etag

9
weed/stats/metrics.go

@ -458,6 +458,13 @@ var (
Name: "bucket_object_count",
Help: "Current number of objects in each S3 bucket (logical count, deduplicated across replicas).",
}, []string{"bucket"})
UploadErrorCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: Namespace,
Name: "upload_error_total",
Help: "Counter of upload errors by HTTP status code. Code 0 means transport error (no response received).",
}, []string{"code"})
)
func init() {
@ -519,6 +526,8 @@ func init() {
Gather.MustRegister(S3BucketPhysicalSizeBytesGauge)
Gather.MustRegister(S3BucketObjectCountGauge)
Gather.MustRegister(UploadErrorCounter)
go bucketMetricTTLControl()
}

Loading…
Cancel
Save