From ba855f996237b8b669e51d356529354e8ce54f7e Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Fri, 20 Mar 2026 22:15:05 -0700 Subject: [PATCH] fix(telemetry): use correct TopologyId field in integration test (#8714) * fix(telemetry): use correct TopologyId field in integration test The proto field was renamed from cluster_id to topology_id but the integration test was not updated, causing a compilation error. * ci: add telemetry integration test workflow Runs the telemetry integration test (server startup, protobuf marshaling, client send, metrics/stats/instances API checks) on changes to telemetry/ or weed/telemetry/. * fix(telemetry): improve error message specificity in integration test * fix(ci): pre-build telemetry server binary for integration test go run compiles the server on the fly, which exceeds the 15s startup timeout in CI. Build the binary first so the test starts instantly. * fix(telemetry): fix ClusterId references in server and CI build path - Replace ClusterId with TopologyId in server storage and API handler (same rename as the integration test fix) - Fix CI build: telemetry server has its own go.mod, so build from within its directory * ci(telemetry): add least-privilege permissions to workflow Scope the workflow token to read-only repository contents, matching the convention used in go.yml. * fix(telemetry): set TopologyId in client integration test The client only populates TopologyId when SetTopologyId has been called. The test was missing this call, causing the server to reject the request with 400 (missing required field). * fix(telemetry): delete clusterInfo metric on instance cleanup The cleanup loop removed all per-instance metrics except clusterInfo, leaking that label set after eviction. --- .github/workflows/telemetry-integration.yml | 46 +++++++++++++++++++++ .gitignore | 1 + telemetry/server/api/handlers.go | 2 +- telemetry/server/go.mod | 12 +++--- telemetry/server/go.sum | 25 +++++------ telemetry/server/storage/prometheus.go | 9 ++-- telemetry/test/integration.go | 30 +++++++++----- 7 files changed, 89 insertions(+), 36 deletions(-) create mode 100644 .github/workflows/telemetry-integration.yml diff --git a/.github/workflows/telemetry-integration.yml b/.github/workflows/telemetry-integration.yml new file mode 100644 index 000000000..1540256f1 --- /dev/null +++ b/.github/workflows/telemetry-integration.yml @@ -0,0 +1,46 @@ +name: Telemetry Integration Tests + +on: + push: + branches: [ master ] + paths: + - 'telemetry/**' + - 'weed/telemetry/**' + - '.github/workflows/telemetry-integration.yml' + pull_request: + branches: [ master ] + paths: + - 'telemetry/**' + - 'weed/telemetry/**' + - '.github/workflows/telemetry-integration.yml' + +permissions: + contents: read + +jobs: + telemetry-integration-test: + runs-on: ubuntu-latest + timeout-minutes: 5 + + steps: + - name: Checkout code + uses: actions/checkout@v6 + + - name: Set up Go + uses: actions/setup-go@v6 + with: + go-version-file: 'go.mod' + + - name: Build telemetry server + run: cd telemetry/server && go build -o telemetry-server . + + - name: Run telemetry integration test + run: go run telemetry/test/integration.go + + - name: Upload test logs on failure + if: failure() + uses: actions/upload-artifact@v7 + with: + name: telemetry-test-logs + path: telemetry-server-test.log + retention-days: 7 diff --git a/.gitignore b/.gitignore index 0ea9a06b0..a3ea87971 100644 --- a/.gitignore +++ b/.gitignore @@ -141,4 +141,5 @@ test/s3/iam/.test_env /test/erasure_coding/admin_dockertest/tmp /test/erasure_coding/admin_dockertest/task_logs weed_bin +telemetry/server/telemetry-server .aider* diff --git a/telemetry/server/api/handlers.go b/telemetry/server/api/handlers.go index 0ff00330b..c480a9771 100644 --- a/telemetry/server/api/handlers.go +++ b/telemetry/server/api/handlers.go @@ -54,7 +54,7 @@ func (h *Handler) CollectTelemetry(w http.ResponseWriter, r *http.Request) { } // Validate required fields - if data.ClusterId == "" || data.Version == "" || data.Os == "" { + if data.TopologyId == "" || data.Version == "" || data.Os == "" { http.Error(w, "Missing required fields", http.StatusBadRequest) return } diff --git a/telemetry/server/go.mod b/telemetry/server/go.mod index f555d0bba..01f46902c 100644 --- a/telemetry/server/go.mod +++ b/telemetry/server/go.mod @@ -1,8 +1,6 @@ module github.com/seaweedfs/seaweedfs/telemetry/server -go 1.25 - -toolchain go1.25.0 +go 1.25.0 require ( github.com/prometheus/client_golang v1.23.2 @@ -15,10 +13,10 @@ require ( github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/prometheus/client_model v0.6.2 // indirect - github.com/prometheus/common v0.66.1 // indirect - github.com/prometheus/procfs v0.19.2 // indirect - go.yaml.in/yaml/v2 v2.4.2 // indirect - golang.org/x/sys v0.39.0 // indirect + github.com/prometheus/common v0.67.2 // indirect + github.com/prometheus/procfs v0.20.1 // indirect + go.yaml.in/yaml/v2 v2.4.3 // indirect + golang.org/x/sys v0.42.0 // indirect ) replace github.com/seaweedfs/seaweedfs => ../.. diff --git a/telemetry/server/go.sum b/telemetry/server/go.sum index b9e086f24..ee6dedcc2 100644 --- a/telemetry/server/go.sum +++ b/telemetry/server/go.sum @@ -6,8 +6,8 @@ github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= -github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= -github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= +github.com/klauspost/compress v1.18.4 h1:RPhnKRAQ4Fh8zU2FY/6ZFDwTVTxgJ/EMydqSTzE9a2c= +github.com/klauspost/compress v1.18.4/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= @@ -22,24 +22,21 @@ github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= -github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= -github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= -github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7DuK0= -github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw= -github.com/prometheus/procfs v0.19.2/go.mod h1:M0aotyiemPhBCM0z5w87kL22CxfcH05ZpYlu+b4J7mw= +github.com/prometheus/common v0.67.2 h1:PcBAckGFTIHt2+L3I33uNRTlKTplNzFctXcWhPyAEN8= +github.com/prometheus/common v0.67.2/go.mod h1:63W3KZb1JOKgcjlIr64WW/LvFGAqKPj0atm+knVGEko= +github.com/prometheus/procfs v0.20.1 h1:XwbrGOIplXW/AU3YhIhLODXMJYyC1isLFfYCsTEycfc= +github.com/prometheus/procfs v0.20.1/go.mod h1:o9EMBZGRyvDrSPH1RqdxhojkuXstoe4UlK79eF5TGGo= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= -go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= -go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= -golang.org/x/sys v0.36.0 h1:KVRy2GtZBrk1cBYA7MKu5bEZFxQk4NIDV6RLVcC8o0k= -golang.org/x/sys v0.36.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= -golang.org/x/sys v0.39.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= -google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= -google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= +go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= +go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= +golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= +golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= diff --git a/telemetry/server/storage/prometheus.go b/telemetry/server/storage/prometheus.go index 0b911227a..7e62ed8ad 100644 --- a/telemetry/server/storage/prometheus.go +++ b/telemetry/server/storage/prometheus.go @@ -82,7 +82,7 @@ func (s *PrometheusStorage) StoreTelemetry(data *proto.TelemetryData) error { // Update Prometheus metrics labels := prometheus.Labels{ - "cluster_id": data.ClusterId, + "cluster_id": data.TopologyId, "version": data.Version, "os": data.Os, } @@ -94,7 +94,7 @@ func (s *PrometheusStorage) StoreTelemetry(data *proto.TelemetryData) error { s.brokerCount.With(labels).Set(float64(data.BrokerCount)) infoLabels := prometheus.Labels{ - "cluster_id": data.ClusterId, + "cluster_id": data.TopologyId, "version": data.Version, "os": data.Os, } @@ -103,7 +103,7 @@ func (s *PrometheusStorage) StoreTelemetry(data *proto.TelemetryData) error { s.telemetryReceived.Inc() // Store in memory for API endpoints - s.instances[data.ClusterId] = &telemetryData{ + s.instances[data.TopologyId] = &telemetryData{ TelemetryData: data, ReceivedAt: time.Now().UTC(), } @@ -219,7 +219,7 @@ func (s *PrometheusStorage) CleanupOldInstances(maxAge time.Duration) { // Remove from Prometheus metrics labels := prometheus.Labels{ - "cluster_id": instance.TelemetryData.ClusterId, + "cluster_id": instance.TelemetryData.TopologyId, "version": instance.TelemetryData.Version, "os": instance.TelemetryData.Os, } @@ -228,6 +228,7 @@ func (s *PrometheusStorage) CleanupOldInstances(maxAge time.Duration) { s.totalVolumeCount.Delete(labels) s.filerCount.Delete(labels) s.brokerCount.Delete(labels) + s.clusterInfo.Delete(labels) } } diff --git a/telemetry/test/integration.go b/telemetry/test/integration.go index 463806f15..f75a3ae89 100644 --- a/telemetry/test/integration.go +++ b/telemetry/test/integration.go @@ -85,16 +85,25 @@ func startTelemetryServer() (*exec.Cmd, error) { return nil, fmt.Errorf("failed to get working directory: %v", err) } - // Navigate to the server directory (from main seaweedfs directory) - serverDir := filepath.Join(testDir, "telemetry", "server") - - cmd := exec.Command("go", "run", ".", - "-port="+serverPort, + // Use pre-built binary if available (faster in CI), otherwise fall back to go run + args := []string{ + "-port=" + serverPort, "-dashboard=false", "-cleanup=1m", - "-max-age=1h") + "-max-age=1h", + } - cmd.Dir = serverDir + serverBin := filepath.Join(testDir, "telemetry", "server", "telemetry-server") + var cmd *exec.Cmd + if _, err := os.Stat(serverBin); err == nil { + fmt.Printf("Using pre-built binary: %s\n", serverBin) + cmd = exec.Command(serverBin, args...) + } else { + fmt.Println("No pre-built binary found, using go run") + serverDir := filepath.Join(testDir, "telemetry", "server") + cmd = exec.Command("go", append([]string{"run", "."}, args...)...) + cmd.Dir = serverDir + } // Create log files for server output logFile, err := os.Create("telemetry-server-test.log") @@ -174,9 +183,9 @@ func testProtobufMarshaling() error { } // Verify data - if testData2.ClusterId != testData.ClusterId { - return fmt.Errorf("protobuf data mismatch: expected %s, got %s", - testData.ClusterId, testData2.ClusterId) + if testData2.TopologyId != testData.TopologyId { + return fmt.Errorf("TopologyId mismatch: expected %s, got %s", + testData.TopologyId, testData2.TopologyId) } if testData2.VolumeServerCount != testData.VolumeServerCount { @@ -190,6 +199,7 @@ func testProtobufMarshaling() error { func testTelemetryClient() error { // Create telemetry client client := telemetry.NewClient(serverURL+"/api/collect", true) + client.SetTopologyId("test-topology-12345") // Create test data using protobuf format testData := &proto.TelemetryData{