From 6a1b9ce8cdfe0556824fbefb27a787f992534ac6 Mon Sep 17 00:00:00 2001 From: Lisandro Pin Date: Thu, 18 Dec 2025 01:40:27 +0100 Subject: [PATCH] Give `cluster.status` detailed file metrics for regular volumes (#7791) * Implement a `weed shell` command to return a status overview of the cluster. Detailed file information will be implemented in a follow-up MR. Note also that masters are currently not reporting back EC shard sizes correctly, via `master_pb.VolumeEcShardInformationMessage.shard_sizes`. F.ex: ``` > status cluster: id: topo status: LOCKED nodes: 10 topology: 1 DC(s)s, 1 disk(s) on 1 rack(s) volumes: total: 3 volumes on 1 collections max size: 31457280000 bytes regular: 2/80 volumes on 6 replicas, 6 writable (100.00%), 0 read-only (0.00%) EC: 1 EC volumes on 14 shards (14.00 shards/volume) storage: total: 186024424 bytes regular volumes: 186024424 bytes EC volumes: 0 bytes raw: 558073152 bytes on volume replicas, 0 bytes on EC shard files ``` * Humanize output for `weed.server` by default. Makes things more readable :) ``` > cluster.status cluster: id: topo status: LOCKED nodes: 10 topology: 1 DC, 10 disks on 1 rack volumes: total: 3 volumes, 1 collection max size: 32 GB regular: 2/80 volumes on 6 replicas, 6 writable (100%), 0 read-only (0%) EC: 1 EC volume on 14 shards (14 shards/volume) storage: total: 172 MB regular volumes: 172 MB EC volumes: 0 B raw: 516 MB on volume replicas, 0 B on EC shards ``` ``` > cluster.status --humanize=false cluster: id: topo status: LOCKED nodes: 10 topology: 1 DC(s), 10 disk(s) on 1 rack(s) volumes: total: 3 volume(s), 1 collection(s) max size: 31457280000 byte(s) regular: 2/80 volume(s) on 6 replica(s), 5 writable (83.33%), 1 read-only (16.67%) EC: 1 EC volume(s) on 14 shard(s) (14.00 shards/volume) storage: total: 172128072 byte(s) regular volumes: 172128072 byte(s) EC volumes: 0 byte(s) raw: 516384216 byte(s) on volume replicas, 0 byte(s) on EC shards ``` Also adds unit tests, and reshuffles test files handling for clarity. * `cluster.status`: Add detailed file metrics for regular volumes. --- weed/shell/command_cluster_status.go | 188 ++++++++++++++++++++-- weed/shell/command_cluster_status_test.go | 75 +++++++++ 2 files changed, 249 insertions(+), 14 deletions(-) diff --git a/weed/shell/command_cluster_status.go b/weed/shell/command_cluster_status.go index 01c45d5a9..746ca4863 100644 --- a/weed/shell/command_cluster_status.go +++ b/weed/shell/command_cluster_status.go @@ -1,13 +1,19 @@ package shell import ( + "context" "flag" "fmt" + "math" "strings" + "sync" "github.com/dustin/go-humanize" "github.com/dustin/go-humanize/english" + "github.com/seaweedfs/seaweedfs/weed/operation" + "github.com/seaweedfs/seaweedfs/weed/pb" "github.com/seaweedfs/seaweedfs/weed/pb/master_pb" + "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb" "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding" "github.com/seaweedfs/seaweedfs/weed/storage/needle" @@ -18,15 +24,29 @@ func init() { Commands = append(Commands, &commandClusterStatus{}) } +// Map of volume_id -> [volume replicas] with stat details. +type VolumeReplicaStats struct { + Id string + VolumeId uint32 + + Files uint64 + FilesDeleted uint64 + TotalSize uint64 +} +type RegularVolumeStats map[uint32][]*VolumeReplicaStats + type commandClusterStatus struct{} type ClusterStatusPrinter struct { - writer io.Writer - humanize bool - - locked bool - collections []string - topology *master_pb.TopologyInfo - volumeSizeLimitMb uint64 + writer io.Writer + writerMu sync.Mutex + humanize bool + maxParallelization int + + locked bool + collections []string + topology *master_pb.TopologyInfo + volumeSizeLimitMb uint64 + regularVolumeStats RegularVolumeStats } func (c *commandClusterStatus) Name() string { @@ -44,6 +64,8 @@ func (c *commandClusterStatus) HasTag(CommandTag) bool { func (c *commandClusterStatus) Do(args []string, commandEnv *CommandEnv, writer io.Writer) (err error) { flags := flag.NewFlagSet(c.Name(), flag.ContinueOnError) humanize := flags.Bool("humanize", true, "human-readable output") + includeFiles := flags.Bool("files", false, "include detailed file metrics, from all volume servers") + maxParallelization := flags.Int("maxParallelization", DefaultMaxParallelization, "run up to X tasks in parallel, whenever possible") if err = flags.Parse(args); err != nil { return err @@ -59,14 +81,21 @@ func (c *commandClusterStatus) Do(args []string, commandEnv *CommandEnv, writer } sp := &ClusterStatusPrinter{ - writer: writer, - humanize: *humanize, + writer: writer, + humanize: *humanize, + maxParallelization: *maxParallelization, locked: commandEnv.isLocked(), collections: collections, topology: topology, volumeSizeLimitMb: volumeSizeLimitMb, } + if *includeFiles { + if err := sp.loadFileStats(commandEnv); err != nil { + return err + } + } + sp.Print() return nil @@ -83,11 +112,19 @@ func (sp *ClusterStatusPrinter) int(n int) string { return sp.uint64(uint64(n)) } -func (sp *ClusterStatusPrinter) plural(n int, str string) string { +func (sp *ClusterStatusPrinter) uint64Plural(n uint64, str string) string { if !sp.humanize { return fmt.Sprintf("%s(s)", str) } - return english.PluralWord(n, str, "") + uin := math.MaxInt + if n < math.MaxInt { + uin = int(n) + } + return english.PluralWord(int(uin), str, "") +} + +func (sp *ClusterStatusPrinter) plural(n int, str string) string { + return sp.uint64Plural(uint64(n), str) } func (sp *ClusterStatusPrinter) bytes(b uint64) string { @@ -128,16 +165,90 @@ func (sp *ClusterStatusPrinter) intPct(a, b int) string { } func (sp *ClusterStatusPrinter) write(format string, a ...any) { - fmt.Fprintf(sp.writer, strings.TrimRight(format, "\r\n "), a...) - fmt.Fprint(sp.writer, "\n") + sp.writerMu.Lock() + defer sp.writerMu.Unlock() + + format = strings.TrimRight(format, " ") + if len(format) == 0 { + format = "\n" + } + fmt.Fprintf(sp.writer, format, a...) + + last := format[len(format)-1:] + if last != "\n" && last != "\r" { + fmt.Fprint(sp.writer, "\n") + } } -// TODO: add option to collect detailed file stats func (sp *ClusterStatusPrinter) Print() { sp.write("") sp.printClusterInfo() sp.printVolumeInfo() sp.printStorageInfo() + sp.printFilesInfo() +} + +// TODO: collect stats for EC volumes as well +func (sp *ClusterStatusPrinter) loadFileStats(commandEnv *CommandEnv) error { + sp.regularVolumeStats = RegularVolumeStats{} + + var mu sync.Mutex + var progressTotal, progressDone uint64 + ewg := NewErrorWaitGroup(sp.maxParallelization) + + for _, dci := range sp.topology.DataCenterInfos { + for _, ri := range dci.RackInfos { + for _, dni := range ri.DataNodeInfos { + for _, d := range dni.DiskInfos { + mu.Lock() + progressTotal += uint64(len(d.VolumeInfos)) + mu.Unlock() + for _, v := range d.VolumeInfos { + ewg.Add(func() error { + // Collect regular volume stats + err := operation.WithVolumeServerClient(false, pb.NewServerAddressWithGrpcPort(dni.Id, int(dni.GrpcPort)), commandEnv.option.GrpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error { + resp, reqErr := volumeServerClient.VolumeStatus(context.Background(), &volume_server_pb.VolumeStatusRequest{ + VolumeId: uint32(v.Id), + }) + if reqErr != nil { + return reqErr + } + + mu.Lock() + defer mu.Unlock() + if resp != nil { + if _, ok := sp.regularVolumeStats[v.Id]; !ok { + sp.regularVolumeStats[v.Id] = []*VolumeReplicaStats{} + } + sp.regularVolumeStats[v.Id] = append(sp.regularVolumeStats[v.Id], &VolumeReplicaStats{ + Id: dni.Id, + VolumeId: v.Id, + Files: resp.FileCount, + FilesDeleted: resp.FileDeletedCount, + TotalSize: resp.VolumeSize, + }) + } + progressDone++ + return nil + }) + if err != nil { + return err + } + + mu.Lock() + sp.write("collecting file stats: %s \r", sp.uint64Pct(progressDone, progressTotal)) + mu.Unlock() + return nil + }) + } + } + } + } + } + + err := ewg.Wait() + sp.write("") + return err } func (sp *ClusterStatusPrinter) printClusterInfo() { @@ -277,3 +388,52 @@ func (sp *ClusterStatusPrinter) printStorageInfo() { sp.write("\traw: %s on volume replicas, %s on EC shards", sp.bytes(rawVolumeSize), sp.bytes(rawEcVolumeSize)) sp.write("") } + +func (sp *ClusterStatusPrinter) printFilesInfo() { + if len(sp.regularVolumeStats) == 0 { + return + } + + var regularFilesTotal, regularFilesDeleted, regularFilesSize uint64 + var regularFilesTotalRaw, regularFilesDeletedRaw, regularFilesSizeRaw uint64 + + for _, replicaStats := range sp.regularVolumeStats { + rc := uint64(len(replicaStats)) + + var volumeFilesTotal, volumeFilesSize, volumeFilesDeleted uint64 + for _, rs := range replicaStats { + regularFilesTotalRaw += rs.Files + regularFilesSizeRaw += rs.TotalSize + regularFilesDeletedRaw += rs.FilesDeleted + + volumeFilesTotal += rs.Files + volumeFilesSize += rs.TotalSize + volumeFilesDeleted += rs.FilesDeleted + } + regularFilesTotal += (volumeFilesTotal / rc) + regularFilesSize += (volumeFilesSize / rc) + regularFilesDeleted += (volumeFilesDeleted / rc) + } + + regularFiles := regularFilesTotal - regularFilesDeleted + regularFilesRaw := regularFilesTotalRaw - regularFilesDeletedRaw + var avgFileSize uint64 + if regularFilesTotal != 0 { + avgFileSize = regularFilesSize / regularFilesTotal + } + + sp.write("files:") + sp.write("\tregular: %s %s, %s readable (%s), %s deleted (%s), avg %s per file", + sp.uint64(regularFilesTotal), sp.uint64Plural(regularFilesTotal, "file"), + sp.uint64(regularFiles), sp.uint64Pct(regularFiles, regularFilesTotal), + sp.uint64(regularFilesDeleted), sp.uint64Pct(regularFilesDeleted, regularFilesTotal), + sp.bytes(avgFileSize)) + sp.write("\tregular raw: %s %s, %s readable (%s), %s deleted (%s), %s total", + sp.uint64(regularFilesTotalRaw), sp.uint64Plural(regularFilesTotalRaw, "file"), + sp.uint64(regularFilesRaw), sp.uint64Pct(regularFilesRaw, regularFilesTotalRaw), + sp.uint64(regularFilesDeletedRaw), sp.uint64Pct(regularFilesDeletedRaw, regularFilesTotalRaw), + sp.bytes(regularFilesSizeRaw)) + sp.write("\tEC: [no data]") + sp.write("\tEC raw: [no data]") + sp.write("") +} diff --git a/weed/shell/command_cluster_status_test.go b/weed/shell/command_cluster_status_test.go index 92de049c6..6528c026f 100644 --- a/weed/shell/command_cluster_status_test.go +++ b/weed/shell/command_cluster_status_test.go @@ -138,3 +138,78 @@ func TestPrintStorageInfo(t *testing.T) { } } } + +func TestPrintFilesInfo(t *testing.T) { + testCases := []struct { + regularVolumeStats RegularVolumeStats + humanize bool + want string + }{ + { + regularVolumeStats: RegularVolumeStats{ + 1: []*VolumeReplicaStats{ + &VolumeReplicaStats{Id: "10.200.17.13:9001", VolumeId: 1, Files: 159, FilesDeleted: 8, TotalSize: 89762704}, + &VolumeReplicaStats{Id: "10.200.17.13:9002", VolumeId: 1, Files: 159, FilesDeleted: 8, TotalSize: 89762704}, + &VolumeReplicaStats{Id: "10.200.17.13:9008", VolumeId: 1, Files: 159, FilesDeleted: 8, TotalSize: 89762704}, + }, + 2: []*VolumeReplicaStats{ + &VolumeReplicaStats{Id: "10.200.17.13:9003", VolumeId: 2, Files: 192, FilesDeleted: 21, TotalSize: 93788632}, + &VolumeReplicaStats{Id: "10.200.17.13:9004", VolumeId: 2, Files: 192, FilesDeleted: 21, TotalSize: 93788632}, + &VolumeReplicaStats{Id: "10.200.17.13:9005", VolumeId: 2, Files: 192, FilesDeleted: 21, TotalSize: 93788632}, + }, + 3: []*VolumeReplicaStats{ + &VolumeReplicaStats{Id: "10.200.17.13:9001", VolumeId: 3, Files: 149, FilesDeleted: 0, TotalSize: 81643872}, + &VolumeReplicaStats{Id: "10.200.17.13:9006", VolumeId: 3, Files: 149, FilesDeleted: 0, TotalSize: 81643872}, + &VolumeReplicaStats{Id: "10.200.17.13:9009", VolumeId: 3, Files: 149, FilesDeleted: 0, TotalSize: 81643872}, + }, + }, + humanize: false, + want: `files: + regular: 500 file(s), 471 readable (94.20%), 29 deleted (5.80%), avg 530390 byte(s) per file + regular raw: 1500 file(s), 1413 readable (94.20%), 87 deleted (5.80%), 795585624 byte(s) total + EC: [no data] + EC raw: [no data] + +`, + }, + { + regularVolumeStats: RegularVolumeStats{ + 1: []*VolumeReplicaStats{ + &VolumeReplicaStats{Id: "10.200.17.13:9001", VolumeId: 1, Files: 184, FilesDeleted: 33, TotalSize: 79187475}, + &VolumeReplicaStats{Id: "10.200.17.13:9008", VolumeId: 1, Files: 184, FilesDeleted: 33, TotalSize: 79187475}, + }, + 2: []*VolumeReplicaStats{ + &VolumeReplicaStats{Id: "10.200.17.13:9004", VolumeId: 2, Files: 245, FilesDeleted: 4, TotalSize: 89501070}, + &VolumeReplicaStats{Id: "10.200.17.13:9005", VolumeId: 2, Files: 245, FilesDeleted: 4, TotalSize: 89501070}, + }, + 3: []*VolumeReplicaStats{ + &VolumeReplicaStats{Id: "10.200.17.13:9006", VolumeId: 3, Files: 171, FilesDeleted: 12, TotalSize: 124049530}, + &VolumeReplicaStats{Id: "10.200.17.13:9009", VolumeId: 3, Files: 171, FilesDeleted: 12, TotalSize: 124049530}, + }, + }, + humanize: true, + want: `files: + regular: 600 files, 551 readable (91.83%), 49 deleted (8.16%), avg 488 kB per file + regular raw: 1,200 files, 1,102 readable (91.83%), 98 deleted (8.16%), 586 MB total + EC: [no data] + EC raw: [no data] + +`, + }, + } + + for i, tc := range testCases { + var buf bytes.Buffer + sp := &ClusterStatusPrinter{ + writer: &buf, + humanize: tc.humanize, + regularVolumeStats: tc.regularVolumeStats, + } + sp.printFilesInfo() + got := buf.String() + + if got != tc.want { + t.Errorf("#%d: got %v, want %v", i, got, tc.want) + } + } +}