From f400fb44a06ceef897237a2b402a8bff03500f26 Mon Sep 17 00:00:00 2001 From: Lisandro Pin Date: Tue, 10 Feb 2026 02:52:43 +0100 Subject: [PATCH] Update `cluster.status` to resolve file details on EC volumes. (#8268) Also parallelizes queries for file metrics collections when the `--files` flag is specified, and improves the command's output for readability: ``` > cluster.status --files collecting file stats: 100% cluster: id: topo status: LOCKED nodes: 10 topology: 1 DC, 10 disks on 1 rack volumes: total: 3 volumes, 1 collection max size: 32 GB regular: 1/80 volume on 3 replicas, 3 writable (100%), 0 read-only (0%) EC: 2 EC volumes on 28 shards (14 shards/volume) storage: total: 269 MB (522 MB raw, 193.95%) regular volumes: 91 MB (272 MB raw, 300%) EC volumes: 178 MB (250 MB raw, 140%) files: total: 363 files, 300 readable (82.64%), 63 deleted (17.35%), avg 522 kB per file regular: 168 files, 105 readable (62.5%), 63 deleted (37.5%), avg 540 kB per file EC: 195 files, 195 readable (100%), 0 deleted (0%), avg 506 kB per file ``` --- weed/shell/command_cluster_status.go | 170 ++++++++++++++++------ weed/shell/command_cluster_status_test.go | 54 ++++--- 2 files changed, 157 insertions(+), 67 deletions(-) diff --git a/weed/shell/command_cluster_status.go b/weed/shell/command_cluster_status.go index 7a31c34e5..9bf7b1591 100644 --- a/weed/shell/command_cluster_status.go +++ b/weed/shell/command_cluster_status.go @@ -33,7 +33,17 @@ type VolumeReplicaStats struct { FilesDeleted uint64 TotalSize uint64 } -type RegularVolumeStats map[uint32][]*VolumeReplicaStats +type RegularVolumesStats map[uint32][]*VolumeReplicaStats + +// Map of ec_volume_id -> stat details. +type EcVolumeStats struct { + VolumeId uint32 + + Files uint64 + FilesDeleted uint64 + TotalSize uint64 +} +type EcVolumesStats map[uint32]*EcVolumeStats type commandClusterStatus struct{} type ClusterStatusPrinter struct { @@ -42,11 +52,12 @@ type ClusterStatusPrinter struct { humanize bool maxParallelization int - locked bool - collections []string - topology *master_pb.TopologyInfo - volumeSizeLimitMb uint64 - regularVolumeStats RegularVolumeStats + locked bool + collections []string + topology *master_pb.TopologyInfo + volumeSizeLimitMb uint64 + regularVolumesStats RegularVolumesStats + ecVolumesStats EcVolumesStats } func (c *commandClusterStatus) Name() string { @@ -137,7 +148,12 @@ func (sp *ClusterStatusPrinter) bytes(b uint64) string { func (sp *ClusterStatusPrinter) uint64Ratio(a, b uint64) string { var p float64 if b != 0 { - p = float64(a) / float64(b) + if a%b == 0 { + // Avoid float precision issues on integer ratios. + p = float64(a / b) + } else { + p = float64(a) / float64(b) + } } if !sp.humanize { return fmt.Sprintf("%.02f", p) @@ -151,8 +167,14 @@ func (sp *ClusterStatusPrinter) intRatio(a, b int) string { func (sp *ClusterStatusPrinter) uint64Pct(a, b uint64) string { var p float64 + if b != 0 { - p = 100 * float64(a) / float64(b) + if a%b == 0 { + // avoid float rounding errors on exact ratios + p = float64(a / b * 100) + } else { + p = 100 * float64(a) / float64(b) + } } if !sp.humanize { return fmt.Sprintf("%.02f%%", p) @@ -188,21 +210,31 @@ func (sp *ClusterStatusPrinter) Print() { sp.printFilesInfo() } -// TODO: collect stats for EC volumes as well func (sp *ClusterStatusPrinter) loadFileStats(commandEnv *CommandEnv) error { - sp.regularVolumeStats = RegularVolumeStats{} + sp.regularVolumesStats = RegularVolumesStats{} + sp.ecVolumesStats = EcVolumesStats{} var mu sync.Mutex var progressTotal, progressDone uint64 ewg := NewErrorWaitGroup(sp.maxParallelization) + updateProgress := func() { + mu.Lock() + defer mu.Unlock() + + progressDone++ + sp.write("collecting file stats: %s \r", sp.uint64Pct(progressDone, progressTotal)) + } + for _, dci := range sp.topology.DataCenterInfos { for _, ri := range dci.RackInfos { for _, dni := range ri.DataNodeInfos { for _, d := range dni.DiskInfos { mu.Lock() progressTotal += uint64(len(d.VolumeInfos)) + progressTotal += uint64(len(d.EcShardInfos)) mu.Unlock() + for _, v := range d.VolumeInfos { ewg.Add(func() error { // Collect regular volume stats @@ -217,10 +249,10 @@ func (sp *ClusterStatusPrinter) loadFileStats(commandEnv *CommandEnv) error { mu.Lock() defer mu.Unlock() if resp != nil { - if _, ok := sp.regularVolumeStats[v.Id]; !ok { - sp.regularVolumeStats[v.Id] = []*VolumeReplicaStats{} + if _, ok := sp.regularVolumesStats[v.Id]; !ok { + sp.regularVolumesStats[v.Id] = []*VolumeReplicaStats{} } - sp.regularVolumeStats[v.Id] = append(sp.regularVolumeStats[v.Id], &VolumeReplicaStats{ + sp.regularVolumesStats[v.Id] = append(sp.regularVolumesStats[v.Id], &VolumeReplicaStats{ Id: dni.Id, VolumeId: v.Id, Files: resp.FileCount, @@ -228,17 +260,51 @@ func (sp *ClusterStatusPrinter) loadFileStats(commandEnv *CommandEnv) error { TotalSize: resp.VolumeSize, }) } - progressDone++ return nil }) - if err != nil { - return err - } + + updateProgress() + return err + }) + } + + for _, eci := range d.EcShardInfos { + ewg.Add(func() error { + // Collect EC shard stats + + var err error mu.Lock() - sp.write("collecting file stats: %s \r", sp.uint64Pct(progressDone, progressTotal)) + _, ok := sp.ecVolumesStats[eci.Id] mu.Unlock() - return nil + if ok { + // this EC volume has been already processed, likely on a different node + return nil + } + + err = operation.WithVolumeServerClient(false, pb.NewServerAddressWithGrpcPort(dni.Id, int(dni.GrpcPort)), commandEnv.option.GrpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error { + resp, reqErr := volumeServerClient.VolumeEcShardsInfo(context.Background(), &volume_server_pb.VolumeEcShardsInfoRequest{ + VolumeId: uint32(eci.Id), + }) + if reqErr != nil { + return reqErr + } + + mu.Lock() + defer mu.Unlock() + if resp != nil { + sp.ecVolumesStats[eci.Id] = &EcVolumeStats{ + VolumeId: eci.Id, + Files: resp.FileCount, + FilesDeleted: resp.FileDeletedCount, + TotalSize: resp.VolumeSize, + } + } + return nil + }) + + updateProgress() + return err }) } } @@ -380,32 +446,27 @@ func (sp *ClusterStatusPrinter) printStorageInfo() { ecVolumeSize += s } totalSize := volumeSize + ecVolumeSize + totalRawSize := rawVolumeSize + rawEcVolumeSize sp.write("storage:") - sp.write("\ttotal: %s", sp.bytes(totalSize)) - sp.write("\tregular volumes: %s", sp.bytes(volumeSize)) - sp.write("\tEC volumes: %s", sp.bytes(ecVolumeSize)) - sp.write("\traw: %s on volume replicas, %s on EC shards", sp.bytes(rawVolumeSize), sp.bytes(rawEcVolumeSize)) + sp.write("\ttotal: %s (%s raw, %s)", sp.bytes(totalSize), sp.bytes(totalRawSize), sp.uint64Pct(totalRawSize, totalSize)) + sp.write("\tregular volumes: %s (%s raw, %s)", sp.bytes(volumeSize), sp.bytes(rawVolumeSize), sp.uint64Pct(rawVolumeSize, volumeSize)) + sp.write("\tEC volumes: %s (%s raw, %s)", sp.bytes(ecVolumeSize), sp.bytes(rawEcVolumeSize), sp.uint64Pct(rawEcVolumeSize, ecVolumeSize)) sp.write("") } func (sp *ClusterStatusPrinter) printFilesInfo() { - if len(sp.regularVolumeStats) == 0 { + if len(sp.regularVolumesStats) == 0 && len(sp.ecVolumesStats) == 0 { return } var regularFilesTotal, regularFilesDeleted, regularFilesSize uint64 - var regularFilesTotalRaw, regularFilesDeletedRaw, regularFilesSizeRaw uint64 - for _, replicaStats := range sp.regularVolumeStats { + for _, replicaStats := range sp.regularVolumesStats { rc := uint64(len(replicaStats)) var volumeFilesTotal, volumeFilesSize, volumeFilesDeleted uint64 for _, rs := range replicaStats { - regularFilesTotalRaw += rs.Files - regularFilesSizeRaw += rs.TotalSize - regularFilesDeletedRaw += rs.FilesDeleted - volumeFilesTotal += rs.Files volumeFilesSize += rs.TotalSize volumeFilesDeleted += rs.FilesDeleted @@ -414,26 +475,49 @@ func (sp *ClusterStatusPrinter) printFilesInfo() { regularFilesSize += (volumeFilesSize / rc) regularFilesDeleted += (volumeFilesDeleted / rc) } - regularFiles := regularFilesTotal - regularFilesDeleted - regularFilesRaw := regularFilesTotalRaw - regularFilesDeletedRaw - var avgFileSize uint64 + var avgRegularFileSize uint64 if regularFilesTotal != 0 { - avgFileSize = regularFilesSize / regularFilesTotal + avgRegularFileSize = regularFilesSize / regularFilesTotal + } + + var ecFilesTotal, ecFilesDeleted, ecFilesSize uint64 + + for _, ecStats := range sp.ecVolumesStats { + ecFilesTotal += ecStats.Files + ecFilesSize += ecStats.TotalSize + ecFilesDeleted += ecStats.FilesDeleted + } + ecFiles := ecFilesTotal - ecFilesDeleted + var avgEcFileSize uint64 + if ecFilesTotal != 0 { + avgEcFileSize = ecFilesSize / ecFilesTotal + } + + files := regularFiles + ecFiles + filesDeleted := regularFilesDeleted + ecFilesDeleted + filesTotal := regularFilesTotal + ecFilesTotal + filesSize := regularFilesSize + ecFilesSize + var avgFileSize uint64 + if filesTotal != 0 { + avgFileSize = filesSize / filesTotal } sp.write("files:") - sp.write("\tregular: %s %s, %s readable (%s), %s deleted (%s), avg %s per file", + sp.write("\ttotal: %s %s, %s readable (%s), %s deleted (%s), avg %s per file", + sp.uint64(filesTotal), sp.uint64Plural(filesTotal, "file"), + sp.uint64(files), sp.uint64Pct(files, filesTotal), + sp.uint64(filesDeleted), sp.uint64Pct(filesDeleted, filesTotal), + sp.bytes(avgFileSize)) + sp.write("\tregular: %s %s, %s readable (%s), %s deleted (%s), avg %s per file", sp.uint64(regularFilesTotal), sp.uint64Plural(regularFilesTotal, "file"), sp.uint64(regularFiles), sp.uint64Pct(regularFiles, regularFilesTotal), sp.uint64(regularFilesDeleted), sp.uint64Pct(regularFilesDeleted, regularFilesTotal), - sp.bytes(avgFileSize)) - sp.write("\tregular raw: %s %s, %s readable (%s), %s deleted (%s), %s total", - sp.uint64(regularFilesTotalRaw), sp.uint64Plural(regularFilesTotalRaw, "file"), - sp.uint64(regularFilesRaw), sp.uint64Pct(regularFilesRaw, regularFilesTotalRaw), - sp.uint64(regularFilesDeletedRaw), sp.uint64Pct(regularFilesDeletedRaw, regularFilesTotalRaw), - sp.bytes(regularFilesSizeRaw)) - sp.write("\tEC: [no data]") - sp.write("\tEC raw: [no data]") + sp.bytes(avgRegularFileSize)) + sp.write("\tEC: %s %s, %s readable (%s), %s deleted (%s), avg %s per file", + sp.uint64(ecFilesTotal), sp.uint64Plural(ecFilesTotal, "file"), + sp.uint64(ecFiles), sp.uint64Pct(ecFiles, ecFilesTotal), + sp.uint64(ecFilesDeleted), sp.uint64Pct(ecFilesDeleted, ecFilesTotal), + sp.bytes(avgEcFileSize)) sp.write("") } diff --git a/weed/shell/command_cluster_status_test.go b/weed/shell/command_cluster_status_test.go index 6528c026f..69ed3f5a9 100644 --- a/weed/shell/command_cluster_status_test.go +++ b/weed/shell/command_cluster_status_test.go @@ -104,20 +104,18 @@ func TestPrintStorageInfo(t *testing.T) { { testTopology2, true, `storage: - total: 5.9 TB - regular volumes: 5.9 TB - EC volumes: 0 B - raw: 18 TB on volume replicas, 0 B on EC shards + total: 5.9 TB (18 TB raw, 299.97%) + regular volumes: 5.9 TB (18 TB raw, 299.97%) + EC volumes: 0 B (0 B raw, 0%) `, }, { testTopology2, false, `storage: - total: 5892610895448 byte(s) - regular volumes: 5892610895448 byte(s) - EC volumes: 0 byte(s) - raw: 17676186754616 byte(s) on volume replicas, 0 byte(s) on EC shards + total: 5892610895448 byte(s) (17676186754616 byte(s) raw, 299.97%) + regular volumes: 5892610895448 byte(s) (17676186754616 byte(s) raw, 299.97%) + EC volumes: 0 byte(s) (0 byte(s) raw, 0.00%) `, }, @@ -141,12 +139,13 @@ func TestPrintStorageInfo(t *testing.T) { func TestPrintFilesInfo(t *testing.T) { testCases := []struct { - regularVolumeStats RegularVolumeStats - humanize bool - want string + regularVolumesStats RegularVolumesStats + ecVolumesStats EcVolumesStats + humanize bool + want string }{ { - regularVolumeStats: RegularVolumeStats{ + regularVolumesStats: RegularVolumesStats{ 1: []*VolumeReplicaStats{ &VolumeReplicaStats{Id: "10.200.17.13:9001", VolumeId: 1, Files: 159, FilesDeleted: 8, TotalSize: 89762704}, &VolumeReplicaStats{Id: "10.200.17.13:9002", VolumeId: 1, Files: 159, FilesDeleted: 8, TotalSize: 89762704}, @@ -163,17 +162,20 @@ func TestPrintFilesInfo(t *testing.T) { &VolumeReplicaStats{Id: "10.200.17.13:9009", VolumeId: 3, Files: 149, FilesDeleted: 0, TotalSize: 81643872}, }, }, + ecVolumesStats: EcVolumesStats{ + 10: &EcVolumeStats{VolumeId: 10, Files: 30, FilesDeleted: 0, TotalSize: 34879032}, + 11: &EcVolumeStats{VolumeId: 11, Files: 55, FilesDeleted: 5, TotalSize: 55540341}, + }, humanize: false, want: `files: - regular: 500 file(s), 471 readable (94.20%), 29 deleted (5.80%), avg 530390 byte(s) per file - regular raw: 1500 file(s), 1413 readable (94.20%), 87 deleted (5.80%), 795585624 byte(s) total - EC: [no data] - EC raw: [no data] + total: 585 file(s), 551 readable (94.19%), 34 deleted (5.81%), avg 607888 byte(s) per file + regular: 500 file(s), 471 readable (94.20%), 29 deleted (5.80%), avg 530390 byte(s) per file + EC: 85 file(s), 80 readable (94.12%), 5 deleted (5.88%), avg 1063757 byte(s) per file `, }, { - regularVolumeStats: RegularVolumeStats{ + regularVolumesStats: RegularVolumesStats{ 1: []*VolumeReplicaStats{ &VolumeReplicaStats{Id: "10.200.17.13:9001", VolumeId: 1, Files: 184, FilesDeleted: 33, TotalSize: 79187475}, &VolumeReplicaStats{Id: "10.200.17.13:9008", VolumeId: 1, Files: 184, FilesDeleted: 33, TotalSize: 79187475}, @@ -187,12 +189,15 @@ func TestPrintFilesInfo(t *testing.T) { &VolumeReplicaStats{Id: "10.200.17.13:9009", VolumeId: 3, Files: 171, FilesDeleted: 12, TotalSize: 124049530}, }, }, + ecVolumesStats: EcVolumesStats{ + 20: &EcVolumeStats{VolumeId: 20, Files: 22, FilesDeleted: 10, TotalSize: 27328233}, + 30: &EcVolumeStats{VolumeId: 30, Files: 16, FilesDeleted: 11, TotalSize: 11193827}, + }, humanize: true, want: `files: - regular: 600 files, 551 readable (91.83%), 49 deleted (8.16%), avg 488 kB per file - regular raw: 1,200 files, 1,102 readable (91.83%), 98 deleted (8.16%), 586 MB total - EC: [no data] - EC raw: [no data] + total: 638 files, 568 readable (89.02%), 70 deleted (10.97%), avg 519 kB per file + regular: 600 files, 551 readable (91.83%), 49 deleted (8.16%), avg 488 kB per file + EC: 38 files, 17 readable (44.73%), 21 deleted (55.26%), avg 1.0 MB per file `, }, @@ -201,9 +206,10 @@ func TestPrintFilesInfo(t *testing.T) { for i, tc := range testCases { var buf bytes.Buffer sp := &ClusterStatusPrinter{ - writer: &buf, - humanize: tc.humanize, - regularVolumeStats: tc.regularVolumeStats, + writer: &buf, + humanize: tc.humanize, + regularVolumesStats: tc.regularVolumesStats, + ecVolumesStats: tc.ecVolumesStats, } sp.printFilesInfo() got := buf.String()