Browse Source

Give `cluster.status` detailed file metrics for regular volumes (#7791)

* Implement a `weed shell` command to return a status overview of the cluster.

Detailed file information will be implemented in a follow-up MR. Note also
that masters are currently not reporting back EC shard sizes correctly, via
`master_pb.VolumeEcShardInformationMessage.shard_sizes`.

F.ex:

```
> status

cluster:
	id:       topo
	status:   LOCKED
	nodes:    10
	topology: 1 DC(s)s, 1 disk(s) on 1 rack(s)

volumes:
	total:    3 volumes on 1 collections
	max size: 31457280000 bytes
	regular:  2/80 volumes on 6 replicas, 6 writable (100.00%), 0 read-only (0.00%)
	EC:       1 EC volumes on 14 shards (14.00 shards/volume)

storage:
	total:           186024424 bytes
	regular volumes: 186024424 bytes
	EC volumes:      0 bytes
	raw:             558073152 bytes on volume replicas, 0 bytes on EC shard files
```

* Humanize output for `weed.server` by default.

Makes things more readable :)

```
> cluster.status

cluster:
	id:       topo
	status:   LOCKED
	nodes:    10
	topology: 1 DC, 10 disks on 1 rack

volumes:
	total:    3 volumes, 1 collection
	max size: 32 GB
	regular:  2/80 volumes on 6 replicas, 6 writable (100%), 0 read-only (0%)
	EC:       1 EC volume on 14 shards (14 shards/volume)

storage:
	total:           172 MB
	regular volumes: 172 MB
	EC volumes:      0 B
	raw:             516 MB on volume replicas, 0 B on EC shards
```

```
> cluster.status --humanize=false

cluster:
	id:       topo
	status:   LOCKED
	nodes:    10
	topology: 1 DC(s), 10 disk(s) on 1 rack(s)

volumes:
	total:    3 volume(s), 1 collection(s)
	max size: 31457280000 byte(s)
	regular:  2/80 volume(s) on 6 replica(s), 5 writable (83.33%), 1 read-only (16.67%)
	EC:       1 EC volume(s) on 14 shard(s) (14.00 shards/volume)

storage:
	total:           172128072 byte(s)
	regular volumes: 172128072 byte(s)
	EC volumes:      0 byte(s)
	raw:             516384216 byte(s) on volume replicas, 0 byte(s) on EC shards
```

Also adds unit tests, and reshuffles test files handling for clarity.

* `cluster.status`: Add detailed file metrics for regular volumes.
pull/7813/head
Lisandro Pin 2 days ago
committed by GitHub
parent
commit
6a1b9ce8cd
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
  1. 168
      weed/shell/command_cluster_status.go
  2. 75
      weed/shell/command_cluster_status_test.go

168
weed/shell/command_cluster_status.go

@ -1,13 +1,19 @@
package shell package shell
import ( import (
"context"
"flag" "flag"
"fmt" "fmt"
"math"
"strings" "strings"
"sync"
"github.com/dustin/go-humanize" "github.com/dustin/go-humanize"
"github.com/dustin/go-humanize/english" "github.com/dustin/go-humanize/english"
"github.com/seaweedfs/seaweedfs/weed/operation"
"github.com/seaweedfs/seaweedfs/weed/pb"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb" "github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding" "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
"github.com/seaweedfs/seaweedfs/weed/storage/needle" "github.com/seaweedfs/seaweedfs/weed/storage/needle"
@ -18,15 +24,29 @@ func init() {
Commands = append(Commands, &commandClusterStatus{}) Commands = append(Commands, &commandClusterStatus{})
} }
// Map of volume_id -> [volume replicas] with stat details.
type VolumeReplicaStats struct {
Id string
VolumeId uint32
Files uint64
FilesDeleted uint64
TotalSize uint64
}
type RegularVolumeStats map[uint32][]*VolumeReplicaStats
type commandClusterStatus struct{} type commandClusterStatus struct{}
type ClusterStatusPrinter struct { type ClusterStatusPrinter struct {
writer io.Writer writer io.Writer
writerMu sync.Mutex
humanize bool humanize bool
maxParallelization int
locked bool locked bool
collections []string collections []string
topology *master_pb.TopologyInfo topology *master_pb.TopologyInfo
volumeSizeLimitMb uint64 volumeSizeLimitMb uint64
regularVolumeStats RegularVolumeStats
} }
func (c *commandClusterStatus) Name() string { func (c *commandClusterStatus) Name() string {
@ -44,6 +64,8 @@ func (c *commandClusterStatus) HasTag(CommandTag) bool {
func (c *commandClusterStatus) Do(args []string, commandEnv *CommandEnv, writer io.Writer) (err error) { func (c *commandClusterStatus) Do(args []string, commandEnv *CommandEnv, writer io.Writer) (err error) {
flags := flag.NewFlagSet(c.Name(), flag.ContinueOnError) flags := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
humanize := flags.Bool("humanize", true, "human-readable output") humanize := flags.Bool("humanize", true, "human-readable output")
includeFiles := flags.Bool("files", false, "include detailed file metrics, from all volume servers")
maxParallelization := flags.Int("maxParallelization", DefaultMaxParallelization, "run up to X tasks in parallel, whenever possible")
if err = flags.Parse(args); err != nil { if err = flags.Parse(args); err != nil {
return err return err
@ -61,12 +83,19 @@ func (c *commandClusterStatus) Do(args []string, commandEnv *CommandEnv, writer
sp := &ClusterStatusPrinter{ sp := &ClusterStatusPrinter{
writer: writer, writer: writer,
humanize: *humanize, humanize: *humanize,
maxParallelization: *maxParallelization,
locked: commandEnv.isLocked(), locked: commandEnv.isLocked(),
collections: collections, collections: collections,
topology: topology, topology: topology,
volumeSizeLimitMb: volumeSizeLimitMb, volumeSizeLimitMb: volumeSizeLimitMb,
} }
if *includeFiles {
if err := sp.loadFileStats(commandEnv); err != nil {
return err
}
}
sp.Print() sp.Print()
return nil return nil
@ -83,11 +112,19 @@ func (sp *ClusterStatusPrinter) int(n int) string {
return sp.uint64(uint64(n)) return sp.uint64(uint64(n))
} }
func (sp *ClusterStatusPrinter) plural(n int, str string) string {
func (sp *ClusterStatusPrinter) uint64Plural(n uint64, str string) string {
if !sp.humanize { if !sp.humanize {
return fmt.Sprintf("%s(s)", str) return fmt.Sprintf("%s(s)", str)
} }
return english.PluralWord(n, str, "")
uin := math.MaxInt
if n < math.MaxInt {
uin = int(n)
}
return english.PluralWord(int(uin), str, "")
}
func (sp *ClusterStatusPrinter) plural(n int, str string) string {
return sp.uint64Plural(uint64(n), str)
} }
func (sp *ClusterStatusPrinter) bytes(b uint64) string { func (sp *ClusterStatusPrinter) bytes(b uint64) string {
@ -128,16 +165,90 @@ func (sp *ClusterStatusPrinter) intPct(a, b int) string {
} }
func (sp *ClusterStatusPrinter) write(format string, a ...any) { func (sp *ClusterStatusPrinter) write(format string, a ...any) {
fmt.Fprintf(sp.writer, strings.TrimRight(format, "\r\n "), a...)
sp.writerMu.Lock()
defer sp.writerMu.Unlock()
format = strings.TrimRight(format, " ")
if len(format) == 0 {
format = "\n"
}
fmt.Fprintf(sp.writer, format, a...)
last := format[len(format)-1:]
if last != "\n" && last != "\r" {
fmt.Fprint(sp.writer, "\n") fmt.Fprint(sp.writer, "\n")
} }
}
// TODO: add option to collect detailed file stats
func (sp *ClusterStatusPrinter) Print() { func (sp *ClusterStatusPrinter) Print() {
sp.write("") sp.write("")
sp.printClusterInfo() sp.printClusterInfo()
sp.printVolumeInfo() sp.printVolumeInfo()
sp.printStorageInfo() sp.printStorageInfo()
sp.printFilesInfo()
}
// TODO: collect stats for EC volumes as well
func (sp *ClusterStatusPrinter) loadFileStats(commandEnv *CommandEnv) error {
sp.regularVolumeStats = RegularVolumeStats{}
var mu sync.Mutex
var progressTotal, progressDone uint64
ewg := NewErrorWaitGroup(sp.maxParallelization)
for _, dci := range sp.topology.DataCenterInfos {
for _, ri := range dci.RackInfos {
for _, dni := range ri.DataNodeInfos {
for _, d := range dni.DiskInfos {
mu.Lock()
progressTotal += uint64(len(d.VolumeInfos))
mu.Unlock()
for _, v := range d.VolumeInfos {
ewg.Add(func() error {
// Collect regular volume stats
err := operation.WithVolumeServerClient(false, pb.NewServerAddressWithGrpcPort(dni.Id, int(dni.GrpcPort)), commandEnv.option.GrpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
resp, reqErr := volumeServerClient.VolumeStatus(context.Background(), &volume_server_pb.VolumeStatusRequest{
VolumeId: uint32(v.Id),
})
if reqErr != nil {
return reqErr
}
mu.Lock()
defer mu.Unlock()
if resp != nil {
if _, ok := sp.regularVolumeStats[v.Id]; !ok {
sp.regularVolumeStats[v.Id] = []*VolumeReplicaStats{}
}
sp.regularVolumeStats[v.Id] = append(sp.regularVolumeStats[v.Id], &VolumeReplicaStats{
Id: dni.Id,
VolumeId: v.Id,
Files: resp.FileCount,
FilesDeleted: resp.FileDeletedCount,
TotalSize: resp.VolumeSize,
})
}
progressDone++
return nil
})
if err != nil {
return err
}
mu.Lock()
sp.write("collecting file stats: %s \r", sp.uint64Pct(progressDone, progressTotal))
mu.Unlock()
return nil
})
}
}
}
}
}
err := ewg.Wait()
sp.write("")
return err
} }
func (sp *ClusterStatusPrinter) printClusterInfo() { func (sp *ClusterStatusPrinter) printClusterInfo() {
@ -277,3 +388,52 @@ func (sp *ClusterStatusPrinter) printStorageInfo() {
sp.write("\traw: %s on volume replicas, %s on EC shards", sp.bytes(rawVolumeSize), sp.bytes(rawEcVolumeSize)) sp.write("\traw: %s on volume replicas, %s on EC shards", sp.bytes(rawVolumeSize), sp.bytes(rawEcVolumeSize))
sp.write("") sp.write("")
} }
func (sp *ClusterStatusPrinter) printFilesInfo() {
if len(sp.regularVolumeStats) == 0 {
return
}
var regularFilesTotal, regularFilesDeleted, regularFilesSize uint64
var regularFilesTotalRaw, regularFilesDeletedRaw, regularFilesSizeRaw uint64
for _, replicaStats := range sp.regularVolumeStats {
rc := uint64(len(replicaStats))
var volumeFilesTotal, volumeFilesSize, volumeFilesDeleted uint64
for _, rs := range replicaStats {
regularFilesTotalRaw += rs.Files
regularFilesSizeRaw += rs.TotalSize
regularFilesDeletedRaw += rs.FilesDeleted
volumeFilesTotal += rs.Files
volumeFilesSize += rs.TotalSize
volumeFilesDeleted += rs.FilesDeleted
}
regularFilesTotal += (volumeFilesTotal / rc)
regularFilesSize += (volumeFilesSize / rc)
regularFilesDeleted += (volumeFilesDeleted / rc)
}
regularFiles := regularFilesTotal - regularFilesDeleted
regularFilesRaw := regularFilesTotalRaw - regularFilesDeletedRaw
var avgFileSize uint64
if regularFilesTotal != 0 {
avgFileSize = regularFilesSize / regularFilesTotal
}
sp.write("files:")
sp.write("\tregular: %s %s, %s readable (%s), %s deleted (%s), avg %s per file",
sp.uint64(regularFilesTotal), sp.uint64Plural(regularFilesTotal, "file"),
sp.uint64(regularFiles), sp.uint64Pct(regularFiles, regularFilesTotal),
sp.uint64(regularFilesDeleted), sp.uint64Pct(regularFilesDeleted, regularFilesTotal),
sp.bytes(avgFileSize))
sp.write("\tregular raw: %s %s, %s readable (%s), %s deleted (%s), %s total",
sp.uint64(regularFilesTotalRaw), sp.uint64Plural(regularFilesTotalRaw, "file"),
sp.uint64(regularFilesRaw), sp.uint64Pct(regularFilesRaw, regularFilesTotalRaw),
sp.uint64(regularFilesDeletedRaw), sp.uint64Pct(regularFilesDeletedRaw, regularFilesTotalRaw),
sp.bytes(regularFilesSizeRaw))
sp.write("\tEC: [no data]")
sp.write("\tEC raw: [no data]")
sp.write("")
}

75
weed/shell/command_cluster_status_test.go

@ -138,3 +138,78 @@ func TestPrintStorageInfo(t *testing.T) {
} }
} }
} }
func TestPrintFilesInfo(t *testing.T) {
testCases := []struct {
regularVolumeStats RegularVolumeStats
humanize bool
want string
}{
{
regularVolumeStats: RegularVolumeStats{
1: []*VolumeReplicaStats{
&VolumeReplicaStats{Id: "10.200.17.13:9001", VolumeId: 1, Files: 159, FilesDeleted: 8, TotalSize: 89762704},
&VolumeReplicaStats{Id: "10.200.17.13:9002", VolumeId: 1, Files: 159, FilesDeleted: 8, TotalSize: 89762704},
&VolumeReplicaStats{Id: "10.200.17.13:9008", VolumeId: 1, Files: 159, FilesDeleted: 8, TotalSize: 89762704},
},
2: []*VolumeReplicaStats{
&VolumeReplicaStats{Id: "10.200.17.13:9003", VolumeId: 2, Files: 192, FilesDeleted: 21, TotalSize: 93788632},
&VolumeReplicaStats{Id: "10.200.17.13:9004", VolumeId: 2, Files: 192, FilesDeleted: 21, TotalSize: 93788632},
&VolumeReplicaStats{Id: "10.200.17.13:9005", VolumeId: 2, Files: 192, FilesDeleted: 21, TotalSize: 93788632},
},
3: []*VolumeReplicaStats{
&VolumeReplicaStats{Id: "10.200.17.13:9001", VolumeId: 3, Files: 149, FilesDeleted: 0, TotalSize: 81643872},
&VolumeReplicaStats{Id: "10.200.17.13:9006", VolumeId: 3, Files: 149, FilesDeleted: 0, TotalSize: 81643872},
&VolumeReplicaStats{Id: "10.200.17.13:9009", VolumeId: 3, Files: 149, FilesDeleted: 0, TotalSize: 81643872},
},
},
humanize: false,
want: `files:
regular: 500 file(s), 471 readable (94.20%), 29 deleted (5.80%), avg 530390 byte(s) per file
regular raw: 1500 file(s), 1413 readable (94.20%), 87 deleted (5.80%), 795585624 byte(s) total
EC: [no data]
EC raw: [no data]
`,
},
{
regularVolumeStats: RegularVolumeStats{
1: []*VolumeReplicaStats{
&VolumeReplicaStats{Id: "10.200.17.13:9001", VolumeId: 1, Files: 184, FilesDeleted: 33, TotalSize: 79187475},
&VolumeReplicaStats{Id: "10.200.17.13:9008", VolumeId: 1, Files: 184, FilesDeleted: 33, TotalSize: 79187475},
},
2: []*VolumeReplicaStats{
&VolumeReplicaStats{Id: "10.200.17.13:9004", VolumeId: 2, Files: 245, FilesDeleted: 4, TotalSize: 89501070},
&VolumeReplicaStats{Id: "10.200.17.13:9005", VolumeId: 2, Files: 245, FilesDeleted: 4, TotalSize: 89501070},
},
3: []*VolumeReplicaStats{
&VolumeReplicaStats{Id: "10.200.17.13:9006", VolumeId: 3, Files: 171, FilesDeleted: 12, TotalSize: 124049530},
&VolumeReplicaStats{Id: "10.200.17.13:9009", VolumeId: 3, Files: 171, FilesDeleted: 12, TotalSize: 124049530},
},
},
humanize: true,
want: `files:
regular: 600 files, 551 readable (91.83%), 49 deleted (8.16%), avg 488 kB per file
regular raw: 1,200 files, 1,102 readable (91.83%), 98 deleted (8.16%), 586 MB total
EC: [no data]
EC raw: [no data]
`,
},
}
for i, tc := range testCases {
var buf bytes.Buffer
sp := &ClusterStatusPrinter{
writer: &buf,
humanize: tc.humanize,
regularVolumeStats: tc.regularVolumeStats,
}
sp.printFilesInfo()
got := buf.String()
if got != tc.want {
t.Errorf("#%d: got %v, want %v", i, got, tc.want)
}
}
}
Loading…
Cancel
Save