From d21a5bf5d0c4b0db1951058934701c142fbd7acc Mon Sep 17 00:00:00 2001 From: Konstantin Lebedev Date: Tue, 23 Mar 2021 15:04:07 +0500 Subject: [PATCH 1/5] add missingKeysThreshold for fsck --- weed/shell/command_volume_check_disk.go | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/weed/shell/command_volume_check_disk.go b/weed/shell/command_volume_check_disk.go index 64b510383..b53229dbd 100644 --- a/weed/shell/command_volume_check_disk.go +++ b/weed/shell/command_volume_check_disk.go @@ -49,6 +49,8 @@ func (c *commandVolumeCheckDisk) Do(args []string, commandEnv *CommandEnv, write slowMode := fsckCommand.Bool("slow", false, "slow mode checks all replicas even file counts are the same") verbose := fsckCommand.Bool("v", false, "verbose mode") applyChanges := fsckCommand.Bool("force", false, "apply the fix") + missingKeysThreshold := fsckCommand.Float64("missingKeysThreshold", 0.3, "repair when missing keys is not more than this limit") + if err = fsckCommand.Parse(args); err != nil { return nil } @@ -101,10 +103,10 @@ func (c *commandVolumeCheckDisk) Do(args []string, commandEnv *CommandEnv, write } // find and make up the differnces - if err := c.doVolumeCheckDisk(aDB, bDB, a, b, *verbose, writer, *applyChanges); err != nil { + if err := c.doVolumeCheckDisk(aDB, bDB, a, b, *verbose, writer, *applyChanges, *missingKeysThreshold); err != nil { return err } - if err := c.doVolumeCheckDisk(bDB, aDB, b, a, *verbose, writer, *applyChanges); err != nil { + if err := c.doVolumeCheckDisk(bDB, aDB, b, a, *verbose, writer, *applyChanges, *missingKeysThreshold); err != nil { return err } replicas = replicas[1:] @@ -114,7 +116,7 @@ func (c *commandVolumeCheckDisk) Do(args []string, commandEnv *CommandEnv, write return nil } -func (c *commandVolumeCheckDisk) doVolumeCheckDisk(subtrahend, minuend *needle_map.MemDb, source, target *VolumeReplica, verbose bool, writer io.Writer, applyChanges bool) error { +func (c *commandVolumeCheckDisk) doVolumeCheckDisk(subtrahend, minuend *needle_map.MemDb, source, target *VolumeReplica, verbose bool, writer io.Writer, applyChanges bool, missingKeysThreshold float64) error { // find missing keys // hash join, can be more efficient @@ -129,6 +131,12 @@ func (c *commandVolumeCheckDisk) doVolumeCheckDisk(subtrahend, minuend *needle_m }) fmt.Fprintf(writer, "volume %d %s has %d entries, %s missed %d entries\n", source.info.Id, source.location.dataNode.Id, counter, target.location.dataNode.Id, len(missingNeedles)) + missingNeedlesPercent := float64(len(missingNeedles)) / float64(counter) + if missingNeedlesPercent > missingKeysThreshold { + return fmt.Errorf( + "failed to start repair volume %d, percentage of missing keys is greater than the threshold: %.2f > %.2f", + source.info.Id, missingNeedlesPercent, missingKeysThreshold) + } for _, needleValue := range missingNeedles { @@ -151,7 +159,6 @@ func (c *commandVolumeCheckDisk) doVolumeCheckDisk(subtrahend, minuend *needle_m } - return nil } @@ -161,8 +168,8 @@ func (c *commandVolumeCheckDisk) readSourceNeedleBlob(sourceVolumeServer string, resp, err := client.ReadNeedleBlob(context.Background(), &volume_server_pb.ReadNeedleBlobRequest{ VolumeId: volumeId, NeedleId: uint64(needleValue.Key), - Offset: needleValue.Offset.ToActualOffset(), - Size: int32(needleValue.Size), + Offset: needleValue.Offset.ToActualOffset(), + Size: int32(needleValue.Size), }) if err != nil { return err @@ -177,9 +184,9 @@ func (c *commandVolumeCheckDisk) writeNeedleBlobToTarget(targetVolumeServer stri return operation.WithVolumeServerClient(targetVolumeServer, c.env.option.GrpcDialOption, func(client volume_server_pb.VolumeServerClient) error { _, err := client.WriteNeedleBlob(context.Background(), &volume_server_pb.WriteNeedleBlobRequest{ - VolumeId: volumeId, - NeedleId: uint64(needleValue.Key), - Size: int32(needleValue.Size), + VolumeId: volumeId, + NeedleId: uint64(needleValue.Key), + Size: int32(needleValue.Size), NeedleBlob: needleBlob, }) return err From 40a9e88c0710dffc780602e55cf14071c384276d Mon Sep 17 00:00:00 2001 From: Konstantin Lebedev Date: Tue, 23 Mar 2021 21:42:35 +0500 Subject: [PATCH 2/5] rename var --- weed/shell/command_volume_check_disk.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/weed/shell/command_volume_check_disk.go b/weed/shell/command_volume_check_disk.go index b53229dbd..dc3b8ca06 100644 --- a/weed/shell/command_volume_check_disk.go +++ b/weed/shell/command_volume_check_disk.go @@ -131,11 +131,11 @@ func (c *commandVolumeCheckDisk) doVolumeCheckDisk(subtrahend, minuend *needle_m }) fmt.Fprintf(writer, "volume %d %s has %d entries, %s missed %d entries\n", source.info.Id, source.location.dataNode.Id, counter, target.location.dataNode.Id, len(missingNeedles)) - missingNeedlesPercent := float64(len(missingNeedles)) / float64(counter) - if missingNeedlesPercent > missingKeysThreshold { + missingNeedlesFraction := float64(len(missingNeedles)) / float64(counter) + if missingNeedlesFraction > missingKeysThreshold { return fmt.Errorf( "failed to start repair volume %d, percentage of missing keys is greater than the threshold: %.2f > %.2f", - source.info.Id, missingNeedlesPercent, missingKeysThreshold) + source.info.Id, missingNeedlesFraction, missingKeysThreshold) } for _, needleValue := range missingNeedles { From 60972f1c9742f0eb10d712dae0d57459e7018a3d Mon Sep 17 00:00:00 2001 From: Konstantin Lebedev Date: Wed, 24 Mar 2021 13:24:49 +0500 Subject: [PATCH 3/5] rename option repairThreshold --- weed/shell/command_volume_check_disk.go | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/weed/shell/command_volume_check_disk.go b/weed/shell/command_volume_check_disk.go index dc3b8ca06..a27c9a8ac 100644 --- a/weed/shell/command_volume_check_disk.go +++ b/weed/shell/command_volume_check_disk.go @@ -49,8 +49,7 @@ func (c *commandVolumeCheckDisk) Do(args []string, commandEnv *CommandEnv, write slowMode := fsckCommand.Bool("slow", false, "slow mode checks all replicas even file counts are the same") verbose := fsckCommand.Bool("v", false, "verbose mode") applyChanges := fsckCommand.Bool("force", false, "apply the fix") - missingKeysThreshold := fsckCommand.Float64("missingKeysThreshold", 0.3, "repair when missing keys is not more than this limit") - + repairThreshold := fsckCommand.Float64("repairThreshold", 0.3, "repair when missing keys is not more than this limit") if err = fsckCommand.Parse(args); err != nil { return nil } @@ -103,10 +102,10 @@ func (c *commandVolumeCheckDisk) Do(args []string, commandEnv *CommandEnv, write } // find and make up the differnces - if err := c.doVolumeCheckDisk(aDB, bDB, a, b, *verbose, writer, *applyChanges, *missingKeysThreshold); err != nil { + if err := c.doVolumeCheckDisk(aDB, bDB, a, b, *verbose, writer, *applyChanges, *repairThreshold); err != nil { return err } - if err := c.doVolumeCheckDisk(bDB, aDB, b, a, *verbose, writer, *applyChanges, *missingKeysThreshold); err != nil { + if err := c.doVolumeCheckDisk(bDB, aDB, b, a, *verbose, writer, *applyChanges, *repairThreshold); err != nil { return err } replicas = replicas[1:] @@ -116,7 +115,7 @@ func (c *commandVolumeCheckDisk) Do(args []string, commandEnv *CommandEnv, write return nil } -func (c *commandVolumeCheckDisk) doVolumeCheckDisk(subtrahend, minuend *needle_map.MemDb, source, target *VolumeReplica, verbose bool, writer io.Writer, applyChanges bool, missingKeysThreshold float64) error { +func (c *commandVolumeCheckDisk) doVolumeCheckDisk(subtrahend, minuend *needle_map.MemDb, source, target *VolumeReplica, verbose bool, writer io.Writer, applyChanges bool, repairThreshold float64) error { // find missing keys // hash join, can be more efficient @@ -132,10 +131,10 @@ func (c *commandVolumeCheckDisk) doVolumeCheckDisk(subtrahend, minuend *needle_m fmt.Fprintf(writer, "volume %d %s has %d entries, %s missed %d entries\n", source.info.Id, source.location.dataNode.Id, counter, target.location.dataNode.Id, len(missingNeedles)) missingNeedlesFraction := float64(len(missingNeedles)) / float64(counter) - if missingNeedlesFraction > missingKeysThreshold { + if missingNeedlesFraction > repairThreshold { return fmt.Errorf( "failed to start repair volume %d, percentage of missing keys is greater than the threshold: %.2f > %.2f", - source.info.Id, missingNeedlesFraction, missingKeysThreshold) + source.info.Id, missingNeedlesFraction, repairThreshold) } for _, needleValue := range missingNeedles { From df6cf0a2fabe557a85427a865aa8256783555fae Mon Sep 17 00:00:00 2001 From: Konstantin Lebedev Date: Wed, 24 Mar 2021 22:07:13 +0500 Subject: [PATCH 4/5] nonRepairThreshold --- weed/shell/command_volume_check_disk.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/weed/shell/command_volume_check_disk.go b/weed/shell/command_volume_check_disk.go index a27c9a8ac..05caa72e9 100644 --- a/weed/shell/command_volume_check_disk.go +++ b/weed/shell/command_volume_check_disk.go @@ -49,7 +49,7 @@ func (c *commandVolumeCheckDisk) Do(args []string, commandEnv *CommandEnv, write slowMode := fsckCommand.Bool("slow", false, "slow mode checks all replicas even file counts are the same") verbose := fsckCommand.Bool("v", false, "verbose mode") applyChanges := fsckCommand.Bool("force", false, "apply the fix") - repairThreshold := fsckCommand.Float64("repairThreshold", 0.3, "repair when missing keys is not more than this limit") + nonRepairThreshold := fsckCommand.Float64("nonRepairThreshold", 0.3, "repair when missing keys is not more than this limit") if err = fsckCommand.Parse(args); err != nil { return nil } @@ -102,10 +102,10 @@ func (c *commandVolumeCheckDisk) Do(args []string, commandEnv *CommandEnv, write } // find and make up the differnces - if err := c.doVolumeCheckDisk(aDB, bDB, a, b, *verbose, writer, *applyChanges, *repairThreshold); err != nil { + if err := c.doVolumeCheckDisk(aDB, bDB, a, b, *verbose, writer, *applyChanges, *nonRepairThreshold); err != nil { return err } - if err := c.doVolumeCheckDisk(bDB, aDB, b, a, *verbose, writer, *applyChanges, *repairThreshold); err != nil { + if err := c.doVolumeCheckDisk(bDB, aDB, b, a, *verbose, writer, *applyChanges, *nonRepairThreshold); err != nil { return err } replicas = replicas[1:] @@ -115,7 +115,7 @@ func (c *commandVolumeCheckDisk) Do(args []string, commandEnv *CommandEnv, write return nil } -func (c *commandVolumeCheckDisk) doVolumeCheckDisk(subtrahend, minuend *needle_map.MemDb, source, target *VolumeReplica, verbose bool, writer io.Writer, applyChanges bool, repairThreshold float64) error { +func (c *commandVolumeCheckDisk) doVolumeCheckDisk(subtrahend, minuend *needle_map.MemDb, source, target *VolumeReplica, verbose bool, writer io.Writer, applyChanges bool, nonRepairThreshold float64) error { // find missing keys // hash join, can be more efficient @@ -131,10 +131,10 @@ func (c *commandVolumeCheckDisk) doVolumeCheckDisk(subtrahend, minuend *needle_m fmt.Fprintf(writer, "volume %d %s has %d entries, %s missed %d entries\n", source.info.Id, source.location.dataNode.Id, counter, target.location.dataNode.Id, len(missingNeedles)) missingNeedlesFraction := float64(len(missingNeedles)) / float64(counter) - if missingNeedlesFraction > repairThreshold { + if missingNeedlesFraction > nonRepairThreshold { return fmt.Errorf( "failed to start repair volume %d, percentage of missing keys is greater than the threshold: %.2f > %.2f", - source.info.Id, missingNeedlesFraction, repairThreshold) + source.info.Id, missingNeedlesFraction, nonRepairThreshold) } for _, needleValue := range missingNeedles { From 020a5d40c32d458492a5ef4a960e58f24e12e991 Mon Sep 17 00:00:00 2001 From: Konstantin Lebedev Date: Wed, 24 Mar 2021 22:51:22 +0500 Subject: [PATCH 5/5] avoid counter is null --- weed/shell/command_volume_check_disk.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/weed/shell/command_volume_check_disk.go b/weed/shell/command_volume_check_disk.go index 05caa72e9..5a0d46869 100644 --- a/weed/shell/command_volume_check_disk.go +++ b/weed/shell/command_volume_check_disk.go @@ -130,6 +130,11 @@ func (c *commandVolumeCheckDisk) doVolumeCheckDisk(subtrahend, minuend *needle_m }) fmt.Fprintf(writer, "volume %d %s has %d entries, %s missed %d entries\n", source.info.Id, source.location.dataNode.Id, counter, target.location.dataNode.Id, len(missingNeedles)) + + if counter == 0 || len(missingNeedles) == 0 { + return nil + } + missingNeedlesFraction := float64(len(missingNeedles)) / float64(counter) if missingNeedlesFraction > nonRepairThreshold { return fmt.Errorf(