Browse Source

Fix #7307: Prevent infinite loop in volume.check.disk (#7308)

The volume.check.disk command could get stuck in an infinite loop when
syncing replicas that have persistent discrepancies that cannot be
resolved. This happened because the sync loop had no maximum iteration
limit and no detection for when progress stopped being made.

Issues fixed:
1. Infinite loop: Added maxIterations limit (5) to prevent endless looping
2. Progress detection: Detect when hasChanges state doesn't change between
   iterations, indicating sync is stuck
3. Return value bug: Fixed naked return statement that was returning zero
   values instead of the actual hasChanges value, causing incorrect loop
   termination logic

Changes:
- Added maximum iteration limit with clear error messages
- Added progress detection to identify stuck sync situations
- Fixed return statement to properly return hasChanges and error
- Added verbose logging for sync iterations

The fix ensures that:
- Sync will terminate after 5 iterations maximum
- Users get clear messages about why sync stopped
- The hasChanges logic properly reflects deletion sync results

Fixes #7307
pull/7309/head
Chris Lu 3 days ago
committed by GitHub
parent
commit
e90809521b
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
  1. 28
      weed/shell/command_volume_check_disk.go

28
weed/shell/command_volume_check_disk.go

@ -183,11 +183,34 @@ func (c *commandVolumeCheckDisk) Do(args []string, commandEnv *CommandEnv, write
func (c *commandVolumeCheckDisk) syncTwoReplicas(a *VolumeReplica, b *VolumeReplica, applyChanges bool, doSyncDeletions bool, nonRepairThreshold float64, verbose bool) (err error) {
aHasChanges, bHasChanges := true, true
for aHasChanges || bHasChanges {
const maxIterations = 5
iteration := 0
for (aHasChanges || bHasChanges) && iteration < maxIterations {
iteration++
if verbose {
fmt.Fprintf(c.writer, "sync iteration %d for volume %d\n", iteration, a.info.Id)
}
prevAHasChanges, prevBHasChanges := aHasChanges, bHasChanges
if aHasChanges, bHasChanges, err = c.checkBoth(a, b, applyChanges, doSyncDeletions, nonRepairThreshold, verbose); err != nil {
return err
}
// Detect if we're stuck in a loop with no progress
if iteration > 1 && prevAHasChanges == aHasChanges && prevBHasChanges == bHasChanges && (aHasChanges || bHasChanges) {
fmt.Fprintf(c.writer, "volume %d sync is not making progress between %s and %s after iteration %d, stopping to prevent infinite loop\n",
a.info.Id, a.location.dataNode.Id, b.location.dataNode.Id, iteration)
return fmt.Errorf("sync not making progress after %d iterations", iteration)
}
}
if iteration >= maxIterations && (aHasChanges || bHasChanges) {
fmt.Fprintf(c.writer, "volume %d sync reached maximum iterations (%d) between %s and %s, may need manual intervention\n",
a.info.Id, maxIterations, a.location.dataNode.Id, b.location.dataNode.Id)
return fmt.Errorf("reached maximum sync iterations (%d)", maxIterations)
}
return nil
}
@ -307,11 +330,10 @@ func doVolumeCheckDisk(minuend, subtrahend *needle_map.MemDb, source, target *Vo
for _, deleteResult := range deleteResults {
if deleteResult.Status == http.StatusAccepted && deleteResult.Size > 0 {
hasChanges = true
return
}
}
}
return
return hasChanges, nil
}
func readSourceNeedleBlob(grpcDialOption grpc.DialOption, sourceVolumeServer pb.ServerAddress, volumeId uint32, needleValue needle_map.NeedleValue) (needleBlob []byte, err error) {

Loading…
Cancel
Save