From 330ba7d9dcfbd6a30e9d83783d3c9e73cc28c12b Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Sat, 7 Feb 2026 21:33:02 -0800 Subject: [PATCH] Fix disk errors handling in vacuum compaction (#8244) When a disk reports IO errors during vacuum compaction (e.g., 'read /mnt/d1/weed/oc_xyz.dat: input/output error'), the vacuum task should signal the error to the master so it can: 1. Drop the faulty volume replica 2. Rebuild the replica from healthy copies Changes: - Add checkReadWriteError() calls in vacuum read paths (ReadNeedleBlob, ReadData, ScanVolumeFile) to flag EIO errors in volume.lastIoError - Preserve error wrapping using %w format instead of %v so EIO propagates correctly - The existing heartbeat logic will detect lastIoError and remove the bad volume Fixes issue #8237 --- weed/storage/volume_read.go | 6 +++--- weed/storage/volume_vacuum.go | 7 +++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/weed/storage/volume_read.go b/weed/storage/volume_read.go index 9a209ced5..376dafb44 100644 --- a/weed/storage/volume_read.go +++ b/weed/storage/volume_read.go @@ -219,10 +219,10 @@ func ScanVolumeFile(dirname string, collection string, id needle.VolumeId, volumeFileScanner VolumeFileScanner) (err error) { var v *Volume if v, err = loadVolumeWithoutIndex(dirname, collection, id, needleMapKind, needle.GetCurrentVersion()); err != nil { - return fmt.Errorf("failed to load volume %d: %v", id, err) + return fmt.Errorf("failed to load volume %d: %w", id, err) } if err = volumeFileScanner.VisitSuperBlock(v.SuperBlock); err != nil { - return fmt.Errorf("failed to process volume %d super block: %v", id, err) + return fmt.Errorf("failed to process volume %d super block: %w", id, err) } defer v.Close() @@ -239,7 +239,7 @@ func ScanVolumeFileFrom(version needle.Version, datBackend backend.BackendStorag if e == io.EOF { return nil } - return fmt.Errorf("cannot read %s at offset %d: %v", datBackend.Name(), offset, e) + return fmt.Errorf("cannot read %s at offset %d: %w", datBackend.Name(), offset, e) } for n != nil { var needleBody []byte diff --git a/weed/storage/volume_vacuum.go b/weed/storage/volume_vacuum.go index e5e0691e3..17c53023d 100644 --- a/weed/storage/volume_vacuum.go +++ b/weed/storage/volume_vacuum.go @@ -326,7 +326,8 @@ func (v *Volume) makeupDiff(newDatFileName, newIdxFileName, oldDatFileName, oldI var needleBytes []byte needleBytes, err = needle.ReadNeedleBlob(oldDatBackend, increIdxEntry.offset.ToActualOffset(), increIdxEntry.size, v.Version()) if err != nil { - return fmt.Errorf("ReadNeedleBlob %s key %d offset %d size %d failed: %v", oldDatFile.Name(), key, increIdxEntry.offset.ToActualOffset(), increIdxEntry.size, err) + v.checkReadWriteError(err) + return fmt.Errorf("ReadNeedleBlob %s key %d offset %d size %d failed: %w", oldDatFile.Name(), key, increIdxEntry.offset.ToActualOffset(), increIdxEntry.size, err) } dstDatBackend.Write(needleBytes) if err := dstDatBackend.Sync(); err != nil { @@ -421,6 +422,7 @@ func (v *Volume) copyDataAndGenerateIndexFile(dstName, idxName string, prealloca } err = ScanVolumeFile(v.dir, v.Collection, v.Id, v.needleMapKind, scanner) if err != nil { + v.checkReadWriteError(err) return err } @@ -476,7 +478,8 @@ func (v *Volume) copyDataBasedOnIndexFile(srcDatName, srcIdxName, dstDatName, da n := new(needle.Needle) if err := n.ReadData(srcDatBackend, offset.ToActualOffset(), size, version); err != nil { - return fmt.Errorf("cannot hydrate needle from file: %s", err) + v.checkReadWriteError(err) + return fmt.Errorf("cannot hydrate needle from file: %w", err) } if n.HasTtl() && now >= n.LastModified+uint64(sb.Ttl.Minutes()*60) {