From af4c3fcb311559e0a8d97c76bc133a09fa0a06df Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Sat, 7 Mar 2026 09:18:48 -0800 Subject: [PATCH] ec: fall back to data dir when ecx file not found in idx dir (#8541) * ec: fall back to data dir when ecx file not found in idx dir (#8540) When -dir.idx is configured after EC encoding, the .ecx/.ecj files remain in the data directory. NewEcVolume now falls back to the data directory when the index file is not found in dirIdx. * ec: add fallback logging and improved error message for ecx lookup * ec: preserve configured dirIdx, track actual ecx location separately The previous fallback set ev.dirIdx = dir when finding .ecx in the data directory, which corrupted IndexBaseFileName() for future writes (e.g., WriteIdxFileFromEcIndex during EC-to-volume conversion would write the .idx file to the data directory instead of the configured index directory). Introduce ecxActualDir to track where .ecx/.ecj were actually found, used only by FileName() for cleanup/destroy. IndexBaseFileName() continues to use the configured dirIdx for new file creation. * ec: check both idx and data dirs for .ecx in all cleanup and lookup paths When -dir.idx is configured after EC encoding, .ecx/.ecj files may reside in the data directory. Several code paths only checked l.IdxDirectory, causing them to miss these files: - removeEcVolumeFiles: now removes .ecx/.ecj from both directories - loadExistingVolume: ecx existence check falls back to data dir - deleteEcShardIdsForEachLocation: ecx existence check and cleanup both cover the data directory - VolumeEcShardsRebuild: ecx lookup falls back to data directory so RebuildEcxFile operates on the correct file --- weed/server/volume_grpc_erasure_coding.go | 24 ++++++++++++++++++----- weed/storage/disk_location.go | 4 ++++ weed/storage/disk_location_ec.go | 5 +++++ weed/storage/erasure_coding/ec_volume.go | 17 ++++++++++++++-- 4 files changed, 43 insertions(+), 7 deletions(-) diff --git a/weed/server/volume_grpc_erasure_coding.go b/weed/server/volume_grpc_erasure_coding.go index af32e801a..803aed937 100644 --- a/weed/server/volume_grpc_erasure_coding.go +++ b/weed/server/volume_grpc_erasure_coding.go @@ -154,7 +154,12 @@ func (vs *VolumeServer) VolumeEcShardsRebuild(ctx context.Context, req *volume_s continue } - if util.FileExists(path.Join(location.IdxDirectory, baseFileName+".ecx")) { + indexBaseFileName := path.Join(location.IdxDirectory, baseFileName) + if !util.FileExists(indexBaseFileName+".ecx") && location.IdxDirectory != location.Directory { + // .ecx may be in the data directory if created before -dir.idx was configured + indexBaseFileName = path.Join(location.Directory, baseFileName) + } + if util.FileExists(indexBaseFileName + ".ecx") { // write .ec00 ~ .ec13 files dataBaseFileName := path.Join(location.Directory, baseFileName) if generatedShardIds, err := erasure_coding.RebuildEcFiles(dataBaseFileName); err != nil { @@ -163,9 +168,8 @@ func (vs *VolumeServer) VolumeEcShardsRebuild(ctx context.Context, req *volume_s rebuiltShardIds = generatedShardIds } - indexBaseFileName := path.Join(location.IdxDirectory, baseFileName) if err := erasure_coding.RebuildEcxFile(indexBaseFileName); err != nil { - return nil, fmt.Errorf("RebuildEcxFile %s: %v", dataBaseFileName, err) + return nil, fmt.Errorf("RebuildEcxFile %s: %v", indexBaseFileName, err) } break @@ -283,7 +287,11 @@ func deleteEcShardIdsForEachLocation(bName string, location *storage.DiskLocatio indexBaseFilename := path.Join(location.IdxDirectory, bName) dataBaseFilename := path.Join(location.Directory, bName) - if util.FileExists(path.Join(location.IdxDirectory, bName+".ecx")) { + ecxExists := util.FileExists(path.Join(location.IdxDirectory, bName+".ecx")) + if !ecxExists && location.IdxDirectory != location.Directory { + ecxExists = util.FileExists(path.Join(location.Directory, bName+".ecx")) + } + if ecxExists { for _, shardId := range shardIds { shardFileName := dataBaseFilename + erasure_coding.ToExt(int(shardId)) if util.FileExists(shardFileName) { @@ -303,10 +311,16 @@ func deleteEcShardIdsForEachLocation(bName string, location *storage.DiskLocatio } if hasEcxFile && existingShardCount == 0 { - if err := os.Remove(indexBaseFilename + ".ecx"); err != nil { + // Remove .ecx/.ecj from both idx and data directories + // since they may be in either location depending on when -dir.idx was configured + if err := os.Remove(indexBaseFilename + ".ecx"); err != nil && !os.IsNotExist(err) { return err } os.Remove(indexBaseFilename + ".ecj") + if location.IdxDirectory != location.Directory { + os.Remove(dataBaseFilename + ".ecx") + os.Remove(dataBaseFilename + ".ecj") + } if !hasIdxFile { // .vif is used for ec volumes and normal volumes diff --git a/weed/storage/disk_location.go b/weed/storage/disk_location.go index 9ab6790e4..9ff7d5e5b 100644 --- a/weed/storage/disk_location.go +++ b/weed/storage/disk_location.go @@ -172,6 +172,10 @@ func (l *DiskLocation) loadExistingVolume(dirEntry os.DirEntry, needleMapKind Ne // skip if ec volumes exists, but validate EC files first if skipIfEcVolumesExists { ecxFilePath := filepath.Join(l.IdxDirectory, volumeName+".ecx") + if !util.FileExists(ecxFilePath) && l.IdxDirectory != l.Directory { + // .ecx may have been created before -dir.idx was configured + ecxFilePath = filepath.Join(l.Directory, volumeName+".ecx") + } if util.FileExists(ecxFilePath) { // Validate EC volume: shard count, size consistency, and expected size vs .dat file if !l.validateEcVolume(collection, vid) { diff --git a/weed/storage/disk_location_ec.go b/weed/storage/disk_location_ec.go index 035f5bc21..cc6ef8f8a 100644 --- a/weed/storage/disk_location_ec.go +++ b/weed/storage/disk_location_ec.go @@ -476,6 +476,11 @@ func (l *DiskLocation) removeEcVolumeFiles(collection string, vid needle.VolumeI // EC loading for incomplete/missing shards on next startup removeFile(indexBaseFileName+".ecx", "EC index file") removeFile(indexBaseFileName+".ecj", "EC journal file") + // Also try the data directory in case .ecx/.ecj were created before -dir.idx was configured + if l.IdxDirectory != l.Directory { + removeFile(baseFileName+".ecx", "EC index file (fallback)") + removeFile(baseFileName+".ecj", "EC journal file (fallback)") + } // Remove all EC shard files (.ec00 ~ .ec31) from data directory // Use MaxShardCount (32) to support custom EC ratios diff --git a/weed/storage/erasure_coding/ec_volume.go b/weed/storage/erasure_coding/ec_volume.go index f926f1925..effb78f28 100644 --- a/weed/storage/erasure_coding/ec_volume.go +++ b/weed/storage/erasure_coding/ec_volume.go @@ -28,6 +28,7 @@ type EcVolume struct { Collection string dir string dirIdx string + ecxActualDir string // directory where .ecx/.ecj were actually found (may differ from dirIdx after fallback) ecxFile *os.File ecxFileSize int64 ecxCreatedAt time.Time @@ -51,8 +52,20 @@ func NewEcVolume(diskType types.DiskType, dir string, dirIdx string, collection indexBaseFileName := EcShardFileName(collection, dirIdx, int(vid)) // open ecx file + ev.ecxActualDir = dirIdx if ev.ecxFile, err = os.OpenFile(indexBaseFileName+".ecx", os.O_RDWR, 0644); err != nil { - return nil, fmt.Errorf("cannot open ec volume index %s.ecx: %v", indexBaseFileName, err) + if dirIdx != dir && os.IsNotExist(err) { + // fall back to data directory if idx directory does not have the .ecx file + firstErr := err + glog.V(1).Infof("ecx file not found at %s.ecx, falling back to %s.ecx", indexBaseFileName, dataBaseFileName) + if ev.ecxFile, err = os.OpenFile(dataBaseFileName+".ecx", os.O_RDWR, 0644); err != nil { + return nil, fmt.Errorf("open ecx index %s.ecx: %v; fallback %s.ecx: %v", indexBaseFileName, firstErr, dataBaseFileName, err) + } + indexBaseFileName = dataBaseFileName + ev.ecxActualDir = dir + } else { + return nil, fmt.Errorf("cannot open ec volume index %s.ecx: %v", indexBaseFileName, err) + } } ecxFi, statErr := ev.ecxFile.Stat() if statErr != nil { @@ -197,7 +210,7 @@ func (ev *EcVolume) Destroy() { func (ev *EcVolume) FileName(ext string) string { switch ext { case ".ecx", ".ecj": - return ev.IndexBaseFileName() + ext + return EcShardFileName(ev.Collection, ev.ecxActualDir, int(ev.VolumeId)) + ext } // .vif return ev.DataBaseFileName() + ext