From e3d0f9e4ce5dc7d7fe17cb0719f344b137987d45 Mon Sep 17 00:00:00 2001 From: chrislu Date: Sun, 26 Oct 2025 11:46:22 -0700 Subject: [PATCH] Added Shard Size Validation --- weed/storage/disk_location_ec.go | 39 +++++++++++++---- weed/storage/disk_location_ec_test.go | 62 +++++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 9 deletions(-) diff --git a/weed/storage/disk_location_ec.go b/weed/storage/disk_location_ec.go index c8dc3c444..bb71236c6 100644 --- a/weed/storage/disk_location_ec.go +++ b/weed/storage/disk_location_ec.go @@ -42,6 +42,18 @@ func (l *DiskLocation) DestroyEcVolume(vid needle.VolumeId) { } } +// unloadEcVolume removes an EC volume from memory without deleting its files on disk. +// This is useful for distributed EC volumes where shards may be on other servers. +func (l *DiskLocation) unloadEcVolume(vid needle.VolumeId) { + l.ecVolumesLock.Lock() + defer l.ecVolumesLock.Unlock() + + if ecVolume, found := l.ecVolumes[vid]; found { + ecVolume.Close() + delete(l.ecVolumes, vid) + } +} + func (l *DiskLocation) CollectEcShards(vid needle.VolumeId, shardFileNames []string) (ecVolume *erasure_coding.EcVolume, found bool) { l.ecVolumesLock.RLock() defer l.ecVolumesLock.RUnlock() @@ -202,9 +214,9 @@ func (l *DiskLocation) loadAllEcShards() (err error) { if datExists && len(sameVolumeShards) < erasure_coding.DataShardsCount { glog.Warningf("Incomplete EC encoding for volume %d: .dat exists but only %d shards found (need at least %d), cleaning up EC files...", volumeId, len(sameVolumeShards), erasure_coding.DataShardsCount) - // Clean up any in-memory state before removing files - l.DestroyEcVolume(volumeId) l.removeEcVolumeFiles(collection, volumeId) + // Clean up any in-memory state. This does not delete files (already deleted by removeEcVolumeFiles). + l.unloadEcVolume(volumeId) sameVolumeShards = nil prevVolumeId = 0 continue @@ -215,14 +227,12 @@ func (l *DiskLocation) loadAllEcShards() (err error) { // If .dat is gone, log error but don't clean up (may be waiting for shards from other servers) if datExists { glog.Warningf("Failed to load EC shards for volume %d and .dat exists: %v, cleaning up EC files to use .dat...", volumeId, err) - // Clean up any partially loaded in-memory state before removing files - l.DestroyEcVolume(volumeId) l.removeEcVolumeFiles(collection, volumeId) } else { glog.Warningf("Failed to load EC shards for volume %d: %v (this may be normal for distributed EC volumes)", volumeId, err) - // Clean up any partially loaded in-memory state even if we don't remove files - l.DestroyEcVolume(volumeId) } + // Clean up any partially loaded in-memory state. This does not delete files. + l.unloadEcVolume(volumeId) sameVolumeShards = nil prevVolumeId = 0 continue @@ -248,9 +258,9 @@ func (l *DiskLocation) loadAllEcShards() (err error) { if util.FileExists(datFileName) { glog.Warningf("Found %d EC shards without .ecx file for volume %d (incomplete encoding interrupted before .ecx creation), cleaning up...", len(sameVolumeShards), volumeId) - // Clean up any in-memory state before removing files - l.DestroyEcVolume(volumeId) l.removeEcVolumeFiles(collection, volumeId) + // Clean up any in-memory state. This does not delete files (already deleted by removeEcVolumeFiles). + l.unloadEcVolume(volumeId) } } } @@ -300,6 +310,7 @@ func (l *DiskLocation) EcShardCount() int { // validateEcVolume checks if EC volume has enough shards to be functional // For distributed EC volumes (where .dat is deleted), any number of shards is valid // For incomplete EC encoding (where .dat still exists), we need at least DataShardsCount shards +// Also validates that all shards have the same size (required for Reed-Solomon EC) func (l *DiskLocation) validateEcVolume(collection string, vid needle.VolumeId) bool { baseFileName := erasure_coding.EcShardFileName(collection, l.Directory, int(vid)) datFileName := baseFileName + ".dat" @@ -312,13 +323,23 @@ func (l *DiskLocation) validateEcVolume(collection string, vid needle.VolumeId) return true } - // .dat file exists, so we need to validate shard count for local EC + // .dat file exists, so we need to validate shard count and size for local EC shardCount := 0 + var expectedShardSize int64 = -1 + for i := 0; i < erasure_coding.TotalShardsCount; i++ { shardFileName := baseFileName + erasure_coding.ToExt(i) if fi, err := os.Stat(shardFileName); err == nil { // Check if file has non-zero size if fi.Size() > 0 { + // Validate all shards are the same size (required for Reed-Solomon EC) + if expectedShardSize == -1 { + expectedShardSize = fi.Size() + } else if fi.Size() != expectedShardSize { + glog.V(0).Infof("EC volume %d shard %d has size %d, expected %d (all EC shards must be same size)", + vid, i, fi.Size(), expectedShardSize) + return false + } shardCount++ } } else if !os.IsNotExist(err) { diff --git a/weed/storage/disk_location_ec_test.go b/weed/storage/disk_location_ec_test.go index ec125bd30..875bd5bb3 100644 --- a/weed/storage/disk_location_ec_test.go +++ b/weed/storage/disk_location_ec_test.go @@ -426,3 +426,65 @@ func TestEcCleanupWithSeparateIdxDirectory(t *testing.T) { t.Errorf(".dat file should remain but was deleted") } } + +// TestDistributedEcVolumeNoFileDeletion verifies that distributed EC volumes +// (where .dat is deleted) do NOT have their shard files deleted when load fails +// This tests the critical bug fix where DestroyEcVolume was incorrectly deleting files +func TestDistributedEcVolumeNoFileDeletion(t *testing.T) { + tempDir := t.TempDir() + + minFreeSpace := util.MinFreeSpace{Type: util.AsPercent, Percent: 1, Raw: "1"} + diskLocation := &DiskLocation{ + Directory: tempDir, + DirectoryUuid: "test-uuid", + IdxDirectory: tempDir, + DiskType: types.HddType, + MinFreeSpace: minFreeSpace, + ecVolumes: make(map[needle.VolumeId]*erasure_coding.EcVolume), + } + + collection := "" + volumeId := needle.VolumeId(500) + baseFileName := erasure_coding.EcShardFileName(collection, tempDir, int(volumeId)) + + // Create EC shards (only 5 shards - not enough to load, but this is a distributed EC volume) + for i := 0; i < 5; i++ { + shardFile, err := os.Create(baseFileName + erasure_coding.ToExt(i)) + if err != nil { + t.Fatalf("Failed to create shard file: %v", err) + } + shardFile.WriteString("dummy shard data") + shardFile.Close() + } + + // Create .ecx file to trigger EC loading + ecxFile, err := os.Create(baseFileName + ".ecx") + if err != nil { + t.Fatalf("Failed to create .ecx file: %v", err) + } + ecxFile.WriteString("dummy ecx data") + ecxFile.Close() + + // NO .dat file - this is a distributed EC volume + + // Run loadAllEcShards - this should fail but NOT delete shard files + loadErr := diskLocation.loadAllEcShards() + if loadErr != nil { + t.Logf("loadAllEcShards returned error (expected): %v", loadErr) + } + + // CRITICAL CHECK: Verify shard files still exist (should NOT be deleted) + for i := 0; i < 5; i++ { + shardFile := baseFileName + erasure_coding.ToExt(i) + if !util.FileExists(shardFile) { + t.Errorf("CRITICAL BUG: Shard file %s was deleted for distributed EC volume!", shardFile) + } + } + + // Verify .ecx file still exists (should NOT be deleted for distributed EC) + if !util.FileExists(baseFileName + ".ecx") { + t.Errorf("CRITICAL BUG: .ecx file was deleted for distributed EC volume!") + } + + t.Logf("SUCCESS: Distributed EC volume files preserved (not deleted)") +}