diff --git a/weed/storage/disk_location_ec.go b/weed/storage/disk_location_ec.go index 5bedd3c8f..0d39a4223 100644 --- a/weed/storage/disk_location_ec.go +++ b/weed/storage/disk_location_ec.go @@ -209,11 +209,11 @@ func (l *DiskLocation) loadAllEcShards() (err error) { datFileName := baseFileName + ".dat" datExists := util.FileExists(datFileName) - // Only validate shard count if .dat file exists (incomplete EC encoding scenario) + // Validate EC volume if .dat file exists (incomplete EC encoding scenario) + // This checks shard count, shard size consistency, and expected size vs .dat file // If .dat is gone, EC encoding completed and shards are distributed across servers - if datExists && len(sameVolumeShards) < erasure_coding.DataShardsCount { - glog.Warningf("Incomplete EC encoding for volume %d: .dat exists but only %d shards found (need at least %d), cleaning up EC files...", - volumeId, len(sameVolumeShards), erasure_coding.DataShardsCount) + if datExists && !l.validateEcVolume(collection, volumeId) { + glog.Warningf("Incomplete or invalid EC volume %d: .dat exists but validation failed, cleaning up EC files...", volumeId) l.removeEcVolumeFiles(collection, volumeId) // Clean up any in-memory state. This does not delete files (already deleted by removeEcVolumeFiles). l.unloadEcVolume(volumeId) @@ -311,13 +311,24 @@ func (l *DiskLocation) EcShardCount() int { // For distributed EC volumes (where .dat is deleted), any number of shards is valid // For incomplete EC encoding (where .dat still exists), we need at least DataShardsCount shards // Also validates that all shards have the same size (required for Reed-Solomon EC) +// If .dat exists, it also validates shards match the expected size based on .dat file size func (l *DiskLocation) validateEcVolume(collection string, vid needle.VolumeId) bool { baseFileName := erasure_coding.EcShardFileName(collection, l.Directory, int(vid)) datFileName := baseFileName + ".dat" - datExists := util.FileExists(datFileName) - shardCount := 0 var expectedShardSize int64 = -1 + datExists := false + + // If .dat file exists, compute expected shard size from it + if datFileInfo, err := os.Stat(datFileName); err == nil { + datExists = true + // Each shard should be approximately datFileSize / DataShardsCount (10 data shards) + // Note: Due to block alignment and padding, actual shard size may be slightly larger + expectedShardSize = datFileInfo.Size() / erasure_coding.DataShardsCount + } + + shardCount := 0 + var actualShardSize int64 = -1 // Count shards and validate they all have the same size (required for Reed-Solomon EC) for i := 0; i < erasure_coding.TotalShardsCount; i++ { @@ -326,11 +337,11 @@ func (l *DiskLocation) validateEcVolume(collection string, vid needle.VolumeId) // Check if file has non-zero size if fi.Size() > 0 { // Validate all shards are the same size (required for Reed-Solomon EC) - if expectedShardSize == -1 { - expectedShardSize = fi.Size() - } else if fi.Size() != expectedShardSize { + if actualShardSize == -1 { + actualShardSize = fi.Size() + } else if fi.Size() != actualShardSize { glog.V(0).Infof("EC volume %d shard %d has size %d, expected %d (all EC shards must be same size)", - vid, i, fi.Size(), expectedShardSize) + vid, i, fi.Size(), actualShardSize) return false } shardCount++ @@ -340,6 +351,18 @@ func (l *DiskLocation) validateEcVolume(collection string, vid needle.VolumeId) } } + // If .dat file exists, validate shard size is reasonable compared to expected size + // Due to block alignment and padding in EC encoding, actual shard size can be slightly larger + // We allow up to SmallBlockSize (1MB) of padding per shard for block alignment + if datExists && actualShardSize > 0 { + maxExpectedSize := expectedShardSize + erasure_coding.ErasureCodingSmallBlockSize + if actualShardSize < expectedShardSize || actualShardSize > maxExpectedSize { + glog.V(0).Infof("EC volume %d: shard size %d outside expected range [%d, %d] (based on .dat file size with padding)", + vid, actualShardSize, expectedShardSize, maxExpectedSize) + return false + } + } + // If .dat file is gone, this is a distributed EC volume - any shard count is valid if !datExists { glog.V(1).Infof("EC volume %d: distributed EC (.dat removed) with %d shards", vid, shardCount) diff --git a/weed/storage/disk_location_ec_test.go b/weed/storage/disk_location_ec_test.go index 875bd5bb3..f5ff8b281 100644 --- a/weed/storage/disk_location_ec_test.go +++ b/weed/storage/disk_location_ec_test.go @@ -113,13 +113,18 @@ func TestIncompleteEcEncodingCleanup(t *testing.T) { // Setup test files baseFileName := erasure_coding.EcShardFileName(tt.collection, tempDir, int(tt.volumeId)) + // Standard shard size for test cases + shardSize := 1024 // 1KB per shard + // Create .dat file if needed + // .dat file size should be DataShardsCount * shard size (10 shards) if tt.createDatFile { datFile, err := os.Create(baseFileName + ".dat") if err != nil { t.Fatalf("Failed to create .dat file: %v", err) } - datFile.WriteString("dummy data") + datData := make([]byte, erasure_coding.DataShardsCount*shardSize) + datFile.Write(datData) datFile.Close() } @@ -129,7 +134,8 @@ func TestIncompleteEcEncodingCleanup(t *testing.T) { if err != nil { t.Fatalf("Failed to create shard file: %v", err) } - shardFile.WriteString("dummy shard data") + shardData := make([]byte, shardSize) + shardFile.Write(shardData) shardFile.Close() } @@ -256,19 +262,32 @@ func TestValidateEcVolume(t *testing.T) { numShards: 0, // Will create 10 zero-byte files below expectValid: false, }, + { + name: "Invalid: .dat exists with different size shards", + volumeId: 205, + collection: "", + createDatFile: true, + numShards: 10, // Will create shards with varying sizes + expectValid: false, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { baseFileName := erasure_coding.EcShardFileName(tt.collection, tempDir, int(tt.volumeId)) + // Standard shard size for normal test cases + shardSize := 1024 // 1KB per shard + // Create .dat file if needed + // .dat file size should be DataShardsCount * shard size (10 shards) if tt.createDatFile { datFile, err := os.Create(baseFileName + ".dat") if err != nil { t.Fatalf("Failed to create .dat file: %v", err) } - datFile.WriteString("dummy data") + datData := make([]byte, erasure_coding.DataShardsCount*shardSize) + datFile.Write(datData) datFile.Close() } @@ -278,7 +297,8 @@ func TestValidateEcVolume(t *testing.T) { if err != nil { t.Fatalf("Failed to create shard file: %v", err) } - shardFile.WriteString("dummy shard data") + shardData := make([]byte, shardSize) + shardFile.Write(shardData) shardFile.Close() } @@ -294,6 +314,20 @@ func TestValidateEcVolume(t *testing.T) { } } + // For mismatched shard size test case, create shards with different sizes + if tt.volumeId == 205 { + for i := 0; i < 10; i++ { + shardFile, err := os.Create(baseFileName + erasure_coding.ToExt(i)) + if err != nil { + t.Fatalf("Failed to create shard file: %v", err) + } + // Write different amount of data to each shard + data := make([]byte, 100+i*10) + shardFile.Write(data) + shardFile.Close() + } + } + // Test validation isValid := diskLocation.validateEcVolume(tt.collection, tt.volumeId) if isValid != tt.expectValid {