Browse Source

validate shard size

pull/7384/head
chrislu 1 month ago
parent
commit
96caa76101
  1. 43
      weed/storage/disk_location_ec.go
  2. 42
      weed/storage/disk_location_ec_test.go

43
weed/storage/disk_location_ec.go

@ -209,11 +209,11 @@ func (l *DiskLocation) loadAllEcShards() (err error) {
datFileName := baseFileName + ".dat"
datExists := util.FileExists(datFileName)
// Only validate shard count if .dat file exists (incomplete EC encoding scenario)
// Validate EC volume if .dat file exists (incomplete EC encoding scenario)
// This checks shard count, shard size consistency, and expected size vs .dat file
// If .dat is gone, EC encoding completed and shards are distributed across servers
if datExists && len(sameVolumeShards) < erasure_coding.DataShardsCount {
glog.Warningf("Incomplete EC encoding for volume %d: .dat exists but only %d shards found (need at least %d), cleaning up EC files...",
volumeId, len(sameVolumeShards), erasure_coding.DataShardsCount)
if datExists && !l.validateEcVolume(collection, volumeId) {
glog.Warningf("Incomplete or invalid EC volume %d: .dat exists but validation failed, cleaning up EC files...", volumeId)
l.removeEcVolumeFiles(collection, volumeId)
// Clean up any in-memory state. This does not delete files (already deleted by removeEcVolumeFiles).
l.unloadEcVolume(volumeId)
@ -311,13 +311,24 @@ func (l *DiskLocation) EcShardCount() int {
// For distributed EC volumes (where .dat is deleted), any number of shards is valid
// For incomplete EC encoding (where .dat still exists), we need at least DataShardsCount shards
// Also validates that all shards have the same size (required for Reed-Solomon EC)
// If .dat exists, it also validates shards match the expected size based on .dat file size
func (l *DiskLocation) validateEcVolume(collection string, vid needle.VolumeId) bool {
baseFileName := erasure_coding.EcShardFileName(collection, l.Directory, int(vid))
datFileName := baseFileName + ".dat"
datExists := util.FileExists(datFileName)
shardCount := 0
var expectedShardSize int64 = -1
datExists := false
// If .dat file exists, compute expected shard size from it
if datFileInfo, err := os.Stat(datFileName); err == nil {
datExists = true
// Each shard should be approximately datFileSize / DataShardsCount (10 data shards)
// Note: Due to block alignment and padding, actual shard size may be slightly larger
expectedShardSize = datFileInfo.Size() / erasure_coding.DataShardsCount
}
shardCount := 0
var actualShardSize int64 = -1
// Count shards and validate they all have the same size (required for Reed-Solomon EC)
for i := 0; i < erasure_coding.TotalShardsCount; i++ {
@ -326,11 +337,11 @@ func (l *DiskLocation) validateEcVolume(collection string, vid needle.VolumeId)
// Check if file has non-zero size
if fi.Size() > 0 {
// Validate all shards are the same size (required for Reed-Solomon EC)
if expectedShardSize == -1 {
expectedShardSize = fi.Size()
} else if fi.Size() != expectedShardSize {
if actualShardSize == -1 {
actualShardSize = fi.Size()
} else if fi.Size() != actualShardSize {
glog.V(0).Infof("EC volume %d shard %d has size %d, expected %d (all EC shards must be same size)",
vid, i, fi.Size(), expectedShardSize)
vid, i, fi.Size(), actualShardSize)
return false
}
shardCount++
@ -340,6 +351,18 @@ func (l *DiskLocation) validateEcVolume(collection string, vid needle.VolumeId)
}
}
// If .dat file exists, validate shard size is reasonable compared to expected size
// Due to block alignment and padding in EC encoding, actual shard size can be slightly larger
// We allow up to SmallBlockSize (1MB) of padding per shard for block alignment
if datExists && actualShardSize > 0 {
maxExpectedSize := expectedShardSize + erasure_coding.ErasureCodingSmallBlockSize
if actualShardSize < expectedShardSize || actualShardSize > maxExpectedSize {
glog.V(0).Infof("EC volume %d: shard size %d outside expected range [%d, %d] (based on .dat file size with padding)",
vid, actualShardSize, expectedShardSize, maxExpectedSize)
return false
}
}
// If .dat file is gone, this is a distributed EC volume - any shard count is valid
if !datExists {
glog.V(1).Infof("EC volume %d: distributed EC (.dat removed) with %d shards", vid, shardCount)

42
weed/storage/disk_location_ec_test.go

@ -113,13 +113,18 @@ func TestIncompleteEcEncodingCleanup(t *testing.T) {
// Setup test files
baseFileName := erasure_coding.EcShardFileName(tt.collection, tempDir, int(tt.volumeId))
// Standard shard size for test cases
shardSize := 1024 // 1KB per shard
// Create .dat file if needed
// .dat file size should be DataShardsCount * shard size (10 shards)
if tt.createDatFile {
datFile, err := os.Create(baseFileName + ".dat")
if err != nil {
t.Fatalf("Failed to create .dat file: %v", err)
}
datFile.WriteString("dummy data")
datData := make([]byte, erasure_coding.DataShardsCount*shardSize)
datFile.Write(datData)
datFile.Close()
}
@ -129,7 +134,8 @@ func TestIncompleteEcEncodingCleanup(t *testing.T) {
if err != nil {
t.Fatalf("Failed to create shard file: %v", err)
}
shardFile.WriteString("dummy shard data")
shardData := make([]byte, shardSize)
shardFile.Write(shardData)
shardFile.Close()
}
@ -256,19 +262,32 @@ func TestValidateEcVolume(t *testing.T) {
numShards: 0, // Will create 10 zero-byte files below
expectValid: false,
},
{
name: "Invalid: .dat exists with different size shards",
volumeId: 205,
collection: "",
createDatFile: true,
numShards: 10, // Will create shards with varying sizes
expectValid: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
baseFileName := erasure_coding.EcShardFileName(tt.collection, tempDir, int(tt.volumeId))
// Standard shard size for normal test cases
shardSize := 1024 // 1KB per shard
// Create .dat file if needed
// .dat file size should be DataShardsCount * shard size (10 shards)
if tt.createDatFile {
datFile, err := os.Create(baseFileName + ".dat")
if err != nil {
t.Fatalf("Failed to create .dat file: %v", err)
}
datFile.WriteString("dummy data")
datData := make([]byte, erasure_coding.DataShardsCount*shardSize)
datFile.Write(datData)
datFile.Close()
}
@ -278,7 +297,8 @@ func TestValidateEcVolume(t *testing.T) {
if err != nil {
t.Fatalf("Failed to create shard file: %v", err)
}
shardFile.WriteString("dummy shard data")
shardData := make([]byte, shardSize)
shardFile.Write(shardData)
shardFile.Close()
}
@ -294,6 +314,20 @@ func TestValidateEcVolume(t *testing.T) {
}
}
// For mismatched shard size test case, create shards with different sizes
if tt.volumeId == 205 {
for i := 0; i < 10; i++ {
shardFile, err := os.Create(baseFileName + erasure_coding.ToExt(i))
if err != nil {
t.Fatalf("Failed to create shard file: %v", err)
}
// Write different amount of data to each shard
data := make([]byte, 100+i*10)
shardFile.Write(data)
shardFile.Close()
}
}
// Test validation
isValid := diskLocation.validateEcVolume(tt.collection, tt.volumeId)
if isValid != tt.expectValid {

Loading…
Cancel
Save