Browse Source
Implement full scrubbing for EC volumes (#8318)
Implement full scrubbing for EC volumes (#8318)
Implement full scrubbing for EC volumes.pull/6065/merge
committed by
GitHub
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 161 additions and 33 deletions
-
7weed/server/volume_grpc_scrub.go
-
17weed/storage/erasure_coding/ec_shard.go
-
73weed/storage/store_ec.go
-
97weed/storage/store_ec_scrub.go
@ -0,0 +1,97 @@ |
|||
package storage |
|||
|
|||
import ( |
|||
"fmt" |
|||
"slices" |
|||
|
|||
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb" |
|||
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding" |
|||
"github.com/seaweedfs/seaweedfs/weed/storage/needle" |
|||
"github.com/seaweedfs/seaweedfs/weed/storage/types" |
|||
) |
|||
|
|||
// ScrubEcVolume checks the full integrity of a EC volume, across both local and remote shards.
|
|||
// Returns a count of processed file entries, slice of found broken shards, and slice of found errors.
|
|||
func (s *Store) ScrubEcVolume(vid needle.VolumeId) (int64, []*volume_server_pb.EcShardInfo, []error) { |
|||
ecv, found := s.FindEcVolume(vid) |
|||
if !found { |
|||
return 0, nil, []error{fmt.Errorf("EC volume id %d not found", vid)} |
|||
} |
|||
if err := s.cachedLookupEcShardLocations(ecv); err != nil { |
|||
return 0, nil, []error{fmt.Errorf("failed to locate shard via master grpc %s: %v", s.MasterAddress, err)} |
|||
} |
|||
|
|||
// full scan means verifying indexes as well
|
|||
_, errs := ecv.ScrubIndex() |
|||
|
|||
var count int64 |
|||
// reads for EC chunks can hit the same shard multiple times, so dedupe upon read errors
|
|||
brokenShardsMap := map[erasure_coding.ShardId]*volume_server_pb.EcShardInfo{} |
|||
|
|||
err := ecv.WalkIndex(func(id types.NeedleId, offset types.Offset, size types.Size) error { |
|||
count++ |
|||
if size.IsTombstone() { |
|||
// nothing to do for tombstones...
|
|||
return nil |
|||
} |
|||
|
|||
data := make([]byte, 0, needle.GetActualSize(size, ecv.Version)) |
|||
intervals := ecv.LocateEcShardNeedleInterval(ecv.Version, offset.ToActualOffset(), size) |
|||
|
|||
for i, iv := range intervals { |
|||
chunk := make([]byte, iv.Size) |
|||
shardId, offset := s.IntervalToShardIdAndOffset(iv) |
|||
|
|||
// try a local shard read first...
|
|||
if err := s.readLocalEcShardInterval(ecv, shardId, chunk, offset); err == nil { |
|||
data = append(data, chunk...) |
|||
continue |
|||
} |
|||
|
|||
// ...then remote. note we do not try to recover EC-encoded data upon read failures;
|
|||
// we want check that shards are valid without decoding
|
|||
ecv.ShardLocationsLock.RLock() |
|||
sourceDataNodes, ok := ecv.ShardLocations[shardId] |
|||
ecv.ShardLocationsLock.RUnlock() |
|||
if ok { |
|||
if _, _, err := s.readRemoteEcShardInterval(sourceDataNodes, id, ecv.VolumeId, shardId, chunk, offset); err == nil { |
|||
data = append(data, chunk...) |
|||
continue |
|||
} |
|||
} |
|||
|
|||
// chunk read for shard failed :(
|
|||
errs = append(errs, fmt.Errorf("failed to read EC shard %d for needle %d on volume %d (interval %d/%d)", shardId, id, ecv.VolumeId, i+1, len(intervals))) |
|||
brokenShardsMap[shardId] = &volume_server_pb.EcShardInfo{ |
|||
ShardId: uint32(shardId), |
|||
Size: int64(iv.Size), |
|||
Collection: ecv.Collection, |
|||
VolumeId: uint32(ecv.VolumeId), |
|||
} |
|||
break |
|||
} |
|||
|
|||
if got, want := int64(len(data)), needle.GetActualSize(size, ecv.Version); got != want { |
|||
errs = append(errs, fmt.Errorf("expected %d bytes for needle %d, got %d", want, id, got)) |
|||
return nil |
|||
} |
|||
|
|||
n := needle.Needle{} |
|||
if err := n.ReadBytes(data, 0, size, ecv.Version); err != nil { |
|||
errs = append(errs, fmt.Errorf("needle %d on EC volume %d: %v", id, ecv.VolumeId, err)) |
|||
} |
|||
|
|||
return nil |
|||
}) |
|||
if err != nil { |
|||
errs = append(errs, err) |
|||
} |
|||
|
|||
brokenShards := []*volume_server_pb.EcShardInfo{} |
|||
for _, s := range brokenShardsMap { |
|||
brokenShards = append(brokenShards, s) |
|||
} |
|||
slices.SortFunc(brokenShards, erasure_coding.CmpEcShardInfo) |
|||
|
|||
return count, brokenShards, errs |
|||
} |
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue