From 689930f092c5b02bea168cc96ab8762ab353e27d Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Fri, 31 May 2019 00:19:13 -0700 Subject: [PATCH] forget shards that are broken --- weed/storage/store_ec.go | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/weed/storage/store_ec.go b/weed/storage/store_ec.go index e934df841..92a63da8f 100644 --- a/weed/storage/store_ec.go +++ b/weed/storage/store_ec.go @@ -157,13 +157,17 @@ func (s *Store) readOneEcShardInterval(ctx context.Context, ecVolume *erasure_co } } else { ecVolume.ShardLocationsLock.RLock() - sourceDataNodes, _ := ecVolume.ShardLocations[shardId] + sourceDataNodes, hasShardIdLocation := ecVolume.ShardLocations[shardId] ecVolume.ShardLocationsLock.RUnlock() // try reading directly - _, err = s.readRemoteEcShardInterval(ctx, sourceDataNodes, ecVolume.VolumeId, shardId, data, actualOffset) - if err == nil { - return + if hasShardIdLocation { + _, err = s.readRemoteEcShardInterval(ctx, sourceDataNodes, ecVolume.VolumeId, shardId, data, actualOffset) + if err == nil { + return + } + glog.V(0).Infof("clearing ec shard %d.%d locations: %v", ecVolume.VolumeId, shardId, err) + forgetShardId(ecVolume, shardId) } // try reading by recovering from other shards @@ -176,6 +180,13 @@ func (s *Store) readOneEcShardInterval(ctx context.Context, ecVolume *erasure_co return } +func forgetShardId(ecVolume *erasure_coding.EcVolume, shardId erasure_coding.ShardId) { + // failed to access the source data nodes, clear it up + ecVolume.ShardLocationsLock.Lock() + delete(ecVolume.ShardLocations, shardId) + ecVolume.ShardLocationsLock.Unlock() +} + func (s *Store) cachedLookupEcShardLocations(ctx context.Context, ecVolume *erasure_coding.EcVolume) (err error) { if ecVolume.ShardLocationsRefreshTime.Add(10 * time.Minute).After(time.Now()) { @@ -265,7 +276,7 @@ func (s *Store) doReadRemoteEcShardInterval(ctx context.Context, sourceDataNode } func (s *Store) recoverOneRemoteEcShardInterval(ctx context.Context, ecVolume *erasure_coding.EcVolume, shardIdToRecover erasure_coding.ShardId, buf []byte, offset int64) (n int, err error) { - glog.V(1).Infof("recover ec shard %d.%d from other locations", ecVolume.VolumeId, shardIdToRecover) + glog.V(4).Infof("recover ec shard %d.%d from other locations", ecVolume.VolumeId, shardIdToRecover) enc, err := reedsolomon.New(erasure_coding.DataShardsCount, erasure_coding.ParityShardsCount) if err != nil { @@ -294,7 +305,8 @@ func (s *Store) recoverOneRemoteEcShardInterval(ctx context.Context, ecVolume *e data := make([]byte, len(buf)) nRead, readErr := s.readRemoteEcShardInterval(ctx, locations, ecVolume.VolumeId, shardId, data, offset) if readErr != nil { - glog.V(4).Infof("recover: readRemoteEcShardInterval %d.%d %d bytes from %+v: %v", ecVolume.VolumeId, shardId, nRead, locations, readErr) + glog.V(3).Infof("recover: readRemoteEcShardInterval %d.%d %d bytes from %+v: %v", ecVolume.VolumeId, shardId, nRead, locations, readErr) + forgetShardId(ecVolume, shardId) } if nRead == len(buf) { bufs[shardId] = data @@ -309,7 +321,7 @@ func (s *Store) recoverOneRemoteEcShardInterval(ctx context.Context, ecVolume *e glog.V(3).Infof("recovered ec shard %d.%d failed: %v", ecVolume.VolumeId, shardIdToRecover, err) return 0, err } - glog.V(3).Infof("recovered ec shard %d.%d from other locations", ecVolume.VolumeId, shardIdToRecover) + glog.V(4).Infof("recovered ec shard %d.%d from other locations", ecVolume.VolumeId, shardIdToRecover) copy(buf, bufs[shardIdToRecover])