Browse Source

Filer: Add retry mechanism for failed file deletions

Implement a retry queue with exponential backoff for handling transient
deletion failures, particularly when volumes are temporarily read-only.

Key features:
- Automatic retry for retryable errors (read-only volumes, network issues)
- Exponential backoff: 5min → 10min → 20min → ... (max 6 hours)
- Maximum 10 retry attempts per file before giving up
- Separate goroutine processing retry queue every minute
- Map-based retry queue for O(1) lookups and deletions
- Enhanced logging with retry/permanent error classification
- Consistent error detail limiting (max 10 total errors logged)
- Graceful shutdown support with quit channel for both processors

This addresses the issue where file deletions fail when volumes are
temporarily read-only (tiered volumes, maintenance, etc.) and these
deletions were previously lost.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
pull/7402/head
Dmitriy Pavlov 1 month ago
committed by Dimonyga
parent
commit
e8f52ccd4c
  1. 3
      weed/filer/filer.go
  2. 46
      weed/filer/filer_deletion.go

3
weed/filer/filer.go

@ -54,6 +54,7 @@ type Filer struct {
RemoteStorage *FilerRemoteStorage
Dlm *lock_manager.DistributedLockManager
MaxFilenameLength uint32
deletionQuit chan struct{}
}
func NewFiler(masters pb.ServerDiscovery, grpcDialOption grpc.DialOption, filerHost pb.ServerAddress, filerGroup string, collection string, replication string, dataCenter string, maxFilenameLength uint32, notifyFn func()) *Filer {
@ -66,6 +67,7 @@ func NewFiler(masters pb.ServerDiscovery, grpcDialOption grpc.DialOption, filerH
UniqueFilerId: util.RandomInt32(),
Dlm: lock_manager.NewDistributedLockManager(filerHost),
MaxFilenameLength: maxFilenameLength,
deletionQuit: make(chan struct{}),
}
if f.UniqueFilerId < 0 {
f.UniqueFilerId = -f.UniqueFilerId
@ -379,6 +381,7 @@ func (f *Filer) doListDirectoryEntries(ctx context.Context, p util.FullPath, sta
}
func (f *Filer) Shutdown() {
close(f.deletionQuit)
f.LocalMetaLogBuffer.ShutdownLogBuffer()
f.Store.Shutdown()
}

46
weed/filer/filer_deletion.go

@ -35,14 +35,14 @@ type DeletionRetryItem struct {
// DeletionRetryQueue manages the queue of failed deletions that need to be retried
type DeletionRetryQueue struct {
items []*DeletionRetryItem
items map[string]*DeletionRetryItem
itemsLock sync.RWMutex
}
// NewDeletionRetryQueue creates a new retry queue
func NewDeletionRetryQueue() *DeletionRetryQueue {
return &DeletionRetryQueue{
items: make([]*DeletionRetryItem, 0),
items: make(map[string]*DeletionRetryItem),
}
}
@ -52,8 +52,7 @@ func (q *DeletionRetryQueue) AddOrUpdate(fileId string, errorMsg string) {
defer q.itemsLock.Unlock()
// Check if item already exists
for _, item := range q.items {
if item.FileId == fileId {
if item, exists := q.items[fileId]; exists {
item.RetryCount++
item.LastError = errorMsg
// Calculate next retry time with exponential backoff
@ -65,17 +64,15 @@ func (q *DeletionRetryQueue) AddOrUpdate(fileId string, errorMsg string) {
glog.V(2).Infof("updated retry for %s: attempt %d, next retry in %v", fileId, item.RetryCount, delay)
return
}
}
// Add new item
delay := InitialRetryDelay
newItem := &DeletionRetryItem{
q.items[fileId] = &DeletionRetryItem{
FileId: fileId,
RetryCount: 1,
NextRetryAt: time.Now().Add(delay),
LastError: errorMsg,
}
q.items = append(q.items, newItem)
glog.V(2).Infof("added retry for %s: next retry in %v", fileId, delay)
}
@ -86,23 +83,20 @@ func (q *DeletionRetryQueue) GetReadyItems(maxItems int) []*DeletionRetryItem {
now := time.Now()
var readyItems []*DeletionRetryItem
var remainingItems []*DeletionRetryItem
for _, item := range q.items {
for fileId, item := range q.items {
if len(readyItems) < maxItems && item.NextRetryAt.Before(now) {
if item.RetryCount < MaxRetryAttempts {
readyItems = append(readyItems, item)
delete(q.items, fileId)
} else {
// Max attempts reached, log and discard
glog.Warningf("max retry attempts (%d) reached for %s, last error: %s", MaxRetryAttempts, item.FileId, item.LastError)
delete(q.items, fileId)
}
} else {
// Keep items that are not ready yet or if the batch is full
remainingItems = append(remainingItems, item)
}
}
q.items = remainingItems
return readyItems
}
@ -147,9 +141,15 @@ func (f *Filer) loopProcessingDeletion() {
// Start retry processor in a separate goroutine
go f.loopProcessingDeletionRetry(lookupFunc, retryQueue)
var deletionCount int
ticker := time.NewTicker(1123 * time.Millisecond)
defer ticker.Stop()
for {
deletionCount = 0
select {
case <-f.deletionQuit:
glog.V(0).Infof("deletion processor shutting down")
return
case <-ticker.C:
f.fileIdDeletionQueue.Consume(func(fileIds []string) {
for len(fileIds) > 0 {
var toDeleteFileIds []string
@ -160,7 +160,6 @@ func (f *Filer) loopProcessingDeletion() {
toDeleteFileIds = fileIds
fileIds = fileIds[:0]
}
deletionCount = len(toDeleteFileIds)
results := operation.DeleteFileIdsWithLookupVolumeId(f.GrpcDialOption, toDeleteFileIds, lookupFunc)
// Process individual results for better error tracking
@ -177,13 +176,13 @@ func (f *Filer) loopProcessingDeletion() {
// Retryable error - add to retry queue
retryableErrorCount++
retryQueue.AddOrUpdate(result.FileId, result.Error)
if retryableErrorCount <= 10 {
if len(errorDetails) < 10 {
errorDetails = append(errorDetails, result.FileId+": "+result.Error+" (will retry)")
}
} else {
// Permanent error - log but don't retry
permanentErrorCount++
if permanentErrorCount <= 10 {
if len(errorDetails) < 10 {
errorDetails = append(errorDetails, result.FileId+": "+result.Error+" (permanent)")
}
}
@ -208,9 +207,6 @@ func (f *Filer) loopProcessingDeletion() {
}
}
})
if deletionCount == 0 {
time.Sleep(1123 * time.Millisecond)
}
}
}
@ -240,7 +236,12 @@ func (f *Filer) loopProcessingDeletionRetry(lookupFunc func([]string) (map[strin
ticker := time.NewTicker(1 * time.Minute)
defer ticker.Stop()
for range ticker.C {
for {
select {
case <-f.deletionQuit:
glog.V(0).Infof("retry processor shutting down, %d items remaining in queue", retryQueue.Size())
return
case <-ticker.C:
// Get items that are ready to retry
readyItems := retryQueue.GetReadyItems(1000)
@ -289,6 +290,7 @@ func (f *Filer) loopProcessingDeletionRetry(lookupFunc func([]string) (map[strin
glog.V(1).Infof("retry: %d files still failing, will retry again later", retryCount)
}
}
}
}
func (f *Filer) DeleteUncommittedChunks(ctx context.Context, chunks []*filer_pb.FileChunk) {

Loading…
Cancel
Save