diff --git a/weed/filer/filer_deletion.go b/weed/filer/filer_deletion.go index b3d5d0c9c..1fb071000 100644 --- a/weed/filer/filer_deletion.go +++ b/weed/filer/filer_deletion.go @@ -35,6 +35,27 @@ const ( DeletionPollInterval = 1123 * time.Millisecond ) +// retryablePatterns contains error message patterns that indicate temporary/transient conditions +// that should be retried. These patterns are based on actual error messages from the deletion pipeline. +var retryablePatterns = []string{ + "is read only", // Volume temporarily read-only (tiering, maintenance) + "error reading from server", // Network I/O errors + "connection reset by peer", // Network connection issues + "closed network connection", // Network connection closed unexpectedly + "connection refused", // Server temporarily unavailable + "timeout", // Operation timeout (network or server) + "deadline exceeded", // Context deadline exceeded + "context canceled", // Context cancellation (may be transient) + "lookup error", // Volume lookup failures + "lookup failed", // Volume server discovery issues + "too many requests", // Rate limiting / backpressure + "service unavailable", // HTTP 503 errors + "temporarily unavailable", // Temporary service issues + "try again", // Explicit retry suggestion + "i/o timeout", // Network I/O timeout + "broken pipe", // Connection broken during operation +} + // DeletionRetryItem represents a file deletion that failed and needs to be retried type DeletionRetryItem struct { FileId string @@ -116,17 +137,20 @@ func NewDeletionRetryQueue() *DeletionRetryQueue { // Includes overflow protection and caps at MaxRetryDelay. func calculateBackoff(retryCount int) time.Duration { // The first retry is attempt 1, but shift should start at 0 - shiftAmount := uint(retryCount - 1) - if shiftAmount > 63 { - // Prevent overflow: use max delay directly - return MaxRetryDelay + if retryCount <= 1 { + return InitialRetryDelay } - delay := InitialRetryDelay * time.Duration(1< MaxRetryDelay { - delay = MaxRetryDelay + return MaxRetryDelay } return delay @@ -356,27 +380,6 @@ func isRetryableError(errorMsg string) bool { return false } - // Known patterns that indicate temporary/transient conditions. - // These are based on actual error messages from the deletion pipeline. - retryablePatterns := []string{ - "is read only", // Volume temporarily read-only (tiering, maintenance) - "error reading from server", // Network I/O errors - "connection reset by peer", // Network connection issues - "closed network connection", // Network connection closed unexpectedly - "connection refused", // Server temporarily unavailable - "timeout", // Operation timeout (network or server) - "deadline exceeded", // Context deadline exceeded - "context canceled", // Context cancellation (may be transient) - "lookup error", // Volume lookup failures - "lookup failed", // Volume server discovery issues - "too many requests", // Rate limiting / backpressure - "service unavailable", // HTTP 503 errors - "temporarily unavailable", // Temporary service issues - "try again", // Explicit retry suggestion - "i/o timeout", // Network I/O timeout - "broken pipe", // Connection broken during operation - } - errorLower := strings.ToLower(errorMsg) for _, pattern := range retryablePatterns { if strings.Contains(errorLower, pattern) { diff --git a/weed/filer/filer_deletion_test.go b/weed/filer/filer_deletion_test.go index f31bae84f..5b5c7bc3e 100644 --- a/weed/filer/filer_deletion_test.go +++ b/weed/filer/filer_deletion_test.go @@ -95,7 +95,7 @@ func TestDeletionRetryQueue_OverflowProtection(t *testing.T) { // Should not panic and should cap at MaxRetryDelay queue.RequeueForRetry(item, "error") - delay := item.NextRetryAt.Sub(time.Now()) + delay := time.Until(item.NextRetryAt) if delay > MaxRetryDelay+time.Second { t.Errorf("Delay exceeded MaxRetryDelay: %v > %v", delay, MaxRetryDelay) } @@ -218,11 +218,13 @@ func TestDeletionRetryQueue_HeapOrdering(t *testing.T) { } queue.lock.Unlock() - // Set all items to ready + // Set all items to ready while preserving their relative order queue.lock.Lock() for _, item := range queue.itemIndex { - item.NextRetryAt = now.Add(-1 * time.Second) + // Shift all times back by 40 seconds to make them ready, but preserve order + item.NextRetryAt = item.NextRetryAt.Add(-40 * time.Second) } + heap.Init(&queue.heap) // Re-establish heap property after modification queue.lock.Unlock() // GetReadyItems should return in NextRetryAt order