fix flanky concurrency tests

1 month ago · 65682eb853
1 changed files with 112 additions and 91 deletions
--- a/test/s3/iam/s3_iam_distributed_test.go
+++ b/test/s3/iam/s3_iam_distributed_test.go
@ -103,15 +103,58 @@ func TestS3IAMDistributedTests(t *testing.T) {
 	})
 	t.Run("distributed_concurrent_operations", func(t *testing.T) {
 		// Test concurrent operations across distributed instances
 		// STRINGENT APPROACH: 8 total operations (4x2) - 33% more than original (6) with rigorous error detection
 		// Target >87.5% success rate to catch concurrency regressions while allowing minimal CI infrastructure issues
 		// Test concurrent operations across distributed instances with robust retry mechanisms
 		// This approach implements proper retry logic instead of tolerating errors to catch real concurrency issues
 		const numGoroutines = 4             // Optimal concurrency for CI reliability
 		const numOperationsPerGoroutine = 2 // Minimal operations per goroutine
 		const maxRetries = 3                // Maximum retry attempts for transient failures
 		const retryDelay = 100 * time.Millisecond
 		var wg sync.WaitGroup
 		errors := make(chan error, numGoroutines*numOperationsPerGoroutine)
 		// Helper function to determine if an error is retryable
 		isRetryableError := func(err error) bool {
 			if err == nil {
 				return false
 			}
 			errorMsg := err.Error()
 			return strings.Contains(errorMsg, "timeout") ||
 				strings.Contains(errorMsg, "connection reset") ||
 				strings.Contains(errorMsg, "temporary failure") ||
 				strings.Contains(errorMsg, "TooManyRequests") ||
 				strings.Contains(errorMsg, "ServiceUnavailable") ||
 				strings.Contains(errorMsg, "InternalError")
 		}
 		// Helper function to execute operations with retry logic
 		executeWithRetry := func(operation func() error, operationName string) error {
 			var lastErr error
 			for attempt := 0; attempt <= maxRetries; attempt++ {
 				if attempt > 0 {
 					time.Sleep(retryDelay * time.Duration(attempt)) // Exponential backoff
 				}
 				lastErr = operation()
 				if lastErr == nil {
 					return nil // Success
 				}
 				if !isRetryableError(lastErr) {
 					// Non-retryable error - fail immediately
 					return fmt.Errorf("%s failed with non-retryable error: %w", operationName, lastErr)
 				}
 				// Retryable error - continue to next attempt
 				if attempt < maxRetries {
 					t.Logf("Retrying %s (attempt %d/%d) after error: %v", operationName, attempt+1, maxRetries, lastErr)
 				}
 			}
 			// All retries exhausted
 			return fmt.Errorf("%s failed after %d retries, last error: %w", operationName, maxRetries, lastErr)
 		}
 		for i := 0; i < numGoroutines; i++ {
 			wg.Add(1)
 			go func(goroutineID int) {
@ -119,55 +162,70 @@ func TestS3IAMDistributedTests(t *testing.T) {
 				client, err := framework.CreateS3ClientWithJWT(fmt.Sprintf("user-%d", goroutineID), "TestAdminRole")
 				if err != nil {
 					errors <- err
 					errors <- fmt.Errorf("failed to create S3 client for goroutine %d: %w", goroutineID, err)
 					return
 				}
 				for j := 0; j < numOperationsPerGoroutine; j++ {
 					bucketName := fmt.Sprintf("test-concurrent-%d-%d", goroutineID, j)
 					objectKey := "test-object.txt"
 					objectContent := fmt.Sprintf("content-%d-%d", goroutineID, j)
 					// Execute full operation sequence with individual retries
 					operationFailed := false
 					// Create bucket
 					if err := framework.CreateBucket(client, bucketName); err != nil {
 					// 1. Create bucket with retry
 					if err := executeWithRetry(func() error {
 						return framework.CreateBucket(client, bucketName)
 					}, fmt.Sprintf("CreateBucket-%s", bucketName)); err != nil {
 						errors <- err
 						continue
 						operationFailed = true
 					}
 					// Moderate delay to reduce server load and improve CI stability
 					time.Sleep(200 * time.Millisecond)
 					// Put object
 					objectKey := "test-object.txt"
 					if err := framework.PutTestObject(client, bucketName, objectKey, fmt.Sprintf("content-%d-%d", goroutineID, j)); err != nil {
 					if !operationFailed {
 						// 2. Put object with retry
 						if err := executeWithRetry(func() error {
 							return framework.PutTestObject(client, bucketName, objectKey, objectContent)
 						}, fmt.Sprintf("PutObject-%s/%s", bucketName, objectKey)); err != nil {
 							errors <- err
 						continue
 							operationFailed = true
 						}
 					}
 					// Moderate delay to reduce server load and improve CI stability
 					time.Sleep(200 * time.Millisecond)
 					// Get object
 					if _, err := framework.GetTestObject(client, bucketName, objectKey); err != nil {
 					if !operationFailed {
 						// 3. Get object with retry
 						if err := executeWithRetry(func() error {
 							_, err := framework.GetTestObject(client, bucketName, objectKey)
 							return err
 						}, fmt.Sprintf("GetObject-%s/%s", bucketName, objectKey)); err != nil {
 							errors <- err
 						continue
 							operationFailed = true
 						}
 					}
 					// Moderate delay to reduce server load and improve CI stability
 					time.Sleep(200 * time.Millisecond)
 					// Delete object
 					if err := framework.DeleteTestObject(client, bucketName, objectKey); err != nil {
 					if !operationFailed {
 						// 4. Delete object with retry
 						if err := executeWithRetry(func() error {
 							return framework.DeleteTestObject(client, bucketName, objectKey)
 						}, fmt.Sprintf("DeleteObject-%s/%s", bucketName, objectKey)); err != nil {
 							errors <- err
 						continue
 							operationFailed = true
 						}
 					}
 					// Moderate delay to reduce server load and improve CI stability
 					time.Sleep(200 * time.Millisecond)
 					// Delete bucket
 					if _, err := client.DeleteBucket(&s3.DeleteBucketInput{
 					// 5. Always attempt bucket cleanup, even if previous operations failed
 					if err := executeWithRetry(func() error {
 						_, err := client.DeleteBucket(&s3.DeleteBucketInput{
 							Bucket: aws.String(bucketName),
 					}); err != nil {
 						errors <- err
 						continue
 						})
 						return err
 					}, fmt.Sprintf("DeleteBucket-%s", bucketName)); err != nil {
 						// Only log cleanup failures, don't fail the test
 						t.Logf("Warning: Failed to cleanup bucket %s: %v", bucketName, err)
 					}
 					// Moderate delay to reduce server load and improve CI stability
 					time.Sleep(200 * time.Millisecond)
 					// Small delay between operation sequences to reduce server load
 					time.Sleep(50 * time.Millisecond)
 				}
 			}(i)
 		}
@ -175,74 +233,37 @@ func TestS3IAMDistributedTests(t *testing.T) {
 		wg.Wait()
 		close(errors)
 		// Analyze errors with categorization for better diagnostics
 		// Collect and analyze errors - with retry logic, we should see very few errors
 		var errorList []error
 		var transientErrors []error
 		var seriousErrors []error
 		for err := range errors {
 			errorList = append(errorList, err)
 			errorMsg := err.Error()
 			// Categorize errors: transient vs serious
 			if strings.Contains(errorMsg, "timeout") ||
 				strings.Contains(errorMsg, "connection reset") ||
 				strings.Contains(errorMsg, "temporary failure") ||
 				strings.Contains(errorMsg, "TooManyRequests") {
 				transientErrors = append(transientErrors, err)
 			} else {
 				seriousErrors = append(seriousErrors, err)
 			}
 		}
 		totalOperations := numGoroutines * numOperationsPerGoroutine
 		errorRate := float64(len(errorList)) / float64(totalOperations)
 		seriousErrorRate := float64(len(seriousErrors)) / float64(totalOperations)
 		transientErrorRate := float64(len(transientErrors)) / float64(totalOperations)
 		// Detailed error reporting
 		if len(errorList) > 0 {
 		// Report results
 		if len(errorList) == 0 {
 			t.Logf("🎉 All %d concurrent operations completed successfully with retry mechanisms!", totalOperations)
 		} else {
 			t.Logf("Concurrent operations summary:")
 			t.Logf("  Total operations: %d", totalOperations)
 			t.Logf("  Failed operations: %d (%.1f%% error rate)", len(errorList), errorRate*100)
 			t.Logf("  Serious errors: %d (%.1f%% rate)", len(seriousErrors), seriousErrorRate*100)
 			t.Logf("  Transient errors: %d (%.1f%% rate)", len(transientErrors), transientErrorRate*100)
 			t.Logf("  Failed operations: %d (%.1f%% error rate)", len(errorList), float64(len(errorList))/float64(totalOperations)*100)
 			if len(seriousErrors) > 0 {
 				t.Logf("  First serious error: %v", seriousErrors[0])
 			// Log first few errors for debugging
 			for i, err := range errorList {
 				if i >= 3 { // Limit to first 3 errors
 					t.Logf("  ... and %d more errors", len(errorList)-3)
 					break
 				}
 			if len(transientErrors) > 0 {
 				t.Logf("  First transient error: %v", transientErrors[0])
 				t.Logf("  Error %d: %v", i+1, err)
 			}
 		}
 		// STRINGENT CONCURRENCY TESTING: More rigorous thresholds to catch regressions while accounting for CI variability
 		// For totalOperations=8, target >87.5% success rate (≤12.5% error rate) to detect concurrency issues
 		// Serious errors (race conditions, deadlocks) should be very limited - allow only 1 for CI infrastructure issues
 		// Based on observed data: 1-3 errors due to volume allocation constraints, not actual concurrency bugs
 		maxSeriousErrors := 1 // Allow 1 serious error (12.5%) for CI infrastructure limitations only
 		if len(seriousErrors) > maxSeriousErrors {
 			t.Errorf("❌ %d serious error(s) detected (%.1f%%), exceeding threshold of %d. This indicates potential concurrency bugs. First error: %v",
 				len(seriousErrors), float64(len(seriousErrors))/float64(totalOperations)*100, maxSeriousErrors, seriousErrors[0])
 		}
 		// For total errors, use stringent thresholds to catch regressions while allowing minimal CI infrastructure issues
 		// Target >87.5% success rate to ensure system reliability and catch concurrency problems early
 		maxTotalErrorsStrict := 1  // Allow max 1 total error (12.5% rate) - excellent performance target
 		maxTotalErrorsRelaxed := 2 // Allow max 2 total errors (25% rate) - acceptable with infrastructure constraints
 		if len(errorList) > maxTotalErrorsRelaxed {
 			t.Errorf("❌ Too many total errors: %d (%.1f%%) - exceeds threshold of %d (%.1f%%). System may have concurrency issues.",
 				len(errorList), errorRate*100, maxTotalErrorsRelaxed, float64(maxTotalErrorsRelaxed)/float64(totalOperations)*100)
 		} else if len(errorList) > maxTotalErrorsStrict {
 			t.Logf("⚠️  Concurrent operations completed with %d errors (%.1f%%) - acceptable but monitor for patterns.",
 				len(errorList), errorRate*100)
 		} else if len(errorList) > 0 {
 			t.Logf("✅ Concurrent operations completed with %d errors (%.1f%%) - excellent performance!",
 				len(errorList), errorRate*100)
 		} else {
 			t.Logf("🎉 All %d concurrent operations completed successfully - perfect concurrency handling!", totalOperations)
 		// With proper retry mechanisms, we should expect near-zero failures
 		// Any remaining errors likely indicate real concurrency issues or system problems
 		if len(errorList) > 0 {
 			t.Errorf("❌ %d operation(s) failed even after retry mechanisms (%.1f%% failure rate). This indicates potential system issues or race conditions that need investigation.",
 				len(errorList), float64(len(errorList))/float64(totalOperations)*100)
 		}
 	})
 }