From 15ca301e43d3a4231180b8aba3ad668c7254b7bc Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Mon, 5 Jan 2026 12:05:31 -0800 Subject: [PATCH] Fix flaky EC integration tests by collecting server logs on failure (#7969) * Fix flaky EC integration tests by collecting server logs on failure The EC Integration Tests were experiencing flaky timeouts with errors like "error reading from server: EOF" and master client reconnection attempts. When tests failed, server logs were not collected, making debugging difficult. Changes: - Updated all test functions to use t.TempDir() instead of os.MkdirTemp() and manual cleanup. t.TempDir() automatically preserves directories when tests fail, ensuring logs are available for debugging. - Modified GitHub Actions workflow to collect server logs from temp directories when tests fail, including master.log and volume*.log files. - Added explicit log collection step that searches for test temp directories and copies them to artifacts for upload. This will make debugging flaky test failures much easier by providing access to the actual server logs showing what went wrong. * Fix find command precedence in log collection The -type d flag only applied to the first -name predicate because -o has lower precedence than the implicit AND. Grouped the -name predicates with escaped parentheses so -type d applies to all directory name patterns. --- .github/workflows/ec-integration-tests.yml | 22 +++++++++++- test/erasure_coding/ec_integration_test.go | 40 +++++++++------------- 2 files changed, 37 insertions(+), 25 deletions(-) diff --git a/.github/workflows/ec-integration-tests.yml b/.github/workflows/ec-integration-tests.yml index 7a6188886..d8d8fdfd3 100644 --- a/.github/workflows/ec-integration-tests.yml +++ b/.github/workflows/ec-integration-tests.yml @@ -33,9 +33,29 @@ jobs: run: | go test -v + - name: Collect server logs on failure + if: failure() + run: | + echo "Collecting server logs from temp directories..." + mkdir -p /tmp/ec-test-logs + # Find all temp directories created by the tests (they persist on failure with t.TempDir()) + find /tmp -maxdepth 1 -type d \( -name "TestEC*" -o -name "TestDisk*" -o -name "TestCross*" -o -name "TestEvacuation*" \) 2>/dev/null | while read dir; do + if [ -d "$dir" ]; then + echo "Found test directory: $dir" + # Copy the entire directory structure to preserve organization + cp -r "$dir" /tmp/ec-test-logs/ 2>/dev/null || true + fi + done + # List what we collected + echo "Collected logs:" + find /tmp/ec-test-logs -type f -name "*.log" 2>/dev/null || echo "No logs found" + - name: Archive logs if: failure() uses: actions/upload-artifact@v6 with: name: ec-integration-test-logs - path: test/erasure_coding \ No newline at end of file + path: | + /tmp/ec-test-logs/ + test/erasure_coding/ + if-no-files-found: warn \ No newline at end of file diff --git a/test/erasure_coding/ec_integration_test.go b/test/erasure_coding/ec_integration_test.go index 25693cb75..f42e586cc 100644 --- a/test/erasure_coding/ec_integration_test.go +++ b/test/erasure_coding/ec_integration_test.go @@ -33,9 +33,8 @@ func TestECEncodingVolumeLocationTimingBug(t *testing.T) { } // Create temporary directory for test data - testDir, err := os.MkdirTemp("", "seaweedfs_ec_integration_test_") - require.NoError(t, err) - defer os.RemoveAll(testDir) + // Using t.TempDir() automatically preserves logs when tests fail + testDir := t.TempDir() // Start SeaweedFS cluster with multiple volume servers ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) @@ -244,9 +243,8 @@ func TestECEncodingMasterTimingRaceCondition(t *testing.T) { } // Create temporary directory for test data - testDir, err := os.MkdirTemp("", "seaweedfs_ec_race_test_") - require.NoError(t, err) - defer os.RemoveAll(testDir) + // Using t.TempDir() automatically preserves logs when tests fail + testDir := t.TempDir() // Start SeaweedFS cluster ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) @@ -783,9 +781,8 @@ func TestDiskAwareECRebalancing(t *testing.T) { t.Skip("Skipping disk-aware integration test in short mode") } - testDir, err := os.MkdirTemp("", "seaweedfs_disk_aware_ec_test_") - require.NoError(t, err) - defer os.RemoveAll(testDir) + // Using t.TempDir() automatically preserves logs when tests fail + testDir := t.TempDir() ctx, cancel := context.WithTimeout(context.Background(), 180*time.Second) defer cancel() @@ -1217,9 +1214,8 @@ func TestECDiskTypeSupport(t *testing.T) { t.Skip("Skipping disk type integration test in short mode") } - testDir, err := os.MkdirTemp("", "seaweedfs_ec_disktype_test_") - require.NoError(t, err) - defer os.RemoveAll(testDir) + // Using t.TempDir() automatically preserves logs when tests fail + testDir := t.TempDir() ctx, cancel := context.WithTimeout(context.Background(), 180*time.Second) defer cancel() @@ -1558,9 +1554,8 @@ func TestECDiskTypeMixedCluster(t *testing.T) { t.Skip("Skipping mixed disk type integration test in short mode") } - testDir, err := os.MkdirTemp("", "seaweedfs_ec_mixed_disktype_test_") - require.NoError(t, err) - defer os.RemoveAll(testDir) + // Using t.TempDir() automatically preserves logs when tests fail + testDir := t.TempDir() ctx, cancel := context.WithTimeout(context.Background(), 180*time.Second) defer cancel() @@ -1748,9 +1743,8 @@ func TestEvacuationFallbackBehavior(t *testing.T) { t.Skip("Skipping evacuation fallback test in short mode") } - testDir, err := os.MkdirTemp("", "seaweedfs_evacuation_fallback_test_") - require.NoError(t, err) - defer os.RemoveAll(testDir) + // Using t.TempDir() automatically preserves logs when tests fail + testDir := t.TempDir() ctx, cancel := context.WithTimeout(context.Background(), 180*time.Second) defer cancel() @@ -1842,9 +1836,8 @@ func TestCrossRackECPlacement(t *testing.T) { t.Skip("Skipping cross-rack EC placement test in short mode") } - testDir, err := os.MkdirTemp("", "seaweedfs_cross_rack_ec_test_") - require.NoError(t, err) - defer os.RemoveAll(testDir) + // Using t.TempDir() automatically preserves logs when tests fail + testDir := t.TempDir() ctx, cancel := context.WithTimeout(context.Background(), 180*time.Second) defer cancel() @@ -2196,9 +2189,8 @@ func TestECEncodeReplicatedVolumeSync(t *testing.T) { } // Create temporary directory for test data - testDir, err := os.MkdirTemp("", "seaweedfs_ec_replica_sync_test_") - require.NoError(t, err) - defer os.RemoveAll(testDir) + // Using t.TempDir() automatically preserves logs when tests fail + testDir := t.TempDir() // Start SeaweedFS cluster ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second)