From 09b0a2505ccfb671c008e6ef31d22ff4946cb6c3 Mon Sep 17 00:00:00 2001 From: chrislu Date: Sun, 23 Nov 2025 21:12:52 -0800 Subject: [PATCH] fix: poll for files to appear instead of fixed sleep The issue: Fixed 5-second sleep was too short - files not written yet The solution: Poll every second for up to 30 seconds - Check if files exist in employees directory - Download immediately when they appear - Log progress every 5 seconds This gives us a 30-second window to catch the file between: - Write (file appears) - Read (EOF error) The file should appear within a few seconds of SparkSQLTest starting, and we'll grab it immediately! --- .github/workflows/spark-integration-tests.yml | 44 ++++++++++++------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/.github/workflows/spark-integration-tests.yml b/.github/workflows/spark-integration-tests.yml index 2cdca458e..ca23cc2eb 100644 --- a/.github/workflows/spark-integration-tests.yml +++ b/.github/workflows/spark-integration-tests.yml @@ -135,24 +135,36 @@ jobs: if docker compose logs spark-tests 2>&1 | grep -q "Running seaweed.spark.SparkSQLTest"; then if [ "$DOWNLOADED" = "false" ]; then echo "" - echo "=== SparkSQLTest started! Waiting for employees file creation ===" - sleep 5 # Give it time to write the file + echo "=== SparkSQLTest started! Polling for employees file creation ===" - # List all files in employees directory - echo "Listing employees directory..." - EMPLOYEES_FILES=$(curl -s "http://localhost:8888/test-spark/employees/" | grep -oP 'part-[a-f0-9-]+-c000\.snappy\.parquet') + # Poll for files to appear (max 30 seconds) + for i in {1..30}; do + EMPLOYEES_FILES=$(curl -s "http://localhost:8888/test-spark/employees/" 2>/dev/null | grep -oP 'part-[a-f0-9-]+-c000\.snappy\.parquet') + + if [ -n "$EMPLOYEES_FILES" ]; then + echo "Files appeared after $i seconds!" + echo "Found employees files, downloading ALL of them..." + for FILE in $EMPLOYEES_FILES; do + echo "Downloading: $FILE" + curl -o "${FILE}" "http://localhost:8888/test-spark/employees/${FILE}" + if [ -f "$FILE" ] && [ -s "$FILE" ]; then + FILE_SIZE=$(stat --format=%s "$FILE" 2>/dev/null || stat -f%z "$FILE" 2>/dev/null) + echo "SUCCESS: Downloaded $FILE_SIZE bytes" + cp "$FILE" test.parquet # Use first file for analysis + DOWNLOADED=true + fi + done + break + fi + + if [ $((i % 5)) -eq 0 ]; then + echo "Still waiting for files... ($i/30)" + fi + sleep 1 + done - if [ -n "$EMPLOYEES_FILES" ]; then - echo "Found employees files, downloading ALL of them..." - for FILE in $EMPLOYEES_FILES; do - echo "Downloading: $FILE" - curl -o "${FILE}" "http://localhost:8888/test-spark/employees/${FILE}" - if [ -f "$FILE" ] && [ -s "$FILE" ]; then - echo "SUCCESS: Downloaded $(stat --format=%s "$FILE" 2>/dev/null || stat -f%z "$FILE" 2>/dev/null) bytes" - cp "$FILE" test.parquet # Use first file for analysis - DOWNLOADED=true - fi - done + if [ "$DOWNLOADED" = "false" ]; then + echo "WARNING: No files found after 30 seconds of polling" fi fi fi