From 09b0a2505ccfb671c008e6ef31d22ff4946cb6c3 Mon Sep 17 00:00:00 2001
From: chrislu <chris.lu@gmail.com>
Date: Sun, 23 Nov 2025 21:12:52 -0800
Subject: [PATCH] fix: poll for files to appear instead of fixed sleep

The issue: Fixed 5-second sleep was too short - files not written yet

The solution: Poll every second for up to 30 seconds
- Check if files exist in employees directory
- Download immediately when they appear
- Log progress every 5 seconds

This gives us a 30-second window to catch the file between:
  - Write (file appears)
  - Read (EOF error)

The file should appear within a few seconds of SparkSQLTest starting, and we'll grab it immediately!
---
 .github/workflows/spark-integration-tests.yml | 44 ++++++++++++-------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/spark-integration-tests.yml b/.github/workflows/spark-integration-tests.yml
index 2cdca458e..ca23cc2eb 100644
--- a/.github/workflows/spark-integration-tests.yml
+++ b/.github/workflows/spark-integration-tests.yml
@@ -135,24 +135,36 @@ jobs:
             if docker compose logs spark-tests 2>&1 | grep -q "Running seaweed.spark.SparkSQLTest"; then
               if [ "$DOWNLOADED" = "false" ]; then
                 echo ""
-                echo "=== SparkSQLTest started! Waiting for employees file creation ==="
-                sleep 5  # Give it time to write the file
+                echo "=== SparkSQLTest started! Polling for employees file creation ==="
                 
-                # List all files in employees directory
-                echo "Listing employees directory..."
-                EMPLOYEES_FILES=$(curl -s "http://localhost:8888/test-spark/employees/" | grep -oP 'part-[a-f0-9-]+-c000\.snappy\.parquet')
+                # Poll for files to appear (max 30 seconds)
+                for i in {1..30}; do
+                  EMPLOYEES_FILES=$(curl -s "http://localhost:8888/test-spark/employees/" 2>/dev/null | grep -oP 'part-[a-f0-9-]+-c000\.snappy\.parquet')
+                  
+                  if [ -n "$EMPLOYEES_FILES" ]; then
+                    echo "Files appeared after $i seconds!"
+                    echo "Found employees files, downloading ALL of them..."
+                    for FILE in $EMPLOYEES_FILES; do
+                      echo "Downloading: $FILE"
+                      curl -o "${FILE}" "http://localhost:8888/test-spark/employees/${FILE}"
+                      if [ -f "$FILE" ] && [ -s "$FILE" ]; then
+                        FILE_SIZE=$(stat --format=%s "$FILE" 2>/dev/null || stat -f%z "$FILE" 2>/dev/null)
+                        echo "SUCCESS: Downloaded $FILE_SIZE bytes"
+                        cp "$FILE" test.parquet  # Use first file for analysis
+                        DOWNLOADED=true
+                      fi
+                    done
+                    break
+                  fi
+                  
+                  if [ $((i % 5)) -eq 0 ]; then
+                    echo "Still waiting for files... ($i/30)"
+                  fi
+                  sleep 1
+                done
                 
-                if [ -n "$EMPLOYEES_FILES" ]; then
-                  echo "Found employees files, downloading ALL of them..."
-                  for FILE in $EMPLOYEES_FILES; do
-                    echo "Downloading: $FILE"
-                    curl -o "${FILE}" "http://localhost:8888/test-spark/employees/${FILE}"
-                    if [ -f "$FILE" ] && [ -s "$FILE" ]; then
-                      echo "SUCCESS: Downloaded $(stat --format=%s "$FILE" 2>/dev/null || stat -f%z "$FILE" 2>/dev/null) bytes"
-                      cp "$FILE" test.parquet  # Use first file for analysis
-                      DOWNLOADED=true
-                    fi
-                  done
+                if [ "$DOWNLOADED" = "false" ]; then
+                  echo "WARNING: No files found after 30 seconds of polling"
                 fi
               fi
             fi