From 588e29ae57fc655af188d5bbc6785eeed68db16c Mon Sep 17 00:00:00 2001 From: chrislu Date: Sun, 23 Nov 2025 14:14:45 -0800 Subject: [PATCH] debug: improve file download with better diagnostics and fallbacks Problem: File download step shows 'No Parquet files found' even though ports are exposed (8888:8888) and services are running. Improvements: 1. Show raw curl output to see actual API response 2. Use improved grep pattern with -oP for better parsing 3. Add fallback to fetch file via docker exec if HTTP fails 4. If no files found via HTTP, try docker exec curl 5. If still no files, use weed shell 'fs.ls' to list files This will help us understand: - Is the HTTP API returning files in unexpected format? - Are files accessible from inside the container but not outside? - Are files in a different path than expected? One of these methods WILL find the files! --- .github/workflows/spark-integration-tests.yml | 29 +++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/.github/workflows/spark-integration-tests.yml b/.github/workflows/spark-integration-tests.yml index f78ab6c8b..365a7210d 100644 --- a/.github/workflows/spark-integration-tests.yml +++ b/.github/workflows/spark-integration-tests.yml @@ -157,14 +157,29 @@ jobs: # List available files echo "Available Parquet files:" + echo "Checking: http://localhost:8888/test-spark/employees/" curl -s http://localhost:8888/test-spark/employees/?pretty=y | tee files.json + echo "" + echo "Raw file listing:" + curl -s http://localhost:8888/test-spark/employees/ + + echo "" + echo "Searching for .parquet files..." # Download a Parquet file - PARQUET_FILE=$(curl -s http://localhost:8888/test-spark/employees/?pretty=y | grep -o 'part-[^"]*\.parquet' | head -1) + PARQUET_FILE=$(curl -s http://localhost:8888/test-spark/employees/ | grep -oP '(?<=")part-[^"]*\.parquet(?=")' | head -1) + echo "Found file: $PARQUET_FILE" + if [ -n "$PARQUET_FILE" ]; then echo "Downloading: $PARQUET_FILE" curl -o test.parquet "http://localhost:8888/test-spark/employees/$PARQUET_FILE" + if [ ! -f test.parquet ] || [ ! -s test.parquet ]; then + echo "⚠️ Failed to download via HTTP, trying direct volume access..." + # Find the actual file ID from filer + docker compose exec -T seaweedfs-filer weed filer.cat -dir=/test-spark/employees/ -name="$PARQUET_FILE" > test.parquet + fi + echo "" echo "=== File Size ===" ls -lh test.parquet @@ -208,7 +223,17 @@ jobs: echo "✗ INVALID Parquet trailer" fi else - echo "No Parquet files found" + echo "❌ No Parquet files found via HTTP API" + echo "" + echo "Trying alternative: list files via docker exec..." + docker compose exec -T seaweedfs-filer sh -c 'curl -s http://localhost:8888/test-spark/employees/' || echo "Docker exec failed" + + echo "" + echo "Trying: weed shell to list files..." + docker compose exec -T seaweedfs-master weed shell <<'SHELL_EOF' || echo "weed shell failed" +fs.ls /test-spark/employees/ +exit +SHELL_EOF fi - name: Stop test services