From c774b807e10024c0ef770808a820335b995af35e Mon Sep 17 00:00:00 2001 From: chrislu Date: Sun, 23 Nov 2025 21:23:33 -0800 Subject: [PATCH] fix: search temporary directories for Parquet files The issue: Files written to employees/ but immediately moved/deleted by Spark Spark's file commit process: 1. Write to: employees/_temporary/0/_temporary/attempt_xxx/part-xxx.parquet 2. Commit/rename to: employees/part-xxx.parquet 3. Read and delete (on failure) By the time we check employees/, the file is already gone! Solution: Search multiple locations - employees/ (final location) - employees/_temporary/ (intermediate) - employees/_temporary/0/_temporary/ (write location) - Recursive search as fallback Also: - Extract exact filename from write log - Try all locations until we find the file - Show directory listings for debugging This should catch files in their temporary location before Spark moves them! --- .github/workflows/spark-integration-tests.yml | 52 ++++++++++++------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/.github/workflows/spark-integration-tests.yml b/.github/workflows/spark-integration-tests.yml index fe489e506..bbf4c1d5e 100644 --- a/.github/workflows/spark-integration-tests.yml +++ b/.github/workflows/spark-integration-tests.yml @@ -135,36 +135,50 @@ jobs: if docker compose logs spark-tests 2>&1 | grep -q "PARQUET FILE WRITTEN TO EMPLOYEES"; then if [ "$DOWNLOADED" = "false" ]; then echo "" - echo "=== EMPLOYEES FILE WRITTEN! Downloading immediately ===" + echo "=== EMPLOYEES FILE WRITTEN! Extracting from logs and downloading ===" - # Poll for files to appear (max 30 seconds) - for i in {1..30}; do - EMPLOYEES_FILES=$(curl -s "http://localhost:8888/test-spark/employees/" 2>/dev/null | grep -oP 'part-[a-f0-9-]+-c000\.snappy\.parquet') + # Extract the EXACT filename from the write log + FULL_LOG=$(docker compose logs spark-tests 2>&1) + WRITTEN_FILE=$(echo "$FULL_LOG" | grep "PARQUET FILE WRITTEN TO EMPLOYEES" | tail -1 | grep -oP 'part-[a-f0-9-]+-c000\.snappy\.parquet') + + echo "Filename from log: $WRITTEN_FILE" + + if [ -z "$WRITTEN_FILE" ]; then + echo "ERROR: Could not extract filename from write log" + echo "Showing write log lines:" + echo "$FULL_LOG" | grep "PARQUET FILE WRITTEN" | tail -5 + fi + + # Try multiple locations: final, temporary, _temporary subdirs + for BASE_DIR in "employees" "employees/_temporary" "employees/_temporary/0/_temporary"; do + echo "Trying: /test-spark/$BASE_DIR/" - if [ -n "$EMPLOYEES_FILES" ]; then - echo "Files appeared after $i seconds!" - echo "Found employees files, downloading ALL of them..." - for FILE in $EMPLOYEES_FILES; do - echo "Downloading: $FILE" - curl -o "${FILE}" "http://localhost:8888/test-spark/employees/${FILE}" + # List what's actually there + LISTING=$(curl -s "http://localhost:8888/test-spark/$BASE_DIR/" 2>/dev/null) + FILES=$(echo "$LISTING" | grep -oP 'part-[a-f0-9-]+-c000\.snappy\.parquet' || true) + + if [ -n "$FILES" ]; then + echo "Found files in $BASE_DIR:" + echo "$FILES" + + for FILE in $FILES; do + echo "Downloading: $FILE from $BASE_DIR" + curl -o "${FILE}" "http://localhost:8888/test-spark/$BASE_DIR/$FILE" if [ -f "$FILE" ] && [ -s "$FILE" ]; then FILE_SIZE=$(stat --format=%s "$FILE" 2>/dev/null || stat -f%z "$FILE" 2>/dev/null) - echo "SUCCESS: Downloaded $FILE_SIZE bytes" - cp "$FILE" test.parquet # Use first file for analysis + echo "SUCCESS: Downloaded $FILE_SIZE bytes from $BASE_DIR" + cp "$FILE" test.parquet DOWNLOADED=true + break 2 # Break out of both loops fi done - break - fi - - if [ $((i % 5)) -eq 0 ]; then - echo "Still waiting for files... ($i/30)" fi - sleep 1 done if [ "$DOWNLOADED" = "false" ]; then - echo "WARNING: No files found after 30 seconds of polling" + echo "WARNING: File not found in any location" + echo "Trying recursive search..." + curl -s "http://localhost:8888/test-spark/employees/?recursive=true" | grep parquet || true fi fi fi