name: Spark Integration Tests on: push: paths: - 'test/java/spark/**' - 'other/java/hdfs2/**' - 'other/java/hdfs3/**' - 'other/java/client/**' - '.github/workflows/spark-integration-tests.yml' pull_request: paths: - 'test/java/spark/**' - 'other/java/hdfs2/**' - 'other/java/hdfs3/**' - 'other/java/client/**' - '.github/workflows/spark-integration-tests.yml' workflow_dispatch: permissions: contents: read checks: write pull-requests: write jobs: spark-integration-tests: name: Spark Integration Tests runs-on: ubuntu-latest timeout-minutes: 45 steps: # ======================================== # SETUP & BUILD # ======================================== - name: Checkout code uses: actions/checkout@v4 - name: Set up JDK 11 uses: actions/setup-java@v4 with: java-version: '11' distribution: 'temurin' cache: maven - name: Set up Go uses: actions/setup-go@v5 with: go-version: '1.24' - name: Build SeaweedFS binary run: | echo "Building SeaweedFS binary (statically linked for Alpine)..." cd weed CGO_ENABLED=0 go build -o ../docker/weed cd ../docker ls -la weed filer.toml entrypoint.sh file weed echo "OK SeaweedFS binary built" - name: Build SeaweedFS Java dependencies run: | echo "Building Java client..." cd other/java/client mvn clean install -U -DskipTests -Dgpg.skip=true -Dcentral.publishing.skip=true echo "OK Java client built" cd ../../.. echo "Building HDFS2 client..." cd other/java/hdfs2 mvn clean install -U -DskipTests -Dgpg.skip=true -Dcentral.publishing.skip=true echo "OK HDFS2 client built" cd ../../.. echo "Building HDFS3 client..." cd other/java/hdfs3 mvn clean install -U -DskipTests -Dgpg.skip=true -Dcentral.publishing.skip=true echo "OK HDFS3 client built" echo "" echo "All Java dependencies installed to ~/.m2/repository" # ======================================== # SPARK INTEGRATION TESTS (DOCKER) # ======================================== - name: Start SeaweedFS services for tests working-directory: test/java/spark run: | echo "=== Starting SeaweedFS Services for Tests ===" docker compose down -v || true docker compose build --no-cache docker compose up -d seaweedfs-master seaweedfs-volume seaweedfs-filer echo "Waiting for services..." for i in {1..30}; do if curl -f http://localhost:8888/ > /dev/null 2>&1; then echo "OK SeaweedFS filer is ready!" break fi if [ $i -eq 30 ]; then echo "FAILED Services failed to start" docker compose ps -a docker compose logs exit 1 fi echo "Waiting... ($i/30)" sleep 2 done curl -f http://localhost:9333/cluster/status || exit 1 echo "OK All services healthy" - name: Prepare Maven repository for Docker working-directory: test/java/spark run: | echo "Copying Maven artifacts for Docker container..." mkdir -p .m2/repository/com cp -r ~/.m2/repository/com/seaweedfs .m2/repository/com/ echo "OK Maven artifacts ready" - name: Run Spark integration tests working-directory: test/java/spark continue-on-error: true id: test-run run: | echo "=== Running Spark Integration Tests ===" # Run tests in detached mode docker compose up -d spark-tests echo "Real-time monitoring: Will download file the instant EOF error appears..." # Monitor logs and download ALL employees files BEFORE they're deleted ( DOWNLOADED=false while docker ps | grep -q seaweedfs-spark-tests; do # Check if an employees Parquet file has been written (we log this explicitly) if docker compose logs spark-tests 2>&1 | grep -q "PARQUET FILE WRITTEN TO EMPLOYEES"; then if [ "$DOWNLOADED" = "false" ]; then echo "" echo "=== EMPLOYEES FILE WRITTEN! Downloading immediately ===" # Poll for files to appear (max 30 seconds) for i in {1..30}; do EMPLOYEES_FILES=$(curl -s "http://localhost:8888/test-spark/employees/" 2>/dev/null | grep -oP 'part-[a-f0-9-]+-c000\.snappy\.parquet') if [ -n "$EMPLOYEES_FILES" ]; then echo "Files appeared after $i seconds!" echo "Found employees files, downloading ALL of them..." for FILE in $EMPLOYEES_FILES; do echo "Downloading: $FILE" curl -o "${FILE}" "http://localhost:8888/test-spark/employees/${FILE}" if [ -f "$FILE" ] && [ -s "$FILE" ]; then FILE_SIZE=$(stat --format=%s "$FILE" 2>/dev/null || stat -f%z "$FILE" 2>/dev/null) echo "SUCCESS: Downloaded $FILE_SIZE bytes" cp "$FILE" test.parquet # Use first file for analysis DOWNLOADED=true fi done break fi if [ $((i % 5)) -eq 0 ]; then echo "Still waiting for files... ($i/30)" fi sleep 1 done if [ "$DOWNLOADED" = "false" ]; then echo "WARNING: No files found after 30 seconds of polling" fi fi fi # Check if EOF error has appeared if docker compose logs spark-tests 2>&1 | grep -q "EOFException.*Still have: 78 bytes left"; then echo "" echo "=== EOF ERROR DETECTED! ===" if [ "$DOWNLOADED" = "true" ] && [ -f test.parquet ] && [ -s test.parquet ]; then echo "File was already downloaded proactively!" FILE_SIZE=$(stat --format=%s test.parquet 2>/dev/null || stat -f%z test.parquet 2>/dev/null) echo "File size: $FILE_SIZE bytes" # Analyze it echo "" echo "Installing parquet-tools..." pip3 install -q parquet-tools echo "" echo "=== File Header (first 100 bytes) ===" hexdump -C test.parquet | head -10 echo "" echo "=== File Footer (last 200 bytes) ===" tail -c 200 test.parquet | hexdump -C echo "" echo "=== Magic bytes check ===" echo "First 4 bytes (should be PAR1):" head -c 4 test.parquet | xxd echo "Last 4 bytes (should be PAR1):" tail -c 4 test.parquet | xxd echo "" echo "=== Parquet metadata ===" parquet-tools inspect test.parquet || echo "parquet-tools inspect failed" echo "" echo "=== Try reading data ===" parquet-tools show test.parquet || echo "parquet-tools show failed" echo "" echo "=== CRITICAL ANALYSIS: Where are the missing 78 bytes? ===" echo "Actual file size: $FILE_SIZE bytes" echo "" echo "Examining column chunk offsets from metadata..." parquet-tools meta test.parquet > meta.txt 2>&1 || true cat meta.txt echo "" echo "Analyzing offset pattern..." grep -i "offset" meta.txt || echo "No offset info" else echo "ERROR: File was not downloaded proactively!" fi break fi sleep 1 done ) & MONITOR_PID=$! # Wait for tests to complete docker wait seaweedfs-spark-tests TEST_EXIT_CODE=$(docker inspect seaweedfs-spark-tests --format='{{.State.ExitCode}}') # Give monitor time to finish sleep 3 kill $MONITOR_PID 2>/dev/null || true # Show full logs echo "" echo "=== Test Logs ===" docker compose logs spark-tests | tail -100 echo "" echo "Tests completed with exit code: $TEST_EXIT_CODE" echo "exit_code=$TEST_EXIT_CODE" >> $GITHUB_OUTPUT exit $TEST_EXIT_CODE - name: Examine Parquet file if: steps.test-run.outcome == 'failure' working-directory: test/java/spark run: | echo "=== Examining Parquet file for analysis ===" # Check if file was already downloaded if [ ! -f test.parquet ] || [ ! -s test.parquet ]; then echo "ERROR: test.parquet not found or empty" echo "File was not successfully downloaded during test run" exit 1 fi echo "Found test.parquet, proceeding with analysis..." # Install parquet-tools pip3 install parquet-tools echo "" echo "=== File Size ===" ls -lh test.parquet FILE_SIZE=$(stat -f%z test.parquet 2>/dev/null || stat -c%s test.parquet) echo "Actual file size: $FILE_SIZE bytes" echo "" echo "=== File Header (first 100 bytes) ===" hexdump -C test.parquet | head -10 echo "" echo "=== File Footer (last 200 bytes) ===" tail -c 200 test.parquet | hexdump -C echo "" echo "=== Magic Bytes Check ===" echo "First 4 bytes (should be PAR1):" head -c 4 test.parquet | xxd echo "Last 4 bytes (should be PAR1):" tail -c 4 test.parquet | xxd echo "" echo "=== Parquet Metadata ===" parquet-tools inspect test.parquet || echo "parquet-tools failed" echo "" echo "=== Try Reading with Parquet Tools ===" parquet-tools show test.parquet || echo "Failed to read file" echo "" echo "=== File Validation ===" if head -c 4 test.parquet | grep -q "PAR1"; then echo "OK Valid Parquet header" else echo "FAILED INVALID Parquet header" fi if tail -c 4 test.parquet | grep -q "PAR1"; then echo "OK Valid Parquet trailer" else echo "FAILED INVALID Parquet trailer" fi - name: Stop test services if: always() working-directory: test/java/spark run: docker compose down -v - name: Upload test results if: always() uses: actions/upload-artifact@v4 with: name: spark-test-results path: test/java/spark/target/surefire-reports/ retention-days: 30 - name: Upload Parquet file for analysis if: failure() uses: actions/upload-artifact@v4 with: name: failed-parquet-file path: test/java/spark/test.parquet retention-days: 7 if-no-files-found: ignore - name: Publish test report if: always() uses: dorny/test-reporter@v1 with: name: Spark Test Results path: test/java/spark/target/surefire-reports/*.xml reporter: java-junit fail-on-error: true - name: Check test results if: steps.test-run.outcome == 'failure' run: | echo "ERROR Tests failed with exit code: ${{ steps.test-run.outputs.exit_code }}" echo "But file analysis was completed above." exit 1 # ======================================== # SPARK EXAMPLE (HOST-BASED) # ======================================== - name: Cache Apache Spark if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' id: cache-spark uses: actions/cache@v4 with: path: spark-3.5.0-bin-hadoop3 key: spark-3.5.0-hadoop3 - name: Download Apache Spark if: (github.event_name == 'push' || github.event_name == 'workflow_dispatch') && steps.cache-spark.outputs.cache-hit != 'true' run: | echo "Downloading Apache Spark 3.5.0..." wget -q https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz tar xzf spark-3.5.0-bin-hadoop3.tgz echo "OK Spark downloaded" - name: Start SeaweedFS services for example if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' working-directory: test/java/spark run: | echo "=== Starting SeaweedFS Services for Example ===" docker compose down -v || true docker compose build --no-cache docker compose up -d seaweedfs-master seaweedfs-volume seaweedfs-filer echo "Waiting for services..." for i in {1..30}; do if curl -f http://localhost:8888/ > /dev/null 2>&1; then echo "OK SeaweedFS filer is ready!" break fi if [ $i -eq 30 ]; then echo "FAILED Services failed to start" docker compose ps -a docker compose logs exit 1 fi echo "Waiting... ($i/30)" sleep 2 done curl -f http://localhost:9333/cluster/status || exit 1 echo "OK All services healthy" - name: Build project for example if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' working-directory: test/java/spark run: mvn clean package -DskipTests - name: Run Spark example application if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' working-directory: test/java/spark run: | echo "=== Running Spark Example Application ===" export SPARK_HOME=$(pwd)/../../../spark-3.5.0-bin-hadoop3 $SPARK_HOME/bin/spark-submit \ --class seaweed.spark.SparkSeaweedFSExample \ --master local[2] \ --conf spark.hadoop.fs.seaweedfs.impl=seaweed.hdfs.SeaweedFileSystem \ --conf spark.hadoop.fs.seaweed.filer.host=localhost \ --conf spark.hadoop.fs.seaweed.filer.port=8888 \ --conf spark.hadoop.fs.seaweed.filer.port.grpc=18888 \ --conf spark.hadoop.fs.seaweed.replication="" \ target/seaweedfs-spark-integration-tests-1.0-SNAPSHOT.jar \ seaweedfs://localhost:8888/ci-spark-output echo "OK Example completed" - name: Verify example output if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' run: | echo "Verifying output..." curl -s http://localhost:8888/ci-spark-output/ || echo "Output listing unavailable" - name: Stop example services if: always() && (github.event_name == 'push' || github.event_name == 'workflow_dispatch') working-directory: test/java/spark run: docker compose down -v # ======================================== # DIAGNOSTICS # ======================================== - name: Display diagnostics on failure if: failure() working-directory: test/java/spark run: | echo "=== Container Status ===" docker compose ps -a echo "" echo "=== Master Logs ===" docker compose logs seaweedfs-master echo "" echo "=== Volume Logs ===" docker compose logs seaweedfs-volume echo "" echo "=== Filer Logs ===" docker compose logs seaweedfs-filer echo "" echo "=== Volume List ===" docker compose exec -T seaweedfs-master weed shell <