seaweedfs/.github/workflows/spark-integration-tests.yml


								name: Spark Integration Tests


								on:

								  push:

								    paths:

								      - 'test/java/spark/**'

								      - 'other/java/hdfs2/**'

								      - 'other/java/hdfs3/**'

								      - 'other/java/client/**'

								      - '.github/workflows/spark-integration-tests.yml'

								  pull_request:

								    paths:

								      - 'test/java/spark/**'

								      - 'other/java/hdfs2/**'

								      - 'other/java/hdfs3/**'

								      - 'other/java/client/**'

								      - '.github/workflows/spark-integration-tests.yml'

								  workflow_dispatch:


								permissions:

								  contents: read

								  checks: write

								  pull-requests: write


								jobs:

								  spark-integration-tests:

								    name: Spark Integration Tests

								    runs-on: ubuntu-latest

								    timeout-minutes: 45


								    steps:

								    # ========================================

								    # SETUP & BUILD

								    # ========================================

								    - name: Checkout code

								      uses: actions/checkout@v4


								    - name: Set up JDK 11

								      uses: actions/setup-java@v4

								      with:

								        java-version: '11'

								        distribution: 'temurin'

								        cache: maven


								    - name: Set up Go

								      uses: actions/setup-go@v5

								      with:

								        go-version: '1.24'


								    - name: Build SeaweedFS binary

								      run: |

								        echo "Building SeaweedFS binary (statically linked for Alpine)..."

								        cd weed

								        CGO_ENABLED=0 go build -o ../docker/weed

								        cd ../docker

								        ls -la weed filer.toml entrypoint.sh

								        file weed

								        echo "OK SeaweedFS binary built"


								    - name: Build SeaweedFS Java dependencies

								      run: |

								        echo "Building Java client..."

								        cd other/java/client

								        mvn clean install -U -DskipTests -Dgpg.skip=true -Dcentral.publishing.skip=true

								        echo "OK Java client built"

								        cd ../../..


								        echo "Building HDFS2 client..."

								        cd other/java/hdfs2

								        mvn clean install -U -DskipTests -Dgpg.skip=true -Dcentral.publishing.skip=true

								        echo "OK HDFS2 client built"

								        cd ../../..


								        echo "Building HDFS3 client..."

								        cd other/java/hdfs3

								        mvn clean install -U -DskipTests -Dgpg.skip=true -Dcentral.publishing.skip=true

								        echo "OK HDFS3 client built"

								        echo ""

								        echo "All Java dependencies installed to ~/.m2/repository"


								    # ========================================

								    # SPARK INTEGRATION TESTS (DOCKER)

								    # ========================================

								    - name: Start SeaweedFS services for tests

								      working-directory: test/java/spark

								      run: |

								        echo "=== Starting SeaweedFS Services for Tests ==="

								        docker compose down -v || true

								        docker compose build --no-cache

								        docker compose up -d seaweedfs-master seaweedfs-volume seaweedfs-filer


								        echo "Waiting for services..."

								        for i in {1..30}; do

								          if curl -f http://localhost:8888/ > /dev/null 2>&1; then

								            echo "OK SeaweedFS filer is ready!"

								            break

								          fi

								          if [ $i -eq 30 ]; then

								            echo "FAILED Services failed to start"

								            docker compose ps -a

								            docker compose logs

								            exit 1

								          fi

								          echo "Waiting... ($i/30)"

								          sleep 2

								        done


								        curl -f http://localhost:9333/cluster/status || exit 1

								        echo "OK All services healthy"


								    - name: Prepare Maven repository for Docker

								      working-directory: test/java/spark

								      run: |

								        echo "Copying Maven artifacts for Docker container..."

								        mkdir -p .m2/repository/com

								        cp -r ~/.m2/repository/com/seaweedfs .m2/repository/com/

								        echo "OK Maven artifacts ready"


								    - name: Run Spark integration tests

								      working-directory: test/java/spark

								      continue-on-error: true

								      id: test-run

								      run: |

								        echo "=== Running Spark Integration Tests ==="

								        # Run tests in detached mode

								        docker compose up -d spark-tests


								        echo "Real-time monitoring: Will download file the instant EOF error appears..."


								        # Monitor logs and download chunk data DIRECTLY from volume server

								        (

								          while docker ps | grep -q seaweedfs-spark-tests; do

								            # Check if EOF error has appeared

								            if docker compose logs spark-tests 2>&1 | grep -q "EOFException.*Still have: 78 bytes left"; then

								              echo ""

								              echo "=== EOF ERROR DETECTED! Extracting chunk data ==="


								              # Get the full log and extract the EXACT file causing the error

								              FULL_LOG=$(docker compose logs spark-tests 2>&1)


								              # Extract the failing filename from the EOF error message

								              # The error message format: "...seaweedfs://seaweedfs-filer:8888/test-spark/employees/part-xxx.parquet..."

								              FAILING_FILE=$(echo "$FULL_LOG" | grep -B 5 "EOFException.*78 bytes" | grep "seaweedfs://" | grep -oP 'part-[a-f0-9-]+\.c000\.snappy\.parquet' | head -1)

								              echo "Failing file: $FAILING_FILE"


								              if [ -z "$FAILING_FILE" ]; then

								                echo "ERROR: Could not extract failing filename from error message"

								                echo "Searching for error message pattern..."

								                echo "$FULL_LOG" | grep -A 2 "EOFException.*78 bytes" | head -20

								                break

								              fi


								              # Now find the chunk info for THIS SPECIFIC FILE

								              # The file is being READ when the error occurs, so look for SeaweedInputStream opening it

								              echo "Searching logs for when $FAILING_FILE was opened for reading..."


								              # Find all instances where this file is mentioned and get nearby chunk info

								              # Strategy: Search for the filename, then look for "chunks {" blocks near it

								              CHUNK_CONTEXT=$(echo "$FULL_LOG" | grep -A 100 "new path:.*$FAILING_FILE")


								              if [ -n "$CHUNK_CONTEXT" ]; then

								                echo "Found read context for file"

								                CHUNK_ID=$(echo "$CHUNK_CONTEXT" | head -30 | grep '  file_id: "' | head -1 | grep -oP '"\K[^"]+')

								              else

								                echo "No read context, trying write context..."

								                # Maybe it's in the write logs

								                CHUNK_CONTEXT=$(echo "$FULL_LOG" | grep -B 50 -A 20 "$FAILING_FILE")

								                CHUNK_ID=$(echo "$CHUNK_CONTEXT" | grep '  file_id: "' | head -1 | grep -oP '"\K[^"]+')

								              fi

								              echo "Found chunk ID: $CHUNK_ID"


								              if [ -n "$CHUNK_ID" ]; then

								                # Download directly from volume server (data persists even after filer metadata deleted)

								                echo "Downloading chunk from volume server: http://localhost:8080/$CHUNK_ID"

								                curl -v -o test.parquet "http://localhost:8080/$CHUNK_ID"


								                if [ -f test.parquet ] && [ -s test.parquet ]; then

								                  FILE_SIZE=$(stat --format=%s test.parquet 2>/dev/null || stat -f%z test.parquet 2>/dev/null)

								                  echo "SUCCESS: Downloaded $FILE_SIZE bytes from volume!"

								                  ls -lh test.parquet


								                  # Quick analysis

								                  echo ""

								                  echo "Installing parquet-tools..."

								                  pip3 install -q parquet-tools


								                  echo ""

								                  echo "=== File Header (first 100 bytes) ==="

								                  hexdump -C test.parquet | head -10


								                  echo ""

								                  echo "=== File Footer (last 200 bytes) ==="

								                  tail -c 200 test.parquet | hexdump -C


								                  echo ""

								                  echo "=== Magic bytes check ==="

								                  echo "First 4 bytes (should be PAR1):"

								                  head -c 4 test.parquet | xxd

								                  echo "Last 4 bytes (should be PAR1):"

								                  tail -c 4 test.parquet | xxd


								                  echo ""

								                  echo "=== Parquet metadata ==="

								                  parquet-tools inspect test.parquet || echo "parquet-tools inspect failed"


								                  echo ""

								                  echo "=== Try reading data ==="

								                  parquet-tools show test.parquet || echo "parquet-tools show failed"


								                  echo ""

								                  echo "=== CRITICAL ANALYSIS: Where are the missing 78 bytes? ==="

								                  echo "Actual file size: $FILE_SIZE bytes"


								                  # Parse footer to find what size Parquet thinks the file should be

								                  echo ""

								                  echo "Reading footer length (last 8 bytes)..."

								                  FOOTER_LEN_HEX=$(tail -c 8 test.parquet | head -c 4 | xxd -p)

								                  echo "Footer length (hex): $FOOTER_LEN_HEX"


								                  # Get the highest offset from column metadata

								                  echo ""

								                  echo "Examining column chunk offsets from metadata..."

								                  parquet-tools meta test.parquet > meta.txt 2>&1 || true

								                  cat meta.txt


								                  echo ""

								                  echo "Analyzing offset pattern..."

								                  grep -i "offset" meta.txt || echo "No offset info"


								                  echo ""

								                  echo "Expected file size based on Parquet metadata:"

								                  echo "  If Parquet reader expects $((FILE_SIZE + 78)) bytes,"

								                  echo "  then column chunks claim offsets beyond actual data"


								                  echo ""

								                  echo "=== Download the file as artifact for local analysis ==="

								                  ls -lh test.parquet

								                else

								                  echo "FAILED: Could not download chunk"

								                fi

								              else

								                echo "ERROR: Could not extract chunk ID from logs"

								              fi

								              break

								            fi

								            sleep 1

								          done

								        ) &

								        MONITOR_PID=$!


								        # Wait for tests to complete

								        docker wait seaweedfs-spark-tests

								        TEST_EXIT_CODE=$(docker inspect seaweedfs-spark-tests --format='{{.State.ExitCode}}')


								        # Give monitor time to finish

								        sleep 3

								        kill $MONITOR_PID 2>/dev/null || true


								        # Show full logs

								        echo ""

								        echo "=== Test Logs ==="

								        docker compose logs spark-tests | tail -100


								        echo ""

								        echo "Tests completed with exit code: $TEST_EXIT_CODE"

								        echo "exit_code=$TEST_EXIT_CODE" >> $GITHUB_OUTPUT


								        exit $TEST_EXIT_CODE


								    - name: Examine Parquet file

								      if: steps.test-run.outcome == 'failure'

								      working-directory: test/java/spark

								      run: |

								        echo "=== Examining Parquet file for analysis ==="


								        # Check if file was already downloaded

								        if [ ! -f test.parquet ] || [ ! -s test.parquet ]; then

								          echo "ERROR: test.parquet not found or empty"

								          echo "File was not successfully downloaded during test run"

								          exit 1

								        fi


								        echo "Found test.parquet, proceeding with analysis..."


								        # Install parquet-tools

								        pip3 install parquet-tools


								        echo ""

								        echo "=== File Size ==="

								        ls -lh test.parquet

								        FILE_SIZE=$(stat -f%z test.parquet 2>/dev/null || stat -c%s test.parquet)

								        echo "Actual file size: $FILE_SIZE bytes"


								        echo ""

								        echo "=== File Header (first 100 bytes) ==="

								        hexdump -C test.parquet | head -10


								        echo ""

								        echo "=== File Footer (last 200 bytes) ==="

								        tail -c 200 test.parquet | hexdump -C


								        echo ""

								        echo "=== Magic Bytes Check ==="

								        echo "First 4 bytes (should be PAR1):"

								        head -c 4 test.parquet | xxd

								        echo "Last 4 bytes (should be PAR1):"

								        tail -c 4 test.parquet | xxd


								        echo ""

								        echo "=== Parquet Metadata ==="

								        parquet-tools inspect test.parquet || echo "parquet-tools failed"


								        echo ""

								        echo "=== Try Reading with Parquet Tools ==="

								        parquet-tools show test.parquet || echo "Failed to read file"


								        echo ""

								        echo "=== File Validation ==="

								        if head -c 4 test.parquet | grep -q "PAR1"; then

								          echo "OK Valid Parquet header"

								        else

								          echo "FAILED INVALID Parquet header"

								        fi


								        if tail -c 4 test.parquet | grep -q "PAR1"; then

								          echo "OK Valid Parquet trailer"

								        else

								          echo "FAILED INVALID Parquet trailer"

								        fi


								    - name: Stop test services

								      if: always()

								      working-directory: test/java/spark

								      run: docker compose down -v


								    - name: Upload test results

								      if: always()

								      uses: actions/upload-artifact@v4

								      with:

								        name: spark-test-results

								        path: test/java/spark/target/surefire-reports/

								        retention-days: 30


								    - name: Upload Parquet file for analysis

								      if: failure()

								      uses: actions/upload-artifact@v4

								      with:

								        name: failed-parquet-file

								        path: test/java/spark/test.parquet

								        retention-days: 7

								        if-no-files-found: ignore


								    - name: Publish test report

								      if: always()

								      uses: dorny/test-reporter@v1

								      with:

								        name: Spark Test Results

								        path: test/java/spark/target/surefire-reports/*.xml

								        reporter: java-junit

								        fail-on-error: true


								    - name: Check test results

								      if: steps.test-run.outcome == 'failure'

								      run: |

								        echo "ERROR Tests failed with exit code: ${{ steps.test-run.outputs.exit_code }}"

								        echo "But file analysis was completed above."

								        exit 1


								    # ========================================

								    # SPARK EXAMPLE (HOST-BASED)

								    # ========================================

								    - name: Cache Apache Spark

								      if: github.event_name == 'push' || github.event_name == 'workflow_dispatch'

								      id: cache-spark

								      uses: actions/cache@v4

								      with:

								        path: spark-3.5.0-bin-hadoop3

								        key: spark-3.5.0-hadoop3


								    - name: Download Apache Spark

								      if: (github.event_name == 'push' || github.event_name == 'workflow_dispatch') && steps.cache-spark.outputs.cache-hit != 'true'

								      run: |

								        echo "Downloading Apache Spark 3.5.0..."

								        wget -q https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz

								        tar xzf spark-3.5.0-bin-hadoop3.tgz

								        echo "OK Spark downloaded"


								    - name: Start SeaweedFS services for example

								      if: github.event_name == 'push' || github.event_name == 'workflow_dispatch'

								      working-directory: test/java/spark

								      run: |

								        echo "=== Starting SeaweedFS Services for Example ==="

								        docker compose down -v || true

								        docker compose build --no-cache

								        docker compose up -d seaweedfs-master seaweedfs-volume seaweedfs-filer


								        echo "Waiting for services..."

								        for i in {1..30}; do

								          if curl -f http://localhost:8888/ > /dev/null 2>&1; then

								            echo "OK SeaweedFS filer is ready!"

								            break

								          fi

								          if [ $i -eq 30 ]; then

								            echo "FAILED Services failed to start"

								            docker compose ps -a

								            docker compose logs

								            exit 1

								          fi

								          echo "Waiting... ($i/30)"

								          sleep 2

								        done


								        curl -f http://localhost:9333/cluster/status || exit 1

								        echo "OK All services healthy"


								    - name: Build project for example

								      if: github.event_name == 'push' || github.event_name == 'workflow_dispatch'

								      working-directory: test/java/spark

								      run: mvn clean package -DskipTests


								    - name: Run Spark example application

								      if: github.event_name == 'push' || github.event_name == 'workflow_dispatch'

								      working-directory: test/java/spark

								      run: |

								        echo "=== Running Spark Example Application ==="

								        export SPARK_HOME=$(pwd)/../../../spark-3.5.0-bin-hadoop3

								        $SPARK_HOME/bin/spark-submit \

								          --class seaweed.spark.SparkSeaweedFSExample \

								          --master local[2] \

								          --conf spark.hadoop.fs.seaweedfs.impl=seaweed.hdfs.SeaweedFileSystem \

								          --conf spark.hadoop.fs.seaweed.filer.host=localhost \

								          --conf spark.hadoop.fs.seaweed.filer.port=8888 \

								          --conf spark.hadoop.fs.seaweed.filer.port.grpc=18888 \

								          --conf spark.hadoop.fs.seaweed.replication="" \

								          target/seaweedfs-spark-integration-tests-1.0-SNAPSHOT.jar \

								          seaweedfs://localhost:8888/ci-spark-output

								        echo "OK Example completed"


								    - name: Verify example output

								      if: github.event_name == 'push' || github.event_name == 'workflow_dispatch'

								      run: |

								        echo "Verifying output..."

								        curl -s http://localhost:8888/ci-spark-output/ || echo "Output listing unavailable"


								    - name: Stop example services

								      if: always() && (github.event_name == 'push' || github.event_name == 'workflow_dispatch')

								      working-directory: test/java/spark

								      run: docker compose down -v


								    # ========================================

								    # DIAGNOSTICS

								    # ========================================

								    - name: Display diagnostics on failure

								      if: failure()

								      working-directory: test/java/spark

								      run: |

								        echo "=== Container Status ==="

								        docker compose ps -a

								        echo ""

								        echo "=== Master Logs ==="

								        docker compose logs seaweedfs-master

								        echo ""

								        echo "=== Volume Logs ==="

								        docker compose logs seaweedfs-volume

								        echo ""

								        echo "=== Filer Logs ==="

								        docker compose logs seaweedfs-filer

								        echo ""

								        echo "=== Volume List ==="

								        docker compose exec -T seaweedfs-master weed shell <<EOF || echo "Failed"

								        volume.list

								        exit

								        EOF

								        echo ""

								        echo "=== Cluster Status ==="

								        curl -s http://localhost:9333/dir/status | jq '.' || curl -s http://localhost:9333/dir/status