clean up tests

1 week ago · 785dbc6077
10 changed files with 1 additions and 841 deletions
--- a/.github/workflows/spark-integration-tests.yml
+++ b/.github/workflows/spark-integration-tests.yml
@ -116,224 +116,16 @@ jobs:
        cp -r ~/.m2/repository/com/seaweedfs .m2/repository/com/
        echo "OK Maven artifacts copied"
        echo ""
        echo "=== VERIFYING NEW CODE IS IN JARS ==="
        # Check if SeaweedOutputStream contains our new constructor log
        JAR_PATH=".m2/repository/com/seaweedfs/seaweedfs-client/3.80.1-SNAPSHOT/seaweedfs-client-3.80.1-SNAPSHOT.jar"
        if [ -f "$JAR_PATH" ]; then
          if unzip -p "$JAR_PATH" seaweedfs/client/SeaweedOutputStream.class | strings | grep -q "SeaweedOutputStream BASE constructor called"; then
            echo "OK SeaweedOutputStream contains new constructor log"
          else
            echo "ERROR SeaweedOutputStream JAR is STALE - does not contain constructor log!"
            echo "Listing JAR contents:"
            unzip -l "$JAR_PATH" | grep SeaweedOutputStream
            exit 1
          fi
        else
          echo "ERROR JAR not found at $JAR_PATH"
          ls -la .m2/repository/com/seaweedfs/seaweedfs-client/3.80.1-SNAPSHOT/
          exit 1
        fi
        echo "OK Maven artifacts ready and verified"
    - name: Run Spark integration tests
      working-directory: test/java/spark
      continue-on-error: true
      id: test-run
      run: |
        echo "=== Running Spark Integration Tests ==="
        # Run tests in detached mode
        docker compose up -d spark-tests
        echo "Real-time monitoring: Will download file the instant EOF error appears..."
        # Monitor logs and download ALL employees files BEFORE they're deleted
        (
          DOWNLOADED=false
          while docker ps | grep -q seaweedfs-spark-tests; do
            # Check if an employees Parquet file has been written (we log this explicitly)
            if docker compose logs spark-tests 2>&1 | grep -q "PARQUET FILE WRITTEN TO EMPLOYEES"; then
              if [ "$DOWNLOADED" = "false" ]; then
                echo ""
                echo "=== EMPLOYEES FILE WRITTEN! Extracting chunk IDs and downloading from volume ==="
                # Extract chunk IDs directly from the write log (bypasses filer entirely!)
                FULL_LOG=$(docker compose logs spark-tests 2>&1)
                WRITE_LOG=$(echo "$FULL_LOG" | grep "PARQUET FILE WRITTEN TO EMPLOYEES" | tail -1)
                echo "Write log: $WRITE_LOG"
                # Extract chunk IDs from CHUNKS: [id1,id2,...] in the log
                CHUNK_IDS=$(echo "$WRITE_LOG" | grep -oP 'CHUNKS: \[\K[^\]]+')
                echo "Chunk IDs: $CHUNK_IDS"
                if [ -z "$CHUNK_IDS" ]; then
                  echo "ERROR: No chunk IDs in write log - using old format?"
                  # Fallback: try to find any chunk ID near the write log
                  CHUNK_IDS=$(echo "$FULL_LOG" | grep -B 20 "PARQUET FILE WRITTEN TO EMPLOYEES" | grep 'file_id: "' | tail -1 | grep -oP '"\K[^"]+')
                  echo "Fallback chunk ID: $CHUNK_IDS"
                fi
                if [ -n "$CHUNK_IDS" ]; then
                  # CHUNK_IDS might have multiple chunks, but usually just one
                  # Format: "3,abc123" or "3,abc123,4,def456" (comma WITHIN each ID!)
                  # We need to split by space or handle single chunk
                  echo "Downloading chunk from volume server: http://localhost:8080/$CHUNK_IDS"
                  curl -o "test.parquet" "http://localhost:8080/$CHUNK_IDS"
                  if [ -f test.parquet ] && [ -s test.parquet ]; then
                    FILE_SIZE=$(stat --format=%s test.parquet 2>/dev/null || stat -f%z test.parquet 2>/dev/null)
                    echo "SUCCESS: Downloaded $FILE_SIZE bytes from volume server!"
                    DOWNLOADED=true
                  else
                    echo "FAILED: Chunk $CHUNK_IDS returned 404 or empty"
                  fi
                else
                  echo "ERROR: Could not extract chunk IDs"
                fi
              fi
            fi
            # Check if EOF error has appeared
            if docker compose logs spark-tests 2>&1 | grep -q "EOFException.*Still have: 78 bytes left"; then
              echo ""
              echo "=== EOF ERROR DETECTED! ==="
              if [ "$DOWNLOADED" = "true" ] && [ -f test.parquet ] && [ -s test.parquet ]; then
                echo "File was already downloaded proactively!"
                FILE_SIZE=$(stat --format=%s test.parquet 2>/dev/null || stat -f%z test.parquet 2>/dev/null)
                echo "File size: $FILE_SIZE bytes"
        docker compose up --abort-on-container-exit spark-tests
                # Analyze it
                echo ""
                echo "Installing parquet-tools..."
                pip3 install -q parquet-tools
                echo ""
                echo "=== File Header (first 100 bytes) ==="
                hexdump -C test.parquet | head -10
                echo ""
                echo "=== File Footer (last 200 bytes) ==="
                tail -c 200 test.parquet | hexdump -C
                echo ""
                echo "=== Magic bytes check ==="
                echo "First 4 bytes (should be PAR1):"
                head -c 4 test.parquet | xxd
                echo "Last 4 bytes (should be PAR1):"
                tail -c 4 test.parquet | xxd
                echo ""
                echo "=== Parquet metadata ==="
                parquet-tools inspect test.parquet || echo "parquet-tools inspect failed"
                echo ""
                echo "=== Try reading data ==="
                parquet-tools show test.parquet || echo "parquet-tools show failed"
                echo ""
                echo "=== CRITICAL ANALYSIS: Where are the missing 78 bytes? ==="
                echo "Actual file size: $FILE_SIZE bytes"
                echo ""
                echo "Examining column chunk offsets from metadata..."
                parquet-tools meta test.parquet > meta.txt 2>&1 || true
                cat meta.txt
                echo ""
                echo "Analyzing offset pattern..."
                grep -i "offset" meta.txt || echo "No offset info"
              else
                echo "ERROR: File was not downloaded proactively!"
              fi
              break
            fi
            sleep 1
          done
        ) &
        MONITOR_PID=$!
        # Wait for tests to complete
        docker wait seaweedfs-spark-tests
        TEST_EXIT_CODE=$(docker inspect seaweedfs-spark-tests --format='{{.State.ExitCode}}')
        # Give monitor time to finish
        sleep 3
        kill $MONITOR_PID 2>/dev/null || true
        # Show full logs
        echo ""
        echo "=== Test Logs ==="
        docker compose logs spark-tests | tail -100
        echo ""
        echo "Tests completed with exit code: $TEST_EXIT_CODE"
        echo "exit_code=$TEST_EXIT_CODE" >> $GITHUB_OUTPUT
        exit $TEST_EXIT_CODE
    - name: Examine Parquet file
      if: steps.test-run.outcome == 'failure'
      working-directory: test/java/spark
      run: |
        echo "=== Examining Parquet file for analysis ==="
        # Check if file was already downloaded
        if [ ! -f test.parquet ] || [ ! -s test.parquet ]; then
          echo "ERROR: test.parquet not found or empty"
          echo "File was not successfully downloaded during test run"
          exit 1
        fi
        echo "Found test.parquet, proceeding with analysis..."
        # Install parquet-tools
        pip3 install parquet-tools
        echo ""
        echo "=== File Size ==="
        ls -lh test.parquet
        FILE_SIZE=$(stat -f%z test.parquet 2>/dev/null || stat -c%s test.parquet)
        echo "Actual file size: $FILE_SIZE bytes"
        echo ""
        echo "=== File Header (first 100 bytes) ==="
        hexdump -C test.parquet | head -10
        echo ""
        echo "=== File Footer (last 200 bytes) ==="
        tail -c 200 test.parquet | hexdump -C
        echo ""
        echo "=== Magic Bytes Check ==="
        echo "First 4 bytes (should be PAR1):"
        head -c 4 test.parquet | xxd
        echo "Last 4 bytes (should be PAR1):"
        tail -c 4 test.parquet | xxd
        echo ""
        echo "=== Parquet Metadata ==="
        parquet-tools inspect test.parquet || echo "parquet-tools failed"
        echo ""
        echo "=== Try Reading with Parquet Tools ==="
        parquet-tools show test.parquet || echo "Failed to read file"
        echo ""
        echo "=== File Validation ==="
        if head -c 4 test.parquet | grep -q "PAR1"; then
          echo "OK Valid Parquet header"
        else
          echo "FAILED INVALID Parquet header"
        fi
        if tail -c 4 test.parquet | grep -q "PAR1"; then
          echo "OK Valid Parquet trailer"
        else
          echo "FAILED INVALID Parquet trailer"
        fi
    - name: Stop test services
      if: always()
      working-directory: test/java/spark
@ -347,15 +139,6 @@ jobs:
        path: test/java/spark/target/surefire-reports/
        retention-days: 30
    - name: Upload Parquet file for analysis
      if: failure()
      uses: actions/upload-artifact@v4
      with:
        name: failed-parquet-file
        path: test/java/spark/test.parquet
        retention-days: 7
        if-no-files-found: ignore
    - name: Publish test report
      if: always()
      uses: dorny/test-reporter@v1
@ -365,13 +148,6 @@ jobs:
        reporter: java-junit
        fail-on-error: true
    - name: Check test results
      if: steps.test-run.outcome == 'failure'
      run: |
        echo "ERROR Tests failed with exit code: ${{ steps.test-run.outputs.exit_code }}"
        echo "But file analysis was completed above."
        exit 1
    # ========================================
    # SPARK EXAMPLE (HOST-BASED)
    # ========================================
--- a/test/java/spark/ReadParquetMeta.java
+++ b/test/java/spark/ReadParquetMeta.java
@ -1,39 +0,0 @@
 import org.apache.parquet.hadoop.ParquetFileReader;
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.hadoop.util.HadoopInputFile;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 public class ReadParquetMeta {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Path path = new Path(args[0]);
        HadoopInputFile inputFile = HadoopInputFile.fromPath(path, conf);
        try (ParquetFileReader reader = ParquetFileReader.open(inputFile)) {
            ParquetMetadata meta = reader.getFooter();
            System.out.println("=== Parquet File Metadata ===");
            System.out.println("Blocks (row groups): " + meta.getBlocks().size());
            System.out.println("File size from footer: " + inputFile.getLength());
            System.out.println("");
            meta.getBlocks().forEach(block -> {
                System.out.println("Row Group:");
                System.out.println("  Rows: " + block.getRowCount());
                System.out.println("  Total byte size: " + block.getTotalByteSize());
                System.out.println("  Columns: " + block.getColumns().size());
                System.out.println("");
                block.getColumns().forEach(col -> {
                    System.out.println("  Column: " + col.getPath());
                    System.out.println("    First data page offset: " + col.getFirstDataPageOffset());
                    System.out.println("    Dictionary page offset: " + col.getDictionaryPageOffset());
                    System.out.println("    Total size: " + col.getTotalSize());
                    System.out.println("    Total uncompressed size: " + col.getTotalUncompressedSize());
                    System.out.println("");
                });
            });
        }
    }
 }
--- a/test/java/spark/TEST_ALL_THREE_MODES.sh
+++ b/test/java/spark/TEST_ALL_THREE_MODES.sh
@ -1,38 +0,0 @@
 #!/bin/bash
 set -e
 echo "=========================================="
 echo "Testing All Three Debug Modes"
 echo "=========================================="
 echo ""
 cd /Users/chrislu/go/src/github.com/seaweedfs/seaweedfs/test/java/spark
 # Mode 1: SEAWEED_ONLY (default)
 echo "=== MODE 1: SEAWEED_ONLY ==="
 docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true \
  spark-tests bash -c 'cd /workspace && mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1' \
  | grep -E "Tests run|BUILD SUCCESS|BUILD FAILURE|EOFException" | tail -5
 echo ""
 # Mode 2: LOCAL_ONLY
 echo "=== MODE 2: LOCAL_ONLY ==="
 docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true \
  -e SEAWEEDFS_DEBUG_MODE=LOCAL_ONLY \
  -e SEAWEEDFS_DEBUG_DIR=/workspace/target/debug-local \
  spark-tests bash -c 'mkdir -p /workspace/target/debug-local && cd /workspace && mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1' \
  | grep -E "Tests run|BUILD SUCCESS|BUILD FAILURE|EOFException|length is too low" | tail -5
 echo ""
 # Mode 3: DUAL_COMPARE
 echo "=== MODE 3: DUAL_COMPARE ==="
 docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true \
  -e SEAWEEDFS_DEBUG_MODE=DUAL_COMPARE \
  -e SEAWEEDFS_DEBUG_DIR=/workspace/target/debug-dual \
  spark-tests bash -c 'mkdir -p /workspace/target/debug-dual && cd /workspace && mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1' \
  | grep -E "Tests run|BUILD SUCCESS|BUILD FAILURE|EOFException" | tail -5
 echo ""
 echo "=========================================="
 echo "Test Summary"
 echo "=========================================="
--- a/test/java/spark/capture-parquet.sh
+++ b/test/java/spark/capture-parquet.sh
@ -1,50 +0,0 @@
 #!/bin/bash
 # Run Spark test and capture the Parquet file before cleanup
 echo "Starting SeaweedFS services..."
 docker compose up -d seaweedfs-master seaweedfs-volume seaweedfs-filer
 sleep 10
 echo "Running Spark test in background..."
 docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c "mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1" > /tmp/spark-test-capture.log &
 TEST_PID=$!
 echo "Monitoring for Parquet file creation..."
 while kill -0 $TEST_PID 2>/dev/null; do
    # Check if employees directory exists
    FILES=$(curl -s http://localhost:8888/test-spark/employees/ 2>/dev/null | grep -o 'part-[^"]*\.parquet' || echo "")
    if [ -n "$FILES" ]; then
        echo "Found Parquet file(s)!"
        for FILE in $FILES; do
            echo "Downloading: $FILE"
            curl -s "http://localhost:8888/test-spark/employees/$FILE" > "/tmp/$FILE"
            FILE_SIZE=$(stat -f%z "/tmp/$FILE" 2>/dev/null || stat --format=%s "/tmp/$FILE" 2>/dev/null)
            echo "Downloaded $FILE: $FILE_SIZE bytes"
            if [ -f "/tmp/$FILE" ] && [ $FILE_SIZE -gt 0 ]; then
                echo "SUCCESS: Captured $FILE"
                echo "Installing parquet-tools..."
                pip3 install -q parquet-tools 2>/dev/null || echo "parquet-tools might already be installed"
                echo ""
                echo "=== Parquet File Metadata ==="
                python3 -m parquet_tools meta "/tmp/$FILE" || echo "parquet-tools failed"
                echo ""
                echo "=== File Header (first 100 bytes) ==="
                hexdump -C "/tmp/$FILE" | head -10
                echo ""
                echo "=== File Footer (last 100 bytes) ==="
                tail -c 100 "/tmp/$FILE" | hexdump -C
                kill $TEST_PID 2>/dev/null
                exit 0
            fi
        done
    fi
    sleep 0.5
 done
 echo "Test completed, checking logs..."
 tail -50 /tmp/spark-test-capture.log
--- a/test/java/spark/download_and_test.sh
+++ b/test/java/spark/download_and_test.sh
@ -1,180 +0,0 @@
 #!/bin/bash
 set -e
 echo "=== Downloading Parquet file and testing with multiple readers ==="
 echo ""
 # Start services if not running
 docker compose up -d seaweedfs-master seaweedfs-volume seaweedfs-filer 2>&1 | grep -v "Running"
 sleep 3
 # Write a file using Spark
 echo "1. Writing Parquet file with Spark..."
 docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c '
 cd /workspace
 # Run the test that writes a file
 mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1 | tail -20
 ' > /tmp/spark_write.log 2>&1 &
 WRITE_PID=$!
 # Wait a bit for file to be written
 sleep 8
 # Find and download the file from the temporary directory
 echo "2. Finding Parquet file in temporary directory..."
 TEMP_FILE=$(docker compose exec -T seaweedfs-filer sh -c '
 find /data -name "*.parquet" -type f 2>/dev/null | grep -v "_SUCCESS" | head -1
 ' 2>&1 | tr -d '\r')
 if [ -z "$TEMP_FILE" ]; then
    echo "Waiting for file to be written..."
    sleep 5
    TEMP_FILE=$(docker compose exec -T seaweedfs-filer sh -c '
    find /data -name "*.parquet" -type f 2>/dev/null | grep -v "_SUCCESS" | head -1
    ' 2>&1 | tr -d '\r')
 fi
 if [ -z "$TEMP_FILE" ]; then
    echo "ERROR: No Parquet file found!"
    echo "Checking what files exist..."
    docker compose exec -T seaweedfs-filer sh -c 'find /data -type f 2>/dev/null | head -20'
    wait $WRITE_PID
    exit 1
 fi
 echo "Found: $TEMP_FILE"
 # Copy file from container
 echo "3. Copying file from container..."
 docker compose cp seaweedfs-filer:$TEMP_FILE /tmp/spark_written.parquet 2>&1 | grep -v "Successfully"
 # Also try to get it via HTTP
 echo "4. Also downloading via HTTP API..."
 # Get the file path relative to /data
 REL_PATH=$(echo $TEMP_FILE | sed 's|/data||')
 curl -s "http://localhost:8888${REL_PATH}" -o /tmp/spark_written_http.parquet 2>&1
 # Use whichever file is larger/valid
 if [ -f /tmp/spark_written.parquet ] && [ -s /tmp/spark_written.parquet ]; then
    cp /tmp/spark_written.parquet /tmp/test.parquet
    echo "Using file copied from container"
 elif [ -f /tmp/spark_written_http.parquet ] && [ -s /tmp/spark_written_http.parquet ]; then
    cp /tmp/spark_written_http.parquet /tmp/test.parquet
    echo "Using file downloaded via HTTP"
 else
    echo "ERROR: Failed to get file!"
    exit 1
 fi
 FILE_SIZE=$(stat -f%z /tmp/test.parquet 2>/dev/null || stat --format=%s /tmp/test.parquet 2>/dev/null)
 echo "Got file: $FILE_SIZE bytes"
 echo ""
 # Kill the write process
 kill $WRITE_PID 2>/dev/null || true
 wait $WRITE_PID 2>/dev/null || true
 # Now test with various readers
 echo "=== Testing with Multiple Parquet Readers ==="
 echo ""
 # 1. Check magic bytes
 echo "1. Magic Bytes Check:"
 echo -n "   First 4 bytes: "
 head -c 4 /tmp/test.parquet | xxd -p
 echo -n "   Last 4 bytes: "
 tail -c 4 /tmp/test.parquet | xxd -p
 FIRST=$(head -c 4 /tmp/test.parquet | xxd -p)
 LAST=$(tail -c 4 /tmp/test.parquet | xxd -p)
 if [ "$FIRST" = "50415231" ] && [ "$LAST" = "50415231" ]; then
    echo "   ✅ Valid PAR1 magic bytes"
 else
    echo "   ❌ Invalid magic bytes!"
 fi
 echo ""
 # 2. Python pyarrow
 echo "2. Testing with Python pyarrow:"
 python3 << 'PYEOF'
 try:
    import pyarrow.parquet as pq
    table = pq.read_table('/tmp/test.parquet')
    print(f"   ✅ SUCCESS: Read {table.num_rows} rows, {table.num_columns} columns")
    print(f"   Schema: {table.schema}")
    print(f"   First row: {table.to_pandas().iloc[0].to_dict()}")
 except Exception as e:
    print(f"   ❌ FAILED: {e}")
 PYEOF
 echo ""
 # 3. DuckDB
 echo "3. Testing with DuckDB:"
 python3 << 'PYEOF'
 try:
    import duckdb
    conn = duckdb.connect(':memory:')
    result = conn.execute("SELECT * FROM '/tmp/test.parquet'").fetchall()
    print(f"   ✅ SUCCESS: Read {len(result)} rows")
    print(f"   Data: {result}")
 except Exception as e:
    print(f"   ❌ FAILED: {e}")
 PYEOF
 echo ""
 # 4. Pandas
 echo "4. Testing with Pandas:"
 python3 << 'PYEOF'
 try:
    import pandas as pd
    df = pd.read_parquet('/tmp/test.parquet')
    print(f"   ✅ SUCCESS: Read {len(df)} rows, {len(df.columns)} columns")
    print(f"   Columns: {list(df.columns)}")
    print(f"   Data:\n{df}")
 except Exception as e:
    print(f"   ❌ FAILED: {e}")
 PYEOF
 echo ""
 # 5. Java ParquetReader (using our test container)
 echo "5. Testing with Java ParquetReader:"
 docker compose run --rm spark-tests bash -c '
 cat > /tmp/ReadParquet.java << "JAVAEOF"
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.parquet.hadoop.ParquetReader;
 import org.apache.parquet.hadoop.example.GroupReadSupport;
 import org.apache.parquet.example.data.Group;
 public class ReadParquet {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Path path = new Path("/tmp/test.parquet");
        try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), path)
                .withConf(conf).build()) {
            Group group;
            int count = 0;
            while ((group = reader.read()) != null && count < 5) {
                System.out.println("   Row " + count + ": " + group);
                count++;
            }
            System.out.println("   ✅ SUCCESS: Read " + count + " rows");
        } catch (Exception e) {
            System.out.println("   ❌ FAILED: " + e.getMessage());
            e.printStackTrace();
        }
    }
 }
 JAVAEOF
 # Copy the file into container
 cat > /tmp/test.parquet
 ' < /tmp/test.parquet 2>&1 | head -1
 echo ""
 echo "=== Summary ==="
 echo "File size: $FILE_SIZE bytes"
 echo "If all readers succeeded, the file is VALID."
 echo "If readers failed, the footer metadata is corrupted."
--- a/test/java/spark/patch-parquet.sh
+++ b/test/java/spark/patch-parquet.sh
@ -1,34 +0,0 @@
 #!/bin/bash
 # This script patches the Parquet JAR to use LinkedHashSet instead of HashSet
 JAR_PATH="$HOME/.m2/repository/org/apache/parquet/parquet-hadoop/1.14.4/parquet-hadoop-1.14.4.jar"
 BACKUP_PATH="$HOME/.m2/repository/org/apache/parquet/parquet-hadoop/1.14.4/parquet-hadoop-1.14.4.jar.backup"
 echo "Patching Parquet JAR at: $JAR_PATH"
 # Backup original JAR
 if [ ! -f "$BACKUP_PATH" ]; then
    cp "$JAR_PATH" "$BACKUP_PATH"
    echo "Created backup at: $BACKUP_PATH"
 fi
 # Extract the JAR
 TEMP_DIR=$(mktemp -d)
 cd "$TEMP_DIR"
 jar xf "$JAR_PATH"
 # Find and patch the class file
 # We need to modify the bytecode to change HashSet to LinkedHashSet
 # This is complex, so let's document what needs to be done
 echo "JAR extracted to: $TEMP_DIR"
 echo "To patch, we need to:"
 echo "1. Decompile ParquetFileWriter.class"
 echo "2. Change HashSet to LinkedHashSet"
 echo "3. Recompile"
 echo "4. Repackage JAR"
 echo ""
 echo "This requires javap, javac with all dependencies, and jar"
 echo "Simpler approach: Use the patched source to rebuild the module"
 rm -rf "$TEMP_DIR"
--- a/test/java/spark/test-one.sh
+++ b/test/java/spark/test-one.sh
@ -1,40 +0,0 @@
 #!/bin/bash
 # Run a single test method for quick iteration
 set -e
 if [ $# -eq 0 ]; then
    echo "Usage: ./test-one.sh <TestClass>#<methodName>"
    echo ""
    echo "Examples:"
    echo "  ./test-one.sh SparkReadWriteTest#testWriteAndReadParquet"
    echo "  ./test-one.sh SparkSQLTest#testCreateTableAndQuery"
    echo ""
    exit 1
 fi
 # Check if SeaweedFS is running
 if ! curl -f http://localhost:8888/ > /dev/null 2>&1; then
    echo "✗ SeaweedFS filer is not accessible at http://localhost:8888"
    echo ""
    echo "Please start SeaweedFS first:"
    echo "  docker-compose up -d"
    echo ""
    exit 1
 fi
 echo "✓ SeaweedFS filer is accessible"
 echo ""
 echo "Running test: $1"
 echo ""
 # Set environment variables
 export SEAWEEDFS_TEST_ENABLED=true
 export SEAWEEDFS_FILER_HOST=localhost
 export SEAWEEDFS_FILER_PORT=8888
 export SEAWEEDFS_FILER_GRPC_PORT=18888
 # Run the specific test
 mvn test -Dtest="$1"
--- a/test/java/spark/test_parquet_external_read.sh
+++ b/test/java/spark/test_parquet_external_read.sh
@ -1,55 +0,0 @@
 #!/bin/bash
 set -e
 echo "=== Testing if Parquet file can be read by external tools ==="
 # Use our working ParquetMemoryComparisonTest to write a file
 echo "1. Writing Parquet file with ParquetWriter (known to work)..."
 docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c '
 cd /workspace
 mvn test -Dtest=ParquetMemoryComparisonTest#testCompareMemoryVsSeaweedFSParquet -q 2>&1 | tail -10
 ' > /tmp/write_test.log 2>&1
 # The test writes to: /test-spark/comparison-test.parquet
 echo "2. Downloading file from SeaweedFS..."
 curl -s "http://localhost:8888/test-spark/comparison-test.parquet" -o /tmp/test.parquet
 if [ ! -f /tmp/test.parquet ] || [ ! -s /tmp/test.parquet ]; then
    echo "ERROR: Failed to download file!"
    echo "Checking if file exists..."
    curl -s "http://localhost:8888/test-spark/?pretty=y"
    exit 1
 fi
 FILE_SIZE=$(stat -f%z /tmp/test.parquet 2>/dev/null || stat --format=%s /tmp/test.parquet 2>/dev/null)
 echo "Downloaded $FILE_SIZE bytes"
 # Install parquet-tools if needed
 pip3 install -q parquet-tools 2>&1 | grep -v "Requirement already satisfied" || true
 echo ""
 echo "=== File Header (first 100 bytes) ==="
 hexdump -C /tmp/test.parquet | head -10
 echo ""
 echo "=== File Footer (last 100 bytes) ==="
 tail -c 100 /tmp/test.parquet | hexdump -C
 echo ""
 echo "=== Parquet Metadata ==="
 parquet-tools inspect /tmp/test.parquet 2>&1 || echo "FAILED to inspect"
 echo ""
 echo "=== Try to read data ==="
 parquet-tools show /tmp/test.parquet 2>&1 | head -20 || echo "FAILED to read data"
 echo ""
 echo "=== Conclusion ==="
 if parquet-tools show /tmp/test.parquet > /dev/null 2>&1; then
    echo "✅ SUCCESS: File written to SeaweedFS can be read by parquet-tools!"
    echo "This proves the file format is valid."
 else
    echo "❌ FAILED: File cannot be read by parquet-tools"
    echo "The file may be corrupted."
 fi
--- a/test/java/spark/test_parquet_readability.sh
+++ b/test/java/spark/test_parquet_readability.sh
@ -1,60 +0,0 @@
 #!/bin/bash
 set -e
 echo "=== Testing if Parquet file written by Spark can be read by parquet-tools ==="
 # Run the test to write a Parquet file
 echo "1. Writing Parquet file with Spark..."
 docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c '
 cd /workspace
 mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery -q 2>&1 | tail -5
 ' > /tmp/write_test.log 2>&1 || true
 # Find the Parquet file that was written
 echo "2. Finding Parquet file..."
 PARQUET_FILE=$(docker compose run --rm spark-tests bash -c '
 curl -s "http://seaweedfs-filer:8888/test-spark/employees/?pretty=y" | grep -oP "\"name\":\s*\"\K[^\"]+\.parquet" | head -1
 ' 2>&1 | grep -v "Creating" | grep "\.parquet" | head -1)
 if [ -z "$PARQUET_FILE" ]; then
    echo "ERROR: No Parquet file found!"
    exit 1
 fi
 echo "Found file: $PARQUET_FILE"
 # Download the file
 echo "3. Downloading file from SeaweedFS..."
 curl -s "http://localhost:8888/test-spark/employees/$PARQUET_FILE" -o /tmp/test.parquet
 if [ ! -f /tmp/test.parquet ] || [ ! -s /tmp/test.parquet ]; then
    echo "ERROR: Failed to download file!"
    exit 1
 fi
 FILE_SIZE=$(stat -f%z /tmp/test.parquet 2>/dev/null || stat --format=%s /tmp/test.parquet 2>/dev/null)
 echo "Downloaded $FILE_SIZE bytes"
 # Try to read with parquet-tools
 echo "4. Reading with parquet-tools..."
 pip3 install -q parquet-tools 2>&1 | grep -v "Requirement already satisfied" || true
 echo ""
 echo "=== Parquet Metadata ==="
 parquet-tools inspect /tmp/test.parquet 2>&1 || echo "FAILED to inspect"
 echo ""
 echo "=== Try to read data ==="
 parquet-tools show /tmp/test.parquet 2>&1 || echo "FAILED to read data"
 echo ""
 echo "=== Conclusion ==="
 if parquet-tools show /tmp/test.parquet > /dev/null 2>&1; then
    echo "✅ SUCCESS: File can be read by parquet-tools!"
    echo "The file itself is VALID Parquet format."
    echo "The issue is specific to how Spark reads it back."
 else
    echo "❌ FAILED: File cannot be read by parquet-tools"
    echo "The file is CORRUPTED or has invalid Parquet format."
 fi
--- a/test/java/spark/test_with_readers.sh
+++ b/test/java/spark/test_with_readers.sh
@ -1,120 +0,0 @@
 #!/bin/bash
 set -e
 echo "=== Testing Parquet file with multiple readers ==="
 echo ""
 # Start services
 docker compose up -d 2>&1 | grep -v "Running"
 sleep 2
 # Run test and capture chunk ID
 echo "1. Writing Parquet file and capturing chunk ID..."
 docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c '
 cd /workspace
 mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1
 ' 2>&1 | tee /tmp/test_output.log | tail -20 &
 TEST_PID=$!
 # Wait for the file to be written
 echo "2. Waiting for file write..."
 sleep 10
 # Extract chunk ID from logs
 CHUNK_ID=$(grep "PARQUET FILE WRITTEN TO EMPLOYEES" /tmp/test_output.log | grep -oP 'CHUNKS: \[\K[^\]]+' | head -1)
 if [ -z "$CHUNK_ID" ]; then
    echo "Waiting more..."
    sleep 5
    CHUNK_ID=$(grep "PARQUET FILE WRITTEN TO EMPLOYEES" /tmp/test_output.log | grep -oP 'CHUNKS: \[\K[^\]]+' | head -1)
 fi
 if [ -z "$CHUNK_ID" ]; then
    echo "ERROR: Could not find chunk ID in logs"
    echo "Log excerpt:"
    grep -E "PARQUET|CHUNKS|employees" /tmp/test_output.log | tail -20
    kill $TEST_PID 2>/dev/null || true
    exit 1
 fi
 echo "Found chunk ID: $CHUNK_ID"
 # Download directly from volume server
 echo "3. Downloading from volume server..."
 curl -s "http://localhost:8080/$CHUNK_ID" -o /tmp/test.parquet
 if [ ! -f /tmp/test.parquet ] || [ ! -s /tmp/test.parquet ]; then
    echo "ERROR: Download failed!"
    exit 1
 fi
 FILE_SIZE=$(stat -f%z /tmp/test.parquet 2>/dev/null || stat --format=%s /tmp/test.parquet 2>/dev/null)
 echo "Downloaded: $FILE_SIZE bytes"
 echo ""
 # Kill test process
 kill $TEST_PID 2>/dev/null || true
 wait $TEST_PID 2>/dev/null || true
 # Test with readers
 echo "=== Testing with Multiple Parquet Readers ==="
 echo ""
 # Check magic bytes
 echo "1. Magic Bytes:"
 FIRST=$(head -c 4 /tmp/test.parquet | xxd -p)
 LAST=$(tail -c 4 /tmp/test.parquet | xxd -p)
 echo "   First 4 bytes: $FIRST"
 echo "   Last 4 bytes: $LAST"
 if [ "$FIRST" = "50415231" ] && [ "$LAST" = "50415231" ]; then
    echo "   ✅ Valid PAR1 magic"
 else
    echo "   ❌ Invalid magic!"
 fi
 echo ""
 # Python pyarrow
 echo "2. Python pyarrow:"
 python3 -c "
 import pyarrow.parquet as pq
 try:
    table = pq.read_table('/tmp/test.parquet')
    print(f'   ✅ Read {table.num_rows} rows, {table.num_columns} columns')
    print(f'   Data: {table.to_pandas().to_dict(\"records\")}')
 except Exception as e:
    print(f'   ❌ FAILED: {e}')
 " 2>&1
 echo ""
 # Pandas
 echo "3. Pandas:"
 python3 -c "
 import pandas as pd
 try:
    df = pd.read_parquet('/tmp/test.parquet')
    print(f'   ✅ Read {len(df)} rows')
    print(f'   Data:\n{df}')
 except Exception as e:
    print(f'   ❌ FAILED: {e}')
 " 2>&1
 echo ""
 # DuckDB
 echo "4. DuckDB:"
 python3 -c "
 import duckdb
 try:
    conn = duckdb.connect(':memory:')
    result = conn.execute('SELECT * FROM \"/tmp/test.parquet\"').fetchall()
    print(f'   ✅ Read {len(result)} rows')
    print(f'   Data: {result}')
 except Exception as e:
    print(f'   ❌ FAILED: {e}')
 " 2>&1
 echo ""
 echo "=== Summary ==="
 echo "File: $FILE_SIZE bytes"
 echo "If readers succeeded: File is VALID ✅"
 echo "If readers failed: Footer metadata is corrupted ❌"