clean up tests

3 months ago · 785dbc6077
10 changed files with 1 additions and 841 deletions
--- a/.github/workflows/spark-integration-tests.yml
+++ b/.github/workflows/spark-integration-tests.yml
@ -115,224 +115,16 @@ jobs:
        mkdir -p .m2/repository/com
        cp -r ~/.m2/repository/com/seaweedfs .m2/repository/com/
        echo "OK Maven artifacts copied"
-        
-        echo ""
-        echo "=== VERIFYING NEW CODE IS IN JARS ==="
-        # Check if SeaweedOutputStream contains our new constructor log
-        JAR_PATH=".m2/repository/com/seaweedfs/seaweedfs-client/3.80.1-SNAPSHOT/seaweedfs-client-3.80.1-SNAPSHOT.jar"
-        if [ -f "$JAR_PATH" ]; then
-          if unzip -p "$JAR_PATH" seaweedfs/client/SeaweedOutputStream.class | strings | grep -q "SeaweedOutputStream BASE constructor called"; then
-            echo "OK SeaweedOutputStream contains new constructor log"
-          else
-            echo "ERROR SeaweedOutputStream JAR is STALE - does not contain constructor log!"
-            echo "Listing JAR contents:"
-            unzip -l "$JAR_PATH" | grep SeaweedOutputStream
-            exit 1
-          fi
-        else
-          echo "ERROR JAR not found at $JAR_PATH"
-          ls -la .m2/repository/com/seaweedfs/seaweedfs-client/3.80.1-SNAPSHOT/
-          exit 1
-        fi
-        echo "OK Maven artifacts ready and verified"

    - name: Run Spark integration tests
      working-directory: test/java/spark
-      continue-on-error: true
-      id: test-run
      run: |
        echo "=== Running Spark Integration Tests ==="
-        # Run tests in detached mode
-        docker compose up -d spark-tests
-        
-        echo "Real-time monitoring: Will download file the instant EOF error appears..."
+        docker compose up --abort-on-container-exit spark-tests
        
-        # Monitor logs and download ALL employees files BEFORE they're deleted
-        (
-          DOWNLOADED=false
-          while docker ps | grep -q seaweedfs-spark-tests; do
-            # Check if an employees Parquet file has been written (we log this explicitly)
-            if docker compose logs spark-tests 2>&1 | grep -q "PARQUET FILE WRITTEN TO EMPLOYEES"; then
-              if [ "$DOWNLOADED" = "false" ]; then
-                echo ""
-                echo "=== EMPLOYEES FILE WRITTEN! Extracting chunk IDs and downloading from volume ==="
-                
-                # Extract chunk IDs directly from the write log (bypasses filer entirely!)
-                FULL_LOG=$(docker compose logs spark-tests 2>&1)
-                WRITE_LOG=$(echo "$FULL_LOG" | grep "PARQUET FILE WRITTEN TO EMPLOYEES" | tail -1)
-                
-                echo "Write log: $WRITE_LOG"
-                
-                # Extract chunk IDs from CHUNKS: [id1,id2,...] in the log
-                CHUNK_IDS=$(echo "$WRITE_LOG" | grep -oP 'CHUNKS: \[\K[^\]]+')
-                echo "Chunk IDs: $CHUNK_IDS"
-                
-                if [ -z "$CHUNK_IDS" ]; then
-                  echo "ERROR: No chunk IDs in write log - using old format?"
-                  # Fallback: try to find any chunk ID near the write log
-                  CHUNK_IDS=$(echo "$FULL_LOG" | grep -B 20 "PARQUET FILE WRITTEN TO EMPLOYEES" | grep 'file_id: "' | tail -1 | grep -oP '"\K[^"]+')
-                  echo "Fallback chunk ID: $CHUNK_IDS"
-                fi
-                
-                if [ -n "$CHUNK_IDS" ]; then
-                  # CHUNK_IDS might have multiple chunks, but usually just one
-                  # Format: "3,abc123" or "3,abc123,4,def456" (comma WITHIN each ID!)
-                  # We need to split by space or handle single chunk
-                  echo "Downloading chunk from volume server: http://localhost:8080/$CHUNK_IDS"
-                  curl -o "test.parquet" "http://localhost:8080/$CHUNK_IDS"
-                  
-                  if [ -f test.parquet ] && [ -s test.parquet ]; then
-                    FILE_SIZE=$(stat --format=%s test.parquet 2>/dev/null || stat -f%z test.parquet 2>/dev/null)
-                    echo "SUCCESS: Downloaded $FILE_SIZE bytes from volume server!"
-                    DOWNLOADED=true
-                  else
-                    echo "FAILED: Chunk $CHUNK_IDS returned 404 or empty"
-                  fi
-                else
-                  echo "ERROR: Could not extract chunk IDs"
-                fi
-              fi
-            fi
-            
-            # Check if EOF error has appeared
-            if docker compose logs spark-tests 2>&1 | grep -q "EOFException.*Still have: 78 bytes left"; then
-              echo ""
-              echo "=== EOF ERROR DETECTED! ==="
-              
-              if [ "$DOWNLOADED" = "true" ] && [ -f test.parquet ] && [ -s test.parquet ]; then
-                echo "File was already downloaded proactively!"
-                FILE_SIZE=$(stat --format=%s test.parquet 2>/dev/null || stat -f%z test.parquet 2>/dev/null)
-                echo "File size: $FILE_SIZE bytes"
-                
-                # Analyze it
-                echo ""
-                echo "Installing parquet-tools..."
-                pip3 install -q parquet-tools
-                
-                echo ""
-                echo "=== File Header (first 100 bytes) ==="
-                hexdump -C test.parquet | head -10
-                
-                echo ""
-                echo "=== File Footer (last 200 bytes) ==="
-                tail -c 200 test.parquet | hexdump -C
-                
-                echo ""
-                echo "=== Magic bytes check ==="
-                echo "First 4 bytes (should be PAR1):"
-                head -c 4 test.parquet | xxd
-                echo "Last 4 bytes (should be PAR1):"
-                tail -c 4 test.parquet | xxd
-                
-                echo ""
-                echo "=== Parquet metadata ==="
-                parquet-tools inspect test.parquet || echo "parquet-tools inspect failed"
-                
-                echo ""
-                echo "=== Try reading data ==="
-                parquet-tools show test.parquet || echo "parquet-tools show failed"
-                
-                echo ""
-                echo "=== CRITICAL ANALYSIS: Where are the missing 78 bytes? ==="
-                echo "Actual file size: $FILE_SIZE bytes"
-                
-                echo ""
-                echo "Examining column chunk offsets from metadata..."
-                parquet-tools meta test.parquet > meta.txt 2>&1 || true
-                cat meta.txt
-                
-                echo ""
-                echo "Analyzing offset pattern..."
-                grep -i "offset" meta.txt || echo "No offset info"
-              else
-                echo "ERROR: File was not downloaded proactively!"
-              fi
-              break
-            fi
-            sleep 1
-          done
-        ) &
-        MONITOR_PID=$!
-        
-        # Wait for tests to complete
-        docker wait seaweedfs-spark-tests
-        TEST_EXIT_CODE=$(docker inspect seaweedfs-spark-tests --format='{{.State.ExitCode}}')
-        
-        # Give monitor time to finish
-        sleep 3
-        kill $MONITOR_PID 2>/dev/null || true
-        
-        # Show full logs
        echo ""
        echo "=== Test Logs ==="
        docker compose logs spark-tests | tail -100
-        
-        echo ""
-        echo "Tests completed with exit code: $TEST_EXIT_CODE"
-        echo "exit_code=$TEST_EXIT_CODE" >> $GITHUB_OUTPUT
-        
-        exit $TEST_EXIT_CODE
-
-    - name: Examine Parquet file
-      if: steps.test-run.outcome == 'failure'
-      working-directory: test/java/spark
-      run: |
-        echo "=== Examining Parquet file for analysis ==="
-        
-        # Check if file was already downloaded
-        if [ ! -f test.parquet ] || [ ! -s test.parquet ]; then
-          echo "ERROR: test.parquet not found or empty"
-          echo "File was not successfully downloaded during test run"
-          exit 1
-        fi
-        
-        echo "Found test.parquet, proceeding with analysis..."
-        
-        # Install parquet-tools
-        pip3 install parquet-tools
-        
-        echo ""
-        echo "=== File Size ==="
-        ls -lh test.parquet
-        FILE_SIZE=$(stat -f%z test.parquet 2>/dev/null || stat -c%s test.parquet)
-        echo "Actual file size: $FILE_SIZE bytes"
-        
-        echo ""
-        echo "=== File Header (first 100 bytes) ==="
-        hexdump -C test.parquet | head -10
-        
-        echo ""
-        echo "=== File Footer (last 200 bytes) ==="
-        tail -c 200 test.parquet | hexdump -C
-        
-        echo ""
-        echo "=== Magic Bytes Check ==="
-        echo "First 4 bytes (should be PAR1):"
-        head -c 4 test.parquet | xxd
-        echo "Last 4 bytes (should be PAR1):"
-        tail -c 4 test.parquet | xxd
-        
-        echo ""
-        echo "=== Parquet Metadata ==="
-        parquet-tools inspect test.parquet || echo "parquet-tools failed"
-        
-        echo ""
-        echo "=== Try Reading with Parquet Tools ==="
-        parquet-tools show test.parquet || echo "Failed to read file"
-        
-        echo ""
-        echo "=== File Validation ==="
-        if head -c 4 test.parquet | grep -q "PAR1"; then
-          echo "OK Valid Parquet header"
-        else
-          echo "FAILED INVALID Parquet header"
-        fi
-        
-        if tail -c 4 test.parquet | grep -q "PAR1"; then
-          echo "OK Valid Parquet trailer"
-        else
-          echo "FAILED INVALID Parquet trailer"
-        fi

    - name: Stop test services
      if: always()
@ -347,15 +139,6 @@ jobs:
        path: test/java/spark/target/surefire-reports/
        retention-days: 30

-    - name: Upload Parquet file for analysis
-      if: failure()
-      uses: actions/upload-artifact@v4
-      with:
-        name: failed-parquet-file
-        path: test/java/spark/test.parquet
-        retention-days: 7
-        if-no-files-found: ignore
-
    - name: Publish test report
      if: always()
      uses: dorny/test-reporter@v1
@ -365,13 +148,6 @@ jobs:
        reporter: java-junit
        fail-on-error: true

-    - name: Check test results
-      if: steps.test-run.outcome == 'failure'
-      run: |
-        echo "ERROR Tests failed with exit code: ${{ steps.test-run.outputs.exit_code }}"
-        echo "But file analysis was completed above."
-        exit 1
-
    # ========================================
    # SPARK EXAMPLE (HOST-BASED)
    # ========================================
--- a/test/java/spark/ReadParquetMeta.java
+++ b/test/java/spark/ReadParquetMeta.java
@ -1,39 +0,0 @@
-import org.apache.parquet.hadoop.ParquetFileReader;
-import org.apache.parquet.hadoop.metadata.ParquetMetadata;
-import org.apache.parquet.hadoop.util.HadoopInputFile;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-
-public class ReadParquetMeta {
-    public static void main(String[] args) throws Exception {
-        Configuration conf = new Configuration();
-        Path path = new Path(args[0]);
-        HadoopInputFile inputFile = HadoopInputFile.fromPath(path, conf);
-        
-        try (ParquetFileReader reader = ParquetFileReader.open(inputFile)) {
-            ParquetMetadata meta = reader.getFooter();
-            
-            System.out.println("=== Parquet File Metadata ===");
-            System.out.println("Blocks (row groups): " + meta.getBlocks().size());
-            System.out.println("File size from footer: " + inputFile.getLength());
-            System.out.println("");
-            
-            meta.getBlocks().forEach(block -> {
-                System.out.println("Row Group:");
-                System.out.println("  Rows: " + block.getRowCount());
-                System.out.println("  Total byte size: " + block.getTotalByteSize());
-                System.out.println("  Columns: " + block.getColumns().size());
-                System.out.println("");
-                
-                block.getColumns().forEach(col -> {
-                    System.out.println("  Column: " + col.getPath());
-                    System.out.println("    First data page offset: " + col.getFirstDataPageOffset());
-                    System.out.println("    Dictionary page offset: " + col.getDictionaryPageOffset());
-                    System.out.println("    Total size: " + col.getTotalSize());
-                    System.out.println("    Total uncompressed size: " + col.getTotalUncompressedSize());
-                    System.out.println("");
-                });
-            });
-        }
-    }
-}
--- a/test/java/spark/TEST_ALL_THREE_MODES.sh
+++ b/test/java/spark/TEST_ALL_THREE_MODES.sh
@ -1,38 +0,0 @@
-#!/bin/bash
-set -e
-
-echo "=========================================="
-echo "Testing All Three Debug Modes"
-echo "=========================================="
-echo ""
-
-cd /Users/chrislu/go/src/github.com/seaweedfs/seaweedfs/test/java/spark
-
-# Mode 1: SEAWEED_ONLY (default)
-echo "=== MODE 1: SEAWEED_ONLY ==="
-docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true \
-  spark-tests bash -c 'cd /workspace && mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1' \
-  | grep -E "Tests run|BUILD SUCCESS|BUILD FAILURE|EOFException" | tail -5
-echo ""
-
-# Mode 2: LOCAL_ONLY
-echo "=== MODE 2: LOCAL_ONLY ==="
-docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true \
-  -e SEAWEEDFS_DEBUG_MODE=LOCAL_ONLY \
-  -e SEAWEEDFS_DEBUG_DIR=/workspace/target/debug-local \
-  spark-tests bash -c 'mkdir -p /workspace/target/debug-local && cd /workspace && mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1' \
-  | grep -E "Tests run|BUILD SUCCESS|BUILD FAILURE|EOFException|length is too low" | tail -5
-echo ""
-
-# Mode 3: DUAL_COMPARE
-echo "=== MODE 3: DUAL_COMPARE ==="
-docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true \
-  -e SEAWEEDFS_DEBUG_MODE=DUAL_COMPARE \
-  -e SEAWEEDFS_DEBUG_DIR=/workspace/target/debug-dual \
-  spark-tests bash -c 'mkdir -p /workspace/target/debug-dual && cd /workspace && mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1' \
-  | grep -E "Tests run|BUILD SUCCESS|BUILD FAILURE|EOFException" | tail -5
-echo ""
-
-echo "=========================================="
-echo "Test Summary"
-echo "=========================================="
--- a/test/java/spark/capture-parquet.sh
+++ b/test/java/spark/capture-parquet.sh
@ -1,50 +0,0 @@
-#!/bin/bash
-# Run Spark test and capture the Parquet file before cleanup
-
-echo "Starting SeaweedFS services..."
-docker compose up -d seaweedfs-master seaweedfs-volume seaweedfs-filer
-sleep 10
-
-echo "Running Spark test in background..."
-docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c "mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1" > /tmp/spark-test-capture.log &
-TEST_PID=$!
-
-echo "Monitoring for Parquet file creation..."
-while kill -0 $TEST_PID 2>/dev/null; do
-    # Check if employees directory exists
-    FILES=$(curl -s http://localhost:8888/test-spark/employees/ 2>/dev/null | grep -o 'part-[^"]*\.parquet' || echo "")
-    if [ -n "$FILES" ]; then
-        echo "Found Parquet file(s)!"
-        for FILE in $FILES; do
-            echo "Downloading: $FILE"
-            curl -s "http://localhost:8888/test-spark/employees/$FILE" > "/tmp/$FILE"
-            FILE_SIZE=$(stat -f%z "/tmp/$FILE" 2>/dev/null || stat --format=%s "/tmp/$FILE" 2>/dev/null)
-            echo "Downloaded $FILE: $FILE_SIZE bytes"
-            
-            if [ -f "/tmp/$FILE" ] && [ $FILE_SIZE -gt 0 ]; then
-                echo "SUCCESS: Captured $FILE"
-                echo "Installing parquet-tools..."
-                pip3 install -q parquet-tools 2>/dev/null || echo "parquet-tools might already be installed"
-                
-                echo ""
-                echo "=== Parquet File Metadata ==="
-                python3 -m parquet_tools meta "/tmp/$FILE" || echo "parquet-tools failed"
-                
-                echo ""
-                echo "=== File Header (first 100 bytes) ==="
-                hexdump -C "/tmp/$FILE" | head -10
-                
-                echo ""
-                echo "=== File Footer (last 100 bytes) ==="
-                tail -c 100 "/tmp/$FILE" | hexdump -C
-                
-                kill $TEST_PID 2>/dev/null
-                exit 0
-            fi
-        done
-    fi
-    sleep 0.5
-done
-
-echo "Test completed, checking logs..."
-tail -50 /tmp/spark-test-capture.log
--- a/test/java/spark/download_and_test.sh
+++ b/test/java/spark/download_and_test.sh
@ -1,180 +0,0 @@
-#!/bin/bash
-set -e
-
-echo "=== Downloading Parquet file and testing with multiple readers ==="
-echo ""
-
-# Start services if not running
-docker compose up -d seaweedfs-master seaweedfs-volume seaweedfs-filer 2>&1 | grep -v "Running"
-sleep 3
-
-# Write a file using Spark
-echo "1. Writing Parquet file with Spark..."
-docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c '
-cd /workspace
-# Run the test that writes a file
-mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1 | tail -20
-' > /tmp/spark_write.log 2>&1 &
-WRITE_PID=$!
-
-# Wait a bit for file to be written
-sleep 8
-
-# Find and download the file from the temporary directory
-echo "2. Finding Parquet file in temporary directory..."
-TEMP_FILE=$(docker compose exec -T seaweedfs-filer sh -c '
-find /data -name "*.parquet" -type f 2>/dev/null | grep -v "_SUCCESS" | head -1
-' 2>&1 | tr -d '\r')
-
-if [ -z "$TEMP_FILE" ]; then
-    echo "Waiting for file to be written..."
-    sleep 5
-    TEMP_FILE=$(docker compose exec -T seaweedfs-filer sh -c '
-    find /data -name "*.parquet" -type f 2>/dev/null | grep -v "_SUCCESS" | head -1
-    ' 2>&1 | tr -d '\r')
-fi
-
-if [ -z "$TEMP_FILE" ]; then
-    echo "ERROR: No Parquet file found!"
-    echo "Checking what files exist..."
-    docker compose exec -T seaweedfs-filer sh -c 'find /data -type f 2>/dev/null | head -20'
-    wait $WRITE_PID
-    exit 1
-fi
-
-echo "Found: $TEMP_FILE"
-
-# Copy file from container
-echo "3. Copying file from container..."
-docker compose cp seaweedfs-filer:$TEMP_FILE /tmp/spark_written.parquet 2>&1 | grep -v "Successfully"
-
-# Also try to get it via HTTP
-echo "4. Also downloading via HTTP API..."
-# Get the file path relative to /data
-REL_PATH=$(echo $TEMP_FILE | sed 's|/data||')
-curl -s "http://localhost:8888${REL_PATH}" -o /tmp/spark_written_http.parquet 2>&1
-
-# Use whichever file is larger/valid
-if [ -f /tmp/spark_written.parquet ] && [ -s /tmp/spark_written.parquet ]; then
-    cp /tmp/spark_written.parquet /tmp/test.parquet
-    echo "Using file copied from container"
-elif [ -f /tmp/spark_written_http.parquet ] && [ -s /tmp/spark_written_http.parquet ]; then
-    cp /tmp/spark_written_http.parquet /tmp/test.parquet
-    echo "Using file downloaded via HTTP"
-else
-    echo "ERROR: Failed to get file!"
-    exit 1
-fi
-
-FILE_SIZE=$(stat -f%z /tmp/test.parquet 2>/dev/null || stat --format=%s /tmp/test.parquet 2>/dev/null)
-echo "Got file: $FILE_SIZE bytes"
-echo ""
-
-# Kill the write process
-kill $WRITE_PID 2>/dev/null || true
-wait $WRITE_PID 2>/dev/null || true
-
-# Now test with various readers
-echo "=== Testing with Multiple Parquet Readers ==="
-echo ""
-
-# 1. Check magic bytes
-echo "1. Magic Bytes Check:"
-echo -n "   First 4 bytes: "
-head -c 4 /tmp/test.parquet | xxd -p
-echo -n "   Last 4 bytes: "
-tail -c 4 /tmp/test.parquet | xxd -p
-
-FIRST=$(head -c 4 /tmp/test.parquet | xxd -p)
-LAST=$(tail -c 4 /tmp/test.parquet | xxd -p)
-if [ "$FIRST" = "50415231" ] && [ "$LAST" = "50415231" ]; then
-    echo "   ✅ Valid PAR1 magic bytes"
-else
-    echo "   ❌ Invalid magic bytes!"
-fi
-echo ""
-
-# 2. Python pyarrow
-echo "2. Testing with Python pyarrow:"
-python3 << 'PYEOF'
-try:
-    import pyarrow.parquet as pq
-    table = pq.read_table('/tmp/test.parquet')
-    print(f"   ✅ SUCCESS: Read {table.num_rows} rows, {table.num_columns} columns")
-    print(f"   Schema: {table.schema}")
-    print(f"   First row: {table.to_pandas().iloc[0].to_dict()}")
-except Exception as e:
-    print(f"   ❌ FAILED: {e}")
-PYEOF
-echo ""
-
-# 3. DuckDB
-echo "3. Testing with DuckDB:"
-python3 << 'PYEOF'
-try:
-    import duckdb
-    conn = duckdb.connect(':memory:')
-    result = conn.execute("SELECT * FROM '/tmp/test.parquet'").fetchall()
-    print(f"   ✅ SUCCESS: Read {len(result)} rows")
-    print(f"   Data: {result}")
-except Exception as e:
-    print(f"   ❌ FAILED: {e}")
-PYEOF
-echo ""
-
-# 4. Pandas
-echo "4. Testing with Pandas:"
-python3 << 'PYEOF'
-try:
-    import pandas as pd
-    df = pd.read_parquet('/tmp/test.parquet')
-    print(f"   ✅ SUCCESS: Read {len(df)} rows, {len(df.columns)} columns")
-    print(f"   Columns: {list(df.columns)}")
-    print(f"   Data:\n{df}")
-except Exception as e:
-    print(f"   ❌ FAILED: {e}")
-PYEOF
-echo ""
-
-# 5. Java ParquetReader (using our test container)
-echo "5. Testing with Java ParquetReader:"
-docker compose run --rm spark-tests bash -c '
-cat > /tmp/ReadParquet.java << "JAVAEOF"
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.parquet.hadoop.ParquetReader;
-import org.apache.parquet.hadoop.example.GroupReadSupport;
-import org.apache.parquet.example.data.Group;
-
-public class ReadParquet {
-    public static void main(String[] args) throws Exception {
-        Configuration conf = new Configuration();
-        Path path = new Path("/tmp/test.parquet");
-        
-        try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), path)
-                .withConf(conf).build()) {
-            Group group;
-            int count = 0;
-            while ((group = reader.read()) != null && count < 5) {
-                System.out.println("   Row " + count + ": " + group);
-                count++;
-            }
-            System.out.println("   ✅ SUCCESS: Read " + count + " rows");
-        } catch (Exception e) {
-            System.out.println("   ❌ FAILED: " + e.getMessage());
-            e.printStackTrace();
-        }
-    }
-}
-JAVAEOF
-
-# Copy the file into container
-cat > /tmp/test.parquet
-' < /tmp/test.parquet 2>&1 | head -1
-
-echo ""
-echo "=== Summary ==="
-echo "File size: $FILE_SIZE bytes"
-echo "If all readers succeeded, the file is VALID."
-echo "If readers failed, the footer metadata is corrupted."
-
--- a/test/java/spark/patch-parquet.sh
+++ b/test/java/spark/patch-parquet.sh
@ -1,34 +0,0 @@
-#!/bin/bash
-# This script patches the Parquet JAR to use LinkedHashSet instead of HashSet
-
-JAR_PATH="$HOME/.m2/repository/org/apache/parquet/parquet-hadoop/1.14.4/parquet-hadoop-1.14.4.jar"
-BACKUP_PATH="$HOME/.m2/repository/org/apache/parquet/parquet-hadoop/1.14.4/parquet-hadoop-1.14.4.jar.backup"
-
-echo "Patching Parquet JAR at: $JAR_PATH"
-
-# Backup original JAR
-if [ ! -f "$BACKUP_PATH" ]; then
-    cp "$JAR_PATH" "$BACKUP_PATH"
-    echo "Created backup at: $BACKUP_PATH"
-fi
-
-# Extract the JAR
-TEMP_DIR=$(mktemp -d)
-cd "$TEMP_DIR"
-jar xf "$JAR_PATH"
-
-# Find and patch the class file
-# We need to modify the bytecode to change HashSet to LinkedHashSet
-# This is complex, so let's document what needs to be done
-
-echo "JAR extracted to: $TEMP_DIR"
-echo "To patch, we need to:"
-echo "1. Decompile ParquetFileWriter.class"
-echo "2. Change HashSet to LinkedHashSet"
-echo "3. Recompile"
-echo "4. Repackage JAR"
-echo ""
-echo "This requires javap, javac with all dependencies, and jar"
-echo "Simpler approach: Use the patched source to rebuild the module"
-
-rm -rf "$TEMP_DIR"
--- a/test/java/spark/test-one.sh
+++ b/test/java/spark/test-one.sh
@ -1,40 +0,0 @@
-#!/bin/bash
-
-# Run a single test method for quick iteration
-
-set -e
-
-if [ $# -eq 0 ]; then
-    echo "Usage: ./test-one.sh <TestClass>#<methodName>"
-    echo ""
-    echo "Examples:"
-    echo "  ./test-one.sh SparkReadWriteTest#testWriteAndReadParquet"
-    echo "  ./test-one.sh SparkSQLTest#testCreateTableAndQuery"
-    echo ""
-    exit 1
-fi
-
-# Check if SeaweedFS is running
-if ! curl -f http://localhost:8888/ > /dev/null 2>&1; then
-    echo "✗ SeaweedFS filer is not accessible at http://localhost:8888"
-    echo ""
-    echo "Please start SeaweedFS first:"
-    echo "  docker-compose up -d"
-    echo ""
-    exit 1
-fi
-
-echo "✓ SeaweedFS filer is accessible"
-echo ""
-echo "Running test: $1"
-echo ""
-
-# Set environment variables
-export SEAWEEDFS_TEST_ENABLED=true
-export SEAWEEDFS_FILER_HOST=localhost
-export SEAWEEDFS_FILER_PORT=8888
-export SEAWEEDFS_FILER_GRPC_PORT=18888
-
-# Run the specific test
-mvn test -Dtest="$1"
-
--- a/test/java/spark/test_parquet_external_read.sh
+++ b/test/java/spark/test_parquet_external_read.sh
@ -1,55 +0,0 @@
-#!/bin/bash
-set -e
-
-echo "=== Testing if Parquet file can be read by external tools ==="
-
-# Use our working ParquetMemoryComparisonTest to write a file
-echo "1. Writing Parquet file with ParquetWriter (known to work)..."
-docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c '
-cd /workspace
-mvn test -Dtest=ParquetMemoryComparisonTest#testCompareMemoryVsSeaweedFSParquet -q 2>&1 | tail -10
-' > /tmp/write_test.log 2>&1
-
-# The test writes to: /test-spark/comparison-test.parquet
-echo "2. Downloading file from SeaweedFS..."
-curl -s "http://localhost:8888/test-spark/comparison-test.parquet" -o /tmp/test.parquet
-
-if [ ! -f /tmp/test.parquet ] || [ ! -s /tmp/test.parquet ]; then
-    echo "ERROR: Failed to download file!"
-    echo "Checking if file exists..."
-    curl -s "http://localhost:8888/test-spark/?pretty=y"
-    exit 1
-fi
-
-FILE_SIZE=$(stat -f%z /tmp/test.parquet 2>/dev/null || stat --format=%s /tmp/test.parquet 2>/dev/null)
-echo "Downloaded $FILE_SIZE bytes"
-
-# Install parquet-tools if needed
-pip3 install -q parquet-tools 2>&1 | grep -v "Requirement already satisfied" || true
-
-echo ""
-echo "=== File Header (first 100 bytes) ==="
-hexdump -C /tmp/test.parquet | head -10
-
-echo ""
-echo "=== File Footer (last 100 bytes) ==="
-tail -c 100 /tmp/test.parquet | hexdump -C
-
-echo ""
-echo "=== Parquet Metadata ==="
-parquet-tools inspect /tmp/test.parquet 2>&1 || echo "FAILED to inspect"
-
-echo ""
-echo "=== Try to read data ==="
-parquet-tools show /tmp/test.parquet 2>&1 | head -20 || echo "FAILED to read data"
-
-echo ""
-echo "=== Conclusion ==="
-if parquet-tools show /tmp/test.parquet > /dev/null 2>&1; then
-    echo "✅ SUCCESS: File written to SeaweedFS can be read by parquet-tools!"
-    echo "This proves the file format is valid."
-else
-    echo "❌ FAILED: File cannot be read by parquet-tools"
-    echo "The file may be corrupted."
-fi
-
--- a/test/java/spark/test_parquet_readability.sh
+++ b/test/java/spark/test_parquet_readability.sh
@ -1,60 +0,0 @@
-#!/bin/bash
-set -e
-
-echo "=== Testing if Parquet file written by Spark can be read by parquet-tools ==="
-
-# Run the test to write a Parquet file
-echo "1. Writing Parquet file with Spark..."
-docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c '
-cd /workspace
-mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery -q 2>&1 | tail -5
-' > /tmp/write_test.log 2>&1 || true
-
-# Find the Parquet file that was written
-echo "2. Finding Parquet file..."
-PARQUET_FILE=$(docker compose run --rm spark-tests bash -c '
-curl -s "http://seaweedfs-filer:8888/test-spark/employees/?pretty=y" | grep -oP "\"name\":\s*\"\K[^\"]+\.parquet" | head -1
-' 2>&1 | grep -v "Creating" | grep "\.parquet" | head -1)
-
-if [ -z "$PARQUET_FILE" ]; then
-    echo "ERROR: No Parquet file found!"
-    exit 1
-fi
-
-echo "Found file: $PARQUET_FILE"
-
-# Download the file
-echo "3. Downloading file from SeaweedFS..."
-curl -s "http://localhost:8888/test-spark/employees/$PARQUET_FILE" -o /tmp/test.parquet
-
-if [ ! -f /tmp/test.parquet ] || [ ! -s /tmp/test.parquet ]; then
-    echo "ERROR: Failed to download file!"
-    exit 1
-fi
-
-FILE_SIZE=$(stat -f%z /tmp/test.parquet 2>/dev/null || stat --format=%s /tmp/test.parquet 2>/dev/null)
-echo "Downloaded $FILE_SIZE bytes"
-
-# Try to read with parquet-tools
-echo "4. Reading with parquet-tools..."
-pip3 install -q parquet-tools 2>&1 | grep -v "Requirement already satisfied" || true
-
-echo ""
-echo "=== Parquet Metadata ==="
-parquet-tools inspect /tmp/test.parquet 2>&1 || echo "FAILED to inspect"
-
-echo ""
-echo "=== Try to read data ==="
-parquet-tools show /tmp/test.parquet 2>&1 || echo "FAILED to read data"
-
-echo ""
-echo "=== Conclusion ==="
-if parquet-tools show /tmp/test.parquet > /dev/null 2>&1; then
-    echo "✅ SUCCESS: File can be read by parquet-tools!"
-    echo "The file itself is VALID Parquet format."
-    echo "The issue is specific to how Spark reads it back."
-else
-    echo "❌ FAILED: File cannot be read by parquet-tools"
-    echo "The file is CORRUPTED or has invalid Parquet format."
-fi
-
--- a/test/java/spark/test_with_readers.sh
+++ b/test/java/spark/test_with_readers.sh
@ -1,120 +0,0 @@
-#!/bin/bash
-set -e
-
-echo "=== Testing Parquet file with multiple readers ==="
-echo ""
-
-# Start services
-docker compose up -d 2>&1 | grep -v "Running"
-sleep 2
-
-# Run test and capture chunk ID
-echo "1. Writing Parquet file and capturing chunk ID..."
-docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c '
-cd /workspace
-mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1
-' 2>&1 | tee /tmp/test_output.log | tail -20 &
-TEST_PID=$!
-
-# Wait for the file to be written
-echo "2. Waiting for file write..."
-sleep 10
-
-# Extract chunk ID from logs
-CHUNK_ID=$(grep "PARQUET FILE WRITTEN TO EMPLOYEES" /tmp/test_output.log | grep -oP 'CHUNKS: \[\K[^\]]+' | head -1)
-
-if [ -z "$CHUNK_ID" ]; then
-    echo "Waiting more..."
-    sleep 5
-    CHUNK_ID=$(grep "PARQUET FILE WRITTEN TO EMPLOYEES" /tmp/test_output.log | grep -oP 'CHUNKS: \[\K[^\]]+' | head -1)
-fi
-
-if [ -z "$CHUNK_ID" ]; then
-    echo "ERROR: Could not find chunk ID in logs"
-    echo "Log excerpt:"
-    grep -E "PARQUET|CHUNKS|employees" /tmp/test_output.log | tail -20
-    kill $TEST_PID 2>/dev/null || true
-    exit 1
-fi
-
-echo "Found chunk ID: $CHUNK_ID"
-
-# Download directly from volume server
-echo "3. Downloading from volume server..."
-curl -s "http://localhost:8080/$CHUNK_ID" -o /tmp/test.parquet
-
-if [ ! -f /tmp/test.parquet ] || [ ! -s /tmp/test.parquet ]; then
-    echo "ERROR: Download failed!"
-    exit 1
-fi
-
-FILE_SIZE=$(stat -f%z /tmp/test.parquet 2>/dev/null || stat --format=%s /tmp/test.parquet 2>/dev/null)
-echo "Downloaded: $FILE_SIZE bytes"
-echo ""
-
-# Kill test process
-kill $TEST_PID 2>/dev/null || true
-wait $TEST_PID 2>/dev/null || true
-
-# Test with readers
-echo "=== Testing with Multiple Parquet Readers ==="
-echo ""
-
-# Check magic bytes
-echo "1. Magic Bytes:"
-FIRST=$(head -c 4 /tmp/test.parquet | xxd -p)
-LAST=$(tail -c 4 /tmp/test.parquet | xxd -p)
-echo "   First 4 bytes: $FIRST"
-echo "   Last 4 bytes: $LAST"
-if [ "$FIRST" = "50415231" ] && [ "$LAST" = "50415231" ]; then
-    echo "   ✅ Valid PAR1 magic"
-else
-    echo "   ❌ Invalid magic!"
-fi
-echo ""
-
-# Python pyarrow
-echo "2. Python pyarrow:"
-python3 -c "
-import pyarrow.parquet as pq
-try:
-    table = pq.read_table('/tmp/test.parquet')
-    print(f'   ✅ Read {table.num_rows} rows, {table.num_columns} columns')
-    print(f'   Data: {table.to_pandas().to_dict(\"records\")}')
-except Exception as e:
-    print(f'   ❌ FAILED: {e}')
-" 2>&1
-echo ""
-
-# Pandas
-echo "3. Pandas:"
-python3 -c "
-import pandas as pd
-try:
-    df = pd.read_parquet('/tmp/test.parquet')
-    print(f'   ✅ Read {len(df)} rows')
-    print(f'   Data:\n{df}')
-except Exception as e:
-    print(f'   ❌ FAILED: {e}')
-" 2>&1
-echo ""
-
-# DuckDB
-echo "4. DuckDB:"
-python3 -c "
-import duckdb
-try:
-    conn = duckdb.connect(':memory:')
-    result = conn.execute('SELECT * FROM \"/tmp/test.parquet\"').fetchall()
-    print(f'   ✅ Read {len(result)} rows')
-    print(f'   Data: {result}')
-except Exception as e:
-    print(f'   ❌ FAILED: {e}')
-" 2>&1
-echo ""
-
-echo "=== Summary ==="
-echo "File: $FILE_SIZE bytes"
-echo "If readers succeeded: File is VALID ✅"
-echo "If readers failed: Footer metadata is corrupted ❌"
-