From 785dbc6077f27ff9307db76a9aaedaf991ab8dc7 Mon Sep 17 00:00:00 2001 From: chrislu Date: Mon, 24 Nov 2025 21:29:49 -0800 Subject: [PATCH] clean up tests --- .github/workflows/spark-integration-tests.yml | 226 +----------------- test/java/spark/ReadParquetMeta.java | 39 --- test/java/spark/TEST_ALL_THREE_MODES.sh | 38 --- test/java/spark/capture-parquet.sh | 50 ---- test/java/spark/download_and_test.sh | 180 -------------- test/java/spark/patch-parquet.sh | 34 --- test/java/spark/test-one.sh | 40 ---- test/java/spark/test_parquet_external_read.sh | 55 ----- test/java/spark/test_parquet_readability.sh | 60 ----- test/java/spark/test_with_readers.sh | 120 ---------- 10 files changed, 1 insertion(+), 841 deletions(-) delete mode 100644 test/java/spark/ReadParquetMeta.java delete mode 100755 test/java/spark/TEST_ALL_THREE_MODES.sh delete mode 100755 test/java/spark/capture-parquet.sh delete mode 100755 test/java/spark/download_and_test.sh delete mode 100755 test/java/spark/patch-parquet.sh delete mode 100755 test/java/spark/test-one.sh delete mode 100755 test/java/spark/test_parquet_external_read.sh delete mode 100755 test/java/spark/test_parquet_readability.sh delete mode 100755 test/java/spark/test_with_readers.sh diff --git a/.github/workflows/spark-integration-tests.yml b/.github/workflows/spark-integration-tests.yml index 66ea0a528..c98061d92 100644 --- a/.github/workflows/spark-integration-tests.yml +++ b/.github/workflows/spark-integration-tests.yml @@ -115,224 +115,16 @@ jobs: mkdir -p .m2/repository/com cp -r ~/.m2/repository/com/seaweedfs .m2/repository/com/ echo "OK Maven artifacts copied" - - echo "" - echo "=== VERIFYING NEW CODE IS IN JARS ===" - # Check if SeaweedOutputStream contains our new constructor log - JAR_PATH=".m2/repository/com/seaweedfs/seaweedfs-client/3.80.1-SNAPSHOT/seaweedfs-client-3.80.1-SNAPSHOT.jar" - if [ -f "$JAR_PATH" ]; then - if unzip -p "$JAR_PATH" seaweedfs/client/SeaweedOutputStream.class | strings | grep -q "SeaweedOutputStream BASE constructor called"; then - echo "OK SeaweedOutputStream contains new constructor log" - else - echo "ERROR SeaweedOutputStream JAR is STALE - does not contain constructor log!" - echo "Listing JAR contents:" - unzip -l "$JAR_PATH" | grep SeaweedOutputStream - exit 1 - fi - else - echo "ERROR JAR not found at $JAR_PATH" - ls -la .m2/repository/com/seaweedfs/seaweedfs-client/3.80.1-SNAPSHOT/ - exit 1 - fi - echo "OK Maven artifacts ready and verified" - name: Run Spark integration tests working-directory: test/java/spark - continue-on-error: true - id: test-run run: | echo "=== Running Spark Integration Tests ===" - # Run tests in detached mode - docker compose up -d spark-tests - - echo "Real-time monitoring: Will download file the instant EOF error appears..." + docker compose up --abort-on-container-exit spark-tests - # Monitor logs and download ALL employees files BEFORE they're deleted - ( - DOWNLOADED=false - while docker ps | grep -q seaweedfs-spark-tests; do - # Check if an employees Parquet file has been written (we log this explicitly) - if docker compose logs spark-tests 2>&1 | grep -q "PARQUET FILE WRITTEN TO EMPLOYEES"; then - if [ "$DOWNLOADED" = "false" ]; then - echo "" - echo "=== EMPLOYEES FILE WRITTEN! Extracting chunk IDs and downloading from volume ===" - - # Extract chunk IDs directly from the write log (bypasses filer entirely!) - FULL_LOG=$(docker compose logs spark-tests 2>&1) - WRITE_LOG=$(echo "$FULL_LOG" | grep "PARQUET FILE WRITTEN TO EMPLOYEES" | tail -1) - - echo "Write log: $WRITE_LOG" - - # Extract chunk IDs from CHUNKS: [id1,id2,...] in the log - CHUNK_IDS=$(echo "$WRITE_LOG" | grep -oP 'CHUNKS: \[\K[^\]]+') - echo "Chunk IDs: $CHUNK_IDS" - - if [ -z "$CHUNK_IDS" ]; then - echo "ERROR: No chunk IDs in write log - using old format?" - # Fallback: try to find any chunk ID near the write log - CHUNK_IDS=$(echo "$FULL_LOG" | grep -B 20 "PARQUET FILE WRITTEN TO EMPLOYEES" | grep 'file_id: "' | tail -1 | grep -oP '"\K[^"]+') - echo "Fallback chunk ID: $CHUNK_IDS" - fi - - if [ -n "$CHUNK_IDS" ]; then - # CHUNK_IDS might have multiple chunks, but usually just one - # Format: "3,abc123" or "3,abc123,4,def456" (comma WITHIN each ID!) - # We need to split by space or handle single chunk - echo "Downloading chunk from volume server: http://localhost:8080/$CHUNK_IDS" - curl -o "test.parquet" "http://localhost:8080/$CHUNK_IDS" - - if [ -f test.parquet ] && [ -s test.parquet ]; then - FILE_SIZE=$(stat --format=%s test.parquet 2>/dev/null || stat -f%z test.parquet 2>/dev/null) - echo "SUCCESS: Downloaded $FILE_SIZE bytes from volume server!" - DOWNLOADED=true - else - echo "FAILED: Chunk $CHUNK_IDS returned 404 or empty" - fi - else - echo "ERROR: Could not extract chunk IDs" - fi - fi - fi - - # Check if EOF error has appeared - if docker compose logs spark-tests 2>&1 | grep -q "EOFException.*Still have: 78 bytes left"; then - echo "" - echo "=== EOF ERROR DETECTED! ===" - - if [ "$DOWNLOADED" = "true" ] && [ -f test.parquet ] && [ -s test.parquet ]; then - echo "File was already downloaded proactively!" - FILE_SIZE=$(stat --format=%s test.parquet 2>/dev/null || stat -f%z test.parquet 2>/dev/null) - echo "File size: $FILE_SIZE bytes" - - # Analyze it - echo "" - echo "Installing parquet-tools..." - pip3 install -q parquet-tools - - echo "" - echo "=== File Header (first 100 bytes) ===" - hexdump -C test.parquet | head -10 - - echo "" - echo "=== File Footer (last 200 bytes) ===" - tail -c 200 test.parquet | hexdump -C - - echo "" - echo "=== Magic bytes check ===" - echo "First 4 bytes (should be PAR1):" - head -c 4 test.parquet | xxd - echo "Last 4 bytes (should be PAR1):" - tail -c 4 test.parquet | xxd - - echo "" - echo "=== Parquet metadata ===" - parquet-tools inspect test.parquet || echo "parquet-tools inspect failed" - - echo "" - echo "=== Try reading data ===" - parquet-tools show test.parquet || echo "parquet-tools show failed" - - echo "" - echo "=== CRITICAL ANALYSIS: Where are the missing 78 bytes? ===" - echo "Actual file size: $FILE_SIZE bytes" - - echo "" - echo "Examining column chunk offsets from metadata..." - parquet-tools meta test.parquet > meta.txt 2>&1 || true - cat meta.txt - - echo "" - echo "Analyzing offset pattern..." - grep -i "offset" meta.txt || echo "No offset info" - else - echo "ERROR: File was not downloaded proactively!" - fi - break - fi - sleep 1 - done - ) & - MONITOR_PID=$! - - # Wait for tests to complete - docker wait seaweedfs-spark-tests - TEST_EXIT_CODE=$(docker inspect seaweedfs-spark-tests --format='{{.State.ExitCode}}') - - # Give monitor time to finish - sleep 3 - kill $MONITOR_PID 2>/dev/null || true - - # Show full logs echo "" echo "=== Test Logs ===" docker compose logs spark-tests | tail -100 - - echo "" - echo "Tests completed with exit code: $TEST_EXIT_CODE" - echo "exit_code=$TEST_EXIT_CODE" >> $GITHUB_OUTPUT - - exit $TEST_EXIT_CODE - - - name: Examine Parquet file - if: steps.test-run.outcome == 'failure' - working-directory: test/java/spark - run: | - echo "=== Examining Parquet file for analysis ===" - - # Check if file was already downloaded - if [ ! -f test.parquet ] || [ ! -s test.parquet ]; then - echo "ERROR: test.parquet not found or empty" - echo "File was not successfully downloaded during test run" - exit 1 - fi - - echo "Found test.parquet, proceeding with analysis..." - - # Install parquet-tools - pip3 install parquet-tools - - echo "" - echo "=== File Size ===" - ls -lh test.parquet - FILE_SIZE=$(stat -f%z test.parquet 2>/dev/null || stat -c%s test.parquet) - echo "Actual file size: $FILE_SIZE bytes" - - echo "" - echo "=== File Header (first 100 bytes) ===" - hexdump -C test.parquet | head -10 - - echo "" - echo "=== File Footer (last 200 bytes) ===" - tail -c 200 test.parquet | hexdump -C - - echo "" - echo "=== Magic Bytes Check ===" - echo "First 4 bytes (should be PAR1):" - head -c 4 test.parquet | xxd - echo "Last 4 bytes (should be PAR1):" - tail -c 4 test.parquet | xxd - - echo "" - echo "=== Parquet Metadata ===" - parquet-tools inspect test.parquet || echo "parquet-tools failed" - - echo "" - echo "=== Try Reading with Parquet Tools ===" - parquet-tools show test.parquet || echo "Failed to read file" - - echo "" - echo "=== File Validation ===" - if head -c 4 test.parquet | grep -q "PAR1"; then - echo "OK Valid Parquet header" - else - echo "FAILED INVALID Parquet header" - fi - - if tail -c 4 test.parquet | grep -q "PAR1"; then - echo "OK Valid Parquet trailer" - else - echo "FAILED INVALID Parquet trailer" - fi - name: Stop test services if: always() @@ -347,15 +139,6 @@ jobs: path: test/java/spark/target/surefire-reports/ retention-days: 30 - - name: Upload Parquet file for analysis - if: failure() - uses: actions/upload-artifact@v4 - with: - name: failed-parquet-file - path: test/java/spark/test.parquet - retention-days: 7 - if-no-files-found: ignore - - name: Publish test report if: always() uses: dorny/test-reporter@v1 @@ -365,13 +148,6 @@ jobs: reporter: java-junit fail-on-error: true - - name: Check test results - if: steps.test-run.outcome == 'failure' - run: | - echo "ERROR Tests failed with exit code: ${{ steps.test-run.outputs.exit_code }}" - echo "But file analysis was completed above." - exit 1 - # ======================================== # SPARK EXAMPLE (HOST-BASED) # ======================================== diff --git a/test/java/spark/ReadParquetMeta.java b/test/java/spark/ReadParquetMeta.java deleted file mode 100644 index 74641a485..000000000 --- a/test/java/spark/ReadParquetMeta.java +++ /dev/null @@ -1,39 +0,0 @@ -import org.apache.parquet.hadoop.ParquetFileReader; -import org.apache.parquet.hadoop.metadata.ParquetMetadata; -import org.apache.parquet.hadoop.util.HadoopInputFile; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; - -public class ReadParquetMeta { - public static void main(String[] args) throws Exception { - Configuration conf = new Configuration(); - Path path = new Path(args[0]); - HadoopInputFile inputFile = HadoopInputFile.fromPath(path, conf); - - try (ParquetFileReader reader = ParquetFileReader.open(inputFile)) { - ParquetMetadata meta = reader.getFooter(); - - System.out.println("=== Parquet File Metadata ==="); - System.out.println("Blocks (row groups): " + meta.getBlocks().size()); - System.out.println("File size from footer: " + inputFile.getLength()); - System.out.println(""); - - meta.getBlocks().forEach(block -> { - System.out.println("Row Group:"); - System.out.println(" Rows: " + block.getRowCount()); - System.out.println(" Total byte size: " + block.getTotalByteSize()); - System.out.println(" Columns: " + block.getColumns().size()); - System.out.println(""); - - block.getColumns().forEach(col -> { - System.out.println(" Column: " + col.getPath()); - System.out.println(" First data page offset: " + col.getFirstDataPageOffset()); - System.out.println(" Dictionary page offset: " + col.getDictionaryPageOffset()); - System.out.println(" Total size: " + col.getTotalSize()); - System.out.println(" Total uncompressed size: " + col.getTotalUncompressedSize()); - System.out.println(""); - }); - }); - } - } -} diff --git a/test/java/spark/TEST_ALL_THREE_MODES.sh b/test/java/spark/TEST_ALL_THREE_MODES.sh deleted file mode 100755 index a5886e503..000000000 --- a/test/java/spark/TEST_ALL_THREE_MODES.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -set -e - -echo "==========================================" -echo "Testing All Three Debug Modes" -echo "==========================================" -echo "" - -cd /Users/chrislu/go/src/github.com/seaweedfs/seaweedfs/test/java/spark - -# Mode 1: SEAWEED_ONLY (default) -echo "=== MODE 1: SEAWEED_ONLY ===" -docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true \ - spark-tests bash -c 'cd /workspace && mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1' \ - | grep -E "Tests run|BUILD SUCCESS|BUILD FAILURE|EOFException" | tail -5 -echo "" - -# Mode 2: LOCAL_ONLY -echo "=== MODE 2: LOCAL_ONLY ===" -docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true \ - -e SEAWEEDFS_DEBUG_MODE=LOCAL_ONLY \ - -e SEAWEEDFS_DEBUG_DIR=/workspace/target/debug-local \ - spark-tests bash -c 'mkdir -p /workspace/target/debug-local && cd /workspace && mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1' \ - | grep -E "Tests run|BUILD SUCCESS|BUILD FAILURE|EOFException|length is too low" | tail -5 -echo "" - -# Mode 3: DUAL_COMPARE -echo "=== MODE 3: DUAL_COMPARE ===" -docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true \ - -e SEAWEEDFS_DEBUG_MODE=DUAL_COMPARE \ - -e SEAWEEDFS_DEBUG_DIR=/workspace/target/debug-dual \ - spark-tests bash -c 'mkdir -p /workspace/target/debug-dual && cd /workspace && mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1' \ - | grep -E "Tests run|BUILD SUCCESS|BUILD FAILURE|EOFException" | tail -5 -echo "" - -echo "==========================================" -echo "Test Summary" -echo "==========================================" diff --git a/test/java/spark/capture-parquet.sh b/test/java/spark/capture-parquet.sh deleted file mode 100755 index 18e608fda..000000000 --- a/test/java/spark/capture-parquet.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash -# Run Spark test and capture the Parquet file before cleanup - -echo "Starting SeaweedFS services..." -docker compose up -d seaweedfs-master seaweedfs-volume seaweedfs-filer -sleep 10 - -echo "Running Spark test in background..." -docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c "mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1" > /tmp/spark-test-capture.log & -TEST_PID=$! - -echo "Monitoring for Parquet file creation..." -while kill -0 $TEST_PID 2>/dev/null; do - # Check if employees directory exists - FILES=$(curl -s http://localhost:8888/test-spark/employees/ 2>/dev/null | grep -o 'part-[^"]*\.parquet' || echo "") - if [ -n "$FILES" ]; then - echo "Found Parquet file(s)!" - for FILE in $FILES; do - echo "Downloading: $FILE" - curl -s "http://localhost:8888/test-spark/employees/$FILE" > "/tmp/$FILE" - FILE_SIZE=$(stat -f%z "/tmp/$FILE" 2>/dev/null || stat --format=%s "/tmp/$FILE" 2>/dev/null) - echo "Downloaded $FILE: $FILE_SIZE bytes" - - if [ -f "/tmp/$FILE" ] && [ $FILE_SIZE -gt 0 ]; then - echo "SUCCESS: Captured $FILE" - echo "Installing parquet-tools..." - pip3 install -q parquet-tools 2>/dev/null || echo "parquet-tools might already be installed" - - echo "" - echo "=== Parquet File Metadata ===" - python3 -m parquet_tools meta "/tmp/$FILE" || echo "parquet-tools failed" - - echo "" - echo "=== File Header (first 100 bytes) ===" - hexdump -C "/tmp/$FILE" | head -10 - - echo "" - echo "=== File Footer (last 100 bytes) ===" - tail -c 100 "/tmp/$FILE" | hexdump -C - - kill $TEST_PID 2>/dev/null - exit 0 - fi - done - fi - sleep 0.5 -done - -echo "Test completed, checking logs..." -tail -50 /tmp/spark-test-capture.log diff --git a/test/java/spark/download_and_test.sh b/test/java/spark/download_and_test.sh deleted file mode 100755 index 998e9ad85..000000000 --- a/test/java/spark/download_and_test.sh +++ /dev/null @@ -1,180 +0,0 @@ -#!/bin/bash -set -e - -echo "=== Downloading Parquet file and testing with multiple readers ===" -echo "" - -# Start services if not running -docker compose up -d seaweedfs-master seaweedfs-volume seaweedfs-filer 2>&1 | grep -v "Running" -sleep 3 - -# Write a file using Spark -echo "1. Writing Parquet file with Spark..." -docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c ' -cd /workspace -# Run the test that writes a file -mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1 | tail -20 -' > /tmp/spark_write.log 2>&1 & -WRITE_PID=$! - -# Wait a bit for file to be written -sleep 8 - -# Find and download the file from the temporary directory -echo "2. Finding Parquet file in temporary directory..." -TEMP_FILE=$(docker compose exec -T seaweedfs-filer sh -c ' -find /data -name "*.parquet" -type f 2>/dev/null | grep -v "_SUCCESS" | head -1 -' 2>&1 | tr -d '\r') - -if [ -z "$TEMP_FILE" ]; then - echo "Waiting for file to be written..." - sleep 5 - TEMP_FILE=$(docker compose exec -T seaweedfs-filer sh -c ' - find /data -name "*.parquet" -type f 2>/dev/null | grep -v "_SUCCESS" | head -1 - ' 2>&1 | tr -d '\r') -fi - -if [ -z "$TEMP_FILE" ]; then - echo "ERROR: No Parquet file found!" - echo "Checking what files exist..." - docker compose exec -T seaweedfs-filer sh -c 'find /data -type f 2>/dev/null | head -20' - wait $WRITE_PID - exit 1 -fi - -echo "Found: $TEMP_FILE" - -# Copy file from container -echo "3. Copying file from container..." -docker compose cp seaweedfs-filer:$TEMP_FILE /tmp/spark_written.parquet 2>&1 | grep -v "Successfully" - -# Also try to get it via HTTP -echo "4. Also downloading via HTTP API..." -# Get the file path relative to /data -REL_PATH=$(echo $TEMP_FILE | sed 's|/data||') -curl -s "http://localhost:8888${REL_PATH}" -o /tmp/spark_written_http.parquet 2>&1 - -# Use whichever file is larger/valid -if [ -f /tmp/spark_written.parquet ] && [ -s /tmp/spark_written.parquet ]; then - cp /tmp/spark_written.parquet /tmp/test.parquet - echo "Using file copied from container" -elif [ -f /tmp/spark_written_http.parquet ] && [ -s /tmp/spark_written_http.parquet ]; then - cp /tmp/spark_written_http.parquet /tmp/test.parquet - echo "Using file downloaded via HTTP" -else - echo "ERROR: Failed to get file!" - exit 1 -fi - -FILE_SIZE=$(stat -f%z /tmp/test.parquet 2>/dev/null || stat --format=%s /tmp/test.parquet 2>/dev/null) -echo "Got file: $FILE_SIZE bytes" -echo "" - -# Kill the write process -kill $WRITE_PID 2>/dev/null || true -wait $WRITE_PID 2>/dev/null || true - -# Now test with various readers -echo "=== Testing with Multiple Parquet Readers ===" -echo "" - -# 1. Check magic bytes -echo "1. Magic Bytes Check:" -echo -n " First 4 bytes: " -head -c 4 /tmp/test.parquet | xxd -p -echo -n " Last 4 bytes: " -tail -c 4 /tmp/test.parquet | xxd -p - -FIRST=$(head -c 4 /tmp/test.parquet | xxd -p) -LAST=$(tail -c 4 /tmp/test.parquet | xxd -p) -if [ "$FIRST" = "50415231" ] && [ "$LAST" = "50415231" ]; then - echo " ✅ Valid PAR1 magic bytes" -else - echo " ❌ Invalid magic bytes!" -fi -echo "" - -# 2. Python pyarrow -echo "2. Testing with Python pyarrow:" -python3 << 'PYEOF' -try: - import pyarrow.parquet as pq - table = pq.read_table('/tmp/test.parquet') - print(f" ✅ SUCCESS: Read {table.num_rows} rows, {table.num_columns} columns") - print(f" Schema: {table.schema}") - print(f" First row: {table.to_pandas().iloc[0].to_dict()}") -except Exception as e: - print(f" ❌ FAILED: {e}") -PYEOF -echo "" - -# 3. DuckDB -echo "3. Testing with DuckDB:" -python3 << 'PYEOF' -try: - import duckdb - conn = duckdb.connect(':memory:') - result = conn.execute("SELECT * FROM '/tmp/test.parquet'").fetchall() - print(f" ✅ SUCCESS: Read {len(result)} rows") - print(f" Data: {result}") -except Exception as e: - print(f" ❌ FAILED: {e}") -PYEOF -echo "" - -# 4. Pandas -echo "4. Testing with Pandas:" -python3 << 'PYEOF' -try: - import pandas as pd - df = pd.read_parquet('/tmp/test.parquet') - print(f" ✅ SUCCESS: Read {len(df)} rows, {len(df.columns)} columns") - print(f" Columns: {list(df.columns)}") - print(f" Data:\n{df}") -except Exception as e: - print(f" ❌ FAILED: {e}") -PYEOF -echo "" - -# 5. Java ParquetReader (using our test container) -echo "5. Testing with Java ParquetReader:" -docker compose run --rm spark-tests bash -c ' -cat > /tmp/ReadParquet.java << "JAVAEOF" -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.parquet.hadoop.ParquetReader; -import org.apache.parquet.hadoop.example.GroupReadSupport; -import org.apache.parquet.example.data.Group; - -public class ReadParquet { - public static void main(String[] args) throws Exception { - Configuration conf = new Configuration(); - Path path = new Path("/tmp/test.parquet"); - - try (ParquetReader reader = ParquetReader.builder(new GroupReadSupport(), path) - .withConf(conf).build()) { - Group group; - int count = 0; - while ((group = reader.read()) != null && count < 5) { - System.out.println(" Row " + count + ": " + group); - count++; - } - System.out.println(" ✅ SUCCESS: Read " + count + " rows"); - } catch (Exception e) { - System.out.println(" ❌ FAILED: " + e.getMessage()); - e.printStackTrace(); - } - } -} -JAVAEOF - -# Copy the file into container -cat > /tmp/test.parquet -' < /tmp/test.parquet 2>&1 | head -1 - -echo "" -echo "=== Summary ===" -echo "File size: $FILE_SIZE bytes" -echo "If all readers succeeded, the file is VALID." -echo "If readers failed, the footer metadata is corrupted." - diff --git a/test/java/spark/patch-parquet.sh b/test/java/spark/patch-parquet.sh deleted file mode 100755 index 0cffb0879..000000000 --- a/test/java/spark/patch-parquet.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash -# This script patches the Parquet JAR to use LinkedHashSet instead of HashSet - -JAR_PATH="$HOME/.m2/repository/org/apache/parquet/parquet-hadoop/1.14.4/parquet-hadoop-1.14.4.jar" -BACKUP_PATH="$HOME/.m2/repository/org/apache/parquet/parquet-hadoop/1.14.4/parquet-hadoop-1.14.4.jar.backup" - -echo "Patching Parquet JAR at: $JAR_PATH" - -# Backup original JAR -if [ ! -f "$BACKUP_PATH" ]; then - cp "$JAR_PATH" "$BACKUP_PATH" - echo "Created backup at: $BACKUP_PATH" -fi - -# Extract the JAR -TEMP_DIR=$(mktemp -d) -cd "$TEMP_DIR" -jar xf "$JAR_PATH" - -# Find and patch the class file -# We need to modify the bytecode to change HashSet to LinkedHashSet -# This is complex, so let's document what needs to be done - -echo "JAR extracted to: $TEMP_DIR" -echo "To patch, we need to:" -echo "1. Decompile ParquetFileWriter.class" -echo "2. Change HashSet to LinkedHashSet" -echo "3. Recompile" -echo "4. Repackage JAR" -echo "" -echo "This requires javap, javac with all dependencies, and jar" -echo "Simpler approach: Use the patched source to rebuild the module" - -rm -rf "$TEMP_DIR" diff --git a/test/java/spark/test-one.sh b/test/java/spark/test-one.sh deleted file mode 100755 index aff6f15bf..000000000 --- a/test/java/spark/test-one.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash - -# Run a single test method for quick iteration - -set -e - -if [ $# -eq 0 ]; then - echo "Usage: ./test-one.sh #" - echo "" - echo "Examples:" - echo " ./test-one.sh SparkReadWriteTest#testWriteAndReadParquet" - echo " ./test-one.sh SparkSQLTest#testCreateTableAndQuery" - echo "" - exit 1 -fi - -# Check if SeaweedFS is running -if ! curl -f http://localhost:8888/ > /dev/null 2>&1; then - echo "✗ SeaweedFS filer is not accessible at http://localhost:8888" - echo "" - echo "Please start SeaweedFS first:" - echo " docker-compose up -d" - echo "" - exit 1 -fi - -echo "✓ SeaweedFS filer is accessible" -echo "" -echo "Running test: $1" -echo "" - -# Set environment variables -export SEAWEEDFS_TEST_ENABLED=true -export SEAWEEDFS_FILER_HOST=localhost -export SEAWEEDFS_FILER_PORT=8888 -export SEAWEEDFS_FILER_GRPC_PORT=18888 - -# Run the specific test -mvn test -Dtest="$1" - diff --git a/test/java/spark/test_parquet_external_read.sh b/test/java/spark/test_parquet_external_read.sh deleted file mode 100755 index ebb697996..000000000 --- a/test/java/spark/test_parquet_external_read.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash -set -e - -echo "=== Testing if Parquet file can be read by external tools ===" - -# Use our working ParquetMemoryComparisonTest to write a file -echo "1. Writing Parquet file with ParquetWriter (known to work)..." -docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c ' -cd /workspace -mvn test -Dtest=ParquetMemoryComparisonTest#testCompareMemoryVsSeaweedFSParquet -q 2>&1 | tail -10 -' > /tmp/write_test.log 2>&1 - -# The test writes to: /test-spark/comparison-test.parquet -echo "2. Downloading file from SeaweedFS..." -curl -s "http://localhost:8888/test-spark/comparison-test.parquet" -o /tmp/test.parquet - -if [ ! -f /tmp/test.parquet ] || [ ! -s /tmp/test.parquet ]; then - echo "ERROR: Failed to download file!" - echo "Checking if file exists..." - curl -s "http://localhost:8888/test-spark/?pretty=y" - exit 1 -fi - -FILE_SIZE=$(stat -f%z /tmp/test.parquet 2>/dev/null || stat --format=%s /tmp/test.parquet 2>/dev/null) -echo "Downloaded $FILE_SIZE bytes" - -# Install parquet-tools if needed -pip3 install -q parquet-tools 2>&1 | grep -v "Requirement already satisfied" || true - -echo "" -echo "=== File Header (first 100 bytes) ===" -hexdump -C /tmp/test.parquet | head -10 - -echo "" -echo "=== File Footer (last 100 bytes) ===" -tail -c 100 /tmp/test.parquet | hexdump -C - -echo "" -echo "=== Parquet Metadata ===" -parquet-tools inspect /tmp/test.parquet 2>&1 || echo "FAILED to inspect" - -echo "" -echo "=== Try to read data ===" -parquet-tools show /tmp/test.parquet 2>&1 | head -20 || echo "FAILED to read data" - -echo "" -echo "=== Conclusion ===" -if parquet-tools show /tmp/test.parquet > /dev/null 2>&1; then - echo "✅ SUCCESS: File written to SeaweedFS can be read by parquet-tools!" - echo "This proves the file format is valid." -else - echo "❌ FAILED: File cannot be read by parquet-tools" - echo "The file may be corrupted." -fi - diff --git a/test/java/spark/test_parquet_readability.sh b/test/java/spark/test_parquet_readability.sh deleted file mode 100755 index 9addbda9c..000000000 --- a/test/java/spark/test_parquet_readability.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -set -e - -echo "=== Testing if Parquet file written by Spark can be read by parquet-tools ===" - -# Run the test to write a Parquet file -echo "1. Writing Parquet file with Spark..." -docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c ' -cd /workspace -mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery -q 2>&1 | tail -5 -' > /tmp/write_test.log 2>&1 || true - -# Find the Parquet file that was written -echo "2. Finding Parquet file..." -PARQUET_FILE=$(docker compose run --rm spark-tests bash -c ' -curl -s "http://seaweedfs-filer:8888/test-spark/employees/?pretty=y" | grep -oP "\"name\":\s*\"\K[^\"]+\.parquet" | head -1 -' 2>&1 | grep -v "Creating" | grep "\.parquet" | head -1) - -if [ -z "$PARQUET_FILE" ]; then - echo "ERROR: No Parquet file found!" - exit 1 -fi - -echo "Found file: $PARQUET_FILE" - -# Download the file -echo "3. Downloading file from SeaweedFS..." -curl -s "http://localhost:8888/test-spark/employees/$PARQUET_FILE" -o /tmp/test.parquet - -if [ ! -f /tmp/test.parquet ] || [ ! -s /tmp/test.parquet ]; then - echo "ERROR: Failed to download file!" - exit 1 -fi - -FILE_SIZE=$(stat -f%z /tmp/test.parquet 2>/dev/null || stat --format=%s /tmp/test.parquet 2>/dev/null) -echo "Downloaded $FILE_SIZE bytes" - -# Try to read with parquet-tools -echo "4. Reading with parquet-tools..." -pip3 install -q parquet-tools 2>&1 | grep -v "Requirement already satisfied" || true - -echo "" -echo "=== Parquet Metadata ===" -parquet-tools inspect /tmp/test.parquet 2>&1 || echo "FAILED to inspect" - -echo "" -echo "=== Try to read data ===" -parquet-tools show /tmp/test.parquet 2>&1 || echo "FAILED to read data" - -echo "" -echo "=== Conclusion ===" -if parquet-tools show /tmp/test.parquet > /dev/null 2>&1; then - echo "✅ SUCCESS: File can be read by parquet-tools!" - echo "The file itself is VALID Parquet format." - echo "The issue is specific to how Spark reads it back." -else - echo "❌ FAILED: File cannot be read by parquet-tools" - echo "The file is CORRUPTED or has invalid Parquet format." -fi - diff --git a/test/java/spark/test_with_readers.sh b/test/java/spark/test_with_readers.sh deleted file mode 100755 index f36fc5d97..000000000 --- a/test/java/spark/test_with_readers.sh +++ /dev/null @@ -1,120 +0,0 @@ -#!/bin/bash -set -e - -echo "=== Testing Parquet file with multiple readers ===" -echo "" - -# Start services -docker compose up -d 2>&1 | grep -v "Running" -sleep 2 - -# Run test and capture chunk ID -echo "1. Writing Parquet file and capturing chunk ID..." -docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c ' -cd /workspace -mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1 -' 2>&1 | tee /tmp/test_output.log | tail -20 & -TEST_PID=$! - -# Wait for the file to be written -echo "2. Waiting for file write..." -sleep 10 - -# Extract chunk ID from logs -CHUNK_ID=$(grep "PARQUET FILE WRITTEN TO EMPLOYEES" /tmp/test_output.log | grep -oP 'CHUNKS: \[\K[^\]]+' | head -1) - -if [ -z "$CHUNK_ID" ]; then - echo "Waiting more..." - sleep 5 - CHUNK_ID=$(grep "PARQUET FILE WRITTEN TO EMPLOYEES" /tmp/test_output.log | grep -oP 'CHUNKS: \[\K[^\]]+' | head -1) -fi - -if [ -z "$CHUNK_ID" ]; then - echo "ERROR: Could not find chunk ID in logs" - echo "Log excerpt:" - grep -E "PARQUET|CHUNKS|employees" /tmp/test_output.log | tail -20 - kill $TEST_PID 2>/dev/null || true - exit 1 -fi - -echo "Found chunk ID: $CHUNK_ID" - -# Download directly from volume server -echo "3. Downloading from volume server..." -curl -s "http://localhost:8080/$CHUNK_ID" -o /tmp/test.parquet - -if [ ! -f /tmp/test.parquet ] || [ ! -s /tmp/test.parquet ]; then - echo "ERROR: Download failed!" - exit 1 -fi - -FILE_SIZE=$(stat -f%z /tmp/test.parquet 2>/dev/null || stat --format=%s /tmp/test.parquet 2>/dev/null) -echo "Downloaded: $FILE_SIZE bytes" -echo "" - -# Kill test process -kill $TEST_PID 2>/dev/null || true -wait $TEST_PID 2>/dev/null || true - -# Test with readers -echo "=== Testing with Multiple Parquet Readers ===" -echo "" - -# Check magic bytes -echo "1. Magic Bytes:" -FIRST=$(head -c 4 /tmp/test.parquet | xxd -p) -LAST=$(tail -c 4 /tmp/test.parquet | xxd -p) -echo " First 4 bytes: $FIRST" -echo " Last 4 bytes: $LAST" -if [ "$FIRST" = "50415231" ] && [ "$LAST" = "50415231" ]; then - echo " ✅ Valid PAR1 magic" -else - echo " ❌ Invalid magic!" -fi -echo "" - -# Python pyarrow -echo "2. Python pyarrow:" -python3 -c " -import pyarrow.parquet as pq -try: - table = pq.read_table('/tmp/test.parquet') - print(f' ✅ Read {table.num_rows} rows, {table.num_columns} columns') - print(f' Data: {table.to_pandas().to_dict(\"records\")}') -except Exception as e: - print(f' ❌ FAILED: {e}') -" 2>&1 -echo "" - -# Pandas -echo "3. Pandas:" -python3 -c " -import pandas as pd -try: - df = pd.read_parquet('/tmp/test.parquet') - print(f' ✅ Read {len(df)} rows') - print(f' Data:\n{df}') -except Exception as e: - print(f' ❌ FAILED: {e}') -" 2>&1 -echo "" - -# DuckDB -echo "4. DuckDB:" -python3 -c " -import duckdb -try: - conn = duckdb.connect(':memory:') - result = conn.execute('SELECT * FROM \"/tmp/test.parquet\"').fetchall() - print(f' ✅ Read {len(result)} rows') - print(f' Data: {result}') -except Exception as e: - print(f' ❌ FAILED: {e}') -" 2>&1 -echo "" - -echo "=== Summary ===" -echo "File: $FILE_SIZE bytes" -echo "If readers succeeded: File is VALID ✅" -echo "If readers failed: Footer metadata is corrupted ❌" -