Browse Source

clean up tests

pull/7526/head
chrislu 1 week ago
parent
commit
785dbc6077
  1. 226
      .github/workflows/spark-integration-tests.yml
  2. 39
      test/java/spark/ReadParquetMeta.java
  3. 38
      test/java/spark/TEST_ALL_THREE_MODES.sh
  4. 50
      test/java/spark/capture-parquet.sh
  5. 180
      test/java/spark/download_and_test.sh
  6. 34
      test/java/spark/patch-parquet.sh
  7. 40
      test/java/spark/test-one.sh
  8. 55
      test/java/spark/test_parquet_external_read.sh
  9. 60
      test/java/spark/test_parquet_readability.sh
  10. 120
      test/java/spark/test_with_readers.sh

226
.github/workflows/spark-integration-tests.yml

@ -115,224 +115,16 @@ jobs:
mkdir -p .m2/repository/com
cp -r ~/.m2/repository/com/seaweedfs .m2/repository/com/
echo "OK Maven artifacts copied"
echo ""
echo "=== VERIFYING NEW CODE IS IN JARS ==="
# Check if SeaweedOutputStream contains our new constructor log
JAR_PATH=".m2/repository/com/seaweedfs/seaweedfs-client/3.80.1-SNAPSHOT/seaweedfs-client-3.80.1-SNAPSHOT.jar"
if [ -f "$JAR_PATH" ]; then
if unzip -p "$JAR_PATH" seaweedfs/client/SeaweedOutputStream.class | strings | grep -q "SeaweedOutputStream BASE constructor called"; then
echo "OK SeaweedOutputStream contains new constructor log"
else
echo "ERROR SeaweedOutputStream JAR is STALE - does not contain constructor log!"
echo "Listing JAR contents:"
unzip -l "$JAR_PATH" | grep SeaweedOutputStream
exit 1
fi
else
echo "ERROR JAR not found at $JAR_PATH"
ls -la .m2/repository/com/seaweedfs/seaweedfs-client/3.80.1-SNAPSHOT/
exit 1
fi
echo "OK Maven artifacts ready and verified"
- name: Run Spark integration tests
working-directory: test/java/spark
continue-on-error: true
id: test-run
run: |
echo "=== Running Spark Integration Tests ==="
# Run tests in detached mode
docker compose up -d spark-tests
echo "Real-time monitoring: Will download file the instant EOF error appears..."
docker compose up --abort-on-container-exit spark-tests
# Monitor logs and download ALL employees files BEFORE they're deleted
(
DOWNLOADED=false
while docker ps | grep -q seaweedfs-spark-tests; do
# Check if an employees Parquet file has been written (we log this explicitly)
if docker compose logs spark-tests 2>&1 | grep -q "PARQUET FILE WRITTEN TO EMPLOYEES"; then
if [ "$DOWNLOADED" = "false" ]; then
echo ""
echo "=== EMPLOYEES FILE WRITTEN! Extracting chunk IDs and downloading from volume ==="
# Extract chunk IDs directly from the write log (bypasses filer entirely!)
FULL_LOG=$(docker compose logs spark-tests 2>&1)
WRITE_LOG=$(echo "$FULL_LOG" | grep "PARQUET FILE WRITTEN TO EMPLOYEES" | tail -1)
echo "Write log: $WRITE_LOG"
# Extract chunk IDs from CHUNKS: [id1,id2,...] in the log
CHUNK_IDS=$(echo "$WRITE_LOG" | grep -oP 'CHUNKS: \[\K[^\]]+')
echo "Chunk IDs: $CHUNK_IDS"
if [ -z "$CHUNK_IDS" ]; then
echo "ERROR: No chunk IDs in write log - using old format?"
# Fallback: try to find any chunk ID near the write log
CHUNK_IDS=$(echo "$FULL_LOG" | grep -B 20 "PARQUET FILE WRITTEN TO EMPLOYEES" | grep 'file_id: "' | tail -1 | grep -oP '"\K[^"]+')
echo "Fallback chunk ID: $CHUNK_IDS"
fi
if [ -n "$CHUNK_IDS" ]; then
# CHUNK_IDS might have multiple chunks, but usually just one
# Format: "3,abc123" or "3,abc123,4,def456" (comma WITHIN each ID!)
# We need to split by space or handle single chunk
echo "Downloading chunk from volume server: http://localhost:8080/$CHUNK_IDS"
curl -o "test.parquet" "http://localhost:8080/$CHUNK_IDS"
if [ -f test.parquet ] && [ -s test.parquet ]; then
FILE_SIZE=$(stat --format=%s test.parquet 2>/dev/null || stat -f%z test.parquet 2>/dev/null)
echo "SUCCESS: Downloaded $FILE_SIZE bytes from volume server!"
DOWNLOADED=true
else
echo "FAILED: Chunk $CHUNK_IDS returned 404 or empty"
fi
else
echo "ERROR: Could not extract chunk IDs"
fi
fi
fi
# Check if EOF error has appeared
if docker compose logs spark-tests 2>&1 | grep -q "EOFException.*Still have: 78 bytes left"; then
echo ""
echo "=== EOF ERROR DETECTED! ==="
if [ "$DOWNLOADED" = "true" ] && [ -f test.parquet ] && [ -s test.parquet ]; then
echo "File was already downloaded proactively!"
FILE_SIZE=$(stat --format=%s test.parquet 2>/dev/null || stat -f%z test.parquet 2>/dev/null)
echo "File size: $FILE_SIZE bytes"
# Analyze it
echo ""
echo "Installing parquet-tools..."
pip3 install -q parquet-tools
echo ""
echo "=== File Header (first 100 bytes) ==="
hexdump -C test.parquet | head -10
echo ""
echo "=== File Footer (last 200 bytes) ==="
tail -c 200 test.parquet | hexdump -C
echo ""
echo "=== Magic bytes check ==="
echo "First 4 bytes (should be PAR1):"
head -c 4 test.parquet | xxd
echo "Last 4 bytes (should be PAR1):"
tail -c 4 test.parquet | xxd
echo ""
echo "=== Parquet metadata ==="
parquet-tools inspect test.parquet || echo "parquet-tools inspect failed"
echo ""
echo "=== Try reading data ==="
parquet-tools show test.parquet || echo "parquet-tools show failed"
echo ""
echo "=== CRITICAL ANALYSIS: Where are the missing 78 bytes? ==="
echo "Actual file size: $FILE_SIZE bytes"
echo ""
echo "Examining column chunk offsets from metadata..."
parquet-tools meta test.parquet > meta.txt 2>&1 || true
cat meta.txt
echo ""
echo "Analyzing offset pattern..."
grep -i "offset" meta.txt || echo "No offset info"
else
echo "ERROR: File was not downloaded proactively!"
fi
break
fi
sleep 1
done
) &
MONITOR_PID=$!
# Wait for tests to complete
docker wait seaweedfs-spark-tests
TEST_EXIT_CODE=$(docker inspect seaweedfs-spark-tests --format='{{.State.ExitCode}}')
# Give monitor time to finish
sleep 3
kill $MONITOR_PID 2>/dev/null || true
# Show full logs
echo ""
echo "=== Test Logs ==="
docker compose logs spark-tests | tail -100
echo ""
echo "Tests completed with exit code: $TEST_EXIT_CODE"
echo "exit_code=$TEST_EXIT_CODE" >> $GITHUB_OUTPUT
exit $TEST_EXIT_CODE
- name: Examine Parquet file
if: steps.test-run.outcome == 'failure'
working-directory: test/java/spark
run: |
echo "=== Examining Parquet file for analysis ==="
# Check if file was already downloaded
if [ ! -f test.parquet ] || [ ! -s test.parquet ]; then
echo "ERROR: test.parquet not found or empty"
echo "File was not successfully downloaded during test run"
exit 1
fi
echo "Found test.parquet, proceeding with analysis..."
# Install parquet-tools
pip3 install parquet-tools
echo ""
echo "=== File Size ==="
ls -lh test.parquet
FILE_SIZE=$(stat -f%z test.parquet 2>/dev/null || stat -c%s test.parquet)
echo "Actual file size: $FILE_SIZE bytes"
echo ""
echo "=== File Header (first 100 bytes) ==="
hexdump -C test.parquet | head -10
echo ""
echo "=== File Footer (last 200 bytes) ==="
tail -c 200 test.parquet | hexdump -C
echo ""
echo "=== Magic Bytes Check ==="
echo "First 4 bytes (should be PAR1):"
head -c 4 test.parquet | xxd
echo "Last 4 bytes (should be PAR1):"
tail -c 4 test.parquet | xxd
echo ""
echo "=== Parquet Metadata ==="
parquet-tools inspect test.parquet || echo "parquet-tools failed"
echo ""
echo "=== Try Reading with Parquet Tools ==="
parquet-tools show test.parquet || echo "Failed to read file"
echo ""
echo "=== File Validation ==="
if head -c 4 test.parquet | grep -q "PAR1"; then
echo "OK Valid Parquet header"
else
echo "FAILED INVALID Parquet header"
fi
if tail -c 4 test.parquet | grep -q "PAR1"; then
echo "OK Valid Parquet trailer"
else
echo "FAILED INVALID Parquet trailer"
fi
- name: Stop test services
if: always()
@ -347,15 +139,6 @@ jobs:
path: test/java/spark/target/surefire-reports/
retention-days: 30
- name: Upload Parquet file for analysis
if: failure()
uses: actions/upload-artifact@v4
with:
name: failed-parquet-file
path: test/java/spark/test.parquet
retention-days: 7
if-no-files-found: ignore
- name: Publish test report
if: always()
uses: dorny/test-reporter@v1
@ -365,13 +148,6 @@ jobs:
reporter: java-junit
fail-on-error: true
- name: Check test results
if: steps.test-run.outcome == 'failure'
run: |
echo "ERROR Tests failed with exit code: ${{ steps.test-run.outputs.exit_code }}"
echo "But file analysis was completed above."
exit 1
# ========================================
# SPARK EXAMPLE (HOST-BASED)
# ========================================

39
test/java/spark/ReadParquetMeta.java

@ -1,39 +0,0 @@
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.hadoop.util.HadoopInputFile;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
public class ReadParquetMeta {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Path path = new Path(args[0]);
HadoopInputFile inputFile = HadoopInputFile.fromPath(path, conf);
try (ParquetFileReader reader = ParquetFileReader.open(inputFile)) {
ParquetMetadata meta = reader.getFooter();
System.out.println("=== Parquet File Metadata ===");
System.out.println("Blocks (row groups): " + meta.getBlocks().size());
System.out.println("File size from footer: " + inputFile.getLength());
System.out.println("");
meta.getBlocks().forEach(block -> {
System.out.println("Row Group:");
System.out.println(" Rows: " + block.getRowCount());
System.out.println(" Total byte size: " + block.getTotalByteSize());
System.out.println(" Columns: " + block.getColumns().size());
System.out.println("");
block.getColumns().forEach(col -> {
System.out.println(" Column: " + col.getPath());
System.out.println(" First data page offset: " + col.getFirstDataPageOffset());
System.out.println(" Dictionary page offset: " + col.getDictionaryPageOffset());
System.out.println(" Total size: " + col.getTotalSize());
System.out.println(" Total uncompressed size: " + col.getTotalUncompressedSize());
System.out.println("");
});
});
}
}
}

38
test/java/spark/TEST_ALL_THREE_MODES.sh

@ -1,38 +0,0 @@
#!/bin/bash
set -e
echo "=========================================="
echo "Testing All Three Debug Modes"
echo "=========================================="
echo ""
cd /Users/chrislu/go/src/github.com/seaweedfs/seaweedfs/test/java/spark
# Mode 1: SEAWEED_ONLY (default)
echo "=== MODE 1: SEAWEED_ONLY ==="
docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true \
spark-tests bash -c 'cd /workspace && mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1' \
| grep -E "Tests run|BUILD SUCCESS|BUILD FAILURE|EOFException" | tail -5
echo ""
# Mode 2: LOCAL_ONLY
echo "=== MODE 2: LOCAL_ONLY ==="
docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true \
-e SEAWEEDFS_DEBUG_MODE=LOCAL_ONLY \
-e SEAWEEDFS_DEBUG_DIR=/workspace/target/debug-local \
spark-tests bash -c 'mkdir -p /workspace/target/debug-local && cd /workspace && mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1' \
| grep -E "Tests run|BUILD SUCCESS|BUILD FAILURE|EOFException|length is too low" | tail -5
echo ""
# Mode 3: DUAL_COMPARE
echo "=== MODE 3: DUAL_COMPARE ==="
docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true \
-e SEAWEEDFS_DEBUG_MODE=DUAL_COMPARE \
-e SEAWEEDFS_DEBUG_DIR=/workspace/target/debug-dual \
spark-tests bash -c 'mkdir -p /workspace/target/debug-dual && cd /workspace && mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1' \
| grep -E "Tests run|BUILD SUCCESS|BUILD FAILURE|EOFException" | tail -5
echo ""
echo "=========================================="
echo "Test Summary"
echo "=========================================="

50
test/java/spark/capture-parquet.sh

@ -1,50 +0,0 @@
#!/bin/bash
# Run Spark test and capture the Parquet file before cleanup
echo "Starting SeaweedFS services..."
docker compose up -d seaweedfs-master seaweedfs-volume seaweedfs-filer
sleep 10
echo "Running Spark test in background..."
docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c "mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1" > /tmp/spark-test-capture.log &
TEST_PID=$!
echo "Monitoring for Parquet file creation..."
while kill -0 $TEST_PID 2>/dev/null; do
# Check if employees directory exists
FILES=$(curl -s http://localhost:8888/test-spark/employees/ 2>/dev/null | grep -o 'part-[^"]*\.parquet' || echo "")
if [ -n "$FILES" ]; then
echo "Found Parquet file(s)!"
for FILE in $FILES; do
echo "Downloading: $FILE"
curl -s "http://localhost:8888/test-spark/employees/$FILE" > "/tmp/$FILE"
FILE_SIZE=$(stat -f%z "/tmp/$FILE" 2>/dev/null || stat --format=%s "/tmp/$FILE" 2>/dev/null)
echo "Downloaded $FILE: $FILE_SIZE bytes"
if [ -f "/tmp/$FILE" ] && [ $FILE_SIZE -gt 0 ]; then
echo "SUCCESS: Captured $FILE"
echo "Installing parquet-tools..."
pip3 install -q parquet-tools 2>/dev/null || echo "parquet-tools might already be installed"
echo ""
echo "=== Parquet File Metadata ==="
python3 -m parquet_tools meta "/tmp/$FILE" || echo "parquet-tools failed"
echo ""
echo "=== File Header (first 100 bytes) ==="
hexdump -C "/tmp/$FILE" | head -10
echo ""
echo "=== File Footer (last 100 bytes) ==="
tail -c 100 "/tmp/$FILE" | hexdump -C
kill $TEST_PID 2>/dev/null
exit 0
fi
done
fi
sleep 0.5
done
echo "Test completed, checking logs..."
tail -50 /tmp/spark-test-capture.log

180
test/java/spark/download_and_test.sh

@ -1,180 +0,0 @@
#!/bin/bash
set -e
echo "=== Downloading Parquet file and testing with multiple readers ==="
echo ""
# Start services if not running
docker compose up -d seaweedfs-master seaweedfs-volume seaweedfs-filer 2>&1 | grep -v "Running"
sleep 3
# Write a file using Spark
echo "1. Writing Parquet file with Spark..."
docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c '
cd /workspace
# Run the test that writes a file
mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1 | tail -20
' > /tmp/spark_write.log 2>&1 &
WRITE_PID=$!
# Wait a bit for file to be written
sleep 8
# Find and download the file from the temporary directory
echo "2. Finding Parquet file in temporary directory..."
TEMP_FILE=$(docker compose exec -T seaweedfs-filer sh -c '
find /data -name "*.parquet" -type f 2>/dev/null | grep -v "_SUCCESS" | head -1
' 2>&1 | tr -d '\r')
if [ -z "$TEMP_FILE" ]; then
echo "Waiting for file to be written..."
sleep 5
TEMP_FILE=$(docker compose exec -T seaweedfs-filer sh -c '
find /data -name "*.parquet" -type f 2>/dev/null | grep -v "_SUCCESS" | head -1
' 2>&1 | tr -d '\r')
fi
if [ -z "$TEMP_FILE" ]; then
echo "ERROR: No Parquet file found!"
echo "Checking what files exist..."
docker compose exec -T seaweedfs-filer sh -c 'find /data -type f 2>/dev/null | head -20'
wait $WRITE_PID
exit 1
fi
echo "Found: $TEMP_FILE"
# Copy file from container
echo "3. Copying file from container..."
docker compose cp seaweedfs-filer:$TEMP_FILE /tmp/spark_written.parquet 2>&1 | grep -v "Successfully"
# Also try to get it via HTTP
echo "4. Also downloading via HTTP API..."
# Get the file path relative to /data
REL_PATH=$(echo $TEMP_FILE | sed 's|/data||')
curl -s "http://localhost:8888${REL_PATH}" -o /tmp/spark_written_http.parquet 2>&1
# Use whichever file is larger/valid
if [ -f /tmp/spark_written.parquet ] && [ -s /tmp/spark_written.parquet ]; then
cp /tmp/spark_written.parquet /tmp/test.parquet
echo "Using file copied from container"
elif [ -f /tmp/spark_written_http.parquet ] && [ -s /tmp/spark_written_http.parquet ]; then
cp /tmp/spark_written_http.parquet /tmp/test.parquet
echo "Using file downloaded via HTTP"
else
echo "ERROR: Failed to get file!"
exit 1
fi
FILE_SIZE=$(stat -f%z /tmp/test.parquet 2>/dev/null || stat --format=%s /tmp/test.parquet 2>/dev/null)
echo "Got file: $FILE_SIZE bytes"
echo ""
# Kill the write process
kill $WRITE_PID 2>/dev/null || true
wait $WRITE_PID 2>/dev/null || true
# Now test with various readers
echo "=== Testing with Multiple Parquet Readers ==="
echo ""
# 1. Check magic bytes
echo "1. Magic Bytes Check:"
echo -n " First 4 bytes: "
head -c 4 /tmp/test.parquet | xxd -p
echo -n " Last 4 bytes: "
tail -c 4 /tmp/test.parquet | xxd -p
FIRST=$(head -c 4 /tmp/test.parquet | xxd -p)
LAST=$(tail -c 4 /tmp/test.parquet | xxd -p)
if [ "$FIRST" = "50415231" ] && [ "$LAST" = "50415231" ]; then
echo " ✅ Valid PAR1 magic bytes"
else
echo " ❌ Invalid magic bytes!"
fi
echo ""
# 2. Python pyarrow
echo "2. Testing with Python pyarrow:"
python3 << 'PYEOF'
try:
import pyarrow.parquet as pq
table = pq.read_table('/tmp/test.parquet')
print(f" ✅ SUCCESS: Read {table.num_rows} rows, {table.num_columns} columns")
print(f" Schema: {table.schema}")
print(f" First row: {table.to_pandas().iloc[0].to_dict()}")
except Exception as e:
print(f" ❌ FAILED: {e}")
PYEOF
echo ""
# 3. DuckDB
echo "3. Testing with DuckDB:"
python3 << 'PYEOF'
try:
import duckdb
conn = duckdb.connect(':memory:')
result = conn.execute("SELECT * FROM '/tmp/test.parquet'").fetchall()
print(f" ✅ SUCCESS: Read {len(result)} rows")
print(f" Data: {result}")
except Exception as e:
print(f" ❌ FAILED: {e}")
PYEOF
echo ""
# 4. Pandas
echo "4. Testing with Pandas:"
python3 << 'PYEOF'
try:
import pandas as pd
df = pd.read_parquet('/tmp/test.parquet')
print(f" ✅ SUCCESS: Read {len(df)} rows, {len(df.columns)} columns")
print(f" Columns: {list(df.columns)}")
print(f" Data:\n{df}")
except Exception as e:
print(f" ❌ FAILED: {e}")
PYEOF
echo ""
# 5. Java ParquetReader (using our test container)
echo "5. Testing with Java ParquetReader:"
docker compose run --rm spark-tests bash -c '
cat > /tmp/ReadParquet.java << "JAVAEOF"
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.example.GroupReadSupport;
import org.apache.parquet.example.data.Group;
public class ReadParquet {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Path path = new Path("/tmp/test.parquet");
try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), path)
.withConf(conf).build()) {
Group group;
int count = 0;
while ((group = reader.read()) != null && count < 5) {
System.out.println(" Row " + count + ": " + group);
count++;
}
System.out.println(" ✅ SUCCESS: Read " + count + " rows");
} catch (Exception e) {
System.out.println(" ❌ FAILED: " + e.getMessage());
e.printStackTrace();
}
}
}
JAVAEOF
# Copy the file into container
cat > /tmp/test.parquet
' < /tmp/test.parquet 2>&1 | head -1
echo ""
echo "=== Summary ==="
echo "File size: $FILE_SIZE bytes"
echo "If all readers succeeded, the file is VALID."
echo "If readers failed, the footer metadata is corrupted."

34
test/java/spark/patch-parquet.sh

@ -1,34 +0,0 @@
#!/bin/bash
# This script patches the Parquet JAR to use LinkedHashSet instead of HashSet
JAR_PATH="$HOME/.m2/repository/org/apache/parquet/parquet-hadoop/1.14.4/parquet-hadoop-1.14.4.jar"
BACKUP_PATH="$HOME/.m2/repository/org/apache/parquet/parquet-hadoop/1.14.4/parquet-hadoop-1.14.4.jar.backup"
echo "Patching Parquet JAR at: $JAR_PATH"
# Backup original JAR
if [ ! -f "$BACKUP_PATH" ]; then
cp "$JAR_PATH" "$BACKUP_PATH"
echo "Created backup at: $BACKUP_PATH"
fi
# Extract the JAR
TEMP_DIR=$(mktemp -d)
cd "$TEMP_DIR"
jar xf "$JAR_PATH"
# Find and patch the class file
# We need to modify the bytecode to change HashSet to LinkedHashSet
# This is complex, so let's document what needs to be done
echo "JAR extracted to: $TEMP_DIR"
echo "To patch, we need to:"
echo "1. Decompile ParquetFileWriter.class"
echo "2. Change HashSet to LinkedHashSet"
echo "3. Recompile"
echo "4. Repackage JAR"
echo ""
echo "This requires javap, javac with all dependencies, and jar"
echo "Simpler approach: Use the patched source to rebuild the module"
rm -rf "$TEMP_DIR"

40
test/java/spark/test-one.sh

@ -1,40 +0,0 @@
#!/bin/bash
# Run a single test method for quick iteration
set -e
if [ $# -eq 0 ]; then
echo "Usage: ./test-one.sh <TestClass>#<methodName>"
echo ""
echo "Examples:"
echo " ./test-one.sh SparkReadWriteTest#testWriteAndReadParquet"
echo " ./test-one.sh SparkSQLTest#testCreateTableAndQuery"
echo ""
exit 1
fi
# Check if SeaweedFS is running
if ! curl -f http://localhost:8888/ > /dev/null 2>&1; then
echo "✗ SeaweedFS filer is not accessible at http://localhost:8888"
echo ""
echo "Please start SeaweedFS first:"
echo " docker-compose up -d"
echo ""
exit 1
fi
echo "✓ SeaweedFS filer is accessible"
echo ""
echo "Running test: $1"
echo ""
# Set environment variables
export SEAWEEDFS_TEST_ENABLED=true
export SEAWEEDFS_FILER_HOST=localhost
export SEAWEEDFS_FILER_PORT=8888
export SEAWEEDFS_FILER_GRPC_PORT=18888
# Run the specific test
mvn test -Dtest="$1"

55
test/java/spark/test_parquet_external_read.sh

@ -1,55 +0,0 @@
#!/bin/bash
set -e
echo "=== Testing if Parquet file can be read by external tools ==="
# Use our working ParquetMemoryComparisonTest to write a file
echo "1. Writing Parquet file with ParquetWriter (known to work)..."
docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c '
cd /workspace
mvn test -Dtest=ParquetMemoryComparisonTest#testCompareMemoryVsSeaweedFSParquet -q 2>&1 | tail -10
' > /tmp/write_test.log 2>&1
# The test writes to: /test-spark/comparison-test.parquet
echo "2. Downloading file from SeaweedFS..."
curl -s "http://localhost:8888/test-spark/comparison-test.parquet" -o /tmp/test.parquet
if [ ! -f /tmp/test.parquet ] || [ ! -s /tmp/test.parquet ]; then
echo "ERROR: Failed to download file!"
echo "Checking if file exists..."
curl -s "http://localhost:8888/test-spark/?pretty=y"
exit 1
fi
FILE_SIZE=$(stat -f%z /tmp/test.parquet 2>/dev/null || stat --format=%s /tmp/test.parquet 2>/dev/null)
echo "Downloaded $FILE_SIZE bytes"
# Install parquet-tools if needed
pip3 install -q parquet-tools 2>&1 | grep -v "Requirement already satisfied" || true
echo ""
echo "=== File Header (first 100 bytes) ==="
hexdump -C /tmp/test.parquet | head -10
echo ""
echo "=== File Footer (last 100 bytes) ==="
tail -c 100 /tmp/test.parquet | hexdump -C
echo ""
echo "=== Parquet Metadata ==="
parquet-tools inspect /tmp/test.parquet 2>&1 || echo "FAILED to inspect"
echo ""
echo "=== Try to read data ==="
parquet-tools show /tmp/test.parquet 2>&1 | head -20 || echo "FAILED to read data"
echo ""
echo "=== Conclusion ==="
if parquet-tools show /tmp/test.parquet > /dev/null 2>&1; then
echo "✅ SUCCESS: File written to SeaweedFS can be read by parquet-tools!"
echo "This proves the file format is valid."
else
echo "❌ FAILED: File cannot be read by parquet-tools"
echo "The file may be corrupted."
fi

60
test/java/spark/test_parquet_readability.sh

@ -1,60 +0,0 @@
#!/bin/bash
set -e
echo "=== Testing if Parquet file written by Spark can be read by parquet-tools ==="
# Run the test to write a Parquet file
echo "1. Writing Parquet file with Spark..."
docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c '
cd /workspace
mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery -q 2>&1 | tail -5
' > /tmp/write_test.log 2>&1 || true
# Find the Parquet file that was written
echo "2. Finding Parquet file..."
PARQUET_FILE=$(docker compose run --rm spark-tests bash -c '
curl -s "http://seaweedfs-filer:8888/test-spark/employees/?pretty=y" | grep -oP "\"name\":\s*\"\K[^\"]+\.parquet" | head -1
' 2>&1 | grep -v "Creating" | grep "\.parquet" | head -1)
if [ -z "$PARQUET_FILE" ]; then
echo "ERROR: No Parquet file found!"
exit 1
fi
echo "Found file: $PARQUET_FILE"
# Download the file
echo "3. Downloading file from SeaweedFS..."
curl -s "http://localhost:8888/test-spark/employees/$PARQUET_FILE" -o /tmp/test.parquet
if [ ! -f /tmp/test.parquet ] || [ ! -s /tmp/test.parquet ]; then
echo "ERROR: Failed to download file!"
exit 1
fi
FILE_SIZE=$(stat -f%z /tmp/test.parquet 2>/dev/null || stat --format=%s /tmp/test.parquet 2>/dev/null)
echo "Downloaded $FILE_SIZE bytes"
# Try to read with parquet-tools
echo "4. Reading with parquet-tools..."
pip3 install -q parquet-tools 2>&1 | grep -v "Requirement already satisfied" || true
echo ""
echo "=== Parquet Metadata ==="
parquet-tools inspect /tmp/test.parquet 2>&1 || echo "FAILED to inspect"
echo ""
echo "=== Try to read data ==="
parquet-tools show /tmp/test.parquet 2>&1 || echo "FAILED to read data"
echo ""
echo "=== Conclusion ==="
if parquet-tools show /tmp/test.parquet > /dev/null 2>&1; then
echo "✅ SUCCESS: File can be read by parquet-tools!"
echo "The file itself is VALID Parquet format."
echo "The issue is specific to how Spark reads it back."
else
echo "❌ FAILED: File cannot be read by parquet-tools"
echo "The file is CORRUPTED or has invalid Parquet format."
fi

120
test/java/spark/test_with_readers.sh

@ -1,120 +0,0 @@
#!/bin/bash
set -e
echo "=== Testing Parquet file with multiple readers ==="
echo ""
# Start services
docker compose up -d 2>&1 | grep -v "Running"
sleep 2
# Run test and capture chunk ID
echo "1. Writing Parquet file and capturing chunk ID..."
docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c '
cd /workspace
mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1
' 2>&1 | tee /tmp/test_output.log | tail -20 &
TEST_PID=$!
# Wait for the file to be written
echo "2. Waiting for file write..."
sleep 10
# Extract chunk ID from logs
CHUNK_ID=$(grep "PARQUET FILE WRITTEN TO EMPLOYEES" /tmp/test_output.log | grep -oP 'CHUNKS: \[\K[^\]]+' | head -1)
if [ -z "$CHUNK_ID" ]; then
echo "Waiting more..."
sleep 5
CHUNK_ID=$(grep "PARQUET FILE WRITTEN TO EMPLOYEES" /tmp/test_output.log | grep -oP 'CHUNKS: \[\K[^\]]+' | head -1)
fi
if [ -z "$CHUNK_ID" ]; then
echo "ERROR: Could not find chunk ID in logs"
echo "Log excerpt:"
grep -E "PARQUET|CHUNKS|employees" /tmp/test_output.log | tail -20
kill $TEST_PID 2>/dev/null || true
exit 1
fi
echo "Found chunk ID: $CHUNK_ID"
# Download directly from volume server
echo "3. Downloading from volume server..."
curl -s "http://localhost:8080/$CHUNK_ID" -o /tmp/test.parquet
if [ ! -f /tmp/test.parquet ] || [ ! -s /tmp/test.parquet ]; then
echo "ERROR: Download failed!"
exit 1
fi
FILE_SIZE=$(stat -f%z /tmp/test.parquet 2>/dev/null || stat --format=%s /tmp/test.parquet 2>/dev/null)
echo "Downloaded: $FILE_SIZE bytes"
echo ""
# Kill test process
kill $TEST_PID 2>/dev/null || true
wait $TEST_PID 2>/dev/null || true
# Test with readers
echo "=== Testing with Multiple Parquet Readers ==="
echo ""
# Check magic bytes
echo "1. Magic Bytes:"
FIRST=$(head -c 4 /tmp/test.parquet | xxd -p)
LAST=$(tail -c 4 /tmp/test.parquet | xxd -p)
echo " First 4 bytes: $FIRST"
echo " Last 4 bytes: $LAST"
if [ "$FIRST" = "50415231" ] && [ "$LAST" = "50415231" ]; then
echo " ✅ Valid PAR1 magic"
else
echo " ❌ Invalid magic!"
fi
echo ""
# Python pyarrow
echo "2. Python pyarrow:"
python3 -c "
import pyarrow.parquet as pq
try:
table = pq.read_table('/tmp/test.parquet')
print(f' ✅ Read {table.num_rows} rows, {table.num_columns} columns')
print(f' Data: {table.to_pandas().to_dict(\"records\")}')
except Exception as e:
print(f' ❌ FAILED: {e}')
" 2>&1
echo ""
# Pandas
echo "3. Pandas:"
python3 -c "
import pandas as pd
try:
df = pd.read_parquet('/tmp/test.parquet')
print(f' ✅ Read {len(df)} rows')
print(f' Data:\n{df}')
except Exception as e:
print(f' ❌ FAILED: {e}')
" 2>&1
echo ""
# DuckDB
echo "4. DuckDB:"
python3 -c "
import duckdb
try:
conn = duckdb.connect(':memory:')
result = conn.execute('SELECT * FROM \"/tmp/test.parquet\"').fetchall()
print(f' ✅ Read {len(result)} rows')
print(f' Data: {result}')
except Exception as e:
print(f' ❌ FAILED: {e}')
" 2>&1
echo ""
echo "=== Summary ==="
echo "File: $FILE_SIZE bytes"
echo "If readers succeeded: File is VALID ✅"
echo "If readers failed: Footer metadata is corrupted ❌"
Loading…
Cancel
Save