seaweedfs/test/java/spark/download_and_test.sh


								#!/bin/bash

								set -e


								echo "=== Downloading Parquet file and testing with multiple readers ==="

								echo ""


								# Start services if not running

								docker compose up -d seaweedfs-master seaweedfs-volume seaweedfs-filer 2>&1 | grep -v "Running"

								sleep 3


								# Write a file using Spark

								echo "1. Writing Parquet file with Spark..."

								docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c '

								cd /workspace

								# Run the test that writes a file

								mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1 | tail -20

								' > /tmp/spark_write.log 2>&1 &

								WRITE_PID=$!


								# Wait a bit for file to be written

								sleep 8


								# Find and download the file from the temporary directory

								echo "2. Finding Parquet file in temporary directory..."

								TEMP_FILE=$(docker compose exec -T seaweedfs-filer sh -c '

								find /data -name "*.parquet" -type f 2>/dev/null | grep -v "_SUCCESS" | head -1

								' 2>&1 | tr -d '\r')


								if [ -z "$TEMP_FILE" ]; then

								    echo "Waiting for file to be written..."

								    sleep 5

								    TEMP_FILE=$(docker compose exec -T seaweedfs-filer sh -c '

								    find /data -name "*.parquet" -type f 2>/dev/null | grep -v "_SUCCESS" | head -1

								    ' 2>&1 | tr -d '\r')

								fi


								if [ -z "$TEMP_FILE" ]; then

								    echo "ERROR: No Parquet file found!"

								    echo "Checking what files exist..."

								    docker compose exec -T seaweedfs-filer sh -c 'find /data -type f 2>/dev/null | head -20'

								    wait $WRITE_PID

								    exit 1

								fi


								echo "Found: $TEMP_FILE"


								# Copy file from container

								echo "3. Copying file from container..."

								docker compose cp seaweedfs-filer:$TEMP_FILE /tmp/spark_written.parquet 2>&1 | grep -v "Successfully"


								# Also try to get it via HTTP

								echo "4. Also downloading via HTTP API..."

								# Get the file path relative to /data

								REL_PATH=$(echo $TEMP_FILE | sed 's|/data||')

								curl -s "http://localhost:8888${REL_PATH}" -o /tmp/spark_written_http.parquet 2>&1


								# Use whichever file is larger/valid

								if [ -f /tmp/spark_written.parquet ] && [ -s /tmp/spark_written.parquet ]; then

								    cp /tmp/spark_written.parquet /tmp/test.parquet

								    echo "Using file copied from container"

								elif [ -f /tmp/spark_written_http.parquet ] && [ -s /tmp/spark_written_http.parquet ]; then

								    cp /tmp/spark_written_http.parquet /tmp/test.parquet

								    echo "Using file downloaded via HTTP"

								else

								    echo "ERROR: Failed to get file!"

								    exit 1

								fi


								FILE_SIZE=$(stat -f%z /tmp/test.parquet 2>/dev/null || stat --format=%s /tmp/test.parquet 2>/dev/null)

								echo "Got file: $FILE_SIZE bytes"

								echo ""


								# Kill the write process

								kill $WRITE_PID 2>/dev/null || true

								wait $WRITE_PID 2>/dev/null || true


								# Now test with various readers

								echo "=== Testing with Multiple Parquet Readers ==="

								echo ""


								# 1. Check magic bytes

								echo "1. Magic Bytes Check:"

								echo -n "   First 4 bytes: "

								head -c 4 /tmp/test.parquet | xxd -p

								echo -n "   Last 4 bytes: "

								tail -c 4 /tmp/test.parquet | xxd -p


								FIRST=$(head -c 4 /tmp/test.parquet | xxd -p)

								LAST=$(tail -c 4 /tmp/test.parquet | xxd -p)

								if [ "$FIRST" = "50415231" ] && [ "$LAST" = "50415231" ]; then

								    echo "   ✅ Valid PAR1 magic bytes"

								else

								    echo "   ❌ Invalid magic bytes!"

								fi

								echo ""


								# 2. Python pyarrow

								echo "2. Testing with Python pyarrow:"

								python3 << 'PYEOF'

								try:

								    import pyarrow.parquet as pq

								    table = pq.read_table('/tmp/test.parquet')

								    print(f"   ✅ SUCCESS: Read {table.num_rows} rows, {table.num_columns} columns")

								    print(f"   Schema: {table.schema}")

								    print(f"   First row: {table.to_pandas().iloc[0].to_dict()}")

								except Exception as e:

								    print(f"   ❌ FAILED: {e}")

								PYEOF

								echo ""


								# 3. DuckDB

								echo "3. Testing with DuckDB:"

								python3 << 'PYEOF'

								try:

								    import duckdb

								    conn = duckdb.connect(':memory:')

								    result = conn.execute("SELECT * FROM '/tmp/test.parquet'").fetchall()

								    print(f"   ✅ SUCCESS: Read {len(result)} rows")

								    print(f"   Data: {result}")

								except Exception as e:

								    print(f"   ❌ FAILED: {e}")

								PYEOF

								echo ""


								# 4. Pandas

								echo "4. Testing with Pandas:"

								python3 << 'PYEOF'

								try:

								    import pandas as pd

								    df = pd.read_parquet('/tmp/test.parquet')

								    print(f"   ✅ SUCCESS: Read {len(df)} rows, {len(df.columns)} columns")

								    print(f"   Columns: {list(df.columns)}")

								    print(f"   Data:\n{df}")

								except Exception as e:

								    print(f"   ❌ FAILED: {e}")

								PYEOF

								echo ""


								# 5. Java ParquetReader (using our test container)

								echo "5. Testing with Java ParquetReader:"

								docker compose run --rm spark-tests bash -c '

								cat > /tmp/ReadParquet.java << "JAVAEOF"

								import org.apache.hadoop.conf.Configuration;

								import org.apache.hadoop.fs.Path;

								import org.apache.parquet.hadoop.ParquetReader;

								import org.apache.parquet.hadoop.example.GroupReadSupport;

								import org.apache.parquet.example.data.Group;


								public class ReadParquet {

								    public static void main(String[] args) throws Exception {

								        Configuration conf = new Configuration();

								        Path path = new Path("/tmp/test.parquet");


								        try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), path)

								                .withConf(conf).build()) {

								            Group group;

								            int count = 0;

								            while ((group = reader.read()) != null && count < 5) {

								                System.out.println("   Row " + count + ": " + group);

								                count++;

								            }

								            System.out.println("   ✅ SUCCESS: Read " + count + " rows");

								        } catch (Exception e) {

								            System.out.println("   ❌ FAILED: " + e.getMessage());

								            e.printStackTrace();

								        }

								    }

								}

								JAVAEOF


								# Copy the file into container

								cat > /tmp/test.parquet

								' < /tmp/test.parquet 2>&1 | head -1


								echo ""

								echo "=== Summary ==="

								echo "File size: $FILE_SIZE bytes"

								echo "If all readers succeeded, the file is VALID."

								echo "If readers failed, the footer metadata is corrupted."