#!/bin/bash set -e echo "=== Downloading Parquet file and testing with multiple readers ===" echo "" # Start services if not running docker compose up -d seaweedfs-master seaweedfs-volume seaweedfs-filer 2>&1 | grep -v "Running" sleep 3 # Write a file using Spark echo "1. Writing Parquet file with Spark..." docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c ' cd /workspace # Run the test that writes a file mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1 | tail -20 ' > /tmp/spark_write.log 2>&1 & WRITE_PID=$! # Wait a bit for file to be written sleep 8 # Find and download the file from the temporary directory echo "2. Finding Parquet file in temporary directory..." TEMP_FILE=$(docker compose exec -T seaweedfs-filer sh -c ' find /data -name "*.parquet" -type f 2>/dev/null | grep -v "_SUCCESS" | head -1 ' 2>&1 | tr -d '\r') if [ -z "$TEMP_FILE" ]; then echo "Waiting for file to be written..." sleep 5 TEMP_FILE=$(docker compose exec -T seaweedfs-filer sh -c ' find /data -name "*.parquet" -type f 2>/dev/null | grep -v "_SUCCESS" | head -1 ' 2>&1 | tr -d '\r') fi if [ -z "$TEMP_FILE" ]; then echo "ERROR: No Parquet file found!" echo "Checking what files exist..." docker compose exec -T seaweedfs-filer sh -c 'find /data -type f 2>/dev/null | head -20' wait $WRITE_PID exit 1 fi echo "Found: $TEMP_FILE" # Copy file from container echo "3. Copying file from container..." docker compose cp seaweedfs-filer:$TEMP_FILE /tmp/spark_written.parquet 2>&1 | grep -v "Successfully" # Also try to get it via HTTP echo "4. Also downloading via HTTP API..." # Get the file path relative to /data REL_PATH=$(echo $TEMP_FILE | sed 's|/data||') curl -s "http://localhost:8888${REL_PATH}" -o /tmp/spark_written_http.parquet 2>&1 # Use whichever file is larger/valid if [ -f /tmp/spark_written.parquet ] && [ -s /tmp/spark_written.parquet ]; then cp /tmp/spark_written.parquet /tmp/test.parquet echo "Using file copied from container" elif [ -f /tmp/spark_written_http.parquet ] && [ -s /tmp/spark_written_http.parquet ]; then cp /tmp/spark_written_http.parquet /tmp/test.parquet echo "Using file downloaded via HTTP" else echo "ERROR: Failed to get file!" exit 1 fi FILE_SIZE=$(stat -f%z /tmp/test.parquet 2>/dev/null || stat --format=%s /tmp/test.parquet 2>/dev/null) echo "Got file: $FILE_SIZE bytes" echo "" # Kill the write process kill $WRITE_PID 2>/dev/null || true wait $WRITE_PID 2>/dev/null || true # Now test with various readers echo "=== Testing with Multiple Parquet Readers ===" echo "" # 1. Check magic bytes echo "1. Magic Bytes Check:" echo -n " First 4 bytes: " head -c 4 /tmp/test.parquet | xxd -p echo -n " Last 4 bytes: " tail -c 4 /tmp/test.parquet | xxd -p FIRST=$(head -c 4 /tmp/test.parquet | xxd -p) LAST=$(tail -c 4 /tmp/test.parquet | xxd -p) if [ "$FIRST" = "50415231" ] && [ "$LAST" = "50415231" ]; then echo " ✅ Valid PAR1 magic bytes" else echo " ❌ Invalid magic bytes!" fi echo "" # 2. Python pyarrow echo "2. Testing with Python pyarrow:" python3 << 'PYEOF' try: import pyarrow.parquet as pq table = pq.read_table('/tmp/test.parquet') print(f" ✅ SUCCESS: Read {table.num_rows} rows, {table.num_columns} columns") print(f" Schema: {table.schema}") print(f" First row: {table.to_pandas().iloc[0].to_dict()}") except Exception as e: print(f" ❌ FAILED: {e}") PYEOF echo "" # 3. DuckDB echo "3. Testing with DuckDB:" python3 << 'PYEOF' try: import duckdb conn = duckdb.connect(':memory:') result = conn.execute("SELECT * FROM '/tmp/test.parquet'").fetchall() print(f" ✅ SUCCESS: Read {len(result)} rows") print(f" Data: {result}") except Exception as e: print(f" ❌ FAILED: {e}") PYEOF echo "" # 4. Pandas echo "4. Testing with Pandas:" python3 << 'PYEOF' try: import pandas as pd df = pd.read_parquet('/tmp/test.parquet') print(f" ✅ SUCCESS: Read {len(df)} rows, {len(df.columns)} columns") print(f" Columns: {list(df.columns)}") print(f" Data:\n{df}") except Exception as e: print(f" ❌ FAILED: {e}") PYEOF echo "" # 5. Java ParquetReader (using our test container) echo "5. Testing with Java ParquetReader:" docker compose run --rm spark-tests bash -c ' cat > /tmp/ReadParquet.java << "JAVAEOF" import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.parquet.hadoop.ParquetReader; import org.apache.parquet.hadoop.example.GroupReadSupport; import org.apache.parquet.example.data.Group; public class ReadParquet { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Path path = new Path("/tmp/test.parquet"); try (ParquetReader reader = ParquetReader.builder(new GroupReadSupport(), path) .withConf(conf).build()) { Group group; int count = 0; while ((group = reader.read()) != null && count < 5) { System.out.println(" Row " + count + ": " + group); count++; } System.out.println(" ✅ SUCCESS: Read " + count + " rows"); } catch (Exception e) { System.out.println(" ❌ FAILED: " + e.getMessage()); e.printStackTrace(); } } } JAVAEOF # Copy the file into container cat > /tmp/test.parquet ' < /tmp/test.parquet 2>&1 | head -1 echo "" echo "=== Summary ===" echo "File size: $FILE_SIZE bytes" echo "If all readers succeeded, the file is VALID." echo "If readers failed, the footer metadata is corrupted."