You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

180 lines
5.5 KiB

#!/bin/bash
set -e
echo "=== Downloading Parquet file and testing with multiple readers ==="
echo ""
# Start services if not running
docker compose up -d seaweedfs-master seaweedfs-volume seaweedfs-filer 2>&1 | grep -v "Running"
sleep 3
# Write a file using Spark
echo "1. Writing Parquet file with Spark..."
docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c '
cd /workspace
# Run the test that writes a file
mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1 | tail -20
' > /tmp/spark_write.log 2>&1 &
WRITE_PID=$!
# Wait a bit for file to be written
sleep 8
# Find and download the file from the temporary directory
echo "2. Finding Parquet file in temporary directory..."
TEMP_FILE=$(docker compose exec -T seaweedfs-filer sh -c '
find /data -name "*.parquet" -type f 2>/dev/null | grep -v "_SUCCESS" | head -1
' 2>&1 | tr -d '\r')
if [ -z "$TEMP_FILE" ]; then
echo "Waiting for file to be written..."
sleep 5
TEMP_FILE=$(docker compose exec -T seaweedfs-filer sh -c '
find /data -name "*.parquet" -type f 2>/dev/null | grep -v "_SUCCESS" | head -1
' 2>&1 | tr -d '\r')
fi
if [ -z "$TEMP_FILE" ]; then
echo "ERROR: No Parquet file found!"
echo "Checking what files exist..."
docker compose exec -T seaweedfs-filer sh -c 'find /data -type f 2>/dev/null | head -20'
wait $WRITE_PID
exit 1
fi
echo "Found: $TEMP_FILE"
# Copy file from container
echo "3. Copying file from container..."
docker compose cp seaweedfs-filer:$TEMP_FILE /tmp/spark_written.parquet 2>&1 | grep -v "Successfully"
# Also try to get it via HTTP
echo "4. Also downloading via HTTP API..."
# Get the file path relative to /data
REL_PATH=$(echo $TEMP_FILE | sed 's|/data||')
curl -s "http://localhost:8888${REL_PATH}" -o /tmp/spark_written_http.parquet 2>&1
# Use whichever file is larger/valid
if [ -f /tmp/spark_written.parquet ] && [ -s /tmp/spark_written.parquet ]; then
cp /tmp/spark_written.parquet /tmp/test.parquet
echo "Using file copied from container"
elif [ -f /tmp/spark_written_http.parquet ] && [ -s /tmp/spark_written_http.parquet ]; then
cp /tmp/spark_written_http.parquet /tmp/test.parquet
echo "Using file downloaded via HTTP"
else
echo "ERROR: Failed to get file!"
exit 1
fi
FILE_SIZE=$(stat -f%z /tmp/test.parquet 2>/dev/null || stat --format=%s /tmp/test.parquet 2>/dev/null)
echo "Got file: $FILE_SIZE bytes"
echo ""
# Kill the write process
kill $WRITE_PID 2>/dev/null || true
wait $WRITE_PID 2>/dev/null || true
# Now test with various readers
echo "=== Testing with Multiple Parquet Readers ==="
echo ""
# 1. Check magic bytes
echo "1. Magic Bytes Check:"
echo -n " First 4 bytes: "
head -c 4 /tmp/test.parquet | xxd -p
echo -n " Last 4 bytes: "
tail -c 4 /tmp/test.parquet | xxd -p
FIRST=$(head -c 4 /tmp/test.parquet | xxd -p)
LAST=$(tail -c 4 /tmp/test.parquet | xxd -p)
if [ "$FIRST" = "50415231" ] && [ "$LAST" = "50415231" ]; then
echo " ✅ Valid PAR1 magic bytes"
else
echo " ❌ Invalid magic bytes!"
fi
echo ""
# 2. Python pyarrow
echo "2. Testing with Python pyarrow:"
python3 << 'PYEOF'
try:
import pyarrow.parquet as pq
table = pq.read_table('/tmp/test.parquet')
print(f" ✅ SUCCESS: Read {table.num_rows} rows, {table.num_columns} columns")
print(f" Schema: {table.schema}")
print(f" First row: {table.to_pandas().iloc[0].to_dict()}")
except Exception as e:
print(f" ❌ FAILED: {e}")
PYEOF
echo ""
# 3. DuckDB
echo "3. Testing with DuckDB:"
python3 << 'PYEOF'
try:
import duckdb
conn = duckdb.connect(':memory:')
result = conn.execute("SELECT * FROM '/tmp/test.parquet'").fetchall()
print(f" ✅ SUCCESS: Read {len(result)} rows")
print(f" Data: {result}")
except Exception as e:
print(f" ❌ FAILED: {e}")
PYEOF
echo ""
# 4. Pandas
echo "4. Testing with Pandas:"
python3 << 'PYEOF'
try:
import pandas as pd
df = pd.read_parquet('/tmp/test.parquet')
print(f" ✅ SUCCESS: Read {len(df)} rows, {len(df.columns)} columns")
print(f" Columns: {list(df.columns)}")
print(f" Data:\n{df}")
except Exception as e:
print(f" ❌ FAILED: {e}")
PYEOF
echo ""
# 5. Java ParquetReader (using our test container)
echo "5. Testing with Java ParquetReader:"
docker compose run --rm spark-tests bash -c '
cat > /tmp/ReadParquet.java << "JAVAEOF"
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.example.GroupReadSupport;
import org.apache.parquet.example.data.Group;
public class ReadParquet {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Path path = new Path("/tmp/test.parquet");
try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), path)
.withConf(conf).build()) {
Group group;
int count = 0;
while ((group = reader.read()) != null && count < 5) {
System.out.println(" Row " + count + ": " + group);
count++;
}
System.out.println(" ✅ SUCCESS: Read " + count + " rows");
} catch (Exception e) {
System.out.println(" ❌ FAILED: " + e.getMessage());
e.printStackTrace();
}
}
}
JAVAEOF
# Copy the file into container
cat > /tmp/test.parquet
' < /tmp/test.parquet 2>&1 | head -1
echo ""
echo "=== Summary ==="
echo "File size: $FILE_SIZE bytes"
echo "If all readers succeeded, the file is VALID."
echo "If readers failed, the footer metadata is corrupted."