You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
180 lines
5.5 KiB
180 lines
5.5 KiB
#!/bin/bash
|
|
set -e
|
|
|
|
echo "=== Downloading Parquet file and testing with multiple readers ==="
|
|
echo ""
|
|
|
|
# Start services if not running
|
|
docker compose up -d seaweedfs-master seaweedfs-volume seaweedfs-filer 2>&1 | grep -v "Running"
|
|
sleep 3
|
|
|
|
# Write a file using Spark
|
|
echo "1. Writing Parquet file with Spark..."
|
|
docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c '
|
|
cd /workspace
|
|
# Run the test that writes a file
|
|
mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1 | tail -20
|
|
' > /tmp/spark_write.log 2>&1 &
|
|
WRITE_PID=$!
|
|
|
|
# Wait a bit for file to be written
|
|
sleep 8
|
|
|
|
# Find and download the file from the temporary directory
|
|
echo "2. Finding Parquet file in temporary directory..."
|
|
TEMP_FILE=$(docker compose exec -T seaweedfs-filer sh -c '
|
|
find /data -name "*.parquet" -type f 2>/dev/null | grep -v "_SUCCESS" | head -1
|
|
' 2>&1 | tr -d '\r')
|
|
|
|
if [ -z "$TEMP_FILE" ]; then
|
|
echo "Waiting for file to be written..."
|
|
sleep 5
|
|
TEMP_FILE=$(docker compose exec -T seaweedfs-filer sh -c '
|
|
find /data -name "*.parquet" -type f 2>/dev/null | grep -v "_SUCCESS" | head -1
|
|
' 2>&1 | tr -d '\r')
|
|
fi
|
|
|
|
if [ -z "$TEMP_FILE" ]; then
|
|
echo "ERROR: No Parquet file found!"
|
|
echo "Checking what files exist..."
|
|
docker compose exec -T seaweedfs-filer sh -c 'find /data -type f 2>/dev/null | head -20'
|
|
wait $WRITE_PID
|
|
exit 1
|
|
fi
|
|
|
|
echo "Found: $TEMP_FILE"
|
|
|
|
# Copy file from container
|
|
echo "3. Copying file from container..."
|
|
docker compose cp seaweedfs-filer:$TEMP_FILE /tmp/spark_written.parquet 2>&1 | grep -v "Successfully"
|
|
|
|
# Also try to get it via HTTP
|
|
echo "4. Also downloading via HTTP API..."
|
|
# Get the file path relative to /data
|
|
REL_PATH=$(echo $TEMP_FILE | sed 's|/data||')
|
|
curl -s "http://localhost:8888${REL_PATH}" -o /tmp/spark_written_http.parquet 2>&1
|
|
|
|
# Use whichever file is larger/valid
|
|
if [ -f /tmp/spark_written.parquet ] && [ -s /tmp/spark_written.parquet ]; then
|
|
cp /tmp/spark_written.parquet /tmp/test.parquet
|
|
echo "Using file copied from container"
|
|
elif [ -f /tmp/spark_written_http.parquet ] && [ -s /tmp/spark_written_http.parquet ]; then
|
|
cp /tmp/spark_written_http.parquet /tmp/test.parquet
|
|
echo "Using file downloaded via HTTP"
|
|
else
|
|
echo "ERROR: Failed to get file!"
|
|
exit 1
|
|
fi
|
|
|
|
FILE_SIZE=$(stat -f%z /tmp/test.parquet 2>/dev/null || stat --format=%s /tmp/test.parquet 2>/dev/null)
|
|
echo "Got file: $FILE_SIZE bytes"
|
|
echo ""
|
|
|
|
# Kill the write process
|
|
kill $WRITE_PID 2>/dev/null || true
|
|
wait $WRITE_PID 2>/dev/null || true
|
|
|
|
# Now test with various readers
|
|
echo "=== Testing with Multiple Parquet Readers ==="
|
|
echo ""
|
|
|
|
# 1. Check magic bytes
|
|
echo "1. Magic Bytes Check:"
|
|
echo -n " First 4 bytes: "
|
|
head -c 4 /tmp/test.parquet | xxd -p
|
|
echo -n " Last 4 bytes: "
|
|
tail -c 4 /tmp/test.parquet | xxd -p
|
|
|
|
FIRST=$(head -c 4 /tmp/test.parquet | xxd -p)
|
|
LAST=$(tail -c 4 /tmp/test.parquet | xxd -p)
|
|
if [ "$FIRST" = "50415231" ] && [ "$LAST" = "50415231" ]; then
|
|
echo " ✅ Valid PAR1 magic bytes"
|
|
else
|
|
echo " ❌ Invalid magic bytes!"
|
|
fi
|
|
echo ""
|
|
|
|
# 2. Python pyarrow
|
|
echo "2. Testing with Python pyarrow:"
|
|
python3 << 'PYEOF'
|
|
try:
|
|
import pyarrow.parquet as pq
|
|
table = pq.read_table('/tmp/test.parquet')
|
|
print(f" ✅ SUCCESS: Read {table.num_rows} rows, {table.num_columns} columns")
|
|
print(f" Schema: {table.schema}")
|
|
print(f" First row: {table.to_pandas().iloc[0].to_dict()}")
|
|
except Exception as e:
|
|
print(f" ❌ FAILED: {e}")
|
|
PYEOF
|
|
echo ""
|
|
|
|
# 3. DuckDB
|
|
echo "3. Testing with DuckDB:"
|
|
python3 << 'PYEOF'
|
|
try:
|
|
import duckdb
|
|
conn = duckdb.connect(':memory:')
|
|
result = conn.execute("SELECT * FROM '/tmp/test.parquet'").fetchall()
|
|
print(f" ✅ SUCCESS: Read {len(result)} rows")
|
|
print(f" Data: {result}")
|
|
except Exception as e:
|
|
print(f" ❌ FAILED: {e}")
|
|
PYEOF
|
|
echo ""
|
|
|
|
# 4. Pandas
|
|
echo "4. Testing with Pandas:"
|
|
python3 << 'PYEOF'
|
|
try:
|
|
import pandas as pd
|
|
df = pd.read_parquet('/tmp/test.parquet')
|
|
print(f" ✅ SUCCESS: Read {len(df)} rows, {len(df.columns)} columns")
|
|
print(f" Columns: {list(df.columns)}")
|
|
print(f" Data:\n{df}")
|
|
except Exception as e:
|
|
print(f" ❌ FAILED: {e}")
|
|
PYEOF
|
|
echo ""
|
|
|
|
# 5. Java ParquetReader (using our test container)
|
|
echo "5. Testing with Java ParquetReader:"
|
|
docker compose run --rm spark-tests bash -c '
|
|
cat > /tmp/ReadParquet.java << "JAVAEOF"
|
|
import org.apache.hadoop.conf.Configuration;
|
|
import org.apache.hadoop.fs.Path;
|
|
import org.apache.parquet.hadoop.ParquetReader;
|
|
import org.apache.parquet.hadoop.example.GroupReadSupport;
|
|
import org.apache.parquet.example.data.Group;
|
|
|
|
public class ReadParquet {
|
|
public static void main(String[] args) throws Exception {
|
|
Configuration conf = new Configuration();
|
|
Path path = new Path("/tmp/test.parquet");
|
|
|
|
try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), path)
|
|
.withConf(conf).build()) {
|
|
Group group;
|
|
int count = 0;
|
|
while ((group = reader.read()) != null && count < 5) {
|
|
System.out.println(" Row " + count + ": " + group);
|
|
count++;
|
|
}
|
|
System.out.println(" ✅ SUCCESS: Read " + count + " rows");
|
|
} catch (Exception e) {
|
|
System.out.println(" ❌ FAILED: " + e.getMessage());
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
}
|
|
JAVAEOF
|
|
|
|
# Copy the file into container
|
|
cat > /tmp/test.parquet
|
|
' < /tmp/test.parquet 2>&1 | head -1
|
|
|
|
echo ""
|
|
echo "=== Summary ==="
|
|
echo "File size: $FILE_SIZE bytes"
|
|
echo "If all readers succeeded, the file is VALID."
|
|
echo "If readers failed, the footer metadata is corrupted."
|
|
|