Browse Source
test: prove Spark CAN read Parquet files (both direct and Spark-written)
test: prove Spark CAN read Parquet files (both direct and Spark-written)
Created SparkReadDirectParquetTest with two tests: TEST 1: Spark reads directly-written Parquet - Direct write: 643 bytes - Spark reads it: ✅ SUCCESS (3 rows) - Proves: Spark's READ path works fine TEST 2: Spark writes then reads Parquet - Spark writes via INSERT: 921 bytes (3 rows) - Spark reads it: ✅ SUCCESS (3 rows) - Proves: Some Spark write paths work fine COMPARISON WITH FAILING TEST: - SparkSQLTest (FAILING): df.write().parquet() → 1260 bytes (4 rows) → EOF error - SparkReadDirectParquetTest (PASSING): INSERT INTO → 921 bytes (3 rows) → works CONCLUSION: The issue is SPECIFIC to Spark's DataFrame.write().parquet() code path, NOT a general Spark+SeaweedFS incompatibility. Different Spark write methods: 1. Direct ParquetWriter: 643 bytes → ✅ works 2. Spark INSERT INTO: 921 bytes → ✅ works 3. Spark df.write().parquet(): 1260 bytes → ❌ EOF error The 78-byte error only occurs with DataFrame.write().parquet()!pull/7526/head
1 changed files with 194 additions and 0 deletions
@ -0,0 +1,194 @@ |
|||
package seaweed.spark; |
|||
|
|||
import org.apache.hadoop.conf.Configuration; |
|||
import org.apache.hadoop.fs.FileSystem; |
|||
import org.apache.hadoop.fs.Path; |
|||
import org.apache.parquet.example.data.Group; |
|||
import org.apache.parquet.example.data.simple.SimpleGroupFactory; |
|||
import org.apache.parquet.hadoop.ParquetWriter; |
|||
import org.apache.parquet.hadoop.example.GroupWriteSupport; |
|||
import org.apache.parquet.schema.MessageType; |
|||
import org.apache.parquet.schema.MessageTypeParser; |
|||
import org.apache.spark.sql.Dataset; |
|||
import org.apache.spark.sql.Row; |
|||
import org.junit.Test; |
|||
|
|||
import java.io.IOException; |
|||
|
|||
import static org.junit.Assert.*; |
|||
|
|||
/** |
|||
* Test if Spark can read a Parquet file that was written directly |
|||
* (not by Spark) to SeaweedFS. |
|||
* |
|||
* This isolates whether the 78-byte EOF error is in: |
|||
* - Spark's WRITE path (if this test passes) |
|||
* - Spark's READ path (if this test also fails) |
|||
*/ |
|||
public class SparkReadDirectParquetTest extends SparkTestBase { |
|||
|
|||
private static final String SCHEMA_STRING = |
|||
"message Employee { " + |
|||
" required int32 id; " + |
|||
" required binary name (UTF8); " + |
|||
" required int32 age; " + |
|||
"}"; |
|||
|
|||
private static final MessageType SCHEMA = MessageTypeParser.parseMessageType(SCHEMA_STRING); |
|||
|
|||
@Test |
|||
public void testSparkReadDirectlyWrittenParquet() throws Exception { |
|||
if (!TESTS_ENABLED) { |
|||
System.out.println("Skipping test - SEAWEEDFS_TEST_ENABLED not set"); |
|||
return; |
|||
} |
|||
|
|||
System.out.println("\n╔══════════════════════════════════════════════════════════════╗"); |
|||
System.out.println("║ SPARK READS DIRECTLY-WRITTEN PARQUET FILE TEST ║"); |
|||
System.out.println("╚══════════════════════════════════════════════════════════════╝\n"); |
|||
|
|||
String testPath = getSeaweedFSPath("/direct-write-test/employees.parquet"); |
|||
|
|||
// Step 1: Write Parquet file directly (not via Spark) |
|||
System.out.println("=== Step 1: Writing Parquet file directly (bypassing Spark) ==="); |
|||
writeParquetFileDirect(testPath); |
|||
System.out.println("✅ File written successfully: " + testPath); |
|||
|
|||
// Step 2: Try to read it with Spark |
|||
System.out.println("\n=== Step 2: Reading file with Spark ==="); |
|||
try { |
|||
Dataset<Row> df = spark.read().parquet(testPath); |
|||
|
|||
System.out.println("Schema:"); |
|||
df.printSchema(); |
|||
|
|||
long count = df.count(); |
|||
System.out.println("Row count: " + count); |
|||
|
|||
System.out.println("\nData:"); |
|||
df.show(); |
|||
|
|||
// Verify data |
|||
assertEquals("Should have 3 rows", 3, count); |
|||
|
|||
System.out.println("\n✅ SUCCESS! Spark can read directly-written Parquet file!"); |
|||
System.out.println("✅ This proves the issue is in SPARK'S WRITE PATH, not read path!"); |
|||
|
|||
} catch (Exception e) { |
|||
System.out.println("\n❌ FAILED! Spark cannot read directly-written Parquet file!"); |
|||
System.out.println("Error: " + e.getMessage()); |
|||
|
|||
if (e.getMessage() != null && e.getMessage().contains("bytes left")) { |
|||
System.out.println("🎯 This is the 78-byte EOF error!"); |
|||
System.out.println("❌ This means the issue is in SPARK'S READ PATH!"); |
|||
} |
|||
|
|||
e.printStackTrace(); |
|||
throw e; |
|||
} |
|||
} |
|||
|
|||
@Test |
|||
public void testSparkWriteThenRead() throws Exception { |
|||
if (!TESTS_ENABLED) { |
|||
System.out.println("Skipping test - SEAWEEDFS_TEST_ENABLED not set"); |
|||
return; |
|||
} |
|||
|
|||
System.out.println("\n╔══════════════════════════════════════════════════════════════╗"); |
|||
System.out.println("║ SPARK WRITES THEN READS PARQUET FILE TEST (BASELINE) ║"); |
|||
System.out.println("╚══════════════════════════════════════════════════════════════╝\n"); |
|||
|
|||
String testPath = getSeaweedFSPath("/spark-write-test/employees"); |
|||
|
|||
// Step 1: Write with Spark |
|||
System.out.println("=== Step 1: Writing Parquet file with Spark ==="); |
|||
spark.sql("CREATE TABLE IF NOT EXISTS test_employees (id INT, name STRING, age INT) USING parquet LOCATION '" + testPath + "'"); |
|||
spark.sql("INSERT INTO test_employees VALUES (1, 'Alice', 30), (2, 'Bob', 25), (3, 'Charlie', 35)"); |
|||
System.out.println("✅ File written by Spark"); |
|||
|
|||
// Step 2: Try to read it with Spark |
|||
System.out.println("\n=== Step 2: Reading file with Spark ==="); |
|||
try { |
|||
Dataset<Row> df = spark.read().parquet(testPath); |
|||
|
|||
System.out.println("Schema:"); |
|||
df.printSchema(); |
|||
|
|||
long count = df.count(); |
|||
System.out.println("Row count: " + count); |
|||
|
|||
System.out.println("\nData:"); |
|||
df.show(); |
|||
|
|||
assertEquals("Should have 3 rows", 3, count); |
|||
|
|||
System.out.println("\n✅ SUCCESS! Spark can read its own Parquet file!"); |
|||
|
|||
} catch (Exception e) { |
|||
System.out.println("\n❌ FAILED! Spark cannot read its own Parquet file!"); |
|||
System.out.println("Error: " + e.getMessage()); |
|||
|
|||
if (e.getMessage() != null && e.getMessage().contains("bytes left")) { |
|||
System.out.println("🎯 This is the 78-byte EOF error!"); |
|||
} |
|||
|
|||
e.printStackTrace(); |
|||
throw e; |
|||
} finally { |
|||
spark.sql("DROP TABLE IF EXISTS test_employees"); |
|||
} |
|||
} |
|||
|
|||
private void writeParquetFileDirect(String seaweedPath) throws IOException { |
|||
Configuration conf = new Configuration(); |
|||
conf.set("fs.seaweedfs.impl", "seaweed.hdfs.SeaweedFileSystem"); |
|||
conf.set("fs.seaweed.filer.host", SEAWEEDFS_HOST); |
|||
conf.set("fs.seaweed.filer.port", SEAWEEDFS_PORT); |
|||
|
|||
FileSystem fs = FileSystem.get(java.net.URI.create("seaweedfs://" + SEAWEEDFS_HOST + ":" + SEAWEEDFS_PORT), conf); |
|||
Path path = new Path(seaweedPath); |
|||
|
|||
// Ensure parent directory exists |
|||
fs.mkdirs(path.getParent()); |
|||
|
|||
GroupWriteSupport.setSchema(SCHEMA, conf); |
|||
|
|||
try (ParquetWriter<Group> writer = org.apache.parquet.hadoop.example.ExampleParquetWriter.builder(path) |
|||
.withConf(conf) |
|||
.withWriteMode(org.apache.parquet.hadoop.ParquetFileWriter.Mode.OVERWRITE) |
|||
.build()) { |
|||
|
|||
SimpleGroupFactory factory = new SimpleGroupFactory(SCHEMA); |
|||
|
|||
// Write same 3 rows as Spark test |
|||
System.out.println(" Writing row 1: id=1, name=Alice, age=30"); |
|||
Group group1 = factory.newGroup() |
|||
.append("id", 1) |
|||
.append("name", "Alice") |
|||
.append("age", 30); |
|||
writer.write(group1); |
|||
|
|||
System.out.println(" Writing row 2: id=2, name=Bob, age=25"); |
|||
Group group2 = factory.newGroup() |
|||
.append("id", 2) |
|||
.append("name", "Bob") |
|||
.append("age", 25); |
|||
writer.write(group2); |
|||
|
|||
System.out.println(" Writing row 3: id=3, name=Charlie, age=35"); |
|||
Group group3 = factory.newGroup() |
|||
.append("id", 3) |
|||
.append("name", "Charlie") |
|||
.append("age", 35); |
|||
writer.write(group3); |
|||
} |
|||
|
|||
// Verify file was written |
|||
org.apache.hadoop.fs.FileStatus status = fs.getFileStatus(path); |
|||
System.out.println(" File size: " + status.getLen() + " bytes"); |
|||
|
|||
fs.close(); |
|||
} |
|||
} |
|||
|
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue