6 changed files with 0 additions and 1498 deletions
-
72test/java/spark/src/test/java/seaweed/spark/DirectFileReadTest.java
-
299test/java/spark/src/test/java/seaweed/spark/ParquetMemoryComparisonTest.java
-
214test/java/spark/src/test/java/seaweed/spark/ShadowVsLocalOnlyComparisonTest.java
-
264test/java/spark/src/test/java/seaweed/spark/SparkSQLReadDifferenceTest.java
-
306test/java/spark/src/test/java/seaweed/spark/SparkShadowComparisonTest.java
-
343test/java/spark/src/test/java/seaweed/spark/SparkShadowReadComparisonTest.java
@ -1,72 +0,0 @@ |
|||
package seaweed.spark; |
|||
|
|||
import org.apache.spark.sql.Dataset; |
|||
import org.apache.spark.sql.Row; |
|||
import org.junit.Test; |
|||
|
|||
import static org.junit.Assert.*; |
|||
|
|||
/** |
|||
* Test reading LOCAL_ONLY files directly via file:// protocol |
|||
* to verify the files themselves are valid. |
|||
*/ |
|||
public class DirectFileReadTest extends SparkTestBase { |
|||
|
|||
@Test |
|||
public void testReadLocalOnlyFileDirectly() { |
|||
skipIfTestsDisabled(); |
|||
|
|||
// First write using LOCAL_ONLY mode (through SeaweedFS path) |
|||
java.util.List<SparkSQLTest.Employee> employees = java.util.Arrays.asList( |
|||
new SparkSQLTest.Employee(1, "Alice", "Engineering", 100000), |
|||
new SparkSQLTest.Employee(2, "Bob", "Sales", 80000), |
|||
new SparkSQLTest.Employee(3, "Charlie", "Engineering", 120000), |
|||
new SparkSQLTest.Employee(4, "David", "Sales", 75000)); |
|||
|
|||
Dataset<Row> df = spark.createDataFrame(employees, SparkSQLTest.Employee.class); |
|||
|
|||
String tablePath = getTestPath("employees_direct_test"); |
|||
df.write().mode(org.apache.spark.sql.SaveMode.Overwrite).parquet(tablePath); |
|||
|
|||
System.out.println("โ
Write completed to: " + tablePath); |
|||
|
|||
// Now try to read the LOCAL_ONLY .debug file directly using file:// protocol |
|||
// This bypasses LocalOnlyInputStream and uses native file system |
|||
String debugFilePath = "file:///workspace/target/debug-local/"; |
|||
|
|||
try { |
|||
// List files in debug directory |
|||
java.io.File debugDir = new java.io.File("/workspace/target/debug-local/"); |
|||
java.io.File[] files = debugDir.listFiles((dir, name) -> name.endsWith(".parquet.debug")); |
|||
|
|||
if (files != null && files.length > 0) { |
|||
String localFile = "file://" + files[0].getAbsolutePath(); |
|||
System.out.println("๐ Found LOCAL_ONLY file: " + localFile); |
|||
System.out.println("๐ File size: " + files[0].length() + " bytes"); |
|||
|
|||
// Try to read it directly |
|||
Dataset<Row> directRead = spark.read().parquet(localFile); |
|||
long count = directRead.count(); |
|||
System.out.println("โ
Direct read successful! Row count: " + count); |
|||
|
|||
// Try SQL query on it |
|||
directRead.createOrReplaceTempView("employees_direct"); |
|||
Dataset<Row> filtered = spark.sql( |
|||
"SELECT name, salary FROM employees_direct WHERE department = 'Engineering'"); |
|||
long engineeringCount = filtered.count(); |
|||
System.out.println("โ
SQL query successful! Engineering employees: " + engineeringCount); |
|||
|
|||
assertEquals("Should have 2 engineering employees", 2, engineeringCount); |
|||
|
|||
} else { |
|||
fail("No .debug files found in /workspace/target/debug-local/"); |
|||
} |
|||
|
|||
} catch (Exception e) { |
|||
System.err.println("โ Direct read failed: " + e.getMessage()); |
|||
e.printStackTrace(); |
|||
throw new RuntimeException("Direct file read failed", e); |
|||
} |
|||
} |
|||
} |
|||
|
|||
@ -1,299 +0,0 @@ |
|||
package seaweed.spark; |
|||
|
|||
import org.apache.hadoop.conf.Configuration; |
|||
import org.apache.hadoop.fs.FSDataInputStream; |
|||
import org.apache.hadoop.fs.FSDataOutputStream; |
|||
import org.apache.hadoop.fs.FileSystem; |
|||
import org.apache.hadoop.fs.Path; |
|||
import org.apache.parquet.example.data.Group; |
|||
import org.apache.parquet.example.data.simple.SimpleGroupFactory; |
|||
import org.apache.parquet.hadoop.ParquetFileReader; |
|||
import org.apache.parquet.hadoop.ParquetWriter; |
|||
import org.apache.parquet.hadoop.example.GroupWriteSupport; |
|||
import org.apache.parquet.hadoop.metadata.ParquetMetadata; |
|||
import org.apache.parquet.hadoop.util.HadoopInputFile; |
|||
import org.apache.parquet.schema.MessageType; |
|||
import org.apache.parquet.schema.MessageTypeParser; |
|||
import org.junit.After; |
|||
import org.junit.Before; |
|||
import org.junit.Test; |
|||
|
|||
import java.io.ByteArrayOutputStream; |
|||
import java.io.IOException; |
|||
import java.nio.ByteBuffer; |
|||
|
|||
import static org.junit.Assert.*; |
|||
|
|||
/** |
|||
* Test to compare in-memory Parquet file with SeaweedFS-stored Parquet file |
|||
* to identify what metadata differences cause the 78-byte EOF error. |
|||
*/ |
|||
public class ParquetMemoryComparisonTest extends SparkTestBase { |
|||
|
|||
private static final String SCHEMA_STRING = |
|||
"message Employee { " + |
|||
" required int32 id; " + |
|||
" required binary name (UTF8); " + |
|||
" required int32 age; " + |
|||
"}"; |
|||
|
|||
private static final MessageType SCHEMA = MessageTypeParser.parseMessageType(SCHEMA_STRING); |
|||
|
|||
private FileSystem localFs; |
|||
private FileSystem seaweedFs; |
|||
|
|||
@Before |
|||
public void setUp() throws Exception { |
|||
if (!TESTS_ENABLED) { |
|||
return; |
|||
} |
|||
|
|||
Configuration conf = new Configuration(); |
|||
|
|||
// Local filesystem |
|||
localFs = FileSystem.getLocal(conf); |
|||
|
|||
// SeaweedFS |
|||
conf.set("fs.seaweedfs.impl", "seaweed.hdfs.SeaweedFileSystem"); |
|||
conf.set("fs.seaweed.filer.host", SEAWEEDFS_HOST); |
|||
conf.set("fs.seaweed.filer.port", SEAWEEDFS_PORT); |
|||
seaweedFs = FileSystem.get(java.net.URI.create("seaweedfs://" + SEAWEEDFS_HOST + ":" + SEAWEEDFS_PORT), conf); |
|||
|
|||
System.out.println("=== Test Setup Complete ==="); |
|||
System.out.println("Local FS: " + localFs.getClass().getName()); |
|||
System.out.println("SeaweedFS: " + seaweedFs.getClass().getName()); |
|||
} |
|||
|
|||
@After |
|||
public void tearDown() throws Exception { |
|||
if (localFs != null) { |
|||
localFs.close(); |
|||
} |
|||
if (seaweedFs != null) { |
|||
seaweedFs.close(); |
|||
} |
|||
} |
|||
|
|||
@Test |
|||
public void testCompareMemoryVsSeaweedFSParquet() throws Exception { |
|||
if (!TESTS_ENABLED) { |
|||
System.out.println("Skipping test - SEAWEEDFS_TEST_ENABLED not set"); |
|||
return; |
|||
} |
|||
|
|||
System.out.println("\n=== PARQUET MEMORY vs SEAWEEDFS COMPARISON TEST ===\n"); |
|||
|
|||
// 1. Write identical Parquet file to local temp and SeaweedFS |
|||
Path localPath = new Path("/tmp/test-local-" + System.currentTimeMillis() + ".parquet"); |
|||
Path seaweedPath = new Path("seaweedfs://" + SEAWEEDFS_HOST + ":" + SEAWEEDFS_PORT + |
|||
"/test-spark/comparison-test.parquet"); |
|||
|
|||
System.out.println("Writing to local: " + localPath); |
|||
System.out.println("Writing to SeaweedFS: " + seaweedPath); |
|||
|
|||
// Write same data to both locations |
|||
writeTestParquetFile(localFs, localPath); |
|||
writeTestParquetFile(seaweedFs, seaweedPath); |
|||
|
|||
System.out.println("\n=== Files Written Successfully ===\n"); |
|||
|
|||
// 2. Read raw bytes from both files |
|||
byte[] localBytes = readAllBytes(localFs, localPath); |
|||
byte[] seaweedBytes = readAllBytes(seaweedFs, seaweedPath); |
|||
|
|||
System.out.println("Local file size: " + localBytes.length + " bytes"); |
|||
System.out.println("SeaweedFS file size: " + seaweedBytes.length + " bytes"); |
|||
|
|||
// 3. Compare byte-by-byte |
|||
if (localBytes.length != seaweedBytes.length) { |
|||
System.out.println("\nโ SIZE MISMATCH!"); |
|||
System.out.println("Difference: " + Math.abs(localBytes.length - seaweedBytes.length) + " bytes"); |
|||
} else { |
|||
System.out.println("\nโ
Sizes match!"); |
|||
} |
|||
|
|||
// Find first difference |
|||
int firstDiff = -1; |
|||
int minLen = Math.min(localBytes.length, seaweedBytes.length); |
|||
for (int i = 0; i < minLen; i++) { |
|||
if (localBytes[i] != seaweedBytes[i]) { |
|||
firstDiff = i; |
|||
break; |
|||
} |
|||
} |
|||
|
|||
if (firstDiff >= 0) { |
|||
System.out.println("\nโ CONTENT DIFFERS at byte offset: " + firstDiff); |
|||
System.out.println("Context (20 bytes before and after):"); |
|||
printByteContext(localBytes, seaweedBytes, firstDiff, 20); |
|||
} else if (localBytes.length == seaweedBytes.length) { |
|||
System.out.println("\nโ
Files are IDENTICAL!"); |
|||
} |
|||
|
|||
// 4. Parse Parquet metadata from both |
|||
System.out.println("\n=== Parquet Metadata Comparison ===\n"); |
|||
|
|||
ParquetMetadata localMeta = readParquetMetadata(localFs, localPath); |
|||
ParquetMetadata seaweedMeta = readParquetMetadata(seaweedFs, seaweedPath); |
|||
|
|||
System.out.println("Local metadata:"); |
|||
printParquetMetadata(localMeta); |
|||
|
|||
System.out.println("\nSeaweedFS metadata:"); |
|||
printParquetMetadata(seaweedMeta); |
|||
|
|||
// 5. Try reading both files with Parquet reader |
|||
System.out.println("\n=== Reading Files with ParquetFileReader ===\n"); |
|||
|
|||
try { |
|||
System.out.println("Reading local file..."); |
|||
int localRows = countParquetRows(localFs, localPath); |
|||
System.out.println("โ
Local file: " + localRows + " rows read successfully"); |
|||
} catch (Exception e) { |
|||
System.out.println("โ Local file read failed: " + e.getMessage()); |
|||
e.printStackTrace(); |
|||
} |
|||
|
|||
try { |
|||
System.out.println("\nReading SeaweedFS file..."); |
|||
int seaweedRows = countParquetRows(seaweedFs, seaweedPath); |
|||
System.out.println("โ
SeaweedFS file: " + seaweedRows + " rows read successfully"); |
|||
} catch (Exception e) { |
|||
System.out.println("โ SeaweedFS file read failed: " + e.getMessage()); |
|||
System.out.println("Error type: " + e.getClass().getName()); |
|||
if (e.getMessage() != null && e.getMessage().contains("bytes left")) { |
|||
System.out.println("๐ฏ THIS IS THE 78-BYTE EOF ERROR!"); |
|||
} |
|||
e.printStackTrace(); |
|||
} |
|||
|
|||
// Cleanup |
|||
localFs.delete(localPath, false); |
|||
seaweedFs.delete(seaweedPath, false); |
|||
|
|||
System.out.println("\n=== Test Complete ===\n"); |
|||
} |
|||
|
|||
private void writeTestParquetFile(FileSystem fs, Path path) throws IOException { |
|||
Configuration conf = fs.getConf(); |
|||
GroupWriteSupport.setSchema(SCHEMA, conf); |
|||
|
|||
try (ParquetWriter<Group> writer = org.apache.parquet.hadoop.example.ExampleParquetWriter.builder(path) |
|||
.withConf(conf) |
|||
.withWriteMode(org.apache.parquet.hadoop.ParquetFileWriter.Mode.OVERWRITE) |
|||
.build()) { |
|||
|
|||
SimpleGroupFactory factory = new SimpleGroupFactory(SCHEMA); |
|||
|
|||
// Write 3 rows (same as Spark test) |
|||
Group group1 = factory.newGroup() |
|||
.append("id", 1) |
|||
.append("name", "Alice") |
|||
.append("age", 30); |
|||
writer.write(group1); |
|||
|
|||
Group group2 = factory.newGroup() |
|||
.append("id", 2) |
|||
.append("name", "Bob") |
|||
.append("age", 25); |
|||
writer.write(group2); |
|||
|
|||
Group group3 = factory.newGroup() |
|||
.append("id", 3) |
|||
.append("name", "Charlie") |
|||
.append("age", 35); |
|||
writer.write(group3); |
|||
} |
|||
} |
|||
|
|||
private byte[] readAllBytes(FileSystem fs, Path path) throws IOException { |
|||
try (FSDataInputStream in = fs.open(path)) { |
|||
ByteArrayOutputStream buffer = new ByteArrayOutputStream(); |
|||
byte[] chunk = new byte[8192]; |
|||
int bytesRead; |
|||
while ((bytesRead = in.read(chunk)) != -1) { |
|||
buffer.write(chunk, 0, bytesRead); |
|||
} |
|||
return buffer.toByteArray(); |
|||
} |
|||
} |
|||
|
|||
private ParquetMetadata readParquetMetadata(FileSystem fs, Path path) throws IOException { |
|||
Configuration conf = fs.getConf(); |
|||
try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, conf))) { |
|||
return reader.getFooter(); |
|||
} |
|||
} |
|||
|
|||
private void printParquetMetadata(ParquetMetadata meta) { |
|||
System.out.println(" Blocks: " + meta.getBlocks().size()); |
|||
meta.getBlocks().forEach(block -> { |
|||
System.out.println(" Block rows: " + block.getRowCount()); |
|||
System.out.println(" Block total size: " + block.getTotalByteSize()); |
|||
System.out.println(" Block compressed size: " + block.getCompressedSize()); |
|||
System.out.println(" Columns: " + block.getColumns().size()); |
|||
block.getColumns().forEach(col -> { |
|||
System.out.println(" Column: " + col.getPath()); |
|||
System.out.println(" Starting pos: " + col.getStartingPos()); |
|||
System.out.println(" Total size: " + col.getTotalSize()); |
|||
System.out.println(" Total uncompressed: " + col.getTotalUncompressedSize()); |
|||
}); |
|||
}); |
|||
} |
|||
|
|||
private int countParquetRows(FileSystem fs, Path path) throws IOException { |
|||
Configuration conf = fs.getConf(); |
|||
int rowCount = 0; |
|||
try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, conf))) { |
|||
org.apache.parquet.column.page.PageReadStore pages; |
|||
while ((pages = reader.readNextRowGroup()) != null) { |
|||
rowCount += pages.getRowCount(); |
|||
} |
|||
} |
|||
return rowCount; |
|||
} |
|||
|
|||
private void printByteContext(byte[] local, byte[] seaweed, int offset, int context) { |
|||
int start = Math.max(0, offset - context); |
|||
int endLocal = Math.min(local.length, offset + context); |
|||
int endSeaweed = Math.min(seaweed.length, offset + context); |
|||
|
|||
System.out.println("\nLocal bytes [" + start + " to " + endLocal + "]:"); |
|||
printHexDump(local, start, endLocal, offset); |
|||
|
|||
System.out.println("\nSeaweedFS bytes [" + start + " to " + endSeaweed + "]:"); |
|||
printHexDump(seaweed, start, endSeaweed, offset); |
|||
} |
|||
|
|||
private void printHexDump(byte[] bytes, int start, int end, int highlight) { |
|||
StringBuilder hex = new StringBuilder(); |
|||
StringBuilder ascii = new StringBuilder(); |
|||
|
|||
for (int i = start; i < end; i++) { |
|||
if (i > start && i % 16 == 0) { |
|||
System.out.printf("%04x: %-48s %s\n", i - 16, hex.toString(), ascii.toString()); |
|||
hex.setLength(0); |
|||
ascii.setLength(0); |
|||
} |
|||
|
|||
byte b = bytes[i]; |
|||
String hexStr = String.format("%02x ", b & 0xFF); |
|||
if (i == highlight) { |
|||
hexStr = "[" + hexStr.trim() + "] "; |
|||
} |
|||
hex.append(hexStr); |
|||
|
|||
char c = (b >= 32 && b < 127) ? (char) b : '.'; |
|||
if (i == highlight) { |
|||
ascii.append('[').append(c).append(']'); |
|||
} else { |
|||
ascii.append(c); |
|||
} |
|||
} |
|||
|
|||
if (hex.length() > 0) { |
|||
System.out.printf("%04x: %-48s %s\n", (end / 16) * 16, hex.toString(), ascii.toString()); |
|||
} |
|||
} |
|||
} |
|||
|
|||
@ -1,214 +0,0 @@ |
|||
package seaweed.spark; |
|||
|
|||
import org.apache.hadoop.fs.FileSystem; |
|||
import org.apache.hadoop.fs.Path; |
|||
import org.apache.spark.sql.Dataset; |
|||
import org.apache.spark.sql.Row; |
|||
import org.apache.spark.sql.SaveMode; |
|||
import org.junit.After; |
|||
import org.junit.Before; |
|||
import org.junit.Test; |
|||
|
|||
import java.io.File; |
|||
import java.io.IOException; |
|||
import java.nio.file.Files; |
|||
import java.util.Arrays; |
|||
import java.util.List; |
|||
|
|||
import static org.junit.Assert.*; |
|||
|
|||
/** |
|||
* CRITICAL TEST: Compare shadow file (reference) with LOCAL_ONLY mode output. |
|||
* |
|||
* This test: |
|||
* 1. Writes with SHADOW mode enabled โ produces reference file |
|||
* 2. Writes with LOCAL_ONLY mode โ produces local-only file |
|||
* 3. Compares the two files byte-by-byte |
|||
* 4. Attempts to read both with Spark SQL |
|||
*/ |
|||
public class ShadowVsLocalOnlyComparisonTest extends SparkTestBase { |
|||
|
|||
private String shadowDir; |
|||
private String localOnlyDir; |
|||
|
|||
@Before |
|||
public void setUp() throws Exception { |
|||
super.setUpSpark(); |
|||
shadowDir = "/workspace/target/shadow-comparison"; |
|||
localOnlyDir = "/workspace/target/local-only-comparison"; |
|||
|
|||
// Clean up previous runs |
|||
deleteDirectory(new File(shadowDir)); |
|||
deleteDirectory(new File(localOnlyDir)); |
|||
|
|||
new File(shadowDir).mkdirs(); |
|||
new File(localOnlyDir).mkdirs(); |
|||
} |
|||
|
|||
@After |
|||
public void tearDown() throws Exception { |
|||
super.tearDownSpark(); |
|||
} |
|||
|
|||
@Test |
|||
public void testShadowVsLocalOnlyComparison() throws IOException { |
|||
skipIfTestsDisabled(); |
|||
|
|||
System.out.println("\nโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ"); |
|||
System.out.println("โ CRITICAL: Shadow vs LOCAL_ONLY Comparison โ"); |
|||
System.out.println("โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ"); |
|||
|
|||
List<Employee> employees = Arrays.asList( |
|||
new Employee(1, "Alice", "Engineering", 100000), |
|||
new Employee(2, "Bob", "Sales", 80000), |
|||
new Employee(3, "Charlie", "Engineering", 120000), |
|||
new Employee(4, "David", "Sales", 75000)); |
|||
|
|||
Dataset<Row> df = spark.createDataFrame(employees, Employee.class); |
|||
|
|||
// PHASE 1: Write with SHADOW mode |
|||
System.out.println("\n=== PHASE 1: Write with SHADOW mode (creates reference) ==="); |
|||
System.setProperty("SEAWEEDFS_SHADOW_MODE", "true"); |
|||
System.setProperty("SEAWEEDFS_DEBUG_MODE", "SEAWEED_ONLY"); |
|||
spark.conf().set("fs.seaweedfs.shadow.dir", shadowDir); |
|||
|
|||
String shadowOutputPath = "seaweedfs://seaweedfs-filer:8888/test-spark/shadow-test/employees"; |
|||
df.write().mode(SaveMode.Overwrite).parquet(shadowOutputPath); |
|||
|
|||
File[] shadowFiles = new File(shadowDir).listFiles((dir, name) -> name.endsWith(".shadow")); |
|||
assertNotNull("Shadow files should exist", shadowFiles); |
|||
assertTrue("Should have at least one shadow file", shadowFiles.length > 0); |
|||
File shadowFile = shadowFiles[0]; |
|||
System.out.println("Shadow file: " + shadowFile.getName() + " (" + shadowFile.length() + " bytes)"); |
|||
|
|||
// PHASE 2: Write with LOCAL_ONLY mode |
|||
System.out.println("\n=== PHASE 2: Write with LOCAL_ONLY mode ==="); |
|||
System.setProperty("SEAWEEDFS_SHADOW_MODE", "false"); |
|||
System.setProperty("SEAWEEDFS_DEBUG_MODE", "LOCAL_ONLY"); |
|||
spark.conf().set("fs.seaweedfs.debug.dir", localOnlyDir); |
|||
|
|||
String localOnlyOutputPath = "seaweedfs://seaweedfs-filer:8888/test-spark/local-only-test/employees"; |
|||
df.write().mode(SaveMode.Overwrite).parquet(localOnlyOutputPath); |
|||
|
|||
File[] localOnlyFiles = new File(localOnlyDir).listFiles((dir, name) -> name.endsWith(".debug")); |
|||
assertNotNull("LOCAL_ONLY files should exist", localOnlyFiles); |
|||
assertTrue("Should have at least one LOCAL_ONLY file", localOnlyFiles.length > 0); |
|||
File localOnlyFile = localOnlyFiles[0]; |
|||
System.out.println("LOCAL_ONLY file: " + localOnlyFile.getName() + " (" + localOnlyFile.length() + " bytes)"); |
|||
|
|||
// PHASE 3: Compare files byte-by-byte |
|||
System.out.println("\n=== PHASE 3: Compare files byte-by-byte ==="); |
|||
assertEquals("File sizes should match", shadowFile.length(), localOnlyFile.length()); |
|||
|
|||
byte[] shadowBytes = Files.readAllBytes(shadowFile.toPath()); |
|||
byte[] localOnlyBytes = Files.readAllBytes(localOnlyFile.toPath()); |
|||
|
|||
System.out.println("Comparing " + shadowBytes.length + " bytes..."); |
|||
|
|||
// Compare byte-by-byte and report first difference |
|||
boolean identical = true; |
|||
for (int i = 0; i < shadowBytes.length; i++) { |
|||
if (shadowBytes[i] != localOnlyBytes[i]) { |
|||
identical = false; |
|||
System.err.println("โ FIRST DIFFERENCE at byte " + i + ":"); |
|||
System.err.println(" Shadow: 0x" + String.format("%02x", shadowBytes[i] & 0xFF)); |
|||
System.err.println(" LOCAL_ONLY: 0x" + String.format("%02x", localOnlyBytes[i] & 0xFF)); |
|||
|
|||
// Show context |
|||
int contextStart = Math.max(0, i - 10); |
|||
int contextEnd = Math.min(shadowBytes.length, i + 10); |
|||
System.err.println(" Context (shadow):"); |
|||
for (int j = contextStart; j < contextEnd; j++) { |
|||
System.err.print(String.format("%02x ", shadowBytes[j] & 0xFF)); |
|||
} |
|||
System.err.println(); |
|||
System.err.println(" Context (local_only):"); |
|||
for (int j = contextStart; j < contextEnd; j++) { |
|||
System.err.print(String.format("%02x ", localOnlyBytes[j] & 0xFF)); |
|||
} |
|||
System.err.println(); |
|||
break; |
|||
} |
|||
} |
|||
|
|||
if (identical) { |
|||
System.out.println("โ
Files are IDENTICAL!"); |
|||
} else { |
|||
fail("Files are NOT identical"); |
|||
} |
|||
|
|||
// PHASE 4: Try reading shadow file with Spark |
|||
System.out.println("\n=== PHASE 4: Try reading shadow file with Spark ==="); |
|||
try { |
|||
// Copy shadow file to a location Spark can read |
|||
String testPath = "file://" + shadowDir + "/test.parquet"; |
|||
Files.copy(shadowFile.toPath(), new File(shadowDir + "/test.parquet").toPath()); |
|||
|
|||
Dataset<Row> shadowDf = spark.read().parquet(testPath); |
|||
shadowDf.createOrReplaceTempView("shadow_test"); |
|||
Dataset<Row> shadowResult = spark.sql("SELECT * FROM shadow_test WHERE department = 'Engineering'"); |
|||
System.out.println("โ
Shadow file SQL query: " + shadowResult.count() + " rows"); |
|||
} catch (Exception e) { |
|||
System.err.println("โ Shadow file SQL query FAILED: " + e.getMessage()); |
|||
e.printStackTrace(); |
|||
} |
|||
|
|||
// PHASE 5: Try reading LOCAL_ONLY file with Spark |
|||
System.out.println("\n=== PHASE 5: Try reading LOCAL_ONLY file with Spark ==="); |
|||
try { |
|||
Dataset<Row> localOnlyDf = spark.read().parquet(localOnlyOutputPath); |
|||
localOnlyDf.createOrReplaceTempView("local_only_test"); |
|||
Dataset<Row> localOnlyResult = spark.sql("SELECT * FROM local_only_test WHERE department = 'Engineering'"); |
|||
System.out.println("โ
LOCAL_ONLY SQL query: " + localOnlyResult.count() + " rows"); |
|||
} catch (Exception e) { |
|||
System.err.println("โ LOCAL_ONLY SQL query FAILED: " + e.getMessage()); |
|||
assertTrue("Expected 78-byte EOF error", e.getMessage().contains("78 bytes left")); |
|||
} |
|||
|
|||
System.out.println("\nโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ"); |
|||
System.out.println("โ Comparison complete. See logs for details. โ"); |
|||
System.out.println("โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ"); |
|||
} |
|||
|
|||
private void deleteDirectory(File dir) { |
|||
if (dir.exists()) { |
|||
File[] files = dir.listFiles(); |
|||
if (files != null) { |
|||
for (File file : files) { |
|||
if (file.isDirectory()) { |
|||
deleteDirectory(file); |
|||
} else { |
|||
file.delete(); |
|||
} |
|||
} |
|||
} |
|||
dir.delete(); |
|||
} |
|||
} |
|||
|
|||
public static class Employee implements java.io.Serializable { |
|||
private int id; |
|||
private String name; |
|||
private String department; |
|||
private int salary; |
|||
|
|||
public Employee() {} |
|||
|
|||
public Employee(int id, String name, String department, int salary) { |
|||
this.id = id; |
|||
this.name = name; |
|||
this.department = department; |
|||
this.salary = salary; |
|||
} |
|||
|
|||
public int getId() { return id; } |
|||
public void setId(int id) { this.id = id; } |
|||
public String getName() { return name; } |
|||
public void setName(String name) { this.name = name; } |
|||
public String getDepartment() { return department; } |
|||
public void setDepartment(String department) { this.department = department; } |
|||
public int getSalary() { return salary; } |
|||
public void setSalary(int salary) { this.salary = salary; } |
|||
} |
|||
} |
|||
|
|||
@ -1,264 +0,0 @@ |
|||
package seaweed.spark; |
|||
|
|||
import org.apache.hadoop.conf.Configuration; |
|||
import org.apache.hadoop.fs.FileSystem; |
|||
import org.apache.hadoop.fs.Path; |
|||
import org.apache.hadoop.fs.RawLocalFileSystem; |
|||
import org.apache.spark.sql.Dataset; |
|||
import org.apache.spark.sql.Row; |
|||
import org.apache.spark.sql.SaveMode; |
|||
import org.junit.After; |
|||
import org.junit.Before; |
|||
import org.junit.Test; |
|||
|
|||
import java.io.File; |
|||
import java.io.IOException; |
|||
import java.net.URI; |
|||
import java.util.Arrays; |
|||
import java.util.List; |
|||
|
|||
import static org.junit.Assert.*; |
|||
|
|||
/** |
|||
* CRITICAL DIAGNOSTIC TEST: Compare the exact sequence of FileSystem operations |
|||
* between RawLocalFS (works) and LOCAL_ONLY (fails) during SQL query execution. |
|||
* |
|||
* This test will help us understand what's different about how Spark SQL |
|||
* interacts with SeaweedFileSystem vs RawLocalFileSystem. |
|||
*/ |
|||
public class SparkSQLReadDifferenceTest extends SparkTestBase { |
|||
|
|||
private String rawLocalDir; |
|||
private String localOnlyDir; |
|||
private FileSystem rawLocalFs; |
|||
|
|||
@Before |
|||
public void setUp() throws Exception { |
|||
// Enable detailed logging |
|||
System.setProperty("seaweedfs.detailed.logging", "true"); |
|||
super.setUpSpark(); |
|||
|
|||
// Set up RawLocalFileSystem directory |
|||
rawLocalDir = "/tmp/spark-sql-diff-rawlocal-" + System.currentTimeMillis(); |
|||
new File(rawLocalDir).mkdirs(); |
|||
|
|||
Configuration conf = spark.sparkContext().hadoopConfiguration(); |
|||
rawLocalFs = new RawLocalFileSystem(); |
|||
rawLocalFs.initialize(new URI("file:///"), conf); |
|||
rawLocalFs.delete(new Path(rawLocalDir), true); |
|||
rawLocalFs.mkdirs(new Path(rawLocalDir)); |
|||
|
|||
// Set up LOCAL_ONLY directory |
|||
localOnlyDir = "/workspace/target/debug-sql-diff"; |
|||
new File(localOnlyDir).mkdirs(); |
|||
for (File f : new File(localOnlyDir).listFiles()) { |
|||
f.delete(); |
|||
} |
|||
|
|||
System.out.println("\nโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ"); |
|||
System.out.println("โ SQL READ DIFFERENCE TEST: RawLocalFS vs LOCAL_ONLY โ"); |
|||
System.out.println("โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ"); |
|||
} |
|||
|
|||
@After |
|||
public void tearDown() throws Exception { |
|||
if (rawLocalFs != null) { |
|||
rawLocalFs.delete(new Path(rawLocalDir), true); |
|||
rawLocalFs.close(); |
|||
} |
|||
super.tearDownSpark(); |
|||
} |
|||
|
|||
@Test |
|||
public void testSQLReadDifference() throws IOException { |
|||
// Create test data |
|||
List<Employee> employees = Arrays.asList( |
|||
new Employee(1, "Alice", "Engineering", 100000), |
|||
new Employee(2, "Bob", "Sales", 80000), |
|||
new Employee(3, "Charlie", "Engineering", 120000), |
|||
new Employee(4, "David", "Sales", 75000)); |
|||
|
|||
Dataset<Row> df = spark.createDataFrame(employees, Employee.class); |
|||
|
|||
// ======================================================================== |
|||
// PART 1: RawLocalFS - SQL Query (WORKS) |
|||
// ======================================================================== |
|||
System.out.println("\n" + "=".repeat(70)); |
|||
System.out.println("PART 1: RawLocalFS - SQL Query (Expected to WORK)"); |
|||
System.out.println("=".repeat(70)); |
|||
|
|||
String rawLocalPath = "file://" + rawLocalDir + "/employees"; |
|||
System.out.println("Writing to: " + rawLocalPath); |
|||
df.write().mode(SaveMode.Overwrite).parquet(rawLocalPath); |
|||
System.out.println("โ
Write completed\n"); |
|||
|
|||
System.out.println("--- Executing SQL Query on RawLocalFS ---"); |
|||
try { |
|||
Dataset<Row> rawDf = spark.read().parquet(rawLocalPath); |
|||
System.out.println("โ
Initial read successful"); |
|||
|
|||
rawDf.createOrReplaceTempView("employees_raw"); |
|||
System.out.println("โ
Temp view created"); |
|||
|
|||
System.out.println("\nExecuting: SELECT name, salary FROM employees_raw WHERE department = 'Engineering'"); |
|||
Dataset<Row> rawResult = spark.sql("SELECT name, salary FROM employees_raw WHERE department = 'Engineering'"); |
|||
|
|||
System.out.println("Triggering execution with count()..."); |
|||
long rawCount = rawResult.count(); |
|||
|
|||
System.out.println("โ
RawLocalFS SQL query SUCCESSFUL! Row count: " + rawCount); |
|||
assertEquals("Should have 2 engineering employees", 2, rawCount); |
|||
|
|||
System.out.println("\nโ
โ
โ
RawLocalFS: ALL OPERATIONS SUCCESSFUL โ
โ
โ
\n"); |
|||
} catch (Exception e) { |
|||
System.err.println("โ RawLocalFS SQL query FAILED (unexpected!): " + e.getMessage()); |
|||
e.printStackTrace(); |
|||
fail("RawLocalFS should not fail!"); |
|||
} |
|||
|
|||
// ======================================================================== |
|||
// PART 2: LOCAL_ONLY - SQL Query (FAILS) |
|||
// ======================================================================== |
|||
System.out.println("\n" + "=".repeat(70)); |
|||
System.out.println("PART 2: LOCAL_ONLY - SQL Query (Expected to FAIL with 78-byte error)"); |
|||
System.out.println("=".repeat(70)); |
|||
|
|||
// Enable LOCAL_ONLY mode |
|||
System.setProperty("SEAWEEDFS_DEBUG_MODE", "LOCAL_ONLY"); |
|||
spark.sparkContext().hadoopConfiguration().set("fs.seaweedfs.debug.dir", localOnlyDir); |
|||
|
|||
String localOnlyPath = getTestPath("employees_localonly"); |
|||
System.out.println("Writing to: " + localOnlyPath); |
|||
df.write().mode(SaveMode.Overwrite).parquet(localOnlyPath); |
|||
System.out.println("โ
Write completed\n"); |
|||
|
|||
System.out.println("--- Executing SQL Query on LOCAL_ONLY ---"); |
|||
try { |
|||
Dataset<Row> localDf = spark.read().parquet(localOnlyPath); |
|||
System.out.println("โ
Initial read successful"); |
|||
|
|||
localDf.createOrReplaceTempView("employees_local"); |
|||
System.out.println("โ
Temp view created"); |
|||
|
|||
System.out.println("\nExecuting: SELECT name, salary FROM employees_local WHERE department = 'Engineering'"); |
|||
Dataset<Row> localResult = spark.sql("SELECT name, salary FROM employees_local WHERE department = 'Engineering'"); |
|||
|
|||
System.out.println("Triggering execution with count()..."); |
|||
long localCount = localResult.count(); |
|||
|
|||
System.out.println("โ
LOCAL_ONLY SQL query SUCCESSFUL! Row count: " + localCount); |
|||
assertEquals("Should have 2 engineering employees", 2, localCount); |
|||
|
|||
System.out.println("\nโ
โ
โ
LOCAL_ONLY: ALL OPERATIONS SUCCESSFUL โ
โ
โ
\n"); |
|||
} catch (Exception e) { |
|||
System.err.println("\nโโโ LOCAL_ONLY SQL query FAILED โโโ"); |
|||
System.err.println("Error: " + e.getMessage()); |
|||
|
|||
if (e.getMessage() != null && e.getMessage().contains("78 bytes")) { |
|||
System.err.println("\n๐ CONFIRMED: 78-byte EOF error!"); |
|||
System.err.println("This error occurs during SQL query execution on LOCAL_ONLY mode."); |
|||
} |
|||
|
|||
System.err.println("\nFull stack trace:"); |
|||
e.printStackTrace(); |
|||
|
|||
System.err.println("\n" + "=".repeat(70)); |
|||
System.err.println("ANALYSIS: Comparing RawLocalFS (works) vs LOCAL_ONLY (fails)"); |
|||
System.err.println("=".repeat(70)); |
|||
System.err.println(); |
|||
System.err.println("Both tests:"); |
|||
System.err.println(" - Write identical data (same DataFrame)"); |
|||
System.err.println(" - Execute identical SQL query"); |
|||
System.err.println(" - Use identical Spark configuration"); |
|||
System.err.println(); |
|||
System.err.println("Key differences:"); |
|||
System.err.println(" 1. Path scheme:"); |
|||
System.err.println(" - RawLocalFS: file:///tmp/..."); |
|||
System.err.println(" - LOCAL_ONLY: seaweedfs://seaweedfs-filer:8888/..."); |
|||
System.err.println(); |
|||
System.err.println(" 2. FileSystem implementation:"); |
|||
System.err.println(" - RawLocalFS: Hadoop's native RawLocalFileSystem"); |
|||
System.err.println(" - LOCAL_ONLY: SeaweedFileSystem (but writes to local disk)"); |
|||
System.err.println(); |
|||
System.err.println(" 3. InputStream type:"); |
|||
System.err.println(" - RawLocalFS: LocalFSFileInputStream"); |
|||
System.err.println(" - LOCAL_ONLY: SeaweedHadoopInputStream -> LocalOnlyInputStream"); |
|||
System.err.println(); |
|||
System.err.println("The 78-byte error suggests that:"); |
|||
System.err.println(" - Spark SQL expects to read 78 more bytes"); |
|||
System.err.println(" - But the InputStream reports EOF"); |
|||
System.err.println(" - This happens even though the file is correct (1260 bytes)"); |
|||
System.err.println(); |
|||
System.err.println("Possible causes:"); |
|||
System.err.println(" 1. getFileStatus() returns wrong file size"); |
|||
System.err.println(" 2. InputStream.available() returns wrong value"); |
|||
System.err.println(" 3. Seek operations don't work correctly"); |
|||
System.err.println(" 4. Multiple InputStreams interfere with each other"); |
|||
System.err.println(" 5. Metadata is cached incorrectly between operations"); |
|||
System.err.println(); |
|||
|
|||
// Don't fail the test - we want to see the full output |
|||
// fail("LOCAL_ONLY failed as expected"); |
|||
} |
|||
|
|||
// ======================================================================== |
|||
// PART 3: Compare Files |
|||
// ======================================================================== |
|||
System.out.println("\n" + "=".repeat(70)); |
|||
System.out.println("PART 3: File Comparison"); |
|||
System.out.println("=".repeat(70)); |
|||
|
|||
File rawLocalParquetDir = new File(rawLocalDir + "/employees"); |
|||
File[] rawLocalFiles = rawLocalParquetDir.listFiles((dir, name) -> name.endsWith(".parquet")); |
|||
|
|||
File[] localOnlyFiles = new File(localOnlyDir).listFiles((dir, name) -> name.endsWith(".parquet.debug")); |
|||
|
|||
if (rawLocalFiles != null && rawLocalFiles.length > 0 && |
|||
localOnlyFiles != null && localOnlyFiles.length > 0) { |
|||
|
|||
File rawFile = rawLocalFiles[0]; |
|||
File localFile = localOnlyFiles[0]; |
|||
|
|||
System.out.println("\nRawLocalFS file: " + rawFile.getName() + " (" + rawFile.length() + " bytes)"); |
|||
System.out.println("LOCAL_ONLY file: " + localFile.getName() + " (" + localFile.length() + " bytes)"); |
|||
|
|||
if (rawFile.length() == localFile.length()) { |
|||
System.out.println("โ
File sizes match!"); |
|||
} else { |
|||
System.out.println("โ File size mismatch: " + (rawFile.length() - localFile.length()) + " bytes"); |
|||
} |
|||
} |
|||
|
|||
System.out.println("\nโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ"); |
|||
System.out.println("โ TEST COMPLETE - Check logs above for differences โ"); |
|||
System.out.println("โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ"); |
|||
} |
|||
|
|||
// Employee class for Spark DataFrame |
|||
public static class Employee implements java.io.Serializable { |
|||
private int id; |
|||
private String name; |
|||
private String department; |
|||
private int salary; |
|||
|
|||
public Employee() {} // Required for Spark |
|||
|
|||
public Employee(int id, String name, String department, int salary) { |
|||
this.id = id; |
|||
this.name = name; |
|||
this.department = department; |
|||
this.salary = salary; |
|||
} |
|||
|
|||
// Getters and Setters (required for Spark) |
|||
public int getId() { return id; } |
|||
public void setId(int id) { this.id = id; } |
|||
public String getName() { return name; } |
|||
public void setName(String name) { this.name = name; } |
|||
public String getDepartment() { return department; } |
|||
public void setDepartment(String department) { this.department = department; } |
|||
public int getSalary() { return salary; } |
|||
public void setSalary(int salary) { this.salary = salary; } |
|||
} |
|||
} |
|||
|
|||
@ -1,306 +0,0 @@ |
|||
package seaweed.spark; |
|||
|
|||
import org.apache.hadoop.conf.Configuration; |
|||
import org.apache.hadoop.fs.FileSystem; |
|||
import org.apache.hadoop.fs.Path; |
|||
import org.apache.hadoop.fs.RawLocalFileSystem; |
|||
import org.apache.spark.sql.Dataset; |
|||
import org.apache.spark.sql.Row; |
|||
import org.apache.spark.sql.SaveMode; |
|||
import org.junit.After; |
|||
import org.junit.Before; |
|||
import org.junit.Test; |
|||
|
|||
import java.io.File; |
|||
import java.io.IOException; |
|||
import java.net.URI; |
|||
import java.nio.file.Files; |
|||
import java.util.Arrays; |
|||
import java.util.List; |
|||
|
|||
import static org.junit.Assert.*; |
|||
|
|||
/** |
|||
* CRITICAL COMPARISON TEST: Use RawLocalFileSystem as a "shadow" to compare |
|||
* all I/O operations with LOCAL_ONLY mode. |
|||
* |
|||
* This test writes the same data to both: |
|||
* 1. RawLocalFileSystem (file://) - Known to work |
|||
* 2. SeaweedFS LOCAL_ONLY mode (seaweedfs://) - Has 78-byte error |
|||
* |
|||
* Then compares the resulting files byte-by-byte to find the exact difference. |
|||
*/ |
|||
public class SparkShadowComparisonTest extends SparkTestBase { |
|||
|
|||
private String rawLocalDir; |
|||
private String localOnlyDir; |
|||
private FileSystem rawLocalFs; |
|||
|
|||
@Before |
|||
public void setUp() throws Exception { |
|||
super.setUpSpark(); |
|||
|
|||
// Set up RawLocalFileSystem directory |
|||
rawLocalDir = "/tmp/spark-shadow-rawlocal-" + System.currentTimeMillis(); |
|||
new File(rawLocalDir).mkdirs(); |
|||
|
|||
Configuration conf = spark.sparkContext().hadoopConfiguration(); |
|||
rawLocalFs = new RawLocalFileSystem(); |
|||
rawLocalFs.initialize(new URI("file:///"), conf); |
|||
rawLocalFs.delete(new Path(rawLocalDir), true); |
|||
rawLocalFs.mkdirs(new Path(rawLocalDir)); |
|||
|
|||
// Set up LOCAL_ONLY directory (will be in debug dir) |
|||
localOnlyDir = "/workspace/target/debug-shadow"; |
|||
new File(localOnlyDir).mkdirs(); |
|||
|
|||
// Clean up previous runs |
|||
for (File f : new File(localOnlyDir).listFiles()) { |
|||
f.delete(); |
|||
} |
|||
|
|||
System.out.println("\nโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ"); |
|||
System.out.println("โ SHADOW COMPARISON: RawLocalFS vs LOCAL_ONLY โ"); |
|||
System.out.println("โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ"); |
|||
System.out.println("RawLocalFS directory: " + rawLocalDir); |
|||
System.out.println("LOCAL_ONLY directory: " + localOnlyDir); |
|||
} |
|||
|
|||
@After |
|||
public void tearDown() throws Exception { |
|||
if (rawLocalFs != null) { |
|||
rawLocalFs.delete(new Path(rawLocalDir), true); |
|||
rawLocalFs.close(); |
|||
} |
|||
super.tearDownSpark(); |
|||
} |
|||
|
|||
@Test |
|||
public void testShadowComparison() throws IOException { |
|||
System.out.println("\n=== PHASE 1: Write to RawLocalFileSystem ==="); |
|||
|
|||
// Create test data |
|||
List<Employee> employees = Arrays.asList( |
|||
new Employee(1, "Alice", "Engineering", 100000), |
|||
new Employee(2, "Bob", "Sales", 80000), |
|||
new Employee(3, "Charlie", "Engineering", 120000), |
|||
new Employee(4, "David", "Sales", 75000)); |
|||
|
|||
Dataset<Row> df = spark.createDataFrame(employees, Employee.class); |
|||
|
|||
// Write to RawLocalFileSystem |
|||
String rawLocalPath = "file://" + rawLocalDir + "/employees"; |
|||
System.out.println("Writing to RawLocalFS: " + rawLocalPath); |
|||
|
|||
try { |
|||
df.write().mode(SaveMode.Overwrite).parquet(rawLocalPath); |
|||
System.out.println("โ
RawLocalFS write completed successfully!"); |
|||
} catch (Exception e) { |
|||
System.err.println("โ RawLocalFS write FAILED: " + e.getMessage()); |
|||
e.printStackTrace(); |
|||
fail("RawLocalFS write should not fail!"); |
|||
} |
|||
|
|||
// List files written by RawLocalFS |
|||
File rawLocalParquetDir = new File(rawLocalDir + "/employees"); |
|||
File[] rawLocalFiles = rawLocalParquetDir.listFiles((dir, name) -> name.endsWith(".parquet")); |
|||
assertNotNull("RawLocalFS should have written files", rawLocalFiles); |
|||
assertTrue("RawLocalFS should have at least one parquet file", rawLocalFiles.length > 0); |
|||
|
|||
System.out.println("RawLocalFS wrote " + rawLocalFiles.length + " parquet file(s):"); |
|||
for (File f : rawLocalFiles) { |
|||
System.out.println(" - " + f.getName() + " (" + f.length() + " bytes)"); |
|||
} |
|||
|
|||
System.out.println("\n=== PHASE 2: Write to LOCAL_ONLY mode ==="); |
|||
|
|||
// Set environment for LOCAL_ONLY mode |
|||
System.setProperty("SEAWEEDFS_DEBUG_MODE", "LOCAL_ONLY"); |
|||
spark.sparkContext().hadoopConfiguration().set("fs.seaweedfs.debug.dir", localOnlyDir); |
|||
|
|||
// Write to LOCAL_ONLY |
|||
String localOnlyPath = getTestPath("employees_localonly"); |
|||
System.out.println("Writing to LOCAL_ONLY: " + localOnlyPath); |
|||
|
|||
boolean localOnlyWriteSucceeded = false; |
|||
try { |
|||
df.write().mode(SaveMode.Overwrite).parquet(localOnlyPath); |
|||
System.out.println("โ
LOCAL_ONLY write completed successfully!"); |
|||
localOnlyWriteSucceeded = true; |
|||
} catch (Exception e) { |
|||
System.err.println("โ ๏ธ LOCAL_ONLY write completed but may have issues: " + e.getMessage()); |
|||
// Don't fail here - we want to compare files even if write "succeeded" |
|||
} |
|||
|
|||
// List files written by LOCAL_ONLY |
|||
File[] localOnlyFiles = new File(localOnlyDir).listFiles((dir, name) -> name.endsWith(".debug")); |
|||
if (localOnlyFiles == null || localOnlyFiles.length == 0) { |
|||
System.err.println("โ LOCAL_ONLY did not write any .debug files!"); |
|||
fail("LOCAL_ONLY should have written .debug files"); |
|||
} |
|||
|
|||
System.out.println("LOCAL_ONLY wrote " + localOnlyFiles.length + " .debug file(s):"); |
|||
for (File f : localOnlyFiles) { |
|||
System.out.println(" - " + f.getName() + " (" + f.length() + " bytes)"); |
|||
} |
|||
|
|||
System.out.println("\n=== PHASE 3: Compare Files Byte-by-Byte ==="); |
|||
|
|||
// Match files by pattern (both should have part-00000-*.snappy.parquet) |
|||
File rawFile = rawLocalFiles[0]; // Should only be one file |
|||
File localOnlyFile = null; |
|||
|
|||
// Find the .debug file that looks like a parquet file |
|||
for (File f : localOnlyFiles) { |
|||
if (f.getName().contains("part-") && f.getName().endsWith(".parquet.debug")) { |
|||
localOnlyFile = f; |
|||
break; |
|||
} |
|||
} |
|||
|
|||
if (localOnlyFile == null) { |
|||
System.out.println("โ Could not find LOCAL_ONLY parquet file!"); |
|||
System.out.println("Available .debug files:"); |
|||
for (File f : localOnlyFiles) { |
|||
System.out.println(" - " + f.getName()); |
|||
} |
|||
fail("LOCAL_ONLY should have written a parquet .debug file"); |
|||
} |
|||
|
|||
System.out.println("\nComparing:"); |
|||
System.out.println(" RawLocalFS: " + rawFile.getName() + " (" + rawFile.length() + " bytes)"); |
|||
System.out.println(" LOCAL_ONLY: " + localOnlyFile.getName() + " (" + localOnlyFile.length() + " bytes)"); |
|||
|
|||
// Compare file sizes |
|||
long sizeDiff = rawFile.length() - localOnlyFile.length(); |
|||
if (sizeDiff != 0) { |
|||
System.out.println(" โ ๏ธ SIZE DIFFERENCE: " + sizeDiff + " bytes"); |
|||
System.out.println(" RawLocalFS is " + (sizeDiff > 0 ? "LARGER" : "SMALLER") + " by " + Math.abs(sizeDiff) + " bytes"); |
|||
|
|||
if (Math.abs(sizeDiff) == 78) { |
|||
System.out.println(" ๐ THIS IS THE 78-BYTE DIFFERENCE!"); |
|||
} |
|||
} else { |
|||
System.out.println(" โ
File sizes match!"); |
|||
} |
|||
|
|||
// Compare file contents byte-by-byte |
|||
byte[] rawBytes = Files.readAllBytes(rawFile.toPath()); |
|||
byte[] localOnlyBytes = Files.readAllBytes(localOnlyFile.toPath()); |
|||
|
|||
int minLen = Math.min(rawBytes.length, localOnlyBytes.length); |
|||
int firstDiffIndex = -1; |
|||
|
|||
for (int i = 0; i < minLen; i++) { |
|||
if (rawBytes[i] != localOnlyBytes[i]) { |
|||
firstDiffIndex = i; |
|||
break; |
|||
} |
|||
} |
|||
|
|||
if (firstDiffIndex >= 0) { |
|||
System.out.println(" โ ๏ธ CONTENT DIFFERS at byte offset: " + firstDiffIndex); |
|||
System.out.println(" Showing 32 bytes around difference:"); |
|||
|
|||
int start = Math.max(0, firstDiffIndex - 16); |
|||
int end = Math.min(minLen, firstDiffIndex + 16); |
|||
|
|||
System.out.print(" RawLocalFS: "); |
|||
for (int i = start; i < end; i++) { |
|||
System.out.printf("%02X ", rawBytes[i]); |
|||
if (i == firstDiffIndex) System.out.print("| "); |
|||
} |
|||
System.out.println(); |
|||
|
|||
System.out.print(" LOCAL_ONLY: "); |
|||
for (int i = start; i < end; i++) { |
|||
System.out.printf("%02X ", localOnlyBytes[i]); |
|||
if (i == firstDiffIndex) System.out.print("| "); |
|||
} |
|||
System.out.println(); |
|||
} else if (rawBytes.length == localOnlyBytes.length) { |
|||
System.out.println(" โ
File contents are IDENTICAL!"); |
|||
} else { |
|||
System.out.println(" โ ๏ธ Files match up to " + minLen + " bytes, but differ in length"); |
|||
|
|||
// Show the extra bytes |
|||
if (rawBytes.length > localOnlyBytes.length) { |
|||
System.out.println(" RawLocalFS has " + (rawBytes.length - minLen) + " extra bytes at end:"); |
|||
System.out.print(" "); |
|||
for (int i = minLen; i < Math.min(rawBytes.length, minLen + 32); i++) { |
|||
System.out.printf("%02X ", rawBytes[i]); |
|||
} |
|||
System.out.println(); |
|||
} else { |
|||
System.out.println(" LOCAL_ONLY has " + (localOnlyBytes.length - minLen) + " extra bytes at end:"); |
|||
System.out.print(" "); |
|||
for (int i = minLen; i < Math.min(localOnlyBytes.length, minLen + 32); i++) { |
|||
System.out.printf("%02X ", localOnlyBytes[i]); |
|||
} |
|||
System.out.println(); |
|||
} |
|||
} |
|||
|
|||
System.out.println("\n=== PHASE 4: Try Reading Both Files ==="); |
|||
|
|||
// Try reading RawLocalFS file |
|||
System.out.println("\nReading from RawLocalFS:"); |
|||
try { |
|||
Dataset<Row> rawDf = spark.read().parquet(rawLocalPath); |
|||
long rawCount = rawDf.count(); |
|||
System.out.println("โ
RawLocalFS read successful! Row count: " + rawCount); |
|||
assertEquals("Should have 4 employees", 4, rawCount); |
|||
} catch (Exception e) { |
|||
System.err.println("โ RawLocalFS read FAILED: " + e.getMessage()); |
|||
e.printStackTrace(); |
|||
fail("RawLocalFS read should not fail!"); |
|||
} |
|||
|
|||
// Try reading LOCAL_ONLY file |
|||
System.out.println("\nReading from LOCAL_ONLY:"); |
|||
try { |
|||
Dataset<Row> localOnlyDf = spark.read().parquet(localOnlyPath); |
|||
long localOnlyCount = localOnlyDf.count(); |
|||
System.out.println("โ
LOCAL_ONLY read successful! Row count: " + localOnlyCount); |
|||
assertEquals("Should have 4 employees", 4, localOnlyCount); |
|||
} catch (Exception e) { |
|||
System.err.println("โ LOCAL_ONLY read FAILED: " + e.getMessage()); |
|||
if (e.getMessage() != null && e.getMessage().contains("78 bytes")) { |
|||
System.err.println("๐ CONFIRMED: 78-byte error occurs during READ, not WRITE!"); |
|||
} |
|||
// Don't fail - we expect this to fail |
|||
} |
|||
|
|||
System.out.println("\nโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ"); |
|||
System.out.println("โ SHADOW COMPARISON COMPLETE โ"); |
|||
System.out.println("โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ"); |
|||
} |
|||
|
|||
// Employee class for Spark DataFrame |
|||
public static class Employee implements java.io.Serializable { |
|||
private int id; |
|||
private String name; |
|||
private String department; |
|||
private int salary; |
|||
|
|||
public Employee() {} // Required for Spark |
|||
|
|||
public Employee(int id, String name, String department, int salary) { |
|||
this.id = id; |
|||
this.name = name; |
|||
this.department = department; |
|||
this.salary = salary; |
|||
} |
|||
|
|||
// Getters and Setters (required for Spark) |
|||
public int getId() { return id; } |
|||
public void setId(int id) { this.id = id; } |
|||
public String getName() { return name; } |
|||
public void setName(String name) { this.name = name; } |
|||
public String getDepartment() { return department; } |
|||
public void setDepartment(String department) { this.department = department; } |
|||
public int getSalary() { return salary; } |
|||
public void setSalary(int salary) { this.salary = salary; } |
|||
} |
|||
} |
|||
|
|||
@ -1,343 +0,0 @@ |
|||
package seaweed.spark; |
|||
|
|||
import org.apache.hadoop.conf.Configuration; |
|||
import org.apache.hadoop.fs.FSDataInputStream; |
|||
import org.apache.hadoop.fs.FileSystem; |
|||
import org.apache.hadoop.fs.Path; |
|||
import org.apache.hadoop.fs.RawLocalFileSystem; |
|||
import org.apache.spark.sql.Dataset; |
|||
import org.apache.spark.sql.Row; |
|||
import org.apache.spark.sql.SaveMode; |
|||
import org.junit.After; |
|||
import org.junit.Before; |
|||
import org.junit.Test; |
|||
|
|||
import java.io.File; |
|||
import java.io.IOException; |
|||
import java.net.URI; |
|||
import java.nio.ByteBuffer; |
|||
import java.util.Arrays; |
|||
import java.util.List; |
|||
|
|||
import static org.junit.Assert.*; |
|||
|
|||
/** |
|||
* CRITICAL READ COMPARISON TEST: Compare all read operations between RawLocalFileSystem |
|||
* and SeaweedFS LOCAL_ONLY mode. |
|||
* |
|||
* This test: |
|||
* 1. Writes identical data to both RawLocalFS and LOCAL_ONLY |
|||
* 2. Performs the same read operations on both |
|||
* 3. Compares the results of each read operation |
|||
* 4. Identifies where the divergence happens |
|||
*/ |
|||
public class SparkShadowReadComparisonTest extends SparkTestBase { |
|||
|
|||
private String rawLocalDir; |
|||
private String localOnlyDir; |
|||
private FileSystem rawLocalFs; |
|||
private FileSystem seaweedFs; |
|||
private String rawLocalParquetFile; |
|||
private String localOnlyParquetFile; |
|||
|
|||
@Before |
|||
public void setUp() throws Exception { |
|||
super.setUpSpark(); |
|||
|
|||
// Set up RawLocalFileSystem directory |
|||
rawLocalDir = "/tmp/spark-shadow-read-rawlocal-" + System.currentTimeMillis(); |
|||
new File(rawLocalDir).mkdirs(); |
|||
|
|||
Configuration conf = spark.sparkContext().hadoopConfiguration(); |
|||
rawLocalFs = new RawLocalFileSystem(); |
|||
rawLocalFs.initialize(new URI("file:///"), conf); |
|||
rawLocalFs.delete(new Path(rawLocalDir), true); |
|||
rawLocalFs.mkdirs(new Path(rawLocalDir)); |
|||
|
|||
// Set up LOCAL_ONLY directory |
|||
localOnlyDir = "/workspace/target/debug-shadow-read"; |
|||
new File(localOnlyDir).mkdirs(); |
|||
for (File f : new File(localOnlyDir).listFiles()) { |
|||
f.delete(); |
|||
} |
|||
|
|||
// Get SeaweedFS instance |
|||
seaweedFs = FileSystem.get(URI.create("seaweedfs://seaweedfs-filer:8888"), conf); |
|||
|
|||
System.out.println("\nโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ"); |
|||
System.out.println("โ SHADOW READ COMPARISON: RawLocalFS vs LOCAL_ONLY โ"); |
|||
System.out.println("โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ"); |
|||
System.out.println("RawLocalFS directory: " + rawLocalDir); |
|||
System.out.println("LOCAL_ONLY directory: " + localOnlyDir); |
|||
} |
|||
|
|||
@After |
|||
public void tearDown() throws Exception { |
|||
if (rawLocalFs != null) { |
|||
rawLocalFs.delete(new Path(rawLocalDir), true); |
|||
rawLocalFs.close(); |
|||
} |
|||
super.tearDownSpark(); |
|||
} |
|||
|
|||
@Test |
|||
public void testShadowReadComparison() throws IOException { |
|||
System.out.println("\n=== PHASE 1: Write Identical Data to Both FileSystems ==="); |
|||
|
|||
// Create test data |
|||
List<Employee> employees = Arrays.asList( |
|||
new Employee(1, "Alice", "Engineering", 100000), |
|||
new Employee(2, "Bob", "Sales", 80000), |
|||
new Employee(3, "Charlie", "Engineering", 120000), |
|||
new Employee(4, "David", "Sales", 75000)); |
|||
|
|||
Dataset<Row> df = spark.createDataFrame(employees, Employee.class); |
|||
|
|||
// Write to RawLocalFileSystem |
|||
String rawLocalPath = "file://" + rawLocalDir + "/employees"; |
|||
System.out.println("Writing to RawLocalFS: " + rawLocalPath); |
|||
df.write().mode(SaveMode.Overwrite).parquet(rawLocalPath); |
|||
System.out.println("โ
RawLocalFS write completed"); |
|||
|
|||
// Set environment for LOCAL_ONLY mode |
|||
System.setProperty("SEAWEEDFS_DEBUG_MODE", "LOCAL_ONLY"); |
|||
spark.sparkContext().hadoopConfiguration().set("fs.seaweedfs.debug.dir", localOnlyDir); |
|||
|
|||
// Write to LOCAL_ONLY |
|||
String localOnlyPath = getTestPath("employees_read_test"); |
|||
System.out.println("Writing to LOCAL_ONLY: " + localOnlyPath); |
|||
df.write().mode(SaveMode.Overwrite).parquet(localOnlyPath); |
|||
System.out.println("โ
LOCAL_ONLY write completed"); |
|||
|
|||
// Find the parquet files |
|||
File rawLocalParquetDir = new File(rawLocalDir + "/employees"); |
|||
File[] rawLocalFiles = rawLocalParquetDir.listFiles((dir, name) -> name.endsWith(".parquet")); |
|||
assertNotNull("RawLocalFS should have written files", rawLocalFiles); |
|||
assertTrue("RawLocalFS should have at least one parquet file", rawLocalFiles.length > 0); |
|||
rawLocalParquetFile = rawLocalFiles[0].getAbsolutePath(); |
|||
|
|||
File[] localOnlyFiles = new File(localOnlyDir).listFiles((dir, name) -> name.endsWith(".parquet.debug")); |
|||
assertNotNull("LOCAL_ONLY should have written files", localOnlyFiles); |
|||
assertTrue("LOCAL_ONLY should have at least one parquet file", localOnlyFiles.length > 0); |
|||
localOnlyParquetFile = localOnlyFiles[0].getAbsolutePath(); |
|||
|
|||
System.out.println("RawLocalFS file: " + rawLocalParquetFile); |
|||
System.out.println("LOCAL_ONLY file: " + localOnlyParquetFile); |
|||
|
|||
System.out.println("\n=== PHASE 2: Compare Low-Level Read Operations ==="); |
|||
|
|||
// Open both files for reading |
|||
FSDataInputStream rawStream = rawLocalFs.open(new Path(rawLocalParquetFile)); |
|||
|
|||
// For LOCAL_ONLY, we need to read the .debug file directly using RawLocalFS |
|||
// because it's just a local file |
|||
FSDataInputStream localOnlyStream = rawLocalFs.open(new Path(localOnlyParquetFile)); |
|||
|
|||
try { |
|||
// Test 1: Read file length |
|||
System.out.println("\n--- Test 1: File Length ---"); |
|||
long rawLength = rawLocalFs.getFileStatus(new Path(rawLocalParquetFile)).getLen(); |
|||
long localOnlyLength = rawLocalFs.getFileStatus(new Path(localOnlyParquetFile)).getLen(); |
|||
System.out.println("RawLocalFS length: " + rawLength); |
|||
System.out.println("LOCAL_ONLY length: " + localOnlyLength); |
|||
if (rawLength == localOnlyLength) { |
|||
System.out.println("โ
Lengths match!"); |
|||
} else { |
|||
System.out.println("โ Length mismatch: " + (rawLength - localOnlyLength) + " bytes"); |
|||
} |
|||
assertEquals("File lengths should match", rawLength, localOnlyLength); |
|||
|
|||
// Test 2: Read first 100 bytes |
|||
System.out.println("\n--- Test 2: Read First 100 Bytes ---"); |
|||
byte[] rawBuffer1 = new byte[100]; |
|||
byte[] localOnlyBuffer1 = new byte[100]; |
|||
rawStream.readFully(0, rawBuffer1); |
|||
localOnlyStream.readFully(0, localOnlyBuffer1); |
|||
boolean firstBytesMatch = Arrays.equals(rawBuffer1, localOnlyBuffer1); |
|||
System.out.println("First 100 bytes match: " + (firstBytesMatch ? "โ
" : "โ")); |
|||
if (!firstBytesMatch) { |
|||
System.out.println("First difference at byte: " + findFirstDifference(rawBuffer1, localOnlyBuffer1)); |
|||
} |
|||
assertTrue("First 100 bytes should match", firstBytesMatch); |
|||
|
|||
// Test 3: Read last 100 bytes (Parquet footer) |
|||
System.out.println("\n--- Test 3: Read Last 100 Bytes (Parquet Footer) ---"); |
|||
byte[] rawBuffer2 = new byte[100]; |
|||
byte[] localOnlyBuffer2 = new byte[100]; |
|||
rawStream.readFully(rawLength - 100, rawBuffer2); |
|||
localOnlyStream.readFully(localOnlyLength - 100, localOnlyBuffer2); |
|||
boolean lastBytesMatch = Arrays.equals(rawBuffer2, localOnlyBuffer2); |
|||
System.out.println("Last 100 bytes match: " + (lastBytesMatch ? "โ
" : "โ")); |
|||
if (!lastBytesMatch) { |
|||
System.out.println("First difference at byte: " + findFirstDifference(rawBuffer2, localOnlyBuffer2)); |
|||
System.out.println("RawLocalFS last 20 bytes:"); |
|||
printHex(rawBuffer2, 80, 100); |
|||
System.out.println("LOCAL_ONLY last 20 bytes:"); |
|||
printHex(localOnlyBuffer2, 80, 100); |
|||
} |
|||
assertTrue("Last 100 bytes should match", lastBytesMatch); |
|||
|
|||
// Test 4: Read entire file |
|||
System.out.println("\n--- Test 4: Read Entire File ---"); |
|||
byte[] rawFull = new byte[(int) rawLength]; |
|||
byte[] localOnlyFull = new byte[(int) localOnlyLength]; |
|||
rawStream.readFully(0, rawFull); |
|||
localOnlyStream.readFully(0, localOnlyFull); |
|||
boolean fullMatch = Arrays.equals(rawFull, localOnlyFull); |
|||
System.out.println("Full file match: " + (fullMatch ? "โ
" : "โ")); |
|||
if (!fullMatch) { |
|||
int firstDiff = findFirstDifference(rawFull, localOnlyFull); |
|||
System.out.println("First difference at byte: " + firstDiff); |
|||
} |
|||
assertTrue("Full file should match", fullMatch); |
|||
|
|||
// Test 5: Sequential reads |
|||
System.out.println("\n--- Test 5: Sequential Reads (10 bytes at a time) ---"); |
|||
rawStream.seek(0); |
|||
localOnlyStream.seek(0); |
|||
boolean sequentialMatch = true; |
|||
int chunkSize = 10; |
|||
int chunksRead = 0; |
|||
while (rawStream.getPos() < rawLength && localOnlyStream.getPos() < localOnlyLength) { |
|||
byte[] rawChunk = new byte[chunkSize]; |
|||
byte[] localOnlyChunk = new byte[chunkSize]; |
|||
int rawRead = rawStream.read(rawChunk); |
|||
int localOnlyRead = localOnlyStream.read(localOnlyChunk); |
|||
|
|||
if (rawRead != localOnlyRead) { |
|||
System.out.println("โ Read size mismatch at chunk " + chunksRead + ": raw=" + rawRead + " localOnly=" + localOnlyRead); |
|||
sequentialMatch = false; |
|||
break; |
|||
} |
|||
|
|||
if (!Arrays.equals(rawChunk, localOnlyChunk)) { |
|||
System.out.println("โ Content mismatch at chunk " + chunksRead + " (byte offset " + (chunksRead * chunkSize) + ")"); |
|||
sequentialMatch = false; |
|||
break; |
|||
} |
|||
chunksRead++; |
|||
} |
|||
System.out.println("Sequential reads (" + chunksRead + " chunks): " + (sequentialMatch ? "โ
" : "โ")); |
|||
assertTrue("Sequential reads should match", sequentialMatch); |
|||
|
|||
} finally { |
|||
rawStream.close(); |
|||
localOnlyStream.close(); |
|||
} |
|||
|
|||
System.out.println("\n=== PHASE 3: Compare Spark Read Operations ==="); |
|||
|
|||
// Test 6: Spark read from RawLocalFS |
|||
System.out.println("\n--- Test 6: Spark Read from RawLocalFS ---"); |
|||
try { |
|||
Dataset<Row> rawDf = spark.read().parquet(rawLocalPath); |
|||
long rawCount = rawDf.count(); |
|||
System.out.println("โ
RawLocalFS Spark read successful! Row count: " + rawCount); |
|||
assertEquals("Should have 4 employees", 4, rawCount); |
|||
} catch (Exception e) { |
|||
System.err.println("โ RawLocalFS Spark read FAILED: " + e.getMessage()); |
|||
e.printStackTrace(); |
|||
fail("RawLocalFS Spark read should not fail!"); |
|||
} |
|||
|
|||
// Test 7: Spark read from LOCAL_ONLY |
|||
System.out.println("\n--- Test 7: Spark Read from LOCAL_ONLY ---"); |
|||
try { |
|||
Dataset<Row> localOnlyDf = spark.read().parquet(localOnlyPath); |
|||
long localOnlyCount = localOnlyDf.count(); |
|||
System.out.println("โ
LOCAL_ONLY Spark read successful! Row count: " + localOnlyCount); |
|||
assertEquals("Should have 4 employees", 4, localOnlyCount); |
|||
} catch (Exception e) { |
|||
System.err.println("โ LOCAL_ONLY Spark read FAILED: " + e.getMessage()); |
|||
if (e.getMessage() != null && e.getMessage().contains("78 bytes")) { |
|||
System.err.println("๐ FOUND IT! 78-byte error occurs during Spark read!"); |
|||
System.err.println("But low-level reads worked, so the issue is in Spark's Parquet reader!"); |
|||
} |
|||
e.printStackTrace(); |
|||
// Don't fail - we want to see the full output |
|||
} |
|||
|
|||
// Test 8: SQL query on RawLocalFS |
|||
System.out.println("\n--- Test 8: SQL Query on RawLocalFS ---"); |
|||
try { |
|||
Dataset<Row> rawDf = spark.read().parquet(rawLocalPath); |
|||
rawDf.createOrReplaceTempView("employees_raw"); |
|||
Dataset<Row> rawResult = spark.sql("SELECT name, salary FROM employees_raw WHERE department = 'Engineering'"); |
|||
long rawResultCount = rawResult.count(); |
|||
System.out.println("โ
RawLocalFS SQL query successful! Row count: " + rawResultCount); |
|||
assertEquals("Should have 2 engineering employees", 2, rawResultCount); |
|||
} catch (Exception e) { |
|||
System.err.println("โ RawLocalFS SQL query FAILED: " + e.getMessage()); |
|||
e.printStackTrace(); |
|||
fail("RawLocalFS SQL query should not fail!"); |
|||
} |
|||
|
|||
// Test 9: SQL query on LOCAL_ONLY |
|||
System.out.println("\n--- Test 9: SQL Query on LOCAL_ONLY ---"); |
|||
try { |
|||
Dataset<Row> localOnlyDf = spark.read().parquet(localOnlyPath); |
|||
localOnlyDf.createOrReplaceTempView("employees_localonly"); |
|||
Dataset<Row> localOnlyResult = spark.sql("SELECT name, salary FROM employees_localonly WHERE department = 'Engineering'"); |
|||
long localOnlyResultCount = localOnlyResult.count(); |
|||
System.out.println("โ
LOCAL_ONLY SQL query successful! Row count: " + localOnlyResultCount); |
|||
assertEquals("Should have 2 engineering employees", 2, localOnlyResultCount); |
|||
} catch (Exception e) { |
|||
System.err.println("โ LOCAL_ONLY SQL query FAILED: " + e.getMessage()); |
|||
if (e.getMessage() != null && e.getMessage().contains("78 bytes")) { |
|||
System.err.println("๐ 78-byte error in SQL query!"); |
|||
} |
|||
e.printStackTrace(); |
|||
// Don't fail - we want to see the full output |
|||
} |
|||
|
|||
System.out.println("\nโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ"); |
|||
System.out.println("โ SHADOW READ COMPARISON COMPLETE โ"); |
|||
System.out.println("โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ"); |
|||
} |
|||
|
|||
private int findFirstDifference(byte[] a, byte[] b) { |
|||
int minLen = Math.min(a.length, b.length); |
|||
for (int i = 0; i < minLen; i++) { |
|||
if (a[i] != b[i]) { |
|||
return i; |
|||
} |
|||
} |
|||
return minLen; |
|||
} |
|||
|
|||
private void printHex(byte[] data, int start, int end) { |
|||
System.out.print(" "); |
|||
for (int i = start; i < end && i < data.length; i++) { |
|||
System.out.printf("%02X ", data[i]); |
|||
} |
|||
System.out.println(); |
|||
} |
|||
|
|||
// Employee class for Spark DataFrame |
|||
public static class Employee implements java.io.Serializable { |
|||
private int id; |
|||
private String name; |
|||
private String department; |
|||
private int salary; |
|||
|
|||
public Employee() {} // Required for Spark |
|||
|
|||
public Employee(int id, String name, String department, int salary) { |
|||
this.id = id; |
|||
this.name = name; |
|||
this.department = department; |
|||
this.salary = salary; |
|||
} |
|||
|
|||
// Getters and Setters (required for Spark) |
|||
public int getId() { return id; } |
|||
public void setId(int id) { this.id = id; } |
|||
public String getName() { return name; } |
|||
public void setName(String name) { this.name = name; } |
|||
public String getDepartment() { return department; } |
|||
public void setDepartment(String department) { this.department = department; } |
|||
public int getSalary() { return salary; } |
|||
public void setSalary(int salary) { this.salary = salary; } |
|||
} |
|||
} |
|||
|
|||
Write
Preview
Loadingโฆ
Cancel
Save
Reference in new issue