You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							174 lines
						
					
					
						
							4.4 KiB
						
					
					
				
			
		
		
		
			
			
			
		
		
	
	
							174 lines
						
					
					
						
							4.4 KiB
						
					
					
				| package schema | |
| 
 | |
| import ( | |
| 	"fmt" | |
| 	"io" | |
| 	"os" | |
| 	"testing" | |
| 
 | |
| 	"github.com/parquet-go/parquet-go" | |
| 	"github.com/parquet-go/parquet-go/compress/zstd" | |
| 	"github.com/seaweedfs/seaweedfs/weed/pb/schema_pb" | |
| ) | |
| 
 | |
| func TestWriteReadParquet(t *testing.T) { | |
| 	// create a schema_pb.RecordType | |
| 	recordType := RecordTypeBegin(). | |
| 		WithField("ID", TypeInt64). | |
| 		WithField("CreatedAt", TypeInt64). | |
| 		WithRecordField("Person", | |
| 			RecordTypeBegin(). | |
| 				WithField("zName", TypeString). | |
| 				WithField("emails", ListOf(TypeString)). | |
| 				RecordTypeEnd()). | |
| 		WithField("Company", TypeString). | |
| 		WithRecordField("Address", | |
| 			RecordTypeBegin(). | |
| 				WithField("Street", TypeString). | |
| 				WithField("City", TypeString). | |
| 				RecordTypeEnd()). | |
| 		RecordTypeEnd() | |
| 	fmt.Printf("RecordType: %v\n", recordType) | |
| 
 | |
| 	// create a parquet schema | |
| 	parquetSchema, err := ToParquetSchema("example", recordType) | |
| 	if err != nil { | |
| 		t.Fatalf("ToParquetSchema failed: %v", err) | |
| 	} | |
| 	fmt.Printf("ParquetSchema: %v\n", parquetSchema) | |
| 
 | |
| 	fmt.Printf("Go Type: %+v\n", parquetSchema.GoType()) | |
| 
 | |
| 	filename := "example.parquet" | |
| 
 | |
| 	count := 3 | |
| 
 | |
| 	testWritingParquetFile(t, count, filename, parquetSchema, recordType) | |
| 
 | |
| 	total := testReadingParquetFile(t, filename, parquetSchema, recordType) | |
| 
 | |
| 	if total != count { | |
| 		t.Fatalf("total != 128*1024: %v", total) | |
| 	} | |
| 
 | |
| 	if err = os.Remove(filename); err != nil { | |
| 		t.Fatalf("os.Remove failed: %v", err) | |
| 	} | |
| 
 | |
| } | |
| 
 | |
| func testWritingParquetFile(t *testing.T, count int, filename string, parquetSchema *parquet.Schema, recordType *schema_pb.RecordType) { | |
| 	parquetLevels, err := ToParquetLevels(recordType) | |
| 	if err != nil { | |
| 		t.Fatalf("ToParquetLevels failed: %v", err) | |
| 	} | |
| 
 | |
| 	// create a parquet file | |
| 	file, err := os.OpenFile(filename, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0664) | |
| 	if err != nil { | |
| 		t.Fatalf("os.Open failed: %v", err) | |
| 	} | |
| 	defer file.Close() | |
| 	writer := parquet.NewWriter(file, parquetSchema, parquet.Compression(&zstd.Codec{Level: zstd.DefaultLevel})) | |
| 	rowBuilder := parquet.NewRowBuilder(parquetSchema) | |
| 	for i := 0; i < count; i++ { | |
| 		rowBuilder.Reset() | |
| 		// generate random data | |
| 		recordValue := RecordBegin(). | |
| 			SetInt64("ID", 1+int64(i)). | |
| 			SetInt64("CreatedAt", 2+2*int64(i)). | |
| 			SetRecord("Person", | |
| 				RecordBegin(). | |
| 					SetString("zName", fmt.Sprintf("john_%d", i)). | |
| 					SetStringList("emails", | |
| 						fmt.Sprintf("john_%d@a.com", i), | |
| 						fmt.Sprintf("john_%d@b.com", i), | |
| 						fmt.Sprintf("john_%d@c.com", i), | |
| 						fmt.Sprintf("john_%d@d.com", i), | |
| 						fmt.Sprintf("john_%d@e.com", i)). | |
| 					RecordEnd()). | |
| 			SetString("Company", fmt.Sprintf("company_%d", i)). | |
| 			RecordEnd() | |
| 		AddRecordValue(rowBuilder, recordType, parquetLevels, recordValue) | |
| 
 | |
| 		if count < 10 { | |
| 			fmt.Printf("Write RecordValue: %v\n", recordValue) | |
| 		} | |
| 
 | |
| 		row := rowBuilder.Row() | |
| 
 | |
| 		if count < 10 { | |
| 			fmt.Printf("Build Row: %+v\n", row) | |
| 		} | |
| 
 | |
| 		if err != nil { | |
| 			t.Fatalf("rowBuilder.Build failed: %v", err) | |
| 		} | |
| 
 | |
| 		if _, err = writer.WriteRows([]parquet.Row{row}); err != nil { | |
| 			t.Fatalf("writer.Write failed: %v", err) | |
| 		} | |
| 	} | |
| 	if err = writer.Close(); err != nil { | |
| 		t.Fatalf("writer.WriteStop failed: %v", err) | |
| 	} | |
| } | |
| 
 | |
| func testReadingParquetFile(t *testing.T, filename string, parquetSchema *parquet.Schema, recordType *schema_pb.RecordType) (total int) { | |
| 	parquetLevels, err := ToParquetLevels(recordType) | |
| 	if err != nil { | |
| 		t.Fatalf("ToParquetLevels failed: %v", err) | |
| 	} | |
| 
 | |
| 	// read the parquet file | |
| 	file, err := os.Open(filename) | |
| 	if err != nil { | |
| 		t.Fatalf("os.Open failed: %v", err) | |
| 	} | |
| 	defer file.Close() | |
| 
 | |
| 	// Get file info to determine size | |
| 	fileInfo, err := file.Stat() | |
| 	if err != nil { | |
| 		t.Fatalf("file.Stat failed: %v", err) | |
| 	} | |
| 
 | |
| 	// Create a parquet file from the opened file | |
| 	parquetFile, err := parquet.OpenFile(file, fileInfo.Size()) | |
| 	if err != nil { | |
| 		t.Fatalf("parquet.OpenFile failed: %v", err) | |
| 	} | |
| 
 | |
| 	reader := parquet.NewReader(parquetFile) | |
| 	rows := make([]parquet.Row, 128) | |
| 	for { | |
| 		rowCount, err := reader.ReadRows(rows) | |
| 
 | |
| 		// Process the rows first, even if EOF is returned | |
| 		for i := 0; i < rowCount; i++ { | |
| 			row := rows[i] | |
| 			// convert parquet row to schema_pb.RecordValue | |
| 			recordValue, err := ToRecordValue(recordType, parquetLevels, row) | |
| 			if err != nil { | |
| 				t.Fatalf("ToRecordValue failed: %v", err) | |
| 			} | |
| 			if rowCount < 10 { | |
| 				fmt.Printf("Read RecordValue: %v\n", recordValue) | |
| 			} | |
| 		} | |
| 		total += rowCount | |
| 
 | |
| 		// Check for end conditions after processing rows | |
| 		if err != nil { | |
| 			if err == io.EOF { | |
| 				break | |
| 			} | |
| 			t.Fatalf("reader.Read failed: %v", err) | |
| 		} | |
| 		if rowCount == 0 { | |
| 			break | |
| 		} | |
| 	} | |
| 	fmt.Printf("total: %v\n", total) | |
| 	return | |
| }
 |