|
|
@ -2,6 +2,7 @@ package engine |
|
|
|
|
|
|
|
import ( |
|
|
|
"context" |
|
|
|
"encoding/json" |
|
|
|
"fmt" |
|
|
|
"math" |
|
|
|
"regexp" |
|
|
@ -1399,8 +1400,11 @@ func (e *SQLEngine) tryFastParquetAggregation(ctx context.Context, hybridScanner |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
// Check if there are live log files and count their rows
|
|
|
|
liveLogRowCount, err := e.countLiveLogRows(partitionPath) |
|
|
|
// Get parquet source files for deduplication
|
|
|
|
parquetSourceFiles := e.extractParquetSourceFiles(fileStats) |
|
|
|
|
|
|
|
// Check if there are live log files and count their rows (excluding parquet-converted files)
|
|
|
|
liveLogRowCount, err := e.countLiveLogRowsExcludingParquetSources(partitionPath, parquetSourceFiles) |
|
|
|
if err != nil { |
|
|
|
// If we can't count live logs, fall back to full scan
|
|
|
|
return nil, false |
|
|
@ -1415,7 +1419,7 @@ func (e *SQLEngine) tryFastParquetAggregation(ctx context.Context, hybridScanner |
|
|
|
|
|
|
|
// Debug: Show the hybrid optimization results
|
|
|
|
if totalParquetRowCount > 0 || totalLiveLogRowCount > 0 { |
|
|
|
fmt.Printf("Hybrid fast aggregation: %d parquet rows + %d live log rows from %d partitions with live logs\n", |
|
|
|
fmt.Printf("Hybrid fast aggregation with deduplication: %d parquet rows + %d deduplicated live log rows from %d partitions\n", |
|
|
|
totalParquetRowCount, totalLiveLogRowCount, partitionsWithLiveLogs) |
|
|
|
} |
|
|
|
|
|
|
@ -1676,6 +1680,101 @@ func (e *SQLEngine) countLiveLogRows(partitionPath string) (int64, error) { |
|
|
|
return totalRows, err |
|
|
|
} |
|
|
|
|
|
|
|
// extractParquetSourceFiles extracts source log file names from parquet file metadata for deduplication
|
|
|
|
func (e *SQLEngine) extractParquetSourceFiles(fileStats []*ParquetFileStats) map[string]bool { |
|
|
|
sourceFiles := make(map[string]bool) |
|
|
|
|
|
|
|
for _, fileStat := range fileStats { |
|
|
|
// Each ParquetFileStats should have a reference to the original file entry
|
|
|
|
// but we need to get it through the hybrid scanner to access Extended metadata
|
|
|
|
// This is a simplified approach - in practice we'd need to access the filer entry
|
|
|
|
|
|
|
|
// For now, we'll use filename-based deduplication as a fallback
|
|
|
|
// Extract timestamp from parquet filename (YYYY-MM-DD-HH-MM-SS.parquet)
|
|
|
|
if strings.HasSuffix(fileStat.FileName, ".parquet") { |
|
|
|
timeStr := strings.TrimSuffix(fileStat.FileName, ".parquet") |
|
|
|
// Mark this timestamp range as covered by parquet
|
|
|
|
sourceFiles[timeStr] = true |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
return sourceFiles |
|
|
|
} |
|
|
|
|
|
|
|
// countLiveLogRowsExcludingParquetSources counts live log rows but excludes files that were converted to parquet
|
|
|
|
func (e *SQLEngine) countLiveLogRowsExcludingParquetSources(partitionPath string, parquetSourceFiles map[string]bool) (int64, error) { |
|
|
|
filerClient, err := e.catalog.brokerClient.GetFilerClient() |
|
|
|
if err != nil { |
|
|
|
return 0, err |
|
|
|
} |
|
|
|
|
|
|
|
// First, get the actual source files from parquet metadata
|
|
|
|
actualSourceFiles, err := e.getParquetSourceFilesFromMetadata(partitionPath) |
|
|
|
if err != nil { |
|
|
|
// If we can't read parquet metadata, use filename-based fallback
|
|
|
|
fmt.Printf("Warning: failed to read parquet metadata, using filename-based deduplication: %v\n", err) |
|
|
|
actualSourceFiles = parquetSourceFiles |
|
|
|
} |
|
|
|
|
|
|
|
// Debug: Show deduplication status
|
|
|
|
if len(actualSourceFiles) > 0 { |
|
|
|
fmt.Printf("Excluding %d converted log files from %s\n", len(actualSourceFiles), partitionPath) |
|
|
|
} |
|
|
|
|
|
|
|
totalRows := int64(0) |
|
|
|
err = filer_pb.ReadDirAllEntries(context.Background(), filerClient, util.FullPath(partitionPath), "", func(entry *filer_pb.Entry, isLast bool) error { |
|
|
|
if entry.IsDirectory || strings.HasSuffix(entry.Name, ".parquet") { |
|
|
|
return nil // Skip directories and parquet files
|
|
|
|
} |
|
|
|
|
|
|
|
// Skip files that have been converted to parquet
|
|
|
|
if actualSourceFiles[entry.Name] { |
|
|
|
fmt.Printf("Skipping %s (already converted to parquet)\n", entry.Name) |
|
|
|
return nil |
|
|
|
} |
|
|
|
|
|
|
|
// Count rows in live log file
|
|
|
|
rowCount, err := e.countRowsInLogFile(filerClient, partitionPath, entry) |
|
|
|
if err != nil { |
|
|
|
fmt.Printf("Warning: failed to count rows in %s/%s: %v\n", partitionPath, entry.Name, err) |
|
|
|
return nil // Continue with other files
|
|
|
|
} |
|
|
|
totalRows += rowCount |
|
|
|
return nil |
|
|
|
}) |
|
|
|
return totalRows, err |
|
|
|
} |
|
|
|
|
|
|
|
// getParquetSourceFilesFromMetadata reads parquet file metadata to get actual source log files
|
|
|
|
func (e *SQLEngine) getParquetSourceFilesFromMetadata(partitionPath string) (map[string]bool, error) { |
|
|
|
filerClient, err := e.catalog.brokerClient.GetFilerClient() |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
|
|
|
|
sourceFiles := make(map[string]bool) |
|
|
|
|
|
|
|
err = filer_pb.ReadDirAllEntries(context.Background(), filerClient, util.FullPath(partitionPath), "", func(entry *filer_pb.Entry, isLast bool) error { |
|
|
|
if entry.IsDirectory || !strings.HasSuffix(entry.Name, ".parquet") { |
|
|
|
return nil |
|
|
|
} |
|
|
|
|
|
|
|
// Read source files from Extended metadata
|
|
|
|
if entry.Extended != nil && entry.Extended["sources"] != nil { |
|
|
|
var sources []string |
|
|
|
if err := json.Unmarshal(entry.Extended["sources"], &sources); err == nil { |
|
|
|
for _, source := range sources { |
|
|
|
sourceFiles[source] = true |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
return nil |
|
|
|
}) |
|
|
|
|
|
|
|
return sourceFiles, err |
|
|
|
} |
|
|
|
|
|
|
|
// countRowsInLogFile counts rows in a single log file using SeaweedFS patterns
|
|
|
|
func (e *SQLEngine) countRowsInLogFile(filerClient filer_pb.FilerClient, partitionPath string, entry *filer_pb.Entry) (int64, error) { |
|
|
|
lookupFileIdFn := filer.LookupFn(filerClient) |
|
|
|