|
|
@ -287,13 +287,21 @@ func writeLogFilesToParquet(filerClient filer_pb.FilerClient, partitionDir strin |
|
|
|
// write to parquet file to partitionDir
|
|
|
|
parquetFileName := fmt.Sprintf("%s.parquet", time.Unix(0, startTsNs).UTC().Format("2006-01-02-15-04-05")) |
|
|
|
|
|
|
|
// Collect source log file names for deduplication metadata
|
|
|
|
// Collect source log file names and buffer_start metadata for deduplication
|
|
|
|
var sourceLogFiles []string |
|
|
|
var earliestBufferStart int64 |
|
|
|
for _, logFile := range logFileGroups { |
|
|
|
sourceLogFiles = append(sourceLogFiles, logFile.Name) |
|
|
|
|
|
|
|
// Extract buffer_start from log file metadata
|
|
|
|
if bufferStart := getBufferStartFromLogFile(logFile); bufferStart > 0 { |
|
|
|
if earliestBufferStart == 0 || bufferStart < earliestBufferStart { |
|
|
|
earliestBufferStart = bufferStart |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
if err := saveParquetFileToPartitionDir(filerClient, tempFile, partitionDir, parquetFileName, preference, startTsNs, stopTsNs, sourceLogFiles); err != nil { |
|
|
|
if err := saveParquetFileToPartitionDir(filerClient, tempFile, partitionDir, parquetFileName, preference, startTsNs, stopTsNs, sourceLogFiles, earliestBufferStart); err != nil { |
|
|
|
return fmt.Errorf("save parquet file %s: %v", parquetFileName, err) |
|
|
|
} |
|
|
|
|
|
|
@ -301,7 +309,7 @@ func writeLogFilesToParquet(filerClient filer_pb.FilerClient, partitionDir strin |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
func saveParquetFileToPartitionDir(filerClient filer_pb.FilerClient, sourceFile *os.File, partitionDir, parquetFileName string, preference *operation.StoragePreference, startTsNs, stopTsNs int64, sourceLogFiles []string) error { |
|
|
|
func saveParquetFileToPartitionDir(filerClient filer_pb.FilerClient, sourceFile *os.File, partitionDir, parquetFileName string, preference *operation.StoragePreference, startTsNs, stopTsNs int64, sourceLogFiles []string, earliestBufferStart int64) error { |
|
|
|
uploader, err := operation.NewUploader() |
|
|
|
if err != nil { |
|
|
|
return fmt.Errorf("new uploader: %w", err) |
|
|
@ -340,6 +348,13 @@ func saveParquetFileToPartitionDir(filerClient filer_pb.FilerClient, sourceFile |
|
|
|
entry.Extended["sources"] = sourceLogFilesJson |
|
|
|
} |
|
|
|
|
|
|
|
// Store earliest buffer_start for precise broker deduplication
|
|
|
|
if earliestBufferStart > 0 { |
|
|
|
bufferStartBytes := make([]byte, 8) |
|
|
|
binary.BigEndian.PutUint64(bufferStartBytes, uint64(earliestBufferStart)) |
|
|
|
entry.Extended["buffer_start"] = bufferStartBytes |
|
|
|
} |
|
|
|
|
|
|
|
for i := int64(0); i < chunkCount; i++ { |
|
|
|
fileId, uploadResult, err, _ := uploader.UploadWithRetry( |
|
|
|
filerClient, |
|
|
@ -472,3 +487,24 @@ func eachChunk(buf []byte, eachLogEntryFn log_buffer.EachLogEntryFuncType) (proc |
|
|
|
|
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
// getBufferStartFromLogFile extracts the buffer_start index from log file extended metadata
|
|
|
|
func getBufferStartFromLogFile(logFile *filer_pb.Entry) int64 { |
|
|
|
if logFile.Extended == nil { |
|
|
|
return 0 |
|
|
|
} |
|
|
|
|
|
|
|
// Parse buffer_start format (same as used in query engine)
|
|
|
|
if startJson, exists := logFile.Extended["buffer_start"]; exists { |
|
|
|
// LogBufferStart struct (JSON format)
|
|
|
|
type LogBufferStart struct { |
|
|
|
StartIndex int64 `json:"start_index"` |
|
|
|
} |
|
|
|
var bufferStart LogBufferStart |
|
|
|
if err := json.Unmarshal(startJson, &bufferStart); err == nil { |
|
|
|
return bufferStart.StartIndex |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
return 0 |
|
|
|
} |