You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							340 lines
						
					
					
						
							12 KiB
						
					
					
				
			
		
		
		
			
			
			
		
		
	
	
							340 lines
						
					
					
						
							12 KiB
						
					
					
				
								package logstore
							 | 
						|
								
							 | 
						|
								import (
							 | 
						|
									"context"
							 | 
						|
									"encoding/binary"
							 | 
						|
									"fmt"
							 | 
						|
									"math"
							 | 
						|
									"strings"
							 | 
						|
									"time"
							 | 
						|
								
							 | 
						|
									"github.com/seaweedfs/seaweedfs/weed/filer"
							 | 
						|
									"github.com/seaweedfs/seaweedfs/weed/glog"
							 | 
						|
									"github.com/seaweedfs/seaweedfs/weed/mq/topic"
							 | 
						|
									"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
							 | 
						|
									"github.com/seaweedfs/seaweedfs/weed/util"
							 | 
						|
									util_http "github.com/seaweedfs/seaweedfs/weed/util/http"
							 | 
						|
									"github.com/seaweedfs/seaweedfs/weed/util/log_buffer"
							 | 
						|
									"google.golang.org/protobuf/proto"
							 | 
						|
								)
							 | 
						|
								
							 | 
						|
								func GenLogOnDiskReadFunc(filerClient filer_pb.FilerClient, t topic.Topic, p topic.Partition) log_buffer.LogReadFromDiskFuncType {
							 | 
						|
									partitionDir := topic.PartitionDir(t, p)
							 | 
						|
								
							 | 
						|
									// Create a small cache for recently-read file chunks (3 files, 60s TTL)
							 | 
						|
									// This significantly reduces Filer load when multiple consumers are catching up
							 | 
						|
									fileCache := log_buffer.NewDiskBufferCache(3, 60*time.Second)
							 | 
						|
								
							 | 
						|
									lookupFileIdFn := filer.LookupFn(filerClient)
							 | 
						|
								
							 | 
						|
									eachChunkFn := func(buf []byte, eachLogEntryFn log_buffer.EachLogEntryFuncType, starTsNs, stopTsNs int64, startOffset int64, isOffsetBased bool) (processedTsNs int64, err error) {
							 | 
						|
										entriesSkipped := 0
							 | 
						|
										entriesProcessed := 0
							 | 
						|
										for pos := 0; pos+4 < len(buf); {
							 | 
						|
								
							 | 
						|
											size := util.BytesToUint32(buf[pos : pos+4])
							 | 
						|
											if pos+4+int(size) > len(buf) {
							 | 
						|
												err = fmt.Errorf("GenLogOnDiskReadFunc: read [%d,%d) from [0,%d)", pos, pos+int(size)+4, len(buf))
							 | 
						|
												return
							 | 
						|
											}
							 | 
						|
											entryData := buf[pos+4 : pos+4+int(size)]
							 | 
						|
								
							 | 
						|
											logEntry := &filer_pb.LogEntry{}
							 | 
						|
											if err = proto.Unmarshal(entryData, logEntry); err != nil {
							 | 
						|
												pos += 4 + int(size)
							 | 
						|
												err = fmt.Errorf("unexpected unmarshal mq_pb.Message: %w", err)
							 | 
						|
												return
							 | 
						|
											}
							 | 
						|
								
							 | 
						|
											// Filter by offset if this is an offset-based subscription
							 | 
						|
											if isOffsetBased {
							 | 
						|
												if logEntry.Offset < startOffset {
							 | 
						|
													entriesSkipped++
							 | 
						|
													pos += 4 + int(size)
							 | 
						|
													continue
							 | 
						|
												}
							 | 
						|
											} else {
							 | 
						|
												// Filter by timestamp for timestamp-based subscriptions
							 | 
						|
												if logEntry.TsNs <= starTsNs {
							 | 
						|
													pos += 4 + int(size)
							 | 
						|
													continue
							 | 
						|
												}
							 | 
						|
												if stopTsNs != 0 && logEntry.TsNs > stopTsNs {
							 | 
						|
													println("stopTsNs", stopTsNs, "logEntry.TsNs", logEntry.TsNs)
							 | 
						|
													return
							 | 
						|
												}
							 | 
						|
											}
							 | 
						|
								
							 | 
						|
											// fmt.Printf(" read logEntry: %v, ts %v\n", string(logEntry.Key), time.Unix(0, logEntry.TsNs).UTC())
							 | 
						|
											if _, err = eachLogEntryFn(logEntry); err != nil {
							 | 
						|
												err = fmt.Errorf("process log entry %v: %w", logEntry, err)
							 | 
						|
												return
							 | 
						|
											}
							 | 
						|
								
							 | 
						|
											processedTsNs = logEntry.TsNs
							 | 
						|
											entriesProcessed++
							 | 
						|
								
							 | 
						|
											pos += 4 + int(size)
							 | 
						|
								
							 | 
						|
										}
							 | 
						|
								
							 | 
						|
										return
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									eachFileFn := func(entry *filer_pb.Entry, eachLogEntryFn log_buffer.EachLogEntryFuncType, starTsNs, stopTsNs int64, startOffset int64, isOffsetBased bool) (processedTsNs int64, err error) {
							 | 
						|
										if len(entry.Content) > 0 {
							 | 
						|
											// skip .offset files
							 | 
						|
											return
							 | 
						|
										}
							 | 
						|
										var urlStrings []string
							 | 
						|
										for _, chunk := range entry.Chunks {
							 | 
						|
											if chunk.Size == 0 {
							 | 
						|
												continue
							 | 
						|
											}
							 | 
						|
											if chunk.IsChunkManifest {
							 | 
						|
												glog.Warningf("this should not happen. unexpected chunk manifest in %s/%s", partitionDir, entry.Name)
							 | 
						|
												return
							 | 
						|
											}
							 | 
						|
											urlStrings, err = lookupFileIdFn(context.Background(), chunk.FileId)
							 | 
						|
											if err != nil {
							 | 
						|
												glog.V(1).Infof("lookup %s failed: %v", chunk.FileId, err)
							 | 
						|
												err = fmt.Errorf("lookup %s: %v", chunk.FileId, err)
							 | 
						|
												return
							 | 
						|
											}
							 | 
						|
											if len(urlStrings) == 0 {
							 | 
						|
												glog.V(1).Infof("no url found for %s", chunk.FileId)
							 | 
						|
												err = fmt.Errorf("no url found for %s", chunk.FileId)
							 | 
						|
												return
							 | 
						|
											}
							 | 
						|
											glog.V(2).Infof("lookup %s returned %d URLs", chunk.FileId, len(urlStrings))
							 | 
						|
								
							 | 
						|
											// Try to get data from cache first
							 | 
						|
											cacheKey := fmt.Sprintf("%s/%s/%d/%s", t.Name, p.String(), p.RangeStart, chunk.FileId)
							 | 
						|
											if cachedData, _, found := fileCache.Get(cacheKey); found {
							 | 
						|
												if cachedData == nil {
							 | 
						|
													// Negative cache hit - data doesn't exist
							 | 
						|
													continue
							 | 
						|
												}
							 | 
						|
												// Positive cache hit - data exists
							 | 
						|
												if processedTsNs, err = eachChunkFn(cachedData, eachLogEntryFn, starTsNs, stopTsNs, startOffset, isOffsetBased); err != nil {
							 | 
						|
													glog.V(1).Infof("eachChunkFn failed on cached data: %v", err)
							 | 
						|
													return
							 | 
						|
												}
							 | 
						|
												continue
							 | 
						|
											}
							 | 
						|
								
							 | 
						|
											// Cache miss - try one of the urlString until util.Get(urlString) succeeds
							 | 
						|
											var processed bool
							 | 
						|
											for _, urlString := range urlStrings {
							 | 
						|
												// TODO optimization opportunity: reuse the buffer
							 | 
						|
												var data []byte
							 | 
						|
												glog.V(2).Infof("trying to fetch data from %s", urlString)
							 | 
						|
												if data, _, err = util_http.Get(urlString); err == nil {
							 | 
						|
													glog.V(2).Infof("successfully fetched %d bytes from %s", len(data), urlString)
							 | 
						|
													processed = true
							 | 
						|
								
							 | 
						|
													// Store in cache for future reads
							 | 
						|
													fileCache.Put(cacheKey, data, startOffset)
							 | 
						|
								
							 | 
						|
													if processedTsNs, err = eachChunkFn(data, eachLogEntryFn, starTsNs, stopTsNs, startOffset, isOffsetBased); err != nil {
							 | 
						|
														glog.V(1).Infof("eachChunkFn failed: %v", err)
							 | 
						|
														return
							 | 
						|
													}
							 | 
						|
													break
							 | 
						|
												} else {
							 | 
						|
													glog.V(2).Infof("failed to fetch from %s: %v", urlString, err)
							 | 
						|
												}
							 | 
						|
											}
							 | 
						|
											if !processed {
							 | 
						|
												// Store negative cache entry - data doesn't exist or all URLs failed
							 | 
						|
												fileCache.Put(cacheKey, nil, startOffset)
							 | 
						|
												glog.V(1).Infof("no data processed for %s %s - all URLs failed", entry.Name, chunk.FileId)
							 | 
						|
												err = fmt.Errorf("no data processed for %s %s", entry.Name, chunk.FileId)
							 | 
						|
												return
							 | 
						|
											}
							 | 
						|
								
							 | 
						|
										}
							 | 
						|
										return
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									return func(startPosition log_buffer.MessagePosition, stopTsNs int64, eachLogEntryFn log_buffer.EachLogEntryFuncType) (lastReadPosition log_buffer.MessagePosition, isDone bool, err error) {
							 | 
						|
										startFileName := startPosition.Time.UTC().Format(topic.TIME_FORMAT)
							 | 
						|
										startTsNs := startPosition.Time.UnixNano()
							 | 
						|
										stopTime := time.Unix(0, stopTsNs)
							 | 
						|
										var processedTsNs int64
							 | 
						|
								
							 | 
						|
										// Check if this is an offset-based subscription
							 | 
						|
										isOffsetBased := startPosition.IsOffsetBased
							 | 
						|
										var startOffset int64
							 | 
						|
										if isOffsetBased {
							 | 
						|
											startOffset = startPosition.Offset
							 | 
						|
											// CRITICAL FIX: For offset-based reads, ignore startFileName (which is based on Time)
							 | 
						|
											// and list all files from the beginning to find the right offset
							 | 
						|
											startFileName = ""
							 | 
						|
											glog.V(1).Infof("disk read start: topic=%s partition=%s startOffset=%d",
							 | 
						|
												t.Name, p, startOffset)
							 | 
						|
										}
							 | 
						|
								
							 | 
						|
										// OPTIMIZATION: For offset-based reads, collect all files with their offset ranges first
							 | 
						|
										// Then use binary search to find the right file, and skip files that don't contain the offset
							 | 
						|
										var candidateFiles []*filer_pb.Entry
							 | 
						|
										var foundStartFile bool
							 | 
						|
								
							 | 
						|
										err = filerClient.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error {
							 | 
						|
											// First pass: collect all relevant files with their metadata
							 | 
						|
											glog.V(2).Infof("listing directory %s for offset %d startFileName=%q", partitionDir, startOffset, startFileName)
							 | 
						|
											return filer_pb.SeaweedList(context.Background(), client, partitionDir, "", func(entry *filer_pb.Entry, isLast bool) error {
							 | 
						|
								
							 | 
						|
												if entry.IsDirectory {
							 | 
						|
													return nil
							 | 
						|
												}
							 | 
						|
												if strings.HasSuffix(entry.Name, ".parquet") {
							 | 
						|
													return nil
							 | 
						|
												}
							 | 
						|
												if strings.HasSuffix(entry.Name, ".offset") {
							 | 
						|
													return nil
							 | 
						|
												}
							 | 
						|
												if stopTsNs != 0 && entry.Name > stopTime.UTC().Format(topic.TIME_FORMAT) {
							 | 
						|
													return nil
							 | 
						|
												}
							 | 
						|
								
							 | 
						|
												// OPTIMIZATION: For offset-based reads, check if this file contains the requested offset
							 | 
						|
												if isOffsetBased {
							 | 
						|
													glog.V(3).Infof("found file %s", entry.Name)
							 | 
						|
													// Check if file has offset range metadata
							 | 
						|
													if minOffsetBytes, hasMin := entry.Extended["offset_min"]; hasMin && len(minOffsetBytes) == 8 {
							 | 
						|
														if maxOffsetBytes, hasMax := entry.Extended["offset_max"]; hasMax && len(maxOffsetBytes) == 8 {
							 | 
						|
															fileMinOffset := int64(binary.BigEndian.Uint64(minOffsetBytes))
							 | 
						|
															fileMaxOffset := int64(binary.BigEndian.Uint64(maxOffsetBytes))
							 | 
						|
								
							 | 
						|
															// Skip files that don't contain our offset range
							 | 
						|
															if startOffset > fileMaxOffset {
							 | 
						|
																return nil
							 | 
						|
															}
							 | 
						|
								
							 | 
						|
															// If we haven't found the start file yet, check if this file contains it
							 | 
						|
															if !foundStartFile && startOffset >= fileMinOffset && startOffset <= fileMaxOffset {
							 | 
						|
																foundStartFile = true
							 | 
						|
															}
							 | 
						|
														}
							 | 
						|
													}
							 | 
						|
													// If file doesn't have offset metadata, include it (might be old format)
							 | 
						|
												} else {
							 | 
						|
													// Timestamp-based filtering
							 | 
						|
													topicName := t.Name
							 | 
						|
													if dotIndex := strings.LastIndex(topicName, "."); dotIndex != -1 {
							 | 
						|
														topicName = topicName[dotIndex+1:]
							 | 
						|
													}
							 | 
						|
													isSystemTopic := strings.HasPrefix(topicName, "_")
							 | 
						|
													if !isSystemTopic && startPosition.Time.Unix() > 86400 && entry.Name < startPosition.Time.UTC().Format(topic.TIME_FORMAT) {
							 | 
						|
														return nil
							 | 
						|
													}
							 | 
						|
												}
							 | 
						|
								
							 | 
						|
												// Add file to candidates for processing
							 | 
						|
												candidateFiles = append(candidateFiles, entry)
							 | 
						|
												glog.V(3).Infof("added candidate file %s (total=%d)", entry.Name, len(candidateFiles))
							 | 
						|
												return nil
							 | 
						|
								
							 | 
						|
											}, startFileName, true, math.MaxInt32)
							 | 
						|
										})
							 | 
						|
								
							 | 
						|
										if err != nil {
							 | 
						|
											glog.Errorf("failed to list directory %s: %v", partitionDir, err)
							 | 
						|
											return
							 | 
						|
										}
							 | 
						|
								
							 | 
						|
										glog.V(2).Infof("found %d candidate files for topic=%s partition=%s offset=%d",
							 | 
						|
											len(candidateFiles), t.Name, p, startOffset)
							 | 
						|
								
							 | 
						|
										if len(candidateFiles) == 0 {
							 | 
						|
											glog.V(2).Infof("no files found in %s", partitionDir)
							 | 
						|
											return startPosition, isDone, nil
							 | 
						|
										}
							 | 
						|
								
							 | 
						|
										// OPTIMIZATION: For offset-based reads with many files, use binary search to find start file
							 | 
						|
										if isOffsetBased && len(candidateFiles) > 10 {
							 | 
						|
											// Binary search to find the first file that might contain our offset
							 | 
						|
											left, right := 0, len(candidateFiles)-1
							 | 
						|
											startIdx := 0
							 | 
						|
								
							 | 
						|
											for left <= right {
							 | 
						|
												mid := (left + right) / 2
							 | 
						|
												entry := candidateFiles[mid]
							 | 
						|
								
							 | 
						|
												if minOffsetBytes, hasMin := entry.Extended["offset_min"]; hasMin && len(minOffsetBytes) == 8 {
							 | 
						|
													if maxOffsetBytes, hasMax := entry.Extended["offset_max"]; hasMax && len(maxOffsetBytes) == 8 {
							 | 
						|
														fileMinOffset := int64(binary.BigEndian.Uint64(minOffsetBytes))
							 | 
						|
														fileMaxOffset := int64(binary.BigEndian.Uint64(maxOffsetBytes))
							 | 
						|
								
							 | 
						|
														if startOffset < fileMinOffset {
							 | 
						|
															// Our offset is before this file, search left
							 | 
						|
															right = mid - 1
							 | 
						|
														} else if startOffset > fileMaxOffset {
							 | 
						|
															// Our offset is after this file, search right
							 | 
						|
															left = mid + 1
							 | 
						|
															startIdx = left
							 | 
						|
														} else {
							 | 
						|
															// Found the file containing our offset
							 | 
						|
															startIdx = mid
							 | 
						|
															break
							 | 
						|
														}
							 | 
						|
													} else {
							 | 
						|
														break
							 | 
						|
													}
							 | 
						|
												} else {
							 | 
						|
													break
							 | 
						|
												}
							 | 
						|
											}
							 | 
						|
								
							 | 
						|
											// Process files starting from the found index
							 | 
						|
											candidateFiles = candidateFiles[startIdx:]
							 | 
						|
										}
							 | 
						|
								
							 | 
						|
										// Second pass: process the filtered files
							 | 
						|
										// CRITICAL: For offset-based reads, process ALL candidate files in one call
							 | 
						|
										// This prevents multiple ReadFromDiskFn calls with 1.127s overhead each
							 | 
						|
										var filesProcessed int
							 | 
						|
										var lastProcessedOffset int64
							 | 
						|
										for _, entry := range candidateFiles {
							 | 
						|
											var fileTsNs int64
							 | 
						|
											if fileTsNs, err = eachFileFn(entry, eachLogEntryFn, startTsNs, stopTsNs, startOffset, isOffsetBased); err != nil {
							 | 
						|
												return lastReadPosition, isDone, err
							 | 
						|
											}
							 | 
						|
											if fileTsNs > 0 {
							 | 
						|
												processedTsNs = fileTsNs
							 | 
						|
												filesProcessed++
							 | 
						|
											}
							 | 
						|
								
							 | 
						|
											// For offset-based reads, track the last processed offset
							 | 
						|
											// We need to continue reading ALL files to avoid multiple disk read calls
							 | 
						|
											if isOffsetBased {
							 | 
						|
												// Extract the last offset from the file's extended attributes
							 | 
						|
												if maxOffsetBytes, hasMax := entry.Extended["offset_max"]; hasMax && len(maxOffsetBytes) == 8 {
							 | 
						|
													fileMaxOffset := int64(binary.BigEndian.Uint64(maxOffsetBytes))
							 | 
						|
													if fileMaxOffset > lastProcessedOffset {
							 | 
						|
														lastProcessedOffset = fileMaxOffset
							 | 
						|
													}
							 | 
						|
												}
							 | 
						|
											}
							 | 
						|
										}
							 | 
						|
								
							 | 
						|
										if isOffsetBased && filesProcessed > 0 {
							 | 
						|
											// Return a position that indicates we've read all disk data up to lastProcessedOffset
							 | 
						|
											// This prevents the subscription from calling ReadFromDiskFn again for these offsets
							 | 
						|
											lastReadPosition = log_buffer.NewMessagePositionFromOffset(lastProcessedOffset + 1)
							 | 
						|
										} else {
							 | 
						|
											// CRITICAL FIX: If no files were processed (e.g., all data already consumed),
							 | 
						|
											// return the requested offset to prevent busy loop
							 | 
						|
											if isOffsetBased {
							 | 
						|
												// For offset-based reads with no data, return the requested offset
							 | 
						|
												// This signals "I've checked, there's no data at this offset, move forward"
							 | 
						|
												lastReadPosition = log_buffer.NewMessagePositionFromOffset(startOffset)
							 | 
						|
											} else {
							 | 
						|
												// For timestamp-based reads, return error (-2)
							 | 
						|
												lastReadPosition = log_buffer.NewMessagePosition(processedTsNs, -2)
							 | 
						|
											}
							 | 
						|
										}
							 | 
						|
										return
							 | 
						|
									}
							 | 
						|
								}
							 |