package logstore import ( "context" "encoding/binary" "fmt" "math" "strings" "time" "github.com/seaweedfs/seaweedfs/weed/filer" "github.com/seaweedfs/seaweedfs/weed/glog" "github.com/seaweedfs/seaweedfs/weed/mq/topic" "github.com/seaweedfs/seaweedfs/weed/pb/filer_pb" "github.com/seaweedfs/seaweedfs/weed/util" util_http "github.com/seaweedfs/seaweedfs/weed/util/http" "github.com/seaweedfs/seaweedfs/weed/util/log_buffer" "google.golang.org/protobuf/proto" ) func GenLogOnDiskReadFunc(filerClient filer_pb.FilerClient, t topic.Topic, p topic.Partition) log_buffer.LogReadFromDiskFuncType { partitionDir := topic.PartitionDir(t, p) // Create a small cache for recently-read file chunks (3 files, 60s TTL) // This significantly reduces Filer load when multiple consumers are catching up fileCache := log_buffer.NewDiskBufferCache(3, 60*time.Second) lookupFileIdFn := filer.LookupFn(filerClient) eachChunkFn := func(buf []byte, eachLogEntryFn log_buffer.EachLogEntryFuncType, starTsNs, stopTsNs int64, startOffset int64, isOffsetBased bool) (processedTsNs int64, err error) { entriesSkipped := 0 entriesProcessed := 0 for pos := 0; pos+4 < len(buf); { size := util.BytesToUint32(buf[pos : pos+4]) if pos+4+int(size) > len(buf) { err = fmt.Errorf("GenLogOnDiskReadFunc: read [%d,%d) from [0,%d)", pos, pos+int(size)+4, len(buf)) return } entryData := buf[pos+4 : pos+4+int(size)] logEntry := &filer_pb.LogEntry{} if err = proto.Unmarshal(entryData, logEntry); err != nil { pos += 4 + int(size) err = fmt.Errorf("unexpected unmarshal mq_pb.Message: %w", err) return } // Filter by offset if this is an offset-based subscription if isOffsetBased { if logEntry.Offset < startOffset { entriesSkipped++ pos += 4 + int(size) continue } } else { // Filter by timestamp for timestamp-based subscriptions if logEntry.TsNs <= starTsNs { pos += 4 + int(size) continue } if stopTsNs != 0 && logEntry.TsNs > stopTsNs { println("stopTsNs", stopTsNs, "logEntry.TsNs", logEntry.TsNs) return } } // fmt.Printf(" read logEntry: %v, ts %v\n", string(logEntry.Key), time.Unix(0, logEntry.TsNs).UTC()) if _, err = eachLogEntryFn(logEntry); err != nil { err = fmt.Errorf("process log entry %v: %w", logEntry, err) return } processedTsNs = logEntry.TsNs entriesProcessed++ pos += 4 + int(size) } return } eachFileFn := func(entry *filer_pb.Entry, eachLogEntryFn log_buffer.EachLogEntryFuncType, starTsNs, stopTsNs int64, startOffset int64, isOffsetBased bool) (processedTsNs int64, err error) { if len(entry.Content) > 0 { // skip .offset files return } var urlStrings []string for _, chunk := range entry.Chunks { if chunk.Size == 0 { continue } if chunk.IsChunkManifest { glog.Warningf("this should not happen. unexpected chunk manifest in %s/%s", partitionDir, entry.Name) return } urlStrings, err = lookupFileIdFn(context.Background(), chunk.FileId) if err != nil { glog.V(1).Infof("lookup %s failed: %v", chunk.FileId, err) err = fmt.Errorf("lookup %s: %v", chunk.FileId, err) return } if len(urlStrings) == 0 { glog.V(1).Infof("no url found for %s", chunk.FileId) err = fmt.Errorf("no url found for %s", chunk.FileId) return } glog.V(2).Infof("lookup %s returned %d URLs", chunk.FileId, len(urlStrings)) // Try to get data from cache first cacheKey := fmt.Sprintf("%s/%s/%d/%s", t.Name, p.String(), p.RangeStart, chunk.FileId) if cachedData, _, found := fileCache.Get(cacheKey); found { if cachedData == nil { // Negative cache hit - data doesn't exist continue } // Positive cache hit - data exists if processedTsNs, err = eachChunkFn(cachedData, eachLogEntryFn, starTsNs, stopTsNs, startOffset, isOffsetBased); err != nil { glog.V(1).Infof("eachChunkFn failed on cached data: %v", err) return } continue } // Cache miss - try one of the urlString until util.Get(urlString) succeeds var processed bool for _, urlString := range urlStrings { // TODO optimization opportunity: reuse the buffer var data []byte glog.V(2).Infof("trying to fetch data from %s", urlString) if data, _, err = util_http.Get(urlString); err == nil { glog.V(2).Infof("successfully fetched %d bytes from %s", len(data), urlString) processed = true // Store in cache for future reads fileCache.Put(cacheKey, data, startOffset) if processedTsNs, err = eachChunkFn(data, eachLogEntryFn, starTsNs, stopTsNs, startOffset, isOffsetBased); err != nil { glog.V(1).Infof("eachChunkFn failed: %v", err) return } break } else { glog.V(2).Infof("failed to fetch from %s: %v", urlString, err) } } if !processed { // Store negative cache entry - data doesn't exist or all URLs failed fileCache.Put(cacheKey, nil, startOffset) glog.V(1).Infof("no data processed for %s %s - all URLs failed", entry.Name, chunk.FileId) err = fmt.Errorf("no data processed for %s %s", entry.Name, chunk.FileId) return } } return } return func(startPosition log_buffer.MessagePosition, stopTsNs int64, eachLogEntryFn log_buffer.EachLogEntryFuncType) (lastReadPosition log_buffer.MessagePosition, isDone bool, err error) { startFileName := startPosition.Time.UTC().Format(topic.TIME_FORMAT) startTsNs := startPosition.Time.UnixNano() stopTime := time.Unix(0, stopTsNs) var processedTsNs int64 // Check if this is an offset-based subscription isOffsetBased := startPosition.IsOffsetBased var startOffset int64 if isOffsetBased { startOffset = startPosition.Offset // CRITICAL FIX: For offset-based reads, ignore startFileName (which is based on Time) // and list all files from the beginning to find the right offset startFileName = "" glog.V(1).Infof("disk read start: topic=%s partition=%s startOffset=%d", t.Name, p, startOffset) } // OPTIMIZATION: For offset-based reads, collect all files with their offset ranges first // Then use binary search to find the right file, and skip files that don't contain the offset var candidateFiles []*filer_pb.Entry var foundStartFile bool err = filerClient.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error { // First pass: collect all relevant files with their metadata glog.V(2).Infof("listing directory %s for offset %d startFileName=%q", partitionDir, startOffset, startFileName) return filer_pb.SeaweedList(context.Background(), client, partitionDir, "", func(entry *filer_pb.Entry, isLast bool) error { if entry.IsDirectory { return nil } if strings.HasSuffix(entry.Name, ".parquet") { return nil } if strings.HasSuffix(entry.Name, ".offset") { return nil } if stopTsNs != 0 && entry.Name > stopTime.UTC().Format(topic.TIME_FORMAT) { return nil } // OPTIMIZATION: For offset-based reads, check if this file contains the requested offset if isOffsetBased { glog.V(3).Infof("found file %s", entry.Name) // Check if file has offset range metadata if minOffsetBytes, hasMin := entry.Extended["offset_min"]; hasMin && len(minOffsetBytes) == 8 { if maxOffsetBytes, hasMax := entry.Extended["offset_max"]; hasMax && len(maxOffsetBytes) == 8 { fileMinOffset := int64(binary.BigEndian.Uint64(minOffsetBytes)) fileMaxOffset := int64(binary.BigEndian.Uint64(maxOffsetBytes)) // Skip files that don't contain our offset range if startOffset > fileMaxOffset { return nil } // If we haven't found the start file yet, check if this file contains it if !foundStartFile && startOffset >= fileMinOffset && startOffset <= fileMaxOffset { foundStartFile = true } } } // If file doesn't have offset metadata, include it (might be old format) } else { // Timestamp-based filtering topicName := t.Name if dotIndex := strings.LastIndex(topicName, "."); dotIndex != -1 { topicName = topicName[dotIndex+1:] } isSystemTopic := strings.HasPrefix(topicName, "_") if !isSystemTopic && startPosition.Time.Unix() > 86400 && entry.Name < startPosition.Time.UTC().Format(topic.TIME_FORMAT) { return nil } } // Add file to candidates for processing candidateFiles = append(candidateFiles, entry) glog.V(3).Infof("added candidate file %s (total=%d)", entry.Name, len(candidateFiles)) return nil }, startFileName, true, math.MaxInt32) }) if err != nil { glog.Errorf("failed to list directory %s: %v", partitionDir, err) return } glog.V(2).Infof("found %d candidate files for topic=%s partition=%s offset=%d", len(candidateFiles), t.Name, p, startOffset) if len(candidateFiles) == 0 { glog.V(2).Infof("no files found in %s", partitionDir) return startPosition, isDone, nil } // OPTIMIZATION: For offset-based reads with many files, use binary search to find start file if isOffsetBased && len(candidateFiles) > 10 { // Binary search to find the first file that might contain our offset left, right := 0, len(candidateFiles)-1 startIdx := 0 for left <= right { mid := (left + right) / 2 entry := candidateFiles[mid] if minOffsetBytes, hasMin := entry.Extended["offset_min"]; hasMin && len(minOffsetBytes) == 8 { if maxOffsetBytes, hasMax := entry.Extended["offset_max"]; hasMax && len(maxOffsetBytes) == 8 { fileMinOffset := int64(binary.BigEndian.Uint64(minOffsetBytes)) fileMaxOffset := int64(binary.BigEndian.Uint64(maxOffsetBytes)) if startOffset < fileMinOffset { // Our offset is before this file, search left right = mid - 1 } else if startOffset > fileMaxOffset { // Our offset is after this file, search right left = mid + 1 startIdx = left } else { // Found the file containing our offset startIdx = mid break } } else { break } } else { break } } // Process files starting from the found index candidateFiles = candidateFiles[startIdx:] } // Second pass: process the filtered files // CRITICAL: For offset-based reads, process ALL candidate files in one call // This prevents multiple ReadFromDiskFn calls with 1.127s overhead each var filesProcessed int var lastProcessedOffset int64 for _, entry := range candidateFiles { var fileTsNs int64 if fileTsNs, err = eachFileFn(entry, eachLogEntryFn, startTsNs, stopTsNs, startOffset, isOffsetBased); err != nil { return lastReadPosition, isDone, err } if fileTsNs > 0 { processedTsNs = fileTsNs filesProcessed++ } // For offset-based reads, track the last processed offset // We need to continue reading ALL files to avoid multiple disk read calls if isOffsetBased { // Extract the last offset from the file's extended attributes if maxOffsetBytes, hasMax := entry.Extended["offset_max"]; hasMax && len(maxOffsetBytes) == 8 { fileMaxOffset := int64(binary.BigEndian.Uint64(maxOffsetBytes)) if fileMaxOffset > lastProcessedOffset { lastProcessedOffset = fileMaxOffset } } } } if isOffsetBased && filesProcessed > 0 { // Return a position that indicates we've read all disk data up to lastProcessedOffset // This prevents the subscription from calling ReadFromDiskFn again for these offsets lastReadPosition = log_buffer.NewMessagePositionFromOffset(lastProcessedOffset + 1) } else { // CRITICAL FIX: If no files were processed (e.g., all data already consumed), // return the requested offset to prevent busy loop if isOffsetBased { // For offset-based reads with no data, return the requested offset // This signals "I've checked, there's no data at this offset, move forward" lastReadPosition = log_buffer.NewMessagePositionFromOffset(startOffset) } else { // For timestamp-based reads, return error (-2) lastReadPosition = log_buffer.NewMessagePosition(processedTsNs, -2) } } return } }