seaweedfs/weed/mq/logstore/read_log_from_disk.go


								package logstore


								import (

									"context"

									"encoding/binary"

									"fmt"

									"math"

									"strings"

									"time"


									"github.com/seaweedfs/seaweedfs/weed/filer"

									"github.com/seaweedfs/seaweedfs/weed/glog"

									"github.com/seaweedfs/seaweedfs/weed/mq/topic"

									"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"

									"github.com/seaweedfs/seaweedfs/weed/util"

									util_http "github.com/seaweedfs/seaweedfs/weed/util/http"

									"github.com/seaweedfs/seaweedfs/weed/util/log_buffer"

									"google.golang.org/protobuf/proto"

								)


								func GenLogOnDiskReadFunc(filerClient filer_pb.FilerClient, t topic.Topic, p topic.Partition) log_buffer.LogReadFromDiskFuncType {

									partitionDir := topic.PartitionDir(t, p)


									// Create a small cache for recently-read file chunks (3 files, 60s TTL)

									// This significantly reduces Filer load when multiple consumers are catching up

									fileCache := log_buffer.NewDiskBufferCache(3, 60*time.Second)


									lookupFileIdFn := filer.LookupFn(filerClient)


									eachChunkFn := func(buf []byte, eachLogEntryFn log_buffer.EachLogEntryFuncType, starTsNs, stopTsNs int64, startOffset int64, isOffsetBased bool) (processedTsNs int64, err error) {

										entriesSkipped := 0

										entriesProcessed := 0

										for pos := 0; pos+4 < len(buf); {


											size := util.BytesToUint32(buf[pos : pos+4])

											if pos+4+int(size) > len(buf) {

												err = fmt.Errorf("GenLogOnDiskReadFunc: read [%d,%d) from [0,%d)", pos, pos+int(size)+4, len(buf))

												return

											}

											entryData := buf[pos+4 : pos+4+int(size)]


											logEntry := &filer_pb.LogEntry{}

											if err = proto.Unmarshal(entryData, logEntry); err != nil {

												pos += 4 + int(size)

												err = fmt.Errorf("unexpected unmarshal mq_pb.Message: %w", err)

												return

											}


											// Filter by offset if this is an offset-based subscription

											if isOffsetBased {

												if logEntry.Offset < startOffset {

													entriesSkipped++

													pos += 4 + int(size)

													continue

												}

											} else {

												// Filter by timestamp for timestamp-based subscriptions

												if logEntry.TsNs <= starTsNs {

													pos += 4 + int(size)

													continue

												}

												if stopTsNs != 0 && logEntry.TsNs > stopTsNs {

													println("stopTsNs", stopTsNs, "logEntry.TsNs", logEntry.TsNs)

													return

												}

											}


											// fmt.Printf(" read logEntry: %v, ts %v\n", string(logEntry.Key), time.Unix(0, logEntry.TsNs).UTC())

											if _, err = eachLogEntryFn(logEntry); err != nil {

												err = fmt.Errorf("process log entry %v: %w", logEntry, err)

												return

											}


											processedTsNs = logEntry.TsNs

											entriesProcessed++


											pos += 4 + int(size)


										}


										return

									}


									eachFileFn := func(entry *filer_pb.Entry, eachLogEntryFn log_buffer.EachLogEntryFuncType, starTsNs, stopTsNs int64, startOffset int64, isOffsetBased bool) (processedTsNs int64, err error) {

										if len(entry.Content) > 0 {

											// skip .offset files

											return

										}

										var urlStrings []string

										for _, chunk := range entry.Chunks {

											if chunk.Size == 0 {

												continue

											}

											if chunk.IsChunkManifest {

												glog.Warningf("this should not happen. unexpected chunk manifest in %s/%s", partitionDir, entry.Name)

												return

											}

											urlStrings, err = lookupFileIdFn(context.Background(), chunk.FileId)

											if err != nil {

												glog.V(1).Infof("lookup %s failed: %v", chunk.FileId, err)

												err = fmt.Errorf("lookup %s: %v", chunk.FileId, err)

												return

											}

											if len(urlStrings) == 0 {

												glog.V(1).Infof("no url found for %s", chunk.FileId)

												err = fmt.Errorf("no url found for %s", chunk.FileId)

												return

											}

											glog.V(2).Infof("lookup %s returned %d URLs", chunk.FileId, len(urlStrings))


											// Try to get data from cache first

											cacheKey := fmt.Sprintf("%s/%s/%d/%s", t.Name, p.String(), p.RangeStart, chunk.FileId)

											if cachedData, _, found := fileCache.Get(cacheKey); found {

												if cachedData == nil {

													// Negative cache hit - data doesn't exist

													continue

												}

												// Positive cache hit - data exists

												if processedTsNs, err = eachChunkFn(cachedData, eachLogEntryFn, starTsNs, stopTsNs, startOffset, isOffsetBased); err != nil {

													glog.V(1).Infof("eachChunkFn failed on cached data: %v", err)

													return

												}

												continue

											}


											// Cache miss - try one of the urlString until util.Get(urlString) succeeds

											var processed bool

											for _, urlString := range urlStrings {

												// TODO optimization opportunity: reuse the buffer

												var data []byte

												glog.V(2).Infof("trying to fetch data from %s", urlString)

												if data, _, err = util_http.Get(urlString); err == nil {

													glog.V(2).Infof("successfully fetched %d bytes from %s", len(data), urlString)

													processed = true


													// Store in cache for future reads

													fileCache.Put(cacheKey, data, startOffset)


													if processedTsNs, err = eachChunkFn(data, eachLogEntryFn, starTsNs, stopTsNs, startOffset, isOffsetBased); err != nil {

														glog.V(1).Infof("eachChunkFn failed: %v", err)

														return

													}

													break

												} else {

													glog.V(2).Infof("failed to fetch from %s: %v", urlString, err)

												}

											}

											if !processed {

												// Store negative cache entry - data doesn't exist or all URLs failed

												fileCache.Put(cacheKey, nil, startOffset)

												glog.V(1).Infof("no data processed for %s %s - all URLs failed", entry.Name, chunk.FileId)

												err = fmt.Errorf("no data processed for %s %s", entry.Name, chunk.FileId)

												return

											}


										}

										return

									}


									return func(startPosition log_buffer.MessagePosition, stopTsNs int64, eachLogEntryFn log_buffer.EachLogEntryFuncType) (lastReadPosition log_buffer.MessagePosition, isDone bool, err error) {

										startFileName := startPosition.Time.UTC().Format(topic.TIME_FORMAT)

										startTsNs := startPosition.Time.UnixNano()

										stopTime := time.Unix(0, stopTsNs)

										var processedTsNs int64


										// Check if this is an offset-based subscription

										isOffsetBased := startPosition.IsOffsetBased

										var startOffset int64

										if isOffsetBased {

											startOffset = startPosition.Offset

											// CRITICAL FIX: For offset-based reads, ignore startFileName (which is based on Time)

											// and list all files from the beginning to find the right offset

											startFileName = ""

											glog.V(1).Infof("disk read start: topic=%s partition=%s startOffset=%d",

												t.Name, p, startOffset)

										}


										// OPTIMIZATION: For offset-based reads, collect all files with their offset ranges first

										// Then use binary search to find the right file, and skip files that don't contain the offset

										var candidateFiles []*filer_pb.Entry

										var foundStartFile bool


										err = filerClient.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error {

											// First pass: collect all relevant files with their metadata

											glog.V(2).Infof("listing directory %s for offset %d startFileName=%q", partitionDir, startOffset, startFileName)

											return filer_pb.SeaweedList(context.Background(), client, partitionDir, "", func(entry *filer_pb.Entry, isLast bool) error {


												if entry.IsDirectory {

													return nil

												}

												if strings.HasSuffix(entry.Name, ".parquet") {

													return nil

												}

												if strings.HasSuffix(entry.Name, ".offset") {

													return nil

												}

												if stopTsNs != 0 && entry.Name > stopTime.UTC().Format(topic.TIME_FORMAT) {

													return nil

												}


												// OPTIMIZATION: For offset-based reads, check if this file contains the requested offset

												if isOffsetBased {

													glog.V(3).Infof("found file %s", entry.Name)

													// Check if file has offset range metadata

													if minOffsetBytes, hasMin := entry.Extended["offset_min"]; hasMin && len(minOffsetBytes) == 8 {

														if maxOffsetBytes, hasMax := entry.Extended["offset_max"]; hasMax && len(maxOffsetBytes) == 8 {

															fileMinOffset := int64(binary.BigEndian.Uint64(minOffsetBytes))

															fileMaxOffset := int64(binary.BigEndian.Uint64(maxOffsetBytes))


															// Skip files that don't contain our offset range

															if startOffset > fileMaxOffset {

																return nil

															}


															// If we haven't found the start file yet, check if this file contains it

															if !foundStartFile && startOffset >= fileMinOffset && startOffset <= fileMaxOffset {

																foundStartFile = true

															}

														}

													}

													// If file doesn't have offset metadata, include it (might be old format)

												} else {

													// Timestamp-based filtering

													topicName := t.Name

													if dotIndex := strings.LastIndex(topicName, "."); dotIndex != -1 {

														topicName = topicName[dotIndex+1:]

													}

													isSystemTopic := strings.HasPrefix(topicName, "_")

													if !isSystemTopic && startPosition.Time.Unix() > 86400 && entry.Name < startPosition.Time.UTC().Format(topic.TIME_FORMAT) {

														return nil

													}

												}


												// Add file to candidates for processing

												candidateFiles = append(candidateFiles, entry)

												glog.V(3).Infof("added candidate file %s (total=%d)", entry.Name, len(candidateFiles))

												return nil


											}, startFileName, true, math.MaxInt32)

										})


										if err != nil {

											glog.Errorf("failed to list directory %s: %v", partitionDir, err)

											return

										}


										glog.V(2).Infof("found %d candidate files for topic=%s partition=%s offset=%d",

											len(candidateFiles), t.Name, p, startOffset)


										if len(candidateFiles) == 0 {

											glog.V(2).Infof("no files found in %s", partitionDir)

											return startPosition, isDone, nil

										}


										// OPTIMIZATION: For offset-based reads with many files, use binary search to find start file

										if isOffsetBased && len(candidateFiles) > 10 {

											// Binary search to find the first file that might contain our offset

											left, right := 0, len(candidateFiles)-1

											startIdx := 0


											for left <= right {

												mid := (left + right) / 2

												entry := candidateFiles[mid]


												if minOffsetBytes, hasMin := entry.Extended["offset_min"]; hasMin && len(minOffsetBytes) == 8 {

													if maxOffsetBytes, hasMax := entry.Extended["offset_max"]; hasMax && len(maxOffsetBytes) == 8 {

														fileMinOffset := int64(binary.BigEndian.Uint64(minOffsetBytes))

														fileMaxOffset := int64(binary.BigEndian.Uint64(maxOffsetBytes))


														if startOffset < fileMinOffset {

															// Our offset is before this file, search left

															right = mid - 1

														} else if startOffset > fileMaxOffset {

															// Our offset is after this file, search right

															left = mid + 1

															startIdx = left

														} else {

															// Found the file containing our offset

															startIdx = mid

															break

														}

													} else {

														break

													}

												} else {

													break

												}

											}


											// Process files starting from the found index

											candidateFiles = candidateFiles[startIdx:]

										}


										// Second pass: process the filtered files

										// CRITICAL: For offset-based reads, process ALL candidate files in one call

										// This prevents multiple ReadFromDiskFn calls with 1.127s overhead each

										var filesProcessed int

										var lastProcessedOffset int64

										for _, entry := range candidateFiles {

											var fileTsNs int64

											if fileTsNs, err = eachFileFn(entry, eachLogEntryFn, startTsNs, stopTsNs, startOffset, isOffsetBased); err != nil {

												return lastReadPosition, isDone, err

											}

											if fileTsNs > 0 {

												processedTsNs = fileTsNs

												filesProcessed++

											}


											// For offset-based reads, track the last processed offset

											// We need to continue reading ALL files to avoid multiple disk read calls

											if isOffsetBased {

												// Extract the last offset from the file's extended attributes

												if maxOffsetBytes, hasMax := entry.Extended["offset_max"]; hasMax && len(maxOffsetBytes) == 8 {

													fileMaxOffset := int64(binary.BigEndian.Uint64(maxOffsetBytes))

													if fileMaxOffset > lastProcessedOffset {

														lastProcessedOffset = fileMaxOffset

													}

												}

											}

										}


										if isOffsetBased && filesProcessed > 0 {

											// Return a position that indicates we've read all disk data up to lastProcessedOffset

											// This prevents the subscription from calling ReadFromDiskFn again for these offsets

											lastReadPosition = log_buffer.NewMessagePositionFromOffset(lastProcessedOffset + 1)

										} else {

											// CRITICAL FIX: If no files were processed (e.g., all data already consumed),

											// return the requested offset to prevent busy loop

											if isOffsetBased {

												// For offset-based reads with no data, return the requested offset

												// This signals "I've checked, there's no data at this offset, move forward"

												lastReadPosition = log_buffer.NewMessagePositionFromOffset(startOffset)

											} else {

												// For timestamp-based reads, return error (-2)

												lastReadPosition = log_buffer.NewMessagePosition(processedTsNs, -2)

											}

										}

										return

									}

								}