You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
377 lines
10 KiB
377 lines
10 KiB
package blockvol
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"log"
|
|
"net"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
var (
|
|
ErrStaleEpoch = errors.New("blockvol: stale epoch")
|
|
ErrDuplicateLSN = errors.New("blockvol: duplicate or out-of-order LSN")
|
|
)
|
|
|
|
// ReplicaReceiver listens for WAL entries from a primary and applies them
|
|
// to the local BlockVol. It runs two listeners: one for the data channel
|
|
// (WAL entries) and one for the control channel (barrier requests).
|
|
type ReplicaReceiver struct {
|
|
vol *BlockVol
|
|
barrierTimeout time.Duration
|
|
advertisedHost string // canonical IP for address reporting; empty = auto-detect
|
|
|
|
mu sync.Mutex
|
|
receivedLSN uint64
|
|
flushedLSN uint64 // highest LSN durably persisted (fd.Sync completed); updated only in handleBarrier
|
|
cond *sync.Cond
|
|
|
|
connMu sync.Mutex // protects activeConns
|
|
activeConns map[net.Conn]struct{}
|
|
|
|
dataListener net.Listener
|
|
ctrlListener net.Listener
|
|
stopCh chan struct{}
|
|
stopped bool
|
|
wg sync.WaitGroup
|
|
}
|
|
|
|
const defaultBarrierTimeout = 5 * time.Second
|
|
|
|
// NewReplicaReceiver creates and starts listening on the data and control ports.
|
|
// advertisedHost is the canonical IP for this server (from VS listen addr or
|
|
// heartbeat identity). If empty, DataAddr()/CtrlAddr() will fall back to
|
|
// outbound-IP detection. On multi-NIC hosts, always provide advertisedHost.
|
|
func NewReplicaReceiver(vol *BlockVol, dataAddr, ctrlAddr string, advertisedHost ...string) (*ReplicaReceiver, error) {
|
|
dataLn, err := net.Listen("tcp", dataAddr)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("replica: listen data %s: %w", dataAddr, err)
|
|
}
|
|
ctrlLn, err := net.Listen("tcp", ctrlAddr)
|
|
if err != nil {
|
|
dataLn.Close()
|
|
return nil, fmt.Errorf("replica: listen ctrl %s: %w", ctrlAddr, err)
|
|
}
|
|
|
|
var advHost string
|
|
if len(advertisedHost) > 0 {
|
|
advHost = advertisedHost[0]
|
|
}
|
|
// Initialize from the volume's persisted state on receiver recreation.
|
|
// receivedLSN: highest LSN in the WAL (may include unflushed entries).
|
|
// flushedLSN: only checkpoint LSN is guaranteed durable (fd.Sync completed
|
|
// during flusher cycle). Using nextLSN would overstate durability because
|
|
// applyEntry advances nextLSN before barrier fd.Sync.
|
|
initReceived := uint64(0)
|
|
if vol.nextLSN.Load() > 1 {
|
|
initReceived = vol.nextLSN.Load() - 1
|
|
}
|
|
initFlushed := uint64(0)
|
|
if vol.flusher != nil {
|
|
initFlushed = vol.flusher.CheckpointLSN()
|
|
}
|
|
r := &ReplicaReceiver{
|
|
vol: vol,
|
|
receivedLSN: initReceived,
|
|
flushedLSN: initFlushed,
|
|
barrierTimeout: defaultBarrierTimeout,
|
|
advertisedHost: advHost,
|
|
dataListener: dataLn,
|
|
ctrlListener: ctrlLn,
|
|
stopCh: make(chan struct{}),
|
|
activeConns: make(map[net.Conn]struct{}),
|
|
}
|
|
r.cond = sync.NewCond(&r.mu)
|
|
return r, nil
|
|
}
|
|
|
|
// Serve starts accept loops for both listeners. Call Stop() to shut down.
|
|
func (r *ReplicaReceiver) Serve() {
|
|
r.wg.Add(2)
|
|
go r.acceptDataLoop()
|
|
go r.acceptCtrlLoop()
|
|
}
|
|
|
|
// Stop shuts down both listeners, closes active connections, and waits for goroutines.
|
|
func (r *ReplicaReceiver) Stop() {
|
|
r.mu.Lock()
|
|
if r.stopped {
|
|
r.mu.Unlock()
|
|
return
|
|
}
|
|
r.stopped = true
|
|
r.mu.Unlock()
|
|
|
|
close(r.stopCh)
|
|
r.dataListener.Close()
|
|
r.ctrlListener.Close()
|
|
|
|
// Close all active connections to unblock ReadFrame calls.
|
|
r.connMu.Lock()
|
|
for conn := range r.activeConns {
|
|
conn.Close()
|
|
}
|
|
r.connMu.Unlock()
|
|
|
|
// Wake any barrier waiters so they can exit (must hold mu for cond).
|
|
r.mu.Lock()
|
|
r.cond.Broadcast()
|
|
r.mu.Unlock()
|
|
r.wg.Wait()
|
|
}
|
|
|
|
func (r *ReplicaReceiver) trackConn(conn net.Conn) {
|
|
r.connMu.Lock()
|
|
r.activeConns[conn] = struct{}{}
|
|
r.connMu.Unlock()
|
|
}
|
|
|
|
func (r *ReplicaReceiver) untrackConn(conn net.Conn) {
|
|
r.connMu.Lock()
|
|
delete(r.activeConns, conn)
|
|
r.connMu.Unlock()
|
|
}
|
|
|
|
func (r *ReplicaReceiver) acceptDataLoop() {
|
|
defer r.wg.Done()
|
|
for {
|
|
conn, err := r.dataListener.Accept()
|
|
if err != nil {
|
|
select {
|
|
case <-r.stopCh:
|
|
return
|
|
default:
|
|
log.Printf("replica: data accept error: %v", err)
|
|
return
|
|
}
|
|
}
|
|
r.trackConn(conn)
|
|
r.wg.Add(1)
|
|
go func() {
|
|
defer r.wg.Done()
|
|
defer r.untrackConn(conn)
|
|
r.handleDataConn(conn)
|
|
}()
|
|
}
|
|
}
|
|
|
|
func (r *ReplicaReceiver) acceptCtrlLoop() {
|
|
defer r.wg.Done()
|
|
for {
|
|
conn, err := r.ctrlListener.Accept()
|
|
if err != nil {
|
|
select {
|
|
case <-r.stopCh:
|
|
return
|
|
default:
|
|
log.Printf("replica: ctrl accept error: %v", err)
|
|
return
|
|
}
|
|
}
|
|
r.trackConn(conn)
|
|
r.wg.Add(1)
|
|
go func() {
|
|
defer r.wg.Done()
|
|
defer r.untrackConn(conn)
|
|
r.handleControlConn(conn)
|
|
}()
|
|
}
|
|
}
|
|
|
|
// handleDataConn reads WAL entry frames and applies them to the local volume.
|
|
func (r *ReplicaReceiver) handleDataConn(conn net.Conn) {
|
|
defer conn.Close()
|
|
for {
|
|
select {
|
|
case <-r.stopCh:
|
|
return
|
|
default:
|
|
}
|
|
|
|
msgType, payload, err := ReadFrame(conn)
|
|
if err != nil {
|
|
select {
|
|
case <-r.stopCh:
|
|
default:
|
|
log.Printf("replica: data read error: %v", err)
|
|
}
|
|
return
|
|
}
|
|
|
|
switch msgType {
|
|
case MsgWALEntry:
|
|
if err := r.applyEntry(payload); err != nil {
|
|
log.Printf("replica: apply entry error: %v", err)
|
|
}
|
|
case MsgResumeShipReq:
|
|
r.handleResumeShipReq(conn, payload)
|
|
case MsgCatchupDone:
|
|
lsn, err := DecodeCatchupDone(payload)
|
|
if err != nil {
|
|
log.Printf("replica: decode catchup done: %v", err)
|
|
} else {
|
|
log.Printf("replica: catch-up done, snapshot LSN=%d", lsn)
|
|
}
|
|
default:
|
|
log.Printf("replica: unexpected data message type 0x%02x", msgType)
|
|
}
|
|
}
|
|
}
|
|
|
|
// handleResumeShipReq processes the reconnect handshake from the primary.
|
|
func (r *ReplicaReceiver) handleResumeShipReq(conn net.Conn, payload []byte) {
|
|
req, err := DecodeResumeShipReq(payload)
|
|
if err != nil {
|
|
log.Printf("replica: decode resume ship req: %v", err)
|
|
resp := EncodeResumeShipResp(ResumeShipResp{Status: ResumeNeedsRebuild})
|
|
WriteFrame(conn, MsgResumeShipResp, resp)
|
|
return
|
|
}
|
|
|
|
// Validate epoch.
|
|
localEpoch := r.vol.epoch.Load()
|
|
if req.Epoch != localEpoch {
|
|
log.Printf("replica: resume ship epoch mismatch: req=%d local=%d", req.Epoch, localEpoch)
|
|
resp := EncodeResumeShipResp(ResumeShipResp{
|
|
Status: ResumeEpochMismatch,
|
|
ReplicaFlushedLSN: r.FlushedLSN(),
|
|
})
|
|
WriteFrame(conn, MsgResumeShipResp, resp)
|
|
return
|
|
}
|
|
|
|
resp := EncodeResumeShipResp(ResumeShipResp{
|
|
Status: ResumeOK,
|
|
ReplicaFlushedLSN: r.FlushedLSN(),
|
|
})
|
|
WriteFrame(conn, MsgResumeShipResp, resp)
|
|
}
|
|
|
|
// applyEntry decodes and applies a single WAL entry to the local volume.
|
|
// The entire apply (LSN check -> WAL append -> dirty map -> receivedLSN update)
|
|
// is serialized under mu to prevent TOCTOU races between concurrent entries.
|
|
func (r *ReplicaReceiver) applyEntry(payload []byte) error {
|
|
entry, err := DecodeWALEntry(payload)
|
|
if err != nil {
|
|
return fmt.Errorf("decode WAL entry: %w", err)
|
|
}
|
|
|
|
// ioMu.RLock: protect WAL/dirtyMap mutation against exclusive restore/import.
|
|
r.vol.ioMu.RLock()
|
|
defer r.vol.ioMu.RUnlock()
|
|
|
|
// Validate epoch: replicas must NOT accept epoch bumps from WAL stream.
|
|
// Only the master can change epochs (via SetEpoch in CP3).
|
|
localEpoch := r.vol.epoch.Load()
|
|
if entry.Epoch != localEpoch {
|
|
return fmt.Errorf("%w: entry epoch %d != local %d", ErrStaleEpoch, entry.Epoch, localEpoch)
|
|
}
|
|
|
|
r.mu.Lock()
|
|
defer r.mu.Unlock()
|
|
|
|
// Enforce contiguous LSN: only accept the next expected entry.
|
|
// This prevents gaps that would let a barrier pass incorrectly.
|
|
if entry.LSN <= r.receivedLSN {
|
|
log.Printf("replica: skipping duplicate/old LSN %d (received %d)", entry.LSN, r.receivedLSN)
|
|
return nil
|
|
}
|
|
if entry.LSN != r.receivedLSN+1 {
|
|
return fmt.Errorf("%w: expected LSN %d, got %d (gap)", ErrDuplicateLSN, r.receivedLSN+1, entry.LSN)
|
|
}
|
|
|
|
// Append to local WAL (with retry on WAL full).
|
|
walOff, err := r.replicaAppendWithRetry(&entry)
|
|
if err != nil {
|
|
return fmt.Errorf("WAL append: %w", err)
|
|
}
|
|
|
|
// Update dirty map.
|
|
switch entry.Type {
|
|
case EntryTypeWrite, EntryTypeTrim:
|
|
blocks := entry.Length / r.vol.super.BlockSize
|
|
for i := uint32(0); i < blocks; i++ {
|
|
r.vol.dirtyMap.Put(entry.LBA+uint64(i), walOff, entry.LSN, r.vol.super.BlockSize)
|
|
}
|
|
}
|
|
|
|
// Update receivedLSN and signal barrier waiters.
|
|
r.receivedLSN = entry.LSN
|
|
r.cond.Broadcast()
|
|
|
|
// Update vol.nextLSN so Status().WALHeadLSN reflects replicated state.
|
|
// CAS loop: only advance, never regress.
|
|
for {
|
|
cur := r.vol.nextLSN.Load()
|
|
next := entry.LSN + 1
|
|
if next <= cur {
|
|
break
|
|
}
|
|
if r.vol.nextLSN.CompareAndSwap(cur, next) {
|
|
break
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// replicaAppendWithRetry appends a WAL entry, retrying on WAL-full by
|
|
// triggering the flusher. Caller must hold r.mu.
|
|
func (r *ReplicaReceiver) replicaAppendWithRetry(entry *WALEntry) (uint64, error) {
|
|
walOff, err := r.vol.wal.Append(entry)
|
|
if !errors.Is(err, ErrWALFull) {
|
|
return walOff, err
|
|
}
|
|
|
|
deadline := time.After(r.vol.config.WALFullTimeout)
|
|
for errors.Is(err, ErrWALFull) {
|
|
select {
|
|
case <-r.stopCh:
|
|
return 0, fmt.Errorf("replica: stopped during WAL retry")
|
|
default:
|
|
}
|
|
if r.vol.flusher != nil {
|
|
r.vol.flusher.NotifyUrgent()
|
|
}
|
|
// Release mu briefly so barrier waiters can proceed and
|
|
// the flusher can make progress (it may need dirty map lock).
|
|
r.mu.Unlock()
|
|
select {
|
|
case <-deadline:
|
|
r.mu.Lock()
|
|
return 0, fmt.Errorf("replica: WAL full timeout: %w", ErrWALFull)
|
|
case <-time.After(1 * time.Millisecond):
|
|
}
|
|
r.mu.Lock()
|
|
walOff, err = r.vol.wal.Append(entry)
|
|
}
|
|
return walOff, err
|
|
}
|
|
|
|
// ReceivedLSN returns the highest LSN received and written to the local WAL.
|
|
func (r *ReplicaReceiver) ReceivedLSN() uint64 {
|
|
r.mu.Lock()
|
|
defer r.mu.Unlock()
|
|
return r.receivedLSN
|
|
}
|
|
|
|
// FlushedLSN returns the highest LSN durably persisted on this replica
|
|
// (after successful WAL fd.Sync). Updated only by handleBarrier.
|
|
func (r *ReplicaReceiver) FlushedLSN() uint64 {
|
|
r.mu.Lock()
|
|
defer r.mu.Unlock()
|
|
return r.flushedLSN
|
|
}
|
|
|
|
// DataAddr returns the data listener's canonical address (ip:port).
|
|
// Wildcard listener addresses are resolved using the advertised host
|
|
// or outbound-IP fallback.
|
|
func (r *ReplicaReceiver) DataAddr() string {
|
|
return canonicalizeListenerAddr(r.dataListener.Addr(), r.advertisedHost)
|
|
}
|
|
|
|
// CtrlAddr returns the control listener's canonical address (ip:port).
|
|
func (r *ReplicaReceiver) CtrlAddr() string {
|
|
return canonicalizeListenerAddr(r.ctrlListener.Addr(), r.advertisedHost)
|
|
}
|