You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
99 lines
2.8 KiB
99 lines
2.8 KiB
package blockvol
|
|
|
|
import (
|
|
"fmt"
|
|
"sync"
|
|
)
|
|
|
|
// MakeDistributedSync creates a sync function that runs local WAL fsync and
|
|
// replica barriers in parallel. Supports N replicas via ShipperGroup.
|
|
//
|
|
// Durability semantics depend on vol.DurabilityMode():
|
|
// - best_effort: local fsync = ACK; replica failures degrade shippers but never fail writes
|
|
// - sync_all: ALL replica barriers must succeed, else write returns ErrDurabilityBarrierFailed
|
|
// - sync_quorum: quorum (RF/2+1) of nodes must be durable, else ErrDurabilityQuorumLost
|
|
func MakeDistributedSync(walSync func() error, group *ShipperGroup, vol *BlockVol) func() error {
|
|
return func() error {
|
|
mode := vol.DurabilityMode()
|
|
|
|
if group == nil || group.Len() == 0 || group.AllDegraded() {
|
|
// No healthy replicas available.
|
|
switch mode {
|
|
case DurabilitySyncAll:
|
|
if group != nil && (group.Len() > 0 || group.AllDegraded()) {
|
|
if vol.Metrics != nil {
|
|
vol.Metrics.DurabilityBarrierFailedTotal.Add(1)
|
|
}
|
|
return ErrDurabilityBarrierFailed
|
|
}
|
|
case DurabilitySyncQuorum:
|
|
if group != nil && group.Len() > 0 {
|
|
// quorum = (Len+1)/2+1; with 0 healthy replicas, only primary is durable
|
|
rf := group.Len() + 1
|
|
quorum := rf/2 + 1
|
|
if 1 < quorum { // primary alone doesn't meet quorum
|
|
if vol.Metrics != nil {
|
|
vol.Metrics.DurabilityQuorumLostTotal.Add(1)
|
|
}
|
|
return ErrDurabilityQuorumLost
|
|
}
|
|
}
|
|
}
|
|
return walSync()
|
|
}
|
|
|
|
// The highest LSN that needs to be durable is nextLSN-1.
|
|
lsnMax := vol.nextLSN.Load() - 1
|
|
|
|
var localErr error
|
|
var barrierErrs []error
|
|
var wg sync.WaitGroup
|
|
wg.Add(2)
|
|
go func() {
|
|
defer wg.Done()
|
|
localErr = walSync()
|
|
}()
|
|
go func() {
|
|
defer wg.Done()
|
|
barrierErrs = group.BarrierAll(lsnMax)
|
|
}()
|
|
wg.Wait()
|
|
|
|
if localErr != nil {
|
|
return localErr
|
|
}
|
|
|
|
// Count barrier failures and degrade shippers.
|
|
failCount := 0
|
|
for _, err := range barrierErrs {
|
|
if err != nil {
|
|
failCount++
|
|
vol.degradeReplica(err)
|
|
}
|
|
}
|
|
|
|
switch mode {
|
|
case DurabilitySyncAll:
|
|
if failCount > 0 {
|
|
if vol.Metrics != nil {
|
|
vol.Metrics.DurabilityBarrierFailedTotal.Add(1)
|
|
}
|
|
return fmt.Errorf("%w: %d of %d barriers failed",
|
|
ErrDurabilityBarrierFailed, failCount, len(barrierErrs))
|
|
}
|
|
case DurabilitySyncQuorum:
|
|
rf := group.Len() + 1 // total nodes including primary
|
|
quorum := rf/2 + 1
|
|
durableNodes := 1 + (len(barrierErrs) - failCount) // primary + successful barriers
|
|
if durableNodes < quorum {
|
|
if vol.Metrics != nil {
|
|
vol.Metrics.DurabilityQuorumLostTotal.Add(1)
|
|
}
|
|
return fmt.Errorf("%w: %d durable of %d needed",
|
|
ErrDurabilityQuorumLost, durableNodes, quorum)
|
|
}
|
|
}
|
|
// best_effort: barrier failures already logged via degradeReplica, return nil.
|
|
return nil
|
|
}
|
|
}
|