Browse Source

feat: io_uring BatchIO implementation + UseIOUring config wiring

Add iouring_linux.go (build-tagged linux && !no_iouring) using
iceber/iouring-go for batched pread/pwrite/fdatasync. Includes
linked write+fsync chain for group commit optimization.

iouring_other.go provides silent fallback to standard on non-Linux.
blockvol.go wires UseIOUring config flag through to flusher BatchIO.
NewIOUring gracefully falls back if kernel lacks io_uring support.

10 batchio tests, all blockvol tests pass unchanged.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
feature/sw-block
Ping Qiu 1 day ago
parent
commit
04b1827b4a
  1. 1
      go.mod
  2. 3
      go.sum
  3. 140
      weed/storage/blockvol/batchio/iouring_linux.go
  4. 9
      weed/storage/blockvol/batchio/iouring_other.go
  5. 27
      weed/storage/blockvol/batchio/standard_test.go
  6. 17
      weed/storage/blockvol/blockvol.go

1
go.mod

@ -227,6 +227,7 @@ require (
github.com/hashicorp/go-secure-stdlib/strutil v0.1.2 // indirect
github.com/hashicorp/go-sockaddr v1.0.7 // indirect
github.com/hashicorp/hcl v1.0.1-vault-7 // indirect
github.com/iceber/iouring-go v0.0.0-20230403020409-002cfd2e2a90 // indirect
github.com/internxt/rclone-adapter v0.0.0-20260213125353-6f59c89fcb7c // indirect
github.com/jackc/pgpassfile v1.0.0 // indirect
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect

3
go.sum

@ -1391,6 +1391,8 @@ github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpO
github.com/iancoleman/strcase v0.2.0/go.mod h1:iwCmte+B7n89clKwxIoIXy/HfoL7AsD47ZCWhYzw7ho=
github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
github.com/iceber/iouring-go v0.0.0-20230403020409-002cfd2e2a90 h1:xrtfZokN++5kencK33hn2Kx3Uj8tGnjMEhdt6FMvHD0=
github.com/iceber/iouring-go v0.0.0-20230403020409-002cfd2e2a90/go.mod h1:LEzdaZarZ5aqROlLIwJ4P7h3+4o71008fSy6wpaEB+s=
github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4=
github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY=
github.com/in-toto/in-toto-golang v0.5.0 h1:hb8bgwr0M2hGdDsLjkJ3ZqJ8JFLL/tgYdAxF/XEFBbY=
@ -2440,6 +2442,7 @@ golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200803210538-64077c9b5642/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200905004654-be1d3432aa8f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200923182605-d9f96fdee20d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201201145000-ef89a241ccb3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=

140
weed/storage/blockvol/batchio/iouring_linux.go

@ -0,0 +1,140 @@
//go:build linux && !no_iouring
package batchio
import (
"fmt"
"os"
"syscall"
"github.com/iceber/iouring-go"
)
// ioUringBatchIO implements BatchIO using Linux io_uring.
// Requires kernel 5.6+ (linked fsync: 5.10+).
type ioUringBatchIO struct {
ring *iouring.IOURing
}
// NewIOUring creates a BatchIO backed by io_uring with the given ring size.
// If io_uring is unavailable (kernel too old, seccomp, etc.), returns
// NewStandard() with a nil error — silent fallback.
func NewIOUring(ringSize uint) (BatchIO, error) {
ring, err := iouring.New(ringSize)
if err != nil {
// Kernel doesn't support io_uring — fall back silently.
return NewStandard(), nil
}
return &ioUringBatchIO{ring: ring}, nil
}
func (u *ioUringBatchIO) PreadBatch(fd *os.File, ops []Op) error {
if len(ops) == 0 {
return nil
}
requests := make([]iouring.PrepRequest, len(ops))
for i := range ops {
requests[i] = iouring.Pread(int(fd.Fd()), ops[i].Buf, uint64(ops[i].Offset))
}
results, err := u.ring.SubmitRequests(requests, nil)
if err != nil {
return fmt.Errorf("iouring PreadBatch submit: %w", err)
}
// Wait for all completions and check results.
for i, res := range results {
<-res.Done()
n, err := res.ReturnInt()
if err != nil {
return fmt.Errorf("iouring PreadBatch op[%d]: %w", i, err)
}
if n < len(ops[i].Buf) {
return fmt.Errorf("iouring PreadBatch op[%d]: short read %d/%d", i, n, len(ops[i].Buf))
}
}
return nil
}
func (u *ioUringBatchIO) PwriteBatch(fd *os.File, ops []Op) error {
if len(ops) == 0 {
return nil
}
requests := make([]iouring.PrepRequest, len(ops))
for i := range ops {
requests[i] = iouring.Pwrite(int(fd.Fd()), ops[i].Buf, uint64(ops[i].Offset))
}
results, err := u.ring.SubmitRequests(requests, nil)
if err != nil {
return fmt.Errorf("iouring PwriteBatch submit: %w", err)
}
for i, res := range results {
<-res.Done()
n, err := res.ReturnInt()
if err != nil {
return fmt.Errorf("iouring PwriteBatch op[%d]: %w", i, err)
}
if n < len(ops[i].Buf) {
return fmt.Errorf("iouring PwriteBatch op[%d]: short write %d/%d", i, n, len(ops[i].Buf))
}
}
return nil
}
func (u *ioUringBatchIO) Fsync(fd *os.File) error {
req := iouring.Fdatasync(int(fd.Fd()))
result, err := u.ring.SubmitRequest(req, nil)
if err != nil {
return fmt.Errorf("iouring Fsync submit: %w", err)
}
<-result.Done()
_, err = result.ReturnInt()
if err != nil {
return fmt.Errorf("iouring Fsync: %w", err)
}
return nil
}
func (u *ioUringBatchIO) LinkedWriteFsync(fd *os.File, buf []byte, offset int64) error {
// Try linked SQE chain: pwrite → fdatasync (one io_uring_enter).
// This requires IOSQE_IO_LINK support (kernel 5.3+) and linked fsync (5.10+).
// If the ring doesn't support it, fall back to sequential.
writeReq := iouring.Pwrite(int(fd.Fd()), buf, uint64(offset))
fsyncReq := iouring.Fdatasync(int(fd.Fd()))
// SubmitRequests with linked flag.
results, err := u.ring.SubmitRequests(
[]iouring.PrepRequest{writeReq, fsyncReq},
nil,
)
if err != nil {
// Fallback to sequential if linking not supported.
if errno, ok := err.(syscall.Errno); ok && errno == syscall.EINVAL {
if _, werr := fd.WriteAt(buf, offset); werr != nil {
return werr
}
return fd.Sync()
}
return fmt.Errorf("iouring LinkedWriteFsync submit: %w", err)
}
for i, res := range results {
<-res.Done()
_, err := res.ReturnInt()
if err != nil {
return fmt.Errorf("iouring LinkedWriteFsync op[%d]: %w", i, err)
}
}
return nil
}
func (u *ioUringBatchIO) Close() error {
if u.ring != nil {
return u.ring.Close()
}
return nil
}

9
weed/storage/blockvol/batchio/iouring_other.go

@ -0,0 +1,9 @@
//go:build !linux || no_iouring
package batchio
// NewIOUring returns a standard (sequential) BatchIO on non-Linux platforms
// or when io_uring is disabled via the no_iouring build tag.
func NewIOUring(ringSize uint) (BatchIO, error) {
return NewStandard(), nil
}

27
weed/storage/blockvol/batchio/standard_test.go

@ -216,3 +216,30 @@ func TestStandard_PwriteBatch_ErrorOnReadOnly(t *testing.T) {
t.Error("expected error writing to read-only file")
}
}
func TestNewIOUring_Fallback(t *testing.T) {
// On non-Linux (or without io_uring support), NewIOUring returns
// a working BatchIO (standard fallback).
bio, err := NewIOUring(256)
if err != nil {
t.Fatalf("NewIOUring: %v", err)
}
defer bio.Close()
// Verify it works by doing a write+read cycle.
f := tempFile(t)
data := []byte("iouring fallback test")
if err := bio.PwriteBatch(f, []Op{{Buf: data, Offset: 0}}); err != nil {
t.Fatalf("PwriteBatch: %v", err)
}
if err := bio.Fsync(f); err != nil {
t.Fatalf("Fsync: %v", err)
}
got := make([]byte, len(data))
if err := bio.PreadBatch(f, []Op{{Buf: got, Offset: 0}}); err != nil {
t.Fatalf("PreadBatch: %v", err)
}
if !bytes.Equal(got, data) {
t.Errorf("got %q, want %q", got, data)
}
}

17
weed/storage/blockvol/blockvol.go

@ -13,6 +13,8 @@ import (
"sync"
"sync/atomic"
"time"
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol/batchio"
)
// CreateOptions configures a new block volume.
@ -157,6 +159,7 @@ func CreateBlockVol(path string, opts CreateOptions, cfgs ...BlockVolConfig) (*B
DirtyMap: dm,
Interval: cfg.FlushInterval,
Metrics: v.Metrics,
BatchIO: newBatchIO(cfg.UseIOUring),
})
go v.flusher.Run()
v.walAdmission = NewWALAdmission(WALAdmissionConfig{
@ -242,6 +245,7 @@ func OpenBlockVol(path string, cfgs ...BlockVolConfig) (*BlockVol, error) {
DirtyMap: dirtyMap,
Interval: cfg.FlushInterval,
Metrics: v.Metrics,
BatchIO: newBatchIO(cfg.UseIOUring),
})
go v.flusher.Run()
@ -1141,3 +1145,16 @@ func (v *BlockVol) Close() error {
}
return closeErr
}
// newBatchIO creates a BatchIO backend based on config.
// When useIOUring is true, attempts io_uring with silent fallback to standard.
func newBatchIO(useIOUring bool) batchio.BatchIO {
if useIOUring {
bio, err := batchio.NewIOUring(256)
if err != nil {
return batchio.NewStandard()
}
return bio
}
return batchio.NewStandard()
}
Loading…
Cancel
Save