Browse Source

feat: Phase 10 CP10-1 -- NVMe/TCP target MVP, 109 tests

NVMe over Fabrics (TCP) target implementation sharing the same BlockVol
engine, fencing, replication, and failover as the existing iSCSI target.

New package: weed/storage/blockvol/nvme/ (11 files, 2,242 production LOC)
- protocol.go: PDU types, opcodes, status codes, marshal/unmarshal
- wire.go: TCP reader/writer with header bounds validation
- controller.go: IC handshake, per-queue state, command dispatch, KATO
- fabric.go: Connect (admin+IO), PropertyGet/Set, Disconnect
- identify.go: Controller/Namespace/NS list/NS descriptors (Linux 5.15)
- admin.go: SetFeatures, GetFeatures, GetLogPage (SMART/ANA), KeepAlive
- io.go: Read (C2HData), Write (inline), Flush, WriteZeros/Trim
- server.go: TCP listener, admin session registry, graceful shutdown
- adapter.go: BlockVol-to-NVMe bridge, error mapping, ANA state

Integration: NVMeConfig + CLI flags (-block.nvme.*), disabled by default.

Key design: inline-data writes only (no R2T), MaxH2CDataLength=32KB,
single ANA group coherent with BlockVol role, CNTLID session registry
for cross-connection IO queues, HostNQN continuity enforcement.

Tests: 65 dev + 44 QA adversarial = 109 total, all passing.
Bugs fixed during review: IO queue cross-connection (A), header bounds
validation (B), write payload size check (C), disconnect error (D),
stream desync prevention (E), HostNQN enforcement (F), capsule-before-IC
state guard (H), flowCtlOff SQHD timing (I).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
feature/sw-block
Ping Qiu 1 day ago
parent
commit
0e234f5c80
  1. 20
      weed/command/volume.go
  2. 2
      weed/server/block_heartbeat_loop_test.go
  3. 2
      weed/server/volume_grpc_block_test.go
  4. 96
      weed/server/volume_server_block.go
  5. 4
      weed/server/volume_server_block_test.go
  6. 127
      weed/storage/blockvol/nvme/adapter.go
  7. 198
      weed/storage/blockvol/nvme/admin.go
  8. 354
      weed/storage/blockvol/nvme/controller.go
  9. 300
      weed/storage/blockvol/nvme/fabric.go
  10. 250
      weed/storage/blockvol/nvme/identify.go
  11. 157
      weed/storage/blockvol/nvme/io.go
  12. 1541
      weed/storage/blockvol/nvme/nvme_qa_test.go
  13. 2377
      weed/storage/blockvol/nvme/nvme_test.go
  14. 444
      weed/storage/blockvol/nvme/protocol.go
  15. 210
      weed/storage/blockvol/nvme/server.go
  16. 202
      weed/storage/blockvol/nvme/wire.go

20
weed/command/volume.go

@ -79,6 +79,12 @@ type VolumeServerOptions struct {
blockDir *string
blockIQNPrefix *string
blockPortal *string
// Block volume (NVMe/TCP) options
blockNvmeEnable *bool
blockNvmeListen *string
blockNvmePortal *string
blockNvmeNQNPrefix *string
blockNvmeMaxIOQueues *int
}
func init() {
@ -123,6 +129,11 @@ func init() {
v.blockDir = cmdVolume.Flag.String("block.dir", "", "directory containing .blk block volume files. Empty disables iSCSI block service.")
v.blockIQNPrefix = cmdVolume.Flag.String("block.iqn.prefix", "iqn.2024-01.com.seaweedfs:vol.", "IQN prefix for block volume iSCSI targets")
v.blockPortal = cmdVolume.Flag.String("block.portal", "", "public iSCSI portal address for SendTargets discovery (e.g. 192.168.1.100:3260,1). Required for Windows clients and Docker deployments.")
v.blockNvmeEnable = cmdVolume.Flag.Bool("block.nvme.enable", false, "enable NVMe/TCP target for block volumes (default off)")
v.blockNvmeListen = cmdVolume.Flag.String("block.nvme.listen", "0.0.0.0:4420", "NVMe/TCP target listen address")
v.blockNvmePortal = cmdVolume.Flag.String("block.nvme.portal", "", "public NVMe/TCP portal address (e.g. 192.168.1.100:4420)")
v.blockNvmeNQNPrefix = cmdVolume.Flag.String("block.nvme.nqnPrefix", "nqn.2024-01.com.seaweedfs:vol.", "NQN prefix for NVMe subsystems")
v.blockNvmeMaxIOQueues = cmdVolume.Flag.Int("block.nvme.maxIOQueues", 4, "maximum NVMe I/O queues per controller (1-128)")
}
var cmdVolume = &Command{
@ -323,7 +334,14 @@ func (v VolumeServerOptions) startVolumeServer(volumeFolders, maxVolumeCounts, v
blockPortal = fmt.Sprintf("%s:%s,1", *v.ip, port)
glog.V(0).Infof("block service: auto-derived portal address %s from -ip flag", blockPortal)
}
blockService := weed_server.StartBlockService(*v.blockListen, *v.blockDir, *v.blockIQNPrefix, blockPortal)
nvmeCfg := weed_server.NVMeConfig{
Enabled: *v.blockNvmeEnable,
ListenAddr: *v.blockNvmeListen,
Portal: *v.blockNvmePortal,
NQNPrefix: *v.blockNvmeNQNPrefix,
MaxIOQueues: *v.blockNvmeMaxIOQueues,
}
blockService := weed_server.StartBlockService(*v.blockListen, *v.blockDir, *v.blockIQNPrefix, blockPortal, nvmeCfg)
if blockService != nil {
volumeServer.SetBlockService(blockService)
}

2
weed/server/block_heartbeat_loop_test.go

@ -247,7 +247,7 @@ func newTestBlockService(t *testing.T) *BlockService {
t.Helper()
dir := t.TempDir()
createTestBlockVolFile(t, dir, "hb-test.blk")
bs := StartBlockService("127.0.0.1:0", dir, "iqn.2024-01.com.test:vol.", "127.0.0.1:3260,1")
bs := StartBlockService("127.0.0.1:0", dir, "iqn.2024-01.com.test:vol.", "127.0.0.1:3260,1", NVMeConfig{})
if bs == nil {
t.Fatal("expected non-nil BlockService")
}

2
weed/server/volume_grpc_block_test.go

@ -12,7 +12,7 @@ func newTestBlockServiceWithDir(t *testing.T) (*BlockService, string) {
dir := t.TempDir()
blockDir := filepath.Join(dir, "blocks")
os.MkdirAll(blockDir, 0755)
bs := StartBlockService("127.0.0.1:0", blockDir, "iqn.2024.test:", "127.0.0.1:3260,1")
bs := StartBlockService("127.0.0.1:0", blockDir, "iqn.2024.test:", "127.0.0.1:3260,1", NVMeConfig{})
if bs == nil {
t.Fatal("StartBlockService returned nil")
}

96
weed/server/volume_server_block.go

@ -13,6 +13,7 @@ import (
"github.com/seaweedfs/seaweedfs/weed/storage"
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol"
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol/iscsi"
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol/nvme"
)
// volReplState tracks active replication addresses per volume.
@ -21,11 +22,22 @@ type volReplState struct {
replicaCtrlAddr string
}
// BlockService manages block volumes and the iSCSI target server.
// NVMeConfig holds NVMe/TCP target configuration passed from CLI flags.
type NVMeConfig struct {
Enabled bool
ListenAddr string
Portal string // reserved for heartbeat/CSI integration (CP10-2)
NQNPrefix string
MaxIOQueues int
}
// BlockService manages block volumes and the iSCSI/NVMe target servers.
type BlockService struct {
blockStore *storage.BlockVolumeStore
targetServer *iscsi.TargetServer
nvmeServer *nvme.Server
iqnPrefix string
nqnPrefix string
blockDir string
listenAddr string
@ -35,9 +47,9 @@ type BlockService struct {
}
// StartBlockService scans blockDir for .blk files, opens them as block volumes,
// registers them with an iSCSI target server, and starts listening.
// registers them with iSCSI and optionally NVMe target servers, and starts listening.
// Returns nil if blockDir is empty (feature disabled).
func StartBlockService(listenAddr, blockDir, iqnPrefix, portalAddr string) *BlockService {
func StartBlockService(listenAddr, blockDir, iqnPrefix, portalAddr string, nvmeCfg NVMeConfig) *BlockService {
if blockDir == "" {
return nil
}
@ -45,14 +57,20 @@ func StartBlockService(listenAddr, blockDir, iqnPrefix, portalAddr string) *Bloc
if iqnPrefix == "" {
iqnPrefix = "iqn.2024-01.com.seaweedfs:vol."
}
nqnPrefix := nvmeCfg.NQNPrefix
if nqnPrefix == "" {
nqnPrefix = "nqn.2024-01.com.seaweedfs:vol."
}
bs := &BlockService{
blockStore: storage.NewBlockVolumeStore(),
iqnPrefix: iqnPrefix,
nqnPrefix: nqnPrefix,
blockDir: blockDir,
listenAddr: listenAddr,
}
// iSCSI target setup.
logger := log.New(os.Stderr, "iscsi: ", log.LstdFlags)
config := iscsi.DefaultTargetConfig()
@ -63,6 +81,20 @@ func StartBlockService(listenAddr, blockDir, iqnPrefix, portalAddr string) *Bloc
bs.targetServer.SetPortalAddr(portalAddr)
}
// NVMe/TCP target setup (optional).
if nvmeCfg.Enabled {
maxQ := uint16(4)
if nvmeCfg.MaxIOQueues >= 1 && nvmeCfg.MaxIOQueues <= 128 {
maxQ = uint16(nvmeCfg.MaxIOQueues)
}
bs.nvmeServer = nvme.NewServer(nvme.Config{
ListenAddr: nvmeCfg.ListenAddr,
NQNPrefix: nqnPrefix,
MaxIOQueues: maxQ,
Enabled: true,
})
}
// Scan blockDir for .blk files.
entries, err := os.ReadDir(blockDir)
if err != nil {
@ -101,12 +133,8 @@ func StartBlockService(listenAddr, blockDir, iqnPrefix, portalAddr string) *Bloc
}
}
// Derive IQN from filename: vol1.blk -> iqn.2024-01.com.seaweedfs:vol.vol1
name := strings.TrimSuffix(entry.Name(), ".blk")
iqn := iqnPrefix + blockvol.SanitizeIQN(name)
adapter := blockvol.NewBlockVolAdapter(vol)
bs.targetServer.AddVolume(iqn, adapter)
glog.V(0).Infof("block service: registered %s as %s", path, iqn)
bs.registerVolume(vol, name)
}
// Start iSCSI target in background.
@ -115,11 +143,36 @@ func StartBlockService(listenAddr, blockDir, iqnPrefix, portalAddr string) *Bloc
glog.Warningf("block service: iSCSI target stopped: %v", err)
}
}()
glog.V(0).Infof("block service: iSCSI target started on %s", listenAddr)
// Start NVMe/TCP target in background (if enabled).
if bs.nvmeServer != nil {
if err := bs.nvmeServer.ListenAndServe(); err != nil {
glog.Warningf("block service: NVMe/TCP target failed to start: %v (iSCSI continues)", err)
bs.nvmeServer = nil // disable NVMe, iSCSI continues
} else {
glog.V(0).Infof("block service: NVMe/TCP target started on %s", nvmeCfg.ListenAddr)
}
}
return bs
}
// registerVolume adds a volume to both iSCSI and NVMe targets.
func (bs *BlockService) registerVolume(vol *blockvol.BlockVol, name string) {
iqn := bs.iqnPrefix + blockvol.SanitizeIQN(name)
adapter := blockvol.NewBlockVolAdapter(vol)
bs.targetServer.AddVolume(iqn, adapter)
if bs.nvmeServer != nil {
nqn := bs.nqnPrefix + blockvol.SanitizeIQN(name)
nvmeAdapter := nvme.NewNVMeAdapter(vol)
bs.nvmeServer.AddVolume(nqn, nvmeAdapter, nvmeAdapter.DeviceNGUID())
}
glog.V(0).Infof("block service: registered %s", name)
}
// Store returns the underlying BlockVolumeStore.
func (bs *BlockService) Store() *storage.BlockVolumeStore {
return bs.blockStore
@ -151,10 +204,15 @@ func (bs *BlockService) CreateBlockVol(name string, sizeBytes uint64, diskType s
return "", "", "", fmt.Errorf("block volume %q exists with size %d (requested %d)",
name, info.VolumeSize, sizeBytes)
}
// Re-add to TargetServer in case it was cleared (crash recovery).
// Re-add to targets in case they were cleared (crash recovery).
// AddVolume is idempotent — no-op if already registered.
adapter := blockvol.NewBlockVolAdapter(vol)
bs.targetServer.AddVolume(iqn, adapter)
if bs.nvmeServer != nil {
nqn := bs.nqnPrefix + blockvol.SanitizeIQN(name)
nvmeAdapter := nvme.NewNVMeAdapter(vol)
bs.nvmeServer.AddVolume(nqn, nvmeAdapter, nvmeAdapter.DeviceNGUID())
}
return path, iqn, iscsiAddr, nil
}
@ -190,6 +248,13 @@ func (bs *BlockService) CreateBlockVol(name string, sizeBytes uint64, diskType s
adapter := blockvol.NewBlockVolAdapter(vol)
bs.targetServer.AddVolume(iqn, adapter)
if bs.nvmeServer != nil {
nqn := bs.nqnPrefix + blockvol.SanitizeIQN(name)
nvmeAdapter := nvme.NewNVMeAdapter(vol)
bs.nvmeServer.AddVolume(nqn, nvmeAdapter, nvmeAdapter.DeviceNGUID())
}
glog.V(0).Infof("block service: created %s as %s (%d bytes)", path, iqn, sizeBytes)
return path, iqn, iscsiAddr, nil
}
@ -206,6 +271,12 @@ func (bs *BlockService) DeleteBlockVol(name string) error {
bs.targetServer.DisconnectVolume(iqn)
}
// Remove from NVMe target.
if bs.nvmeServer != nil {
nqn := bs.nqnPrefix + blockvol.SanitizeIQN(name)
bs.nvmeServer.RemoveVolume(nqn)
}
// Close and unregister.
if err := bs.blockStore.RemoveBlockVolume(path); err != nil {
// Not found is OK (idempotent).
@ -482,12 +553,15 @@ func (bs *BlockService) ReplicationPorts(volPath string) (dataPort, ctrlPort, re
return
}
// Shutdown gracefully stops the iSCSI target and closes all block volumes.
// Shutdown gracefully stops the iSCSI and NVMe targets and closes all block volumes.
func (bs *BlockService) Shutdown() {
if bs == nil {
return
}
glog.V(0).Infof("block service: shutting down...")
if bs.nvmeServer != nil {
bs.nvmeServer.Close()
}
if bs.targetServer != nil {
bs.targetServer.Close()
}

4
weed/server/volume_server_block_test.go

@ -26,7 +26,7 @@ func createTestBlockVolFile(t *testing.T, dir, name string) string {
func TestBlockServiceDisabledByDefault(t *testing.T) {
// Empty blockDir means feature is disabled.
bs := StartBlockService("0.0.0.0:3260", "", "", "")
bs := StartBlockService("0.0.0.0:3260", "", "", "", NVMeConfig{})
if bs != nil {
bs.Shutdown()
t.Fatal("expected nil BlockService when blockDir is empty")
@ -41,7 +41,7 @@ func TestBlockServiceStartAndShutdown(t *testing.T) {
dir := t.TempDir()
createTestBlockVolFile(t, dir, "testvol.blk")
bs := StartBlockService("127.0.0.1:0", dir, "iqn.2024-01.com.test:vol.", "127.0.0.1:3260,1")
bs := StartBlockService("127.0.0.1:0", dir, "iqn.2024-01.com.test:vol.", "127.0.0.1:3260,1", NVMeConfig{})
if bs == nil {
t.Fatal("expected non-nil BlockService")
}

127
weed/storage/blockvol/nvme/adapter.go

@ -0,0 +1,127 @@
package nvme
import (
"errors"
"strings"
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol"
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol/blockerr"
)
// NVMeAdapter wraps a *BlockVol to implement BlockDevice and ANAProvider
// for the NVMe/TCP target, bridging the BlockVol storage engine to NVMe
// command handling.
type NVMeAdapter struct {
Vol *blockvol.BlockVol
}
// NewNVMeAdapter creates a BlockDevice adapter for the given BlockVol.
func NewNVMeAdapter(vol *blockvol.BlockVol) *NVMeAdapter {
return &NVMeAdapter{Vol: vol}
}
func (a *NVMeAdapter) ReadAt(lba uint64, length uint32) ([]byte, error) {
return a.Vol.ReadLBA(lba, length)
}
func (a *NVMeAdapter) WriteAt(lba uint64, data []byte) error {
return a.Vol.WriteLBA(lba, data)
}
func (a *NVMeAdapter) Trim(lba uint64, length uint32) error {
return a.Vol.Trim(lba, length)
}
func (a *NVMeAdapter) SyncCache() error {
return a.Vol.SyncCache()
}
func (a *NVMeAdapter) BlockSize() uint32 {
return a.Vol.Info().BlockSize
}
func (a *NVMeAdapter) VolumeSize() uint64 {
return a.Vol.Info().VolumeSize
}
func (a *NVMeAdapter) IsHealthy() bool {
return a.Vol.Info().Healthy
}
// ANAState returns the ANA state based on the volume's role.
func (a *NVMeAdapter) ANAState() uint8 {
return RoleToANAState(a.Vol.Role())
}
// ANAGroupID returns the ANA group ID (always 1 for single-group MVP).
func (a *NVMeAdapter) ANAGroupID() uint16 { return 1 }
// DeviceNGUID returns a 16-byte NGUID derived from the volume UUID.
func (a *NVMeAdapter) DeviceNGUID() [16]byte {
return UUIDToNGUID(a.Vol.Info().UUID)
}
// Compile-time checks.
var _ BlockDevice = (*NVMeAdapter)(nil)
var _ ANAProvider = (*NVMeAdapter)(nil)
// RoleToANAState maps a BlockVol Role to an NVMe ANA state.
func RoleToANAState(r blockvol.Role) uint8 {
switch r {
case blockvol.RolePrimary, blockvol.RoleNone:
return anaOptimized
case blockvol.RoleReplica:
return anaInaccessible
case blockvol.RoleStale:
return anaPersistentLoss
case blockvol.RoleRebuilding, blockvol.RoleDraining:
return anaInaccessible
default:
return anaInaccessible
}
}
// UUIDToNGUID converts a 16-byte UUID to a 16-byte NGUID.
// Uses NAA-6 pattern for first 8 bytes (compatible with iSCSI UUIDToNAA),
// copies remaining bytes as-is.
func UUIDToNGUID(uuid [16]byte) [16]byte {
var nguid [16]byte
nguid[0] = 0x60 | (uuid[0] & 0x0F)
copy(nguid[1:8], uuid[1:8])
copy(nguid[8:16], uuid[8:16])
return nguid
}
// mapBlockError maps BlockVol errors to NVMe status words.
func mapBlockError(err error) StatusWord {
if err == nil {
return StatusSuccess
}
// Check known sentinel errors from blockvol and blockerr packages.
switch {
case errors.Is(err, blockvol.ErrLeaseExpired):
return StatusNSNotReadyDNR // DNR=1: fencing is permanent
case errors.Is(err, blockvol.ErrEpochRegression):
return StatusInternalErrorDNR // DNR=1: stale controller
case errors.Is(err, blockerr.ErrDurabilityBarrierFailed):
return StatusInternalError // DNR=0: replica may recover
case errors.Is(err, blockerr.ErrDurabilityQuorumLost):
return StatusInternalError // DNR=0: quorum may heal
case errors.Is(err, blockvol.ErrWALFull):
return StatusNSNotReady // DNR=0: transient pressure
case errors.Is(err, blockvol.ErrNotPrimary):
return StatusNSNotReady // DNR=0: may be transitioning
}
// Heuristic for I/O errors (no dedicated sentinels yet).
msg := err.Error()
if strings.Contains(msg, "write") || strings.Contains(msg, "Write") {
return StatusMediaWriteFault
}
if strings.Contains(msg, "read") || strings.Contains(msg, "Read") {
return StatusMediaReadError
}
return StatusInternalError
}

198
weed/storage/blockvol/nvme/admin.go

@ -0,0 +1,198 @@
package nvme
import (
"encoding/binary"
)
// handleSetFeatures processes SetFeatures admin commands.
func (c *Controller) handleSetFeatures(req *Request) error {
fid := uint8(req.capsule.D10 & 0xFF)
switch fid {
case fidNumberOfQueues:
// D11: NCQR[15:0] | NSQR[31:16]
ncqr := uint16(req.capsule.D11 & 0xFFFF)
nsqr := uint16(req.capsule.D11 >> 16)
// Grant min(requested, max)
if ncqr > c.maxIOQueues {
ncqr = c.maxIOQueues
}
if nsqr > c.maxIOQueues {
nsqr = c.maxIOQueues
}
if ncqr == 0 {
ncqr = 1
}
if nsqr == 0 {
nsqr = 1
}
c.grantedQueues = ncqr
// Response DW0: (NCQR-1) | ((NSQR-1) << 16)
req.resp.DW0 = uint32(ncqr-1) | (uint32(nsqr-1) << 16)
return c.sendResponse(req)
case fidKeepAliveTimer:
// D11 contains KATO in milliseconds
c.katoMs = req.capsule.D11
return c.sendResponse(req)
case fidAsyncEventConfig:
// Stub: accept but don't deliver events
return c.sendResponse(req)
default:
req.resp.Status = uint16(StatusInvalidField)
return c.sendResponse(req)
}
}
// handleGetFeatures returns stored feature values.
func (c *Controller) handleGetFeatures(req *Request) error {
fid := uint8(req.capsule.D10 & 0xFF)
switch fid {
case fidNumberOfQueues:
n := c.grantedQueues
if n == 0 {
n = c.maxIOQueues
}
req.resp.DW0 = uint32(n-1) | (uint32(n-1) << 16)
return c.sendResponse(req)
case fidKeepAliveTimer:
req.resp.DW0 = c.katoMs
return c.sendResponse(req)
case fidAsyncEventConfig:
req.resp.DW0 = 0
return c.sendResponse(req)
default:
req.resp.Status = uint16(StatusInvalidField)
return c.sendResponse(req)
}
}
// handleGetLogPage returns log page data.
func (c *Controller) handleGetLogPage(req *Request) error {
// D10 bits 7:0 = Log Page Identifier
// D10 bits 27:16 and D11 bits 15:0 = Number of Dwords (NUMD)
lid := uint8(req.capsule.D10 & 0xFF)
numdl := (req.capsule.D10 >> 16) & 0xFFF
numdu := req.capsule.D11 & 0xFFFF
numd := uint32(numdu)<<16 | uint32(numdl)
length := (numd + 1) * 4 // NUMD is 0-based, in dwords
switch lid {
case logPageError:
return c.logPageError(req, length)
case logPageSMART:
return c.logPageSMART(req, length)
case logPageANA:
return c.logPageANA(req, length)
default:
req.resp.Status = uint16(StatusInvalidField)
return c.sendResponse(req)
}
}
// logPageError returns an empty error log page.
func (c *Controller) logPageError(req *Request, length uint32) error {
if length > 64 {
length = 64
}
req.c2hData = make([]byte, length)
return c.sendC2HDataAndResponse(req)
}
// logPageSMART returns a 512-byte SMART/Health log.
func (c *Controller) logPageSMART(req *Request, length uint32) error {
if length > 512 {
length = 512
}
buf := make([]byte, 512)
// Critical Warning - offset 0: 0 = no warnings
buf[0] = 0
// Composite Temperature - offset 1-2: 0 (not implemented)
binary.LittleEndian.PutUint16(buf[1:], 0)
// Available Spare - offset 3: 100%
buf[3] = 100
// Available Spare Threshold - offset 4: 10%
buf[4] = 10
// Percentage Used - offset 5: 0%
buf[5] = 0
req.c2hData = buf[:length]
return c.sendC2HDataAndResponse(req)
}
// logPageANA returns the ANA log page with a single group.
func (c *Controller) logPageANA(req *Request, length uint32) error {
// ANA log page format (32 bytes for single group):
// [0:8] CHGCNT (uint64)
// [8:10] NGRPS = 1 (uint16)
// [10:16] reserved
// Group descriptor:
// [16:20] ANAGRPID = 1 (uint32)
// [20:24] NNSID = 1 (uint32)
// [24:32] Change Count (uint64)
// [32] ANA State
// [33:36] reserved
// [36:40] NSID = 1 (uint32)
const anaLogSize = 40
buf := make([]byte, anaLogSize)
// CHGCNT
binary.LittleEndian.PutUint64(buf[0:], c.anaChangeCount())
// NGRPS
binary.LittleEndian.PutUint16(buf[8:], 1)
// Group descriptor
binary.LittleEndian.PutUint32(buf[16:], 1) // ANAGRPID=1
binary.LittleEndian.PutUint32(buf[20:], 1) // NNSID=1
binary.LittleEndian.PutUint64(buf[24:], c.anaChangeCount()) // chgcnt
buf[32] = c.anaState() // ANA state
binary.LittleEndian.PutUint32(buf[36:], 1) // NSID=1
if length > anaLogSize {
length = anaLogSize
}
req.c2hData = buf[:length]
return c.sendC2HDataAndResponse(req)
}
// anaState returns the current ANA state based on the subsystem's device.
func (c *Controller) anaState() uint8 {
if c.subsystem == nil {
return anaInaccessible
}
if prov, ok := c.subsystem.Dev.(ANAProvider); ok {
return prov.ANAState()
}
// Default: if healthy → optimized
if c.subsystem.Dev.IsHealthy() {
return anaOptimized
}
return anaInaccessible
}
// anaChangeCount returns a monotonic ANA change counter.
// For MVP, we use 1 as a constant (no dynamic role changes tracked).
func (c *Controller) anaChangeCount() uint64 {
return 1
}
// handleKeepAlive resets the KATO timer and returns success.
func (c *Controller) handleKeepAlive(req *Request) error {
c.resetKATO()
return c.sendResponse(req)
}

354
weed/storage/blockvol/nvme/controller.go

@ -0,0 +1,354 @@
package nvme
import (
"fmt"
"io"
"log"
"net"
"sync"
"sync/atomic"
"time"
)
// controllerState tracks the lifecycle of an NVMe controller session.
type controllerState int
const (
stateConnected controllerState = iota // TCP connected, no IC yet
stateICComplete // IC exchange done
stateAdminReady // Admin queue connected
stateCtrlReady // CC.EN=1, CSTS.RDY=1
stateIOActive // IO queues active
stateClosed // Shut down
)
// Request represents an in-flight NVMe command being processed.
type Request struct {
capsule CapsuleCommand
payload []byte // inline data from host (Write commands)
resp CapsuleResponse
c2hData []byte // data to send to host (Read commands)
status StatusWord
}
// Controller handles one NVMe/TCP connection (one queue per connection).
type Controller struct {
mu sync.Mutex
// Session identity
conn net.Conn
in *Reader
out *Writer
state controllerState
closed atomic.Bool
// Queue state (one queue per TCP connection)
queueID uint16
queueSize uint16
sqhd uint16 // Submission Queue Head pointer
flowCtlOff bool // CATTR bit2: SQ flow control disabled
// Controller identity
cntlID uint16
subNQN string
// Controller registers
regCAP uint64 // Controller Capabilities
regCC uint32 // Controller Configuration (set by host via PropertySet)
regCSTS uint32 // Controller Status (RDY bit)
regVS uint32 // Version
// KeepAlive
katoMs uint32
katoTimer *time.Timer
katoMu sync.Mutex
// Async completion (IO queues)
waiting chan *Request // pre-allocated request pool
completions chan *Request // completed requests to send
// Backend
subsystem *Subsystem
server *Server
// Features
maxIOQueues uint16
grantedQueues uint16
isAdmin bool // true if this controller owns admin queue (QID=0)
// Lifecycle
wg sync.WaitGroup
closeOnce sync.Once
}
// newController creates a controller for the given connection.
func newController(conn net.Conn, server *Server) *Controller {
c := &Controller{
conn: conn,
in: NewReader(conn),
out: NewWriter(conn),
state: stateConnected,
server: server,
regVS: nvmeVersion14,
// CAP register: MQES=63 (bits 15:0), CQR=1 (bit 16), TO=30 (bits 31:24, *500ms=15s), CSS bit37=1 (NVM command set)
regCAP: uint64(63) | (1 << 16) | (uint64(30) << 24) | (1 << 37),
maxIOQueues: server.cfg.MaxIOQueues,
}
return c
}
// Serve is the main event loop for this controller connection.
func (c *Controller) Serve() error {
defer c.shutdown()
// IC handshake timeout
if err := c.conn.SetReadDeadline(time.Now().Add(10 * time.Second)); err != nil {
return err
}
for {
if c.closed.Load() {
return nil
}
hdr, err := c.in.Dequeue()
if err != nil {
if err == io.EOF || c.closed.Load() {
return nil
}
return fmt.Errorf("read header: %w", err)
}
switch hdr.Type {
case pduICReq:
if err := c.handleIC(); err != nil {
return fmt.Errorf("IC handshake: %w", err)
}
// Clear read deadline after successful IC
if err := c.conn.SetReadDeadline(time.Time{}); err != nil {
return err
}
case pduCapsuleCmd:
if err := c.handleCapsule(); err != nil {
return fmt.Errorf("capsule: %w", err)
}
case pduH2CTermReq:
return nil // host terminated
default:
return fmt.Errorf("unexpected PDU type: 0x%x", hdr.Type)
}
}
}
// handleIC processes the IC handshake.
func (c *Controller) handleIC() error {
var req ICRequest
if err := c.in.Receive(&req); err != nil {
return err
}
resp := ICResponse{
PDUFormatVersion: 0,
MaxH2CDataLength: maxH2CDataLen,
}
if err := c.out.SendHeaderOnly(pduICResp, &resp, icBodySize); err != nil {
return err
}
c.state = stateICComplete
return nil
}
// handleCapsule dispatches a CapsuleCmd PDU.
func (c *Controller) handleCapsule() error {
// Reject capsule commands before IC handshake is complete.
if c.state < stateICComplete {
return fmt.Errorf("capsule command before IC handshake")
}
var capsule CapsuleCommand
if err := c.in.Receive(&capsule); err != nil {
return err
}
// Read optional inline data
var payload []byte
if dataLen := c.in.Length(); dataLen > 0 {
payload = make([]byte, dataLen)
if err := c.in.ReceiveData(payload); err != nil {
return err
}
}
// Advance SQHD
c.sqhd++
if c.sqhd >= c.queueSize && c.queueSize > 0 {
c.sqhd = 0
}
req := &Request{
capsule: capsule,
payload: payload,
}
req.resp.CID = capsule.CID
req.resp.QueueID = c.queueID
// SQHD is set in sendResponse/sendC2HDataAndResponse using the
// latest c.flowCtlOff value, so Connect responses correctly get
// SQHD=0xFFFF when the host requests flowCtlOff via CATTR.
req.resp.Status = uint16(StatusSuccess)
if c.queueID == 0 {
return c.dispatchAdmin(req)
}
return c.dispatchIO(req)
}
// dispatchAdmin handles admin queue commands synchronously.
func (c *Controller) dispatchAdmin(req *Request) error {
capsule := &req.capsule
if capsule.OpCode == adminFabric {
return c.handleFabricCommand(req)
}
switch capsule.OpCode {
case adminIdentify:
return c.handleIdentify(req)
case adminSetFeatures:
return c.handleSetFeatures(req)
case adminGetFeatures:
return c.handleGetFeatures(req)
case adminGetLogPage:
return c.handleGetLogPage(req)
case adminKeepAlive:
return c.handleKeepAlive(req)
case adminAsyncEvent:
// Stub: just succeed (don't deliver events in CP10-1)
return c.sendResponse(req)
default:
req.resp.Status = uint16(StatusInvalidOpcode)
return c.sendResponse(req)
}
}
// dispatchIO handles IO queue commands.
func (c *Controller) dispatchIO(req *Request) error {
capsule := &req.capsule
switch capsule.OpCode {
case ioRead:
return c.handleRead(req)
case ioWrite:
return c.handleWrite(req)
case ioFlush:
return c.handleFlush(req)
case ioWriteZeros:
return c.handleWriteZeros(req)
default:
req.resp.Status = uint16(StatusInvalidOpcode)
return c.sendResponse(req)
}
}
// sendC2HDataAndResponse sends C2HData PDUs followed by a CapsuleResp.
func (c *Controller) sendC2HDataAndResponse(req *Request) error {
if len(req.c2hData) > 0 {
data := req.c2hData
offset := uint32(0)
chunkSize := uint32(maxH2CDataLen)
for offset < uint32(len(data)) {
end := offset + chunkSize
if end > uint32(len(data)) {
end = uint32(len(data))
}
chunk := data[offset:end]
hdr := C2HDataHeader{
CCCID: req.capsule.CID,
DATAO: offset,
DATAL: uint32(len(chunk)),
}
flags := uint8(0)
if end >= uint32(len(data)) {
flags = c2hFlagLast
}
if err := c.out.SendWithData(pduC2HData, flags, &hdr, c2hDataHdrSize, chunk); err != nil {
return err
}
offset = end
}
}
return c.sendResponse(req)
}
// sendResponse sends a CapsuleResp PDU.
// SQHD is set here (not in handleCapsule) so that flowCtlOff changes
// made during command dispatch (e.g. Fabric Connect) take effect
// on the same response.
func (c *Controller) sendResponse(req *Request) error {
if c.flowCtlOff {
req.resp.SQHD = 0xFFFF
} else {
req.resp.SQHD = c.sqhd
}
c.resetKATO()
return c.out.SendHeaderOnly(pduCapsuleResp, &req.resp, capsuleRespSize)
}
// ---------- KATO management ----------
func (c *Controller) startKATO() {
c.katoMu.Lock()
defer c.katoMu.Unlock()
if c.katoMs == 0 {
return
}
d := time.Duration(c.katoMs) * time.Millisecond
// Add 50% margin per spec recommendation
d = d + d/2
c.katoTimer = time.AfterFunc(d, func() {
log.Printf("nvme: KATO expired for cntlid=%d, closing connection", c.cntlID)
c.conn.Close()
})
}
func (c *Controller) resetKATO() {
c.katoMu.Lock()
defer c.katoMu.Unlock()
if c.katoTimer != nil {
c.katoTimer.Reset(time.Duration(c.katoMs)*time.Millisecond + time.Duration(c.katoMs)*time.Millisecond/2)
}
}
func (c *Controller) stopKATO() {
c.katoMu.Lock()
defer c.katoMu.Unlock()
if c.katoTimer != nil {
c.katoTimer.Stop()
c.katoTimer = nil
}
}
// ---------- Lifecycle ----------
func (c *Controller) shutdown() {
c.closeOnce.Do(func() {
c.closed.Store(true)
c.stopKATO()
c.state = stateClosed
c.conn.Close()
if c.server != nil {
if c.isAdmin && c.cntlID != 0 {
c.server.unregisterAdmin(c.cntlID)
}
c.server.removeSession(c)
}
})
}

300
weed/storage/blockvol/nvme/fabric.go

@ -0,0 +1,300 @@
package nvme
import (
"encoding/binary"
)
// handleFabricCommand dispatches Fabric-specific commands by FCType.
func (c *Controller) handleFabricCommand(req *Request) error {
switch req.capsule.FCType {
case fcConnect:
return c.handleConnect(req)
case fcPropertyGet:
return c.handlePropertyGet(req)
case fcPropertySet:
return c.handlePropertySet(req)
case fcDisconnect:
return c.handleDisconnect(req)
default:
req.resp.Status = uint16(StatusInvalidField)
return c.sendResponse(req)
}
}
// handleConnect processes a Fabric Connect command.
func (c *Controller) handleConnect(req *Request) error {
capsule := &req.capsule
// Parse QueueID, QueueSize, KATO, CATTR from capsule dwords.
// Connect command layout (CDW10-CDW12):
// CDW10[15:0]=RECFM, CDW10[31:16]=QID
// CDW11[15:0]=SQSIZE, CDW11[23:16]=CATTR
// CDW12=KATO
queueID := uint16(capsule.D10 >> 16)
queueSize := uint16(capsule.D11&0xFFFF) + 1 // SQSIZE is 0-based
cattr := uint8(capsule.D11 >> 16)
kato := capsule.D12
// Parse ConnectData from payload
if len(req.payload) < connectDataSize {
req.resp.Status = uint16(StatusInvalidField)
return c.sendResponse(req)
}
var cd ConnectData
cd.Unmarshal(req.payload)
if queueID == 0 {
// Admin queue connect
sub := c.server.findSubsystem(cd.SubNQN)
if sub == nil {
req.resp.Status = uint16(StatusInvalidField)
return c.sendResponse(req)
}
c.subsystem = sub
c.subNQN = cd.SubNQN
c.queueID = 0
c.queueSize = queueSize
c.cntlID = c.server.allocCNTLID()
c.katoMs = kato
c.flowCtlOff = (cattr & 0x04) != 0
c.state = stateAdminReady
c.isAdmin = true
// Register admin session so IO queue connections can find us.
c.server.registerAdmin(&adminSession{
cntlID: c.cntlID,
subsystem: sub,
subNQN: cd.SubNQN,
hostNQN: cd.HostNQN,
regCAP: c.regCAP,
regCC: c.regCC,
regCSTS: c.regCSTS,
regVS: c.regVS,
katoMs: kato,
})
// Return CNTLID in DW0
req.resp.DW0 = uint32(c.cntlID)
return c.sendResponse(req)
}
// IO queue connect — look up admin session from server registry.
// IO queues arrive on separate TCP connections with fresh Controllers,
// so we must find the admin session by CNTLID from the server.
admin := c.server.lookupAdmin(cd.CNTLID)
if admin == nil {
req.resp.Status = uint16(StatusInvalidField)
return c.sendResponse(req)
}
// Validate SubNQN and HostNQN match the admin session.
if cd.SubNQN != admin.subNQN {
req.resp.Status = uint16(StatusInvalidField)
return c.sendResponse(req)
}
if cd.HostNQN != admin.hostNQN {
req.resp.Status = uint16(StatusInvalidField)
return c.sendResponse(req)
}
c.cntlID = cd.CNTLID
c.subsystem = admin.subsystem
c.subNQN = admin.subNQN
c.queueID = queueID
c.queueSize = queueSize
c.flowCtlOff = (cattr & 0x04) != 0
c.state = stateIOActive
req.resp.DW0 = uint32(c.cntlID)
return c.sendResponse(req)
}
// handlePropertyGet returns a controller register value.
func (c *Controller) handlePropertyGet(req *Request) error {
// Property offset in D10 (bits 31:0, but only lower bits used)
offset := req.capsule.D10
// Attrib in D11 bit 0: 0=4byte, 1=8byte
size8 := (req.capsule.D11 & 1) != 0
var val uint64
switch offset {
case propCAP:
val = c.regCAP
case propVS:
val = uint64(c.regVS)
case propCC:
val = uint64(c.regCC)
case propCSTS:
val = uint64(c.regCSTS)
default:
req.resp.Status = uint16(StatusInvalidField)
return c.sendResponse(req)
}
if size8 {
// 8-byte value in DW0+DW1
req.resp.DW0 = uint32(val)
req.resp.DW1 = uint32(val >> 32)
} else {
req.resp.DW0 = uint32(val)
}
return c.sendResponse(req)
}
// handlePropertySet handles controller register writes.
func (c *Controller) handlePropertySet(req *Request) error {
offset := req.capsule.D10
value := uint64(req.capsule.D14) | uint64(req.capsule.D15)<<32
switch offset {
case propCC:
c.regCC = uint32(value)
// Check CC.EN (bit 0)
if c.regCC&1 != 0 {
c.regCSTS |= 1 // Set CSTS.RDY
c.state = stateCtrlReady
if c.katoMs > 0 {
c.startKATO()
}
} else {
c.regCSTS &^= 1 // Clear CSTS.RDY
}
default:
// Ignore writes to other registers
}
return c.sendResponse(req)
}
// handleDisconnect processes a Fabric Disconnect.
func (c *Controller) handleDisconnect(req *Request) error {
if err := c.sendResponse(req); err != nil {
return err
}
c.shutdown()
return nil
}
// ---------- Subsystem ----------
// Subsystem represents an NVMe subsystem backed by a BlockDevice.
type Subsystem struct {
NQN string
Dev BlockDevice
NGUID [16]byte // Namespace GUID
}
// BlockDevice is the interface for the underlying storage.
// This is the same as iscsi.BlockDevice.
type BlockDevice interface {
ReadAt(lba uint64, length uint32) ([]byte, error)
WriteAt(lba uint64, data []byte) error
Trim(lba uint64, length uint32) error
SyncCache() error
BlockSize() uint32
VolumeSize() uint64
IsHealthy() bool
}
// ANAProvider extends BlockDevice with ANA state reporting.
type ANAProvider interface {
ANAState() uint8
ANAGroupID() uint16
DeviceNGUID() [16]byte
}
// allocCNTLID allocates a new controller ID from the server.
func (s *Server) allocCNTLID() uint16 {
return uint16(s.nextCNTLID.Add(1))
}
// findSubsystem looks up a subsystem by NQN.
func (s *Server) findSubsystem(nqn string) *Subsystem {
s.mu.RLock()
defer s.mu.RUnlock()
sub, ok := s.subsystems[nqn]
if !ok {
return nil
}
return sub
}
// ---------- ConnectData field access helpers ----------
// connectQueueID extracts the QueueID from a Connect capsule D10.
func connectQueueID(capsule *CapsuleCommand) uint16 {
return uint16(capsule.D10 >> 16)
}
// connectQueueSize extracts the QueueSize from a Connect capsule D11 (0-based → +1).
func connectQueueSize(capsule *CapsuleCommand) uint16 {
return uint16(capsule.D11&0xFFFF) + 1
}
// connectKATO extracts the KeepAlive timeout from a Connect capsule D12.
func connectKATO(capsule *CapsuleCommand) uint32 {
return capsule.D12
}
// PropertySet value extraction: the go-nvme reference puts value in D12/D13,
// but NVMe spec actually uses CDW14/CDW15 for PropertySet. We handle both.
func propertySetValue(capsule *CapsuleCommand) uint64 {
return uint64(capsule.D14) | uint64(capsule.D15)<<32
}
// propertyGetSize returns true if the PropertyGet requests an 8-byte value.
func propertyGetSize8(capsule *CapsuleCommand) bool {
return (capsule.D11 & 1) != 0
}
// propertyGetOffset returns the register offset for PropertyGet.
func propertyGetOffset(capsule *CapsuleCommand) uint32 {
return capsule.D10
}
// ---------- ConnectData marshal helpers for tests ----------
func marshalConnectData(cd *ConnectData) []byte {
buf := make([]byte, connectDataSize)
cd.Marshal(buf)
return buf
}
func makeConnectCapsule(queueID, queueSize uint16, kato uint32, fcType uint8) CapsuleCommand {
return CapsuleCommand{
OpCode: adminFabric,
FCType: fcType,
D10: uint32(queueID) << 16,
D11: uint32(queueSize - 1), // 0-based
D12: kato,
}
}
// makePropertyGetCapsule creates a PropertyGet capsule for the given register offset.
func makePropertyGetCapsule(offset uint32, size8 bool) CapsuleCommand {
c := CapsuleCommand{
OpCode: adminFabric,
FCType: fcPropertyGet,
D10: offset,
}
if size8 {
c.D11 = 1
}
return c
}
// makePropertySetCapsule creates a PropertySet capsule.
func makePropertySetCapsule(offset uint32, value uint64) CapsuleCommand {
return CapsuleCommand{
OpCode: adminFabric,
FCType: fcPropertySet,
D10: offset,
D14: uint32(value),
D15: uint32(value >> 32),
}
}
// putCNTLID stores the controller ID in ConnectData at offset 16.
func putCNTLID(buf []byte, cntlid uint16) {
binary.LittleEndian.PutUint16(buf[16:], cntlid)
}

250
weed/storage/blockvol/nvme/identify.go

@ -0,0 +1,250 @@
package nvme
import (
"encoding/binary"
"math/bits"
)
const identifySize = 4096
// handleIdentify dispatches Identify commands by CNS type.
func (c *Controller) handleIdentify(req *Request) error {
cns := uint8(req.capsule.D10 & 0xFF)
switch cns {
case cnsIdentifyController:
return c.identifyController(req)
case cnsIdentifyNamespace:
return c.identifyNamespace(req)
case cnsActiveNSList:
return c.identifyActiveNSList(req)
case cnsNSDescriptorList:
return c.identifyNSDescriptors(req)
default:
req.resp.Status = uint16(StatusInvalidField)
return c.sendResponse(req)
}
}
// identifyController returns the 4096-byte Identify Controller data structure.
func (c *Controller) identifyController(req *Request) error {
buf := make([]byte, identifySize)
sub := c.subsystem
if sub == nil {
req.resp.Status = uint16(StatusInvalidField)
return c.sendResponse(req)
}
// VID (PCI Vendor ID) - use 0 for software target
// SSVID - 0
// Serial Number (offset 4, 20 bytes, space-padded ASCII)
copyPadded(buf[4:24], "SWF00001")
// Model Number (offset 24, 40 bytes, space-padded ASCII)
copyPadded(buf[24:64], "SeaweedFS BlockVol")
// Firmware Revision (offset 64, 8 bytes, space-padded ASCII)
copyPadded(buf[64:72], "0001")
// RAB (Recommended Arbitration Burst) - offset 72
buf[72] = 6
// IEEE OUI - offset 73-75 (3 bytes, 0 for software)
// CMIC (Controller Multi-Path I/O Capabilities) - offset 76
// bit 3: ANA reporting supported
buf[76] = 0x08
// MDTS (Maximum Data Transfer Size) - offset 77
// 2^MDTS * 4096 = max transfer. MDTS=3 → 32KB
buf[77] = 3
// CNTLID (Controller ID) - offset 78-79
binary.LittleEndian.PutUint16(buf[78:], c.cntlID)
// Version - offset 80-83
binary.LittleEndian.PutUint32(buf[80:], nvmeVersion14)
// OACS (Optional Admin Command Support) - offset 256-257
// 0 = no optional admin commands
binary.LittleEndian.PutUint16(buf[256:], 0)
// ACRTD (Abort Command Limit) - offset 258
buf[258] = 3
// AERTL (Async Event Request Limit) - offset 259
buf[259] = 3
// FRMW (Firmware Updates) - offset 260
buf[260] = 0x02 // slot 1 read-only
// LPA (Log Page Attributes) - offset 261
buf[261] = 0
// ELPE (Error Log Page Entries) - offset 262
buf[262] = 0 // 1 entry (0-based)
// SQES (Submission Queue Entry Size) - offset 512
// min=6 (2^6=64 bytes), max=6
buf[512] = 0x66
// CQES (Completion Queue Entry Size) - offset 513
// min=4 (2^4=16 bytes), max=4
buf[513] = 0x44
// MAXCMD - offset 514-515
binary.LittleEndian.PutUint16(buf[514:], 64)
// NN (Number of Namespaces) - offset 516-519
binary.LittleEndian.PutUint32(buf[516:], 1)
// ONCS (Optional NVM Command Support) - offset 520-521
// bit 3: WriteZeros, bit 2: DatasetMgmt (Trim)
binary.LittleEndian.PutUint16(buf[520:], 0x0C)
// ANACAP (ANA Capabilities) - offset 522
// bit 3: reports Optimized state
buf[522] = 0x08
// ANAGRPMAX - offset 524-527
binary.LittleEndian.PutUint32(buf[524:], 1)
// NANAGRPID - offset 528-531
binary.LittleEndian.PutUint32(buf[528:], 1)
// VWC (Volatile Write Cache) - offset 525
// bit 0: volatile write cache present → Flush required
buf[525] = 0x01
// SGLS (SGL Support) - offset 536-539
// bit 0: SGLs supported (required for NVMe/TCP)
binary.LittleEndian.PutUint32(buf[536:], 0x01)
// SubNQN (Subsystem NQN) - offset 768, 256 bytes
copyPadded(buf[768:1024], sub.NQN)
// IOCCSZ (I/O Queue Command Capsule Supported Size) - offset 1792-1795
// In 16-byte units: 64/16 = 4
binary.LittleEndian.PutUint32(buf[1792:], 4)
// IORCSZ (I/O Queue Response Capsule Supported Size) - offset 1796-1799
// In 16-byte units: 16/16 = 1
binary.LittleEndian.PutUint32(buf[1796:], 1)
// ICDOFF (In Capsule Data Offset) - offset 1800-1801
// 0 means inline data immediately follows SQE in capsule
binary.LittleEndian.PutUint16(buf[1800:], 0)
// FCATT (Fabrics Controller Attributes) - offset 1802
// bit 0: 0 = I/O controller (not discovery)
buf[1802] = 0
// MSDBD (Maximum SGL Data Block Descriptors) - offset 1803
buf[1803] = 1
// OFCS (Optional Fabric Commands Supported) - offset 1804-1805
// bit 0: Disconnect command supported
binary.LittleEndian.PutUint16(buf[1804:], 0x01)
req.c2hData = buf
return c.sendC2HDataAndResponse(req)
}
// identifyNamespace returns the 4096-byte Identify Namespace data for NSID=1.
func (c *Controller) identifyNamespace(req *Request) error {
sub := c.subsystem
if sub == nil {
req.resp.Status = uint16(StatusInvalidField)
return c.sendResponse(req)
}
dev := sub.Dev
blockSize := dev.BlockSize()
nsze := dev.VolumeSize() / uint64(blockSize)
buf := make([]byte, identifySize)
// NSZE (Namespace Size in blocks) - offset 0-7
binary.LittleEndian.PutUint64(buf[0:], nsze)
// NCAP (Namespace Capacity) - offset 8-15
binary.LittleEndian.PutUint64(buf[8:], nsze)
// NUSE (Namespace Utilization) - offset 16-23
binary.LittleEndian.PutUint64(buf[16:], nsze)
// NSFEAT (Namespace Features) - offset 24
// bit 0: thin provisioning (supports Trim)
buf[24] = 0x01
// NLBAF (Number of LBA Formats minus 1) - offset 25
buf[25] = 0 // one format
// FLBAS (Formatted LBA Size) - offset 26
// bits 3:0 = LBA format index (0)
buf[26] = 0
// MC (Metadata Capabilities) - offset 27
buf[27] = 0
// DLFEAT (Deallocate Logical Block Features) - offset 28
// bit 2: Deallocated blocks return zeros on read
buf[28] = 0x04
// NGUID (Namespace Globally Unique Identifier) - offset 104-119 (16 bytes)
copy(buf[104:120], sub.NGUID[:])
// LBAF[0] (LBA Format 0) - offset 128-131
// bits 23:16 = LBADS (log2 of block size)
lbads := uint8(bits.TrailingZeros32(blockSize))
binary.LittleEndian.PutUint32(buf[128:], uint32(lbads)<<16)
// ANAGRPID (ANA Group Identifier) - offset 92-95
binary.LittleEndian.PutUint32(buf[92:], 1)
req.c2hData = buf
return c.sendC2HDataAndResponse(req)
}
// identifyActiveNSList returns the list of active namespace IDs (just NSID=1).
func (c *Controller) identifyActiveNSList(req *Request) error {
buf := make([]byte, identifySize)
// Single namespace: NSID=1
binary.LittleEndian.PutUint32(buf[0:], 1)
req.c2hData = buf
return c.sendC2HDataAndResponse(req)
}
// identifyNSDescriptors returns namespace descriptor list for NSID=1.
func (c *Controller) identifyNSDescriptors(req *Request) error {
sub := c.subsystem
if sub == nil {
req.resp.Status = uint16(StatusInvalidField)
return c.sendResponse(req)
}
buf := make([]byte, identifySize)
off := 0
// NGUID descriptor (type=0x02, length=16)
buf[off] = 0x02 // NIDT: NGUID
off++
buf[off] = 16 // NIDL: 16 bytes
off++
off += 2 // reserved
copy(buf[off:off+16], sub.NGUID[:])
req.c2hData = buf
return c.sendC2HDataAndResponse(req)
}
// copyPadded copies src into dst, padding remaining bytes with spaces.
func copyPadded(dst []byte, src string) {
n := copy(dst, src)
for i := n; i < len(dst); i++ {
dst[i] = ' '
}
}

157
weed/storage/blockvol/nvme/io.go

@ -0,0 +1,157 @@
package nvme
// handleRead processes an NVMe Read command.
func (c *Controller) handleRead(req *Request) error {
sub := c.subsystem
if sub == nil {
req.resp.Status = uint16(StatusInvalidField)
return c.sendResponse(req)
}
dev := sub.Dev
lba := req.capsule.Lba()
nlb := req.capsule.LbaLength()
blockSize := dev.BlockSize()
totalBytes := uint32(nlb) * blockSize
// Bounds check
nsze := dev.VolumeSize() / uint64(blockSize)
if lba+uint64(nlb) > nsze {
req.resp.Status = uint16(StatusLBAOutOfRange)
return c.sendResponse(req)
}
data, err := dev.ReadAt(lba, totalBytes)
if err != nil {
req.resp.Status = uint16(mapBlockError(err))
return c.sendResponse(req)
}
req.c2hData = data
return c.sendC2HDataAndResponse(req)
}
// handleWrite processes an NVMe Write command with inline data.
func (c *Controller) handleWrite(req *Request) error {
sub := c.subsystem
if sub == nil {
req.resp.Status = uint16(StatusInvalidField)
return c.sendResponse(req)
}
// Check ANA state (write-gating)
if !c.isWriteAllowed() {
req.resp.Status = uint16(StatusNSNotReady)
return c.sendResponse(req)
}
// Inline data must be present (DataOffset != 0 in the received PDU).
// If DataOffset == 0 for a Write, the host expects R2T flow — reject.
if len(req.payload) == 0 {
req.resp.Status = uint16(StatusInvalidField)
return c.sendResponse(req)
}
dev := sub.Dev
lba := req.capsule.Lba()
nlb := req.capsule.LbaLength()
blockSize := dev.BlockSize()
// Bounds check
nsze := dev.VolumeSize() / uint64(blockSize)
if lba+uint64(nlb) > nsze {
req.resp.Status = uint16(StatusLBAOutOfRange)
return c.sendResponse(req)
}
// Validate payload size matches NLB*blockSize.
expectedBytes := uint32(nlb) * blockSize
if uint32(len(req.payload)) != expectedBytes {
req.resp.Status = uint16(StatusInvalidField)
return c.sendResponse(req)
}
if err := dev.WriteAt(lba, req.payload); err != nil {
req.resp.Status = uint16(mapBlockError(err))
return c.sendResponse(req)
}
return c.sendResponse(req)
}
// handleFlush processes an NVMe Flush command.
func (c *Controller) handleFlush(req *Request) error {
sub := c.subsystem
if sub == nil {
req.resp.Status = uint16(StatusInvalidField)
return c.sendResponse(req)
}
if !c.isWriteAllowed() {
req.resp.Status = uint16(StatusNSNotReady)
return c.sendResponse(req)
}
if err := sub.Dev.SyncCache(); err != nil {
req.resp.Status = uint16(mapBlockError(err))
return c.sendResponse(req)
}
return c.sendResponse(req)
}
// handleWriteZeros processes an NVMe Write Zeroes command.
func (c *Controller) handleWriteZeros(req *Request) error {
sub := c.subsystem
if sub == nil {
req.resp.Status = uint16(StatusInvalidField)
return c.sendResponse(req)
}
if !c.isWriteAllowed() {
req.resp.Status = uint16(StatusNSNotReady)
return c.sendResponse(req)
}
dev := sub.Dev
lba := req.capsule.Lba()
nlb := req.capsule.LbaLength()
blockSize := dev.BlockSize()
totalBytes := uint32(nlb) * blockSize
// Bounds check
nsze := dev.VolumeSize() / uint64(blockSize)
if lba+uint64(nlb) > nsze {
req.resp.Status = uint16(StatusLBAOutOfRange)
return c.sendResponse(req)
}
// D12 bit 25: DEALLOC — if set, use Trim instead of writing zeros
if req.capsule.D12&commandBitDeallocate != 0 {
if err := dev.Trim(lba, totalBytes); err != nil {
req.resp.Status = uint16(mapBlockError(err))
return c.sendResponse(req)
}
} else {
zeroBuf := make([]byte, totalBytes)
if err := dev.WriteAt(lba, zeroBuf); err != nil {
req.resp.Status = uint16(mapBlockError(err))
return c.sendResponse(req)
}
}
return c.sendResponse(req)
}
// isWriteAllowed checks if the current ANA state allows writes.
func (c *Controller) isWriteAllowed() bool {
if c.subsystem == nil {
return false
}
if prov, ok := c.subsystem.Dev.(ANAProvider); ok {
state := prov.ANAState()
return state == anaOptimized || state == anaNonOptimized
}
// No ANA provider: allow if healthy
return c.subsystem.Dev.IsHealthy()
}

1541
weed/storage/blockvol/nvme/nvme_qa_test.go
File diff suppressed because it is too large
View File

2377
weed/storage/blockvol/nvme/nvme_test.go
File diff suppressed because it is too large
View File

444
weed/storage/blockvol/nvme/protocol.go

@ -0,0 +1,444 @@
// Package nvme implements an NVMe/TCP target for SeaweedFS BlockVol.
//
// This package provides a functionally correct NVMe-oF over TCP transport
// that shares the same BlockVol engine, fencing, replication, and failover
// as the iSCSI target.
package nvme
import (
"encoding/binary"
"fmt"
)
// ---------- PDU type codes ----------
const (
pduICReq uint8 = 0x0 // Initialization Connection Request
pduICResp uint8 = 0x1 // Initialization Connection Response
pduH2CTermReq uint8 = 0x2 // Host-to-Controller Termination Request
pduC2HTermReq uint8 = 0x3 // Controller-to-Host Termination Request
pduCapsuleCmd uint8 = 0x4 // NVMe Capsule Command
pduCapsuleResp uint8 = 0x5 // NVMe Capsule Response
pduC2HData uint8 = 0x7 // Controller-to-Host Data Transfer
pduR2T uint8 = 0x9 // Ready-to-Transfer
)
// ---------- Admin command opcodes ----------
const (
adminFlush uint8 = 0x00 // NVM Flush (admin context unused here)
adminGetLogPage uint8 = 0x02
adminIdentify uint8 = 0x06
adminAbort uint8 = 0x08
adminSetFeatures uint8 = 0x09
adminGetFeatures uint8 = 0x0A
adminAsyncEvent uint8 = 0x0C
adminKeepAlive uint8 = 0x18
adminFabric uint8 = 0x7F // Fabric-specific commands
)
// ---------- IO command opcodes ----------
const (
ioFlush uint8 = 0x00
ioWrite uint8 = 0x01
ioRead uint8 = 0x02
ioWriteZeros uint8 = 0x08
)
// ---------- Fabric command types (FCType) ----------
const (
fcPropertySet uint8 = 0x00
fcConnect uint8 = 0x01
fcPropertyGet uint8 = 0x04
fcDisconnect uint8 = 0x08
)
// ---------- Feature identifiers ----------
const (
fidNumberOfQueues uint8 = 0x07
fidAsyncEventConfig uint8 = 0x0B
fidKeepAliveTimer uint8 = 0x0F
)
// ---------- Identify CNS types ----------
const (
cnsIdentifyNamespace uint8 = 0x00
cnsIdentifyController uint8 = 0x01
cnsActiveNSList uint8 = 0x02
cnsNSDescriptorList uint8 = 0x03
)
// ---------- Log page identifiers ----------
const (
logPageError uint8 = 0x01
logPageSMART uint8 = 0x02
logPageANA uint8 = 0x0C
)
// ---------- Property register offsets ----------
const (
propCAP uint32 = 0x00 // Controller Capabilities
propVS uint32 = 0x08 // Version
propCC uint32 = 0x14 // Controller Configuration
propCSTS uint32 = 0x1C // Controller Status
)
// ---------- ANA states ----------
const (
anaOptimized uint8 = 0x01
anaNonOptimized uint8 = 0x02
anaInaccessible uint8 = 0x03
anaPersistentLoss uint8 = 0x04
anaChange uint8 = 0x0F
)
// ---------- Misc constants ----------
const (
commonHeaderSize = 8
maxHeaderSize = 128
maxH2CDataLen = 0x8000 // 32 KB
capsuleCmdSize = 64 // CapsuleCommand specific header size (after CommonHeader)
capsuleRespSize = 16 // CapsuleResponse specific header size
c2hDataHdrSize = 16 // C2HDataHeader specific header size
icBodySize = 120 // ICReq/ICResp body size (after CommonHeader)
connectDataSize = 1024
// Total header lengths including CommonHeader
capsuleCmdHdrLen = commonHeaderSize + capsuleCmdSize // 72
capsuleRespHdrLen = commonHeaderSize + capsuleRespSize // 24
c2hDataHdrLen = commonHeaderSize + c2hDataHdrSize // 24
icHdrLen = commonHeaderSize + icBodySize // 128
commandBitDeallocate = 1 << 25
nvmeVersion14 uint32 = 0x00010400 // NVMe 1.4
// C2HData flags
c2hFlagLast uint8 = 0x04
)
// ---------- CommonHeader (8 bytes) ----------
// CommonHeader is the 8-byte preamble of every NVMe/TCP PDU.
type CommonHeader struct {
Type uint8
Flags uint8
HeaderLength uint8
DataOffset uint8
DataLength uint32
}
func (h *CommonHeader) Marshal(buf []byte) {
buf[0] = h.Type
buf[1] = h.Flags
buf[2] = h.HeaderLength
buf[3] = h.DataOffset
binary.LittleEndian.PutUint32(buf[4:], h.DataLength)
}
func (h *CommonHeader) Unmarshal(buf []byte) {
h.Type = buf[0]
h.Flags = buf[1]
h.HeaderLength = buf[2]
h.DataOffset = buf[3]
h.DataLength = binary.LittleEndian.Uint32(buf[4:])
}
func (h *CommonHeader) String() string {
return fmt.Sprintf("PDU{type=0x%x hlen=%d doff=%d dlen=%d}",
h.Type, h.HeaderLength, h.DataOffset, h.DataLength)
}
// ---------- PDU interface ----------
// PDU is the interface for all NVMe/TCP PDU-specific headers.
type PDU interface {
Marshal([]byte)
Unmarshal([]byte)
}
// ---------- ICRequest (120-byte body) ----------
// ICRequest is the host-to-controller initialization request.
type ICRequest struct {
PDUFormatVersion uint16
PDUDataAlignment uint8
PDUDataDigest uint8
PDUMaxR2T uint32
// remaining 112 bytes reserved
}
func (r *ICRequest) Marshal(buf []byte) {
// zero out the full 120-byte body
for i := range buf[:icBodySize] {
buf[i] = 0
}
binary.LittleEndian.PutUint16(buf[0:], r.PDUFormatVersion)
buf[2] = r.PDUDataAlignment
buf[3] = r.PDUDataDigest
binary.LittleEndian.PutUint32(buf[4:], r.PDUMaxR2T)
}
func (r *ICRequest) Unmarshal(buf []byte) {
r.PDUFormatVersion = binary.LittleEndian.Uint16(buf[0:])
r.PDUDataAlignment = buf[2]
r.PDUDataDigest = buf[3]
r.PDUMaxR2T = binary.LittleEndian.Uint32(buf[4:])
}
// ---------- ICResponse (120-byte body) ----------
// ICResponse is the controller-to-host initialization response.
type ICResponse struct {
PDUFormatVersion uint16
PDUDataAlignment uint8
PDUDataDigest uint8
MaxH2CDataLength uint32
// remaining 112 bytes reserved
}
func (r *ICResponse) Marshal(buf []byte) {
for i := range buf[:icBodySize] {
buf[i] = 0
}
binary.LittleEndian.PutUint16(buf[0:], r.PDUFormatVersion)
buf[2] = r.PDUDataAlignment
buf[3] = r.PDUDataDigest
binary.LittleEndian.PutUint32(buf[4:], r.MaxH2CDataLength)
}
func (r *ICResponse) Unmarshal(buf []byte) {
r.PDUFormatVersion = binary.LittleEndian.Uint16(buf[0:])
r.PDUDataAlignment = buf[2]
r.PDUDataDigest = buf[3]
r.MaxH2CDataLength = binary.LittleEndian.Uint32(buf[4:])
}
// ---------- CapsuleCommand (64-byte specific header) ----------
// CapsuleCommand is the 64-byte NVMe command capsule.
type CapsuleCommand struct {
OpCode uint8
PRP uint8
CID uint16
FCType uint8 // Fabric command type (only for OpCode=0x7F)
NSID uint32 // Namespace ID (bytes 4-7 of NVMe SQE after opcode/flags/CID)
DPTR [16]byte // Data pointer
D10 uint32
D11 uint32
D12 uint32
D13 uint32
D14 uint32
D15 uint32
}
// Lba returns the starting LBA from D10:D11 (Read/Write commands).
func (c *CapsuleCommand) Lba() uint64 {
return uint64(c.D11)<<32 | uint64(c.D10)
}
// LbaLength returns the number of logical blocks (0-based in D12, actual = D12&0xFFFF + 1).
func (c *CapsuleCommand) LbaLength() uint32 {
return c.D12&0xFFFF + 1
}
func (c *CapsuleCommand) Marshal(buf []byte) {
for i := range buf[:capsuleCmdSize] {
buf[i] = 0
}
buf[0] = c.OpCode
buf[1] = c.PRP
binary.LittleEndian.PutUint16(buf[2:], c.CID)
// Bytes 4-7: NSID for normal commands, FCType at byte 4 for Fabric (0x7F).
// They share the same offset per NVMe spec.
if c.OpCode == adminFabric {
buf[4] = c.FCType
} else {
binary.LittleEndian.PutUint32(buf[4:], c.NSID)
}
copy(buf[24:40], c.DPTR[:])
binary.LittleEndian.PutUint32(buf[40:], c.D10)
binary.LittleEndian.PutUint32(buf[44:], c.D11)
binary.LittleEndian.PutUint32(buf[48:], c.D12)
binary.LittleEndian.PutUint32(buf[52:], c.D13)
binary.LittleEndian.PutUint32(buf[56:], c.D14)
binary.LittleEndian.PutUint32(buf[60:], c.D15)
}
func (c *CapsuleCommand) Unmarshal(buf []byte) {
c.OpCode = buf[0]
c.PRP = buf[1]
c.CID = binary.LittleEndian.Uint16(buf[2:])
c.FCType = buf[4]
c.NSID = binary.LittleEndian.Uint32(buf[4:])
copy(c.DPTR[:], buf[24:40])
c.D10 = binary.LittleEndian.Uint32(buf[40:])
c.D11 = binary.LittleEndian.Uint32(buf[44:])
c.D12 = binary.LittleEndian.Uint32(buf[48:])
c.D13 = binary.LittleEndian.Uint32(buf[52:])
c.D14 = binary.LittleEndian.Uint32(buf[56:])
c.D15 = binary.LittleEndian.Uint32(buf[60:])
}
func (c *CapsuleCommand) String() string {
return fmt.Sprintf("CapsuleCmd{op=0x%02x cid=%d nsid=%d}", c.OpCode, c.CID, c.NSID)
}
// ---------- CapsuleResponse (16-byte specific header) ----------
// CapsuleResponse is the NVMe completion queue entry (16 bytes).
type CapsuleResponse struct {
DW0 uint32 // Command-specific DWord 0 (also FabricResponse bytes 0-3)
DW1 uint32 // Command-specific DWord 1 (also FabricResponse bytes 4-7)
SQHD uint16 // Submission Queue Head Pointer
QueueID uint16
CID uint16
Status uint16 // Status field: DNR(15) | More(14) | SCT(13:9) | SC(8:1) | P(0)
}
func (r *CapsuleResponse) Marshal(buf []byte) {
binary.LittleEndian.PutUint32(buf[0:], r.DW0)
binary.LittleEndian.PutUint32(buf[4:], r.DW1)
binary.LittleEndian.PutUint16(buf[8:], r.SQHD)
binary.LittleEndian.PutUint16(buf[10:], r.QueueID)
binary.LittleEndian.PutUint16(buf[12:], r.CID)
binary.LittleEndian.PutUint16(buf[14:], r.Status)
}
func (r *CapsuleResponse) Unmarshal(buf []byte) {
r.DW0 = binary.LittleEndian.Uint32(buf[0:])
r.DW1 = binary.LittleEndian.Uint32(buf[4:])
r.SQHD = binary.LittleEndian.Uint16(buf[8:])
r.QueueID = binary.LittleEndian.Uint16(buf[10:])
r.CID = binary.LittleEndian.Uint16(buf[12:])
r.Status = binary.LittleEndian.Uint16(buf[14:])
}
func (r *CapsuleResponse) String() string {
return fmt.Sprintf("CapsuleResp{sqhd=%d qid=%d cid=%d status=0x%04x}",
r.SQHD, r.QueueID, r.CID, r.Status)
}
// ---------- C2HDataHeader (16-byte specific header) ----------
// C2HDataHeader is the controller-to-host data transfer header.
type C2HDataHeader struct {
CCCID uint16 // Command Capsule CID
_ uint16 // reserved
DATAO uint32 // Data offset within the total transfer
DATAL uint32 // Data length in this PDU
_pad uint32 // reserved
}
func (h *C2HDataHeader) Marshal(buf []byte) {
for i := range buf[:c2hDataHdrSize] {
buf[i] = 0
}
binary.LittleEndian.PutUint16(buf[0:], h.CCCID)
binary.LittleEndian.PutUint32(buf[4:], h.DATAO)
binary.LittleEndian.PutUint32(buf[8:], h.DATAL)
}
func (h *C2HDataHeader) Unmarshal(buf []byte) {
h.CCCID = binary.LittleEndian.Uint16(buf[0:])
h.DATAO = binary.LittleEndian.Uint32(buf[4:])
h.DATAL = binary.LittleEndian.Uint32(buf[8:])
}
// ---------- ConnectData (1024 bytes, payload of Fabric Connect) ----------
// ConnectData is the 1024-byte payload sent with a Fabric Connect command.
type ConnectData struct {
HostID [16]byte // Host UUID
CNTLID uint16 // Requested controller ID (0xFFFF = new)
SubNQN string // Subsystem NQN
HostNQN string // Host NQN
}
func (d *ConnectData) Marshal(buf []byte) {
for i := range buf[:connectDataSize] {
buf[i] = 0
}
copy(buf[0:16], d.HostID[:])
binary.LittleEndian.PutUint16(buf[16:], d.CNTLID)
copyNQN(buf[256:512], d.SubNQN)
copyNQN(buf[512:768], d.HostNQN)
}
func (d *ConnectData) Unmarshal(buf []byte) {
copy(d.HostID[:], buf[0:16])
d.CNTLID = binary.LittleEndian.Uint16(buf[16:])
d.SubNQN = extractNQN(buf[256:512])
d.HostNQN = extractNQN(buf[512:768])
}
// copyNQN writes a NUL-terminated string into a fixed-size buffer.
func copyNQN(dst []byte, s string) {
n := copy(dst, s)
if n < len(dst) {
dst[n] = 0
}
}
// extractNQN reads a NUL-terminated string from a fixed-size buffer.
func extractNQN(buf []byte) string {
for i, b := range buf {
if b == 0 {
return string(buf[:i])
}
}
return string(buf)
}
// ---------- Status word encoding ----------
// StatusWord encodes NVMe status: DNR(15) | More(14) | SCT(13:9) | SC(8:1) | P(0)
//
// StatusWord = (DNR << 15) | (SCT << 9) | (SC << 1)
type StatusWord uint16
// MakeStatus constructs a status word from SCT, SC, and DNR flag.
func MakeStatus(sct, sc uint8, dnr bool) StatusWord {
w := uint16(sct)<<9 | uint16(sc)<<1
if dnr {
w |= 1 << 15
}
return StatusWord(w)
}
// StatusSuccess is the zero-value success status.
const StatusSuccess StatusWord = 0
// Pre-defined status words used in the NVMe target.
var (
StatusInvalidOpcode = MakeStatus(0, 0x01, true) // Generic: Invalid Command Opcode
StatusInvalidField = MakeStatus(0, 0x02, true) // Generic: Invalid Field in Command
StatusInternalError = MakeStatus(0, 0x06, false) // Generic: Internal Error (retryable)
StatusInternalErrorDNR = MakeStatus(0, 0x06, true) // Generic: Internal Error (permanent)
StatusNSNotReady = MakeStatus(0, 0x82, false) // Generic: Namespace Not Ready (retryable)
StatusNSNotReadyDNR = MakeStatus(0, 0x82, true) // Generic: Namespace Not Ready (permanent)
StatusLBAOutOfRange = MakeStatus(0, 0x80, true) // Generic: LBA Out of Range
StatusMediaWriteFault = MakeStatus(2, 0x80, false) // Media: Write Fault
StatusMediaReadError = MakeStatus(2, 0x81, false) // Media: Uncorrectable Read Error
)
func (s StatusWord) SCT() uint8 { return uint8((s >> 9) & 0x07) }
func (s StatusWord) SC() uint8 { return uint8((s >> 1) & 0xFF) }
func (s StatusWord) DNR() bool { return s&(1<<15) != 0 }
func (s StatusWord) IsError() bool { return s != StatusSuccess }
func (s StatusWord) String() string {
if s == StatusSuccess {
return "Success"
}
return fmt.Sprintf("Status{sct=%d sc=0x%02x dnr=%v}", s.SCT(), s.SC(), s.DNR())
}

210
weed/storage/blockvol/nvme/server.go

@ -0,0 +1,210 @@
package nvme
import (
"fmt"
"log"
"net"
"sync"
"sync/atomic"
"time"
)
// Config holds NVMe/TCP target configuration.
type Config struct {
ListenAddr string
NQNPrefix string
MaxH2CDataLength uint32
MaxIOQueues uint16
Enabled bool
}
// DefaultConfig returns the default NVMe target configuration.
func DefaultConfig() Config {
return Config{
ListenAddr: "0.0.0.0:4420",
NQNPrefix: "nqn.2024-01.com.seaweedfs:vol.",
MaxH2CDataLength: maxH2CDataLen,
MaxIOQueues: 4,
Enabled: false,
}
}
// adminSession stores state from an admin queue connection that IO queue
// connections need to look up (they arrive on separate TCP connections).
type adminSession struct {
cntlID uint16
subsystem *Subsystem
subNQN string
hostNQN string
regCAP uint64
regCC uint32
regCSTS uint32
regVS uint32
katoMs uint32
}
// Server is the NVMe/TCP target server.
type Server struct {
cfg Config
listener net.Listener
mu sync.RWMutex
subsystems map[string]*Subsystem // NQN → Subsystem
sessions map[*Controller]struct{}
adminMu sync.RWMutex
admins map[uint16]*adminSession // CNTLID → admin session
nextCNTLID atomic.Uint32
closed atomic.Bool
wg sync.WaitGroup
}
// NewServer creates a new NVMe/TCP target server.
func NewServer(cfg Config) *Server {
return &Server{
cfg: cfg,
subsystems: make(map[string]*Subsystem),
sessions: make(map[*Controller]struct{}),
admins: make(map[uint16]*adminSession),
}
}
// AddVolume registers a block device as an NVMe subsystem.
func (s *Server) AddVolume(nqn string, dev BlockDevice, nguid [16]byte) {
s.mu.Lock()
defer s.mu.Unlock()
s.subsystems[nqn] = &Subsystem{
NQN: nqn,
Dev: dev,
NGUID: nguid,
}
}
// RemoveVolume unregisters an NVMe subsystem.
func (s *Server) RemoveVolume(nqn string) {
s.mu.Lock()
defer s.mu.Unlock()
delete(s.subsystems, nqn)
}
// ListenAndServe starts the NVMe/TCP listener.
// If not enabled, returns nil immediately.
func (s *Server) ListenAndServe() error {
if !s.cfg.Enabled {
return nil
}
ln, err := net.Listen("tcp", s.cfg.ListenAddr)
if err != nil {
return fmt.Errorf("nvme listen %s: %w", s.cfg.ListenAddr, err)
}
s.listener = ln
log.Printf("nvme: listening on %s", s.cfg.ListenAddr)
s.wg.Add(1)
go func() {
defer s.wg.Done()
s.acceptLoop()
}()
return nil
}
func (s *Server) acceptLoop() {
for {
conn, err := s.listener.Accept()
if err != nil {
if s.closed.Load() {
return
}
log.Printf("nvme: accept error: %v", err)
continue
}
ctrl := newController(conn, s)
s.addSession(ctrl)
s.wg.Add(1)
go func() {
defer s.wg.Done()
if err := ctrl.Serve(); err != nil {
if !s.closed.Load() {
log.Printf("nvme: session error: %v", err)
}
}
}()
}
}
func (s *Server) addSession(ctrl *Controller) {
s.mu.Lock()
defer s.mu.Unlock()
s.sessions[ctrl] = struct{}{}
}
func (s *Server) removeSession(ctrl *Controller) {
s.mu.Lock()
defer s.mu.Unlock()
delete(s.sessions, ctrl)
}
// registerAdmin stores admin queue state so IO queue connections can look it up.
func (s *Server) registerAdmin(sess *adminSession) {
s.adminMu.Lock()
defer s.adminMu.Unlock()
s.admins[sess.cntlID] = sess
}
// unregisterAdmin removes an admin session by CNTLID.
func (s *Server) unregisterAdmin(cntlID uint16) {
s.adminMu.Lock()
defer s.adminMu.Unlock()
delete(s.admins, cntlID)
}
// lookupAdmin returns the admin session for the given CNTLID.
func (s *Server) lookupAdmin(cntlID uint16) *adminSession {
s.adminMu.RLock()
defer s.adminMu.RUnlock()
return s.admins[cntlID]
}
// Close gracefully shuts down the server.
func (s *Server) Close() error {
if !s.cfg.Enabled {
return nil
}
s.closed.Store(true)
if s.listener != nil {
s.listener.Close()
}
// Close all active sessions
s.mu.RLock()
sessions := make([]*Controller, 0, len(s.sessions))
for ctrl := range s.sessions {
sessions = append(sessions, ctrl)
}
s.mu.RUnlock()
for _, ctrl := range sessions {
ctrl.conn.Close()
}
// Wait with timeout
done := make(chan struct{})
go func() {
s.wg.Wait()
close(done)
}()
select {
case <-done:
case <-time.After(5 * time.Second):
log.Printf("nvme: shutdown timed out after 5s")
}
return nil
}
// NQN returns the full NQN for a volume name.
func (s *Server) NQN(volName string) string {
return s.cfg.NQNPrefix + volName
}

202
weed/storage/blockvol/nvme/wire.go

@ -0,0 +1,202 @@
package nvme
import (
"bufio"
"encoding/binary"
"fmt"
"io"
)
// ---------- Reader ----------
// Reader decodes NVMe/TCP PDUs from a stream.
//
// Usage:
//
// hdr, _ := r.Dequeue() // read 8-byte CommonHeader
// r.Receive(&capsuleCmd) // read remaining specific header
// if r.Length() > 0 {
// data := make([]byte, r.Length())
// r.ReceiveData(data) // read payload
// }
type Reader struct {
rd io.Reader
CH CommonHeader
header [maxHeaderSize]byte
}
// NewReader wraps an io.Reader for NVMe/TCP PDU decoding.
func NewReader(r io.Reader) *Reader {
return &Reader{rd: r}
}
// Dequeue reads the 8-byte CommonHeader, validates bounds, and returns it.
func (r *Reader) Dequeue() (*CommonHeader, error) {
if _, err := io.ReadFull(r.rd, r.header[:commonHeaderSize]); err != nil {
return nil, err
}
r.CH.Unmarshal(r.header[:commonHeaderSize])
// Validate header bounds to prevent panics on malformed PDUs.
if r.CH.HeaderLength < commonHeaderSize {
return nil, fmt.Errorf("nvme: HeaderLength %d < minimum %d", r.CH.HeaderLength, commonHeaderSize)
}
if r.CH.HeaderLength > maxHeaderSize {
return nil, fmt.Errorf("nvme: HeaderLength %d > maximum %d", r.CH.HeaderLength, maxHeaderSize)
}
if r.CH.DataOffset != 0 && r.CH.DataOffset < r.CH.HeaderLength {
return nil, fmt.Errorf("nvme: DataOffset %d < HeaderLength %d", r.CH.DataOffset, r.CH.HeaderLength)
}
if r.CH.DataOffset != 0 && uint32(r.CH.DataOffset) > r.CH.DataLength {
return nil, fmt.Errorf("nvme: DataOffset %d > DataLength %d", r.CH.DataOffset, r.CH.DataLength)
}
if r.CH.DataLength < uint32(r.CH.HeaderLength) {
return nil, fmt.Errorf("nvme: DataLength %d < HeaderLength %d", r.CH.DataLength, r.CH.HeaderLength)
}
// DataOffset==0 means no inline data — DataLength must equal HeaderLength,
// otherwise unconsumed bytes desynchronize the stream.
if r.CH.DataOffset == 0 && r.CH.DataLength != uint32(r.CH.HeaderLength) {
return nil, fmt.Errorf("nvme: DataOffset=0 but DataLength %d != HeaderLength %d", r.CH.DataLength, r.CH.HeaderLength)
}
return &r.CH, nil
}
// Receive reads the remaining PDU-specific header (HeaderLength - 8 bytes)
// and unmarshals it into pdu. It also skips any padding between header and
// data (DataOffset - HeaderLength bytes).
func (r *Reader) Receive(pdu PDU) error {
remain := int(r.CH.HeaderLength) - commonHeaderSize
if remain <= 0 {
return nil
}
if _, err := io.ReadFull(r.rd, r.header[commonHeaderSize:r.CH.HeaderLength]); err != nil {
return err
}
pdu.Unmarshal(r.header[commonHeaderSize:r.CH.HeaderLength])
// Skip padding between header and data.
pad := int(r.CH.DataOffset) - int(r.CH.HeaderLength)
if pad > 0 {
if _, err := io.ReadFull(r.rd, make([]byte, pad)); err != nil {
return err
}
}
return nil
}
// Length returns the payload size: DataLength - DataOffset (when DataOffset != 0).
func (r *Reader) Length() uint32 {
if r.CH.DataOffset != 0 {
return r.CH.DataLength - uint32(r.CH.DataOffset)
}
return 0
}
// ReceiveData reads exactly len(buf) bytes of payload data.
func (r *Reader) ReceiveData(buf []byte) error {
_, err := io.ReadFull(r.rd, buf)
return err
}
// ---------- Writer ----------
// Writer encodes NVMe/TCP PDUs to a stream.
type Writer struct {
wr *bufio.Writer
CH CommonHeader
header [maxHeaderSize]byte
}
// NewWriter wraps an io.Writer for NVMe/TCP PDU encoding.
func NewWriter(w io.Writer) *Writer {
return &Writer{wr: bufio.NewWriter(w)}
}
// PrepareHeaderOnly sets up a header-only PDU (no payload).
// Call Flush() to write it to the wire.
func (w *Writer) PrepareHeaderOnly(pduType uint8, pdu PDU, specificLen uint8) {
w.CH.Type = pduType
w.CH.Flags = 0
w.CH.HeaderLength = commonHeaderSize + specificLen
w.CH.DataOffset = 0
w.CH.DataLength = uint32(w.CH.HeaderLength)
pdu.Marshal(w.header[commonHeaderSize:])
}
// PrepareWithData sets up a PDU with payload data.
// Call Flush() to write it to the wire.
func (w *Writer) PrepareWithData(pduType, flags uint8, pdu PDU, specificLen uint8, data []byte) {
w.CH.Type = pduType
w.CH.Flags = flags
w.CH.HeaderLength = commonHeaderSize + specificLen
if data != nil {
w.CH.DataOffset = w.CH.HeaderLength
w.CH.DataLength = uint32(w.CH.HeaderLength) + uint32(len(data))
} else {
w.CH.DataOffset = 0
w.CH.DataLength = uint32(w.CH.HeaderLength)
}
pdu.Marshal(w.header[commonHeaderSize:])
}
// Flush writes the prepared CommonHeader + specific header to the wire.
// If there was payload data (from PrepareWithData), call FlushData after.
func (w *Writer) Flush() error {
w.CH.Marshal(w.header[:commonHeaderSize])
if _, err := w.wr.Write(w.header[:w.CH.HeaderLength]); err != nil {
return err
}
return nil
}
// FlushData writes payload data and flushes the underlying buffered writer.
func (w *Writer) FlushData(data []byte) error {
if len(data) > 0 {
if _, err := w.wr.Write(data); err != nil {
return err
}
}
return w.wr.Flush()
}
// SendHeaderOnly writes a complete header-only PDU (prepare + flush).
func (w *Writer) SendHeaderOnly(pduType uint8, pdu PDU, specificLen uint8) error {
w.PrepareHeaderOnly(pduType, pdu, specificLen)
if err := w.Flush(); err != nil {
return err
}
return w.wr.Flush()
}
// SendWithData writes a complete PDU with payload data.
func (w *Writer) SendWithData(pduType, flags uint8, pdu PDU, specificLen uint8, data []byte) error {
w.PrepareWithData(pduType, flags, pdu, specificLen, data)
if err := w.Flush(); err != nil {
return err
}
return w.FlushData(data)
}
// writeRaw writes raw bytes directly (used for ConnectData inline in capsule).
func (w *Writer) writeRaw(data []byte) error {
_, err := w.wr.Write(data)
return err
}
// flushBuf flushes the underlying buffered writer.
func (w *Writer) flushBuf() error {
return w.wr.Flush()
}
// ---------- Helpers ----------
// putLE32 writes a uint32 in little-endian.
func putLE32(buf []byte, v uint32) {
binary.LittleEndian.PutUint32(buf, v)
}
// putLE64 writes a uint64 in little-endian.
func putLE64(buf []byte, v uint64) {
binary.LittleEndian.PutUint64(buf, v)
}
Loading…
Cancel
Save