Browse Source
feat: Phase 10 CP10-1 -- NVMe/TCP target MVP, 109 tests
feat: Phase 10 CP10-1 -- NVMe/TCP target MVP, 109 tests
NVMe over Fabrics (TCP) target implementation sharing the same BlockVol engine, fencing, replication, and failover as the existing iSCSI target. New package: weed/storage/blockvol/nvme/ (11 files, 2,242 production LOC) - protocol.go: PDU types, opcodes, status codes, marshal/unmarshal - wire.go: TCP reader/writer with header bounds validation - controller.go: IC handshake, per-queue state, command dispatch, KATO - fabric.go: Connect (admin+IO), PropertyGet/Set, Disconnect - identify.go: Controller/Namespace/NS list/NS descriptors (Linux 5.15) - admin.go: SetFeatures, GetFeatures, GetLogPage (SMART/ANA), KeepAlive - io.go: Read (C2HData), Write (inline), Flush, WriteZeros/Trim - server.go: TCP listener, admin session registry, graceful shutdown - adapter.go: BlockVol-to-NVMe bridge, error mapping, ANA state Integration: NVMeConfig + CLI flags (-block.nvme.*), disabled by default. Key design: inline-data writes only (no R2T), MaxH2CDataLength=32KB, single ANA group coherent with BlockVol role, CNTLID session registry for cross-connection IO queues, HostNQN continuity enforcement. Tests: 65 dev + 44 QA adversarial = 109 total, all passing. Bugs fixed during review: IO queue cross-connection (A), header bounds validation (B), write payload size check (C), disconnect error (D), stream desync prevention (E), HostNQN enforcement (F), capsule-before-IC state guard (H), flowCtlOff SQHD timing (I). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>feature/sw-block
16 changed files with 6268 additions and 16 deletions
-
20weed/command/volume.go
-
2weed/server/block_heartbeat_loop_test.go
-
2weed/server/volume_grpc_block_test.go
-
96weed/server/volume_server_block.go
-
4weed/server/volume_server_block_test.go
-
127weed/storage/blockvol/nvme/adapter.go
-
198weed/storage/blockvol/nvme/admin.go
-
354weed/storage/blockvol/nvme/controller.go
-
300weed/storage/blockvol/nvme/fabric.go
-
250weed/storage/blockvol/nvme/identify.go
-
157weed/storage/blockvol/nvme/io.go
-
1541weed/storage/blockvol/nvme/nvme_qa_test.go
-
2377weed/storage/blockvol/nvme/nvme_test.go
-
444weed/storage/blockvol/nvme/protocol.go
-
210weed/storage/blockvol/nvme/server.go
-
202weed/storage/blockvol/nvme/wire.go
@ -0,0 +1,127 @@ |
|||
package nvme |
|||
|
|||
import ( |
|||
"errors" |
|||
"strings" |
|||
|
|||
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol" |
|||
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol/blockerr" |
|||
) |
|||
|
|||
// NVMeAdapter wraps a *BlockVol to implement BlockDevice and ANAProvider
|
|||
// for the NVMe/TCP target, bridging the BlockVol storage engine to NVMe
|
|||
// command handling.
|
|||
type NVMeAdapter struct { |
|||
Vol *blockvol.BlockVol |
|||
} |
|||
|
|||
// NewNVMeAdapter creates a BlockDevice adapter for the given BlockVol.
|
|||
func NewNVMeAdapter(vol *blockvol.BlockVol) *NVMeAdapter { |
|||
return &NVMeAdapter{Vol: vol} |
|||
} |
|||
|
|||
func (a *NVMeAdapter) ReadAt(lba uint64, length uint32) ([]byte, error) { |
|||
return a.Vol.ReadLBA(lba, length) |
|||
} |
|||
|
|||
func (a *NVMeAdapter) WriteAt(lba uint64, data []byte) error { |
|||
return a.Vol.WriteLBA(lba, data) |
|||
} |
|||
|
|||
func (a *NVMeAdapter) Trim(lba uint64, length uint32) error { |
|||
return a.Vol.Trim(lba, length) |
|||
} |
|||
|
|||
func (a *NVMeAdapter) SyncCache() error { |
|||
return a.Vol.SyncCache() |
|||
} |
|||
|
|||
func (a *NVMeAdapter) BlockSize() uint32 { |
|||
return a.Vol.Info().BlockSize |
|||
} |
|||
|
|||
func (a *NVMeAdapter) VolumeSize() uint64 { |
|||
return a.Vol.Info().VolumeSize |
|||
} |
|||
|
|||
func (a *NVMeAdapter) IsHealthy() bool { |
|||
return a.Vol.Info().Healthy |
|||
} |
|||
|
|||
// ANAState returns the ANA state based on the volume's role.
|
|||
func (a *NVMeAdapter) ANAState() uint8 { |
|||
return RoleToANAState(a.Vol.Role()) |
|||
} |
|||
|
|||
// ANAGroupID returns the ANA group ID (always 1 for single-group MVP).
|
|||
func (a *NVMeAdapter) ANAGroupID() uint16 { return 1 } |
|||
|
|||
// DeviceNGUID returns a 16-byte NGUID derived from the volume UUID.
|
|||
func (a *NVMeAdapter) DeviceNGUID() [16]byte { |
|||
return UUIDToNGUID(a.Vol.Info().UUID) |
|||
} |
|||
|
|||
// Compile-time checks.
|
|||
var _ BlockDevice = (*NVMeAdapter)(nil) |
|||
var _ ANAProvider = (*NVMeAdapter)(nil) |
|||
|
|||
// RoleToANAState maps a BlockVol Role to an NVMe ANA state.
|
|||
func RoleToANAState(r blockvol.Role) uint8 { |
|||
switch r { |
|||
case blockvol.RolePrimary, blockvol.RoleNone: |
|||
return anaOptimized |
|||
case blockvol.RoleReplica: |
|||
return anaInaccessible |
|||
case blockvol.RoleStale: |
|||
return anaPersistentLoss |
|||
case blockvol.RoleRebuilding, blockvol.RoleDraining: |
|||
return anaInaccessible |
|||
default: |
|||
return anaInaccessible |
|||
} |
|||
} |
|||
|
|||
// UUIDToNGUID converts a 16-byte UUID to a 16-byte NGUID.
|
|||
// Uses NAA-6 pattern for first 8 bytes (compatible with iSCSI UUIDToNAA),
|
|||
// copies remaining bytes as-is.
|
|||
func UUIDToNGUID(uuid [16]byte) [16]byte { |
|||
var nguid [16]byte |
|||
nguid[0] = 0x60 | (uuid[0] & 0x0F) |
|||
copy(nguid[1:8], uuid[1:8]) |
|||
copy(nguid[8:16], uuid[8:16]) |
|||
return nguid |
|||
} |
|||
|
|||
// mapBlockError maps BlockVol errors to NVMe status words.
|
|||
func mapBlockError(err error) StatusWord { |
|||
if err == nil { |
|||
return StatusSuccess |
|||
} |
|||
|
|||
// Check known sentinel errors from blockvol and blockerr packages.
|
|||
switch { |
|||
case errors.Is(err, blockvol.ErrLeaseExpired): |
|||
return StatusNSNotReadyDNR // DNR=1: fencing is permanent
|
|||
case errors.Is(err, blockvol.ErrEpochRegression): |
|||
return StatusInternalErrorDNR // DNR=1: stale controller
|
|||
case errors.Is(err, blockerr.ErrDurabilityBarrierFailed): |
|||
return StatusInternalError // DNR=0: replica may recover
|
|||
case errors.Is(err, blockerr.ErrDurabilityQuorumLost): |
|||
return StatusInternalError // DNR=0: quorum may heal
|
|||
case errors.Is(err, blockvol.ErrWALFull): |
|||
return StatusNSNotReady // DNR=0: transient pressure
|
|||
case errors.Is(err, blockvol.ErrNotPrimary): |
|||
return StatusNSNotReady // DNR=0: may be transitioning
|
|||
} |
|||
|
|||
// Heuristic for I/O errors (no dedicated sentinels yet).
|
|||
msg := err.Error() |
|||
if strings.Contains(msg, "write") || strings.Contains(msg, "Write") { |
|||
return StatusMediaWriteFault |
|||
} |
|||
if strings.Contains(msg, "read") || strings.Contains(msg, "Read") { |
|||
return StatusMediaReadError |
|||
} |
|||
|
|||
return StatusInternalError |
|||
} |
|||
@ -0,0 +1,198 @@ |
|||
package nvme |
|||
|
|||
import ( |
|||
"encoding/binary" |
|||
) |
|||
|
|||
// handleSetFeatures processes SetFeatures admin commands.
|
|||
func (c *Controller) handleSetFeatures(req *Request) error { |
|||
fid := uint8(req.capsule.D10 & 0xFF) |
|||
|
|||
switch fid { |
|||
case fidNumberOfQueues: |
|||
// D11: NCQR[15:0] | NSQR[31:16]
|
|||
ncqr := uint16(req.capsule.D11 & 0xFFFF) |
|||
nsqr := uint16(req.capsule.D11 >> 16) |
|||
|
|||
// Grant min(requested, max)
|
|||
if ncqr > c.maxIOQueues { |
|||
ncqr = c.maxIOQueues |
|||
} |
|||
if nsqr > c.maxIOQueues { |
|||
nsqr = c.maxIOQueues |
|||
} |
|||
if ncqr == 0 { |
|||
ncqr = 1 |
|||
} |
|||
if nsqr == 0 { |
|||
nsqr = 1 |
|||
} |
|||
c.grantedQueues = ncqr |
|||
|
|||
// Response DW0: (NCQR-1) | ((NSQR-1) << 16)
|
|||
req.resp.DW0 = uint32(ncqr-1) | (uint32(nsqr-1) << 16) |
|||
return c.sendResponse(req) |
|||
|
|||
case fidKeepAliveTimer: |
|||
// D11 contains KATO in milliseconds
|
|||
c.katoMs = req.capsule.D11 |
|||
return c.sendResponse(req) |
|||
|
|||
case fidAsyncEventConfig: |
|||
// Stub: accept but don't deliver events
|
|||
return c.sendResponse(req) |
|||
|
|||
default: |
|||
req.resp.Status = uint16(StatusInvalidField) |
|||
return c.sendResponse(req) |
|||
} |
|||
} |
|||
|
|||
// handleGetFeatures returns stored feature values.
|
|||
func (c *Controller) handleGetFeatures(req *Request) error { |
|||
fid := uint8(req.capsule.D10 & 0xFF) |
|||
|
|||
switch fid { |
|||
case fidNumberOfQueues: |
|||
n := c.grantedQueues |
|||
if n == 0 { |
|||
n = c.maxIOQueues |
|||
} |
|||
req.resp.DW0 = uint32(n-1) | (uint32(n-1) << 16) |
|||
return c.sendResponse(req) |
|||
|
|||
case fidKeepAliveTimer: |
|||
req.resp.DW0 = c.katoMs |
|||
return c.sendResponse(req) |
|||
|
|||
case fidAsyncEventConfig: |
|||
req.resp.DW0 = 0 |
|||
return c.sendResponse(req) |
|||
|
|||
default: |
|||
req.resp.Status = uint16(StatusInvalidField) |
|||
return c.sendResponse(req) |
|||
} |
|||
} |
|||
|
|||
// handleGetLogPage returns log page data.
|
|||
func (c *Controller) handleGetLogPage(req *Request) error { |
|||
// D10 bits 7:0 = Log Page Identifier
|
|||
// D10 bits 27:16 and D11 bits 15:0 = Number of Dwords (NUMD)
|
|||
lid := uint8(req.capsule.D10 & 0xFF) |
|||
numdl := (req.capsule.D10 >> 16) & 0xFFF |
|||
numdu := req.capsule.D11 & 0xFFFF |
|||
numd := uint32(numdu)<<16 | uint32(numdl) |
|||
length := (numd + 1) * 4 // NUMD is 0-based, in dwords
|
|||
|
|||
switch lid { |
|||
case logPageError: |
|||
return c.logPageError(req, length) |
|||
case logPageSMART: |
|||
return c.logPageSMART(req, length) |
|||
case logPageANA: |
|||
return c.logPageANA(req, length) |
|||
default: |
|||
req.resp.Status = uint16(StatusInvalidField) |
|||
return c.sendResponse(req) |
|||
} |
|||
} |
|||
|
|||
// logPageError returns an empty error log page.
|
|||
func (c *Controller) logPageError(req *Request, length uint32) error { |
|||
if length > 64 { |
|||
length = 64 |
|||
} |
|||
req.c2hData = make([]byte, length) |
|||
return c.sendC2HDataAndResponse(req) |
|||
} |
|||
|
|||
// logPageSMART returns a 512-byte SMART/Health log.
|
|||
func (c *Controller) logPageSMART(req *Request, length uint32) error { |
|||
if length > 512 { |
|||
length = 512 |
|||
} |
|||
buf := make([]byte, 512) |
|||
|
|||
// Critical Warning - offset 0: 0 = no warnings
|
|||
buf[0] = 0 |
|||
|
|||
// Composite Temperature - offset 1-2: 0 (not implemented)
|
|||
binary.LittleEndian.PutUint16(buf[1:], 0) |
|||
|
|||
// Available Spare - offset 3: 100%
|
|||
buf[3] = 100 |
|||
|
|||
// Available Spare Threshold - offset 4: 10%
|
|||
buf[4] = 10 |
|||
|
|||
// Percentage Used - offset 5: 0%
|
|||
buf[5] = 0 |
|||
|
|||
req.c2hData = buf[:length] |
|||
return c.sendC2HDataAndResponse(req) |
|||
} |
|||
|
|||
// logPageANA returns the ANA log page with a single group.
|
|||
func (c *Controller) logPageANA(req *Request, length uint32) error { |
|||
// ANA log page format (32 bytes for single group):
|
|||
// [0:8] CHGCNT (uint64)
|
|||
// [8:10] NGRPS = 1 (uint16)
|
|||
// [10:16] reserved
|
|||
// Group descriptor:
|
|||
// [16:20] ANAGRPID = 1 (uint32)
|
|||
// [20:24] NNSID = 1 (uint32)
|
|||
// [24:32] Change Count (uint64)
|
|||
// [32] ANA State
|
|||
// [33:36] reserved
|
|||
// [36:40] NSID = 1 (uint32)
|
|||
const anaLogSize = 40 |
|||
|
|||
buf := make([]byte, anaLogSize) |
|||
|
|||
// CHGCNT
|
|||
binary.LittleEndian.PutUint64(buf[0:], c.anaChangeCount()) |
|||
|
|||
// NGRPS
|
|||
binary.LittleEndian.PutUint16(buf[8:], 1) |
|||
|
|||
// Group descriptor
|
|||
binary.LittleEndian.PutUint32(buf[16:], 1) // ANAGRPID=1
|
|||
binary.LittleEndian.PutUint32(buf[20:], 1) // NNSID=1
|
|||
binary.LittleEndian.PutUint64(buf[24:], c.anaChangeCount()) // chgcnt
|
|||
buf[32] = c.anaState() // ANA state
|
|||
binary.LittleEndian.PutUint32(buf[36:], 1) // NSID=1
|
|||
|
|||
if length > anaLogSize { |
|||
length = anaLogSize |
|||
} |
|||
req.c2hData = buf[:length] |
|||
return c.sendC2HDataAndResponse(req) |
|||
} |
|||
|
|||
// anaState returns the current ANA state based on the subsystem's device.
|
|||
func (c *Controller) anaState() uint8 { |
|||
if c.subsystem == nil { |
|||
return anaInaccessible |
|||
} |
|||
if prov, ok := c.subsystem.Dev.(ANAProvider); ok { |
|||
return prov.ANAState() |
|||
} |
|||
// Default: if healthy → optimized
|
|||
if c.subsystem.Dev.IsHealthy() { |
|||
return anaOptimized |
|||
} |
|||
return anaInaccessible |
|||
} |
|||
|
|||
// anaChangeCount returns a monotonic ANA change counter.
|
|||
// For MVP, we use 1 as a constant (no dynamic role changes tracked).
|
|||
func (c *Controller) anaChangeCount() uint64 { |
|||
return 1 |
|||
} |
|||
|
|||
// handleKeepAlive resets the KATO timer and returns success.
|
|||
func (c *Controller) handleKeepAlive(req *Request) error { |
|||
c.resetKATO() |
|||
return c.sendResponse(req) |
|||
} |
|||
@ -0,0 +1,354 @@ |
|||
package nvme |
|||
|
|||
import ( |
|||
"fmt" |
|||
"io" |
|||
"log" |
|||
"net" |
|||
"sync" |
|||
"sync/atomic" |
|||
"time" |
|||
) |
|||
|
|||
// controllerState tracks the lifecycle of an NVMe controller session.
|
|||
type controllerState int |
|||
|
|||
const ( |
|||
stateConnected controllerState = iota // TCP connected, no IC yet
|
|||
stateICComplete // IC exchange done
|
|||
stateAdminReady // Admin queue connected
|
|||
stateCtrlReady // CC.EN=1, CSTS.RDY=1
|
|||
stateIOActive // IO queues active
|
|||
stateClosed // Shut down
|
|||
) |
|||
|
|||
// Request represents an in-flight NVMe command being processed.
|
|||
type Request struct { |
|||
capsule CapsuleCommand |
|||
payload []byte // inline data from host (Write commands)
|
|||
resp CapsuleResponse |
|||
c2hData []byte // data to send to host (Read commands)
|
|||
status StatusWord |
|||
} |
|||
|
|||
// Controller handles one NVMe/TCP connection (one queue per connection).
|
|||
type Controller struct { |
|||
mu sync.Mutex |
|||
|
|||
// Session identity
|
|||
conn net.Conn |
|||
in *Reader |
|||
out *Writer |
|||
state controllerState |
|||
closed atomic.Bool |
|||
|
|||
// Queue state (one queue per TCP connection)
|
|||
queueID uint16 |
|||
queueSize uint16 |
|||
sqhd uint16 // Submission Queue Head pointer
|
|||
flowCtlOff bool // CATTR bit2: SQ flow control disabled
|
|||
|
|||
// Controller identity
|
|||
cntlID uint16 |
|||
subNQN string |
|||
|
|||
// Controller registers
|
|||
regCAP uint64 // Controller Capabilities
|
|||
regCC uint32 // Controller Configuration (set by host via PropertySet)
|
|||
regCSTS uint32 // Controller Status (RDY bit)
|
|||
regVS uint32 // Version
|
|||
|
|||
// KeepAlive
|
|||
katoMs uint32 |
|||
katoTimer *time.Timer |
|||
katoMu sync.Mutex |
|||
|
|||
// Async completion (IO queues)
|
|||
waiting chan *Request // pre-allocated request pool
|
|||
completions chan *Request // completed requests to send
|
|||
|
|||
// Backend
|
|||
subsystem *Subsystem |
|||
server *Server |
|||
|
|||
// Features
|
|||
maxIOQueues uint16 |
|||
grantedQueues uint16 |
|||
isAdmin bool // true if this controller owns admin queue (QID=0)
|
|||
|
|||
// Lifecycle
|
|||
wg sync.WaitGroup |
|||
closeOnce sync.Once |
|||
} |
|||
|
|||
// newController creates a controller for the given connection.
|
|||
func newController(conn net.Conn, server *Server) *Controller { |
|||
c := &Controller{ |
|||
conn: conn, |
|||
in: NewReader(conn), |
|||
out: NewWriter(conn), |
|||
state: stateConnected, |
|||
server: server, |
|||
regVS: nvmeVersion14, |
|||
// CAP register: MQES=63 (bits 15:0), CQR=1 (bit 16), TO=30 (bits 31:24, *500ms=15s), CSS bit37=1 (NVM command set)
|
|||
regCAP: uint64(63) | (1 << 16) | (uint64(30) << 24) | (1 << 37), |
|||
maxIOQueues: server.cfg.MaxIOQueues, |
|||
} |
|||
return c |
|||
} |
|||
|
|||
// Serve is the main event loop for this controller connection.
|
|||
func (c *Controller) Serve() error { |
|||
defer c.shutdown() |
|||
|
|||
// IC handshake timeout
|
|||
if err := c.conn.SetReadDeadline(time.Now().Add(10 * time.Second)); err != nil { |
|||
return err |
|||
} |
|||
|
|||
for { |
|||
if c.closed.Load() { |
|||
return nil |
|||
} |
|||
|
|||
hdr, err := c.in.Dequeue() |
|||
if err != nil { |
|||
if err == io.EOF || c.closed.Load() { |
|||
return nil |
|||
} |
|||
return fmt.Errorf("read header: %w", err) |
|||
} |
|||
|
|||
switch hdr.Type { |
|||
case pduICReq: |
|||
if err := c.handleIC(); err != nil { |
|||
return fmt.Errorf("IC handshake: %w", err) |
|||
} |
|||
// Clear read deadline after successful IC
|
|||
if err := c.conn.SetReadDeadline(time.Time{}); err != nil { |
|||
return err |
|||
} |
|||
|
|||
case pduCapsuleCmd: |
|||
if err := c.handleCapsule(); err != nil { |
|||
return fmt.Errorf("capsule: %w", err) |
|||
} |
|||
|
|||
case pduH2CTermReq: |
|||
return nil // host terminated
|
|||
|
|||
default: |
|||
return fmt.Errorf("unexpected PDU type: 0x%x", hdr.Type) |
|||
} |
|||
} |
|||
} |
|||
|
|||
// handleIC processes the IC handshake.
|
|||
func (c *Controller) handleIC() error { |
|||
var req ICRequest |
|||
if err := c.in.Receive(&req); err != nil { |
|||
return err |
|||
} |
|||
|
|||
resp := ICResponse{ |
|||
PDUFormatVersion: 0, |
|||
MaxH2CDataLength: maxH2CDataLen, |
|||
} |
|||
if err := c.out.SendHeaderOnly(pduICResp, &resp, icBodySize); err != nil { |
|||
return err |
|||
} |
|||
|
|||
c.state = stateICComplete |
|||
return nil |
|||
} |
|||
|
|||
// handleCapsule dispatches a CapsuleCmd PDU.
|
|||
func (c *Controller) handleCapsule() error { |
|||
// Reject capsule commands before IC handshake is complete.
|
|||
if c.state < stateICComplete { |
|||
return fmt.Errorf("capsule command before IC handshake") |
|||
} |
|||
|
|||
var capsule CapsuleCommand |
|||
if err := c.in.Receive(&capsule); err != nil { |
|||
return err |
|||
} |
|||
|
|||
// Read optional inline data
|
|||
var payload []byte |
|||
if dataLen := c.in.Length(); dataLen > 0 { |
|||
payload = make([]byte, dataLen) |
|||
if err := c.in.ReceiveData(payload); err != nil { |
|||
return err |
|||
} |
|||
} |
|||
|
|||
// Advance SQHD
|
|||
c.sqhd++ |
|||
if c.sqhd >= c.queueSize && c.queueSize > 0 { |
|||
c.sqhd = 0 |
|||
} |
|||
|
|||
req := &Request{ |
|||
capsule: capsule, |
|||
payload: payload, |
|||
} |
|||
req.resp.CID = capsule.CID |
|||
req.resp.QueueID = c.queueID |
|||
// SQHD is set in sendResponse/sendC2HDataAndResponse using the
|
|||
// latest c.flowCtlOff value, so Connect responses correctly get
|
|||
// SQHD=0xFFFF when the host requests flowCtlOff via CATTR.
|
|||
req.resp.Status = uint16(StatusSuccess) |
|||
|
|||
if c.queueID == 0 { |
|||
return c.dispatchAdmin(req) |
|||
} |
|||
return c.dispatchIO(req) |
|||
} |
|||
|
|||
// dispatchAdmin handles admin queue commands synchronously.
|
|||
func (c *Controller) dispatchAdmin(req *Request) error { |
|||
capsule := &req.capsule |
|||
|
|||
if capsule.OpCode == adminFabric { |
|||
return c.handleFabricCommand(req) |
|||
} |
|||
|
|||
switch capsule.OpCode { |
|||
case adminIdentify: |
|||
return c.handleIdentify(req) |
|||
case adminSetFeatures: |
|||
return c.handleSetFeatures(req) |
|||
case adminGetFeatures: |
|||
return c.handleGetFeatures(req) |
|||
case adminGetLogPage: |
|||
return c.handleGetLogPage(req) |
|||
case adminKeepAlive: |
|||
return c.handleKeepAlive(req) |
|||
case adminAsyncEvent: |
|||
// Stub: just succeed (don't deliver events in CP10-1)
|
|||
return c.sendResponse(req) |
|||
default: |
|||
req.resp.Status = uint16(StatusInvalidOpcode) |
|||
return c.sendResponse(req) |
|||
} |
|||
} |
|||
|
|||
// dispatchIO handles IO queue commands.
|
|||
func (c *Controller) dispatchIO(req *Request) error { |
|||
capsule := &req.capsule |
|||
|
|||
switch capsule.OpCode { |
|||
case ioRead: |
|||
return c.handleRead(req) |
|||
case ioWrite: |
|||
return c.handleWrite(req) |
|||
case ioFlush: |
|||
return c.handleFlush(req) |
|||
case ioWriteZeros: |
|||
return c.handleWriteZeros(req) |
|||
default: |
|||
req.resp.Status = uint16(StatusInvalidOpcode) |
|||
return c.sendResponse(req) |
|||
} |
|||
} |
|||
|
|||
// sendC2HDataAndResponse sends C2HData PDUs followed by a CapsuleResp.
|
|||
func (c *Controller) sendC2HDataAndResponse(req *Request) error { |
|||
if len(req.c2hData) > 0 { |
|||
data := req.c2hData |
|||
offset := uint32(0) |
|||
chunkSize := uint32(maxH2CDataLen) |
|||
|
|||
for offset < uint32(len(data)) { |
|||
end := offset + chunkSize |
|||
if end > uint32(len(data)) { |
|||
end = uint32(len(data)) |
|||
} |
|||
chunk := data[offset:end] |
|||
|
|||
hdr := C2HDataHeader{ |
|||
CCCID: req.capsule.CID, |
|||
DATAO: offset, |
|||
DATAL: uint32(len(chunk)), |
|||
} |
|||
|
|||
flags := uint8(0) |
|||
if end >= uint32(len(data)) { |
|||
flags = c2hFlagLast |
|||
} |
|||
|
|||
if err := c.out.SendWithData(pduC2HData, flags, &hdr, c2hDataHdrSize, chunk); err != nil { |
|||
return err |
|||
} |
|||
offset = end |
|||
} |
|||
} |
|||
|
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
// sendResponse sends a CapsuleResp PDU.
|
|||
// SQHD is set here (not in handleCapsule) so that flowCtlOff changes
|
|||
// made during command dispatch (e.g. Fabric Connect) take effect
|
|||
// on the same response.
|
|||
func (c *Controller) sendResponse(req *Request) error { |
|||
if c.flowCtlOff { |
|||
req.resp.SQHD = 0xFFFF |
|||
} else { |
|||
req.resp.SQHD = c.sqhd |
|||
} |
|||
c.resetKATO() |
|||
return c.out.SendHeaderOnly(pduCapsuleResp, &req.resp, capsuleRespSize) |
|||
} |
|||
|
|||
// ---------- KATO management ----------
|
|||
|
|||
func (c *Controller) startKATO() { |
|||
c.katoMu.Lock() |
|||
defer c.katoMu.Unlock() |
|||
if c.katoMs == 0 { |
|||
return |
|||
} |
|||
d := time.Duration(c.katoMs) * time.Millisecond |
|||
// Add 50% margin per spec recommendation
|
|||
d = d + d/2 |
|||
c.katoTimer = time.AfterFunc(d, func() { |
|||
log.Printf("nvme: KATO expired for cntlid=%d, closing connection", c.cntlID) |
|||
c.conn.Close() |
|||
}) |
|||
} |
|||
|
|||
func (c *Controller) resetKATO() { |
|||
c.katoMu.Lock() |
|||
defer c.katoMu.Unlock() |
|||
if c.katoTimer != nil { |
|||
c.katoTimer.Reset(time.Duration(c.katoMs)*time.Millisecond + time.Duration(c.katoMs)*time.Millisecond/2) |
|||
} |
|||
} |
|||
|
|||
func (c *Controller) stopKATO() { |
|||
c.katoMu.Lock() |
|||
defer c.katoMu.Unlock() |
|||
if c.katoTimer != nil { |
|||
c.katoTimer.Stop() |
|||
c.katoTimer = nil |
|||
} |
|||
} |
|||
|
|||
// ---------- Lifecycle ----------
|
|||
|
|||
func (c *Controller) shutdown() { |
|||
c.closeOnce.Do(func() { |
|||
c.closed.Store(true) |
|||
c.stopKATO() |
|||
c.state = stateClosed |
|||
c.conn.Close() |
|||
if c.server != nil { |
|||
if c.isAdmin && c.cntlID != 0 { |
|||
c.server.unregisterAdmin(c.cntlID) |
|||
} |
|||
c.server.removeSession(c) |
|||
} |
|||
}) |
|||
} |
|||
@ -0,0 +1,300 @@ |
|||
package nvme |
|||
|
|||
import ( |
|||
"encoding/binary" |
|||
) |
|||
|
|||
// handleFabricCommand dispatches Fabric-specific commands by FCType.
|
|||
func (c *Controller) handleFabricCommand(req *Request) error { |
|||
switch req.capsule.FCType { |
|||
case fcConnect: |
|||
return c.handleConnect(req) |
|||
case fcPropertyGet: |
|||
return c.handlePropertyGet(req) |
|||
case fcPropertySet: |
|||
return c.handlePropertySet(req) |
|||
case fcDisconnect: |
|||
return c.handleDisconnect(req) |
|||
default: |
|||
req.resp.Status = uint16(StatusInvalidField) |
|||
return c.sendResponse(req) |
|||
} |
|||
} |
|||
|
|||
// handleConnect processes a Fabric Connect command.
|
|||
func (c *Controller) handleConnect(req *Request) error { |
|||
capsule := &req.capsule |
|||
|
|||
// Parse QueueID, QueueSize, KATO, CATTR from capsule dwords.
|
|||
// Connect command layout (CDW10-CDW12):
|
|||
// CDW10[15:0]=RECFM, CDW10[31:16]=QID
|
|||
// CDW11[15:0]=SQSIZE, CDW11[23:16]=CATTR
|
|||
// CDW12=KATO
|
|||
queueID := uint16(capsule.D10 >> 16) |
|||
queueSize := uint16(capsule.D11&0xFFFF) + 1 // SQSIZE is 0-based
|
|||
cattr := uint8(capsule.D11 >> 16) |
|||
kato := capsule.D12 |
|||
|
|||
// Parse ConnectData from payload
|
|||
if len(req.payload) < connectDataSize { |
|||
req.resp.Status = uint16(StatusInvalidField) |
|||
return c.sendResponse(req) |
|||
} |
|||
var cd ConnectData |
|||
cd.Unmarshal(req.payload) |
|||
|
|||
if queueID == 0 { |
|||
// Admin queue connect
|
|||
sub := c.server.findSubsystem(cd.SubNQN) |
|||
if sub == nil { |
|||
req.resp.Status = uint16(StatusInvalidField) |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
c.subsystem = sub |
|||
c.subNQN = cd.SubNQN |
|||
c.queueID = 0 |
|||
c.queueSize = queueSize |
|||
c.cntlID = c.server.allocCNTLID() |
|||
c.katoMs = kato |
|||
c.flowCtlOff = (cattr & 0x04) != 0 |
|||
c.state = stateAdminReady |
|||
c.isAdmin = true |
|||
|
|||
// Register admin session so IO queue connections can find us.
|
|||
c.server.registerAdmin(&adminSession{ |
|||
cntlID: c.cntlID, |
|||
subsystem: sub, |
|||
subNQN: cd.SubNQN, |
|||
hostNQN: cd.HostNQN, |
|||
regCAP: c.regCAP, |
|||
regCC: c.regCC, |
|||
regCSTS: c.regCSTS, |
|||
regVS: c.regVS, |
|||
katoMs: kato, |
|||
}) |
|||
|
|||
// Return CNTLID in DW0
|
|||
req.resp.DW0 = uint32(c.cntlID) |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
// IO queue connect — look up admin session from server registry.
|
|||
// IO queues arrive on separate TCP connections with fresh Controllers,
|
|||
// so we must find the admin session by CNTLID from the server.
|
|||
admin := c.server.lookupAdmin(cd.CNTLID) |
|||
if admin == nil { |
|||
req.resp.Status = uint16(StatusInvalidField) |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
// Validate SubNQN and HostNQN match the admin session.
|
|||
if cd.SubNQN != admin.subNQN { |
|||
req.resp.Status = uint16(StatusInvalidField) |
|||
return c.sendResponse(req) |
|||
} |
|||
if cd.HostNQN != admin.hostNQN { |
|||
req.resp.Status = uint16(StatusInvalidField) |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
c.cntlID = cd.CNTLID |
|||
c.subsystem = admin.subsystem |
|||
c.subNQN = admin.subNQN |
|||
c.queueID = queueID |
|||
c.queueSize = queueSize |
|||
c.flowCtlOff = (cattr & 0x04) != 0 |
|||
c.state = stateIOActive |
|||
|
|||
req.resp.DW0 = uint32(c.cntlID) |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
// handlePropertyGet returns a controller register value.
|
|||
func (c *Controller) handlePropertyGet(req *Request) error { |
|||
// Property offset in D10 (bits 31:0, but only lower bits used)
|
|||
offset := req.capsule.D10 |
|||
// Attrib in D11 bit 0: 0=4byte, 1=8byte
|
|||
size8 := (req.capsule.D11 & 1) != 0 |
|||
|
|||
var val uint64 |
|||
switch offset { |
|||
case propCAP: |
|||
val = c.regCAP |
|||
case propVS: |
|||
val = uint64(c.regVS) |
|||
case propCC: |
|||
val = uint64(c.regCC) |
|||
case propCSTS: |
|||
val = uint64(c.regCSTS) |
|||
default: |
|||
req.resp.Status = uint16(StatusInvalidField) |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
if size8 { |
|||
// 8-byte value in DW0+DW1
|
|||
req.resp.DW0 = uint32(val) |
|||
req.resp.DW1 = uint32(val >> 32) |
|||
} else { |
|||
req.resp.DW0 = uint32(val) |
|||
} |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
// handlePropertySet handles controller register writes.
|
|||
func (c *Controller) handlePropertySet(req *Request) error { |
|||
offset := req.capsule.D10 |
|||
value := uint64(req.capsule.D14) | uint64(req.capsule.D15)<<32 |
|||
|
|||
switch offset { |
|||
case propCC: |
|||
c.regCC = uint32(value) |
|||
// Check CC.EN (bit 0)
|
|||
if c.regCC&1 != 0 { |
|||
c.regCSTS |= 1 // Set CSTS.RDY
|
|||
c.state = stateCtrlReady |
|||
if c.katoMs > 0 { |
|||
c.startKATO() |
|||
} |
|||
} else { |
|||
c.regCSTS &^= 1 // Clear CSTS.RDY
|
|||
} |
|||
default: |
|||
// Ignore writes to other registers
|
|||
} |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
// handleDisconnect processes a Fabric Disconnect.
|
|||
func (c *Controller) handleDisconnect(req *Request) error { |
|||
if err := c.sendResponse(req); err != nil { |
|||
return err |
|||
} |
|||
c.shutdown() |
|||
return nil |
|||
} |
|||
|
|||
// ---------- Subsystem ----------
|
|||
|
|||
// Subsystem represents an NVMe subsystem backed by a BlockDevice.
|
|||
type Subsystem struct { |
|||
NQN string |
|||
Dev BlockDevice |
|||
NGUID [16]byte // Namespace GUID
|
|||
} |
|||
|
|||
// BlockDevice is the interface for the underlying storage.
|
|||
// This is the same as iscsi.BlockDevice.
|
|||
type BlockDevice interface { |
|||
ReadAt(lba uint64, length uint32) ([]byte, error) |
|||
WriteAt(lba uint64, data []byte) error |
|||
Trim(lba uint64, length uint32) error |
|||
SyncCache() error |
|||
BlockSize() uint32 |
|||
VolumeSize() uint64 |
|||
IsHealthy() bool |
|||
} |
|||
|
|||
// ANAProvider extends BlockDevice with ANA state reporting.
|
|||
type ANAProvider interface { |
|||
ANAState() uint8 |
|||
ANAGroupID() uint16 |
|||
DeviceNGUID() [16]byte |
|||
} |
|||
|
|||
// allocCNTLID allocates a new controller ID from the server.
|
|||
func (s *Server) allocCNTLID() uint16 { |
|||
return uint16(s.nextCNTLID.Add(1)) |
|||
} |
|||
|
|||
// findSubsystem looks up a subsystem by NQN.
|
|||
func (s *Server) findSubsystem(nqn string) *Subsystem { |
|||
s.mu.RLock() |
|||
defer s.mu.RUnlock() |
|||
sub, ok := s.subsystems[nqn] |
|||
if !ok { |
|||
return nil |
|||
} |
|||
return sub |
|||
} |
|||
|
|||
// ---------- ConnectData field access helpers ----------
|
|||
|
|||
// connectQueueID extracts the QueueID from a Connect capsule D10.
|
|||
func connectQueueID(capsule *CapsuleCommand) uint16 { |
|||
return uint16(capsule.D10 >> 16) |
|||
} |
|||
|
|||
// connectQueueSize extracts the QueueSize from a Connect capsule D11 (0-based → +1).
|
|||
func connectQueueSize(capsule *CapsuleCommand) uint16 { |
|||
return uint16(capsule.D11&0xFFFF) + 1 |
|||
} |
|||
|
|||
// connectKATO extracts the KeepAlive timeout from a Connect capsule D12.
|
|||
func connectKATO(capsule *CapsuleCommand) uint32 { |
|||
return capsule.D12 |
|||
} |
|||
|
|||
// PropertySet value extraction: the go-nvme reference puts value in D12/D13,
|
|||
// but NVMe spec actually uses CDW14/CDW15 for PropertySet. We handle both.
|
|||
func propertySetValue(capsule *CapsuleCommand) uint64 { |
|||
return uint64(capsule.D14) | uint64(capsule.D15)<<32 |
|||
} |
|||
|
|||
// propertyGetSize returns true if the PropertyGet requests an 8-byte value.
|
|||
func propertyGetSize8(capsule *CapsuleCommand) bool { |
|||
return (capsule.D11 & 1) != 0 |
|||
} |
|||
|
|||
// propertyGetOffset returns the register offset for PropertyGet.
|
|||
func propertyGetOffset(capsule *CapsuleCommand) uint32 { |
|||
return capsule.D10 |
|||
} |
|||
|
|||
// ---------- ConnectData marshal helpers for tests ----------
|
|||
|
|||
func marshalConnectData(cd *ConnectData) []byte { |
|||
buf := make([]byte, connectDataSize) |
|||
cd.Marshal(buf) |
|||
return buf |
|||
} |
|||
|
|||
func makeConnectCapsule(queueID, queueSize uint16, kato uint32, fcType uint8) CapsuleCommand { |
|||
return CapsuleCommand{ |
|||
OpCode: adminFabric, |
|||
FCType: fcType, |
|||
D10: uint32(queueID) << 16, |
|||
D11: uint32(queueSize - 1), // 0-based
|
|||
D12: kato, |
|||
} |
|||
} |
|||
|
|||
// makePropertyGetCapsule creates a PropertyGet capsule for the given register offset.
|
|||
func makePropertyGetCapsule(offset uint32, size8 bool) CapsuleCommand { |
|||
c := CapsuleCommand{ |
|||
OpCode: adminFabric, |
|||
FCType: fcPropertyGet, |
|||
D10: offset, |
|||
} |
|||
if size8 { |
|||
c.D11 = 1 |
|||
} |
|||
return c |
|||
} |
|||
|
|||
// makePropertySetCapsule creates a PropertySet capsule.
|
|||
func makePropertySetCapsule(offset uint32, value uint64) CapsuleCommand { |
|||
return CapsuleCommand{ |
|||
OpCode: adminFabric, |
|||
FCType: fcPropertySet, |
|||
D10: offset, |
|||
D14: uint32(value), |
|||
D15: uint32(value >> 32), |
|||
} |
|||
} |
|||
|
|||
// putCNTLID stores the controller ID in ConnectData at offset 16.
|
|||
func putCNTLID(buf []byte, cntlid uint16) { |
|||
binary.LittleEndian.PutUint16(buf[16:], cntlid) |
|||
} |
|||
@ -0,0 +1,250 @@ |
|||
package nvme |
|||
|
|||
import ( |
|||
"encoding/binary" |
|||
"math/bits" |
|||
) |
|||
|
|||
const identifySize = 4096 |
|||
|
|||
// handleIdentify dispatches Identify commands by CNS type.
|
|||
func (c *Controller) handleIdentify(req *Request) error { |
|||
cns := uint8(req.capsule.D10 & 0xFF) |
|||
|
|||
switch cns { |
|||
case cnsIdentifyController: |
|||
return c.identifyController(req) |
|||
case cnsIdentifyNamespace: |
|||
return c.identifyNamespace(req) |
|||
case cnsActiveNSList: |
|||
return c.identifyActiveNSList(req) |
|||
case cnsNSDescriptorList: |
|||
return c.identifyNSDescriptors(req) |
|||
default: |
|||
req.resp.Status = uint16(StatusInvalidField) |
|||
return c.sendResponse(req) |
|||
} |
|||
} |
|||
|
|||
// identifyController returns the 4096-byte Identify Controller data structure.
|
|||
func (c *Controller) identifyController(req *Request) error { |
|||
buf := make([]byte, identifySize) |
|||
|
|||
sub := c.subsystem |
|||
if sub == nil { |
|||
req.resp.Status = uint16(StatusInvalidField) |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
// VID (PCI Vendor ID) - use 0 for software target
|
|||
// SSVID - 0
|
|||
|
|||
// Serial Number (offset 4, 20 bytes, space-padded ASCII)
|
|||
copyPadded(buf[4:24], "SWF00001") |
|||
|
|||
// Model Number (offset 24, 40 bytes, space-padded ASCII)
|
|||
copyPadded(buf[24:64], "SeaweedFS BlockVol") |
|||
|
|||
// Firmware Revision (offset 64, 8 bytes, space-padded ASCII)
|
|||
copyPadded(buf[64:72], "0001") |
|||
|
|||
// RAB (Recommended Arbitration Burst) - offset 72
|
|||
buf[72] = 6 |
|||
|
|||
// IEEE OUI - offset 73-75 (3 bytes, 0 for software)
|
|||
|
|||
// CMIC (Controller Multi-Path I/O Capabilities) - offset 76
|
|||
// bit 3: ANA reporting supported
|
|||
buf[76] = 0x08 |
|||
|
|||
// MDTS (Maximum Data Transfer Size) - offset 77
|
|||
// 2^MDTS * 4096 = max transfer. MDTS=3 → 32KB
|
|||
buf[77] = 3 |
|||
|
|||
// CNTLID (Controller ID) - offset 78-79
|
|||
binary.LittleEndian.PutUint16(buf[78:], c.cntlID) |
|||
|
|||
// Version - offset 80-83
|
|||
binary.LittleEndian.PutUint32(buf[80:], nvmeVersion14) |
|||
|
|||
// OACS (Optional Admin Command Support) - offset 256-257
|
|||
// 0 = no optional admin commands
|
|||
binary.LittleEndian.PutUint16(buf[256:], 0) |
|||
|
|||
// ACRTD (Abort Command Limit) - offset 258
|
|||
buf[258] = 3 |
|||
|
|||
// AERTL (Async Event Request Limit) - offset 259
|
|||
buf[259] = 3 |
|||
|
|||
// FRMW (Firmware Updates) - offset 260
|
|||
buf[260] = 0x02 // slot 1 read-only
|
|||
|
|||
// LPA (Log Page Attributes) - offset 261
|
|||
buf[261] = 0 |
|||
|
|||
// ELPE (Error Log Page Entries) - offset 262
|
|||
buf[262] = 0 // 1 entry (0-based)
|
|||
|
|||
// SQES (Submission Queue Entry Size) - offset 512
|
|||
// min=6 (2^6=64 bytes), max=6
|
|||
buf[512] = 0x66 |
|||
|
|||
// CQES (Completion Queue Entry Size) - offset 513
|
|||
// min=4 (2^4=16 bytes), max=4
|
|||
buf[513] = 0x44 |
|||
|
|||
// MAXCMD - offset 514-515
|
|||
binary.LittleEndian.PutUint16(buf[514:], 64) |
|||
|
|||
// NN (Number of Namespaces) - offset 516-519
|
|||
binary.LittleEndian.PutUint32(buf[516:], 1) |
|||
|
|||
// ONCS (Optional NVM Command Support) - offset 520-521
|
|||
// bit 3: WriteZeros, bit 2: DatasetMgmt (Trim)
|
|||
binary.LittleEndian.PutUint16(buf[520:], 0x0C) |
|||
|
|||
// ANACAP (ANA Capabilities) - offset 522
|
|||
// bit 3: reports Optimized state
|
|||
buf[522] = 0x08 |
|||
|
|||
// ANAGRPMAX - offset 524-527
|
|||
binary.LittleEndian.PutUint32(buf[524:], 1) |
|||
|
|||
// NANAGRPID - offset 528-531
|
|||
binary.LittleEndian.PutUint32(buf[528:], 1) |
|||
|
|||
// VWC (Volatile Write Cache) - offset 525
|
|||
// bit 0: volatile write cache present → Flush required
|
|||
buf[525] = 0x01 |
|||
|
|||
// SGLS (SGL Support) - offset 536-539
|
|||
// bit 0: SGLs supported (required for NVMe/TCP)
|
|||
binary.LittleEndian.PutUint32(buf[536:], 0x01) |
|||
|
|||
// SubNQN (Subsystem NQN) - offset 768, 256 bytes
|
|||
copyPadded(buf[768:1024], sub.NQN) |
|||
|
|||
// IOCCSZ (I/O Queue Command Capsule Supported Size) - offset 1792-1795
|
|||
// In 16-byte units: 64/16 = 4
|
|||
binary.LittleEndian.PutUint32(buf[1792:], 4) |
|||
|
|||
// IORCSZ (I/O Queue Response Capsule Supported Size) - offset 1796-1799
|
|||
// In 16-byte units: 16/16 = 1
|
|||
binary.LittleEndian.PutUint32(buf[1796:], 1) |
|||
|
|||
// ICDOFF (In Capsule Data Offset) - offset 1800-1801
|
|||
// 0 means inline data immediately follows SQE in capsule
|
|||
binary.LittleEndian.PutUint16(buf[1800:], 0) |
|||
|
|||
// FCATT (Fabrics Controller Attributes) - offset 1802
|
|||
// bit 0: 0 = I/O controller (not discovery)
|
|||
buf[1802] = 0 |
|||
|
|||
// MSDBD (Maximum SGL Data Block Descriptors) - offset 1803
|
|||
buf[1803] = 1 |
|||
|
|||
// OFCS (Optional Fabric Commands Supported) - offset 1804-1805
|
|||
// bit 0: Disconnect command supported
|
|||
binary.LittleEndian.PutUint16(buf[1804:], 0x01) |
|||
|
|||
req.c2hData = buf |
|||
return c.sendC2HDataAndResponse(req) |
|||
} |
|||
|
|||
// identifyNamespace returns the 4096-byte Identify Namespace data for NSID=1.
|
|||
func (c *Controller) identifyNamespace(req *Request) error { |
|||
sub := c.subsystem |
|||
if sub == nil { |
|||
req.resp.Status = uint16(StatusInvalidField) |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
dev := sub.Dev |
|||
blockSize := dev.BlockSize() |
|||
nsze := dev.VolumeSize() / uint64(blockSize) |
|||
|
|||
buf := make([]byte, identifySize) |
|||
|
|||
// NSZE (Namespace Size in blocks) - offset 0-7
|
|||
binary.LittleEndian.PutUint64(buf[0:], nsze) |
|||
|
|||
// NCAP (Namespace Capacity) - offset 8-15
|
|||
binary.LittleEndian.PutUint64(buf[8:], nsze) |
|||
|
|||
// NUSE (Namespace Utilization) - offset 16-23
|
|||
binary.LittleEndian.PutUint64(buf[16:], nsze) |
|||
|
|||
// NSFEAT (Namespace Features) - offset 24
|
|||
// bit 0: thin provisioning (supports Trim)
|
|||
buf[24] = 0x01 |
|||
|
|||
// NLBAF (Number of LBA Formats minus 1) - offset 25
|
|||
buf[25] = 0 // one format
|
|||
|
|||
// FLBAS (Formatted LBA Size) - offset 26
|
|||
// bits 3:0 = LBA format index (0)
|
|||
buf[26] = 0 |
|||
|
|||
// MC (Metadata Capabilities) - offset 27
|
|||
buf[27] = 0 |
|||
|
|||
// DLFEAT (Deallocate Logical Block Features) - offset 28
|
|||
// bit 2: Deallocated blocks return zeros on read
|
|||
buf[28] = 0x04 |
|||
|
|||
// NGUID (Namespace Globally Unique Identifier) - offset 104-119 (16 bytes)
|
|||
copy(buf[104:120], sub.NGUID[:]) |
|||
|
|||
// LBAF[0] (LBA Format 0) - offset 128-131
|
|||
// bits 23:16 = LBADS (log2 of block size)
|
|||
lbads := uint8(bits.TrailingZeros32(blockSize)) |
|||
binary.LittleEndian.PutUint32(buf[128:], uint32(lbads)<<16) |
|||
|
|||
// ANAGRPID (ANA Group Identifier) - offset 92-95
|
|||
binary.LittleEndian.PutUint32(buf[92:], 1) |
|||
|
|||
req.c2hData = buf |
|||
return c.sendC2HDataAndResponse(req) |
|||
} |
|||
|
|||
// identifyActiveNSList returns the list of active namespace IDs (just NSID=1).
|
|||
func (c *Controller) identifyActiveNSList(req *Request) error { |
|||
buf := make([]byte, identifySize) |
|||
// Single namespace: NSID=1
|
|||
binary.LittleEndian.PutUint32(buf[0:], 1) |
|||
|
|||
req.c2hData = buf |
|||
return c.sendC2HDataAndResponse(req) |
|||
} |
|||
|
|||
// identifyNSDescriptors returns namespace descriptor list for NSID=1.
|
|||
func (c *Controller) identifyNSDescriptors(req *Request) error { |
|||
sub := c.subsystem |
|||
if sub == nil { |
|||
req.resp.Status = uint16(StatusInvalidField) |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
buf := make([]byte, identifySize) |
|||
off := 0 |
|||
|
|||
// NGUID descriptor (type=0x02, length=16)
|
|||
buf[off] = 0x02 // NIDT: NGUID
|
|||
off++ |
|||
buf[off] = 16 // NIDL: 16 bytes
|
|||
off++ |
|||
off += 2 // reserved
|
|||
copy(buf[off:off+16], sub.NGUID[:]) |
|||
|
|||
req.c2hData = buf |
|||
return c.sendC2HDataAndResponse(req) |
|||
} |
|||
|
|||
// copyPadded copies src into dst, padding remaining bytes with spaces.
|
|||
func copyPadded(dst []byte, src string) { |
|||
n := copy(dst, src) |
|||
for i := n; i < len(dst); i++ { |
|||
dst[i] = ' ' |
|||
} |
|||
} |
|||
@ -0,0 +1,157 @@ |
|||
package nvme |
|||
|
|||
// handleRead processes an NVMe Read command.
|
|||
func (c *Controller) handleRead(req *Request) error { |
|||
sub := c.subsystem |
|||
if sub == nil { |
|||
req.resp.Status = uint16(StatusInvalidField) |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
dev := sub.Dev |
|||
lba := req.capsule.Lba() |
|||
nlb := req.capsule.LbaLength() |
|||
blockSize := dev.BlockSize() |
|||
totalBytes := uint32(nlb) * blockSize |
|||
|
|||
// Bounds check
|
|||
nsze := dev.VolumeSize() / uint64(blockSize) |
|||
if lba+uint64(nlb) > nsze { |
|||
req.resp.Status = uint16(StatusLBAOutOfRange) |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
data, err := dev.ReadAt(lba, totalBytes) |
|||
if err != nil { |
|||
req.resp.Status = uint16(mapBlockError(err)) |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
req.c2hData = data |
|||
return c.sendC2HDataAndResponse(req) |
|||
} |
|||
|
|||
// handleWrite processes an NVMe Write command with inline data.
|
|||
func (c *Controller) handleWrite(req *Request) error { |
|||
sub := c.subsystem |
|||
if sub == nil { |
|||
req.resp.Status = uint16(StatusInvalidField) |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
// Check ANA state (write-gating)
|
|||
if !c.isWriteAllowed() { |
|||
req.resp.Status = uint16(StatusNSNotReady) |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
// Inline data must be present (DataOffset != 0 in the received PDU).
|
|||
// If DataOffset == 0 for a Write, the host expects R2T flow — reject.
|
|||
if len(req.payload) == 0 { |
|||
req.resp.Status = uint16(StatusInvalidField) |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
dev := sub.Dev |
|||
lba := req.capsule.Lba() |
|||
nlb := req.capsule.LbaLength() |
|||
blockSize := dev.BlockSize() |
|||
|
|||
// Bounds check
|
|||
nsze := dev.VolumeSize() / uint64(blockSize) |
|||
if lba+uint64(nlb) > nsze { |
|||
req.resp.Status = uint16(StatusLBAOutOfRange) |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
// Validate payload size matches NLB*blockSize.
|
|||
expectedBytes := uint32(nlb) * blockSize |
|||
if uint32(len(req.payload)) != expectedBytes { |
|||
req.resp.Status = uint16(StatusInvalidField) |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
if err := dev.WriteAt(lba, req.payload); err != nil { |
|||
req.resp.Status = uint16(mapBlockError(err)) |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
// handleFlush processes an NVMe Flush command.
|
|||
func (c *Controller) handleFlush(req *Request) error { |
|||
sub := c.subsystem |
|||
if sub == nil { |
|||
req.resp.Status = uint16(StatusInvalidField) |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
if !c.isWriteAllowed() { |
|||
req.resp.Status = uint16(StatusNSNotReady) |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
if err := sub.Dev.SyncCache(); err != nil { |
|||
req.resp.Status = uint16(mapBlockError(err)) |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
// handleWriteZeros processes an NVMe Write Zeroes command.
|
|||
func (c *Controller) handleWriteZeros(req *Request) error { |
|||
sub := c.subsystem |
|||
if sub == nil { |
|||
req.resp.Status = uint16(StatusInvalidField) |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
if !c.isWriteAllowed() { |
|||
req.resp.Status = uint16(StatusNSNotReady) |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
dev := sub.Dev |
|||
lba := req.capsule.Lba() |
|||
nlb := req.capsule.LbaLength() |
|||
blockSize := dev.BlockSize() |
|||
totalBytes := uint32(nlb) * blockSize |
|||
|
|||
// Bounds check
|
|||
nsze := dev.VolumeSize() / uint64(blockSize) |
|||
if lba+uint64(nlb) > nsze { |
|||
req.resp.Status = uint16(StatusLBAOutOfRange) |
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
// D12 bit 25: DEALLOC — if set, use Trim instead of writing zeros
|
|||
if req.capsule.D12&commandBitDeallocate != 0 { |
|||
if err := dev.Trim(lba, totalBytes); err != nil { |
|||
req.resp.Status = uint16(mapBlockError(err)) |
|||
return c.sendResponse(req) |
|||
} |
|||
} else { |
|||
zeroBuf := make([]byte, totalBytes) |
|||
if err := dev.WriteAt(lba, zeroBuf); err != nil { |
|||
req.resp.Status = uint16(mapBlockError(err)) |
|||
return c.sendResponse(req) |
|||
} |
|||
} |
|||
|
|||
return c.sendResponse(req) |
|||
} |
|||
|
|||
// isWriteAllowed checks if the current ANA state allows writes.
|
|||
func (c *Controller) isWriteAllowed() bool { |
|||
if c.subsystem == nil { |
|||
return false |
|||
} |
|||
if prov, ok := c.subsystem.Dev.(ANAProvider); ok { |
|||
state := prov.ANAState() |
|||
return state == anaOptimized || state == anaNonOptimized |
|||
} |
|||
// No ANA provider: allow if healthy
|
|||
return c.subsystem.Dev.IsHealthy() |
|||
} |
|||
1541
weed/storage/blockvol/nvme/nvme_qa_test.go
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
2377
weed/storage/blockvol/nvme/nvme_test.go
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
@ -0,0 +1,444 @@ |
|||
// Package nvme implements an NVMe/TCP target for SeaweedFS BlockVol.
|
|||
//
|
|||
// This package provides a functionally correct NVMe-oF over TCP transport
|
|||
// that shares the same BlockVol engine, fencing, replication, and failover
|
|||
// as the iSCSI target.
|
|||
package nvme |
|||
|
|||
import ( |
|||
"encoding/binary" |
|||
"fmt" |
|||
) |
|||
|
|||
// ---------- PDU type codes ----------
|
|||
|
|||
const ( |
|||
pduICReq uint8 = 0x0 // Initialization Connection Request
|
|||
pduICResp uint8 = 0x1 // Initialization Connection Response
|
|||
pduH2CTermReq uint8 = 0x2 // Host-to-Controller Termination Request
|
|||
pduC2HTermReq uint8 = 0x3 // Controller-to-Host Termination Request
|
|||
pduCapsuleCmd uint8 = 0x4 // NVMe Capsule Command
|
|||
pduCapsuleResp uint8 = 0x5 // NVMe Capsule Response
|
|||
pduC2HData uint8 = 0x7 // Controller-to-Host Data Transfer
|
|||
pduR2T uint8 = 0x9 // Ready-to-Transfer
|
|||
) |
|||
|
|||
// ---------- Admin command opcodes ----------
|
|||
|
|||
const ( |
|||
adminFlush uint8 = 0x00 // NVM Flush (admin context unused here)
|
|||
adminGetLogPage uint8 = 0x02 |
|||
adminIdentify uint8 = 0x06 |
|||
adminAbort uint8 = 0x08 |
|||
adminSetFeatures uint8 = 0x09 |
|||
adminGetFeatures uint8 = 0x0A |
|||
adminAsyncEvent uint8 = 0x0C |
|||
adminKeepAlive uint8 = 0x18 |
|||
adminFabric uint8 = 0x7F // Fabric-specific commands
|
|||
) |
|||
|
|||
// ---------- IO command opcodes ----------
|
|||
|
|||
const ( |
|||
ioFlush uint8 = 0x00 |
|||
ioWrite uint8 = 0x01 |
|||
ioRead uint8 = 0x02 |
|||
ioWriteZeros uint8 = 0x08 |
|||
) |
|||
|
|||
// ---------- Fabric command types (FCType) ----------
|
|||
|
|||
const ( |
|||
fcPropertySet uint8 = 0x00 |
|||
fcConnect uint8 = 0x01 |
|||
fcPropertyGet uint8 = 0x04 |
|||
fcDisconnect uint8 = 0x08 |
|||
) |
|||
|
|||
// ---------- Feature identifiers ----------
|
|||
|
|||
const ( |
|||
fidNumberOfQueues uint8 = 0x07 |
|||
fidAsyncEventConfig uint8 = 0x0B |
|||
fidKeepAliveTimer uint8 = 0x0F |
|||
) |
|||
|
|||
// ---------- Identify CNS types ----------
|
|||
|
|||
const ( |
|||
cnsIdentifyNamespace uint8 = 0x00 |
|||
cnsIdentifyController uint8 = 0x01 |
|||
cnsActiveNSList uint8 = 0x02 |
|||
cnsNSDescriptorList uint8 = 0x03 |
|||
) |
|||
|
|||
// ---------- Log page identifiers ----------
|
|||
|
|||
const ( |
|||
logPageError uint8 = 0x01 |
|||
logPageSMART uint8 = 0x02 |
|||
logPageANA uint8 = 0x0C |
|||
) |
|||
|
|||
// ---------- Property register offsets ----------
|
|||
|
|||
const ( |
|||
propCAP uint32 = 0x00 // Controller Capabilities
|
|||
propVS uint32 = 0x08 // Version
|
|||
propCC uint32 = 0x14 // Controller Configuration
|
|||
propCSTS uint32 = 0x1C // Controller Status
|
|||
) |
|||
|
|||
// ---------- ANA states ----------
|
|||
|
|||
const ( |
|||
anaOptimized uint8 = 0x01 |
|||
anaNonOptimized uint8 = 0x02 |
|||
anaInaccessible uint8 = 0x03 |
|||
anaPersistentLoss uint8 = 0x04 |
|||
anaChange uint8 = 0x0F |
|||
) |
|||
|
|||
// ---------- Misc constants ----------
|
|||
|
|||
const ( |
|||
commonHeaderSize = 8 |
|||
maxHeaderSize = 128 |
|||
maxH2CDataLen = 0x8000 // 32 KB
|
|||
|
|||
capsuleCmdSize = 64 // CapsuleCommand specific header size (after CommonHeader)
|
|||
capsuleRespSize = 16 // CapsuleResponse specific header size
|
|||
c2hDataHdrSize = 16 // C2HDataHeader specific header size
|
|||
icBodySize = 120 // ICReq/ICResp body size (after CommonHeader)
|
|||
connectDataSize = 1024 |
|||
|
|||
// Total header lengths including CommonHeader
|
|||
capsuleCmdHdrLen = commonHeaderSize + capsuleCmdSize // 72
|
|||
capsuleRespHdrLen = commonHeaderSize + capsuleRespSize // 24
|
|||
c2hDataHdrLen = commonHeaderSize + c2hDataHdrSize // 24
|
|||
icHdrLen = commonHeaderSize + icBodySize // 128
|
|||
|
|||
commandBitDeallocate = 1 << 25 |
|||
|
|||
nvmeVersion14 uint32 = 0x00010400 // NVMe 1.4
|
|||
|
|||
// C2HData flags
|
|||
c2hFlagLast uint8 = 0x04 |
|||
) |
|||
|
|||
// ---------- CommonHeader (8 bytes) ----------
|
|||
|
|||
// CommonHeader is the 8-byte preamble of every NVMe/TCP PDU.
|
|||
type CommonHeader struct { |
|||
Type uint8 |
|||
Flags uint8 |
|||
HeaderLength uint8 |
|||
DataOffset uint8 |
|||
DataLength uint32 |
|||
} |
|||
|
|||
func (h *CommonHeader) Marshal(buf []byte) { |
|||
buf[0] = h.Type |
|||
buf[1] = h.Flags |
|||
buf[2] = h.HeaderLength |
|||
buf[3] = h.DataOffset |
|||
binary.LittleEndian.PutUint32(buf[4:], h.DataLength) |
|||
} |
|||
|
|||
func (h *CommonHeader) Unmarshal(buf []byte) { |
|||
h.Type = buf[0] |
|||
h.Flags = buf[1] |
|||
h.HeaderLength = buf[2] |
|||
h.DataOffset = buf[3] |
|||
h.DataLength = binary.LittleEndian.Uint32(buf[4:]) |
|||
} |
|||
|
|||
func (h *CommonHeader) String() string { |
|||
return fmt.Sprintf("PDU{type=0x%x hlen=%d doff=%d dlen=%d}", |
|||
h.Type, h.HeaderLength, h.DataOffset, h.DataLength) |
|||
} |
|||
|
|||
// ---------- PDU interface ----------
|
|||
|
|||
// PDU is the interface for all NVMe/TCP PDU-specific headers.
|
|||
type PDU interface { |
|||
Marshal([]byte) |
|||
Unmarshal([]byte) |
|||
} |
|||
|
|||
// ---------- ICRequest (120-byte body) ----------
|
|||
|
|||
// ICRequest is the host-to-controller initialization request.
|
|||
type ICRequest struct { |
|||
PDUFormatVersion uint16 |
|||
PDUDataAlignment uint8 |
|||
PDUDataDigest uint8 |
|||
PDUMaxR2T uint32 |
|||
// remaining 112 bytes reserved
|
|||
} |
|||
|
|||
func (r *ICRequest) Marshal(buf []byte) { |
|||
// zero out the full 120-byte body
|
|||
for i := range buf[:icBodySize] { |
|||
buf[i] = 0 |
|||
} |
|||
binary.LittleEndian.PutUint16(buf[0:], r.PDUFormatVersion) |
|||
buf[2] = r.PDUDataAlignment |
|||
buf[3] = r.PDUDataDigest |
|||
binary.LittleEndian.PutUint32(buf[4:], r.PDUMaxR2T) |
|||
} |
|||
|
|||
func (r *ICRequest) Unmarshal(buf []byte) { |
|||
r.PDUFormatVersion = binary.LittleEndian.Uint16(buf[0:]) |
|||
r.PDUDataAlignment = buf[2] |
|||
r.PDUDataDigest = buf[3] |
|||
r.PDUMaxR2T = binary.LittleEndian.Uint32(buf[4:]) |
|||
} |
|||
|
|||
// ---------- ICResponse (120-byte body) ----------
|
|||
|
|||
// ICResponse is the controller-to-host initialization response.
|
|||
type ICResponse struct { |
|||
PDUFormatVersion uint16 |
|||
PDUDataAlignment uint8 |
|||
PDUDataDigest uint8 |
|||
MaxH2CDataLength uint32 |
|||
// remaining 112 bytes reserved
|
|||
} |
|||
|
|||
func (r *ICResponse) Marshal(buf []byte) { |
|||
for i := range buf[:icBodySize] { |
|||
buf[i] = 0 |
|||
} |
|||
binary.LittleEndian.PutUint16(buf[0:], r.PDUFormatVersion) |
|||
buf[2] = r.PDUDataAlignment |
|||
buf[3] = r.PDUDataDigest |
|||
binary.LittleEndian.PutUint32(buf[4:], r.MaxH2CDataLength) |
|||
} |
|||
|
|||
func (r *ICResponse) Unmarshal(buf []byte) { |
|||
r.PDUFormatVersion = binary.LittleEndian.Uint16(buf[0:]) |
|||
r.PDUDataAlignment = buf[2] |
|||
r.PDUDataDigest = buf[3] |
|||
r.MaxH2CDataLength = binary.LittleEndian.Uint32(buf[4:]) |
|||
} |
|||
|
|||
// ---------- CapsuleCommand (64-byte specific header) ----------
|
|||
|
|||
// CapsuleCommand is the 64-byte NVMe command capsule.
|
|||
type CapsuleCommand struct { |
|||
OpCode uint8 |
|||
PRP uint8 |
|||
CID uint16 |
|||
FCType uint8 // Fabric command type (only for OpCode=0x7F)
|
|||
NSID uint32 // Namespace ID (bytes 4-7 of NVMe SQE after opcode/flags/CID)
|
|||
DPTR [16]byte // Data pointer
|
|||
D10 uint32 |
|||
D11 uint32 |
|||
D12 uint32 |
|||
D13 uint32 |
|||
D14 uint32 |
|||
D15 uint32 |
|||
} |
|||
|
|||
// Lba returns the starting LBA from D10:D11 (Read/Write commands).
|
|||
func (c *CapsuleCommand) Lba() uint64 { |
|||
return uint64(c.D11)<<32 | uint64(c.D10) |
|||
} |
|||
|
|||
// LbaLength returns the number of logical blocks (0-based in D12, actual = D12&0xFFFF + 1).
|
|||
func (c *CapsuleCommand) LbaLength() uint32 { |
|||
return c.D12&0xFFFF + 1 |
|||
} |
|||
|
|||
func (c *CapsuleCommand) Marshal(buf []byte) { |
|||
for i := range buf[:capsuleCmdSize] { |
|||
buf[i] = 0 |
|||
} |
|||
buf[0] = c.OpCode |
|||
buf[1] = c.PRP |
|||
binary.LittleEndian.PutUint16(buf[2:], c.CID) |
|||
// Bytes 4-7: NSID for normal commands, FCType at byte 4 for Fabric (0x7F).
|
|||
// They share the same offset per NVMe spec.
|
|||
if c.OpCode == adminFabric { |
|||
buf[4] = c.FCType |
|||
} else { |
|||
binary.LittleEndian.PutUint32(buf[4:], c.NSID) |
|||
} |
|||
copy(buf[24:40], c.DPTR[:]) |
|||
binary.LittleEndian.PutUint32(buf[40:], c.D10) |
|||
binary.LittleEndian.PutUint32(buf[44:], c.D11) |
|||
binary.LittleEndian.PutUint32(buf[48:], c.D12) |
|||
binary.LittleEndian.PutUint32(buf[52:], c.D13) |
|||
binary.LittleEndian.PutUint32(buf[56:], c.D14) |
|||
binary.LittleEndian.PutUint32(buf[60:], c.D15) |
|||
} |
|||
|
|||
func (c *CapsuleCommand) Unmarshal(buf []byte) { |
|||
c.OpCode = buf[0] |
|||
c.PRP = buf[1] |
|||
c.CID = binary.LittleEndian.Uint16(buf[2:]) |
|||
c.FCType = buf[4] |
|||
c.NSID = binary.LittleEndian.Uint32(buf[4:]) |
|||
copy(c.DPTR[:], buf[24:40]) |
|||
c.D10 = binary.LittleEndian.Uint32(buf[40:]) |
|||
c.D11 = binary.LittleEndian.Uint32(buf[44:]) |
|||
c.D12 = binary.LittleEndian.Uint32(buf[48:]) |
|||
c.D13 = binary.LittleEndian.Uint32(buf[52:]) |
|||
c.D14 = binary.LittleEndian.Uint32(buf[56:]) |
|||
c.D15 = binary.LittleEndian.Uint32(buf[60:]) |
|||
} |
|||
|
|||
func (c *CapsuleCommand) String() string { |
|||
return fmt.Sprintf("CapsuleCmd{op=0x%02x cid=%d nsid=%d}", c.OpCode, c.CID, c.NSID) |
|||
} |
|||
|
|||
// ---------- CapsuleResponse (16-byte specific header) ----------
|
|||
|
|||
// CapsuleResponse is the NVMe completion queue entry (16 bytes).
|
|||
type CapsuleResponse struct { |
|||
DW0 uint32 // Command-specific DWord 0 (also FabricResponse bytes 0-3)
|
|||
DW1 uint32 // Command-specific DWord 1 (also FabricResponse bytes 4-7)
|
|||
SQHD uint16 // Submission Queue Head Pointer
|
|||
QueueID uint16 |
|||
CID uint16 |
|||
Status uint16 // Status field: DNR(15) | More(14) | SCT(13:9) | SC(8:1) | P(0)
|
|||
} |
|||
|
|||
func (r *CapsuleResponse) Marshal(buf []byte) { |
|||
binary.LittleEndian.PutUint32(buf[0:], r.DW0) |
|||
binary.LittleEndian.PutUint32(buf[4:], r.DW1) |
|||
binary.LittleEndian.PutUint16(buf[8:], r.SQHD) |
|||
binary.LittleEndian.PutUint16(buf[10:], r.QueueID) |
|||
binary.LittleEndian.PutUint16(buf[12:], r.CID) |
|||
binary.LittleEndian.PutUint16(buf[14:], r.Status) |
|||
} |
|||
|
|||
func (r *CapsuleResponse) Unmarshal(buf []byte) { |
|||
r.DW0 = binary.LittleEndian.Uint32(buf[0:]) |
|||
r.DW1 = binary.LittleEndian.Uint32(buf[4:]) |
|||
r.SQHD = binary.LittleEndian.Uint16(buf[8:]) |
|||
r.QueueID = binary.LittleEndian.Uint16(buf[10:]) |
|||
r.CID = binary.LittleEndian.Uint16(buf[12:]) |
|||
r.Status = binary.LittleEndian.Uint16(buf[14:]) |
|||
} |
|||
|
|||
func (r *CapsuleResponse) String() string { |
|||
return fmt.Sprintf("CapsuleResp{sqhd=%d qid=%d cid=%d status=0x%04x}", |
|||
r.SQHD, r.QueueID, r.CID, r.Status) |
|||
} |
|||
|
|||
// ---------- C2HDataHeader (16-byte specific header) ----------
|
|||
|
|||
// C2HDataHeader is the controller-to-host data transfer header.
|
|||
type C2HDataHeader struct { |
|||
CCCID uint16 // Command Capsule CID
|
|||
_ uint16 // reserved
|
|||
DATAO uint32 // Data offset within the total transfer
|
|||
DATAL uint32 // Data length in this PDU
|
|||
_pad uint32 // reserved
|
|||
} |
|||
|
|||
func (h *C2HDataHeader) Marshal(buf []byte) { |
|||
for i := range buf[:c2hDataHdrSize] { |
|||
buf[i] = 0 |
|||
} |
|||
binary.LittleEndian.PutUint16(buf[0:], h.CCCID) |
|||
binary.LittleEndian.PutUint32(buf[4:], h.DATAO) |
|||
binary.LittleEndian.PutUint32(buf[8:], h.DATAL) |
|||
} |
|||
|
|||
func (h *C2HDataHeader) Unmarshal(buf []byte) { |
|||
h.CCCID = binary.LittleEndian.Uint16(buf[0:]) |
|||
h.DATAO = binary.LittleEndian.Uint32(buf[4:]) |
|||
h.DATAL = binary.LittleEndian.Uint32(buf[8:]) |
|||
} |
|||
|
|||
// ---------- ConnectData (1024 bytes, payload of Fabric Connect) ----------
|
|||
|
|||
// ConnectData is the 1024-byte payload sent with a Fabric Connect command.
|
|||
type ConnectData struct { |
|||
HostID [16]byte // Host UUID
|
|||
CNTLID uint16 // Requested controller ID (0xFFFF = new)
|
|||
SubNQN string // Subsystem NQN
|
|||
HostNQN string // Host NQN
|
|||
} |
|||
|
|||
func (d *ConnectData) Marshal(buf []byte) { |
|||
for i := range buf[:connectDataSize] { |
|||
buf[i] = 0 |
|||
} |
|||
copy(buf[0:16], d.HostID[:]) |
|||
binary.LittleEndian.PutUint16(buf[16:], d.CNTLID) |
|||
copyNQN(buf[256:512], d.SubNQN) |
|||
copyNQN(buf[512:768], d.HostNQN) |
|||
} |
|||
|
|||
func (d *ConnectData) Unmarshal(buf []byte) { |
|||
copy(d.HostID[:], buf[0:16]) |
|||
d.CNTLID = binary.LittleEndian.Uint16(buf[16:]) |
|||
d.SubNQN = extractNQN(buf[256:512]) |
|||
d.HostNQN = extractNQN(buf[512:768]) |
|||
} |
|||
|
|||
// copyNQN writes a NUL-terminated string into a fixed-size buffer.
|
|||
func copyNQN(dst []byte, s string) { |
|||
n := copy(dst, s) |
|||
if n < len(dst) { |
|||
dst[n] = 0 |
|||
} |
|||
} |
|||
|
|||
// extractNQN reads a NUL-terminated string from a fixed-size buffer.
|
|||
func extractNQN(buf []byte) string { |
|||
for i, b := range buf { |
|||
if b == 0 { |
|||
return string(buf[:i]) |
|||
} |
|||
} |
|||
return string(buf) |
|||
} |
|||
|
|||
// ---------- Status word encoding ----------
|
|||
|
|||
// StatusWord encodes NVMe status: DNR(15) | More(14) | SCT(13:9) | SC(8:1) | P(0)
|
|||
//
|
|||
// StatusWord = (DNR << 15) | (SCT << 9) | (SC << 1)
|
|||
type StatusWord uint16 |
|||
|
|||
// MakeStatus constructs a status word from SCT, SC, and DNR flag.
|
|||
func MakeStatus(sct, sc uint8, dnr bool) StatusWord { |
|||
w := uint16(sct)<<9 | uint16(sc)<<1 |
|||
if dnr { |
|||
w |= 1 << 15 |
|||
} |
|||
return StatusWord(w) |
|||
} |
|||
|
|||
// StatusSuccess is the zero-value success status.
|
|||
const StatusSuccess StatusWord = 0 |
|||
|
|||
// Pre-defined status words used in the NVMe target.
|
|||
var ( |
|||
StatusInvalidOpcode = MakeStatus(0, 0x01, true) // Generic: Invalid Command Opcode
|
|||
StatusInvalidField = MakeStatus(0, 0x02, true) // Generic: Invalid Field in Command
|
|||
StatusInternalError = MakeStatus(0, 0x06, false) // Generic: Internal Error (retryable)
|
|||
StatusInternalErrorDNR = MakeStatus(0, 0x06, true) // Generic: Internal Error (permanent)
|
|||
StatusNSNotReady = MakeStatus(0, 0x82, false) // Generic: Namespace Not Ready (retryable)
|
|||
StatusNSNotReadyDNR = MakeStatus(0, 0x82, true) // Generic: Namespace Not Ready (permanent)
|
|||
StatusLBAOutOfRange = MakeStatus(0, 0x80, true) // Generic: LBA Out of Range
|
|||
StatusMediaWriteFault = MakeStatus(2, 0x80, false) // Media: Write Fault
|
|||
StatusMediaReadError = MakeStatus(2, 0x81, false) // Media: Uncorrectable Read Error
|
|||
) |
|||
|
|||
func (s StatusWord) SCT() uint8 { return uint8((s >> 9) & 0x07) } |
|||
func (s StatusWord) SC() uint8 { return uint8((s >> 1) & 0xFF) } |
|||
func (s StatusWord) DNR() bool { return s&(1<<15) != 0 } |
|||
func (s StatusWord) IsError() bool { return s != StatusSuccess } |
|||
|
|||
func (s StatusWord) String() string { |
|||
if s == StatusSuccess { |
|||
return "Success" |
|||
} |
|||
return fmt.Sprintf("Status{sct=%d sc=0x%02x dnr=%v}", s.SCT(), s.SC(), s.DNR()) |
|||
} |
|||
@ -0,0 +1,210 @@ |
|||
package nvme |
|||
|
|||
import ( |
|||
"fmt" |
|||
"log" |
|||
"net" |
|||
"sync" |
|||
"sync/atomic" |
|||
"time" |
|||
) |
|||
|
|||
// Config holds NVMe/TCP target configuration.
|
|||
type Config struct { |
|||
ListenAddr string |
|||
NQNPrefix string |
|||
MaxH2CDataLength uint32 |
|||
MaxIOQueues uint16 |
|||
Enabled bool |
|||
} |
|||
|
|||
// DefaultConfig returns the default NVMe target configuration.
|
|||
func DefaultConfig() Config { |
|||
return Config{ |
|||
ListenAddr: "0.0.0.0:4420", |
|||
NQNPrefix: "nqn.2024-01.com.seaweedfs:vol.", |
|||
MaxH2CDataLength: maxH2CDataLen, |
|||
MaxIOQueues: 4, |
|||
Enabled: false, |
|||
} |
|||
} |
|||
|
|||
// adminSession stores state from an admin queue connection that IO queue
|
|||
// connections need to look up (they arrive on separate TCP connections).
|
|||
type adminSession struct { |
|||
cntlID uint16 |
|||
subsystem *Subsystem |
|||
subNQN string |
|||
hostNQN string |
|||
regCAP uint64 |
|||
regCC uint32 |
|||
regCSTS uint32 |
|||
regVS uint32 |
|||
katoMs uint32 |
|||
} |
|||
|
|||
// Server is the NVMe/TCP target server.
|
|||
type Server struct { |
|||
cfg Config |
|||
listener net.Listener |
|||
mu sync.RWMutex |
|||
subsystems map[string]*Subsystem // NQN → Subsystem
|
|||
sessions map[*Controller]struct{} |
|||
adminMu sync.RWMutex |
|||
admins map[uint16]*adminSession // CNTLID → admin session
|
|||
nextCNTLID atomic.Uint32 |
|||
closed atomic.Bool |
|||
wg sync.WaitGroup |
|||
} |
|||
|
|||
// NewServer creates a new NVMe/TCP target server.
|
|||
func NewServer(cfg Config) *Server { |
|||
return &Server{ |
|||
cfg: cfg, |
|||
subsystems: make(map[string]*Subsystem), |
|||
sessions: make(map[*Controller]struct{}), |
|||
admins: make(map[uint16]*adminSession), |
|||
} |
|||
} |
|||
|
|||
// AddVolume registers a block device as an NVMe subsystem.
|
|||
func (s *Server) AddVolume(nqn string, dev BlockDevice, nguid [16]byte) { |
|||
s.mu.Lock() |
|||
defer s.mu.Unlock() |
|||
s.subsystems[nqn] = &Subsystem{ |
|||
NQN: nqn, |
|||
Dev: dev, |
|||
NGUID: nguid, |
|||
} |
|||
} |
|||
|
|||
// RemoveVolume unregisters an NVMe subsystem.
|
|||
func (s *Server) RemoveVolume(nqn string) { |
|||
s.mu.Lock() |
|||
defer s.mu.Unlock() |
|||
delete(s.subsystems, nqn) |
|||
} |
|||
|
|||
// ListenAndServe starts the NVMe/TCP listener.
|
|||
// If not enabled, returns nil immediately.
|
|||
func (s *Server) ListenAndServe() error { |
|||
if !s.cfg.Enabled { |
|||
return nil |
|||
} |
|||
|
|||
ln, err := net.Listen("tcp", s.cfg.ListenAddr) |
|||
if err != nil { |
|||
return fmt.Errorf("nvme listen %s: %w", s.cfg.ListenAddr, err) |
|||
} |
|||
s.listener = ln |
|||
log.Printf("nvme: listening on %s", s.cfg.ListenAddr) |
|||
|
|||
s.wg.Add(1) |
|||
go func() { |
|||
defer s.wg.Done() |
|||
s.acceptLoop() |
|||
}() |
|||
return nil |
|||
} |
|||
|
|||
func (s *Server) acceptLoop() { |
|||
for { |
|||
conn, err := s.listener.Accept() |
|||
if err != nil { |
|||
if s.closed.Load() { |
|||
return |
|||
} |
|||
log.Printf("nvme: accept error: %v", err) |
|||
continue |
|||
} |
|||
|
|||
ctrl := newController(conn, s) |
|||
s.addSession(ctrl) |
|||
|
|||
s.wg.Add(1) |
|||
go func() { |
|||
defer s.wg.Done() |
|||
if err := ctrl.Serve(); err != nil { |
|||
if !s.closed.Load() { |
|||
log.Printf("nvme: session error: %v", err) |
|||
} |
|||
} |
|||
}() |
|||
} |
|||
} |
|||
|
|||
func (s *Server) addSession(ctrl *Controller) { |
|||
s.mu.Lock() |
|||
defer s.mu.Unlock() |
|||
s.sessions[ctrl] = struct{}{} |
|||
} |
|||
|
|||
func (s *Server) removeSession(ctrl *Controller) { |
|||
s.mu.Lock() |
|||
defer s.mu.Unlock() |
|||
delete(s.sessions, ctrl) |
|||
} |
|||
|
|||
// registerAdmin stores admin queue state so IO queue connections can look it up.
|
|||
func (s *Server) registerAdmin(sess *adminSession) { |
|||
s.adminMu.Lock() |
|||
defer s.adminMu.Unlock() |
|||
s.admins[sess.cntlID] = sess |
|||
} |
|||
|
|||
// unregisterAdmin removes an admin session by CNTLID.
|
|||
func (s *Server) unregisterAdmin(cntlID uint16) { |
|||
s.adminMu.Lock() |
|||
defer s.adminMu.Unlock() |
|||
delete(s.admins, cntlID) |
|||
} |
|||
|
|||
// lookupAdmin returns the admin session for the given CNTLID.
|
|||
func (s *Server) lookupAdmin(cntlID uint16) *adminSession { |
|||
s.adminMu.RLock() |
|||
defer s.adminMu.RUnlock() |
|||
return s.admins[cntlID] |
|||
} |
|||
|
|||
// Close gracefully shuts down the server.
|
|||
func (s *Server) Close() error { |
|||
if !s.cfg.Enabled { |
|||
return nil |
|||
} |
|||
s.closed.Store(true) |
|||
|
|||
if s.listener != nil { |
|||
s.listener.Close() |
|||
} |
|||
|
|||
// Close all active sessions
|
|||
s.mu.RLock() |
|||
sessions := make([]*Controller, 0, len(s.sessions)) |
|||
for ctrl := range s.sessions { |
|||
sessions = append(sessions, ctrl) |
|||
} |
|||
s.mu.RUnlock() |
|||
|
|||
for _, ctrl := range sessions { |
|||
ctrl.conn.Close() |
|||
} |
|||
|
|||
// Wait with timeout
|
|||
done := make(chan struct{}) |
|||
go func() { |
|||
s.wg.Wait() |
|||
close(done) |
|||
}() |
|||
|
|||
select { |
|||
case <-done: |
|||
case <-time.After(5 * time.Second): |
|||
log.Printf("nvme: shutdown timed out after 5s") |
|||
} |
|||
return nil |
|||
} |
|||
|
|||
// NQN returns the full NQN for a volume name.
|
|||
func (s *Server) NQN(volName string) string { |
|||
return s.cfg.NQNPrefix + volName |
|||
} |
|||
@ -0,0 +1,202 @@ |
|||
package nvme |
|||
|
|||
import ( |
|||
"bufio" |
|||
"encoding/binary" |
|||
"fmt" |
|||
"io" |
|||
) |
|||
|
|||
// ---------- Reader ----------
|
|||
|
|||
// Reader decodes NVMe/TCP PDUs from a stream.
|
|||
//
|
|||
// Usage:
|
|||
//
|
|||
// hdr, _ := r.Dequeue() // read 8-byte CommonHeader
|
|||
// r.Receive(&capsuleCmd) // read remaining specific header
|
|||
// if r.Length() > 0 {
|
|||
// data := make([]byte, r.Length())
|
|||
// r.ReceiveData(data) // read payload
|
|||
// }
|
|||
type Reader struct { |
|||
rd io.Reader |
|||
CH CommonHeader |
|||
header [maxHeaderSize]byte |
|||
} |
|||
|
|||
// NewReader wraps an io.Reader for NVMe/TCP PDU decoding.
|
|||
func NewReader(r io.Reader) *Reader { |
|||
return &Reader{rd: r} |
|||
} |
|||
|
|||
// Dequeue reads the 8-byte CommonHeader, validates bounds, and returns it.
|
|||
func (r *Reader) Dequeue() (*CommonHeader, error) { |
|||
if _, err := io.ReadFull(r.rd, r.header[:commonHeaderSize]); err != nil { |
|||
return nil, err |
|||
} |
|||
r.CH.Unmarshal(r.header[:commonHeaderSize]) |
|||
|
|||
// Validate header bounds to prevent panics on malformed PDUs.
|
|||
if r.CH.HeaderLength < commonHeaderSize { |
|||
return nil, fmt.Errorf("nvme: HeaderLength %d < minimum %d", r.CH.HeaderLength, commonHeaderSize) |
|||
} |
|||
if r.CH.HeaderLength > maxHeaderSize { |
|||
return nil, fmt.Errorf("nvme: HeaderLength %d > maximum %d", r.CH.HeaderLength, maxHeaderSize) |
|||
} |
|||
if r.CH.DataOffset != 0 && r.CH.DataOffset < r.CH.HeaderLength { |
|||
return nil, fmt.Errorf("nvme: DataOffset %d < HeaderLength %d", r.CH.DataOffset, r.CH.HeaderLength) |
|||
} |
|||
if r.CH.DataOffset != 0 && uint32(r.CH.DataOffset) > r.CH.DataLength { |
|||
return nil, fmt.Errorf("nvme: DataOffset %d > DataLength %d", r.CH.DataOffset, r.CH.DataLength) |
|||
} |
|||
if r.CH.DataLength < uint32(r.CH.HeaderLength) { |
|||
return nil, fmt.Errorf("nvme: DataLength %d < HeaderLength %d", r.CH.DataLength, r.CH.HeaderLength) |
|||
} |
|||
// DataOffset==0 means no inline data — DataLength must equal HeaderLength,
|
|||
// otherwise unconsumed bytes desynchronize the stream.
|
|||
if r.CH.DataOffset == 0 && r.CH.DataLength != uint32(r.CH.HeaderLength) { |
|||
return nil, fmt.Errorf("nvme: DataOffset=0 but DataLength %d != HeaderLength %d", r.CH.DataLength, r.CH.HeaderLength) |
|||
} |
|||
|
|||
return &r.CH, nil |
|||
} |
|||
|
|||
// Receive reads the remaining PDU-specific header (HeaderLength - 8 bytes)
|
|||
// and unmarshals it into pdu. It also skips any padding between header and
|
|||
// data (DataOffset - HeaderLength bytes).
|
|||
func (r *Reader) Receive(pdu PDU) error { |
|||
remain := int(r.CH.HeaderLength) - commonHeaderSize |
|||
if remain <= 0 { |
|||
return nil |
|||
} |
|||
if _, err := io.ReadFull(r.rd, r.header[commonHeaderSize:r.CH.HeaderLength]); err != nil { |
|||
return err |
|||
} |
|||
pdu.Unmarshal(r.header[commonHeaderSize:r.CH.HeaderLength]) |
|||
|
|||
// Skip padding between header and data.
|
|||
pad := int(r.CH.DataOffset) - int(r.CH.HeaderLength) |
|||
if pad > 0 { |
|||
if _, err := io.ReadFull(r.rd, make([]byte, pad)); err != nil { |
|||
return err |
|||
} |
|||
} |
|||
return nil |
|||
} |
|||
|
|||
// Length returns the payload size: DataLength - DataOffset (when DataOffset != 0).
|
|||
func (r *Reader) Length() uint32 { |
|||
if r.CH.DataOffset != 0 { |
|||
return r.CH.DataLength - uint32(r.CH.DataOffset) |
|||
} |
|||
return 0 |
|||
} |
|||
|
|||
// ReceiveData reads exactly len(buf) bytes of payload data.
|
|||
func (r *Reader) ReceiveData(buf []byte) error { |
|||
_, err := io.ReadFull(r.rd, buf) |
|||
return err |
|||
} |
|||
|
|||
// ---------- Writer ----------
|
|||
|
|||
// Writer encodes NVMe/TCP PDUs to a stream.
|
|||
type Writer struct { |
|||
wr *bufio.Writer |
|||
CH CommonHeader |
|||
header [maxHeaderSize]byte |
|||
} |
|||
|
|||
// NewWriter wraps an io.Writer for NVMe/TCP PDU encoding.
|
|||
func NewWriter(w io.Writer) *Writer { |
|||
return &Writer{wr: bufio.NewWriter(w)} |
|||
} |
|||
|
|||
// PrepareHeaderOnly sets up a header-only PDU (no payload).
|
|||
// Call Flush() to write it to the wire.
|
|||
func (w *Writer) PrepareHeaderOnly(pduType uint8, pdu PDU, specificLen uint8) { |
|||
w.CH.Type = pduType |
|||
w.CH.Flags = 0 |
|||
w.CH.HeaderLength = commonHeaderSize + specificLen |
|||
w.CH.DataOffset = 0 |
|||
w.CH.DataLength = uint32(w.CH.HeaderLength) |
|||
pdu.Marshal(w.header[commonHeaderSize:]) |
|||
} |
|||
|
|||
// PrepareWithData sets up a PDU with payload data.
|
|||
// Call Flush() to write it to the wire.
|
|||
func (w *Writer) PrepareWithData(pduType, flags uint8, pdu PDU, specificLen uint8, data []byte) { |
|||
w.CH.Type = pduType |
|||
w.CH.Flags = flags |
|||
w.CH.HeaderLength = commonHeaderSize + specificLen |
|||
if data != nil { |
|||
w.CH.DataOffset = w.CH.HeaderLength |
|||
w.CH.DataLength = uint32(w.CH.HeaderLength) + uint32(len(data)) |
|||
} else { |
|||
w.CH.DataOffset = 0 |
|||
w.CH.DataLength = uint32(w.CH.HeaderLength) |
|||
} |
|||
pdu.Marshal(w.header[commonHeaderSize:]) |
|||
} |
|||
|
|||
// Flush writes the prepared CommonHeader + specific header to the wire.
|
|||
// If there was payload data (from PrepareWithData), call FlushData after.
|
|||
func (w *Writer) Flush() error { |
|||
w.CH.Marshal(w.header[:commonHeaderSize]) |
|||
if _, err := w.wr.Write(w.header[:w.CH.HeaderLength]); err != nil { |
|||
return err |
|||
} |
|||
return nil |
|||
} |
|||
|
|||
// FlushData writes payload data and flushes the underlying buffered writer.
|
|||
func (w *Writer) FlushData(data []byte) error { |
|||
if len(data) > 0 { |
|||
if _, err := w.wr.Write(data); err != nil { |
|||
return err |
|||
} |
|||
} |
|||
return w.wr.Flush() |
|||
} |
|||
|
|||
// SendHeaderOnly writes a complete header-only PDU (prepare + flush).
|
|||
func (w *Writer) SendHeaderOnly(pduType uint8, pdu PDU, specificLen uint8) error { |
|||
w.PrepareHeaderOnly(pduType, pdu, specificLen) |
|||
if err := w.Flush(); err != nil { |
|||
return err |
|||
} |
|||
return w.wr.Flush() |
|||
} |
|||
|
|||
// SendWithData writes a complete PDU with payload data.
|
|||
func (w *Writer) SendWithData(pduType, flags uint8, pdu PDU, specificLen uint8, data []byte) error { |
|||
w.PrepareWithData(pduType, flags, pdu, specificLen, data) |
|||
if err := w.Flush(); err != nil { |
|||
return err |
|||
} |
|||
return w.FlushData(data) |
|||
} |
|||
|
|||
// writeRaw writes raw bytes directly (used for ConnectData inline in capsule).
|
|||
func (w *Writer) writeRaw(data []byte) error { |
|||
_, err := w.wr.Write(data) |
|||
return err |
|||
} |
|||
|
|||
// flushBuf flushes the underlying buffered writer.
|
|||
func (w *Writer) flushBuf() error { |
|||
return w.wr.Flush() |
|||
} |
|||
|
|||
// ---------- Helpers ----------
|
|||
|
|||
// putLE32 writes a uint32 in little-endian.
|
|||
func putLE32(buf []byte, v uint32) { |
|||
binary.LittleEndian.PutUint32(buf, v) |
|||
} |
|||
|
|||
// putLE64 writes a uint64 in little-endian.
|
|||
func putLE64(buf []byte, v uint64) { |
|||
binary.LittleEndian.PutUint64(buf, v) |
|||
} |
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue