Browse Source
feat: Phase 10 CP10-1 -- NVMe/TCP target MVP, 109 tests
feat: Phase 10 CP10-1 -- NVMe/TCP target MVP, 109 tests
NVMe over Fabrics (TCP) target implementation sharing the same BlockVol engine, fencing, replication, and failover as the existing iSCSI target. New package: weed/storage/blockvol/nvme/ (11 files, 2,242 production LOC) - protocol.go: PDU types, opcodes, status codes, marshal/unmarshal - wire.go: TCP reader/writer with header bounds validation - controller.go: IC handshake, per-queue state, command dispatch, KATO - fabric.go: Connect (admin+IO), PropertyGet/Set, Disconnect - identify.go: Controller/Namespace/NS list/NS descriptors (Linux 5.15) - admin.go: SetFeatures, GetFeatures, GetLogPage (SMART/ANA), KeepAlive - io.go: Read (C2HData), Write (inline), Flush, WriteZeros/Trim - server.go: TCP listener, admin session registry, graceful shutdown - adapter.go: BlockVol-to-NVMe bridge, error mapping, ANA state Integration: NVMeConfig + CLI flags (-block.nvme.*), disabled by default. Key design: inline-data writes only (no R2T), MaxH2CDataLength=32KB, single ANA group coherent with BlockVol role, CNTLID session registry for cross-connection IO queues, HostNQN continuity enforcement. Tests: 65 dev + 44 QA adversarial = 109 total, all passing. Bugs fixed during review: IO queue cross-connection (A), header bounds validation (B), write payload size check (C), disconnect error (D), stream desync prevention (E), HostNQN enforcement (F), capsule-before-IC state guard (H), flowCtlOff SQHD timing (I). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>feature/sw-block
16 changed files with 6268 additions and 16 deletions
-
20weed/command/volume.go
-
2weed/server/block_heartbeat_loop_test.go
-
2weed/server/volume_grpc_block_test.go
-
96weed/server/volume_server_block.go
-
4weed/server/volume_server_block_test.go
-
127weed/storage/blockvol/nvme/adapter.go
-
198weed/storage/blockvol/nvme/admin.go
-
354weed/storage/blockvol/nvme/controller.go
-
300weed/storage/blockvol/nvme/fabric.go
-
250weed/storage/blockvol/nvme/identify.go
-
157weed/storage/blockvol/nvme/io.go
-
1541weed/storage/blockvol/nvme/nvme_qa_test.go
-
2377weed/storage/blockvol/nvme/nvme_test.go
-
444weed/storage/blockvol/nvme/protocol.go
-
210weed/storage/blockvol/nvme/server.go
-
202weed/storage/blockvol/nvme/wire.go
@ -0,0 +1,127 @@ |
|||||
|
package nvme |
||||
|
|
||||
|
import ( |
||||
|
"errors" |
||||
|
"strings" |
||||
|
|
||||
|
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol" |
||||
|
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol/blockerr" |
||||
|
) |
||||
|
|
||||
|
// NVMeAdapter wraps a *BlockVol to implement BlockDevice and ANAProvider
|
||||
|
// for the NVMe/TCP target, bridging the BlockVol storage engine to NVMe
|
||||
|
// command handling.
|
||||
|
type NVMeAdapter struct { |
||||
|
Vol *blockvol.BlockVol |
||||
|
} |
||||
|
|
||||
|
// NewNVMeAdapter creates a BlockDevice adapter for the given BlockVol.
|
||||
|
func NewNVMeAdapter(vol *blockvol.BlockVol) *NVMeAdapter { |
||||
|
return &NVMeAdapter{Vol: vol} |
||||
|
} |
||||
|
|
||||
|
func (a *NVMeAdapter) ReadAt(lba uint64, length uint32) ([]byte, error) { |
||||
|
return a.Vol.ReadLBA(lba, length) |
||||
|
} |
||||
|
|
||||
|
func (a *NVMeAdapter) WriteAt(lba uint64, data []byte) error { |
||||
|
return a.Vol.WriteLBA(lba, data) |
||||
|
} |
||||
|
|
||||
|
func (a *NVMeAdapter) Trim(lba uint64, length uint32) error { |
||||
|
return a.Vol.Trim(lba, length) |
||||
|
} |
||||
|
|
||||
|
func (a *NVMeAdapter) SyncCache() error { |
||||
|
return a.Vol.SyncCache() |
||||
|
} |
||||
|
|
||||
|
func (a *NVMeAdapter) BlockSize() uint32 { |
||||
|
return a.Vol.Info().BlockSize |
||||
|
} |
||||
|
|
||||
|
func (a *NVMeAdapter) VolumeSize() uint64 { |
||||
|
return a.Vol.Info().VolumeSize |
||||
|
} |
||||
|
|
||||
|
func (a *NVMeAdapter) IsHealthy() bool { |
||||
|
return a.Vol.Info().Healthy |
||||
|
} |
||||
|
|
||||
|
// ANAState returns the ANA state based on the volume's role.
|
||||
|
func (a *NVMeAdapter) ANAState() uint8 { |
||||
|
return RoleToANAState(a.Vol.Role()) |
||||
|
} |
||||
|
|
||||
|
// ANAGroupID returns the ANA group ID (always 1 for single-group MVP).
|
||||
|
func (a *NVMeAdapter) ANAGroupID() uint16 { return 1 } |
||||
|
|
||||
|
// DeviceNGUID returns a 16-byte NGUID derived from the volume UUID.
|
||||
|
func (a *NVMeAdapter) DeviceNGUID() [16]byte { |
||||
|
return UUIDToNGUID(a.Vol.Info().UUID) |
||||
|
} |
||||
|
|
||||
|
// Compile-time checks.
|
||||
|
var _ BlockDevice = (*NVMeAdapter)(nil) |
||||
|
var _ ANAProvider = (*NVMeAdapter)(nil) |
||||
|
|
||||
|
// RoleToANAState maps a BlockVol Role to an NVMe ANA state.
|
||||
|
func RoleToANAState(r blockvol.Role) uint8 { |
||||
|
switch r { |
||||
|
case blockvol.RolePrimary, blockvol.RoleNone: |
||||
|
return anaOptimized |
||||
|
case blockvol.RoleReplica: |
||||
|
return anaInaccessible |
||||
|
case blockvol.RoleStale: |
||||
|
return anaPersistentLoss |
||||
|
case blockvol.RoleRebuilding, blockvol.RoleDraining: |
||||
|
return anaInaccessible |
||||
|
default: |
||||
|
return anaInaccessible |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// UUIDToNGUID converts a 16-byte UUID to a 16-byte NGUID.
|
||||
|
// Uses NAA-6 pattern for first 8 bytes (compatible with iSCSI UUIDToNAA),
|
||||
|
// copies remaining bytes as-is.
|
||||
|
func UUIDToNGUID(uuid [16]byte) [16]byte { |
||||
|
var nguid [16]byte |
||||
|
nguid[0] = 0x60 | (uuid[0] & 0x0F) |
||||
|
copy(nguid[1:8], uuid[1:8]) |
||||
|
copy(nguid[8:16], uuid[8:16]) |
||||
|
return nguid |
||||
|
} |
||||
|
|
||||
|
// mapBlockError maps BlockVol errors to NVMe status words.
|
||||
|
func mapBlockError(err error) StatusWord { |
||||
|
if err == nil { |
||||
|
return StatusSuccess |
||||
|
} |
||||
|
|
||||
|
// Check known sentinel errors from blockvol and blockerr packages.
|
||||
|
switch { |
||||
|
case errors.Is(err, blockvol.ErrLeaseExpired): |
||||
|
return StatusNSNotReadyDNR // DNR=1: fencing is permanent
|
||||
|
case errors.Is(err, blockvol.ErrEpochRegression): |
||||
|
return StatusInternalErrorDNR // DNR=1: stale controller
|
||||
|
case errors.Is(err, blockerr.ErrDurabilityBarrierFailed): |
||||
|
return StatusInternalError // DNR=0: replica may recover
|
||||
|
case errors.Is(err, blockerr.ErrDurabilityQuorumLost): |
||||
|
return StatusInternalError // DNR=0: quorum may heal
|
||||
|
case errors.Is(err, blockvol.ErrWALFull): |
||||
|
return StatusNSNotReady // DNR=0: transient pressure
|
||||
|
case errors.Is(err, blockvol.ErrNotPrimary): |
||||
|
return StatusNSNotReady // DNR=0: may be transitioning
|
||||
|
} |
||||
|
|
||||
|
// Heuristic for I/O errors (no dedicated sentinels yet).
|
||||
|
msg := err.Error() |
||||
|
if strings.Contains(msg, "write") || strings.Contains(msg, "Write") { |
||||
|
return StatusMediaWriteFault |
||||
|
} |
||||
|
if strings.Contains(msg, "read") || strings.Contains(msg, "Read") { |
||||
|
return StatusMediaReadError |
||||
|
} |
||||
|
|
||||
|
return StatusInternalError |
||||
|
} |
||||
@ -0,0 +1,198 @@ |
|||||
|
package nvme |
||||
|
|
||||
|
import ( |
||||
|
"encoding/binary" |
||||
|
) |
||||
|
|
||||
|
// handleSetFeatures processes SetFeatures admin commands.
|
||||
|
func (c *Controller) handleSetFeatures(req *Request) error { |
||||
|
fid := uint8(req.capsule.D10 & 0xFF) |
||||
|
|
||||
|
switch fid { |
||||
|
case fidNumberOfQueues: |
||||
|
// D11: NCQR[15:0] | NSQR[31:16]
|
||||
|
ncqr := uint16(req.capsule.D11 & 0xFFFF) |
||||
|
nsqr := uint16(req.capsule.D11 >> 16) |
||||
|
|
||||
|
// Grant min(requested, max)
|
||||
|
if ncqr > c.maxIOQueues { |
||||
|
ncqr = c.maxIOQueues |
||||
|
} |
||||
|
if nsqr > c.maxIOQueues { |
||||
|
nsqr = c.maxIOQueues |
||||
|
} |
||||
|
if ncqr == 0 { |
||||
|
ncqr = 1 |
||||
|
} |
||||
|
if nsqr == 0 { |
||||
|
nsqr = 1 |
||||
|
} |
||||
|
c.grantedQueues = ncqr |
||||
|
|
||||
|
// Response DW0: (NCQR-1) | ((NSQR-1) << 16)
|
||||
|
req.resp.DW0 = uint32(ncqr-1) | (uint32(nsqr-1) << 16) |
||||
|
return c.sendResponse(req) |
||||
|
|
||||
|
case fidKeepAliveTimer: |
||||
|
// D11 contains KATO in milliseconds
|
||||
|
c.katoMs = req.capsule.D11 |
||||
|
return c.sendResponse(req) |
||||
|
|
||||
|
case fidAsyncEventConfig: |
||||
|
// Stub: accept but don't deliver events
|
||||
|
return c.sendResponse(req) |
||||
|
|
||||
|
default: |
||||
|
req.resp.Status = uint16(StatusInvalidField) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// handleGetFeatures returns stored feature values.
|
||||
|
func (c *Controller) handleGetFeatures(req *Request) error { |
||||
|
fid := uint8(req.capsule.D10 & 0xFF) |
||||
|
|
||||
|
switch fid { |
||||
|
case fidNumberOfQueues: |
||||
|
n := c.grantedQueues |
||||
|
if n == 0 { |
||||
|
n = c.maxIOQueues |
||||
|
} |
||||
|
req.resp.DW0 = uint32(n-1) | (uint32(n-1) << 16) |
||||
|
return c.sendResponse(req) |
||||
|
|
||||
|
case fidKeepAliveTimer: |
||||
|
req.resp.DW0 = c.katoMs |
||||
|
return c.sendResponse(req) |
||||
|
|
||||
|
case fidAsyncEventConfig: |
||||
|
req.resp.DW0 = 0 |
||||
|
return c.sendResponse(req) |
||||
|
|
||||
|
default: |
||||
|
req.resp.Status = uint16(StatusInvalidField) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// handleGetLogPage returns log page data.
|
||||
|
func (c *Controller) handleGetLogPage(req *Request) error { |
||||
|
// D10 bits 7:0 = Log Page Identifier
|
||||
|
// D10 bits 27:16 and D11 bits 15:0 = Number of Dwords (NUMD)
|
||||
|
lid := uint8(req.capsule.D10 & 0xFF) |
||||
|
numdl := (req.capsule.D10 >> 16) & 0xFFF |
||||
|
numdu := req.capsule.D11 & 0xFFFF |
||||
|
numd := uint32(numdu)<<16 | uint32(numdl) |
||||
|
length := (numd + 1) * 4 // NUMD is 0-based, in dwords
|
||||
|
|
||||
|
switch lid { |
||||
|
case logPageError: |
||||
|
return c.logPageError(req, length) |
||||
|
case logPageSMART: |
||||
|
return c.logPageSMART(req, length) |
||||
|
case logPageANA: |
||||
|
return c.logPageANA(req, length) |
||||
|
default: |
||||
|
req.resp.Status = uint16(StatusInvalidField) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// logPageError returns an empty error log page.
|
||||
|
func (c *Controller) logPageError(req *Request, length uint32) error { |
||||
|
if length > 64 { |
||||
|
length = 64 |
||||
|
} |
||||
|
req.c2hData = make([]byte, length) |
||||
|
return c.sendC2HDataAndResponse(req) |
||||
|
} |
||||
|
|
||||
|
// logPageSMART returns a 512-byte SMART/Health log.
|
||||
|
func (c *Controller) logPageSMART(req *Request, length uint32) error { |
||||
|
if length > 512 { |
||||
|
length = 512 |
||||
|
} |
||||
|
buf := make([]byte, 512) |
||||
|
|
||||
|
// Critical Warning - offset 0: 0 = no warnings
|
||||
|
buf[0] = 0 |
||||
|
|
||||
|
// Composite Temperature - offset 1-2: 0 (not implemented)
|
||||
|
binary.LittleEndian.PutUint16(buf[1:], 0) |
||||
|
|
||||
|
// Available Spare - offset 3: 100%
|
||||
|
buf[3] = 100 |
||||
|
|
||||
|
// Available Spare Threshold - offset 4: 10%
|
||||
|
buf[4] = 10 |
||||
|
|
||||
|
// Percentage Used - offset 5: 0%
|
||||
|
buf[5] = 0 |
||||
|
|
||||
|
req.c2hData = buf[:length] |
||||
|
return c.sendC2HDataAndResponse(req) |
||||
|
} |
||||
|
|
||||
|
// logPageANA returns the ANA log page with a single group.
|
||||
|
func (c *Controller) logPageANA(req *Request, length uint32) error { |
||||
|
// ANA log page format (32 bytes for single group):
|
||||
|
// [0:8] CHGCNT (uint64)
|
||||
|
// [8:10] NGRPS = 1 (uint16)
|
||||
|
// [10:16] reserved
|
||||
|
// Group descriptor:
|
||||
|
// [16:20] ANAGRPID = 1 (uint32)
|
||||
|
// [20:24] NNSID = 1 (uint32)
|
||||
|
// [24:32] Change Count (uint64)
|
||||
|
// [32] ANA State
|
||||
|
// [33:36] reserved
|
||||
|
// [36:40] NSID = 1 (uint32)
|
||||
|
const anaLogSize = 40 |
||||
|
|
||||
|
buf := make([]byte, anaLogSize) |
||||
|
|
||||
|
// CHGCNT
|
||||
|
binary.LittleEndian.PutUint64(buf[0:], c.anaChangeCount()) |
||||
|
|
||||
|
// NGRPS
|
||||
|
binary.LittleEndian.PutUint16(buf[8:], 1) |
||||
|
|
||||
|
// Group descriptor
|
||||
|
binary.LittleEndian.PutUint32(buf[16:], 1) // ANAGRPID=1
|
||||
|
binary.LittleEndian.PutUint32(buf[20:], 1) // NNSID=1
|
||||
|
binary.LittleEndian.PutUint64(buf[24:], c.anaChangeCount()) // chgcnt
|
||||
|
buf[32] = c.anaState() // ANA state
|
||||
|
binary.LittleEndian.PutUint32(buf[36:], 1) // NSID=1
|
||||
|
|
||||
|
if length > anaLogSize { |
||||
|
length = anaLogSize |
||||
|
} |
||||
|
req.c2hData = buf[:length] |
||||
|
return c.sendC2HDataAndResponse(req) |
||||
|
} |
||||
|
|
||||
|
// anaState returns the current ANA state based on the subsystem's device.
|
||||
|
func (c *Controller) anaState() uint8 { |
||||
|
if c.subsystem == nil { |
||||
|
return anaInaccessible |
||||
|
} |
||||
|
if prov, ok := c.subsystem.Dev.(ANAProvider); ok { |
||||
|
return prov.ANAState() |
||||
|
} |
||||
|
// Default: if healthy → optimized
|
||||
|
if c.subsystem.Dev.IsHealthy() { |
||||
|
return anaOptimized |
||||
|
} |
||||
|
return anaInaccessible |
||||
|
} |
||||
|
|
||||
|
// anaChangeCount returns a monotonic ANA change counter.
|
||||
|
// For MVP, we use 1 as a constant (no dynamic role changes tracked).
|
||||
|
func (c *Controller) anaChangeCount() uint64 { |
||||
|
return 1 |
||||
|
} |
||||
|
|
||||
|
// handleKeepAlive resets the KATO timer and returns success.
|
||||
|
func (c *Controller) handleKeepAlive(req *Request) error { |
||||
|
c.resetKATO() |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
@ -0,0 +1,354 @@ |
|||||
|
package nvme |
||||
|
|
||||
|
import ( |
||||
|
"fmt" |
||||
|
"io" |
||||
|
"log" |
||||
|
"net" |
||||
|
"sync" |
||||
|
"sync/atomic" |
||||
|
"time" |
||||
|
) |
||||
|
|
||||
|
// controllerState tracks the lifecycle of an NVMe controller session.
|
||||
|
type controllerState int |
||||
|
|
||||
|
const ( |
||||
|
stateConnected controllerState = iota // TCP connected, no IC yet
|
||||
|
stateICComplete // IC exchange done
|
||||
|
stateAdminReady // Admin queue connected
|
||||
|
stateCtrlReady // CC.EN=1, CSTS.RDY=1
|
||||
|
stateIOActive // IO queues active
|
||||
|
stateClosed // Shut down
|
||||
|
) |
||||
|
|
||||
|
// Request represents an in-flight NVMe command being processed.
|
||||
|
type Request struct { |
||||
|
capsule CapsuleCommand |
||||
|
payload []byte // inline data from host (Write commands)
|
||||
|
resp CapsuleResponse |
||||
|
c2hData []byte // data to send to host (Read commands)
|
||||
|
status StatusWord |
||||
|
} |
||||
|
|
||||
|
// Controller handles one NVMe/TCP connection (one queue per connection).
|
||||
|
type Controller struct { |
||||
|
mu sync.Mutex |
||||
|
|
||||
|
// Session identity
|
||||
|
conn net.Conn |
||||
|
in *Reader |
||||
|
out *Writer |
||||
|
state controllerState |
||||
|
closed atomic.Bool |
||||
|
|
||||
|
// Queue state (one queue per TCP connection)
|
||||
|
queueID uint16 |
||||
|
queueSize uint16 |
||||
|
sqhd uint16 // Submission Queue Head pointer
|
||||
|
flowCtlOff bool // CATTR bit2: SQ flow control disabled
|
||||
|
|
||||
|
// Controller identity
|
||||
|
cntlID uint16 |
||||
|
subNQN string |
||||
|
|
||||
|
// Controller registers
|
||||
|
regCAP uint64 // Controller Capabilities
|
||||
|
regCC uint32 // Controller Configuration (set by host via PropertySet)
|
||||
|
regCSTS uint32 // Controller Status (RDY bit)
|
||||
|
regVS uint32 // Version
|
||||
|
|
||||
|
// KeepAlive
|
||||
|
katoMs uint32 |
||||
|
katoTimer *time.Timer |
||||
|
katoMu sync.Mutex |
||||
|
|
||||
|
// Async completion (IO queues)
|
||||
|
waiting chan *Request // pre-allocated request pool
|
||||
|
completions chan *Request // completed requests to send
|
||||
|
|
||||
|
// Backend
|
||||
|
subsystem *Subsystem |
||||
|
server *Server |
||||
|
|
||||
|
// Features
|
||||
|
maxIOQueues uint16 |
||||
|
grantedQueues uint16 |
||||
|
isAdmin bool // true if this controller owns admin queue (QID=0)
|
||||
|
|
||||
|
// Lifecycle
|
||||
|
wg sync.WaitGroup |
||||
|
closeOnce sync.Once |
||||
|
} |
||||
|
|
||||
|
// newController creates a controller for the given connection.
|
||||
|
func newController(conn net.Conn, server *Server) *Controller { |
||||
|
c := &Controller{ |
||||
|
conn: conn, |
||||
|
in: NewReader(conn), |
||||
|
out: NewWriter(conn), |
||||
|
state: stateConnected, |
||||
|
server: server, |
||||
|
regVS: nvmeVersion14, |
||||
|
// CAP register: MQES=63 (bits 15:0), CQR=1 (bit 16), TO=30 (bits 31:24, *500ms=15s), CSS bit37=1 (NVM command set)
|
||||
|
regCAP: uint64(63) | (1 << 16) | (uint64(30) << 24) | (1 << 37), |
||||
|
maxIOQueues: server.cfg.MaxIOQueues, |
||||
|
} |
||||
|
return c |
||||
|
} |
||||
|
|
||||
|
// Serve is the main event loop for this controller connection.
|
||||
|
func (c *Controller) Serve() error { |
||||
|
defer c.shutdown() |
||||
|
|
||||
|
// IC handshake timeout
|
||||
|
if err := c.conn.SetReadDeadline(time.Now().Add(10 * time.Second)); err != nil { |
||||
|
return err |
||||
|
} |
||||
|
|
||||
|
for { |
||||
|
if c.closed.Load() { |
||||
|
return nil |
||||
|
} |
||||
|
|
||||
|
hdr, err := c.in.Dequeue() |
||||
|
if err != nil { |
||||
|
if err == io.EOF || c.closed.Load() { |
||||
|
return nil |
||||
|
} |
||||
|
return fmt.Errorf("read header: %w", err) |
||||
|
} |
||||
|
|
||||
|
switch hdr.Type { |
||||
|
case pduICReq: |
||||
|
if err := c.handleIC(); err != nil { |
||||
|
return fmt.Errorf("IC handshake: %w", err) |
||||
|
} |
||||
|
// Clear read deadline after successful IC
|
||||
|
if err := c.conn.SetReadDeadline(time.Time{}); err != nil { |
||||
|
return err |
||||
|
} |
||||
|
|
||||
|
case pduCapsuleCmd: |
||||
|
if err := c.handleCapsule(); err != nil { |
||||
|
return fmt.Errorf("capsule: %w", err) |
||||
|
} |
||||
|
|
||||
|
case pduH2CTermReq: |
||||
|
return nil // host terminated
|
||||
|
|
||||
|
default: |
||||
|
return fmt.Errorf("unexpected PDU type: 0x%x", hdr.Type) |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// handleIC processes the IC handshake.
|
||||
|
func (c *Controller) handleIC() error { |
||||
|
var req ICRequest |
||||
|
if err := c.in.Receive(&req); err != nil { |
||||
|
return err |
||||
|
} |
||||
|
|
||||
|
resp := ICResponse{ |
||||
|
PDUFormatVersion: 0, |
||||
|
MaxH2CDataLength: maxH2CDataLen, |
||||
|
} |
||||
|
if err := c.out.SendHeaderOnly(pduICResp, &resp, icBodySize); err != nil { |
||||
|
return err |
||||
|
} |
||||
|
|
||||
|
c.state = stateICComplete |
||||
|
return nil |
||||
|
} |
||||
|
|
||||
|
// handleCapsule dispatches a CapsuleCmd PDU.
|
||||
|
func (c *Controller) handleCapsule() error { |
||||
|
// Reject capsule commands before IC handshake is complete.
|
||||
|
if c.state < stateICComplete { |
||||
|
return fmt.Errorf("capsule command before IC handshake") |
||||
|
} |
||||
|
|
||||
|
var capsule CapsuleCommand |
||||
|
if err := c.in.Receive(&capsule); err != nil { |
||||
|
return err |
||||
|
} |
||||
|
|
||||
|
// Read optional inline data
|
||||
|
var payload []byte |
||||
|
if dataLen := c.in.Length(); dataLen > 0 { |
||||
|
payload = make([]byte, dataLen) |
||||
|
if err := c.in.ReceiveData(payload); err != nil { |
||||
|
return err |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// Advance SQHD
|
||||
|
c.sqhd++ |
||||
|
if c.sqhd >= c.queueSize && c.queueSize > 0 { |
||||
|
c.sqhd = 0 |
||||
|
} |
||||
|
|
||||
|
req := &Request{ |
||||
|
capsule: capsule, |
||||
|
payload: payload, |
||||
|
} |
||||
|
req.resp.CID = capsule.CID |
||||
|
req.resp.QueueID = c.queueID |
||||
|
// SQHD is set in sendResponse/sendC2HDataAndResponse using the
|
||||
|
// latest c.flowCtlOff value, so Connect responses correctly get
|
||||
|
// SQHD=0xFFFF when the host requests flowCtlOff via CATTR.
|
||||
|
req.resp.Status = uint16(StatusSuccess) |
||||
|
|
||||
|
if c.queueID == 0 { |
||||
|
return c.dispatchAdmin(req) |
||||
|
} |
||||
|
return c.dispatchIO(req) |
||||
|
} |
||||
|
|
||||
|
// dispatchAdmin handles admin queue commands synchronously.
|
||||
|
func (c *Controller) dispatchAdmin(req *Request) error { |
||||
|
capsule := &req.capsule |
||||
|
|
||||
|
if capsule.OpCode == adminFabric { |
||||
|
return c.handleFabricCommand(req) |
||||
|
} |
||||
|
|
||||
|
switch capsule.OpCode { |
||||
|
case adminIdentify: |
||||
|
return c.handleIdentify(req) |
||||
|
case adminSetFeatures: |
||||
|
return c.handleSetFeatures(req) |
||||
|
case adminGetFeatures: |
||||
|
return c.handleGetFeatures(req) |
||||
|
case adminGetLogPage: |
||||
|
return c.handleGetLogPage(req) |
||||
|
case adminKeepAlive: |
||||
|
return c.handleKeepAlive(req) |
||||
|
case adminAsyncEvent: |
||||
|
// Stub: just succeed (don't deliver events in CP10-1)
|
||||
|
return c.sendResponse(req) |
||||
|
default: |
||||
|
req.resp.Status = uint16(StatusInvalidOpcode) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// dispatchIO handles IO queue commands.
|
||||
|
func (c *Controller) dispatchIO(req *Request) error { |
||||
|
capsule := &req.capsule |
||||
|
|
||||
|
switch capsule.OpCode { |
||||
|
case ioRead: |
||||
|
return c.handleRead(req) |
||||
|
case ioWrite: |
||||
|
return c.handleWrite(req) |
||||
|
case ioFlush: |
||||
|
return c.handleFlush(req) |
||||
|
case ioWriteZeros: |
||||
|
return c.handleWriteZeros(req) |
||||
|
default: |
||||
|
req.resp.Status = uint16(StatusInvalidOpcode) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// sendC2HDataAndResponse sends C2HData PDUs followed by a CapsuleResp.
|
||||
|
func (c *Controller) sendC2HDataAndResponse(req *Request) error { |
||||
|
if len(req.c2hData) > 0 { |
||||
|
data := req.c2hData |
||||
|
offset := uint32(0) |
||||
|
chunkSize := uint32(maxH2CDataLen) |
||||
|
|
||||
|
for offset < uint32(len(data)) { |
||||
|
end := offset + chunkSize |
||||
|
if end > uint32(len(data)) { |
||||
|
end = uint32(len(data)) |
||||
|
} |
||||
|
chunk := data[offset:end] |
||||
|
|
||||
|
hdr := C2HDataHeader{ |
||||
|
CCCID: req.capsule.CID, |
||||
|
DATAO: offset, |
||||
|
DATAL: uint32(len(chunk)), |
||||
|
} |
||||
|
|
||||
|
flags := uint8(0) |
||||
|
if end >= uint32(len(data)) { |
||||
|
flags = c2hFlagLast |
||||
|
} |
||||
|
|
||||
|
if err := c.out.SendWithData(pduC2HData, flags, &hdr, c2hDataHdrSize, chunk); err != nil { |
||||
|
return err |
||||
|
} |
||||
|
offset = end |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
// sendResponse sends a CapsuleResp PDU.
|
||||
|
// SQHD is set here (not in handleCapsule) so that flowCtlOff changes
|
||||
|
// made during command dispatch (e.g. Fabric Connect) take effect
|
||||
|
// on the same response.
|
||||
|
func (c *Controller) sendResponse(req *Request) error { |
||||
|
if c.flowCtlOff { |
||||
|
req.resp.SQHD = 0xFFFF |
||||
|
} else { |
||||
|
req.resp.SQHD = c.sqhd |
||||
|
} |
||||
|
c.resetKATO() |
||||
|
return c.out.SendHeaderOnly(pduCapsuleResp, &req.resp, capsuleRespSize) |
||||
|
} |
||||
|
|
||||
|
// ---------- KATO management ----------
|
||||
|
|
||||
|
func (c *Controller) startKATO() { |
||||
|
c.katoMu.Lock() |
||||
|
defer c.katoMu.Unlock() |
||||
|
if c.katoMs == 0 { |
||||
|
return |
||||
|
} |
||||
|
d := time.Duration(c.katoMs) * time.Millisecond |
||||
|
// Add 50% margin per spec recommendation
|
||||
|
d = d + d/2 |
||||
|
c.katoTimer = time.AfterFunc(d, func() { |
||||
|
log.Printf("nvme: KATO expired for cntlid=%d, closing connection", c.cntlID) |
||||
|
c.conn.Close() |
||||
|
}) |
||||
|
} |
||||
|
|
||||
|
func (c *Controller) resetKATO() { |
||||
|
c.katoMu.Lock() |
||||
|
defer c.katoMu.Unlock() |
||||
|
if c.katoTimer != nil { |
||||
|
c.katoTimer.Reset(time.Duration(c.katoMs)*time.Millisecond + time.Duration(c.katoMs)*time.Millisecond/2) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
func (c *Controller) stopKATO() { |
||||
|
c.katoMu.Lock() |
||||
|
defer c.katoMu.Unlock() |
||||
|
if c.katoTimer != nil { |
||||
|
c.katoTimer.Stop() |
||||
|
c.katoTimer = nil |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// ---------- Lifecycle ----------
|
||||
|
|
||||
|
func (c *Controller) shutdown() { |
||||
|
c.closeOnce.Do(func() { |
||||
|
c.closed.Store(true) |
||||
|
c.stopKATO() |
||||
|
c.state = stateClosed |
||||
|
c.conn.Close() |
||||
|
if c.server != nil { |
||||
|
if c.isAdmin && c.cntlID != 0 { |
||||
|
c.server.unregisterAdmin(c.cntlID) |
||||
|
} |
||||
|
c.server.removeSession(c) |
||||
|
} |
||||
|
}) |
||||
|
} |
||||
@ -0,0 +1,300 @@ |
|||||
|
package nvme |
||||
|
|
||||
|
import ( |
||||
|
"encoding/binary" |
||||
|
) |
||||
|
|
||||
|
// handleFabricCommand dispatches Fabric-specific commands by FCType.
|
||||
|
func (c *Controller) handleFabricCommand(req *Request) error { |
||||
|
switch req.capsule.FCType { |
||||
|
case fcConnect: |
||||
|
return c.handleConnect(req) |
||||
|
case fcPropertyGet: |
||||
|
return c.handlePropertyGet(req) |
||||
|
case fcPropertySet: |
||||
|
return c.handlePropertySet(req) |
||||
|
case fcDisconnect: |
||||
|
return c.handleDisconnect(req) |
||||
|
default: |
||||
|
req.resp.Status = uint16(StatusInvalidField) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// handleConnect processes a Fabric Connect command.
|
||||
|
func (c *Controller) handleConnect(req *Request) error { |
||||
|
capsule := &req.capsule |
||||
|
|
||||
|
// Parse QueueID, QueueSize, KATO, CATTR from capsule dwords.
|
||||
|
// Connect command layout (CDW10-CDW12):
|
||||
|
// CDW10[15:0]=RECFM, CDW10[31:16]=QID
|
||||
|
// CDW11[15:0]=SQSIZE, CDW11[23:16]=CATTR
|
||||
|
// CDW12=KATO
|
||||
|
queueID := uint16(capsule.D10 >> 16) |
||||
|
queueSize := uint16(capsule.D11&0xFFFF) + 1 // SQSIZE is 0-based
|
||||
|
cattr := uint8(capsule.D11 >> 16) |
||||
|
kato := capsule.D12 |
||||
|
|
||||
|
// Parse ConnectData from payload
|
||||
|
if len(req.payload) < connectDataSize { |
||||
|
req.resp.Status = uint16(StatusInvalidField) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
var cd ConnectData |
||||
|
cd.Unmarshal(req.payload) |
||||
|
|
||||
|
if queueID == 0 { |
||||
|
// Admin queue connect
|
||||
|
sub := c.server.findSubsystem(cd.SubNQN) |
||||
|
if sub == nil { |
||||
|
req.resp.Status = uint16(StatusInvalidField) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
c.subsystem = sub |
||||
|
c.subNQN = cd.SubNQN |
||||
|
c.queueID = 0 |
||||
|
c.queueSize = queueSize |
||||
|
c.cntlID = c.server.allocCNTLID() |
||||
|
c.katoMs = kato |
||||
|
c.flowCtlOff = (cattr & 0x04) != 0 |
||||
|
c.state = stateAdminReady |
||||
|
c.isAdmin = true |
||||
|
|
||||
|
// Register admin session so IO queue connections can find us.
|
||||
|
c.server.registerAdmin(&adminSession{ |
||||
|
cntlID: c.cntlID, |
||||
|
subsystem: sub, |
||||
|
subNQN: cd.SubNQN, |
||||
|
hostNQN: cd.HostNQN, |
||||
|
regCAP: c.regCAP, |
||||
|
regCC: c.regCC, |
||||
|
regCSTS: c.regCSTS, |
||||
|
regVS: c.regVS, |
||||
|
katoMs: kato, |
||||
|
}) |
||||
|
|
||||
|
// Return CNTLID in DW0
|
||||
|
req.resp.DW0 = uint32(c.cntlID) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
// IO queue connect — look up admin session from server registry.
|
||||
|
// IO queues arrive on separate TCP connections with fresh Controllers,
|
||||
|
// so we must find the admin session by CNTLID from the server.
|
||||
|
admin := c.server.lookupAdmin(cd.CNTLID) |
||||
|
if admin == nil { |
||||
|
req.resp.Status = uint16(StatusInvalidField) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
// Validate SubNQN and HostNQN match the admin session.
|
||||
|
if cd.SubNQN != admin.subNQN { |
||||
|
req.resp.Status = uint16(StatusInvalidField) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
if cd.HostNQN != admin.hostNQN { |
||||
|
req.resp.Status = uint16(StatusInvalidField) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
c.cntlID = cd.CNTLID |
||||
|
c.subsystem = admin.subsystem |
||||
|
c.subNQN = admin.subNQN |
||||
|
c.queueID = queueID |
||||
|
c.queueSize = queueSize |
||||
|
c.flowCtlOff = (cattr & 0x04) != 0 |
||||
|
c.state = stateIOActive |
||||
|
|
||||
|
req.resp.DW0 = uint32(c.cntlID) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
// handlePropertyGet returns a controller register value.
|
||||
|
func (c *Controller) handlePropertyGet(req *Request) error { |
||||
|
// Property offset in D10 (bits 31:0, but only lower bits used)
|
||||
|
offset := req.capsule.D10 |
||||
|
// Attrib in D11 bit 0: 0=4byte, 1=8byte
|
||||
|
size8 := (req.capsule.D11 & 1) != 0 |
||||
|
|
||||
|
var val uint64 |
||||
|
switch offset { |
||||
|
case propCAP: |
||||
|
val = c.regCAP |
||||
|
case propVS: |
||||
|
val = uint64(c.regVS) |
||||
|
case propCC: |
||||
|
val = uint64(c.regCC) |
||||
|
case propCSTS: |
||||
|
val = uint64(c.regCSTS) |
||||
|
default: |
||||
|
req.resp.Status = uint16(StatusInvalidField) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
if size8 { |
||||
|
// 8-byte value in DW0+DW1
|
||||
|
req.resp.DW0 = uint32(val) |
||||
|
req.resp.DW1 = uint32(val >> 32) |
||||
|
} else { |
||||
|
req.resp.DW0 = uint32(val) |
||||
|
} |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
// handlePropertySet handles controller register writes.
|
||||
|
func (c *Controller) handlePropertySet(req *Request) error { |
||||
|
offset := req.capsule.D10 |
||||
|
value := uint64(req.capsule.D14) | uint64(req.capsule.D15)<<32 |
||||
|
|
||||
|
switch offset { |
||||
|
case propCC: |
||||
|
c.regCC = uint32(value) |
||||
|
// Check CC.EN (bit 0)
|
||||
|
if c.regCC&1 != 0 { |
||||
|
c.regCSTS |= 1 // Set CSTS.RDY
|
||||
|
c.state = stateCtrlReady |
||||
|
if c.katoMs > 0 { |
||||
|
c.startKATO() |
||||
|
} |
||||
|
} else { |
||||
|
c.regCSTS &^= 1 // Clear CSTS.RDY
|
||||
|
} |
||||
|
default: |
||||
|
// Ignore writes to other registers
|
||||
|
} |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
// handleDisconnect processes a Fabric Disconnect.
|
||||
|
func (c *Controller) handleDisconnect(req *Request) error { |
||||
|
if err := c.sendResponse(req); err != nil { |
||||
|
return err |
||||
|
} |
||||
|
c.shutdown() |
||||
|
return nil |
||||
|
} |
||||
|
|
||||
|
// ---------- Subsystem ----------
|
||||
|
|
||||
|
// Subsystem represents an NVMe subsystem backed by a BlockDevice.
|
||||
|
type Subsystem struct { |
||||
|
NQN string |
||||
|
Dev BlockDevice |
||||
|
NGUID [16]byte // Namespace GUID
|
||||
|
} |
||||
|
|
||||
|
// BlockDevice is the interface for the underlying storage.
|
||||
|
// This is the same as iscsi.BlockDevice.
|
||||
|
type BlockDevice interface { |
||||
|
ReadAt(lba uint64, length uint32) ([]byte, error) |
||||
|
WriteAt(lba uint64, data []byte) error |
||||
|
Trim(lba uint64, length uint32) error |
||||
|
SyncCache() error |
||||
|
BlockSize() uint32 |
||||
|
VolumeSize() uint64 |
||||
|
IsHealthy() bool |
||||
|
} |
||||
|
|
||||
|
// ANAProvider extends BlockDevice with ANA state reporting.
|
||||
|
type ANAProvider interface { |
||||
|
ANAState() uint8 |
||||
|
ANAGroupID() uint16 |
||||
|
DeviceNGUID() [16]byte |
||||
|
} |
||||
|
|
||||
|
// allocCNTLID allocates a new controller ID from the server.
|
||||
|
func (s *Server) allocCNTLID() uint16 { |
||||
|
return uint16(s.nextCNTLID.Add(1)) |
||||
|
} |
||||
|
|
||||
|
// findSubsystem looks up a subsystem by NQN.
|
||||
|
func (s *Server) findSubsystem(nqn string) *Subsystem { |
||||
|
s.mu.RLock() |
||||
|
defer s.mu.RUnlock() |
||||
|
sub, ok := s.subsystems[nqn] |
||||
|
if !ok { |
||||
|
return nil |
||||
|
} |
||||
|
return sub |
||||
|
} |
||||
|
|
||||
|
// ---------- ConnectData field access helpers ----------
|
||||
|
|
||||
|
// connectQueueID extracts the QueueID from a Connect capsule D10.
|
||||
|
func connectQueueID(capsule *CapsuleCommand) uint16 { |
||||
|
return uint16(capsule.D10 >> 16) |
||||
|
} |
||||
|
|
||||
|
// connectQueueSize extracts the QueueSize from a Connect capsule D11 (0-based → +1).
|
||||
|
func connectQueueSize(capsule *CapsuleCommand) uint16 { |
||||
|
return uint16(capsule.D11&0xFFFF) + 1 |
||||
|
} |
||||
|
|
||||
|
// connectKATO extracts the KeepAlive timeout from a Connect capsule D12.
|
||||
|
func connectKATO(capsule *CapsuleCommand) uint32 { |
||||
|
return capsule.D12 |
||||
|
} |
||||
|
|
||||
|
// PropertySet value extraction: the go-nvme reference puts value in D12/D13,
|
||||
|
// but NVMe spec actually uses CDW14/CDW15 for PropertySet. We handle both.
|
||||
|
func propertySetValue(capsule *CapsuleCommand) uint64 { |
||||
|
return uint64(capsule.D14) | uint64(capsule.D15)<<32 |
||||
|
} |
||||
|
|
||||
|
// propertyGetSize returns true if the PropertyGet requests an 8-byte value.
|
||||
|
func propertyGetSize8(capsule *CapsuleCommand) bool { |
||||
|
return (capsule.D11 & 1) != 0 |
||||
|
} |
||||
|
|
||||
|
// propertyGetOffset returns the register offset for PropertyGet.
|
||||
|
func propertyGetOffset(capsule *CapsuleCommand) uint32 { |
||||
|
return capsule.D10 |
||||
|
} |
||||
|
|
||||
|
// ---------- ConnectData marshal helpers for tests ----------
|
||||
|
|
||||
|
func marshalConnectData(cd *ConnectData) []byte { |
||||
|
buf := make([]byte, connectDataSize) |
||||
|
cd.Marshal(buf) |
||||
|
return buf |
||||
|
} |
||||
|
|
||||
|
func makeConnectCapsule(queueID, queueSize uint16, kato uint32, fcType uint8) CapsuleCommand { |
||||
|
return CapsuleCommand{ |
||||
|
OpCode: adminFabric, |
||||
|
FCType: fcType, |
||||
|
D10: uint32(queueID) << 16, |
||||
|
D11: uint32(queueSize - 1), // 0-based
|
||||
|
D12: kato, |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// makePropertyGetCapsule creates a PropertyGet capsule for the given register offset.
|
||||
|
func makePropertyGetCapsule(offset uint32, size8 bool) CapsuleCommand { |
||||
|
c := CapsuleCommand{ |
||||
|
OpCode: adminFabric, |
||||
|
FCType: fcPropertyGet, |
||||
|
D10: offset, |
||||
|
} |
||||
|
if size8 { |
||||
|
c.D11 = 1 |
||||
|
} |
||||
|
return c |
||||
|
} |
||||
|
|
||||
|
// makePropertySetCapsule creates a PropertySet capsule.
|
||||
|
func makePropertySetCapsule(offset uint32, value uint64) CapsuleCommand { |
||||
|
return CapsuleCommand{ |
||||
|
OpCode: adminFabric, |
||||
|
FCType: fcPropertySet, |
||||
|
D10: offset, |
||||
|
D14: uint32(value), |
||||
|
D15: uint32(value >> 32), |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// putCNTLID stores the controller ID in ConnectData at offset 16.
|
||||
|
func putCNTLID(buf []byte, cntlid uint16) { |
||||
|
binary.LittleEndian.PutUint16(buf[16:], cntlid) |
||||
|
} |
||||
@ -0,0 +1,250 @@ |
|||||
|
package nvme |
||||
|
|
||||
|
import ( |
||||
|
"encoding/binary" |
||||
|
"math/bits" |
||||
|
) |
||||
|
|
||||
|
const identifySize = 4096 |
||||
|
|
||||
|
// handleIdentify dispatches Identify commands by CNS type.
|
||||
|
func (c *Controller) handleIdentify(req *Request) error { |
||||
|
cns := uint8(req.capsule.D10 & 0xFF) |
||||
|
|
||||
|
switch cns { |
||||
|
case cnsIdentifyController: |
||||
|
return c.identifyController(req) |
||||
|
case cnsIdentifyNamespace: |
||||
|
return c.identifyNamespace(req) |
||||
|
case cnsActiveNSList: |
||||
|
return c.identifyActiveNSList(req) |
||||
|
case cnsNSDescriptorList: |
||||
|
return c.identifyNSDescriptors(req) |
||||
|
default: |
||||
|
req.resp.Status = uint16(StatusInvalidField) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// identifyController returns the 4096-byte Identify Controller data structure.
|
||||
|
func (c *Controller) identifyController(req *Request) error { |
||||
|
buf := make([]byte, identifySize) |
||||
|
|
||||
|
sub := c.subsystem |
||||
|
if sub == nil { |
||||
|
req.resp.Status = uint16(StatusInvalidField) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
// VID (PCI Vendor ID) - use 0 for software target
|
||||
|
// SSVID - 0
|
||||
|
|
||||
|
// Serial Number (offset 4, 20 bytes, space-padded ASCII)
|
||||
|
copyPadded(buf[4:24], "SWF00001") |
||||
|
|
||||
|
// Model Number (offset 24, 40 bytes, space-padded ASCII)
|
||||
|
copyPadded(buf[24:64], "SeaweedFS BlockVol") |
||||
|
|
||||
|
// Firmware Revision (offset 64, 8 bytes, space-padded ASCII)
|
||||
|
copyPadded(buf[64:72], "0001") |
||||
|
|
||||
|
// RAB (Recommended Arbitration Burst) - offset 72
|
||||
|
buf[72] = 6 |
||||
|
|
||||
|
// IEEE OUI - offset 73-75 (3 bytes, 0 for software)
|
||||
|
|
||||
|
// CMIC (Controller Multi-Path I/O Capabilities) - offset 76
|
||||
|
// bit 3: ANA reporting supported
|
||||
|
buf[76] = 0x08 |
||||
|
|
||||
|
// MDTS (Maximum Data Transfer Size) - offset 77
|
||||
|
// 2^MDTS * 4096 = max transfer. MDTS=3 → 32KB
|
||||
|
buf[77] = 3 |
||||
|
|
||||
|
// CNTLID (Controller ID) - offset 78-79
|
||||
|
binary.LittleEndian.PutUint16(buf[78:], c.cntlID) |
||||
|
|
||||
|
// Version - offset 80-83
|
||||
|
binary.LittleEndian.PutUint32(buf[80:], nvmeVersion14) |
||||
|
|
||||
|
// OACS (Optional Admin Command Support) - offset 256-257
|
||||
|
// 0 = no optional admin commands
|
||||
|
binary.LittleEndian.PutUint16(buf[256:], 0) |
||||
|
|
||||
|
// ACRTD (Abort Command Limit) - offset 258
|
||||
|
buf[258] = 3 |
||||
|
|
||||
|
// AERTL (Async Event Request Limit) - offset 259
|
||||
|
buf[259] = 3 |
||||
|
|
||||
|
// FRMW (Firmware Updates) - offset 260
|
||||
|
buf[260] = 0x02 // slot 1 read-only
|
||||
|
|
||||
|
// LPA (Log Page Attributes) - offset 261
|
||||
|
buf[261] = 0 |
||||
|
|
||||
|
// ELPE (Error Log Page Entries) - offset 262
|
||||
|
buf[262] = 0 // 1 entry (0-based)
|
||||
|
|
||||
|
// SQES (Submission Queue Entry Size) - offset 512
|
||||
|
// min=6 (2^6=64 bytes), max=6
|
||||
|
buf[512] = 0x66 |
||||
|
|
||||
|
// CQES (Completion Queue Entry Size) - offset 513
|
||||
|
// min=4 (2^4=16 bytes), max=4
|
||||
|
buf[513] = 0x44 |
||||
|
|
||||
|
// MAXCMD - offset 514-515
|
||||
|
binary.LittleEndian.PutUint16(buf[514:], 64) |
||||
|
|
||||
|
// NN (Number of Namespaces) - offset 516-519
|
||||
|
binary.LittleEndian.PutUint32(buf[516:], 1) |
||||
|
|
||||
|
// ONCS (Optional NVM Command Support) - offset 520-521
|
||||
|
// bit 3: WriteZeros, bit 2: DatasetMgmt (Trim)
|
||||
|
binary.LittleEndian.PutUint16(buf[520:], 0x0C) |
||||
|
|
||||
|
// ANACAP (ANA Capabilities) - offset 522
|
||||
|
// bit 3: reports Optimized state
|
||||
|
buf[522] = 0x08 |
||||
|
|
||||
|
// ANAGRPMAX - offset 524-527
|
||||
|
binary.LittleEndian.PutUint32(buf[524:], 1) |
||||
|
|
||||
|
// NANAGRPID - offset 528-531
|
||||
|
binary.LittleEndian.PutUint32(buf[528:], 1) |
||||
|
|
||||
|
// VWC (Volatile Write Cache) - offset 525
|
||||
|
// bit 0: volatile write cache present → Flush required
|
||||
|
buf[525] = 0x01 |
||||
|
|
||||
|
// SGLS (SGL Support) - offset 536-539
|
||||
|
// bit 0: SGLs supported (required for NVMe/TCP)
|
||||
|
binary.LittleEndian.PutUint32(buf[536:], 0x01) |
||||
|
|
||||
|
// SubNQN (Subsystem NQN) - offset 768, 256 bytes
|
||||
|
copyPadded(buf[768:1024], sub.NQN) |
||||
|
|
||||
|
// IOCCSZ (I/O Queue Command Capsule Supported Size) - offset 1792-1795
|
||||
|
// In 16-byte units: 64/16 = 4
|
||||
|
binary.LittleEndian.PutUint32(buf[1792:], 4) |
||||
|
|
||||
|
// IORCSZ (I/O Queue Response Capsule Supported Size) - offset 1796-1799
|
||||
|
// In 16-byte units: 16/16 = 1
|
||||
|
binary.LittleEndian.PutUint32(buf[1796:], 1) |
||||
|
|
||||
|
// ICDOFF (In Capsule Data Offset) - offset 1800-1801
|
||||
|
// 0 means inline data immediately follows SQE in capsule
|
||||
|
binary.LittleEndian.PutUint16(buf[1800:], 0) |
||||
|
|
||||
|
// FCATT (Fabrics Controller Attributes) - offset 1802
|
||||
|
// bit 0: 0 = I/O controller (not discovery)
|
||||
|
buf[1802] = 0 |
||||
|
|
||||
|
// MSDBD (Maximum SGL Data Block Descriptors) - offset 1803
|
||||
|
buf[1803] = 1 |
||||
|
|
||||
|
// OFCS (Optional Fabric Commands Supported) - offset 1804-1805
|
||||
|
// bit 0: Disconnect command supported
|
||||
|
binary.LittleEndian.PutUint16(buf[1804:], 0x01) |
||||
|
|
||||
|
req.c2hData = buf |
||||
|
return c.sendC2HDataAndResponse(req) |
||||
|
} |
||||
|
|
||||
|
// identifyNamespace returns the 4096-byte Identify Namespace data for NSID=1.
|
||||
|
func (c *Controller) identifyNamespace(req *Request) error { |
||||
|
sub := c.subsystem |
||||
|
if sub == nil { |
||||
|
req.resp.Status = uint16(StatusInvalidField) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
dev := sub.Dev |
||||
|
blockSize := dev.BlockSize() |
||||
|
nsze := dev.VolumeSize() / uint64(blockSize) |
||||
|
|
||||
|
buf := make([]byte, identifySize) |
||||
|
|
||||
|
// NSZE (Namespace Size in blocks) - offset 0-7
|
||||
|
binary.LittleEndian.PutUint64(buf[0:], nsze) |
||||
|
|
||||
|
// NCAP (Namespace Capacity) - offset 8-15
|
||||
|
binary.LittleEndian.PutUint64(buf[8:], nsze) |
||||
|
|
||||
|
// NUSE (Namespace Utilization) - offset 16-23
|
||||
|
binary.LittleEndian.PutUint64(buf[16:], nsze) |
||||
|
|
||||
|
// NSFEAT (Namespace Features) - offset 24
|
||||
|
// bit 0: thin provisioning (supports Trim)
|
||||
|
buf[24] = 0x01 |
||||
|
|
||||
|
// NLBAF (Number of LBA Formats minus 1) - offset 25
|
||||
|
buf[25] = 0 // one format
|
||||
|
|
||||
|
// FLBAS (Formatted LBA Size) - offset 26
|
||||
|
// bits 3:0 = LBA format index (0)
|
||||
|
buf[26] = 0 |
||||
|
|
||||
|
// MC (Metadata Capabilities) - offset 27
|
||||
|
buf[27] = 0 |
||||
|
|
||||
|
// DLFEAT (Deallocate Logical Block Features) - offset 28
|
||||
|
// bit 2: Deallocated blocks return zeros on read
|
||||
|
buf[28] = 0x04 |
||||
|
|
||||
|
// NGUID (Namespace Globally Unique Identifier) - offset 104-119 (16 bytes)
|
||||
|
copy(buf[104:120], sub.NGUID[:]) |
||||
|
|
||||
|
// LBAF[0] (LBA Format 0) - offset 128-131
|
||||
|
// bits 23:16 = LBADS (log2 of block size)
|
||||
|
lbads := uint8(bits.TrailingZeros32(blockSize)) |
||||
|
binary.LittleEndian.PutUint32(buf[128:], uint32(lbads)<<16) |
||||
|
|
||||
|
// ANAGRPID (ANA Group Identifier) - offset 92-95
|
||||
|
binary.LittleEndian.PutUint32(buf[92:], 1) |
||||
|
|
||||
|
req.c2hData = buf |
||||
|
return c.sendC2HDataAndResponse(req) |
||||
|
} |
||||
|
|
||||
|
// identifyActiveNSList returns the list of active namespace IDs (just NSID=1).
|
||||
|
func (c *Controller) identifyActiveNSList(req *Request) error { |
||||
|
buf := make([]byte, identifySize) |
||||
|
// Single namespace: NSID=1
|
||||
|
binary.LittleEndian.PutUint32(buf[0:], 1) |
||||
|
|
||||
|
req.c2hData = buf |
||||
|
return c.sendC2HDataAndResponse(req) |
||||
|
} |
||||
|
|
||||
|
// identifyNSDescriptors returns namespace descriptor list for NSID=1.
|
||||
|
func (c *Controller) identifyNSDescriptors(req *Request) error { |
||||
|
sub := c.subsystem |
||||
|
if sub == nil { |
||||
|
req.resp.Status = uint16(StatusInvalidField) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
buf := make([]byte, identifySize) |
||||
|
off := 0 |
||||
|
|
||||
|
// NGUID descriptor (type=0x02, length=16)
|
||||
|
buf[off] = 0x02 // NIDT: NGUID
|
||||
|
off++ |
||||
|
buf[off] = 16 // NIDL: 16 bytes
|
||||
|
off++ |
||||
|
off += 2 // reserved
|
||||
|
copy(buf[off:off+16], sub.NGUID[:]) |
||||
|
|
||||
|
req.c2hData = buf |
||||
|
return c.sendC2HDataAndResponse(req) |
||||
|
} |
||||
|
|
||||
|
// copyPadded copies src into dst, padding remaining bytes with spaces.
|
||||
|
func copyPadded(dst []byte, src string) { |
||||
|
n := copy(dst, src) |
||||
|
for i := n; i < len(dst); i++ { |
||||
|
dst[i] = ' ' |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,157 @@ |
|||||
|
package nvme |
||||
|
|
||||
|
// handleRead processes an NVMe Read command.
|
||||
|
func (c *Controller) handleRead(req *Request) error { |
||||
|
sub := c.subsystem |
||||
|
if sub == nil { |
||||
|
req.resp.Status = uint16(StatusInvalidField) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
dev := sub.Dev |
||||
|
lba := req.capsule.Lba() |
||||
|
nlb := req.capsule.LbaLength() |
||||
|
blockSize := dev.BlockSize() |
||||
|
totalBytes := uint32(nlb) * blockSize |
||||
|
|
||||
|
// Bounds check
|
||||
|
nsze := dev.VolumeSize() / uint64(blockSize) |
||||
|
if lba+uint64(nlb) > nsze { |
||||
|
req.resp.Status = uint16(StatusLBAOutOfRange) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
data, err := dev.ReadAt(lba, totalBytes) |
||||
|
if err != nil { |
||||
|
req.resp.Status = uint16(mapBlockError(err)) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
req.c2hData = data |
||||
|
return c.sendC2HDataAndResponse(req) |
||||
|
} |
||||
|
|
||||
|
// handleWrite processes an NVMe Write command with inline data.
|
||||
|
func (c *Controller) handleWrite(req *Request) error { |
||||
|
sub := c.subsystem |
||||
|
if sub == nil { |
||||
|
req.resp.Status = uint16(StatusInvalidField) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
// Check ANA state (write-gating)
|
||||
|
if !c.isWriteAllowed() { |
||||
|
req.resp.Status = uint16(StatusNSNotReady) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
// Inline data must be present (DataOffset != 0 in the received PDU).
|
||||
|
// If DataOffset == 0 for a Write, the host expects R2T flow — reject.
|
||||
|
if len(req.payload) == 0 { |
||||
|
req.resp.Status = uint16(StatusInvalidField) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
dev := sub.Dev |
||||
|
lba := req.capsule.Lba() |
||||
|
nlb := req.capsule.LbaLength() |
||||
|
blockSize := dev.BlockSize() |
||||
|
|
||||
|
// Bounds check
|
||||
|
nsze := dev.VolumeSize() / uint64(blockSize) |
||||
|
if lba+uint64(nlb) > nsze { |
||||
|
req.resp.Status = uint16(StatusLBAOutOfRange) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
// Validate payload size matches NLB*blockSize.
|
||||
|
expectedBytes := uint32(nlb) * blockSize |
||||
|
if uint32(len(req.payload)) != expectedBytes { |
||||
|
req.resp.Status = uint16(StatusInvalidField) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
if err := dev.WriteAt(lba, req.payload); err != nil { |
||||
|
req.resp.Status = uint16(mapBlockError(err)) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
// handleFlush processes an NVMe Flush command.
|
||||
|
func (c *Controller) handleFlush(req *Request) error { |
||||
|
sub := c.subsystem |
||||
|
if sub == nil { |
||||
|
req.resp.Status = uint16(StatusInvalidField) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
if !c.isWriteAllowed() { |
||||
|
req.resp.Status = uint16(StatusNSNotReady) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
if err := sub.Dev.SyncCache(); err != nil { |
||||
|
req.resp.Status = uint16(mapBlockError(err)) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
// handleWriteZeros processes an NVMe Write Zeroes command.
|
||||
|
func (c *Controller) handleWriteZeros(req *Request) error { |
||||
|
sub := c.subsystem |
||||
|
if sub == nil { |
||||
|
req.resp.Status = uint16(StatusInvalidField) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
if !c.isWriteAllowed() { |
||||
|
req.resp.Status = uint16(StatusNSNotReady) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
dev := sub.Dev |
||||
|
lba := req.capsule.Lba() |
||||
|
nlb := req.capsule.LbaLength() |
||||
|
blockSize := dev.BlockSize() |
||||
|
totalBytes := uint32(nlb) * blockSize |
||||
|
|
||||
|
// Bounds check
|
||||
|
nsze := dev.VolumeSize() / uint64(blockSize) |
||||
|
if lba+uint64(nlb) > nsze { |
||||
|
req.resp.Status = uint16(StatusLBAOutOfRange) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
// D12 bit 25: DEALLOC — if set, use Trim instead of writing zeros
|
||||
|
if req.capsule.D12&commandBitDeallocate != 0 { |
||||
|
if err := dev.Trim(lba, totalBytes); err != nil { |
||||
|
req.resp.Status = uint16(mapBlockError(err)) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
} else { |
||||
|
zeroBuf := make([]byte, totalBytes) |
||||
|
if err := dev.WriteAt(lba, zeroBuf); err != nil { |
||||
|
req.resp.Status = uint16(mapBlockError(err)) |
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return c.sendResponse(req) |
||||
|
} |
||||
|
|
||||
|
// isWriteAllowed checks if the current ANA state allows writes.
|
||||
|
func (c *Controller) isWriteAllowed() bool { |
||||
|
if c.subsystem == nil { |
||||
|
return false |
||||
|
} |
||||
|
if prov, ok := c.subsystem.Dev.(ANAProvider); ok { |
||||
|
state := prov.ANAState() |
||||
|
return state == anaOptimized || state == anaNonOptimized |
||||
|
} |
||||
|
// No ANA provider: allow if healthy
|
||||
|
return c.subsystem.Dev.IsHealthy() |
||||
|
} |
||||
1541
weed/storage/blockvol/nvme/nvme_qa_test.go
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
2377
weed/storage/blockvol/nvme/nvme_test.go
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
@ -0,0 +1,444 @@ |
|||||
|
// Package nvme implements an NVMe/TCP target for SeaweedFS BlockVol.
|
||||
|
//
|
||||
|
// This package provides a functionally correct NVMe-oF over TCP transport
|
||||
|
// that shares the same BlockVol engine, fencing, replication, and failover
|
||||
|
// as the iSCSI target.
|
||||
|
package nvme |
||||
|
|
||||
|
import ( |
||||
|
"encoding/binary" |
||||
|
"fmt" |
||||
|
) |
||||
|
|
||||
|
// ---------- PDU type codes ----------
|
||||
|
|
||||
|
const ( |
||||
|
pduICReq uint8 = 0x0 // Initialization Connection Request
|
||||
|
pduICResp uint8 = 0x1 // Initialization Connection Response
|
||||
|
pduH2CTermReq uint8 = 0x2 // Host-to-Controller Termination Request
|
||||
|
pduC2HTermReq uint8 = 0x3 // Controller-to-Host Termination Request
|
||||
|
pduCapsuleCmd uint8 = 0x4 // NVMe Capsule Command
|
||||
|
pduCapsuleResp uint8 = 0x5 // NVMe Capsule Response
|
||||
|
pduC2HData uint8 = 0x7 // Controller-to-Host Data Transfer
|
||||
|
pduR2T uint8 = 0x9 // Ready-to-Transfer
|
||||
|
) |
||||
|
|
||||
|
// ---------- Admin command opcodes ----------
|
||||
|
|
||||
|
const ( |
||||
|
adminFlush uint8 = 0x00 // NVM Flush (admin context unused here)
|
||||
|
adminGetLogPage uint8 = 0x02 |
||||
|
adminIdentify uint8 = 0x06 |
||||
|
adminAbort uint8 = 0x08 |
||||
|
adminSetFeatures uint8 = 0x09 |
||||
|
adminGetFeatures uint8 = 0x0A |
||||
|
adminAsyncEvent uint8 = 0x0C |
||||
|
adminKeepAlive uint8 = 0x18 |
||||
|
adminFabric uint8 = 0x7F // Fabric-specific commands
|
||||
|
) |
||||
|
|
||||
|
// ---------- IO command opcodes ----------
|
||||
|
|
||||
|
const ( |
||||
|
ioFlush uint8 = 0x00 |
||||
|
ioWrite uint8 = 0x01 |
||||
|
ioRead uint8 = 0x02 |
||||
|
ioWriteZeros uint8 = 0x08 |
||||
|
) |
||||
|
|
||||
|
// ---------- Fabric command types (FCType) ----------
|
||||
|
|
||||
|
const ( |
||||
|
fcPropertySet uint8 = 0x00 |
||||
|
fcConnect uint8 = 0x01 |
||||
|
fcPropertyGet uint8 = 0x04 |
||||
|
fcDisconnect uint8 = 0x08 |
||||
|
) |
||||
|
|
||||
|
// ---------- Feature identifiers ----------
|
||||
|
|
||||
|
const ( |
||||
|
fidNumberOfQueues uint8 = 0x07 |
||||
|
fidAsyncEventConfig uint8 = 0x0B |
||||
|
fidKeepAliveTimer uint8 = 0x0F |
||||
|
) |
||||
|
|
||||
|
// ---------- Identify CNS types ----------
|
||||
|
|
||||
|
const ( |
||||
|
cnsIdentifyNamespace uint8 = 0x00 |
||||
|
cnsIdentifyController uint8 = 0x01 |
||||
|
cnsActiveNSList uint8 = 0x02 |
||||
|
cnsNSDescriptorList uint8 = 0x03 |
||||
|
) |
||||
|
|
||||
|
// ---------- Log page identifiers ----------
|
||||
|
|
||||
|
const ( |
||||
|
logPageError uint8 = 0x01 |
||||
|
logPageSMART uint8 = 0x02 |
||||
|
logPageANA uint8 = 0x0C |
||||
|
) |
||||
|
|
||||
|
// ---------- Property register offsets ----------
|
||||
|
|
||||
|
const ( |
||||
|
propCAP uint32 = 0x00 // Controller Capabilities
|
||||
|
propVS uint32 = 0x08 // Version
|
||||
|
propCC uint32 = 0x14 // Controller Configuration
|
||||
|
propCSTS uint32 = 0x1C // Controller Status
|
||||
|
) |
||||
|
|
||||
|
// ---------- ANA states ----------
|
||||
|
|
||||
|
const ( |
||||
|
anaOptimized uint8 = 0x01 |
||||
|
anaNonOptimized uint8 = 0x02 |
||||
|
anaInaccessible uint8 = 0x03 |
||||
|
anaPersistentLoss uint8 = 0x04 |
||||
|
anaChange uint8 = 0x0F |
||||
|
) |
||||
|
|
||||
|
// ---------- Misc constants ----------
|
||||
|
|
||||
|
const ( |
||||
|
commonHeaderSize = 8 |
||||
|
maxHeaderSize = 128 |
||||
|
maxH2CDataLen = 0x8000 // 32 KB
|
||||
|
|
||||
|
capsuleCmdSize = 64 // CapsuleCommand specific header size (after CommonHeader)
|
||||
|
capsuleRespSize = 16 // CapsuleResponse specific header size
|
||||
|
c2hDataHdrSize = 16 // C2HDataHeader specific header size
|
||||
|
icBodySize = 120 // ICReq/ICResp body size (after CommonHeader)
|
||||
|
connectDataSize = 1024 |
||||
|
|
||||
|
// Total header lengths including CommonHeader
|
||||
|
capsuleCmdHdrLen = commonHeaderSize + capsuleCmdSize // 72
|
||||
|
capsuleRespHdrLen = commonHeaderSize + capsuleRespSize // 24
|
||||
|
c2hDataHdrLen = commonHeaderSize + c2hDataHdrSize // 24
|
||||
|
icHdrLen = commonHeaderSize + icBodySize // 128
|
||||
|
|
||||
|
commandBitDeallocate = 1 << 25 |
||||
|
|
||||
|
nvmeVersion14 uint32 = 0x00010400 // NVMe 1.4
|
||||
|
|
||||
|
// C2HData flags
|
||||
|
c2hFlagLast uint8 = 0x04 |
||||
|
) |
||||
|
|
||||
|
// ---------- CommonHeader (8 bytes) ----------
|
||||
|
|
||||
|
// CommonHeader is the 8-byte preamble of every NVMe/TCP PDU.
|
||||
|
type CommonHeader struct { |
||||
|
Type uint8 |
||||
|
Flags uint8 |
||||
|
HeaderLength uint8 |
||||
|
DataOffset uint8 |
||||
|
DataLength uint32 |
||||
|
} |
||||
|
|
||||
|
func (h *CommonHeader) Marshal(buf []byte) { |
||||
|
buf[0] = h.Type |
||||
|
buf[1] = h.Flags |
||||
|
buf[2] = h.HeaderLength |
||||
|
buf[3] = h.DataOffset |
||||
|
binary.LittleEndian.PutUint32(buf[4:], h.DataLength) |
||||
|
} |
||||
|
|
||||
|
func (h *CommonHeader) Unmarshal(buf []byte) { |
||||
|
h.Type = buf[0] |
||||
|
h.Flags = buf[1] |
||||
|
h.HeaderLength = buf[2] |
||||
|
h.DataOffset = buf[3] |
||||
|
h.DataLength = binary.LittleEndian.Uint32(buf[4:]) |
||||
|
} |
||||
|
|
||||
|
func (h *CommonHeader) String() string { |
||||
|
return fmt.Sprintf("PDU{type=0x%x hlen=%d doff=%d dlen=%d}", |
||||
|
h.Type, h.HeaderLength, h.DataOffset, h.DataLength) |
||||
|
} |
||||
|
|
||||
|
// ---------- PDU interface ----------
|
||||
|
|
||||
|
// PDU is the interface for all NVMe/TCP PDU-specific headers.
|
||||
|
type PDU interface { |
||||
|
Marshal([]byte) |
||||
|
Unmarshal([]byte) |
||||
|
} |
||||
|
|
||||
|
// ---------- ICRequest (120-byte body) ----------
|
||||
|
|
||||
|
// ICRequest is the host-to-controller initialization request.
|
||||
|
type ICRequest struct { |
||||
|
PDUFormatVersion uint16 |
||||
|
PDUDataAlignment uint8 |
||||
|
PDUDataDigest uint8 |
||||
|
PDUMaxR2T uint32 |
||||
|
// remaining 112 bytes reserved
|
||||
|
} |
||||
|
|
||||
|
func (r *ICRequest) Marshal(buf []byte) { |
||||
|
// zero out the full 120-byte body
|
||||
|
for i := range buf[:icBodySize] { |
||||
|
buf[i] = 0 |
||||
|
} |
||||
|
binary.LittleEndian.PutUint16(buf[0:], r.PDUFormatVersion) |
||||
|
buf[2] = r.PDUDataAlignment |
||||
|
buf[3] = r.PDUDataDigest |
||||
|
binary.LittleEndian.PutUint32(buf[4:], r.PDUMaxR2T) |
||||
|
} |
||||
|
|
||||
|
func (r *ICRequest) Unmarshal(buf []byte) { |
||||
|
r.PDUFormatVersion = binary.LittleEndian.Uint16(buf[0:]) |
||||
|
r.PDUDataAlignment = buf[2] |
||||
|
r.PDUDataDigest = buf[3] |
||||
|
r.PDUMaxR2T = binary.LittleEndian.Uint32(buf[4:]) |
||||
|
} |
||||
|
|
||||
|
// ---------- ICResponse (120-byte body) ----------
|
||||
|
|
||||
|
// ICResponse is the controller-to-host initialization response.
|
||||
|
type ICResponse struct { |
||||
|
PDUFormatVersion uint16 |
||||
|
PDUDataAlignment uint8 |
||||
|
PDUDataDigest uint8 |
||||
|
MaxH2CDataLength uint32 |
||||
|
// remaining 112 bytes reserved
|
||||
|
} |
||||
|
|
||||
|
func (r *ICResponse) Marshal(buf []byte) { |
||||
|
for i := range buf[:icBodySize] { |
||||
|
buf[i] = 0 |
||||
|
} |
||||
|
binary.LittleEndian.PutUint16(buf[0:], r.PDUFormatVersion) |
||||
|
buf[2] = r.PDUDataAlignment |
||||
|
buf[3] = r.PDUDataDigest |
||||
|
binary.LittleEndian.PutUint32(buf[4:], r.MaxH2CDataLength) |
||||
|
} |
||||
|
|
||||
|
func (r *ICResponse) Unmarshal(buf []byte) { |
||||
|
r.PDUFormatVersion = binary.LittleEndian.Uint16(buf[0:]) |
||||
|
r.PDUDataAlignment = buf[2] |
||||
|
r.PDUDataDigest = buf[3] |
||||
|
r.MaxH2CDataLength = binary.LittleEndian.Uint32(buf[4:]) |
||||
|
} |
||||
|
|
||||
|
// ---------- CapsuleCommand (64-byte specific header) ----------
|
||||
|
|
||||
|
// CapsuleCommand is the 64-byte NVMe command capsule.
|
||||
|
type CapsuleCommand struct { |
||||
|
OpCode uint8 |
||||
|
PRP uint8 |
||||
|
CID uint16 |
||||
|
FCType uint8 // Fabric command type (only for OpCode=0x7F)
|
||||
|
NSID uint32 // Namespace ID (bytes 4-7 of NVMe SQE after opcode/flags/CID)
|
||||
|
DPTR [16]byte // Data pointer
|
||||
|
D10 uint32 |
||||
|
D11 uint32 |
||||
|
D12 uint32 |
||||
|
D13 uint32 |
||||
|
D14 uint32 |
||||
|
D15 uint32 |
||||
|
} |
||||
|
|
||||
|
// Lba returns the starting LBA from D10:D11 (Read/Write commands).
|
||||
|
func (c *CapsuleCommand) Lba() uint64 { |
||||
|
return uint64(c.D11)<<32 | uint64(c.D10) |
||||
|
} |
||||
|
|
||||
|
// LbaLength returns the number of logical blocks (0-based in D12, actual = D12&0xFFFF + 1).
|
||||
|
func (c *CapsuleCommand) LbaLength() uint32 { |
||||
|
return c.D12&0xFFFF + 1 |
||||
|
} |
||||
|
|
||||
|
func (c *CapsuleCommand) Marshal(buf []byte) { |
||||
|
for i := range buf[:capsuleCmdSize] { |
||||
|
buf[i] = 0 |
||||
|
} |
||||
|
buf[0] = c.OpCode |
||||
|
buf[1] = c.PRP |
||||
|
binary.LittleEndian.PutUint16(buf[2:], c.CID) |
||||
|
// Bytes 4-7: NSID for normal commands, FCType at byte 4 for Fabric (0x7F).
|
||||
|
// They share the same offset per NVMe spec.
|
||||
|
if c.OpCode == adminFabric { |
||||
|
buf[4] = c.FCType |
||||
|
} else { |
||||
|
binary.LittleEndian.PutUint32(buf[4:], c.NSID) |
||||
|
} |
||||
|
copy(buf[24:40], c.DPTR[:]) |
||||
|
binary.LittleEndian.PutUint32(buf[40:], c.D10) |
||||
|
binary.LittleEndian.PutUint32(buf[44:], c.D11) |
||||
|
binary.LittleEndian.PutUint32(buf[48:], c.D12) |
||||
|
binary.LittleEndian.PutUint32(buf[52:], c.D13) |
||||
|
binary.LittleEndian.PutUint32(buf[56:], c.D14) |
||||
|
binary.LittleEndian.PutUint32(buf[60:], c.D15) |
||||
|
} |
||||
|
|
||||
|
func (c *CapsuleCommand) Unmarshal(buf []byte) { |
||||
|
c.OpCode = buf[0] |
||||
|
c.PRP = buf[1] |
||||
|
c.CID = binary.LittleEndian.Uint16(buf[2:]) |
||||
|
c.FCType = buf[4] |
||||
|
c.NSID = binary.LittleEndian.Uint32(buf[4:]) |
||||
|
copy(c.DPTR[:], buf[24:40]) |
||||
|
c.D10 = binary.LittleEndian.Uint32(buf[40:]) |
||||
|
c.D11 = binary.LittleEndian.Uint32(buf[44:]) |
||||
|
c.D12 = binary.LittleEndian.Uint32(buf[48:]) |
||||
|
c.D13 = binary.LittleEndian.Uint32(buf[52:]) |
||||
|
c.D14 = binary.LittleEndian.Uint32(buf[56:]) |
||||
|
c.D15 = binary.LittleEndian.Uint32(buf[60:]) |
||||
|
} |
||||
|
|
||||
|
func (c *CapsuleCommand) String() string { |
||||
|
return fmt.Sprintf("CapsuleCmd{op=0x%02x cid=%d nsid=%d}", c.OpCode, c.CID, c.NSID) |
||||
|
} |
||||
|
|
||||
|
// ---------- CapsuleResponse (16-byte specific header) ----------
|
||||
|
|
||||
|
// CapsuleResponse is the NVMe completion queue entry (16 bytes).
|
||||
|
type CapsuleResponse struct { |
||||
|
DW0 uint32 // Command-specific DWord 0 (also FabricResponse bytes 0-3)
|
||||
|
DW1 uint32 // Command-specific DWord 1 (also FabricResponse bytes 4-7)
|
||||
|
SQHD uint16 // Submission Queue Head Pointer
|
||||
|
QueueID uint16 |
||||
|
CID uint16 |
||||
|
Status uint16 // Status field: DNR(15) | More(14) | SCT(13:9) | SC(8:1) | P(0)
|
||||
|
} |
||||
|
|
||||
|
func (r *CapsuleResponse) Marshal(buf []byte) { |
||||
|
binary.LittleEndian.PutUint32(buf[0:], r.DW0) |
||||
|
binary.LittleEndian.PutUint32(buf[4:], r.DW1) |
||||
|
binary.LittleEndian.PutUint16(buf[8:], r.SQHD) |
||||
|
binary.LittleEndian.PutUint16(buf[10:], r.QueueID) |
||||
|
binary.LittleEndian.PutUint16(buf[12:], r.CID) |
||||
|
binary.LittleEndian.PutUint16(buf[14:], r.Status) |
||||
|
} |
||||
|
|
||||
|
func (r *CapsuleResponse) Unmarshal(buf []byte) { |
||||
|
r.DW0 = binary.LittleEndian.Uint32(buf[0:]) |
||||
|
r.DW1 = binary.LittleEndian.Uint32(buf[4:]) |
||||
|
r.SQHD = binary.LittleEndian.Uint16(buf[8:]) |
||||
|
r.QueueID = binary.LittleEndian.Uint16(buf[10:]) |
||||
|
r.CID = binary.LittleEndian.Uint16(buf[12:]) |
||||
|
r.Status = binary.LittleEndian.Uint16(buf[14:]) |
||||
|
} |
||||
|
|
||||
|
func (r *CapsuleResponse) String() string { |
||||
|
return fmt.Sprintf("CapsuleResp{sqhd=%d qid=%d cid=%d status=0x%04x}", |
||||
|
r.SQHD, r.QueueID, r.CID, r.Status) |
||||
|
} |
||||
|
|
||||
|
// ---------- C2HDataHeader (16-byte specific header) ----------
|
||||
|
|
||||
|
// C2HDataHeader is the controller-to-host data transfer header.
|
||||
|
type C2HDataHeader struct { |
||||
|
CCCID uint16 // Command Capsule CID
|
||||
|
_ uint16 // reserved
|
||||
|
DATAO uint32 // Data offset within the total transfer
|
||||
|
DATAL uint32 // Data length in this PDU
|
||||
|
_pad uint32 // reserved
|
||||
|
} |
||||
|
|
||||
|
func (h *C2HDataHeader) Marshal(buf []byte) { |
||||
|
for i := range buf[:c2hDataHdrSize] { |
||||
|
buf[i] = 0 |
||||
|
} |
||||
|
binary.LittleEndian.PutUint16(buf[0:], h.CCCID) |
||||
|
binary.LittleEndian.PutUint32(buf[4:], h.DATAO) |
||||
|
binary.LittleEndian.PutUint32(buf[8:], h.DATAL) |
||||
|
} |
||||
|
|
||||
|
func (h *C2HDataHeader) Unmarshal(buf []byte) { |
||||
|
h.CCCID = binary.LittleEndian.Uint16(buf[0:]) |
||||
|
h.DATAO = binary.LittleEndian.Uint32(buf[4:]) |
||||
|
h.DATAL = binary.LittleEndian.Uint32(buf[8:]) |
||||
|
} |
||||
|
|
||||
|
// ---------- ConnectData (1024 bytes, payload of Fabric Connect) ----------
|
||||
|
|
||||
|
// ConnectData is the 1024-byte payload sent with a Fabric Connect command.
|
||||
|
type ConnectData struct { |
||||
|
HostID [16]byte // Host UUID
|
||||
|
CNTLID uint16 // Requested controller ID (0xFFFF = new)
|
||||
|
SubNQN string // Subsystem NQN
|
||||
|
HostNQN string // Host NQN
|
||||
|
} |
||||
|
|
||||
|
func (d *ConnectData) Marshal(buf []byte) { |
||||
|
for i := range buf[:connectDataSize] { |
||||
|
buf[i] = 0 |
||||
|
} |
||||
|
copy(buf[0:16], d.HostID[:]) |
||||
|
binary.LittleEndian.PutUint16(buf[16:], d.CNTLID) |
||||
|
copyNQN(buf[256:512], d.SubNQN) |
||||
|
copyNQN(buf[512:768], d.HostNQN) |
||||
|
} |
||||
|
|
||||
|
func (d *ConnectData) Unmarshal(buf []byte) { |
||||
|
copy(d.HostID[:], buf[0:16]) |
||||
|
d.CNTLID = binary.LittleEndian.Uint16(buf[16:]) |
||||
|
d.SubNQN = extractNQN(buf[256:512]) |
||||
|
d.HostNQN = extractNQN(buf[512:768]) |
||||
|
} |
||||
|
|
||||
|
// copyNQN writes a NUL-terminated string into a fixed-size buffer.
|
||||
|
func copyNQN(dst []byte, s string) { |
||||
|
n := copy(dst, s) |
||||
|
if n < len(dst) { |
||||
|
dst[n] = 0 |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// extractNQN reads a NUL-terminated string from a fixed-size buffer.
|
||||
|
func extractNQN(buf []byte) string { |
||||
|
for i, b := range buf { |
||||
|
if b == 0 { |
||||
|
return string(buf[:i]) |
||||
|
} |
||||
|
} |
||||
|
return string(buf) |
||||
|
} |
||||
|
|
||||
|
// ---------- Status word encoding ----------
|
||||
|
|
||||
|
// StatusWord encodes NVMe status: DNR(15) | More(14) | SCT(13:9) | SC(8:1) | P(0)
|
||||
|
//
|
||||
|
// StatusWord = (DNR << 15) | (SCT << 9) | (SC << 1)
|
||||
|
type StatusWord uint16 |
||||
|
|
||||
|
// MakeStatus constructs a status word from SCT, SC, and DNR flag.
|
||||
|
func MakeStatus(sct, sc uint8, dnr bool) StatusWord { |
||||
|
w := uint16(sct)<<9 | uint16(sc)<<1 |
||||
|
if dnr { |
||||
|
w |= 1 << 15 |
||||
|
} |
||||
|
return StatusWord(w) |
||||
|
} |
||||
|
|
||||
|
// StatusSuccess is the zero-value success status.
|
||||
|
const StatusSuccess StatusWord = 0 |
||||
|
|
||||
|
// Pre-defined status words used in the NVMe target.
|
||||
|
var ( |
||||
|
StatusInvalidOpcode = MakeStatus(0, 0x01, true) // Generic: Invalid Command Opcode
|
||||
|
StatusInvalidField = MakeStatus(0, 0x02, true) // Generic: Invalid Field in Command
|
||||
|
StatusInternalError = MakeStatus(0, 0x06, false) // Generic: Internal Error (retryable)
|
||||
|
StatusInternalErrorDNR = MakeStatus(0, 0x06, true) // Generic: Internal Error (permanent)
|
||||
|
StatusNSNotReady = MakeStatus(0, 0x82, false) // Generic: Namespace Not Ready (retryable)
|
||||
|
StatusNSNotReadyDNR = MakeStatus(0, 0x82, true) // Generic: Namespace Not Ready (permanent)
|
||||
|
StatusLBAOutOfRange = MakeStatus(0, 0x80, true) // Generic: LBA Out of Range
|
||||
|
StatusMediaWriteFault = MakeStatus(2, 0x80, false) // Media: Write Fault
|
||||
|
StatusMediaReadError = MakeStatus(2, 0x81, false) // Media: Uncorrectable Read Error
|
||||
|
) |
||||
|
|
||||
|
func (s StatusWord) SCT() uint8 { return uint8((s >> 9) & 0x07) } |
||||
|
func (s StatusWord) SC() uint8 { return uint8((s >> 1) & 0xFF) } |
||||
|
func (s StatusWord) DNR() bool { return s&(1<<15) != 0 } |
||||
|
func (s StatusWord) IsError() bool { return s != StatusSuccess } |
||||
|
|
||||
|
func (s StatusWord) String() string { |
||||
|
if s == StatusSuccess { |
||||
|
return "Success" |
||||
|
} |
||||
|
return fmt.Sprintf("Status{sct=%d sc=0x%02x dnr=%v}", s.SCT(), s.SC(), s.DNR()) |
||||
|
} |
||||
@ -0,0 +1,210 @@ |
|||||
|
package nvme |
||||
|
|
||||
|
import ( |
||||
|
"fmt" |
||||
|
"log" |
||||
|
"net" |
||||
|
"sync" |
||||
|
"sync/atomic" |
||||
|
"time" |
||||
|
) |
||||
|
|
||||
|
// Config holds NVMe/TCP target configuration.
|
||||
|
type Config struct { |
||||
|
ListenAddr string |
||||
|
NQNPrefix string |
||||
|
MaxH2CDataLength uint32 |
||||
|
MaxIOQueues uint16 |
||||
|
Enabled bool |
||||
|
} |
||||
|
|
||||
|
// DefaultConfig returns the default NVMe target configuration.
|
||||
|
func DefaultConfig() Config { |
||||
|
return Config{ |
||||
|
ListenAddr: "0.0.0.0:4420", |
||||
|
NQNPrefix: "nqn.2024-01.com.seaweedfs:vol.", |
||||
|
MaxH2CDataLength: maxH2CDataLen, |
||||
|
MaxIOQueues: 4, |
||||
|
Enabled: false, |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// adminSession stores state from an admin queue connection that IO queue
|
||||
|
// connections need to look up (they arrive on separate TCP connections).
|
||||
|
type adminSession struct { |
||||
|
cntlID uint16 |
||||
|
subsystem *Subsystem |
||||
|
subNQN string |
||||
|
hostNQN string |
||||
|
regCAP uint64 |
||||
|
regCC uint32 |
||||
|
regCSTS uint32 |
||||
|
regVS uint32 |
||||
|
katoMs uint32 |
||||
|
} |
||||
|
|
||||
|
// Server is the NVMe/TCP target server.
|
||||
|
type Server struct { |
||||
|
cfg Config |
||||
|
listener net.Listener |
||||
|
mu sync.RWMutex |
||||
|
subsystems map[string]*Subsystem // NQN → Subsystem
|
||||
|
sessions map[*Controller]struct{} |
||||
|
adminMu sync.RWMutex |
||||
|
admins map[uint16]*adminSession // CNTLID → admin session
|
||||
|
nextCNTLID atomic.Uint32 |
||||
|
closed atomic.Bool |
||||
|
wg sync.WaitGroup |
||||
|
} |
||||
|
|
||||
|
// NewServer creates a new NVMe/TCP target server.
|
||||
|
func NewServer(cfg Config) *Server { |
||||
|
return &Server{ |
||||
|
cfg: cfg, |
||||
|
subsystems: make(map[string]*Subsystem), |
||||
|
sessions: make(map[*Controller]struct{}), |
||||
|
admins: make(map[uint16]*adminSession), |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// AddVolume registers a block device as an NVMe subsystem.
|
||||
|
func (s *Server) AddVolume(nqn string, dev BlockDevice, nguid [16]byte) { |
||||
|
s.mu.Lock() |
||||
|
defer s.mu.Unlock() |
||||
|
s.subsystems[nqn] = &Subsystem{ |
||||
|
NQN: nqn, |
||||
|
Dev: dev, |
||||
|
NGUID: nguid, |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// RemoveVolume unregisters an NVMe subsystem.
|
||||
|
func (s *Server) RemoveVolume(nqn string) { |
||||
|
s.mu.Lock() |
||||
|
defer s.mu.Unlock() |
||||
|
delete(s.subsystems, nqn) |
||||
|
} |
||||
|
|
||||
|
// ListenAndServe starts the NVMe/TCP listener.
|
||||
|
// If not enabled, returns nil immediately.
|
||||
|
func (s *Server) ListenAndServe() error { |
||||
|
if !s.cfg.Enabled { |
||||
|
return nil |
||||
|
} |
||||
|
|
||||
|
ln, err := net.Listen("tcp", s.cfg.ListenAddr) |
||||
|
if err != nil { |
||||
|
return fmt.Errorf("nvme listen %s: %w", s.cfg.ListenAddr, err) |
||||
|
} |
||||
|
s.listener = ln |
||||
|
log.Printf("nvme: listening on %s", s.cfg.ListenAddr) |
||||
|
|
||||
|
s.wg.Add(1) |
||||
|
go func() { |
||||
|
defer s.wg.Done() |
||||
|
s.acceptLoop() |
||||
|
}() |
||||
|
return nil |
||||
|
} |
||||
|
|
||||
|
func (s *Server) acceptLoop() { |
||||
|
for { |
||||
|
conn, err := s.listener.Accept() |
||||
|
if err != nil { |
||||
|
if s.closed.Load() { |
||||
|
return |
||||
|
} |
||||
|
log.Printf("nvme: accept error: %v", err) |
||||
|
continue |
||||
|
} |
||||
|
|
||||
|
ctrl := newController(conn, s) |
||||
|
s.addSession(ctrl) |
||||
|
|
||||
|
s.wg.Add(1) |
||||
|
go func() { |
||||
|
defer s.wg.Done() |
||||
|
if err := ctrl.Serve(); err != nil { |
||||
|
if !s.closed.Load() { |
||||
|
log.Printf("nvme: session error: %v", err) |
||||
|
} |
||||
|
} |
||||
|
}() |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
func (s *Server) addSession(ctrl *Controller) { |
||||
|
s.mu.Lock() |
||||
|
defer s.mu.Unlock() |
||||
|
s.sessions[ctrl] = struct{}{} |
||||
|
} |
||||
|
|
||||
|
func (s *Server) removeSession(ctrl *Controller) { |
||||
|
s.mu.Lock() |
||||
|
defer s.mu.Unlock() |
||||
|
delete(s.sessions, ctrl) |
||||
|
} |
||||
|
|
||||
|
// registerAdmin stores admin queue state so IO queue connections can look it up.
|
||||
|
func (s *Server) registerAdmin(sess *adminSession) { |
||||
|
s.adminMu.Lock() |
||||
|
defer s.adminMu.Unlock() |
||||
|
s.admins[sess.cntlID] = sess |
||||
|
} |
||||
|
|
||||
|
// unregisterAdmin removes an admin session by CNTLID.
|
||||
|
func (s *Server) unregisterAdmin(cntlID uint16) { |
||||
|
s.adminMu.Lock() |
||||
|
defer s.adminMu.Unlock() |
||||
|
delete(s.admins, cntlID) |
||||
|
} |
||||
|
|
||||
|
// lookupAdmin returns the admin session for the given CNTLID.
|
||||
|
func (s *Server) lookupAdmin(cntlID uint16) *adminSession { |
||||
|
s.adminMu.RLock() |
||||
|
defer s.adminMu.RUnlock() |
||||
|
return s.admins[cntlID] |
||||
|
} |
||||
|
|
||||
|
// Close gracefully shuts down the server.
|
||||
|
func (s *Server) Close() error { |
||||
|
if !s.cfg.Enabled { |
||||
|
return nil |
||||
|
} |
||||
|
s.closed.Store(true) |
||||
|
|
||||
|
if s.listener != nil { |
||||
|
s.listener.Close() |
||||
|
} |
||||
|
|
||||
|
// Close all active sessions
|
||||
|
s.mu.RLock() |
||||
|
sessions := make([]*Controller, 0, len(s.sessions)) |
||||
|
for ctrl := range s.sessions { |
||||
|
sessions = append(sessions, ctrl) |
||||
|
} |
||||
|
s.mu.RUnlock() |
||||
|
|
||||
|
for _, ctrl := range sessions { |
||||
|
ctrl.conn.Close() |
||||
|
} |
||||
|
|
||||
|
// Wait with timeout
|
||||
|
done := make(chan struct{}) |
||||
|
go func() { |
||||
|
s.wg.Wait() |
||||
|
close(done) |
||||
|
}() |
||||
|
|
||||
|
select { |
||||
|
case <-done: |
||||
|
case <-time.After(5 * time.Second): |
||||
|
log.Printf("nvme: shutdown timed out after 5s") |
||||
|
} |
||||
|
return nil |
||||
|
} |
||||
|
|
||||
|
// NQN returns the full NQN for a volume name.
|
||||
|
func (s *Server) NQN(volName string) string { |
||||
|
return s.cfg.NQNPrefix + volName |
||||
|
} |
||||
@ -0,0 +1,202 @@ |
|||||
|
package nvme |
||||
|
|
||||
|
import ( |
||||
|
"bufio" |
||||
|
"encoding/binary" |
||||
|
"fmt" |
||||
|
"io" |
||||
|
) |
||||
|
|
||||
|
// ---------- Reader ----------
|
||||
|
|
||||
|
// Reader decodes NVMe/TCP PDUs from a stream.
|
||||
|
//
|
||||
|
// Usage:
|
||||
|
//
|
||||
|
// hdr, _ := r.Dequeue() // read 8-byte CommonHeader
|
||||
|
// r.Receive(&capsuleCmd) // read remaining specific header
|
||||
|
// if r.Length() > 0 {
|
||||
|
// data := make([]byte, r.Length())
|
||||
|
// r.ReceiveData(data) // read payload
|
||||
|
// }
|
||||
|
type Reader struct { |
||||
|
rd io.Reader |
||||
|
CH CommonHeader |
||||
|
header [maxHeaderSize]byte |
||||
|
} |
||||
|
|
||||
|
// NewReader wraps an io.Reader for NVMe/TCP PDU decoding.
|
||||
|
func NewReader(r io.Reader) *Reader { |
||||
|
return &Reader{rd: r} |
||||
|
} |
||||
|
|
||||
|
// Dequeue reads the 8-byte CommonHeader, validates bounds, and returns it.
|
||||
|
func (r *Reader) Dequeue() (*CommonHeader, error) { |
||||
|
if _, err := io.ReadFull(r.rd, r.header[:commonHeaderSize]); err != nil { |
||||
|
return nil, err |
||||
|
} |
||||
|
r.CH.Unmarshal(r.header[:commonHeaderSize]) |
||||
|
|
||||
|
// Validate header bounds to prevent panics on malformed PDUs.
|
||||
|
if r.CH.HeaderLength < commonHeaderSize { |
||||
|
return nil, fmt.Errorf("nvme: HeaderLength %d < minimum %d", r.CH.HeaderLength, commonHeaderSize) |
||||
|
} |
||||
|
if r.CH.HeaderLength > maxHeaderSize { |
||||
|
return nil, fmt.Errorf("nvme: HeaderLength %d > maximum %d", r.CH.HeaderLength, maxHeaderSize) |
||||
|
} |
||||
|
if r.CH.DataOffset != 0 && r.CH.DataOffset < r.CH.HeaderLength { |
||||
|
return nil, fmt.Errorf("nvme: DataOffset %d < HeaderLength %d", r.CH.DataOffset, r.CH.HeaderLength) |
||||
|
} |
||||
|
if r.CH.DataOffset != 0 && uint32(r.CH.DataOffset) > r.CH.DataLength { |
||||
|
return nil, fmt.Errorf("nvme: DataOffset %d > DataLength %d", r.CH.DataOffset, r.CH.DataLength) |
||||
|
} |
||||
|
if r.CH.DataLength < uint32(r.CH.HeaderLength) { |
||||
|
return nil, fmt.Errorf("nvme: DataLength %d < HeaderLength %d", r.CH.DataLength, r.CH.HeaderLength) |
||||
|
} |
||||
|
// DataOffset==0 means no inline data — DataLength must equal HeaderLength,
|
||||
|
// otherwise unconsumed bytes desynchronize the stream.
|
||||
|
if r.CH.DataOffset == 0 && r.CH.DataLength != uint32(r.CH.HeaderLength) { |
||||
|
return nil, fmt.Errorf("nvme: DataOffset=0 but DataLength %d != HeaderLength %d", r.CH.DataLength, r.CH.HeaderLength) |
||||
|
} |
||||
|
|
||||
|
return &r.CH, nil |
||||
|
} |
||||
|
|
||||
|
// Receive reads the remaining PDU-specific header (HeaderLength - 8 bytes)
|
||||
|
// and unmarshals it into pdu. It also skips any padding between header and
|
||||
|
// data (DataOffset - HeaderLength bytes).
|
||||
|
func (r *Reader) Receive(pdu PDU) error { |
||||
|
remain := int(r.CH.HeaderLength) - commonHeaderSize |
||||
|
if remain <= 0 { |
||||
|
return nil |
||||
|
} |
||||
|
if _, err := io.ReadFull(r.rd, r.header[commonHeaderSize:r.CH.HeaderLength]); err != nil { |
||||
|
return err |
||||
|
} |
||||
|
pdu.Unmarshal(r.header[commonHeaderSize:r.CH.HeaderLength]) |
||||
|
|
||||
|
// Skip padding between header and data.
|
||||
|
pad := int(r.CH.DataOffset) - int(r.CH.HeaderLength) |
||||
|
if pad > 0 { |
||||
|
if _, err := io.ReadFull(r.rd, make([]byte, pad)); err != nil { |
||||
|
return err |
||||
|
} |
||||
|
} |
||||
|
return nil |
||||
|
} |
||||
|
|
||||
|
// Length returns the payload size: DataLength - DataOffset (when DataOffset != 0).
|
||||
|
func (r *Reader) Length() uint32 { |
||||
|
if r.CH.DataOffset != 0 { |
||||
|
return r.CH.DataLength - uint32(r.CH.DataOffset) |
||||
|
} |
||||
|
return 0 |
||||
|
} |
||||
|
|
||||
|
// ReceiveData reads exactly len(buf) bytes of payload data.
|
||||
|
func (r *Reader) ReceiveData(buf []byte) error { |
||||
|
_, err := io.ReadFull(r.rd, buf) |
||||
|
return err |
||||
|
} |
||||
|
|
||||
|
// ---------- Writer ----------
|
||||
|
|
||||
|
// Writer encodes NVMe/TCP PDUs to a stream.
|
||||
|
type Writer struct { |
||||
|
wr *bufio.Writer |
||||
|
CH CommonHeader |
||||
|
header [maxHeaderSize]byte |
||||
|
} |
||||
|
|
||||
|
// NewWriter wraps an io.Writer for NVMe/TCP PDU encoding.
|
||||
|
func NewWriter(w io.Writer) *Writer { |
||||
|
return &Writer{wr: bufio.NewWriter(w)} |
||||
|
} |
||||
|
|
||||
|
// PrepareHeaderOnly sets up a header-only PDU (no payload).
|
||||
|
// Call Flush() to write it to the wire.
|
||||
|
func (w *Writer) PrepareHeaderOnly(pduType uint8, pdu PDU, specificLen uint8) { |
||||
|
w.CH.Type = pduType |
||||
|
w.CH.Flags = 0 |
||||
|
w.CH.HeaderLength = commonHeaderSize + specificLen |
||||
|
w.CH.DataOffset = 0 |
||||
|
w.CH.DataLength = uint32(w.CH.HeaderLength) |
||||
|
pdu.Marshal(w.header[commonHeaderSize:]) |
||||
|
} |
||||
|
|
||||
|
// PrepareWithData sets up a PDU with payload data.
|
||||
|
// Call Flush() to write it to the wire.
|
||||
|
func (w *Writer) PrepareWithData(pduType, flags uint8, pdu PDU, specificLen uint8, data []byte) { |
||||
|
w.CH.Type = pduType |
||||
|
w.CH.Flags = flags |
||||
|
w.CH.HeaderLength = commonHeaderSize + specificLen |
||||
|
if data != nil { |
||||
|
w.CH.DataOffset = w.CH.HeaderLength |
||||
|
w.CH.DataLength = uint32(w.CH.HeaderLength) + uint32(len(data)) |
||||
|
} else { |
||||
|
w.CH.DataOffset = 0 |
||||
|
w.CH.DataLength = uint32(w.CH.HeaderLength) |
||||
|
} |
||||
|
pdu.Marshal(w.header[commonHeaderSize:]) |
||||
|
} |
||||
|
|
||||
|
// Flush writes the prepared CommonHeader + specific header to the wire.
|
||||
|
// If there was payload data (from PrepareWithData), call FlushData after.
|
||||
|
func (w *Writer) Flush() error { |
||||
|
w.CH.Marshal(w.header[:commonHeaderSize]) |
||||
|
if _, err := w.wr.Write(w.header[:w.CH.HeaderLength]); err != nil { |
||||
|
return err |
||||
|
} |
||||
|
return nil |
||||
|
} |
||||
|
|
||||
|
// FlushData writes payload data and flushes the underlying buffered writer.
|
||||
|
func (w *Writer) FlushData(data []byte) error { |
||||
|
if len(data) > 0 { |
||||
|
if _, err := w.wr.Write(data); err != nil { |
||||
|
return err |
||||
|
} |
||||
|
} |
||||
|
return w.wr.Flush() |
||||
|
} |
||||
|
|
||||
|
// SendHeaderOnly writes a complete header-only PDU (prepare + flush).
|
||||
|
func (w *Writer) SendHeaderOnly(pduType uint8, pdu PDU, specificLen uint8) error { |
||||
|
w.PrepareHeaderOnly(pduType, pdu, specificLen) |
||||
|
if err := w.Flush(); err != nil { |
||||
|
return err |
||||
|
} |
||||
|
return w.wr.Flush() |
||||
|
} |
||||
|
|
||||
|
// SendWithData writes a complete PDU with payload data.
|
||||
|
func (w *Writer) SendWithData(pduType, flags uint8, pdu PDU, specificLen uint8, data []byte) error { |
||||
|
w.PrepareWithData(pduType, flags, pdu, specificLen, data) |
||||
|
if err := w.Flush(); err != nil { |
||||
|
return err |
||||
|
} |
||||
|
return w.FlushData(data) |
||||
|
} |
||||
|
|
||||
|
// writeRaw writes raw bytes directly (used for ConnectData inline in capsule).
|
||||
|
func (w *Writer) writeRaw(data []byte) error { |
||||
|
_, err := w.wr.Write(data) |
||||
|
return err |
||||
|
} |
||||
|
|
||||
|
// flushBuf flushes the underlying buffered writer.
|
||||
|
func (w *Writer) flushBuf() error { |
||||
|
return w.wr.Flush() |
||||
|
} |
||||
|
|
||||
|
// ---------- Helpers ----------
|
||||
|
|
||||
|
// putLE32 writes a uint32 in little-endian.
|
||||
|
func putLE32(buf []byte, v uint32) { |
||||
|
binary.LittleEndian.PutUint32(buf, v) |
||||
|
} |
||||
|
|
||||
|
// putLE64 writes a uint64 in little-endian.
|
||||
|
func putLE64(buf []byte, v uint64) { |
||||
|
binary.LittleEndian.PutUint64(buf, v) |
||||
|
} |
||||
Write
Preview
Loading…
Cancel
Save
Reference in new issue