diff --git a/weed/command/volume.go b/weed/command/volume.go index 302333651..a5bf8b3e7 100644 --- a/weed/command/volume.go +++ b/weed/command/volume.go @@ -79,6 +79,12 @@ type VolumeServerOptions struct { blockDir *string blockIQNPrefix *string blockPortal *string + // Block volume (NVMe/TCP) options + blockNvmeEnable *bool + blockNvmeListen *string + blockNvmePortal *string + blockNvmeNQNPrefix *string + blockNvmeMaxIOQueues *int } func init() { @@ -123,6 +129,11 @@ func init() { v.blockDir = cmdVolume.Flag.String("block.dir", "", "directory containing .blk block volume files. Empty disables iSCSI block service.") v.blockIQNPrefix = cmdVolume.Flag.String("block.iqn.prefix", "iqn.2024-01.com.seaweedfs:vol.", "IQN prefix for block volume iSCSI targets") v.blockPortal = cmdVolume.Flag.String("block.portal", "", "public iSCSI portal address for SendTargets discovery (e.g. 192.168.1.100:3260,1). Required for Windows clients and Docker deployments.") + v.blockNvmeEnable = cmdVolume.Flag.Bool("block.nvme.enable", false, "enable NVMe/TCP target for block volumes (default off)") + v.blockNvmeListen = cmdVolume.Flag.String("block.nvme.listen", "0.0.0.0:4420", "NVMe/TCP target listen address") + v.blockNvmePortal = cmdVolume.Flag.String("block.nvme.portal", "", "public NVMe/TCP portal address (e.g. 192.168.1.100:4420)") + v.blockNvmeNQNPrefix = cmdVolume.Flag.String("block.nvme.nqnPrefix", "nqn.2024-01.com.seaweedfs:vol.", "NQN prefix for NVMe subsystems") + v.blockNvmeMaxIOQueues = cmdVolume.Flag.Int("block.nvme.maxIOQueues", 4, "maximum NVMe I/O queues per controller (1-128)") } var cmdVolume = &Command{ @@ -323,7 +334,14 @@ func (v VolumeServerOptions) startVolumeServer(volumeFolders, maxVolumeCounts, v blockPortal = fmt.Sprintf("%s:%s,1", *v.ip, port) glog.V(0).Infof("block service: auto-derived portal address %s from -ip flag", blockPortal) } - blockService := weed_server.StartBlockService(*v.blockListen, *v.blockDir, *v.blockIQNPrefix, blockPortal) + nvmeCfg := weed_server.NVMeConfig{ + Enabled: *v.blockNvmeEnable, + ListenAddr: *v.blockNvmeListen, + Portal: *v.blockNvmePortal, + NQNPrefix: *v.blockNvmeNQNPrefix, + MaxIOQueues: *v.blockNvmeMaxIOQueues, + } + blockService := weed_server.StartBlockService(*v.blockListen, *v.blockDir, *v.blockIQNPrefix, blockPortal, nvmeCfg) if blockService != nil { volumeServer.SetBlockService(blockService) } diff --git a/weed/server/block_heartbeat_loop_test.go b/weed/server/block_heartbeat_loop_test.go index 93544286b..7af33c6f6 100644 --- a/weed/server/block_heartbeat_loop_test.go +++ b/weed/server/block_heartbeat_loop_test.go @@ -247,7 +247,7 @@ func newTestBlockService(t *testing.T) *BlockService { t.Helper() dir := t.TempDir() createTestBlockVolFile(t, dir, "hb-test.blk") - bs := StartBlockService("127.0.0.1:0", dir, "iqn.2024-01.com.test:vol.", "127.0.0.1:3260,1") + bs := StartBlockService("127.0.0.1:0", dir, "iqn.2024-01.com.test:vol.", "127.0.0.1:3260,1", NVMeConfig{}) if bs == nil { t.Fatal("expected non-nil BlockService") } diff --git a/weed/server/volume_grpc_block_test.go b/weed/server/volume_grpc_block_test.go index d5a7aefa9..055c1f22b 100644 --- a/weed/server/volume_grpc_block_test.go +++ b/weed/server/volume_grpc_block_test.go @@ -12,7 +12,7 @@ func newTestBlockServiceWithDir(t *testing.T) (*BlockService, string) { dir := t.TempDir() blockDir := filepath.Join(dir, "blocks") os.MkdirAll(blockDir, 0755) - bs := StartBlockService("127.0.0.1:0", blockDir, "iqn.2024.test:", "127.0.0.1:3260,1") + bs := StartBlockService("127.0.0.1:0", blockDir, "iqn.2024.test:", "127.0.0.1:3260,1", NVMeConfig{}) if bs == nil { t.Fatal("StartBlockService returned nil") } diff --git a/weed/server/volume_server_block.go b/weed/server/volume_server_block.go index f029f93b4..af1d02a06 100644 --- a/weed/server/volume_server_block.go +++ b/weed/server/volume_server_block.go @@ -13,6 +13,7 @@ import ( "github.com/seaweedfs/seaweedfs/weed/storage" "github.com/seaweedfs/seaweedfs/weed/storage/blockvol" "github.com/seaweedfs/seaweedfs/weed/storage/blockvol/iscsi" + "github.com/seaweedfs/seaweedfs/weed/storage/blockvol/nvme" ) // volReplState tracks active replication addresses per volume. @@ -21,11 +22,22 @@ type volReplState struct { replicaCtrlAddr string } -// BlockService manages block volumes and the iSCSI target server. +// NVMeConfig holds NVMe/TCP target configuration passed from CLI flags. +type NVMeConfig struct { + Enabled bool + ListenAddr string + Portal string // reserved for heartbeat/CSI integration (CP10-2) + NQNPrefix string + MaxIOQueues int +} + +// BlockService manages block volumes and the iSCSI/NVMe target servers. type BlockService struct { blockStore *storage.BlockVolumeStore targetServer *iscsi.TargetServer + nvmeServer *nvme.Server iqnPrefix string + nqnPrefix string blockDir string listenAddr string @@ -35,9 +47,9 @@ type BlockService struct { } // StartBlockService scans blockDir for .blk files, opens them as block volumes, -// registers them with an iSCSI target server, and starts listening. +// registers them with iSCSI and optionally NVMe target servers, and starts listening. // Returns nil if blockDir is empty (feature disabled). -func StartBlockService(listenAddr, blockDir, iqnPrefix, portalAddr string) *BlockService { +func StartBlockService(listenAddr, blockDir, iqnPrefix, portalAddr string, nvmeCfg NVMeConfig) *BlockService { if blockDir == "" { return nil } @@ -45,14 +57,20 @@ func StartBlockService(listenAddr, blockDir, iqnPrefix, portalAddr string) *Bloc if iqnPrefix == "" { iqnPrefix = "iqn.2024-01.com.seaweedfs:vol." } + nqnPrefix := nvmeCfg.NQNPrefix + if nqnPrefix == "" { + nqnPrefix = "nqn.2024-01.com.seaweedfs:vol." + } bs := &BlockService{ blockStore: storage.NewBlockVolumeStore(), iqnPrefix: iqnPrefix, + nqnPrefix: nqnPrefix, blockDir: blockDir, listenAddr: listenAddr, } + // iSCSI target setup. logger := log.New(os.Stderr, "iscsi: ", log.LstdFlags) config := iscsi.DefaultTargetConfig() @@ -63,6 +81,20 @@ func StartBlockService(listenAddr, blockDir, iqnPrefix, portalAddr string) *Bloc bs.targetServer.SetPortalAddr(portalAddr) } + // NVMe/TCP target setup (optional). + if nvmeCfg.Enabled { + maxQ := uint16(4) + if nvmeCfg.MaxIOQueues >= 1 && nvmeCfg.MaxIOQueues <= 128 { + maxQ = uint16(nvmeCfg.MaxIOQueues) + } + bs.nvmeServer = nvme.NewServer(nvme.Config{ + ListenAddr: nvmeCfg.ListenAddr, + NQNPrefix: nqnPrefix, + MaxIOQueues: maxQ, + Enabled: true, + }) + } + // Scan blockDir for .blk files. entries, err := os.ReadDir(blockDir) if err != nil { @@ -101,12 +133,8 @@ func StartBlockService(listenAddr, blockDir, iqnPrefix, portalAddr string) *Bloc } } - // Derive IQN from filename: vol1.blk -> iqn.2024-01.com.seaweedfs:vol.vol1 name := strings.TrimSuffix(entry.Name(), ".blk") - iqn := iqnPrefix + blockvol.SanitizeIQN(name) - adapter := blockvol.NewBlockVolAdapter(vol) - bs.targetServer.AddVolume(iqn, adapter) - glog.V(0).Infof("block service: registered %s as %s", path, iqn) + bs.registerVolume(vol, name) } // Start iSCSI target in background. @@ -115,11 +143,36 @@ func StartBlockService(listenAddr, blockDir, iqnPrefix, portalAddr string) *Bloc glog.Warningf("block service: iSCSI target stopped: %v", err) } }() - glog.V(0).Infof("block service: iSCSI target started on %s", listenAddr) + + // Start NVMe/TCP target in background (if enabled). + if bs.nvmeServer != nil { + if err := bs.nvmeServer.ListenAndServe(); err != nil { + glog.Warningf("block service: NVMe/TCP target failed to start: %v (iSCSI continues)", err) + bs.nvmeServer = nil // disable NVMe, iSCSI continues + } else { + glog.V(0).Infof("block service: NVMe/TCP target started on %s", nvmeCfg.ListenAddr) + } + } + return bs } +// registerVolume adds a volume to both iSCSI and NVMe targets. +func (bs *BlockService) registerVolume(vol *blockvol.BlockVol, name string) { + iqn := bs.iqnPrefix + blockvol.SanitizeIQN(name) + adapter := blockvol.NewBlockVolAdapter(vol) + bs.targetServer.AddVolume(iqn, adapter) + + if bs.nvmeServer != nil { + nqn := bs.nqnPrefix + blockvol.SanitizeIQN(name) + nvmeAdapter := nvme.NewNVMeAdapter(vol) + bs.nvmeServer.AddVolume(nqn, nvmeAdapter, nvmeAdapter.DeviceNGUID()) + } + + glog.V(0).Infof("block service: registered %s", name) +} + // Store returns the underlying BlockVolumeStore. func (bs *BlockService) Store() *storage.BlockVolumeStore { return bs.blockStore @@ -151,10 +204,15 @@ func (bs *BlockService) CreateBlockVol(name string, sizeBytes uint64, diskType s return "", "", "", fmt.Errorf("block volume %q exists with size %d (requested %d)", name, info.VolumeSize, sizeBytes) } - // Re-add to TargetServer in case it was cleared (crash recovery). + // Re-add to targets in case they were cleared (crash recovery). // AddVolume is idempotent — no-op if already registered. adapter := blockvol.NewBlockVolAdapter(vol) bs.targetServer.AddVolume(iqn, adapter) + if bs.nvmeServer != nil { + nqn := bs.nqnPrefix + blockvol.SanitizeIQN(name) + nvmeAdapter := nvme.NewNVMeAdapter(vol) + bs.nvmeServer.AddVolume(nqn, nvmeAdapter, nvmeAdapter.DeviceNGUID()) + } return path, iqn, iscsiAddr, nil } @@ -190,6 +248,13 @@ func (bs *BlockService) CreateBlockVol(name string, sizeBytes uint64, diskType s adapter := blockvol.NewBlockVolAdapter(vol) bs.targetServer.AddVolume(iqn, adapter) + + if bs.nvmeServer != nil { + nqn := bs.nqnPrefix + blockvol.SanitizeIQN(name) + nvmeAdapter := nvme.NewNVMeAdapter(vol) + bs.nvmeServer.AddVolume(nqn, nvmeAdapter, nvmeAdapter.DeviceNGUID()) + } + glog.V(0).Infof("block service: created %s as %s (%d bytes)", path, iqn, sizeBytes) return path, iqn, iscsiAddr, nil } @@ -206,6 +271,12 @@ func (bs *BlockService) DeleteBlockVol(name string) error { bs.targetServer.DisconnectVolume(iqn) } + // Remove from NVMe target. + if bs.nvmeServer != nil { + nqn := bs.nqnPrefix + blockvol.SanitizeIQN(name) + bs.nvmeServer.RemoveVolume(nqn) + } + // Close and unregister. if err := bs.blockStore.RemoveBlockVolume(path); err != nil { // Not found is OK (idempotent). @@ -482,12 +553,15 @@ func (bs *BlockService) ReplicationPorts(volPath string) (dataPort, ctrlPort, re return } -// Shutdown gracefully stops the iSCSI target and closes all block volumes. +// Shutdown gracefully stops the iSCSI and NVMe targets and closes all block volumes. func (bs *BlockService) Shutdown() { if bs == nil { return } glog.V(0).Infof("block service: shutting down...") + if bs.nvmeServer != nil { + bs.nvmeServer.Close() + } if bs.targetServer != nil { bs.targetServer.Close() } diff --git a/weed/server/volume_server_block_test.go b/weed/server/volume_server_block_test.go index ec97784ee..44841cd73 100644 --- a/weed/server/volume_server_block_test.go +++ b/weed/server/volume_server_block_test.go @@ -26,7 +26,7 @@ func createTestBlockVolFile(t *testing.T, dir, name string) string { func TestBlockServiceDisabledByDefault(t *testing.T) { // Empty blockDir means feature is disabled. - bs := StartBlockService("0.0.0.0:3260", "", "", "") + bs := StartBlockService("0.0.0.0:3260", "", "", "", NVMeConfig{}) if bs != nil { bs.Shutdown() t.Fatal("expected nil BlockService when blockDir is empty") @@ -41,7 +41,7 @@ func TestBlockServiceStartAndShutdown(t *testing.T) { dir := t.TempDir() createTestBlockVolFile(t, dir, "testvol.blk") - bs := StartBlockService("127.0.0.1:0", dir, "iqn.2024-01.com.test:vol.", "127.0.0.1:3260,1") + bs := StartBlockService("127.0.0.1:0", dir, "iqn.2024-01.com.test:vol.", "127.0.0.1:3260,1", NVMeConfig{}) if bs == nil { t.Fatal("expected non-nil BlockService") } diff --git a/weed/storage/blockvol/nvme/adapter.go b/weed/storage/blockvol/nvme/adapter.go new file mode 100644 index 000000000..8edabbfd3 --- /dev/null +++ b/weed/storage/blockvol/nvme/adapter.go @@ -0,0 +1,127 @@ +package nvme + +import ( + "errors" + "strings" + + "github.com/seaweedfs/seaweedfs/weed/storage/blockvol" + "github.com/seaweedfs/seaweedfs/weed/storage/blockvol/blockerr" +) + +// NVMeAdapter wraps a *BlockVol to implement BlockDevice and ANAProvider +// for the NVMe/TCP target, bridging the BlockVol storage engine to NVMe +// command handling. +type NVMeAdapter struct { + Vol *blockvol.BlockVol +} + +// NewNVMeAdapter creates a BlockDevice adapter for the given BlockVol. +func NewNVMeAdapter(vol *blockvol.BlockVol) *NVMeAdapter { + return &NVMeAdapter{Vol: vol} +} + +func (a *NVMeAdapter) ReadAt(lba uint64, length uint32) ([]byte, error) { + return a.Vol.ReadLBA(lba, length) +} + +func (a *NVMeAdapter) WriteAt(lba uint64, data []byte) error { + return a.Vol.WriteLBA(lba, data) +} + +func (a *NVMeAdapter) Trim(lba uint64, length uint32) error { + return a.Vol.Trim(lba, length) +} + +func (a *NVMeAdapter) SyncCache() error { + return a.Vol.SyncCache() +} + +func (a *NVMeAdapter) BlockSize() uint32 { + return a.Vol.Info().BlockSize +} + +func (a *NVMeAdapter) VolumeSize() uint64 { + return a.Vol.Info().VolumeSize +} + +func (a *NVMeAdapter) IsHealthy() bool { + return a.Vol.Info().Healthy +} + +// ANAState returns the ANA state based on the volume's role. +func (a *NVMeAdapter) ANAState() uint8 { + return RoleToANAState(a.Vol.Role()) +} + +// ANAGroupID returns the ANA group ID (always 1 for single-group MVP). +func (a *NVMeAdapter) ANAGroupID() uint16 { return 1 } + +// DeviceNGUID returns a 16-byte NGUID derived from the volume UUID. +func (a *NVMeAdapter) DeviceNGUID() [16]byte { + return UUIDToNGUID(a.Vol.Info().UUID) +} + +// Compile-time checks. +var _ BlockDevice = (*NVMeAdapter)(nil) +var _ ANAProvider = (*NVMeAdapter)(nil) + +// RoleToANAState maps a BlockVol Role to an NVMe ANA state. +func RoleToANAState(r blockvol.Role) uint8 { + switch r { + case blockvol.RolePrimary, blockvol.RoleNone: + return anaOptimized + case blockvol.RoleReplica: + return anaInaccessible + case blockvol.RoleStale: + return anaPersistentLoss + case blockvol.RoleRebuilding, blockvol.RoleDraining: + return anaInaccessible + default: + return anaInaccessible + } +} + +// UUIDToNGUID converts a 16-byte UUID to a 16-byte NGUID. +// Uses NAA-6 pattern for first 8 bytes (compatible with iSCSI UUIDToNAA), +// copies remaining bytes as-is. +func UUIDToNGUID(uuid [16]byte) [16]byte { + var nguid [16]byte + nguid[0] = 0x60 | (uuid[0] & 0x0F) + copy(nguid[1:8], uuid[1:8]) + copy(nguid[8:16], uuid[8:16]) + return nguid +} + +// mapBlockError maps BlockVol errors to NVMe status words. +func mapBlockError(err error) StatusWord { + if err == nil { + return StatusSuccess + } + + // Check known sentinel errors from blockvol and blockerr packages. + switch { + case errors.Is(err, blockvol.ErrLeaseExpired): + return StatusNSNotReadyDNR // DNR=1: fencing is permanent + case errors.Is(err, blockvol.ErrEpochRegression): + return StatusInternalErrorDNR // DNR=1: stale controller + case errors.Is(err, blockerr.ErrDurabilityBarrierFailed): + return StatusInternalError // DNR=0: replica may recover + case errors.Is(err, blockerr.ErrDurabilityQuorumLost): + return StatusInternalError // DNR=0: quorum may heal + case errors.Is(err, blockvol.ErrWALFull): + return StatusNSNotReady // DNR=0: transient pressure + case errors.Is(err, blockvol.ErrNotPrimary): + return StatusNSNotReady // DNR=0: may be transitioning + } + + // Heuristic for I/O errors (no dedicated sentinels yet). + msg := err.Error() + if strings.Contains(msg, "write") || strings.Contains(msg, "Write") { + return StatusMediaWriteFault + } + if strings.Contains(msg, "read") || strings.Contains(msg, "Read") { + return StatusMediaReadError + } + + return StatusInternalError +} diff --git a/weed/storage/blockvol/nvme/admin.go b/weed/storage/blockvol/nvme/admin.go new file mode 100644 index 000000000..82505e31a --- /dev/null +++ b/weed/storage/blockvol/nvme/admin.go @@ -0,0 +1,198 @@ +package nvme + +import ( + "encoding/binary" +) + +// handleSetFeatures processes SetFeatures admin commands. +func (c *Controller) handleSetFeatures(req *Request) error { + fid := uint8(req.capsule.D10 & 0xFF) + + switch fid { + case fidNumberOfQueues: + // D11: NCQR[15:0] | NSQR[31:16] + ncqr := uint16(req.capsule.D11 & 0xFFFF) + nsqr := uint16(req.capsule.D11 >> 16) + + // Grant min(requested, max) + if ncqr > c.maxIOQueues { + ncqr = c.maxIOQueues + } + if nsqr > c.maxIOQueues { + nsqr = c.maxIOQueues + } + if ncqr == 0 { + ncqr = 1 + } + if nsqr == 0 { + nsqr = 1 + } + c.grantedQueues = ncqr + + // Response DW0: (NCQR-1) | ((NSQR-1) << 16) + req.resp.DW0 = uint32(ncqr-1) | (uint32(nsqr-1) << 16) + return c.sendResponse(req) + + case fidKeepAliveTimer: + // D11 contains KATO in milliseconds + c.katoMs = req.capsule.D11 + return c.sendResponse(req) + + case fidAsyncEventConfig: + // Stub: accept but don't deliver events + return c.sendResponse(req) + + default: + req.resp.Status = uint16(StatusInvalidField) + return c.sendResponse(req) + } +} + +// handleGetFeatures returns stored feature values. +func (c *Controller) handleGetFeatures(req *Request) error { + fid := uint8(req.capsule.D10 & 0xFF) + + switch fid { + case fidNumberOfQueues: + n := c.grantedQueues + if n == 0 { + n = c.maxIOQueues + } + req.resp.DW0 = uint32(n-1) | (uint32(n-1) << 16) + return c.sendResponse(req) + + case fidKeepAliveTimer: + req.resp.DW0 = c.katoMs + return c.sendResponse(req) + + case fidAsyncEventConfig: + req.resp.DW0 = 0 + return c.sendResponse(req) + + default: + req.resp.Status = uint16(StatusInvalidField) + return c.sendResponse(req) + } +} + +// handleGetLogPage returns log page data. +func (c *Controller) handleGetLogPage(req *Request) error { + // D10 bits 7:0 = Log Page Identifier + // D10 bits 27:16 and D11 bits 15:0 = Number of Dwords (NUMD) + lid := uint8(req.capsule.D10 & 0xFF) + numdl := (req.capsule.D10 >> 16) & 0xFFF + numdu := req.capsule.D11 & 0xFFFF + numd := uint32(numdu)<<16 | uint32(numdl) + length := (numd + 1) * 4 // NUMD is 0-based, in dwords + + switch lid { + case logPageError: + return c.logPageError(req, length) + case logPageSMART: + return c.logPageSMART(req, length) + case logPageANA: + return c.logPageANA(req, length) + default: + req.resp.Status = uint16(StatusInvalidField) + return c.sendResponse(req) + } +} + +// logPageError returns an empty error log page. +func (c *Controller) logPageError(req *Request, length uint32) error { + if length > 64 { + length = 64 + } + req.c2hData = make([]byte, length) + return c.sendC2HDataAndResponse(req) +} + +// logPageSMART returns a 512-byte SMART/Health log. +func (c *Controller) logPageSMART(req *Request, length uint32) error { + if length > 512 { + length = 512 + } + buf := make([]byte, 512) + + // Critical Warning - offset 0: 0 = no warnings + buf[0] = 0 + + // Composite Temperature - offset 1-2: 0 (not implemented) + binary.LittleEndian.PutUint16(buf[1:], 0) + + // Available Spare - offset 3: 100% + buf[3] = 100 + + // Available Spare Threshold - offset 4: 10% + buf[4] = 10 + + // Percentage Used - offset 5: 0% + buf[5] = 0 + + req.c2hData = buf[:length] + return c.sendC2HDataAndResponse(req) +} + +// logPageANA returns the ANA log page with a single group. +func (c *Controller) logPageANA(req *Request, length uint32) error { + // ANA log page format (32 bytes for single group): + // [0:8] CHGCNT (uint64) + // [8:10] NGRPS = 1 (uint16) + // [10:16] reserved + // Group descriptor: + // [16:20] ANAGRPID = 1 (uint32) + // [20:24] NNSID = 1 (uint32) + // [24:32] Change Count (uint64) + // [32] ANA State + // [33:36] reserved + // [36:40] NSID = 1 (uint32) + const anaLogSize = 40 + + buf := make([]byte, anaLogSize) + + // CHGCNT + binary.LittleEndian.PutUint64(buf[0:], c.anaChangeCount()) + + // NGRPS + binary.LittleEndian.PutUint16(buf[8:], 1) + + // Group descriptor + binary.LittleEndian.PutUint32(buf[16:], 1) // ANAGRPID=1 + binary.LittleEndian.PutUint32(buf[20:], 1) // NNSID=1 + binary.LittleEndian.PutUint64(buf[24:], c.anaChangeCount()) // chgcnt + buf[32] = c.anaState() // ANA state + binary.LittleEndian.PutUint32(buf[36:], 1) // NSID=1 + + if length > anaLogSize { + length = anaLogSize + } + req.c2hData = buf[:length] + return c.sendC2HDataAndResponse(req) +} + +// anaState returns the current ANA state based on the subsystem's device. +func (c *Controller) anaState() uint8 { + if c.subsystem == nil { + return anaInaccessible + } + if prov, ok := c.subsystem.Dev.(ANAProvider); ok { + return prov.ANAState() + } + // Default: if healthy → optimized + if c.subsystem.Dev.IsHealthy() { + return anaOptimized + } + return anaInaccessible +} + +// anaChangeCount returns a monotonic ANA change counter. +// For MVP, we use 1 as a constant (no dynamic role changes tracked). +func (c *Controller) anaChangeCount() uint64 { + return 1 +} + +// handleKeepAlive resets the KATO timer and returns success. +func (c *Controller) handleKeepAlive(req *Request) error { + c.resetKATO() + return c.sendResponse(req) +} diff --git a/weed/storage/blockvol/nvme/controller.go b/weed/storage/blockvol/nvme/controller.go new file mode 100644 index 000000000..1e4d4ae4f --- /dev/null +++ b/weed/storage/blockvol/nvme/controller.go @@ -0,0 +1,354 @@ +package nvme + +import ( + "fmt" + "io" + "log" + "net" + "sync" + "sync/atomic" + "time" +) + +// controllerState tracks the lifecycle of an NVMe controller session. +type controllerState int + +const ( + stateConnected controllerState = iota // TCP connected, no IC yet + stateICComplete // IC exchange done + stateAdminReady // Admin queue connected + stateCtrlReady // CC.EN=1, CSTS.RDY=1 + stateIOActive // IO queues active + stateClosed // Shut down +) + +// Request represents an in-flight NVMe command being processed. +type Request struct { + capsule CapsuleCommand + payload []byte // inline data from host (Write commands) + resp CapsuleResponse + c2hData []byte // data to send to host (Read commands) + status StatusWord +} + +// Controller handles one NVMe/TCP connection (one queue per connection). +type Controller struct { + mu sync.Mutex + + // Session identity + conn net.Conn + in *Reader + out *Writer + state controllerState + closed atomic.Bool + + // Queue state (one queue per TCP connection) + queueID uint16 + queueSize uint16 + sqhd uint16 // Submission Queue Head pointer + flowCtlOff bool // CATTR bit2: SQ flow control disabled + + // Controller identity + cntlID uint16 + subNQN string + + // Controller registers + regCAP uint64 // Controller Capabilities + regCC uint32 // Controller Configuration (set by host via PropertySet) + regCSTS uint32 // Controller Status (RDY bit) + regVS uint32 // Version + + // KeepAlive + katoMs uint32 + katoTimer *time.Timer + katoMu sync.Mutex + + // Async completion (IO queues) + waiting chan *Request // pre-allocated request pool + completions chan *Request // completed requests to send + + // Backend + subsystem *Subsystem + server *Server + + // Features + maxIOQueues uint16 + grantedQueues uint16 + isAdmin bool // true if this controller owns admin queue (QID=0) + + // Lifecycle + wg sync.WaitGroup + closeOnce sync.Once +} + +// newController creates a controller for the given connection. +func newController(conn net.Conn, server *Server) *Controller { + c := &Controller{ + conn: conn, + in: NewReader(conn), + out: NewWriter(conn), + state: stateConnected, + server: server, + regVS: nvmeVersion14, + // CAP register: MQES=63 (bits 15:0), CQR=1 (bit 16), TO=30 (bits 31:24, *500ms=15s), CSS bit37=1 (NVM command set) + regCAP: uint64(63) | (1 << 16) | (uint64(30) << 24) | (1 << 37), + maxIOQueues: server.cfg.MaxIOQueues, + } + return c +} + +// Serve is the main event loop for this controller connection. +func (c *Controller) Serve() error { + defer c.shutdown() + + // IC handshake timeout + if err := c.conn.SetReadDeadline(time.Now().Add(10 * time.Second)); err != nil { + return err + } + + for { + if c.closed.Load() { + return nil + } + + hdr, err := c.in.Dequeue() + if err != nil { + if err == io.EOF || c.closed.Load() { + return nil + } + return fmt.Errorf("read header: %w", err) + } + + switch hdr.Type { + case pduICReq: + if err := c.handleIC(); err != nil { + return fmt.Errorf("IC handshake: %w", err) + } + // Clear read deadline after successful IC + if err := c.conn.SetReadDeadline(time.Time{}); err != nil { + return err + } + + case pduCapsuleCmd: + if err := c.handleCapsule(); err != nil { + return fmt.Errorf("capsule: %w", err) + } + + case pduH2CTermReq: + return nil // host terminated + + default: + return fmt.Errorf("unexpected PDU type: 0x%x", hdr.Type) + } + } +} + +// handleIC processes the IC handshake. +func (c *Controller) handleIC() error { + var req ICRequest + if err := c.in.Receive(&req); err != nil { + return err + } + + resp := ICResponse{ + PDUFormatVersion: 0, + MaxH2CDataLength: maxH2CDataLen, + } + if err := c.out.SendHeaderOnly(pduICResp, &resp, icBodySize); err != nil { + return err + } + + c.state = stateICComplete + return nil +} + +// handleCapsule dispatches a CapsuleCmd PDU. +func (c *Controller) handleCapsule() error { + // Reject capsule commands before IC handshake is complete. + if c.state < stateICComplete { + return fmt.Errorf("capsule command before IC handshake") + } + + var capsule CapsuleCommand + if err := c.in.Receive(&capsule); err != nil { + return err + } + + // Read optional inline data + var payload []byte + if dataLen := c.in.Length(); dataLen > 0 { + payload = make([]byte, dataLen) + if err := c.in.ReceiveData(payload); err != nil { + return err + } + } + + // Advance SQHD + c.sqhd++ + if c.sqhd >= c.queueSize && c.queueSize > 0 { + c.sqhd = 0 + } + + req := &Request{ + capsule: capsule, + payload: payload, + } + req.resp.CID = capsule.CID + req.resp.QueueID = c.queueID + // SQHD is set in sendResponse/sendC2HDataAndResponse using the + // latest c.flowCtlOff value, so Connect responses correctly get + // SQHD=0xFFFF when the host requests flowCtlOff via CATTR. + req.resp.Status = uint16(StatusSuccess) + + if c.queueID == 0 { + return c.dispatchAdmin(req) + } + return c.dispatchIO(req) +} + +// dispatchAdmin handles admin queue commands synchronously. +func (c *Controller) dispatchAdmin(req *Request) error { + capsule := &req.capsule + + if capsule.OpCode == adminFabric { + return c.handleFabricCommand(req) + } + + switch capsule.OpCode { + case adminIdentify: + return c.handleIdentify(req) + case adminSetFeatures: + return c.handleSetFeatures(req) + case adminGetFeatures: + return c.handleGetFeatures(req) + case adminGetLogPage: + return c.handleGetLogPage(req) + case adminKeepAlive: + return c.handleKeepAlive(req) + case adminAsyncEvent: + // Stub: just succeed (don't deliver events in CP10-1) + return c.sendResponse(req) + default: + req.resp.Status = uint16(StatusInvalidOpcode) + return c.sendResponse(req) + } +} + +// dispatchIO handles IO queue commands. +func (c *Controller) dispatchIO(req *Request) error { + capsule := &req.capsule + + switch capsule.OpCode { + case ioRead: + return c.handleRead(req) + case ioWrite: + return c.handleWrite(req) + case ioFlush: + return c.handleFlush(req) + case ioWriteZeros: + return c.handleWriteZeros(req) + default: + req.resp.Status = uint16(StatusInvalidOpcode) + return c.sendResponse(req) + } +} + +// sendC2HDataAndResponse sends C2HData PDUs followed by a CapsuleResp. +func (c *Controller) sendC2HDataAndResponse(req *Request) error { + if len(req.c2hData) > 0 { + data := req.c2hData + offset := uint32(0) + chunkSize := uint32(maxH2CDataLen) + + for offset < uint32(len(data)) { + end := offset + chunkSize + if end > uint32(len(data)) { + end = uint32(len(data)) + } + chunk := data[offset:end] + + hdr := C2HDataHeader{ + CCCID: req.capsule.CID, + DATAO: offset, + DATAL: uint32(len(chunk)), + } + + flags := uint8(0) + if end >= uint32(len(data)) { + flags = c2hFlagLast + } + + if err := c.out.SendWithData(pduC2HData, flags, &hdr, c2hDataHdrSize, chunk); err != nil { + return err + } + offset = end + } + } + + return c.sendResponse(req) +} + +// sendResponse sends a CapsuleResp PDU. +// SQHD is set here (not in handleCapsule) so that flowCtlOff changes +// made during command dispatch (e.g. Fabric Connect) take effect +// on the same response. +func (c *Controller) sendResponse(req *Request) error { + if c.flowCtlOff { + req.resp.SQHD = 0xFFFF + } else { + req.resp.SQHD = c.sqhd + } + c.resetKATO() + return c.out.SendHeaderOnly(pduCapsuleResp, &req.resp, capsuleRespSize) +} + +// ---------- KATO management ---------- + +func (c *Controller) startKATO() { + c.katoMu.Lock() + defer c.katoMu.Unlock() + if c.katoMs == 0 { + return + } + d := time.Duration(c.katoMs) * time.Millisecond + // Add 50% margin per spec recommendation + d = d + d/2 + c.katoTimer = time.AfterFunc(d, func() { + log.Printf("nvme: KATO expired for cntlid=%d, closing connection", c.cntlID) + c.conn.Close() + }) +} + +func (c *Controller) resetKATO() { + c.katoMu.Lock() + defer c.katoMu.Unlock() + if c.katoTimer != nil { + c.katoTimer.Reset(time.Duration(c.katoMs)*time.Millisecond + time.Duration(c.katoMs)*time.Millisecond/2) + } +} + +func (c *Controller) stopKATO() { + c.katoMu.Lock() + defer c.katoMu.Unlock() + if c.katoTimer != nil { + c.katoTimer.Stop() + c.katoTimer = nil + } +} + +// ---------- Lifecycle ---------- + +func (c *Controller) shutdown() { + c.closeOnce.Do(func() { + c.closed.Store(true) + c.stopKATO() + c.state = stateClosed + c.conn.Close() + if c.server != nil { + if c.isAdmin && c.cntlID != 0 { + c.server.unregisterAdmin(c.cntlID) + } + c.server.removeSession(c) + } + }) +} diff --git a/weed/storage/blockvol/nvme/fabric.go b/weed/storage/blockvol/nvme/fabric.go new file mode 100644 index 000000000..ef6f36110 --- /dev/null +++ b/weed/storage/blockvol/nvme/fabric.go @@ -0,0 +1,300 @@ +package nvme + +import ( + "encoding/binary" +) + +// handleFabricCommand dispatches Fabric-specific commands by FCType. +func (c *Controller) handleFabricCommand(req *Request) error { + switch req.capsule.FCType { + case fcConnect: + return c.handleConnect(req) + case fcPropertyGet: + return c.handlePropertyGet(req) + case fcPropertySet: + return c.handlePropertySet(req) + case fcDisconnect: + return c.handleDisconnect(req) + default: + req.resp.Status = uint16(StatusInvalidField) + return c.sendResponse(req) + } +} + +// handleConnect processes a Fabric Connect command. +func (c *Controller) handleConnect(req *Request) error { + capsule := &req.capsule + + // Parse QueueID, QueueSize, KATO, CATTR from capsule dwords. + // Connect command layout (CDW10-CDW12): + // CDW10[15:0]=RECFM, CDW10[31:16]=QID + // CDW11[15:0]=SQSIZE, CDW11[23:16]=CATTR + // CDW12=KATO + queueID := uint16(capsule.D10 >> 16) + queueSize := uint16(capsule.D11&0xFFFF) + 1 // SQSIZE is 0-based + cattr := uint8(capsule.D11 >> 16) + kato := capsule.D12 + + // Parse ConnectData from payload + if len(req.payload) < connectDataSize { + req.resp.Status = uint16(StatusInvalidField) + return c.sendResponse(req) + } + var cd ConnectData + cd.Unmarshal(req.payload) + + if queueID == 0 { + // Admin queue connect + sub := c.server.findSubsystem(cd.SubNQN) + if sub == nil { + req.resp.Status = uint16(StatusInvalidField) + return c.sendResponse(req) + } + + c.subsystem = sub + c.subNQN = cd.SubNQN + c.queueID = 0 + c.queueSize = queueSize + c.cntlID = c.server.allocCNTLID() + c.katoMs = kato + c.flowCtlOff = (cattr & 0x04) != 0 + c.state = stateAdminReady + c.isAdmin = true + + // Register admin session so IO queue connections can find us. + c.server.registerAdmin(&adminSession{ + cntlID: c.cntlID, + subsystem: sub, + subNQN: cd.SubNQN, + hostNQN: cd.HostNQN, + regCAP: c.regCAP, + regCC: c.regCC, + regCSTS: c.regCSTS, + regVS: c.regVS, + katoMs: kato, + }) + + // Return CNTLID in DW0 + req.resp.DW0 = uint32(c.cntlID) + return c.sendResponse(req) + } + + // IO queue connect — look up admin session from server registry. + // IO queues arrive on separate TCP connections with fresh Controllers, + // so we must find the admin session by CNTLID from the server. + admin := c.server.lookupAdmin(cd.CNTLID) + if admin == nil { + req.resp.Status = uint16(StatusInvalidField) + return c.sendResponse(req) + } + + // Validate SubNQN and HostNQN match the admin session. + if cd.SubNQN != admin.subNQN { + req.resp.Status = uint16(StatusInvalidField) + return c.sendResponse(req) + } + if cd.HostNQN != admin.hostNQN { + req.resp.Status = uint16(StatusInvalidField) + return c.sendResponse(req) + } + + c.cntlID = cd.CNTLID + c.subsystem = admin.subsystem + c.subNQN = admin.subNQN + c.queueID = queueID + c.queueSize = queueSize + c.flowCtlOff = (cattr & 0x04) != 0 + c.state = stateIOActive + + req.resp.DW0 = uint32(c.cntlID) + return c.sendResponse(req) +} + +// handlePropertyGet returns a controller register value. +func (c *Controller) handlePropertyGet(req *Request) error { + // Property offset in D10 (bits 31:0, but only lower bits used) + offset := req.capsule.D10 + // Attrib in D11 bit 0: 0=4byte, 1=8byte + size8 := (req.capsule.D11 & 1) != 0 + + var val uint64 + switch offset { + case propCAP: + val = c.regCAP + case propVS: + val = uint64(c.regVS) + case propCC: + val = uint64(c.regCC) + case propCSTS: + val = uint64(c.regCSTS) + default: + req.resp.Status = uint16(StatusInvalidField) + return c.sendResponse(req) + } + + if size8 { + // 8-byte value in DW0+DW1 + req.resp.DW0 = uint32(val) + req.resp.DW1 = uint32(val >> 32) + } else { + req.resp.DW0 = uint32(val) + } + return c.sendResponse(req) +} + +// handlePropertySet handles controller register writes. +func (c *Controller) handlePropertySet(req *Request) error { + offset := req.capsule.D10 + value := uint64(req.capsule.D14) | uint64(req.capsule.D15)<<32 + + switch offset { + case propCC: + c.regCC = uint32(value) + // Check CC.EN (bit 0) + if c.regCC&1 != 0 { + c.regCSTS |= 1 // Set CSTS.RDY + c.state = stateCtrlReady + if c.katoMs > 0 { + c.startKATO() + } + } else { + c.regCSTS &^= 1 // Clear CSTS.RDY + } + default: + // Ignore writes to other registers + } + return c.sendResponse(req) +} + +// handleDisconnect processes a Fabric Disconnect. +func (c *Controller) handleDisconnect(req *Request) error { + if err := c.sendResponse(req); err != nil { + return err + } + c.shutdown() + return nil +} + +// ---------- Subsystem ---------- + +// Subsystem represents an NVMe subsystem backed by a BlockDevice. +type Subsystem struct { + NQN string + Dev BlockDevice + NGUID [16]byte // Namespace GUID +} + +// BlockDevice is the interface for the underlying storage. +// This is the same as iscsi.BlockDevice. +type BlockDevice interface { + ReadAt(lba uint64, length uint32) ([]byte, error) + WriteAt(lba uint64, data []byte) error + Trim(lba uint64, length uint32) error + SyncCache() error + BlockSize() uint32 + VolumeSize() uint64 + IsHealthy() bool +} + +// ANAProvider extends BlockDevice with ANA state reporting. +type ANAProvider interface { + ANAState() uint8 + ANAGroupID() uint16 + DeviceNGUID() [16]byte +} + +// allocCNTLID allocates a new controller ID from the server. +func (s *Server) allocCNTLID() uint16 { + return uint16(s.nextCNTLID.Add(1)) +} + +// findSubsystem looks up a subsystem by NQN. +func (s *Server) findSubsystem(nqn string) *Subsystem { + s.mu.RLock() + defer s.mu.RUnlock() + sub, ok := s.subsystems[nqn] + if !ok { + return nil + } + return sub +} + +// ---------- ConnectData field access helpers ---------- + +// connectQueueID extracts the QueueID from a Connect capsule D10. +func connectQueueID(capsule *CapsuleCommand) uint16 { + return uint16(capsule.D10 >> 16) +} + +// connectQueueSize extracts the QueueSize from a Connect capsule D11 (0-based → +1). +func connectQueueSize(capsule *CapsuleCommand) uint16 { + return uint16(capsule.D11&0xFFFF) + 1 +} + +// connectKATO extracts the KeepAlive timeout from a Connect capsule D12. +func connectKATO(capsule *CapsuleCommand) uint32 { + return capsule.D12 +} + +// PropertySet value extraction: the go-nvme reference puts value in D12/D13, +// but NVMe spec actually uses CDW14/CDW15 for PropertySet. We handle both. +func propertySetValue(capsule *CapsuleCommand) uint64 { + return uint64(capsule.D14) | uint64(capsule.D15)<<32 +} + +// propertyGetSize returns true if the PropertyGet requests an 8-byte value. +func propertyGetSize8(capsule *CapsuleCommand) bool { + return (capsule.D11 & 1) != 0 +} + +// propertyGetOffset returns the register offset for PropertyGet. +func propertyGetOffset(capsule *CapsuleCommand) uint32 { + return capsule.D10 +} + +// ---------- ConnectData marshal helpers for tests ---------- + +func marshalConnectData(cd *ConnectData) []byte { + buf := make([]byte, connectDataSize) + cd.Marshal(buf) + return buf +} + +func makeConnectCapsule(queueID, queueSize uint16, kato uint32, fcType uint8) CapsuleCommand { + return CapsuleCommand{ + OpCode: adminFabric, + FCType: fcType, + D10: uint32(queueID) << 16, + D11: uint32(queueSize - 1), // 0-based + D12: kato, + } +} + +// makePropertyGetCapsule creates a PropertyGet capsule for the given register offset. +func makePropertyGetCapsule(offset uint32, size8 bool) CapsuleCommand { + c := CapsuleCommand{ + OpCode: adminFabric, + FCType: fcPropertyGet, + D10: offset, + } + if size8 { + c.D11 = 1 + } + return c +} + +// makePropertySetCapsule creates a PropertySet capsule. +func makePropertySetCapsule(offset uint32, value uint64) CapsuleCommand { + return CapsuleCommand{ + OpCode: adminFabric, + FCType: fcPropertySet, + D10: offset, + D14: uint32(value), + D15: uint32(value >> 32), + } +} + +// putCNTLID stores the controller ID in ConnectData at offset 16. +func putCNTLID(buf []byte, cntlid uint16) { + binary.LittleEndian.PutUint16(buf[16:], cntlid) +} diff --git a/weed/storage/blockvol/nvme/identify.go b/weed/storage/blockvol/nvme/identify.go new file mode 100644 index 000000000..d245ea9c1 --- /dev/null +++ b/weed/storage/blockvol/nvme/identify.go @@ -0,0 +1,250 @@ +package nvme + +import ( + "encoding/binary" + "math/bits" +) + +const identifySize = 4096 + +// handleIdentify dispatches Identify commands by CNS type. +func (c *Controller) handleIdentify(req *Request) error { + cns := uint8(req.capsule.D10 & 0xFF) + + switch cns { + case cnsIdentifyController: + return c.identifyController(req) + case cnsIdentifyNamespace: + return c.identifyNamespace(req) + case cnsActiveNSList: + return c.identifyActiveNSList(req) + case cnsNSDescriptorList: + return c.identifyNSDescriptors(req) + default: + req.resp.Status = uint16(StatusInvalidField) + return c.sendResponse(req) + } +} + +// identifyController returns the 4096-byte Identify Controller data structure. +func (c *Controller) identifyController(req *Request) error { + buf := make([]byte, identifySize) + + sub := c.subsystem + if sub == nil { + req.resp.Status = uint16(StatusInvalidField) + return c.sendResponse(req) + } + + // VID (PCI Vendor ID) - use 0 for software target + // SSVID - 0 + + // Serial Number (offset 4, 20 bytes, space-padded ASCII) + copyPadded(buf[4:24], "SWF00001") + + // Model Number (offset 24, 40 bytes, space-padded ASCII) + copyPadded(buf[24:64], "SeaweedFS BlockVol") + + // Firmware Revision (offset 64, 8 bytes, space-padded ASCII) + copyPadded(buf[64:72], "0001") + + // RAB (Recommended Arbitration Burst) - offset 72 + buf[72] = 6 + + // IEEE OUI - offset 73-75 (3 bytes, 0 for software) + + // CMIC (Controller Multi-Path I/O Capabilities) - offset 76 + // bit 3: ANA reporting supported + buf[76] = 0x08 + + // MDTS (Maximum Data Transfer Size) - offset 77 + // 2^MDTS * 4096 = max transfer. MDTS=3 → 32KB + buf[77] = 3 + + // CNTLID (Controller ID) - offset 78-79 + binary.LittleEndian.PutUint16(buf[78:], c.cntlID) + + // Version - offset 80-83 + binary.LittleEndian.PutUint32(buf[80:], nvmeVersion14) + + // OACS (Optional Admin Command Support) - offset 256-257 + // 0 = no optional admin commands + binary.LittleEndian.PutUint16(buf[256:], 0) + + // ACRTD (Abort Command Limit) - offset 258 + buf[258] = 3 + + // AERTL (Async Event Request Limit) - offset 259 + buf[259] = 3 + + // FRMW (Firmware Updates) - offset 260 + buf[260] = 0x02 // slot 1 read-only + + // LPA (Log Page Attributes) - offset 261 + buf[261] = 0 + + // ELPE (Error Log Page Entries) - offset 262 + buf[262] = 0 // 1 entry (0-based) + + // SQES (Submission Queue Entry Size) - offset 512 + // min=6 (2^6=64 bytes), max=6 + buf[512] = 0x66 + + // CQES (Completion Queue Entry Size) - offset 513 + // min=4 (2^4=16 bytes), max=4 + buf[513] = 0x44 + + // MAXCMD - offset 514-515 + binary.LittleEndian.PutUint16(buf[514:], 64) + + // NN (Number of Namespaces) - offset 516-519 + binary.LittleEndian.PutUint32(buf[516:], 1) + + // ONCS (Optional NVM Command Support) - offset 520-521 + // bit 3: WriteZeros, bit 2: DatasetMgmt (Trim) + binary.LittleEndian.PutUint16(buf[520:], 0x0C) + + // ANACAP (ANA Capabilities) - offset 522 + // bit 3: reports Optimized state + buf[522] = 0x08 + + // ANAGRPMAX - offset 524-527 + binary.LittleEndian.PutUint32(buf[524:], 1) + + // NANAGRPID - offset 528-531 + binary.LittleEndian.PutUint32(buf[528:], 1) + + // VWC (Volatile Write Cache) - offset 525 + // bit 0: volatile write cache present → Flush required + buf[525] = 0x01 + + // SGLS (SGL Support) - offset 536-539 + // bit 0: SGLs supported (required for NVMe/TCP) + binary.LittleEndian.PutUint32(buf[536:], 0x01) + + // SubNQN (Subsystem NQN) - offset 768, 256 bytes + copyPadded(buf[768:1024], sub.NQN) + + // IOCCSZ (I/O Queue Command Capsule Supported Size) - offset 1792-1795 + // In 16-byte units: 64/16 = 4 + binary.LittleEndian.PutUint32(buf[1792:], 4) + + // IORCSZ (I/O Queue Response Capsule Supported Size) - offset 1796-1799 + // In 16-byte units: 16/16 = 1 + binary.LittleEndian.PutUint32(buf[1796:], 1) + + // ICDOFF (In Capsule Data Offset) - offset 1800-1801 + // 0 means inline data immediately follows SQE in capsule + binary.LittleEndian.PutUint16(buf[1800:], 0) + + // FCATT (Fabrics Controller Attributes) - offset 1802 + // bit 0: 0 = I/O controller (not discovery) + buf[1802] = 0 + + // MSDBD (Maximum SGL Data Block Descriptors) - offset 1803 + buf[1803] = 1 + + // OFCS (Optional Fabric Commands Supported) - offset 1804-1805 + // bit 0: Disconnect command supported + binary.LittleEndian.PutUint16(buf[1804:], 0x01) + + req.c2hData = buf + return c.sendC2HDataAndResponse(req) +} + +// identifyNamespace returns the 4096-byte Identify Namespace data for NSID=1. +func (c *Controller) identifyNamespace(req *Request) error { + sub := c.subsystem + if sub == nil { + req.resp.Status = uint16(StatusInvalidField) + return c.sendResponse(req) + } + + dev := sub.Dev + blockSize := dev.BlockSize() + nsze := dev.VolumeSize() / uint64(blockSize) + + buf := make([]byte, identifySize) + + // NSZE (Namespace Size in blocks) - offset 0-7 + binary.LittleEndian.PutUint64(buf[0:], nsze) + + // NCAP (Namespace Capacity) - offset 8-15 + binary.LittleEndian.PutUint64(buf[8:], nsze) + + // NUSE (Namespace Utilization) - offset 16-23 + binary.LittleEndian.PutUint64(buf[16:], nsze) + + // NSFEAT (Namespace Features) - offset 24 + // bit 0: thin provisioning (supports Trim) + buf[24] = 0x01 + + // NLBAF (Number of LBA Formats minus 1) - offset 25 + buf[25] = 0 // one format + + // FLBAS (Formatted LBA Size) - offset 26 + // bits 3:0 = LBA format index (0) + buf[26] = 0 + + // MC (Metadata Capabilities) - offset 27 + buf[27] = 0 + + // DLFEAT (Deallocate Logical Block Features) - offset 28 + // bit 2: Deallocated blocks return zeros on read + buf[28] = 0x04 + + // NGUID (Namespace Globally Unique Identifier) - offset 104-119 (16 bytes) + copy(buf[104:120], sub.NGUID[:]) + + // LBAF[0] (LBA Format 0) - offset 128-131 + // bits 23:16 = LBADS (log2 of block size) + lbads := uint8(bits.TrailingZeros32(blockSize)) + binary.LittleEndian.PutUint32(buf[128:], uint32(lbads)<<16) + + // ANAGRPID (ANA Group Identifier) - offset 92-95 + binary.LittleEndian.PutUint32(buf[92:], 1) + + req.c2hData = buf + return c.sendC2HDataAndResponse(req) +} + +// identifyActiveNSList returns the list of active namespace IDs (just NSID=1). +func (c *Controller) identifyActiveNSList(req *Request) error { + buf := make([]byte, identifySize) + // Single namespace: NSID=1 + binary.LittleEndian.PutUint32(buf[0:], 1) + + req.c2hData = buf + return c.sendC2HDataAndResponse(req) +} + +// identifyNSDescriptors returns namespace descriptor list for NSID=1. +func (c *Controller) identifyNSDescriptors(req *Request) error { + sub := c.subsystem + if sub == nil { + req.resp.Status = uint16(StatusInvalidField) + return c.sendResponse(req) + } + + buf := make([]byte, identifySize) + off := 0 + + // NGUID descriptor (type=0x02, length=16) + buf[off] = 0x02 // NIDT: NGUID + off++ + buf[off] = 16 // NIDL: 16 bytes + off++ + off += 2 // reserved + copy(buf[off:off+16], sub.NGUID[:]) + + req.c2hData = buf + return c.sendC2HDataAndResponse(req) +} + +// copyPadded copies src into dst, padding remaining bytes with spaces. +func copyPadded(dst []byte, src string) { + n := copy(dst, src) + for i := n; i < len(dst); i++ { + dst[i] = ' ' + } +} diff --git a/weed/storage/blockvol/nvme/io.go b/weed/storage/blockvol/nvme/io.go new file mode 100644 index 000000000..32b7b8988 --- /dev/null +++ b/weed/storage/blockvol/nvme/io.go @@ -0,0 +1,157 @@ +package nvme + +// handleRead processes an NVMe Read command. +func (c *Controller) handleRead(req *Request) error { + sub := c.subsystem + if sub == nil { + req.resp.Status = uint16(StatusInvalidField) + return c.sendResponse(req) + } + + dev := sub.Dev + lba := req.capsule.Lba() + nlb := req.capsule.LbaLength() + blockSize := dev.BlockSize() + totalBytes := uint32(nlb) * blockSize + + // Bounds check + nsze := dev.VolumeSize() / uint64(blockSize) + if lba+uint64(nlb) > nsze { + req.resp.Status = uint16(StatusLBAOutOfRange) + return c.sendResponse(req) + } + + data, err := dev.ReadAt(lba, totalBytes) + if err != nil { + req.resp.Status = uint16(mapBlockError(err)) + return c.sendResponse(req) + } + + req.c2hData = data + return c.sendC2HDataAndResponse(req) +} + +// handleWrite processes an NVMe Write command with inline data. +func (c *Controller) handleWrite(req *Request) error { + sub := c.subsystem + if sub == nil { + req.resp.Status = uint16(StatusInvalidField) + return c.sendResponse(req) + } + + // Check ANA state (write-gating) + if !c.isWriteAllowed() { + req.resp.Status = uint16(StatusNSNotReady) + return c.sendResponse(req) + } + + // Inline data must be present (DataOffset != 0 in the received PDU). + // If DataOffset == 0 for a Write, the host expects R2T flow — reject. + if len(req.payload) == 0 { + req.resp.Status = uint16(StatusInvalidField) + return c.sendResponse(req) + } + + dev := sub.Dev + lba := req.capsule.Lba() + nlb := req.capsule.LbaLength() + blockSize := dev.BlockSize() + + // Bounds check + nsze := dev.VolumeSize() / uint64(blockSize) + if lba+uint64(nlb) > nsze { + req.resp.Status = uint16(StatusLBAOutOfRange) + return c.sendResponse(req) + } + + // Validate payload size matches NLB*blockSize. + expectedBytes := uint32(nlb) * blockSize + if uint32(len(req.payload)) != expectedBytes { + req.resp.Status = uint16(StatusInvalidField) + return c.sendResponse(req) + } + + if err := dev.WriteAt(lba, req.payload); err != nil { + req.resp.Status = uint16(mapBlockError(err)) + return c.sendResponse(req) + } + + return c.sendResponse(req) +} + +// handleFlush processes an NVMe Flush command. +func (c *Controller) handleFlush(req *Request) error { + sub := c.subsystem + if sub == nil { + req.resp.Status = uint16(StatusInvalidField) + return c.sendResponse(req) + } + + if !c.isWriteAllowed() { + req.resp.Status = uint16(StatusNSNotReady) + return c.sendResponse(req) + } + + if err := sub.Dev.SyncCache(); err != nil { + req.resp.Status = uint16(mapBlockError(err)) + return c.sendResponse(req) + } + + return c.sendResponse(req) +} + +// handleWriteZeros processes an NVMe Write Zeroes command. +func (c *Controller) handleWriteZeros(req *Request) error { + sub := c.subsystem + if sub == nil { + req.resp.Status = uint16(StatusInvalidField) + return c.sendResponse(req) + } + + if !c.isWriteAllowed() { + req.resp.Status = uint16(StatusNSNotReady) + return c.sendResponse(req) + } + + dev := sub.Dev + lba := req.capsule.Lba() + nlb := req.capsule.LbaLength() + blockSize := dev.BlockSize() + totalBytes := uint32(nlb) * blockSize + + // Bounds check + nsze := dev.VolumeSize() / uint64(blockSize) + if lba+uint64(nlb) > nsze { + req.resp.Status = uint16(StatusLBAOutOfRange) + return c.sendResponse(req) + } + + // D12 bit 25: DEALLOC — if set, use Trim instead of writing zeros + if req.capsule.D12&commandBitDeallocate != 0 { + if err := dev.Trim(lba, totalBytes); err != nil { + req.resp.Status = uint16(mapBlockError(err)) + return c.sendResponse(req) + } + } else { + zeroBuf := make([]byte, totalBytes) + if err := dev.WriteAt(lba, zeroBuf); err != nil { + req.resp.Status = uint16(mapBlockError(err)) + return c.sendResponse(req) + } + } + + return c.sendResponse(req) +} + +// isWriteAllowed checks if the current ANA state allows writes. +func (c *Controller) isWriteAllowed() bool { + if c.subsystem == nil { + return false + } + if prov, ok := c.subsystem.Dev.(ANAProvider); ok { + state := prov.ANAState() + return state == anaOptimized || state == anaNonOptimized + } + // No ANA provider: allow if healthy + return c.subsystem.Dev.IsHealthy() +} diff --git a/weed/storage/blockvol/nvme/nvme_qa_test.go b/weed/storage/blockvol/nvme/nvme_qa_test.go new file mode 100644 index 000000000..b034f4c3e --- /dev/null +++ b/weed/storage/blockvol/nvme/nvme_qa_test.go @@ -0,0 +1,1541 @@ +package nvme + +// Adversarial / QA tests for NVMe/TCP target. +// Covers: malformed wire, protocol state violations, IO boundary attacks, +// ANA/fencing transitions, admin command edge cases, server lifecycle races, +// multi-block chunking, SQHD wraparound, concurrent stress. + +import ( + "bytes" + "encoding/binary" + "errors" + "io" + "net" + "sync" + "testing" + "time" +) + +// ============================================================ +// QA-1: Wire / Protocol Attacks +// ============================================================ + +// TestQA_Wire_TruncatedHeaderEOF: stream ends mid-header (< 8 bytes). +func TestQA_Wire_TruncatedHeaderEOF(t *testing.T) { + // Only 4 of 8 header bytes → io.ErrUnexpectedEOF + buf := make([]byte, 4) + buf[0] = pduCapsuleCmd + r := NewReader(bytes.NewReader(buf)) + _, err := r.Dequeue() + if err == nil { + t.Fatal("expected error for truncated header") + } +} + +// TestQA_Wire_ZeroLengthStream: completely empty reader. +func TestQA_Wire_ZeroLengthStream(t *testing.T) { + r := NewReader(bytes.NewReader(nil)) + _, err := r.Dequeue() + if err == nil { + t.Fatal("expected error for empty stream") + } + if err != io.EOF && err != io.ErrUnexpectedEOF { + t.Fatalf("expected EOF-type error, got: %v", err) + } +} + +// TestQA_Wire_HeaderLength_Exactly8: minimum valid HeaderLength with no specific header. +func TestQA_Wire_HeaderLength_Exactly8(t *testing.T) { + buf := make([]byte, commonHeaderSize) + hdr := CommonHeader{ + Type: pduCapsuleResp, + HeaderLength: commonHeaderSize, // exactly 8 + DataOffset: 0, + DataLength: uint32(commonHeaderSize), + } + hdr.Marshal(buf) + r := NewReader(bytes.NewReader(buf)) + got, err := r.Dequeue() + if err != nil { + t.Fatalf("valid minimum header should parse: %v", err) + } + if got.HeaderLength != commonHeaderSize { + t.Fatalf("HeaderLength = %d", got.HeaderLength) + } +} + +// TestQA_Wire_AllZeroHeader: all-zero 8-byte header has HeaderLength=0 < 8 → rejected. +func TestQA_Wire_AllZeroHeader(t *testing.T) { + buf := make([]byte, commonHeaderSize) + r := NewReader(bytes.NewReader(buf)) + _, err := r.Dequeue() + if err == nil { + t.Fatal("all-zero header should be rejected (HeaderLength=0 < 8)") + } +} + +// TestQA_Wire_GarbageAfterValidPDU: garbage bytes after a valid PDU +// should not cause the valid PDU to fail. +func TestQA_Wire_GarbageAfterValidPDU(t *testing.T) { + var buf bytes.Buffer + w := NewWriter(&buf) + + resp := CapsuleResponse{CID: 99, Status: 0} + if err := w.SendHeaderOnly(pduCapsuleResp, &resp, capsuleRespSize); err != nil { + t.Fatal(err) + } + // Append garbage + buf.Write([]byte{0xFF, 0xFE, 0xFD}) + + r := NewReader(&buf) + hdr, err := r.Dequeue() + if err != nil { + t.Fatalf("first valid PDU should parse: %v", err) + } + if hdr.Type != pduCapsuleResp { + t.Fatalf("type = 0x%x", hdr.Type) + } + var got CapsuleResponse + r.Receive(&got) + if got.CID != 99 { + t.Fatalf("CID = %d", got.CID) + } + + // Second Dequeue should fail on garbage (HeaderLength too small) + _, err = r.Dequeue() + if err == nil { + t.Fatal("expected error parsing garbage as next PDU") + } +} + +// ============================================================ +// QA-2: Controller State Machine Violations +// ============================================================ + +// TestQA_UnexpectedPDUType: send an unknown PDU type after IC. +func TestQA_UnexpectedPDUType(t *testing.T) { + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + ctrl := newController(serverConn, srv) + + done := make(chan error, 1) + go func() { done <- ctrl.Serve() }() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + + sendICReq(w) + recvICResp(t, r) + + // Send R2T PDU type (0x06) which is not handled by the controller. + fakePDU := CapsuleCommand{} // payload doesn't matter + w.SendHeaderOnly(0x06, &fakePDU, capsuleCmdSize) + + // Controller should return error or close connection + select { + case err := <-done: + if err == nil { + // EOF from pipe close is also acceptable + } + case <-time.After(2 * time.Second): + t.Fatal("controller did not exit on unknown PDU type") + } + + clientConn.Close() +} + +// TestQA_CapsuleBeforeIC: send a capsule command before IC handshake. +// The controller must reject it — capsules require state >= stateICComplete. +func TestQA_CapsuleBeforeIC(t *testing.T) { + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + ctrl := newController(serverConn, srv) + + done := make(chan error, 1) + go func() { done <- ctrl.Serve() }() + + w := NewWriter(clientConn) + + // Send capsule directly without IC — controller should reject and close. + cmd := CapsuleCommand{OpCode: adminKeepAlive, CID: 1} + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + + select { + case err := <-done: + if err == nil { + t.Fatal("expected error from Serve for capsule before IC") + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for controller to reject capsule before IC") + } +} + +// TestQA_IOWrite_OnAdminQueue: ioWrite (0x01) has no admin counterpart → InvalidOpcode. +// Note: other IO opcodes (0x00, 0x02, 0x08) map to valid admin opcodes by NVMe spec. +func TestQA_IOWrite_OnAdminQueue(t *testing.T) { + nqn := "nqn.test:qa-admin-io" + client, r, w, _, _ := setupAdminSession(t, nqn) + defer client.Close() + + cmd := CapsuleCommand{ + OpCode: ioWrite, // 0x01: no admin equivalent + CID: 500, + D10: 0, + D12: 0, + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, make([]byte, 512)) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusInvalidOpcode { + t.Fatalf("ioWrite(0x01) on admin queue: got 0x%04x, want InvalidOpcode", resp.Status) + } +} + +// TestQA_UnknownAdminOpcode: bogus admin opcode → InvalidOpcode. +func TestQA_UnknownAdminOpcode(t *testing.T) { + nqn := "nqn.test:qa-bad-admin" + client, r, w, _, _ := setupAdminSession(t, nqn) + defer client.Close() + + cmd := CapsuleCommand{ + OpCode: 0xFF, // unknown + CID: 600, + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusInvalidOpcode { + t.Fatalf("unknown admin opcode: got 0x%04x, want InvalidOpcode", resp.Status) + } +} + +// TestQA_UnknownIOOpcode: bogus IO opcode → InvalidOpcode. +func TestQA_UnknownIOOpcode(t *testing.T) { + nqn := "nqn.test:qa-bad-io" + dev := newMockDevice(256, 512) + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + ctrl := newController(serverConn, srv) + ctrl.subsystem = srv.findSubsystem(nqn) + ctrl.queueID = 1 + ctrl.queueSize = 64 + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + + sendICReq(w) + recvICResp(t, r) + + cmd := CapsuleCommand{ + OpCode: 0xFE, // unknown IO opcode + CID: 700, + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusInvalidOpcode { + t.Fatalf("unknown IO opcode: got 0x%04x, want InvalidOpcode", resp.Status) + } +} + +// ============================================================ +// QA-3: Fabric Command Edge Cases +// ============================================================ + +// TestQA_ConnectEmptyPayload: Connect with payload < 1024 bytes → InvalidField. +func TestQA_ConnectEmptyPayload(t *testing.T) { + nqn := "nqn.test:qa-empty-connect" + dev := newMockDevice(256, 512) + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + ctrl := newController(serverConn, srv) + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + + sendICReq(w) + recvICResp(t, r) + + // Send Connect with only 16 bytes of payload (need 1024) + cmd := CapsuleCommand{ + OpCode: adminFabric, + FCType: fcConnect, + D10: 0, // QID=0 + D11: 63, + } + shortPayload := make([]byte, 16) // way too short + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, shortPayload) + + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusInvalidField { + t.Fatalf("short ConnectData: got 0x%04x, want InvalidField", resp.Status) + } +} + +// TestQA_ConnectNoPayload: Connect with no inline data at all → InvalidField. +func TestQA_ConnectNoPayload(t *testing.T) { + nqn := "nqn.test:qa-no-connect-data" + dev := newMockDevice(256, 512) + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + ctrl := newController(serverConn, srv) + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + + sendICReq(w) + recvICResp(t, r) + + // Send Connect capsule with zero payload + cmd := CapsuleCommand{ + OpCode: adminFabric, + FCType: fcConnect, + D10: 0, + D11: 63, + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusInvalidField { + t.Fatalf("no ConnectData: got 0x%04x, want InvalidField", resp.Status) + } +} + +// TestQA_UnknownFabricFCType: unknown FCType → InvalidField. +func TestQA_UnknownFabricFCType(t *testing.T) { + nqn := "nqn.test:qa-bad-fc" + client, r, w, _, _ := setupAdminSession(t, nqn) + defer client.Close() + + cmd := CapsuleCommand{ + OpCode: adminFabric, + FCType: 0xFF, // unknown + CID: 800, + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusInvalidField { + t.Fatalf("unknown FCType: got 0x%04x, want InvalidField", resp.Status) + } +} + +// TestQA_PropertyGetUnknownOffset: PropertyGet with bad register → InvalidField. +func TestQA_PropertyGetUnknownOffset(t *testing.T) { + nqn := "nqn.test:qa-bad-prop" + client, r, w, _, _ := setupAdminSession(t, nqn) + defer client.Close() + + cmd := CapsuleCommand{ + OpCode: adminFabric, + FCType: fcPropertyGet, + CID: 801, + D10: 0xDEAD, // invalid register offset + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusInvalidField { + t.Fatalf("unknown register offset: got 0x%04x, want InvalidField", resp.Status) + } +} + +// TestQA_Disconnect_CleanShutdown: Disconnect should send response and close. +func TestQA_Disconnect_CleanShutdown(t *testing.T) { + nqn := "nqn.test:qa-disconnect" + client, r, w, ctrl, _ := setupAdminSession(t, nqn) + defer client.Close() + + cmd := CapsuleCommand{ + OpCode: adminFabric, + FCType: fcDisconnect, + CID: 802, + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + + // Should get a success response + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("Disconnect failed: 0x%04x", resp.Status) + } + + // Controller should be closed after disconnect + time.Sleep(50 * time.Millisecond) + if ctrl.state != stateClosed { + t.Fatalf("state = %d, want stateClosed(%d)", ctrl.state, stateClosed) + } +} + +// ============================================================ +// QA-4: IO Boundary Attacks +// ============================================================ + +// setupQAIOQueue creates a controller with IO queue set up for testing. +func setupQAIOQueue(t *testing.T, nqn string, dev *mockBlockDevice) (net.Conn, *Reader, *Writer) { + t.Helper() + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + clientConn, serverConn := pipeConn() + ctrl := newController(serverConn, srv) + ctrl.subsystem = srv.findSubsystem(nqn) + ctrl.queueID = 1 + ctrl.queueSize = 64 + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + sendICReq(w) + recvICResp(t, r) + return clientConn, r, w +} + +// TestQA_IO_WriteOversizedPayload: NLB=1 (512B) but payload=1024B → InvalidField. +func TestQA_IO_WriteOversizedPayload(t *testing.T) { + dev := newMockDevice(256, 512) + client, r, w := setupQAIOQueue(t, "nqn.test:qa-oversize", dev) + defer client.Close() + + writeCmd := CapsuleCommand{ + OpCode: ioWrite, + CID: 900, + D10: 0, + D12: 0, // 1 block = 512 bytes + } + oversized := make([]byte, 1024) // too large + w.SendWithData(pduCapsuleCmd, 0, &writeCmd, capsuleCmdSize, oversized) + + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusInvalidField { + t.Fatalf("oversized payload: got 0x%04x, want InvalidField", resp.Status) + } +} + +// TestQA_IO_WriteExactBoundary: write at last valid LBA should succeed. +func TestQA_IO_WriteExactBoundary(t *testing.T) { + dev := newMockDevice(256, 512) // 256 blocks + client, r, w := setupQAIOQueue(t, "nqn.test:qa-boundary", dev) + defer client.Close() + + // Write 1 block at LBA 255 (last valid) + writeCmd := CapsuleCommand{ + OpCode: ioWrite, + CID: 901, + D10: 255, + D11: 0, + D12: 0, // 1 block + } + w.SendWithData(pduCapsuleCmd, 0, &writeCmd, capsuleCmdSize, make([]byte, 512)) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("write at last LBA should succeed: 0x%04x", resp.Status) + } + + // Write 1 block at LBA 256 → out of bounds + writeCmd2 := CapsuleCommand{ + OpCode: ioWrite, + CID: 902, + D10: 256, // past end + D11: 0, + D12: 0, + } + w.SendWithData(pduCapsuleCmd, 0, &writeCmd2, capsuleCmdSize, make([]byte, 512)) + resp = recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusLBAOutOfRange { + t.Fatalf("write past end: got 0x%04x, want LBAOutOfRange", resp.Status) + } +} + +// TestQA_IO_ReadExactBoundary: read at last valid LBA succeeds, LBA+1 fails. +func TestQA_IO_ReadExactBoundary(t *testing.T) { + dev := newMockDevice(256, 512) + client, r, w := setupQAIOQueue(t, "nqn.test:qa-readbound", dev) + defer client.Close() + + // Read last block + readCmd := CapsuleCommand{ + OpCode: ioRead, + CID: 903, + D10: 255, + D12: 0, + } + w.SendWithData(pduCapsuleCmd, 0, &readCmd, capsuleCmdSize, nil) + + hdr, _ := r.Dequeue() + if hdr.Type != pduC2HData { + t.Fatalf("expected C2HData, got 0x%x", hdr.Type) + } + var c2h C2HDataHeader + r.Receive(&c2h) + data := make([]byte, r.Length()) + r.ReceiveData(data) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("read last LBA failed: 0x%04x", resp.Status) + } + + // Read at LBA 256 → out of bounds + readCmd2 := CapsuleCommand{ + OpCode: ioRead, + CID: 904, + D10: 256, + D12: 0, + } + w.SendWithData(pduCapsuleCmd, 0, &readCmd2, capsuleCmdSize, nil) + resp = recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusLBAOutOfRange { + t.Fatalf("read past end: got 0x%04x, want LBAOutOfRange", resp.Status) + } +} + +// TestQA_IO_MultiBlockWrite: write 8 blocks at once, read back and verify. +func TestQA_IO_MultiBlockWrite(t *testing.T) { + dev := newMockDevice(256, 512) + client, r, w := setupQAIOQueue(t, "nqn.test:qa-multi", dev) + defer client.Close() + + // Write 8 blocks (4096 bytes) at LBA 10 + writeData := make([]byte, 4096) + for i := range writeData { + writeData[i] = byte(i & 0xFF) + } + writeCmd := CapsuleCommand{ + OpCode: ioWrite, + CID: 910, + D10: 10, + D12: 7, // 8 blocks (0-based) + } + w.SendWithData(pduCapsuleCmd, 0, &writeCmd, capsuleCmdSize, writeData) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("multi-block write failed: 0x%04x", resp.Status) + } + + // Read back + readCmd := CapsuleCommand{ + OpCode: ioRead, + CID: 911, + D10: 10, + D12: 7, + } + w.SendWithData(pduCapsuleCmd, 0, &readCmd, capsuleCmdSize, nil) + + // May come as multiple C2HData chunks (4096 > maxH2CDataLen if small) + var readBuf bytes.Buffer + for { + hdr, err := r.Dequeue() + if err != nil { + t.Fatal(err) + } + if hdr.Type == pduCapsuleResp { + var resp2 CapsuleResponse + r.Receive(&resp2) + if StatusWord(resp2.Status).IsError() { + t.Fatalf("multi-block read resp error: 0x%04x", resp2.Status) + } + break + } + if hdr.Type == pduC2HData { + var c2h C2HDataHeader + r.Receive(&c2h) + chunk := make([]byte, r.Length()) + r.ReceiveData(chunk) + readBuf.Write(chunk) + } + } + + if !bytes.Equal(readBuf.Bytes(), writeData) { + t.Fatal("multi-block read data mismatch") + } +} + +// TestQA_IO_WriteZerosOutOfBounds: WriteZeros past volume end → LBAOutOfRange. +func TestQA_IO_WriteZerosOutOfBounds(t *testing.T) { + dev := newMockDevice(256, 512) + client, r, w := setupQAIOQueue(t, "nqn.test:qa-wz-oob", dev) + defer client.Close() + + cmd := CapsuleCommand{ + OpCode: ioWriteZeros, + CID: 920, + D10: 255, + D12: 1, // 2 blocks from LBA 255 → exceeds 256 blocks + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusLBAOutOfRange { + t.Fatalf("WriteZeros OOB: got 0x%04x, want LBAOutOfRange", resp.Status) + } +} + +// TestQA_IO_FlushOnReplica: flush gated by ANA → NSNotReady. +func TestQA_IO_FlushOnReplica(t *testing.T) { + dev := newMockDevice(256, 512) + dev.anaState = anaInaccessible // replica mode + client, r, w := setupQAIOQueue(t, "nqn.test:qa-flush-replica", dev) + defer client.Close() + + cmd := CapsuleCommand{ + OpCode: ioFlush, + CID: 921, + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusNSNotReady { + t.Fatalf("flush on replica: got 0x%04x, want NSNotReady", resp.Status) + } +} + +// TestQA_IO_WriteZerosOnReplica: WriteZeros gated by ANA → NSNotReady. +func TestQA_IO_WriteZerosOnReplica(t *testing.T) { + dev := newMockDevice(256, 512) + dev.anaState = anaPersistentLoss // stale + client, r, w := setupQAIOQueue(t, "nqn.test:qa-wz-replica", dev) + defer client.Close() + + cmd := CapsuleCommand{ + OpCode: ioWriteZeros, + CID: 922, + D10: 0, + D12: 0, + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusNSNotReady { + t.Fatalf("WriteZeros on stale: got 0x%04x, want NSNotReady", resp.Status) + } +} + +// TestQA_IO_ReadOnReplicaSucceeds: reads should work even on replica ANA state. +func TestQA_IO_ReadOnReplicaSucceeds(t *testing.T) { + dev := newMockDevice(256, 512) + dev.anaState = anaInaccessible // replica + client, r, w := setupQAIOQueue(t, "nqn.test:qa-read-replica", dev) + defer client.Close() + + readCmd := CapsuleCommand{ + OpCode: ioRead, + CID: 923, + D10: 0, + D12: 0, + } + w.SendWithData(pduCapsuleCmd, 0, &readCmd, capsuleCmdSize, nil) + + hdr, _ := r.Dequeue() + if hdr.Type != pduC2HData { + t.Fatalf("expected C2HData, got 0x%x", hdr.Type) + } + var c2h C2HDataHeader + r.Receive(&c2h) + data := make([]byte, r.Length()) + r.ReceiveData(data) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("read on replica should succeed: 0x%04x", resp.Status) + } +} + +// TestQA_IO_SyncCacheError: SyncCache returns error → mapped NVMe status. +func TestQA_IO_SyncCacheError(t *testing.T) { + dev := newMockDevice(256, 512) + dev.syncErr = errors.New("sync failed") + client, r, w := setupQAIOQueue(t, "nqn.test:qa-sync-err", dev) + defer client.Close() + + cmd := CapsuleCommand{ + OpCode: ioFlush, + CID: 924, + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + resp := recvCapsuleResp(t, r) + if !StatusWord(resp.Status).IsError() { + t.Fatal("flush with sync error should fail") + } + if StatusWord(resp.Status) != StatusInternalError { + t.Fatalf("sync error: got 0x%04x, want InternalError", resp.Status) + } +} + +// TestQA_IO_TrimError: Trim returns error → mapped NVMe status. +func TestQA_IO_TrimError(t *testing.T) { + dev := newMockDevice(256, 512) + dev.trimErr = errors.New("trim failed") + client, r, w := setupQAIOQueue(t, "nqn.test:qa-trim-err", dev) + defer client.Close() + + cmd := CapsuleCommand{ + OpCode: ioWriteZeros, + CID: 925, + D10: 0, + D12: 0 | commandBitDeallocate, + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + resp := recvCapsuleResp(t, r) + if !StatusWord(resp.Status).IsError() { + t.Fatal("trim error should fail") + } +} + +// ============================================================ +// QA-5: Admin Command Edge Cases +// ============================================================ + +// TestQA_Admin_UnknownFeatureID: SetFeatures/GetFeatures with unknown FID. +func TestQA_Admin_UnknownFeatureID(t *testing.T) { + nqn := "nqn.test:qa-bad-fid" + client, r, w, _, _ := setupAdminSession(t, nqn) + defer client.Close() + + // SetFeatures unknown + cmd := CapsuleCommand{ + OpCode: adminSetFeatures, + CID: 1000, + D10: 0xBB, // unknown FID + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusInvalidField { + t.Fatalf("SetFeatures unknown FID: got 0x%04x, want InvalidField", resp.Status) + } + + // GetFeatures unknown + cmd2 := CapsuleCommand{ + OpCode: adminGetFeatures, + CID: 1001, + D10: 0xBB, + } + w.SendWithData(pduCapsuleCmd, 0, &cmd2, capsuleCmdSize, nil) + resp = recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusInvalidField { + t.Fatalf("GetFeatures unknown FID: got 0x%04x, want InvalidField", resp.Status) + } +} + +// TestQA_Admin_UnknownIdentifyCNS: Identify with unknown CNS → InvalidField. +func TestQA_Admin_UnknownIdentifyCNS(t *testing.T) { + nqn := "nqn.test:qa-bad-cns" + client, r, w, _, _ := setupAdminSession(t, nqn) + defer client.Close() + + cmd := CapsuleCommand{ + OpCode: adminIdentify, + CID: 1002, + D10: 0xFF, // unknown CNS + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusInvalidField { + t.Fatalf("Identify unknown CNS: got 0x%04x, want InvalidField", resp.Status) + } +} + +// TestQA_Admin_UnknownLogPageLID: GetLogPage with unknown LID → InvalidField. +func TestQA_Admin_UnknownLogPageLID(t *testing.T) { + nqn := "nqn.test:qa-bad-lid" + client, r, w, _, _ := setupAdminSession(t, nqn) + defer client.Close() + + cmd := CapsuleCommand{ + OpCode: adminGetLogPage, + CID: 1003, + D10: 0xFE | (3 << 16), // unknown LID, NUMD=3 + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusInvalidField { + t.Fatalf("GetLogPage unknown LID: got 0x%04x, want InvalidField", resp.Status) + } +} + +// TestQA_Admin_SetFeaturesZeroQueues: request 0 queues → clamped to 1. +func TestQA_Admin_SetFeaturesZeroQueues(t *testing.T) { + nqn := "nqn.test:qa-zero-q" + client, r, w, _, _ := setupAdminSession(t, nqn) + defer client.Close() + + cmd := CapsuleCommand{ + OpCode: adminSetFeatures, + CID: 1004, + D10: uint32(fidNumberOfQueues), + D11: 0, // 0|0 → request 0+1=1? No: D11 is NCQR|NSQR with 0-based, so 0 means 1. + // Actually the code reads ncqr = D11 & 0xFFFF = 0, then clamps to 1 + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("SetFeatures 0 queues: got 0x%04x", resp.Status) + } + // DW0 should be (1-1) | ((1-1)<<16) = 0 + if resp.DW0 != 0 { + t.Fatalf("expected DW0=0 (1 queue each, 0-based), got %d", resp.DW0) + } +} + +// TestQA_Admin_GetLogPage_ErrorLog: error log should return empty data. +func TestQA_Admin_GetLogPage_ErrorLog(t *testing.T) { + nqn := "nqn.test:qa-errlog" + client, r, w, _, _ := setupAdminSession(t, nqn) + defer client.Close() + + cmd := CapsuleCommand{ + OpCode: adminGetLogPage, + CID: 1005, + D10: uint32(logPageError) | (15 << 16), // NUMD=15 → 64 bytes + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + + hdr, _ := r.Dequeue() + if hdr.Type != pduC2HData { + t.Fatalf("expected C2HData for error log") + } + var c2h C2HDataHeader + r.Receive(&c2h) + data := make([]byte, r.Length()) + r.ReceiveData(data) + // Error log should be all zeros (empty) + for i, b := range data { + if b != 0 { + t.Fatalf("error log byte[%d] = 0x%02x, want 0", i, b) + } + } + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("error log failed: 0x%04x", resp.Status) + } +} + +// ============================================================ +// QA-6: ANA State Transitions Under IO +// ============================================================ + +// TestQA_ANA_TransitionMidIO: change ANA state from optimized to inaccessible +// mid-flight. First write succeeds, then transition, second write fails. +func TestQA_ANA_TransitionMidIO(t *testing.T) { + dev := newMockDevice(256, 512) + dev.anaState = anaOptimized + client, r, w := setupQAIOQueue(t, "nqn.test:qa-ana-flip", dev) + defer client.Close() + + // Write should succeed (optimized) + writeCmd := CapsuleCommand{OpCode: ioWrite, CID: 1100, D10: 0, D12: 0} + w.SendWithData(pduCapsuleCmd, 0, &writeCmd, capsuleCmdSize, make([]byte, 512)) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("write while optimized failed: 0x%04x", resp.Status) + } + + // Flip to inaccessible + dev.mu.Lock() + dev.anaState = anaInaccessible + dev.mu.Unlock() + + // Write should be rejected + writeCmd2 := CapsuleCommand{OpCode: ioWrite, CID: 1101, D10: 1, D12: 0} + w.SendWithData(pduCapsuleCmd, 0, &writeCmd2, capsuleCmdSize, make([]byte, 512)) + resp = recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusNSNotReady { + t.Fatalf("write after ANA flip: got 0x%04x, want NSNotReady", resp.Status) + } + + // Flip back to optimized + dev.mu.Lock() + dev.anaState = anaOptimized + dev.mu.Unlock() + + // Write should succeed again + writeCmd3 := CapsuleCommand{OpCode: ioWrite, CID: 1102, D10: 2, D12: 0} + w.SendWithData(pduCapsuleCmd, 0, &writeCmd3, capsuleCmdSize, make([]byte, 512)) + resp = recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("write after ANA restore: 0x%04x", resp.Status) + } +} + +// TestQA_ANA_NonOptimizedAllowsWrite: anaNonOptimized should allow writes. +func TestQA_ANA_NonOptimizedAllowsWrite(t *testing.T) { + dev := newMockDevice(256, 512) + dev.anaState = anaNonOptimized + client, r, w := setupQAIOQueue(t, "nqn.test:qa-ana-nonopt", dev) + defer client.Close() + + writeCmd := CapsuleCommand{OpCode: ioWrite, CID: 1110, D10: 0, D12: 0} + w.SendWithData(pduCapsuleCmd, 0, &writeCmd, capsuleCmdSize, make([]byte, 512)) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("write with NonOptimized ANA should succeed: 0x%04x", resp.Status) + } +} + +// TestQA_ANA_LogReflectsState: ANA log page reports correct state after transition. +func TestQA_ANA_LogReflectsState(t *testing.T) { + nqn := "nqn.test:qa-ana-log" + dev := newMockDevice(256, 512) + dev.anaState = anaPersistentLoss + + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + ctrl := newController(serverConn, srv) + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + sendICReq(w) + recvICResp(t, r) + sendConnect(w, 0, 64, 0, nqn, "host", 0xFFFF) + recvCapsuleResp(t, r) + + cmd := CapsuleCommand{ + OpCode: adminGetLogPage, + CID: 1120, + D10: uint32(logPageANA) | (9 << 16), + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + + hdr, _ := r.Dequeue() + if hdr.Type != pduC2HData { + t.Fatalf("expected C2HData") + } + var c2h C2HDataHeader + r.Receive(&c2h) + data := make([]byte, r.Length()) + r.ReceiveData(data) + + if data[32] != anaPersistentLoss { + t.Fatalf("ANA state = 0x%02x, want 0x%02x (PersistentLoss)", data[32], anaPersistentLoss) + } + + recvCapsuleResp(t, r) +} + +// ============================================================ +// QA-7: Server Lifecycle +// ============================================================ + +// TestQA_Server_ConnectAfterVolumeRemoved: connect to NQN after RemoveVolume → error. +func TestQA_Server_ConnectAfterVolumeRemoved(t *testing.T) { + nqn := "nqn.test:qa-removed" + dev := newMockDevice(256, 512) + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + srv.RemoveVolume(nqn) // Remove before connect + + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + ctrl := newController(serverConn, srv) + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + + sendICReq(w) + recvICResp(t, r) + + sendConnect(w, 0, 64, 0, nqn, "host", 0xFFFF) + resp := recvCapsuleResp(t, r) + if !StatusWord(resp.Status).IsError() { + t.Fatal("connect to removed volume should fail") + } +} + +// TestQA_Server_RapidConnectDisconnect: 20 rapid admin connect/disconnect cycles. +func TestQA_Server_RapidConnectDisconnect(t *testing.T) { + nqn := "nqn.test:qa-rapid" + dev := newMockDevice(256, 512) + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + for i := 0; i < 20; i++ { + clientConn, serverConn := pipeConn() + ctrl := newController(serverConn, srv) + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + + if err := sendICReq(w); err != nil { + t.Fatalf("iter %d IC: %v", i, err) + } + recvICResp(t, r) + + if err := sendConnect(w, 0, 64, 0, nqn, "host", 0xFFFF); err != nil { + t.Fatalf("iter %d connect: %v", i, err) + } + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("iter %d connect failed: 0x%04x", i, resp.Status) + } + + clientConn.Close() + } + + // Verify all admin sessions are cleaned up + time.Sleep(100 * time.Millisecond) + srv.adminMu.RLock() + count := len(srv.admins) + srv.adminMu.RUnlock() + if count != 0 { + t.Fatalf("leaked %d admin sessions after rapid connect/disconnect", count) + } +} + +// TestQA_Server_ConcurrentIO: 4 concurrent IO queue connections read/write simultaneously. +func TestQA_Server_ConcurrentIO(t *testing.T) { + nqn := "nqn.test:qa-concurrent-io" + dev := newMockDevice(1024, 512) + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 8}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + var wg sync.WaitGroup + for i := 0; i < 4; i++ { + wg.Add(1) + go func(idx int) { + defer wg.Done() + + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + ctrl := newController(serverConn, srv) + ctrl.subsystem = srv.findSubsystem(nqn) + ctrl.queueID = uint16(idx + 1) + ctrl.queueSize = 64 + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + sendICReq(w) + recvICResp(t, r) + + // Each goroutine writes to a different LBA range + base := uint32(idx * 64) + for j := 0; j < 10; j++ { + lba := base + uint32(j) + writeData := bytes.Repeat([]byte{byte(idx*10 + j)}, 512) + writeCmd := CapsuleCommand{ + OpCode: ioWrite, + CID: uint16(idx*100 + j), + D10: lba, + D12: 0, + } + w.SendWithData(pduCapsuleCmd, 0, &writeCmd, capsuleCmdSize, writeData) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Errorf("goroutine %d write %d failed: 0x%04x", idx, j, resp.Status) + return + } + } + }(i) + } + wg.Wait() +} + +// ============================================================ +// QA-8: SQHD Wraparound +// ============================================================ + +// TestQA_SQHDWraparound: send queueSize+1 commands to trigger SQHD wrap. +func TestQA_SQHDWraparound(t *testing.T) { + dev := newMockDevice(256, 512) + nqn := "nqn.test:qa-sqhd" + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + ctrl := newController(serverConn, srv) + ctrl.subsystem = srv.findSubsystem(nqn) + ctrl.queueID = 1 + ctrl.queueSize = 4 // very small queue for quick wrap + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + sendICReq(w) + recvICResp(t, r) + + // Send 10 commands (wraps around queueSize=4 multiple times) + var lastSQHD uint16 + for i := 0; i < 10; i++ { + readCmd := CapsuleCommand{ + OpCode: ioRead, + CID: uint16(2000 + i), + D10: 0, + D12: 0, + } + w.SendWithData(pduCapsuleCmd, 0, &readCmd, capsuleCmdSize, nil) + + // Read C2HData + CapsuleResp + hdr, err := r.Dequeue() + if err != nil { + t.Fatalf("cmd %d: dequeue: %v", i, err) + } + if hdr.Type == pduC2HData { + var c2h C2HDataHeader + r.Receive(&c2h) + data := make([]byte, r.Length()) + r.ReceiveData(data) + // Read the CapsuleResp + resp := recvCapsuleResp(t, r) + lastSQHD = resp.SQHD + } else if hdr.Type == pduCapsuleResp { + var resp CapsuleResponse + r.Receive(&resp) + lastSQHD = resp.SQHD + } + } + + // SQHD should have wrapped (not grown unbounded) + // With queueSize=4, after 10 commands: sqhd cycles 1,2,3,0,1,2,3,0,1,2 + // Expected: SQHD=2 + if lastSQHD >= 4 { + t.Fatalf("SQHD=%d should be < queueSize=4", lastSQHD) + } +} + +// ============================================================ +// QA-9: Large Read Chunking (C2HData) +// ============================================================ + +// TestQA_LargeReadChunking: read > maxH2CDataLen triggers multiple C2HData PDUs. +func TestQA_LargeReadChunking(t *testing.T) { + // maxH2CDataLen = 32KB. Create a device with 512B blocks, + // read 128 blocks = 64KB → should produce 2 C2HData chunks. + dev := newMockDevice(256, 512) + // Fill with pattern + for i := range dev.data { + dev.data[i] = byte(i & 0xFF) + } + + client, r, w := setupQAIOQueue(t, "nqn.test:qa-chunk", dev) + defer client.Close() + + readCmd := CapsuleCommand{ + OpCode: ioRead, + CID: 3000, + D10: 0, + D12: 127, // 128 blocks = 65536 bytes (0-based count) + } + w.SendWithData(pduCapsuleCmd, 0, &readCmd, capsuleCmdSize, nil) + + // Collect all C2HData chunks + var readBuf bytes.Buffer + chunkCount := 0 + for { + hdr, err := r.Dequeue() + if err != nil { + t.Fatal(err) + } + if hdr.Type == pduCapsuleResp { + var resp CapsuleResponse + r.Receive(&resp) + if StatusWord(resp.Status).IsError() { + t.Fatalf("large read error: 0x%04x", resp.Status) + } + break + } + if hdr.Type == pduC2HData { + chunkCount++ + var c2h C2HDataHeader + r.Receive(&c2h) + chunk := make([]byte, r.Length()) + r.ReceiveData(chunk) + readBuf.Write(chunk) + } + } + + if readBuf.Len() != 65536 { + t.Fatalf("total read = %d bytes, want 65536", readBuf.Len()) + } + if chunkCount < 2 { + t.Fatalf("expected >= 2 C2HData chunks, got %d", chunkCount) + } + + // Verify data matches + if !bytes.Equal(readBuf.Bytes(), dev.data[:65536]) { + t.Fatal("large read data mismatch") + } +} + +// ============================================================ +// QA-10: Error Injection Under Load +// ============================================================ + +// TestQA_ErrorInjectionMidStream: inject errors after successful IO. +func TestQA_ErrorInjectionMidStream(t *testing.T) { + dev := newMockDevice(256, 512) + client, r, w := setupQAIOQueue(t, "nqn.test:qa-inject", dev) + defer client.Close() + + // First write succeeds + writeCmd := CapsuleCommand{OpCode: ioWrite, CID: 4000, D10: 0, D12: 0} + w.SendWithData(pduCapsuleCmd, 0, &writeCmd, capsuleCmdSize, make([]byte, 512)) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("first write failed: 0x%04x", resp.Status) + } + + // Inject write error + dev.mu.Lock() + dev.writeErr = errors.New("injected Write fault") + dev.mu.Unlock() + + // Second write should fail with MediaWriteFault + writeCmd2 := CapsuleCommand{OpCode: ioWrite, CID: 4001, D10: 1, D12: 0} + w.SendWithData(pduCapsuleCmd, 0, &writeCmd2, capsuleCmdSize, make([]byte, 512)) + resp = recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusMediaWriteFault { + t.Fatalf("injected write error: got 0x%04x, want MediaWriteFault", resp.Status) + } + + // Clear error → writes should work again + dev.mu.Lock() + dev.writeErr = nil + dev.mu.Unlock() + + writeCmd3 := CapsuleCommand{OpCode: ioWrite, CID: 4002, D10: 2, D12: 0} + w.SendWithData(pduCapsuleCmd, 0, &writeCmd3, capsuleCmdSize, make([]byte, 512)) + resp = recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("write after error clear: 0x%04x", resp.Status) + } + + // Inject read error + dev.mu.Lock() + dev.readErr = errors.New("injected Read failure") + dev.mu.Unlock() + + readCmd := CapsuleCommand{OpCode: ioRead, CID: 4003, D10: 0, D12: 0} + w.SendWithData(pduCapsuleCmd, 0, &readCmd, capsuleCmdSize, nil) + resp = recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusMediaReadError { + t.Fatalf("injected read error: got 0x%04x, want MediaReadError", resp.Status) + } +} + +// ============================================================ +// QA-11: PropertySet CC.EN=0 clears CSTS.RDY +// ============================================================ + +func TestQA_PropertySet_DisableController(t *testing.T) { + nqn := "nqn.test:qa-cc-disable" + client, r, w, _, _ := setupAdminSession(t, nqn) + defer client.Close() + + // Enable: CC.EN=1 + cmd := makePropertySetCapsule(propCC, 1) + cmd.CID = 5000 + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + recvCapsuleResp(t, r) + + // Verify RDY=1 + cmd2 := makePropertyGetCapsule(propCSTS, false) + cmd2.CID = 5001 + w.SendWithData(pduCapsuleCmd, 0, &cmd2, capsuleCmdSize, nil) + resp := recvCapsuleResp(t, r) + if resp.DW0&1 != 1 { + t.Fatal("CSTS.RDY should be 1 after CC.EN=1") + } + + // Disable: CC.EN=0 + cmd3 := makePropertySetCapsule(propCC, 0) + cmd3.CID = 5002 + w.SendWithData(pduCapsuleCmd, 0, &cmd3, capsuleCmdSize, nil) + recvCapsuleResp(t, r) + + // Verify RDY=0 + cmd4 := makePropertyGetCapsule(propCSTS, false) + cmd4.CID = 5003 + w.SendWithData(pduCapsuleCmd, 0, &cmd4, capsuleCmdSize, nil) + resp = recvCapsuleResp(t, r) + if resp.DW0&1 != 0 { + t.Fatal("CSTS.RDY should be 0 after CC.EN=0") + } +} + +// ============================================================ +// QA-12: Identify before subsystem set → InvalidField +// ============================================================ + +func TestQA_IdentifyWithoutSubsystem(t *testing.T) { + // Controller with no subsystem set (no Connect) + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + ctrl := newController(serverConn, srv) + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + + sendICReq(w) + recvICResp(t, r) + + // Send Identify without admin Connect first + cmd := CapsuleCommand{ + OpCode: adminIdentify, + CID: 6000, + D10: uint32(cnsIdentifyController), + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusInvalidField { + t.Fatalf("Identify without subsystem: got 0x%04x, want InvalidField", resp.Status) + } +} + +// ============================================================ +// QA-13: IO commands without subsystem → InvalidField +// ============================================================ + +func TestQA_IO_WithoutSubsystem(t *testing.T) { + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + ctrl := newController(serverConn, srv) + ctrl.queueID = 1 + ctrl.queueSize = 64 + // No subsystem set + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + sendICReq(w) + recvICResp(t, r) + + // Read without subsystem + readCmd := CapsuleCommand{OpCode: ioRead, CID: 6100, D10: 0, D12: 0} + w.SendWithData(pduCapsuleCmd, 0, &readCmd, capsuleCmdSize, nil) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusInvalidField { + t.Fatalf("Read without subsystem: got 0x%04x, want InvalidField", resp.Status) + } + + // Write without subsystem + writeCmd := CapsuleCommand{OpCode: ioWrite, CID: 6101, D10: 0, D12: 0} + w.SendWithData(pduCapsuleCmd, 0, &writeCmd, capsuleCmdSize, make([]byte, 512)) + resp = recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusInvalidField { + t.Fatalf("Write without subsystem: got 0x%04x, want InvalidField", resp.Status) + } + + // Flush without subsystem + flushCmd := CapsuleCommand{OpCode: ioFlush, CID: 6102} + w.SendWithData(pduCapsuleCmd, 0, &flushCmd, capsuleCmdSize, nil) + resp = recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusInvalidField { + t.Fatalf("Flush without subsystem: got 0x%04x, want InvalidField", resp.Status) + } + + // WriteZeros without subsystem + wzCmd := CapsuleCommand{OpCode: ioWriteZeros, CID: 6103, D10: 0, D12: 0} + w.SendWithData(pduCapsuleCmd, 0, &wzCmd, capsuleCmdSize, nil) + resp = recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusInvalidField { + t.Fatalf("WriteZeros without subsystem: got 0x%04x, want InvalidField", resp.Status) + } +} + +// ============================================================ +// QA-14: flowCtlOff (SQHD disabled) +// ============================================================ + +func TestQA_FlowCtlOff_SQHD(t *testing.T) { + nqn := "nqn.test:qa-flowctl" + dev := newMockDevice(256, 512) + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + ctrl := newController(serverConn, srv) + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + + sendICReq(w) + recvICResp(t, r) + + // Connect with CATTR bit 2 set (flowCtlOff) + cmd := CapsuleCommand{ + OpCode: adminFabric, + FCType: fcConnect, + CID: 0, + D10: 0, // QID=0 + D11: 63 | (0x04 << 16), // SQSIZE=64, CATTR=0x04 (flowCtlOff) + D12: 0, + } + cd := ConnectData{CNTLID: 0xFFFF, SubNQN: nqn, HostNQN: "host"} + payload := make([]byte, connectDataSize) + cd.Marshal(payload) + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, payload) + + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("connect with flowCtlOff failed: 0x%04x", resp.Status) + } + + // Connect response itself must have SQHD=0xFFFF because flowCtlOff + // is set during handleConnect and sendResponse reads it at send time. + if resp.SQHD != 0xFFFF { + t.Fatalf("Connect SQHD = 0x%04x, want 0xFFFF (flowCtlOff)", resp.SQHD) + } + + // KeepAlive: SQHD still 0xFFFF + kaCmd := CapsuleCommand{OpCode: adminKeepAlive, CID: 7001} + w.SendWithData(pduCapsuleCmd, 0, &kaCmd, capsuleCmdSize, nil) + resp = recvCapsuleResp(t, r) + if resp.SQHD != 0xFFFF { + t.Fatalf("SQHD = 0x%04x after KeepAlive, want 0xFFFF", resp.SQHD) + } + + // Second command: SQHD still 0xFFFF + kaCmd2 := CapsuleCommand{OpCode: adminKeepAlive, CID: 7002} + w.SendWithData(pduCapsuleCmd, 0, &kaCmd2, capsuleCmdSize, nil) + resp = recvCapsuleResp(t, r) + if resp.SQHD != 0xFFFF { + t.Fatalf("SQHD = 0x%04x after second KeepAlive, want 0xFFFF", resp.SQHD) + } +} + +// ============================================================ +// QA-15: 4K block size IO operations +// ============================================================ + +func TestQA_IO_4KBlockSize(t *testing.T) { + dev := newMockDevice(64, 4096) // 64 blocks * 4096 = 256KB + client, r, w := setupQAIOQueue(t, "nqn.test:qa-4k", dev) + defer client.Close() + + // Write 1 block at LBA 0 (4096 bytes) + writeData := bytes.Repeat([]byte{0xAA}, 4096) + writeCmd := CapsuleCommand{OpCode: ioWrite, CID: 8000, D10: 0, D12: 0} + w.SendWithData(pduCapsuleCmd, 0, &writeCmd, capsuleCmdSize, writeData) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("4K write failed: 0x%04x", resp.Status) + } + + // Read back + readCmd := CapsuleCommand{OpCode: ioRead, CID: 8001, D10: 0, D12: 0} + w.SendWithData(pduCapsuleCmd, 0, &readCmd, capsuleCmdSize, nil) + + var readBuf bytes.Buffer + for { + hdr, err := r.Dequeue() + if err != nil { + t.Fatal(err) + } + if hdr.Type == pduCapsuleResp { + var resp2 CapsuleResponse + r.Receive(&resp2) + break + } + if hdr.Type == pduC2HData { + var c2h C2HDataHeader + r.Receive(&c2h) + chunk := make([]byte, r.Length()) + r.ReceiveData(chunk) + readBuf.Write(chunk) + } + } + + if !bytes.Equal(readBuf.Bytes(), writeData) { + t.Fatal("4K block read/write mismatch") + } + + // Write with wrong payload size (512 instead of 4096) + writeCmd2 := CapsuleCommand{OpCode: ioWrite, CID: 8002, D10: 1, D12: 0} + w.SendWithData(pduCapsuleCmd, 0, &writeCmd2, capsuleCmdSize, make([]byte, 512)) + resp = recvCapsuleResp(t, r) + if StatusWord(resp.Status) != StatusInvalidField { + t.Fatalf("512B payload on 4K device: got 0x%04x, want InvalidField", resp.Status) + } +} + +// ============================================================ +// QA-16: Identify Controller field checks +// ============================================================ + +func TestQA_Identify_ControllerModelSerial(t *testing.T) { + nqn := "nqn.test:qa-id-fields" + client, r, w, _, _ := setupAdminSession(t, nqn) + defer client.Close() + + cmd := CapsuleCommand{ + OpCode: adminIdentify, + CID: 9000, + D10: uint32(cnsIdentifyController), + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + + hdr, _ := r.Dequeue() + if hdr.Type != pduC2HData { + t.Fatal("expected C2HData") + } + var c2h C2HDataHeader + r.Receive(&c2h) + data := make([]byte, r.Length()) + r.ReceiveData(data) + + // Serial Number at offset 4, 20 bytes + serial := string(bytes.TrimRight(data[4:24], " \x00")) + if serial != "SWF00001" { + t.Fatalf("Serial = %q, want SWF00001", serial) + } + + // Model Number at offset 24, 40 bytes + model := string(bytes.TrimRight(data[24:64], " \x00")) + if model != "SeaweedFS BlockVol" { + t.Fatalf("Model = %q, want SeaweedFS BlockVol", model) + } + + // Firmware at offset 64, 8 bytes + fw := string(bytes.TrimRight(data[64:72], " \x00")) + if fw != "0001" { + t.Fatalf("Firmware = %q, want 0001", fw) + } + + // NVMe version at offset 80 (uint32 LE) + ver := binary.LittleEndian.Uint32(data[80:]) + if ver != nvmeVersion14 { + t.Fatalf("Version = 0x%08x, want 0x%08x", ver, nvmeVersion14) + } + + // ONCS at offset 520: bits 2+3 (WriteZeros + DatasetManagement/Trim) + oncs := binary.LittleEndian.Uint16(data[520:]) + if oncs&0x0C != 0x0C { + t.Fatalf("ONCS = 0x%04x, expected bits 2+3 set", oncs) + } + + recvCapsuleResp(t, r) +} diff --git a/weed/storage/blockvol/nvme/nvme_test.go b/weed/storage/blockvol/nvme/nvme_test.go new file mode 100644 index 000000000..4e1c8f16b --- /dev/null +++ b/weed/storage/blockvol/nvme/nvme_test.go @@ -0,0 +1,2377 @@ +package nvme + +import ( + "bytes" + "encoding/binary" + "errors" + "io" + "net" + "sync" + "testing" + "time" + + "github.com/seaweedfs/seaweedfs/weed/storage/blockvol" + "github.com/seaweedfs/seaweedfs/weed/storage/blockvol/blockerr" +) + +// ============================================================ +// Mock BlockDevice +// ============================================================ + +type mockBlockDevice struct { + mu sync.Mutex + data []byte + blockSize uint32 + healthy bool + anaState uint8 + readErr error + writeErr error + syncErr error + trimErr error +} + +func newMockDevice(blocks int, blockSize uint32) *mockBlockDevice { + return &mockBlockDevice{ + data: make([]byte, int(blockSize)*blocks), + blockSize: blockSize, + healthy: true, + anaState: anaOptimized, + } +} + +func (m *mockBlockDevice) ReadAt(lba uint64, length uint32) ([]byte, error) { + m.mu.Lock() + defer m.mu.Unlock() + if m.readErr != nil { + return nil, m.readErr + } + off := lba * uint64(m.blockSize) + if off+uint64(length) > uint64(len(m.data)) { + return nil, errors.New("read out of range") + } + buf := make([]byte, length) + copy(buf, m.data[off:off+uint64(length)]) + return buf, nil +} + +func (m *mockBlockDevice) WriteAt(lba uint64, data []byte) error { + m.mu.Lock() + defer m.mu.Unlock() + if m.writeErr != nil { + return m.writeErr + } + off := lba * uint64(m.blockSize) + if off+uint64(len(data)) > uint64(len(m.data)) { + return errors.New("write out of range") + } + copy(m.data[off:], data) + return nil +} + +func (m *mockBlockDevice) Trim(lba uint64, length uint32) error { + m.mu.Lock() + defer m.mu.Unlock() + if m.trimErr != nil { + return m.trimErr + } + off := lba * uint64(m.blockSize) + if off+uint64(length) > uint64(len(m.data)) { + return errors.New("trim out of range") + } + for i := uint64(0); i < uint64(length); i++ { + m.data[off+i] = 0 + } + return nil +} + +func (m *mockBlockDevice) SyncCache() error { + m.mu.Lock() + defer m.mu.Unlock() + return m.syncErr +} + +func (m *mockBlockDevice) BlockSize() uint32 { return m.blockSize } +func (m *mockBlockDevice) VolumeSize() uint64 { return uint64(len(m.data)) } +func (m *mockBlockDevice) IsHealthy() bool { return m.healthy } +func (m *mockBlockDevice) ANAState() uint8 { return m.anaState } +func (m *mockBlockDevice) ANAGroupID() uint16 { return 1 } +func (m *mockBlockDevice) DeviceNGUID() [16]byte { return [16]byte{0x60, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} } + +// ============================================================ +// Protocol Marshal/Unmarshal Tests +// ============================================================ + +func TestCommonHeader_MarshalRoundTrip(t *testing.T) { + hdr := CommonHeader{ + Type: pduCapsuleCmd, + Flags: 0x04, + HeaderLength: 72, + DataOffset: 72, + DataLength: 200, + } + buf := make([]byte, commonHeaderSize) + hdr.Marshal(buf) + + var got CommonHeader + got.Unmarshal(buf) + if got != hdr { + t.Fatalf("got %+v, want %+v", got, hdr) + } +} + +func TestCapsuleCommand_MarshalRoundTrip(t *testing.T) { + cmd := CapsuleCommand{ + OpCode: ioRead, + CID: 42, + NSID: 1, + D10: 0x100, + D11: 0, + D12: 7, // 8 blocks (0-based) + } + buf := make([]byte, capsuleCmdSize) + cmd.Marshal(buf) + + var got CapsuleCommand + got.Unmarshal(buf) + if got.OpCode != cmd.OpCode || got.CID != cmd.CID || got.D10 != cmd.D10 || got.D12 != cmd.D12 { + t.Fatalf("got %+v", got) + } + if got.Lba() != 0x100 { + t.Fatalf("Lba() = %d, want 256", got.Lba()) + } + if got.LbaLength() != 8 { + t.Fatalf("LbaLength() = %d, want 8", got.LbaLength()) + } +} + +func TestCapsuleResponse_MarshalRoundTrip(t *testing.T) { + resp := CapsuleResponse{ + DW0: 0x12345678, + DW1: 0xABCD, + SQHD: 5, + QueueID: 0, + CID: 42, + Status: uint16(StatusSuccess), + } + buf := make([]byte, capsuleRespSize) + resp.Marshal(buf) + + var got CapsuleResponse + got.Unmarshal(buf) + if got.DW0 != resp.DW0 || got.SQHD != resp.SQHD || got.CID != resp.CID || got.Status != resp.Status { + t.Fatalf("got %+v, want %+v", got, resp) + } +} + +func TestICRequest_MarshalRoundTrip(t *testing.T) { + req := ICRequest{ + PDUFormatVersion: 0x0100, + PDUDataAlignment: 2, + PDUMaxR2T: 4, + } + buf := make([]byte, icBodySize) + req.Marshal(buf) + + var got ICRequest + got.Unmarshal(buf) + if got.PDUFormatVersion != req.PDUFormatVersion || got.PDUMaxR2T != req.PDUMaxR2T { + t.Fatalf("got %+v, want %+v", got, req) + } +} + +func TestICResponse_MarshalRoundTrip(t *testing.T) { + resp := ICResponse{MaxH2CDataLength: maxH2CDataLen} + buf := make([]byte, icBodySize) + resp.Marshal(buf) + + var got ICResponse + got.Unmarshal(buf) + if got.MaxH2CDataLength != maxH2CDataLen { + t.Fatalf("MaxH2CDataLength = %d, want %d", got.MaxH2CDataLength, maxH2CDataLen) + } +} + +func TestC2HDataHeader_MarshalRoundTrip(t *testing.T) { + hdr := C2HDataHeader{ + CCCID: 7, + DATAO: 1024, + DATAL: 4096, + } + buf := make([]byte, c2hDataHdrSize) + hdr.Marshal(buf) + + var got C2HDataHeader + got.Unmarshal(buf) + if got.CCCID != 7 || got.DATAO != 1024 || got.DATAL != 4096 { + t.Fatalf("got %+v", got) + } +} + +func TestConnectData_MarshalRoundTrip(t *testing.T) { + cd := ConnectData{ + HostID: [16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, + CNTLID: 0xFFFF, + SubNQN: "nqn.2024-01.com.seaweedfs:vol.test", + HostNQN: "nqn.2024-01.com.seaweedfs:host", + } + buf := make([]byte, connectDataSize) + cd.Marshal(buf) + + var got ConnectData + got.Unmarshal(buf) + if got.SubNQN != cd.SubNQN || got.HostNQN != cd.HostNQN || got.CNTLID != cd.CNTLID { + t.Fatalf("got SubNQN=%q HostNQN=%q CNTLID=%d", got.SubNQN, got.HostNQN, got.CNTLID) + } + if got.HostID != cd.HostID { + t.Fatalf("HostID mismatch") + } +} + +func TestStatusWord_Encoding(t *testing.T) { + tests := []struct { + name string + sct uint8 + sc uint8 + dnr bool + want StatusWord + }{ + {"Success", 0, 0, false, StatusSuccess}, + {"InvalidOpcode_DNR", 0, 0x01, true, StatusInvalidOpcode}, + {"InvalidField_DNR", 0, 0x02, true, StatusInvalidField}, + {"InternalError", 0, 0x06, false, StatusInternalError}, + {"InternalError_DNR", 0, 0x06, true, StatusInternalErrorDNR}, + {"NSNotReady", 0, 0x82, false, StatusNSNotReady}, + {"NSNotReady_DNR", 0, 0x82, true, StatusNSNotReadyDNR}, + {"LBAOutOfRange", 0, 0x80, true, StatusLBAOutOfRange}, + {"MediaWriteFault", 2, 0x80, false, StatusMediaWriteFault}, + {"MediaReadError", 2, 0x81, false, StatusMediaReadError}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := MakeStatus(tt.sct, tt.sc, tt.dnr) + if got != tt.want { + t.Fatalf("MakeStatus(%d,%d,%v) = 0x%04x, want 0x%04x", tt.sct, tt.sc, tt.dnr, got, tt.want) + } + if got.SCT() != tt.sct { + t.Fatalf("SCT() = %d, want %d", got.SCT(), tt.sct) + } + if got.SC() != tt.sc { + t.Fatalf("SC() = 0x%02x, want 0x%02x", got.SC(), tt.sc) + } + if got.DNR() != tt.dnr { + t.Fatalf("DNR() = %v, want %v", got.DNR(), tt.dnr) + } + }) + } +} + +func TestStatusWord_IsError(t *testing.T) { + if StatusSuccess.IsError() { + t.Fatal("Success should not be error") + } + if !StatusInvalidOpcode.IsError() { + t.Fatal("InvalidOpcode should be error") + } +} + +// ============================================================ +// Wire Reader/Writer Tests +// ============================================================ + +func TestWire_WriteReadRoundTrip_HeaderOnly(t *testing.T) { + var buf bytes.Buffer + w := NewWriter(&buf) + r := NewReader(&buf) + + resp := ICResponse{MaxH2CDataLength: maxH2CDataLen} + if err := w.SendHeaderOnly(pduICResp, &resp, icBodySize); err != nil { + t.Fatal(err) + } + + hdr, err := r.Dequeue() + if err != nil { + t.Fatal(err) + } + if hdr.Type != pduICResp { + t.Fatalf("type = 0x%x, want 0x%x", hdr.Type, pduICResp) + } + if hdr.HeaderLength != icHdrLen { + t.Fatalf("HeaderLength = %d, want %d", hdr.HeaderLength, icHdrLen) + } + if hdr.DataOffset != 0 { + t.Fatalf("DataOffset = %d, want 0", hdr.DataOffset) + } + + var got ICResponse + if err := r.Receive(&got); err != nil { + t.Fatal(err) + } + if got.MaxH2CDataLength != maxH2CDataLen { + t.Fatalf("MaxH2CDataLength = %d", got.MaxH2CDataLength) + } +} + +func TestWire_WriteReadRoundTrip_WithData(t *testing.T) { + var buf bytes.Buffer + w := NewWriter(&buf) + r := NewReader(&buf) + + c2h := C2HDataHeader{CCCID: 5, DATAO: 0, DATAL: 4096} + payload := make([]byte, 4096) + for i := range payload { + payload[i] = byte(i & 0xFF) + } + + if err := w.SendWithData(pduC2HData, c2hFlagLast, &c2h, c2hDataHdrSize, payload); err != nil { + t.Fatal(err) + } + + hdr, err := r.Dequeue() + if err != nil { + t.Fatal(err) + } + if hdr.Type != pduC2HData { + t.Fatalf("type = 0x%x", hdr.Type) + } + if hdr.Flags != c2hFlagLast { + t.Fatalf("flags = 0x%x", hdr.Flags) + } + if hdr.DataOffset != c2hDataHdrLen { + t.Fatalf("DataOffset = %d, want %d", hdr.DataOffset, c2hDataHdrLen) + } + + var gotHdr C2HDataHeader + if err := r.Receive(&gotHdr); err != nil { + t.Fatal(err) + } + if gotHdr.CCCID != 5 || gotHdr.DATAL != 4096 { + t.Fatalf("got %+v", gotHdr) + } + + dataLen := r.Length() + if dataLen != 4096 { + t.Fatalf("Length() = %d", dataLen) + } + + gotData := make([]byte, dataLen) + if err := r.ReceiveData(gotData); err != nil { + t.Fatal(err) + } + if !bytes.Equal(gotData, payload) { + t.Fatal("payload mismatch") + } +} + +func TestWire_MultiPDU(t *testing.T) { + var buf bytes.Buffer + w := NewWriter(&buf) + r := NewReader(&buf) + + // Write 3 header-only PDUs + for i := 0; i < 3; i++ { + resp := CapsuleResponse{CID: uint16(i), Status: 0} + if err := w.SendHeaderOnly(pduCapsuleResp, &resp, capsuleRespSize); err != nil { + t.Fatal(err) + } + } + + // Read all 3 + for i := 0; i < 3; i++ { + hdr, err := r.Dequeue() + if err != nil { + t.Fatalf("pdu %d: %v", i, err) + } + if hdr.Type != pduCapsuleResp { + t.Fatalf("pdu %d: type 0x%x", i, hdr.Type) + } + var resp CapsuleResponse + if err := r.Receive(&resp); err != nil { + t.Fatalf("pdu %d: %v", i, err) + } + if resp.CID != uint16(i) { + t.Fatalf("pdu %d: CID=%d", i, resp.CID) + } + } +} + +func TestWire_PayloadSize(t *testing.T) { + var buf bytes.Buffer + w := NewWriter(&buf) + r := NewReader(&buf) + + // Header-only: Length() should be 0 + resp := CapsuleResponse{} + w.SendHeaderOnly(pduCapsuleResp, &resp, capsuleRespSize) + r.Dequeue() + r.Receive(&resp) + if r.Length() != 0 { + t.Fatalf("expected 0 length for header-only, got %d", r.Length()) + } +} + +func TestWire_CapsuleCmdWithData(t *testing.T) { + var buf bytes.Buffer + w := NewWriter(&buf) + r := NewReader(&buf) + + cmd := CapsuleCommand{OpCode: ioWrite, CID: 10, D10: 0, D12: 0} + data := []byte("hello world block data 123") + + if err := w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, data); err != nil { + t.Fatal(err) + } + + hdr, err := r.Dequeue() + if err != nil { + t.Fatal(err) + } + if hdr.DataOffset != capsuleCmdHdrLen { + t.Fatalf("DataOffset = %d", hdr.DataOffset) + } + + var gotCmd CapsuleCommand + r.Receive(&gotCmd) + if gotCmd.CID != 10 { + t.Fatalf("CID = %d", gotCmd.CID) + } + + payloadLen := r.Length() + if payloadLen != uint32(len(data)) { + t.Fatalf("Length() = %d, want %d", payloadLen, len(data)) + } + + gotData := make([]byte, payloadLen) + r.ReceiveData(gotData) + if !bytes.Equal(gotData, data) { + t.Fatal("data mismatch") + } +} + +// ============================================================ +// Controller + Fabric Tests (using pipe connections) +// ============================================================ + +func pipeConn() (client, server net.Conn) { + s, c := net.Pipe() + return c, s +} + +// sendICReq writes an ICRequest PDU to the writer. +func sendICReq(w *Writer) error { + req := ICRequest{} + return w.SendHeaderOnly(pduICReq, &req, icBodySize) +} + +// recvICResp reads and validates an ICResponse PDU. +func recvICResp(t *testing.T, r *Reader) { + t.Helper() + hdr, err := r.Dequeue() + if err != nil { + t.Fatalf("ICResp dequeue: %v", err) + } + if hdr.Type != pduICResp { + t.Fatalf("expected ICResp, got 0x%x", hdr.Type) + } + var resp ICResponse + if err := r.Receive(&resp); err != nil { + t.Fatal(err) + } + if resp.MaxH2CDataLength != maxH2CDataLen { + t.Fatalf("MaxH2CDataLength = %d", resp.MaxH2CDataLength) + } +} + +// sendConnect sends a Fabric Connect capsule with inline ConnectData. +func sendConnect(w *Writer, queueID, queueSize uint16, kato uint32, subNQN, hostNQN string, cntlID uint16) error { + cmd := CapsuleCommand{ + OpCode: adminFabric, + FCType: fcConnect, + CID: 0, + D10: uint32(queueID) << 16, + D11: uint32(queueSize - 1), + D12: kato, + } + cd := ConnectData{ + CNTLID: cntlID, + SubNQN: subNQN, + HostNQN: hostNQN, + } + payload := make([]byte, connectDataSize) + cd.Marshal(payload) + return w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, payload) +} + +// recvCapsuleResp reads a CapsuleResponse and returns it. +func recvCapsuleResp(t *testing.T, r *Reader) CapsuleResponse { + t.Helper() + hdr, err := r.Dequeue() + if err != nil { + t.Fatalf("CapsuleResp dequeue: %v", err) + } + if hdr.Type != pduCapsuleResp { + t.Fatalf("expected CapsuleResp (0x5), got 0x%x", hdr.Type) + } + var resp CapsuleResponse + if err := r.Receive(&resp); err != nil { + t.Fatal(err) + } + return resp +} + +// setupAdminSession performs IC + admin Connect on a pipe. +func setupAdminSession(t *testing.T, subNQN string) (client net.Conn, clientR *Reader, clientW *Writer, ctrl *Controller, cntlID uint16) { + t.Helper() + + clientConn, serverConn := pipeConn() + dev := newMockDevice(1024, 512) + srv := NewServer(Config{ + Enabled: true, + ListenAddr: "127.0.0.1:0", + MaxIOQueues: 4, + }) + srv.AddVolume(subNQN, dev, dev.DeviceNGUID()) + + ctrl = newController(serverConn, srv) + + // Run controller in background + go ctrl.Serve() + + clientR = NewReader(clientConn) + clientW = NewWriter(clientConn) + + // IC handshake + if err := sendICReq(clientW); err != nil { + t.Fatal(err) + } + recvICResp(t, clientR) + + // Admin Connect + if err := sendConnect(clientW, 0, 64, 60000, subNQN, "host-nqn", 0xFFFF); err != nil { + t.Fatal(err) + } + resp := recvCapsuleResp(t, clientR) + if StatusWord(resp.Status).IsError() { + t.Fatalf("Connect failed: 0x%04x", resp.Status) + } + cntlID = uint16(resp.DW0) + + return clientConn, clientR, clientW, ctrl, cntlID +} + +func TestController_ICHandshake(t *testing.T) { + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + ctrl := newController(serverConn, srv) + + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + + if err := sendICReq(w); err != nil { + t.Fatal(err) + } + recvICResp(t, r) + + clientConn.Close() +} + +func TestController_AdminConnect(t *testing.T) { + nqn := "nqn.test:vol.t1" + client, _, _, _, cntlID := setupAdminSession(t, nqn) + defer client.Close() + + if cntlID == 0 { + t.Fatal("expected non-zero CNTLID") + } +} + +func TestController_ConnectUnknownNQN(t *testing.T) { + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + ctrl := newController(serverConn, srv) + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + + sendICReq(w) + recvICResp(t, r) + + sendConnect(w, 0, 64, 0, "nqn.unknown", "host", 0xFFFF) + resp := recvCapsuleResp(t, r) + if !StatusWord(resp.Status).IsError() { + t.Fatal("expected error for unknown NQN") + } + + clientConn.Close() +} + +func TestController_PropertyGetCAP(t *testing.T) { + nqn := "nqn.test:propget" + client, r, w, _, _ := setupAdminSession(t, nqn) + defer client.Close() + + // PropertyGet CAP (8 bytes) + cmd := CapsuleCommand{ + OpCode: adminFabric, + FCType: fcPropertyGet, + CID: 1, + D10: propCAP, + D11: 1, // 8-byte + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("PropertyGet CAP failed: 0x%04x", resp.Status) + } + cap := uint64(resp.DW0) | uint64(resp.DW1)<<32 + // MQES should be 63 + if cap&0xFFFF != 63 { + t.Fatalf("MQES = %d, want 63", cap&0xFFFF) + } + + client.Close() +} + +func TestController_PropertySetCC_EN(t *testing.T) { + nqn := "nqn.test:propset" + client, r, w, ctrl, _ := setupAdminSession(t, nqn) + defer client.Close() + + // PropertySet CC.EN=1 + cmd := CapsuleCommand{ + OpCode: adminFabric, + FCType: fcPropertySet, + CID: 2, + D10: propCC, + D14: 1, // CC.EN=1 + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("PropertySet CC failed: 0x%04x", resp.Status) + } + + // Verify CSTS.RDY via PropertyGet + cmd2 := CapsuleCommand{ + OpCode: adminFabric, + FCType: fcPropertyGet, + CID: 3, + D10: propCSTS, + } + w.SendWithData(pduCapsuleCmd, 0, &cmd2, capsuleCmdSize, nil) + resp2 := recvCapsuleResp(t, r) + if resp2.DW0&1 != 1 { + t.Fatal("CSTS.RDY not set after CC.EN=1") + } + + _ = ctrl + client.Close() +} + +// ============================================================ +// Identify Tests +// ============================================================ + +func TestIdentify_Controller(t *testing.T) { + nqn := "nqn.test:id-ctrl" + client, r, w, _, _ := setupAdminSession(t, nqn) + defer client.Close() + + cmd := CapsuleCommand{ + OpCode: adminIdentify, + CID: 10, + D10: uint32(cnsIdentifyController), + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + + // Expect C2HData + CapsuleResp + hdr, err := r.Dequeue() + if err != nil { + t.Fatal(err) + } + if hdr.Type != pduC2HData { + t.Fatalf("expected C2HData, got 0x%x", hdr.Type) + } + + var c2h C2HDataHeader + r.Receive(&c2h) + data := make([]byte, r.Length()) + r.ReceiveData(data) + + if len(data) != identifySize { + t.Fatalf("identify data size = %d, want %d", len(data), identifySize) + } + + // SQES + if data[512] != 0x66 { + t.Fatalf("SQES = 0x%02x, want 0x66", data[512]) + } + // CQES + if data[513] != 0x44 { + t.Fatalf("CQES = 0x%02x, want 0x44", data[513]) + } + // VWC + if data[525] != 0x01 { + t.Fatalf("VWC = 0x%02x, want 0x01", data[525]) + } + // MDTS + if data[77] != 3 { + t.Fatalf("MDTS = %d, want 3", data[77]) + } + // SubNQN check + subNQN := string(bytes.TrimRight(data[768:1024], " ")) + if subNQN != nqn { + t.Fatalf("SubNQN = %q, want %q", subNQN, nqn) + } + // IOCCSZ + ioccsz := binary.LittleEndian.Uint32(data[1792:]) + if ioccsz != 4 { + t.Fatalf("IOCCSZ = %d, want 4", ioccsz) + } + // IORCSZ + iorcsz := binary.LittleEndian.Uint32(data[1796:]) + if iorcsz != 1 { + t.Fatalf("IORCSZ = %d, want 1", iorcsz) + } + // NN + nn := binary.LittleEndian.Uint32(data[516:]) + if nn != 1 { + t.Fatalf("NN = %d, want 1", nn) + } + + // Read trailing CapsuleResp + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("Identify Ctrl response error: 0x%04x", resp.Status) + } + + client.Close() +} + +func TestIdentify_Namespace_512B(t *testing.T) { + testIdentifyNS(t, 512) +} + +func TestIdentify_Namespace_4K(t *testing.T) { + testIdentifyNS(t, 4096) +} + +func testIdentifyNS(t *testing.T, blockSize uint32) { + t.Helper() + nqn := "nqn.test:id-ns" + + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + dev := newMockDevice(1024, blockSize) + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + ctrl := newController(serverConn, srv) + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + + sendICReq(w) + recvICResp(t, r) + sendConnect(w, 0, 64, 0, nqn, "host", 0xFFFF) + recvCapsuleResp(t, r) + + cmd := CapsuleCommand{ + OpCode: adminIdentify, + CID: 11, + D10: uint32(cnsIdentifyNamespace), + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + + hdr, _ := r.Dequeue() + if hdr.Type != pduC2HData { + t.Fatalf("expected C2HData") + } + var c2h C2HDataHeader + r.Receive(&c2h) + data := make([]byte, r.Length()) + r.ReceiveData(data) + + expectedNSZE := dev.VolumeSize() / uint64(blockSize) + nsze := binary.LittleEndian.Uint64(data[0:]) + if nsze != expectedNSZE { + t.Fatalf("NSZE = %d, want %d", nsze, expectedNSZE) + } + + // LBAF[0]: bits 23:16 = log2(blockSize) + lbaf0 := binary.LittleEndian.Uint32(data[128:]) + var expectedLBADS uint8 + switch blockSize { + case 512: + expectedLBADS = 9 + case 4096: + expectedLBADS = 12 + } + gotLBADS := uint8((lbaf0 >> 16) & 0xFF) + if gotLBADS != expectedLBADS { + t.Fatalf("LBADS = %d, want %d", gotLBADS, expectedLBADS) + } + + recvCapsuleResp(t, r) + clientConn.Close() +} + +func TestIdentify_ActiveNSList(t *testing.T) { + nqn := "nqn.test:nslist" + client, r, w, _, _ := setupAdminSession(t, nqn) + defer client.Close() + + cmd := CapsuleCommand{ + OpCode: adminIdentify, + CID: 12, + D10: uint32(cnsActiveNSList), + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + + hdr, _ := r.Dequeue() + if hdr.Type != pduC2HData { + t.Fatalf("expected C2HData") + } + var c2h C2HDataHeader + r.Receive(&c2h) + data := make([]byte, r.Length()) + r.ReceiveData(data) + + nsid := binary.LittleEndian.Uint32(data[0:]) + if nsid != 1 { + t.Fatalf("NSID = %d, want 1", nsid) + } + + recvCapsuleResp(t, r) + client.Close() +} + +func TestIdentify_NSDescriptors(t *testing.T) { + nqn := "nqn.test:nsdesc" + client, r, w, _, _ := setupAdminSession(t, nqn) + defer client.Close() + + cmd := CapsuleCommand{ + OpCode: adminIdentify, + CID: 13, + D10: uint32(cnsNSDescriptorList), + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + + hdr, _ := r.Dequeue() + if hdr.Type != pduC2HData { + t.Fatalf("expected C2HData") + } + var c2h C2HDataHeader + r.Receive(&c2h) + data := make([]byte, r.Length()) + r.ReceiveData(data) + + // Type = 0x02 (NGUID), Length = 16 + if data[0] != 0x02 || data[1] != 16 { + t.Fatalf("NS descriptor: type=0x%02x len=%d", data[0], data[1]) + } + + recvCapsuleResp(t, r) + client.Close() +} + +// ============================================================ +// Admin Command Tests +// ============================================================ + +func TestAdmin_SetFeatures_NumQueues(t *testing.T) { + nqn := "nqn.test:numq" + client, r, w, _, _ := setupAdminSession(t, nqn) + defer client.Close() + + // Request 8 queues + cmd := CapsuleCommand{ + OpCode: adminSetFeatures, + CID: 20, + D10: uint32(fidNumberOfQueues), + D11: 7 | (7 << 16), // NCQR=7, NSQR=7 (both request 8, 0-based) + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("SetFeatures failed: 0x%04x", resp.Status) + } + + // MaxIOQueues=4, so granted should be 4 + ncqr := resp.DW0 & 0xFFFF + nsqr := resp.DW0 >> 16 + if ncqr != 3 || nsqr != 3 { // 0-based: 3 means 4 queues + t.Fatalf("NCQR=%d NSQR=%d, want 3,3", ncqr, nsqr) + } + + client.Close() +} + +func TestAdmin_GetLogPage_SMART(t *testing.T) { + nqn := "nqn.test:smart" + client, r, w, _, _ := setupAdminSession(t, nqn) + defer client.Close() + + // Request 512 bytes of SMART log (NUMD = 512/4 - 1 = 127) + cmd := CapsuleCommand{ + OpCode: adminGetLogPage, + CID: 21, + D10: uint32(logPageSMART) | (127 << 16), + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + + hdr, _ := r.Dequeue() + if hdr.Type != pduC2HData { + t.Fatalf("expected C2HData, got 0x%x", hdr.Type) + } + var c2h C2HDataHeader + r.Receive(&c2h) + data := make([]byte, r.Length()) + r.ReceiveData(data) + + // Available Spare = 100% + if data[3] != 100 { + t.Fatalf("Available Spare = %d, want 100", data[3]) + } + + recvCapsuleResp(t, r) + client.Close() +} + +func TestAdmin_GetLogPage_ANA(t *testing.T) { + nqn := "nqn.test:ana" + client, r, w, _, _ := setupAdminSession(t, nqn) + defer client.Close() + + // Request ANA log + cmd := CapsuleCommand{ + OpCode: adminGetLogPage, + CID: 22, + D10: uint32(logPageANA) | (9 << 16), // NUMD=9 → 40 bytes + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + + hdr, _ := r.Dequeue() + if hdr.Type != pduC2HData { + t.Fatalf("expected C2HData") + } + var c2h C2HDataHeader + r.Receive(&c2h) + data := make([]byte, r.Length()) + r.ReceiveData(data) + + // ANA state at offset 32 should be Optimized (0x01) + if data[32] != anaOptimized { + t.Fatalf("ANA state = 0x%02x, want 0x%02x", data[32], anaOptimized) + } + + // NSID at offset 36 + nsid := binary.LittleEndian.Uint32(data[36:]) + if nsid != 1 { + t.Fatalf("NSID = %d", nsid) + } + + recvCapsuleResp(t, r) + client.Close() +} + +func TestAdmin_KeepAlive(t *testing.T) { + nqn := "nqn.test:ka" + client, r, w, _, _ := setupAdminSession(t, nqn) + defer client.Close() + + cmd := CapsuleCommand{ + OpCode: adminKeepAlive, + CID: 23, + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("KeepAlive failed: 0x%04x", resp.Status) + } + + client.Close() +} + +func TestAdmin_GetFeatures(t *testing.T) { + nqn := "nqn.test:getfeat" + client, r, w, _, _ := setupAdminSession(t, nqn) + defer client.Close() + + cmd := CapsuleCommand{ + OpCode: adminGetFeatures, + CID: 24, + D10: uint32(fidNumberOfQueues), + } + w.SendWithData(pduCapsuleCmd, 0, &cmd, capsuleCmdSize, nil) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("GetFeatures failed: 0x%04x", resp.Status) + } + // Default granted = maxIOQueues=4, so response DW0 = 3 | (3<<16) + ncqr := resp.DW0 & 0xFFFF + nsqr := resp.DW0 >> 16 + if ncqr != 3 || nsqr != 3 { + t.Fatalf("NCQR=%d NSQR=%d", ncqr, nsqr) + } + + client.Close() +} + +// ============================================================ +// IO Command Tests +// ============================================================ + +// setupIOSession creates an admin session, sets up IO queue, and returns IO conn. +func setupIOSession(t *testing.T) (client net.Conn, r *Reader, w *Writer, dev *mockBlockDevice, cleanup func()) { + t.Helper() + nqn := "nqn.test:io" + + clientConn, serverConn := pipeConn() + dev = newMockDevice(256, 512) // 256 blocks * 512 bytes = 128 KB + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + ctrl := newController(serverConn, srv) + go ctrl.Serve() + + r = NewReader(clientConn) + w = NewWriter(clientConn) + + // IC + sendICReq(w) + recvICResp(t, r) + + // Admin Connect + sendConnect(w, 0, 64, 0, nqn, "host", 0xFFFF) + recvCapsuleResp(t, r) + + cleanup = func() { clientConn.Close() } + return clientConn, r, w, dev, cleanup +} + +func TestIO_ReadWrite(t *testing.T) { + _, r, w, dev, cleanup := setupIOSession(t) + defer cleanup() + + // Switch to IO queue by changing the controller's queueID. + // In a real scenario, this would be a separate TCP connection. + // For unit testing, we directly set the state after admin setup. + + // Write 1 block at LBA 0 + writeData := make([]byte, 512) + for i := range writeData { + writeData[i] = 0xAB + } + writeCapsule := CapsuleCommand{ + OpCode: ioWrite, + CID: 100, + NSID: 1, + D10: 0, // LBA 0 + D12: 0, // 1 block (0-based) + } + w.SendWithData(pduCapsuleCmd, 0, &writeCapsule, capsuleCmdSize, writeData) + + // The controller dispatches to IO if queueID > 0. + // Since we're on admin queue (QID=0), this will go through dispatchAdmin + // and get "invalid opcode". For a proper IO test, we need to test the + // IO functions directly. + + // Let's test the IO handlers directly instead: + resp := recvCapsuleResp(t, r) + // On admin queue, IO opcodes are invalid + if !StatusWord(resp.Status).IsError() { + t.Fatal("expected error for IO opcode on admin queue") + } + + // Direct handler tests below... + _ = dev +} + +// TestIO_HandleRead tests the Read handler directly. +func TestIO_HandleRead(t *testing.T) { + nqn := "nqn.test:io-read" + dev := newMockDevice(256, 512) + // Write known data + for i := 0; i < 512; i++ { + dev.data[i] = byte(i & 0xFF) + } + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + ctrl := newController(serverConn, srv) + ctrl.subsystem = srv.findSubsystem(nqn) + ctrl.queueID = 1 // IO queue + ctrl.queueSize = 64 + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + + // IC + sendICReq(w) + recvICResp(t, r) + + // Read 1 block at LBA 0 + readCmd := CapsuleCommand{ + OpCode: ioRead, + CID: 200, + D10: 0, + D12: 0, // 1 block + } + w.SendWithData(pduCapsuleCmd, 0, &readCmd, capsuleCmdSize, nil) + + // Expect C2HData + CapsuleResp + hdr, err := r.Dequeue() + if err != nil { + t.Fatal(err) + } + if hdr.Type != pduC2HData { + t.Fatalf("expected C2HData, got 0x%x", hdr.Type) + } + var c2h C2HDataHeader + r.Receive(&c2h) + data := make([]byte, r.Length()) + r.ReceiveData(data) + + if len(data) != 512 { + t.Fatalf("read data len = %d", len(data)) + } + for i := 0; i < 512; i++ { + if data[i] != byte(i&0xFF) { + t.Fatalf("data[%d] = 0x%02x, want 0x%02x", i, data[i], byte(i&0xFF)) + } + } + + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("Read response error: 0x%04x", resp.Status) + } + + clientConn.Close() +} + +func TestIO_HandleWrite(t *testing.T) { + nqn := "nqn.test:io-write" + dev := newMockDevice(256, 512) + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + ctrl := newController(serverConn, srv) + ctrl.subsystem = srv.findSubsystem(nqn) + ctrl.queueID = 1 + ctrl.queueSize = 64 + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + + sendICReq(w) + recvICResp(t, r) + + // Write 1 block at LBA 5 + writeData := bytes.Repeat([]byte{0xCD}, 512) + writeCmd := CapsuleCommand{ + OpCode: ioWrite, + CID: 201, + D10: 5, + D12: 0, + } + w.SendWithData(pduCapsuleCmd, 0, &writeCmd, capsuleCmdSize, writeData) + + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("Write failed: 0x%04x", resp.Status) + } + + // Verify data was written + for i := 0; i < 512; i++ { + if dev.data[5*512+i] != 0xCD { + t.Fatalf("data at LBA 5 offset %d = 0x%02x", i, dev.data[5*512+i]) + } + } + + clientConn.Close() +} + +func TestIO_HandleFlush(t *testing.T) { + nqn := "nqn.test:io-flush" + dev := newMockDevice(256, 512) + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + ctrl := newController(serverConn, srv) + ctrl.subsystem = srv.findSubsystem(nqn) + ctrl.queueID = 1 + ctrl.queueSize = 64 + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + + sendICReq(w) + recvICResp(t, r) + + flushCmd := CapsuleCommand{ + OpCode: ioFlush, + CID: 202, + } + w.SendWithData(pduCapsuleCmd, 0, &flushCmd, capsuleCmdSize, nil) + + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("Flush failed: 0x%04x", resp.Status) + } + + clientConn.Close() +} + +func TestIO_HandleWriteZeros_Trim(t *testing.T) { + nqn := "nqn.test:io-wz" + dev := newMockDevice(256, 512) + // Fill LBA 10 with data + for i := 0; i < 512; i++ { + dev.data[10*512+i] = 0xFF + } + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + ctrl := newController(serverConn, srv) + ctrl.subsystem = srv.findSubsystem(nqn) + ctrl.queueID = 1 + ctrl.queueSize = 64 + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + + sendICReq(w) + recvICResp(t, r) + + // WriteZeros with DEALLOC bit at LBA 10, 1 block + wzCmd := CapsuleCommand{ + OpCode: ioWriteZeros, + CID: 203, + D10: 10, + D12: 0 | commandBitDeallocate, // 1 block + DEALLOC + } + w.SendWithData(pduCapsuleCmd, 0, &wzCmd, capsuleCmdSize, nil) + + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("WriteZeros failed: 0x%04x", resp.Status) + } + + // Verify data was zeroed + for i := 0; i < 512; i++ { + if dev.data[10*512+i] != 0 { + t.Fatalf("data at LBA 10 offset %d = 0x%02x, expected 0", i, dev.data[10*512+i]) + } + } + + clientConn.Close() +} + +func TestIO_ReadOutOfBounds(t *testing.T) { + nqn := "nqn.test:io-oob" + dev := newMockDevice(256, 512) + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + ctrl := newController(serverConn, srv) + ctrl.subsystem = srv.findSubsystem(nqn) + ctrl.queueID = 1 + ctrl.queueSize = 64 + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + + sendICReq(w) + recvICResp(t, r) + + // Read past end: LBA 255, 2 blocks (only 256 blocks total) + readCmd := CapsuleCommand{ + OpCode: ioRead, + CID: 204, + D10: 255, + D12: 1, // 2 blocks (0-based) + } + w.SendWithData(pduCapsuleCmd, 0, &readCmd, capsuleCmdSize, nil) + + resp := recvCapsuleResp(t, r) + status := StatusWord(resp.Status) + if status != StatusLBAOutOfRange { + t.Fatalf("expected LBAOutOfRange, got 0x%04x", resp.Status) + } + if !status.DNR() { + t.Fatal("LBAOutOfRange should have DNR=1") + } + + clientConn.Close() +} + +func TestIO_WriteNoInlineData(t *testing.T) { + nqn := "nqn.test:io-noinline" + dev := newMockDevice(256, 512) + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + ctrl := newController(serverConn, srv) + ctrl.subsystem = srv.findSubsystem(nqn) + ctrl.queueID = 1 + ctrl.queueSize = 64 + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + + sendICReq(w) + recvICResp(t, r) + + // Write with no inline data (DataOffset=0) + writeCmd := CapsuleCommand{ + OpCode: ioWrite, + CID: 205, + D10: 0, + D12: 0, + } + // Send header-only (no data) + w.SendWithData(pduCapsuleCmd, 0, &writeCmd, capsuleCmdSize, nil) + + resp := recvCapsuleResp(t, r) + status := StatusWord(resp.Status) + if status != StatusInvalidField { + t.Fatalf("expected InvalidField for R2T write, got 0x%04x", resp.Status) + } + if !status.DNR() { + t.Fatal("InvalidField should have DNR=1") + } + + clientConn.Close() +} + +func TestIO_WriteUnhealthy(t *testing.T) { + nqn := "nqn.test:io-unhealthy" + dev := newMockDevice(256, 512) + dev.anaState = anaInaccessible // not writable + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + ctrl := newController(serverConn, srv) + ctrl.subsystem = srv.findSubsystem(nqn) + ctrl.queueID = 1 + ctrl.queueSize = 64 + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + + sendICReq(w) + recvICResp(t, r) + + writeData := make([]byte, 512) + writeCmd := CapsuleCommand{ + OpCode: ioWrite, + CID: 206, + D10: 0, + D12: 0, + } + w.SendWithData(pduCapsuleCmd, 0, &writeCmd, capsuleCmdSize, writeData) + + resp := recvCapsuleResp(t, r) + status := StatusWord(resp.Status) + if status != StatusNSNotReady { + t.Fatalf("expected NSNotReady for unhealthy, got 0x%04x", resp.Status) + } + + clientConn.Close() +} + +func TestIO_ReadError(t *testing.T) { + nqn := "nqn.test:io-readerr" + dev := newMockDevice(256, 512) + dev.readErr = errors.New("simulated Read error") + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + ctrl := newController(serverConn, srv) + ctrl.subsystem = srv.findSubsystem(nqn) + ctrl.queueID = 1 + ctrl.queueSize = 64 + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + + sendICReq(w) + recvICResp(t, r) + + readCmd := CapsuleCommand{ + OpCode: ioRead, + CID: 207, + D10: 0, + D12: 0, + } + w.SendWithData(pduCapsuleCmd, 0, &readCmd, capsuleCmdSize, nil) + + resp := recvCapsuleResp(t, r) + status := StatusWord(resp.Status) + if status != StatusMediaReadError { + t.Fatalf("expected MediaReadError, got 0x%04x (SCT=%d SC=0x%02x)", resp.Status, status.SCT(), status.SC()) + } + + clientConn.Close() +} + +func TestIO_WriteError(t *testing.T) { + nqn := "nqn.test:io-writeerr" + dev := newMockDevice(256, 512) + dev.writeErr = errors.New("simulated Write fault") + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + ctrl := newController(serverConn, srv) + ctrl.subsystem = srv.findSubsystem(nqn) + ctrl.queueID = 1 + ctrl.queueSize = 64 + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + + sendICReq(w) + recvICResp(t, r) + + writeData := make([]byte, 512) + writeCmd := CapsuleCommand{ + OpCode: ioWrite, + CID: 208, + D10: 0, + D12: 0, + } + w.SendWithData(pduCapsuleCmd, 0, &writeCmd, capsuleCmdSize, writeData) + + resp := recvCapsuleResp(t, r) + status := StatusWord(resp.Status) + if status != StatusMediaWriteFault { + t.Fatalf("expected MediaWriteFault, got 0x%04x", resp.Status) + } + + clientConn.Close() +} + +// ============================================================ +// Error Mapping Tests +// ============================================================ + +func TestErrorMapping_AllSentinels(t *testing.T) { + tests := []struct { + name string + err error + want StatusWord + }{ + {"nil", nil, StatusSuccess}, + {"LeaseExpired", blockvol.ErrLeaseExpired, StatusNSNotReadyDNR}, + {"EpochRegression", blockvol.ErrEpochRegression, StatusInternalErrorDNR}, + {"DurabilityBarrier", blockerr.ErrDurabilityBarrierFailed, StatusInternalError}, + {"DurabilityQuorum", blockerr.ErrDurabilityQuorumLost, StatusInternalError}, + {"WALFull", blockvol.ErrWALFull, StatusNSNotReady}, + {"NotPrimary", blockvol.ErrNotPrimary, StatusNSNotReady}, + {"GenericError", errors.New("something else"), StatusInternalError}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := mapBlockError(tt.err) + if got != tt.want { + t.Fatalf("mapBlockError(%v) = 0x%04x, want 0x%04x", tt.err, got, tt.want) + } + }) + } +} + +func TestErrorMapping_DNR(t *testing.T) { + // LeaseExpired: DNR=1 + s := mapBlockError(blockvol.ErrLeaseExpired) + if !s.DNR() { + t.Fatal("LeaseExpired should have DNR=1") + } + // WALFull: DNR=0 (retryable) + s = mapBlockError(blockvol.ErrWALFull) + if s.DNR() { + t.Fatal("WALFull should have DNR=0") + } +} + +// ============================================================ +// ANA State Tests +// ============================================================ + +func TestANAState_AllRoles(t *testing.T) { + tests := []struct { + role blockvol.Role + want uint8 + }{ + {blockvol.RolePrimary, anaOptimized}, + {blockvol.RoleNone, anaOptimized}, + {blockvol.RoleReplica, anaInaccessible}, + {blockvol.RoleStale, anaPersistentLoss}, + {blockvol.RoleRebuilding, anaInaccessible}, + {blockvol.RoleDraining, anaInaccessible}, + } + for _, tt := range tests { + got := RoleToANAState(tt.role) + if got != tt.want { + t.Fatalf("RoleToANAState(%v) = 0x%02x, want 0x%02x", tt.role, got, tt.want) + } + } +} + +// ============================================================ +// Adapter Tests +// ============================================================ + +func TestNGUID_Generation(t *testing.T) { + uuid := [16]byte{0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC, 0xDE, 0xF0, 1, 2, 3, 4, 5, 6, 7, 8} + nguid := UUIDToNGUID(uuid) + + // NAA-6 prefix: first nibble = 6 + if (nguid[0] >> 4) != 0x06 { + t.Fatalf("NAA prefix = 0x%x, want 0x06", nguid[0]>>4) + } + // Lower nibble from uuid[0] + if (nguid[0] & 0x0F) != (uuid[0] & 0x0F) { + t.Fatalf("lower nibble mismatch") + } + // Bytes 1-7 from uuid + for i := 1; i < 8; i++ { + if nguid[i] != uuid[i] { + t.Fatalf("nguid[%d] = 0x%02x, want 0x%02x", i, nguid[i], uuid[i]) + } + } + // Bytes 8-15 from uuid + for i := 8; i < 16; i++ { + if nguid[i] != uuid[i] { + t.Fatalf("nguid[%d] = 0x%02x, want 0x%02x", i, nguid[i], uuid[i]) + } + } +} + +// ============================================================ +// Server Lifecycle Tests +// ============================================================ + +func TestServer_StartStop(t *testing.T) { + srv := NewServer(Config{ + Enabled: true, + ListenAddr: "127.0.0.1:0", + MaxIOQueues: 4, + }) + + dev := newMockDevice(256, 512) + srv.AddVolume("nqn.test:srv", dev, dev.DeviceNGUID()) + + if err := srv.ListenAndServe(); err != nil { + t.Fatal(err) + } + time.Sleep(10 * time.Millisecond) + + if err := srv.Close(); err != nil { + t.Fatal(err) + } +} + +func TestServer_DisabledNoOp(t *testing.T) { + srv := NewServer(Config{Enabled: false}) + if err := srv.ListenAndServe(); err != nil { + t.Fatal("disabled server should return nil") + } + if err := srv.Close(); err != nil { + t.Fatal("disabled close should return nil") + } +} + +func TestServer_AddRemoveVolume(t *testing.T) { + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + dev := newMockDevice(256, 512) + + nqn := "nqn.test:vol1" + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + if sub := srv.findSubsystem(nqn); sub == nil { + t.Fatal("subsystem not found after add") + } + + srv.RemoveVolume(nqn) + if sub := srv.findSubsystem(nqn); sub != nil { + t.Fatal("subsystem still found after remove") + } +} + +func TestServer_ConcurrentAccept(t *testing.T) { + srv := NewServer(Config{ + Enabled: true, + ListenAddr: "127.0.0.1:0", + MaxIOQueues: 4, + }) + dev := newMockDevice(256, 512) + srv.AddVolume("nqn.test:concurrent", dev, dev.DeviceNGUID()) + + if err := srv.ListenAndServe(); err != nil { + t.Fatal(err) + } + + addr := srv.listener.Addr().String() + + // Connect 3 clients concurrently + var wg sync.WaitGroup + for i := 0; i < 3; i++ { + wg.Add(1) + go func() { + defer wg.Done() + conn, err := net.DialTimeout("tcp", addr, time.Second) + if err != nil { + return + } + defer conn.Close() + + w := NewWriter(conn) + r := NewReader(conn) + sendICReq(w) + hdr, err := r.Dequeue() + if err != nil { + return + } + if hdr.Type != pduICResp { + t.Errorf("expected ICResp, got 0x%x", hdr.Type) + } + var resp ICResponse + r.Receive(&resp) + }() + } + wg.Wait() + + srv.Close() +} + +// ============================================================ +// KATO Timeout Test +// ============================================================ + +func TestController_KATOTimeout(t *testing.T) { + nqn := "nqn.test:kato" + + clientConn, serverConn := pipeConn() + dev := newMockDevice(256, 512) + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + ctrl := newController(serverConn, srv) + + done := make(chan error, 1) + go func() { + done <- ctrl.Serve() + }() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + + // IC + sendICReq(w) + recvICResp(t, r) + + // Connect with very short KATO (100ms) + sendConnect(w, 0, 64, 100, nqn, "host", 0xFFFF) + recvCapsuleResp(t, r) + + // Enable controller (which starts KATO timer) + propSet := CapsuleCommand{ + OpCode: adminFabric, + FCType: fcPropertySet, + CID: 1, + D10: propCC, + D14: 1, // CC.EN=1 + } + w.SendWithData(pduCapsuleCmd, 0, &propSet, capsuleCmdSize, nil) + recvCapsuleResp(t, r) + + // Wait for KATO to expire (100ms + 50% margin = 150ms, wait 300ms) + time.Sleep(300 * time.Millisecond) + + // Connection should be closed by KATO + _, err := r.Dequeue() + if err == nil || err == io.EOF { + // EOF is expected when connection is closed + } + + clientConn.Close() +} + +// ============================================================ +// Full Protocol Sequence Test +// ============================================================ + +func TestFullSequence_ICConnectIdentifyReadWrite(t *testing.T) { + nqn := "nqn.test:fullseq" + dev := newMockDevice(256, 512) + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + ctrl := newController(serverConn, srv) + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + + // 1. IC Handshake + sendICReq(w) + recvICResp(t, r) + + // 2. Admin Connect + sendConnect(w, 0, 64, 60000, nqn, "host-nqn", 0xFFFF) + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("Connect failed: 0x%04x", resp.Status) + } + + // 3. SetFeatures NumQueues + sfCmd := CapsuleCommand{ + OpCode: adminSetFeatures, + CID: 5, + D10: uint32(fidNumberOfQueues), + D11: 3 | (3 << 16), // 4 queues each + } + w.SendWithData(pduCapsuleCmd, 0, &sfCmd, capsuleCmdSize, nil) + resp = recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("SetFeatures NumQueues failed: 0x%04x", resp.Status) + } + + // 4. PropertySet CC.EN=1 + propCmd := CapsuleCommand{ + OpCode: adminFabric, + FCType: fcPropertySet, + CID: 6, + D10: propCC, + D14: 1, + } + w.SendWithData(pduCapsuleCmd, 0, &propCmd, capsuleCmdSize, nil) + resp = recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("PropertySet CC.EN failed: 0x%04x", resp.Status) + } + + // 5. Identify Controller + idCmd := CapsuleCommand{ + OpCode: adminIdentify, + CID: 7, + D10: uint32(cnsIdentifyController), + } + w.SendWithData(pduCapsuleCmd, 0, &idCmd, capsuleCmdSize, nil) + + hdr, _ := r.Dequeue() + if hdr.Type != pduC2HData { + t.Fatalf("expected C2HData for identify, got 0x%x", hdr.Type) + } + var c2h C2HDataHeader + r.Receive(&c2h) + identData := make([]byte, r.Length()) + r.ReceiveData(identData) + + resp = recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("Identify failed: 0x%04x", resp.Status) + } + + // 6. Identify Namespace + idNsCmd := CapsuleCommand{ + OpCode: adminIdentify, + CID: 8, + D10: uint32(cnsIdentifyNamespace), + } + w.SendWithData(pduCapsuleCmd, 0, &idNsCmd, capsuleCmdSize, nil) + + hdr, _ = r.Dequeue() + if hdr.Type != pduC2HData { + t.Fatalf("expected C2HData for identify ns") + } + r.Receive(&c2h) + nsData := make([]byte, r.Length()) + r.ReceiveData(nsData) + nsze := binary.LittleEndian.Uint64(nsData[0:]) + if nsze != 256 { + t.Fatalf("NSZE = %d, want 256", nsze) + } + recvCapsuleResp(t, r) + + // 7. KeepAlive + kaCmd := CapsuleCommand{ + OpCode: adminKeepAlive, + CID: 9, + } + w.SendWithData(pduCapsuleCmd, 0, &kaCmd, capsuleCmdSize, nil) + resp = recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("KeepAlive failed: 0x%04x", resp.Status) + } + + clientConn.Close() +} + +func TestServer_NQN(t *testing.T) { + srv := NewServer(Config{ + NQNPrefix: "nqn.2024-01.com.seaweedfs:vol.", + }) + got := srv.NQN("test-vol") + want := "nqn.2024-01.com.seaweedfs:vol.test-vol" + if got != want { + t.Fatalf("NQN() = %q, want %q", got, want) + } +} + +// ============================================================ +// Cross-Connection IO Queue Tests (Finding #1) +// ============================================================ + +// TestIOQueue_CrossConnection verifies that IO queues on separate TCP +// connections can validate CNTLID against the admin session registry. +func TestIOQueue_CrossConnection(t *testing.T) { + nqn := "nqn.test:cross-conn" + dev := newMockDevice(256, 512) + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + // --- Admin queue connection (QID=0) --- + adminClient, adminServer := pipeConn() + defer adminClient.Close() + + adminCtrl := newController(adminServer, srv) + go adminCtrl.Serve() + + ar := NewReader(adminClient) + aw := NewWriter(adminClient) + + sendICReq(aw) + recvICResp(t, ar) + + sendConnect(aw, 0, 64, 60000, nqn, "host-nqn", 0xFFFF) + resp := recvCapsuleResp(t, ar) + if StatusWord(resp.Status).IsError() { + t.Fatalf("Admin Connect failed: 0x%04x", resp.Status) + } + cntlID := uint16(resp.DW0) + if cntlID == 0 { + t.Fatal("expected non-zero CNTLID") + } + + // --- IO queue connection (QID=1, separate TCP conn) --- + ioClient, ioServer := pipeConn() + defer ioClient.Close() + + ioCtrl := newController(ioServer, srv) + go ioCtrl.Serve() + + ir := NewReader(ioClient) + iw := NewWriter(ioClient) + + sendICReq(iw) + recvICResp(t, ir) + + // IO Connect with CNTLID from admin session + sendConnect(iw, 1, 64, 0, nqn, "host-nqn", cntlID) + resp = recvCapsuleResp(t, ir) + if StatusWord(resp.Status).IsError() { + t.Fatalf("IO Connect failed: 0x%04x", resp.Status) + } + if uint16(resp.DW0) != cntlID { + t.Fatalf("IO Connect returned CNTLID=%d, want %d", resp.DW0, cntlID) + } + + // Verify IO commands work on the IO queue + writeData := bytes.Repeat([]byte{0xEE}, 512) + writeCmd := CapsuleCommand{ + OpCode: ioWrite, + CID: 300, + D10: 0, + D12: 0, // 1 block + } + iw.SendWithData(pduCapsuleCmd, 0, &writeCmd, capsuleCmdSize, writeData) + resp = recvCapsuleResp(t, ir) + if StatusWord(resp.Status).IsError() { + t.Fatalf("IO Write failed: 0x%04x", resp.Status) + } + + // Read back + readCmd := CapsuleCommand{ + OpCode: ioRead, + CID: 301, + D10: 0, + D12: 0, + } + iw.SendWithData(pduCapsuleCmd, 0, &readCmd, capsuleCmdSize, nil) + + hdr, err := ir.Dequeue() + if err != nil { + t.Fatal(err) + } + if hdr.Type != pduC2HData { + t.Fatalf("expected C2HData, got 0x%x", hdr.Type) + } + var c2h C2HDataHeader + ir.Receive(&c2h) + data := make([]byte, ir.Length()) + ir.ReceiveData(data) + + if !bytes.Equal(data, writeData) { + t.Fatal("read data doesn't match written data") + } + + resp = recvCapsuleResp(t, ir) + if StatusWord(resp.Status).IsError() { + t.Fatalf("IO Read failed: 0x%04x", resp.Status) + } + + adminClient.Close() + ioClient.Close() +} + +// TestIOQueue_InvalidCNTLID verifies that IO queue connect with wrong CNTLID fails. +func TestIOQueue_InvalidCNTLID(t *testing.T) { + nqn := "nqn.test:bad-cntlid" + dev := newMockDevice(256, 512) + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + // IO queue connection with CNTLID that doesn't exist (no admin session) + ioClient, ioServer := pipeConn() + defer ioClient.Close() + + ioCtrl := newController(ioServer, srv) + go ioCtrl.Serve() + + ir := NewReader(ioClient) + iw := NewWriter(ioClient) + + sendICReq(iw) + recvICResp(t, ir) + + // Try IO Connect with bogus CNTLID + sendConnect(iw, 1, 64, 0, nqn, "host-nqn", 9999) + resp := recvCapsuleResp(t, ir) + if !StatusWord(resp.Status).IsError() { + t.Fatal("expected error for invalid CNTLID") + } + + ioClient.Close() +} + +// TestIOQueue_NQNMismatch verifies IO queue connect fails when SubNQN +// doesn't match the admin session's SubNQN. +func TestIOQueue_NQNMismatch(t *testing.T) { + nqn1 := "nqn.test:nqn-match" + nqn2 := "nqn.test:nqn-other" + dev := newMockDevice(256, 512) + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn1, dev, dev.DeviceNGUID()) + srv.AddVolume(nqn2, dev, dev.DeviceNGUID()) + + // Admin queue connect to nqn1 + adminClient, adminServer := pipeConn() + defer adminClient.Close() + + adminCtrl := newController(adminServer, srv) + go adminCtrl.Serve() + + ar := NewReader(adminClient) + aw := NewWriter(adminClient) + + sendICReq(aw) + recvICResp(t, ar) + + sendConnect(aw, 0, 64, 0, nqn1, "host", 0xFFFF) + resp := recvCapsuleResp(t, ar) + cntlID := uint16(resp.DW0) + + // IO queue connect with same CNTLID but different NQN + ioClient, ioServer := pipeConn() + defer ioClient.Close() + + ioCtrl := newController(ioServer, srv) + go ioCtrl.Serve() + + ir := NewReader(ioClient) + iw := NewWriter(ioClient) + + sendICReq(iw) + recvICResp(t, ir) + + sendConnect(iw, 1, 64, 0, nqn2, "host", cntlID) + resp = recvCapsuleResp(t, ir) + if !StatusWord(resp.Status).IsError() { + t.Fatal("expected error for NQN mismatch on IO queue connect") + } + + adminClient.Close() + ioClient.Close() +} + +// TestAdminSession_UnregisteredOnShutdown verifies admin sessions are +// cleaned up when the admin controller shuts down. +func TestAdminSession_UnregisteredOnShutdown(t *testing.T) { + nqn := "nqn.test:unreg" + dev := newMockDevice(256, 512) + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + adminClient, adminServer := pipeConn() + + adminCtrl := newController(adminServer, srv) + go adminCtrl.Serve() + + ar := NewReader(adminClient) + aw := NewWriter(adminClient) + + sendICReq(aw) + recvICResp(t, ar) + + sendConnect(aw, 0, 64, 0, nqn, "host", 0xFFFF) + resp := recvCapsuleResp(t, ar) + cntlID := uint16(resp.DW0) + + // Admin session should be registered + if srv.lookupAdmin(cntlID) == nil { + t.Fatal("admin session not registered") + } + + // Close admin connection → triggers shutdown → unregister + adminClient.Close() + time.Sleep(50 * time.Millisecond) // give goroutine time to cleanup + + if srv.lookupAdmin(cntlID) != nil { + t.Fatal("admin session should be unregistered after shutdown") + } +} + +// ============================================================ +// Header Bounds Validation Tests (Finding #2) +// ============================================================ + +func TestReader_MalformedHeader_TooSmall(t *testing.T) { + // HeaderLength < 8 (common header size) + buf := make([]byte, commonHeaderSize) + hdr := CommonHeader{ + Type: pduCapsuleCmd, + HeaderLength: 4, // invalid: < 8 + DataLength: 4, + } + hdr.Marshal(buf) + r := NewReader(bytes.NewReader(buf)) + _, err := r.Dequeue() + if err == nil { + t.Fatal("expected error for HeaderLength < 8") + } +} + +func TestReader_MalformedHeader_TooLarge(t *testing.T) { + buf := make([]byte, commonHeaderSize) + hdr := CommonHeader{ + Type: pduCapsuleCmd, + HeaderLength: 255, // larger than maxHeaderSize + DataLength: 255, + } + hdr.Marshal(buf) + r := NewReader(bytes.NewReader(buf)) + _, err := r.Dequeue() + if err == nil { + t.Fatal("expected error for HeaderLength > maxHeaderSize") + } +} + +func TestReader_MalformedHeader_DataOffsetLessThanHeaderLength(t *testing.T) { + buf := make([]byte, commonHeaderSize) + hdr := CommonHeader{ + Type: pduCapsuleCmd, + HeaderLength: 72, + DataOffset: 32, // invalid: < HeaderLength + DataLength: 100, + } + hdr.Marshal(buf) + r := NewReader(bytes.NewReader(buf)) + _, err := r.Dequeue() + if err == nil { + t.Fatal("expected error for DataOffset < HeaderLength") + } +} + +func TestReader_MalformedHeader_DataOffsetGtDataLength(t *testing.T) { + buf := make([]byte, commonHeaderSize) + hdr := CommonHeader{ + Type: pduCapsuleCmd, + HeaderLength: 72, + DataOffset: 72, + DataLength: 50, // invalid: DataOffset > DataLength + } + hdr.Marshal(buf) + r := NewReader(bytes.NewReader(buf)) + _, err := r.Dequeue() + if err == nil { + t.Fatal("expected error for DataOffset > DataLength") + } +} + +func TestReader_MalformedHeader_DataLengthLtHeaderLength(t *testing.T) { + buf := make([]byte, commonHeaderSize) + hdr := CommonHeader{ + Type: pduCapsuleCmd, + HeaderLength: 72, + DataOffset: 0, + DataLength: 40, // invalid: DataLength < HeaderLength + } + hdr.Marshal(buf) + r := NewReader(bytes.NewReader(buf)) + _, err := r.Dequeue() + if err == nil { + t.Fatal("expected error for DataLength < HeaderLength") + } +} + +func TestReader_MalformedHeader_DataOffsetZero_ExtraDataLength(t *testing.T) { + // DataOffset==0 but DataLength > HeaderLength → unconsumed bytes would desync stream. + buf := make([]byte, commonHeaderSize) + hdr := CommonHeader{ + Type: pduCapsuleResp, + HeaderLength: 24, + DataOffset: 0, + DataLength: 100, // invalid: no data expected but DataLength > HeaderLength + } + hdr.Marshal(buf) + r := NewReader(bytes.NewReader(buf)) + _, err := r.Dequeue() + if err == nil { + t.Fatal("expected error for DataOffset=0 with DataLength > HeaderLength") + } +} + +// ============================================================ +// IO Queue Host Identity Tests (Finding: HostNQN continuity) +// ============================================================ + +func TestIOQueue_HostNQNMismatch(t *testing.T) { + nqn := "nqn.test:hostnqn" + dev := newMockDevice(256, 512) + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + // Admin connect with HostNQN "host-A" + adminClient, adminServer := pipeConn() + defer adminClient.Close() + + adminCtrl := newController(adminServer, srv) + go adminCtrl.Serve() + + ar := NewReader(adminClient) + aw := NewWriter(adminClient) + + sendICReq(aw) + recvICResp(t, ar) + + sendConnect(aw, 0, 64, 0, nqn, "host-A", 0xFFFF) + resp := recvCapsuleResp(t, ar) + cntlID := uint16(resp.DW0) + + // IO connect with same CNTLID + SubNQN but different HostNQN "host-B" + ioClient, ioServer := pipeConn() + defer ioClient.Close() + + ioCtrl := newController(ioServer, srv) + go ioCtrl.Serve() + + ir := NewReader(ioClient) + iw := NewWriter(ioClient) + + sendICReq(iw) + recvICResp(t, ir) + + sendConnect(iw, 1, 64, 0, nqn, "host-B", cntlID) + resp = recvCapsuleResp(t, ir) + if !StatusWord(resp.Status).IsError() { + t.Fatal("expected error for HostNQN mismatch on IO queue connect") + } + + adminClient.Close() + ioClient.Close() +} + +// ============================================================ +// Write Payload Size Validation Tests (Finding #3) +// ============================================================ + +func TestIO_WritePayloadSizeMismatch(t *testing.T) { + nqn := "nqn.test:io-paysize" + dev := newMockDevice(256, 512) + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + ctrl := newController(serverConn, srv) + ctrl.subsystem = srv.findSubsystem(nqn) + ctrl.queueID = 1 + ctrl.queueSize = 64 + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + + sendICReq(w) + recvICResp(t, r) + + // Write with NLB=1 (1 block = 512 bytes) but payload = 256 bytes + writeData := make([]byte, 256) // wrong size + writeCmd := CapsuleCommand{ + OpCode: ioWrite, + CID: 400, + D10: 0, + D12: 0, // 1 block + } + w.SendWithData(pduCapsuleCmd, 0, &writeCmd, capsuleCmdSize, writeData) + + resp := recvCapsuleResp(t, r) + if !StatusWord(resp.Status).IsError() { + t.Fatal("expected error for payload size mismatch") + } + if StatusWord(resp.Status) != StatusInvalidField { + t.Fatalf("expected InvalidField, got 0x%04x", resp.Status) + } + + clientConn.Close() +} + +func TestIO_WritePayloadTooLarge(t *testing.T) { + nqn := "nqn.test:io-paysize2" + dev := newMockDevice(256, 512) + + srv := NewServer(Config{Enabled: true, ListenAddr: "127.0.0.1:0", MaxIOQueues: 4}) + srv.AddVolume(nqn, dev, dev.DeviceNGUID()) + + clientConn, serverConn := pipeConn() + defer clientConn.Close() + + ctrl := newController(serverConn, srv) + ctrl.subsystem = srv.findSubsystem(nqn) + ctrl.queueID = 1 + ctrl.queueSize = 64 + go ctrl.Serve() + + r := NewReader(clientConn) + w := NewWriter(clientConn) + + sendICReq(w) + recvICResp(t, r) + + // Write with NLB=1 (1 block = 512 bytes) but payload = 1024 bytes + writeData := make([]byte, 1024) // too large + writeCmd := CapsuleCommand{ + OpCode: ioWrite, + CID: 401, + D10: 0, + D12: 0, // 1 block + } + w.SendWithData(pduCapsuleCmd, 0, &writeCmd, capsuleCmdSize, writeData) + + resp := recvCapsuleResp(t, r) + if !StatusWord(resp.Status).IsError() { + t.Fatal("expected error for oversized payload") + } + if StatusWord(resp.Status) != StatusInvalidField { + t.Fatalf("expected InvalidField, got 0x%04x", resp.Status) + } + + clientConn.Close() +} + +// ============================================================ +// Disconnect Non-Error Tests (Finding #5) +// ============================================================ + +func TestDisconnect_NoError(t *testing.T) { + nqn := "nqn.test:disconnect" + client, r, w, _, cntlID := setupAdminSession(t, nqn) + defer client.Close() + + _ = cntlID + + // Send Disconnect + disconnectCmd := CapsuleCommand{ + OpCode: adminFabric, + FCType: fcDisconnect, + CID: 50, + } + w.SendWithData(pduCapsuleCmd, 0, &disconnectCmd, capsuleCmdSize, nil) + + // Should get a success response + resp := recvCapsuleResp(t, r) + if StatusWord(resp.Status).IsError() { + t.Fatalf("Disconnect response should be success, got 0x%04x", resp.Status) + } + + // Connection should be closed after disconnect + time.Sleep(50 * time.Millisecond) + _, err := r.Dequeue() + if err == nil { + t.Fatal("expected read error after disconnect") + } + + client.Close() +} diff --git a/weed/storage/blockvol/nvme/protocol.go b/weed/storage/blockvol/nvme/protocol.go new file mode 100644 index 000000000..a5eb803e9 --- /dev/null +++ b/weed/storage/blockvol/nvme/protocol.go @@ -0,0 +1,444 @@ +// Package nvme implements an NVMe/TCP target for SeaweedFS BlockVol. +// +// This package provides a functionally correct NVMe-oF over TCP transport +// that shares the same BlockVol engine, fencing, replication, and failover +// as the iSCSI target. +package nvme + +import ( + "encoding/binary" + "fmt" +) + +// ---------- PDU type codes ---------- + +const ( + pduICReq uint8 = 0x0 // Initialization Connection Request + pduICResp uint8 = 0x1 // Initialization Connection Response + pduH2CTermReq uint8 = 0x2 // Host-to-Controller Termination Request + pduC2HTermReq uint8 = 0x3 // Controller-to-Host Termination Request + pduCapsuleCmd uint8 = 0x4 // NVMe Capsule Command + pduCapsuleResp uint8 = 0x5 // NVMe Capsule Response + pduC2HData uint8 = 0x7 // Controller-to-Host Data Transfer + pduR2T uint8 = 0x9 // Ready-to-Transfer +) + +// ---------- Admin command opcodes ---------- + +const ( + adminFlush uint8 = 0x00 // NVM Flush (admin context unused here) + adminGetLogPage uint8 = 0x02 + adminIdentify uint8 = 0x06 + adminAbort uint8 = 0x08 + adminSetFeatures uint8 = 0x09 + adminGetFeatures uint8 = 0x0A + adminAsyncEvent uint8 = 0x0C + adminKeepAlive uint8 = 0x18 + adminFabric uint8 = 0x7F // Fabric-specific commands +) + +// ---------- IO command opcodes ---------- + +const ( + ioFlush uint8 = 0x00 + ioWrite uint8 = 0x01 + ioRead uint8 = 0x02 + ioWriteZeros uint8 = 0x08 +) + +// ---------- Fabric command types (FCType) ---------- + +const ( + fcPropertySet uint8 = 0x00 + fcConnect uint8 = 0x01 + fcPropertyGet uint8 = 0x04 + fcDisconnect uint8 = 0x08 +) + +// ---------- Feature identifiers ---------- + +const ( + fidNumberOfQueues uint8 = 0x07 + fidAsyncEventConfig uint8 = 0x0B + fidKeepAliveTimer uint8 = 0x0F +) + +// ---------- Identify CNS types ---------- + +const ( + cnsIdentifyNamespace uint8 = 0x00 + cnsIdentifyController uint8 = 0x01 + cnsActiveNSList uint8 = 0x02 + cnsNSDescriptorList uint8 = 0x03 +) + +// ---------- Log page identifiers ---------- + +const ( + logPageError uint8 = 0x01 + logPageSMART uint8 = 0x02 + logPageANA uint8 = 0x0C +) + +// ---------- Property register offsets ---------- + +const ( + propCAP uint32 = 0x00 // Controller Capabilities + propVS uint32 = 0x08 // Version + propCC uint32 = 0x14 // Controller Configuration + propCSTS uint32 = 0x1C // Controller Status +) + +// ---------- ANA states ---------- + +const ( + anaOptimized uint8 = 0x01 + anaNonOptimized uint8 = 0x02 + anaInaccessible uint8 = 0x03 + anaPersistentLoss uint8 = 0x04 + anaChange uint8 = 0x0F +) + +// ---------- Misc constants ---------- + +const ( + commonHeaderSize = 8 + maxHeaderSize = 128 + maxH2CDataLen = 0x8000 // 32 KB + + capsuleCmdSize = 64 // CapsuleCommand specific header size (after CommonHeader) + capsuleRespSize = 16 // CapsuleResponse specific header size + c2hDataHdrSize = 16 // C2HDataHeader specific header size + icBodySize = 120 // ICReq/ICResp body size (after CommonHeader) + connectDataSize = 1024 + + // Total header lengths including CommonHeader + capsuleCmdHdrLen = commonHeaderSize + capsuleCmdSize // 72 + capsuleRespHdrLen = commonHeaderSize + capsuleRespSize // 24 + c2hDataHdrLen = commonHeaderSize + c2hDataHdrSize // 24 + icHdrLen = commonHeaderSize + icBodySize // 128 + + commandBitDeallocate = 1 << 25 + + nvmeVersion14 uint32 = 0x00010400 // NVMe 1.4 + + // C2HData flags + c2hFlagLast uint8 = 0x04 +) + +// ---------- CommonHeader (8 bytes) ---------- + +// CommonHeader is the 8-byte preamble of every NVMe/TCP PDU. +type CommonHeader struct { + Type uint8 + Flags uint8 + HeaderLength uint8 + DataOffset uint8 + DataLength uint32 +} + +func (h *CommonHeader) Marshal(buf []byte) { + buf[0] = h.Type + buf[1] = h.Flags + buf[2] = h.HeaderLength + buf[3] = h.DataOffset + binary.LittleEndian.PutUint32(buf[4:], h.DataLength) +} + +func (h *CommonHeader) Unmarshal(buf []byte) { + h.Type = buf[0] + h.Flags = buf[1] + h.HeaderLength = buf[2] + h.DataOffset = buf[3] + h.DataLength = binary.LittleEndian.Uint32(buf[4:]) +} + +func (h *CommonHeader) String() string { + return fmt.Sprintf("PDU{type=0x%x hlen=%d doff=%d dlen=%d}", + h.Type, h.HeaderLength, h.DataOffset, h.DataLength) +} + +// ---------- PDU interface ---------- + +// PDU is the interface for all NVMe/TCP PDU-specific headers. +type PDU interface { + Marshal([]byte) + Unmarshal([]byte) +} + +// ---------- ICRequest (120-byte body) ---------- + +// ICRequest is the host-to-controller initialization request. +type ICRequest struct { + PDUFormatVersion uint16 + PDUDataAlignment uint8 + PDUDataDigest uint8 + PDUMaxR2T uint32 + // remaining 112 bytes reserved +} + +func (r *ICRequest) Marshal(buf []byte) { + // zero out the full 120-byte body + for i := range buf[:icBodySize] { + buf[i] = 0 + } + binary.LittleEndian.PutUint16(buf[0:], r.PDUFormatVersion) + buf[2] = r.PDUDataAlignment + buf[3] = r.PDUDataDigest + binary.LittleEndian.PutUint32(buf[4:], r.PDUMaxR2T) +} + +func (r *ICRequest) Unmarshal(buf []byte) { + r.PDUFormatVersion = binary.LittleEndian.Uint16(buf[0:]) + r.PDUDataAlignment = buf[2] + r.PDUDataDigest = buf[3] + r.PDUMaxR2T = binary.LittleEndian.Uint32(buf[4:]) +} + +// ---------- ICResponse (120-byte body) ---------- + +// ICResponse is the controller-to-host initialization response. +type ICResponse struct { + PDUFormatVersion uint16 + PDUDataAlignment uint8 + PDUDataDigest uint8 + MaxH2CDataLength uint32 + // remaining 112 bytes reserved +} + +func (r *ICResponse) Marshal(buf []byte) { + for i := range buf[:icBodySize] { + buf[i] = 0 + } + binary.LittleEndian.PutUint16(buf[0:], r.PDUFormatVersion) + buf[2] = r.PDUDataAlignment + buf[3] = r.PDUDataDigest + binary.LittleEndian.PutUint32(buf[4:], r.MaxH2CDataLength) +} + +func (r *ICResponse) Unmarshal(buf []byte) { + r.PDUFormatVersion = binary.LittleEndian.Uint16(buf[0:]) + r.PDUDataAlignment = buf[2] + r.PDUDataDigest = buf[3] + r.MaxH2CDataLength = binary.LittleEndian.Uint32(buf[4:]) +} + +// ---------- CapsuleCommand (64-byte specific header) ---------- + +// CapsuleCommand is the 64-byte NVMe command capsule. +type CapsuleCommand struct { + OpCode uint8 + PRP uint8 + CID uint16 + FCType uint8 // Fabric command type (only for OpCode=0x7F) + NSID uint32 // Namespace ID (bytes 4-7 of NVMe SQE after opcode/flags/CID) + DPTR [16]byte // Data pointer + D10 uint32 + D11 uint32 + D12 uint32 + D13 uint32 + D14 uint32 + D15 uint32 +} + +// Lba returns the starting LBA from D10:D11 (Read/Write commands). +func (c *CapsuleCommand) Lba() uint64 { + return uint64(c.D11)<<32 | uint64(c.D10) +} + +// LbaLength returns the number of logical blocks (0-based in D12, actual = D12&0xFFFF + 1). +func (c *CapsuleCommand) LbaLength() uint32 { + return c.D12&0xFFFF + 1 +} + +func (c *CapsuleCommand) Marshal(buf []byte) { + for i := range buf[:capsuleCmdSize] { + buf[i] = 0 + } + buf[0] = c.OpCode + buf[1] = c.PRP + binary.LittleEndian.PutUint16(buf[2:], c.CID) + // Bytes 4-7: NSID for normal commands, FCType at byte 4 for Fabric (0x7F). + // They share the same offset per NVMe spec. + if c.OpCode == adminFabric { + buf[4] = c.FCType + } else { + binary.LittleEndian.PutUint32(buf[4:], c.NSID) + } + copy(buf[24:40], c.DPTR[:]) + binary.LittleEndian.PutUint32(buf[40:], c.D10) + binary.LittleEndian.PutUint32(buf[44:], c.D11) + binary.LittleEndian.PutUint32(buf[48:], c.D12) + binary.LittleEndian.PutUint32(buf[52:], c.D13) + binary.LittleEndian.PutUint32(buf[56:], c.D14) + binary.LittleEndian.PutUint32(buf[60:], c.D15) +} + +func (c *CapsuleCommand) Unmarshal(buf []byte) { + c.OpCode = buf[0] + c.PRP = buf[1] + c.CID = binary.LittleEndian.Uint16(buf[2:]) + c.FCType = buf[4] + c.NSID = binary.LittleEndian.Uint32(buf[4:]) + copy(c.DPTR[:], buf[24:40]) + c.D10 = binary.LittleEndian.Uint32(buf[40:]) + c.D11 = binary.LittleEndian.Uint32(buf[44:]) + c.D12 = binary.LittleEndian.Uint32(buf[48:]) + c.D13 = binary.LittleEndian.Uint32(buf[52:]) + c.D14 = binary.LittleEndian.Uint32(buf[56:]) + c.D15 = binary.LittleEndian.Uint32(buf[60:]) +} + +func (c *CapsuleCommand) String() string { + return fmt.Sprintf("CapsuleCmd{op=0x%02x cid=%d nsid=%d}", c.OpCode, c.CID, c.NSID) +} + +// ---------- CapsuleResponse (16-byte specific header) ---------- + +// CapsuleResponse is the NVMe completion queue entry (16 bytes). +type CapsuleResponse struct { + DW0 uint32 // Command-specific DWord 0 (also FabricResponse bytes 0-3) + DW1 uint32 // Command-specific DWord 1 (also FabricResponse bytes 4-7) + SQHD uint16 // Submission Queue Head Pointer + QueueID uint16 + CID uint16 + Status uint16 // Status field: DNR(15) | More(14) | SCT(13:9) | SC(8:1) | P(0) +} + +func (r *CapsuleResponse) Marshal(buf []byte) { + binary.LittleEndian.PutUint32(buf[0:], r.DW0) + binary.LittleEndian.PutUint32(buf[4:], r.DW1) + binary.LittleEndian.PutUint16(buf[8:], r.SQHD) + binary.LittleEndian.PutUint16(buf[10:], r.QueueID) + binary.LittleEndian.PutUint16(buf[12:], r.CID) + binary.LittleEndian.PutUint16(buf[14:], r.Status) +} + +func (r *CapsuleResponse) Unmarshal(buf []byte) { + r.DW0 = binary.LittleEndian.Uint32(buf[0:]) + r.DW1 = binary.LittleEndian.Uint32(buf[4:]) + r.SQHD = binary.LittleEndian.Uint16(buf[8:]) + r.QueueID = binary.LittleEndian.Uint16(buf[10:]) + r.CID = binary.LittleEndian.Uint16(buf[12:]) + r.Status = binary.LittleEndian.Uint16(buf[14:]) +} + +func (r *CapsuleResponse) String() string { + return fmt.Sprintf("CapsuleResp{sqhd=%d qid=%d cid=%d status=0x%04x}", + r.SQHD, r.QueueID, r.CID, r.Status) +} + +// ---------- C2HDataHeader (16-byte specific header) ---------- + +// C2HDataHeader is the controller-to-host data transfer header. +type C2HDataHeader struct { + CCCID uint16 // Command Capsule CID + _ uint16 // reserved + DATAO uint32 // Data offset within the total transfer + DATAL uint32 // Data length in this PDU + _pad uint32 // reserved +} + +func (h *C2HDataHeader) Marshal(buf []byte) { + for i := range buf[:c2hDataHdrSize] { + buf[i] = 0 + } + binary.LittleEndian.PutUint16(buf[0:], h.CCCID) + binary.LittleEndian.PutUint32(buf[4:], h.DATAO) + binary.LittleEndian.PutUint32(buf[8:], h.DATAL) +} + +func (h *C2HDataHeader) Unmarshal(buf []byte) { + h.CCCID = binary.LittleEndian.Uint16(buf[0:]) + h.DATAO = binary.LittleEndian.Uint32(buf[4:]) + h.DATAL = binary.LittleEndian.Uint32(buf[8:]) +} + +// ---------- ConnectData (1024 bytes, payload of Fabric Connect) ---------- + +// ConnectData is the 1024-byte payload sent with a Fabric Connect command. +type ConnectData struct { + HostID [16]byte // Host UUID + CNTLID uint16 // Requested controller ID (0xFFFF = new) + SubNQN string // Subsystem NQN + HostNQN string // Host NQN +} + +func (d *ConnectData) Marshal(buf []byte) { + for i := range buf[:connectDataSize] { + buf[i] = 0 + } + copy(buf[0:16], d.HostID[:]) + binary.LittleEndian.PutUint16(buf[16:], d.CNTLID) + copyNQN(buf[256:512], d.SubNQN) + copyNQN(buf[512:768], d.HostNQN) +} + +func (d *ConnectData) Unmarshal(buf []byte) { + copy(d.HostID[:], buf[0:16]) + d.CNTLID = binary.LittleEndian.Uint16(buf[16:]) + d.SubNQN = extractNQN(buf[256:512]) + d.HostNQN = extractNQN(buf[512:768]) +} + +// copyNQN writes a NUL-terminated string into a fixed-size buffer. +func copyNQN(dst []byte, s string) { + n := copy(dst, s) + if n < len(dst) { + dst[n] = 0 + } +} + +// extractNQN reads a NUL-terminated string from a fixed-size buffer. +func extractNQN(buf []byte) string { + for i, b := range buf { + if b == 0 { + return string(buf[:i]) + } + } + return string(buf) +} + +// ---------- Status word encoding ---------- + +// StatusWord encodes NVMe status: DNR(15) | More(14) | SCT(13:9) | SC(8:1) | P(0) +// +// StatusWord = (DNR << 15) | (SCT << 9) | (SC << 1) +type StatusWord uint16 + +// MakeStatus constructs a status word from SCT, SC, and DNR flag. +func MakeStatus(sct, sc uint8, dnr bool) StatusWord { + w := uint16(sct)<<9 | uint16(sc)<<1 + if dnr { + w |= 1 << 15 + } + return StatusWord(w) +} + +// StatusSuccess is the zero-value success status. +const StatusSuccess StatusWord = 0 + +// Pre-defined status words used in the NVMe target. +var ( + StatusInvalidOpcode = MakeStatus(0, 0x01, true) // Generic: Invalid Command Opcode + StatusInvalidField = MakeStatus(0, 0x02, true) // Generic: Invalid Field in Command + StatusInternalError = MakeStatus(0, 0x06, false) // Generic: Internal Error (retryable) + StatusInternalErrorDNR = MakeStatus(0, 0x06, true) // Generic: Internal Error (permanent) + StatusNSNotReady = MakeStatus(0, 0x82, false) // Generic: Namespace Not Ready (retryable) + StatusNSNotReadyDNR = MakeStatus(0, 0x82, true) // Generic: Namespace Not Ready (permanent) + StatusLBAOutOfRange = MakeStatus(0, 0x80, true) // Generic: LBA Out of Range + StatusMediaWriteFault = MakeStatus(2, 0x80, false) // Media: Write Fault + StatusMediaReadError = MakeStatus(2, 0x81, false) // Media: Uncorrectable Read Error +) + +func (s StatusWord) SCT() uint8 { return uint8((s >> 9) & 0x07) } +func (s StatusWord) SC() uint8 { return uint8((s >> 1) & 0xFF) } +func (s StatusWord) DNR() bool { return s&(1<<15) != 0 } +func (s StatusWord) IsError() bool { return s != StatusSuccess } + +func (s StatusWord) String() string { + if s == StatusSuccess { + return "Success" + } + return fmt.Sprintf("Status{sct=%d sc=0x%02x dnr=%v}", s.SCT(), s.SC(), s.DNR()) +} diff --git a/weed/storage/blockvol/nvme/server.go b/weed/storage/blockvol/nvme/server.go new file mode 100644 index 000000000..a60626d27 --- /dev/null +++ b/weed/storage/blockvol/nvme/server.go @@ -0,0 +1,210 @@ +package nvme + +import ( + "fmt" + "log" + "net" + "sync" + "sync/atomic" + "time" +) + +// Config holds NVMe/TCP target configuration. +type Config struct { + ListenAddr string + NQNPrefix string + MaxH2CDataLength uint32 + MaxIOQueues uint16 + Enabled bool +} + +// DefaultConfig returns the default NVMe target configuration. +func DefaultConfig() Config { + return Config{ + ListenAddr: "0.0.0.0:4420", + NQNPrefix: "nqn.2024-01.com.seaweedfs:vol.", + MaxH2CDataLength: maxH2CDataLen, + MaxIOQueues: 4, + Enabled: false, + } +} + +// adminSession stores state from an admin queue connection that IO queue +// connections need to look up (they arrive on separate TCP connections). +type adminSession struct { + cntlID uint16 + subsystem *Subsystem + subNQN string + hostNQN string + regCAP uint64 + regCC uint32 + regCSTS uint32 + regVS uint32 + katoMs uint32 +} + +// Server is the NVMe/TCP target server. +type Server struct { + cfg Config + listener net.Listener + mu sync.RWMutex + subsystems map[string]*Subsystem // NQN → Subsystem + sessions map[*Controller]struct{} + adminMu sync.RWMutex + admins map[uint16]*adminSession // CNTLID → admin session + nextCNTLID atomic.Uint32 + closed atomic.Bool + wg sync.WaitGroup +} + +// NewServer creates a new NVMe/TCP target server. +func NewServer(cfg Config) *Server { + return &Server{ + cfg: cfg, + subsystems: make(map[string]*Subsystem), + sessions: make(map[*Controller]struct{}), + admins: make(map[uint16]*adminSession), + } +} + +// AddVolume registers a block device as an NVMe subsystem. +func (s *Server) AddVolume(nqn string, dev BlockDevice, nguid [16]byte) { + s.mu.Lock() + defer s.mu.Unlock() + s.subsystems[nqn] = &Subsystem{ + NQN: nqn, + Dev: dev, + NGUID: nguid, + } +} + +// RemoveVolume unregisters an NVMe subsystem. +func (s *Server) RemoveVolume(nqn string) { + s.mu.Lock() + defer s.mu.Unlock() + delete(s.subsystems, nqn) +} + +// ListenAndServe starts the NVMe/TCP listener. +// If not enabled, returns nil immediately. +func (s *Server) ListenAndServe() error { + if !s.cfg.Enabled { + return nil + } + + ln, err := net.Listen("tcp", s.cfg.ListenAddr) + if err != nil { + return fmt.Errorf("nvme listen %s: %w", s.cfg.ListenAddr, err) + } + s.listener = ln + log.Printf("nvme: listening on %s", s.cfg.ListenAddr) + + s.wg.Add(1) + go func() { + defer s.wg.Done() + s.acceptLoop() + }() + return nil +} + +func (s *Server) acceptLoop() { + for { + conn, err := s.listener.Accept() + if err != nil { + if s.closed.Load() { + return + } + log.Printf("nvme: accept error: %v", err) + continue + } + + ctrl := newController(conn, s) + s.addSession(ctrl) + + s.wg.Add(1) + go func() { + defer s.wg.Done() + if err := ctrl.Serve(); err != nil { + if !s.closed.Load() { + log.Printf("nvme: session error: %v", err) + } + } + }() + } +} + +func (s *Server) addSession(ctrl *Controller) { + s.mu.Lock() + defer s.mu.Unlock() + s.sessions[ctrl] = struct{}{} +} + +func (s *Server) removeSession(ctrl *Controller) { + s.mu.Lock() + defer s.mu.Unlock() + delete(s.sessions, ctrl) +} + +// registerAdmin stores admin queue state so IO queue connections can look it up. +func (s *Server) registerAdmin(sess *adminSession) { + s.adminMu.Lock() + defer s.adminMu.Unlock() + s.admins[sess.cntlID] = sess +} + +// unregisterAdmin removes an admin session by CNTLID. +func (s *Server) unregisterAdmin(cntlID uint16) { + s.adminMu.Lock() + defer s.adminMu.Unlock() + delete(s.admins, cntlID) +} + +// lookupAdmin returns the admin session for the given CNTLID. +func (s *Server) lookupAdmin(cntlID uint16) *adminSession { + s.adminMu.RLock() + defer s.adminMu.RUnlock() + return s.admins[cntlID] +} + +// Close gracefully shuts down the server. +func (s *Server) Close() error { + if !s.cfg.Enabled { + return nil + } + s.closed.Store(true) + + if s.listener != nil { + s.listener.Close() + } + + // Close all active sessions + s.mu.RLock() + sessions := make([]*Controller, 0, len(s.sessions)) + for ctrl := range s.sessions { + sessions = append(sessions, ctrl) + } + s.mu.RUnlock() + + for _, ctrl := range sessions { + ctrl.conn.Close() + } + + // Wait with timeout + done := make(chan struct{}) + go func() { + s.wg.Wait() + close(done) + }() + + select { + case <-done: + case <-time.After(5 * time.Second): + log.Printf("nvme: shutdown timed out after 5s") + } + return nil +} + +// NQN returns the full NQN for a volume name. +func (s *Server) NQN(volName string) string { + return s.cfg.NQNPrefix + volName +} diff --git a/weed/storage/blockvol/nvme/wire.go b/weed/storage/blockvol/nvme/wire.go new file mode 100644 index 000000000..b8ac979b6 --- /dev/null +++ b/weed/storage/blockvol/nvme/wire.go @@ -0,0 +1,202 @@ +package nvme + +import ( + "bufio" + "encoding/binary" + "fmt" + "io" +) + +// ---------- Reader ---------- + +// Reader decodes NVMe/TCP PDUs from a stream. +// +// Usage: +// +// hdr, _ := r.Dequeue() // read 8-byte CommonHeader +// r.Receive(&capsuleCmd) // read remaining specific header +// if r.Length() > 0 { +// data := make([]byte, r.Length()) +// r.ReceiveData(data) // read payload +// } +type Reader struct { + rd io.Reader + CH CommonHeader + header [maxHeaderSize]byte +} + +// NewReader wraps an io.Reader for NVMe/TCP PDU decoding. +func NewReader(r io.Reader) *Reader { + return &Reader{rd: r} +} + +// Dequeue reads the 8-byte CommonHeader, validates bounds, and returns it. +func (r *Reader) Dequeue() (*CommonHeader, error) { + if _, err := io.ReadFull(r.rd, r.header[:commonHeaderSize]); err != nil { + return nil, err + } + r.CH.Unmarshal(r.header[:commonHeaderSize]) + + // Validate header bounds to prevent panics on malformed PDUs. + if r.CH.HeaderLength < commonHeaderSize { + return nil, fmt.Errorf("nvme: HeaderLength %d < minimum %d", r.CH.HeaderLength, commonHeaderSize) + } + if r.CH.HeaderLength > maxHeaderSize { + return nil, fmt.Errorf("nvme: HeaderLength %d > maximum %d", r.CH.HeaderLength, maxHeaderSize) + } + if r.CH.DataOffset != 0 && r.CH.DataOffset < r.CH.HeaderLength { + return nil, fmt.Errorf("nvme: DataOffset %d < HeaderLength %d", r.CH.DataOffset, r.CH.HeaderLength) + } + if r.CH.DataOffset != 0 && uint32(r.CH.DataOffset) > r.CH.DataLength { + return nil, fmt.Errorf("nvme: DataOffset %d > DataLength %d", r.CH.DataOffset, r.CH.DataLength) + } + if r.CH.DataLength < uint32(r.CH.HeaderLength) { + return nil, fmt.Errorf("nvme: DataLength %d < HeaderLength %d", r.CH.DataLength, r.CH.HeaderLength) + } + // DataOffset==0 means no inline data — DataLength must equal HeaderLength, + // otherwise unconsumed bytes desynchronize the stream. + if r.CH.DataOffset == 0 && r.CH.DataLength != uint32(r.CH.HeaderLength) { + return nil, fmt.Errorf("nvme: DataOffset=0 but DataLength %d != HeaderLength %d", r.CH.DataLength, r.CH.HeaderLength) + } + + return &r.CH, nil +} + +// Receive reads the remaining PDU-specific header (HeaderLength - 8 bytes) +// and unmarshals it into pdu. It also skips any padding between header and +// data (DataOffset - HeaderLength bytes). +func (r *Reader) Receive(pdu PDU) error { + remain := int(r.CH.HeaderLength) - commonHeaderSize + if remain <= 0 { + return nil + } + if _, err := io.ReadFull(r.rd, r.header[commonHeaderSize:r.CH.HeaderLength]); err != nil { + return err + } + pdu.Unmarshal(r.header[commonHeaderSize:r.CH.HeaderLength]) + + // Skip padding between header and data. + pad := int(r.CH.DataOffset) - int(r.CH.HeaderLength) + if pad > 0 { + if _, err := io.ReadFull(r.rd, make([]byte, pad)); err != nil { + return err + } + } + return nil +} + +// Length returns the payload size: DataLength - DataOffset (when DataOffset != 0). +func (r *Reader) Length() uint32 { + if r.CH.DataOffset != 0 { + return r.CH.DataLength - uint32(r.CH.DataOffset) + } + return 0 +} + +// ReceiveData reads exactly len(buf) bytes of payload data. +func (r *Reader) ReceiveData(buf []byte) error { + _, err := io.ReadFull(r.rd, buf) + return err +} + +// ---------- Writer ---------- + +// Writer encodes NVMe/TCP PDUs to a stream. +type Writer struct { + wr *bufio.Writer + CH CommonHeader + header [maxHeaderSize]byte +} + +// NewWriter wraps an io.Writer for NVMe/TCP PDU encoding. +func NewWriter(w io.Writer) *Writer { + return &Writer{wr: bufio.NewWriter(w)} +} + +// PrepareHeaderOnly sets up a header-only PDU (no payload). +// Call Flush() to write it to the wire. +func (w *Writer) PrepareHeaderOnly(pduType uint8, pdu PDU, specificLen uint8) { + w.CH.Type = pduType + w.CH.Flags = 0 + w.CH.HeaderLength = commonHeaderSize + specificLen + w.CH.DataOffset = 0 + w.CH.DataLength = uint32(w.CH.HeaderLength) + pdu.Marshal(w.header[commonHeaderSize:]) +} + +// PrepareWithData sets up a PDU with payload data. +// Call Flush() to write it to the wire. +func (w *Writer) PrepareWithData(pduType, flags uint8, pdu PDU, specificLen uint8, data []byte) { + w.CH.Type = pduType + w.CH.Flags = flags + w.CH.HeaderLength = commonHeaderSize + specificLen + if data != nil { + w.CH.DataOffset = w.CH.HeaderLength + w.CH.DataLength = uint32(w.CH.HeaderLength) + uint32(len(data)) + } else { + w.CH.DataOffset = 0 + w.CH.DataLength = uint32(w.CH.HeaderLength) + } + pdu.Marshal(w.header[commonHeaderSize:]) +} + +// Flush writes the prepared CommonHeader + specific header to the wire. +// If there was payload data (from PrepareWithData), call FlushData after. +func (w *Writer) Flush() error { + w.CH.Marshal(w.header[:commonHeaderSize]) + if _, err := w.wr.Write(w.header[:w.CH.HeaderLength]); err != nil { + return err + } + return nil +} + +// FlushData writes payload data and flushes the underlying buffered writer. +func (w *Writer) FlushData(data []byte) error { + if len(data) > 0 { + if _, err := w.wr.Write(data); err != nil { + return err + } + } + return w.wr.Flush() +} + +// SendHeaderOnly writes a complete header-only PDU (prepare + flush). +func (w *Writer) SendHeaderOnly(pduType uint8, pdu PDU, specificLen uint8) error { + w.PrepareHeaderOnly(pduType, pdu, specificLen) + if err := w.Flush(); err != nil { + return err + } + return w.wr.Flush() +} + +// SendWithData writes a complete PDU with payload data. +func (w *Writer) SendWithData(pduType, flags uint8, pdu PDU, specificLen uint8, data []byte) error { + w.PrepareWithData(pduType, flags, pdu, specificLen, data) + if err := w.Flush(); err != nil { + return err + } + return w.FlushData(data) +} + +// writeRaw writes raw bytes directly (used for ConnectData inline in capsule). +func (w *Writer) writeRaw(data []byte) error { + _, err := w.wr.Write(data) + return err +} + +// flushBuf flushes the underlying buffered writer. +func (w *Writer) flushBuf() error { + return w.wr.Flush() +} + +// ---------- Helpers ---------- + +// putLE32 writes a uint32 in little-endian. +func putLE32(buf []byte, v uint32) { + binary.LittleEndian.PutUint32(buf, v) +} + +// putLE64 writes a uint64 in little-endian. +func putLE64(buf []byte, v uint64) { + binary.LittleEndian.PutUint64(buf, v) +}