Browse Source
feat: Phase 10 CP10-2 -- CSI NVMe/TCP node plugin, 210 tests
feat: Phase 10 CP10-2 -- CSI NVMe/TCP node plugin, 210 tests
NVMe/TCP transport support in the CSI driver so Kubernetes pods can mount block volumes via NVMe alongside (or instead of) iSCSI. Transport selection: NVMe preferred when nvme_tcp module loaded + metadata present + nvmeUtil available. Fail-fast on NVMe errors (no silent iSCSI fallback). .transport file persists across CSI restarts. Key changes: - BuildNQN() single source of truth for NQN construction (naming.go) - NVMeUtil interface + realNVMeUtil wrapping nvme-cli (nvme_util.go) - NodeStageVolume/Unstage/Expand dual-transport paths (node.go) - NvmeAddr/NQN fields in VolumeInfo, Controller contexts - VolumeManager NvmeAddr()/VolumeNQN() getters - BlockService NvmeListenAddr()/NQN() accessors - 27 unit tests + 26 QA adversarial tests (nvme_node_test.go, qa_cp102) - Fix: flaky TestQA_Node_ConcurrentStageUnstage (pre-alloc temp dirs) Review fixes applied: F1 (NQN format mismatch), F2 (CreateVolume drops NVMe context), F3 (IsConnected error classification), F4 (findSubsys path validation), F5 (MasterVolumeClient NVMe gap documented). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>feature/sw-block
11 changed files with 2881 additions and 88 deletions
-
47weed/server/volume_server_block.go
-
30weed/storage/blockvol/csi/controller.go
-
258weed/storage/blockvol/csi/node.go
-
1222weed/storage/blockvol/csi/nvme_node_test.go
-
247weed/storage/blockvol/csi/nvme_util.go
-
1088weed/storage/blockvol/csi/qa_cp102_nvme_node_test.go
-
13weed/storage/blockvol/csi/qa_cp62_test.go
-
7weed/storage/blockvol/csi/server.go
-
11weed/storage/blockvol/csi/volume_backend.go
-
38weed/storage/blockvol/csi/volume_manager.go
-
8weed/storage/blockvol/naming.go
1222
weed/storage/blockvol/csi/nvme_node_test.go
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
@ -0,0 +1,247 @@ |
|||||
|
package csi |
||||
|
|
||||
|
import ( |
||||
|
"context" |
||||
|
"encoding/json" |
||||
|
"errors" |
||||
|
"fmt" |
||||
|
"net" |
||||
|
"os" |
||||
|
"os/exec" |
||||
|
"strings" |
||||
|
"time" |
||||
|
) |
||||
|
|
||||
|
// NVMeUtil provides NVMe/TCP initiator operations.
|
||||
|
type NVMeUtil interface { |
||||
|
Connect(ctx context.Context, nqn, addr string) error |
||||
|
Disconnect(ctx context.Context, nqn string) error |
||||
|
IsConnected(ctx context.Context, nqn string) (bool, error) |
||||
|
GetDeviceByNQN(ctx context.Context, nqn string) (string, error) |
||||
|
GetControllerByNQN(ctx context.Context, nqn string) (string, error) |
||||
|
Rescan(ctx context.Context, nqn string) error |
||||
|
IsNVMeTCPAvailable() bool |
||||
|
} |
||||
|
|
||||
|
// realNVMeUtil uses nvme-cli commands.
|
||||
|
type realNVMeUtil struct{} |
||||
|
|
||||
|
func (r *realNVMeUtil) Connect(ctx context.Context, nqn, addr string) error { |
||||
|
host, port, err := net.SplitHostPort(addr) |
||||
|
if err != nil { |
||||
|
return fmt.Errorf("nvme connect: invalid addr %q: %w", addr, err) |
||||
|
} |
||||
|
cmd := exec.CommandContext(ctx, "nvme", "connect", "-t", "tcp", "-n", nqn, "-a", host, "-s", port) |
||||
|
out, err := cmd.CombinedOutput() |
||||
|
if err != nil { |
||||
|
// Treat "already connected" as success (idempotent).
|
||||
|
if strings.Contains(string(out), "already connected") { |
||||
|
return nil |
||||
|
} |
||||
|
return fmt.Errorf("nvme connect: %s: %w", string(out), err) |
||||
|
} |
||||
|
return nil |
||||
|
} |
||||
|
|
||||
|
func (r *realNVMeUtil) Disconnect(ctx context.Context, nqn string) error { |
||||
|
cmd := exec.CommandContext(ctx, "nvme", "disconnect", "-n", nqn) |
||||
|
out, err := cmd.CombinedOutput() |
||||
|
if err != nil { |
||||
|
// Treat "not connected" / "no subsystem" as success (idempotent).
|
||||
|
outStr := string(out) |
||||
|
if strings.Contains(outStr, "not connected") || strings.Contains(outStr, "No subsystemtype") || strings.Contains(outStr, "Invalid argument") { |
||||
|
return nil |
||||
|
} |
||||
|
return fmt.Errorf("nvme disconnect: %s: %w", outStr, err) |
||||
|
} |
||||
|
return nil |
||||
|
} |
||||
|
|
||||
|
func (r *realNVMeUtil) IsConnected(ctx context.Context, nqn string) (bool, error) { |
||||
|
_, _, err := r.findSubsys(ctx, nqn) |
||||
|
if err != nil { |
||||
|
if errors.Is(err, errNQNNotFound) { |
||||
|
return false, nil // NQN not present = not connected
|
||||
|
} |
||||
|
return false, err // command/parse failure — propagate
|
||||
|
} |
||||
|
return true, nil |
||||
|
} |
||||
|
|
||||
|
// errNQNNotFound is returned by findSubsys when the NQN is not in the subsystem list.
|
||||
|
// Callers use errors.Is to distinguish "not found" from command/parse errors.
|
||||
|
var errNQNNotFound = errors.New("nvme: NQN not found") |
||||
|
|
||||
|
func (r *realNVMeUtil) GetDeviceByNQN(ctx context.Context, nqn string) (string, error) { |
||||
|
// Poll for device to appear (NVMe connect + device enumeration is async).
|
||||
|
deadline := time.After(10 * time.Second) |
||||
|
ticker := time.NewTicker(200 * time.Millisecond) |
||||
|
defer ticker.Stop() |
||||
|
|
||||
|
for { |
||||
|
select { |
||||
|
case <-ctx.Done(): |
||||
|
return "", ctx.Err() |
||||
|
case <-deadline: |
||||
|
return "", fmt.Errorf("timeout waiting for NVMe device for NQN %s", nqn) |
||||
|
case <-ticker.C: |
||||
|
_, dev, err := r.findSubsys(ctx, nqn) |
||||
|
if err != nil { |
||||
|
continue |
||||
|
} |
||||
|
if dev != "" { |
||||
|
return dev, nil |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
func (r *realNVMeUtil) GetControllerByNQN(ctx context.Context, nqn string) (string, error) { |
||||
|
ctrl, _, err := r.findSubsys(ctx, nqn) |
||||
|
if err != nil { |
||||
|
return "", err |
||||
|
} |
||||
|
if ctrl == "" { |
||||
|
return "", fmt.Errorf("no controller found for NQN %s", nqn) |
||||
|
} |
||||
|
return ctrl, nil |
||||
|
} |
||||
|
|
||||
|
func (r *realNVMeUtil) Rescan(ctx context.Context, nqn string) error { |
||||
|
ctrl, err := r.GetControllerByNQN(ctx, nqn) |
||||
|
if err != nil { |
||||
|
return fmt.Errorf("nvme rescan: find controller: %w", err) |
||||
|
} |
||||
|
cmd := exec.CommandContext(ctx, "nvme", "ns-rescan", ctrl) |
||||
|
out, errCmd := cmd.CombinedOutput() |
||||
|
if errCmd != nil { |
||||
|
return fmt.Errorf("nvme ns-rescan %s: %s: %w", ctrl, string(out), errCmd) |
||||
|
} |
||||
|
return nil |
||||
|
} |
||||
|
|
||||
|
// IsNVMeTCPAvailable checks if the nvme_tcp kernel module is loaded (read-only).
|
||||
|
func (r *realNVMeUtil) IsNVMeTCPAvailable() bool { |
||||
|
_, err := os.Stat("/sys/module/nvme_tcp") |
||||
|
return err == nil |
||||
|
} |
||||
|
|
||||
|
// nvmeListSubsysOutput represents the JSON output from `nvme list-subsys -o json`.
|
||||
|
type nvmeListSubsysOutput struct { |
||||
|
Subsystems []nvmeSubsys `json:"Subsystems"` |
||||
|
} |
||||
|
|
||||
|
type nvmeSubsys struct { |
||||
|
NQN string `json:"NQN"` |
||||
|
Paths []nvmePath `json:"Paths"` |
||||
|
// Some nvme-cli versions use "Namespaces" instead.
|
||||
|
} |
||||
|
|
||||
|
type nvmePath struct { |
||||
|
Name string `json:"Name"` // controller name, e.g. "nvme0"
|
||||
|
Transport string `json:"Transport"` |
||||
|
State string `json:"State"` |
||||
|
} |
||||
|
|
||||
|
// findSubsys parses `nvme list-subsys -o json` to find controller and namespace device
|
||||
|
// for a given NQN. Returns (controller path, namespace device path, error).
|
||||
|
// Returns errNQNNotFound (sentinel) when the NQN is absent from the subsystem list.
|
||||
|
// Returns a non-sentinel error for command execution or JSON parse failures.
|
||||
|
func (r *realNVMeUtil) findSubsys(ctx context.Context, nqn string) (string, string, error) { |
||||
|
cmd := exec.CommandContext(ctx, "nvme", "list-subsys", "-o", "json") |
||||
|
out, err := cmd.CombinedOutput() |
||||
|
if err != nil { |
||||
|
return "", "", fmt.Errorf("nvme list-subsys: %s: %w", string(out), err) |
||||
|
} |
||||
|
|
||||
|
var parsed nvmeListSubsysOutput |
||||
|
if err := json.Unmarshal(out, &parsed); err != nil { |
||||
|
return "", "", fmt.Errorf("nvme list-subsys: parse json: %w", err) |
||||
|
} |
||||
|
|
||||
|
for _, ss := range parsed.Subsystems { |
||||
|
if ss.NQN != nqn { |
||||
|
continue |
||||
|
} |
||||
|
// Prefer a live TCP path. Fall back to any path with a name.
|
||||
|
var fallbackCtrl string |
||||
|
for _, p := range ss.Paths { |
||||
|
if p.Name == "" { |
||||
|
continue |
||||
|
} |
||||
|
ctrl := "/dev/" + p.Name |
||||
|
dev := ctrl + "n1" |
||||
|
// Prefer Transport=tcp + State=live.
|
||||
|
if strings.EqualFold(p.Transport, "tcp") && strings.EqualFold(p.State, "live") { |
||||
|
return ctrl, dev, nil |
||||
|
} |
||||
|
if fallbackCtrl == "" { |
||||
|
fallbackCtrl = ctrl |
||||
|
} |
||||
|
} |
||||
|
if fallbackCtrl != "" { |
||||
|
return fallbackCtrl, fallbackCtrl + "n1", nil |
||||
|
} |
||||
|
return "", "", fmt.Errorf("NQN %s found but no controller paths", nqn) |
||||
|
} |
||||
|
return "", "", errNQNNotFound |
||||
|
} |
||||
|
|
||||
|
// mockNVMeUtil is a test double for NVMeUtil.
|
||||
|
type mockNVMeUtil struct { |
||||
|
connectErr error |
||||
|
disconnectErr error |
||||
|
getDeviceResult string |
||||
|
getDeviceErr error |
||||
|
getControllerResult string |
||||
|
getControllerErr error |
||||
|
rescanErr error |
||||
|
nvmeTCPAvailable bool |
||||
|
connected map[string]bool |
||||
|
calls []string |
||||
|
} |
||||
|
|
||||
|
func newMockNVMeUtil() *mockNVMeUtil { |
||||
|
return &mockNVMeUtil{connected: make(map[string]bool)} |
||||
|
} |
||||
|
|
||||
|
func (m *mockNVMeUtil) Connect(_ context.Context, nqn, addr string) error { |
||||
|
m.calls = append(m.calls, "connect:"+nqn+":"+addr) |
||||
|
if m.connectErr != nil { |
||||
|
return m.connectErr |
||||
|
} |
||||
|
m.connected[nqn] = true |
||||
|
return nil |
||||
|
} |
||||
|
|
||||
|
func (m *mockNVMeUtil) Disconnect(_ context.Context, nqn string) error { |
||||
|
m.calls = append(m.calls, "disconnect:"+nqn) |
||||
|
if m.disconnectErr != nil { |
||||
|
return m.disconnectErr |
||||
|
} |
||||
|
delete(m.connected, nqn) |
||||
|
return nil |
||||
|
} |
||||
|
|
||||
|
func (m *mockNVMeUtil) IsConnected(_ context.Context, nqn string) (bool, error) { |
||||
|
return m.connected[nqn], nil |
||||
|
} |
||||
|
|
||||
|
func (m *mockNVMeUtil) GetDeviceByNQN(_ context.Context, nqn string) (string, error) { |
||||
|
m.calls = append(m.calls, "getdevice:"+nqn) |
||||
|
return m.getDeviceResult, m.getDeviceErr |
||||
|
} |
||||
|
|
||||
|
func (m *mockNVMeUtil) GetControllerByNQN(_ context.Context, nqn string) (string, error) { |
||||
|
m.calls = append(m.calls, "getcontroller:"+nqn) |
||||
|
return m.getControllerResult, m.getControllerErr |
||||
|
} |
||||
|
|
||||
|
func (m *mockNVMeUtil) Rescan(_ context.Context, nqn string) error { |
||||
|
m.calls = append(m.calls, "rescan:"+nqn) |
||||
|
return m.rescanErr |
||||
|
} |
||||
|
|
||||
|
func (m *mockNVMeUtil) IsNVMeTCPAvailable() bool { |
||||
|
return m.nvmeTCPAvailable |
||||
|
} |
||||
1088
weed/storage/blockvol/csi/qa_cp102_nvme_node_test.go
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
Write
Preview
Loading…
Cancel
Save
Reference in new issue