Browse Source
feat: Phase 10 CP10-2 -- CSI NVMe/TCP node plugin, 210 tests
feat: Phase 10 CP10-2 -- CSI NVMe/TCP node plugin, 210 tests
NVMe/TCP transport support in the CSI driver so Kubernetes pods can mount block volumes via NVMe alongside (or instead of) iSCSI. Transport selection: NVMe preferred when nvme_tcp module loaded + metadata present + nvmeUtil available. Fail-fast on NVMe errors (no silent iSCSI fallback). .transport file persists across CSI restarts. Key changes: - BuildNQN() single source of truth for NQN construction (naming.go) - NVMeUtil interface + realNVMeUtil wrapping nvme-cli (nvme_util.go) - NodeStageVolume/Unstage/Expand dual-transport paths (node.go) - NvmeAddr/NQN fields in VolumeInfo, Controller contexts - VolumeManager NvmeAddr()/VolumeNQN() getters - BlockService NvmeListenAddr()/NQN() accessors - 27 unit tests + 26 QA adversarial tests (nvme_node_test.go, qa_cp102) - Fix: flaky TestQA_Node_ConcurrentStageUnstage (pre-alloc temp dirs) Review fixes applied: F1 (NQN format mismatch), F2 (CreateVolume drops NVMe context), F3 (IsConnected error classification), F4 (findSubsys path validation), F5 (MasterVolumeClient NVMe gap documented). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>feature/sw-block
11 changed files with 2881 additions and 88 deletions
-
47weed/server/volume_server_block.go
-
30weed/storage/blockvol/csi/controller.go
-
258weed/storage/blockvol/csi/node.go
-
1222weed/storage/blockvol/csi/nvme_node_test.go
-
247weed/storage/blockvol/csi/nvme_util.go
-
1088weed/storage/blockvol/csi/qa_cp102_nvme_node_test.go
-
13weed/storage/blockvol/csi/qa_cp62_test.go
-
7weed/storage/blockvol/csi/server.go
-
11weed/storage/blockvol/csi/volume_backend.go
-
38weed/storage/blockvol/csi/volume_manager.go
-
8weed/storage/blockvol/naming.go
1222
weed/storage/blockvol/csi/nvme_node_test.go
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
@ -0,0 +1,247 @@ |
|||
package csi |
|||
|
|||
import ( |
|||
"context" |
|||
"encoding/json" |
|||
"errors" |
|||
"fmt" |
|||
"net" |
|||
"os" |
|||
"os/exec" |
|||
"strings" |
|||
"time" |
|||
) |
|||
|
|||
// NVMeUtil provides NVMe/TCP initiator operations.
|
|||
type NVMeUtil interface { |
|||
Connect(ctx context.Context, nqn, addr string) error |
|||
Disconnect(ctx context.Context, nqn string) error |
|||
IsConnected(ctx context.Context, nqn string) (bool, error) |
|||
GetDeviceByNQN(ctx context.Context, nqn string) (string, error) |
|||
GetControllerByNQN(ctx context.Context, nqn string) (string, error) |
|||
Rescan(ctx context.Context, nqn string) error |
|||
IsNVMeTCPAvailable() bool |
|||
} |
|||
|
|||
// realNVMeUtil uses nvme-cli commands.
|
|||
type realNVMeUtil struct{} |
|||
|
|||
func (r *realNVMeUtil) Connect(ctx context.Context, nqn, addr string) error { |
|||
host, port, err := net.SplitHostPort(addr) |
|||
if err != nil { |
|||
return fmt.Errorf("nvme connect: invalid addr %q: %w", addr, err) |
|||
} |
|||
cmd := exec.CommandContext(ctx, "nvme", "connect", "-t", "tcp", "-n", nqn, "-a", host, "-s", port) |
|||
out, err := cmd.CombinedOutput() |
|||
if err != nil { |
|||
// Treat "already connected" as success (idempotent).
|
|||
if strings.Contains(string(out), "already connected") { |
|||
return nil |
|||
} |
|||
return fmt.Errorf("nvme connect: %s: %w", string(out), err) |
|||
} |
|||
return nil |
|||
} |
|||
|
|||
func (r *realNVMeUtil) Disconnect(ctx context.Context, nqn string) error { |
|||
cmd := exec.CommandContext(ctx, "nvme", "disconnect", "-n", nqn) |
|||
out, err := cmd.CombinedOutput() |
|||
if err != nil { |
|||
// Treat "not connected" / "no subsystem" as success (idempotent).
|
|||
outStr := string(out) |
|||
if strings.Contains(outStr, "not connected") || strings.Contains(outStr, "No subsystemtype") || strings.Contains(outStr, "Invalid argument") { |
|||
return nil |
|||
} |
|||
return fmt.Errorf("nvme disconnect: %s: %w", outStr, err) |
|||
} |
|||
return nil |
|||
} |
|||
|
|||
func (r *realNVMeUtil) IsConnected(ctx context.Context, nqn string) (bool, error) { |
|||
_, _, err := r.findSubsys(ctx, nqn) |
|||
if err != nil { |
|||
if errors.Is(err, errNQNNotFound) { |
|||
return false, nil // NQN not present = not connected
|
|||
} |
|||
return false, err // command/parse failure — propagate
|
|||
} |
|||
return true, nil |
|||
} |
|||
|
|||
// errNQNNotFound is returned by findSubsys when the NQN is not in the subsystem list.
|
|||
// Callers use errors.Is to distinguish "not found" from command/parse errors.
|
|||
var errNQNNotFound = errors.New("nvme: NQN not found") |
|||
|
|||
func (r *realNVMeUtil) GetDeviceByNQN(ctx context.Context, nqn string) (string, error) { |
|||
// Poll for device to appear (NVMe connect + device enumeration is async).
|
|||
deadline := time.After(10 * time.Second) |
|||
ticker := time.NewTicker(200 * time.Millisecond) |
|||
defer ticker.Stop() |
|||
|
|||
for { |
|||
select { |
|||
case <-ctx.Done(): |
|||
return "", ctx.Err() |
|||
case <-deadline: |
|||
return "", fmt.Errorf("timeout waiting for NVMe device for NQN %s", nqn) |
|||
case <-ticker.C: |
|||
_, dev, err := r.findSubsys(ctx, nqn) |
|||
if err != nil { |
|||
continue |
|||
} |
|||
if dev != "" { |
|||
return dev, nil |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
func (r *realNVMeUtil) GetControllerByNQN(ctx context.Context, nqn string) (string, error) { |
|||
ctrl, _, err := r.findSubsys(ctx, nqn) |
|||
if err != nil { |
|||
return "", err |
|||
} |
|||
if ctrl == "" { |
|||
return "", fmt.Errorf("no controller found for NQN %s", nqn) |
|||
} |
|||
return ctrl, nil |
|||
} |
|||
|
|||
func (r *realNVMeUtil) Rescan(ctx context.Context, nqn string) error { |
|||
ctrl, err := r.GetControllerByNQN(ctx, nqn) |
|||
if err != nil { |
|||
return fmt.Errorf("nvme rescan: find controller: %w", err) |
|||
} |
|||
cmd := exec.CommandContext(ctx, "nvme", "ns-rescan", ctrl) |
|||
out, errCmd := cmd.CombinedOutput() |
|||
if errCmd != nil { |
|||
return fmt.Errorf("nvme ns-rescan %s: %s: %w", ctrl, string(out), errCmd) |
|||
} |
|||
return nil |
|||
} |
|||
|
|||
// IsNVMeTCPAvailable checks if the nvme_tcp kernel module is loaded (read-only).
|
|||
func (r *realNVMeUtil) IsNVMeTCPAvailable() bool { |
|||
_, err := os.Stat("/sys/module/nvme_tcp") |
|||
return err == nil |
|||
} |
|||
|
|||
// nvmeListSubsysOutput represents the JSON output from `nvme list-subsys -o json`.
|
|||
type nvmeListSubsysOutput struct { |
|||
Subsystems []nvmeSubsys `json:"Subsystems"` |
|||
} |
|||
|
|||
type nvmeSubsys struct { |
|||
NQN string `json:"NQN"` |
|||
Paths []nvmePath `json:"Paths"` |
|||
// Some nvme-cli versions use "Namespaces" instead.
|
|||
} |
|||
|
|||
type nvmePath struct { |
|||
Name string `json:"Name"` // controller name, e.g. "nvme0"
|
|||
Transport string `json:"Transport"` |
|||
State string `json:"State"` |
|||
} |
|||
|
|||
// findSubsys parses `nvme list-subsys -o json` to find controller and namespace device
|
|||
// for a given NQN. Returns (controller path, namespace device path, error).
|
|||
// Returns errNQNNotFound (sentinel) when the NQN is absent from the subsystem list.
|
|||
// Returns a non-sentinel error for command execution or JSON parse failures.
|
|||
func (r *realNVMeUtil) findSubsys(ctx context.Context, nqn string) (string, string, error) { |
|||
cmd := exec.CommandContext(ctx, "nvme", "list-subsys", "-o", "json") |
|||
out, err := cmd.CombinedOutput() |
|||
if err != nil { |
|||
return "", "", fmt.Errorf("nvme list-subsys: %s: %w", string(out), err) |
|||
} |
|||
|
|||
var parsed nvmeListSubsysOutput |
|||
if err := json.Unmarshal(out, &parsed); err != nil { |
|||
return "", "", fmt.Errorf("nvme list-subsys: parse json: %w", err) |
|||
} |
|||
|
|||
for _, ss := range parsed.Subsystems { |
|||
if ss.NQN != nqn { |
|||
continue |
|||
} |
|||
// Prefer a live TCP path. Fall back to any path with a name.
|
|||
var fallbackCtrl string |
|||
for _, p := range ss.Paths { |
|||
if p.Name == "" { |
|||
continue |
|||
} |
|||
ctrl := "/dev/" + p.Name |
|||
dev := ctrl + "n1" |
|||
// Prefer Transport=tcp + State=live.
|
|||
if strings.EqualFold(p.Transport, "tcp") && strings.EqualFold(p.State, "live") { |
|||
return ctrl, dev, nil |
|||
} |
|||
if fallbackCtrl == "" { |
|||
fallbackCtrl = ctrl |
|||
} |
|||
} |
|||
if fallbackCtrl != "" { |
|||
return fallbackCtrl, fallbackCtrl + "n1", nil |
|||
} |
|||
return "", "", fmt.Errorf("NQN %s found but no controller paths", nqn) |
|||
} |
|||
return "", "", errNQNNotFound |
|||
} |
|||
|
|||
// mockNVMeUtil is a test double for NVMeUtil.
|
|||
type mockNVMeUtil struct { |
|||
connectErr error |
|||
disconnectErr error |
|||
getDeviceResult string |
|||
getDeviceErr error |
|||
getControllerResult string |
|||
getControllerErr error |
|||
rescanErr error |
|||
nvmeTCPAvailable bool |
|||
connected map[string]bool |
|||
calls []string |
|||
} |
|||
|
|||
func newMockNVMeUtil() *mockNVMeUtil { |
|||
return &mockNVMeUtil{connected: make(map[string]bool)} |
|||
} |
|||
|
|||
func (m *mockNVMeUtil) Connect(_ context.Context, nqn, addr string) error { |
|||
m.calls = append(m.calls, "connect:"+nqn+":"+addr) |
|||
if m.connectErr != nil { |
|||
return m.connectErr |
|||
} |
|||
m.connected[nqn] = true |
|||
return nil |
|||
} |
|||
|
|||
func (m *mockNVMeUtil) Disconnect(_ context.Context, nqn string) error { |
|||
m.calls = append(m.calls, "disconnect:"+nqn) |
|||
if m.disconnectErr != nil { |
|||
return m.disconnectErr |
|||
} |
|||
delete(m.connected, nqn) |
|||
return nil |
|||
} |
|||
|
|||
func (m *mockNVMeUtil) IsConnected(_ context.Context, nqn string) (bool, error) { |
|||
return m.connected[nqn], nil |
|||
} |
|||
|
|||
func (m *mockNVMeUtil) GetDeviceByNQN(_ context.Context, nqn string) (string, error) { |
|||
m.calls = append(m.calls, "getdevice:"+nqn) |
|||
return m.getDeviceResult, m.getDeviceErr |
|||
} |
|||
|
|||
func (m *mockNVMeUtil) GetControllerByNQN(_ context.Context, nqn string) (string, error) { |
|||
m.calls = append(m.calls, "getcontroller:"+nqn) |
|||
return m.getControllerResult, m.getControllerErr |
|||
} |
|||
|
|||
func (m *mockNVMeUtil) Rescan(_ context.Context, nqn string) error { |
|||
m.calls = append(m.calls, "rescan:"+nqn) |
|||
return m.rescanErr |
|||
} |
|||
|
|||
func (m *mockNVMeUtil) IsNVMeTCPAvailable() bool { |
|||
return m.nvmeTCPAvailable |
|||
} |
|||
1088
weed/storage/blockvol/csi/qa_cp102_nvme_node_test.go
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
Write
Preview
Loading…
Cancel
Save
Reference in new issue