You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
269 lines
9.0 KiB
269 lines
9.0 KiB
//! Error types and handling for the RDMA engine
|
|
|
|
// use std::fmt; // Unused for now
|
|
use thiserror::Error;
|
|
|
|
/// Result type alias for RDMA operations
|
|
pub type RdmaResult<T> = Result<T, RdmaError>;
|
|
|
|
/// Comprehensive error types for RDMA operations
|
|
#[derive(Error, Debug)]
|
|
pub enum RdmaError {
|
|
/// RDMA device not found or unavailable
|
|
#[error("RDMA device '{device}' not found or unavailable")]
|
|
DeviceNotFound { device: String },
|
|
|
|
/// Failed to initialize RDMA context
|
|
#[error("Failed to initialize RDMA context: {reason}")]
|
|
ContextInitFailed { reason: String },
|
|
|
|
/// Failed to allocate protection domain
|
|
#[error("Failed to allocate protection domain: {reason}")]
|
|
PdAllocFailed { reason: String },
|
|
|
|
/// Failed to create completion queue
|
|
#[error("Failed to create completion queue: {reason}")]
|
|
CqCreationFailed { reason: String },
|
|
|
|
/// Failed to create queue pair
|
|
#[error("Failed to create queue pair: {reason}")]
|
|
QpCreationFailed { reason: String },
|
|
|
|
/// Memory registration failed
|
|
#[error("Memory registration failed: {reason}")]
|
|
MemoryRegFailed { reason: String },
|
|
|
|
/// RDMA operation failed
|
|
#[error("RDMA operation failed: {operation}, status: {status}")]
|
|
OperationFailed { operation: String, status: i32 },
|
|
|
|
/// Session not found
|
|
#[error("Session '{session_id}' not found")]
|
|
SessionNotFound { session_id: String },
|
|
|
|
/// Session expired
|
|
#[error("Session '{session_id}' has expired")]
|
|
SessionExpired { session_id: String },
|
|
|
|
/// Too many active sessions
|
|
#[error("Maximum number of sessions ({max_sessions}) exceeded")]
|
|
TooManySessions { max_sessions: usize },
|
|
|
|
/// IPC communication error
|
|
#[error("IPC communication error: {reason}")]
|
|
IpcError { reason: String },
|
|
|
|
/// Serialization/deserialization error
|
|
#[error("Serialization error: {reason}")]
|
|
SerializationError { reason: String },
|
|
|
|
/// Invalid request parameters
|
|
#[error("Invalid request: {reason}")]
|
|
InvalidRequest { reason: String },
|
|
|
|
/// Insufficient buffer space
|
|
#[error("Insufficient buffer space: requested {requested}, available {available}")]
|
|
InsufficientBuffer { requested: usize, available: usize },
|
|
|
|
/// Hardware not supported
|
|
#[error("Hardware not supported: {reason}")]
|
|
UnsupportedHardware { reason: String },
|
|
|
|
/// System resource exhausted
|
|
#[error("System resource exhausted: {resource}")]
|
|
ResourceExhausted { resource: String },
|
|
|
|
/// Permission denied
|
|
#[error("Permission denied: {operation}")]
|
|
PermissionDenied { operation: String },
|
|
|
|
/// Network timeout
|
|
#[error("Network timeout after {timeout_ms}ms")]
|
|
NetworkTimeout { timeout_ms: u64 },
|
|
|
|
/// I/O error
|
|
#[error("I/O error: {0}")]
|
|
Io(#[from] std::io::Error),
|
|
|
|
/// Generic error for unexpected conditions
|
|
#[error("Internal error: {reason}")]
|
|
Internal { reason: String },
|
|
}
|
|
|
|
impl RdmaError {
|
|
/// Create a new DeviceNotFound error
|
|
pub fn device_not_found(device: impl Into<String>) -> Self {
|
|
Self::DeviceNotFound { device: device.into() }
|
|
}
|
|
|
|
/// Create a new ContextInitFailed error
|
|
pub fn context_init_failed(reason: impl Into<String>) -> Self {
|
|
Self::ContextInitFailed { reason: reason.into() }
|
|
}
|
|
|
|
/// Create a new MemoryRegFailed error
|
|
pub fn memory_reg_failed(reason: impl Into<String>) -> Self {
|
|
Self::MemoryRegFailed { reason: reason.into() }
|
|
}
|
|
|
|
/// Create a new OperationFailed error
|
|
pub fn operation_failed(operation: impl Into<String>, status: i32) -> Self {
|
|
Self::OperationFailed {
|
|
operation: operation.into(),
|
|
status
|
|
}
|
|
}
|
|
|
|
/// Create a new SessionNotFound error
|
|
pub fn session_not_found(session_id: impl Into<String>) -> Self {
|
|
Self::SessionNotFound { session_id: session_id.into() }
|
|
}
|
|
|
|
/// Create a new IpcError
|
|
pub fn ipc_error(reason: impl Into<String>) -> Self {
|
|
Self::IpcError { reason: reason.into() }
|
|
}
|
|
|
|
/// Create a new InvalidRequest error
|
|
pub fn invalid_request(reason: impl Into<String>) -> Self {
|
|
Self::InvalidRequest { reason: reason.into() }
|
|
}
|
|
|
|
/// Create a new Internal error
|
|
pub fn internal(reason: impl Into<String>) -> Self {
|
|
Self::Internal { reason: reason.into() }
|
|
}
|
|
|
|
/// Check if this error is recoverable
|
|
pub fn is_recoverable(&self) -> bool {
|
|
match self {
|
|
// Network and temporary errors are recoverable
|
|
Self::NetworkTimeout { .. } |
|
|
Self::ResourceExhausted { .. } |
|
|
Self::TooManySessions { .. } |
|
|
Self::InsufficientBuffer { .. } => true,
|
|
|
|
// Session errors are recoverable (can retry with new session)
|
|
Self::SessionNotFound { .. } |
|
|
Self::SessionExpired { .. } => true,
|
|
|
|
// Hardware and system errors are generally not recoverable
|
|
Self::DeviceNotFound { .. } |
|
|
Self::ContextInitFailed { .. } |
|
|
Self::UnsupportedHardware { .. } |
|
|
Self::PermissionDenied { .. } => false,
|
|
|
|
// IPC errors might be recoverable
|
|
Self::IpcError { .. } |
|
|
Self::SerializationError { .. } => true,
|
|
|
|
// Invalid requests are not recoverable without fixing the request
|
|
Self::InvalidRequest { .. } => false,
|
|
|
|
// RDMA operation failures might be recoverable
|
|
Self::OperationFailed { .. } => true,
|
|
|
|
// Memory and resource allocation failures depend on the cause
|
|
Self::PdAllocFailed { .. } |
|
|
Self::CqCreationFailed { .. } |
|
|
Self::QpCreationFailed { .. } |
|
|
Self::MemoryRegFailed { .. } => false,
|
|
|
|
// I/O errors might be recoverable
|
|
Self::Io(_) => true,
|
|
|
|
// Internal errors are generally not recoverable
|
|
Self::Internal { .. } => false,
|
|
}
|
|
}
|
|
|
|
/// Get error category for metrics and logging
|
|
pub fn category(&self) -> &'static str {
|
|
match self {
|
|
Self::DeviceNotFound { .. } |
|
|
Self::ContextInitFailed { .. } |
|
|
Self::UnsupportedHardware { .. } => "hardware",
|
|
|
|
Self::PdAllocFailed { .. } |
|
|
Self::CqCreationFailed { .. } |
|
|
Self::QpCreationFailed { .. } |
|
|
Self::MemoryRegFailed { .. } => "resource",
|
|
|
|
Self::OperationFailed { .. } => "rdma",
|
|
|
|
Self::SessionNotFound { .. } |
|
|
Self::SessionExpired { .. } |
|
|
Self::TooManySessions { .. } => "session",
|
|
|
|
Self::IpcError { .. } |
|
|
Self::SerializationError { .. } => "ipc",
|
|
|
|
Self::InvalidRequest { .. } => "request",
|
|
|
|
Self::InsufficientBuffer { .. } |
|
|
Self::ResourceExhausted { .. } => "capacity",
|
|
|
|
Self::PermissionDenied { .. } => "security",
|
|
|
|
Self::NetworkTimeout { .. } => "network",
|
|
|
|
Self::Io(_) => "io",
|
|
|
|
Self::Internal { .. } => "internal",
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Convert from various RDMA library error codes
|
|
impl From<i32> for RdmaError {
|
|
fn from(errno: i32) -> Self {
|
|
match errno {
|
|
libc::ENODEV => Self::DeviceNotFound {
|
|
device: "unknown".to_string()
|
|
},
|
|
libc::ENOMEM => Self::ResourceExhausted {
|
|
resource: "memory".to_string()
|
|
},
|
|
libc::EPERM | libc::EACCES => Self::PermissionDenied {
|
|
operation: "RDMA operation".to_string()
|
|
},
|
|
libc::ETIMEDOUT => Self::NetworkTimeout {
|
|
timeout_ms: 5000
|
|
},
|
|
libc::ENOSPC => Self::InsufficientBuffer {
|
|
requested: 0,
|
|
available: 0
|
|
},
|
|
_ => Self::Internal {
|
|
reason: format!("System error: {}", errno)
|
|
},
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_error_creation() {
|
|
let err = RdmaError::device_not_found("mlx5_0");
|
|
assert!(matches!(err, RdmaError::DeviceNotFound { .. }));
|
|
assert_eq!(err.category(), "hardware");
|
|
assert!(!err.is_recoverable());
|
|
}
|
|
|
|
#[test]
|
|
fn test_error_recoverability() {
|
|
assert!(RdmaError::NetworkTimeout { timeout_ms: 1000 }.is_recoverable());
|
|
assert!(!RdmaError::DeviceNotFound { device: "test".to_string() }.is_recoverable());
|
|
assert!(RdmaError::SessionExpired { session_id: "test".to_string() }.is_recoverable());
|
|
}
|
|
|
|
#[test]
|
|
fn test_error_display() {
|
|
let err = RdmaError::InvalidRequest { reason: "missing field".to_string() };
|
|
assert!(err.to_string().contains("Invalid request"));
|
|
assert!(err.to_string().contains("missing field"));
|
|
}
|
|
}
|