You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
542 lines
19 KiB
542 lines
19 KiB
//! IPC (Inter-Process Communication) module for communicating with Go sidecar
|
|
//!
|
|
//! This module handles high-performance IPC between the Rust RDMA engine and
|
|
//! the Go control plane sidecar using Unix domain sockets and MessagePack serialization.
|
|
|
|
use crate::{RdmaError, RdmaResult, rdma::RdmaContext, session::SessionManager};
|
|
use serde::{Deserialize, Serialize};
|
|
use std::sync::Arc;
|
|
use std::sync::atomic::{AtomicU64, Ordering};
|
|
use tokio::net::{UnixListener, UnixStream};
|
|
use tokio::io::{AsyncReadExt, AsyncWriteExt, BufReader, BufWriter};
|
|
use tracing::{info, debug, error};
|
|
use uuid::Uuid;
|
|
use std::path::Path;
|
|
|
|
/// Atomic counter for generating unique work request IDs
|
|
/// This ensures no hash collisions that could cause incorrect completion handling
|
|
static NEXT_WR_ID: AtomicU64 = AtomicU64::new(1);
|
|
|
|
/// IPC message types between Go sidecar and Rust RDMA engine
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
#[serde(tag = "type", content = "data")]
|
|
pub enum IpcMessage {
|
|
/// Request to start an RDMA read operation
|
|
StartRead(StartReadRequest),
|
|
/// Response with RDMA session information
|
|
StartReadResponse(StartReadResponse),
|
|
|
|
/// Request to complete an RDMA operation
|
|
CompleteRead(CompleteReadRequest),
|
|
/// Response confirming completion
|
|
CompleteReadResponse(CompleteReadResponse),
|
|
|
|
/// Request for engine capabilities
|
|
GetCapabilities(GetCapabilitiesRequest),
|
|
/// Response with engine capabilities
|
|
GetCapabilitiesResponse(GetCapabilitiesResponse),
|
|
|
|
/// Health check ping
|
|
Ping(PingRequest),
|
|
/// Ping response
|
|
Pong(PongResponse),
|
|
|
|
/// Error response
|
|
Error(ErrorResponse),
|
|
}
|
|
|
|
/// Request to start RDMA read operation
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct StartReadRequest {
|
|
/// Volume ID in SeaweedFS
|
|
pub volume_id: u32,
|
|
/// Needle ID in SeaweedFS
|
|
pub needle_id: u64,
|
|
/// Needle cookie for validation
|
|
pub cookie: u32,
|
|
/// File offset within the needle data
|
|
pub offset: u64,
|
|
/// Size to read (0 = entire needle)
|
|
pub size: u64,
|
|
/// Remote memory address from Go sidecar
|
|
pub remote_addr: u64,
|
|
/// Remote key for RDMA access
|
|
pub remote_key: u32,
|
|
/// Session timeout in seconds
|
|
pub timeout_secs: u64,
|
|
/// Authentication token (optional)
|
|
pub auth_token: Option<String>,
|
|
}
|
|
|
|
/// Response with RDMA session details
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct StartReadResponse {
|
|
/// Unique session identifier
|
|
pub session_id: String,
|
|
/// Local buffer address for RDMA
|
|
pub local_addr: u64,
|
|
/// Local key for RDMA operations
|
|
pub local_key: u32,
|
|
/// Actual size that will be transferred
|
|
pub transfer_size: u64,
|
|
/// Expected CRC checksum
|
|
pub expected_crc: u32,
|
|
/// Session expiration timestamp (Unix nanoseconds)
|
|
pub expires_at_ns: u64,
|
|
}
|
|
|
|
/// Request to complete RDMA operation
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct CompleteReadRequest {
|
|
/// Session ID to complete
|
|
pub session_id: String,
|
|
/// Whether the operation was successful
|
|
pub success: bool,
|
|
/// Actual bytes transferred
|
|
pub bytes_transferred: u64,
|
|
/// Client-computed CRC (for verification)
|
|
pub client_crc: Option<u32>,
|
|
/// Error message if failed
|
|
pub error_message: Option<String>,
|
|
}
|
|
|
|
/// Response confirming completion
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct CompleteReadResponse {
|
|
/// Whether completion was successful
|
|
pub success: bool,
|
|
/// Server-computed CRC for verification
|
|
pub server_crc: Option<u32>,
|
|
/// Any cleanup messages
|
|
pub message: Option<String>,
|
|
}
|
|
|
|
/// Request for engine capabilities
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct GetCapabilitiesRequest {
|
|
/// Client identifier
|
|
pub client_id: Option<String>,
|
|
}
|
|
|
|
/// Response with engine capabilities
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct GetCapabilitiesResponse {
|
|
/// RDMA device name
|
|
pub device_name: String,
|
|
/// RDMA device vendor ID
|
|
pub vendor_id: u32,
|
|
/// Maximum transfer size in bytes
|
|
pub max_transfer_size: u64,
|
|
/// Maximum concurrent sessions
|
|
pub max_sessions: usize,
|
|
/// Current active sessions
|
|
pub active_sessions: usize,
|
|
/// Device port GID
|
|
pub port_gid: String,
|
|
/// Device port LID
|
|
pub port_lid: u16,
|
|
/// Supported authentication methods
|
|
pub supported_auth: Vec<String>,
|
|
/// Engine version
|
|
pub version: String,
|
|
/// Whether real RDMA hardware is available
|
|
pub real_rdma: bool,
|
|
}
|
|
|
|
/// Health check ping request
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct PingRequest {
|
|
/// Client timestamp (Unix nanoseconds)
|
|
pub timestamp_ns: u64,
|
|
/// Client identifier
|
|
pub client_id: Option<String>,
|
|
}
|
|
|
|
/// Ping response
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct PongResponse {
|
|
/// Original client timestamp
|
|
pub client_timestamp_ns: u64,
|
|
/// Server timestamp (Unix nanoseconds)
|
|
pub server_timestamp_ns: u64,
|
|
/// Round-trip time in nanoseconds (server perspective)
|
|
pub server_rtt_ns: u64,
|
|
}
|
|
|
|
/// Error response
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct ErrorResponse {
|
|
/// Error code
|
|
pub code: String,
|
|
/// Human-readable error message
|
|
pub message: String,
|
|
/// Error category
|
|
pub category: String,
|
|
/// Whether the error is recoverable
|
|
pub recoverable: bool,
|
|
}
|
|
|
|
impl From<&RdmaError> for ErrorResponse {
|
|
fn from(error: &RdmaError) -> Self {
|
|
Self {
|
|
code: format!("{:?}", error),
|
|
message: error.to_string(),
|
|
category: error.category().to_string(),
|
|
recoverable: error.is_recoverable(),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// IPC server handling communication with Go sidecar
|
|
pub struct IpcServer {
|
|
socket_path: String,
|
|
listener: Option<UnixListener>,
|
|
rdma_context: Arc<RdmaContext>,
|
|
session_manager: Arc<SessionManager>,
|
|
shutdown_flag: Arc<parking_lot::RwLock<bool>>,
|
|
}
|
|
|
|
impl IpcServer {
|
|
/// Create new IPC server
|
|
pub async fn new(
|
|
socket_path: &str,
|
|
rdma_context: Arc<RdmaContext>,
|
|
session_manager: Arc<SessionManager>,
|
|
) -> RdmaResult<Self> {
|
|
// Remove existing socket if it exists
|
|
if Path::new(socket_path).exists() {
|
|
std::fs::remove_file(socket_path)
|
|
.map_err(|e| RdmaError::ipc_error(format!("Failed to remove existing socket: {}", e)))?;
|
|
}
|
|
|
|
Ok(Self {
|
|
socket_path: socket_path.to_string(),
|
|
listener: None,
|
|
rdma_context,
|
|
session_manager,
|
|
shutdown_flag: Arc::new(parking_lot::RwLock::new(false)),
|
|
})
|
|
}
|
|
|
|
/// Start the IPC server
|
|
pub async fn run(&mut self) -> RdmaResult<()> {
|
|
let listener = UnixListener::bind(&self.socket_path)
|
|
.map_err(|e| RdmaError::ipc_error(format!("Failed to bind Unix socket: {}", e)))?;
|
|
|
|
info!("🎯 IPC server listening on: {}", self.socket_path);
|
|
self.listener = Some(listener);
|
|
|
|
if let Some(ref listener) = self.listener {
|
|
loop {
|
|
// Check shutdown flag
|
|
if *self.shutdown_flag.read() {
|
|
info!("IPC server shutting down");
|
|
break;
|
|
}
|
|
|
|
// Accept connection with timeout
|
|
let accept_result = tokio::time::timeout(
|
|
tokio::time::Duration::from_millis(100),
|
|
listener.accept()
|
|
).await;
|
|
|
|
match accept_result {
|
|
Ok(Ok((stream, addr))) => {
|
|
debug!("New IPC connection from: {:?}", addr);
|
|
|
|
// Spawn handler for this connection
|
|
let rdma_context = self.rdma_context.clone();
|
|
let session_manager = self.session_manager.clone();
|
|
let shutdown_flag = self.shutdown_flag.clone();
|
|
|
|
tokio::spawn(async move {
|
|
if let Err(e) = Self::handle_connection(stream, rdma_context, session_manager, shutdown_flag).await {
|
|
error!("IPC connection error: {}", e);
|
|
}
|
|
});
|
|
}
|
|
Ok(Err(e)) => {
|
|
error!("Failed to accept IPC connection: {}", e);
|
|
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
|
|
}
|
|
Err(_) => {
|
|
// Timeout - continue loop to check shutdown flag
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Handle a single IPC connection
|
|
async fn handle_connection(
|
|
stream: UnixStream,
|
|
rdma_context: Arc<RdmaContext>,
|
|
session_manager: Arc<SessionManager>,
|
|
shutdown_flag: Arc<parking_lot::RwLock<bool>>,
|
|
) -> RdmaResult<()> {
|
|
let (reader_half, writer_half) = stream.into_split();
|
|
let mut reader = BufReader::new(reader_half);
|
|
let mut writer = BufWriter::new(writer_half);
|
|
|
|
let mut buffer = Vec::with_capacity(4096);
|
|
|
|
loop {
|
|
// Check shutdown
|
|
if *shutdown_flag.read() {
|
|
break;
|
|
}
|
|
|
|
// Read message length (4 bytes)
|
|
let mut len_bytes = [0u8; 4];
|
|
match tokio::time::timeout(
|
|
tokio::time::Duration::from_millis(100),
|
|
reader.read_exact(&mut len_bytes)
|
|
).await {
|
|
Ok(Ok(_)) => {},
|
|
Ok(Err(e)) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
|
|
debug!("IPC connection closed by peer");
|
|
break;
|
|
}
|
|
Ok(Err(e)) => return Err(RdmaError::ipc_error(format!("Read error: {}", e))),
|
|
Err(_) => continue, // Timeout, check shutdown flag
|
|
}
|
|
|
|
let msg_len = u32::from_le_bytes(len_bytes) as usize;
|
|
if msg_len > 1024 * 1024 { // 1MB max message size
|
|
return Err(RdmaError::ipc_error("Message too large"));
|
|
}
|
|
|
|
// Read message data
|
|
buffer.clear();
|
|
buffer.resize(msg_len, 0);
|
|
reader.read_exact(&mut buffer).await
|
|
.map_err(|e| RdmaError::ipc_error(format!("Failed to read message: {}", e)))?;
|
|
|
|
// Deserialize message
|
|
let request: IpcMessage = rmp_serde::from_slice(&buffer)
|
|
.map_err(|e| RdmaError::SerializationError { reason: e.to_string() })?;
|
|
|
|
debug!("Received IPC message: {:?}", request);
|
|
|
|
// Process message
|
|
let response = Self::process_message(
|
|
request,
|
|
&rdma_context,
|
|
&session_manager,
|
|
).await;
|
|
|
|
// Serialize response
|
|
let response_data = rmp_serde::to_vec(&response)
|
|
.map_err(|e| RdmaError::SerializationError { reason: e.to_string() })?;
|
|
|
|
// Send response
|
|
let response_len = (response_data.len() as u32).to_le_bytes();
|
|
writer.write_all(&response_len).await
|
|
.map_err(|e| RdmaError::ipc_error(format!("Failed to write response length: {}", e)))?;
|
|
writer.write_all(&response_data).await
|
|
.map_err(|e| RdmaError::ipc_error(format!("Failed to write response: {}", e)))?;
|
|
writer.flush().await
|
|
.map_err(|e| RdmaError::ipc_error(format!("Failed to flush response: {}", e)))?;
|
|
|
|
debug!("Sent IPC response");
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Process IPC message and generate response
|
|
async fn process_message(
|
|
message: IpcMessage,
|
|
rdma_context: &Arc<RdmaContext>,
|
|
session_manager: &Arc<SessionManager>,
|
|
) -> IpcMessage {
|
|
match message {
|
|
IpcMessage::Ping(req) => {
|
|
let server_timestamp = chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0) as u64;
|
|
IpcMessage::Pong(PongResponse {
|
|
client_timestamp_ns: req.timestamp_ns,
|
|
server_timestamp_ns: server_timestamp,
|
|
server_rtt_ns: server_timestamp.saturating_sub(req.timestamp_ns),
|
|
})
|
|
}
|
|
|
|
IpcMessage::GetCapabilities(_req) => {
|
|
let device_info = rdma_context.device_info();
|
|
let active_sessions = session_manager.active_session_count().await;
|
|
|
|
IpcMessage::GetCapabilitiesResponse(GetCapabilitiesResponse {
|
|
device_name: device_info.name.clone(),
|
|
vendor_id: device_info.vendor_id,
|
|
max_transfer_size: device_info.max_mr_size,
|
|
max_sessions: session_manager.max_sessions(),
|
|
active_sessions,
|
|
port_gid: device_info.port_gid.clone(),
|
|
port_lid: device_info.port_lid,
|
|
supported_auth: vec!["none".to_string()],
|
|
version: env!("CARGO_PKG_VERSION").to_string(),
|
|
real_rdma: cfg!(feature = "real-ucx"),
|
|
})
|
|
}
|
|
|
|
IpcMessage::StartRead(req) => {
|
|
match Self::handle_start_read(req, rdma_context, session_manager).await {
|
|
Ok(response) => IpcMessage::StartReadResponse(response),
|
|
Err(error) => IpcMessage::Error(ErrorResponse::from(&error)),
|
|
}
|
|
}
|
|
|
|
IpcMessage::CompleteRead(req) => {
|
|
match Self::handle_complete_read(req, session_manager).await {
|
|
Ok(response) => IpcMessage::CompleteReadResponse(response),
|
|
Err(error) => IpcMessage::Error(ErrorResponse::from(&error)),
|
|
}
|
|
}
|
|
|
|
_ => IpcMessage::Error(ErrorResponse {
|
|
code: "UNSUPPORTED_MESSAGE".to_string(),
|
|
message: "Unsupported message type".to_string(),
|
|
category: "request".to_string(),
|
|
recoverable: true,
|
|
}),
|
|
}
|
|
}
|
|
|
|
/// Handle StartRead request
|
|
async fn handle_start_read(
|
|
req: StartReadRequest,
|
|
rdma_context: &Arc<RdmaContext>,
|
|
session_manager: &Arc<SessionManager>,
|
|
) -> RdmaResult<StartReadResponse> {
|
|
info!("🚀 Starting RDMA read: volume={}, needle={}, size={}",
|
|
req.volume_id, req.needle_id, req.size);
|
|
|
|
// Create session
|
|
let session_id = Uuid::new_v4().to_string();
|
|
let transfer_size = if req.size == 0 { 65536 } else { req.size }; // Default 64KB
|
|
|
|
// Allocate local buffer
|
|
let buffer = vec![0u8; transfer_size as usize];
|
|
let local_addr = buffer.as_ptr() as u64;
|
|
|
|
// Register memory for RDMA
|
|
let memory_region = rdma_context.register_memory(local_addr, transfer_size as usize).await?;
|
|
|
|
// Create and store session
|
|
session_manager.create_session(
|
|
session_id.clone(),
|
|
req.volume_id,
|
|
req.needle_id,
|
|
req.remote_addr,
|
|
req.remote_key,
|
|
transfer_size,
|
|
buffer,
|
|
memory_region.clone(),
|
|
chrono::Duration::seconds(req.timeout_secs as i64),
|
|
).await?;
|
|
|
|
// Perform RDMA read with unique work request ID
|
|
// Use atomic counter to avoid hash collisions that could cause incorrect completion handling
|
|
let wr_id = NEXT_WR_ID.fetch_add(1, Ordering::Relaxed);
|
|
rdma_context.post_read(
|
|
local_addr,
|
|
req.remote_addr,
|
|
req.remote_key,
|
|
transfer_size as usize,
|
|
wr_id,
|
|
).await?;
|
|
|
|
// Poll for completion
|
|
let completions = rdma_context.poll_completion(1).await?;
|
|
if completions.is_empty() {
|
|
return Err(RdmaError::operation_failed("RDMA read", -1));
|
|
}
|
|
|
|
let completion = &completions[0];
|
|
if completion.status != crate::rdma::CompletionStatus::Success {
|
|
return Err(RdmaError::operation_failed("RDMA read", completion.status as i32));
|
|
}
|
|
|
|
info!("✅ RDMA read completed: {} bytes", completion.byte_len);
|
|
|
|
let expires_at = chrono::Utc::now() + chrono::Duration::seconds(req.timeout_secs as i64);
|
|
|
|
Ok(StartReadResponse {
|
|
session_id,
|
|
local_addr,
|
|
local_key: memory_region.lkey,
|
|
transfer_size,
|
|
expected_crc: 0x12345678, // Mock CRC
|
|
expires_at_ns: expires_at.timestamp_nanos_opt().unwrap_or(0) as u64,
|
|
})
|
|
}
|
|
|
|
/// Handle CompleteRead request
|
|
async fn handle_complete_read(
|
|
req: CompleteReadRequest,
|
|
session_manager: &Arc<SessionManager>,
|
|
) -> RdmaResult<CompleteReadResponse> {
|
|
info!("🏁 Completing RDMA read session: {}", req.session_id);
|
|
|
|
// Clean up session
|
|
session_manager.remove_session(&req.session_id).await?;
|
|
|
|
Ok(CompleteReadResponse {
|
|
success: req.success,
|
|
server_crc: Some(0x12345678), // Mock CRC
|
|
message: Some("Session completed successfully".to_string()),
|
|
})
|
|
}
|
|
|
|
/// Shutdown the IPC server
|
|
pub async fn shutdown(&mut self) -> RdmaResult<()> {
|
|
info!("Shutting down IPC server");
|
|
*self.shutdown_flag.write() = true;
|
|
|
|
// Remove socket file
|
|
if Path::new(&self.socket_path).exists() {
|
|
std::fs::remove_file(&self.socket_path)
|
|
.map_err(|e| RdmaError::ipc_error(format!("Failed to remove socket file: {}", e)))?;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_error_response_conversion() {
|
|
let error = RdmaError::device_not_found("mlx5_0");
|
|
let response = ErrorResponse::from(&error);
|
|
|
|
assert!(response.message.contains("mlx5_0"));
|
|
assert_eq!(response.category, "hardware");
|
|
assert!(!response.recoverable);
|
|
}
|
|
|
|
#[test]
|
|
fn test_message_serialization() {
|
|
let request = IpcMessage::Ping(PingRequest {
|
|
timestamp_ns: 12345,
|
|
client_id: Some("test".to_string()),
|
|
});
|
|
|
|
let serialized = rmp_serde::to_vec(&request).unwrap();
|
|
let deserialized: IpcMessage = rmp_serde::from_slice(&serialized).unwrap();
|
|
|
|
match deserialized {
|
|
IpcMessage::Ping(ping) => {
|
|
assert_eq!(ping.timestamp_ns, 12345);
|
|
assert_eq!(ping.client_id, Some("test".to_string()));
|
|
}
|
|
_ => panic!("Wrong message type"),
|
|
}
|
|
}
|
|
}
|