You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							379 lines
						
					
					
						
							12 KiB
						
					
					
				
			
		
		
		
			
			
			
		
		
	
	
							379 lines
						
					
					
						
							12 KiB
						
					
					
				| package mount | |
| 
 | |
| import ( | |
| 	"context" | |
| 	"encoding/json" | |
| 	"fmt" | |
| 	"io" | |
| 	"net/http" | |
| 	"net/url" | |
| 	"os" | |
| 	"strings" | |
| 	"sync/atomic" | |
| 	"time" | |
| 
 | |
| 	"github.com/seaweedfs/seaweedfs/weed/glog" | |
| 	"github.com/seaweedfs/seaweedfs/weed/wdclient" | |
| ) | |
| 
 | |
| // RDMAMountClient provides RDMA acceleration for SeaweedFS mount operations | |
| type RDMAMountClient struct { | |
| 	sidecarAddr   string | |
| 	httpClient    *http.Client | |
| 	maxConcurrent int | |
| 	timeout       time.Duration | |
| 	semaphore     chan struct{} | |
| 
 | |
| 	// Volume lookup | |
| 	lookupFileIdFn wdclient.LookupFileIdFunctionType | |
| 
 | |
| 	// Statistics | |
| 	totalRequests   atomic.Int64 | |
| 	successfulReads atomic.Int64 | |
| 	failedReads     atomic.Int64 | |
| 	totalBytesRead  atomic.Int64 | |
| 	totalLatencyNs  atomic.Int64 | |
| } | |
| 
 | |
| // RDMAReadRequest represents a request to read data via RDMA | |
| type RDMAReadRequest struct { | |
| 	VolumeID uint32 `json:"volume_id"` | |
| 	NeedleID uint64 `json:"needle_id"` | |
| 	Cookie   uint32 `json:"cookie"` | |
| 	Offset   uint64 `json:"offset"` | |
| 	Size     uint64 `json:"size"` | |
| } | |
| 
 | |
| // RDMAReadResponse represents the response from an RDMA read operation | |
| type RDMAReadResponse struct { | |
| 	Success   bool   `json:"success"` | |
| 	IsRDMA    bool   `json:"is_rdma"` | |
| 	Source    string `json:"source"` | |
| 	Duration  string `json:"duration"` | |
| 	DataSize  int    `json:"data_size"` | |
| 	SessionID string `json:"session_id,omitempty"` | |
| 	ErrorMsg  string `json:"error,omitempty"` | |
| 
 | |
| 	// Zero-copy optimization fields | |
| 	UseTempFile bool   `json:"use_temp_file"` | |
| 	TempFile    string `json:"temp_file"` | |
| } | |
| 
 | |
| // RDMAHealthResponse represents the health status of the RDMA sidecar | |
| type RDMAHealthResponse struct { | |
| 	Status string `json:"status"` | |
| 	RDMA   struct { | |
| 		Enabled   bool `json:"enabled"` | |
| 		Connected bool `json:"connected"` | |
| 	} `json:"rdma"` | |
| 	Timestamp string `json:"timestamp"` | |
| } | |
| 
 | |
| // NewRDMAMountClient creates a new RDMA client for mount operations | |
| func NewRDMAMountClient(sidecarAddr string, lookupFileIdFn wdclient.LookupFileIdFunctionType, maxConcurrent int, timeoutMs int) (*RDMAMountClient, error) { | |
| 	client := &RDMAMountClient{ | |
| 		sidecarAddr:   sidecarAddr, | |
| 		maxConcurrent: maxConcurrent, | |
| 		timeout:       time.Duration(timeoutMs) * time.Millisecond, | |
| 		httpClient: &http.Client{ | |
| 			Timeout: time.Duration(timeoutMs) * time.Millisecond, | |
| 		}, | |
| 		semaphore:      make(chan struct{}, maxConcurrent), | |
| 		lookupFileIdFn: lookupFileIdFn, | |
| 	} | |
| 
 | |
| 	// Test connectivity and RDMA availability | |
| 	if err := client.healthCheck(); err != nil { | |
| 		return nil, fmt.Errorf("RDMA sidecar health check failed: %w", err) | |
| 	} | |
| 
 | |
| 	glog.Infof("RDMA mount client initialized: sidecar=%s, maxConcurrent=%d, timeout=%v", | |
| 		sidecarAddr, maxConcurrent, client.timeout) | |
| 
 | |
| 	return client, nil | |
| } | |
| 
 | |
| // lookupVolumeLocationByFileID finds the best volume server for a given file ID | |
| func (c *RDMAMountClient) lookupVolumeLocationByFileID(ctx context.Context, fileID string) (string, error) { | |
| 	glog.V(4).Infof("Looking up volume location for file ID %s", fileID) | |
| 
 | |
| 	targetUrls, err := c.lookupFileIdFn(ctx, fileID) | |
| 	if err != nil { | |
| 		return "", fmt.Errorf("failed to lookup volume for file %s: %w", fileID, err) | |
| 	} | |
| 
 | |
| 	if len(targetUrls) == 0 { | |
| 		return "", fmt.Errorf("no locations found for file %s", fileID) | |
| 	} | |
| 
 | |
| 	// Choose the first URL and extract the server address | |
| 	targetUrl := targetUrls[0] | |
| 	// Extract server address from URL like "http://server:port/fileId" | |
| 	parts := strings.Split(targetUrl, "/") | |
| 	if len(parts) < 3 { | |
| 		return "", fmt.Errorf("invalid target URL format: %s", targetUrl) | |
| 	} | |
| 	bestAddress := fmt.Sprintf("http://%s", parts[2]) | |
| 
 | |
| 	glog.V(4).Infof("File %s located at %s", fileID, bestAddress) | |
| 	return bestAddress, nil | |
| } | |
| 
 | |
| // lookupVolumeLocation finds the best volume server for a given volume ID (legacy method) | |
| func (c *RDMAMountClient) lookupVolumeLocation(ctx context.Context, volumeID uint32, needleID uint64, cookie uint32) (string, error) { | |
| 	// Create a file ID for lookup (format: volumeId,needleId,cookie) | |
| 	fileID := fmt.Sprintf("%d,%x,%d", volumeID, needleID, cookie) | |
| 	return c.lookupVolumeLocationByFileID(ctx, fileID) | |
| } | |
| 
 | |
| // healthCheck verifies that the RDMA sidecar is available and functioning | |
| func (c *RDMAMountClient) healthCheck() error { | |
| 	ctx, cancel := context.WithTimeout(context.Background(), c.timeout) | |
| 	defer cancel() | |
| 
 | |
| 	req, err := http.NewRequestWithContext(ctx, "GET", | |
| 		fmt.Sprintf("http://%s/health", c.sidecarAddr), nil) | |
| 	if err != nil { | |
| 		return fmt.Errorf("failed to create health check request: %w", err) | |
| 	} | |
| 
 | |
| 	resp, err := c.httpClient.Do(req) | |
| 	if err != nil { | |
| 		return fmt.Errorf("health check request failed: %w", err) | |
| 	} | |
| 	defer resp.Body.Close() | |
| 
 | |
| 	if resp.StatusCode != http.StatusOK { | |
| 		return fmt.Errorf("health check failed with status: %s", resp.Status) | |
| 	} | |
| 
 | |
| 	// Parse health response | |
| 	var health RDMAHealthResponse | |
| 	if err := json.NewDecoder(resp.Body).Decode(&health); err != nil { | |
| 		return fmt.Errorf("failed to parse health response: %w", err) | |
| 	} | |
| 
 | |
| 	if health.Status != "healthy" { | |
| 		return fmt.Errorf("sidecar reports unhealthy status: %s", health.Status) | |
| 	} | |
| 
 | |
| 	if !health.RDMA.Enabled { | |
| 		return fmt.Errorf("RDMA is not enabled on sidecar") | |
| 	} | |
| 
 | |
| 	if !health.RDMA.Connected { | |
| 		glog.Warningf("RDMA sidecar is healthy but not connected to RDMA engine") | |
| 	} | |
| 
 | |
| 	return nil | |
| } | |
| 
 | |
| // ReadNeedle reads data from a specific needle using RDMA acceleration | |
| func (c *RDMAMountClient) ReadNeedle(ctx context.Context, fileID string, offset, size uint64) ([]byte, bool, error) { | |
| 	// Acquire semaphore for concurrency control | |
| 	select { | |
| 	case c.semaphore <- struct{}{}: | |
| 		defer func() { <-c.semaphore }() | |
| 	case <-ctx.Done(): | |
| 		return nil, false, ctx.Err() | |
| 	} | |
| 
 | |
| 	c.totalRequests.Add(1) | |
| 	startTime := time.Now() | |
| 
 | |
| 	// Lookup volume location using file ID directly | |
| 	volumeServer, err := c.lookupVolumeLocationByFileID(ctx, fileID) | |
| 	if err != nil { | |
| 		c.failedReads.Add(1) | |
| 		return nil, false, fmt.Errorf("failed to lookup volume for file %s: %w", fileID, err) | |
| 	} | |
| 
 | |
| 	// Prepare request URL with file_id parameter (simpler than individual components) | |
| 	reqURL := fmt.Sprintf("http://%s/read?file_id=%s&offset=%d&size=%d&volume_server=%s", | |
| 		c.sidecarAddr, fileID, offset, size, volumeServer) | |
| 
 | |
| 	req, err := http.NewRequestWithContext(ctx, "GET", reqURL, nil) | |
| 	if err != nil { | |
| 		c.failedReads.Add(1) | |
| 		return nil, false, fmt.Errorf("failed to create RDMA request: %w", err) | |
| 	} | |
| 
 | |
| 	// Execute request | |
| 	resp, err := c.httpClient.Do(req) | |
| 	if err != nil { | |
| 		c.failedReads.Add(1) | |
| 		return nil, false, fmt.Errorf("RDMA request failed: %w", err) | |
| 	} | |
| 	defer resp.Body.Close() | |
| 
 | |
| 	duration := time.Since(startTime) | |
| 	c.totalLatencyNs.Add(duration.Nanoseconds()) | |
| 
 | |
| 	if resp.StatusCode != http.StatusOK { | |
| 		c.failedReads.Add(1) | |
| 		body, _ := io.ReadAll(resp.Body) | |
| 		return nil, false, fmt.Errorf("RDMA read failed with status %s: %s", resp.Status, string(body)) | |
| 	} | |
| 
 | |
| 	// Check if response indicates RDMA was used | |
| 	contentType := resp.Header.Get("Content-Type") | |
| 	isRDMA := strings.Contains(resp.Header.Get("X-Source"), "rdma") || | |
| 		resp.Header.Get("X-RDMA-Used") == "true" | |
| 
 | |
| 	// Check for zero-copy temp file optimization | |
| 	tempFilePath := resp.Header.Get("X-Temp-File") | |
| 	useTempFile := resp.Header.Get("X-Use-Temp-File") == "true" | |
| 
 | |
| 	var data []byte | |
| 
 | |
| 	if useTempFile && tempFilePath != "" { | |
| 		// Zero-copy path: read from temp file (page cache) | |
| 		glog.V(4).Infof("🔥 Using zero-copy temp file: %s", tempFilePath) | |
| 
 | |
| 		// Allocate buffer for temp file read | |
| 		var bufferSize uint64 = 1024 * 1024 // Default 1MB | |
| 		if size > 0 { | |
| 			bufferSize = size | |
| 		} | |
| 		buffer := make([]byte, bufferSize) | |
| 
 | |
| 		n, err := c.readFromTempFile(tempFilePath, buffer) | |
| 		if err != nil { | |
| 			glog.V(2).Infof("Zero-copy failed, falling back to HTTP body: %v", err) | |
| 			// Fall back to reading HTTP body | |
| 			data, err = io.ReadAll(resp.Body) | |
| 		} else { | |
| 			data = buffer[:n] | |
| 			glog.V(4).Infof("🔥 Zero-copy successful: %d bytes from page cache", n) | |
| 		} | |
| 
 | |
| 		// Important: Cleanup temp file after reading (consumer responsibility) | |
| 		// This prevents accumulation of temp files in /tmp/rdma-cache | |
| 		go c.cleanupTempFile(tempFilePath) | |
| 	} else { | |
| 		// Regular path: read from HTTP response body | |
| 		data, err = io.ReadAll(resp.Body) | |
| 	} | |
| 
 | |
| 	if err != nil { | |
| 		c.failedReads.Add(1) | |
| 		return nil, false, fmt.Errorf("failed to read RDMA response: %w", err) | |
| 	} | |
| 
 | |
| 	c.successfulReads.Add(1) | |
| 	c.totalBytesRead.Add(int64(len(data))) | |
| 
 | |
| 	// Log successful operation | |
| 	glog.V(4).Infof("RDMA read completed: fileID=%s, size=%d, duration=%v, rdma=%v, contentType=%s", | |
| 		fileID, size, duration, isRDMA, contentType) | |
| 
 | |
| 	return data, isRDMA, nil | |
| } | |
| 
 | |
| // cleanupTempFile requests cleanup of a temp file from the sidecar | |
| func (c *RDMAMountClient) cleanupTempFile(tempFilePath string) { | |
| 	if tempFilePath == "" { | |
| 		return | |
| 	} | |
| 
 | |
| 	// Give the page cache a brief moment to be utilized before cleanup | |
| 	// This preserves the zero-copy performance window | |
| 	time.Sleep(100 * time.Millisecond) | |
| 
 | |
| 	// Call sidecar cleanup endpoint | |
| 	cleanupURL := fmt.Sprintf("http://%s/cleanup?temp_file=%s", c.sidecarAddr, url.QueryEscape(tempFilePath)) | |
| 
 | |
| 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) | |
| 	defer cancel() | |
| 
 | |
| 	req, err := http.NewRequestWithContext(ctx, "DELETE", cleanupURL, nil) | |
| 	if err != nil { | |
| 		glog.V(2).Infof("Failed to create cleanup request for %s: %v", tempFilePath, err) | |
| 		return | |
| 	} | |
| 
 | |
| 	resp, err := c.httpClient.Do(req) | |
| 	if err != nil { | |
| 		glog.V(2).Infof("Failed to cleanup temp file %s: %v", tempFilePath, err) | |
| 		return | |
| 	} | |
| 	defer resp.Body.Close() | |
| 
 | |
| 	if resp.StatusCode == http.StatusOK { | |
| 		glog.V(4).Infof("🧹 Temp file cleaned up: %s", tempFilePath) | |
| 	} else { | |
| 		glog.V(2).Infof("Cleanup failed for %s: status %s", tempFilePath, resp.Status) | |
| 	} | |
| } | |
| 
 | |
| // GetStats returns current RDMA client statistics | |
| func (c *RDMAMountClient) GetStats() map[string]interface{} { | |
| 	totalRequests := c.totalRequests.Load() | |
| 	successfulReads := c.successfulReads.Load() | |
| 	failedReads := c.failedReads.Load() | |
| 	totalBytesRead := c.totalBytesRead.Load() | |
| 	totalLatencyNs := c.totalLatencyNs.Load() | |
| 
 | |
| 	successRate := float64(0) | |
| 	avgLatencyNs := int64(0) | |
| 
 | |
| 	if totalRequests > 0 { | |
| 		successRate = float64(successfulReads) / float64(totalRequests) * 100 | |
| 		avgLatencyNs = totalLatencyNs / totalRequests | |
| 	} | |
| 
 | |
| 	return map[string]interface{}{ | |
| 		"sidecar_addr":     c.sidecarAddr, | |
| 		"max_concurrent":   c.maxConcurrent, | |
| 		"timeout_ms":       int(c.timeout / time.Millisecond), | |
| 		"total_requests":   totalRequests, | |
| 		"successful_reads": successfulReads, | |
| 		"failed_reads":     failedReads, | |
| 		"success_rate_pct": fmt.Sprintf("%.1f", successRate), | |
| 		"total_bytes_read": totalBytesRead, | |
| 		"avg_latency_ns":   avgLatencyNs, | |
| 		"avg_latency_ms":   fmt.Sprintf("%.3f", float64(avgLatencyNs)/1000000), | |
| 	} | |
| } | |
| 
 | |
| // Close shuts down the RDMA client and releases resources | |
| func (c *RDMAMountClient) Close() error { | |
| 	// No need to close semaphore channel; closing it may cause panics if goroutines are still using it. | |
| 	// The semaphore will be garbage collected when the client is no longer referenced. | |
|  | |
| 	// Log final statistics | |
| 	stats := c.GetStats() | |
| 	glog.Infof("RDMA mount client closing: %+v", stats) | |
| 
 | |
| 	return nil | |
| } | |
| 
 | |
| // IsHealthy checks if the RDMA sidecar is currently healthy | |
| func (c *RDMAMountClient) IsHealthy() bool { | |
| 	err := c.healthCheck() | |
| 	return err == nil | |
| } | |
| 
 | |
| // readFromTempFile performs zero-copy read from temp file using page cache | |
| func (c *RDMAMountClient) readFromTempFile(tempFilePath string, buffer []byte) (int, error) { | |
| 	if tempFilePath == "" { | |
| 		return 0, fmt.Errorf("empty temp file path") | |
| 	} | |
| 
 | |
| 	// Open temp file for reading | |
| 	file, err := os.Open(tempFilePath) | |
| 	if err != nil { | |
| 		return 0, fmt.Errorf("failed to open temp file %s: %w", tempFilePath, err) | |
| 	} | |
| 	defer file.Close() | |
| 
 | |
| 	// Read from temp file (this should be served from page cache) | |
| 	n, err := file.Read(buffer) | |
| 	if err != nil && err != io.EOF { | |
| 		return n, fmt.Errorf("failed to read from temp file: %w", err) | |
| 	} | |
| 
 | |
| 	glog.V(4).Infof("🔥 Zero-copy read: %d bytes from temp file %s", n, tempFilePath) | |
| 
 | |
| 	return n, nil | |
| }
 |