You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							379 lines
						
					
					
						
							12 KiB
						
					
					
				
			
		
		
		
			
			
			
		
		
	
	
							379 lines
						
					
					
						
							12 KiB
						
					
					
				
								package mount
							 | 
						|
								
							 | 
						|
								import (
							 | 
						|
									"context"
							 | 
						|
									"encoding/json"
							 | 
						|
									"fmt"
							 | 
						|
									"io"
							 | 
						|
									"net/http"
							 | 
						|
									"net/url"
							 | 
						|
									"os"
							 | 
						|
									"strings"
							 | 
						|
									"sync/atomic"
							 | 
						|
									"time"
							 | 
						|
								
							 | 
						|
									"github.com/seaweedfs/seaweedfs/weed/glog"
							 | 
						|
									"github.com/seaweedfs/seaweedfs/weed/wdclient"
							 | 
						|
								)
							 | 
						|
								
							 | 
						|
								// RDMAMountClient provides RDMA acceleration for SeaweedFS mount operations
							 | 
						|
								type RDMAMountClient struct {
							 | 
						|
									sidecarAddr   string
							 | 
						|
									httpClient    *http.Client
							 | 
						|
									maxConcurrent int
							 | 
						|
									timeout       time.Duration
							 | 
						|
									semaphore     chan struct{}
							 | 
						|
								
							 | 
						|
									// Volume lookup
							 | 
						|
									lookupFileIdFn wdclient.LookupFileIdFunctionType
							 | 
						|
								
							 | 
						|
									// Statistics
							 | 
						|
									totalRequests   atomic.Int64
							 | 
						|
									successfulReads atomic.Int64
							 | 
						|
									failedReads     atomic.Int64
							 | 
						|
									totalBytesRead  atomic.Int64
							 | 
						|
									totalLatencyNs  atomic.Int64
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								// RDMAReadRequest represents a request to read data via RDMA
							 | 
						|
								type RDMAReadRequest struct {
							 | 
						|
									VolumeID uint32 `json:"volume_id"`
							 | 
						|
									NeedleID uint64 `json:"needle_id"`
							 | 
						|
									Cookie   uint32 `json:"cookie"`
							 | 
						|
									Offset   uint64 `json:"offset"`
							 | 
						|
									Size     uint64 `json:"size"`
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								// RDMAReadResponse represents the response from an RDMA read operation
							 | 
						|
								type RDMAReadResponse struct {
							 | 
						|
									Success   bool   `json:"success"`
							 | 
						|
									IsRDMA    bool   `json:"is_rdma"`
							 | 
						|
									Source    string `json:"source"`
							 | 
						|
									Duration  string `json:"duration"`
							 | 
						|
									DataSize  int    `json:"data_size"`
							 | 
						|
									SessionID string `json:"session_id,omitempty"`
							 | 
						|
									ErrorMsg  string `json:"error,omitempty"`
							 | 
						|
								
							 | 
						|
									// Zero-copy optimization fields
							 | 
						|
									UseTempFile bool   `json:"use_temp_file"`
							 | 
						|
									TempFile    string `json:"temp_file"`
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								// RDMAHealthResponse represents the health status of the RDMA sidecar
							 | 
						|
								type RDMAHealthResponse struct {
							 | 
						|
									Status string `json:"status"`
							 | 
						|
									RDMA   struct {
							 | 
						|
										Enabled   bool `json:"enabled"`
							 | 
						|
										Connected bool `json:"connected"`
							 | 
						|
									} `json:"rdma"`
							 | 
						|
									Timestamp string `json:"timestamp"`
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								// NewRDMAMountClient creates a new RDMA client for mount operations
							 | 
						|
								func NewRDMAMountClient(sidecarAddr string, lookupFileIdFn wdclient.LookupFileIdFunctionType, maxConcurrent int, timeoutMs int) (*RDMAMountClient, error) {
							 | 
						|
									client := &RDMAMountClient{
							 | 
						|
										sidecarAddr:   sidecarAddr,
							 | 
						|
										maxConcurrent: maxConcurrent,
							 | 
						|
										timeout:       time.Duration(timeoutMs) * time.Millisecond,
							 | 
						|
										httpClient: &http.Client{
							 | 
						|
											Timeout: time.Duration(timeoutMs) * time.Millisecond,
							 | 
						|
										},
							 | 
						|
										semaphore:      make(chan struct{}, maxConcurrent),
							 | 
						|
										lookupFileIdFn: lookupFileIdFn,
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									// Test connectivity and RDMA availability
							 | 
						|
									if err := client.healthCheck(); err != nil {
							 | 
						|
										return nil, fmt.Errorf("RDMA sidecar health check failed: %w", err)
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									glog.Infof("RDMA mount client initialized: sidecar=%s, maxConcurrent=%d, timeout=%v",
							 | 
						|
										sidecarAddr, maxConcurrent, client.timeout)
							 | 
						|
								
							 | 
						|
									return client, nil
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								// lookupVolumeLocationByFileID finds the best volume server for a given file ID
							 | 
						|
								func (c *RDMAMountClient) lookupVolumeLocationByFileID(ctx context.Context, fileID string) (string, error) {
							 | 
						|
									glog.V(4).Infof("Looking up volume location for file ID %s", fileID)
							 | 
						|
								
							 | 
						|
									targetUrls, err := c.lookupFileIdFn(ctx, fileID)
							 | 
						|
									if err != nil {
							 | 
						|
										return "", fmt.Errorf("failed to lookup volume for file %s: %w", fileID, err)
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									if len(targetUrls) == 0 {
							 | 
						|
										return "", fmt.Errorf("no locations found for file %s", fileID)
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									// Choose the first URL and extract the server address
							 | 
						|
									targetUrl := targetUrls[0]
							 | 
						|
									// Extract server address from URL like "http://server:port/fileId"
							 | 
						|
									parts := strings.Split(targetUrl, "/")
							 | 
						|
									if len(parts) < 3 {
							 | 
						|
										return "", fmt.Errorf("invalid target URL format: %s", targetUrl)
							 | 
						|
									}
							 | 
						|
									bestAddress := fmt.Sprintf("http://%s", parts[2])
							 | 
						|
								
							 | 
						|
									glog.V(4).Infof("File %s located at %s", fileID, bestAddress)
							 | 
						|
									return bestAddress, nil
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								// lookupVolumeLocation finds the best volume server for a given volume ID (legacy method)
							 | 
						|
								func (c *RDMAMountClient) lookupVolumeLocation(ctx context.Context, volumeID uint32, needleID uint64, cookie uint32) (string, error) {
							 | 
						|
									// Create a file ID for lookup (format: volumeId,needleId,cookie)
							 | 
						|
									fileID := fmt.Sprintf("%d,%x,%d", volumeID, needleID, cookie)
							 | 
						|
									return c.lookupVolumeLocationByFileID(ctx, fileID)
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								// healthCheck verifies that the RDMA sidecar is available and functioning
							 | 
						|
								func (c *RDMAMountClient) healthCheck() error {
							 | 
						|
									ctx, cancel := context.WithTimeout(context.Background(), c.timeout)
							 | 
						|
									defer cancel()
							 | 
						|
								
							 | 
						|
									req, err := http.NewRequestWithContext(ctx, "GET",
							 | 
						|
										fmt.Sprintf("http://%s/health", c.sidecarAddr), nil)
							 | 
						|
									if err != nil {
							 | 
						|
										return fmt.Errorf("failed to create health check request: %w", err)
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									resp, err := c.httpClient.Do(req)
							 | 
						|
									if err != nil {
							 | 
						|
										return fmt.Errorf("health check request failed: %w", err)
							 | 
						|
									}
							 | 
						|
									defer resp.Body.Close()
							 | 
						|
								
							 | 
						|
									if resp.StatusCode != http.StatusOK {
							 | 
						|
										return fmt.Errorf("health check failed with status: %s", resp.Status)
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									// Parse health response
							 | 
						|
									var health RDMAHealthResponse
							 | 
						|
									if err := json.NewDecoder(resp.Body).Decode(&health); err != nil {
							 | 
						|
										return fmt.Errorf("failed to parse health response: %w", err)
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									if health.Status != "healthy" {
							 | 
						|
										return fmt.Errorf("sidecar reports unhealthy status: %s", health.Status)
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									if !health.RDMA.Enabled {
							 | 
						|
										return fmt.Errorf("RDMA is not enabled on sidecar")
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									if !health.RDMA.Connected {
							 | 
						|
										glog.Warningf("RDMA sidecar is healthy but not connected to RDMA engine")
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									return nil
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								// ReadNeedle reads data from a specific needle using RDMA acceleration
							 | 
						|
								func (c *RDMAMountClient) ReadNeedle(ctx context.Context, fileID string, offset, size uint64) ([]byte, bool, error) {
							 | 
						|
									// Acquire semaphore for concurrency control
							 | 
						|
									select {
							 | 
						|
									case c.semaphore <- struct{}{}:
							 | 
						|
										defer func() { <-c.semaphore }()
							 | 
						|
									case <-ctx.Done():
							 | 
						|
										return nil, false, ctx.Err()
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									c.totalRequests.Add(1)
							 | 
						|
									startTime := time.Now()
							 | 
						|
								
							 | 
						|
									// Lookup volume location using file ID directly
							 | 
						|
									volumeServer, err := c.lookupVolumeLocationByFileID(ctx, fileID)
							 | 
						|
									if err != nil {
							 | 
						|
										c.failedReads.Add(1)
							 | 
						|
										return nil, false, fmt.Errorf("failed to lookup volume for file %s: %w", fileID, err)
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									// Prepare request URL with file_id parameter (simpler than individual components)
							 | 
						|
									reqURL := fmt.Sprintf("http://%s/read?file_id=%s&offset=%d&size=%d&volume_server=%s",
							 | 
						|
										c.sidecarAddr, fileID, offset, size, volumeServer)
							 | 
						|
								
							 | 
						|
									req, err := http.NewRequestWithContext(ctx, "GET", reqURL, nil)
							 | 
						|
									if err != nil {
							 | 
						|
										c.failedReads.Add(1)
							 | 
						|
										return nil, false, fmt.Errorf("failed to create RDMA request: %w", err)
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									// Execute request
							 | 
						|
									resp, err := c.httpClient.Do(req)
							 | 
						|
									if err != nil {
							 | 
						|
										c.failedReads.Add(1)
							 | 
						|
										return nil, false, fmt.Errorf("RDMA request failed: %w", err)
							 | 
						|
									}
							 | 
						|
									defer resp.Body.Close()
							 | 
						|
								
							 | 
						|
									duration := time.Since(startTime)
							 | 
						|
									c.totalLatencyNs.Add(duration.Nanoseconds())
							 | 
						|
								
							 | 
						|
									if resp.StatusCode != http.StatusOK {
							 | 
						|
										c.failedReads.Add(1)
							 | 
						|
										body, _ := io.ReadAll(resp.Body)
							 | 
						|
										return nil, false, fmt.Errorf("RDMA read failed with status %s: %s", resp.Status, string(body))
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									// Check if response indicates RDMA was used
							 | 
						|
									contentType := resp.Header.Get("Content-Type")
							 | 
						|
									isRDMA := strings.Contains(resp.Header.Get("X-Source"), "rdma") ||
							 | 
						|
										resp.Header.Get("X-RDMA-Used") == "true"
							 | 
						|
								
							 | 
						|
									// Check for zero-copy temp file optimization
							 | 
						|
									tempFilePath := resp.Header.Get("X-Temp-File")
							 | 
						|
									useTempFile := resp.Header.Get("X-Use-Temp-File") == "true"
							 | 
						|
								
							 | 
						|
									var data []byte
							 | 
						|
								
							 | 
						|
									if useTempFile && tempFilePath != "" {
							 | 
						|
										// Zero-copy path: read from temp file (page cache)
							 | 
						|
										glog.V(4).Infof("🔥 Using zero-copy temp file: %s", tempFilePath)
							 | 
						|
								
							 | 
						|
										// Allocate buffer for temp file read
							 | 
						|
										var bufferSize uint64 = 1024 * 1024 // Default 1MB
							 | 
						|
										if size > 0 {
							 | 
						|
											bufferSize = size
							 | 
						|
										}
							 | 
						|
										buffer := make([]byte, bufferSize)
							 | 
						|
								
							 | 
						|
										n, err := c.readFromTempFile(tempFilePath, buffer)
							 | 
						|
										if err != nil {
							 | 
						|
											glog.V(2).Infof("Zero-copy failed, falling back to HTTP body: %v", err)
							 | 
						|
											// Fall back to reading HTTP body
							 | 
						|
											data, err = io.ReadAll(resp.Body)
							 | 
						|
										} else {
							 | 
						|
											data = buffer[:n]
							 | 
						|
											glog.V(4).Infof("🔥 Zero-copy successful: %d bytes from page cache", n)
							 | 
						|
										}
							 | 
						|
								
							 | 
						|
										// Important: Cleanup temp file after reading (consumer responsibility)
							 | 
						|
										// This prevents accumulation of temp files in /tmp/rdma-cache
							 | 
						|
										go c.cleanupTempFile(tempFilePath)
							 | 
						|
									} else {
							 | 
						|
										// Regular path: read from HTTP response body
							 | 
						|
										data, err = io.ReadAll(resp.Body)
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									if err != nil {
							 | 
						|
										c.failedReads.Add(1)
							 | 
						|
										return nil, false, fmt.Errorf("failed to read RDMA response: %w", err)
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									c.successfulReads.Add(1)
							 | 
						|
									c.totalBytesRead.Add(int64(len(data)))
							 | 
						|
								
							 | 
						|
									// Log successful operation
							 | 
						|
									glog.V(4).Infof("RDMA read completed: fileID=%s, size=%d, duration=%v, rdma=%v, contentType=%s",
							 | 
						|
										fileID, size, duration, isRDMA, contentType)
							 | 
						|
								
							 | 
						|
									return data, isRDMA, nil
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								// cleanupTempFile requests cleanup of a temp file from the sidecar
							 | 
						|
								func (c *RDMAMountClient) cleanupTempFile(tempFilePath string) {
							 | 
						|
									if tempFilePath == "" {
							 | 
						|
										return
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									// Give the page cache a brief moment to be utilized before cleanup
							 | 
						|
									// This preserves the zero-copy performance window
							 | 
						|
									time.Sleep(100 * time.Millisecond)
							 | 
						|
								
							 | 
						|
									// Call sidecar cleanup endpoint
							 | 
						|
									cleanupURL := fmt.Sprintf("http://%s/cleanup?temp_file=%s", c.sidecarAddr, url.QueryEscape(tempFilePath))
							 | 
						|
								
							 | 
						|
									ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
							 | 
						|
									defer cancel()
							 | 
						|
								
							 | 
						|
									req, err := http.NewRequestWithContext(ctx, "DELETE", cleanupURL, nil)
							 | 
						|
									if err != nil {
							 | 
						|
										glog.V(2).Infof("Failed to create cleanup request for %s: %v", tempFilePath, err)
							 | 
						|
										return
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									resp, err := c.httpClient.Do(req)
							 | 
						|
									if err != nil {
							 | 
						|
										glog.V(2).Infof("Failed to cleanup temp file %s: %v", tempFilePath, err)
							 | 
						|
										return
							 | 
						|
									}
							 | 
						|
									defer resp.Body.Close()
							 | 
						|
								
							 | 
						|
									if resp.StatusCode == http.StatusOK {
							 | 
						|
										glog.V(4).Infof("🧹 Temp file cleaned up: %s", tempFilePath)
							 | 
						|
									} else {
							 | 
						|
										glog.V(2).Infof("Cleanup failed for %s: status %s", tempFilePath, resp.Status)
							 | 
						|
									}
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								// GetStats returns current RDMA client statistics
							 | 
						|
								func (c *RDMAMountClient) GetStats() map[string]interface{} {
							 | 
						|
									totalRequests := c.totalRequests.Load()
							 | 
						|
									successfulReads := c.successfulReads.Load()
							 | 
						|
									failedReads := c.failedReads.Load()
							 | 
						|
									totalBytesRead := c.totalBytesRead.Load()
							 | 
						|
									totalLatencyNs := c.totalLatencyNs.Load()
							 | 
						|
								
							 | 
						|
									successRate := float64(0)
							 | 
						|
									avgLatencyNs := int64(0)
							 | 
						|
								
							 | 
						|
									if totalRequests > 0 {
							 | 
						|
										successRate = float64(successfulReads) / float64(totalRequests) * 100
							 | 
						|
										avgLatencyNs = totalLatencyNs / totalRequests
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									return map[string]interface{}{
							 | 
						|
										"sidecar_addr":     c.sidecarAddr,
							 | 
						|
										"max_concurrent":   c.maxConcurrent,
							 | 
						|
										"timeout_ms":       int(c.timeout / time.Millisecond),
							 | 
						|
										"total_requests":   totalRequests,
							 | 
						|
										"successful_reads": successfulReads,
							 | 
						|
										"failed_reads":     failedReads,
							 | 
						|
										"success_rate_pct": fmt.Sprintf("%.1f", successRate),
							 | 
						|
										"total_bytes_read": totalBytesRead,
							 | 
						|
										"avg_latency_ns":   avgLatencyNs,
							 | 
						|
										"avg_latency_ms":   fmt.Sprintf("%.3f", float64(avgLatencyNs)/1000000),
							 | 
						|
									}
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								// Close shuts down the RDMA client and releases resources
							 | 
						|
								func (c *RDMAMountClient) Close() error {
							 | 
						|
									// No need to close semaphore channel; closing it may cause panics if goroutines are still using it.
							 | 
						|
									// The semaphore will be garbage collected when the client is no longer referenced.
							 | 
						|
								
							 | 
						|
									// Log final statistics
							 | 
						|
									stats := c.GetStats()
							 | 
						|
									glog.Infof("RDMA mount client closing: %+v", stats)
							 | 
						|
								
							 | 
						|
									return nil
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								// IsHealthy checks if the RDMA sidecar is currently healthy
							 | 
						|
								func (c *RDMAMountClient) IsHealthy() bool {
							 | 
						|
									err := c.healthCheck()
							 | 
						|
									return err == nil
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								// readFromTempFile performs zero-copy read from temp file using page cache
							 | 
						|
								func (c *RDMAMountClient) readFromTempFile(tempFilePath string, buffer []byte) (int, error) {
							 | 
						|
									if tempFilePath == "" {
							 | 
						|
										return 0, fmt.Errorf("empty temp file path")
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									// Open temp file for reading
							 | 
						|
									file, err := os.Open(tempFilePath)
							 | 
						|
									if err != nil {
							 | 
						|
										return 0, fmt.Errorf("failed to open temp file %s: %w", tempFilePath, err)
							 | 
						|
									}
							 | 
						|
									defer file.Close()
							 | 
						|
								
							 | 
						|
									// Read from temp file (this should be served from page cache)
							 | 
						|
									n, err := file.Read(buffer)
							 | 
						|
									if err != nil && err != io.EOF {
							 | 
						|
										return n, fmt.Errorf("failed to read from temp file: %w", err)
							 | 
						|
									}
							 | 
						|
								
							 | 
						|
									glog.V(4).Infof("🔥 Zero-copy read: %d bytes from temp file %s", n, tempFilePath)
							 | 
						|
								
							 | 
						|
									return n, nil
							 | 
						|
								}
							 |