From 9a4f32fc495d8d83756c4e6e985bce06cc4f7517 Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Sun, 21 Dec 2025 23:25:30 -0800 Subject: [PATCH] feat: add automatic port detection and fallback for mini command (#7836) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: add automatic port detection and fallback for mini command - Added port availability detection using TCP binding tests - Implemented port fallback mechanism searching for available ports - Support for both HTTP and gRPC port handling - IP-aware port checking using actual service bind address - Dual-interface verification (specific IP and wildcard 0.0.0.0) - All services (Master, Volume, Filer, S3, WebDAV, Admin) auto-reallocate to available ports - Enables multiple mini instances to run simultaneously without conflicts * fix: use actual bind IP for service health checks - Previously health checks were hardcoded to localhost (127.0.0.1) - This caused failures when services bind to actual IP (e.g., 10.21.153.8) - Now health checks use the same IP that services are binding to - Fixes Volume and other service health check failures on non-localhost IPs * refactor: improve port detection logic and remove gRPC handling duplication - findAvailablePortOnIP now returns 0 on failure instead of unavailable port Allows callers to detect when port finding fails and handle appropriately - Remove duplicate gRPC port handling from ensureAllPortsAvailableOnIP All gRPC port logic is now centralized in initializeGrpcPortsOnIP - Log final port configuration only after all ports are finalized Both HTTP and gRPC ports are now correctly initialized before logging - Add error logging when port allocation fails Makes debugging easier when ports can't be found * refactor: fix race condition and clean up port detection code - Convert parallel HTTP port checks to sequential to prevent race conditions where multiple goroutines could allocate the same available port - Remove unused 'sync' import since WaitGroup is no longer used - Add documentation to localhost wrapper functions explaining they are kept for backwards compatibility and future use - All gRPC port logic is now exclusively handled in initializeGrpcPortsOnIP eliminating any duplication in ensureAllPortsAvailableOnIP * refactor: address code review comments - constants, helper function, and cleanup - Define GrpcPortOffset constant (10000) to replace magic numbers throughout the code for better maintainability and consistency - Extract bindIp determination logic into getBindIp() helper function to eliminate code duplication between runMini and startMiniServices - Remove redundant 'calculatedPort = calculatedPort' assignment that had no effect - Update all gRPC port calculations to use GrpcPortOffset constant (lines 489, 886 and the error logging at line 501) * refactor: remove unused wrapper functions and update documentation - Remove unused localhost wrapper functions that were never called: - isPortOpen() - wrapper around isPortOpenOnIP with hardcoded 127.0.0.1 - findAvailablePort() - wrapper around findAvailablePortOnIP with hardcoded 127.0.0.1 - ensurePortAvailable() - wrapper around ensurePortAvailableOnIP with hardcoded 127.0.0.1 - ensureAllPortsAvailable() - wrapper around ensureAllPortsAvailableOnIP with hardcoded 127.0.0.1 Since this is new functionality with no backwards compatibility concerns, these wrapper functions were not needed. The comments claiming they were 'kept for future use or backwards compatibility' are no longer valid. - Update documentation to reference GrpcPortOffset constant instead of hardcoded 10000: - Update comment in ensureAllPortsAvailableOnIP to use GrpcPortOffset - Update admin.port.grpc flag help text to reference GrpcPortOffset Note: getBindIp() is actually being used and should be retained (contrary to the review comment suggesting it was unused - it's called in both runMini and startMiniServices functions) * refactor: prevent HTTP/gRPC port collisions and improve error handling - Add upfront reservation of all calculated gRPC ports before allocating HTTP ports to prevent collisions where an HTTP port allocation could use a port that will later be needed for a gRPC port calculation. Example scenario that is now prevented: - Master HTTP reallocated from 9333 to 9334 (original in use) - Filer HTTP search finds 19334 available and assigns it - Master gRPC calculated as 9334 + GrpcPortOffset = 19334 → collision! Now: reserved gRPC ports are tracked upfront and HTTP port search skips them. - Improve admin server gRPC port fallback error handling: - Change from silent V(1) verbose log to Warningf to make the error visible - Update comment to clarify this indicates a problem in the port initialization sequence - Add explanation that the fallback calculation may cause bind failure - Update ensureAllPortsAvailableOnIP comment to clarify it avoids reserved ports * fix: enforce reserved ports in HTTP allocation and improve admin gRPC fallback Critical fixes for port allocation safety: 1. Make findAvailablePortOnIP and ensurePortAvailableOnIP aware of reservedPorts: - Add reservedPorts map parameter to both functions - findAvailablePortOnIP now skips reserved ports when searching for alternatives - ensurePortAvailableOnIP passes reservedPorts through to findAvailablePortOnIP - This prevents HTTP ports from being allocated to ports reserved for gRPC 2. Update ensureAllPortsAvailableOnIP to pass reservedPorts: - Pass the reservedPorts map to ensurePortAvailableOnIP calls - Maintains the map updates (delete/add) for accuracy as ports change 3. Replace blind admin gRPC port fallback with proper availability checks: - Previous code just calculated *miniAdminOptions.port + GrpcPortOffset - New code checks both the calculated port and finds alternatives if needed - Uses the same availability checking logic as initializeGrpcPortsOnIP - Properly logs the fallback process and any port changes - Will fail gracefully if no available ports found (consistent with other services) These changes eliminate two critical vulnerabilities: - HTTP port allocation can no longer accidentally claim gRPC ports - Admin gRPC port fallback no longer blindly uses an unchecked port * fix: prevent gRPC port collisions during multi-service fallback allocation Critical fix for gRPC port allocation safety across multiple services: Problem: When multiple services need gRPC port fallback allocation in sequence (e.g., Master gRPC unavailable → finds alternative, then Filer gRPC unavailable → searches from calculated port), there was no tracking of previously allocated gRPC ports. This could allow two services to claim the same port. Scenario that is now prevented: - Master gRPC: calculated 19333 unavailable → finds 19334 → assigns 19334 - Filer gRPC: calculated 18888 unavailable → searches from 18889, might land on 19334 if consecutive ports in range are unavailable (especially with custom port configurations or in high-port-contention environments) Solution: - Add allocatedGrpcPorts map to track gRPC ports allocated within the function - Check allocatedGrpcPorts before using calculated port for each service - Pass allocatedGrpcPorts to findAvailablePortOnIP when finding fallback ports - Add allocatedGrpcPorts[port] = true after each successful allocation - This ensures no two services can allocate the same gRPC port The fix handles both: 1. Calculated gRPC ports (when grpcPort == 0) 2. Explicitly set gRPC ports (when user provides -service.port.grpc value) While default port spacing makes collision unlikely, this fix is essential for: - Custom port configurations - High-contention environments - Edge cases with many unavailable consecutive ports - Correctness and safety guarantees * feat: enforce hard-fail behavior for explicitly specified ports When users explicitly specify a port via command-line flags (e.g., -s3.port=8333), the server should fail immediately if the port is unavailable, rather than silently falling back to an alternative port. This prevents user confusion and makes misconfiguration failures obvious. Changes: - Modified ensurePortAvailableOnIP() to check if a port was explicitly passed via isFlagPassed() - If an explicit port is unavailable, return error instead of silently allocating alternative - Updated ensureAllPortsAvailableOnIP() to handle the returned error and fail startup - Modified runMini() to check error from ensureAllPortsAvailableOnIP() and return false on failure - Default ports (not explicitly specified) continue to fallback to available alternatives This ensures: - Explicit ports: fail if unavailable (e.g., -s3.port=8333 fails if 8333 is taken) - Default ports: fallback to alternatives (e.g., s3.port without flag falls back to 8334 if 8333 taken) * fix: accurate error messages for explicitly specified unavailable ports When a port is explicitly specified via CLI flags but is unavailable, the error message now correctly reports the originally requested port instead of reporting a fallback port that was calculated internally. The issue was that the config file applied after CLI flag parsing caused isFlagPassed() to return true for ports loaded from the config file (since flag.Visit() was called during config file application), incorrectly marking them as explicitly specified. Solution: Capture which port flags were explicitly passed on the CLI BEFORE the config file is applied, storing them in the explicitPortFlags map. This preserves the accurate distinction between user-specified ports and defaults/config-file ports. Example: - User runs: weed mini -dir=. -s3.port=22 - Now correctly shows: 'port 22 for S3 (specified by flag s3.port) is not available' - Previously incorrectly showed: 'port 8334 for S3...' (some calculated fallback) * fix: respect explicitly specified ports and prevent config file override When a port is explicitly specified via CLI flags (e.g., -s3.port=8333), the config file options should NOT override it. Previously, config file options would be applied if the flag value differed from default, but this check wasn't sufficient to prevent override in all cases. Solution: Check the explicitPortFlags map before applying any config file port options. If a port was explicitly passed on the CLI, skip applying the config file option for that port. This ensures: - Explicit ports take absolute precedence over config file ports - Config file ports are only used if port wasn't specified on CLI - Example: 'weed mini -s3.port=8333' will use 8333, never the config file value * fix: don't print usage on port allocation error When a port allocation fails (e.g., explicit port is unavailable), exit immediately without showing the usage example. This provides cleaner error output when the error is expected (port conflict). * fix: increase worker registration timeout for reconnections Increase the worker registration timeout from 10 seconds to 30 seconds. The 10-second timeout was too aggressive for reconnections when the admin server might be busy processing other operations. Reconnecting workers need more time to: 1. Re-establish the gRPC connection 2. Send the registration message 3. Wait for the admin server to process and respond This prevents spurious "registration timeout" errors during long-running mini instances when brief network hiccups or admin server load cause delays. * refactor: clean up code quality issues Remove no-op assignment (calculatedPort = calculatedPort) that had no effect. The variable already holds the correct value when no alternative port is found. Improve documentation for the defensive gRPC port initialization fallback in startAdminServer. While this code shouldn't execute in normal flow because ensureAllPortsAvailableOnIP is called earlier in runMini, the fallback handles edge cases where port initialization may have been skipped or failed silently due to configuration changes or error handling paths. --- weed/command/mini.go | 291 ++++++++++++++++++++++++++++++++++++++++-- weed/worker/client.go | 3 +- 2 files changed, 284 insertions(+), 10 deletions(-) diff --git a/weed/command/mini.go b/weed/command/mini.go index 2602a4a23..fc359f904 100644 --- a/weed/command/mini.go +++ b/weed/command/mini.go @@ -44,6 +44,7 @@ const ( minVolumeSizeMB = 64 // Minimum volume size in MB defaultMiniVolumeSizeMB = 128 // Default volume size for mini mode maxVolumeSizeMB = 1024 // Maximum volume size in MB (1GB) + GrpcPortOffset = 10000 // Offset used to calculate gRPC port from HTTP port ) var ( @@ -54,6 +55,8 @@ var ( miniWebDavOptions WebDavOption miniAdminOptions AdminOptions createdInitialIAM bool // Track if initial IAM config was created from env vars + // Track which port flags were explicitly passed on CLI before config file is applied + explicitPortFlags map[string]bool ) func init() { @@ -117,6 +120,15 @@ var ( miniS3AllowDeleteBucketNotEmpty = cmdMini.Flag.Bool("s3.allowDeleteBucketNotEmpty", true, "allow recursive deleting all entries along with bucket") ) +// getBindIp determines the bind IP address based on miniIp and miniBindIp flags +// Returns miniBindIp if set (non-empty), otherwise returns miniIp +func getBindIp() string { + if *miniBindIp != "" { + return *miniBindIp + } + return *miniIp +} + // initMiniCommonFlags initializes common mini flags func initMiniCommonFlags() { miniOptions.cpuprofile = cmdMini.Flag.String("cpuprofile", "", "cpu profile output file") @@ -242,7 +254,7 @@ func initMiniWebDAVFlags() { // initMiniAdminFlags initializes Admin server flag options func initMiniAdminFlags() { miniAdminOptions.port = cmdMini.Flag.Int("admin.port", 23646, "admin server http listen port") - miniAdminOptions.grpcPort = cmdMini.Flag.Int("admin.port.grpc", 0, "admin server grpc listen port (default: admin http port + 10000)") + miniAdminOptions.grpcPort = cmdMini.Flag.Int("admin.port.grpc", 0, "admin server grpc listen port (default: admin http port + GrpcPortOffset)") miniAdminOptions.master = cmdMini.Flag.String("admin.master", "", "master server address (automatically set)") miniAdminOptions.dataDir = cmdMini.Flag.String("admin.dataDir", "", "directory to store admin configuration and data files") miniAdminOptions.adminUser = cmdMini.Flag.String("admin.user", "admin", "admin interface username") @@ -326,6 +338,221 @@ func isFlagPassed(name string) bool { return found } +// isPortOpenOnIP checks if a port is available for binding on a specific IP address +func isPortOpenOnIP(ip string, port int) bool { + listener, err := net.Listen("tcp", fmt.Sprintf("%s:%d", ip, port)) + if err != nil { + return false + } + listener.Close() + return true +} + +// isPortAvailable checks if a port is available on any interface +// This is more comprehensive than checking a single IP +func isPortAvailable(port int) bool { + // Try to listen on all interfaces (0.0.0.0) + listener, err := net.Listen("tcp", fmt.Sprintf(":%d", port)) + if err != nil { + return false + } + listener.Close() + return true +} + +// findAvailablePortOnIP finds the next available port on a specific IP starting from the given port +// It skips any ports that are in the reservedPorts map (for gRPC port collision avoidance) +// It returns the first available port found within maxAttempts, or 0 if none found +func findAvailablePortOnIP(ip string, startPort int, maxAttempts int, reservedPorts map[int]bool) int { + for i := 0; i < maxAttempts; i++ { + port := startPort + i + // Skip ports reserved for gRPC calculation + if reservedPorts[port] { + continue + } + // Check on both the specific IP and on all interfaces for maximum reliability + if isPortOpenOnIP(ip, port) && isPortAvailable(port) { + return port + } + } + // If no port found, return 0 to indicate failure + return 0 +} + +// ensurePortAvailableOnIP ensures a port pointer points to an available port on a specific IP +// If the port is not available, it finds the next available port and updates the pointer +// The reservedPorts map contains ports that should not be allocated (for gRPC collision avoidance) +func ensurePortAvailableOnIP(portPtr *int, serviceName string, ip string, reservedPorts map[int]bool, flagName string) error { + if portPtr == nil { + return nil + } + + original := *portPtr + + // Check if this port was explicitly specified by the user (from CLI, before config file was applied) + isExplicitPort := explicitPortFlags[flagName] + + // Skip if this port is reserved for gRPC calculation + if reservedPorts[original] { + if isExplicitPort { + return fmt.Errorf("port %d for %s (specified by flag %s) is reserved for gRPC calculation and cannot be used", original, serviceName, flagName) + } + glog.Warningf("Port %d for %s is reserved for gRPC calculation, finding alternative...", original, serviceName) + newPort := findAvailablePortOnIP(ip, original+1, 100, reservedPorts) + if newPort == 0 { + glog.Errorf("Could not find available port for %s starting from %d, will use original %d and fail on binding", serviceName, original+1, original) + } else { + glog.Infof("Port %d for %s is available, using it instead of %d", newPort, serviceName, original) + *portPtr = newPort + } + return nil + } + + // Check on both the specific IP and on all interfaces (0.0.0.0) for maximum reliability + if !isPortOpenOnIP(ip, original) || !isPortAvailable(original) { + // If explicitly specified, fail immediately with the originally requested port + if isExplicitPort { + return fmt.Errorf("port %d for %s (specified by flag %s) is not available on %s and cannot be used", original, serviceName, flagName, ip) + } + // For default ports, try to find an alternative + glog.Warningf("Port %d for %s is not available on %s, finding alternative port...", original, serviceName, ip) + newPort := findAvailablePortOnIP(ip, original+1, 100, reservedPorts) + if newPort == 0 { + glog.Errorf("Could not find available port for %s starting from %d, will use original %d and fail on binding", serviceName, original+1, original) + } else { + glog.Infof("Port %d for %s is available, using it instead of %d", newPort, serviceName, original) + *portPtr = newPort + } + } else { + glog.V(1).Infof("Port %d for %s is available on %s", original, serviceName, ip) + } + return nil +} + +// ensureAllPortsAvailableOnIP ensures all mini service ports are available on a specific IP +// Returns an error if an explicitly specified port is unavailable. +// This should be called before starting any services +func ensureAllPortsAvailableOnIP(bindIp string) error { + portConfigs := []struct { + port *int + name string + flagName string + grpcPtr *int + }{ + {miniMasterOptions.port, "Master", "master.port", miniMasterOptions.portGrpc}, + {miniFilerOptions.port, "Filer", "filer.port", miniFilerOptions.portGrpc}, + {miniOptions.v.port, "Volume", "volume.port", miniOptions.v.portGrpc}, + {miniS3Options.port, "S3", "s3.port", miniS3Options.portGrpc}, + {miniWebDavOptions.port, "WebDAV", "webdav.port", nil}, + {miniAdminOptions.port, "Admin", "admin.port", miniAdminOptions.grpcPort}, + } + + // First, reserve all gRPC ports that will be calculated to prevent HTTP port allocation from using them + // This prevents collisions like: HTTP port moves to X, then gRPC port is calculated as Y where Y == X + reservedPorts := make(map[int]bool) + for _, config := range portConfigs { + if config.grpcPtr != nil && *config.grpcPtr == 0 { + // This gRPC port will be calculated as httpPort + GrpcPortOffset + calculatedGrpcPort := *config.port + GrpcPortOffset + reservedPorts[calculatedGrpcPort] = true + } + } + + // Check all HTTP ports sequentially to avoid race conditions + // Each port check and allocation must complete before the next one starts + // to prevent multiple goroutines from claiming the same available port + // Also avoid allocating ports that are reserved for gRPC calculation + for _, config := range portConfigs { + original := *config.port + if err := ensurePortAvailableOnIP(config.port, config.name, bindIp, reservedPorts, config.flagName); err != nil { + return err + } + // If port was changed, update the reserved gRPC ports mapping + if *config.port != original && config.grpcPtr != nil && *config.grpcPtr == 0 { + delete(reservedPorts, original+GrpcPortOffset) + reservedPorts[*config.port+GrpcPortOffset] = true + } + } + + // Initialize all gRPC ports before services start + // This ensures they won't be recalculated and cause conflicts + // All gRPC port handling (calculation, validation, and assignment) is performed exclusively in initializeGrpcPortsOnIP + initializeGrpcPortsOnIP(bindIp) + + // Log the final port configuration + glog.Infof("Final port configuration - Master: %d, Filer: %d, Volume: %d, S3: %d, WebDAV: %d, Admin: %d", + *miniMasterOptions.port, *miniFilerOptions.port, *miniOptions.v.port, + *miniS3Options.port, *miniWebDavOptions.port, *miniAdminOptions.port) + + // Log gRPC ports too (now finalized) + glog.Infof("gRPC port configuration - Master: %d, Filer: %d, Volume: %d, S3: %d, Admin: %d", + *miniMasterOptions.portGrpc, *miniFilerOptions.portGrpc, *miniOptions.v.portGrpc, + *miniS3Options.portGrpc, *miniAdminOptions.grpcPort) + + return nil +} + +// initializeGrpcPortsOnIP initializes all gRPC ports based on their HTTP ports on a specific IP +// If a gRPC port is 0, it will be set to httpPort + GrpcPortOffset +// This must be called after HTTP ports are finalized and before services start +func initializeGrpcPortsOnIP(bindIp string) { + // Track gRPC ports allocated during this function to prevent collisions between services + // when multiple services need fallback port allocation + allocatedGrpcPorts := make(map[int]bool) + + grpcConfigs := []struct { + httpPort *int + grpcPort *int + name string + }{ + {miniMasterOptions.port, miniMasterOptions.portGrpc, "Master"}, + {miniFilerOptions.port, miniFilerOptions.portGrpc, "Filer"}, + {miniOptions.v.port, miniOptions.v.portGrpc, "Volume"}, + {miniS3Options.port, miniS3Options.portGrpc, "S3"}, + {miniAdminOptions.port, miniAdminOptions.grpcPort, "Admin"}, + } + + for _, config := range grpcConfigs { + if config.grpcPort == nil { + continue + } + + // If gRPC port is 0, calculate it + if *config.grpcPort == 0 { + calculatedPort := *config.httpPort + GrpcPortOffset + // Check if calculated port is available (on both specific IP and all interfaces) + // Also check if it was already allocated to another service in this function + if !isPortOpenOnIP(bindIp, calculatedPort) || !isPortAvailable(calculatedPort) || allocatedGrpcPorts[calculatedPort] { + glog.Warningf("Calculated gRPC port %d for %s is not available, finding alternative...", calculatedPort, config.name) + newPort := findAvailablePortOnIP(bindIp, calculatedPort+1, 100, allocatedGrpcPorts) + if newPort == 0 { + glog.Errorf("Could not find available gRPC port for %s starting from %d, will use calculated %d and fail on binding", config.name, calculatedPort+1, calculatedPort) + } else { + calculatedPort = newPort + glog.Infof("gRPC port %d for %s is available, using it instead of calculated %d", newPort, config.name, *config.httpPort+GrpcPortOffset) + } + } + *config.grpcPort = calculatedPort + allocatedGrpcPorts[calculatedPort] = true + glog.V(1).Infof("%s gRPC port initialized to %d", config.name, calculatedPort) + } else { + // gRPC port was explicitly set, verify it's still available (check on both specific IP and all interfaces) + // Also check if it was already allocated to another service in this function + if !isPortOpenOnIP(bindIp, *config.grpcPort) || !isPortAvailable(*config.grpcPort) || allocatedGrpcPorts[*config.grpcPort] { + glog.Warningf("Explicitly set gRPC port %d for %s is not available, finding alternative...", *config.grpcPort, config.name) + newPort := findAvailablePortOnIP(bindIp, *config.grpcPort+1, 100, allocatedGrpcPorts) + if newPort == 0 { + glog.Errorf("Could not find available gRPC port for %s starting from %d, will use original %d and fail on binding", config.name, *config.grpcPort+1, *config.grpcPort) + } else { + glog.Infof("gRPC port %d for %s is available, using it instead of %d", newPort, config.name, *config.grpcPort) + *config.grpcPort = newPort + } + } + allocatedGrpcPorts[*config.grpcPort] = true + } + } +} + // loadMiniConfigurationFile reads the mini.options file and returns parsed options // File format: one option per line, without leading dash (e.g., "ip=127.0.0.1") func loadMiniConfigurationFile(dataFolder string) (map[string]string, error) { @@ -380,6 +607,11 @@ func loadMiniConfigurationFile(dataFolder string) (map[string]string, error) { // applyConfigFileOptions sets command-line flags from loaded configuration file func applyConfigFileOptions(options map[string]string) { for key, value := range options { + // Skip port flags that were explicitly passed on CLI + if explicitPortFlags[key] { + glog.V(2).Infof("Skipping config file option %s=%s (explicitly specified on command line)", key, value) + continue + } // Set the flag value if it hasn't been explicitly set on command line flag := cmdMini.Flag.Lookup(key) if flag != nil { @@ -442,6 +674,14 @@ func saveMiniConfiguration(dataFolder string) error { func runMini(cmd *Command, args []string) bool { + // Capture which port flags were explicitly passed on CLI BEFORE config file is applied + // This is necessary to distinguish user-specified ports from defaults or config file options + explicitPortFlags = make(map[string]bool) + portFlagNames := []string{"master.port", "filer.port", "volume.port", "s3.port", "webdav.port", "admin.port"} + for _, flagName := range portFlagNames { + explicitPortFlags[flagName] = isFlagPassed(flagName) + } + // Load configuration from file if it exists configOptions, err := loadMiniConfigurationFile(*miniDataFolders) if err != nil { @@ -459,6 +699,15 @@ func runMini(cmd *Command, args []string) bool { grace.SetupProfiling(*miniOptions.cpuprofile, *miniOptions.memprofile) + // Determine bind IP + bindIp := getBindIp() + + // Ensure all ports are available, find alternatives if needed + if err := ensureAllPortsAvailableOnIP(bindIp); err != nil { + glog.Errorf("Port allocation failed: %v", err) + os.Exit(1) + } + // Set master.peers to "none" if not specified (single master mode) if *miniMasterOptions.peers == "" { *miniMasterOptions.peers = "none" @@ -552,13 +801,16 @@ func runMini(cmd *Command, args []string) bool { // startMiniServices starts all mini services with proper dependency coordination func startMiniServices(miniWhiteList []string, allServicesReady chan struct{}) { + // Determine bind IP for health checks + bindIp := getBindIp() + // Start Master server (no dependencies) go startMiniService("Master", func() { startMaster(miniMasterOptions, miniWhiteList) }, *miniMasterOptions.port) // Wait for master to be ready - waitForServiceReady("Master", *miniMasterOptions.port) + waitForServiceReady("Master", *miniMasterOptions.port, bindIp) // Start Volume server (depends on master) go startMiniService("Volume", func() { @@ -567,7 +819,7 @@ func startMiniServices(miniWhiteList []string, allServicesReady chan struct{}) { }, *miniOptions.v.port) // Wait for volume to be ready - waitForServiceReady("Volume", *miniOptions.v.port) + waitForServiceReady("Volume", *miniOptions.v.port, bindIp) // Start Filer (depends on master and volume) go startMiniService("Filer", func() { @@ -575,7 +827,7 @@ func startMiniServices(miniWhiteList []string, allServicesReady chan struct{}) { }, *miniFilerOptions.port) // Wait for filer to be ready - waitForServiceReady("Filer", *miniFilerOptions.port) + waitForServiceReady("Filer", *miniFilerOptions.port, bindIp) // Start S3 and WebDAV in parallel (both depend on filer) go startMiniService("S3", func() { @@ -587,8 +839,8 @@ func startMiniServices(miniWhiteList []string, allServicesReady chan struct{}) { }, *miniWebDavOptions.port) // Wait for both S3 and WebDAV to be ready - waitForServiceReady("S3", *miniS3Options.port) - waitForServiceReady("WebDAV", *miniWebDavOptions.port) + waitForServiceReady("S3", *miniS3Options.port, bindIp) + waitForServiceReady("WebDAV", *miniWebDavOptions.port, bindIp) // Start Admin with worker (depends on master, filer, S3, WebDAV) go startMiniAdminWithWorker(allServicesReady) @@ -601,8 +853,8 @@ func startMiniService(name string, fn func(), port int) { } // waitForServiceReady pings the service HTTP endpoint to check if it's ready to accept connections -func waitForServiceReady(name string, port int) { - address := fmt.Sprintf("http://127.0.0.1:%d", port) +func waitForServiceReady(name string, port int, bindIp string) { + address := fmt.Sprintf("http://%s:%d", bindIp, port) maxAttempts := 30 // 30 * 200ms = 6 seconds max wait attempt := 0 client := &http.Client{ @@ -679,8 +931,29 @@ func startMiniAdminWithWorker(allServicesReady chan struct{}) { // Set admin options *miniAdminOptions.master = masterAddr + + // gRPC port should have been initialized by ensureAllPortsAvailableOnIP in runMini + // If it's still 0, that indicates a problem with the port initialization sequence + // This defensive fallback handles edge cases where port initialization may have been skipped + // or failed silently (e.g., due to configuration changes or error handling paths) if *miniAdminOptions.grpcPort == 0 { - *miniAdminOptions.grpcPort = *miniAdminOptions.port + 10000 + glog.Warningf("Admin gRPC port was not initialized before startAdminServer, attempting fallback initialization...") + // Use the same availability checking logic as initializeGrpcPortsOnIP + calculatedPort := *miniAdminOptions.port + GrpcPortOffset + if !isPortOpenOnIP(getBindIp(), calculatedPort) || !isPortAvailable(calculatedPort) { + glog.Warningf("Calculated fallback gRPC port %d is not available, finding alternative...", calculatedPort) + newPort := findAvailablePortOnIP(getBindIp(), calculatedPort+1, 100, make(map[int]bool)) + if newPort == 0 { + glog.Errorf("Could not find available gRPC port for Admin starting from %d, will use calculated %d and fail on binding", calculatedPort+1, calculatedPort) + *miniAdminOptions.grpcPort = calculatedPort + } else { + glog.Infof("Fallback: using gRPC port %d for Admin", newPort) + *miniAdminOptions.grpcPort = newPort + } + } else { + *miniAdminOptions.grpcPort = calculatedPort + glog.Infof("Fallback: Admin gRPC port initialized to %d", calculatedPort) + } } // Create data directory if specified diff --git a/weed/worker/client.go b/weed/worker/client.go index a080e58cf..d562b8703 100644 --- a/weed/worker/client.go +++ b/weed/worker/client.go @@ -522,7 +522,8 @@ func (c *GrpcAdminClient) sendRegistration(worker *types.WorkerData) error { } // Wait for registration response - timeout := time.NewTimer(10 * time.Second) + // Use longer timeout for reconnections since admin server might be busy + timeout := time.NewTimer(30 * time.Second) defer timeout.Stop() for {