You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							633 lines
						
					
					
						
							18 KiB
						
					
					
				
			
		
		
		
			
			
			
		
		
	
	
							633 lines
						
					
					
						
							18 KiB
						
					
					
				| package dash | |
| 
 | |
| import ( | |
| 	"context" | |
| 	"fmt" | |
| 	"io" | |
| 	"net" | |
| 	"sync" | |
| 	"time" | |
| 
 | |
| 	"github.com/seaweedfs/seaweedfs/weed/glog" | |
| 	"github.com/seaweedfs/seaweedfs/weed/pb" | |
| 	"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" | |
| 	"github.com/seaweedfs/seaweedfs/weed/security" | |
| 	"github.com/seaweedfs/seaweedfs/weed/util" | |
| 	"google.golang.org/grpc" | |
| 	"google.golang.org/grpc/peer" | |
| ) | |
| 
 | |
| // WorkerGrpcServer implements the WorkerService gRPC interface | |
| type WorkerGrpcServer struct { | |
| 	worker_pb.UnimplementedWorkerServiceServer | |
| 	adminServer *AdminServer | |
| 
 | |
| 	// Worker connection management | |
| 	connections map[string]*WorkerConnection | |
| 	connMutex   sync.RWMutex | |
| 
 | |
| 	// Log request correlation | |
| 	pendingLogRequests map[string]*LogRequestContext | |
| 	logRequestsMutex   sync.RWMutex | |
| 
 | |
| 	// gRPC server | |
| 	grpcServer *grpc.Server | |
| 	listener   net.Listener | |
| 	running    bool | |
| 	stopChan   chan struct{} | |
| } | |
| 
 | |
| // LogRequestContext tracks pending log requests | |
| type LogRequestContext struct { | |
| 	TaskID     string | |
| 	WorkerID   string | |
| 	ResponseCh chan *worker_pb.TaskLogResponse | |
| 	Timeout    time.Time | |
| } | |
| 
 | |
| // WorkerConnection represents an active worker connection | |
| type WorkerConnection struct { | |
| 	workerID      string | |
| 	stream        worker_pb.WorkerService_WorkerStreamServer | |
| 	lastSeen      time.Time | |
| 	capabilities  []MaintenanceTaskType | |
| 	address       string | |
| 	maxConcurrent int32 | |
| 	outgoing      chan *worker_pb.AdminMessage | |
| 	ctx           context.Context | |
| 	cancel        context.CancelFunc | |
| } | |
| 
 | |
| // NewWorkerGrpcServer creates a new gRPC server for worker connections | |
| func NewWorkerGrpcServer(adminServer *AdminServer) *WorkerGrpcServer { | |
| 	return &WorkerGrpcServer{ | |
| 		adminServer:        adminServer, | |
| 		connections:        make(map[string]*WorkerConnection), | |
| 		pendingLogRequests: make(map[string]*LogRequestContext), | |
| 		stopChan:           make(chan struct{}), | |
| 	} | |
| } | |
| 
 | |
| // StartWithTLS starts the gRPC server on the specified port with optional TLS | |
| func (s *WorkerGrpcServer) StartWithTLS(port int) error { | |
| 	if s.running { | |
| 		return fmt.Errorf("worker gRPC server is already running") | |
| 	} | |
| 
 | |
| 	// Create listener | |
| 	listener, err := net.Listen("tcp", fmt.Sprintf(":%d", port)) | |
| 	if err != nil { | |
| 		return fmt.Errorf("failed to listen on port %d: %v", port, err) | |
| 	} | |
| 
 | |
| 	// Create gRPC server with optional TLS | |
| 	grpcServer := pb.NewGrpcServer(security.LoadServerTLS(util.GetViper(), "grpc.admin")) | |
| 
 | |
| 	worker_pb.RegisterWorkerServiceServer(grpcServer, s) | |
| 
 | |
| 	s.grpcServer = grpcServer | |
| 	s.listener = listener | |
| 	s.running = true | |
| 
 | |
| 	// Start cleanup routine | |
| 	go s.cleanupRoutine() | |
| 
 | |
| 	// Start serving in a goroutine | |
| 	go func() { | |
| 		if err := s.grpcServer.Serve(listener); err != nil { | |
| 			if s.running { | |
| 				glog.Errorf("Worker gRPC server error: %v", err) | |
| 			} | |
| 		} | |
| 	}() | |
| 
 | |
| 	return nil | |
| } | |
| 
 | |
| // Stop stops the gRPC server | |
| func (s *WorkerGrpcServer) Stop() error { | |
| 	if !s.running { | |
| 		return nil | |
| 	} | |
| 
 | |
| 	s.running = false | |
| 	close(s.stopChan) | |
| 
 | |
| 	// Close all worker connections | |
| 	s.connMutex.Lock() | |
| 	for _, conn := range s.connections { | |
| 		conn.cancel() | |
| 		close(conn.outgoing) | |
| 	} | |
| 	s.connections = make(map[string]*WorkerConnection) | |
| 	s.connMutex.Unlock() | |
| 
 | |
| 	// Stop gRPC server | |
| 	if s.grpcServer != nil { | |
| 		s.grpcServer.GracefulStop() | |
| 	} | |
| 
 | |
| 	// Close listener | |
| 	if s.listener != nil { | |
| 		s.listener.Close() | |
| 	} | |
| 
 | |
| 	glog.Infof("Worker gRPC server stopped") | |
| 	return nil | |
| } | |
| 
 | |
| // WorkerStream handles bidirectional communication with workers | |
| func (s *WorkerGrpcServer) WorkerStream(stream worker_pb.WorkerService_WorkerStreamServer) error { | |
| 	ctx := stream.Context() | |
| 
 | |
| 	// get client address | |
| 	address := findClientAddress(ctx) | |
| 
 | |
| 	// Wait for initial registration message | |
| 	msg, err := stream.Recv() | |
| 	if err != nil { | |
| 		return fmt.Errorf("failed to receive registration message: %w", err) | |
| 	} | |
| 
 | |
| 	registration := msg.GetRegistration() | |
| 	if registration == nil { | |
| 		return fmt.Errorf("first message must be registration") | |
| 	} | |
| 	registration.Address = address | |
| 
 | |
| 	workerID := registration.WorkerId | |
| 	if workerID == "" { | |
| 		return fmt.Errorf("worker ID cannot be empty") | |
| 	} | |
| 
 | |
| 	glog.Infof("Worker %s connecting from %s", workerID, registration.Address) | |
| 
 | |
| 	// Create worker connection | |
| 	connCtx, connCancel := context.WithCancel(ctx) | |
| 	conn := &WorkerConnection{ | |
| 		workerID:      workerID, | |
| 		stream:        stream, | |
| 		lastSeen:      time.Now(), | |
| 		address:       registration.Address, | |
| 		maxConcurrent: registration.MaxConcurrent, | |
| 		outgoing:      make(chan *worker_pb.AdminMessage, 100), | |
| 		ctx:           connCtx, | |
| 		cancel:        connCancel, | |
| 	} | |
| 
 | |
| 	// Convert capabilities | |
| 	capabilities := make([]MaintenanceTaskType, len(registration.Capabilities)) | |
| 	for i, cap := range registration.Capabilities { | |
| 		capabilities[i] = MaintenanceTaskType(cap) | |
| 	} | |
| 	conn.capabilities = capabilities | |
| 
 | |
| 	// Register connection | |
| 	s.connMutex.Lock() | |
| 	s.connections[workerID] = conn | |
| 	s.connMutex.Unlock() | |
| 
 | |
| 	// Register worker with maintenance manager | |
| 	s.registerWorkerWithManager(conn) | |
| 
 | |
| 	// Send registration response | |
| 	regResponse := &worker_pb.AdminMessage{ | |
| 		Timestamp: time.Now().Unix(), | |
| 		Message: &worker_pb.AdminMessage_RegistrationResponse{ | |
| 			RegistrationResponse: &worker_pb.RegistrationResponse{ | |
| 				Success: true, | |
| 				Message: "Worker registered successfully", | |
| 			}, | |
| 		}, | |
| 	} | |
| 
 | |
| 	select { | |
| 	case conn.outgoing <- regResponse: | |
| 	case <-time.After(5 * time.Second): | |
| 		glog.Errorf("Failed to send registration response to worker %s", workerID) | |
| 	} | |
| 
 | |
| 	// Start outgoing message handler | |
| 	go s.handleOutgoingMessages(conn) | |
| 
 | |
| 	// Handle incoming messages | |
| 	for { | |
| 		select { | |
| 		case <-ctx.Done(): | |
| 			glog.Infof("Worker %s connection closed: %v", workerID, ctx.Err()) | |
| 			s.unregisterWorker(workerID) | |
| 			return nil | |
| 		case <-connCtx.Done(): | |
| 			glog.Infof("Worker %s connection cancelled", workerID) | |
| 			s.unregisterWorker(workerID) | |
| 			return nil | |
| 		default: | |
| 		} | |
| 
 | |
| 		msg, err := stream.Recv() | |
| 		if err != nil { | |
| 			if err == io.EOF { | |
| 				glog.Infof("Worker %s disconnected", workerID) | |
| 			} else { | |
| 				glog.Errorf("Error receiving from worker %s: %v", workerID, err) | |
| 			} | |
| 			s.unregisterWorker(workerID) | |
| 			return err | |
| 		} | |
| 
 | |
| 		conn.lastSeen = time.Now() | |
| 		s.handleWorkerMessage(conn, msg) | |
| 	} | |
| } | |
| 
 | |
| // handleOutgoingMessages sends messages to worker | |
| func (s *WorkerGrpcServer) handleOutgoingMessages(conn *WorkerConnection) { | |
| 	for { | |
| 		select { | |
| 		case <-conn.ctx.Done(): | |
| 			return | |
| 		case msg, ok := <-conn.outgoing: | |
| 			if !ok { | |
| 				return | |
| 			} | |
| 
 | |
| 			if err := conn.stream.Send(msg); err != nil { | |
| 				glog.Errorf("Failed to send message to worker %s: %v", conn.workerID, err) | |
| 				conn.cancel() | |
| 				return | |
| 			} | |
| 		} | |
| 	} | |
| } | |
| 
 | |
| // handleWorkerMessage processes incoming messages from workers | |
| func (s *WorkerGrpcServer) handleWorkerMessage(conn *WorkerConnection, msg *worker_pb.WorkerMessage) { | |
| 	workerID := conn.workerID | |
| 
 | |
| 	switch m := msg.Message.(type) { | |
| 	case *worker_pb.WorkerMessage_Heartbeat: | |
| 		s.handleHeartbeat(conn, m.Heartbeat) | |
| 
 | |
| 	case *worker_pb.WorkerMessage_TaskRequest: | |
| 		s.handleTaskRequest(conn, m.TaskRequest) | |
| 
 | |
| 	case *worker_pb.WorkerMessage_TaskUpdate: | |
| 		s.handleTaskUpdate(conn, m.TaskUpdate) | |
| 
 | |
| 	case *worker_pb.WorkerMessage_TaskComplete: | |
| 		s.handleTaskCompletion(conn, m.TaskComplete) | |
| 
 | |
| 	case *worker_pb.WorkerMessage_TaskLogResponse: | |
| 		s.handleTaskLogResponse(conn, m.TaskLogResponse) | |
| 
 | |
| 	case *worker_pb.WorkerMessage_Shutdown: | |
| 		glog.Infof("Worker %s shutting down: %s", workerID, m.Shutdown.Reason) | |
| 		s.unregisterWorker(workerID) | |
| 
 | |
| 	default: | |
| 		glog.Warningf("Unknown message type from worker %s", workerID) | |
| 	} | |
| } | |
| 
 | |
| // registerWorkerWithManager registers the worker with the maintenance manager | |
| func (s *WorkerGrpcServer) registerWorkerWithManager(conn *WorkerConnection) { | |
| 	if s.adminServer.maintenanceManager == nil { | |
| 		return | |
| 	} | |
| 
 | |
| 	worker := &MaintenanceWorker{ | |
| 		ID:            conn.workerID, | |
| 		Address:       conn.address, | |
| 		LastHeartbeat: time.Now(), | |
| 		Status:        "active", | |
| 		Capabilities:  conn.capabilities, | |
| 		MaxConcurrent: int(conn.maxConcurrent), | |
| 		CurrentLoad:   0, | |
| 	} | |
| 
 | |
| 	s.adminServer.maintenanceManager.RegisterWorker(worker) | |
| 	glog.V(1).Infof("Registered worker %s with maintenance manager", conn.workerID) | |
| } | |
| 
 | |
| // handleHeartbeat processes heartbeat messages | |
| func (s *WorkerGrpcServer) handleHeartbeat(conn *WorkerConnection, heartbeat *worker_pb.WorkerHeartbeat) { | |
| 	if s.adminServer.maintenanceManager != nil { | |
| 		s.adminServer.maintenanceManager.UpdateWorkerHeartbeat(conn.workerID) | |
| 	} | |
| 
 | |
| 	// Send heartbeat response | |
| 	response := &worker_pb.AdminMessage{ | |
| 		Timestamp: time.Now().Unix(), | |
| 		Message: &worker_pb.AdminMessage_HeartbeatResponse{ | |
| 			HeartbeatResponse: &worker_pb.HeartbeatResponse{ | |
| 				Success: true, | |
| 				Message: "Heartbeat acknowledged", | |
| 			}, | |
| 		}, | |
| 	} | |
| 
 | |
| 	select { | |
| 	case conn.outgoing <- response: | |
| 	case <-time.After(time.Second): | |
| 		glog.Warningf("Failed to send heartbeat response to worker %s", conn.workerID) | |
| 	} | |
| } | |
| 
 | |
| // handleTaskRequest processes task requests from workers | |
| func (s *WorkerGrpcServer) handleTaskRequest(conn *WorkerConnection, request *worker_pb.TaskRequest) { | |
| 	// glog.Infof("DEBUG handleTaskRequest: Worker %s requesting tasks with capabilities %v", conn.workerID, conn.capabilities) | |
|  | |
| 	if s.adminServer.maintenanceManager == nil { | |
| 		glog.Infof("DEBUG handleTaskRequest: maintenance manager is nil") | |
| 		return | |
| 	} | |
| 
 | |
| 	// Get next task from maintenance manager | |
| 	task := s.adminServer.maintenanceManager.GetNextTask(conn.workerID, conn.capabilities) | |
| 	// glog.Infof("DEBUG handleTaskRequest: GetNextTask returned task: %v", task != nil) | |
|  | |
| 	if task != nil { | |
| 		glog.Infof("DEBUG handleTaskRequest: Assigning task %s (type: %s) to worker %s", task.ID, task.Type, conn.workerID) | |
| 
 | |
| 		// Use typed params directly - master client should already be configured in the params | |
| 		var taskParams *worker_pb.TaskParams | |
| 		if task.TypedParams != nil { | |
| 			taskParams = task.TypedParams | |
| 		} else { | |
| 			// Create basic params if none exist | |
| 			taskParams = &worker_pb.TaskParams{ | |
| 				VolumeId:   task.VolumeID, | |
| 				Collection: task.Collection, | |
| 				Sources: []*worker_pb.TaskSource{ | |
| 					{ | |
| 						Node:     task.Server, | |
| 						VolumeId: task.VolumeID, | |
| 					}, | |
| 				}, | |
| 			} | |
| 		} | |
| 
 | |
| 		// Send task assignment | |
| 		assignment := &worker_pb.AdminMessage{ | |
| 			Timestamp: time.Now().Unix(), | |
| 			Message: &worker_pb.AdminMessage_TaskAssignment{ | |
| 				TaskAssignment: &worker_pb.TaskAssignment{ | |
| 					TaskId:      task.ID, | |
| 					TaskType:    string(task.Type), | |
| 					Params:      taskParams, | |
| 					Priority:    int32(task.Priority), | |
| 					CreatedTime: time.Now().Unix(), | |
| 				}, | |
| 			}, | |
| 		} | |
| 
 | |
| 		select { | |
| 		case conn.outgoing <- assignment: | |
| 			glog.Infof("DEBUG handleTaskRequest: Successfully assigned task %s to worker %s", task.ID, conn.workerID) | |
| 		case <-time.After(time.Second): | |
| 			glog.Warningf("Failed to send task assignment to worker %s", conn.workerID) | |
| 		} | |
| 	} else { | |
| 		// glog.Infof("DEBUG handleTaskRequest: No tasks available for worker %s", conn.workerID) | |
| 	} | |
| } | |
| 
 | |
| // handleTaskUpdate processes task progress updates | |
| func (s *WorkerGrpcServer) handleTaskUpdate(conn *WorkerConnection, update *worker_pb.TaskUpdate) { | |
| 	if s.adminServer.maintenanceManager != nil { | |
| 		s.adminServer.maintenanceManager.UpdateTaskProgress(update.TaskId, float64(update.Progress)) | |
| 		glog.V(3).Infof("Updated task %s progress: %.1f%%", update.TaskId, update.Progress) | |
| 	} | |
| } | |
| 
 | |
| // handleTaskCompletion processes task completion notifications | |
| func (s *WorkerGrpcServer) handleTaskCompletion(conn *WorkerConnection, completion *worker_pb.TaskComplete) { | |
| 	if s.adminServer.maintenanceManager != nil { | |
| 		errorMsg := "" | |
| 		if !completion.Success { | |
| 			errorMsg = completion.ErrorMessage | |
| 		} | |
| 		s.adminServer.maintenanceManager.CompleteTask(completion.TaskId, errorMsg) | |
| 
 | |
| 		if completion.Success { | |
| 			glog.V(1).Infof("Worker %s completed task %s successfully", conn.workerID, completion.TaskId) | |
| 		} else { | |
| 			glog.Errorf("Worker %s failed task %s: %s", conn.workerID, completion.TaskId, completion.ErrorMessage) | |
| 		} | |
| 	} | |
| } | |
| 
 | |
| // handleTaskLogResponse processes task log responses from workers | |
| func (s *WorkerGrpcServer) handleTaskLogResponse(conn *WorkerConnection, response *worker_pb.TaskLogResponse) { | |
| 	requestKey := fmt.Sprintf("%s:%s", response.WorkerId, response.TaskId) | |
| 
 | |
| 	s.logRequestsMutex.RLock() | |
| 	requestContext, exists := s.pendingLogRequests[requestKey] | |
| 	s.logRequestsMutex.RUnlock() | |
| 
 | |
| 	if !exists { | |
| 		glog.Warningf("Received unexpected log response for task %s from worker %s", response.TaskId, response.WorkerId) | |
| 		return | |
| 	} | |
| 
 | |
| 	glog.V(1).Infof("Received log response for task %s from worker %s: %d entries", response.TaskId, response.WorkerId, len(response.LogEntries)) | |
| 
 | |
| 	// Send response to waiting channel | |
| 	select { | |
| 	case requestContext.ResponseCh <- response: | |
| 		// Response delivered successfully | |
| 	case <-time.After(time.Second): | |
| 		glog.Warningf("Failed to deliver log response for task %s from worker %s: timeout", response.TaskId, response.WorkerId) | |
| 	} | |
| 
 | |
| 	// Clean up the pending request | |
| 	s.logRequestsMutex.Lock() | |
| 	delete(s.pendingLogRequests, requestKey) | |
| 	s.logRequestsMutex.Unlock() | |
| } | |
| 
 | |
| // unregisterWorker removes a worker connection | |
| func (s *WorkerGrpcServer) unregisterWorker(workerID string) { | |
| 	s.connMutex.Lock() | |
| 	if conn, exists := s.connections[workerID]; exists { | |
| 		conn.cancel() | |
| 		close(conn.outgoing) | |
| 		delete(s.connections, workerID) | |
| 	} | |
| 	s.connMutex.Unlock() | |
| 
 | |
| 	glog.V(1).Infof("Unregistered worker %s", workerID) | |
| } | |
| 
 | |
| // cleanupRoutine periodically cleans up stale connections | |
| func (s *WorkerGrpcServer) cleanupRoutine() { | |
| 	ticker := time.NewTicker(30 * time.Second) | |
| 	defer ticker.Stop() | |
| 
 | |
| 	for { | |
| 		select { | |
| 		case <-s.stopChan: | |
| 			return | |
| 		case <-ticker.C: | |
| 			s.cleanupStaleConnections() | |
| 		} | |
| 	} | |
| } | |
| 
 | |
| // cleanupStaleConnections removes connections that haven't been seen recently | |
| func (s *WorkerGrpcServer) cleanupStaleConnections() { | |
| 	cutoff := time.Now().Add(-2 * time.Minute) | |
| 
 | |
| 	s.connMutex.Lock() | |
| 	defer s.connMutex.Unlock() | |
| 
 | |
| 	for workerID, conn := range s.connections { | |
| 		if conn.lastSeen.Before(cutoff) { | |
| 			glog.Warningf("Cleaning up stale worker connection: %s", workerID) | |
| 			conn.cancel() | |
| 			close(conn.outgoing) | |
| 			delete(s.connections, workerID) | |
| 		} | |
| 	} | |
| } | |
| 
 | |
| // GetConnectedWorkers returns a list of currently connected workers | |
| func (s *WorkerGrpcServer) GetConnectedWorkers() []string { | |
| 	s.connMutex.RLock() | |
| 	defer s.connMutex.RUnlock() | |
| 
 | |
| 	workers := make([]string, 0, len(s.connections)) | |
| 	for workerID := range s.connections { | |
| 		workers = append(workers, workerID) | |
| 	} | |
| 	return workers | |
| } | |
| 
 | |
| // RequestTaskLogs requests execution logs from a worker for a specific task | |
| func (s *WorkerGrpcServer) RequestTaskLogs(workerID, taskID string, maxEntries int32, logLevel string) ([]*worker_pb.TaskLogEntry, error) { | |
| 	s.connMutex.RLock() | |
| 	conn, exists := s.connections[workerID] | |
| 	s.connMutex.RUnlock() | |
| 
 | |
| 	if !exists { | |
| 		return nil, fmt.Errorf("worker %s is not connected", workerID) | |
| 	} | |
| 
 | |
| 	// Create response channel for this request | |
| 	responseCh := make(chan *worker_pb.TaskLogResponse, 1) | |
| 	requestKey := fmt.Sprintf("%s:%s", workerID, taskID) | |
| 
 | |
| 	// Register pending request | |
| 	requestContext := &LogRequestContext{ | |
| 		TaskID:     taskID, | |
| 		WorkerID:   workerID, | |
| 		ResponseCh: responseCh, | |
| 		Timeout:    time.Now().Add(10 * time.Second), | |
| 	} | |
| 
 | |
| 	s.logRequestsMutex.Lock() | |
| 	s.pendingLogRequests[requestKey] = requestContext | |
| 	s.logRequestsMutex.Unlock() | |
| 
 | |
| 	// Create log request message | |
| 	logRequest := &worker_pb.AdminMessage{ | |
| 		AdminId:   "admin-server", | |
| 		Timestamp: time.Now().Unix(), | |
| 		Message: &worker_pb.AdminMessage_TaskLogRequest{ | |
| 			TaskLogRequest: &worker_pb.TaskLogRequest{ | |
| 				TaskId:          taskID, | |
| 				WorkerId:        workerID, | |
| 				IncludeMetadata: true, | |
| 				MaxEntries:      maxEntries, | |
| 				LogLevel:        logLevel, | |
| 			}, | |
| 		}, | |
| 	} | |
| 
 | |
| 	// Send the request through the worker's outgoing channel | |
| 	select { | |
| 	case conn.outgoing <- logRequest: | |
| 		glog.V(1).Infof("Log request sent to worker %s for task %s", workerID, taskID) | |
| 	case <-time.After(5 * time.Second): | |
| 		// Clean up pending request on timeout | |
| 		s.logRequestsMutex.Lock() | |
| 		delete(s.pendingLogRequests, requestKey) | |
| 		s.logRequestsMutex.Unlock() | |
| 		return nil, fmt.Errorf("timeout sending log request to worker %s", workerID) | |
| 	} | |
| 
 | |
| 	// Wait for response | |
| 	select { | |
| 	case response := <-responseCh: | |
| 		if !response.Success { | |
| 			return nil, fmt.Errorf("worker log request failed: %s", response.ErrorMessage) | |
| 		} | |
| 		glog.V(1).Infof("Received %d log entries for task %s from worker %s", len(response.LogEntries), taskID, workerID) | |
| 		return response.LogEntries, nil | |
| 	case <-time.After(10 * time.Second): | |
| 		// Clean up pending request on timeout | |
| 		s.logRequestsMutex.Lock() | |
| 		delete(s.pendingLogRequests, requestKey) | |
| 		s.logRequestsMutex.Unlock() | |
| 		return nil, fmt.Errorf("timeout waiting for log response from worker %s", workerID) | |
| 	} | |
| } | |
| 
 | |
| // RequestTaskLogsFromAllWorkers requests logs for a task from all connected workers | |
| func (s *WorkerGrpcServer) RequestTaskLogsFromAllWorkers(taskID string, maxEntries int32, logLevel string) (map[string][]*worker_pb.TaskLogEntry, error) { | |
| 	s.connMutex.RLock() | |
| 	workerIDs := make([]string, 0, len(s.connections)) | |
| 	for workerID := range s.connections { | |
| 		workerIDs = append(workerIDs, workerID) | |
| 	} | |
| 	s.connMutex.RUnlock() | |
| 
 | |
| 	results := make(map[string][]*worker_pb.TaskLogEntry) | |
| 
 | |
| 	for _, workerID := range workerIDs { | |
| 		logs, err := s.RequestTaskLogs(workerID, taskID, maxEntries, logLevel) | |
| 		if err != nil { | |
| 			glog.V(1).Infof("Failed to get logs from worker %s for task %s: %v", workerID, taskID, err) | |
| 			// Store empty result with error information for debugging | |
| 			results[workerID+"_error"] = []*worker_pb.TaskLogEntry{ | |
| 				{ | |
| 					Timestamp: time.Now().Unix(), | |
| 					Level:     "ERROR", | |
| 					Message:   fmt.Sprintf("Failed to retrieve logs from worker %s: %v", workerID, err), | |
| 					Fields:    map[string]string{"source": "admin"}, | |
| 				}, | |
| 			} | |
| 			continue | |
| 		} | |
| 		if len(logs) > 0 { | |
| 			results[workerID] = logs | |
| 		} else { | |
| 			glog.V(2).Infof("No logs found for task %s on worker %s", taskID, workerID) | |
| 		} | |
| 	} | |
| 
 | |
| 	return results, nil | |
| } | |
| 
 | |
| // convertTaskParameters converts task parameters to protobuf format | |
| func convertTaskParameters(params map[string]interface{}) map[string]string { | |
| 	result := make(map[string]string) | |
| 	for key, value := range params { | |
| 		result[key] = fmt.Sprintf("%v", value) | |
| 	} | |
| 	return result | |
| } | |
| 
 | |
| func findClientAddress(ctx context.Context) string { | |
| 	// fmt.Printf("FromContext %+v\n", ctx) | |
| 	pr, ok := peer.FromContext(ctx) | |
| 	if !ok { | |
| 		glog.Error("failed to get peer from ctx") | |
| 		return "" | |
| 	} | |
| 	if pr.Addr == net.Addr(nil) { | |
| 		glog.Error("failed to get peer address") | |
| 		return "" | |
| 	} | |
| 	return pr.Addr.String() | |
| }
 |