You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
488 lines
13 KiB
488 lines
13 KiB
package task
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/glog"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/worker/types"
|
|
"google.golang.org/grpc"
|
|
"google.golang.org/grpc/credentials/insecure"
|
|
)
|
|
|
|
// WorkerConnection manages the gRPC connection to a single worker
|
|
type WorkerConnection struct {
|
|
workerID string
|
|
address string
|
|
conn *grpc.ClientConn
|
|
client worker_pb.WorkerServiceClient
|
|
stream worker_pb.WorkerService_WorkerStreamClient
|
|
lastSeen time.Time
|
|
mutex sync.RWMutex
|
|
adminServer *AdminServer
|
|
stopCh chan struct{}
|
|
active bool
|
|
}
|
|
|
|
// WorkerCommunicationManager manages all worker connections
|
|
type WorkerCommunicationManager struct {
|
|
adminServer *AdminServer
|
|
connections map[string]*WorkerConnection
|
|
mutex sync.RWMutex
|
|
stopCh chan struct{}
|
|
}
|
|
|
|
// NewWorkerCommunicationManager creates a new worker communication manager
|
|
func NewWorkerCommunicationManager(adminServer *AdminServer) *WorkerCommunicationManager {
|
|
return &WorkerCommunicationManager{
|
|
adminServer: adminServer,
|
|
connections: make(map[string]*WorkerConnection),
|
|
stopCh: make(chan struct{}),
|
|
}
|
|
}
|
|
|
|
// Start starts the worker communication manager
|
|
func (wcm *WorkerCommunicationManager) Start() {
|
|
glog.Infof("Starting worker communication manager")
|
|
|
|
go wcm.connectionMonitorLoop()
|
|
}
|
|
|
|
// Stop stops the worker communication manager
|
|
func (wcm *WorkerCommunicationManager) Stop() {
|
|
glog.Infof("Stopping worker communication manager")
|
|
|
|
close(wcm.stopCh)
|
|
|
|
wcm.mutex.Lock()
|
|
defer wcm.mutex.Unlock()
|
|
|
|
for _, conn := range wcm.connections {
|
|
conn.Close()
|
|
}
|
|
}
|
|
|
|
// EstablishWorkerConnection establishes a connection to a worker
|
|
func (wcm *WorkerCommunicationManager) EstablishWorkerConnection(workerID, address string) error {
|
|
wcm.mutex.Lock()
|
|
defer wcm.mutex.Unlock()
|
|
|
|
// Check if already connected
|
|
if conn, exists := wcm.connections[workerID]; exists {
|
|
if conn.active {
|
|
return nil // Already connected
|
|
}
|
|
conn.Close() // Close inactive connection
|
|
}
|
|
|
|
// Create new connection
|
|
conn, err := NewWorkerConnection(workerID, address, wcm.adminServer)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create worker connection: %v", err)
|
|
}
|
|
|
|
wcm.connections[workerID] = conn
|
|
|
|
// Start connection
|
|
go conn.Start()
|
|
|
|
glog.Infof("Established connection to worker %s at %s", workerID, address)
|
|
return nil
|
|
}
|
|
|
|
// SendTaskAssignment sends a task assignment to a worker
|
|
func (wcm *WorkerCommunicationManager) SendTaskAssignment(workerID string, task *Task) error {
|
|
wcm.mutex.RLock()
|
|
conn, exists := wcm.connections[workerID]
|
|
wcm.mutex.RUnlock()
|
|
|
|
if !exists || !conn.active {
|
|
return fmt.Errorf("no active connection to worker %s", workerID)
|
|
}
|
|
|
|
return conn.SendTaskAssignment(task)
|
|
}
|
|
|
|
// CancelTask sends a task cancellation to a worker
|
|
func (wcm *WorkerCommunicationManager) CancelTask(workerID, taskID string, reason string) error {
|
|
wcm.mutex.RLock()
|
|
conn, exists := wcm.connections[workerID]
|
|
wcm.mutex.RUnlock()
|
|
|
|
if !exists || !conn.active {
|
|
return fmt.Errorf("no active connection to worker %s", workerID)
|
|
}
|
|
|
|
return conn.CancelTask(taskID, reason)
|
|
}
|
|
|
|
// GetActiveConnections returns the list of active worker connections
|
|
func (wcm *WorkerCommunicationManager) GetActiveConnections() []string {
|
|
wcm.mutex.RLock()
|
|
defer wcm.mutex.RUnlock()
|
|
|
|
var active []string
|
|
for workerID, conn := range wcm.connections {
|
|
if conn.active {
|
|
active = append(active, workerID)
|
|
}
|
|
}
|
|
|
|
return active
|
|
}
|
|
|
|
// connectionMonitorLoop monitors worker connections and cleans up inactive ones
|
|
func (wcm *WorkerCommunicationManager) connectionMonitorLoop() {
|
|
ticker := time.NewTicker(30 * time.Second)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
wcm.cleanupInactiveConnections()
|
|
case <-wcm.stopCh:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// cleanupInactiveConnections removes inactive worker connections
|
|
func (wcm *WorkerCommunicationManager) cleanupInactiveConnections() {
|
|
wcm.mutex.Lock()
|
|
defer wcm.mutex.Unlock()
|
|
|
|
now := time.Now()
|
|
timeout := 2 * time.Minute
|
|
|
|
for workerID, conn := range wcm.connections {
|
|
if !conn.active || now.Sub(conn.lastSeen) > timeout {
|
|
glog.Infof("Cleaning up inactive connection to worker %s", workerID)
|
|
conn.Close()
|
|
delete(wcm.connections, workerID)
|
|
|
|
// Mark worker as inactive in registry
|
|
wcm.adminServer.workerRegistry.MarkWorkerInactive(workerID)
|
|
}
|
|
}
|
|
}
|
|
|
|
// NewWorkerConnection creates a new worker connection
|
|
func NewWorkerConnection(workerID, address string, adminServer *AdminServer) (*WorkerConnection, error) {
|
|
// Convert address to gRPC address
|
|
grpcAddress := pb.ServerToGrpcAddress(address)
|
|
conn, err := grpc.NewClient(grpcAddress, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to connect to worker at %s: %v", address, err)
|
|
}
|
|
|
|
client := worker_pb.NewWorkerServiceClient(conn)
|
|
|
|
return &WorkerConnection{
|
|
workerID: workerID,
|
|
address: address,
|
|
conn: conn,
|
|
client: client,
|
|
lastSeen: time.Now(),
|
|
adminServer: adminServer,
|
|
stopCh: make(chan struct{}),
|
|
active: false,
|
|
}, nil
|
|
}
|
|
|
|
// Start starts the worker connection and message handling
|
|
func (wc *WorkerConnection) Start() {
|
|
defer wc.Close()
|
|
|
|
ctx := context.Background()
|
|
stream, err := wc.client.WorkerStream(ctx)
|
|
if err != nil {
|
|
glog.Errorf("Failed to create worker stream for %s: %v", wc.workerID, err)
|
|
return
|
|
}
|
|
|
|
wc.stream = stream
|
|
wc.active = true
|
|
|
|
glog.Infof("Worker connection %s started", wc.workerID)
|
|
|
|
// Start message handling goroutines
|
|
go wc.receiveMessages()
|
|
|
|
// Keep connection alive until stopped
|
|
<-wc.stopCh
|
|
}
|
|
|
|
// Close closes the worker connection
|
|
func (wc *WorkerConnection) Close() {
|
|
wc.mutex.Lock()
|
|
defer wc.mutex.Unlock()
|
|
|
|
if !wc.active {
|
|
return
|
|
}
|
|
|
|
wc.active = false
|
|
close(wc.stopCh)
|
|
|
|
if wc.stream != nil {
|
|
wc.stream.CloseSend()
|
|
}
|
|
|
|
if wc.conn != nil {
|
|
wc.conn.Close()
|
|
}
|
|
|
|
glog.Infof("Worker connection %s closed", wc.workerID)
|
|
}
|
|
|
|
// receiveMessages handles incoming messages from the worker
|
|
func (wc *WorkerConnection) receiveMessages() {
|
|
for {
|
|
select {
|
|
case <-wc.stopCh:
|
|
return
|
|
default:
|
|
}
|
|
|
|
msg, err := wc.stream.Recv()
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
glog.Infof("Worker %s closed connection", wc.workerID)
|
|
} else {
|
|
glog.Errorf("Error receiving from worker %s: %v", wc.workerID, err)
|
|
}
|
|
wc.Close()
|
|
return
|
|
}
|
|
|
|
wc.updateLastSeen()
|
|
// Convert AdminMessage to WorkerMessage for processing
|
|
if workerMsg := convertToWorkerMessage(msg); workerMsg != nil {
|
|
wc.handleMessage(workerMsg)
|
|
}
|
|
}
|
|
}
|
|
|
|
// updateLastSeen updates the last seen timestamp
|
|
func (wc *WorkerConnection) updateLastSeen() {
|
|
wc.mutex.Lock()
|
|
defer wc.mutex.Unlock()
|
|
wc.lastSeen = time.Now()
|
|
}
|
|
|
|
// handleMessage processes a message from the worker
|
|
func (wc *WorkerConnection) handleMessage(msg *worker_pb.WorkerMessage) {
|
|
switch message := msg.Message.(type) {
|
|
case *worker_pb.WorkerMessage_Registration:
|
|
registration := message.Registration
|
|
worker := &Worker{
|
|
ID: registration.WorkerId,
|
|
Address: registration.Address,
|
|
Capabilities: registration.Capabilities,
|
|
}
|
|
wc.workerID = worker.ID
|
|
// UpdateWorkerStatus stub
|
|
if wc.adminServer.workerRegistry != nil {
|
|
// wc.adminServer.workerRegistry.UpdateWorkerStatus(worker) // Commented out - method doesn't exist
|
|
}
|
|
glog.Infof("Worker %s registered", worker.ID)
|
|
|
|
case *worker_pb.WorkerMessage_Heartbeat:
|
|
glog.V(3).Infof("Heartbeat from worker %s", wc.workerID)
|
|
|
|
case *worker_pb.WorkerMessage_TaskRequest:
|
|
glog.V(2).Infof("Task request from worker %s", wc.workerID)
|
|
// AssignTaskToWorker stub
|
|
// task := wc.adminServer.AssignTaskToWorker(wc.workerID) // Commented out - method doesn't exist
|
|
|
|
case *worker_pb.WorkerMessage_TaskUpdate:
|
|
update := message.TaskUpdate
|
|
// UpdateTaskProgress stub - fix signature
|
|
wc.adminServer.UpdateTaskProgress(update.TaskId, float64(update.Progress))
|
|
|
|
case *worker_pb.WorkerMessage_TaskComplete:
|
|
complete := message.TaskComplete
|
|
// CompleteTask stub - fix signature
|
|
wc.adminServer.CompleteTask(complete.TaskId, complete.Success, complete.ErrorMessage)
|
|
|
|
case *worker_pb.WorkerMessage_Shutdown:
|
|
glog.Infof("Worker %s shutting down", wc.workerID)
|
|
wc.Close()
|
|
}
|
|
}
|
|
|
|
// SendTaskAssignment sends a task assignment to the worker
|
|
func (wc *WorkerConnection) SendTaskAssignment(task *Task) error {
|
|
return wc.sendTaskAssignment(task)
|
|
}
|
|
|
|
// sendTaskAssignment sends a task assignment message
|
|
func (wc *WorkerConnection) sendTaskAssignment(task *types.Task) error {
|
|
// Fix type assertions for parameters
|
|
server, _ := task.Parameters["server"].(string)
|
|
collection, _ := task.Parameters["collection"].(string)
|
|
|
|
// Convert map[string]interface{} to map[string]string
|
|
parameters := make(map[string]string)
|
|
for k, v := range task.Parameters {
|
|
if str, ok := v.(string); ok {
|
|
parameters[k] = str
|
|
} else {
|
|
parameters[k] = fmt.Sprintf("%v", v)
|
|
}
|
|
}
|
|
|
|
// Add master_client parameter for tasks that need it (especially EC tasks)
|
|
if wc.adminServer.masterClient != nil {
|
|
if currentMaster := wc.adminServer.masterClient.GetMaster(context.Background()); currentMaster != "" {
|
|
parameters["master_client"] = string(currentMaster)
|
|
glog.V(2).Infof("Added master_client parameter to task %s: %s", task.ID, currentMaster)
|
|
} else {
|
|
glog.Warningf("No master address available for task %s", task.ID)
|
|
}
|
|
}
|
|
|
|
assignment := &worker_pb.TaskAssignment{
|
|
TaskId: task.ID,
|
|
TaskType: string(task.Type),
|
|
Priority: int32(task.Priority),
|
|
CreatedTime: task.CreatedAt.Unix(),
|
|
Params: &worker_pb.TaskParams{
|
|
VolumeId: task.VolumeID,
|
|
Server: server,
|
|
Collection: collection,
|
|
Parameters: parameters,
|
|
},
|
|
Metadata: map[string]string{
|
|
"assigned_at": time.Now().Format(time.RFC3339),
|
|
},
|
|
}
|
|
|
|
response := &worker_pb.AdminMessage{
|
|
AdminId: wc.adminServer.ID,
|
|
Timestamp: time.Now().Unix(),
|
|
Message: &worker_pb.AdminMessage_TaskAssignment{
|
|
TaskAssignment: assignment,
|
|
},
|
|
}
|
|
|
|
return wc.sendMessage(response)
|
|
}
|
|
|
|
// CancelTask sends a task cancellation to the worker
|
|
func (wc *WorkerConnection) CancelTask(taskID, reason string) error {
|
|
cancellation := &worker_pb.TaskCancellation{
|
|
TaskId: taskID,
|
|
Reason: reason,
|
|
Force: false,
|
|
}
|
|
|
|
response := &worker_pb.AdminMessage{
|
|
AdminId: wc.adminServer.ID,
|
|
Timestamp: time.Now().Unix(),
|
|
Message: &worker_pb.AdminMessage_TaskCancellation{
|
|
TaskCancellation: cancellation,
|
|
},
|
|
}
|
|
|
|
return wc.sendMessage(response)
|
|
}
|
|
|
|
// sendMessage sends a message to the worker
|
|
func (wc *WorkerConnection) sendMessage(msg *worker_pb.AdminMessage) error {
|
|
wc.mutex.RLock()
|
|
defer wc.mutex.RUnlock()
|
|
|
|
if !wc.active || wc.stream == nil {
|
|
return fmt.Errorf("connection to worker %s is not active", wc.workerID)
|
|
}
|
|
|
|
// The stream expects WorkerMessage from client (admin) to server (worker)
|
|
// Convert AdminMessage to appropriate WorkerMessage format
|
|
workerMsg := &worker_pb.WorkerMessage{
|
|
WorkerId: wc.workerID,
|
|
Timestamp: msg.Timestamp,
|
|
}
|
|
|
|
// Convert AdminMessage content to WorkerMessage based on message type
|
|
switch adminMsg := msg.Message.(type) {
|
|
case *worker_pb.AdminMessage_TaskAssignment:
|
|
// Task assignments should be sent as notifications to worker
|
|
// Since there's no direct equivalent, we'll create a generic message
|
|
// In a full implementation, this would need proper message type mapping
|
|
_ = adminMsg // Use the variable to avoid unused warning
|
|
workerMsg.Message = &worker_pb.WorkerMessage_Heartbeat{
|
|
Heartbeat: &worker_pb.WorkerHeartbeat{
|
|
WorkerId: wc.workerID,
|
|
Status: "task_assigned",
|
|
},
|
|
}
|
|
case *worker_pb.AdminMessage_TaskCancellation:
|
|
// Similar conversion for task cancellation
|
|
_ = adminMsg // Use the variable to avoid unused warning
|
|
workerMsg.Message = &worker_pb.WorkerMessage_Heartbeat{
|
|
Heartbeat: &worker_pb.WorkerHeartbeat{
|
|
WorkerId: wc.workerID,
|
|
Status: "task_cancelled",
|
|
},
|
|
}
|
|
default:
|
|
// For other message types, send a generic heartbeat
|
|
workerMsg.Message = &worker_pb.WorkerMessage_Heartbeat{
|
|
Heartbeat: &worker_pb.WorkerHeartbeat{
|
|
WorkerId: wc.workerID,
|
|
Status: "admin_message",
|
|
},
|
|
}
|
|
}
|
|
|
|
return wc.stream.Send(workerMsg)
|
|
}
|
|
|
|
// Helper functions
|
|
|
|
// convertCapabilities converts string capabilities to TaskType slice
|
|
func convertCapabilities(capabilities []string) []TaskType {
|
|
var result []TaskType
|
|
for _, cap := range capabilities {
|
|
result = append(result, TaskType(cap))
|
|
}
|
|
return result
|
|
}
|
|
|
|
// WorkerStatus represents worker status information
|
|
type WorkerStatus struct {
|
|
Status string
|
|
CurrentLoad int
|
|
MaxConcurrent int
|
|
CurrentTasks []string
|
|
TasksCompleted int
|
|
TasksFailed int
|
|
UptimeSeconds int64
|
|
LastSeen time.Time
|
|
}
|
|
|
|
// TaskProgress represents task progress information
|
|
type TaskProgress struct {
|
|
Progress float64
|
|
Message string
|
|
}
|
|
|
|
// TaskResult represents task completion result
|
|
type TaskResult struct {
|
|
Success bool
|
|
Error string
|
|
Message string
|
|
}
|
|
|
|
// convertToWorkerMessage converts AdminMessage to WorkerMessage (stub implementation)
|
|
func convertToWorkerMessage(msg *worker_pb.AdminMessage) *worker_pb.WorkerMessage {
|
|
// This is a stub - in real implementation would need proper conversion
|
|
// For now, return nil to avoid processing
|
|
return nil
|
|
}
|