Browse Source

Merge 293e9a8168 into 099a351f3b

pull/7394/merge
Mariano Ntrougkas 2 days ago
committed by GitHub
parent
commit
467a680414
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
  1. 83
      weed/command/worker.go
  2. 637
      weed/worker/client.go
  3. 633
      weed/worker/worker.go

83
weed/command/worker.go

@ -1,9 +1,9 @@
package command
import (
"fmt"
"os"
"os/signal"
"path/filepath"
"strings"
"syscall"
"time"
@ -69,43 +69,6 @@ func runWorker(cmd *Command, args []string) bool {
glog.Fatalf("No valid capabilities specified")
return false
}
// Set working directory and create task-specific subdirectories
var baseWorkingDir string
if *workerWorkingDir != "" {
glog.Infof("Setting working directory to: %s", *workerWorkingDir)
if err := os.Chdir(*workerWorkingDir); err != nil {
glog.Fatalf("Failed to change working directory: %v", err)
return false
}
wd, err := os.Getwd()
if err != nil {
glog.Fatalf("Failed to get working directory: %v", err)
return false
}
baseWorkingDir = wd
glog.Infof("Current working directory: %s", baseWorkingDir)
} else {
// Use default working directory when not specified
wd, err := os.Getwd()
if err != nil {
glog.Fatalf("Failed to get current working directory: %v", err)
return false
}
baseWorkingDir = wd
glog.Infof("Using current working directory: %s", baseWorkingDir)
}
// Create task-specific subdirectories
for _, capability := range capabilities {
taskDir := filepath.Join(baseWorkingDir, string(capability))
if err := os.MkdirAll(taskDir, 0755); err != nil {
glog.Fatalf("Failed to create task directory %s: %v", taskDir, err)
return false
}
glog.Infof("Created task directory: %s", taskDir)
}
// Create gRPC dial option using TLS configuration
grpcDialOption := security.LoadClientTLS(util.GetViper(), "grpc.worker")
@ -116,45 +79,30 @@ func runWorker(cmd *Command, args []string) bool {
MaxConcurrent: *workerMaxConcurrent,
HeartbeatInterval: *workerHeartbeatInterval,
TaskRequestInterval: *workerTaskRequestInterval,
BaseWorkingDir: baseWorkingDir,
BaseWorkingDir: *workerWorkingDir,
GrpcDialOption: grpcDialOption,
}
// Create worker instance
workerInstance, err := worker.NewWorker(config)
if err != nil {
glog.Fatalf("Failed to create worker: %v", err)
return false
}
adminClient, err := worker.CreateAdminClient(*workerAdminServer, workerInstance.ID(), grpcDialOption)
if err != nil {
glog.Fatalf("Failed to create admin client: %v", err)
if err := RunWorkerFromConfig(config); err != nil {
glog.Fatalf("Worker failed to run: %v", err)
return false
}
// Set admin client
workerInstance.SetAdminClient(adminClient)
glog.Infof("Worker stopped gracefully.")
return true
}
// Set working directory
if *workerWorkingDir != "" {
glog.Infof("Setting working directory to: %s", *workerWorkingDir)
if err := os.Chdir(*workerWorkingDir); err != nil {
glog.Fatalf("Failed to change working directory: %v", err)
return false
}
wd, err := os.Getwd()
if err != nil {
glog.Fatalf("Failed to get working directory: %v", err)
return false
}
glog.Infof("Current working directory: %s", wd)
func RunWorkerFromConfig(config *types.WorkerConfig) error {
// Create worker instance
workerInstance, err := worker.NewWorkerWithDefaults(config)
if err != nil {
return fmt.Errorf("Failed to create worker: %v", err)
}
// Start the worker
err = workerInstance.Start()
if err != nil {
glog.Errorf("Failed to start worker: %v", err)
return false
return fmt.Errorf("Failed to start worker: %v", err)
}
// Set up signal handling
@ -171,11 +119,10 @@ func runWorker(cmd *Command, args []string) bool {
// Gracefully stop the worker
err = workerInstance.Stop()
if err != nil {
glog.Errorf("Error stopping worker: %v", err)
return fmt.Errorf("Error stopping worker: %v", err)
}
glog.Infof("Worker stopped")
return true
return nil
}
// parseCapabilities converts comma-separated capability string to task types

637
weed/worker/client.go

@ -2,7 +2,6 @@ package worker
import (
"context"
"errors"
"fmt"
"io"
"time"
@ -14,17 +13,12 @@ import (
"google.golang.org/grpc"
)
var (
ErrAlreadyConnected = errors.New("already connected")
)
// GrpcAdminClient implements AdminClient using gRPC bidirectional streaming
type GrpcAdminClient struct {
adminAddress string
workerID string
dialOption grpc.DialOption
cmds chan grpcCommand
comms clientChannels
// Reconnection parameters
maxReconnectAttempts int
@ -33,47 +27,25 @@ type GrpcAdminClient struct {
reconnectMultiplier float64
// Channels for communication
outgoing chan *worker_pb.WorkerMessage
incoming chan *worker_pb.AdminMessage
responseChans map[string]chan *worker_pb.AdminMessage
}
type grpcAction string
const (
ActionConnect grpcAction = "connect"
ActionDisconnect grpcAction = "disconnect"
ActionReconnect grpcAction = "reconnect"
ActionStreamError grpcAction = "stream_error"
ActionRegisterWorker grpcAction = "register_worker"
ActionQueryReconnecting grpcAction = "query_reconnecting"
ActionQueryConnected grpcAction = "query_connected"
ActionQueryShouldReconnect grpcAction = "query_shouldreconnect"
)
outgoing chan *worker_pb.WorkerMessage
incoming chan *worker_pb.AdminMessage
type registrationRequest struct {
Worker *types.WorkerData
Resp chan error // Used to send the registration result back
state grpcState
}
type connectionEvent struct {
connected bool
err error
}
type grpcCommand struct {
action grpcAction
data any
resp chan error // for reporting success/failure
type clientChannels struct {
stop chan struct{}
connectionEvents chan connectionEvent
streamErrors chan error
}
type grpcState struct {
connected bool
reconnecting bool
shouldReconnect bool
conn *grpc.ClientConn
client worker_pb.WorkerServiceClient
stream worker_pb.WorkerService_WorkerStreamClient
streamCtx context.Context
streamCancel context.CancelFunc
lastWorkerInfo *types.WorkerData
reconnectStop chan struct{}
streamExit chan struct{}
started bool
lastWorkerInfo *types.WorkerData
}
// NewGrpcAdminClient creates a new gRPC admin client
@ -91,88 +63,89 @@ func NewGrpcAdminClient(adminAddress string, workerID string, dialOption grpc.Di
reconnectMultiplier: 1.5,
outgoing: make(chan *worker_pb.WorkerMessage, 100),
incoming: make(chan *worker_pb.AdminMessage, 100),
responseChans: make(map[string]chan *worker_pb.AdminMessage),
cmds: make(chan grpcCommand),
state: grpcState{started: false},
}
go c.managerLoop()
return c
}
func (c *GrpcAdminClient) managerLoop() {
state := &grpcState{shouldReconnect: true}
out:
for cmd := range c.cmds {
switch cmd.action {
case ActionConnect:
c.handleConnect(cmd, state)
case ActionDisconnect:
c.handleDisconnect(cmd, state)
break out
case ActionReconnect:
if state.connected || state.reconnecting || !state.shouldReconnect {
cmd.resp <- ErrAlreadyConnected
continue
}
state.reconnecting = true // Manager acknowledges the attempt
err := c.reconnect(state)
state.reconnecting = false
cmd.resp <- err
case ActionStreamError:
state.connected = false
case ActionRegisterWorker:
req := cmd.data.(registrationRequest)
state.lastWorkerInfo = req.Worker
if !state.connected {
glog.V(1).Infof("Not connected yet, worker info stored for registration upon connection")
// Respond immediately with success (registration will happen later)
req.Resp <- nil
continue
}
err := c.sendRegistration(req.Worker)
req.Resp <- err
case ActionQueryConnected:
respCh := cmd.data.(chan bool)
respCh <- state.connected
case ActionQueryReconnecting:
respCh := cmd.data.(chan bool)
respCh <- state.reconnecting
case ActionQueryShouldReconnect:
respCh := cmd.data.(chan bool)
respCh <- state.shouldReconnect
}
// Connect establishes gRPC connection to admin server with TLS detection
func (c *GrpcAdminClient) Connect(workerInfo *types.WorkerData) error {
if c.state.started {
return fmt.Errorf("already started")
}
// Register worker info with client first (this stores it for use during connection)
if err := c.RegisterWorker(workerInfo); err != nil {
glog.V(1).Infof("Worker info stored for registration: %v", err)
// This is expected if not connected yet
}
}
// Connect establishes gRPC connection to admin server with TLS detection
func (c *GrpcAdminClient) Connect() error {
resp := make(chan error)
c.cmds <- grpcCommand{
action: ActionConnect,
resp: resp,
c.state.started = true
c.comms.stop = make(chan struct{})
c.comms.connectionEvents = make(chan connectionEvent)
c.comms.streamErrors = make(chan error, 2)
go c.connectionProcess()
// Attempt the initial connection
event := <-c.comms.connectionEvents
if event.err != nil {
glog.V(1).Infof("Initial connection failed, will retry: %v", event.err)
return event.err
} else {
return nil
}
return <-resp
}
func (c *GrpcAdminClient) handleConnect(cmd grpcCommand, s *grpcState) {
if s.connected {
cmd.resp <- fmt.Errorf("already connected")
return
}
func (c *GrpcAdminClient) GetEvents() chan connectionEvent {
return c.comms.connectionEvents
}
// Start reconnection loop immediately (async)
stop := make(chan struct{})
s.reconnectStop = stop
go c.reconnectionLoop(stop)
func (c *GrpcAdminClient) connectionProcess() {
var (
conn *grpc.ClientConn
)
// Attempt the initial connection
err := c.attemptConnection(s)
// Initial connection attempt
conn, err := c.tryConnect()
if err != nil {
glog.V(1).Infof("Initial connection failed, reconnection loop will retry: %v", err)
cmd.resp <- err
return
glog.Warningf("Initial connection failed: %v", err)
c.comms.connectionEvents <- connectionEvent{connected: false, err: err}
c.comms.streamErrors <- err
c.comms.streamErrors <- err
} else {
c.comms.connectionEvents <- connectionEvent{connected: true}
}
cmd.resp <- nil
for {
select {
case <-c.comms.stop:
c.comms.connectionEvents <- connectionEvent{connected: false}
if conn != nil {
<-c.comms.streamErrors
<-c.comms.streamErrors
conn.Close()
}
return
case err := <-c.comms.streamErrors:
<-c.comms.streamErrors // now both incomingProcess and outgoingProcess
// have been cleaned up
glog.Warningf("Stream error: %v, reconnecting...", err)
if conn != nil {
conn.Close()
conn = nil
}
c.comms.connectionEvents <- connectionEvent{connected: false, err: err}
conn, err = c.tryConnectWithBackoff()
if err != nil {
glog.Errorf("Reconnection failed: %v", err)
} else {
c.comms.connectionEvents <- connectionEvent{connected: true}
}
}
}
}
// createConnection attempts to connect using the provided dial option
@ -185,297 +158,160 @@ func (c *GrpcAdminClient) createConnection() (*grpc.ClientConn, error) {
return nil, fmt.Errorf("failed to connect to admin server: %w", err)
}
glog.Infof("Connected to admin server at %s", c.adminAddress)
return conn, nil
}
// attemptConnection tries to establish the connection without managing the reconnection loop
func (c *GrpcAdminClient) attemptConnection(s *grpcState) error {
// Detect TLS support and create appropriate connection
func (c *GrpcAdminClient) tryConnect() (*grpc.ClientConn, error) {
glog.Infof("Connecting to admin server at %s", c.adminAddress)
conn, err := c.createConnection()
if err != nil {
return fmt.Errorf("failed to connect to admin server: %w", err)
return nil, fmt.Errorf("connection failed: %w", err)
}
s.conn = conn
s.client = worker_pb.NewWorkerServiceClient(conn)
// Create bidirectional stream
s.streamCtx, s.streamCancel = context.WithCancel(context.Background())
stream, err := s.client.WorkerStream(s.streamCtx)
glog.Infof("Worker stream created")
stream, err := worker_pb.NewWorkerServiceClient(conn).WorkerStream(context.Background())
if err != nil {
s.conn.Close()
return fmt.Errorf("failed to create worker stream: %w", err)
}
s.connected = true
s.stream = stream
// Always check for worker info and send registration immediately as the very first message
if s.lastWorkerInfo != nil {
// Send registration synchronously as the very first message
if err := c.sendRegistrationSync(s.lastWorkerInfo, s.stream); err != nil {
s.conn.Close()
s.connected = false
return fmt.Errorf("failed to register worker: %w", err)
conn.Close()
return nil, fmt.Errorf("stream creation failed: %w", err)
}
if c.state.lastWorkerInfo != nil {
if err := c.sendRegistrationSync(c.state.lastWorkerInfo, stream); err != nil {
conn.Close()
return nil, fmt.Errorf("registration failed: %w", err)
}
glog.Infof("Worker registered successfully with admin server")
} else {
// No worker info yet - stream will wait for registration
glog.V(1).Infof("Connected to admin server, waiting for worker registration info")
glog.Infof("Worker registered successfully")
}
// Start stream handlers
s.streamExit = make(chan struct{})
go handleOutgoing(s.stream, s.streamExit, c.outgoing, c.cmds)
go handleIncoming(c.workerID, s.stream, s.streamExit, c.incoming, c.cmds)
go c.outgoingProcess(stream)
go c.incomingProcess(stream)
glog.Infof("Connected to admin server at %s", c.adminAddress)
return nil
}
// reconnect attempts to re-establish the connection
func (c *GrpcAdminClient) reconnect(s *grpcState) error {
// Clean up existing connection completely
if s.streamCancel != nil {
s.streamCancel()
}
if s.conn != nil {
s.conn.Close()
}
s.connected = false
// Attempt to re-establish connection using the same logic as initial connection
if err := c.attemptConnection(s); err != nil {
return fmt.Errorf("failed to reconnect: %w", err)
}
// Registration is now handled in attemptConnection if worker info is available
return nil
return conn, nil
}
// reconnectionLoop handles automatic reconnection with exponential backoff
func (c *GrpcAdminClient) reconnectionLoop(reconnectStop chan struct{}) {
backoff := c.reconnectBackoff
func (c *GrpcAdminClient) tryConnectWithBackoff() (*grpc.ClientConn, error) {
backoff := time.Second
attempts := 0
for {
waitDuration := backoff
if attempts == 0 {
waitDuration = time.Second
if conn, err := c.tryConnect(); err == nil {
return conn, nil
}
select {
case <-reconnectStop:
return
case <-time.After(waitDuration):
attempts++
if c.maxReconnectAttempts > 0 && attempts >= c.maxReconnectAttempts {
return nil, fmt.Errorf("max reconnection attempts reached")
}
resp := make(chan error, 1)
c.cmds <- grpcCommand{
action: ActionReconnect,
resp: resp,
// Exponential backoff
backoff = time.Duration(float64(backoff) * c.reconnectMultiplier)
if backoff > c.maxReconnectBackoff {
backoff = c.maxReconnectBackoff
}
err := <-resp
if err == nil {
// Successful reconnection
attempts = 0
backoff = c.reconnectBackoff
glog.Infof("Successfully reconnected to admin server")
} else if errors.Is(err, ErrAlreadyConnected) {
attempts = 0
backoff = c.reconnectBackoff
} else {
attempts++
glog.Errorf("Reconnection attempt %d failed: %v", attempts, err)
// Check if we should give up
if c.maxReconnectAttempts > 0 && attempts >= c.maxReconnectAttempts {
glog.Errorf("Max reconnection attempts (%d) reached, giving up", c.maxReconnectAttempts)
return
}
// Increase backoff
backoff = time.Duration(float64(backoff) * c.reconnectMultiplier)
if backoff > c.maxReconnectBackoff {
backoff = c.maxReconnectBackoff
}
glog.Infof("Waiting %v before next reconnection attempt", backoff)
glog.Infof("Reconnection failed, retrying in %v", backoff)
select {
case <-c.comms.stop:
return nil, fmt.Errorf("cancelled")
case <-time.After(backoff):
}
}
}
// handleOutgoing processes outgoing messages to admin
func handleOutgoing(
stream worker_pb.WorkerService_WorkerStreamClient,
streamExit <-chan struct{},
outgoing <-chan *worker_pb.WorkerMessage,
cmds chan<- grpcCommand) {
msgCh := make(chan *worker_pb.WorkerMessage)
errCh := make(chan error, 1) // Buffered to prevent blocking if the manager is busy
// Goroutine to handle blocking stream.Recv() and simultaneously handle exit
// signals
go func() {
for msg := range msgCh {
if err := stream.Send(msg); err != nil {
errCh <- err
return // Exit the receiver goroutine on error/EOF
}
}
close(errCh)
}()
func (c *GrpcAdminClient) outgoingProcess(stream worker_pb.WorkerService_WorkerStreamClient) {
for msg := range outgoing {
for {
select {
case msgCh <- msg:
case err := <-errCh:
glog.Errorf("Failed to send message to admin: %v", err)
cmds <- grpcCommand{action: ActionStreamError, data: err}
return
case <-streamExit:
close(msgCh)
<-errCh
case <-c.comms.stop:
// Send shutdown message
shutdownMsg := &worker_pb.WorkerMessage{
WorkerId: c.workerID,
Timestamp: time.Now().Unix(),
Message: &worker_pb.WorkerMessage_Shutdown{
Shutdown: &worker_pb.WorkerShutdown{
WorkerId: c.workerID,
Reason: "normal shutdown",
},
},
}
stream.Send(shutdownMsg)
close(c.outgoing)
c.comms.streamErrors <- nil
return
case msg := <-c.outgoing:
if err := stream.Send(msg); err != nil {
glog.Errorf("Failed to send message: %v", err)
c.comms.streamErrors <- err
return
}
}
}
}
// handleIncoming processes incoming messages from admin
func handleIncoming(
workerID string,
stream worker_pb.WorkerService_WorkerStreamClient,
streamExit <-chan struct{},
incoming chan<- *worker_pb.AdminMessage,
cmds chan<- grpcCommand) {
func (c *GrpcAdminClient) incomingProcess(stream worker_pb.WorkerService_WorkerStreamClient) {
workerID := c.state.lastWorkerInfo.ID
glog.V(1).Infof("INCOMING HANDLER STARTED: Worker %s incoming message handler started", workerID)
msgCh := make(chan *worker_pb.AdminMessage)
errCh := make(chan error, 1) // Buffered to prevent blocking if the manager is busy
// Goroutine to handle blocking stream.Recv() and simultaneously handle exit
// signals
go func() {
for {
msg, err := stream.Recv()
if err != nil {
errCh <- err
return // Exit the receiver goroutine on error/EOF
}
msgCh <- msg
}
}()
for {
glog.V(4).Infof("LISTENING: Worker %s waiting for message from admin server", workerID)
select {
case msg := <-msgCh:
// Message successfully received from the stream
case <-c.comms.stop:
close(c.incoming)
glog.V(1).Infof("INCOMING HANDLER STOPPED: Worker %s stopping incoming handler - received exit signal", workerID)
c.comms.streamErrors <- nil
return
default:
msg, err := stream.Recv()
if err != nil {
if err == io.EOF {
glog.Infof("STREAM CLOSED: Worker %s admin server closed the stream", workerID)
} else {
glog.Errorf("RECEIVE ERROR: Worker %s failed to receive message from admin: %v", workerID, err)
}
c.comms.streamErrors <- err
return // Exit the receiver goroutine on error/EOF
}
glog.V(4).Infof("MESSAGE RECEIVED: Worker %s received message from admin server: %T", workerID, msg.Message)
// Route message to waiting goroutines or general handler (original select logic)
select {
case incoming <- msg:
case c.incoming <- msg:
glog.V(3).Infof("MESSAGE ROUTED: Worker %s successfully routed message to handler", workerID)
case <-time.After(time.Second):
glog.Warningf("MESSAGE DROPPED: Worker %s incoming message buffer full, dropping message: %T", workerID, msg.Message)
}
case err := <-errCh:
// Stream Receiver goroutine reported an error (EOF or network error)
if err == io.EOF {
glog.Infof("STREAM CLOSED: Worker %s admin server closed the stream", workerID)
} else {
glog.Errorf("RECEIVE ERROR: Worker %s failed to receive message from admin: %v", workerID, err)
}
// Report the failure as a command to the managerLoop (blocking)
cmds <- grpcCommand{action: ActionStreamError, data: err}
// Exit the main handler loop
glog.V(1).Infof("INCOMING HANDLER STOPPED: Worker %s stopping incoming handler due to stream error", workerID)
return
case <-streamExit:
// Manager closed this channel, signaling a controlled disconnection.
glog.V(1).Infof("INCOMING HANDLER STOPPED: Worker %s stopping incoming handler - received exit signal", workerID)
return
}
}
}
// Connect establishes gRPC connection to admin server with TLS detection
func (c *GrpcAdminClient) Disconnect() error {
resp := make(chan error)
c.cmds <- grpcCommand{
action: ActionDisconnect,
resp: resp,
}
err := <-resp
return err
}
func (c *GrpcAdminClient) handleDisconnect(cmd grpcCommand, s *grpcState) {
if !s.connected {
cmd.resp <- fmt.Errorf("already disconnected")
return
}
// Send shutdown signal to stop reconnection loop
close(s.reconnectStop)
s.connected = false
s.shouldReconnect = false
// Send shutdown message
shutdownMsg := &worker_pb.WorkerMessage{
WorkerId: c.workerID,
Timestamp: time.Now().Unix(),
Message: &worker_pb.WorkerMessage_Shutdown{
Shutdown: &worker_pb.WorkerShutdown{
WorkerId: c.workerID,
Reason: "normal shutdown",
},
},
}
// Close outgoing/incoming
select {
case c.outgoing <- shutdownMsg:
case <-time.After(time.Second):
glog.Warningf("Failed to send shutdown message")
}
// Send shutdown signal to stop handlers loop
close(s.streamExit)
// Cancel stream context
if s.streamCancel != nil {
s.streamCancel()
}
// Close connection
if s.conn != nil {
s.conn.Close()
if !c.state.started {
glog.Errorf("already disconnected")
return nil
}
c.state.started = false
// Close channels
close(c.outgoing)
close(c.incoming)
// Send shutdown signal to stop connection Process
close(c.comms.stop)
glog.Infof("Disconnected from admin server")
cmd.resp <- nil
return nil
}
// RegisterWorker registers the worker with the admin server
func (c *GrpcAdminClient) RegisterWorker(worker *types.WorkerData) error {
respCh := make(chan error, 1)
request := registrationRequest{
Worker: worker,
Resp: respCh,
}
c.cmds <- grpcCommand{
action: ActionRegisterWorker,
data: request,
c.state.lastWorkerInfo = worker
if !c.state.started {
glog.V(1).Infof("Not started yet, worker info stored for registration upon connection")
// Respond immediately with success (registration will happen later)
return nil
}
return <-respCh
err := c.sendRegistration(worker)
return err
}
// sendRegistration sends the registration message and waits for response
@ -595,56 +431,8 @@ func (c *GrpcAdminClient) sendRegistrationSync(worker *types.WorkerData, stream
}
}
func (c *GrpcAdminClient) IsConnected() bool {
respCh := make(chan bool, 1)
c.cmds <- grpcCommand{
action: ActionQueryConnected,
data: respCh,
}
return <-respCh
}
func (c *GrpcAdminClient) IsReconnecting() bool {
respCh := make(chan bool, 1)
c.cmds <- grpcCommand{
action: ActionQueryReconnecting,
data: respCh,
}
return <-respCh
}
func (c *GrpcAdminClient) ShouldReconnect() bool {
respCh := make(chan bool, 1)
c.cmds <- grpcCommand{
action: ActionQueryShouldReconnect,
data: respCh,
}
return <-respCh
}
// SendHeartbeat sends heartbeat to admin server
func (c *GrpcAdminClient) SendHeartbeat(workerID string, status *types.WorkerStatus) error {
if !c.IsConnected() {
// If we're currently reconnecting, don't wait - just skip the heartbeat
reconnecting := c.IsReconnecting()
if reconnecting {
// Don't treat as an error - reconnection is in progress
glog.V(2).Infof("Skipping heartbeat during reconnection")
return nil
}
// Wait for reconnection for a short time
if err := c.waitForConnection(10 * time.Second); err != nil {
return fmt.Errorf("not connected to admin server: %w", err)
}
}
taskIds := make([]string, len(status.CurrentTasks))
for i, task := range status.CurrentTasks {
@ -678,21 +466,6 @@ func (c *GrpcAdminClient) SendHeartbeat(workerID string, status *types.WorkerSta
// RequestTask requests a new task from admin server
func (c *GrpcAdminClient) RequestTask(workerID string, capabilities []types.TaskType) (*types.TaskInput, error) {
if !c.IsConnected() {
// If we're currently reconnecting, don't wait - just return no task
reconnecting := c.IsReconnecting()
if reconnecting {
// Don't treat as an error - reconnection is in progress
glog.V(2).Infof("RECONNECTING: Worker %s skipping task request during reconnection", workerID)
return nil, nil
}
// Wait for reconnection for a short time
if err := c.waitForConnection(5 * time.Second); err != nil {
return nil, fmt.Errorf("not connected to admin server: %w", err)
}
}
caps := make([]string, len(capabilities))
for i, cap := range capabilities {
@ -766,22 +539,6 @@ func (c *GrpcAdminClient) CompleteTask(taskID string, success bool, errorMsg str
// CompleteTaskWithMetadata reports task completion with additional metadata
func (c *GrpcAdminClient) CompleteTaskWithMetadata(taskID string, success bool, errorMsg string, metadata map[string]string) error {
if !c.IsConnected() {
// If we're currently reconnecting, don't wait - just skip the completion report
reconnecting := c.IsReconnecting()
if reconnecting {
// Don't treat as an error - reconnection is in progress
glog.V(2).Infof("Skipping task completion report during reconnection for task %s", taskID)
return nil
}
// Wait for reconnection for a short time
if err := c.waitForConnection(5 * time.Second); err != nil {
return fmt.Errorf("not connected to admin server: %w", err)
}
}
taskComplete := &worker_pb.TaskComplete{
TaskId: taskID,
WorkerId: c.workerID,
@ -813,22 +570,6 @@ func (c *GrpcAdminClient) CompleteTaskWithMetadata(taskID string, success bool,
// UpdateTaskProgress updates task progress to admin server
func (c *GrpcAdminClient) UpdateTaskProgress(taskID string, progress float64) error {
if !c.IsConnected() {
// If we're currently reconnecting, don't wait - just skip the progress update
reconnecting := c.IsReconnecting()
if reconnecting {
// Don't treat as an error - reconnection is in progress
glog.V(2).Infof("Skipping task progress update during reconnection for task %s", taskID)
return nil
}
// Wait for reconnection for a short time
if err := c.waitForConnection(5 * time.Second); err != nil {
return fmt.Errorf("not connected to admin server: %w", err)
}
}
msg := &worker_pb.WorkerMessage{
WorkerId: c.workerID,
Timestamp: time.Now().Unix(),
@ -850,37 +591,15 @@ func (c *GrpcAdminClient) UpdateTaskProgress(taskID string, progress float64) er
}
}
// waitForConnection waits for the connection to be established or timeout
func (c *GrpcAdminClient) waitForConnection(timeout time.Duration) error {
deadline := time.Now().Add(timeout)
for time.Now().Before(deadline) {
connected := c.IsConnected()
shouldReconnect := c.ShouldReconnect()
if connected {
return nil
}
if !shouldReconnect {
return fmt.Errorf("reconnection is disabled")
}
time.Sleep(100 * time.Millisecond)
}
return fmt.Errorf("timeout waiting for connection")
}
// GetIncomingChannel returns the incoming message channel for message processing
// This allows the worker to process admin messages directly
func (c *GrpcAdminClient) GetIncomingChannel() <-chan *worker_pb.AdminMessage {
return c.incoming
}
// CreateAdminClient creates an admin client with the provided dial option
func CreateAdminClient(adminServer string, workerID string, dialOption grpc.DialOption) (AdminClient, error) {
return NewGrpcAdminClient(adminServer, workerID, dialOption), nil
// NewAdminClient creates an admin client with the provided dial option
func NewAdminClient(adminServer string, workerID string, dialOption grpc.DialOption) AdminClient {
return NewGrpcAdminClient(adminServer, workerID, dialOption)
}
// getServerFromParams extracts server address from unified sources

633
weed/worker/worker.go

@ -25,57 +25,44 @@ type Worker struct {
id string
config *types.WorkerConfig
registry *tasks.TaskRegistry
cmds chan workerCommand
state *workerState
client AdminClient
taskLogHandler *tasks.TaskLogHandler
comms workerChannels
state workerState
}
type workerChannels struct {
stop chan struct{}
connectionEvents chan connectionEvent
taskReqs chan taskRequest
taskCompl chan taskCompletion
metricsQuery chan chan metricsResponse
loadQuery chan chan int
}
type metricsResponse struct {
success, failure int
}
type workerState struct {
running bool
adminClient AdminClient
startTime time.Time
stopChan chan struct{}
heartbeatTicker *time.Ticker
requestTicker *time.Ticker
currentTasks map[string]*types.TaskInput
tasksCompleted int
tasksFailed int
running bool
startTime time.Time
}
type taskRequest struct {
task *types.TaskInput
resp chan taskResponse
}
type workerAction string
const (
ActionStart workerAction = "start"
ActionStop workerAction = "stop"
ActionGetStatus workerAction = "getstatus"
ActionGetTaskLoad workerAction = "getload"
ActionSetTask workerAction = "settask"
ActionSetAdmin workerAction = "setadmin"
ActionRemoveTask workerAction = "removetask"
ActionGetAdmin workerAction = "getadmin"
ActionIncTaskFail workerAction = "inctaskfail"
ActionIncTaskComplete workerAction = "inctaskcomplete"
ActionGetHbTick workerAction = "gethbtick"
ActionGetReqTick workerAction = "getreqtick"
ActionGetStopChan workerAction = "getstopchan"
ActionSetHbTick workerAction = "sethbtick"
ActionSetReqTick workerAction = "setreqtick"
ActionGetStartTime workerAction = "getstarttime"
ActionGetCompletedTasks workerAction = "getcompletedtasks"
ActionGetFailedTasks workerAction = "getfailedtasks"
ActionCancelTask workerAction = "canceltask"
// ... other worker actions like Stop, Status, etc.
)
type taskResponse struct {
accepted bool
reason error
}
type statusResponse chan types.WorkerStatus
type workerCommand struct {
action workerAction
data any
resp chan error // for reporting success/failure
type taskCompletion struct {
success bool
}
// AdminClient defines the interface for communicating with the admin server
type AdminClient interface {
Connect() error
Connect(workerInfo *types.WorkerData) error
Disconnect() error
RegisterWorker(worker *types.WorkerData) error
SendHeartbeat(workerID string, status *types.WorkerStatus) error
@ -83,7 +70,7 @@ type AdminClient interface {
CompleteTask(taskID string, success bool, errorMsg string) error
CompleteTaskWithMetadata(taskID string, success bool, errorMsg string, metadata map[string]string) error
UpdateTaskProgress(taskID string, progress float64) error
IsConnected() bool
GetEvents() chan connectionEvent
}
// GenerateOrLoadWorkerID generates a unique worker ID or loads existing one from working directory
@ -157,290 +144,109 @@ func GenerateOrLoadWorkerID(workingDir string) (string, error) {
return workerID, nil
}
// NewWorker creates a new worker instance
func NewWorker(config *types.WorkerConfig) (*Worker, error) {
func setWorkingDir(workingDir string) (string, error) {
// Set working directory and create task-specific subdirectories
var baseWorkingDir string
if workingDir != "" {
glog.Infof("Setting working directory to: %s", workingDir)
if err := os.Chdir(workingDir); err != nil {
return "", fmt.Errorf("failed to change working directory: %v", err)
}
wd, err := os.Getwd()
if err != nil {
return "", fmt.Errorf("failed to get working directory: %v", err)
}
baseWorkingDir = wd
glog.Infof("Current working directory: %s", baseWorkingDir)
} else {
// Use default working directory when not specified
wd, err := os.Getwd()
if err != nil {
return "", fmt.Errorf("failed to get current working directory: %v", err)
}
baseWorkingDir = wd
glog.Infof("Using current working directory: %s", baseWorkingDir)
}
return baseWorkingDir, nil
}
func makeDirectories(capabilities []types.TaskType, baseWorkingDir string) error {
// Create task-specific subdirectories
for _, capability := range capabilities {
taskDir := filepath.Join(baseWorkingDir, string(capability))
if err := os.MkdirAll(taskDir, 0755); err != nil {
return fmt.Errorf("failed to create task directory %s: %v", taskDir, err)
}
glog.Infof("Created task directory: %s", taskDir)
}
return nil
}
func NewWorkerWithDefaults(config *types.WorkerConfig) (*Worker, error) {
if config == nil {
config = types.DefaultWorkerConfig()
}
baseWorkingDir, err := setWorkingDir(config.BaseWorkingDir)
if err != nil {
return nil, err
}
config.BaseWorkingDir = baseWorkingDir
if err := makeDirectories(config.Capabilities, config.BaseWorkingDir); err != nil {
return nil, err
}
// Generate or load persistent worker ID
workerID, err := GenerateOrLoadWorkerID(config.BaseWorkingDir)
workerID, err := GenerateOrLoadWorkerID(baseWorkingDir)
if err != nil {
return nil, fmt.Errorf("failed to generate or load worker ID: %w", err)
}
client := NewAdminClient(config.AdminServer, workerID, config.GrpcDialOption)
// Use the global unified registry that already has all tasks registered
registry := tasks.GetGlobalTaskRegistry()
// Initialize task log handler
logDir := filepath.Join(config.BaseWorkingDir, "task_logs")
logDir := filepath.Join(baseWorkingDir, "task_logs")
// Ensure the base task log directory exists to avoid errors when admin requests logs
if err := os.MkdirAll(logDir, 0755); err != nil {
glog.Warningf("Failed to create task log base directory %s: %v", logDir, err)
return nil, fmt.Errorf("failed to create task log base directory %s: %v", logDir, err)
}
taskLogHandler := tasks.NewTaskLogHandler(logDir)
return NewWorker(workerID, config, registry, client, taskLogHandler), nil
}
// NewWorker creates a new worker instance
func NewWorker(workerID string, config *types.WorkerConfig, registry *tasks.TaskRegistry, client AdminClient, taskLogHandler *tasks.TaskLogHandler) *Worker {
worker := &Worker{
id: workerID,
config: config,
registry: registry,
client: client,
taskLogHandler: taskLogHandler,
cmds: make(chan workerCommand),
}
glog.V(1).Infof("Worker created with %d registered task types", len(registry.GetAll()))
go worker.managerLoop()
return worker, nil
}
func (w *Worker) managerLoop() {
w.state = &workerState{
startTime: time.Now(),
stopChan: make(chan struct{}),
currentTasks: make(map[string]*types.TaskInput),
}
out:
for cmd := range w.cmds {
switch cmd.action {
case ActionStart:
w.handleStart(cmd)
case ActionStop:
w.handleStop(cmd)
break out
case ActionGetStatus:
respCh := cmd.data.(statusResponse)
var currentTasks []types.TaskInput
for _, task := range w.state.currentTasks {
currentTasks = append(currentTasks, *task)
}
statusStr := "active"
if len(w.state.currentTasks) >= w.config.MaxConcurrent {
statusStr = "busy"
}
status := types.WorkerStatus{
WorkerID: w.id,
Status: statusStr,
Capabilities: w.config.Capabilities,
MaxConcurrent: w.config.MaxConcurrent,
CurrentLoad: len(w.state.currentTasks),
LastHeartbeat: time.Now(),
CurrentTasks: currentTasks,
Uptime: time.Since(w.state.startTime),
TasksCompleted: w.state.tasksCompleted,
TasksFailed: w.state.tasksFailed,
}
respCh <- status
case ActionGetTaskLoad:
respCh := cmd.data.(chan int)
respCh <- len(w.state.currentTasks)
case ActionSetTask:
currentLoad := len(w.state.currentTasks)
if currentLoad >= w.config.MaxConcurrent {
cmd.resp <- fmt.Errorf("worker is at capacity")
}
task := cmd.data.(*types.TaskInput)
w.state.currentTasks[task.ID] = task
cmd.resp <- nil
case ActionSetAdmin:
admin := cmd.data.(AdminClient)
w.state.adminClient = admin
case ActionRemoveTask:
taskID := cmd.data.(string)
delete(w.state.currentTasks, taskID)
case ActionGetAdmin:
respCh := cmd.data.(chan AdminClient)
respCh <- w.state.adminClient
case ActionIncTaskFail:
w.state.tasksFailed++
case ActionIncTaskComplete:
w.state.tasksCompleted++
case ActionGetHbTick:
respCh := cmd.data.(chan *time.Ticker)
respCh <- w.state.heartbeatTicker
case ActionGetReqTick:
respCh := cmd.data.(chan *time.Ticker)
respCh <- w.state.requestTicker
case ActionSetHbTick:
w.state.heartbeatTicker = cmd.data.(*time.Ticker)
case ActionSetReqTick:
w.state.requestTicker = cmd.data.(*time.Ticker)
case ActionGetStopChan:
cmd.data.(chan chan struct{}) <- w.state.stopChan
case ActionGetStartTime:
cmd.data.(chan time.Time) <- w.state.startTime
case ActionGetCompletedTasks:
cmd.data.(chan int) <- w.state.tasksCompleted
case ActionGetFailedTasks:
cmd.data.(chan int) <- w.state.tasksFailed
case ActionCancelTask:
taskID := cmd.data.(string)
if task, exists := w.state.currentTasks[taskID]; exists {
glog.Infof("Cancelling task %s", task.ID)
// TODO: Implement actual task cancellation logic
} else {
glog.Warningf("Cannot cancel task %s: task not found", taskID)
}
}
}
}
func (w *Worker) getTaskLoad() int {
respCh := make(chan int, 1)
w.cmds <- workerCommand{
action: ActionGetTaskLoad,
data: respCh,
resp: nil,
}
return <-respCh
}
func (w *Worker) setTask(task *types.TaskInput) error {
resp := make(chan error)
w.cmds <- workerCommand{
action: ActionSetTask,
data: task,
resp: resp,
}
if err := <-resp; err != nil {
glog.Errorf("TASK REJECTED: Worker %s at capacity (%d/%d) - rejecting task %s",
w.id, w.getTaskLoad(), w.config.MaxConcurrent, task.ID)
return err
}
newLoad := w.getTaskLoad()
glog.Infof("TASK ACCEPTED: Worker %s accepted task %s - current load: %d/%d",
w.id, task.ID, newLoad, w.config.MaxConcurrent)
return nil
}
func (w *Worker) removeTask(task *types.TaskInput) int {
w.cmds <- workerCommand{
action: ActionRemoveTask,
data: task.ID,
}
return w.getTaskLoad()
}
func (w *Worker) getAdmin() AdminClient {
respCh := make(chan AdminClient, 1)
w.cmds <- workerCommand{
action: ActionGetAdmin,
data: respCh,
}
return <-respCh
}
func (w *Worker) getStopChan() chan struct{} {
respCh := make(chan chan struct{}, 1)
w.cmds <- workerCommand{
action: ActionGetStopChan,
data: respCh,
}
return <-respCh
}
func (w *Worker) getHbTick() *time.Ticker {
respCh := make(chan *time.Ticker, 1)
w.cmds <- workerCommand{
action: ActionGetHbTick,
data: respCh,
}
return <-respCh
}
func (w *Worker) getReqTick() *time.Ticker {
respCh := make(chan *time.Ticker, 1)
w.cmds <- workerCommand{
action: ActionGetReqTick,
data: respCh,
}
return <-respCh
}
func (w *Worker) setHbTick(tick *time.Ticker) *time.Ticker {
w.cmds <- workerCommand{
action: ActionSetHbTick,
data: tick,
}
return w.getHbTick()
}
func (w *Worker) setReqTick(tick *time.Ticker) *time.Ticker {
w.cmds <- workerCommand{
action: ActionSetReqTick,
data: tick,
}
return w.getReqTick()
}
func (w *Worker) getStartTime() time.Time {
respCh := make(chan time.Time, 1)
w.cmds <- workerCommand{
action: ActionGetStartTime,
data: respCh,
}
return <-respCh
}
func (w *Worker) getCompletedTasks() int {
respCh := make(chan int, 1)
w.cmds <- workerCommand{
action: ActionGetCompletedTasks,
data: respCh,
}
return <-respCh
}
func (w *Worker) getFailedTasks() int {
respCh := make(chan int, 1)
w.cmds <- workerCommand{
action: ActionGetFailedTasks,
data: respCh,
}
return <-respCh
}
// getTaskLoggerConfig returns the task logger configuration with worker's log directory
func (w *Worker) getTaskLoggerConfig() tasks.TaskLoggerConfig {
config := tasks.DefaultTaskLoggerConfig()
// Use worker's configured log directory (BaseWorkingDir is guaranteed to be non-empty)
logDir := filepath.Join(w.config.BaseWorkingDir, "task_logs")
config.BaseLogDir = logDir
return config
}
// ID returns the worker ID
func (w *Worker) ID() string {
return w.id
return worker
}
func (w *Worker) Start() error {
resp := make(chan error)
w.cmds <- workerCommand{
action: ActionStart,
resp: resp,
}
return <-resp
}
// Start starts the worker
func (w *Worker) handleStart(cmd workerCommand) {
if w.state.running {
cmd.resp <- fmt.Errorf("worker is already running")
return
return fmt.Errorf("worker is already running")
}
if w.state.adminClient == nil {
cmd.resp <- fmt.Errorf("admin client is not set")
return
if w.getAdmin() == nil {
return fmt.Errorf("admin client is not set")
}
w.state.running = true
w.state.startTime = time.Now()
w.comms.stop = make(chan struct{})
w.comms.taskReqs = make(chan taskRequest)
w.comms.taskCompl = make(chan taskCompletion)
w.comms.loadQuery = make(chan chan int)
w.comms.metricsQuery = make(chan chan metricsResponse)
// Prepare worker info for registration
workerInfo := &types.WorkerData{
ID: w.id,
@ -451,58 +257,46 @@ func (w *Worker) handleStart(cmd workerCommand) {
LastHeartbeat: time.Now(),
}
// Register worker info with client first (this stores it for use during connection)
if err := w.state.adminClient.RegisterWorker(workerInfo); err != nil {
glog.V(1).Infof("Worker info stored for registration: %v", err)
// This is expected if not connected yet
}
// Start connection attempt (will register immediately if successful)
glog.Infof("WORKER STARTING: Worker %s starting with capabilities %v, max concurrent: %d",
w.id, w.config.Capabilities, w.config.MaxConcurrent)
// Try initial connection, but don't fail if it doesn't work immediately
if err := w.state.adminClient.Connect(); err != nil {
if err := w.getAdmin().Connect(workerInfo); err != nil {
glog.Warningf("INITIAL CONNECTION FAILED: Worker %s initial connection to admin server failed, will keep retrying: %v", w.id, err)
// Don't return error - let the reconnection loop handle it
} else {
glog.Infof("INITIAL CONNECTION SUCCESS: Worker %s successfully connected to admin server", w.id)
}
w.comms.connectionEvents = w.getAdmin().GetEvents()
// Start worker loops regardless of initial connection status
// They will handle connection failures gracefully
glog.V(1).Infof("STARTING LOOPS: Worker %s starting background loops", w.id)
go w.heartbeatLoop()
go w.taskRequestLoop()
go w.connectionMonitorLoop()
go w.connectionMonitorProcess()
go w.messageProcessingLoop()
go w.taskProcess()
glog.Infof("WORKER STARTED: Worker %s started successfully (connection attempts will continue in background)", w.id)
cmd.resp <- nil
return nil
}
func (w *Worker) getTaskLoad() int {
loadCh := make(chan int)
w.comms.loadQuery <- loadCh
return <-loadCh
}
func (w *Worker) Stop() error {
resp := make(chan error)
w.cmds <- workerCommand{
action: ActionStop,
resp: resp,
}
if err := <-resp; err != nil {
return err
if !w.state.running {
return nil
}
// Wait for tasks to finish
timeout := time.NewTimer(30 * time.Second)
defer timeout.Stop()
out:
for w.getTaskLoad() > 0 {
select {
case <-timeout.C:
glog.Warningf("Worker %s stopping with %d tasks still running", w.id, w.getTaskLoad())
break out
case <-time.After(100 * time.Millisecond):
}
}
w.state.running = false
close(w.comms.stop)
// Disconnect from admin server
if adminClient := w.getAdmin(); adminClient != nil {
@ -514,25 +308,85 @@ out:
return nil
}
// Stop stops the worker
func (w *Worker) handleStop(cmd workerCommand) {
if !w.state.running {
cmd.resp <- nil
return
}
// Task Process owns ALL task state
func (w *Worker) taskProcess() {
var currentLoad int
var success int
var failure int
var maxConcurrent = w.config.MaxConcurrent
doneCh := make(chan chan int)
w.state.running = false
close(w.state.stopChan)
for {
select {
case <-w.comms.stop:
if currentLoad > 0 {
glog.Warningf("Worker %s stopping with %d tasks still running", w.id, currentLoad)
}
return
// Stop tickers
if w.state.heartbeatTicker != nil {
w.state.heartbeatTicker.Stop()
}
if w.state.requestTicker != nil {
w.state.requestTicker.Stop()
case req := <-w.comms.taskReqs:
if currentLoad >= maxConcurrent {
req.resp <- taskResponse{
accepted: false,
reason: fmt.Errorf("worker is at capacity"),
}
glog.Errorf("TASK REJECTED: Worker %s at capacity (%d/%d) - rejecting task %s",
w.id, currentLoad, maxConcurrent, req.task.ID)
continue
}
// Accept task and update our owned state
currentLoad++
req.resp <- taskResponse{accepted: true}
glog.Infof("TASK ACCEPTED: Worker %s accepted task %s - current load: %d/%d",
w.id, req.task.ID, currentLoad, maxConcurrent)
// Execute task and manage our own load count
go w.executeTask(req.task, doneCh)
case loadCh := <-doneCh:
currentLoad--
loadCh <- currentLoad
case compl := <-w.comms.taskCompl:
if compl.success {
success++
} else {
failure++
}
case resp := <-w.comms.metricsQuery:
resp <- metricsResponse{success: success, failure: failure}
case resp := <-w.comms.loadQuery:
resp <- currentLoad
}
}
}
func (w *Worker) getAdmin() AdminClient {
return w.client
}
func (w *Worker) getStopChan() <-chan struct{} {
return w.comms.stop
}
cmd.resp <- nil
func (w *Worker) getStartTime() time.Time {
return w.state.startTime
}
// getTaskLoggerConfig returns the task logger configuration with worker's log directory
func (w *Worker) getTaskLoggerConfig() tasks.TaskLoggerConfig {
config := tasks.DefaultTaskLoggerConfig()
// Use worker's configured log directory (BaseWorkingDir is guaranteed to be non-empty)
logDir := filepath.Join(w.config.BaseWorkingDir, "task_logs")
config.BaseLogDir = logDir
return config
}
// ID returns the worker ID
func (w *Worker) ID() string {
return w.id
}
// RegisterTask registers a task factory
@ -545,28 +399,19 @@ func (w *Worker) GetCapabilities() []types.TaskType {
return w.config.Capabilities
}
// GetStatus returns the current worker status
func (w *Worker) GetStatus() types.WorkerStatus {
respCh := make(statusResponse, 1)
w.cmds <- workerCommand{
action: ActionGetStatus,
data: respCh,
resp: nil,
}
return <-respCh
}
// HandleTask handles a task execution
func (w *Worker) HandleTask(task *types.TaskInput) error {
glog.V(1).Infof("Worker %s received task %s (type: %s, volume: %d)",
w.id, task.ID, task.Type, task.VolumeID)
if err := w.setTask(task); err != nil {
return err
resp := make(chan taskResponse)
if w.comms.taskReqs == nil {
return fmt.Errorf("worker is shutting down")
}
w.comms.taskReqs <- taskRequest{task: task, resp: resp}
result := <-resp
if !result.accepted {
return result.reason
}
// Execute task in goroutine
go w.executeTask(task)
return nil
}
@ -593,18 +438,17 @@ func (w *Worker) SetTaskRequestInterval(interval time.Duration) {
// SetAdminClient sets the admin client
func (w *Worker) SetAdminClient(client AdminClient) {
w.cmds <- workerCommand{
action: ActionSetAdmin,
data: client,
}
w.client = client
}
// executeTask executes a task
func (w *Worker) executeTask(task *types.TaskInput) {
func (w *Worker) executeTask(task *types.TaskInput, done chan<- chan int) {
startTime := time.Now()
defer func() {
currentLoad := w.removeTask(task)
currentLoadCh := make(chan int)
done <- currentLoadCh
currentLoad := <-currentLoadCh
duration := time.Since(startTime)
glog.Infof("TASK EXECUTION FINISHED: Worker %s finished executing task %s after %v - current load: %d/%d",
@ -708,9 +552,7 @@ func (w *Worker) executeTask(task *types.TaskInput) {
// Report completion
if err != nil {
w.completeTask(task.ID, false, err.Error())
w.cmds <- workerCommand{
action: ActionIncTaskFail,
}
w.comms.taskCompl <- taskCompletion{success: false}
glog.Errorf("Worker %s failed to execute task %s: %v", w.id, task.ID, err)
if fileLogger != nil {
fileLogger.LogStatus("failed", err.Error())
@ -718,9 +560,7 @@ func (w *Worker) executeTask(task *types.TaskInput) {
}
} else {
w.completeTask(task.ID, true, "")
w.cmds <- workerCommand{
action: ActionIncTaskComplete,
}
w.comms.taskCompl <- taskCompletion{success: true}
glog.Infof("Worker %s completed task %s successfully", w.id, task.ID)
if fileLogger != nil {
fileLogger.Info("Task %s completed successfully", task.ID)
@ -739,8 +579,8 @@ func (w *Worker) completeTask(taskID string, success bool, errorMsg string) {
// heartbeatLoop sends periodic heartbeats to the admin server
func (w *Worker) heartbeatLoop() {
defer w.setHbTick(time.NewTicker(w.config.HeartbeatInterval)).Stop()
ticker := w.getHbTick()
ticker := time.NewTicker(w.config.HeartbeatInterval)
defer ticker.Stop()
stopChan := w.getStopChan()
for {
select {
@ -754,8 +594,8 @@ func (w *Worker) heartbeatLoop() {
// taskRequestLoop periodically requests new tasks from the admin server
func (w *Worker) taskRequestLoop() {
defer w.setReqTick(time.NewTicker(w.config.TaskRequestInterval)).Stop()
ticker := w.getReqTick()
ticker := time.NewTicker(w.config.TaskRequestInterval)
defer ticker.Stop()
stopChan := w.getStopChan()
for {
select {
@ -820,49 +660,25 @@ func (w *Worker) GetTaskRegistry() *tasks.TaskRegistry {
return w.registry
}
// registerWorker registers the worker with the admin server
func (w *Worker) registerWorker() {
workerInfo := &types.WorkerData{
ID: w.id,
Capabilities: w.config.Capabilities,
MaxConcurrent: w.config.MaxConcurrent,
Status: "active",
CurrentLoad: 0,
LastHeartbeat: time.Now(),
}
if err := w.getAdmin().RegisterWorker(workerInfo); err != nil {
glog.Warningf("Failed to register worker (will retry on next heartbeat): %v", err)
} else {
glog.Infof("Worker %s registered successfully with admin server", w.id)
}
}
// connectionMonitorLoop monitors connection status
func (w *Worker) connectionMonitorLoop() {
ticker := time.NewTicker(30 * time.Second) // Check every 30 seconds
defer ticker.Stop()
lastConnectionStatus := false
// connectionMonitorProcess monitors connection status
func (w *Worker) connectionMonitorProcess() {
var connected bool
stopChan := w.getStopChan()
for {
select {
case <-stopChan:
glog.V(1).Infof("CONNECTION MONITOR STOPPING: Worker %s connection monitor loop stopping", w.id)
return
case <-ticker.C:
// Monitor connection status and log changes
currentConnectionStatus := w.getAdmin() != nil && w.getAdmin().IsConnected()
if currentConnectionStatus != lastConnectionStatus {
if currentConnectionStatus {
case event := <-w.comms.connectionEvents:
if event.connected != connected {
if event.connected {
glog.Infof("CONNECTION RESTORED: Worker %s connection status changed: connected", w.id)
} else {
glog.Warningf("CONNECTION LOST: Worker %s connection status changed: disconnected", w.id)
}
lastConnectionStatus = currentConnectionStatus
connected = event.connected
} else {
if currentConnectionStatus {
if event.connected {
glog.V(3).Infof("CONNECTION OK: Worker %s connection status: connected", w.id)
} else {
glog.V(1).Infof("CONNECTION DOWN: Worker %s connection status: disconnected, reconnection in progress", w.id)
@ -880,16 +696,22 @@ func (w *Worker) GetConfig() *types.WorkerConfig {
// GetPerformanceMetrics returns performance metrics
func (w *Worker) GetPerformanceMetrics() *types.WorkerPerformance {
metricsCh := make(chan metricsResponse)
w.comms.metricsQuery <- metricsCh
metrics := <-metricsCh
success := metrics.success
failure := metrics.failure
uptime := time.Since(w.getStartTime())
var successRate float64
totalTasks := w.getCompletedTasks() + w.getFailedTasks()
totalTasks := success + failure
if totalTasks > 0 {
successRate = float64(w.getCompletedTasks()) / float64(totalTasks) * 100
successRate = float64(success) / float64(totalTasks) * 100
}
return &types.WorkerPerformance{
TasksCompleted: w.getCompletedTasks(),
TasksFailed: w.getFailedTasks(),
TasksCompleted: success,
TasksFailed: failure,
AverageTaskTime: 0, // Would need to track this
Uptime: uptime,
SuccessRate: successRate,
@ -1006,11 +828,8 @@ func (w *Worker) handleTaskLogRequest(request *worker_pb.TaskLogRequest) {
func (w *Worker) handleTaskCancellation(cancellation *worker_pb.TaskCancellation) {
glog.Infof("Worker %s received task cancellation for task %s", w.id, cancellation.TaskId)
w.cmds <- workerCommand{
action: ActionCancelTask,
data: cancellation.TaskId,
resp: nil,
}
// TODO: To implement task cancellation, each task type should define how
// a task can be cancelled.
}
// handleAdminShutdown processes admin shutdown notifications

Loading…
Cancel
Save