1 changed files with 701 additions and 0 deletions
@ -0,0 +1,701 @@ |
|||
package pluginworker |
|||
|
|||
import ( |
|||
"context" |
|||
"fmt" |
|||
"sort" |
|||
"strings" |
|||
"time" |
|||
|
|||
"github.com/seaweedfs/seaweedfs/weed/admin/topology" |
|||
"github.com/seaweedfs/seaweedfs/weed/glog" |
|||
"github.com/seaweedfs/seaweedfs/weed/operation" |
|||
"github.com/seaweedfs/seaweedfs/weed/pb" |
|||
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb" |
|||
"github.com/seaweedfs/seaweedfs/weed/pb/plugin_pb" |
|||
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb" |
|||
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" |
|||
ecrepair "github.com/seaweedfs/seaweedfs/weed/worker/tasks/ec_repair" |
|||
"google.golang.org/grpc" |
|||
"google.golang.org/protobuf/proto" |
|||
) |
|||
|
|||
type ecRepairWorkerConfig struct { |
|||
MinIntervalSeconds int |
|||
} |
|||
|
|||
// EcRepairHandler is the plugin job handler for EC shard repair.
|
|||
type EcRepairHandler struct { |
|||
grpcDialOption grpc.DialOption |
|||
} |
|||
|
|||
func NewEcRepairHandler(grpcDialOption grpc.DialOption) *EcRepairHandler { |
|||
return &EcRepairHandler{grpcDialOption: grpcDialOption} |
|||
} |
|||
|
|||
func (h *EcRepairHandler) Capability() *plugin_pb.JobTypeCapability { |
|||
return &plugin_pb.JobTypeCapability{ |
|||
JobType: "ec_repair", |
|||
CanDetect: true, |
|||
CanExecute: true, |
|||
MaxDetectionConcurrency: 1, |
|||
MaxExecutionConcurrency: 1, |
|||
DisplayName: "EC Repair", |
|||
Description: "Repairs missing or inconsistent EC shards", |
|||
} |
|||
} |
|||
|
|||
func (h *EcRepairHandler) Descriptor() *plugin_pb.JobTypeDescriptor { |
|||
return &plugin_pb.JobTypeDescriptor{ |
|||
JobType: "ec_repair", |
|||
DisplayName: "EC Repair", |
|||
Description: "Detect and repair missing or inconsistent erasure coding shards", |
|||
Icon: "fas fa-toolbox", |
|||
DescriptorVersion: 1, |
|||
AdminConfigForm: &plugin_pb.ConfigForm{ |
|||
FormId: "ec-repair-admin", |
|||
Title: "EC Repair Admin Config", |
|||
Description: "Admin-side controls for EC repair detection scope.", |
|||
Sections: []*plugin_pb.ConfigSection{ |
|||
{ |
|||
SectionId: "scope", |
|||
Title: "Scope", |
|||
Description: "Optional filters applied before EC repair detection.", |
|||
Fields: []*plugin_pb.ConfigField{ |
|||
{ |
|||
Name: "collection_filter", |
|||
Label: "Collection Filter", |
|||
Description: "Only detect EC repairs for this collection when set.", |
|||
Placeholder: "all collections", |
|||
FieldType: plugin_pb.ConfigFieldType_CONFIG_FIELD_TYPE_STRING, |
|||
Widget: plugin_pb.ConfigWidget_CONFIG_WIDGET_TEXT, |
|||
}, |
|||
}, |
|||
}, |
|||
}, |
|||
DefaultValues: map[string]*plugin_pb.ConfigValue{ |
|||
"collection_filter": { |
|||
Kind: &plugin_pb.ConfigValue_StringValue{StringValue: ""}, |
|||
}, |
|||
}, |
|||
}, |
|||
WorkerConfigForm: &plugin_pb.ConfigForm{ |
|||
FormId: "ec-repair-worker", |
|||
Title: "EC Repair Worker Config", |
|||
Description: "Worker-side EC repair controls.", |
|||
Sections: []*plugin_pb.ConfigSection{ |
|||
{ |
|||
SectionId: "interval", |
|||
Title: "Detection Interval", |
|||
Description: "Minimum interval between EC repair scans.", |
|||
Fields: []*plugin_pb.ConfigField{ |
|||
{ |
|||
Name: "min_interval_seconds", |
|||
Label: "Minimum Detection Interval (s)", |
|||
Description: "Skip detection if the last successful run is more recent than this interval.", |
|||
FieldType: plugin_pb.ConfigFieldType_CONFIG_FIELD_TYPE_INT64, |
|||
Widget: plugin_pb.ConfigWidget_CONFIG_WIDGET_NUMBER, |
|||
Required: true, |
|||
MinValue: &plugin_pb.ConfigValue{Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: 0}}, |
|||
}, |
|||
}, |
|||
}, |
|||
}, |
|||
DefaultValues: map[string]*plugin_pb.ConfigValue{ |
|||
"min_interval_seconds": { |
|||
Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: 300}, |
|||
}, |
|||
}, |
|||
}, |
|||
AdminRuntimeDefaults: &plugin_pb.AdminRuntimeDefaults{ |
|||
Enabled: true, |
|||
DetectionIntervalSeconds: 10 * 60, |
|||
DetectionTimeoutSeconds: 300, |
|||
MaxJobsPerDetection: 500, |
|||
GlobalExecutionConcurrency: 8, |
|||
PerWorkerExecutionConcurrency: 2, |
|||
RetryLimit: 1, |
|||
RetryBackoffSeconds: 30, |
|||
}, |
|||
WorkerDefaultValues: map[string]*plugin_pb.ConfigValue{ |
|||
"min_interval_seconds": { |
|||
Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: 300}, |
|||
}, |
|||
}, |
|||
} |
|||
} |
|||
|
|||
func (h *EcRepairHandler) Detect(ctx context.Context, request *plugin_pb.RunDetectionRequest, sender DetectionSender) error { |
|||
if request == nil { |
|||
return fmt.Errorf("run detection request is nil") |
|||
} |
|||
if sender == nil { |
|||
return fmt.Errorf("detection sender is nil") |
|||
} |
|||
if request.JobType != "" && request.JobType != "ec_repair" { |
|||
return fmt.Errorf("job type %q is not handled by ec_repair worker", request.JobType) |
|||
} |
|||
|
|||
workerConfig := deriveEcRepairWorkerConfig(request.GetWorkerConfigValues()) |
|||
if shouldSkipDetectionByInterval(request.GetLastSuccessfulRun(), workerConfig.MinIntervalSeconds) { |
|||
minInterval := time.Duration(workerConfig.MinIntervalSeconds) * time.Second |
|||
_ = sender.SendActivity(buildDetectorActivity( |
|||
"skipped_by_interval", |
|||
fmt.Sprintf("EC REPAIR: Detection skipped due to min interval (%s)", minInterval), |
|||
map[string]*plugin_pb.ConfigValue{ |
|||
"min_interval_seconds": { |
|||
Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: int64(workerConfig.MinIntervalSeconds)}, |
|||
}, |
|||
}, |
|||
)) |
|||
if err := sender.SendProposals(&plugin_pb.DetectionProposals{ |
|||
JobType: "ec_repair", |
|||
Proposals: []*plugin_pb.JobProposal{}, |
|||
HasMore: false, |
|||
}); err != nil { |
|||
return err |
|||
} |
|||
return sender.SendComplete(&plugin_pb.DetectionComplete{ |
|||
JobType: "ec_repair", |
|||
Success: true, |
|||
TotalProposals: 0, |
|||
}) |
|||
} |
|||
|
|||
collectionFilter := strings.TrimSpace(readStringConfig(request.GetAdminConfigValues(), "collection_filter", "")) |
|||
masters := make([]string, 0) |
|||
if request.ClusterContext != nil { |
|||
masters = append(masters, request.ClusterContext.MasterGrpcAddresses...) |
|||
} |
|||
|
|||
response, _, err := h.fetchTopology(ctx, masters) |
|||
if err != nil { |
|||
return err |
|||
} |
|||
|
|||
maxResults := int(request.MaxResults) |
|||
candidates, hasMore, err := ecrepair.Detect(response.TopologyInfo, collectionFilter, maxResults) |
|||
if err != nil { |
|||
return err |
|||
} |
|||
|
|||
proposals := make([]*plugin_pb.JobProposal, 0, len(candidates)) |
|||
for _, candidate := range candidates { |
|||
proposal, proposalErr := buildEcRepairProposal(candidate) |
|||
if proposalErr != nil { |
|||
glog.Warningf("Plugin worker skip invalid ec_repair proposal: %v", proposalErr) |
|||
continue |
|||
} |
|||
proposals = append(proposals, proposal) |
|||
} |
|||
|
|||
if err := sender.SendProposals(&plugin_pb.DetectionProposals{ |
|||
JobType: "ec_repair", |
|||
Proposals: proposals, |
|||
HasMore: hasMore, |
|||
}); err != nil { |
|||
return err |
|||
} |
|||
|
|||
return sender.SendComplete(&plugin_pb.DetectionComplete{ |
|||
JobType: "ec_repair", |
|||
Success: true, |
|||
TotalProposals: int32(len(proposals)), |
|||
}) |
|||
} |
|||
|
|||
func (h *EcRepairHandler) Execute(ctx context.Context, request *plugin_pb.ExecuteJobRequest, sender ExecutionSender) error { |
|||
if request == nil || request.Job == nil { |
|||
return fmt.Errorf("execute request/job is nil") |
|||
} |
|||
if sender == nil { |
|||
return fmt.Errorf("execution sender is nil") |
|||
} |
|||
if request.Job.JobType != "" && request.Job.JobType != "ec_repair" { |
|||
return fmt.Errorf("job type %q is not handled by ec_repair worker", request.Job.JobType) |
|||
} |
|||
|
|||
volumeID := readInt64Config(request.Job.Parameters, "volume_id", 0) |
|||
if volumeID <= 0 { |
|||
return fmt.Errorf("missing volume_id in job parameters") |
|||
} |
|||
collection := readStringConfig(request.Job.Parameters, "collection", "") |
|||
diskType := readStringConfig(request.Job.Parameters, "disk_type", "") |
|||
|
|||
masters := make([]string, 0) |
|||
if request.ClusterContext != nil { |
|||
masters = append(masters, request.ClusterContext.MasterGrpcAddresses...) |
|||
} |
|||
|
|||
if err := sendProgress(sender, request.Job, 0, "assigned", "ec repair job accepted"); err != nil { |
|||
return err |
|||
} |
|||
|
|||
response, activeTopology, err := h.fetchTopology(ctx, masters) |
|||
if err != nil { |
|||
return err |
|||
} |
|||
|
|||
plan, err := ecrepair.BuildRepairPlan(response.TopologyInfo, activeTopology, uint32(volumeID), collection, diskType) |
|||
if err != nil { |
|||
return err |
|||
} |
|||
|
|||
if len(plan.MissingShards) == 0 && len(plan.DeleteByNode) == 0 { |
|||
resultSummary := fmt.Sprintf("EC repair skipped for volume %d (no issues found)", volumeID) |
|||
return sender.SendCompleted(&plugin_pb.JobCompleted{ |
|||
JobId: request.Job.JobId, |
|||
JobType: request.Job.JobType, |
|||
Success: true, |
|||
Result: &plugin_pb.JobResult{ |
|||
Summary: resultSummary, |
|||
OutputValues: map[string]*plugin_pb.ConfigValue{ |
|||
"volume_id": { |
|||
Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: volumeID}, |
|||
}, |
|||
}, |
|||
}, |
|||
Activities: []*plugin_pb.ActivityEvent{ |
|||
buildExecutorActivity("completed", resultSummary), |
|||
}, |
|||
}) |
|||
} |
|||
|
|||
if err := h.executeRepairPlan(ctx, plan, request.Job, sender); err != nil { |
|||
_ = sendProgress(sender, request.Job, 100, "failed", err.Error()) |
|||
return err |
|||
} |
|||
|
|||
resultSummary := fmt.Sprintf("EC repair completed for volume %d", volumeID) |
|||
return sender.SendCompleted(&plugin_pb.JobCompleted{ |
|||
JobId: request.Job.JobId, |
|||
JobType: request.Job.JobType, |
|||
Success: true, |
|||
Result: &plugin_pb.JobResult{ |
|||
Summary: resultSummary, |
|||
OutputValues: map[string]*plugin_pb.ConfigValue{ |
|||
"volume_id": { |
|||
Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: volumeID}, |
|||
}, |
|||
"collection": { |
|||
Kind: &plugin_pb.ConfigValue_StringValue{StringValue: plan.Collection}, |
|||
}, |
|||
"disk_type": { |
|||
Kind: &plugin_pb.ConfigValue_StringValue{StringValue: plan.DiskType}, |
|||
}, |
|||
}, |
|||
}, |
|||
Activities: []*plugin_pb.ActivityEvent{ |
|||
buildExecutorActivity("completed", resultSummary), |
|||
}, |
|||
}) |
|||
} |
|||
|
|||
func (h *EcRepairHandler) executeRepairPlan(ctx context.Context, plan *ecrepair.RepairPlan, job *plugin_pb.JobSpec, sender ExecutionSender) error { |
|||
if plan == nil { |
|||
return fmt.Errorf("repair plan is nil") |
|||
} |
|||
|
|||
if len(plan.MissingShards) > 0 { |
|||
if err := sendProgress(sender, job, 10, "copy_existing_shards", "copying EC shards to rebuilder"); err != nil { |
|||
return err |
|||
} |
|||
if err := h.copyShardsToRebuilder(ctx, plan); err != nil { |
|||
return err |
|||
} |
|||
|
|||
if err := sendProgress(sender, job, 40, "rebuild_missing_shards", "rebuilding missing EC shards"); err != nil { |
|||
return err |
|||
} |
|||
rebuilt, err := h.rebuildMissingShards(ctx, plan) |
|||
if err != nil { |
|||
return err |
|||
} |
|||
|
|||
if err := sendProgress(sender, job, 60, "distribute_shards", "distributing rebuilt EC shards"); err != nil { |
|||
return err |
|||
} |
|||
if err := h.distributeRebuiltShards(ctx, plan, rebuilt); err != nil { |
|||
return err |
|||
} |
|||
|
|||
if err := sendProgress(sender, job, 75, "cleanup_rebuilder", "cleaning up rebuilder temporary shards"); err != nil { |
|||
return err |
|||
} |
|||
if err := h.cleanupRebuilder(ctx, plan, rebuilt); err != nil { |
|||
return err |
|||
} |
|||
} |
|||
|
|||
if len(plan.DeleteByNode) > 0 { |
|||
if err := sendProgress(sender, job, 85, "delete_extra_shards", "deleting extra or mismatched shards"); err != nil { |
|||
return err |
|||
} |
|||
if err := h.deleteExtraShards(ctx, plan); err != nil { |
|||
return err |
|||
} |
|||
} |
|||
|
|||
return sendProgress(sender, job, 100, "completed", "ec repair completed") |
|||
} |
|||
|
|||
func (h *EcRepairHandler) copyShardsToRebuilder(ctx context.Context, plan *ecrepair.RepairPlan) error { |
|||
if plan.Rebuilder.NodeAddress == "" { |
|||
return fmt.Errorf("rebuilder node is required") |
|||
} |
|||
if len(plan.CopySources) == 0 { |
|||
return nil |
|||
} |
|||
|
|||
localShardSet := make(map[uint32]struct{}, len(plan.Rebuilder.LocalShards)) |
|||
for _, shardID := range plan.Rebuilder.LocalShards { |
|||
localShardSet[shardID] = struct{}{} |
|||
} |
|||
|
|||
shardIDs := make([]uint32, 0, len(plan.CopySources)) |
|||
for shardID := range plan.CopySources { |
|||
shardIDs = append(shardIDs, shardID) |
|||
} |
|||
sort.Slice(shardIDs, func(i, j int) bool { return shardIDs[i] < shardIDs[j] }) |
|||
|
|||
copyIndexFiles := true |
|||
for _, shardID := range shardIDs { |
|||
source := strings.TrimSpace(plan.CopySources[shardID]) |
|||
if source == "" { |
|||
continue |
|||
} |
|||
if source == plan.Rebuilder.NodeAddress { |
|||
if _, ok := localShardSet[shardID]; ok { |
|||
continue |
|||
} |
|||
} |
|||
|
|||
err := operation.WithVolumeServerClient(false, pb.ServerAddress(plan.Rebuilder.NodeAddress), h.grpcDialOption, func(client volume_server_pb.VolumeServerClient) error { |
|||
_, err := client.VolumeEcShardsCopy(ctx, &volume_server_pb.VolumeEcShardsCopyRequest{ |
|||
VolumeId: plan.VolumeID, |
|||
Collection: plan.Collection, |
|||
ShardIds: []uint32{shardID}, |
|||
CopyEcxFile: copyIndexFiles, |
|||
CopyEcjFile: copyIndexFiles, |
|||
CopyVifFile: copyIndexFiles, |
|||
SourceDataNode: source, |
|||
DiskId: plan.Rebuilder.DiskID, |
|||
}) |
|||
return err |
|||
}) |
|||
if err != nil { |
|||
return err |
|||
} |
|||
copyIndexFiles = false |
|||
} |
|||
|
|||
return nil |
|||
} |
|||
|
|||
func (h *EcRepairHandler) rebuildMissingShards(ctx context.Context, plan *ecrepair.RepairPlan) ([]uint32, error) { |
|||
var rebuilt []uint32 |
|||
if plan.Rebuilder.NodeAddress == "" { |
|||
return nil, fmt.Errorf("rebuilder node is required") |
|||
} |
|||
if err := operation.WithVolumeServerClient(false, pb.ServerAddress(plan.Rebuilder.NodeAddress), h.grpcDialOption, func(client volume_server_pb.VolumeServerClient) error { |
|||
resp, err := client.VolumeEcShardsRebuild(ctx, &volume_server_pb.VolumeEcShardsRebuildRequest{ |
|||
VolumeId: plan.VolumeID, |
|||
Collection: plan.Collection, |
|||
}) |
|||
if err != nil { |
|||
return err |
|||
} |
|||
rebuilt = append(rebuilt, resp.RebuiltShardIds...) |
|||
return nil |
|||
}); err != nil { |
|||
return nil, err |
|||
} |
|||
if len(rebuilt) == 0 { |
|||
rebuilt = append(rebuilt, plan.MissingShards...) |
|||
} |
|||
return rebuilt, nil |
|||
} |
|||
|
|||
func (h *EcRepairHandler) distributeRebuiltShards(ctx context.Context, plan *ecrepair.RepairPlan, rebuilt []uint32) error { |
|||
if len(plan.Targets) == 0 { |
|||
return nil |
|||
} |
|||
if len(rebuilt) == 0 { |
|||
return nil |
|||
} |
|||
|
|||
rebuiltSet := make(map[uint32]struct{}, len(rebuilt)) |
|||
for _, shardID := range rebuilt { |
|||
rebuiltSet[shardID] = struct{}{} |
|||
} |
|||
|
|||
targetCopyIndex := make(map[string]bool) |
|||
for _, target := range plan.Targets { |
|||
if target.NodeAddress == "" { |
|||
continue |
|||
} |
|||
var shardIDs []uint32 |
|||
for _, shardID := range target.ShardIDs { |
|||
if _, ok := rebuiltSet[shardID]; ok { |
|||
shardIDs = append(shardIDs, shardID) |
|||
} |
|||
} |
|||
if len(shardIDs) == 0 { |
|||
continue |
|||
} |
|||
sort.Slice(shardIDs, func(i, j int) bool { return shardIDs[i] < shardIDs[j] }) |
|||
|
|||
copyIndex := !targetCopyIndex[target.NodeAddress] |
|||
targetCopyIndex[target.NodeAddress] = true |
|||
|
|||
err := operation.WithVolumeServerClient(false, pb.ServerAddress(target.NodeAddress), h.grpcDialOption, func(client volume_server_pb.VolumeServerClient) error { |
|||
if target.NodeAddress != plan.Rebuilder.NodeAddress { |
|||
_, err := client.VolumeEcShardsCopy(ctx, &volume_server_pb.VolumeEcShardsCopyRequest{ |
|||
VolumeId: plan.VolumeID, |
|||
Collection: plan.Collection, |
|||
ShardIds: shardIDs, |
|||
CopyEcxFile: copyIndex, |
|||
CopyEcjFile: copyIndex, |
|||
CopyVifFile: copyIndex, |
|||
SourceDataNode: plan.Rebuilder.NodeAddress, |
|||
DiskId: target.DiskID, |
|||
}) |
|||
if err != nil { |
|||
return err |
|||
} |
|||
} |
|||
|
|||
_, err := client.VolumeEcShardsMount(ctx, &volume_server_pb.VolumeEcShardsMountRequest{ |
|||
VolumeId: plan.VolumeID, |
|||
Collection: plan.Collection, |
|||
ShardIds: shardIDs, |
|||
}) |
|||
return err |
|||
}) |
|||
if err != nil { |
|||
return err |
|||
} |
|||
} |
|||
|
|||
return nil |
|||
} |
|||
|
|||
func (h *EcRepairHandler) cleanupRebuilder(ctx context.Context, plan *ecrepair.RepairPlan, rebuilt []uint32) error { |
|||
if plan.Rebuilder.NodeAddress == "" || len(rebuilt) == 0 { |
|||
return nil |
|||
} |
|||
|
|||
keep := make(map[uint32]struct{}) |
|||
for _, shardID := range plan.Rebuilder.LocalShards { |
|||
keep[shardID] = struct{}{} |
|||
} |
|||
for _, target := range plan.Targets { |
|||
if target.NodeAddress != plan.Rebuilder.NodeAddress { |
|||
continue |
|||
} |
|||
for _, shardID := range target.ShardIDs { |
|||
keep[shardID] = struct{}{} |
|||
} |
|||
} |
|||
|
|||
var toDelete []uint32 |
|||
for _, shardID := range rebuilt { |
|||
if _, ok := keep[shardID]; ok { |
|||
continue |
|||
} |
|||
toDelete = append(toDelete, shardID) |
|||
} |
|||
if len(toDelete) == 0 { |
|||
return nil |
|||
} |
|||
return deleteShardIds(ctx, h.grpcDialOption, plan.Rebuilder.NodeAddress, plan.VolumeID, plan.Collection, toDelete) |
|||
} |
|||
|
|||
func (h *EcRepairHandler) deleteExtraShards(ctx context.Context, plan *ecrepair.RepairPlan) error { |
|||
for nodeAddress, shardIDs := range plan.DeleteByNode { |
|||
if len(shardIDs) == 0 { |
|||
continue |
|||
} |
|||
if err := deleteShardIds(ctx, h.grpcDialOption, nodeAddress, plan.VolumeID, plan.Collection, shardIDs); err != nil { |
|||
return err |
|||
} |
|||
} |
|||
return nil |
|||
} |
|||
|
|||
func deleteShardIds(ctx context.Context, dialOption grpc.DialOption, nodeAddress string, volumeID uint32, collection string, shardIDs []uint32) error { |
|||
sorted := make([]uint32, len(shardIDs)) |
|||
copy(sorted, shardIDs) |
|||
sort.Slice(sorted, func(i, j int) bool { return sorted[i] < sorted[j] }) |
|||
|
|||
return operation.WithVolumeServerClient(false, pb.ServerAddress(nodeAddress), dialOption, func(client volume_server_pb.VolumeServerClient) error { |
|||
_, err := client.VolumeEcShardsUnmount(ctx, &volume_server_pb.VolumeEcShardsUnmountRequest{ |
|||
VolumeId: volumeID, |
|||
ShardIds: sorted, |
|||
}) |
|||
if err != nil { |
|||
return err |
|||
} |
|||
_, err = client.VolumeEcShardsDelete(ctx, &volume_server_pb.VolumeEcShardsDeleteRequest{ |
|||
VolumeId: volumeID, |
|||
Collection: collection, |
|||
ShardIds: sorted, |
|||
}) |
|||
return err |
|||
}) |
|||
} |
|||
|
|||
func (h *EcRepairHandler) fetchTopology(ctx context.Context, masterAddresses []string) (*master_pb.VolumeListResponse, *topology.ActiveTopology, error) { |
|||
if h.grpcDialOption == nil { |
|||
return nil, nil, fmt.Errorf("grpc dial option is not configured") |
|||
} |
|||
if len(masterAddresses) == 0 { |
|||
return nil, nil, fmt.Errorf("no master addresses provided in cluster context") |
|||
} |
|||
|
|||
for _, masterAddress := range masterAddresses { |
|||
response, err := h.fetchVolumeList(ctx, masterAddress) |
|||
if err != nil { |
|||
glog.Warningf("Plugin worker failed master volume list at %s: %v", masterAddress, err) |
|||
continue |
|||
} |
|||
if response == nil || response.TopologyInfo == nil { |
|||
continue |
|||
} |
|||
activeTopology := topology.NewActiveTopology(10) |
|||
if err := activeTopology.UpdateTopology(response.TopologyInfo); err != nil { |
|||
return nil, nil, err |
|||
} |
|||
return response, activeTopology, nil |
|||
} |
|||
|
|||
return nil, nil, fmt.Errorf("failed to load topology from all provided masters") |
|||
} |
|||
|
|||
func (h *EcRepairHandler) fetchVolumeList(ctx context.Context, address string) (*master_pb.VolumeListResponse, error) { |
|||
var lastErr error |
|||
for _, candidate := range masterAddressCandidates(address) { |
|||
if ctx.Err() != nil { |
|||
return nil, ctx.Err() |
|||
} |
|||
|
|||
dialCtx, cancelDial := context.WithTimeout(ctx, 5*time.Second) |
|||
conn, err := pb.GrpcDial(dialCtx, candidate, false, h.grpcDialOption) |
|||
cancelDial() |
|||
if err != nil { |
|||
lastErr = err |
|||
continue |
|||
} |
|||
|
|||
client := master_pb.NewSeaweedClient(conn) |
|||
callCtx, cancelCall := context.WithTimeout(ctx, 10*time.Second) |
|||
response, callErr := client.VolumeList(callCtx, &master_pb.VolumeListRequest{}) |
|||
cancelCall() |
|||
_ = conn.Close() |
|||
|
|||
if callErr == nil { |
|||
return response, nil |
|||
} |
|||
lastErr = callErr |
|||
} |
|||
|
|||
if lastErr == nil { |
|||
lastErr = fmt.Errorf("no valid master address candidate") |
|||
} |
|||
return nil, lastErr |
|||
} |
|||
|
|||
func deriveEcRepairWorkerConfig(values map[string]*plugin_pb.ConfigValue) *ecRepairWorkerConfig { |
|||
return &ecRepairWorkerConfig{ |
|||
MinIntervalSeconds: int(readInt64Config(values, "min_interval_seconds", 300)), |
|||
} |
|||
} |
|||
|
|||
func buildEcRepairProposal(candidate *ecrepair.RepairCandidate) (*plugin_pb.JobProposal, error) { |
|||
if candidate == nil { |
|||
return nil, fmt.Errorf("repair candidate is nil") |
|||
} |
|||
|
|||
params := &worker_pb.TaskParams{ |
|||
VolumeId: candidate.VolumeID, |
|||
Collection: candidate.Collection, |
|||
} |
|||
payload, err := proto.Marshal(params) |
|||
if err != nil { |
|||
return nil, fmt.Errorf("marshal task params: %w", err) |
|||
} |
|||
|
|||
proposalID := fmt.Sprintf("ec-repair-%d-%d", candidate.VolumeID, time.Now().UnixNano()) |
|||
dedupeKey := fmt.Sprintf("ec_repair:%d", candidate.VolumeID) |
|||
if candidate.Collection != "" { |
|||
dedupeKey = dedupeKey + ":" + candidate.Collection |
|||
} |
|||
if candidate.DiskType != "" { |
|||
dedupeKey = dedupeKey + ":" + candidate.DiskType |
|||
} |
|||
|
|||
summary := fmt.Sprintf("Repair EC volume %d", candidate.VolumeID) |
|||
if candidate.Collection != "" { |
|||
summary = summary + " (" + candidate.Collection + ")" |
|||
} |
|||
|
|||
detail := fmt.Sprintf("missing shards=%d, extra shards=%d, mismatched shards=%d", candidate.MissingShards, candidate.ExtraShards, candidate.MismatchedShards) |
|||
|
|||
return &plugin_pb.JobProposal{ |
|||
ProposalId: proposalID, |
|||
DedupeKey: dedupeKey, |
|||
JobType: "ec_repair", |
|||
Priority: plugin_pb.JobPriority_JOB_PRIORITY_NORMAL, |
|||
Summary: summary, |
|||
Detail: detail, |
|||
Parameters: map[string]*plugin_pb.ConfigValue{ |
|||
"task_params_pb": { |
|||
Kind: &plugin_pb.ConfigValue_BytesValue{BytesValue: payload}, |
|||
}, |
|||
"volume_id": { |
|||
Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: int64(candidate.VolumeID)}, |
|||
}, |
|||
"collection": { |
|||
Kind: &plugin_pb.ConfigValue_StringValue{StringValue: candidate.Collection}, |
|||
}, |
|||
"disk_type": { |
|||
Kind: &plugin_pb.ConfigValue_StringValue{StringValue: candidate.DiskType}, |
|||
}, |
|||
"missing_shards": { |
|||
Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: int64(candidate.MissingShards)}, |
|||
}, |
|||
"extra_shards": { |
|||
Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: int64(candidate.ExtraShards)}, |
|||
}, |
|||
"mismatched_shards": { |
|||
Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: int64(candidate.MismatchedShards)}, |
|||
}, |
|||
}, |
|||
Labels: map[string]string{ |
|||
"task_type": "ec_repair", |
|||
"volume_id": fmt.Sprintf("%d", candidate.VolumeID), |
|||
"collection": candidate.Collection, |
|||
"disk_type": candidate.DiskType, |
|||
"missing_shards": fmt.Sprintf("%d", candidate.MissingShards), |
|||
"extra_shards": fmt.Sprintf("%d", candidate.ExtraShards), |
|||
"mismatched_shards": fmt.Sprintf("%d", candidate.MismatchedShards), |
|||
}, |
|||
}, nil |
|||
} |
|||
|
|||
func sendProgress(sender ExecutionSender, job *plugin_pb.JobSpec, percent float64, stage string, message string) error { |
|||
if sender == nil || job == nil { |
|||
return nil |
|||
} |
|||
return sender.SendProgress(&plugin_pb.JobProgressUpdate{ |
|||
JobId: job.JobId, |
|||
JobType: job.JobType, |
|||
State: plugin_pb.JobState_JOB_STATE_RUNNING, |
|||
ProgressPercent: percent, |
|||
Stage: stage, |
|||
Message: message, |
|||
Activities: []*plugin_pb.ActivityEvent{ |
|||
buildExecutorActivity(stage, message), |
|||
}, |
|||
}) |
|||
} |
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue