You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

627 lines
18 KiB

package consumer
import (
"context"
"encoding/binary"
"encoding/json"
"fmt"
"log"
"sync"
"time"
"github.com/IBM/sarama"
"github.com/linkedin/goavro/v2"
"github.com/seaweedfs/seaweedfs/test/kafka/kafka-client-loadtest/internal/config"
"github.com/seaweedfs/seaweedfs/test/kafka/kafka-client-loadtest/internal/metrics"
pb "github.com/seaweedfs/seaweedfs/test/kafka/kafka-client-loadtest/internal/schema/pb"
"google.golang.org/protobuf/proto"
)
// Consumer represents a Kafka consumer for load testing
type Consumer struct {
id int
config *config.Config
metricsCollector *metrics.Collector
saramaConsumer sarama.ConsumerGroup
useConfluent bool // Always false, Sarama only
topics []string
consumerGroup string
avroCodec *goavro.Codec
// Schema format tracking per topic
schemaFormats map[string]string // topic -> schema format mapping (AVRO, JSON, PROTOBUF)
// Processing tracking
messagesProcessed int64
lastOffset map[string]map[int32]int64
offsetMutex sync.RWMutex
}
// New creates a new consumer instance
func New(cfg *config.Config, collector *metrics.Collector, id int) (*Consumer, error) {
// All consumers share the same group for load balancing across partitions
consumerGroup := cfg.Consumers.GroupPrefix
c := &Consumer{
id: id,
config: cfg,
metricsCollector: collector,
topics: cfg.GetTopicNames(),
consumerGroup: consumerGroup,
useConfluent: false, // Use Sarama by default
lastOffset: make(map[string]map[int32]int64),
schemaFormats: make(map[string]string),
}
// Initialize schema formats for each topic (must match producer logic)
// This mirrors the format distribution in cmd/loadtest/main.go registerSchemas()
for i, topic := range c.topics {
var schemaFormat string
if cfg.Producers.SchemaFormat != "" {
// Use explicit config if provided
schemaFormat = cfg.Producers.SchemaFormat
} else {
// Distribute across formats (same as producer)
switch i % 3 {
case 0:
schemaFormat = "AVRO"
case 1:
schemaFormat = "JSON"
case 2:
schemaFormat = "PROTOBUF"
}
}
c.schemaFormats[topic] = schemaFormat
log.Printf("Consumer %d: Topic %s will use schema format: %s", id, topic, schemaFormat)
}
// Initialize consumer based on configuration
if c.useConfluent {
if err := c.initConfluentConsumer(); err != nil {
return nil, fmt.Errorf("failed to initialize Confluent consumer: %w", err)
}
} else {
if err := c.initSaramaConsumer(); err != nil {
return nil, fmt.Errorf("failed to initialize Sarama consumer: %w", err)
}
}
// Initialize Avro codec if schemas are enabled
if cfg.Schemas.Enabled {
if err := c.initAvroCodec(); err != nil {
return nil, fmt.Errorf("failed to initialize Avro codec: %w", err)
}
}
log.Printf("Consumer %d initialized for group %s", id, consumerGroup)
return c, nil
}
// initSaramaConsumer initializes the Sarama consumer group
func (c *Consumer) initSaramaConsumer() error {
config := sarama.NewConfig()
// Consumer configuration
config.Consumer.Return.Errors = true
config.Consumer.Offsets.Initial = sarama.OffsetOldest
if c.config.Consumers.AutoOffsetReset == "latest" {
config.Consumer.Offsets.Initial = sarama.OffsetNewest
}
// Auto commit configuration
config.Consumer.Offsets.AutoCommit.Enable = c.config.Consumers.EnableAutoCommit
config.Consumer.Offsets.AutoCommit.Interval = time.Duration(c.config.Consumers.AutoCommitIntervalMs) * time.Millisecond
// Session and heartbeat configuration
config.Consumer.Group.Session.Timeout = time.Duration(c.config.Consumers.SessionTimeoutMs) * time.Millisecond
config.Consumer.Group.Heartbeat.Interval = time.Duration(c.config.Consumers.HeartbeatIntervalMs) * time.Millisecond
// Fetch configuration
config.Consumer.Fetch.Min = int32(c.config.Consumers.FetchMinBytes)
config.Consumer.Fetch.Default = 10 * 1024 * 1024 // 10MB per partition (increased from 1MB default)
config.Consumer.Fetch.Max = int32(c.config.Consumers.FetchMaxBytes)
config.Consumer.MaxWaitTime = time.Duration(c.config.Consumers.FetchMaxWaitMs) * time.Millisecond
config.Consumer.MaxProcessingTime = time.Duration(c.config.Consumers.MaxPollIntervalMs) * time.Millisecond
// Channel buffer sizes for concurrent partition consumption
config.ChannelBufferSize = 256 // Increase from default 256 to allow more buffering
// Enable concurrent partition fetching by increasing the number of broker connections
// This allows Sarama to fetch from multiple partitions in parallel
config.Net.MaxOpenRequests = 20 // Increase from default 5 to allow 20 concurrent requests
// Version
config.Version = sarama.V2_8_0_0
// Create consumer group
consumerGroup, err := sarama.NewConsumerGroup(c.config.Kafka.BootstrapServers, c.consumerGroup, config)
if err != nil {
return fmt.Errorf("failed to create Sarama consumer group: %w", err)
}
c.saramaConsumer = consumerGroup
return nil
}
// initConfluentConsumer initializes the Confluent Kafka Go consumer
func (c *Consumer) initConfluentConsumer() error {
// Confluent consumer disabled, using Sarama only
return fmt.Errorf("confluent consumer not enabled")
}
// initAvroCodec initializes the Avro codec for schema-based messages
func (c *Consumer) initAvroCodec() error {
// Use the LoadTestMessage schema (matches what producer uses)
loadTestSchema := `{
"type": "record",
"name": "LoadTestMessage",
"namespace": "com.seaweedfs.loadtest",
"fields": [
{"name": "id", "type": "string"},
{"name": "timestamp", "type": "long"},
{"name": "producer_id", "type": "int"},
{"name": "counter", "type": "long"},
{"name": "user_id", "type": "string"},
{"name": "event_type", "type": "string"},
{"name": "properties", "type": {"type": "map", "values": "string"}}
]
}`
codec, err := goavro.NewCodec(loadTestSchema)
if err != nil {
return fmt.Errorf("failed to create Avro codec: %w", err)
}
c.avroCodec = codec
return nil
}
// Run starts the consumer and consumes messages until the context is cancelled
func (c *Consumer) Run(ctx context.Context) {
log.Printf("Consumer %d starting for group %s", c.id, c.consumerGroup)
defer log.Printf("Consumer %d stopped", c.id)
if c.useConfluent {
c.runConfluentConsumer(ctx)
} else {
c.runSaramaConsumer(ctx)
}
}
// runSaramaConsumer runs the Sarama consumer group
func (c *Consumer) runSaramaConsumer(ctx context.Context) {
handler := &ConsumerGroupHandler{
consumer: c,
}
var wg sync.WaitGroup
// Start error handler
wg.Add(1)
go func() {
defer wg.Done()
for {
select {
case err, ok := <-c.saramaConsumer.Errors():
if !ok {
return
}
log.Printf("Consumer %d error: %v", c.id, err)
c.metricsCollector.RecordConsumerError()
case <-ctx.Done():
return
}
}
}()
// Start consumer group session
wg.Add(1)
go func() {
defer wg.Done()
for {
select {
case <-ctx.Done():
return
default:
if err := c.saramaConsumer.Consume(ctx, c.topics, handler); err != nil {
log.Printf("Consumer %d: Error consuming: %v", c.id, err)
c.metricsCollector.RecordConsumerError()
// Wait briefly before retrying (reduced from 5s to 1s for faster recovery)
select {
case <-time.After(1 * time.Second):
case <-ctx.Done():
return
}
}
}
}
}()
// Start lag monitoring
wg.Add(1)
go func() {
defer wg.Done()
c.monitorConsumerLag(ctx)
}()
// Wait for completion
<-ctx.Done()
log.Printf("Consumer %d: Context cancelled, shutting down", c.id)
wg.Wait()
}
// runConfluentConsumer runs the Confluent consumer
func (c *Consumer) runConfluentConsumer(ctx context.Context) {
// Confluent consumer disabled, using Sarama only
log.Printf("Consumer %d: Confluent consumer not enabled", c.id)
}
// processMessage processes a consumed message
func (c *Consumer) processMessage(topicPtr *string, partition int32, offset int64, key, value []byte) error {
topic := ""
if topicPtr != nil {
topic = *topicPtr
}
// Update offset tracking
c.updateOffset(topic, partition, offset)
// Decode message based on topic-specific schema format
var decodedMessage interface{}
var err error
// Determine schema format for this topic (if schemas are enabled)
var schemaFormat string
if c.config.Schemas.Enabled {
schemaFormat = c.schemaFormats[topic]
if schemaFormat == "" {
// Fallback to config if topic not in map
schemaFormat = c.config.Producers.ValueType
}
} else {
// No schemas, use global value type
schemaFormat = c.config.Producers.ValueType
}
// Decode message based on format
switch schemaFormat {
case "avro", "AVRO":
decodedMessage, err = c.decodeAvroMessage(value)
case "json", "JSON", "JSON_SCHEMA":
decodedMessage, err = c.decodeJSONSchemaMessage(value)
case "protobuf", "PROTOBUF":
decodedMessage, err = c.decodeProtobufMessage(value)
case "binary":
decodedMessage, err = c.decodeBinaryMessage(value)
default:
// Fallback to plain JSON
decodedMessage, err = c.decodeJSONMessage(value)
}
if err != nil {
return fmt.Errorf("failed to decode message: %w", err)
}
// Note: Removed artificial delay to allow maximum throughput
// If you need to simulate processing time, add a configurable delay setting
// time.Sleep(time.Millisecond) // Minimal processing delay
// Record metrics
c.metricsCollector.RecordConsumedMessage(len(value))
c.messagesProcessed++
// Log progress
if c.id == 0 && c.messagesProcessed%1000 == 0 {
log.Printf("Consumer %d: Processed %d messages (latest: %s[%d]@%d)",
c.id, c.messagesProcessed, topic, partition, offset)
}
// Optional: Validate message content (for testing purposes)
if c.config.Chaos.Enabled {
if err := c.validateMessage(decodedMessage); err != nil {
log.Printf("Consumer %d: Message validation failed: %v", c.id, err)
}
}
return nil
}
// decodeJSONMessage decodes a JSON message
func (c *Consumer) decodeJSONMessage(value []byte) (interface{}, error) {
var message map[string]interface{}
if err := json.Unmarshal(value, &message); err != nil {
// DEBUG: Log the raw bytes when JSON parsing fails
log.Printf("Consumer %d: JSON decode failed. Length: %d, Raw bytes (hex): %x, Raw string: %q, Error: %v",
c.id, len(value), value, string(value), err)
return nil, err
}
return message, nil
}
// decodeAvroMessage decodes an Avro message (handles Confluent Wire Format)
func (c *Consumer) decodeAvroMessage(value []byte) (interface{}, error) {
if c.avroCodec == nil {
return nil, fmt.Errorf("Avro codec not initialized")
}
// Handle Confluent Wire Format when schemas are enabled
var avroData []byte
if c.config.Schemas.Enabled {
if len(value) < 5 {
return nil, fmt.Errorf("message too short for Confluent Wire Format: %d bytes", len(value))
}
// Check magic byte (should be 0)
if value[0] != 0 {
return nil, fmt.Errorf("invalid Confluent Wire Format magic byte: %d", value[0])
}
// Extract schema ID (bytes 1-4, big-endian)
schemaID := binary.BigEndian.Uint32(value[1:5])
_ = schemaID // TODO: Could validate schema ID matches expected schema
// Extract Avro data (bytes 5+)
avroData = value[5:]
} else {
// No wire format, use raw data
avroData = value
}
native, _, err := c.avroCodec.NativeFromBinary(avroData)
if err != nil {
return nil, fmt.Errorf("failed to decode Avro data: %w", err)
}
return native, nil
}
// decodeJSONSchemaMessage decodes a JSON Schema message (handles Confluent Wire Format)
func (c *Consumer) decodeJSONSchemaMessage(value []byte) (interface{}, error) {
// Handle Confluent Wire Format when schemas are enabled
var jsonData []byte
if c.config.Schemas.Enabled {
if len(value) < 5 {
return nil, fmt.Errorf("message too short for Confluent Wire Format: %d bytes", len(value))
}
// Check magic byte (should be 0)
if value[0] != 0 {
return nil, fmt.Errorf("invalid Confluent Wire Format magic byte: %d", value[0])
}
// Extract schema ID (bytes 1-4, big-endian)
schemaID := binary.BigEndian.Uint32(value[1:5])
_ = schemaID // TODO: Could validate schema ID matches expected schema
// Extract JSON data (bytes 5+)
jsonData = value[5:]
} else {
// No wire format, use raw data
jsonData = value
}
// Decode JSON
var message map[string]interface{}
if err := json.Unmarshal(jsonData, &message); err != nil {
return nil, fmt.Errorf("failed to decode JSON data: %w", err)
}
return message, nil
}
// decodeProtobufMessage decodes a Protobuf message (handles Confluent Wire Format)
func (c *Consumer) decodeProtobufMessage(value []byte) (interface{}, error) {
// Handle Confluent Wire Format when schemas are enabled
var protoData []byte
if c.config.Schemas.Enabled {
if len(value) < 5 {
return nil, fmt.Errorf("message too short for Confluent Wire Format: %d bytes", len(value))
}
// Check magic byte (should be 0)
if value[0] != 0 {
return nil, fmt.Errorf("invalid Confluent Wire Format magic byte: %d", value[0])
}
// Extract schema ID (bytes 1-4, big-endian)
schemaID := binary.BigEndian.Uint32(value[1:5])
_ = schemaID // TODO: Could validate schema ID matches expected schema
// Extract Protobuf data (bytes 5+)
protoData = value[5:]
} else {
// No wire format, use raw data
protoData = value
}
// Unmarshal protobuf message
var protoMsg pb.LoadTestMessage
if err := proto.Unmarshal(protoData, &protoMsg); err != nil {
return nil, fmt.Errorf("failed to unmarshal Protobuf data: %w", err)
}
// Convert to map for consistency with other decoders
return map[string]interface{}{
"id": protoMsg.Id,
"timestamp": protoMsg.Timestamp,
"producer_id": protoMsg.ProducerId,
"counter": protoMsg.Counter,
"user_id": protoMsg.UserId,
"event_type": protoMsg.EventType,
"properties": protoMsg.Properties,
}, nil
}
// decodeBinaryMessage decodes a binary message
func (c *Consumer) decodeBinaryMessage(value []byte) (interface{}, error) {
if len(value) < 20 {
return nil, fmt.Errorf("binary message too short")
}
// Extract fields from the binary format:
// [producer_id:4][counter:8][timestamp:8][random_data:...]
producerID := int(value[0])<<24 | int(value[1])<<16 | int(value[2])<<8 | int(value[3])
var counter int64
for i := 0; i < 8; i++ {
counter |= int64(value[4+i]) << (56 - i*8)
}
var timestamp int64
for i := 0; i < 8; i++ {
timestamp |= int64(value[12+i]) << (56 - i*8)
}
return map[string]interface{}{
"producer_id": producerID,
"counter": counter,
"timestamp": timestamp,
"data_size": len(value),
}, nil
}
// validateMessage performs basic message validation
func (c *Consumer) validateMessage(message interface{}) error {
// This is a placeholder for message validation logic
// In a real load test, you might validate:
// - Message structure
// - Required fields
// - Data consistency
// - Schema compliance
if message == nil {
return fmt.Errorf("message is nil")
}
return nil
}
// updateOffset updates the last seen offset for lag calculation
func (c *Consumer) updateOffset(topic string, partition int32, offset int64) {
c.offsetMutex.Lock()
defer c.offsetMutex.Unlock()
if c.lastOffset[topic] == nil {
c.lastOffset[topic] = make(map[int32]int64)
}
c.lastOffset[topic][partition] = offset
}
// monitorConsumerLag monitors and reports consumer lag
func (c *Consumer) monitorConsumerLag(ctx context.Context) {
ticker := time.NewTicker(30 * time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
c.reportConsumerLag()
}
}
}
// reportConsumerLag calculates and reports consumer lag
func (c *Consumer) reportConsumerLag() {
// This is a simplified lag calculation
// In a real implementation, you would query the broker for high water marks
c.offsetMutex.RLock()
defer c.offsetMutex.RUnlock()
for topic, partitions := range c.lastOffset {
for partition, _ := range partitions {
// For simplicity, assume lag is always 0 when we're consuming actively
// In a real test, you would compare against the high water mark
lag := int64(0)
c.metricsCollector.UpdateConsumerLag(c.consumerGroup, topic, partition, lag)
}
}
}
// Close closes the consumer and cleans up resources
func (c *Consumer) Close() error {
log.Printf("Consumer %d: Closing", c.id)
if c.saramaConsumer != nil {
return c.saramaConsumer.Close()
}
return nil
}
// ConsumerGroupHandler implements sarama.ConsumerGroupHandler
type ConsumerGroupHandler struct {
consumer *Consumer
}
// Setup is run at the beginning of a new session, before ConsumeClaim
func (h *ConsumerGroupHandler) Setup(sarama.ConsumerGroupSession) error {
log.Printf("Consumer %d: Consumer group session setup", h.consumer.id)
return nil
}
// Cleanup is run at the end of a session, once all ConsumeClaim goroutines have exited
func (h *ConsumerGroupHandler) Cleanup(sarama.ConsumerGroupSession) error {
log.Printf("Consumer %d: Consumer group session cleanup", h.consumer.id)
return nil
}
// ConsumeClaim must start a consumer loop of ConsumerGroupClaim's Messages()
func (h *ConsumerGroupHandler) ConsumeClaim(session sarama.ConsumerGroupSession, claim sarama.ConsumerGroupClaim) error {
msgCount := 0
for {
select {
case message, ok := <-claim.Messages():
if !ok {
return nil
}
msgCount++
// Process the message
var key []byte
if message.Key != nil {
key = message.Key
}
if err := h.consumer.processMessage(&message.Topic, message.Partition, message.Offset, key, message.Value); err != nil {
log.Printf("Consumer %d: Error processing message: %v", h.consumer.id, err)
h.consumer.metricsCollector.RecordConsumerError()
// Add a small delay for schema validation or other processing errors to avoid overloading
// select {
// case <-time.After(100 * time.Millisecond):
// // Continue after brief delay
// case <-session.Context().Done():
// return nil
// }
} else {
// Mark message as processed
session.MarkMessage(message, "")
}
case <-session.Context().Done():
log.Printf("Consumer %d: Session context cancelled for %s[%d]",
h.consumer.id, claim.Topic(), claim.Partition())
return nil
}
}
}
// Helper functions
func joinStrings(strs []string, sep string) string {
if len(strs) == 0 {
return ""
}
result := strs[0]
for i := 1; i < len(strs); i++ {
result += sep + strs[i]
}
return result
}