Browse Source
Phase 7: Comprehensive error handling and edge cases
Phase 7: Comprehensive error handling and edge cases
- Added centralized errors.go with complete Kafka error code definitions - Implemented timeout detection and network error classification - Enhanced connection handling with configurable timeouts and better error reporting - Added comprehensive error handling test suite with 21 test cases - Unified error code usage across all protocol handlers - Improved request/response timeout handling with graceful fallbacks - All protocol and E2E tests passing with robust error handlingpull/7231/head
11 changed files with 1057 additions and 271 deletions
-
23weed/mq/kafka/IMPLEMENTATION_PHASES.md
-
2weed/mq/kafka/protocol/api_versions_test.go
-
6weed/mq/kafka/protocol/consumer_coordination.go
-
414weed/mq/kafka/protocol/error_handling_test.go
-
361weed/mq/kafka/protocol/errors.go
-
106weed/mq/kafka/protocol/flexible_versions.go
-
102weed/mq/kafka/protocol/flexible_versions_integration_test.go
-
158weed/mq/kafka/protocol/flexible_versions_test.go
-
131weed/mq/kafka/protocol/handler.go
-
16weed/mq/kafka/protocol/joingroup.go
-
9weed/mq/kafka/protocol/offset_management.go
@ -0,0 +1,414 @@ |
|||||
|
package protocol |
||||
|
|
||||
|
import ( |
||||
|
"context" |
||||
|
"encoding/binary" |
||||
|
"errors" |
||||
|
"testing" |
||||
|
"time" |
||||
|
) |
||||
|
|
||||
|
func TestKafkaErrorCodes(t *testing.T) { |
||||
|
tests := []struct { |
||||
|
name string |
||||
|
errorCode int16 |
||||
|
expectedInfo ErrorInfo |
||||
|
}{ |
||||
|
{ |
||||
|
name: "No error", |
||||
|
errorCode: ErrorCodeNone, |
||||
|
expectedInfo: ErrorInfo{ |
||||
|
Code: 0, Name: "NONE", Description: "No error", Retriable: false, |
||||
|
}, |
||||
|
}, |
||||
|
{ |
||||
|
name: "Unknown server error", |
||||
|
errorCode: ErrorCodeUnknownServerError, |
||||
|
expectedInfo: ErrorInfo{ |
||||
|
Code: 1, Name: "UNKNOWN_SERVER_ERROR", Description: "Unknown server error", Retriable: true, |
||||
|
}, |
||||
|
}, |
||||
|
{ |
||||
|
name: "Topic already exists", |
||||
|
errorCode: ErrorCodeTopicAlreadyExists, |
||||
|
expectedInfo: ErrorInfo{ |
||||
|
Code: 36, Name: "TOPIC_ALREADY_EXISTS", Description: "Topic already exists", Retriable: false, |
||||
|
}, |
||||
|
}, |
||||
|
{ |
||||
|
name: "Invalid partitions", |
||||
|
errorCode: ErrorCodeInvalidPartitions, |
||||
|
expectedInfo: ErrorInfo{ |
||||
|
Code: 37, Name: "INVALID_PARTITIONS", Description: "Invalid number of partitions", Retriable: false, |
||||
|
}, |
||||
|
}, |
||||
|
{ |
||||
|
name: "Request timed out", |
||||
|
errorCode: ErrorCodeRequestTimedOut, |
||||
|
expectedInfo: ErrorInfo{ |
||||
|
Code: 7, Name: "REQUEST_TIMED_OUT", Description: "Request timed out", Retriable: true, |
||||
|
}, |
||||
|
}, |
||||
|
{ |
||||
|
name: "Connection timeout", |
||||
|
errorCode: ErrorCodeConnectionTimeout, |
||||
|
expectedInfo: ErrorInfo{ |
||||
|
Code: 61, Name: "CONNECTION_TIMEOUT", Description: "Connection timeout", Retriable: true, |
||||
|
}, |
||||
|
}, |
||||
|
} |
||||
|
|
||||
|
for _, tt := range tests { |
||||
|
t.Run(tt.name, func(t *testing.T) { |
||||
|
info := GetErrorInfo(tt.errorCode) |
||||
|
if info.Code != tt.expectedInfo.Code { |
||||
|
t.Errorf("GetErrorInfo().Code = %d, want %d", info.Code, tt.expectedInfo.Code) |
||||
|
} |
||||
|
if info.Name != tt.expectedInfo.Name { |
||||
|
t.Errorf("GetErrorInfo().Name = %s, want %s", info.Name, tt.expectedInfo.Name) |
||||
|
} |
||||
|
if info.Description != tt.expectedInfo.Description { |
||||
|
t.Errorf("GetErrorInfo().Description = %s, want %s", info.Description, tt.expectedInfo.Description) |
||||
|
} |
||||
|
if info.Retriable != tt.expectedInfo.Retriable { |
||||
|
t.Errorf("GetErrorInfo().Retriable = %v, want %v", info.Retriable, tt.expectedInfo.Retriable) |
||||
|
} |
||||
|
}) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
func TestIsRetriableError(t *testing.T) { |
||||
|
tests := []struct { |
||||
|
name string |
||||
|
errorCode int16 |
||||
|
retriable bool |
||||
|
}{ |
||||
|
{"None", ErrorCodeNone, false}, |
||||
|
{"Unknown server error", ErrorCodeUnknownServerError, true}, |
||||
|
{"Topic already exists", ErrorCodeTopicAlreadyExists, false}, |
||||
|
{"Request timed out", ErrorCodeRequestTimedOut, true}, |
||||
|
{"Rebalance in progress", ErrorCodeRebalanceInProgress, true}, |
||||
|
{"Invalid group ID", ErrorCodeInvalidGroupID, false}, |
||||
|
{"Network exception", ErrorCodeNetworkException, true}, |
||||
|
{"Connection timeout", ErrorCodeConnectionTimeout, true}, |
||||
|
{"Read timeout", ErrorCodeReadTimeout, true}, |
||||
|
{"Write timeout", ErrorCodeWriteTimeout, true}, |
||||
|
} |
||||
|
|
||||
|
for _, tt := range tests { |
||||
|
t.Run(tt.name, func(t *testing.T) { |
||||
|
if got := IsRetriableError(tt.errorCode); got != tt.retriable { |
||||
|
t.Errorf("IsRetriableError() = %v, want %v", got, tt.retriable) |
||||
|
} |
||||
|
}) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
func TestBuildErrorResponse(t *testing.T) { |
||||
|
tests := []struct { |
||||
|
name string |
||||
|
correlationID uint32 |
||||
|
errorCode int16 |
||||
|
expectedLen int |
||||
|
}{ |
||||
|
{"Basic error response", 12345, ErrorCodeUnknownServerError, 6}, |
||||
|
{"Topic already exists", 67890, ErrorCodeTopicAlreadyExists, 6}, |
||||
|
{"No error", 11111, ErrorCodeNone, 6}, |
||||
|
} |
||||
|
|
||||
|
for _, tt := range tests { |
||||
|
t.Run(tt.name, func(t *testing.T) { |
||||
|
response := BuildErrorResponse(tt.correlationID, tt.errorCode) |
||||
|
|
||||
|
if len(response) != tt.expectedLen { |
||||
|
t.Errorf("BuildErrorResponse() length = %d, want %d", len(response), tt.expectedLen) |
||||
|
} |
||||
|
|
||||
|
// Verify correlation ID
|
||||
|
if len(response) >= 4 { |
||||
|
correlationID := binary.BigEndian.Uint32(response[0:4]) |
||||
|
if correlationID != tt.correlationID { |
||||
|
t.Errorf("Correlation ID = %d, want %d", correlationID, tt.correlationID) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// Verify error code
|
||||
|
if len(response) >= 6 { |
||||
|
errorCode := binary.BigEndian.Uint16(response[4:6]) |
||||
|
if errorCode != uint16(tt.errorCode) { |
||||
|
t.Errorf("Error code = %d, want %d", errorCode, uint16(tt.errorCode)) |
||||
|
} |
||||
|
} |
||||
|
}) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
func TestBuildErrorResponseWithMessage(t *testing.T) { |
||||
|
tests := []struct { |
||||
|
name string |
||||
|
correlationID uint32 |
||||
|
errorCode int16 |
||||
|
message string |
||||
|
expectNullMsg bool |
||||
|
}{ |
||||
|
{"Error with message", 12345, ErrorCodeUnknownServerError, "Test error message", false}, |
||||
|
{"Error with empty message", 67890, ErrorCodeTopicAlreadyExists, "", true}, |
||||
|
{"Error with long message", 11111, ErrorCodeInvalidPartitions, "This is a longer error message for testing", false}, |
||||
|
} |
||||
|
|
||||
|
for _, tt := range tests { |
||||
|
t.Run(tt.name, func(t *testing.T) { |
||||
|
response := BuildErrorResponseWithMessage(tt.correlationID, tt.errorCode, tt.message) |
||||
|
|
||||
|
// Should have at least correlation ID (4) + error code (2) + message length (2)
|
||||
|
minLen := 8 |
||||
|
if len(response) < minLen { |
||||
|
t.Errorf("BuildErrorResponseWithMessage() length = %d, want at least %d", len(response), minLen) |
||||
|
} |
||||
|
|
||||
|
// Verify correlation ID
|
||||
|
correlationID := binary.BigEndian.Uint32(response[0:4]) |
||||
|
if correlationID != tt.correlationID { |
||||
|
t.Errorf("Correlation ID = %d, want %d", correlationID, tt.correlationID) |
||||
|
} |
||||
|
|
||||
|
// Verify error code
|
||||
|
errorCode := binary.BigEndian.Uint16(response[4:6]) |
||||
|
if errorCode != uint16(tt.errorCode) { |
||||
|
t.Errorf("Error code = %d, want %d", errorCode, uint16(tt.errorCode)) |
||||
|
} |
||||
|
|
||||
|
// Verify message
|
||||
|
if tt.expectNullMsg { |
||||
|
// Should have null string marker (0xFF, 0xFF)
|
||||
|
if len(response) >= 8 && (response[6] != 0xFF || response[7] != 0xFF) { |
||||
|
t.Errorf("Expected null string marker, got %x %x", response[6], response[7]) |
||||
|
} |
||||
|
} else { |
||||
|
// Should have message length and message
|
||||
|
if len(response) >= 8 { |
||||
|
messageLen := binary.BigEndian.Uint16(response[6:8]) |
||||
|
if messageLen != uint16(len(tt.message)) { |
||||
|
t.Errorf("Message length = %d, want %d", messageLen, len(tt.message)) |
||||
|
} |
||||
|
|
||||
|
if len(response) >= 8+len(tt.message) { |
||||
|
actualMessage := string(response[8 : 8+len(tt.message)]) |
||||
|
if actualMessage != tt.message { |
||||
|
t.Errorf("Message = %q, want %q", actualMessage, tt.message) |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
}) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
func TestClassifyNetworkError(t *testing.T) { |
||||
|
tests := []struct { |
||||
|
name string |
||||
|
err error |
||||
|
expected int16 |
||||
|
}{ |
||||
|
{"No error", nil, ErrorCodeNone}, |
||||
|
{"Generic error", errors.New("generic error"), ErrorCodeUnknownServerError}, |
||||
|
{"Connection refused", errors.New("connection refused"), ErrorCodeConnectionRefused}, |
||||
|
{"Connection timeout", errors.New("connection timeout"), ErrorCodeConnectionTimeout}, |
||||
|
{"Network timeout", &timeoutError{}, ErrorCodeRequestTimedOut}, |
||||
|
{"Network error", &networkError{}, ErrorCodeNetworkException}, |
||||
|
} |
||||
|
|
||||
|
for _, tt := range tests { |
||||
|
t.Run(tt.name, func(t *testing.T) { |
||||
|
if got := ClassifyNetworkError(tt.err); got != tt.expected { |
||||
|
t.Errorf("ClassifyNetworkError() = %v, want %v", got, tt.expected) |
||||
|
} |
||||
|
}) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
func TestHandleTimeoutError(t *testing.T) { |
||||
|
tests := []struct { |
||||
|
name string |
||||
|
err error |
||||
|
operation string |
||||
|
expected int16 |
||||
|
}{ |
||||
|
{"No error", nil, "read", ErrorCodeNone}, |
||||
|
{"Read timeout", &timeoutError{}, "read", ErrorCodeReadTimeout}, |
||||
|
{"Write timeout", &timeoutError{}, "write", ErrorCodeWriteTimeout}, |
||||
|
{"Connect timeout", &timeoutError{}, "connect", ErrorCodeConnectionTimeout}, |
||||
|
{"Generic timeout", &timeoutError{}, "unknown", ErrorCodeRequestTimedOut}, |
||||
|
{"Non-timeout error", errors.New("other error"), "read", ErrorCodeUnknownServerError}, |
||||
|
} |
||||
|
|
||||
|
for _, tt := range tests { |
||||
|
t.Run(tt.name, func(t *testing.T) { |
||||
|
if got := HandleTimeoutError(tt.err, tt.operation); got != tt.expected { |
||||
|
t.Errorf("HandleTimeoutError() = %v, want %v", got, tt.expected) |
||||
|
} |
||||
|
}) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
func TestDefaultTimeoutConfig(t *testing.T) { |
||||
|
config := DefaultTimeoutConfig() |
||||
|
|
||||
|
if config.ConnectionTimeout != 30*time.Second { |
||||
|
t.Errorf("ConnectionTimeout = %v, want %v", config.ConnectionTimeout, 30*time.Second) |
||||
|
} |
||||
|
if config.ReadTimeout != 10*time.Second { |
||||
|
t.Errorf("ReadTimeout = %v, want %v", config.ReadTimeout, 10*time.Second) |
||||
|
} |
||||
|
if config.WriteTimeout != 10*time.Second { |
||||
|
t.Errorf("WriteTimeout = %v, want %v", config.WriteTimeout, 10*time.Second) |
||||
|
} |
||||
|
if config.RequestTimeout != 30*time.Second { |
||||
|
t.Errorf("RequestTimeout = %v, want %v", config.RequestTimeout, 30*time.Second) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
func TestSafeFormatError(t *testing.T) { |
||||
|
tests := []struct { |
||||
|
name string |
||||
|
err error |
||||
|
expected string |
||||
|
}{ |
||||
|
{"No error", nil, ""}, |
||||
|
{"Generic error", errors.New("test error"), "Error: test error"}, |
||||
|
{"Network error", &networkError{}, "Error: network error"}, |
||||
|
} |
||||
|
|
||||
|
for _, tt := range tests { |
||||
|
t.Run(tt.name, func(t *testing.T) { |
||||
|
if got := SafeFormatError(tt.err); got != tt.expected { |
||||
|
t.Errorf("SafeFormatError() = %q, want %q", got, tt.expected) |
||||
|
} |
||||
|
}) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
func TestGetErrorInfo_UnknownErrorCode(t *testing.T) { |
||||
|
unknownCode := int16(9999) |
||||
|
info := GetErrorInfo(unknownCode) |
||||
|
|
||||
|
if info.Code != unknownCode { |
||||
|
t.Errorf("Code = %d, want %d", info.Code, unknownCode) |
||||
|
} |
||||
|
if info.Name != "UNKNOWN" { |
||||
|
t.Errorf("Name = %s, want UNKNOWN", info.Name) |
||||
|
} |
||||
|
if info.Description != "Unknown error code" { |
||||
|
t.Errorf("Description = %s, want 'Unknown error code'", info.Description) |
||||
|
} |
||||
|
if info.Retriable != false { |
||||
|
t.Errorf("Retriable = %v, want false", info.Retriable) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// Integration test for error handling in protocol context
|
||||
|
func TestErrorHandling_Integration(t *testing.T) { |
||||
|
// Test building various protocol error responses
|
||||
|
tests := []struct { |
||||
|
name string |
||||
|
apiKey uint16 |
||||
|
errorCode int16 |
||||
|
message string |
||||
|
}{ |
||||
|
{"ApiVersions error", 18, ErrorCodeUnsupportedVersion, "Version not supported"}, |
||||
|
{"Metadata error", 3, ErrorCodeUnknownTopicOrPartition, "Topic not found"}, |
||||
|
{"Produce error", 0, ErrorCodeMessageTooLarge, "Message exceeds size limit"}, |
||||
|
{"Fetch error", 1, ErrorCodeOffsetOutOfRange, "Offset out of range"}, |
||||
|
{"CreateTopics error", 19, ErrorCodeTopicAlreadyExists, "Topic already exists"}, |
||||
|
} |
||||
|
|
||||
|
for _, tt := range tests { |
||||
|
t.Run(tt.name, func(t *testing.T) { |
||||
|
correlationID := uint32(12345) |
||||
|
|
||||
|
// Test basic error response
|
||||
|
basicResponse := BuildErrorResponse(correlationID, tt.errorCode) |
||||
|
if len(basicResponse) != 6 { |
||||
|
t.Errorf("Basic response length = %d, want 6", len(basicResponse)) |
||||
|
} |
||||
|
|
||||
|
// Test error response with message
|
||||
|
messageResponse := BuildErrorResponseWithMessage(correlationID, tt.errorCode, tt.message) |
||||
|
expectedMinLen := 8 + len(tt.message) // 4 (correlationID) + 2 (errorCode) + 2 (messageLen) + len(message)
|
||||
|
if len(messageResponse) < expectedMinLen { |
||||
|
t.Errorf("Message response length = %d, want at least %d", len(messageResponse), expectedMinLen) |
||||
|
} |
||||
|
|
||||
|
// Verify error is correctly classified
|
||||
|
info := GetErrorInfo(tt.errorCode) |
||||
|
if info.Code != tt.errorCode { |
||||
|
t.Errorf("Error info code = %d, want %d", info.Code, tt.errorCode) |
||||
|
} |
||||
|
}) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// Mock error types for testing
|
||||
|
type timeoutError struct{} |
||||
|
|
||||
|
func (e *timeoutError) Error() string { return "timeout error" } |
||||
|
func (e *timeoutError) Timeout() bool { return true } |
||||
|
func (e *timeoutError) Temporary() bool { return true } |
||||
|
|
||||
|
type networkError struct{} |
||||
|
|
||||
|
func (e *networkError) Error() string { return "network error" } |
||||
|
func (e *networkError) Timeout() bool { return false } |
||||
|
func (e *networkError) Temporary() bool { return true } |
||||
|
|
||||
|
// Test timeout detection
|
||||
|
func TestTimeoutDetection(t *testing.T) { |
||||
|
// Test with context timeout
|
||||
|
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Millisecond) |
||||
|
defer cancel() |
||||
|
|
||||
|
// Wait for context to timeout
|
||||
|
time.Sleep(2 * time.Millisecond) |
||||
|
|
||||
|
select { |
||||
|
case <-ctx.Done(): |
||||
|
err := ctx.Err() |
||||
|
errorCode := HandleTimeoutError(err, "context") |
||||
|
if errorCode != ErrorCodeRequestTimedOut { |
||||
|
t.Errorf("Context timeout error code = %v, want %v", errorCode, ErrorCodeRequestTimedOut) |
||||
|
} |
||||
|
default: |
||||
|
t.Error("Context should have timed out") |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// Benchmark error response building
|
||||
|
func BenchmarkBuildErrorResponse(b *testing.B) { |
||||
|
correlationID := uint32(12345) |
||||
|
errorCode := ErrorCodeUnknownServerError |
||||
|
|
||||
|
b.ResetTimer() |
||||
|
for i := 0; i < b.N; i++ { |
||||
|
BuildErrorResponse(correlationID, errorCode) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
func BenchmarkBuildErrorResponseWithMessage(b *testing.B) { |
||||
|
correlationID := uint32(12345) |
||||
|
errorCode := ErrorCodeUnknownServerError |
||||
|
message := "This is a test error message" |
||||
|
|
||||
|
b.ResetTimer() |
||||
|
for i := 0; i < b.N; i++ { |
||||
|
BuildErrorResponseWithMessage(correlationID, errorCode, message) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
func BenchmarkClassifyNetworkError(b *testing.B) { |
||||
|
err := &timeoutError{} |
||||
|
|
||||
|
b.ResetTimer() |
||||
|
for i := 0; i < b.N; i++ { |
||||
|
ClassifyNetworkError(err) |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,361 @@ |
|||||
|
package protocol |
||||
|
|
||||
|
import ( |
||||
|
"encoding/binary" |
||||
|
"fmt" |
||||
|
"net" |
||||
|
"time" |
||||
|
) |
||||
|
|
||||
|
// Kafka Protocol Error Codes
|
||||
|
// Based on Apache Kafka protocol specification
|
||||
|
const ( |
||||
|
// Success
|
||||
|
ErrorCodeNone int16 = 0 |
||||
|
|
||||
|
// General server errors
|
||||
|
ErrorCodeUnknownServerError int16 = 1 |
||||
|
ErrorCodeOffsetOutOfRange int16 = 2 |
||||
|
ErrorCodeCorruptMessage int16 = 3 // Also UNKNOWN_TOPIC_OR_PARTITION
|
||||
|
ErrorCodeUnknownTopicOrPartition int16 = 3 |
||||
|
ErrorCodeInvalidFetchSize int16 = 4 |
||||
|
ErrorCodeLeaderNotAvailable int16 = 5 |
||||
|
ErrorCodeNotLeaderOrFollower int16 = 6 // Formerly NOT_LEADER_FOR_PARTITION
|
||||
|
ErrorCodeRequestTimedOut int16 = 7 |
||||
|
ErrorCodeBrokerNotAvailable int16 = 8 |
||||
|
ErrorCodeReplicaNotAvailable int16 = 9 |
||||
|
ErrorCodeMessageTooLarge int16 = 10 |
||||
|
ErrorCodeStaleControllerEpoch int16 = 11 |
||||
|
ErrorCodeOffsetMetadataTooLarge int16 = 12 |
||||
|
ErrorCodeNetworkException int16 = 13 |
||||
|
ErrorCodeOffsetLoadInProgress int16 = 14 |
||||
|
ErrorCodeGroupLoadInProgress int16 = 15 |
||||
|
ErrorCodeNotCoordinatorForGroup int16 = 16 |
||||
|
ErrorCodeNotCoordinatorForTransaction int16 = 17 |
||||
|
|
||||
|
// Consumer group coordination errors
|
||||
|
ErrorCodeIllegalGeneration int16 = 22 |
||||
|
ErrorCodeInconsistentGroupProtocol int16 = 23 |
||||
|
ErrorCodeInvalidGroupID int16 = 24 |
||||
|
ErrorCodeUnknownMemberID int16 = 25 |
||||
|
ErrorCodeInvalidSessionTimeout int16 = 26 |
||||
|
ErrorCodeRebalanceInProgress int16 = 27 |
||||
|
ErrorCodeInvalidCommitOffsetSize int16 = 28 |
||||
|
ErrorCodeTopicAuthorizationFailed int16 = 29 |
||||
|
ErrorCodeGroupAuthorizationFailed int16 = 30 |
||||
|
ErrorCodeClusterAuthorizationFailed int16 = 31 |
||||
|
ErrorCodeInvalidTimestamp int16 = 32 |
||||
|
ErrorCodeUnsupportedSASLMechanism int16 = 33 |
||||
|
ErrorCodeIllegalSASLState int16 = 34 |
||||
|
ErrorCodeUnsupportedVersion int16 = 35 |
||||
|
|
||||
|
// Topic management errors
|
||||
|
ErrorCodeTopicAlreadyExists int16 = 36 |
||||
|
ErrorCodeInvalidPartitions int16 = 37 |
||||
|
ErrorCodeInvalidReplicationFactor int16 = 38 |
||||
|
ErrorCodeInvalidReplicaAssignment int16 = 39 |
||||
|
ErrorCodeInvalidConfig int16 = 40 |
||||
|
ErrorCodeNotController int16 = 41 |
||||
|
ErrorCodeInvalidRecord int16 = 42 |
||||
|
ErrorCodePolicyViolation int16 = 43 |
||||
|
ErrorCodeOutOfOrderSequenceNumber int16 = 44 |
||||
|
ErrorCodeDuplicateSequenceNumber int16 = 45 |
||||
|
ErrorCodeInvalidProducerEpoch int16 = 46 |
||||
|
ErrorCodeInvalidTxnState int16 = 47 |
||||
|
ErrorCodeInvalidProducerIDMapping int16 = 48 |
||||
|
ErrorCodeInvalidTransactionTimeout int16 = 49 |
||||
|
ErrorCodeConcurrentTransactions int16 = 50 |
||||
|
|
||||
|
// Connection and timeout errors
|
||||
|
ErrorCodeConnectionRefused int16 = 60 // Custom for connection issues
|
||||
|
ErrorCodeConnectionTimeout int16 = 61 // Custom for connection timeouts
|
||||
|
ErrorCodeReadTimeout int16 = 62 // Custom for read timeouts
|
||||
|
ErrorCodeWriteTimeout int16 = 63 // Custom for write timeouts
|
||||
|
|
||||
|
// Consumer group specific errors
|
||||
|
ErrorCodeMemberIDRequired int16 = 79 |
||||
|
ErrorCodeFencedInstanceID int16 = 82 |
||||
|
ErrorCodeGroupMaxSizeReached int16 = 84 |
||||
|
ErrorCodeUnstableOffsetCommit int16 = 95 |
||||
|
) |
||||
|
|
||||
|
// ErrorInfo contains metadata about a Kafka error
|
||||
|
type ErrorInfo struct { |
||||
|
Code int16 |
||||
|
Name string |
||||
|
Description string |
||||
|
Retriable bool |
||||
|
} |
||||
|
|
||||
|
// KafkaErrors maps error codes to their metadata
|
||||
|
var KafkaErrors = map[int16]ErrorInfo{ |
||||
|
ErrorCodeNone: { |
||||
|
Code: ErrorCodeNone, Name: "NONE", Description: "No error", Retriable: false, |
||||
|
}, |
||||
|
ErrorCodeUnknownServerError: { |
||||
|
Code: ErrorCodeUnknownServerError, Name: "UNKNOWN_SERVER_ERROR", |
||||
|
Description: "Unknown server error", Retriable: true, |
||||
|
}, |
||||
|
ErrorCodeOffsetOutOfRange: { |
||||
|
Code: ErrorCodeOffsetOutOfRange, Name: "OFFSET_OUT_OF_RANGE", |
||||
|
Description: "Offset out of range", Retriable: false, |
||||
|
}, |
||||
|
ErrorCodeUnknownTopicOrPartition: { |
||||
|
Code: ErrorCodeUnknownTopicOrPartition, Name: "UNKNOWN_TOPIC_OR_PARTITION", |
||||
|
Description: "Topic or partition does not exist", Retriable: false, |
||||
|
}, |
||||
|
ErrorCodeInvalidFetchSize: { |
||||
|
Code: ErrorCodeInvalidFetchSize, Name: "INVALID_FETCH_SIZE", |
||||
|
Description: "Invalid fetch size", Retriable: false, |
||||
|
}, |
||||
|
ErrorCodeLeaderNotAvailable: { |
||||
|
Code: ErrorCodeLeaderNotAvailable, Name: "LEADER_NOT_AVAILABLE", |
||||
|
Description: "Leader not available", Retriable: true, |
||||
|
}, |
||||
|
ErrorCodeNotLeaderOrFollower: { |
||||
|
Code: ErrorCodeNotLeaderOrFollower, Name: "NOT_LEADER_OR_FOLLOWER", |
||||
|
Description: "Not leader or follower", Retriable: true, |
||||
|
}, |
||||
|
ErrorCodeRequestTimedOut: { |
||||
|
Code: ErrorCodeRequestTimedOut, Name: "REQUEST_TIMED_OUT", |
||||
|
Description: "Request timed out", Retriable: true, |
||||
|
}, |
||||
|
ErrorCodeBrokerNotAvailable: { |
||||
|
Code: ErrorCodeBrokerNotAvailable, Name: "BROKER_NOT_AVAILABLE", |
||||
|
Description: "Broker not available", Retriable: true, |
||||
|
}, |
||||
|
ErrorCodeMessageTooLarge: { |
||||
|
Code: ErrorCodeMessageTooLarge, Name: "MESSAGE_TOO_LARGE", |
||||
|
Description: "Message size exceeds limit", Retriable: false, |
||||
|
}, |
||||
|
ErrorCodeOffsetMetadataTooLarge: { |
||||
|
Code: ErrorCodeOffsetMetadataTooLarge, Name: "OFFSET_METADATA_TOO_LARGE", |
||||
|
Description: "Offset metadata too large", Retriable: false, |
||||
|
}, |
||||
|
ErrorCodeNetworkException: { |
||||
|
Code: ErrorCodeNetworkException, Name: "NETWORK_EXCEPTION", |
||||
|
Description: "Network error", Retriable: true, |
||||
|
}, |
||||
|
ErrorCodeOffsetLoadInProgress: { |
||||
|
Code: ErrorCodeOffsetLoadInProgress, Name: "OFFSET_LOAD_IN_PROGRESS", |
||||
|
Description: "Offset load in progress", Retriable: true, |
||||
|
}, |
||||
|
ErrorCodeNotCoordinatorForGroup: { |
||||
|
Code: ErrorCodeNotCoordinatorForGroup, Name: "NOT_COORDINATOR_FOR_GROUP", |
||||
|
Description: "Not coordinator for group", Retriable: true, |
||||
|
}, |
||||
|
ErrorCodeInvalidGroupID: { |
||||
|
Code: ErrorCodeInvalidGroupID, Name: "INVALID_GROUP_ID", |
||||
|
Description: "Invalid group ID", Retriable: false, |
||||
|
}, |
||||
|
ErrorCodeUnknownMemberID: { |
||||
|
Code: ErrorCodeUnknownMemberID, Name: "UNKNOWN_MEMBER_ID", |
||||
|
Description: "Unknown member ID", Retriable: false, |
||||
|
}, |
||||
|
ErrorCodeInvalidSessionTimeout: { |
||||
|
Code: ErrorCodeInvalidSessionTimeout, Name: "INVALID_SESSION_TIMEOUT", |
||||
|
Description: "Invalid session timeout", Retriable: false, |
||||
|
}, |
||||
|
ErrorCodeRebalanceInProgress: { |
||||
|
Code: ErrorCodeRebalanceInProgress, Name: "REBALANCE_IN_PROGRESS", |
||||
|
Description: "Group rebalance in progress", Retriable: true, |
||||
|
}, |
||||
|
ErrorCodeInvalidCommitOffsetSize: { |
||||
|
Code: ErrorCodeInvalidCommitOffsetSize, Name: "INVALID_COMMIT_OFFSET_SIZE", |
||||
|
Description: "Invalid commit offset size", Retriable: false, |
||||
|
}, |
||||
|
ErrorCodeTopicAuthorizationFailed: { |
||||
|
Code: ErrorCodeTopicAuthorizationFailed, Name: "TOPIC_AUTHORIZATION_FAILED", |
||||
|
Description: "Topic authorization failed", Retriable: false, |
||||
|
}, |
||||
|
ErrorCodeGroupAuthorizationFailed: { |
||||
|
Code: ErrorCodeGroupAuthorizationFailed, Name: "GROUP_AUTHORIZATION_FAILED", |
||||
|
Description: "Group authorization failed", Retriable: false, |
||||
|
}, |
||||
|
ErrorCodeUnsupportedVersion: { |
||||
|
Code: ErrorCodeUnsupportedVersion, Name: "UNSUPPORTED_VERSION", |
||||
|
Description: "Unsupported version", Retriable: false, |
||||
|
}, |
||||
|
ErrorCodeTopicAlreadyExists: { |
||||
|
Code: ErrorCodeTopicAlreadyExists, Name: "TOPIC_ALREADY_EXISTS", |
||||
|
Description: "Topic already exists", Retriable: false, |
||||
|
}, |
||||
|
ErrorCodeInvalidPartitions: { |
||||
|
Code: ErrorCodeInvalidPartitions, Name: "INVALID_PARTITIONS", |
||||
|
Description: "Invalid number of partitions", Retriable: false, |
||||
|
}, |
||||
|
ErrorCodeInvalidReplicationFactor: { |
||||
|
Code: ErrorCodeInvalidReplicationFactor, Name: "INVALID_REPLICATION_FACTOR", |
||||
|
Description: "Invalid replication factor", Retriable: false, |
||||
|
}, |
||||
|
ErrorCodeInvalidRecord: { |
||||
|
Code: ErrorCodeInvalidRecord, Name: "INVALID_RECORD", |
||||
|
Description: "Invalid record", Retriable: false, |
||||
|
}, |
||||
|
ErrorCodeConnectionRefused: { |
||||
|
Code: ErrorCodeConnectionRefused, Name: "CONNECTION_REFUSED", |
||||
|
Description: "Connection refused", Retriable: true, |
||||
|
}, |
||||
|
ErrorCodeConnectionTimeout: { |
||||
|
Code: ErrorCodeConnectionTimeout, Name: "CONNECTION_TIMEOUT", |
||||
|
Description: "Connection timeout", Retriable: true, |
||||
|
}, |
||||
|
ErrorCodeReadTimeout: { |
||||
|
Code: ErrorCodeReadTimeout, Name: "READ_TIMEOUT", |
||||
|
Description: "Read operation timeout", Retriable: true, |
||||
|
}, |
||||
|
ErrorCodeWriteTimeout: { |
||||
|
Code: ErrorCodeWriteTimeout, Name: "WRITE_TIMEOUT", |
||||
|
Description: "Write operation timeout", Retriable: true, |
||||
|
}, |
||||
|
ErrorCodeIllegalGeneration: { |
||||
|
Code: ErrorCodeIllegalGeneration, Name: "ILLEGAL_GENERATION", |
||||
|
Description: "Illegal generation", Retriable: false, |
||||
|
}, |
||||
|
ErrorCodeInconsistentGroupProtocol: { |
||||
|
Code: ErrorCodeInconsistentGroupProtocol, Name: "INCONSISTENT_GROUP_PROTOCOL", |
||||
|
Description: "Inconsistent group protocol", Retriable: false, |
||||
|
}, |
||||
|
ErrorCodeMemberIDRequired: { |
||||
|
Code: ErrorCodeMemberIDRequired, Name: "MEMBER_ID_REQUIRED", |
||||
|
Description: "Member ID required", Retriable: false, |
||||
|
}, |
||||
|
ErrorCodeFencedInstanceID: { |
||||
|
Code: ErrorCodeFencedInstanceID, Name: "FENCED_INSTANCE_ID", |
||||
|
Description: "Instance ID fenced", Retriable: false, |
||||
|
}, |
||||
|
ErrorCodeGroupMaxSizeReached: { |
||||
|
Code: ErrorCodeGroupMaxSizeReached, Name: "GROUP_MAX_SIZE_REACHED", |
||||
|
Description: "Group max size reached", Retriable: false, |
||||
|
}, |
||||
|
ErrorCodeUnstableOffsetCommit: { |
||||
|
Code: ErrorCodeUnstableOffsetCommit, Name: "UNSTABLE_OFFSET_COMMIT", |
||||
|
Description: "Offset commit during rebalance", Retriable: true, |
||||
|
}, |
||||
|
} |
||||
|
|
||||
|
// GetErrorInfo returns error information for the given error code
|
||||
|
func GetErrorInfo(code int16) ErrorInfo { |
||||
|
if info, exists := KafkaErrors[code]; exists { |
||||
|
return info |
||||
|
} |
||||
|
return ErrorInfo{ |
||||
|
Code: code, Name: "UNKNOWN", Description: "Unknown error code", Retriable: false, |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// IsRetriableError returns true if the error is retriable
|
||||
|
func IsRetriableError(code int16) bool { |
||||
|
return GetErrorInfo(code).Retriable |
||||
|
} |
||||
|
|
||||
|
// BuildErrorResponse builds a standard Kafka error response
|
||||
|
func BuildErrorResponse(correlationID uint32, errorCode int16) []byte { |
||||
|
response := make([]byte, 0, 8) |
||||
|
|
||||
|
// Correlation ID (4 bytes)
|
||||
|
correlationIDBytes := make([]byte, 4) |
||||
|
binary.BigEndian.PutUint32(correlationIDBytes, correlationID) |
||||
|
response = append(response, correlationIDBytes...) |
||||
|
|
||||
|
// Error code (2 bytes)
|
||||
|
errorCodeBytes := make([]byte, 2) |
||||
|
binary.BigEndian.PutUint16(errorCodeBytes, uint16(errorCode)) |
||||
|
response = append(response, errorCodeBytes...) |
||||
|
|
||||
|
return response |
||||
|
} |
||||
|
|
||||
|
// BuildErrorResponseWithMessage builds a Kafka error response with error message
|
||||
|
func BuildErrorResponseWithMessage(correlationID uint32, errorCode int16, message string) []byte { |
||||
|
response := BuildErrorResponse(correlationID, errorCode) |
||||
|
|
||||
|
// Error message (2 bytes length + message)
|
||||
|
if message == "" { |
||||
|
response = append(response, 0xFF, 0xFF) // Null string
|
||||
|
} else { |
||||
|
messageLen := uint16(len(message)) |
||||
|
messageLenBytes := make([]byte, 2) |
||||
|
binary.BigEndian.PutUint16(messageLenBytes, messageLen) |
||||
|
response = append(response, messageLenBytes...) |
||||
|
response = append(response, []byte(message)...) |
||||
|
} |
||||
|
|
||||
|
return response |
||||
|
} |
||||
|
|
||||
|
// ClassifyNetworkError classifies network errors into appropriate Kafka error codes
|
||||
|
func ClassifyNetworkError(err error) int16 { |
||||
|
if err == nil { |
||||
|
return ErrorCodeNone |
||||
|
} |
||||
|
|
||||
|
// Check for network errors
|
||||
|
if netErr, ok := err.(net.Error); ok { |
||||
|
if netErr.Timeout() { |
||||
|
return ErrorCodeRequestTimedOut |
||||
|
} |
||||
|
return ErrorCodeNetworkException |
||||
|
} |
||||
|
|
||||
|
// Check for specific error types
|
||||
|
switch err.Error() { |
||||
|
case "connection refused": |
||||
|
return ErrorCodeConnectionRefused |
||||
|
case "connection timeout": |
||||
|
return ErrorCodeConnectionTimeout |
||||
|
default: |
||||
|
return ErrorCodeUnknownServerError |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// TimeoutConfig holds timeout configuration for connections and operations
|
||||
|
type TimeoutConfig struct { |
||||
|
ConnectionTimeout time.Duration // Timeout for establishing connections
|
||||
|
ReadTimeout time.Duration // Timeout for read operations
|
||||
|
WriteTimeout time.Duration // Timeout for write operations
|
||||
|
RequestTimeout time.Duration // Overall request timeout
|
||||
|
} |
||||
|
|
||||
|
// DefaultTimeoutConfig returns default timeout configuration
|
||||
|
func DefaultTimeoutConfig() TimeoutConfig { |
||||
|
return TimeoutConfig{ |
||||
|
ConnectionTimeout: 30 * time.Second, |
||||
|
ReadTimeout: 10 * time.Second, |
||||
|
WriteTimeout: 10 * time.Second, |
||||
|
RequestTimeout: 30 * time.Second, |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// HandleTimeoutError handles timeout errors and returns appropriate error code
|
||||
|
func HandleTimeoutError(err error, operation string) int16 { |
||||
|
if err == nil { |
||||
|
return ErrorCodeNone |
||||
|
} |
||||
|
|
||||
|
if netErr, ok := err.(net.Error); ok && netErr.Timeout() { |
||||
|
switch operation { |
||||
|
case "read": |
||||
|
return ErrorCodeReadTimeout |
||||
|
case "write": |
||||
|
return ErrorCodeWriteTimeout |
||||
|
case "connect": |
||||
|
return ErrorCodeConnectionTimeout |
||||
|
default: |
||||
|
return ErrorCodeRequestTimedOut |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return ClassifyNetworkError(err) |
||||
|
} |
||||
|
|
||||
|
// SafeFormatError safely formats error messages to avoid information leakage
|
||||
|
func SafeFormatError(err error) string { |
||||
|
if err == nil { |
||||
|
return "" |
||||
|
} |
||||
|
|
||||
|
// For production, we might want to sanitize error messages
|
||||
|
// For now, return the full error for debugging
|
||||
|
return fmt.Sprintf("Error: %v", err) |
||||
|
} |
||||
Write
Preview
Loading…
Cancel
Save
Reference in new issue