Browse Source
Phase 7: Comprehensive error handling and edge cases
Phase 7: Comprehensive error handling and edge cases
- Added centralized errors.go with complete Kafka error code definitions - Implemented timeout detection and network error classification - Enhanced connection handling with configurable timeouts and better error reporting - Added comprehensive error handling test suite with 21 test cases - Unified error code usage across all protocol handlers - Improved request/response timeout handling with graceful fallbacks - All protocol and E2E tests passing with robust error handlingpull/7231/head
11 changed files with 1057 additions and 271 deletions
-
23weed/mq/kafka/IMPLEMENTATION_PHASES.md
-
2weed/mq/kafka/protocol/api_versions_test.go
-
6weed/mq/kafka/protocol/consumer_coordination.go
-
414weed/mq/kafka/protocol/error_handling_test.go
-
361weed/mq/kafka/protocol/errors.go
-
106weed/mq/kafka/protocol/flexible_versions.go
-
102weed/mq/kafka/protocol/flexible_versions_integration_test.go
-
158weed/mq/kafka/protocol/flexible_versions_test.go
-
131weed/mq/kafka/protocol/handler.go
-
16weed/mq/kafka/protocol/joingroup.go
-
9weed/mq/kafka/protocol/offset_management.go
@ -0,0 +1,414 @@ |
|||
package protocol |
|||
|
|||
import ( |
|||
"context" |
|||
"encoding/binary" |
|||
"errors" |
|||
"testing" |
|||
"time" |
|||
) |
|||
|
|||
func TestKafkaErrorCodes(t *testing.T) { |
|||
tests := []struct { |
|||
name string |
|||
errorCode int16 |
|||
expectedInfo ErrorInfo |
|||
}{ |
|||
{ |
|||
name: "No error", |
|||
errorCode: ErrorCodeNone, |
|||
expectedInfo: ErrorInfo{ |
|||
Code: 0, Name: "NONE", Description: "No error", Retriable: false, |
|||
}, |
|||
}, |
|||
{ |
|||
name: "Unknown server error", |
|||
errorCode: ErrorCodeUnknownServerError, |
|||
expectedInfo: ErrorInfo{ |
|||
Code: 1, Name: "UNKNOWN_SERVER_ERROR", Description: "Unknown server error", Retriable: true, |
|||
}, |
|||
}, |
|||
{ |
|||
name: "Topic already exists", |
|||
errorCode: ErrorCodeTopicAlreadyExists, |
|||
expectedInfo: ErrorInfo{ |
|||
Code: 36, Name: "TOPIC_ALREADY_EXISTS", Description: "Topic already exists", Retriable: false, |
|||
}, |
|||
}, |
|||
{ |
|||
name: "Invalid partitions", |
|||
errorCode: ErrorCodeInvalidPartitions, |
|||
expectedInfo: ErrorInfo{ |
|||
Code: 37, Name: "INVALID_PARTITIONS", Description: "Invalid number of partitions", Retriable: false, |
|||
}, |
|||
}, |
|||
{ |
|||
name: "Request timed out", |
|||
errorCode: ErrorCodeRequestTimedOut, |
|||
expectedInfo: ErrorInfo{ |
|||
Code: 7, Name: "REQUEST_TIMED_OUT", Description: "Request timed out", Retriable: true, |
|||
}, |
|||
}, |
|||
{ |
|||
name: "Connection timeout", |
|||
errorCode: ErrorCodeConnectionTimeout, |
|||
expectedInfo: ErrorInfo{ |
|||
Code: 61, Name: "CONNECTION_TIMEOUT", Description: "Connection timeout", Retriable: true, |
|||
}, |
|||
}, |
|||
} |
|||
|
|||
for _, tt := range tests { |
|||
t.Run(tt.name, func(t *testing.T) { |
|||
info := GetErrorInfo(tt.errorCode) |
|||
if info.Code != tt.expectedInfo.Code { |
|||
t.Errorf("GetErrorInfo().Code = %d, want %d", info.Code, tt.expectedInfo.Code) |
|||
} |
|||
if info.Name != tt.expectedInfo.Name { |
|||
t.Errorf("GetErrorInfo().Name = %s, want %s", info.Name, tt.expectedInfo.Name) |
|||
} |
|||
if info.Description != tt.expectedInfo.Description { |
|||
t.Errorf("GetErrorInfo().Description = %s, want %s", info.Description, tt.expectedInfo.Description) |
|||
} |
|||
if info.Retriable != tt.expectedInfo.Retriable { |
|||
t.Errorf("GetErrorInfo().Retriable = %v, want %v", info.Retriable, tt.expectedInfo.Retriable) |
|||
} |
|||
}) |
|||
} |
|||
} |
|||
|
|||
func TestIsRetriableError(t *testing.T) { |
|||
tests := []struct { |
|||
name string |
|||
errorCode int16 |
|||
retriable bool |
|||
}{ |
|||
{"None", ErrorCodeNone, false}, |
|||
{"Unknown server error", ErrorCodeUnknownServerError, true}, |
|||
{"Topic already exists", ErrorCodeTopicAlreadyExists, false}, |
|||
{"Request timed out", ErrorCodeRequestTimedOut, true}, |
|||
{"Rebalance in progress", ErrorCodeRebalanceInProgress, true}, |
|||
{"Invalid group ID", ErrorCodeInvalidGroupID, false}, |
|||
{"Network exception", ErrorCodeNetworkException, true}, |
|||
{"Connection timeout", ErrorCodeConnectionTimeout, true}, |
|||
{"Read timeout", ErrorCodeReadTimeout, true}, |
|||
{"Write timeout", ErrorCodeWriteTimeout, true}, |
|||
} |
|||
|
|||
for _, tt := range tests { |
|||
t.Run(tt.name, func(t *testing.T) { |
|||
if got := IsRetriableError(tt.errorCode); got != tt.retriable { |
|||
t.Errorf("IsRetriableError() = %v, want %v", got, tt.retriable) |
|||
} |
|||
}) |
|||
} |
|||
} |
|||
|
|||
func TestBuildErrorResponse(t *testing.T) { |
|||
tests := []struct { |
|||
name string |
|||
correlationID uint32 |
|||
errorCode int16 |
|||
expectedLen int |
|||
}{ |
|||
{"Basic error response", 12345, ErrorCodeUnknownServerError, 6}, |
|||
{"Topic already exists", 67890, ErrorCodeTopicAlreadyExists, 6}, |
|||
{"No error", 11111, ErrorCodeNone, 6}, |
|||
} |
|||
|
|||
for _, tt := range tests { |
|||
t.Run(tt.name, func(t *testing.T) { |
|||
response := BuildErrorResponse(tt.correlationID, tt.errorCode) |
|||
|
|||
if len(response) != tt.expectedLen { |
|||
t.Errorf("BuildErrorResponse() length = %d, want %d", len(response), tt.expectedLen) |
|||
} |
|||
|
|||
// Verify correlation ID
|
|||
if len(response) >= 4 { |
|||
correlationID := binary.BigEndian.Uint32(response[0:4]) |
|||
if correlationID != tt.correlationID { |
|||
t.Errorf("Correlation ID = %d, want %d", correlationID, tt.correlationID) |
|||
} |
|||
} |
|||
|
|||
// Verify error code
|
|||
if len(response) >= 6 { |
|||
errorCode := binary.BigEndian.Uint16(response[4:6]) |
|||
if errorCode != uint16(tt.errorCode) { |
|||
t.Errorf("Error code = %d, want %d", errorCode, uint16(tt.errorCode)) |
|||
} |
|||
} |
|||
}) |
|||
} |
|||
} |
|||
|
|||
func TestBuildErrorResponseWithMessage(t *testing.T) { |
|||
tests := []struct { |
|||
name string |
|||
correlationID uint32 |
|||
errorCode int16 |
|||
message string |
|||
expectNullMsg bool |
|||
}{ |
|||
{"Error with message", 12345, ErrorCodeUnknownServerError, "Test error message", false}, |
|||
{"Error with empty message", 67890, ErrorCodeTopicAlreadyExists, "", true}, |
|||
{"Error with long message", 11111, ErrorCodeInvalidPartitions, "This is a longer error message for testing", false}, |
|||
} |
|||
|
|||
for _, tt := range tests { |
|||
t.Run(tt.name, func(t *testing.T) { |
|||
response := BuildErrorResponseWithMessage(tt.correlationID, tt.errorCode, tt.message) |
|||
|
|||
// Should have at least correlation ID (4) + error code (2) + message length (2)
|
|||
minLen := 8 |
|||
if len(response) < minLen { |
|||
t.Errorf("BuildErrorResponseWithMessage() length = %d, want at least %d", len(response), minLen) |
|||
} |
|||
|
|||
// Verify correlation ID
|
|||
correlationID := binary.BigEndian.Uint32(response[0:4]) |
|||
if correlationID != tt.correlationID { |
|||
t.Errorf("Correlation ID = %d, want %d", correlationID, tt.correlationID) |
|||
} |
|||
|
|||
// Verify error code
|
|||
errorCode := binary.BigEndian.Uint16(response[4:6]) |
|||
if errorCode != uint16(tt.errorCode) { |
|||
t.Errorf("Error code = %d, want %d", errorCode, uint16(tt.errorCode)) |
|||
} |
|||
|
|||
// Verify message
|
|||
if tt.expectNullMsg { |
|||
// Should have null string marker (0xFF, 0xFF)
|
|||
if len(response) >= 8 && (response[6] != 0xFF || response[7] != 0xFF) { |
|||
t.Errorf("Expected null string marker, got %x %x", response[6], response[7]) |
|||
} |
|||
} else { |
|||
// Should have message length and message
|
|||
if len(response) >= 8 { |
|||
messageLen := binary.BigEndian.Uint16(response[6:8]) |
|||
if messageLen != uint16(len(tt.message)) { |
|||
t.Errorf("Message length = %d, want %d", messageLen, len(tt.message)) |
|||
} |
|||
|
|||
if len(response) >= 8+len(tt.message) { |
|||
actualMessage := string(response[8 : 8+len(tt.message)]) |
|||
if actualMessage != tt.message { |
|||
t.Errorf("Message = %q, want %q", actualMessage, tt.message) |
|||
} |
|||
} |
|||
} |
|||
} |
|||
}) |
|||
} |
|||
} |
|||
|
|||
func TestClassifyNetworkError(t *testing.T) { |
|||
tests := []struct { |
|||
name string |
|||
err error |
|||
expected int16 |
|||
}{ |
|||
{"No error", nil, ErrorCodeNone}, |
|||
{"Generic error", errors.New("generic error"), ErrorCodeUnknownServerError}, |
|||
{"Connection refused", errors.New("connection refused"), ErrorCodeConnectionRefused}, |
|||
{"Connection timeout", errors.New("connection timeout"), ErrorCodeConnectionTimeout}, |
|||
{"Network timeout", &timeoutError{}, ErrorCodeRequestTimedOut}, |
|||
{"Network error", &networkError{}, ErrorCodeNetworkException}, |
|||
} |
|||
|
|||
for _, tt := range tests { |
|||
t.Run(tt.name, func(t *testing.T) { |
|||
if got := ClassifyNetworkError(tt.err); got != tt.expected { |
|||
t.Errorf("ClassifyNetworkError() = %v, want %v", got, tt.expected) |
|||
} |
|||
}) |
|||
} |
|||
} |
|||
|
|||
func TestHandleTimeoutError(t *testing.T) { |
|||
tests := []struct { |
|||
name string |
|||
err error |
|||
operation string |
|||
expected int16 |
|||
}{ |
|||
{"No error", nil, "read", ErrorCodeNone}, |
|||
{"Read timeout", &timeoutError{}, "read", ErrorCodeReadTimeout}, |
|||
{"Write timeout", &timeoutError{}, "write", ErrorCodeWriteTimeout}, |
|||
{"Connect timeout", &timeoutError{}, "connect", ErrorCodeConnectionTimeout}, |
|||
{"Generic timeout", &timeoutError{}, "unknown", ErrorCodeRequestTimedOut}, |
|||
{"Non-timeout error", errors.New("other error"), "read", ErrorCodeUnknownServerError}, |
|||
} |
|||
|
|||
for _, tt := range tests { |
|||
t.Run(tt.name, func(t *testing.T) { |
|||
if got := HandleTimeoutError(tt.err, tt.operation); got != tt.expected { |
|||
t.Errorf("HandleTimeoutError() = %v, want %v", got, tt.expected) |
|||
} |
|||
}) |
|||
} |
|||
} |
|||
|
|||
func TestDefaultTimeoutConfig(t *testing.T) { |
|||
config := DefaultTimeoutConfig() |
|||
|
|||
if config.ConnectionTimeout != 30*time.Second { |
|||
t.Errorf("ConnectionTimeout = %v, want %v", config.ConnectionTimeout, 30*time.Second) |
|||
} |
|||
if config.ReadTimeout != 10*time.Second { |
|||
t.Errorf("ReadTimeout = %v, want %v", config.ReadTimeout, 10*time.Second) |
|||
} |
|||
if config.WriteTimeout != 10*time.Second { |
|||
t.Errorf("WriteTimeout = %v, want %v", config.WriteTimeout, 10*time.Second) |
|||
} |
|||
if config.RequestTimeout != 30*time.Second { |
|||
t.Errorf("RequestTimeout = %v, want %v", config.RequestTimeout, 30*time.Second) |
|||
} |
|||
} |
|||
|
|||
func TestSafeFormatError(t *testing.T) { |
|||
tests := []struct { |
|||
name string |
|||
err error |
|||
expected string |
|||
}{ |
|||
{"No error", nil, ""}, |
|||
{"Generic error", errors.New("test error"), "Error: test error"}, |
|||
{"Network error", &networkError{}, "Error: network error"}, |
|||
} |
|||
|
|||
for _, tt := range tests { |
|||
t.Run(tt.name, func(t *testing.T) { |
|||
if got := SafeFormatError(tt.err); got != tt.expected { |
|||
t.Errorf("SafeFormatError() = %q, want %q", got, tt.expected) |
|||
} |
|||
}) |
|||
} |
|||
} |
|||
|
|||
func TestGetErrorInfo_UnknownErrorCode(t *testing.T) { |
|||
unknownCode := int16(9999) |
|||
info := GetErrorInfo(unknownCode) |
|||
|
|||
if info.Code != unknownCode { |
|||
t.Errorf("Code = %d, want %d", info.Code, unknownCode) |
|||
} |
|||
if info.Name != "UNKNOWN" { |
|||
t.Errorf("Name = %s, want UNKNOWN", info.Name) |
|||
} |
|||
if info.Description != "Unknown error code" { |
|||
t.Errorf("Description = %s, want 'Unknown error code'", info.Description) |
|||
} |
|||
if info.Retriable != false { |
|||
t.Errorf("Retriable = %v, want false", info.Retriable) |
|||
} |
|||
} |
|||
|
|||
// Integration test for error handling in protocol context
|
|||
func TestErrorHandling_Integration(t *testing.T) { |
|||
// Test building various protocol error responses
|
|||
tests := []struct { |
|||
name string |
|||
apiKey uint16 |
|||
errorCode int16 |
|||
message string |
|||
}{ |
|||
{"ApiVersions error", 18, ErrorCodeUnsupportedVersion, "Version not supported"}, |
|||
{"Metadata error", 3, ErrorCodeUnknownTopicOrPartition, "Topic not found"}, |
|||
{"Produce error", 0, ErrorCodeMessageTooLarge, "Message exceeds size limit"}, |
|||
{"Fetch error", 1, ErrorCodeOffsetOutOfRange, "Offset out of range"}, |
|||
{"CreateTopics error", 19, ErrorCodeTopicAlreadyExists, "Topic already exists"}, |
|||
} |
|||
|
|||
for _, tt := range tests { |
|||
t.Run(tt.name, func(t *testing.T) { |
|||
correlationID := uint32(12345) |
|||
|
|||
// Test basic error response
|
|||
basicResponse := BuildErrorResponse(correlationID, tt.errorCode) |
|||
if len(basicResponse) != 6 { |
|||
t.Errorf("Basic response length = %d, want 6", len(basicResponse)) |
|||
} |
|||
|
|||
// Test error response with message
|
|||
messageResponse := BuildErrorResponseWithMessage(correlationID, tt.errorCode, tt.message) |
|||
expectedMinLen := 8 + len(tt.message) // 4 (correlationID) + 2 (errorCode) + 2 (messageLen) + len(message)
|
|||
if len(messageResponse) < expectedMinLen { |
|||
t.Errorf("Message response length = %d, want at least %d", len(messageResponse), expectedMinLen) |
|||
} |
|||
|
|||
// Verify error is correctly classified
|
|||
info := GetErrorInfo(tt.errorCode) |
|||
if info.Code != tt.errorCode { |
|||
t.Errorf("Error info code = %d, want %d", info.Code, tt.errorCode) |
|||
} |
|||
}) |
|||
} |
|||
} |
|||
|
|||
// Mock error types for testing
|
|||
type timeoutError struct{} |
|||
|
|||
func (e *timeoutError) Error() string { return "timeout error" } |
|||
func (e *timeoutError) Timeout() bool { return true } |
|||
func (e *timeoutError) Temporary() bool { return true } |
|||
|
|||
type networkError struct{} |
|||
|
|||
func (e *networkError) Error() string { return "network error" } |
|||
func (e *networkError) Timeout() bool { return false } |
|||
func (e *networkError) Temporary() bool { return true } |
|||
|
|||
// Test timeout detection
|
|||
func TestTimeoutDetection(t *testing.T) { |
|||
// Test with context timeout
|
|||
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Millisecond) |
|||
defer cancel() |
|||
|
|||
// Wait for context to timeout
|
|||
time.Sleep(2 * time.Millisecond) |
|||
|
|||
select { |
|||
case <-ctx.Done(): |
|||
err := ctx.Err() |
|||
errorCode := HandleTimeoutError(err, "context") |
|||
if errorCode != ErrorCodeRequestTimedOut { |
|||
t.Errorf("Context timeout error code = %v, want %v", errorCode, ErrorCodeRequestTimedOut) |
|||
} |
|||
default: |
|||
t.Error("Context should have timed out") |
|||
} |
|||
} |
|||
|
|||
// Benchmark error response building
|
|||
func BenchmarkBuildErrorResponse(b *testing.B) { |
|||
correlationID := uint32(12345) |
|||
errorCode := ErrorCodeUnknownServerError |
|||
|
|||
b.ResetTimer() |
|||
for i := 0; i < b.N; i++ { |
|||
BuildErrorResponse(correlationID, errorCode) |
|||
} |
|||
} |
|||
|
|||
func BenchmarkBuildErrorResponseWithMessage(b *testing.B) { |
|||
correlationID := uint32(12345) |
|||
errorCode := ErrorCodeUnknownServerError |
|||
message := "This is a test error message" |
|||
|
|||
b.ResetTimer() |
|||
for i := 0; i < b.N; i++ { |
|||
BuildErrorResponseWithMessage(correlationID, errorCode, message) |
|||
} |
|||
} |
|||
|
|||
func BenchmarkClassifyNetworkError(b *testing.B) { |
|||
err := &timeoutError{} |
|||
|
|||
b.ResetTimer() |
|||
for i := 0; i < b.N; i++ { |
|||
ClassifyNetworkError(err) |
|||
} |
|||
} |
|||
@ -0,0 +1,361 @@ |
|||
package protocol |
|||
|
|||
import ( |
|||
"encoding/binary" |
|||
"fmt" |
|||
"net" |
|||
"time" |
|||
) |
|||
|
|||
// Kafka Protocol Error Codes
|
|||
// Based on Apache Kafka protocol specification
|
|||
const ( |
|||
// Success
|
|||
ErrorCodeNone int16 = 0 |
|||
|
|||
// General server errors
|
|||
ErrorCodeUnknownServerError int16 = 1 |
|||
ErrorCodeOffsetOutOfRange int16 = 2 |
|||
ErrorCodeCorruptMessage int16 = 3 // Also UNKNOWN_TOPIC_OR_PARTITION
|
|||
ErrorCodeUnknownTopicOrPartition int16 = 3 |
|||
ErrorCodeInvalidFetchSize int16 = 4 |
|||
ErrorCodeLeaderNotAvailable int16 = 5 |
|||
ErrorCodeNotLeaderOrFollower int16 = 6 // Formerly NOT_LEADER_FOR_PARTITION
|
|||
ErrorCodeRequestTimedOut int16 = 7 |
|||
ErrorCodeBrokerNotAvailable int16 = 8 |
|||
ErrorCodeReplicaNotAvailable int16 = 9 |
|||
ErrorCodeMessageTooLarge int16 = 10 |
|||
ErrorCodeStaleControllerEpoch int16 = 11 |
|||
ErrorCodeOffsetMetadataTooLarge int16 = 12 |
|||
ErrorCodeNetworkException int16 = 13 |
|||
ErrorCodeOffsetLoadInProgress int16 = 14 |
|||
ErrorCodeGroupLoadInProgress int16 = 15 |
|||
ErrorCodeNotCoordinatorForGroup int16 = 16 |
|||
ErrorCodeNotCoordinatorForTransaction int16 = 17 |
|||
|
|||
// Consumer group coordination errors
|
|||
ErrorCodeIllegalGeneration int16 = 22 |
|||
ErrorCodeInconsistentGroupProtocol int16 = 23 |
|||
ErrorCodeInvalidGroupID int16 = 24 |
|||
ErrorCodeUnknownMemberID int16 = 25 |
|||
ErrorCodeInvalidSessionTimeout int16 = 26 |
|||
ErrorCodeRebalanceInProgress int16 = 27 |
|||
ErrorCodeInvalidCommitOffsetSize int16 = 28 |
|||
ErrorCodeTopicAuthorizationFailed int16 = 29 |
|||
ErrorCodeGroupAuthorizationFailed int16 = 30 |
|||
ErrorCodeClusterAuthorizationFailed int16 = 31 |
|||
ErrorCodeInvalidTimestamp int16 = 32 |
|||
ErrorCodeUnsupportedSASLMechanism int16 = 33 |
|||
ErrorCodeIllegalSASLState int16 = 34 |
|||
ErrorCodeUnsupportedVersion int16 = 35 |
|||
|
|||
// Topic management errors
|
|||
ErrorCodeTopicAlreadyExists int16 = 36 |
|||
ErrorCodeInvalidPartitions int16 = 37 |
|||
ErrorCodeInvalidReplicationFactor int16 = 38 |
|||
ErrorCodeInvalidReplicaAssignment int16 = 39 |
|||
ErrorCodeInvalidConfig int16 = 40 |
|||
ErrorCodeNotController int16 = 41 |
|||
ErrorCodeInvalidRecord int16 = 42 |
|||
ErrorCodePolicyViolation int16 = 43 |
|||
ErrorCodeOutOfOrderSequenceNumber int16 = 44 |
|||
ErrorCodeDuplicateSequenceNumber int16 = 45 |
|||
ErrorCodeInvalidProducerEpoch int16 = 46 |
|||
ErrorCodeInvalidTxnState int16 = 47 |
|||
ErrorCodeInvalidProducerIDMapping int16 = 48 |
|||
ErrorCodeInvalidTransactionTimeout int16 = 49 |
|||
ErrorCodeConcurrentTransactions int16 = 50 |
|||
|
|||
// Connection and timeout errors
|
|||
ErrorCodeConnectionRefused int16 = 60 // Custom for connection issues
|
|||
ErrorCodeConnectionTimeout int16 = 61 // Custom for connection timeouts
|
|||
ErrorCodeReadTimeout int16 = 62 // Custom for read timeouts
|
|||
ErrorCodeWriteTimeout int16 = 63 // Custom for write timeouts
|
|||
|
|||
// Consumer group specific errors
|
|||
ErrorCodeMemberIDRequired int16 = 79 |
|||
ErrorCodeFencedInstanceID int16 = 82 |
|||
ErrorCodeGroupMaxSizeReached int16 = 84 |
|||
ErrorCodeUnstableOffsetCommit int16 = 95 |
|||
) |
|||
|
|||
// ErrorInfo contains metadata about a Kafka error
|
|||
type ErrorInfo struct { |
|||
Code int16 |
|||
Name string |
|||
Description string |
|||
Retriable bool |
|||
} |
|||
|
|||
// KafkaErrors maps error codes to their metadata
|
|||
var KafkaErrors = map[int16]ErrorInfo{ |
|||
ErrorCodeNone: { |
|||
Code: ErrorCodeNone, Name: "NONE", Description: "No error", Retriable: false, |
|||
}, |
|||
ErrorCodeUnknownServerError: { |
|||
Code: ErrorCodeUnknownServerError, Name: "UNKNOWN_SERVER_ERROR", |
|||
Description: "Unknown server error", Retriable: true, |
|||
}, |
|||
ErrorCodeOffsetOutOfRange: { |
|||
Code: ErrorCodeOffsetOutOfRange, Name: "OFFSET_OUT_OF_RANGE", |
|||
Description: "Offset out of range", Retriable: false, |
|||
}, |
|||
ErrorCodeUnknownTopicOrPartition: { |
|||
Code: ErrorCodeUnknownTopicOrPartition, Name: "UNKNOWN_TOPIC_OR_PARTITION", |
|||
Description: "Topic or partition does not exist", Retriable: false, |
|||
}, |
|||
ErrorCodeInvalidFetchSize: { |
|||
Code: ErrorCodeInvalidFetchSize, Name: "INVALID_FETCH_SIZE", |
|||
Description: "Invalid fetch size", Retriable: false, |
|||
}, |
|||
ErrorCodeLeaderNotAvailable: { |
|||
Code: ErrorCodeLeaderNotAvailable, Name: "LEADER_NOT_AVAILABLE", |
|||
Description: "Leader not available", Retriable: true, |
|||
}, |
|||
ErrorCodeNotLeaderOrFollower: { |
|||
Code: ErrorCodeNotLeaderOrFollower, Name: "NOT_LEADER_OR_FOLLOWER", |
|||
Description: "Not leader or follower", Retriable: true, |
|||
}, |
|||
ErrorCodeRequestTimedOut: { |
|||
Code: ErrorCodeRequestTimedOut, Name: "REQUEST_TIMED_OUT", |
|||
Description: "Request timed out", Retriable: true, |
|||
}, |
|||
ErrorCodeBrokerNotAvailable: { |
|||
Code: ErrorCodeBrokerNotAvailable, Name: "BROKER_NOT_AVAILABLE", |
|||
Description: "Broker not available", Retriable: true, |
|||
}, |
|||
ErrorCodeMessageTooLarge: { |
|||
Code: ErrorCodeMessageTooLarge, Name: "MESSAGE_TOO_LARGE", |
|||
Description: "Message size exceeds limit", Retriable: false, |
|||
}, |
|||
ErrorCodeOffsetMetadataTooLarge: { |
|||
Code: ErrorCodeOffsetMetadataTooLarge, Name: "OFFSET_METADATA_TOO_LARGE", |
|||
Description: "Offset metadata too large", Retriable: false, |
|||
}, |
|||
ErrorCodeNetworkException: { |
|||
Code: ErrorCodeNetworkException, Name: "NETWORK_EXCEPTION", |
|||
Description: "Network error", Retriable: true, |
|||
}, |
|||
ErrorCodeOffsetLoadInProgress: { |
|||
Code: ErrorCodeOffsetLoadInProgress, Name: "OFFSET_LOAD_IN_PROGRESS", |
|||
Description: "Offset load in progress", Retriable: true, |
|||
}, |
|||
ErrorCodeNotCoordinatorForGroup: { |
|||
Code: ErrorCodeNotCoordinatorForGroup, Name: "NOT_COORDINATOR_FOR_GROUP", |
|||
Description: "Not coordinator for group", Retriable: true, |
|||
}, |
|||
ErrorCodeInvalidGroupID: { |
|||
Code: ErrorCodeInvalidGroupID, Name: "INVALID_GROUP_ID", |
|||
Description: "Invalid group ID", Retriable: false, |
|||
}, |
|||
ErrorCodeUnknownMemberID: { |
|||
Code: ErrorCodeUnknownMemberID, Name: "UNKNOWN_MEMBER_ID", |
|||
Description: "Unknown member ID", Retriable: false, |
|||
}, |
|||
ErrorCodeInvalidSessionTimeout: { |
|||
Code: ErrorCodeInvalidSessionTimeout, Name: "INVALID_SESSION_TIMEOUT", |
|||
Description: "Invalid session timeout", Retriable: false, |
|||
}, |
|||
ErrorCodeRebalanceInProgress: { |
|||
Code: ErrorCodeRebalanceInProgress, Name: "REBALANCE_IN_PROGRESS", |
|||
Description: "Group rebalance in progress", Retriable: true, |
|||
}, |
|||
ErrorCodeInvalidCommitOffsetSize: { |
|||
Code: ErrorCodeInvalidCommitOffsetSize, Name: "INVALID_COMMIT_OFFSET_SIZE", |
|||
Description: "Invalid commit offset size", Retriable: false, |
|||
}, |
|||
ErrorCodeTopicAuthorizationFailed: { |
|||
Code: ErrorCodeTopicAuthorizationFailed, Name: "TOPIC_AUTHORIZATION_FAILED", |
|||
Description: "Topic authorization failed", Retriable: false, |
|||
}, |
|||
ErrorCodeGroupAuthorizationFailed: { |
|||
Code: ErrorCodeGroupAuthorizationFailed, Name: "GROUP_AUTHORIZATION_FAILED", |
|||
Description: "Group authorization failed", Retriable: false, |
|||
}, |
|||
ErrorCodeUnsupportedVersion: { |
|||
Code: ErrorCodeUnsupportedVersion, Name: "UNSUPPORTED_VERSION", |
|||
Description: "Unsupported version", Retriable: false, |
|||
}, |
|||
ErrorCodeTopicAlreadyExists: { |
|||
Code: ErrorCodeTopicAlreadyExists, Name: "TOPIC_ALREADY_EXISTS", |
|||
Description: "Topic already exists", Retriable: false, |
|||
}, |
|||
ErrorCodeInvalidPartitions: { |
|||
Code: ErrorCodeInvalidPartitions, Name: "INVALID_PARTITIONS", |
|||
Description: "Invalid number of partitions", Retriable: false, |
|||
}, |
|||
ErrorCodeInvalidReplicationFactor: { |
|||
Code: ErrorCodeInvalidReplicationFactor, Name: "INVALID_REPLICATION_FACTOR", |
|||
Description: "Invalid replication factor", Retriable: false, |
|||
}, |
|||
ErrorCodeInvalidRecord: { |
|||
Code: ErrorCodeInvalidRecord, Name: "INVALID_RECORD", |
|||
Description: "Invalid record", Retriable: false, |
|||
}, |
|||
ErrorCodeConnectionRefused: { |
|||
Code: ErrorCodeConnectionRefused, Name: "CONNECTION_REFUSED", |
|||
Description: "Connection refused", Retriable: true, |
|||
}, |
|||
ErrorCodeConnectionTimeout: { |
|||
Code: ErrorCodeConnectionTimeout, Name: "CONNECTION_TIMEOUT", |
|||
Description: "Connection timeout", Retriable: true, |
|||
}, |
|||
ErrorCodeReadTimeout: { |
|||
Code: ErrorCodeReadTimeout, Name: "READ_TIMEOUT", |
|||
Description: "Read operation timeout", Retriable: true, |
|||
}, |
|||
ErrorCodeWriteTimeout: { |
|||
Code: ErrorCodeWriteTimeout, Name: "WRITE_TIMEOUT", |
|||
Description: "Write operation timeout", Retriable: true, |
|||
}, |
|||
ErrorCodeIllegalGeneration: { |
|||
Code: ErrorCodeIllegalGeneration, Name: "ILLEGAL_GENERATION", |
|||
Description: "Illegal generation", Retriable: false, |
|||
}, |
|||
ErrorCodeInconsistentGroupProtocol: { |
|||
Code: ErrorCodeInconsistentGroupProtocol, Name: "INCONSISTENT_GROUP_PROTOCOL", |
|||
Description: "Inconsistent group protocol", Retriable: false, |
|||
}, |
|||
ErrorCodeMemberIDRequired: { |
|||
Code: ErrorCodeMemberIDRequired, Name: "MEMBER_ID_REQUIRED", |
|||
Description: "Member ID required", Retriable: false, |
|||
}, |
|||
ErrorCodeFencedInstanceID: { |
|||
Code: ErrorCodeFencedInstanceID, Name: "FENCED_INSTANCE_ID", |
|||
Description: "Instance ID fenced", Retriable: false, |
|||
}, |
|||
ErrorCodeGroupMaxSizeReached: { |
|||
Code: ErrorCodeGroupMaxSizeReached, Name: "GROUP_MAX_SIZE_REACHED", |
|||
Description: "Group max size reached", Retriable: false, |
|||
}, |
|||
ErrorCodeUnstableOffsetCommit: { |
|||
Code: ErrorCodeUnstableOffsetCommit, Name: "UNSTABLE_OFFSET_COMMIT", |
|||
Description: "Offset commit during rebalance", Retriable: true, |
|||
}, |
|||
} |
|||
|
|||
// GetErrorInfo returns error information for the given error code
|
|||
func GetErrorInfo(code int16) ErrorInfo { |
|||
if info, exists := KafkaErrors[code]; exists { |
|||
return info |
|||
} |
|||
return ErrorInfo{ |
|||
Code: code, Name: "UNKNOWN", Description: "Unknown error code", Retriable: false, |
|||
} |
|||
} |
|||
|
|||
// IsRetriableError returns true if the error is retriable
|
|||
func IsRetriableError(code int16) bool { |
|||
return GetErrorInfo(code).Retriable |
|||
} |
|||
|
|||
// BuildErrorResponse builds a standard Kafka error response
|
|||
func BuildErrorResponse(correlationID uint32, errorCode int16) []byte { |
|||
response := make([]byte, 0, 8) |
|||
|
|||
// Correlation ID (4 bytes)
|
|||
correlationIDBytes := make([]byte, 4) |
|||
binary.BigEndian.PutUint32(correlationIDBytes, correlationID) |
|||
response = append(response, correlationIDBytes...) |
|||
|
|||
// Error code (2 bytes)
|
|||
errorCodeBytes := make([]byte, 2) |
|||
binary.BigEndian.PutUint16(errorCodeBytes, uint16(errorCode)) |
|||
response = append(response, errorCodeBytes...) |
|||
|
|||
return response |
|||
} |
|||
|
|||
// BuildErrorResponseWithMessage builds a Kafka error response with error message
|
|||
func BuildErrorResponseWithMessage(correlationID uint32, errorCode int16, message string) []byte { |
|||
response := BuildErrorResponse(correlationID, errorCode) |
|||
|
|||
// Error message (2 bytes length + message)
|
|||
if message == "" { |
|||
response = append(response, 0xFF, 0xFF) // Null string
|
|||
} else { |
|||
messageLen := uint16(len(message)) |
|||
messageLenBytes := make([]byte, 2) |
|||
binary.BigEndian.PutUint16(messageLenBytes, messageLen) |
|||
response = append(response, messageLenBytes...) |
|||
response = append(response, []byte(message)...) |
|||
} |
|||
|
|||
return response |
|||
} |
|||
|
|||
// ClassifyNetworkError classifies network errors into appropriate Kafka error codes
|
|||
func ClassifyNetworkError(err error) int16 { |
|||
if err == nil { |
|||
return ErrorCodeNone |
|||
} |
|||
|
|||
// Check for network errors
|
|||
if netErr, ok := err.(net.Error); ok { |
|||
if netErr.Timeout() { |
|||
return ErrorCodeRequestTimedOut |
|||
} |
|||
return ErrorCodeNetworkException |
|||
} |
|||
|
|||
// Check for specific error types
|
|||
switch err.Error() { |
|||
case "connection refused": |
|||
return ErrorCodeConnectionRefused |
|||
case "connection timeout": |
|||
return ErrorCodeConnectionTimeout |
|||
default: |
|||
return ErrorCodeUnknownServerError |
|||
} |
|||
} |
|||
|
|||
// TimeoutConfig holds timeout configuration for connections and operations
|
|||
type TimeoutConfig struct { |
|||
ConnectionTimeout time.Duration // Timeout for establishing connections
|
|||
ReadTimeout time.Duration // Timeout for read operations
|
|||
WriteTimeout time.Duration // Timeout for write operations
|
|||
RequestTimeout time.Duration // Overall request timeout
|
|||
} |
|||
|
|||
// DefaultTimeoutConfig returns default timeout configuration
|
|||
func DefaultTimeoutConfig() TimeoutConfig { |
|||
return TimeoutConfig{ |
|||
ConnectionTimeout: 30 * time.Second, |
|||
ReadTimeout: 10 * time.Second, |
|||
WriteTimeout: 10 * time.Second, |
|||
RequestTimeout: 30 * time.Second, |
|||
} |
|||
} |
|||
|
|||
// HandleTimeoutError handles timeout errors and returns appropriate error code
|
|||
func HandleTimeoutError(err error, operation string) int16 { |
|||
if err == nil { |
|||
return ErrorCodeNone |
|||
} |
|||
|
|||
if netErr, ok := err.(net.Error); ok && netErr.Timeout() { |
|||
switch operation { |
|||
case "read": |
|||
return ErrorCodeReadTimeout |
|||
case "write": |
|||
return ErrorCodeWriteTimeout |
|||
case "connect": |
|||
return ErrorCodeConnectionTimeout |
|||
default: |
|||
return ErrorCodeRequestTimedOut |
|||
} |
|||
} |
|||
|
|||
return ClassifyNetworkError(err) |
|||
} |
|||
|
|||
// SafeFormatError safely formats error messages to avoid information leakage
|
|||
func SafeFormatError(err error) string { |
|||
if err == nil { |
|||
return "" |
|||
} |
|||
|
|||
// For production, we might want to sanitize error messages
|
|||
// For now, return the full error for debugging
|
|||
return fmt.Sprintf("Error: %v", err) |
|||
} |
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue