diff --git a/seaweed-volume/Cargo.toml b/seaweed-volume/Cargo.toml index ca48e708a..520894811 100644 --- a/seaweed-volume/Cargo.toml +++ b/seaweed-volume/Cargo.toml @@ -4,6 +4,12 @@ version = "0.1.0" edition = "2021" description = "SeaweedFS Volume Server — Rust implementation" +[features] +default = ["5bytes"] +# Enable 5-byte offset mode for 8TB max volume size (matches Go production builds with -tags 5BytesOffset). +# Without this feature, uses 4-byte offsets with 32GB max volume size. +5bytes = [] + [dependencies] # Async runtime tokio = { version = "1", features = ["full"] } diff --git a/seaweed-volume/src/storage/needle_map.rs b/seaweed-volume/src/storage/needle_map.rs index 3ab48106c..600976a30 100644 --- a/seaweed-volume/src/storage/needle_map.rs +++ b/seaweed-volume/src/storage/needle_map.rs @@ -31,20 +31,23 @@ pub struct NeedleValue { pub size: Size, } -/// Pack an (Offset, Size) pair into 8 bytes for redb storage. -/// Layout: [offset 4 bytes big-endian] [size 4 bytes big-endian] -fn pack_needle_value(nv: &NeedleValue) -> [u8; 8] { - let mut buf = [0u8; 8]; - nv.offset.to_bytes(&mut buf[..4]); - nv.size.to_bytes(&mut buf[4..8]); +/// Packed size of a NeedleValue in redb storage: OFFSET_SIZE + SIZE_SIZE. +const PACKED_NEEDLE_VALUE_SIZE: usize = OFFSET_SIZE + SIZE_SIZE; + +/// Pack an (Offset, Size) pair into bytes for redb storage. +/// Layout: [offset OFFSET_SIZE bytes] [size 4 bytes big-endian] +fn pack_needle_value(nv: &NeedleValue) -> [u8; PACKED_NEEDLE_VALUE_SIZE] { + let mut buf = [0u8; PACKED_NEEDLE_VALUE_SIZE]; + nv.offset.to_bytes(&mut buf[..OFFSET_SIZE]); + nv.size.to_bytes(&mut buf[OFFSET_SIZE..]); buf } -/// Unpack 8 bytes from redb storage into (Offset, Size). -fn unpack_needle_value(bytes: &[u8; 8]) -> NeedleValue { +/// Unpack bytes from redb storage into (Offset, Size). +fn unpack_needle_value(bytes: &[u8; PACKED_NEEDLE_VALUE_SIZE]) -> NeedleValue { NeedleValue { - offset: Offset::from_bytes(&bytes[..4]), - size: Size::from_bytes(&bytes[4..8]), + offset: Offset::from_bytes(&bytes[..OFFSET_SIZE]), + size: Size::from_bytes(&bytes[OFFSET_SIZE..]), } } @@ -422,8 +425,8 @@ impl RedbNeedleMap { })?; let key = NeedleId(key_guard.value()); let bytes: &[u8] = val_guard.value(); - if bytes.len() == 8 { - let mut arr = [0u8; 8]; + if bytes.len() == PACKED_NEEDLE_VALUE_SIZE { + let mut arr = [0u8; PACKED_NEEDLE_VALUE_SIZE]; arr.copy_from_slice(bytes); let nv = unpack_needle_value(&arr); self.metric.maybe_set_max_file_key(key); @@ -569,8 +572,8 @@ impl RedbNeedleMap { match table.get(key_u64) { Ok(Some(guard)) => { let bytes: &[u8] = guard.value(); - if bytes.len() == 8 { - let mut arr = [0u8; 8]; + if bytes.len() == PACKED_NEEDLE_VALUE_SIZE { + let mut arr = [0u8; PACKED_NEEDLE_VALUE_SIZE]; arr.copy_from_slice(bytes); Ok(Some(unpack_needle_value(&arr))) } else { @@ -693,8 +696,8 @@ impl RedbNeedleMap { match table.get(key_u64) { Ok(Some(guard)) => { let bytes: &[u8] = guard.value(); - if bytes.len() == 8 { - let mut arr = [0u8; 8]; + if bytes.len() == PACKED_NEEDLE_VALUE_SIZE { + let mut arr = [0u8; PACKED_NEEDLE_VALUE_SIZE]; arr.copy_from_slice(bytes); Ok(Some(unpack_needle_value(&arr))) } else { @@ -816,8 +819,8 @@ impl RedbNeedleMap { })?; let key_u64: u64 = key_guard.value(); let bytes: &[u8] = val_guard.value(); - if bytes.len() == 8 { - let mut arr = [0u8; 8]; + if bytes.len() == PACKED_NEEDLE_VALUE_SIZE { + let mut arr = [0u8; PACKED_NEEDLE_VALUE_SIZE]; arr.copy_from_slice(bytes); let nv = unpack_needle_value(&arr); if nv.size.is_valid() { @@ -847,8 +850,8 @@ impl RedbNeedleMap { let (key_guard, val_guard) = entry.map_err(|e| format!("redb iter next: {}", e))?; let key_u64: u64 = key_guard.value(); let bytes: &[u8] = val_guard.value(); - if bytes.len() == 8 { - let mut arr = [0u8; 8]; + if bytes.len() == PACKED_NEEDLE_VALUE_SIZE { + let mut arr = [0u8; PACKED_NEEDLE_VALUE_SIZE]; arr.copy_from_slice(bytes); let nv = unpack_needle_value(&arr); f(NeedleId(key_u64), &nv)?; @@ -876,8 +879,8 @@ impl RedbNeedleMap { if let Ok((key_guard, val_guard)) = entry { let key_u64: u64 = key_guard.value(); let bytes: &[u8] = val_guard.value(); - if bytes.len() == 8 { - let mut arr = [0u8; 8]; + if bytes.len() == PACKED_NEEDLE_VALUE_SIZE { + let mut arr = [0u8; PACKED_NEEDLE_VALUE_SIZE]; arr.copy_from_slice(bytes); let nv = unpack_needle_value(&arr); result.push((NeedleId(key_u64), nv)); diff --git a/seaweed-volume/src/storage/needle_map/compact_map.rs b/seaweed-volume/src/storage/needle_map/compact_map.rs index dc4552df7..411936dfa 100644 --- a/seaweed-volume/src/storage/needle_map/compact_map.rs +++ b/seaweed-volume/src/storage/needle_map/compact_map.rs @@ -249,8 +249,7 @@ mod tests { use super::*; fn offset(v: u32) -> Offset { - let bytes = v.to_be_bytes(); - Offset::from_bytes(&bytes) + Offset::from_actual_offset(v as i64 * NEEDLE_PADDING_SIZE as i64) } #[test] diff --git a/seaweed-volume/src/storage/types.rs b/seaweed-volume/src/storage/types.rs index acca5f517..f33bfde0f 100644 --- a/seaweed-volume/src/storage/types.rs +++ b/seaweed-volume/src/storage/types.rs @@ -12,18 +12,28 @@ use std::fmt; pub const NEEDLE_ID_SIZE: usize = 8; pub const NEEDLE_ID_EMPTY: u64 = 0; pub const COOKIE_SIZE: usize = 4; -pub const OFFSET_SIZE: usize = 4; // 4-byte offset (32GB max volume, matching Go default build) pub const SIZE_SIZE: usize = 4; pub const NEEDLE_HEADER_SIZE: usize = COOKIE_SIZE + NEEDLE_ID_SIZE + SIZE_SIZE; // 16 pub const DATA_SIZE_SIZE: usize = 4; -pub const NEEDLE_MAP_ENTRY_SIZE: usize = NEEDLE_ID_SIZE + OFFSET_SIZE + SIZE_SIZE; // 16 pub const TIMESTAMP_SIZE: usize = 8; pub const NEEDLE_PADDING_SIZE: usize = 8; pub const NEEDLE_CHECKSUM_SIZE: usize = 4; -/// Maximum possible volume size with 4-byte offset: 32GB -/// Formula: 4 * 1024 * 1024 * 1024 * 8 -pub const MAX_POSSIBLE_VOLUME_SIZE: u64 = 4 * 1024 * 1024 * 1024 * 8; +/// 5-byte offset mode (matching Go production builds with `-tags 5BytesOffset`). +/// Max volume size: 8TB. Index entry: 17 bytes (8 + 5 + 4). +#[cfg(feature = "5bytes")] +pub const OFFSET_SIZE: usize = 5; +#[cfg(feature = "5bytes")] +pub const MAX_POSSIBLE_VOLUME_SIZE: u64 = 4 * 1024 * 1024 * 1024 * 8 * 256; // 8TB + +/// 4-byte offset mode (matching Go default build without `5BytesOffset`). +/// Max volume size: 32GB. Index entry: 16 bytes (8 + 4 + 4). +#[cfg(not(feature = "5bytes"))] +pub const OFFSET_SIZE: usize = 4; +#[cfg(not(feature = "5bytes"))] +pub const MAX_POSSIBLE_VOLUME_SIZE: u64 = 4 * 1024 * 1024 * 1024 * 8; // 32GB + +pub const NEEDLE_MAP_ENTRY_SIZE: usize = NEEDLE_ID_SIZE + OFFSET_SIZE + SIZE_SIZE; // ============================================================================ // NeedleId @@ -175,22 +185,28 @@ impl From for i32 { } // ============================================================================ -// Offset (5 bytes) +// Offset // ============================================================================ -/// 4-byte offset encoding for needle positions in .dat files (matching Go default build). +/// Offset encoding for needle positions in .dat files. +/// +/// The offset is stored divided by NEEDLE_PADDING_SIZE (8). /// -/// The offset is stored divided by NEEDLE_PADDING_SIZE (8), so 4 bytes can -/// address up to 32GB. The on-disk byte layout in .idx files is: -/// [b3][b2][b1][b0] (big-endian 4 bytes) +/// With `5bytes` feature (default, matching Go production builds): +/// 5 bytes can address up to 8TB. +/// On-disk layout: [b3][b2][b1][b0][b4] (big-endian 4 bytes + 1 high byte) /// -/// actual_offset = stored_value * 8 +/// Without `5bytes` feature (matching Go default build): +/// 4 bytes can address up to 32GB. +/// On-disk layout: [b3][b2][b1][b0] (big-endian 4 bytes) #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] pub struct Offset { pub b0: u8, pub b1: u8, pub b2: u8, pub b3: u8, + #[cfg(feature = "5bytes")] + pub b4: u8, } impl Offset { @@ -200,6 +216,8 @@ impl Offset { + (self.b1 as i64) * 256 + (self.b2 as i64) * 65536 + (self.b3 as i64) * 16777216; + #[cfg(feature = "5bytes")] + let stored = stored + (self.b4 as i64) * 4294967296; // 1 << 32 stored * NEEDLE_PADDING_SIZE as i64 } @@ -211,20 +229,27 @@ impl Offset { b1: (smaller >> 8) as u8, b2: (smaller >> 16) as u8, b3: (smaller >> 24) as u8, + #[cfg(feature = "5bytes")] + b4: (smaller >> 32) as u8, } } - /// Serialize to 4 bytes in the .idx file format. - /// Layout: [b3][b2][b1][b0] (big-endian) + /// Serialize to bytes in the .idx file format. + /// 5-byte layout: [b3][b2][b1][b0][b4] + /// 4-byte layout: [b3][b2][b1][b0] pub fn to_bytes(&self, bytes: &mut [u8]) { assert!(bytes.len() >= OFFSET_SIZE); bytes[0] = self.b3; bytes[1] = self.b2; bytes[2] = self.b1; bytes[3] = self.b0; + #[cfg(feature = "5bytes")] + { + bytes[4] = self.b4; + } } - /// Deserialize from 4 bytes in the .idx file format. + /// Deserialize from bytes in the .idx file format. pub fn from_bytes(bytes: &[u8]) -> Self { assert!(bytes.len() >= OFFSET_SIZE); Offset { @@ -232,11 +257,20 @@ impl Offset { b2: bytes[1], b1: bytes[2], b0: bytes[3], + #[cfg(feature = "5bytes")] + b4: bytes[4], } } pub fn is_zero(&self) -> bool { - self.b0 == 0 && self.b1 == 0 && self.b2 == 0 && self.b3 == 0 + #[cfg(feature = "5bytes")] + { + self.b0 == 0 && self.b1 == 0 && self.b2 == 0 && self.b3 == 0 && self.b4 == 0 + } + #[cfg(not(feature = "5bytes"))] + { + self.b0 == 0 && self.b1 == 0 && self.b2 == 0 && self.b3 == 0 + } } } @@ -483,13 +517,32 @@ mod tests { #[test] fn test_offset_max() { - // Max 4-byte stored value = 2^32 - 1 - let max_stored: i64 = (1i64 << 32) - 1; + // Max stored value depends on offset size + #[cfg(feature = "5bytes")] + let max_stored: i64 = (1i64 << 40) - 1; // 5-byte max + #[cfg(not(feature = "5bytes"))] + let max_stored: i64 = (1i64 << 32) - 1; // 4-byte max let max_actual = max_stored * NEEDLE_PADDING_SIZE as i64; let offset = Offset::from_actual_offset(max_actual); assert_eq!(offset.to_actual_offset(), max_actual); } + #[test] + fn test_offset_size_constants() { + #[cfg(feature = "5bytes")] + { + assert_eq!(OFFSET_SIZE, 5); + assert_eq!(NEEDLE_MAP_ENTRY_SIZE, 17); // 8 + 5 + 4 + assert_eq!(MAX_POSSIBLE_VOLUME_SIZE, 4 * 1024 * 1024 * 1024 * 8 * 256); // 8TB + } + #[cfg(not(feature = "5bytes"))] + { + assert_eq!(OFFSET_SIZE, 4); + assert_eq!(NEEDLE_MAP_ENTRY_SIZE, 16); // 8 + 4 + 4 + assert_eq!(MAX_POSSIBLE_VOLUME_SIZE, 4 * 1024 * 1024 * 1024 * 8); // 32GB + } + } + #[test] fn test_idx_entry_round_trip() { let key = NeedleId(0xdeadbeef12345678); diff --git a/seaweed-volume/src/storage/volume.rs b/seaweed-volume/src/storage/volume.rs index f1bf8b03b..814ad0577 100644 --- a/seaweed-volume/src/storage/volume.rs +++ b/seaweed-volume/src/storage/volume.rs @@ -768,6 +768,30 @@ impl Volume { n: &mut Needle, offset: i64, size: Size, + ) -> Result<(), VolumeError> { + match self.read_needle_blob_and_parse(n, offset, size) { + Ok(()) => Ok(()), + #[cfg(not(feature = "5bytes"))] + Err(VolumeError::Needle(NeedleError::SizeMismatch { offset: o, .. })) + if o < MAX_POSSIBLE_VOLUME_SIZE as i64 => + { + // Double-read: in 4-byte offset mode, the actual data may be + // beyond 32GB due to offset wrapping. Retry at offset + 32GB. + self.read_needle_blob_and_parse( + n, + offset + MAX_POSSIBLE_VOLUME_SIZE as i64, + size, + ) + } + Err(e) => Err(e), + } + } + + fn read_needle_blob_and_parse( + &self, + n: &mut Needle, + offset: i64, + size: Size, ) -> Result<(), VolumeError> { let dat_file = self.dat_file.as_ref().ok_or_else(|| { VolumeError::Io(io::Error::new(io::ErrorKind::Other, "dat file not open")) @@ -857,24 +881,40 @@ impl Volume { VolumeError::Io(io::Error::new(io::ErrorKind::Other, "dat file not open")) })?; - let offset = nv.offset.to_actual_offset(); + #[cfg_attr(feature = "5bytes", allow(unused_mut))] + let mut offset = nv.offset.to_actual_offset(); let version = self.version(); let actual_size = get_actual_size(read_size, version); // Read the full needle bytes (including data) for metadata parsing. // We use read_bytes_meta_only which skips copying the data payload. - let mut buf = vec![0u8; actual_size as usize]; - #[cfg(unix)] - { - use std::os::unix::fs::FileExt; - dat_file.read_exact_at(&mut buf, offset as u64)?; - } - #[cfg(windows)] - { - read_exact_at(dat_file, &mut buf, offset as u64)?; - } + #[cfg_attr(feature = "5bytes", allow(unused_mut))] + let mut read_and_parse = |off: i64| -> Result<(), VolumeError> { + let mut buf = vec![0u8; actual_size as usize]; + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + dat_file.read_exact_at(&mut buf, off as u64)?; + } + #[cfg(windows)] + { + read_exact_at(dat_file, &mut buf, off as u64)?; + } + n.read_bytes_meta_only(&mut buf, off, read_size, version)?; + Ok(()) + }; - n.read_bytes_meta_only(&mut buf, offset, read_size, version)?; + match read_and_parse(offset) { + Ok(()) => {} + #[cfg(not(feature = "5bytes"))] + Err(VolumeError::Needle(NeedleError::SizeMismatch { offset: o, .. })) + if o < MAX_POSSIBLE_VOLUME_SIZE as i64 => + { + offset += MAX_POSSIBLE_VOLUME_SIZE as i64; + read_and_parse(offset)?; + } + Err(e) => return Err(e), + } // TTL expiry check if n.has_ttl() { diff --git a/seaweed-volume/src/version.rs b/seaweed-volume/src/version.rs index 73f0ff576..ddb7bed1a 100644 --- a/seaweed-volume/src/version.rs +++ b/seaweed-volume/src/version.rs @@ -2,7 +2,10 @@ use std::sync::OnceLock; -const SIZE_LIMIT: &str = "30GB"; // Go default build (!5BytesOffset) +#[cfg(feature = "5bytes")] +const SIZE_LIMIT: &str = "8000GB"; // Matches Go production builds (5BytesOffset) +#[cfg(not(feature = "5bytes"))] +const SIZE_LIMIT: &str = "30GB"; // Matches Go default build (!5BytesOffset) pub fn size_limit() -> &'static str { SIZE_LIMIT