Browse Source

feat: add 5-byte offset support for 8TB max volume size

Adds a `5bytes` Cargo feature (enabled by default) matching Go's
production `-tags 5BytesOffset` build. With this feature:
- Offset uses 5 bytes (b0-b4), addressing up to 8TB per volume
- MAX_POSSIBLE_VOLUME_SIZE = 8TB, SIZE_LIMIT = "8000GB"
- .idx entry size = 17 bytes (8 + 5 + 4)

Without the feature (cargo build --no-default-features):
- Offset uses 4 bytes (b0-b3), addressing up to 32GB per volume
- Matches Go's default non-production build
rust-volume-server
Chris Lu 17 hours ago
parent
commit
db7886c886
  1. 6
      seaweed-volume/Cargo.toml
  2. 47
      seaweed-volume/src/storage/needle_map.rs
  3. 3
      seaweed-volume/src/storage/needle_map/compact_map.rs
  4. 87
      seaweed-volume/src/storage/types.rs
  5. 64
      seaweed-volume/src/storage/volume.rs
  6. 5
      seaweed-volume/src/version.rs

6
seaweed-volume/Cargo.toml

@ -4,6 +4,12 @@ version = "0.1.0"
edition = "2021"
description = "SeaweedFS Volume Server — Rust implementation"
[features]
default = ["5bytes"]
# Enable 5-byte offset mode for 8TB max volume size (matches Go production builds with -tags 5BytesOffset).
# Without this feature, uses 4-byte offsets with 32GB max volume size.
5bytes = []
[dependencies]
# Async runtime
tokio = { version = "1", features = ["full"] }

47
seaweed-volume/src/storage/needle_map.rs

@ -31,20 +31,23 @@ pub struct NeedleValue {
pub size: Size,
}
/// Pack an (Offset, Size) pair into 8 bytes for redb storage.
/// Layout: [offset 4 bytes big-endian] [size 4 bytes big-endian]
fn pack_needle_value(nv: &NeedleValue) -> [u8; 8] {
let mut buf = [0u8; 8];
nv.offset.to_bytes(&mut buf[..4]);
nv.size.to_bytes(&mut buf[4..8]);
/// Packed size of a NeedleValue in redb storage: OFFSET_SIZE + SIZE_SIZE.
const PACKED_NEEDLE_VALUE_SIZE: usize = OFFSET_SIZE + SIZE_SIZE;
/// Pack an (Offset, Size) pair into bytes for redb storage.
/// Layout: [offset OFFSET_SIZE bytes] [size 4 bytes big-endian]
fn pack_needle_value(nv: &NeedleValue) -> [u8; PACKED_NEEDLE_VALUE_SIZE] {
let mut buf = [0u8; PACKED_NEEDLE_VALUE_SIZE];
nv.offset.to_bytes(&mut buf[..OFFSET_SIZE]);
nv.size.to_bytes(&mut buf[OFFSET_SIZE..]);
buf
}
/// Unpack 8 bytes from redb storage into (Offset, Size).
fn unpack_needle_value(bytes: &[u8; 8]) -> NeedleValue {
/// Unpack bytes from redb storage into (Offset, Size).
fn unpack_needle_value(bytes: &[u8; PACKED_NEEDLE_VALUE_SIZE]) -> NeedleValue {
NeedleValue {
offset: Offset::from_bytes(&bytes[..4]),
size: Size::from_bytes(&bytes[4..8]),
offset: Offset::from_bytes(&bytes[..OFFSET_SIZE]),
size: Size::from_bytes(&bytes[OFFSET_SIZE..]),
}
}
@ -422,8 +425,8 @@ impl RedbNeedleMap {
})?;
let key = NeedleId(key_guard.value());
let bytes: &[u8] = val_guard.value();
if bytes.len() == 8 {
let mut arr = [0u8; 8];
if bytes.len() == PACKED_NEEDLE_VALUE_SIZE {
let mut arr = [0u8; PACKED_NEEDLE_VALUE_SIZE];
arr.copy_from_slice(bytes);
let nv = unpack_needle_value(&arr);
self.metric.maybe_set_max_file_key(key);
@ -569,8 +572,8 @@ impl RedbNeedleMap {
match table.get(key_u64) {
Ok(Some(guard)) => {
let bytes: &[u8] = guard.value();
if bytes.len() == 8 {
let mut arr = [0u8; 8];
if bytes.len() == PACKED_NEEDLE_VALUE_SIZE {
let mut arr = [0u8; PACKED_NEEDLE_VALUE_SIZE];
arr.copy_from_slice(bytes);
Ok(Some(unpack_needle_value(&arr)))
} else {
@ -693,8 +696,8 @@ impl RedbNeedleMap {
match table.get(key_u64) {
Ok(Some(guard)) => {
let bytes: &[u8] = guard.value();
if bytes.len() == 8 {
let mut arr = [0u8; 8];
if bytes.len() == PACKED_NEEDLE_VALUE_SIZE {
let mut arr = [0u8; PACKED_NEEDLE_VALUE_SIZE];
arr.copy_from_slice(bytes);
Ok(Some(unpack_needle_value(&arr)))
} else {
@ -816,8 +819,8 @@ impl RedbNeedleMap {
})?;
let key_u64: u64 = key_guard.value();
let bytes: &[u8] = val_guard.value();
if bytes.len() == 8 {
let mut arr = [0u8; 8];
if bytes.len() == PACKED_NEEDLE_VALUE_SIZE {
let mut arr = [0u8; PACKED_NEEDLE_VALUE_SIZE];
arr.copy_from_slice(bytes);
let nv = unpack_needle_value(&arr);
if nv.size.is_valid() {
@ -847,8 +850,8 @@ impl RedbNeedleMap {
let (key_guard, val_guard) = entry.map_err(|e| format!("redb iter next: {}", e))?;
let key_u64: u64 = key_guard.value();
let bytes: &[u8] = val_guard.value();
if bytes.len() == 8 {
let mut arr = [0u8; 8];
if bytes.len() == PACKED_NEEDLE_VALUE_SIZE {
let mut arr = [0u8; PACKED_NEEDLE_VALUE_SIZE];
arr.copy_from_slice(bytes);
let nv = unpack_needle_value(&arr);
f(NeedleId(key_u64), &nv)?;
@ -876,8 +879,8 @@ impl RedbNeedleMap {
if let Ok((key_guard, val_guard)) = entry {
let key_u64: u64 = key_guard.value();
let bytes: &[u8] = val_guard.value();
if bytes.len() == 8 {
let mut arr = [0u8; 8];
if bytes.len() == PACKED_NEEDLE_VALUE_SIZE {
let mut arr = [0u8; PACKED_NEEDLE_VALUE_SIZE];
arr.copy_from_slice(bytes);
let nv = unpack_needle_value(&arr);
result.push((NeedleId(key_u64), nv));

3
seaweed-volume/src/storage/needle_map/compact_map.rs

@ -249,8 +249,7 @@ mod tests {
use super::*;
fn offset(v: u32) -> Offset {
let bytes = v.to_be_bytes();
Offset::from_bytes(&bytes)
Offset::from_actual_offset(v as i64 * NEEDLE_PADDING_SIZE as i64)
}
#[test]

87
seaweed-volume/src/storage/types.rs

@ -12,18 +12,28 @@ use std::fmt;
pub const NEEDLE_ID_SIZE: usize = 8;
pub const NEEDLE_ID_EMPTY: u64 = 0;
pub const COOKIE_SIZE: usize = 4;
pub const OFFSET_SIZE: usize = 4; // 4-byte offset (32GB max volume, matching Go default build)
pub const SIZE_SIZE: usize = 4;
pub const NEEDLE_HEADER_SIZE: usize = COOKIE_SIZE + NEEDLE_ID_SIZE + SIZE_SIZE; // 16
pub const DATA_SIZE_SIZE: usize = 4;
pub const NEEDLE_MAP_ENTRY_SIZE: usize = NEEDLE_ID_SIZE + OFFSET_SIZE + SIZE_SIZE; // 16
pub const TIMESTAMP_SIZE: usize = 8;
pub const NEEDLE_PADDING_SIZE: usize = 8;
pub const NEEDLE_CHECKSUM_SIZE: usize = 4;
/// Maximum possible volume size with 4-byte offset: 32GB
/// Formula: 4 * 1024 * 1024 * 1024 * 8
pub const MAX_POSSIBLE_VOLUME_SIZE: u64 = 4 * 1024 * 1024 * 1024 * 8;
/// 5-byte offset mode (matching Go production builds with `-tags 5BytesOffset`).
/// Max volume size: 8TB. Index entry: 17 bytes (8 + 5 + 4).
#[cfg(feature = "5bytes")]
pub const OFFSET_SIZE: usize = 5;
#[cfg(feature = "5bytes")]
pub const MAX_POSSIBLE_VOLUME_SIZE: u64 = 4 * 1024 * 1024 * 1024 * 8 * 256; // 8TB
/// 4-byte offset mode (matching Go default build without `5BytesOffset`).
/// Max volume size: 32GB. Index entry: 16 bytes (8 + 4 + 4).
#[cfg(not(feature = "5bytes"))]
pub const OFFSET_SIZE: usize = 4;
#[cfg(not(feature = "5bytes"))]
pub const MAX_POSSIBLE_VOLUME_SIZE: u64 = 4 * 1024 * 1024 * 1024 * 8; // 32GB
pub const NEEDLE_MAP_ENTRY_SIZE: usize = NEEDLE_ID_SIZE + OFFSET_SIZE + SIZE_SIZE;
// ============================================================================
// NeedleId
@ -175,22 +185,28 @@ impl From<Size> for i32 {
}
// ============================================================================
// Offset (5 bytes)
// Offset
// ============================================================================
/// 4-byte offset encoding for needle positions in .dat files (matching Go default build).
/// Offset encoding for needle positions in .dat files.
///
/// The offset is stored divided by NEEDLE_PADDING_SIZE (8).
///
/// The offset is stored divided by NEEDLE_PADDING_SIZE (8), so 4 bytes can
/// address up to 32GB. The on-disk byte layout in .idx files is:
/// [b3][b2][b1][b0] (big-endian 4 bytes)
/// With `5bytes` feature (default, matching Go production builds):
/// 5 bytes can address up to 8TB.
/// On-disk layout: [b3][b2][b1][b0][b4] (big-endian 4 bytes + 1 high byte)
///
/// actual_offset = stored_value * 8
/// Without `5bytes` feature (matching Go default build):
/// 4 bytes can address up to 32GB.
/// On-disk layout: [b3][b2][b1][b0] (big-endian 4 bytes)
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
pub struct Offset {
pub b0: u8,
pub b1: u8,
pub b2: u8,
pub b3: u8,
#[cfg(feature = "5bytes")]
pub b4: u8,
}
impl Offset {
@ -200,6 +216,8 @@ impl Offset {
+ (self.b1 as i64) * 256
+ (self.b2 as i64) * 65536
+ (self.b3 as i64) * 16777216;
#[cfg(feature = "5bytes")]
let stored = stored + (self.b4 as i64) * 4294967296; // 1 << 32
stored * NEEDLE_PADDING_SIZE as i64
}
@ -211,20 +229,27 @@ impl Offset {
b1: (smaller >> 8) as u8,
b2: (smaller >> 16) as u8,
b3: (smaller >> 24) as u8,
#[cfg(feature = "5bytes")]
b4: (smaller >> 32) as u8,
}
}
/// Serialize to 4 bytes in the .idx file format.
/// Layout: [b3][b2][b1][b0] (big-endian)
/// Serialize to bytes in the .idx file format.
/// 5-byte layout: [b3][b2][b1][b0][b4]
/// 4-byte layout: [b3][b2][b1][b0]
pub fn to_bytes(&self, bytes: &mut [u8]) {
assert!(bytes.len() >= OFFSET_SIZE);
bytes[0] = self.b3;
bytes[1] = self.b2;
bytes[2] = self.b1;
bytes[3] = self.b0;
#[cfg(feature = "5bytes")]
{
bytes[4] = self.b4;
}
}
/// Deserialize from 4 bytes in the .idx file format.
/// Deserialize from bytes in the .idx file format.
pub fn from_bytes(bytes: &[u8]) -> Self {
assert!(bytes.len() >= OFFSET_SIZE);
Offset {
@ -232,11 +257,20 @@ impl Offset {
b2: bytes[1],
b1: bytes[2],
b0: bytes[3],
#[cfg(feature = "5bytes")]
b4: bytes[4],
}
}
pub fn is_zero(&self) -> bool {
self.b0 == 0 && self.b1 == 0 && self.b2 == 0 && self.b3 == 0
#[cfg(feature = "5bytes")]
{
self.b0 == 0 && self.b1 == 0 && self.b2 == 0 && self.b3 == 0 && self.b4 == 0
}
#[cfg(not(feature = "5bytes"))]
{
self.b0 == 0 && self.b1 == 0 && self.b2 == 0 && self.b3 == 0
}
}
}
@ -483,13 +517,32 @@ mod tests {
#[test]
fn test_offset_max() {
// Max 4-byte stored value = 2^32 - 1
let max_stored: i64 = (1i64 << 32) - 1;
// Max stored value depends on offset size
#[cfg(feature = "5bytes")]
let max_stored: i64 = (1i64 << 40) - 1; // 5-byte max
#[cfg(not(feature = "5bytes"))]
let max_stored: i64 = (1i64 << 32) - 1; // 4-byte max
let max_actual = max_stored * NEEDLE_PADDING_SIZE as i64;
let offset = Offset::from_actual_offset(max_actual);
assert_eq!(offset.to_actual_offset(), max_actual);
}
#[test]
fn test_offset_size_constants() {
#[cfg(feature = "5bytes")]
{
assert_eq!(OFFSET_SIZE, 5);
assert_eq!(NEEDLE_MAP_ENTRY_SIZE, 17); // 8 + 5 + 4
assert_eq!(MAX_POSSIBLE_VOLUME_SIZE, 4 * 1024 * 1024 * 1024 * 8 * 256); // 8TB
}
#[cfg(not(feature = "5bytes"))]
{
assert_eq!(OFFSET_SIZE, 4);
assert_eq!(NEEDLE_MAP_ENTRY_SIZE, 16); // 8 + 4 + 4
assert_eq!(MAX_POSSIBLE_VOLUME_SIZE, 4 * 1024 * 1024 * 1024 * 8); // 32GB
}
}
#[test]
fn test_idx_entry_round_trip() {
let key = NeedleId(0xdeadbeef12345678);

64
seaweed-volume/src/storage/volume.rs

@ -768,6 +768,30 @@ impl Volume {
n: &mut Needle,
offset: i64,
size: Size,
) -> Result<(), VolumeError> {
match self.read_needle_blob_and_parse(n, offset, size) {
Ok(()) => Ok(()),
#[cfg(not(feature = "5bytes"))]
Err(VolumeError::Needle(NeedleError::SizeMismatch { offset: o, .. }))
if o < MAX_POSSIBLE_VOLUME_SIZE as i64 =>
{
// Double-read: in 4-byte offset mode, the actual data may be
// beyond 32GB due to offset wrapping. Retry at offset + 32GB.
self.read_needle_blob_and_parse(
n,
offset + MAX_POSSIBLE_VOLUME_SIZE as i64,
size,
)
}
Err(e) => Err(e),
}
}
fn read_needle_blob_and_parse(
&self,
n: &mut Needle,
offset: i64,
size: Size,
) -> Result<(), VolumeError> {
let dat_file = self.dat_file.as_ref().ok_or_else(|| {
VolumeError::Io(io::Error::new(io::ErrorKind::Other, "dat file not open"))
@ -857,24 +881,40 @@ impl Volume {
VolumeError::Io(io::Error::new(io::ErrorKind::Other, "dat file not open"))
})?;
let offset = nv.offset.to_actual_offset();
#[cfg_attr(feature = "5bytes", allow(unused_mut))]
let mut offset = nv.offset.to_actual_offset();
let version = self.version();
let actual_size = get_actual_size(read_size, version);
// Read the full needle bytes (including data) for metadata parsing.
// We use read_bytes_meta_only which skips copying the data payload.
let mut buf = vec![0u8; actual_size as usize];
#[cfg(unix)]
{
use std::os::unix::fs::FileExt;
dat_file.read_exact_at(&mut buf, offset as u64)?;
}
#[cfg(windows)]
{
read_exact_at(dat_file, &mut buf, offset as u64)?;
}
#[cfg_attr(feature = "5bytes", allow(unused_mut))]
let mut read_and_parse = |off: i64| -> Result<(), VolumeError> {
let mut buf = vec![0u8; actual_size as usize];
#[cfg(unix)]
{
use std::os::unix::fs::FileExt;
dat_file.read_exact_at(&mut buf, off as u64)?;
}
#[cfg(windows)]
{
read_exact_at(dat_file, &mut buf, off as u64)?;
}
n.read_bytes_meta_only(&mut buf, off, read_size, version)?;
Ok(())
};
n.read_bytes_meta_only(&mut buf, offset, read_size, version)?;
match read_and_parse(offset) {
Ok(()) => {}
#[cfg(not(feature = "5bytes"))]
Err(VolumeError::Needle(NeedleError::SizeMismatch { offset: o, .. }))
if o < MAX_POSSIBLE_VOLUME_SIZE as i64 =>
{
offset += MAX_POSSIBLE_VOLUME_SIZE as i64;
read_and_parse(offset)?;
}
Err(e) => return Err(e),
}
// TTL expiry check
if n.has_ttl() {

5
seaweed-volume/src/version.rs

@ -2,7 +2,10 @@
use std::sync::OnceLock;
const SIZE_LIMIT: &str = "30GB"; // Go default build (!5BytesOffset)
#[cfg(feature = "5bytes")]
const SIZE_LIMIT: &str = "8000GB"; // Matches Go production builds (5BytesOffset)
#[cfg(not(feature = "5bytes"))]
const SIZE_LIMIT: &str = "30GB"; // Matches Go default build (!5BytesOffset)
pub fn size_limit() -> &'static str {
SIZE_LIMIT

Loading…
Cancel
Save