From f71c7d12b692c6a5410f5a4cd0ee4cc4b50e88df Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 22 Mar 2025 00:00:31 +0100 Subject: [PATCH 001/613] v3 block format wip --- src/key.rs | 7 + src/lib.rs | 2 + src/super_segment/mod.rs | 462 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 471 insertions(+) create mode 100644 src/super_segment/mod.rs diff --git a/src/key.rs b/src/key.rs index fb2b5ca6..b987371d 100644 --- a/src/key.rs +++ b/src/key.rs @@ -19,6 +19,13 @@ pub struct InternalKey { pub value_type: ValueType, } +impl std::hash::Hash for InternalKey { + fn hash(&self, state: &mut H) { + state.write(&self.user_key); + state.write_u64(self.seqno); + } +} + impl std::fmt::Debug for InternalKey { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( diff --git a/src/lib.rs b/src/lib.rs index 3d785aca..1156e8e6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -184,6 +184,8 @@ mod tree; mod value; mod version; +mod super_segment; + /// KV-tuple, typically returned by an iterator pub type KvPair = (UserKey, UserValue); diff --git a/src/super_segment/mod.rs b/src/super_segment/mod.rs new file mode 100644 index 00000000..f5a729d7 --- /dev/null +++ b/src/super_segment/mod.rs @@ -0,0 +1,462 @@ +use std::{ + hash::Hash, + io::{Cursor, Seek, Write}, +}; + +use crate::{ + key::InternalKey, segment::block::header::Header, CompressionType, Decode, DecodeError, Encode, + EncodeError, InternalValue, Slice, ValueType, +}; +use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; +use varint_rs::{VarintReader, VarintWriter}; +use xxhash_rust::xxh3::xxh3_64; + +/// A block on disk. +/// +/// Consists of a header and some bytes (the data/payload) +pub struct Block { + header: Header, + data: Slice, +} + +/* impl Decode for Block { + fn decode_from(reader: &mut R) -> Result + where + Self: Sized, + { + let header = Header::decode_from(reader)?; + let data = Slice::from_reader(reader, header.data_length as usize)?; + let data = match header.compression { + CompressionType::None => data, + + #[cfg(feature = "lz4")] + CompressionType::Lz4 => lz4_flex::decompress_size_prepended(&data) + .map(Into::into) + .map_err(|_| crate::Error::Decompress(header.compression))?, + + #[cfg(feature = "miniz")] + CompressionType::Miniz(_) => miniz_oxide::inflate::decompress_to_vec(&data) + .map(Into::into) + .map_err(|_| crate::Error::Decompress(header.compression))?, + }; + + Ok(Self { header, data }) + } +} */ + +fn longest_shared_prefix_length(s1: &[u8], s2: &[u8]) -> usize { + s1.iter() + .zip(s2.iter()) + .take_while(|(c1, c2)| c1 == c2) + .count() +} + +const MARKER_FREE: u8 = u8::MAX - 1; +const MARKER_CONFLICT: u8 = u8::MAX; + +/// Block that contains key-value pairs (user data) +pub struct DataBlock { + inner: Block, +} + +impl DataBlock { + pub fn point_read(&self, key: &[u8]) -> crate::Result<()> { + let bytes = &self.inner.data; + + let mut cursor = &bytes[bytes.len() + - std::mem::size_of::() + - std::mem::size_of::() + - std::mem::size_of::() + - std::mem::size_of::()..]; + + let binary_index_offset: usize = cursor.read_u32::().unwrap() as usize; + let binary_index_len: usize = cursor.read_u32::().unwrap() as usize; + eprintln!( + "we got binary_idx_offset={binary_index_offset}, binary_index_len={binary_index_len}" + ); + + let hash_index_offset: usize = cursor.read_u32::().unwrap() as usize; + let hash_bucket_count: usize = cursor.read_u8().unwrap().into(); + eprintln!( + "we got hash_idx_offset={hash_index_offset}, hash_bucket_count={hash_bucket_count}" + ); + + if hash_index_offset > 0 { + let hash = xxh3_64(key); + let bucket_no = (hash % hash_bucket_count as u64) as usize; + + eprintln!( + "{:?} may be in bucket {bucket_no}", + String::from_utf8_lossy(key) + ); + + let bucket_value_pos = hash_index_offset + bucket_no; + let bucket_value = bytes[bucket_value_pos] as usize; + + if bucket_value < MARKER_FREE.into() { + eprintln!("binary index hash short circuit idx = {bucket_value}"); + + let binary_index_pos = + binary_index_offset + bucket_value * std::mem::size_of::(); + + let mut cursor = &bytes[binary_index_pos..]; + + let restart_entry_pos = cursor.read_u32::()?; + + eprintln!("we have to jump to {restart_entry_pos}"); + + todo!(); + } else { + // NOTE: Fallback to binary search + + unimplemented!() + } + } + + Ok(()) + } + + pub fn iter(&self) -> crate::Result<()> { + let bytes = &self.inner.data; + + let mut cursor = &bytes[bytes.len() + - std::mem::size_of::() + - std::mem::size_of::() + - std::mem::size_of::() + - std::mem::size_of::() + - std::mem::size_of::() + - std::mem::size_of::()..]; + + let item_count = cursor.read_u32::().unwrap(); + let restart_count: usize = cursor.read_u8().unwrap().into(); + eprintln!("we got item_count={item_count}, restart_interval={restart_count}"); + + let mut cursor = Cursor::new(&bytes[..]); + let mut base_key: Option = None; + + for idx in 0..item_count as usize { + if idx % restart_count == 0 { + eprintln!("-- full item"); + + let seqno = cursor.read_u64_varint()?; + + let value_type = cursor.read_u8()?; + let value_type: ValueType = value_type + .try_into() + .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type)))?; + + let key_len: usize = cursor.read_u16_varint()?.into(); + + let offset = cursor.position() as usize; + let key = bytes.slice(offset..(offset + key_len)); + cursor.seek_relative(key_len as i64)?; + // eprintln!("{:?}", String::from_utf8_lossy(&key)); + + let val_len: usize = cursor.read_u32_varint()? as usize; + + let offset = cursor.position() as usize; + let value = bytes.slice(offset..(offset + val_len)); + cursor.seek_relative(val_len as i64)?; + + // eprintln!("{:?}", String::from_utf8_lossy(&value)); + + let item = InternalValue::from_components(key, value, seqno, value_type); + eprintln!("{item:?}"); + + base_key = Some(item.key.clone()); + } else { + eprintln!("-- truncated item"); + + let seqno = cursor.read_u64_varint()?; + + let value_type = cursor.read_u8()?; + let value_type: ValueType = value_type + .try_into() + .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type)))?; + + let shared_prefix_len: usize = cursor.read_u16_varint()?.into(); + let rest_key_len: usize = cursor.read_u16_varint()?.into(); + + // eprintln!("shared={shared_prefix_len}, rest={rest_key_len}"); + + let key = if shared_prefix_len > 0 { + // Stitch key + + // TODO: use Slice::with_size_unzeroed + let mut key = Vec::with_capacity(shared_prefix_len + rest_key_len); + key.extend_from_slice( + &base_key.as_ref().unwrap().user_key[0..shared_prefix_len], + ); + + for _ in 0..rest_key_len { + key.push(cursor.read_u8()?); + } + + Slice::from(key) + } else { + // Is full key already + let offset = cursor.position() as usize; + let key = bytes.slice(offset..(offset + rest_key_len)); + cursor.seek_relative(rest_key_len as i64)?; + key + }; + // eprintln!("{:?}", String::from_utf8_lossy(&key)); + + if value_type == ValueType::Value { + let val_len: usize = cursor.read_u32_varint()? as usize; + + // eprintln!("val len={val_len}"); + + let offset = cursor.position() as usize; + let value = bytes.slice(offset..(offset + val_len)); + cursor.seek_relative(val_len as i64)?; + + // eprintln!("{:?}", String::from_utf8_lossy(&value)); + + let item = InternalValue::from_components(key, value, seqno, value_type); + eprintln!("{item:?}"); + } else { + let item = InternalValue::from_components(key, b"", seqno, value_type); + eprintln!("{item:?}"); + } + } + } + + Ok(()) + } + + pub fn encode_items(items: &[InternalValue], restart_interval: u8) -> crate::Result> { + let mut writer = Vec::with_capacity(u16::MAX.into()); + + eprintln!("encoding {} items", items.len()); + + let mut binary_index = Vec::::with_capacity(items.len()); + + let hash_bucket_count = items.len(); + let mut hash_index: Vec = vec![MARKER_FREE; hash_bucket_count]; + + let mut base_key: &Slice = &items + .first() + .expect("chunk should not be empty") + .key + .user_key; + + let mut restart_count: u32 = 0; + + #[cfg(debug_assertions)] + let mut hash_conflicts = 0; + + // Serialize each value + for (idx, kv) in items.iter().enumerate() { + // We encode restart markers as + // [seqno] [value type] [user key len] [user key] [value len] [value] + if idx % usize::from(restart_interval) == 0 { + eprintln!("restart!"); + restart_count += 1; + + binary_index.push(writer.len() as u32); + + kv.key.encode_into(&mut writer)?; + + base_key = &kv.key.user_key; + } else { + // We encode truncated values as + // [seqno] [value type] [shared prefix len] [rest key len] [rest key] [value len] [value] + + eprintln!("encode with prefix truncation"); + eprintln!("base key is {:?}", String::from_utf8_lossy(base_key)); + + writer.write_u64_varint(kv.key.seqno)?; + writer.write_u8(u8::from(kv.key.value_type))?; + + let shared_prefix_len = + longest_shared_prefix_length(base_key, &kv.key.user_key) as u16; + + writer.write_u16_varint(shared_prefix_len)?; + + let rest_len = kv.key.user_key.len() as u16 - shared_prefix_len; + writer.write_u16_varint(rest_len)?; + + let truncated_user_key: &[u8] = &kv.key.user_key; + let truncated_user_key = &truncated_user_key[shared_prefix_len as usize..]; + writer.write_all(truncated_user_key)?; + + eprintln!( + "shared prefix is {:?}", + String::from_utf8_lossy(&base_key[0..shared_prefix_len as usize]), + ); + } + + let hash = xxh3_64(&kv.key.user_key); + let pos = (hash % hash_bucket_count as u64) as usize; + + if hash_index[pos] == MARKER_FREE { + // Free slot + hash_index[pos] = (restart_count as u8) - 1; + + eprintln!( + "hash ref for {:?} => bucket={}->{}", + String::from_utf8_lossy(&kv.key.user_key), + pos, + restart_count - 1, + ); + } else if hash_index[pos] < MARKER_FREE { + // Mark as conflicted + hash_index[pos] = MARKER_CONFLICT; + + eprintln!("{pos} is now conflicted"); + hash_conflicts += 1; + } + + // NOTE: Only write value len + value if we are actually a value + if !kv.is_tombstone() { + // NOTE: We know values are limited to 32-bit length + #[allow(clippy::cast_possible_truncation)] + writer.write_u32_varint(kv.value.len() as u32)?; + writer.write_all(&kv.value)?; + } + } + + let binary_index_offset = writer.len() as u32; + + eprintln!("binary index @ {binary_index_offset}: {binary_index:?}"); + + for &offset in &binary_index { + writer.write_u32::(offset)?; // TODO: benchmark little endian on x86_64 + } + + let mut hash_index_offset = 0u32; + + // TODO: unit test when binary index is too long + if binary_index.len() <= (u8::MAX - 2).into() { + hash_index_offset = writer.len() as u32; + + eprintln!("hash index @ {hash_index_offset}: {hash_index:?}"); + + for &idx in &hash_index { + writer.write_u8(idx)?; + } + } + + // Trailer + writer.write_u32::(items.len() as u32)?; + writer.write_u8(restart_interval)?; + writer.write_u32::(binary_index_offset)?; + writer.write_u32::(binary_index.len() as u32)?; + writer.write_u32::(hash_index_offset)?; + writer.write_u8(if hash_index_offset > 0 { + hash_index.len() as u8 + } else { + 0 + })?; + + #[cfg(debug_assertions)] + eprintln!( + "hash index had {hash_conflicts} conflicts (rate={}%)", + (hash_conflicts as f32 / hash_bucket_count as f32) * 100.0 + ); + + Ok(writer) + } +} + +#[cfg(test)] +mod tests { + use crate::{ + segment::block::{header::Header, offset::BlockOffset}, + super_segment::{Block, DataBlock}, + Checksum, InternalValue, + }; + use test_log::test; + + #[test] + fn v3_data_block_simple() -> crate::Result<()> { + let items = [ + InternalValue::from_components( + "planet:earth:fact", + "eaaaaaaaaarth", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components( + "planet:jupiter:fact", + "Jupiter is big", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components( + "planet:jupiter:mass", + "Massive", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components( + "planet:jupiter:name", + "Jupiter", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components( + "planet:jupiter:radius", + "Big", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components( + "planet:saturn:fact", + "Saturn is pretty big", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components( + "planet:saturn:name", + "Saturn", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components("planet:venus:fact", "", 1, crate::ValueType::Tombstone), + InternalValue::from_components( + "planet:venus:fact", + "Venus exists", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components( + "planet:venus:name", + "Venus", + 0, + crate::ValueType::Value, + ), + ]; + + let bytes = DataBlock::encode_items(&items, 2)?; + eprintln!("{bytes:?}"); + eprintln!("{}", String::from_utf8_lossy(&bytes)); + eprintln!("encoded into {} bytes", bytes.len()); + + { + let bytes = lz4_flex::compress_prepend_size(&bytes); + eprintln!("compressed into {} bytes", bytes.len()); + } + + let data_block = DataBlock { + inner: Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + compression: crate::CompressionType::None, + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }, + }; + data_block.iter()?; + + data_block.point_read(b"planet:jupiter:name")?; + + panic!(); + + Ok(()) + } +} From f73706352b3bfbdec009b92ea03c6e73463b459f Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 22 Mar 2025 21:46:29 +0100 Subject: [PATCH 002/613] data block point reads --- benches/block.rs | 4 +- src/key.rs | 8 +- src/lib.rs | 2 +- src/super_segment/binary_index/builder.rs | 24 + src/super_segment/binary_index/mod.rs | 5 + src/super_segment/binary_index/reader.rs | 22 + src/super_segment/hash_index/builder.rs | 145 ++++++ src/super_segment/hash_index/mod.rs | 17 + src/super_segment/hash_index/reader.rs | 26 + src/super_segment/mod.rs | 588 +++++++++++++++------- 10 files changed, 643 insertions(+), 198 deletions(-) create mode 100644 src/super_segment/binary_index/builder.rs create mode 100644 src/super_segment/binary_index/mod.rs create mode 100644 src/super_segment/binary_index/reader.rs create mode 100644 src/super_segment/hash_index/builder.rs create mode 100644 src/super_segment/hash_index/mod.rs create mode 100644 src/super_segment/hash_index/reader.rs diff --git a/benches/block.rs b/benches/block.rs index 58d6c23b..b7107354 100644 --- a/benches/block.rs +++ b/benches/block.rs @@ -2,9 +2,9 @@ use criterion::{criterion_group, criterion_main, Criterion}; use lsm_tree::{ coding::Encode, segment::{ - block::{header::Header as BlockHeader, ItemSize}, + block::{header::Header as BlockHeader, offset::BlockOffset, ItemSize}, meta::CompressionType, - value_block::{BlockOffset, ValueBlock}, + value_block::ValueBlock, }, Checksum, InternalValue, }; diff --git a/src/key.rs b/src/key.rs index 7b4622f5..4e25a531 100644 --- a/src/key.rs +++ b/src/key.rs @@ -68,10 +68,10 @@ impl InternalKey { impl Encode for InternalKey { fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { - writer.write_u64_varint(self.seqno)?; - writer.write_u8(u8::from(self.value_type))?; + writer.write_u64_varint(self.seqno)?; + // NOTE: Truncation is okay and actually needed #[allow(clippy::cast_possible_truncation)] writer.write_u16_varint(self.user_key.len() as u16)?; @@ -83,13 +83,13 @@ impl Encode for InternalKey { impl Decode for InternalKey { fn decode_from(reader: &mut R) -> Result { - let seqno = reader.read_u64_varint()?; - let value_type = reader.read_u8()?; let value_type = value_type .try_into() .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type)))?; + let seqno = reader.read_u64_varint()?; + let key_len = reader.read_u16_varint()?; let key = Slice::from_reader(reader, key_len.into())?; diff --git a/src/lib.rs b/src/lib.rs index d1fc92f0..1079d3a9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -90,7 +90,7 @@ #![doc(html_logo_url = "https://raw.githubusercontent.com/fjall-rs/lsm-tree/main/logo.png")] #![doc(html_favicon_url = "https://raw.githubusercontent.com/fjall-rs/lsm-tree/main/logo.png")] -#![forbid(unsafe_code)] +#![deny(unsafe_code)] #![deny(clippy::all, missing_docs, clippy::cargo)] #![deny(clippy::unwrap_used)] #![deny(clippy::indexing_slicing)] diff --git a/src/super_segment/binary_index/builder.rs b/src/super_segment/binary_index/builder.rs new file mode 100644 index 00000000..656d1a19 --- /dev/null +++ b/src/super_segment/binary_index/builder.rs @@ -0,0 +1,24 @@ +use byteorder::{BigEndian, WriteBytesExt}; + +#[derive(Debug)] +pub struct Builder(Vec); + +impl Builder { + pub fn new(capacity: usize) -> Self { + Self(Vec::with_capacity(capacity)) + } + + pub fn insert(&mut self, pos: u32) { + self.0.push(pos); + } + + pub fn write(self, writer: &mut W) -> crate::Result { + let len = self.0.len(); + + for offset in self.0 { + writer.write_u32::(offset)?; // TODO: benchmark little endian on x86_64 + } + + Ok(len) + } +} diff --git a/src/super_segment/binary_index/mod.rs b/src/super_segment/binary_index/mod.rs new file mode 100644 index 00000000..de6da13c --- /dev/null +++ b/src/super_segment/binary_index/mod.rs @@ -0,0 +1,5 @@ +mod builder; +mod reader; + +pub use builder::Builder; +pub use reader::Reader; diff --git a/src/super_segment/binary_index/reader.rs b/src/super_segment/binary_index/reader.rs new file mode 100644 index 00000000..bbc9eeae --- /dev/null +++ b/src/super_segment/binary_index/reader.rs @@ -0,0 +1,22 @@ +use byteorder::{BigEndian, ReadBytesExt}; + +pub struct Reader<'a> { + bytes: &'a [u8], +} + +impl<'a> Reader<'a> { + pub fn new(bytes: &'a [u8]) -> Self { + Self { bytes } + } + + pub fn len(&self) -> usize { + self.bytes.len() / std::mem::size_of::() + } + + pub fn get(&self, idx: usize) -> u32 { + let offset = idx * std::mem::size_of::(); + + let mut bytes = self.bytes.get(offset..).expect("should be in array"); + bytes.read_u32::().expect("should read") + } +} diff --git a/src/super_segment/hash_index/builder.rs b/src/super_segment/hash_index/builder.rs new file mode 100644 index 00000000..7d63ee59 --- /dev/null +++ b/src/super_segment/hash_index/builder.rs @@ -0,0 +1,145 @@ +use super::{calculate_bucket_position, MARKER_CONFLICT, MARKER_FREE}; +use byteorder::WriteBytesExt; + +#[derive(Debug)] +pub struct Builder(Vec); + +impl Builder { + pub fn new(bucket_count: u8) -> Self { + Self(vec![MARKER_FREE; bucket_count as usize]) + } + + // NOTE: We know the hash index has a bucket count <= u8 + #[allow(clippy::cast_possible_truncation)] + pub fn len(&self) -> u8 { + self.0.len() as u8 + } + + pub fn set(&mut self, key: &[u8], binary_index_pos: u8) -> bool { + let bucket_pos = calculate_bucket_position(key, self.len()); + + // SAFETY: We used modulo + #[allow(unsafe_code)] + let curr_marker = unsafe { *self.0.get_unchecked(bucket_pos) }; + + match curr_marker { + MARKER_CONFLICT => false, + MARKER_FREE => { + // NOTE: Free slot + + // SAFETY: We previously asserted that the slot exists + #[allow(unsafe_code)] + unsafe { + *self.0.get_unchecked_mut(bucket_pos) = binary_index_pos; + } + + eprintln!( + "hash ref for {:?} => bucket={}->{}", + String::from_utf8_lossy(key), + bucket_pos, + binary_index_pos, + ); + + true + } + x if x == binary_index_pos => { + // NOTE: If different keys map to the same bucket, we can keep + // the mapping + true + } + _ => { + // NOTE: Mark as conflicted + + // SAFETY: We previously asserted that the slot exists + #[allow(unsafe_code)] + unsafe { + *self.0.get_unchecked_mut(bucket_pos) = MARKER_CONFLICT; + } + + eprintln!( + "hash conflict for {:?} => bucket={}->{}", + String::from_utf8_lossy(key), + bucket_pos, + binary_index_pos, + ); + + false + } + } + } + + #[cfg(test)] + pub fn into_inner(self) -> Vec { + self.0 + } + + pub fn write(self, writer: &mut W) -> std::io::Result<()> { + for byte in self.0 { + writer.write_u8(byte)?; + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use test_log::test; + + #[test] + fn v3_hash_index_simple() { + let mut hash_index = Builder::new(100); + + hash_index.set(b"a", 5); + hash_index.set(b"b", 8); + hash_index.set(b"c", 10); + + // NOTE: Hash index bytes need to be consistent across machines and compilations etc. + assert_eq!( + [ + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 10, 254, 254, 254, 8, 254, + 254, 254, 5, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254 + ], + &*hash_index.into_inner() + ); + } + + #[test] + fn v3_hash_index_conflict() { + let mut hash_index = Builder::new(1); + + hash_index.set(b"a", 5); + hash_index.set(b"b", 8); + + // NOTE: Hash index bytes need to be consistent across machines and compilations etc. + assert_eq!([255], &*hash_index.into_inner()); + } + + #[test] + fn v3_hash_index_same_offset() { + let mut hash_index = Builder::new(1); + + hash_index.set(b"a", 5); + hash_index.set(b"b", 5); + + // NOTE: Hash index bytes need to be consistent across machines and compilations etc. + assert_eq!([5], &*hash_index.into_inner()); + } + + #[test] + fn v3_hash_index_mix() { + let mut hash_index = Builder::new(1); + + hash_index.set(b"a", 5); + hash_index.set(b"b", 5); + hash_index.set(b"c", 6); + + // NOTE: Hash index bytes need to be consistent across machines and compilations etc. + assert_eq!([255], &*hash_index.into_inner()); + } +} diff --git a/src/super_segment/hash_index/mod.rs b/src/super_segment/hash_index/mod.rs new file mode 100644 index 00000000..6ca93fe0 --- /dev/null +++ b/src/super_segment/hash_index/mod.rs @@ -0,0 +1,17 @@ +mod builder; +mod reader; + +use xxhash_rust::xxh3::xxh3_64; + +const MARKER_FREE: u8 = u8::MAX - 1; +const MARKER_CONFLICT: u8 = u8::MAX; + +// NOTE: We know the hash index has a bucket count <= u8 +#[allow(clippy::cast_possible_truncation)] +fn calculate_bucket_position(key: &[u8], bucket_count: u8) -> usize { + let hash = xxh3_64(key); + (hash % u64::from(bucket_count)) as usize +} + +pub use builder::Builder; +pub use reader::Reader; diff --git a/src/super_segment/hash_index/reader.rs b/src/super_segment/hash_index/reader.rs new file mode 100644 index 00000000..a423394f --- /dev/null +++ b/src/super_segment/hash_index/reader.rs @@ -0,0 +1,26 @@ +use super::{calculate_bucket_position, MARKER_CONFLICT, MARKER_FREE}; + +pub struct Reader<'a>(&'a [u8]); + +impl<'a> Reader<'a> { + pub fn new(bytes: &'a [u8]) -> Self { + Self(bytes) + } + + pub fn get(&self, key: &[u8]) -> Option { + // NOTE: We know the hash index has a bucket count <= u8 + #[allow(clippy::cast_possible_truncation)] + let bucket_count = self.0.len() as u8; + + let bucket_pos = calculate_bucket_position(key, bucket_count); + + // SAFETY: We used modulo + #[allow(unsafe_code)] + let marker = unsafe { *self.0.get_unchecked(bucket_pos) }; + + match marker { + MARKER_CONFLICT | MARKER_FREE => None, + idx => Some(idx), + } + } +} diff --git a/src/super_segment/mod.rs b/src/super_segment/mod.rs index f5a729d7..b129dbf9 100644 --- a/src/super_segment/mod.rs +++ b/src/super_segment/mod.rs @@ -1,22 +1,28 @@ -use std::{ - hash::Hash, - io::{Cursor, Seek, Write}, -}; +mod binary_index; +mod hash_index; use crate::{ - key::InternalKey, segment::block::header::Header, CompressionType, Decode, DecodeError, Encode, - EncodeError, InternalValue, Slice, ValueType, + coding::{DecodeError, Encode}, + segment::block::header::Header, + InternalValue, SeqNo, Slice, ValueType, }; +use binary_index::{Builder as BinaryIndexBuilder, Reader as BinaryIndexReader}; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; +use hash_index::{Builder as HashIndexBuilder, Reader as HashIndexReader}; +use std::{ + cmp::Reverse, + io::{Cursor, Seek, Write}, +}; use varint_rs::{VarintReader, VarintWriter}; -use xxhash_rust::xxh3::xxh3_64; + +const TERMINATOR_MARKER: u8 = 255; /// A block on disk. /// /// Consists of a header and some bytes (the data/payload) pub struct Block { - header: Header, - data: Slice, + pub header: Header, + pub data: Slice, } /* impl Decode for Block { @@ -51,75 +57,193 @@ fn longest_shared_prefix_length(s1: &[u8], s2: &[u8]) -> usize { .count() } -const MARKER_FREE: u8 = u8::MAX - 1; -const MARKER_CONFLICT: u8 = u8::MAX; +fn compare_slices(prefix_part: &[T], rest_key: &[T], needle: &[T]) -> bool { + let full_len = prefix_part.len() + rest_key.len(); + + if full_len != needle.len() { + return false; + } + + needle.ends_with(rest_key) && needle.starts_with(prefix_part) +} /// Block that contains key-value pairs (user data) pub struct DataBlock { - inner: Block, + pub inner: Block, } impl DataBlock { - pub fn point_read(&self, key: &[u8]) -> crate::Result<()> { + pub fn get_key_at(&self, pos: usize) -> (&[u8], Reverse) { + eprintln!("get key at {pos}"); + let bytes = &self.inner.data; + let mut cursor = Cursor::new(&bytes[pos..]); - let mut cursor = &bytes[bytes.len() - - std::mem::size_of::() - - std::mem::size_of::() - - std::mem::size_of::() - - std::mem::size_of::()..]; + let value_type = cursor.read_u8().expect("should read"); - let binary_index_offset: usize = cursor.read_u32::().unwrap() as usize; - let binary_index_len: usize = cursor.read_u32::().unwrap() as usize; - eprintln!( - "we got binary_idx_offset={binary_index_offset}, binary_index_len={binary_index_len}" - ); + let _value_type: ValueType = value_type + .try_into() + .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type))) + .expect("should read"); - let hash_index_offset: usize = cursor.read_u32::().unwrap() as usize; - let hash_bucket_count: usize = cursor.read_u8().unwrap().into(); - eprintln!( - "we got hash_idx_offset={hash_index_offset}, hash_bucket_count={hash_bucket_count}" - ); + let seqno = cursor.read_u64_varint().expect("should read"); - if hash_index_offset > 0 { - let hash = xxh3_64(key); - let bucket_no = (hash % hash_bucket_count as u64) as usize; + let key_len: usize = cursor.read_u16_varint().expect("should read").into(); - eprintln!( - "{:?} may be in bucket {bucket_no}", - String::from_utf8_lossy(key) - ); + let key_offset = pos + cursor.position() as usize; + let key = &bytes[key_offset..(key_offset + key_len)]; + + eprintln!("candidate key: {:?}", String::from_utf8_lossy(key)); - let bucket_value_pos = hash_index_offset + bucket_no; - let bucket_value = bytes[bucket_value_pos] as usize; + (key, Reverse(seqno)) + } + + pub fn walk( + &self, + needle: &[u8], + seqno_watermark: Option, + pos: usize, + restart_interval: usize, + ) -> crate::Result> { + let bytes = &self.inner.data; + let mut cursor = Cursor::new(&bytes[pos..]); + + // eprintln!("cursor initial pos: {:?}", cursor.position()); + + // NOTE: Check the full item + let base_key = { + // eprintln!("-- full item"); + + let value_type = cursor.read_u8().expect("should read"); + + if value_type == TERMINATOR_MARKER { + return Ok(None); + } + + let value_type: ValueType = value_type + .try_into() + .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type))) + .expect("should read"); + + let seqno = cursor.read_u64_varint().expect("should read"); + + let key_len: usize = cursor.read_u16_varint().expect("should read").into(); + + let key_offset = pos + cursor.position() as usize; + let key = &bytes[key_offset..(key_offset + key_len)]; + cursor.seek_relative(key_len as i64).expect("should read"); - if bucket_value < MARKER_FREE.into() { - eprintln!("binary index hash short circuit idx = {bucket_value}"); + let val_len: usize = cursor.read_u32_varint().expect("should read") as usize; + let val_offset = pos + cursor.position() as usize; - let binary_index_pos = - binary_index_offset + bucket_value * std::mem::size_of::(); + eprintln!("maybe it is {:?}", String::from_utf8_lossy(key)); - let mut cursor = &bytes[binary_index_pos..]; + if key == needle { + let should_skip = seqno_watermark + .map(|watermark| seqno >= watermark) + .unwrap_or(false); + + if !should_skip { + let key = bytes.slice(key_offset..(key_offset + key_len)); + let value = bytes.slice(val_offset..(val_offset + val_len)); + + return Ok(Some(InternalValue::from_components( + key, value, seqno, value_type, + ))); + } + } + + // TODO: return None if needle < current key + + cursor.seek_relative(val_len as i64).expect("should read"); + + key + }; + + /* eprintln!( + "damn, we did not find the item, but the base key is {:?}", + String::from_utf8_lossy(base_key), + ); */ + + // NOTE: Check the rest items + for _idx in 1..restart_interval { + let value_type = cursor.read_u8()?; + + if value_type == TERMINATOR_MARKER { + return Ok(None); + } - let restart_entry_pos = cursor.read_u32::()?; + let value_type: ValueType = value_type + .try_into() + .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type)))?; - eprintln!("we have to jump to {restart_entry_pos}"); + eprintln!("type: {value_type:?}"); - todo!(); + let seqno = cursor.read_u64_varint()?; + + let shared_prefix_len: usize = cursor.read_u16_varint()?.into(); + let rest_key_len: usize = cursor.read_u16_varint()?.into(); + + let key_offset = pos + cursor.position() as usize; + + let prefix_part = &base_key[0..shared_prefix_len]; + let rest_key = &bytes[key_offset..(key_offset + rest_key_len)]; + cursor.seek_relative(rest_key_len as i64)?; + + let val_len: usize = if value_type == ValueType::Value { + cursor.read_u32_varint().expect("should read") as usize } else { - // NOTE: Fallback to binary search + 0 + }; + + let val_offset = pos + cursor.position() as usize; + + eprintln!( + "maybeee it is \"{}{}\"", + String::from_utf8_lossy(prefix_part), + String::from_utf8_lossy(rest_key), + ); + + if compare_slices(prefix_part, rest_key, needle) { + let should_skip = seqno_watermark + .map(|watermark| seqno >= watermark) + .unwrap_or(false); + + if !should_skip { + let key = if shared_prefix_len == 0 { + bytes.slice(key_offset..(key_offset + rest_key_len)) + } else { + // Stitch key + Slice::fuse(prefix_part, rest_key) + }; + + return Ok(Some(if value_type == ValueType::Value { + let value = bytes.slice(val_offset..(val_offset + val_len)); + InternalValue::from_components(key, value, seqno, value_type) + } else { + InternalValue::from_components(key, b"", seqno, value_type) + })); + } + } + + // TODO: return None, if needle is < current key - unimplemented!() + if value_type == ValueType::Value { + cursor.seek_relative(val_len as i64)?; } } - Ok(()) + // eprintln!("damn we found nothing"); + + Ok(None) } - pub fn iter(&self) -> crate::Result<()> { + pub fn point_read(&self, key: &[u8], seqno: Option) -> Option { + eprintln!("searching for {:?}", String::from_utf8_lossy(key)); + let bytes = &self.inner.data; - let mut cursor = &bytes[bytes.len() + let mut reader = &bytes[bytes.len() - std::mem::size_of::() - std::mem::size_of::() - std::mem::size_of::() @@ -127,102 +251,76 @@ impl DataBlock { - std::mem::size_of::() - std::mem::size_of::()..]; - let item_count = cursor.read_u32::().unwrap(); - let restart_count: usize = cursor.read_u8().unwrap().into(); - eprintln!("we got item_count={item_count}, restart_interval={restart_count}"); - - let mut cursor = Cursor::new(&bytes[..]); - let mut base_key: Option = None; - - for idx in 0..item_count as usize { - if idx % restart_count == 0 { - eprintln!("-- full item"); + let _item_count = reader.read_u32::().expect("should read") as usize; + let restart_interval = reader.read_u8().expect("should read") as usize; - let seqno = cursor.read_u64_varint()?; - - let value_type = cursor.read_u8()?; - let value_type: ValueType = value_type - .try_into() - .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type)))?; - - let key_len: usize = cursor.read_u16_varint()?.into(); - - let offset = cursor.position() as usize; - let key = bytes.slice(offset..(offset + key_len)); - cursor.seek_relative(key_len as i64)?; - // eprintln!("{:?}", String::from_utf8_lossy(&key)); - - let val_len: usize = cursor.read_u32_varint()? as usize; - - let offset = cursor.position() as usize; - let value = bytes.slice(offset..(offset + val_len)); - cursor.seek_relative(val_len as i64)?; - - // eprintln!("{:?}", String::from_utf8_lossy(&value)); + let binary_index_offset = reader.read_u32::().expect("should read") as usize; + let binary_index_len = reader.read_u32::().expect("should read") as usize; + let binary_index = BinaryIndexReader::new( + &bytes[binary_index_offset + ..binary_index_offset + binary_index_len * std::mem::size_of::()], + ); - let item = InternalValue::from_components(key, value, seqno, value_type); - eprintln!("{item:?}"); + /* eprintln!( + "binary index @ {}:{}", + binary_index_offset, binary_index_len, + ); */ + // eprintln!("{:?}", &bytes[binary_index_offset..]); - base_key = Some(item.key.clone()); - } else { - eprintln!("-- truncated item"); + // TODO: if the binary index is really dense, don't look into hash index, or maybe don't even build + // TODO: it in the first place - let seqno = cursor.read_u64_varint()?; + let hash_index_offset = reader.read_u32::().expect("should read") as usize; - let value_type = cursor.read_u8()?; - let value_type: ValueType = value_type - .try_into() - .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type)))?; + if hash_index_offset > 0 { + let hash_bucket_count = reader.read_u8().expect("should read") as usize; - let shared_prefix_len: usize = cursor.read_u16_varint()?.into(); - let rest_key_len: usize = cursor.read_u16_varint()?.into(); + // eprintln!("hash index @ {}:{}", hash_index_offset, hash_bucket_count); - // eprintln!("shared={shared_prefix_len}, rest={rest_key_len}"); + let hash_index_bytes = &bytes[hash_index_offset..hash_index_offset + hash_bucket_count]; + let hash_index = HashIndexReader::new(hash_index_bytes); - let key = if shared_prefix_len > 0 { - // Stitch key + if let Some(bucket_value) = hash_index.get(key) { + let restart_entry_pos = binary_index.get(usize::from(bucket_value)); - // TODO: use Slice::with_size_unzeroed - let mut key = Vec::with_capacity(shared_prefix_len + rest_key_len); - key.extend_from_slice( - &base_key.as_ref().unwrap().user_key[0..shared_prefix_len], - ); + return self + .walk(key, seqno, restart_entry_pos as usize, restart_interval) + .expect("OH NO"); + } + } - for _ in 0..rest_key_len { - key.push(cursor.read_u8()?); - } + // NOTE: Fallback to binary search + eprintln!("fallback to binary search"); - Slice::from(key) - } else { - // Is full key already - let offset = cursor.position() as usize; - let key = bytes.slice(offset..(offset + rest_key_len)); - cursor.seek_relative(rest_key_len as i64)?; - key - }; - // eprintln!("{:?}", String::from_utf8_lossy(&key)); + let mut left = 0; + let mut right = binary_index.len(); - if value_type == ValueType::Value { - let val_len: usize = cursor.read_u32_varint()? as usize; + if right == 0 { + return None; + } - // eprintln!("val len={val_len}"); + let seqno_cmp = Reverse(seqno.map(|x| x - 1).unwrap_or_default()); - let offset = cursor.position() as usize; - let value = bytes.slice(offset..(offset + val_len)); - cursor.seek_relative(val_len as i64)?; + while left < right { + let mid = (left + right) / 2; - // eprintln!("{:?}", String::from_utf8_lossy(&value)); + let offset = binary_index.get(mid); - let item = InternalValue::from_components(key, value, seqno, value_type); - eprintln!("{item:?}"); - } else { - let item = InternalValue::from_components(key, b"", seqno, value_type); - eprintln!("{item:?}"); - } + if (key, seqno_cmp) >= self.get_key_at(offset as usize) { + left = mid + 1; + } else { + right = mid; } } - Ok(()) + if left == 0 { + return None; + } + + let offset = binary_index.get(left - 1); + + self.walk(key, seqno, offset as usize, restart_interval) + .expect("OH NO") } pub fn encode_items(items: &[InternalValue], restart_interval: u8) -> crate::Result> { @@ -230,10 +328,10 @@ impl DataBlock { eprintln!("encoding {} items", items.len()); - let mut binary_index = Vec::::with_capacity(items.len()); + let mut binary_index_builder = + BinaryIndexBuilder::new(items.len() / usize::from(restart_interval)); - let hash_bucket_count = items.len(); - let mut hash_index: Vec = vec![MARKER_FREE; hash_bucket_count]; + let mut hash_index_builder = HashIndexBuilder::new((items.len() as f32 * 0.75) as u8); let mut base_key: &Slice = &items .first() @@ -249,26 +347,27 @@ impl DataBlock { // Serialize each value for (idx, kv) in items.iter().enumerate() { // We encode restart markers as - // [seqno] [value type] [user key len] [user key] [value len] [value] + // [value type] [seqno] [user key len] [user key] [value len] [value] if idx % usize::from(restart_interval) == 0 { eprintln!("restart!"); restart_count += 1; - binary_index.push(writer.len() as u32); + binary_index_builder.insert(writer.len() as u32); kv.key.encode_into(&mut writer)?; base_key = &kv.key.user_key; } else { // We encode truncated values as - // [seqno] [value type] [shared prefix len] [rest key len] [rest key] [value len] [value] + // [value type] [seqno] [shared prefix len] [rest key len] [rest key] [value len] [value] eprintln!("encode with prefix truncation"); eprintln!("base key is {:?}", String::from_utf8_lossy(base_key)); - writer.write_u64_varint(kv.key.seqno)?; writer.write_u8(u8::from(kv.key.value_type))?; + writer.write_u64_varint(kv.key.seqno)?; + let shared_prefix_len = longest_shared_prefix_length(base_key, &kv.key.user_key) as u16; @@ -287,25 +386,11 @@ impl DataBlock { ); } - let hash = xxh3_64(&kv.key.user_key); - let pos = (hash % hash_bucket_count as u64) as usize; - - if hash_index[pos] == MARKER_FREE { - // Free slot - hash_index[pos] = (restart_count as u8) - 1; - - eprintln!( - "hash ref for {:?} => bucket={}->{}", - String::from_utf8_lossy(&kv.key.user_key), - pos, - restart_count - 1, - ); - } else if hash_index[pos] < MARKER_FREE { - // Mark as conflicted - hash_index[pos] = MARKER_CONFLICT; - - eprintln!("{pos} is now conflicted"); - hash_conflicts += 1; + if !hash_index_builder.set(&kv.key.user_key, (restart_count - 1) as u8) { + #[cfg(debug_assertions)] + { + hash_conflicts += 1; + } } // NOTE: Only write value len + value if we are actually a value @@ -317,35 +402,40 @@ impl DataBlock { } } - let binary_index_offset = writer.len() as u32; + // IMPORTANT: Terminator marker + writer.write_u8(TERMINATOR_MARKER)?; - eprintln!("binary index @ {binary_index_offset}: {binary_index:?}"); - - for &offset in &binary_index { - writer.write_u32::(offset)?; // TODO: benchmark little endian on x86_64 - } + let binary_index_offset = writer.len() as u32; + eprintln!("binary index @ {binary_index_offset}: {binary_index_builder:?}"); + let binary_index_len = binary_index_builder.write(&mut writer)?; let mut hash_index_offset = 0u32; + let mut hash_index_len = 0u8; // TODO: unit test when binary index is too long - if binary_index.len() <= (u8::MAX - 2).into() { + + // NOTE: We can only use a hash index when there are 254 buckets or less + // Because 254 and 255 are reserved marker values + // + // With the default restart interval of 16, that still gives us support + // for up to ~4000 KVs + if binary_index_len <= (u8::MAX - 2).into() { hash_index_offset = writer.len() as u32; + hash_index_len = hash_index_builder.len(); - eprintln!("hash index @ {hash_index_offset}: {hash_index:?}"); + eprintln!("hash index @ {hash_index_offset}: {hash_index_builder:?}"); - for &idx in &hash_index { - writer.write_u8(idx)?; - } + hash_index_builder.write(&mut writer)?; } // Trailer writer.write_u32::(items.len() as u32)?; writer.write_u8(restart_interval)?; writer.write_u32::(binary_index_offset)?; - writer.write_u32::(binary_index.len() as u32)?; + writer.write_u32::(binary_index_len as u32)?; writer.write_u32::(hash_index_offset)?; writer.write_u8(if hash_index_offset > 0 { - hash_index.len() as u8 + hash_index_len } else { 0 })?; @@ -353,7 +443,7 @@ impl DataBlock { #[cfg(debug_assertions)] eprintln!( "hash index had {hash_conflicts} conflicts (rate={}%)", - (hash_conflicts as f32 / hash_bucket_count as f32) * 100.0 + (hash_conflicts as f32 / hash_index_len as f32) * 100.0 ); Ok(writer) @@ -370,75 +460,187 @@ mod tests { use test_log::test; #[test] - fn v3_data_block_simple() -> crate::Result<()> { + fn v3_data_block_point_read() -> crate::Result<()> { let items = [ InternalValue::from_components( - "planet:earth:fact", + "pla:earth:fact", "eaaaaaaaaarth", 0, crate::ValueType::Value, ), InternalValue::from_components( - "planet:jupiter:fact", + "pla:jupiter:fact", "Jupiter is big", 0, crate::ValueType::Value, ), InternalValue::from_components( - "planet:jupiter:mass", + "pla:jupiter:mass", "Massive", 0, crate::ValueType::Value, ), InternalValue::from_components( - "planet:jupiter:name", + "pla:jupiter:name", "Jupiter", 0, crate::ValueType::Value, ), + InternalValue::from_components("pla:jupiter:radius", "Big", 0, crate::ValueType::Value), InternalValue::from_components( - "planet:jupiter:radius", - "Big", + "pla:saturn:fact", + "Saturn is pretty big", 0, crate::ValueType::Value, ), + InternalValue::from_components("pla:saturn:name", "Saturn", 0, crate::ValueType::Value), + InternalValue::from_components("pla:venus:fact", "", 1, crate::ValueType::Tombstone), InternalValue::from_components( - "planet:saturn:fact", - "Saturn is pretty big", + "pla:venus:fact", + "Venus exists", 0, crate::ValueType::Value, ), + InternalValue::from_components("pla:venus:name", "Venus", 0, crate::ValueType::Value), + ]; + + let bytes = DataBlock::encode_items(&items, 16)?; + eprintln!("{bytes:?}"); + eprintln!("{}", String::from_utf8_lossy(&bytes)); + eprintln!("encoded into {} bytes", bytes.len()); + + let data_block = DataBlock { + inner: Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + compression: crate::CompressionType::None, + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }, + }; + + /* use std::time::Instant; + + let start = Instant::now(); + for _ in 0..1_000_000 { + data_block.point_read(&needle.key.user_key); + } + eprintln!("one read took {:?}ns", { + let ns = start.elapsed().as_nanos(); + ns / 1_000_000 + }); */ + + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + ); + } + + assert_eq!(None, data_block.point_read(b"yyy", None)); + + Ok(()) + } + + #[test] + fn v3_data_block_point_read_shadowing() -> crate::Result<()> { + let items = [ InternalValue::from_components( - "planet:saturn:name", - "Saturn", + "pla:saturn:fact", + "Saturn is pretty big", 0, crate::ValueType::Value, ), - InternalValue::from_components("planet:venus:fact", "", 1, crate::ValueType::Tombstone), + InternalValue::from_components("pla:saturn:name", "Saturn", 0, crate::ValueType::Value), + InternalValue::from_components("pla:venus:fact", "", 1, crate::ValueType::Tombstone), InternalValue::from_components( - "planet:venus:fact", + "pla:venus:fact", "Venus exists", 0, crate::ValueType::Value, ), + InternalValue::from_components("pla:venus:name", "Venus", 0, crate::ValueType::Value), + ]; + + let bytes = DataBlock::encode_items(&items, 16)?; + eprintln!("{bytes:?}"); + eprintln!("{}", String::from_utf8_lossy(&bytes)); + eprintln!("encoded into {} bytes", bytes.len()); + + let data_block = DataBlock { + inner: Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + compression: crate::CompressionType::None, + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }, + }; + + assert!(data_block + .point_read(b"pla:venus:fact", None) + .expect("should exist") + .is_tombstone()); + + Ok(()) + } + + #[test] + fn v3_data_block_point_read_dense() -> crate::Result<()> { + let items = [ + InternalValue::from_components( + "pla:earth:fact", + "eaaaaaaaaarth", + 0, + crate::ValueType::Value, + ), InternalValue::from_components( - "planet:venus:name", - "Venus", + "pla:jupiter:fact", + "Jupiter is big", 0, crate::ValueType::Value, ), + InternalValue::from_components( + "pla:jupiter:mass", + "Massive", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components( + "pla:jupiter:name", + "Jupiter", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components("pla:jupiter:radius", "Big", 0, crate::ValueType::Value), + InternalValue::from_components( + "pla:saturn:fact", + "Saturn is pretty big", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components("pla:saturn:name", "Saturn", 0, crate::ValueType::Value), + InternalValue::from_components("pla:venus:fact", "", 1, crate::ValueType::Tombstone), + InternalValue::from_components( + "pla:venus:fact", + "Venus exists", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components("pla:venus:name", "Venus", 0, crate::ValueType::Value), ]; - let bytes = DataBlock::encode_items(&items, 2)?; + let bytes = DataBlock::encode_items(&items, 1)?; eprintln!("{bytes:?}"); eprintln!("{}", String::from_utf8_lossy(&bytes)); eprintln!("encoded into {} bytes", bytes.len()); - { - let bytes = lz4_flex::compress_prepend_size(&bytes); - eprintln!("compressed into {} bytes", bytes.len()); - } - let data_block = DataBlock { inner: Block { data: bytes.into(), @@ -451,11 +653,15 @@ mod tests { }, }, }; - data_block.iter()?; - data_block.point_read(b"planet:jupiter:name")?; + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + ); + } - panic!(); + assert_eq!(None, data_block.point_read(b"yyy", None)); Ok(()) } From 98e6f3fd5c4b5a7b5efb9bab6d5c1fd107222ec6 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 22 Mar 2025 23:23:01 +0100 Subject: [PATCH 003/613] remove logs and handle some edge cases --- src/segment/block/header.rs | 2 +- src/super_segment/hash_index/builder.rs | 14 -- src/super_segment/mod.rs | 163 +++++++++++------------- src/value.rs | 23 +++- 4 files changed, 95 insertions(+), 107 deletions(-) diff --git a/src/segment/block/header.rs b/src/segment/block/header.rs index 03361256..2acd1798 100644 --- a/src/segment/block/header.rs +++ b/src/segment/block/header.rs @@ -27,7 +27,7 @@ pub struct Header { pub data_length: u32, /// Uncompressed size of data segment - pub uncompressed_length: u32, + pub uncompressed_length: u32, // TODO: v3: can remove this, because every block stores its allocation anyway } impl Header { diff --git a/src/super_segment/hash_index/builder.rs b/src/super_segment/hash_index/builder.rs index 7d63ee59..8a4a6e25 100644 --- a/src/super_segment/hash_index/builder.rs +++ b/src/super_segment/hash_index/builder.rs @@ -33,13 +33,6 @@ impl Builder { *self.0.get_unchecked_mut(bucket_pos) = binary_index_pos; } - eprintln!( - "hash ref for {:?} => bucket={}->{}", - String::from_utf8_lossy(key), - bucket_pos, - binary_index_pos, - ); - true } x if x == binary_index_pos => { @@ -56,13 +49,6 @@ impl Builder { *self.0.get_unchecked_mut(bucket_pos) = MARKER_CONFLICT; } - eprintln!( - "hash conflict for {:?} => bucket={}->{}", - String::from_utf8_lossy(key), - bucket_pos, - binary_index_pos, - ); - false } } diff --git a/src/super_segment/mod.rs b/src/super_segment/mod.rs index b129dbf9..dc098882 100644 --- a/src/super_segment/mod.rs +++ b/src/super_segment/mod.rs @@ -25,6 +25,14 @@ pub struct Block { pub data: Slice, } +impl Block { + /// Returns the uncompressed block size in bytes + #[must_use] + pub fn size(&self) -> usize { + self.data.len() + } +} + /* impl Decode for Block { fn decode_from(reader: &mut R) -> Result where @@ -73,9 +81,13 @@ pub struct DataBlock { } impl DataBlock { - pub fn get_key_at(&self, pos: usize) -> (&[u8], Reverse) { - eprintln!("get key at {pos}"); + /// Returns the uncompressed block size in bytes. + #[must_use] + pub fn size(&self) -> usize { + self.inner.size() + } + pub fn get_key_at(&self, pos: usize) -> (&[u8], Reverse) { let bytes = &self.inner.data; let mut cursor = Cursor::new(&bytes[pos..]); @@ -93,8 +105,6 @@ impl DataBlock { let key_offset = pos + cursor.position() as usize; let key = &bytes[key_offset..(key_offset + key_len)]; - eprintln!("candidate key: {:?}", String::from_utf8_lossy(key)); - (key, Reverse(seqno)) } @@ -105,15 +115,13 @@ impl DataBlock { pos: usize, restart_interval: usize, ) -> crate::Result> { + use std::cmp::Ordering::{Equal, Greater, Less}; + let bytes = &self.inner.data; let mut cursor = Cursor::new(&bytes[pos..]); - // eprintln!("cursor initial pos: {:?}", cursor.position()); - // NOTE: Check the full item let base_key = { - // eprintln!("-- full item"); - let value_type = cursor.read_u8().expect("should read"); if value_type == TERMINATOR_MARKER { @@ -136,35 +144,35 @@ impl DataBlock { let val_len: usize = cursor.read_u32_varint().expect("should read") as usize; let val_offset = pos + cursor.position() as usize; - eprintln!("maybe it is {:?}", String::from_utf8_lossy(key)); - - if key == needle { - let should_skip = seqno_watermark - .map(|watermark| seqno >= watermark) - .unwrap_or(false); + match key.cmp(needle) { + Equal => { + let should_skip = seqno_watermark + .map(|watermark| seqno >= watermark) + .unwrap_or(false); - if !should_skip { - let key = bytes.slice(key_offset..(key_offset + key_len)); - let value = bytes.slice(val_offset..(val_offset + val_len)); + if !should_skip { + let key = bytes.slice(key_offset..(key_offset + key_len)); + let value = bytes.slice(val_offset..(val_offset + val_len)); - return Ok(Some(InternalValue::from_components( - key, value, seqno, value_type, - ))); + return Ok(Some(InternalValue::from_components( + key, value, seqno, value_type, + ))); + } + } + Greater => { + // NOTE: Already passed searched key + return Ok(None); + } + Less => { + // NOTE: Continue } } - // TODO: return None if needle < current key - cursor.seek_relative(val_len as i64).expect("should read"); key }; - /* eprintln!( - "damn, we did not find the item, but the base key is {:?}", - String::from_utf8_lossy(base_key), - ); */ - // NOTE: Check the rest items for _idx in 1..restart_interval { let value_type = cursor.read_u8()?; @@ -177,8 +185,6 @@ impl DataBlock { .try_into() .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type)))?; - eprintln!("type: {value_type:?}"); - let seqno = cursor.read_u64_varint()?; let shared_prefix_len: usize = cursor.read_u16_varint()?.into(); @@ -198,12 +204,7 @@ impl DataBlock { let val_offset = pos + cursor.position() as usize; - eprintln!( - "maybeee it is \"{}{}\"", - String::from_utf8_lossy(prefix_part), - String::from_utf8_lossy(rest_key), - ); - + // TODO: need cmp result if compare_slices(prefix_part, rest_key, needle) { let should_skip = seqno_watermark .map(|watermark| seqno >= watermark) @@ -233,14 +234,10 @@ impl DataBlock { } } - // eprintln!("damn we found nothing"); - Ok(None) } pub fn point_read(&self, key: &[u8], seqno: Option) -> Option { - eprintln!("searching for {:?}", String::from_utf8_lossy(key)); - let bytes = &self.inner.data; let mut reader = &bytes[bytes.len() @@ -261,12 +258,6 @@ impl DataBlock { ..binary_index_offset + binary_index_len * std::mem::size_of::()], ); - /* eprintln!( - "binary index @ {}:{}", - binary_index_offset, binary_index_len, - ); */ - // eprintln!("{:?}", &bytes[binary_index_offset..]); - // TODO: if the binary index is really dense, don't look into hash index, or maybe don't even build // TODO: it in the first place @@ -275,8 +266,6 @@ impl DataBlock { if hash_index_offset > 0 { let hash_bucket_count = reader.read_u8().expect("should read") as usize; - // eprintln!("hash index @ {}:{}", hash_index_offset, hash_bucket_count); - let hash_index_bytes = &bytes[hash_index_offset..hash_index_offset + hash_bucket_count]; let hash_index = HashIndexReader::new(hash_index_bytes); @@ -290,7 +279,6 @@ impl DataBlock { } // NOTE: Fallback to binary search - eprintln!("fallback to binary search"); let mut left = 0; let mut right = binary_index.len(); @@ -299,17 +287,32 @@ impl DataBlock { return None; } - let seqno_cmp = Reverse(seqno.map(|x| x - 1).unwrap_or_default()); + // TODO: try to refactor this somehow + if let Some(seqno) = seqno { + let seqno_cmp = Reverse(seqno - 1); - while left < right { - let mid = (left + right) / 2; + while left < right { + let mid = (left + right) / 2; - let offset = binary_index.get(mid); + let offset = binary_index.get(mid); - if (key, seqno_cmp) >= self.get_key_at(offset as usize) { - left = mid + 1; - } else { - right = mid; + if (key, seqno_cmp) >= self.get_key_at(offset as usize) { + left = mid + 1; + } else { + right = mid; + } + } + } else { + while left < right { + let mid = (left + right) / 2; + + let offset = binary_index.get(mid); + + if key >= self.get_key_at(offset as usize).0 { + left = mid + 1; + } else { + right = mid; + } } } @@ -323,15 +326,18 @@ impl DataBlock { .expect("OH NO") } - pub fn encode_items(items: &[InternalValue], restart_interval: u8) -> crate::Result> { + pub fn encode_items( + items: &[InternalValue], + restart_interval: u8, + hash_index_ratio: f32, + ) -> crate::Result> { let mut writer = Vec::with_capacity(u16::MAX.into()); - eprintln!("encoding {} items", items.len()); - let mut binary_index_builder = BinaryIndexBuilder::new(items.len() / usize::from(restart_interval)); - let mut hash_index_builder = HashIndexBuilder::new((items.len() as f32 * 0.75) as u8); + let bucket_count = (items.len() as f32 * hash_index_ratio) as u8; + let mut hash_index_builder = HashIndexBuilder::new(bucket_count); let mut base_key: &Slice = &items .first() @@ -341,15 +347,11 @@ impl DataBlock { let mut restart_count: u32 = 0; - #[cfg(debug_assertions)] - let mut hash_conflicts = 0; - // Serialize each value for (idx, kv) in items.iter().enumerate() { // We encode restart markers as // [value type] [seqno] [user key len] [user key] [value len] [value] if idx % usize::from(restart_interval) == 0 { - eprintln!("restart!"); restart_count += 1; binary_index_builder.insert(writer.len() as u32); @@ -361,9 +363,6 @@ impl DataBlock { // We encode truncated values as // [value type] [seqno] [shared prefix len] [rest key len] [rest key] [value len] [value] - eprintln!("encode with prefix truncation"); - eprintln!("base key is {:?}", String::from_utf8_lossy(base_key)); - writer.write_u8(u8::from(kv.key.value_type))?; writer.write_u64_varint(kv.key.seqno)?; @@ -379,18 +378,10 @@ impl DataBlock { let truncated_user_key: &[u8] = &kv.key.user_key; let truncated_user_key = &truncated_user_key[shared_prefix_len as usize..]; writer.write_all(truncated_user_key)?; - - eprintln!( - "shared prefix is {:?}", - String::from_utf8_lossy(&base_key[0..shared_prefix_len as usize]), - ); } - if !hash_index_builder.set(&kv.key.user_key, (restart_count - 1) as u8) { - #[cfg(debug_assertions)] - { - hash_conflicts += 1; - } + if bucket_count > 0 { + hash_index_builder.set(&kv.key.user_key, (restart_count - 1) as u8); } // NOTE: Only write value len + value if we are actually a value @@ -406,25 +397,21 @@ impl DataBlock { writer.write_u8(TERMINATOR_MARKER)?; let binary_index_offset = writer.len() as u32; - eprintln!("binary index @ {binary_index_offset}: {binary_index_builder:?}"); let binary_index_len = binary_index_builder.write(&mut writer)?; let mut hash_index_offset = 0u32; let mut hash_index_len = 0u8; // TODO: unit test when binary index is too long - // NOTE: We can only use a hash index when there are 254 buckets or less // Because 254 and 255 are reserved marker values // // With the default restart interval of 16, that still gives us support // for up to ~4000 KVs - if binary_index_len <= (u8::MAX - 2).into() { + if bucket_count > 0 && binary_index_len <= (u8::MAX - 2).into() { hash_index_offset = writer.len() as u32; hash_index_len = hash_index_builder.len(); - eprintln!("hash index @ {hash_index_offset}: {hash_index_builder:?}"); - hash_index_builder.write(&mut writer)?; } @@ -440,12 +427,6 @@ impl DataBlock { 0 })?; - #[cfg(debug_assertions)] - eprintln!( - "hash index had {hash_conflicts} conflicts (rate={}%)", - (hash_conflicts as f32 / hash_index_len as f32) * 100.0 - ); - Ok(writer) } } @@ -504,7 +485,7 @@ mod tests { InternalValue::from_components("pla:venus:name", "Venus", 0, crate::ValueType::Value), ]; - let bytes = DataBlock::encode_items(&items, 16)?; + let bytes = DataBlock::encode_items(&items, 16, 0.75)?; eprintln!("{bytes:?}"); eprintln!("{}", String::from_utf8_lossy(&bytes)); eprintln!("encoded into {} bytes", bytes.len()); @@ -565,7 +546,7 @@ mod tests { InternalValue::from_components("pla:venus:name", "Venus", 0, crate::ValueType::Value), ]; - let bytes = DataBlock::encode_items(&items, 16)?; + let bytes = DataBlock::encode_items(&items, 16, 0.75)?; eprintln!("{bytes:?}"); eprintln!("{}", String::from_utf8_lossy(&bytes)); eprintln!("encoded into {} bytes", bytes.len()); @@ -636,7 +617,7 @@ mod tests { InternalValue::from_components("pla:venus:name", "Venus", 0, crate::ValueType::Value), ]; - let bytes = DataBlock::encode_items(&items, 1)?; + let bytes = DataBlock::encode_items(&items, 1, 0.75)?; eprintln!("{bytes:?}"); eprintln!("{}", String::from_utf8_lossy(&bytes)); eprintln!("encoded into {} bytes", bytes.len()); diff --git a/src/value.rs b/src/value.rs index 50ad35a0..f3a3b5e3 100644 --- a/src/value.rs +++ b/src/value.rs @@ -67,7 +67,7 @@ impl From for u8 { /// Internal representation of KV pairs #[allow(clippy::module_name_repetitions)] -#[derive(Clone, Eq, PartialEq)] +#[derive(Clone, Eq)] pub struct InternalValue { /// Internal key pub key: InternalKey, @@ -138,6 +138,27 @@ impl InternalValue { } } +impl PartialEq for InternalValue { + fn eq(&self, other: &Self) -> bool { + self.key == other.key + } +} + +impl PartialOrd for InternalValue { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.key.cmp(&other.key)) + } +} + +// Order by user key, THEN by sequence number +// This is one of the most important functions +// Otherwise queries will not match expected behaviour +impl Ord for InternalValue { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.key.cmp(&other.key) + } +} + impl ItemSize for InternalValue { fn size(&self) -> usize { std::mem::size_of::() From 7fc802afe9ac712a1fd7f6dfe9d1eb1278fa0ae5 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 22 Mar 2025 23:23:19 +0100 Subject: [PATCH 004/613] add fuzz tests for data block point reads --- fuzz/Cargo.toml | 19 +++++++++ fuzz/fuzz_targets/data_block.rs | 74 +++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 fuzz/Cargo.toml create mode 100644 fuzz/fuzz_targets/data_block.rs diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml new file mode 100644 index 00000000..6eaa72bf --- /dev/null +++ b/fuzz/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "lsm-tree-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +lsm-tree = { path = ".." } + +[[bin]] +name = "data_block" +path = "fuzz_targets/data_block.rs" +test = false +doc = false +bench = false diff --git a/fuzz/fuzz_targets/data_block.rs b/fuzz/fuzz_targets/data_block.rs new file mode 100644 index 00000000..a083818a --- /dev/null +++ b/fuzz/fuzz_targets/data_block.rs @@ -0,0 +1,74 @@ +#![no_main] +use libfuzzer_sys::{ + arbitrary::{Arbitrary, Result, Unstructured}, + fuzz_target, +}; +use lsm_tree::{ + segment::block::offset::BlockOffset, + super_segment::{Block, DataBlock}, + InternalValue, SeqNo, ValueType, +}; + +#[derive(Clone, Debug, PartialEq, Eq, Ord, PartialOrd)] +struct FuzzyValue(InternalValue); + +impl<'a> Arbitrary<'a> for FuzzyValue { + fn arbitrary(u: &mut Unstructured<'a>) -> Result { + let key = Vec::::arbitrary(u)?; + let value = Vec::::arbitrary(u)?; + let seqno = u64::arbitrary(u)?; + + let key = if key.is_empty() { vec![0] } else { key }; + + Ok(Self(InternalValue::from_components( + key, + value, + seqno, + ValueType::Value, + ))) + } +} + +fuzz_target!(|data: &[u8]| { + let mut unstructured = Unstructured::new(data); + + let restart_interval = u8::arbitrary(&mut unstructured).unwrap().max(1); + let hash_ratio = (f32::arbitrary(&mut unstructured).unwrap() / f32::MAX) + .min(1.0) + .max(0.0); + + if let Ok(mut items) = as Arbitrary>::arbitrary(&mut unstructured) { + if !items.is_empty() { + items.sort(); + items.dedup(); + + let items = items.into_iter().map(|value| value.0).collect::>(); + let bytes = + DataBlock::encode_items(&items, restart_interval.into(), hash_ratio).unwrap(); + + let data_block = DataBlock { + inner: Block { + data: bytes.into(), + header: lsm_tree::segment::block::header::Header { + checksum: lsm_tree::segment::block::checksum::Checksum::from_raw(0), + compression: lsm_tree::CompressionType::None, + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }, + }; + + for needle in items { + if needle.key.seqno == SeqNo::MAX { + continue; + } + + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + ); + } + } + } +}); From a18467209f29b12ef5cf48da7b0daed564024e63 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 22 Mar 2025 23:42:42 +0100 Subject: [PATCH 005/613] wip --- src/super_segment/mod.rs | 78 ++++++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 30 deletions(-) diff --git a/src/super_segment/mod.rs b/src/super_segment/mod.rs index dc098882..37b3706e 100644 --- a/src/super_segment/mod.rs +++ b/src/super_segment/mod.rs @@ -10,7 +10,7 @@ use binary_index::{Builder as BinaryIndexBuilder, Reader as BinaryIndexReader}; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use hash_index::{Builder as HashIndexBuilder, Reader as HashIndexReader}; use std::{ - cmp::Reverse, + cmp::{Ordering, Reverse}, io::{Cursor, Seek, Write}, }; use varint_rs::{VarintReader, VarintWriter}; @@ -65,14 +65,26 @@ fn longest_shared_prefix_length(s1: &[u8], s2: &[u8]) -> usize { .count() } -fn compare_slices(prefix_part: &[T], rest_key: &[T], needle: &[T]) -> bool { - let full_len = prefix_part.len() + rest_key.len(); +fn compare_slices(prefix_part: &[T], key: &[T], needle: &[T]) -> Ordering { + let combined = prefix_part.iter().chain(key.iter()); + let mut needle_iter = needle.iter(); - if full_len != needle.len() { - return false; + for (a, b) in combined.zip(needle_iter.by_ref()) { + match a.cmp(b) { + Ordering::Equal => continue, + other => return other, + } + } + + if needle_iter.next().is_some() { + return Ordering::Less; } - needle.ends_with(rest_key) && needle.starts_with(prefix_part) + if prefix_part.len() + key.len() > needle.len() { + return Ordering::Greater; + } + + Ordering::Equal } /// Block that contains key-value pairs (user data) @@ -204,31 +216,37 @@ impl DataBlock { let val_offset = pos + cursor.position() as usize; - // TODO: need cmp result - if compare_slices(prefix_part, rest_key, needle) { - let should_skip = seqno_watermark - .map(|watermark| seqno >= watermark) - .unwrap_or(false); - - if !should_skip { - let key = if shared_prefix_len == 0 { - bytes.slice(key_offset..(key_offset + rest_key_len)) - } else { - // Stitch key - Slice::fuse(prefix_part, rest_key) - }; - - return Ok(Some(if value_type == ValueType::Value { - let value = bytes.slice(val_offset..(val_offset + val_len)); - InternalValue::from_components(key, value, seqno, value_type) - } else { - InternalValue::from_components(key, b"", seqno, value_type) - })); + match compare_slices(prefix_part, rest_key, needle) { + Equal => { + let should_skip = seqno_watermark + .map(|watermark| seqno >= watermark) + .unwrap_or(false); + + if !should_skip { + let key = if shared_prefix_len == 0 { + bytes.slice(key_offset..(key_offset + rest_key_len)) + } else { + // Stitch key + Slice::fuse(prefix_part, rest_key) + }; + + return Ok(Some(if value_type == ValueType::Value { + let value = bytes.slice(val_offset..(val_offset + val_len)); + InternalValue::from_components(key, value, seqno, value_type) + } else { + InternalValue::from_components(key, b"", seqno, value_type) + })); + } + } + Greater => { + // NOTE: Already passed searched key + return Ok(None); + } + Less => { + // NOTE: Continue } } - // TODO: return None, if needle is < current key - if value_type == ValueType::Value { cursor.seek_relative(val_len as i64)?; } @@ -258,8 +276,8 @@ impl DataBlock { ..binary_index_offset + binary_index_len * std::mem::size_of::()], ); - // TODO: if the binary index is really dense, don't look into hash index, or maybe don't even build - // TODO: it in the first place + // TODO: if the binary index is really dense, don't look into hash index, or + // maybe don't even build it in the first place let hash_index_offset = reader.read_u32::().expect("should read") as usize; From f2d0bdbd810462d03ee16941a47a920bf65f5e9c Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 23 Mar 2025 01:34:34 +0100 Subject: [PATCH 006/613] refactor --- src/super_segment/binary_index/reader.rs | 14 +- src/super_segment/data_block/encoder.rs | 191 +++++++ src/super_segment/data_block/mod.rs | 526 +++++++++++++++++++ src/super_segment/hash_index/builder.rs | 9 +- src/super_segment/hash_index/mod.rs | 2 +- src/super_segment/hash_index/reader.rs | 6 +- src/super_segment/mod.rs | 629 +---------------------- 7 files changed, 740 insertions(+), 637 deletions(-) create mode 100644 src/super_segment/data_block/encoder.rs create mode 100644 src/super_segment/data_block/mod.rs diff --git a/src/super_segment/binary_index/reader.rs b/src/super_segment/binary_index/reader.rs index bbc9eeae..0e481fb6 100644 --- a/src/super_segment/binary_index/reader.rs +++ b/src/super_segment/binary_index/reader.rs @@ -1,20 +1,24 @@ use byteorder::{BigEndian, ReadBytesExt}; +type FencePtr = u32; + pub struct Reader<'a> { bytes: &'a [u8], } impl<'a> Reader<'a> { - pub fn new(bytes: &'a [u8]) -> Self { - Self { bytes } + pub fn new(bytes: &'a [u8], offset: usize, len: usize) -> Self { + Self { + bytes: &bytes[offset..(offset + len * std::mem::size_of::())], + } } pub fn len(&self) -> usize { - self.bytes.len() / std::mem::size_of::() + self.bytes.len() / std::mem::size_of::() } - pub fn get(&self, idx: usize) -> u32 { - let offset = idx * std::mem::size_of::(); + pub fn get(&self, idx: usize) -> FencePtr { + let offset = idx * std::mem::size_of::(); let mut bytes = self.bytes.get(offset..).expect("should be in array"); bytes.read_u32::().expect("should read") diff --git a/src/super_segment/data_block/encoder.rs b/src/super_segment/data_block/encoder.rs new file mode 100644 index 00000000..410c61ac --- /dev/null +++ b/src/super_segment/data_block/encoder.rs @@ -0,0 +1,191 @@ +use super::super::binary_index::Builder as BinaryIndexBuilder; +use super::super::hash_index::Builder as HashIndexBuilder; +use crate::{coding::Encode, InternalValue}; +use byteorder::{BigEndian, WriteBytesExt}; +use std::io::Write; +use varint_rs::VarintWriter; + +pub const TERMINATOR_MARKER: u8 = 255; + +pub const TRAILER_SIZE: usize = (std::mem::size_of::() * 5) + std::mem::size_of::(); + +fn longest_shared_prefix_length(s1: &[u8], s2: &[u8]) -> usize { + s1.iter() + .zip(s2.iter()) + .take_while(|(c1, c2)| c1 == c2) + .count() +} + +pub struct Encoder<'a> { + writer: Vec, + + binary_index_builder: BinaryIndexBuilder, + hash_index_builder: HashIndexBuilder, + + restart_interval: u8, + + base_key: &'a [u8], + + restart_count: usize, + item_count: usize, +} + +impl<'a> Encoder<'a> { + pub fn new( + item_count: usize, + restart_interval: u8, + hash_index_ratio: f32, + first_key: &'a [u8], + ) -> Self { + let binary_index_len = item_count / usize::from(restart_interval); + let bucket_count = (item_count as f32 * hash_index_ratio) as u32; // TODO: verify + + Self { + writer: Vec::with_capacity(u16::MAX.into()), + + binary_index_builder: BinaryIndexBuilder::new(binary_index_len), + hash_index_builder: HashIndexBuilder::new(bucket_count), + + restart_interval, + + base_key: first_key, + + restart_count: 0, + item_count: 0, + } + } + + pub fn write(&mut self, kv: &'a InternalValue) -> crate::Result<()> { + // NOTE: Check if we are a restart marker + if self.item_count % usize::from(self.restart_interval) == 0 { + // We encode restart markers as: + // [value type] [seqno] [user key len] [user key] [value len] [value] + + self.restart_count += 1; + + // NOTE: We know that data blocks will never even approach 4 GB in size + #[allow(clippy::cast_possible_truncation)] + self.binary_index_builder.insert(self.writer.len() as u32); + + kv.key.encode_into(&mut self.writer)?; + + self.base_key = &kv.key.user_key; + } else { + // We encode truncated values as: + // [value type] [seqno] [shared prefix len] [rest key len] [rest key] [value len] [value] + + self.writer.write_u8(u8::from(kv.key.value_type))?; + + self.writer.write_u64_varint(kv.key.seqno)?; + + // NOTE: We can safely cast to u16, because keys are u16 long max + #[allow(clippy::cast_possible_truncation)] + let shared_prefix_len = + longest_shared_prefix_length(self.base_key, &kv.key.user_key) as u16; + + self.writer.write_u16_varint(shared_prefix_len)?; + + // NOTE: We can safely cast to u16, because keys are u16 long max + #[allow(clippy::cast_possible_truncation)] + let rest_len = kv.key.user_key.len() as u16 - shared_prefix_len; + + self.writer.write_u16_varint(rest_len)?; + + let truncated_user_key = &kv + .key + .user_key + .get(shared_prefix_len as usize..) + .expect("should be in bounds"); + + self.writer.write_all(truncated_user_key)?; + } + + if self.hash_index_builder.bucket_count() > 0 { + // NOTE: The max binary index is bound by u8 (technically u8::MAX - 2) + #[allow(clippy::cast_possible_truncation)] + self.hash_index_builder + .set(&kv.key.user_key, (self.restart_count - 1) as u8); + } + + // NOTE: Only write value len + value if we are actually a value + if !kv.is_tombstone() { + // NOTE: We know values are limited to 32-bit length + #[allow(clippy::cast_possible_truncation)] + self.writer.write_u32_varint(kv.value.len() as u32)?; + self.writer.write_all(&kv.value)?; + } + + self.item_count += 1; + + Ok(()) + } + + pub fn finish(mut self) -> crate::Result> { + // IMPORTANT: Terminator marker + self.writer.write_u8(TERMINATOR_MARKER)?; + + // NOTE: We know that data blocks will never even approach 4 GB in size + #[allow(clippy::cast_possible_truncation)] + let binary_index_offset = self.writer.len() as u32; + + let binary_index_len = self.binary_index_builder.write(&mut self.writer)?; + + let mut hash_index_offset = 0u32; + let mut hash_index_len = 0u32; + + // TODO: unit test when binary index is too long + // NOTE: We can only use a hash index when there are 254 buckets or less + // Because 254 and 255 are reserved marker values + // + // With the default restart interval of 16, that still gives us support + // for up to ~4000 KVs + if self.hash_index_builder.bucket_count() > 0 && binary_index_len <= (u8::MAX - 2).into() { + // NOTE: We know that data blocks will never even approach 4 GB in size + #[allow(clippy::cast_possible_truncation)] + { + hash_index_offset = self.writer.len() as u32; + } + + hash_index_len = self.hash_index_builder.bucket_count(); + + self.hash_index_builder.write(&mut self.writer)?; + } + + #[cfg(debug_assertions)] + let bytes_before = self.writer.len(); + + // Trailer: + // [item_count] [restart_interval] [binary_index_offset] [binary_index_len] [hash_index_offset] [hash_index_len] + + // NOTE: We know that data blocks will never even approach 4 GB in size, so there can't be that many items either + #[allow(clippy::cast_possible_truncation)] + self.writer.write_u32::(self.item_count as u32)?; + + self.writer.write_u8(self.restart_interval)?; + + self.writer.write_u32::(binary_index_offset)?; + + // NOTE: Even with a dense index, there can't be more index pointers than items + #[allow(clippy::cast_possible_truncation)] + self.writer + .write_u32::(binary_index_len as u32)?; + + self.writer.write_u32::(hash_index_offset)?; + + self.writer + .write_u32::(if hash_index_offset > 0 { + hash_index_len + } else { + 0 + })?; + + #[cfg(debug_assertions)] + assert_eq!( + TRAILER_SIZE, + self.writer.len() - bytes_before, + "footer size does not match", + ); + + Ok(self.writer) + } +} diff --git a/src/super_segment/data_block/mod.rs b/src/super_segment/data_block/mod.rs new file mode 100644 index 00000000..1bd77f9c --- /dev/null +++ b/src/super_segment/data_block/mod.rs @@ -0,0 +1,526 @@ +mod encoder; + +use super::hash_index::Reader as HashIndexReader; +use super::{binary_index::Reader as BinaryIndexReader, Block}; +use crate::{coding::DecodeError, InternalValue, SeqNo, Slice, ValueType}; +use byteorder::{BigEndian, ReadBytesExt}; +use encoder::{TERMINATOR_MARKER, TRAILER_SIZE}; +use std::cmp::Ordering; +use std::{ + cmp::Reverse, + io::{Cursor, Seek}, +}; +use varint_rs::VarintReader; + +pub use encoder::Encoder; + +type DataBlockEncoder<'a> = Encoder<'a>; + +fn compare_slices(prefix_part: &[T], key: &[T], needle: &[T]) -> Ordering { + let combined = prefix_part.iter().chain(key.iter()); + let mut needle_iter = needle.iter(); + + for (a, b) in combined.zip(needle_iter.by_ref()) { + match a.cmp(b) { + Ordering::Equal => continue, + other => return other, + } + } + + if needle_iter.next().is_some() { + return Ordering::Less; + } + + if prefix_part.len() + key.len() > needle.len() { + return Ordering::Greater; + } + + Ordering::Equal +} + +/// Block that contains key-value pairs (user data) +pub struct DataBlock { + pub inner: Block, +} + +impl DataBlock { + /// Returns the uncompressed block size in bytes. + #[must_use] + pub fn size(&self) -> usize { + self.inner.size() + } + + pub fn get_key_at(&self, pos: usize) -> (&[u8], Reverse) { + let bytes = &self.inner.data; + let mut cursor = Cursor::new(&bytes[pos..]); + + let value_type = cursor.read_u8().expect("should read"); + + let _value_type: ValueType = value_type + .try_into() + .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type))) + .expect("should read"); + + let seqno = cursor.read_u64_varint().expect("should read"); + + let key_len: usize = cursor.read_u16_varint().expect("should read").into(); + + let key_offset = pos + cursor.position() as usize; + let key = &bytes[key_offset..(key_offset + key_len)]; + + (key, Reverse(seqno)) + } + + pub fn walk( + &self, + needle: &[u8], + seqno_watermark: Option, + pos: usize, + restart_interval: usize, + ) -> crate::Result> { + use std::cmp::Ordering::{Equal, Greater, Less}; + + let bytes = &self.inner.data; + let mut cursor = Cursor::new(&bytes[pos..]); + + // NOTE: Check the full item + let base_key = { + let value_type = cursor.read_u8().expect("should read"); + + if value_type == TERMINATOR_MARKER { + return Ok(None); + } + + let value_type: ValueType = value_type + .try_into() + .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type))) + .expect("should read"); + + let seqno = cursor.read_u64_varint().expect("should read"); + + let key_len: usize = cursor.read_u16_varint().expect("should read").into(); + + let key_offset = pos + cursor.position() as usize; + let key = &bytes[key_offset..(key_offset + key_len)]; + cursor.seek_relative(key_len as i64).expect("should read"); + + let val_len: usize = cursor.read_u32_varint().expect("should read") as usize; + let val_offset = pos + cursor.position() as usize; + + match key.cmp(needle) { + Equal => { + let should_skip = seqno_watermark + .map(|watermark| seqno >= watermark) + .unwrap_or(false); + + if !should_skip { + let key = bytes.slice(key_offset..(key_offset + key_len)); + let value = bytes.slice(val_offset..(val_offset + val_len)); + + return Ok(Some(InternalValue::from_components( + key, value, seqno, value_type, + ))); + } + } + Greater => { + // NOTE: Already passed searched key + return Ok(None); + } + Less => { + // NOTE: Continue + } + } + + cursor.seek_relative(val_len as i64).expect("should read"); + + key + }; + + // NOTE: Check the rest items + for _idx in 1..restart_interval { + let value_type = cursor.read_u8()?; + + if value_type == TERMINATOR_MARKER { + return Ok(None); + } + + let value_type: ValueType = value_type + .try_into() + .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type)))?; + + let seqno = cursor.read_u64_varint()?; + + let shared_prefix_len: usize = cursor.read_u16_varint()?.into(); + let rest_key_len: usize = cursor.read_u16_varint()?.into(); + + let key_offset = pos + cursor.position() as usize; + + let prefix_part = &base_key[0..shared_prefix_len]; + let rest_key = &bytes[key_offset..(key_offset + rest_key_len)]; + cursor.seek_relative(rest_key_len as i64)?; + + let val_len: usize = if value_type == ValueType::Value { + cursor.read_u32_varint().expect("should read") as usize + } else { + 0 + }; + + let val_offset = pos + cursor.position() as usize; + + match compare_slices(prefix_part, rest_key, needle) { + Equal => { + let should_skip = seqno_watermark + .map(|watermark| seqno >= watermark) + .unwrap_or(false); + + if !should_skip { + let key = if shared_prefix_len == 0 { + bytes.slice(key_offset..(key_offset + rest_key_len)) + } else { + // Stitch key + Slice::fuse(prefix_part, rest_key) + }; + + return Ok(Some(if value_type == ValueType::Value { + let value = bytes.slice(val_offset..(val_offset + val_len)); + InternalValue::from_components(key, value, seqno, value_type) + } else { + InternalValue::from_components(key, b"", seqno, value_type) + })); + } + } + Greater => { + // NOTE: Already passed searched key + return Ok(None); + } + Less => { + // NOTE: Continue + } + } + + if value_type == ValueType::Value { + cursor.seek_relative(val_len as i64)?; + } + } + + Ok(None) + } + + pub fn point_read(&self, key: &[u8], seqno: Option) -> Option { + let bytes = &self.inner.data; + + let mut reader = &bytes[bytes.len() - TRAILER_SIZE..]; + + let _item_count = reader.read_u32::().expect("should read") as usize; + let restart_interval = reader.read_u8().expect("should read") as usize; + + let binary_index_offset = reader.read_u32::().expect("should read") as usize; + let binary_index_len = reader.read_u32::().expect("should read") as usize; + let binary_index = BinaryIndexReader::new(bytes, binary_index_offset, binary_index_len); + + // TODO: if the binary index is really dense, don't look into hash index, or + // maybe don't even build it in the first place + + let hash_index_offset = reader.read_u32::().expect("should read") as usize; + + if hash_index_offset > 0 { + let hash_bucket_count = reader.read_u32::().expect("should read") as usize; + + let hash_index = HashIndexReader::new(bytes, hash_index_offset, hash_bucket_count); + + if let Some(bucket_value) = hash_index.get(key) { + let restart_entry_pos = binary_index.get(usize::from(bucket_value)); + + return self + .walk(key, seqno, restart_entry_pos as usize, restart_interval) + .expect("OH NO"); + } + } + + // NOTE: Fallback to binary search + + let mut left = 0; + let mut right = binary_index.len(); + + if right == 0 { + return None; + } + + // TODO: try to refactor this somehow + if let Some(seqno) = seqno { + let seqno_cmp = Reverse(seqno - 1); + + while left < right { + let mid = (left + right) / 2; + + let offset = binary_index.get(mid); + + if (key, seqno_cmp) >= self.get_key_at(offset as usize) { + left = mid + 1; + } else { + right = mid; + } + } + } else { + while left < right { + let mid = (left + right) / 2; + + let offset = binary_index.get(mid); + + if key >= self.get_key_at(offset as usize).0 { + left = mid + 1; + } else { + right = mid; + } + } + } + + if left == 0 { + return None; + } + + let offset = binary_index.get(left - 1); + + self.walk(key, seqno, offset as usize, restart_interval) + .expect("OH NO") + } + + pub fn encode_items( + items: &[InternalValue], + restart_interval: u8, + hash_index_ratio: f32, + ) -> crate::Result> { + let first_key = &items + .first() + .expect("chunk should not be empty") + .key + .user_key; + + let mut serializer = + DataBlockEncoder::new(items.len(), restart_interval, hash_index_ratio, first_key); + + for item in items { + serializer.write(item)?; + } + + serializer.finish() + } +} + +#[cfg(test)] +mod tests { + use super::DataBlock; + use crate::{ + segment::block::{header::Header, offset::BlockOffset}, + super_segment::Block, + Checksum, InternalValue, + }; + use test_log::test; + + #[test] + fn v3_data_block_point_read() -> crate::Result<()> { + let items = [ + InternalValue::from_components( + "pla:earth:fact", + "eaaaaaaaaarth", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components( + "pla:jupiter:fact", + "Jupiter is big", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components( + "pla:jupiter:mass", + "Massive", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components( + "pla:jupiter:name", + "Jupiter", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components("pla:jupiter:radius", "Big", 0, crate::ValueType::Value), + InternalValue::from_components( + "pla:saturn:fact", + "Saturn is pretty big", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components("pla:saturn:name", "Saturn", 0, crate::ValueType::Value), + InternalValue::from_components("pla:venus:fact", "", 1, crate::ValueType::Tombstone), + InternalValue::from_components( + "pla:venus:fact", + "Venus exists", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components("pla:venus:name", "Venus", 0, crate::ValueType::Value), + ]; + + let bytes = DataBlock::encode_items(&items, 16, 0.75)?; + eprintln!("{bytes:?}"); + eprintln!("{}", String::from_utf8_lossy(&bytes)); + eprintln!("encoded into {} bytes", bytes.len()); + + let data_block = DataBlock { + inner: Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + compression: crate::CompressionType::None, + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }, + }; + + /* use std::time::Instant; + + let start = Instant::now(); + for _ in 0..1_000_000 { + data_block.point_read(&needle.key.user_key); + } + eprintln!("one read took {:?}ns", { + let ns = start.elapsed().as_nanos(); + ns / 1_000_000 + }); */ + + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + ); + } + + assert_eq!(None, data_block.point_read(b"yyy", None)); + + Ok(()) + } + + #[test] + fn v3_data_block_point_read_shadowing() -> crate::Result<()> { + let items = [ + InternalValue::from_components( + "pla:saturn:fact", + "Saturn is pretty big", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components("pla:saturn:name", "Saturn", 0, crate::ValueType::Value), + InternalValue::from_components("pla:venus:fact", "", 1, crate::ValueType::Tombstone), + InternalValue::from_components( + "pla:venus:fact", + "Venus exists", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components("pla:venus:name", "Venus", 0, crate::ValueType::Value), + ]; + + let bytes = DataBlock::encode_items(&items, 16, 0.75)?; + eprintln!("{bytes:?}"); + eprintln!("{}", String::from_utf8_lossy(&bytes)); + eprintln!("encoded into {} bytes", bytes.len()); + + let data_block = DataBlock { + inner: Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + compression: crate::CompressionType::None, + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }, + }; + + assert!(data_block + .point_read(b"pla:venus:fact", None) + .expect("should exist") + .is_tombstone()); + + Ok(()) + } + + #[test] + fn v3_data_block_point_read_dense() -> crate::Result<()> { + let items = [ + InternalValue::from_components( + "pla:earth:fact", + "eaaaaaaaaarth", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components( + "pla:jupiter:fact", + "Jupiter is big", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components( + "pla:jupiter:mass", + "Massive", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components( + "pla:jupiter:name", + "Jupiter", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components("pla:jupiter:radius", "Big", 0, crate::ValueType::Value), + InternalValue::from_components( + "pla:saturn:fact", + "Saturn is pretty big", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components("pla:saturn:name", "Saturn", 0, crate::ValueType::Value), + InternalValue::from_components("pla:venus:fact", "", 1, crate::ValueType::Tombstone), + InternalValue::from_components( + "pla:venus:fact", + "Venus exists", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components("pla:venus:name", "Venus", 0, crate::ValueType::Value), + ]; + + let bytes = DataBlock::encode_items(&items, 1, 0.75)?; + eprintln!("{bytes:?}"); + eprintln!("{}", String::from_utf8_lossy(&bytes)); + eprintln!("encoded into {} bytes", bytes.len()); + + let data_block = DataBlock { + inner: Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + compression: crate::CompressionType::None, + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }, + }; + + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + ); + } + + assert_eq!(None, data_block.point_read(b"yyy", None)); + + Ok(()) + } +} diff --git a/src/super_segment/hash_index/builder.rs b/src/super_segment/hash_index/builder.rs index 8a4a6e25..594ee581 100644 --- a/src/super_segment/hash_index/builder.rs +++ b/src/super_segment/hash_index/builder.rs @@ -5,18 +5,19 @@ use byteorder::WriteBytesExt; pub struct Builder(Vec); impl Builder { - pub fn new(bucket_count: u8) -> Self { + pub fn new(bucket_count: u32) -> Self { Self(vec![MARKER_FREE; bucket_count as usize]) } // NOTE: We know the hash index has a bucket count <= u8 #[allow(clippy::cast_possible_truncation)] - pub fn len(&self) -> u8 { - self.0.len() as u8 + /// Returns the number of buckets + pub fn bucket_count(&self) -> u32 { + self.0.len() as u32 } pub fn set(&mut self, key: &[u8], binary_index_pos: u8) -> bool { - let bucket_pos = calculate_bucket_position(key, self.len()); + let bucket_pos = calculate_bucket_position(key, self.bucket_count()); // SAFETY: We used modulo #[allow(unsafe_code)] diff --git a/src/super_segment/hash_index/mod.rs b/src/super_segment/hash_index/mod.rs index 6ca93fe0..1e96f726 100644 --- a/src/super_segment/hash_index/mod.rs +++ b/src/super_segment/hash_index/mod.rs @@ -8,7 +8,7 @@ const MARKER_CONFLICT: u8 = u8::MAX; // NOTE: We know the hash index has a bucket count <= u8 #[allow(clippy::cast_possible_truncation)] -fn calculate_bucket_position(key: &[u8], bucket_count: u8) -> usize { +fn calculate_bucket_position(key: &[u8], bucket_count: u32) -> usize { let hash = xxh3_64(key); (hash % u64::from(bucket_count)) as usize } diff --git a/src/super_segment/hash_index/reader.rs b/src/super_segment/hash_index/reader.rs index a423394f..67d99e7b 100644 --- a/src/super_segment/hash_index/reader.rs +++ b/src/super_segment/hash_index/reader.rs @@ -3,14 +3,14 @@ use super::{calculate_bucket_position, MARKER_CONFLICT, MARKER_FREE}; pub struct Reader<'a>(&'a [u8]); impl<'a> Reader<'a> { - pub fn new(bytes: &'a [u8]) -> Self { - Self(bytes) + pub fn new(bytes: &'a [u8], offset: usize, len: usize) -> Self { + Self(&bytes[offset..(offset + len)]) } pub fn get(&self, key: &[u8]) -> Option { // NOTE: We know the hash index has a bucket count <= u8 #[allow(clippy::cast_possible_truncation)] - let bucket_count = self.0.len() as u8; + let bucket_count = self.0.len() as u32; let bucket_pos = calculate_bucket_position(key, bucket_count); diff --git a/src/super_segment/mod.rs b/src/super_segment/mod.rs index 37b3706e..649cf312 100644 --- a/src/super_segment/mod.rs +++ b/src/super_segment/mod.rs @@ -1,21 +1,10 @@ -mod binary_index; -mod hash_index; +pub(crate) mod binary_index; +pub(crate) mod data_block; +pub(crate) mod hash_index; -use crate::{ - coding::{DecodeError, Encode}, - segment::block::header::Header, - InternalValue, SeqNo, Slice, ValueType, -}; -use binary_index::{Builder as BinaryIndexBuilder, Reader as BinaryIndexReader}; -use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; -use hash_index::{Builder as HashIndexBuilder, Reader as HashIndexReader}; -use std::{ - cmp::{Ordering, Reverse}, - io::{Cursor, Seek, Write}, -}; -use varint_rs::{VarintReader, VarintWriter}; +use crate::{segment::block::header::Header, Slice}; -const TERMINATOR_MARKER: u8 = 255; +pub use data_block::DataBlock; /// A block on disk. /// @@ -57,611 +46,3 @@ impl Block { Ok(Self { header, data }) } } */ - -fn longest_shared_prefix_length(s1: &[u8], s2: &[u8]) -> usize { - s1.iter() - .zip(s2.iter()) - .take_while(|(c1, c2)| c1 == c2) - .count() -} - -fn compare_slices(prefix_part: &[T], key: &[T], needle: &[T]) -> Ordering { - let combined = prefix_part.iter().chain(key.iter()); - let mut needle_iter = needle.iter(); - - for (a, b) in combined.zip(needle_iter.by_ref()) { - match a.cmp(b) { - Ordering::Equal => continue, - other => return other, - } - } - - if needle_iter.next().is_some() { - return Ordering::Less; - } - - if prefix_part.len() + key.len() > needle.len() { - return Ordering::Greater; - } - - Ordering::Equal -} - -/// Block that contains key-value pairs (user data) -pub struct DataBlock { - pub inner: Block, -} - -impl DataBlock { - /// Returns the uncompressed block size in bytes. - #[must_use] - pub fn size(&self) -> usize { - self.inner.size() - } - - pub fn get_key_at(&self, pos: usize) -> (&[u8], Reverse) { - let bytes = &self.inner.data; - let mut cursor = Cursor::new(&bytes[pos..]); - - let value_type = cursor.read_u8().expect("should read"); - - let _value_type: ValueType = value_type - .try_into() - .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type))) - .expect("should read"); - - let seqno = cursor.read_u64_varint().expect("should read"); - - let key_len: usize = cursor.read_u16_varint().expect("should read").into(); - - let key_offset = pos + cursor.position() as usize; - let key = &bytes[key_offset..(key_offset + key_len)]; - - (key, Reverse(seqno)) - } - - pub fn walk( - &self, - needle: &[u8], - seqno_watermark: Option, - pos: usize, - restart_interval: usize, - ) -> crate::Result> { - use std::cmp::Ordering::{Equal, Greater, Less}; - - let bytes = &self.inner.data; - let mut cursor = Cursor::new(&bytes[pos..]); - - // NOTE: Check the full item - let base_key = { - let value_type = cursor.read_u8().expect("should read"); - - if value_type == TERMINATOR_MARKER { - return Ok(None); - } - - let value_type: ValueType = value_type - .try_into() - .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type))) - .expect("should read"); - - let seqno = cursor.read_u64_varint().expect("should read"); - - let key_len: usize = cursor.read_u16_varint().expect("should read").into(); - - let key_offset = pos + cursor.position() as usize; - let key = &bytes[key_offset..(key_offset + key_len)]; - cursor.seek_relative(key_len as i64).expect("should read"); - - let val_len: usize = cursor.read_u32_varint().expect("should read") as usize; - let val_offset = pos + cursor.position() as usize; - - match key.cmp(needle) { - Equal => { - let should_skip = seqno_watermark - .map(|watermark| seqno >= watermark) - .unwrap_or(false); - - if !should_skip { - let key = bytes.slice(key_offset..(key_offset + key_len)); - let value = bytes.slice(val_offset..(val_offset + val_len)); - - return Ok(Some(InternalValue::from_components( - key, value, seqno, value_type, - ))); - } - } - Greater => { - // NOTE: Already passed searched key - return Ok(None); - } - Less => { - // NOTE: Continue - } - } - - cursor.seek_relative(val_len as i64).expect("should read"); - - key - }; - - // NOTE: Check the rest items - for _idx in 1..restart_interval { - let value_type = cursor.read_u8()?; - - if value_type == TERMINATOR_MARKER { - return Ok(None); - } - - let value_type: ValueType = value_type - .try_into() - .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type)))?; - - let seqno = cursor.read_u64_varint()?; - - let shared_prefix_len: usize = cursor.read_u16_varint()?.into(); - let rest_key_len: usize = cursor.read_u16_varint()?.into(); - - let key_offset = pos + cursor.position() as usize; - - let prefix_part = &base_key[0..shared_prefix_len]; - let rest_key = &bytes[key_offset..(key_offset + rest_key_len)]; - cursor.seek_relative(rest_key_len as i64)?; - - let val_len: usize = if value_type == ValueType::Value { - cursor.read_u32_varint().expect("should read") as usize - } else { - 0 - }; - - let val_offset = pos + cursor.position() as usize; - - match compare_slices(prefix_part, rest_key, needle) { - Equal => { - let should_skip = seqno_watermark - .map(|watermark| seqno >= watermark) - .unwrap_or(false); - - if !should_skip { - let key = if shared_prefix_len == 0 { - bytes.slice(key_offset..(key_offset + rest_key_len)) - } else { - // Stitch key - Slice::fuse(prefix_part, rest_key) - }; - - return Ok(Some(if value_type == ValueType::Value { - let value = bytes.slice(val_offset..(val_offset + val_len)); - InternalValue::from_components(key, value, seqno, value_type) - } else { - InternalValue::from_components(key, b"", seqno, value_type) - })); - } - } - Greater => { - // NOTE: Already passed searched key - return Ok(None); - } - Less => { - // NOTE: Continue - } - } - - if value_type == ValueType::Value { - cursor.seek_relative(val_len as i64)?; - } - } - - Ok(None) - } - - pub fn point_read(&self, key: &[u8], seqno: Option) -> Option { - let bytes = &self.inner.data; - - let mut reader = &bytes[bytes.len() - - std::mem::size_of::() - - std::mem::size_of::() - - std::mem::size_of::() - - std::mem::size_of::() - - std::mem::size_of::() - - std::mem::size_of::()..]; - - let _item_count = reader.read_u32::().expect("should read") as usize; - let restart_interval = reader.read_u8().expect("should read") as usize; - - let binary_index_offset = reader.read_u32::().expect("should read") as usize; - let binary_index_len = reader.read_u32::().expect("should read") as usize; - let binary_index = BinaryIndexReader::new( - &bytes[binary_index_offset - ..binary_index_offset + binary_index_len * std::mem::size_of::()], - ); - - // TODO: if the binary index is really dense, don't look into hash index, or - // maybe don't even build it in the first place - - let hash_index_offset = reader.read_u32::().expect("should read") as usize; - - if hash_index_offset > 0 { - let hash_bucket_count = reader.read_u8().expect("should read") as usize; - - let hash_index_bytes = &bytes[hash_index_offset..hash_index_offset + hash_bucket_count]; - let hash_index = HashIndexReader::new(hash_index_bytes); - - if let Some(bucket_value) = hash_index.get(key) { - let restart_entry_pos = binary_index.get(usize::from(bucket_value)); - - return self - .walk(key, seqno, restart_entry_pos as usize, restart_interval) - .expect("OH NO"); - } - } - - // NOTE: Fallback to binary search - - let mut left = 0; - let mut right = binary_index.len(); - - if right == 0 { - return None; - } - - // TODO: try to refactor this somehow - if let Some(seqno) = seqno { - let seqno_cmp = Reverse(seqno - 1); - - while left < right { - let mid = (left + right) / 2; - - let offset = binary_index.get(mid); - - if (key, seqno_cmp) >= self.get_key_at(offset as usize) { - left = mid + 1; - } else { - right = mid; - } - } - } else { - while left < right { - let mid = (left + right) / 2; - - let offset = binary_index.get(mid); - - if key >= self.get_key_at(offset as usize).0 { - left = mid + 1; - } else { - right = mid; - } - } - } - - if left == 0 { - return None; - } - - let offset = binary_index.get(left - 1); - - self.walk(key, seqno, offset as usize, restart_interval) - .expect("OH NO") - } - - pub fn encode_items( - items: &[InternalValue], - restart_interval: u8, - hash_index_ratio: f32, - ) -> crate::Result> { - let mut writer = Vec::with_capacity(u16::MAX.into()); - - let mut binary_index_builder = - BinaryIndexBuilder::new(items.len() / usize::from(restart_interval)); - - let bucket_count = (items.len() as f32 * hash_index_ratio) as u8; - let mut hash_index_builder = HashIndexBuilder::new(bucket_count); - - let mut base_key: &Slice = &items - .first() - .expect("chunk should not be empty") - .key - .user_key; - - let mut restart_count: u32 = 0; - - // Serialize each value - for (idx, kv) in items.iter().enumerate() { - // We encode restart markers as - // [value type] [seqno] [user key len] [user key] [value len] [value] - if idx % usize::from(restart_interval) == 0 { - restart_count += 1; - - binary_index_builder.insert(writer.len() as u32); - - kv.key.encode_into(&mut writer)?; - - base_key = &kv.key.user_key; - } else { - // We encode truncated values as - // [value type] [seqno] [shared prefix len] [rest key len] [rest key] [value len] [value] - - writer.write_u8(u8::from(kv.key.value_type))?; - - writer.write_u64_varint(kv.key.seqno)?; - - let shared_prefix_len = - longest_shared_prefix_length(base_key, &kv.key.user_key) as u16; - - writer.write_u16_varint(shared_prefix_len)?; - - let rest_len = kv.key.user_key.len() as u16 - shared_prefix_len; - writer.write_u16_varint(rest_len)?; - - let truncated_user_key: &[u8] = &kv.key.user_key; - let truncated_user_key = &truncated_user_key[shared_prefix_len as usize..]; - writer.write_all(truncated_user_key)?; - } - - if bucket_count > 0 { - hash_index_builder.set(&kv.key.user_key, (restart_count - 1) as u8); - } - - // NOTE: Only write value len + value if we are actually a value - if !kv.is_tombstone() { - // NOTE: We know values are limited to 32-bit length - #[allow(clippy::cast_possible_truncation)] - writer.write_u32_varint(kv.value.len() as u32)?; - writer.write_all(&kv.value)?; - } - } - - // IMPORTANT: Terminator marker - writer.write_u8(TERMINATOR_MARKER)?; - - let binary_index_offset = writer.len() as u32; - let binary_index_len = binary_index_builder.write(&mut writer)?; - - let mut hash_index_offset = 0u32; - let mut hash_index_len = 0u8; - - // TODO: unit test when binary index is too long - // NOTE: We can only use a hash index when there are 254 buckets or less - // Because 254 and 255 are reserved marker values - // - // With the default restart interval of 16, that still gives us support - // for up to ~4000 KVs - if bucket_count > 0 && binary_index_len <= (u8::MAX - 2).into() { - hash_index_offset = writer.len() as u32; - hash_index_len = hash_index_builder.len(); - - hash_index_builder.write(&mut writer)?; - } - - // Trailer - writer.write_u32::(items.len() as u32)?; - writer.write_u8(restart_interval)?; - writer.write_u32::(binary_index_offset)?; - writer.write_u32::(binary_index_len as u32)?; - writer.write_u32::(hash_index_offset)?; - writer.write_u8(if hash_index_offset > 0 { - hash_index_len - } else { - 0 - })?; - - Ok(writer) - } -} - -#[cfg(test)] -mod tests { - use crate::{ - segment::block::{header::Header, offset::BlockOffset}, - super_segment::{Block, DataBlock}, - Checksum, InternalValue, - }; - use test_log::test; - - #[test] - fn v3_data_block_point_read() -> crate::Result<()> { - let items = [ - InternalValue::from_components( - "pla:earth:fact", - "eaaaaaaaaarth", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components( - "pla:jupiter:fact", - "Jupiter is big", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components( - "pla:jupiter:mass", - "Massive", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components( - "pla:jupiter:name", - "Jupiter", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components("pla:jupiter:radius", "Big", 0, crate::ValueType::Value), - InternalValue::from_components( - "pla:saturn:fact", - "Saturn is pretty big", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components("pla:saturn:name", "Saturn", 0, crate::ValueType::Value), - InternalValue::from_components("pla:venus:fact", "", 1, crate::ValueType::Tombstone), - InternalValue::from_components( - "pla:venus:fact", - "Venus exists", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components("pla:venus:name", "Venus", 0, crate::ValueType::Value), - ]; - - let bytes = DataBlock::encode_items(&items, 16, 0.75)?; - eprintln!("{bytes:?}"); - eprintln!("{}", String::from_utf8_lossy(&bytes)); - eprintln!("encoded into {} bytes", bytes.len()); - - let data_block = DataBlock { - inner: Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - compression: crate::CompressionType::None, - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }, - }; - - /* use std::time::Instant; - - let start = Instant::now(); - for _ in 0..1_000_000 { - data_block.point_read(&needle.key.user_key); - } - eprintln!("one read took {:?}ns", { - let ns = start.elapsed().as_nanos(); - ns / 1_000_000 - }); */ - - for needle in items { - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), - ); - } - - assert_eq!(None, data_block.point_read(b"yyy", None)); - - Ok(()) - } - - #[test] - fn v3_data_block_point_read_shadowing() -> crate::Result<()> { - let items = [ - InternalValue::from_components( - "pla:saturn:fact", - "Saturn is pretty big", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components("pla:saturn:name", "Saturn", 0, crate::ValueType::Value), - InternalValue::from_components("pla:venus:fact", "", 1, crate::ValueType::Tombstone), - InternalValue::from_components( - "pla:venus:fact", - "Venus exists", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components("pla:venus:name", "Venus", 0, crate::ValueType::Value), - ]; - - let bytes = DataBlock::encode_items(&items, 16, 0.75)?; - eprintln!("{bytes:?}"); - eprintln!("{}", String::from_utf8_lossy(&bytes)); - eprintln!("encoded into {} bytes", bytes.len()); - - let data_block = DataBlock { - inner: Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - compression: crate::CompressionType::None, - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }, - }; - - assert!(data_block - .point_read(b"pla:venus:fact", None) - .expect("should exist") - .is_tombstone()); - - Ok(()) - } - - #[test] - fn v3_data_block_point_read_dense() -> crate::Result<()> { - let items = [ - InternalValue::from_components( - "pla:earth:fact", - "eaaaaaaaaarth", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components( - "pla:jupiter:fact", - "Jupiter is big", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components( - "pla:jupiter:mass", - "Massive", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components( - "pla:jupiter:name", - "Jupiter", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components("pla:jupiter:radius", "Big", 0, crate::ValueType::Value), - InternalValue::from_components( - "pla:saturn:fact", - "Saturn is pretty big", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components("pla:saturn:name", "Saturn", 0, crate::ValueType::Value), - InternalValue::from_components("pla:venus:fact", "", 1, crate::ValueType::Tombstone), - InternalValue::from_components( - "pla:venus:fact", - "Venus exists", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components("pla:venus:name", "Venus", 0, crate::ValueType::Value), - ]; - - let bytes = DataBlock::encode_items(&items, 1, 0.75)?; - eprintln!("{bytes:?}"); - eprintln!("{}", String::from_utf8_lossy(&bytes)); - eprintln!("encoded into {} bytes", bytes.len()); - - let data_block = DataBlock { - inner: Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - compression: crate::CompressionType::None, - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }, - }; - - for needle in items { - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), - ); - } - - assert_eq!(None, data_block.point_read(b"yyy", None)); - - Ok(()) - } -} From 859999f500942223dd8c3fe7da15653ad3aad3aa Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 23 Mar 2025 02:37:19 +0100 Subject: [PATCH 007/613] refactor --- src/super_segment/binary_index/reader.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/super_segment/binary_index/reader.rs b/src/super_segment/binary_index/reader.rs index 0e481fb6..a3628a9e 100644 --- a/src/super_segment/binary_index/reader.rs +++ b/src/super_segment/binary_index/reader.rs @@ -2,6 +2,8 @@ use byteorder::{BigEndian, ReadBytesExt}; type FencePtr = u32; +const FENCE_PTR_SIZE: usize = std::mem::size_of::(); + pub struct Reader<'a> { bytes: &'a [u8], } @@ -9,16 +11,16 @@ pub struct Reader<'a> { impl<'a> Reader<'a> { pub fn new(bytes: &'a [u8], offset: usize, len: usize) -> Self { Self { - bytes: &bytes[offset..(offset + len * std::mem::size_of::())], + bytes: &bytes[offset..(offset + len * FENCE_PTR_SIZE)], } } pub fn len(&self) -> usize { - self.bytes.len() / std::mem::size_of::() + self.bytes.len() / FENCE_PTR_SIZE } pub fn get(&self, idx: usize) -> FencePtr { - let offset = idx * std::mem::size_of::(); + let offset = idx * FENCE_PTR_SIZE; let mut bytes = self.bytes.get(offset..).expect("should be in array"); bytes.read_u32::().expect("should read") From 3cd6b8a9c422fd99bf55d5078cac2c7336ecbe14 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 24 Mar 2025 18:37:07 +0100 Subject: [PATCH 008/613] improve data block fuzz test --- fuzz/Cargo.toml | 1 + fuzz/fuzz_targets/data_block.rs | 32 ++++++++++++++++++++++++++------ 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 6eaa72bf..1b5f6c16 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -8,6 +8,7 @@ edition = "2021" cargo-fuzz = true [dependencies] +arbitrary = { version = "1", features = ["derive"] } libfuzzer-sys = "0.4" lsm-tree = { path = ".." } diff --git a/fuzz/fuzz_targets/data_block.rs b/fuzz/fuzz_targets/data_block.rs index a083818a..2a16fff1 100644 --- a/fuzz/fuzz_targets/data_block.rs +++ b/fuzz/fuzz_targets/data_block.rs @@ -1,14 +1,28 @@ #![no_main] -use libfuzzer_sys::{ - arbitrary::{Arbitrary, Result, Unstructured}, - fuzz_target, -}; +use arbitrary::{Arbitrary, Result, Unstructured}; +use libfuzzer_sys::fuzz_target; use lsm_tree::{ segment::block::offset::BlockOffset, super_segment::{Block, DataBlock}, InternalValue, SeqNo, ValueType, }; +#[derive(Arbitrary, Clone, Debug, PartialEq, Eq, Ord, PartialOrd)] +enum FuzzyValueType { + Value, + Tombstone, + // TODO: single delete +} + +impl Into for FuzzyValueType { + fn into(self) -> ValueType { + match self { + Self::Value => ValueType::Value, + Self::Tombstone => ValueType::Tombstone, + } + } +} + #[derive(Clone, Debug, PartialEq, Eq, Ord, PartialOrd)] struct FuzzyValue(InternalValue); @@ -17,6 +31,7 @@ impl<'a> Arbitrary<'a> for FuzzyValue { let key = Vec::::arbitrary(u)?; let value = Vec::::arbitrary(u)?; let seqno = u64::arbitrary(u)?; + let vtype = FuzzyValueType::arbitrary(u)?; let key = if key.is_empty() { vec![0] } else { key }; @@ -24,7 +39,7 @@ impl<'a> Arbitrary<'a> for FuzzyValue { key, value, seqno, - ValueType::Value, + vtype.into(), ))) } } @@ -33,7 +48,8 @@ fuzz_target!(|data: &[u8]| { let mut unstructured = Unstructured::new(data); let restart_interval = u8::arbitrary(&mut unstructured).unwrap().max(1); - let hash_ratio = (f32::arbitrary(&mut unstructured).unwrap() / f32::MAX) + + let hash_ratio = ((u16::arbitrary(&mut unstructured).unwrap() / u16::MAX) as f32) .min(1.0) .max(0.0); @@ -59,11 +75,15 @@ fuzz_target!(|data: &[u8]| { }, }; + // eprintln!("{items:?}"); + for needle in items { if needle.key.seqno == SeqNo::MAX { continue; } + // eprintln!("needle: {:?}", needle.key); + assert_eq!( Some(needle.clone()), data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), From bc85f5c7d68fcb1a88c9ce96e709d6b5b44e81f6 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 24 Mar 2025 18:37:14 +0100 Subject: [PATCH 009/613] wip --- src/segment/block/header.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/segment/block/header.rs b/src/segment/block/header.rs index 2acd1798..9c02b9fe 100644 --- a/src/segment/block/header.rs +++ b/src/segment/block/header.rs @@ -12,7 +12,7 @@ use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use std::io::{Read, Write}; /// Header of a disk-based block -#[derive(Clone, Debug, Eq, PartialEq)] +#[derive(Copy, Clone, Debug, Eq, PartialEq)] pub struct Header { /// Compression type used pub compression: CompressionType, From 7a2454f4d02e39c8952f39471caf6f84e59a2425 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 24 Mar 2025 18:37:20 +0100 Subject: [PATCH 010/613] wip --- src/super_segment/data_block/encoder.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/super_segment/data_block/encoder.rs b/src/super_segment/data_block/encoder.rs index 410c61ac..feef1d9e 100644 --- a/src/super_segment/data_block/encoder.rs +++ b/src/super_segment/data_block/encoder.rs @@ -7,7 +7,7 @@ use varint_rs::VarintWriter; pub const TERMINATOR_MARKER: u8 = 255; -pub const TRAILER_SIZE: usize = (std::mem::size_of::() * 5) + std::mem::size_of::(); +pub const TRAILER_SIZE: usize = 5 * std::mem::size_of::() + std::mem::size_of::(); fn longest_shared_prefix_length(s1: &[u8], s2: &[u8]) -> usize { s1.iter() @@ -183,7 +183,7 @@ impl<'a> Encoder<'a> { assert_eq!( TRAILER_SIZE, self.writer.len() - bytes_before, - "footer size does not match", + "trailer size does not match", ); Ok(self.writer) From e20742de71c2b5358168bbf4b56cd1dd7c7353c8 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 24 Mar 2025 18:37:27 +0100 Subject: [PATCH 011/613] wip --- src/super_segment/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/super_segment/mod.rs b/src/super_segment/mod.rs index 649cf312..dae411db 100644 --- a/src/super_segment/mod.rs +++ b/src/super_segment/mod.rs @@ -9,6 +9,7 @@ pub use data_block::DataBlock; /// A block on disk. /// /// Consists of a header and some bytes (the data/payload) +#[derive(Clone)] pub struct Block { pub header: Header, pub data: Slice, From f717cccd59dbfac083178ff52de5a20001725536 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 24 Mar 2025 19:26:54 +0100 Subject: [PATCH 012/613] refactor --- fuzz/fuzz_targets/data_block.rs | 16 +- src/key.rs | 8 +- src/lib.rs | 2 +- src/super_segment/data_block/iter.rs | 130 ++++++++ src/super_segment/data_block/mod.rs | 448 +++++++++++++++++++-------- 5 files changed, 470 insertions(+), 134 deletions(-) create mode 100644 src/super_segment/data_block/iter.rs diff --git a/fuzz/fuzz_targets/data_block.rs b/fuzz/fuzz_targets/data_block.rs index 2a16fff1..f904862f 100644 --- a/fuzz/fuzz_targets/data_block.rs +++ b/fuzz/fuzz_targets/data_block.rs @@ -53,11 +53,23 @@ fuzz_target!(|data: &[u8]| { .min(1.0) .max(0.0); + // eprintln!("restart_interval={restart_interval}, hash_ratio={hash_ratio}"); + if let Ok(mut items) = as Arbitrary>::arbitrary(&mut unstructured) { + // let mut items = items.to_vec(); + if !items.is_empty() { items.sort(); items.dedup(); + /* eprintln!("-- items --"); + for item in items.iter().map(|value| &value.0) { + eprintln!( + r#"InternalValue::from_components({:?}, {:?}, {}, {:?}),"#, + item.key.user_key, item.value, item.key.seqno, item.key.value_type, + ); + } */ + let items = items.into_iter().map(|value| value.0).collect::>(); let bytes = DataBlock::encode_items(&items, restart_interval.into(), hash_ratio).unwrap(); @@ -86,7 +98,9 @@ fuzz_target!(|data: &[u8]| { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + data_block + .point_read(&needle.key.user_key, Some(needle.key.seqno + 1)) + .unwrap(), ); } } diff --git a/src/key.rs b/src/key.rs index 4e25a531..37dff6cc 100644 --- a/src/key.rs +++ b/src/key.rs @@ -14,7 +14,7 @@ use std::{ use value_log::Slice; use varint_rs::{VarintReader, VarintWriter}; -#[derive(Clone, PartialEq, Eq)] +#[derive(Clone, Eq)] #[allow(clippy::module_name_repetitions)] pub struct InternalKey { pub user_key: UserKey, @@ -22,6 +22,12 @@ pub struct InternalKey { pub value_type: ValueType, } +impl PartialEq for InternalKey { + fn eq(&self, other: &Self) -> bool { + self.user_key == other.user_key && self.seqno == other.seqno + } +} + impl std::hash::Hash for InternalKey { fn hash(&self, state: &mut H) { state.write(&self.user_key); diff --git a/src/lib.rs b/src/lib.rs index 1079d3a9..f11f2c02 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -115,7 +115,7 @@ macro_rules! fail_iter { ($e:expr) => { match $e { Ok(v) => v, - Err(e) => return Some(Err(e)), + Err(e) => return Some(Err(e.into())), } }; } diff --git a/src/super_segment/data_block/iter.rs b/src/super_segment/data_block/iter.rs new file mode 100644 index 00000000..88ce41e4 --- /dev/null +++ b/src/super_segment/data_block/iter.rs @@ -0,0 +1,130 @@ +use super::{encoder::TRAILER_SIZE, DataBlock}; +use crate::{ + coding::DecodeError, super_segment::data_block::encoder::TERMINATOR_MARKER, InternalValue, + Slice, ValueType, +}; +use byteorder::{BigEndian, ReadBytesExt}; +use std::io::{Cursor, Seek}; +use varint_rs::VarintReader; + +/// Double-ended iterator over data blocks +pub struct Iter { + block: DataBlock, + + cursor: usize, + idx: usize, + restart_interval: usize, + + base_key: Option, +} + +impl Iter { + pub fn new(block: DataBlock) -> Self { + let bytes = &block.inner.data; + let mut reader = &bytes[bytes.len() - TRAILER_SIZE..]; + + let _item_count = reader.read_u32::().expect("should read") as usize; + let restart_interval = reader.read_u8().expect("should read") as usize; + + Self { + block, + cursor: 0, + idx: 0, + restart_interval, + + base_key: None, + } + } +} + +impl Iterator for Iter { + type Item = crate::Result; + + fn next(&mut self) -> Option { + let is_restart = (self.idx % self.restart_interval) == 0; + + let bytes = &self.block.inner.data; + let mut cursor = Cursor::new(&bytes[self.cursor..]); + + if is_restart { + let parsed = fail_iter!(DataBlock::parse_restart_item(&mut cursor)); + let value_type = parsed.value_type; + let seqno = parsed.seqno; + + let key_start = self.cursor + parsed.key_start; + let key_end = key_start + parsed.key_len; + let key = bytes.slice(key_start..key_end); + + // TODO: check for tombstones!!! TEST!!! + + let val_len: usize = if value_type == ValueType::Value { + cursor.read_u32_varint().expect("should read") as usize + } else { + 0 + }; + let val_offset = self.cursor + cursor.position() as usize; + cursor.seek_relative(val_len as i64).expect("should read"); + + self.cursor += cursor.position() as usize; + self.idx += 1; + self.base_key = Some(key.clone()); + + Some(Ok(if value_type == ValueType::Value { + let value = bytes.slice(val_offset..(val_offset + val_len)); + InternalValue::from_components(key, value, seqno, value_type) + } else { + InternalValue::from_components(key, b"", seqno, value_type) + })) + } else { + let value_type = fail_iter!(cursor.read_u8()); + + if value_type == TERMINATOR_MARKER { + return None; + } + + let value_type: ValueType = fail_iter!(value_type + .try_into() + .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type)))); + + let seqno = cursor.read_u64_varint().expect("should read"); + + let shared_prefix_len: usize = cursor.read_u16_varint().expect("should read").into(); + let rest_key_len: usize = cursor.read_u16_varint().expect("should read").into(); + + let key_offset = self.cursor + cursor.position() as usize; + + let prefix_part = &self.base_key.as_ref().expect("should exist")[0..shared_prefix_len]; + let rest_key = &bytes[key_offset..(key_offset + rest_key_len)]; + cursor + .seek_relative(rest_key_len as i64) + .expect("should read"); + + let val_len: usize = if value_type == ValueType::Value { + cursor.read_u32_varint().expect("should read") as usize + } else { + 0 + }; + let val_offset = self.cursor + cursor.position() as usize; + cursor.seek_relative(val_len as i64).expect("should read"); + + eprintln!("{prefix_part:?} <-> {rest_key:?}"); + + let key = if shared_prefix_len == 0 { + bytes.slice(key_offset..(key_offset + rest_key_len)) + } else { + // Stitch key + Slice::fused(&[prefix_part, rest_key]) + }; + + self.cursor += cursor.position() as usize; + self.idx += 1; + + Some(Ok(if value_type == ValueType::Value { + let value = bytes.slice(val_offset..(val_offset + val_len)); + InternalValue::from_components(key, value, seqno, value_type) + } else { + InternalValue::from_components(key, b"", seqno, value_type) + })) + } + } +} diff --git a/src/super_segment/data_block/mod.rs b/src/super_segment/data_block/mod.rs index 1bd77f9c..6a3e6c53 100644 --- a/src/super_segment/data_block/mod.rs +++ b/src/super_segment/data_block/mod.rs @@ -1,4 +1,5 @@ mod encoder; +mod iter; use super::hash_index::Reader as HashIndexReader; use super::{binary_index::Reader as BinaryIndexReader, Block}; @@ -13,36 +14,67 @@ use std::{ use varint_rs::VarintReader; pub use encoder::Encoder; +pub use iter::Iter; type DataBlockEncoder<'a> = Encoder<'a>; -fn compare_slices(prefix_part: &[T], key: &[T], needle: &[T]) -> Ordering { - let combined = prefix_part.iter().chain(key.iter()); - let mut needle_iter = needle.iter(); +// TODO: Fuzz test +fn compare_prefixed_slice(prefix: &[u8], suffix: &[u8], needle: &[u8]) -> Ordering { + if needle.is_empty() { + let combined_len = prefix.len() + suffix.len(); - for (a, b) in combined.zip(needle_iter.by_ref()) { - match a.cmp(b) { - Ordering::Equal => continue, - other => return other, - } - } - - if needle_iter.next().is_some() { - return Ordering::Less; + return if combined_len > 0 { + Ordering::Greater + } else { + Ordering::Equal + }; } - if prefix_part.len() + key.len() > needle.len() { - return Ordering::Greater; + match prefix.len().cmp(&needle.len()) { + Ordering::Equal => match prefix.cmp(needle) { + Ordering::Equal => {} + ordering => return ordering, + }, + Ordering::Greater => { + // SAFETY: We know that the prefix is longer than the needle, so we can safely + // truncate it to the needle's length + #[allow(unsafe_code)] + let prefix = unsafe { prefix.get_unchecked(0..needle.len()) }; + return prefix.cmp(needle); + } + Ordering::Less => { + // SAFETY: We know that the needle is longer than the prefix, so we can safely + // truncate it to the prefix's length + #[allow(unsafe_code)] + let needle = unsafe { needle.get_unchecked(0..prefix.len()) }; + + match prefix.cmp(needle) { + Ordering::Equal => {} + ordering => return ordering, + } + } } - Ordering::Equal + // SAFETY: We know that the prefix is definitely not longer than the needle + // so we can safely truncate + #[allow(unsafe_code)] + let needle = unsafe { needle.get_unchecked(prefix.len()..) }; + suffix.cmp(needle) } /// Block that contains key-value pairs (user data) +#[derive(Clone)] pub struct DataBlock { pub inner: Block, } +struct RestartHead { + value_type: ValueType, + seqno: SeqNo, + key_start: usize, + key_len: usize, +} + impl DataBlock { /// Returns the uncompressed block size in bytes. #[must_use] @@ -50,28 +82,47 @@ impl DataBlock { self.inner.size() } - pub fn get_key_at(&self, pos: usize) -> (&[u8], Reverse) { + #[must_use] + pub fn iter(&self) -> Iter { + Iter::new(self.clone()) + } + + fn get_key_at(&self, pos: usize) -> crate::Result<(&[u8], Reverse)> { + // eprintln!("probe {pos}"); + let bytes = &self.inner.data; let mut cursor = Cursor::new(&bytes[pos..]); - let value_type = cursor.read_u8().expect("should read"); + let parsed = Self::parse_restart_item(&mut cursor)?; + let key_start = pos + parsed.key_start; + let key_end = key_start + parsed.key_len; + let key = &bytes[key_start..key_end]; - let _value_type: ValueType = value_type - .try_into() - .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type))) - .expect("should read"); + Ok((key, Reverse(parsed.seqno))) + } - let seqno = cursor.read_u64_varint().expect("should read"); + fn parse_restart_item(cursor: &mut Cursor<&[u8]>) -> crate::Result { + let value_type = cursor.read_u8()?; + + let value_type: ValueType = value_type + .try_into() + .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type)))?; - let key_len: usize = cursor.read_u16_varint().expect("should read").into(); + let seqno = cursor.read_u64_varint()?; - let key_offset = pos + cursor.position() as usize; - let key = &bytes[key_offset..(key_offset + key_len)]; + let key_len: usize = cursor.read_u16_varint()?.into(); + let key_start = cursor.position() as usize; + cursor.seek_relative(key_len as i64)?; - (key, Reverse(seqno)) + Ok(RestartHead { + value_type, + seqno, + key_start, + key_len, + }) } - pub fn walk( + fn walk( &self, needle: &[u8], seqno_watermark: Option, @@ -83,29 +134,29 @@ impl DataBlock { let bytes = &self.inner.data; let mut cursor = Cursor::new(&bytes[pos..]); + let mut base_key_pos = 0; + // NOTE: Check the full item let base_key = { - let value_type = cursor.read_u8().expect("should read"); - - if value_type == TERMINATOR_MARKER { - return Ok(None); - } + let parsed = Self::parse_restart_item(&mut cursor)?; + let value_type = parsed.value_type; + let seqno = parsed.seqno; - let value_type: ValueType = value_type - .try_into() - .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type))) - .expect("should read"); + let key_start = pos + parsed.key_start; + let key_end = key_start + parsed.key_len; + let key = &bytes[key_start..key_end]; - let seqno = cursor.read_u64_varint().expect("should read"); + // eprintln!("walk start at {key:?} : {seqno}"); - let key_len: usize = cursor.read_u16_varint().expect("should read").into(); + base_key_pos = key_start; - let key_offset = pos + cursor.position() as usize; - let key = &bytes[key_offset..(key_offset + key_len)]; - cursor.seek_relative(key_len as i64).expect("should read"); + // TODO: TEST TOMBSTONES + shadowing with tombstone at restart interval etc.!!! - let val_len: usize = cursor.read_u32_varint().expect("should read") as usize; - let val_offset = pos + cursor.position() as usize; + let val_len: usize = if value_type == ValueType::Value { + cursor.read_u32_varint().expect("should read") as usize + } else { + 0 + }; match key.cmp(needle) { Equal => { @@ -114,12 +165,17 @@ impl DataBlock { .unwrap_or(false); if !should_skip { - let key = bytes.slice(key_offset..(key_offset + key_len)); - let value = bytes.slice(val_offset..(val_offset + val_len)); + let key = bytes.slice(key_start..key_end); + + // TODO: TEST TOMBSTONES - return Ok(Some(InternalValue::from_components( - key, value, seqno, value_type, - ))); + return Ok(Some(if value_type == ValueType::Value { + let val_offset = pos + cursor.position() as usize; + let value = bytes.slice(val_offset..(val_offset + val_len)); + InternalValue::from_components(key, value, seqno, value_type) + } else { + InternalValue::from_components(key, b"", seqno, value_type) + })); } } Greater => { @@ -159,15 +215,15 @@ impl DataBlock { let rest_key = &bytes[key_offset..(key_offset + rest_key_len)]; cursor.seek_relative(rest_key_len as i64)?; + // eprintln!(" maybe it is {:?}+{:?} : {seqno}", prefix_part, rest_key); + let val_len: usize = if value_type == ValueType::Value { cursor.read_u32_varint().expect("should read") as usize } else { 0 }; - let val_offset = pos + cursor.position() as usize; - - match compare_slices(prefix_part, rest_key, needle) { + match compare_prefixed_slice(prefix_part, rest_key, needle) { Equal => { let should_skip = seqno_watermark .map(|watermark| seqno >= watermark) @@ -176,12 +232,15 @@ impl DataBlock { if !should_skip { let key = if shared_prefix_len == 0 { bytes.slice(key_offset..(key_offset + rest_key_len)) + } else if rest_key_len == 0 { + bytes.slice(base_key_pos..(base_key_pos + shared_prefix_len)) } else { // Stitch key - Slice::fuse(prefix_part, rest_key) + Slice::fused(&[prefix_part, rest_key]) }; return Ok(Some(if value_type == ValueType::Value { + let val_offset = pos + cursor.position() as usize; let value = bytes.slice(val_offset..(val_offset + val_len)); InternalValue::from_components(key, value, seqno, value_type) } else { @@ -206,7 +265,11 @@ impl DataBlock { Ok(None) } - pub fn point_read(&self, key: &[u8], seqno: Option) -> Option { + pub fn point_read( + &self, + key: &[u8], + seqno: Option, + ) -> crate::Result> { let bytes = &self.inner.data; let mut reader = &bytes[bytes.len() - TRAILER_SIZE..]; @@ -231,9 +294,7 @@ impl DataBlock { if let Some(bucket_value) = hash_index.get(key) { let restart_entry_pos = binary_index.get(usize::from(bucket_value)); - return self - .walk(key, seqno, restart_entry_pos as usize, restart_interval) - .expect("OH NO"); + return self.walk(key, seqno, restart_entry_pos as usize, restart_interval); } } @@ -243,7 +304,7 @@ impl DataBlock { let mut right = binary_index.len(); if right == 0 { - return None; + return Ok(None); } // TODO: try to refactor this somehow @@ -255,7 +316,7 @@ impl DataBlock { let offset = binary_index.get(mid); - if (key, seqno_cmp) >= self.get_key_at(offset as usize) { + if (key, seqno_cmp) >= self.get_key_at(offset as usize)? { left = mid + 1; } else { right = mid; @@ -267,7 +328,7 @@ impl DataBlock { let offset = binary_index.get(mid); - if key >= self.get_key_at(offset as usize).0 { + if key >= self.get_key_at(offset as usize)?.0 { left = mid + 1; } else { right = mid; @@ -276,15 +337,17 @@ impl DataBlock { } if left == 0 { - return None; + return Ok(None); } let offset = binary_index.get(left - 1); self.walk(key, seqno, offset as usize, restart_interval) - .expect("OH NO") } + // TODO: blocks are often <64K in size, so maybe we can selectively fall back to a + // BinaryIndex to save some space - would need a flag that determines + // what "type" the binary index is pub fn encode_items( items: &[InternalValue], restart_interval: u8, @@ -309,14 +372,33 @@ impl DataBlock { #[cfg(test)] mod tests { - use super::DataBlock; + use super::*; use crate::{ segment::block::{header::Header, offset::BlockOffset}, super_segment::Block, Checksum, InternalValue, + ValueType::{Tombstone, Value}, }; + use std::cmp::Ordering::{Equal, Greater, Less}; use test_log::test; + #[test] + fn v3_compare_prefixed_slice() { + assert_eq!(Equal, compare_prefixed_slice(b"", b"", b"")); + + assert_eq!(Greater, compare_prefixed_slice(b"a", b"", b"")); + assert_eq!(Greater, compare_prefixed_slice(b"", b"a", b"")); + assert_eq!(Greater, compare_prefixed_slice(b"a", b"a", b"")); + assert_eq!(Greater, compare_prefixed_slice(b"b", b"a", b"a")); + assert_eq!(Greater, compare_prefixed_slice(b"a", b"b", b"a")); + + assert_eq!(Less, compare_prefixed_slice(b"a", b"", b"y")); + assert_eq!(Less, compare_prefixed_slice(b"a", b"", b"yyy")); + assert_eq!(Less, compare_prefixed_slice(b"a", b"", b"yyy")); + assert_eq!(Less, compare_prefixed_slice(b"yyyy", b"a", b"yyyyb")); + assert_eq!(Less, compare_prefixed_slice(b"yyy", b"b", b"yyyyb")); + } + #[test] fn v3_data_block_point_read() -> crate::Result<()> { let items = [ @@ -380,25 +462,135 @@ mod tests { }, }; - /* use std::time::Instant; + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1))?, + ); + } + + assert_eq!(None, data_block.point_read(b"yyy", None)?); + + Ok(()) + } + + #[test] + fn v3_data_block_fuzz_1() -> crate::Result<()> { + let items = [ + InternalValue::from_components([0], b"", 23_523_531_241_241_242, Value), + InternalValue::from_components([0], b"", 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 16, 0.75)?; + eprintln!("{bytes:?}"); + eprintln!("{}", String::from_utf8_lossy(&bytes)); + eprintln!("encoded into {} bytes", bytes.len()); + + let data_block = DataBlock { + inner: Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + compression: crate::CompressionType::None, + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }, + }; + + for needle in items { + eprintln!("NEEDLE {needle:?}"); - let start = Instant::now(); - for _ in 0..1_000_000 { - data_block.point_read(&needle.key.user_key); + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1))?, + ); } - eprintln!("one read took {:?}ns", { - let ns = start.elapsed().as_nanos(); - ns / 1_000_000 - }); */ + + assert_eq!(None, data_block.point_read(b"yyy", None)?); + + Ok(()) + } + + #[test] + fn v3_data_block_fuzz_2() -> crate::Result<()> { + let items = [ + InternalValue::from_components([0], [], 18_446_568_565_776_614_018, Value), + InternalValue::from_components([0], [], 6_989_411_799_330_193_407, Tombstone), + InternalValue::from_components([0], [], 864_515_618_921_971_552, Value), + InternalValue::from_components([0], [], 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 2, 0.0)?; + eprintln!("{bytes:?}"); + eprintln!("{}", String::from_utf8_lossy(&bytes)); + eprintln!("encoded into {} bytes", bytes.len()); + + let data_block = DataBlock { + inner: Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + compression: crate::CompressionType::None, + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }, + }; for needle in items { + eprintln!("NEEDLE {needle:?}"); + assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1))?, ); } - assert_eq!(None, data_block.point_read(b"yyy", None)); + assert_eq!(None, data_block.point_read(b"yyy", None)?); + + Ok(()) + } + + #[test] + fn v3_data_block_dense_mvcc() -> crate::Result<()> { + let items = [ + InternalValue::from_components(b"a", b"a", 3, Value), + InternalValue::from_components(b"a", b"a", 2, Value), + InternalValue::from_components(b"a", b"a", 1, Value), + InternalValue::from_components(b"b", b"b", 65, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 1, 0.0)?; + eprintln!("{bytes:?}"); + eprintln!("{}", String::from_utf8_lossy(&bytes)); + eprintln!("encoded into {} bytes", bytes.len()); + + let data_block = DataBlock { + inner: Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + compression: crate::CompressionType::None, + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }, + }; + + for needle in items { + eprintln!("NEEDLE {needle:?}"); + + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1))?, + ); + } + + assert_eq!(None, data_block.point_read(b"yyy", None)?); Ok(()) } @@ -406,21 +598,11 @@ mod tests { #[test] fn v3_data_block_point_read_shadowing() -> crate::Result<()> { let items = [ - InternalValue::from_components( - "pla:saturn:fact", - "Saturn is pretty big", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components("pla:saturn:name", "Saturn", 0, crate::ValueType::Value), - InternalValue::from_components("pla:venus:fact", "", 1, crate::ValueType::Tombstone), - InternalValue::from_components( - "pla:venus:fact", - "Venus exists", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components("pla:venus:name", "Venus", 0, crate::ValueType::Value), + InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), + InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), + InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), + InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), + InternalValue::from_components("pla:venus:name", "Venus", 0, Value), ]; let bytes = DataBlock::encode_items(&items, 16, 0.75)?; @@ -442,8 +624,8 @@ mod tests { }; assert!(data_block - .point_read(b"pla:venus:fact", None) - .expect("should exist") + .point_read(b"pla:venus:fact", None)? + .unwrap() .is_tombstone()); Ok(()) @@ -452,46 +634,16 @@ mod tests { #[test] fn v3_data_block_point_read_dense() -> crate::Result<()> { let items = [ - InternalValue::from_components( - "pla:earth:fact", - "eaaaaaaaaarth", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components( - "pla:jupiter:fact", - "Jupiter is big", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components( - "pla:jupiter:mass", - "Massive", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components( - "pla:jupiter:name", - "Jupiter", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components("pla:jupiter:radius", "Big", 0, crate::ValueType::Value), - InternalValue::from_components( - "pla:saturn:fact", - "Saturn is pretty big", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components("pla:saturn:name", "Saturn", 0, crate::ValueType::Value), - InternalValue::from_components("pla:venus:fact", "", 1, crate::ValueType::Tombstone), - InternalValue::from_components( - "pla:venus:fact", - "Venus exists", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components("pla:venus:name", "Venus", 0, crate::ValueType::Value), + InternalValue::from_components("pla:earth:fact", "eaaaaaaaaarth", 0, Value), + InternalValue::from_components("pla:jupiter:fact", "Jupiter is big", 0, Value), + InternalValue::from_components("pla:jupiter:mass", "Massive", 0, Value), + InternalValue::from_components("pla:jupiter:name", "Jupiter", 0, Value), + InternalValue::from_components("pla:jupiter:radius", "Big", 0, Value), + InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), + InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), + InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), + InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), + InternalValue::from_components("pla:venus:name", "Venus", 0, Value), ]; let bytes = DataBlock::encode_items(&items, 1, 0.75)?; @@ -515,11 +667,45 @@ mod tests { for needle in items { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1))?, ); } - assert_eq!(None, data_block.point_read(b"yyy", None)); + assert_eq!(None, data_block.point_read(b"yyy", None)?); + + Ok(()) + } + + #[test] + fn v3_data_block_iter_forward() -> crate::Result<()> { + let items = [ + InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), + InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), + InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), + InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), + InternalValue::from_components("pla:venus:name", "Venus", 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 16, 0.75)?; + + let data_block = DataBlock { + inner: Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + compression: crate::CompressionType::None, + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }, + }; + + for item in data_block.iter() { + eprintln!("{item:?}"); + } + + assert_eq!(data_block.iter().count(), items.len()); Ok(()) } From 2f90ba0735ea8a99795455c8d157f2d7e159d680 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 24 Mar 2025 19:38:28 +0100 Subject: [PATCH 013/613] old block bench: use random keys --- benches/block.rs | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/benches/block.rs b/benches/block.rs index b7107354..fe203275 100644 --- a/benches/block.rs +++ b/benches/block.rs @@ -8,6 +8,7 @@ use lsm_tree::{ }, Checksum, InternalValue, }; +use rand::Rng; use std::io::Write; /* fn value_block_size(c: &mut Criterion) { @@ -50,19 +51,11 @@ fn value_block_find(c: &mut Criterion) { for item_count in [10, 100, 1_000, 10_000] { let mut items = vec![]; - for seqno in (0..(item_count - 2)).rev() { + for item in 0u64..item_count { items.push(InternalValue::from_components( - *b"a", - *b"a", - seqno, - lsm_tree::ValueType::Value, - )); - } - for seqno in (0..2).rev() { - items.push(InternalValue::from_components( - *b"b", - *b"b", - seqno, + item.to_be_bytes(), + b"", + 0, lsm_tree::ValueType::Value, )); } @@ -78,22 +71,29 @@ fn value_block_find(c: &mut Criterion) { }, }; + let mut rng = rand::rng(); + group.bench_function(format!("{item_count} items (linear)"), |b| { b.iter(|| { + let needle = rng.random_range(0..item_count).to_be_bytes(); + let item = block .items .iter() - .find(|item| &*item.key.user_key == b"b") + .find(|item| &*item.key.user_key == needle) .cloned() .unwrap(); - assert_eq!(item.key.seqno, 1); + + assert_eq!(item.key.user_key, needle); }) }); group.bench_function(format!("{item_count} items (binary search)"), |b| { b.iter(|| { - let item = block.get_latest(b"b").unwrap(); - assert_eq!(item.key.seqno, 1); + let needle = rng.random_range(0..item_count).to_be_bytes(); + + let item = block.get_latest(&needle).unwrap(); + assert_eq!(item.key.user_key, needle); }) }); } From 4250f22de70c0970948784a04bfe7328e56558f7 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 27 Mar 2025 04:40:44 +0100 Subject: [PATCH 014/613] wip --- fuzz/fuzz_targets/data_block.rs | 19 +- src/super_segment/binary_index/builder.rs | 29 +- src/super_segment/binary_index/reader.rs | 28 +- src/super_segment/data_block/encoder.rs | 84 ++-- src/super_segment/data_block/iter.rs | 47 ++- src/super_segment/data_block/mod.rs | 469 ++++++++++++++++++++-- src/super_segment/hash_index/builder.rs | 8 +- src/super_segment/hash_index/mod.rs | 2 +- src/super_segment/hash_index/reader.rs | 2 +- src/super_segment/mod.rs | 4 +- 10 files changed, 588 insertions(+), 104 deletions(-) diff --git a/fuzz/fuzz_targets/data_block.rs b/fuzz/fuzz_targets/data_block.rs index f904862f..9b10b385 100644 --- a/fuzz/fuzz_targets/data_block.rs +++ b/fuzz/fuzz_targets/data_block.rs @@ -70,6 +70,10 @@ fuzz_target!(|data: &[u8]| { ); } */ + /* if items.len() > 100 { + eprintln!("================== {}. ", items.len()); + } */ + let items = items.into_iter().map(|value| value.0).collect::>(); let bytes = DataBlock::encode_items(&items, restart_interval.into(), hash_ratio).unwrap(); @@ -87,9 +91,15 @@ fuzz_target!(|data: &[u8]| { }, }; + if data_block.binary_index_pointer_count() > 254 { + assert!(data_block.hash_bucket_count() == 0); + } else if hash_ratio > 0.0 { + assert!(data_block.hash_bucket_count() > 0); + } + // eprintln!("{items:?}"); - for needle in items { + for needle in &items { if needle.key.seqno == SeqNo::MAX { continue; } @@ -103,6 +113,13 @@ fuzz_target!(|data: &[u8]| { .unwrap(), ); } + + assert_eq!( + items, + data_block.iter().map(|x| x.unwrap()).collect::>(), + ); + + // TODO: add rev and ping-pong iters } } }); diff --git a/src/super_segment/binary_index/builder.rs b/src/super_segment/binary_index/builder.rs index 656d1a19..3212d73d 100644 --- a/src/super_segment/binary_index/builder.rs +++ b/src/super_segment/binary_index/builder.rs @@ -1,4 +1,4 @@ -use byteorder::{BigEndian, WriteBytesExt}; +use byteorder::{LittleEndian, WriteBytesExt}; #[derive(Debug)] pub struct Builder(Vec); @@ -12,13 +12,32 @@ impl Builder { self.0.push(pos); } - pub fn write(self, writer: &mut W) -> crate::Result { + pub fn write(self, writer: &mut W) -> crate::Result<(u8, usize)> { + // NOTE: We check if the pointers may fit in 16-bits + // If so, we halve the index size by storing u16 instead of u32 + let step_size = { + if u16::try_from(*self.0.last().expect("should not be empty")).is_ok() { + 2 + } else { + 4 + } + }; + let len = self.0.len(); - for offset in self.0 { - writer.write_u32::(offset)?; // TODO: benchmark little endian on x86_64 + if step_size == 2 { + // Write u16 index + for offset in self.0 { + let offset = offset as u16; + writer.write_u16::(offset)?; + } + } else { + // Write u32 index + for offset in self.0 { + writer.write_u32::(offset)?; + } } - Ok(len) + Ok((step_size, len)) } } diff --git a/src/super_segment/binary_index/reader.rs b/src/super_segment/binary_index/reader.rs index a3628a9e..7c557431 100644 --- a/src/super_segment/binary_index/reader.rs +++ b/src/super_segment/binary_index/reader.rs @@ -1,28 +1,34 @@ -use byteorder::{BigEndian, ReadBytesExt}; - -type FencePtr = u32; - -const FENCE_PTR_SIZE: usize = std::mem::size_of::(); +use byteorder::{LittleEndian, ReadBytesExt}; pub struct Reader<'a> { bytes: &'a [u8], + step_size: usize, } impl<'a> Reader<'a> { - pub fn new(bytes: &'a [u8], offset: usize, len: usize) -> Self { + pub fn new(bytes: &'a [u8], offset: usize, len: usize, step_size: usize) -> Self { Self { - bytes: &bytes[offset..(offset + len * FENCE_PTR_SIZE)], + bytes: &bytes[offset..(offset + len * step_size)], + step_size, } } pub fn len(&self) -> usize { - self.bytes.len() / FENCE_PTR_SIZE + self.bytes.len() / self.step_size } - pub fn get(&self, idx: usize) -> FencePtr { - let offset = idx * FENCE_PTR_SIZE; + pub(crate) fn get(&self, idx: usize) -> usize { + let offset = idx * self.step_size; let mut bytes = self.bytes.get(offset..).expect("should be in array"); - bytes.read_u32::().expect("should read") + + if self.step_size == 2 { + bytes + .read_u16::() + .expect("should read") + .into() + } else { + bytes.read_u32::().expect("should read") as usize + } } } diff --git a/src/super_segment/data_block/encoder.rs b/src/super_segment/data_block/encoder.rs index feef1d9e..673b90a9 100644 --- a/src/super_segment/data_block/encoder.rs +++ b/src/super_segment/data_block/encoder.rs @@ -1,13 +1,15 @@ use super::super::binary_index::Builder as BinaryIndexBuilder; use super::super::hash_index::Builder as HashIndexBuilder; -use crate::{coding::Encode, InternalValue}; -use byteorder::{BigEndian, WriteBytesExt}; +use crate::{ + coding::Encode, super_segment::hash_index::MAX_POINTERS_FOR_HASH_INDEX, InternalValue, +}; +use byteorder::{LittleEndian, WriteBytesExt}; use std::io::Write; use varint_rs::VarintWriter; pub const TERMINATOR_MARKER: u8 = 255; -pub const TRAILER_SIZE: usize = 5 * std::mem::size_of::() + std::mem::size_of::(); +pub const TRAILER_SIZE: usize = 5 * std::mem::size_of::() + (2 * std::mem::size_of::()); fn longest_shared_prefix_length(s1: &[u8], s2: &[u8]) -> usize { s1.iter() @@ -120,18 +122,23 @@ impl<'a> Encoder<'a> { Ok(()) } + // TODO: maybe change the order of trailer items a bit so we can get to the binary index first pub fn finish(mut self) -> crate::Result> { // IMPORTANT: Terminator marker self.writer.write_u8(TERMINATOR_MARKER)?; + // TODO: version u8? -> add to segment metadata instead + // NOTE: We know that data blocks will never even approach 4 GB in size #[allow(clippy::cast_possible_truncation)] let binary_index_offset = self.writer.len() as u32; - let binary_index_len = self.binary_index_builder.write(&mut self.writer)?; + // Write binary index + let (binary_index_step_size, binary_index_len) = + self.binary_index_builder.write(&mut self.writer)?; let mut hash_index_offset = 0u32; - let mut hash_index_len = 0u32; + let hash_index_len = self.hash_index_builder.bucket_count(); // TODO: unit test when binary index is too long // NOTE: We can only use a hash index when there are 254 buckets or less @@ -139,52 +146,57 @@ impl<'a> Encoder<'a> { // // With the default restart interval of 16, that still gives us support // for up to ~4000 KVs - if self.hash_index_builder.bucket_count() > 0 && binary_index_len <= (u8::MAX - 2).into() { + if self.hash_index_builder.bucket_count() > 0 + && binary_index_len <= MAX_POINTERS_FOR_HASH_INDEX.into() + { // NOTE: We know that data blocks will never even approach 4 GB in size #[allow(clippy::cast_possible_truncation)] { hash_index_offset = self.writer.len() as u32; } - hash_index_len = self.hash_index_builder.bucket_count(); - + // Write hash index self.hash_index_builder.write(&mut self.writer)?; } - #[cfg(debug_assertions)] - let bytes_before = self.writer.len(); - // Trailer: // [item_count] [restart_interval] [binary_index_offset] [binary_index_len] [hash_index_offset] [hash_index_len] + { + #[cfg(debug_assertions)] + let bytes_before = self.writer.len(); - // NOTE: We know that data blocks will never even approach 4 GB in size, so there can't be that many items either - #[allow(clippy::cast_possible_truncation)] - self.writer.write_u32::(self.item_count as u32)?; + // NOTE: We know that data blocks will never even approach 4 GB in size, so there can't be that many items either + #[allow(clippy::cast_possible_truncation)] + self.writer + .write_u32::(self.item_count as u32)?; - self.writer.write_u8(self.restart_interval)?; + self.writer.write_u8(self.restart_interval)?; - self.writer.write_u32::(binary_index_offset)?; + self.writer.write_u8(binary_index_step_size)?; - // NOTE: Even with a dense index, there can't be more index pointers than items - #[allow(clippy::cast_possible_truncation)] - self.writer - .write_u32::(binary_index_len as u32)?; - - self.writer.write_u32::(hash_index_offset)?; - - self.writer - .write_u32::(if hash_index_offset > 0 { - hash_index_len - } else { - 0 - })?; - - #[cfg(debug_assertions)] - assert_eq!( - TRAILER_SIZE, - self.writer.len() - bytes_before, - "trailer size does not match", - ); + self.writer.write_u32::(binary_index_offset)?; + + // NOTE: Even with a dense index, there can't be more index pointers than items + #[allow(clippy::cast_possible_truncation)] + self.writer + .write_u32::(binary_index_len as u32)?; + + self.writer.write_u32::(hash_index_offset)?; + + self.writer + .write_u32::(if hash_index_offset > 0 { + hash_index_len + } else { + 0 + })?; + + #[cfg(debug_assertions)] + assert_eq!( + TRAILER_SIZE, + self.writer.len() - bytes_before, + "trailer size does not match", + ); + } Ok(self.writer) } diff --git a/src/super_segment/data_block/iter.rs b/src/super_segment/data_block/iter.rs index 88ce41e4..dd00a153 100644 --- a/src/super_segment/data_block/iter.rs +++ b/src/super_segment/data_block/iter.rs @@ -3,7 +3,7 @@ use crate::{ coding::DecodeError, super_segment::data_block::encoder::TERMINATOR_MARKER, InternalValue, Slice, ValueType, }; -use byteorder::{BigEndian, ReadBytesExt}; +use byteorder::{LittleEndian, ReadBytesExt}; use std::io::{Cursor, Seek}; use varint_rs::VarintReader; @@ -23,7 +23,7 @@ impl Iter { let bytes = &block.inner.data; let mut reader = &bytes[bytes.len() - TRAILER_SIZE..]; - let _item_count = reader.read_u32::().expect("should read") as usize; + let _item_count = reader.read_u32::().expect("should read") as usize; let restart_interval = reader.read_u8().expect("should read") as usize; Self { @@ -48,22 +48,29 @@ impl Iterator for Iter { if is_restart { let parsed = fail_iter!(DataBlock::parse_restart_item(&mut cursor)); - let value_type = parsed.value_type; + + if parsed.value_type == TERMINATOR_MARKER { + return None; + } + + let value_type: ValueType = fail_iter!(parsed + .value_type + .try_into() + .map_err(|()| DecodeError::InvalidTag(("ValueType", parsed.value_type)))); + let seqno = parsed.seqno; let key_start = self.cursor + parsed.key_start; let key_end = key_start + parsed.key_len; let key = bytes.slice(key_start..key_end); - // TODO: check for tombstones!!! TEST!!! - let val_len: usize = if value_type == ValueType::Value { - cursor.read_u32_varint().expect("should read") as usize + fail_iter!(cursor.read_u32_varint()) as usize } else { 0 }; let val_offset = self.cursor + cursor.position() as usize; - cursor.seek_relative(val_len as i64).expect("should read"); + fail_iter!(cursor.seek_relative(val_len as i64)); self.cursor += cursor.position() as usize; self.idx += 1; @@ -86,34 +93,34 @@ impl Iterator for Iter { .try_into() .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type)))); - let seqno = cursor.read_u64_varint().expect("should read"); + let seqno = fail_iter!(cursor.read_u64_varint()); - let shared_prefix_len: usize = cursor.read_u16_varint().expect("should read").into(); - let rest_key_len: usize = cursor.read_u16_varint().expect("should read").into(); + let shared_prefix_len: usize = fail_iter!(cursor.read_u16_varint()).into(); + let rest_key_len: usize = fail_iter!(cursor.read_u16_varint()).into(); let key_offset = self.cursor + cursor.position() as usize; - let prefix_part = &self.base_key.as_ref().expect("should exist")[0..shared_prefix_len]; + // SAFETY: We always start with a restart item, so the base key is always set to Some(_) + #[warn(unsafe_code)] + let base_key = unsafe { self.base_key.as_ref().unwrap_unchecked() }; + + let prefix_part = &base_key[..shared_prefix_len]; let rest_key = &bytes[key_offset..(key_offset + rest_key_len)]; - cursor - .seek_relative(rest_key_len as i64) - .expect("should read"); + fail_iter!(cursor.seek_relative(rest_key_len as i64)); let val_len: usize = if value_type == ValueType::Value { - cursor.read_u32_varint().expect("should read") as usize + fail_iter!(cursor.read_u32_varint()) as usize } else { 0 }; let val_offset = self.cursor + cursor.position() as usize; - cursor.seek_relative(val_len as i64).expect("should read"); - - eprintln!("{prefix_part:?} <-> {rest_key:?}"); + fail_iter!(cursor.seek_relative(val_len as i64)); let key = if shared_prefix_len == 0 { bytes.slice(key_offset..(key_offset + rest_key_len)) } else { // Stitch key - Slice::fused(&[prefix_part, rest_key]) + Slice::fused(prefix_part, rest_key) }; self.cursor += cursor.position() as usize; @@ -123,7 +130,7 @@ impl Iterator for Iter { let value = bytes.slice(val_offset..(val_offset + val_len)); InternalValue::from_components(key, value, seqno, value_type) } else { - InternalValue::from_components(key, b"", seqno, value_type) + InternalValue::new_tombstone(key, seqno) })) } } diff --git a/src/super_segment/data_block/mod.rs b/src/super_segment/data_block/mod.rs index 6a3e6c53..f5388f01 100644 --- a/src/super_segment/data_block/mod.rs +++ b/src/super_segment/data_block/mod.rs @@ -4,7 +4,7 @@ mod iter; use super::hash_index::Reader as HashIndexReader; use super::{binary_index::Reader as BinaryIndexReader, Block}; use crate::{coding::DecodeError, InternalValue, SeqNo, Slice, ValueType}; -use byteorder::{BigEndian, ReadBytesExt}; +use byteorder::{LittleEndian, ReadBytesExt}; use encoder::{TERMINATOR_MARKER, TRAILER_SIZE}; use std::cmp::Ordering; use std::{ @@ -69,7 +69,7 @@ pub struct DataBlock { } struct RestartHead { - value_type: ValueType, + value_type: u8, seqno: SeqNo, key_start: usize, key_len: usize, @@ -104,10 +104,6 @@ impl DataBlock { fn parse_restart_item(cursor: &mut Cursor<&[u8]>) -> crate::Result { let value_type = cursor.read_u8()?; - let value_type: ValueType = value_type - .try_into() - .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type)))?; - let seqno = cursor.read_u64_varint()?; let key_len: usize = cursor.read_u16_varint()?.into(); @@ -139,7 +135,12 @@ impl DataBlock { // NOTE: Check the full item let base_key = { let parsed = Self::parse_restart_item(&mut cursor)?; - let value_type = parsed.value_type; + + let value_type: ValueType = parsed + .value_type + .try_into() + .map_err(|()| DecodeError::InvalidTag(("ValueType", parsed.value_type)))?; + let seqno = parsed.seqno; let key_start = pos + parsed.key_start; @@ -236,7 +237,7 @@ impl DataBlock { bytes.slice(base_key_pos..(base_key_pos + shared_prefix_len)) } else { // Stitch key - Slice::fused(&[prefix_part, rest_key]) + Slice::fused(prefix_part, rest_key) }; return Ok(Some(if value_type == ValueType::Value { @@ -265,6 +266,68 @@ impl DataBlock { Ok(None) } + pub fn binary_index_pointer_count(&self) -> usize { + let bytes = &self.inner.data; + + // SAFETY: We know that there is always a trailer, so we cannot go out of bounds + #[warn(unsafe_code)] + let mut reader = unsafe { bytes.get_unchecked(self.trailer_offset()..) }; + + let _item_count = reader.read_u32::().expect("should read") as usize; + let _restart_interval = reader.read_u8().expect("should read") as usize; + + let _binary_index_step_size = reader.read_u8().expect("should read") as usize; + + let _binary_index_offset = reader.read_u32::().expect("should read") as usize; + + reader.read_u32::().expect("should read") as usize + } + + pub fn hash_bucket_count(&self) -> usize { + let bytes = &self.inner.data; + + // SAFETY: We know that there is always a trailer, so we cannot go out of bounds + #[warn(unsafe_code)] + let mut reader = unsafe { bytes.get_unchecked(self.trailer_offset()..) }; + + let _item_count = reader.read_u32::().expect("should read") as usize; + let _restart_interval = reader.read_u8().expect("should read") as usize; + + let _binary_index_step_size = reader.read_u8().expect("should read") as usize; + let _binary_index_offset = reader.read_u32::().expect("should read") as usize; + let _binary_index_len = reader.read_u32::().expect("should read") as usize; + + let hash_index_offset = reader.read_u32::().expect("should read") as usize; + + if hash_index_offset > 0 { + reader.read_u32::().expect("should read") as usize + } else { + 0 + } + } + + fn trailer_offset(&self) -> usize { + self.inner.data.len() - TRAILER_SIZE + } + + /// Returns the amount of items in the block + #[must_use] + pub fn len(&self) -> usize { + let bytes = &self.inner.data; + + // SAFETY: We know that there is always a trailer, so we cannot go out of bounds + #[warn(unsafe_code)] + let mut reader = unsafe { bytes.get_unchecked(self.trailer_offset()..) }; + + reader.read_u32::().expect("should read") as usize + } + + /// Always returns false: a block is never empty + #[must_use] + pub fn is_empty(&self) -> bool { + false + } + pub fn point_read( &self, key: &[u8], @@ -272,29 +335,48 @@ impl DataBlock { ) -> crate::Result> { let bytes = &self.inner.data; - let mut reader = &bytes[bytes.len() - TRAILER_SIZE..]; + let start_pos = self.trailer_offset() + + /* skip item count */ std::mem::size_of::(); + + // SAFETY: We know that there is always a trailer, so we cannot go out of bounds + #[warn(unsafe_code)] + let mut reader = unsafe { bytes.get_unchecked(start_pos..) }; - let _item_count = reader.read_u32::().expect("should read") as usize; let restart_interval = reader.read_u8().expect("should read") as usize; - let binary_index_offset = reader.read_u32::().expect("should read") as usize; - let binary_index_len = reader.read_u32::().expect("should read") as usize; - let binary_index = BinaryIndexReader::new(bytes, binary_index_offset, binary_index_len); + let binary_index_step_size = reader.read_u8().expect("should read") as usize; + + debug_assert!( + binary_index_step_size == 2 || binary_index_step_size == 4, + "invalid binary index step size", + ); + + // eprintln!("binary index step size={binary_index_step_size}"); + + let binary_index_offset = reader.read_u32::().expect("should read") as usize; + let binary_index_len = reader.read_u32::().expect("should read") as usize; + let binary_index = BinaryIndexReader::new( + bytes, + binary_index_offset, + binary_index_len, + binary_index_step_size, + ); // TODO: if the binary index is really dense, don't look into hash index, or // maybe don't even build it in the first place - let hash_index_offset = reader.read_u32::().expect("should read") as usize; + let hash_index_offset = reader.read_u32::().expect("should read") as usize; if hash_index_offset > 0 { - let hash_bucket_count = reader.read_u32::().expect("should read") as usize; + let hash_bucket_count = + reader.read_u32::().expect("should read") as usize; let hash_index = HashIndexReader::new(bytes, hash_index_offset, hash_bucket_count); if let Some(bucket_value) = hash_index.get(key) { let restart_entry_pos = binary_index.get(usize::from(bucket_value)); - return self.walk(key, seqno, restart_entry_pos as usize, restart_interval); + return self.walk(key, seqno, restart_entry_pos, restart_interval); } } @@ -316,7 +398,7 @@ impl DataBlock { let offset = binary_index.get(mid); - if (key, seqno_cmp) >= self.get_key_at(offset as usize)? { + if (key, seqno_cmp) >= self.get_key_at(offset)? { left = mid + 1; } else { right = mid; @@ -328,7 +410,7 @@ impl DataBlock { let offset = binary_index.get(mid); - if key >= self.get_key_at(offset as usize)?.0 { + if key >= self.get_key_at(offset)?.0 { left = mid + 1; } else { right = mid; @@ -342,7 +424,7 @@ impl DataBlock { let offset = binary_index.get(left - 1); - self.walk(key, seqno, offset as usize, restart_interval) + self.walk(key, seqno, offset, restart_interval) } // TODO: blocks are often <64K in size, so maybe we can selectively fall back to a @@ -371,6 +453,7 @@ impl DataBlock { } #[cfg(test)] +#[allow(clippy::expect_used)] mod tests { use super::*; use crate::{ @@ -462,6 +545,8 @@ mod tests { }, }; + assert!(data_block.hash_bucket_count() > 0); + for needle in items { assert_eq!( Some(needle.clone()), @@ -499,6 +584,8 @@ mod tests { }, }; + assert!(data_block.hash_bucket_count() > 0); + for needle in items { eprintln!("NEEDLE {needle:?}"); @@ -540,6 +627,8 @@ mod tests { }, }; + assert!(data_block.hash_bucket_count() == 0); + for needle in items { eprintln!("NEEDLE {needle:?}"); @@ -555,7 +644,103 @@ mod tests { } #[test] - fn v3_data_block_dense_mvcc() -> crate::Result<()> { + fn v3_data_block_fuzz_3() -> crate::Result<()> { + let items = [ + InternalValue::from_components( + Slice::from([ + 255, 255, 255, 255, 5, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, + ]), + Slice::from([0, 0, 192]), + 18_446_744_073_701_163_007, + Tombstone, + ), + InternalValue::from_components( + Slice::from([255, 255, 255, 255, 255, 255, 0]), + Slice::from([]), + 0, + Value, + ), + ]; + + let bytes = DataBlock::encode_items(&items, 5, 1.0)?; + + let data_block = DataBlock { + inner: Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + compression: crate::CompressionType::None, + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }, + }; + + assert!(data_block.hash_bucket_count() > 0); + + assert_eq!( + data_block.iter().map(|x| x.expect("should be ok")).count(), + items.len(), + ); + + assert_eq!( + items, + *data_block + .iter() + .map(|x| x.expect("should be ok")) + .collect::>(), + ); + + Ok(()) + } + + #[test] + fn v3_data_block_dense_mvcc_with_hash() -> crate::Result<()> { + let items = [ + InternalValue::from_components(b"a", b"a", 3, Value), + InternalValue::from_components(b"a", b"a", 2, Value), + InternalValue::from_components(b"a", b"a", 1, Value), + InternalValue::from_components(b"b", b"b", 65, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 1, 0.75)?; + eprintln!("{bytes:?}"); + eprintln!("{}", String::from_utf8_lossy(&bytes)); + eprintln!("encoded into {} bytes", bytes.len()); + + let data_block = DataBlock { + inner: Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + compression: crate::CompressionType::None, + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }, + }; + + assert!(data_block.hash_bucket_count() > 0); + + for needle in items { + eprintln!("NEEDLE {needle:?}"); + + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1))?, + ); + } + + assert_eq!(None, data_block.point_read(b"yyy", None)?); + + Ok(()) + } + + #[test] + fn v3_data_block_dense_mvcc_no_hash() -> crate::Result<()> { let items = [ InternalValue::from_components(b"a", b"a", 3, Value), InternalValue::from_components(b"a", b"a", 2, Value), @@ -581,6 +766,8 @@ mod tests { }, }; + assert!(data_block.hash_bucket_count() == 0); + for needle in items { eprintln!("NEEDLE {needle:?}"); @@ -623,9 +810,11 @@ mod tests { }, }; + assert!(data_block.hash_bucket_count() > 0); + assert!(data_block .point_read(b"pla:venus:fact", None)? - .unwrap() + .expect("should exist") .is_tombstone()); Ok(()) @@ -664,6 +853,8 @@ mod tests { }, }; + assert!(data_block.hash_bucket_count() > 0); + for needle in items { assert_eq!( Some(needle.clone()), @@ -676,6 +867,40 @@ mod tests { Ok(()) } + #[test] + fn v3_data_block_iter_forward_one_time() -> crate::Result<()> { + let items = [InternalValue::from_components( + "pla:saturn:fact", + "Saturn is pretty big", + 0, + Value, + )]; + + let bytes = DataBlock::encode_items(&items, 16, 0.75)?; + + let data_block = DataBlock { + inner: Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + compression: crate::CompressionType::None, + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }, + }; + + assert_eq!( + data_block.iter().map(|x| x.expect("should be ok")).count(), + items.len() + ); + + assert_eq!(data_block.iter().flatten().collect::>(), items); + + Ok(()) + } + #[test] fn v3_data_block_iter_forward() -> crate::Result<()> { let items = [ @@ -701,11 +926,207 @@ mod tests { }, }; - for item in data_block.iter() { - eprintln!("{item:?}"); + assert!(data_block.hash_bucket_count() > 0); + + assert_eq!( + data_block.iter().map(|x| x.expect("should be ok")).count(), + items.len(), + ); + + assert_eq!( + items, + *data_block + .iter() + .map(|x| x.expect("should be ok")) + .collect::>(), + ); + + Ok(()) + } + + #[test] + fn v3_data_block_iter_forward_dense() -> crate::Result<()> { + let items = [InternalValue::from_components( + "pla:saturn:fact", + "Saturn is pretty big", + 0, + Value, + )]; + + let bytes = DataBlock::encode_items(&items, 1, 0.75)?; + + let data_block = DataBlock { + inner: Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + compression: crate::CompressionType::None, + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }, + }; + + assert_eq!( + items.len(), + data_block.iter().map(|x| x.expect("should be ok")).count(), + ); + + assert_eq!( + items, + *data_block + .iter() + .map(|x| x.expect("should be ok")) + .collect::>(), + ); + + Ok(()) + } + + #[test] + fn v3_data_block_iter_rev() -> crate::Result<()> { + let items = [ + InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), + InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), + InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), + InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), + InternalValue::from_components("pla:venus:name", "Venus", 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 16, 0.75)?; + + let data_block = DataBlock { + inner: Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + compression: crate::CompressionType::None, + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }, + }; + + assert!(data_block.hash_bucket_count() > 0); + + assert_eq!( + items.len(), + data_block + .iter() + .rev() + .map(|x| x.expect("should be ok")) + .count(), + ); + + assert_eq!( + items.into_iter().rev().collect::>(), + data_block + .iter() + .rev() + .map(|x| x.expect("should be ok")) + .collect::>(), + ); + + Ok(()) + } + + #[test] + fn v3_data_block_just_enough_pointers_for_hash_bucket() -> crate::Result<()> { + let items = (0u64..254) + .map(|x| InternalValue::from_components(x.to_be_bytes(), x.to_be_bytes(), 0, Value)) + .collect::>(); + + let bytes = DataBlock::encode_items(&items, 1, 0.75)?; + + let data_block = DataBlock { + inner: Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + compression: crate::CompressionType::None, + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }, + }; + + assert_eq!(0, data_block.hash_bucket_count()); + + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1))?, + ); } - assert_eq!(data_block.iter().count(), items.len()); + Ok(()) + } + + #[test] + fn v3_data_block_too_many_pointers_for_hash_bucket() -> crate::Result<()> { + let items = (0u64..255) + .map(|x| InternalValue::from_components(x.to_be_bytes(), x.to_be_bytes(), 0, Value)) + .collect::>(); + + let bytes = DataBlock::encode_items(&items, 1, 0.75)?; + + let data_block = DataBlock { + inner: Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + compression: crate::CompressionType::None, + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }, + }; + + assert_eq!(0, data_block.hash_bucket_count()); + + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1))?, + ); + } + + Ok(()) + } + + #[test] + fn v3_data_block_no_hash_index() -> crate::Result<()> { + let items = (0u64..1) + .map(|x| InternalValue::from_components(x.to_be_bytes(), x.to_be_bytes(), 0, Value)) + .collect::>(); + + let bytes = DataBlock::encode_items(&items, 1, 0.0)?; + + let data_block = DataBlock { + inner: Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + compression: crate::CompressionType::None, + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }, + }; + + assert_eq!(0, data_block.hash_bucket_count()); + + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1))?, + ); + } Ok(()) } diff --git a/src/super_segment/hash_index/builder.rs b/src/super_segment/hash_index/builder.rs index 594ee581..3d040fb5 100644 --- a/src/super_segment/hash_index/builder.rs +++ b/src/super_segment/hash_index/builder.rs @@ -1,6 +1,8 @@ use super::{calculate_bucket_position, MARKER_CONFLICT, MARKER_FREE}; use byteorder::WriteBytesExt; +pub const MAX_POINTERS_FOR_HASH_INDEX: u8 = u8::MAX - 2; + #[derive(Debug)] pub struct Builder(Vec); @@ -20,7 +22,7 @@ impl Builder { let bucket_pos = calculate_bucket_position(key, self.bucket_count()); // SAFETY: We used modulo - #[allow(unsafe_code)] + #[warn(unsafe_code)] let curr_marker = unsafe { *self.0.get_unchecked(bucket_pos) }; match curr_marker { @@ -29,7 +31,7 @@ impl Builder { // NOTE: Free slot // SAFETY: We previously asserted that the slot exists - #[allow(unsafe_code)] + #[warn(unsafe_code)] unsafe { *self.0.get_unchecked_mut(bucket_pos) = binary_index_pos; } @@ -45,7 +47,7 @@ impl Builder { // NOTE: Mark as conflicted // SAFETY: We previously asserted that the slot exists - #[allow(unsafe_code)] + #[warn(unsafe_code)] unsafe { *self.0.get_unchecked_mut(bucket_pos) = MARKER_CONFLICT; } diff --git a/src/super_segment/hash_index/mod.rs b/src/super_segment/hash_index/mod.rs index 1e96f726..9226d660 100644 --- a/src/super_segment/hash_index/mod.rs +++ b/src/super_segment/hash_index/mod.rs @@ -13,5 +13,5 @@ fn calculate_bucket_position(key: &[u8], bucket_count: u32) -> usize { (hash % u64::from(bucket_count)) as usize } -pub use builder::Builder; +pub use builder::{Builder, MAX_POINTERS_FOR_HASH_INDEX}; pub use reader::Reader; diff --git a/src/super_segment/hash_index/reader.rs b/src/super_segment/hash_index/reader.rs index 67d99e7b..8ae91d86 100644 --- a/src/super_segment/hash_index/reader.rs +++ b/src/super_segment/hash_index/reader.rs @@ -15,7 +15,7 @@ impl<'a> Reader<'a> { let bucket_pos = calculate_bucket_position(key, bucket_count); // SAFETY: We used modulo - #[allow(unsafe_code)] + #[warn(unsafe_code)] let marker = unsafe { *self.0.get_unchecked(bucket_pos) }; match marker { diff --git a/src/super_segment/mod.rs b/src/super_segment/mod.rs index dae411db..0703f638 100644 --- a/src/super_segment/mod.rs +++ b/src/super_segment/mod.rs @@ -2,10 +2,10 @@ pub(crate) mod binary_index; pub(crate) mod data_block; pub(crate) mod hash_index; -use crate::{segment::block::header::Header, Slice}; - pub use data_block::DataBlock; +use crate::{segment::block::header::Header, Slice}; + /// A block on disk. /// /// Consists of a header and some bytes (the data/payload) From ceecaa28fc6955d7a0e62ecf2de42dbc6b54f360 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 30 Mar 2025 18:31:17 +0200 Subject: [PATCH 015/613] version --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 5d72e83e..1a944d55 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ name = "lsm-tree" description = "A K.I.S.S. implementation of log-structured merge trees (LSM-trees/LSMTs)" license = "MIT OR Apache-2.0" -version = "2.7.2" +version = "3.0.0" edition = "2021" rust-version = "1.75.0" readme = "README.md" From a7bf9deb82e7d72d4a5bcfe6c05c59f723feabf8 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 30 Mar 2025 18:31:19 +0200 Subject: [PATCH 016/613] wip --- src/super_segment/data_block/iter.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/super_segment/data_block/iter.rs b/src/super_segment/data_block/iter.rs index dd00a153..41139cc8 100644 --- a/src/super_segment/data_block/iter.rs +++ b/src/super_segment/data_block/iter.rs @@ -135,3 +135,9 @@ impl Iterator for Iter { } } } + +impl DoubleEndedIterator for Iter { + fn next_back(&mut self) -> Option { + todo!() + } +} From 85c7e9a2fbb8f8d4a0948b5183119c9cd8c19019 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 30 Mar 2025 18:37:55 +0200 Subject: [PATCH 017/613] wip --- UNSAFE.md | 7 +++++++ src/lib.rs | 3 ++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/UNSAFE.md b/UNSAFE.md index 5b7f6033..9e64c312 100644 --- a/UNSAFE.md +++ b/UNSAFE.md @@ -3,3 +3,10 @@ Currently, the project itself only uses one **1** unsafe block (ignoring dependencies which are tested themselves separately): - https://github.com/fjall-rs/lsm-tree/blob/2d8686e873369bd9c4ff2b562ed988c1cea38331/src/binary_search.rs#L23-L25 + +## Run fuzz testing + +```bash +cargo +nightly fuzz run data_block -- -max_len=8000000 +cargo +nightly fuzz run partition_point -- -max_len=1000000 +``` diff --git a/src/lib.rs b/src/lib.rs index 8db0a67f..c615622c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -186,7 +186,8 @@ mod tree; mod value; mod version; -mod super_segment; +#[doc(hidden)] +pub mod super_segment; /// KV-tuple, typically returned by an iterator pub type KvPair = (UserKey, UserValue); From 56f409b849a4f453b403a28d45cdd1793182a80e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 30 Mar 2025 19:45:15 +0200 Subject: [PATCH 018/613] wip --- fuzz/fuzz_targets/data_block.rs | 20 +- src/super_segment/data_block/mod.rs | 284 ++++++++++++---------------- 2 files changed, 126 insertions(+), 178 deletions(-) diff --git a/fuzz/fuzz_targets/data_block.rs b/fuzz/fuzz_targets/data_block.rs index 9b10b385..f867155f 100644 --- a/fuzz/fuzz_targets/data_block.rs +++ b/fuzz/fuzz_targets/data_block.rs @@ -78,18 +78,16 @@ fuzz_target!(|data: &[u8]| { let bytes = DataBlock::encode_items(&items, restart_interval.into(), hash_ratio).unwrap(); - let data_block = DataBlock { - inner: Block { - data: bytes.into(), - header: lsm_tree::segment::block::header::Header { - checksum: lsm_tree::segment::block::checksum::Checksum::from_raw(0), - compression: lsm_tree::CompressionType::None, - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: lsm_tree::segment::block::header::Header { + checksum: lsm_tree::segment::block::checksum::Checksum::from_raw(0), + compression: lsm_tree::CompressionType::None, + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), }, - }; + }); if data_block.binary_index_pointer_count() > 254 { assert!(data_block.hash_bucket_count() == 0); diff --git a/src/super_segment/data_block/mod.rs b/src/super_segment/data_block/mod.rs index f5388f01..cda81672 100644 --- a/src/super_segment/data_block/mod.rs +++ b/src/super_segment/data_block/mod.rs @@ -76,6 +76,12 @@ struct RestartHead { } impl DataBlock { + #[must_use] + pub fn new(inner: Block) -> Self { + let bytes = &inner.data; + Self { inner } + } + /// Returns the uncompressed block size in bytes. #[must_use] pub fn size(&self) -> usize { @@ -147,12 +153,8 @@ impl DataBlock { let key_end = key_start + parsed.key_len; let key = &bytes[key_start..key_end]; - // eprintln!("walk start at {key:?} : {seqno}"); - base_key_pos = key_start; - // TODO: TEST TOMBSTONES + shadowing with tombstone at restart interval etc.!!! - let val_len: usize = if value_type == ValueType::Value { cursor.read_u32_varint().expect("should read") as usize } else { @@ -168,8 +170,6 @@ impl DataBlock { if !should_skip { let key = bytes.slice(key_start..key_end); - // TODO: TEST TOMBSTONES - return Ok(Some(if value_type == ValueType::Value { let val_offset = pos + cursor.position() as usize; let value = bytes.slice(val_offset..(val_offset + val_len)); @@ -216,8 +216,6 @@ impl DataBlock { let rest_key = &bytes[key_offset..(key_offset + rest_key_len)]; cursor.seek_relative(rest_key_len as i64)?; - // eprintln!(" maybe it is {:?}+{:?} : {seqno}", prefix_part, rest_key); - let val_len: usize = if value_type == ValueType::Value { cursor.read_u32_varint().expect("should read") as usize } else { @@ -389,32 +387,17 @@ impl DataBlock { return Ok(None); } - // TODO: try to refactor this somehow - if let Some(seqno) = seqno { - let seqno_cmp = Reverse(seqno - 1); + let seqno_cmp = Reverse(seqno.unwrap_or(u64::MAX) - 1); - while left < right { - let mid = (left + right) / 2; + while left < right { + let mid = (left + right) / 2; let offset = binary_index.get(mid); if (key, seqno_cmp) >= self.get_key_at(offset)? { - left = mid + 1; - } else { - right = mid; - } - } - } else { - while left < right { - let mid = (left + right) / 2; - - let offset = binary_index.get(mid); - - if key >= self.get_key_at(offset)?.0 { - left = mid + 1; - } else { - right = mid; - } + left = mid + 1; + } else { + right = mid; } } @@ -427,9 +410,6 @@ impl DataBlock { self.walk(key, seqno, offset, restart_interval) } - // TODO: blocks are often <64K in size, so maybe we can selectively fall back to a - // BinaryIndex to save some space - would need a flag that determines - // what "type" the binary index is pub fn encode_items( items: &[InternalValue], restart_interval: u8, @@ -532,18 +512,16 @@ mod tests { eprintln!("{}", String::from_utf8_lossy(&bytes)); eprintln!("encoded into {} bytes", bytes.len()); - let data_block = DataBlock { - inner: Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), compression: crate::CompressionType::None, data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), }, - }; + }); assert!(data_block.hash_bucket_count() > 0); @@ -571,18 +549,16 @@ mod tests { eprintln!("{}", String::from_utf8_lossy(&bytes)); eprintln!("encoded into {} bytes", bytes.len()); - let data_block = DataBlock { - inner: Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), compression: crate::CompressionType::None, data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), }, - }; + }); assert!(data_block.hash_bucket_count() > 0); @@ -614,18 +590,16 @@ mod tests { eprintln!("{}", String::from_utf8_lossy(&bytes)); eprintln!("encoded into {} bytes", bytes.len()); - let data_block = DataBlock { - inner: Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), compression: crate::CompressionType::None, data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), }, - }; + }); assert!(data_block.hash_bucket_count() == 0); @@ -665,18 +639,16 @@ mod tests { let bytes = DataBlock::encode_items(&items, 5, 1.0)?; - let data_block = DataBlock { - inner: Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), compression: crate::CompressionType::None, data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), }, - }; + }); assert!(data_block.hash_bucket_count() > 0); @@ -710,18 +682,16 @@ mod tests { eprintln!("{}", String::from_utf8_lossy(&bytes)); eprintln!("encoded into {} bytes", bytes.len()); - let data_block = DataBlock { - inner: Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), compression: crate::CompressionType::None, data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), }, - }; + }); assert!(data_block.hash_bucket_count() > 0); @@ -753,18 +723,16 @@ mod tests { eprintln!("{}", String::from_utf8_lossy(&bytes)); eprintln!("encoded into {} bytes", bytes.len()); - let data_block = DataBlock { - inner: Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), compression: crate::CompressionType::None, data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), }, - }; + }); assert!(data_block.hash_bucket_count() == 0); @@ -797,18 +765,16 @@ mod tests { eprintln!("{}", String::from_utf8_lossy(&bytes)); eprintln!("encoded into {} bytes", bytes.len()); - let data_block = DataBlock { - inner: Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), compression: crate::CompressionType::None, data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), }, - }; + }); assert!(data_block.hash_bucket_count() > 0); @@ -840,18 +806,16 @@ mod tests { eprintln!("{}", String::from_utf8_lossy(&bytes)); eprintln!("encoded into {} bytes", bytes.len()); - let data_block = DataBlock { - inner: Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), compression: crate::CompressionType::None, data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), }, - }; + }); assert!(data_block.hash_bucket_count() > 0); @@ -878,18 +842,16 @@ mod tests { let bytes = DataBlock::encode_items(&items, 16, 0.75)?; - let data_block = DataBlock { - inner: Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), compression: crate::CompressionType::None, data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), }, - }; + }); assert_eq!( data_block.iter().map(|x| x.expect("should be ok")).count(), @@ -913,18 +875,16 @@ mod tests { let bytes = DataBlock::encode_items(&items, 16, 0.75)?; - let data_block = DataBlock { - inner: Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), compression: crate::CompressionType::None, data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), }, - }; + }); assert!(data_block.hash_bucket_count() > 0); @@ -955,18 +915,16 @@ mod tests { let bytes = DataBlock::encode_items(&items, 1, 0.75)?; - let data_block = DataBlock { - inner: Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), compression: crate::CompressionType::None, data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), }, - }; + }); assert_eq!( items.len(), @@ -996,18 +954,16 @@ mod tests { let bytes = DataBlock::encode_items(&items, 16, 0.75)?; - let data_block = DataBlock { - inner: Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), compression: crate::CompressionType::None, data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), }, - }; + }); assert!(data_block.hash_bucket_count() > 0); @@ -1040,18 +996,16 @@ mod tests { let bytes = DataBlock::encode_items(&items, 1, 0.75)?; - let data_block = DataBlock { - inner: Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), compression: crate::CompressionType::None, data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), }, - }; + }); assert_eq!(0, data_block.hash_bucket_count()); @@ -1073,18 +1027,16 @@ mod tests { let bytes = DataBlock::encode_items(&items, 1, 0.75)?; - let data_block = DataBlock { - inner: Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), compression: crate::CompressionType::None, data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), }, - }; + }); assert_eq!(0, data_block.hash_bucket_count()); @@ -1106,18 +1058,16 @@ mod tests { let bytes = DataBlock::encode_items(&items, 1, 0.0)?; - let data_block = DataBlock { - inner: Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), compression: crate::CompressionType::None, data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), }, - }; + }); assert_eq!(0, data_block.hash_bucket_count()); From 019fe48055d2fd832c16e27af0da69aac77b889d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 2 Apr 2025 22:02:55 +0200 Subject: [PATCH 019/613] clipping iter --- src/clipping_iter.rs | 240 +++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 1 + 2 files changed, 241 insertions(+) create mode 100644 src/clipping_iter.rs diff --git a/src/clipping_iter.rs b/src/clipping_iter.rs new file mode 100644 index 00000000..fca37fa4 --- /dev/null +++ b/src/clipping_iter.rs @@ -0,0 +1,240 @@ +use std::{ + marker::PhantomData, + ops::{Bound, RangeBounds}, +}; + +use crate::InternalValue; + +/// Clips an iterator to a key range +pub struct ClippingIter<'a, K, R, I> +where + K: AsRef<[u8]>, + R: RangeBounds, + I: DoubleEndedIterator>, +{ + _phantom: std::marker::PhantomData, + + inner: I, + range: &'a R, + + has_entered_lo: bool, + has_entered_hi: bool, +} + +impl<'a, K, R, I> ClippingIter<'a, K, R, I> +where + K: AsRef<[u8]>, + R: RangeBounds, + I: DoubleEndedIterator>, +{ + pub fn new(iter: I, range: &'a R) -> Self { + Self { + _phantom: PhantomData, + + inner: iter, + range, + + has_entered_lo: false, + has_entered_hi: false, + } + } +} + +impl Iterator for ClippingIter<'_, K, R, I> +where + K: AsRef<[u8]>, + R: RangeBounds, + I: DoubleEndedIterator>, +{ + type Item = crate::Result; + + fn next(&mut self) -> Option { + loop { + let item = fail_iter!(self.inner.next()?); + + if !self.has_entered_lo { + match self.range.start_bound() { + Bound::Included(start) => { + if item.key.user_key < start.as_ref() { + // Before min key + continue; + } + self.has_entered_lo = true; + } + Bound::Excluded(start) => { + if item.key.user_key <= start.as_ref() { + // Before or equal min key + continue; + } + self.has_entered_lo = true; + } + Bound::Unbounded => {} + } + } + + match self.range.end_bound() { + Bound::Included(start) => { + if item.key.user_key > start.as_ref() { + // After max key + return None; + } + } + Bound::Excluded(start) => { + if item.key.user_key >= start.as_ref() { + // Reached max key + return None; + } + } + Bound::Unbounded => {} + } + + return Some(Ok(item)); + } + } +} + +impl DoubleEndedIterator for ClippingIter<'_, K, R, I> +where + K: AsRef<[u8]>, + R: RangeBounds, + I: DoubleEndedIterator>, +{ + fn next_back(&mut self) -> Option { + loop { + let item = fail_iter!(self.inner.next_back()?); + + match self.range.start_bound() { + Bound::Included(start) => { + if item.key.user_key < start.as_ref() { + // Reached min key + return None; + } + } + Bound::Excluded(start) => { + if item.key.user_key <= start.as_ref() { + // Before min key + return None; + } + } + Bound::Unbounded => {} + } + + if !self.has_entered_hi { + match self.range.end_bound() { + Bound::Included(end) => { + if item.key.user_key > end.as_ref() { + // After max key + continue; + } + self.has_entered_hi = true; + } + Bound::Excluded(end) => { + if item.key.user_key >= end.as_ref() { + // After or equal max key + continue; + } + self.has_entered_hi = true; + } + Bound::Unbounded => {} + } + } + + return Some(Ok(item)); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use test_log::test; + + #[test] + fn v3_clipping_iter_forwards() -> crate::Result<()> { + let items = [ + InternalValue::from_components(b"a", b"", 0, crate::ValueType::Value), + InternalValue::from_components(b"b", b"", 0, crate::ValueType::Value), + InternalValue::from_components(b"c", b"", 0, crate::ValueType::Value), + InternalValue::from_components(b"d", b"", 0, crate::ValueType::Value), + InternalValue::from_components(b"e", b"", 0, crate::ValueType::Value), + ]; + let range = "c"..="d"; + + let mut iter = ClippingIter::new(items.into_iter().map(Ok), &range); + assert_eq!( + Some(b"c" as &[u8]), + iter.next().transpose()?.map(|x| x.key.user_key).as_deref(), + ); + assert_eq!( + Some(b"d" as &[u8]), + iter.next().transpose()?.map(|x| x.key.user_key).as_deref(), + ); + assert!(iter.next().is_none()); + + Ok(()) + } + + #[test] + fn v3_clipping_iter_rev() -> crate::Result<()> { + let items = [ + InternalValue::from_components(b"a", b"", 0, crate::ValueType::Value), + InternalValue::from_components(b"b", b"", 0, crate::ValueType::Value), + InternalValue::from_components(b"c", b"", 0, crate::ValueType::Value), + InternalValue::from_components(b"d", b"", 0, crate::ValueType::Value), + InternalValue::from_components(b"e", b"", 0, crate::ValueType::Value), + ]; + let range = "c"..="d"; + + let mut iter = ClippingIter::new(items.into_iter().map(Ok), &range); + assert_eq!( + Some(b"d" as &[u8]), + iter.next_back() + .transpose()? + .map(|x| x.key.user_key) + .as_deref(), + ); + assert_eq!( + Some(b"c" as &[u8]), + iter.next_back() + .transpose()? + .map(|x| x.key.user_key) + .as_deref(), + ); + assert!(iter.next_back().is_none()); + + Ok(()) + } + + #[test] + fn v3_clipping_iter_ping_pong() -> crate::Result<()> { + let items = [ + InternalValue::from_components(b"a", b"", 0, crate::ValueType::Value), + InternalValue::from_components(b"b", b"", 0, crate::ValueType::Value), + InternalValue::from_components(b"c", b"", 0, crate::ValueType::Value), + InternalValue::from_components(b"d", b"", 0, crate::ValueType::Value), + InternalValue::from_components(b"e", b"", 0, crate::ValueType::Value), + ]; + let range = "b"..="d"; + + let mut iter = ClippingIter::new(items.into_iter().map(Ok), &range); + assert_eq!( + Some(b"b" as &[u8]), + iter.next().transpose()?.map(|x| x.key.user_key).as_deref(), + ); + assert_eq!( + Some(b"d" as &[u8]), + iter.next_back() + .transpose()? + .map(|x| x.key.user_key) + .as_deref(), + ); + assert_eq!( + Some(b"c" as &[u8]), + iter.next().transpose()?.map(|x| x.key.user_key).as_deref(), + ); + assert!(iter.next_back().is_none()); + assert!(iter.next().is_none()); + + Ok(()) + } +} diff --git a/src/lib.rs b/src/lib.rs index c615622c..31a9fb3f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -135,6 +135,7 @@ mod cache; #[doc(hidden)] pub mod bloom; +mod clipping_iter; pub mod compaction; mod config; From 8bdb1897b7015abaad54bfe4e501e3197f6a803c Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 2 Apr 2025 22:03:25 +0200 Subject: [PATCH 020/613] more fuzz --- fuzz/fuzz_targets/data_block.rs | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/fuzz/fuzz_targets/data_block.rs b/fuzz/fuzz_targets/data_block.rs index f867155f..054f7c1c 100644 --- a/fuzz/fuzz_targets/data_block.rs +++ b/fuzz/fuzz_targets/data_block.rs @@ -62,7 +62,7 @@ fuzz_target!(|data: &[u8]| { items.sort(); items.dedup(); - /* eprintln!("-- items --"); + /* eprintln!("-- items --"); for item in items.iter().map(|value| &value.0) { eprintln!( r#"InternalValue::from_components({:?}, {:?}, {}, {:?}),"#, @@ -89,10 +89,12 @@ fuzz_target!(|data: &[u8]| { }, }); - if data_block.binary_index_pointer_count() > 254 { - assert!(data_block.hash_bucket_count() == 0); + assert_eq!(data_block.item_count(), items.len()); + + if data_block.binary_index_len() > 254 { + assert!(data_block.hash_bucket_count().is_none()); } else if hash_ratio > 0.0 { - assert!(data_block.hash_bucket_count() > 0); + assert!(data_block.hash_bucket_count().unwrap() > 0); } // eprintln!("{items:?}"); @@ -117,7 +119,18 @@ fuzz_target!(|data: &[u8]| { data_block.iter().map(|x| x.unwrap()).collect::>(), ); - // TODO: add rev and ping-pong iters + assert_eq!( + items.iter().rev().cloned().collect::>(), + data_block + .iter() + .rev() + .map(|x| x.unwrap()) + .collect::>(), + ); + + // TODO: add ping-pong iters + + // TODO: add range iter too } } }); From 932e64ec23e4955466ecff2906cee5cdad4f3c53 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 9 Apr 2025 18:53:16 +0200 Subject: [PATCH 021/613] todo comments --- src/key.rs | 2 ++ src/value.rs | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/key.rs b/src/key.rs index bef731e4..045756f2 100644 --- a/src/key.rs +++ b/src/key.rs @@ -71,6 +71,7 @@ impl InternalKey { } } +// TODO: 3.0.0 remove impl Encode for InternalKey { fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { writer.write_u8(u8::from(self.value_type))?; @@ -86,6 +87,7 @@ impl Encode for InternalKey { } } +// TODO: 3.0.0 remove impl Decode for InternalKey { fn decode_from(reader: &mut R) -> Result { let value_type = reader.read_u8()?; diff --git a/src/value.rs b/src/value.rs index 8120762f..5bc10a83 100644 --- a/src/value.rs +++ b/src/value.rs @@ -183,6 +183,7 @@ impl std::fmt::Debug for InternalValue { } } +// TODO: 3.0.0 remove impl Encode for InternalValue { fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { self.key.encode_into(writer)?; @@ -199,6 +200,7 @@ impl Encode for InternalValue { } } +// TODO: 3.0.0 remove impl Decode for InternalValue { fn decode_from(reader: &mut R) -> Result { let key = InternalKey::decode_from(reader)?; From 7528b376a9d875516e7df684dd27129f7fef633b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 9 Apr 2025 18:53:22 +0200 Subject: [PATCH 022/613] comments --- src/clipping_iter.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/clipping_iter.rs b/src/clipping_iter.rs index fca37fa4..c5845f29 100644 --- a/src/clipping_iter.rs +++ b/src/clipping_iter.rs @@ -1,10 +1,9 @@ +use crate::InternalValue; use std::{ marker::PhantomData, ops::{Bound, RangeBounds}, }; -use crate::InternalValue; - /// Clips an iterator to a key range pub struct ClippingIter<'a, K, R, I> where @@ -52,6 +51,9 @@ where loop { let item = fail_iter!(self.inner.next()?); + // NOTE: PERF: As soon as we enter ->[lo..] + // we don't need to do key comparisons anymore which are + // more expensive than a simple flag check, especially for long keys if !self.has_entered_lo { match self.range.start_bound() { Bound::Included(start) => { @@ -119,6 +121,9 @@ where Bound::Unbounded => {} } + // NOTE: PERF: As soon as we enter [..hi]<- + // we don't need to do key comparisons anymore which are + // more expensive than a simple flag check, especially for long keys if !self.has_entered_hi { match self.range.end_bound() { Bound::Included(end) => { From d3baf1226dca2c48d57668830c77c1f234a897c2 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 9 Apr 2025 18:53:37 +0200 Subject: [PATCH 023/613] adjust fuzz --- fuzz/fuzz_targets/data_block.rs | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/fuzz/fuzz_targets/data_block.rs b/fuzz/fuzz_targets/data_block.rs index 054f7c1c..1d3f2a38 100644 --- a/fuzz/fuzz_targets/data_block.rs +++ b/fuzz/fuzz_targets/data_block.rs @@ -89,7 +89,7 @@ fuzz_target!(|data: &[u8]| { }, }); - assert_eq!(data_block.item_count(), items.len()); + assert_eq!(data_block.len(), items.len()); if data_block.binary_index_len() > 254 { assert!(data_block.hash_bucket_count().is_none()); @@ -114,19 +114,15 @@ fuzz_target!(|data: &[u8]| { ); } - assert_eq!( - items, - data_block.iter().map(|x| x.unwrap()).collect::>(), - ); + assert_eq!(items, data_block.iter().collect::>(),); - assert_eq!( + /* assert_eq!( items.iter().rev().cloned().collect::>(), data_block .iter() .rev() - .map(|x| x.unwrap()) .collect::>(), - ); + ); */ // TODO: add ping-pong iters From b72fb54a336e6dd04dc4ae74c0bd90baefc3bce4 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 9 Apr 2025 18:53:58 +0200 Subject: [PATCH 024/613] remove todo --- src/segment/block/mod.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index ea0580e3..1eff7925 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -68,7 +68,6 @@ impl Block { }; let mut bytes = Cursor::new(bytes); - // TODO: 3.0.0 varint? // Read number of items let item_count = bytes.read_u32::()? as usize; From 6cfbf4061681cc6b0f633b5133696fc1d08cf822 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 9 Apr 2025 18:54:38 +0200 Subject: [PATCH 025/613] finish block hash index --- src/super_segment/hash_index/builder.rs | 82 +++------------ src/super_segment/hash_index/mod.rs | 133 ++++++++++++++++++++++-- src/super_segment/hash_index/reader.rs | 57 ++++++++-- 3 files changed, 187 insertions(+), 85 deletions(-) diff --git a/src/super_segment/hash_index/builder.rs b/src/super_segment/hash_index/builder.rs index 3d040fb5..4444dfb1 100644 --- a/src/super_segment/hash_index/builder.rs +++ b/src/super_segment/hash_index/builder.rs @@ -3,35 +3,36 @@ use byteorder::WriteBytesExt; pub const MAX_POINTERS_FOR_HASH_INDEX: u8 = u8::MAX - 2; +/// Builds a block hash index #[derive(Debug)] pub struct Builder(Vec); impl Builder { + /// Initializes a new builder with the given amount of buckets. pub fn new(bucket_count: u32) -> Self { Self(vec![MARKER_FREE; bucket_count as usize]) } // NOTE: We know the hash index has a bucket count <= u8 #[allow(clippy::cast_possible_truncation)] - /// Returns the number of buckets + /// Returns the number of buckets. pub fn bucket_count(&self) -> u32 { self.0.len() as u32 } + /// Tries to map the given key to the binary index position. pub fn set(&mut self, key: &[u8], binary_index_pos: u8) -> bool { let bucket_pos = calculate_bucket_position(key, self.bucket_count()); - // SAFETY: We used modulo - #[warn(unsafe_code)] + // SAFETY: We use modulo in `calculate_bucket_position` + #[allow(unsafe_code)] let curr_marker = unsafe { *self.0.get_unchecked(bucket_pos) }; match curr_marker { MARKER_CONFLICT => false, MARKER_FREE => { - // NOTE: Free slot - // SAFETY: We previously asserted that the slot exists - #[warn(unsafe_code)] + #[allow(unsafe_code)] unsafe { *self.0.get_unchecked_mut(bucket_pos) = binary_index_pos; } @@ -47,7 +48,7 @@ impl Builder { // NOTE: Mark as conflicted // SAFETY: We previously asserted that the slot exists - #[warn(unsafe_code)] + #[allow(unsafe_code)] unsafe { *self.0.get_unchecked_mut(bucket_pos) = MARKER_CONFLICT; } @@ -57,11 +58,15 @@ impl Builder { } } + /// Consumes the builder, returning its raw bytes. + /// + /// Only used for tests #[cfg(test)] pub fn into_inner(self) -> Vec { self.0 } + /// Appends the raw index bytes to a writer. pub fn write(self, writer: &mut W) -> std::io::Result<()> { for byte in self.0 { writer.write_u8(byte)?; @@ -69,66 +74,3 @@ impl Builder { Ok(()) } } - -#[cfg(test)] -mod tests { - use super::*; - use test_log::test; - - #[test] - fn v3_hash_index_simple() { - let mut hash_index = Builder::new(100); - - hash_index.set(b"a", 5); - hash_index.set(b"b", 8); - hash_index.set(b"c", 10); - - // NOTE: Hash index bytes need to be consistent across machines and compilations etc. - assert_eq!( - [ - 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 10, 254, 254, 254, 8, 254, - 254, 254, 5, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, - 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, - 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, - 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, - 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, - 254, 254 - ], - &*hash_index.into_inner() - ); - } - - #[test] - fn v3_hash_index_conflict() { - let mut hash_index = Builder::new(1); - - hash_index.set(b"a", 5); - hash_index.set(b"b", 8); - - // NOTE: Hash index bytes need to be consistent across machines and compilations etc. - assert_eq!([255], &*hash_index.into_inner()); - } - - #[test] - fn v3_hash_index_same_offset() { - let mut hash_index = Builder::new(1); - - hash_index.set(b"a", 5); - hash_index.set(b"b", 5); - - // NOTE: Hash index bytes need to be consistent across machines and compilations etc. - assert_eq!([5], &*hash_index.into_inner()); - } - - #[test] - fn v3_hash_index_mix() { - let mut hash_index = Builder::new(1); - - hash_index.set(b"a", 5); - hash_index.set(b"b", 5); - hash_index.set(b"c", 6); - - // NOTE: Hash index bytes need to be consistent across machines and compilations etc. - assert_eq!([255], &*hash_index.into_inner()); - } -} diff --git a/src/super_segment/hash_index/mod.rs b/src/super_segment/hash_index/mod.rs index 9226d660..3f394a76 100644 --- a/src/super_segment/hash_index/mod.rs +++ b/src/super_segment/hash_index/mod.rs @@ -1,17 +1,136 @@ +//! The hash index is a lightweight (typically <=1 byte per KV) index +//! embeddeded into a block to speed up point reads +//! +//! The index is initialized with `hash_ratio * item_count` buckets. +//! +//! Each bucket is initialized as 254 (FREE). +//! +//! During block building, each key is hashed into a bucket. +//! If the bucket is FREE, it is set to the index of the binary index pointer +//! pointing to the item's restart interval. +//! +//! If the given bucket is already < FREE, it is set to CONFLICT. +//! +//! During a point read, `CONFLICT`ed buckets are skipped, and the binary index +//! is consulted instead. + mod builder; mod reader; -use xxhash_rust::xxh3::xxh3_64; +pub use builder::{Builder, MAX_POINTERS_FOR_HASH_INDEX}; +pub use reader::{Lookup, Reader}; -const MARKER_FREE: u8 = u8::MAX - 1; -const MARKER_CONFLICT: u8 = u8::MAX; +const MARKER_FREE: u8 = u8::MAX - 1; // 254 +const MARKER_CONFLICT: u8 = u8::MAX; // 255 -// NOTE: We know the hash index has a bucket count <= u8 +// NOTE: We know the hash index has a bucket count <= u32 #[allow(clippy::cast_possible_truncation)] +/// Calculates the bucket index for the given key. fn calculate_bucket_position(key: &[u8], bucket_count: u32) -> usize { - let hash = xxh3_64(key); + use xxhash_rust::xxh3::xxh3_64 as hash; + + let hash = hash(key); (hash % u64::from(bucket_count)) as usize } -pub use builder::{Builder, MAX_POINTERS_FOR_HASH_INDEX}; -pub use reader::Reader; +#[cfg(test)] +mod tests { + use super::*; + use test_log::test; + + #[test] + fn v3_hash_index_build_simple() { + let mut hash_index = Builder::new(100); + + hash_index.set(b"a", 5); + hash_index.set(b"b", 8); + hash_index.set(b"c", 10); + + let bytes = hash_index.into_inner(); + + // NOTE: Hash index bytes need to be consistent across machines and compilations etc. + assert_eq!( + [ + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 10, 254, 254, 254, 8, 254, + 254, 254, 5, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254 + ], + &*bytes + ); + + let reader = Reader::new(&bytes, 0, 100); + assert_eq!(0, reader.conflict_count()); + + assert_eq!(Lookup::Found(5), reader.get(b"a")); + assert_eq!(Lookup::Found(8), reader.get(b"b")); + assert_eq!(Lookup::Found(10), reader.get(b"c")); + assert_eq!(Lookup::NotFound, reader.get(b"d")); + } + + #[test] + fn v3_hash_index_build_conflict() { + let mut hash_index = Builder::new(1); + + hash_index.set(b"a", 5); + hash_index.set(b"b", 8); + + let bytes = hash_index.into_inner(); + + assert_eq!([255], &*bytes); + + assert_eq!(1, Reader::new(&bytes, 0, 1).conflict_count()); + } + + #[test] + fn v3_hash_index_build_same_offset() { + let mut hash_index = Builder::new(1); + + hash_index.set(b"a", 5); + hash_index.set(b"b", 5); + + let bytes = hash_index.into_inner(); + + assert_eq!([5], &*bytes); + + let reader = Reader::new(&bytes, 0, 1); + assert_eq!(0, reader.conflict_count()); + assert_eq!(Lookup::Found(5), reader.get(b"a")); + assert_eq!(Lookup::Found(5), reader.get(b"b")); + } + + #[test] + fn v3_hash_index_build_mix() { + let mut hash_index = Builder::new(1); + + hash_index.set(b"a", 5); + hash_index.set(b"b", 5); + hash_index.set(b"c", 6); + + let bytes = hash_index.into_inner(); + + assert_eq!([255], &*bytes); + + assert_eq!(1, Reader::new(&bytes, 0, 1).conflict_count()); + } + + #[test] + fn v3_hash_index_read_conflict() { + let mut hash_index = Builder::new(1); + + hash_index.set(b"a", 5); + hash_index.set(b"b", 8); + + let bytes = hash_index.into_inner(); + + let reader = Reader::new(&bytes, 0, 1); + assert_eq!(Lookup::Conflicted, reader.get(b"a")); + assert_eq!(Lookup::Conflicted, reader.get(b"b")); + assert_eq!(Lookup::Conflicted, reader.get(b"c")); + + assert_eq!(1, Reader::new(&bytes, 0, 1).conflict_count()); + } +} diff --git a/src/super_segment/hash_index/reader.rs b/src/super_segment/hash_index/reader.rs index 8ae91d86..0579deb3 100644 --- a/src/super_segment/hash_index/reader.rs +++ b/src/super_segment/hash_index/reader.rs @@ -1,26 +1,67 @@ use super::{calculate_bucket_position, MARKER_CONFLICT, MARKER_FREE}; +/// Hash index lookup result +#[derive(Debug, Eq, PartialEq)] +pub enum Lookup { + /// Key is found, can skip the binary index search - fast path + Found(u8), + + /// Key's bucket was still FREE, so it definitely does not exist + NotFound, + + /// Key is conflicted - we need to look in the binary index instead - slow path + Conflicted, +} + +/// Helper to read from an embedded block hash index pub struct Reader<'a>(&'a [u8]); impl<'a> Reader<'a> { - pub fn new(bytes: &'a [u8], offset: usize, len: usize) -> Self { - Self(&bytes[offset..(offset + len)]) + /// Initializes a new hash index reader. + pub fn new(bytes: &'a [u8], offset: u32, len: u32) -> Self { + let offset = offset as usize; + let len = len as usize; + let end = offset + len; + + // NOTE: We consider the caller to be trustworthy + #[warn(clippy::indexing_slicing)] + Self(&bytes[offset..end]) + } + + // NOTE: Not used for performance reasons, so no need to be hyper-optimized + #[allow(clippy::naive_bytecount)] + /// Returns the amount of empty slots in the hash index. + pub fn free_count(&self) -> usize { + self.0.iter().filter(|&&byte| byte == MARKER_FREE).count() + } + + // NOTE: Not used for performance reasons, so no need to be hyper-optimized + #[allow(clippy::naive_bytecount)] + /// Returns the amount of conflict markers in the hash index. + pub fn conflict_count(&self) -> usize { + self.0 + .iter() + .filter(|&&byte| byte == MARKER_CONFLICT) + .count() } - pub fn get(&self, key: &[u8]) -> Option { - // NOTE: We know the hash index has a bucket count <= u8 + /// Returns the binary index position if the key is not conflicted. + pub fn get(&self, key: &[u8]) -> Lookup { + // NOTE: Even with very high hash ratio, there will be nearly enough items to + // cause us to create u32 buckets #[allow(clippy::cast_possible_truncation)] let bucket_count = self.0.len() as u32; let bucket_pos = calculate_bucket_position(key, bucket_count); - // SAFETY: We used modulo - #[warn(unsafe_code)] + // SAFETY: We use modulo in `calculate_bucket_position` + #[allow(unsafe_code)] let marker = unsafe { *self.0.get_unchecked(bucket_pos) }; match marker { - MARKER_CONFLICT | MARKER_FREE => None, - idx => Some(idx), + MARKER_CONFLICT => Lookup::Conflicted, + MARKER_FREE => Lookup::NotFound, + idx => Lookup::Found(idx), } } } From cdc57e66c9c1b8871db0dd7cd4a6582fd62cf157 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 9 Apr 2025 18:54:55 +0200 Subject: [PATCH 026/613] v3 segment utils --- src/super_segment/mod.rs | 1 + src/super_segment/util.rs | 52 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 src/super_segment/util.rs diff --git a/src/super_segment/mod.rs b/src/super_segment/mod.rs index 0703f638..973c656f 100644 --- a/src/super_segment/mod.rs +++ b/src/super_segment/mod.rs @@ -1,6 +1,7 @@ pub(crate) mod binary_index; pub(crate) mod data_block; pub(crate) mod hash_index; +pub(crate) mod util; pub use data_block::DataBlock; diff --git a/src/super_segment/util.rs b/src/super_segment/util.rs new file mode 100644 index 00000000..d993c032 --- /dev/null +++ b/src/super_segment/util.rs @@ -0,0 +1,52 @@ +use std::cmp::Ordering; + +pub fn longest_shared_prefix_length(s1: &[u8], s2: &[u8]) -> usize { + s1.iter() + .zip(s2.iter()) + .take_while(|(c1, c2)| c1 == c2) + .count() +} + +// TODO: Fuzz test +pub fn compare_prefixed_slice(prefix: &[u8], suffix: &[u8], needle: &[u8]) -> Ordering { + if needle.is_empty() { + let combined_len = prefix.len() + suffix.len(); + + return if combined_len > 0 { + Ordering::Greater + } else { + Ordering::Equal + }; + } + + match prefix.len().cmp(&needle.len()) { + Ordering::Equal => match prefix.cmp(needle) { + Ordering::Equal => {} + ordering => return ordering, + }, + Ordering::Greater => { + // SAFETY: We know that the prefix is longer than the needle, so we can safely + // truncate it to the needle's length + #[allow(unsafe_code)] + let prefix = unsafe { prefix.get_unchecked(0..needle.len()) }; + return prefix.cmp(needle); + } + Ordering::Less => { + // SAFETY: We know that the needle is longer than the prefix, so we can safely + // truncate it to the prefix's length + #[allow(unsafe_code)] + let needle = unsafe { needle.get_unchecked(0..prefix.len()) }; + + match prefix.cmp(needle) { + Ordering::Equal => {} + ordering => return ordering, + } + } + } + + // SAFETY: We know that the prefix is definitely not longer than the needle + // so we can safely truncate + #[allow(unsafe_code)] + let needle = unsafe { needle.get_unchecked(prefix.len()..) }; + suffix.cmp(needle) +} From 25eb79a4615dcf95b06a56cce9b4f21f098bdf58 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 9 Apr 2025 18:56:31 +0200 Subject: [PATCH 027/613] generic block encoder --- src/super_segment/block/encoder.rs | 121 +++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 src/super_segment/block/encoder.rs diff --git a/src/super_segment/block/encoder.rs b/src/super_segment/block/encoder.rs new file mode 100644 index 00000000..a6997fe6 --- /dev/null +++ b/src/super_segment/block/encoder.rs @@ -0,0 +1,121 @@ +use super::super::hash_index::Builder as HashIndexBuilder; +use super::{super::binary_index::Builder as BinaryIndexBuilder, Trailer}; +use crate::super_segment::util::longest_shared_prefix_length; +use std::marker::PhantomData; + +pub trait Encodable { + fn key(&self) -> &[u8]; + + fn encode_full_into( + &self, + writer: &mut W, + state: &mut S, + ) -> crate::Result<()> + where + Self: Sized; + + fn encode_truncated_into( + &self, + writer: &mut W, + state: &mut S, + shared_len: usize, + ) -> crate::Result<()> + where + Self: Sized; +} + +/// Block encoder +pub struct Encoder<'a, S: Default, T: Encodable> { + pub(crate) phantom: PhantomData<(S, T)>, + + pub(crate) writer: Vec, + + pub(crate) state: S, + + pub(crate) item_count: usize, + pub(crate) restart_count: usize, + + pub(crate) restart_interval: u8, + pub(crate) use_prefix_truncation: bool, + + pub(crate) binary_index_builder: BinaryIndexBuilder, + pub(crate) hash_index_builder: HashIndexBuilder, + + base_key: &'a [u8], +} + +// TODO: maybe split into Builder +impl<'a, S: Default, T: Encodable> Encoder<'a, S, T> { + pub fn new( + item_count: usize, + restart_interval: u8, + hash_index_ratio: f32, + first_key: &'a [u8], + ) -> Self { + let binary_index_len = item_count / usize::from(restart_interval); + let bucket_count = (item_count as f32 * hash_index_ratio) as u32; // TODO: verify + + Self { + phantom: PhantomData, + + writer: Vec::new(), + + state: S::default(), + + item_count: 0, + restart_count: 0, + + restart_interval, + use_prefix_truncation: true, + + binary_index_builder: BinaryIndexBuilder::new(binary_index_len), + hash_index_builder: HashIndexBuilder::new(bucket_count), + + base_key: first_key, + } + } + + /* /// Toggles prefix truncation. + pub fn use_prefix_truncation(mut self, flag: bool) -> Self { + self.use_prefix_truncation = flag; + self + } */ + + pub fn write(&mut self, item: &'a T) -> crate::Result<()> { + // NOTE: Check if we are a restart marker + if self.item_count % usize::from(self.restart_interval) == 0 { + self.restart_count += 1; + + if self.restart_interval > 0 { + // NOTE: We know that data blocks will never even approach 4 GB in size + #[allow(clippy::cast_possible_truncation)] + self.binary_index_builder.insert(self.writer.len() as u32); + } + + item.encode_full_into(&mut self.writer, &mut self.state)?; + + self.base_key = item.key(); + } else { + // NOTE: We can safely cast to u16, because keys are u16 long max + #[allow(clippy::cast_possible_truncation)] + let shared_prefix_len = longest_shared_prefix_length(self.base_key, item.key()); + + item.encode_truncated_into(&mut self.writer, &mut self.state, shared_prefix_len)?; + } + + if self.hash_index_builder.bucket_count() > 0 { + // NOTE: The max binary index is bound by u8 (technically u8::MAX - 2) + #[allow(clippy::cast_possible_truncation)] + self.hash_index_builder + .set(item.key(), (self.restart_count - 1) as u8); + } + + self.item_count += 1; + + Ok(()) + } + + pub fn finish(self) -> crate::Result> { + Trailer::write(self) + } +} From f73a16764ecc9e42285c4a82e5b0fc8959568c34 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 9 Apr 2025 18:56:59 +0200 Subject: [PATCH 028/613] wip --- .../{data_block => index_block}/encoder.rs | 83 ++++++++++--------- 1 file changed, 42 insertions(+), 41 deletions(-) rename src/super_segment/{data_block => index_block}/encoder.rs (70%) diff --git a/src/super_segment/data_block/encoder.rs b/src/super_segment/index_block/encoder.rs similarity index 70% rename from src/super_segment/data_block/encoder.rs rename to src/super_segment/index_block/encoder.rs index 673b90a9..4ab3aa14 100644 --- a/src/super_segment/data_block/encoder.rs +++ b/src/super_segment/index_block/encoder.rs @@ -1,23 +1,14 @@ -use super::super::binary_index::Builder as BinaryIndexBuilder; use super::super::hash_index::Builder as HashIndexBuilder; +use super::{super::binary_index::Builder as BinaryIndexBuilder, NewKeyedBlockHandle}; +use crate::super_segment::util::longest_shared_prefix_length; use crate::{ - coding::Encode, super_segment::hash_index::MAX_POINTERS_FOR_HASH_INDEX, InternalValue, + segment::{block::offset::BlockOffset, trailer::TRAILER_SIZE}, + super_segment::{block::TRAILER_START_MARKER, hash_index::MAX_POINTERS_FOR_HASH_INDEX}, }; use byteorder::{LittleEndian, WriteBytesExt}; use std::io::Write; use varint_rs::VarintWriter; -pub const TERMINATOR_MARKER: u8 = 255; - -pub const TRAILER_SIZE: usize = 5 * std::mem::size_of::() + (2 * std::mem::size_of::()); - -fn longest_shared_prefix_length(s1: &[u8], s2: &[u8]) -> usize { - s1.iter() - .zip(s2.iter()) - .take_while(|(c1, c2)| c1 == c2) - .count() -} - pub struct Encoder<'a> { writer: Vec, @@ -26,8 +17,11 @@ pub struct Encoder<'a> { restart_interval: u8, + use_prefix_truncation: bool, base_key: &'a [u8], + offset: BlockOffset, + restart_count: usize, item_count: usize, } @@ -37,6 +31,7 @@ impl<'a> Encoder<'a> { item_count: usize, restart_interval: u8, hash_index_ratio: f32, + use_prefix_truncation: bool, first_key: &'a [u8], ) -> Self { let binary_index_len = item_count / usize::from(restart_interval); @@ -50,18 +45,21 @@ impl<'a> Encoder<'a> { restart_interval, + use_prefix_truncation, base_key: first_key, + offset: BlockOffset(0), + restart_count: 0, item_count: 0, } } - pub fn write(&mut self, kv: &'a InternalValue) -> crate::Result<()> { + pub fn write(&mut self, handle: &'a NewKeyedBlockHandle) -> crate::Result<()> { // NOTE: Check if we are a restart marker if self.item_count % usize::from(self.restart_interval) == 0 { // We encode restart markers as: - // [value type] [seqno] [user key len] [user key] [value len] [value] + // [offset] [size] [key len] [end key] self.restart_count += 1; @@ -69,52 +67,55 @@ impl<'a> Encoder<'a> { #[allow(clippy::cast_possible_truncation)] self.binary_index_builder.insert(self.writer.len() as u32); - kv.key.encode_into(&mut self.writer)?; + self.writer.write_u64_varint(*handle.offset)?; + self.writer.write_u32_varint(handle.size)?; + self.writer.write_u16_varint(handle.end_key.len() as u16)?; + self.writer.write_all(&handle.end_key)?; - self.base_key = &kv.key.user_key; + self.base_key = &handle.end_key; + self.offset = BlockOffset(*handle.offset + u64::from(handle.size)); } else { - // We encode truncated values as: - // [value type] [seqno] [shared prefix len] [rest key len] [rest key] [value len] [value] + // We encode truncated handles as: + // [size] [shared prefix len] [rest key len] [rest key] - self.writer.write_u8(u8::from(kv.key.value_type))?; + self.writer.write_u32_varint(handle.size)?; - self.writer.write_u64_varint(kv.key.seqno)?; + let shared_prefix_len = if self.use_prefix_truncation { + // NOTE: We can safely cast to u16, because keys are u16 long max + #[allow(clippy::cast_possible_truncation)] + let shared_prefix_len = + longest_shared_prefix_length(self.base_key, &handle.end_key) as u16; - // NOTE: We can safely cast to u16, because keys are u16 long max - #[allow(clippy::cast_possible_truncation)] - let shared_prefix_len = - longest_shared_prefix_length(self.base_key, &kv.key.user_key) as u16; + shared_prefix_len + } else { + self.writer.write_u8(0)?; + 0 + }; + // TODO: maybe we can skip this varint altogether if prefix truncation = false self.writer.write_u16_varint(shared_prefix_len)?; // NOTE: We can safely cast to u16, because keys are u16 long max #[allow(clippy::cast_possible_truncation)] - let rest_len = kv.key.user_key.len() as u16 - shared_prefix_len; + let rest_len = handle.end_key.len() as u16 - shared_prefix_len; self.writer.write_u16_varint(rest_len)?; - let truncated_user_key = &kv - .key - .user_key + let truncated_user_key = handle + .end_key .get(shared_prefix_len as usize..) .expect("should be in bounds"); self.writer.write_all(truncated_user_key)?; + + self.offset += u64::from(handle.size); } if self.hash_index_builder.bucket_count() > 0 { // NOTE: The max binary index is bound by u8 (technically u8::MAX - 2) #[allow(clippy::cast_possible_truncation)] self.hash_index_builder - .set(&kv.key.user_key, (self.restart_count - 1) as u8); - } - - // NOTE: Only write value len + value if we are actually a value - if !kv.is_tombstone() { - // NOTE: We know values are limited to 32-bit length - #[allow(clippy::cast_possible_truncation)] - self.writer.write_u32_varint(kv.value.len() as u32)?; - self.writer.write_all(&kv.value)?; + .set(&handle.end_key, (self.restart_count - 1) as u8); } self.item_count += 1; @@ -122,10 +123,11 @@ impl<'a> Encoder<'a> { Ok(()) } - // TODO: maybe change the order of trailer items a bit so we can get to the binary index first + // TODO: trailer of data block and index block are the same... consolidate into some + // kind of TrailerWriter or whatever pub fn finish(mut self) -> crate::Result> { // IMPORTANT: Terminator marker - self.writer.write_u8(TERMINATOR_MARKER)?; + self.writer.write_u8(TRAILER_START_MARKER)?; // TODO: version u8? -> add to segment metadata instead @@ -140,7 +142,6 @@ impl<'a> Encoder<'a> { let mut hash_index_offset = 0u32; let hash_index_len = self.hash_index_builder.bucket_count(); - // TODO: unit test when binary index is too long // NOTE: We can only use a hash index when there are 254 buckets or less // Because 254 and 255 are reserved marker values // From 156ae564fd806cb0b0b3e9c72bfbb1a13af4f701 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 9 Apr 2025 18:57:29 +0200 Subject: [PATCH 029/613] v3 block trailer & header --- src/super_segment/block/header.rs | 14 +++ src/super_segment/block/mod.rs | 26 ++++++ src/super_segment/block/trailer.rs | 142 +++++++++++++++++++++++++++++ src/super_segment/mod.rs | 48 +--------- 4 files changed, 186 insertions(+), 44 deletions(-) create mode 100644 src/super_segment/block/header.rs create mode 100644 src/super_segment/block/mod.rs create mode 100644 src/super_segment/block/trailer.rs diff --git a/src/super_segment/block/header.rs b/src/super_segment/block/header.rs new file mode 100644 index 00000000..f2cbfb5c --- /dev/null +++ b/src/super_segment/block/header.rs @@ -0,0 +1,14 @@ +use crate::{segment::block::offset::BlockOffset, Checksum}; + +/// Header of a disk-based block +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub struct Header { + /// Checksum value to verify integrity of data + pub checksum: Checksum, + + /// File offset of previous block - only used for data blocks + pub previous_block_offset: BlockOffset, + + /// On-disk size of data segment + pub data_length: u32, +} diff --git a/src/super_segment/block/mod.rs b/src/super_segment/block/mod.rs new file mode 100644 index 00000000..9b117f6d --- /dev/null +++ b/src/super_segment/block/mod.rs @@ -0,0 +1,26 @@ +mod encoder; +mod header; +mod trailer; + +pub use encoder::{Encodable, Encoder}; +pub use header::Header; +pub use trailer::{Trailer, TRAILER_START_MARKER}; + +use crate::Slice; + +/// A block on disk. +/// +/// Consists of a header and some bytes (the data/payload) +#[derive(Clone)] +pub struct Block { + pub header: Header, + pub data: Slice, +} + +impl Block { + /// Returns the uncompressed block size in bytes + #[must_use] + pub fn size(&self) -> usize { + self.data.len() + } +} diff --git a/src/super_segment/block/trailer.rs b/src/super_segment/block/trailer.rs new file mode 100644 index 00000000..094d9581 --- /dev/null +++ b/src/super_segment/block/trailer.rs @@ -0,0 +1,142 @@ +use crate::super_segment::hash_index::MAX_POINTERS_FOR_HASH_INDEX; + +use super::{ + encoder::{Encodable, Encoder}, + Block, +}; +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; + +pub const TRAILER_START_MARKER: u8 = 255; + +const TRAILER_SIZE: usize = 5 * std::mem::size_of::() + (2 * std::mem::size_of::()); + +/// Block trailer +/// +/// ## Format +/// +/// \[item_count\] \[restart_interval\] \[binary_index_offset\] \[binary_index_len\] \[hash_index_offset\] \[hash_index_len\] +#[allow(clippy::doc_markdown)] +pub struct Trailer<'a> { + block: &'a Block, +} + +impl<'a> Trailer<'a> { + pub fn new(block: &'a Block) -> Self { + Self { block } + } + + /// Returns the trailer position. + pub fn trailer_offset(&self) -> usize { + self.block.data.len() - TRAILER_SIZE + } + + /// Returns the amount of items in the block + #[must_use] + pub fn item_count(&self) -> usize { + let mut reader = self.as_slice(); + + // NOTE: We know the trailer offset is valid, and the trailer has a fixed size + // so the next item must be the item count + #[allow(clippy::expect_used)] + { + reader + .read_u32::() + .expect("should read item count") as usize + } + } + + pub fn as_slice(&self) -> &[u8] { + let start = self.trailer_offset(); + + // SAFETY: We know that a block always has a trailer, so the + // `block_size - TRAILER_SIZE` cannot go out of bounds + #[allow(unsafe_code)] + unsafe { + self.block.data.get_unchecked(start..) + } + } + + pub fn write>( + mut encoder: Encoder<'_, S, T>, + ) -> crate::Result> { + // IMPORTANT: Terminator marker + encoder.writer.write_u8(TRAILER_START_MARKER)?; + + // TODO: version u8? -> add to segment metadata instead + + // NOTE: We know that data blocks will never even approach 4 GB in size + #[allow(clippy::cast_possible_truncation)] + let binary_index_offset = encoder.writer.len() as u32; + + // Write binary index + let (binary_index_step_size, binary_index_len) = + encoder.binary_index_builder.write(&mut encoder.writer)?; + + let mut hash_index_offset = 0u32; + let hash_index_len = encoder.hash_index_builder.bucket_count(); + + // NOTE: We can only use a hash index when there are 254 buckets or less + // Because 254 and 255 are reserved marker values + // + // With the default restart interval of 16, that still gives us support + // for up to ~4000 KVs + if encoder.hash_index_builder.bucket_count() > 0 + && binary_index_len <= MAX_POINTERS_FOR_HASH_INDEX.into() + { + // NOTE: We know that data blocks will never even approach 4 GB in size + #[allow(clippy::cast_possible_truncation)] + { + hash_index_offset = encoder.writer.len() as u32; + } + + // Write hash index + encoder.hash_index_builder.write(&mut encoder.writer)?; + } + + // Write trailer + + #[cfg(debug_assertions)] + let bytes_before = encoder.writer.len(); + + // NOTE: We know that data blocks will never even approach 4 GB in size, so there can't be that many items either + #[allow(clippy::cast_possible_truncation)] + encoder + .writer + .write_u32::(encoder.item_count as u32)?; + + encoder.writer.write_u8(encoder.restart_interval)?; + + encoder.writer.write_u8(binary_index_step_size)?; + + encoder + .writer + .write_u32::(binary_index_offset)?; + + // NOTE: Even with a dense index, there can't be more index pointers than items + #[allow(clippy::cast_possible_truncation)] + encoder + .writer + .write_u32::(binary_index_len as u32)?; + + encoder + .writer + .write_u32::(hash_index_offset)?; + + encoder + .writer + .write_u32::(if hash_index_offset > 0 { + hash_index_len + } else { + 0 + })?; + + #[cfg(debug_assertions)] + assert_eq!( + TRAILER_SIZE, + encoder.writer.len() - bytes_before, + "trailer size does not match", + ); + + Ok(encoder.writer) + } +} diff --git a/src/super_segment/mod.rs b/src/super_segment/mod.rs index 973c656f..1697f904 100644 --- a/src/super_segment/mod.rs +++ b/src/super_segment/mod.rs @@ -1,50 +1,10 @@ pub(crate) mod binary_index; +mod block; pub(crate) mod data_block; pub(crate) mod hash_index; +mod index_block; pub(crate) mod util; +pub use block::{Block, Header}; pub use data_block::DataBlock; - -use crate::{segment::block::header::Header, Slice}; - -/// A block on disk. -/// -/// Consists of a header and some bytes (the data/payload) -#[derive(Clone)] -pub struct Block { - pub header: Header, - pub data: Slice, -} - -impl Block { - /// Returns the uncompressed block size in bytes - #[must_use] - pub fn size(&self) -> usize { - self.data.len() - } -} - -/* impl Decode for Block { - fn decode_from(reader: &mut R) -> Result - where - Self: Sized, - { - let header = Header::decode_from(reader)?; - let data = Slice::from_reader(reader, header.data_length as usize)?; - let data = match header.compression { - CompressionType::None => data, - - #[cfg(feature = "lz4")] - CompressionType::Lz4 => lz4_flex::decompress_size_prepended(&data) - .map(Into::into) - .map_err(|_| crate::Error::Decompress(header.compression))?, - - #[cfg(feature = "miniz")] - CompressionType::Miniz(_) => miniz_oxide::inflate::decompress_to_vec(&data) - .map(Into::into) - .map_err(|_| crate::Error::Decompress(header.compression))?, - }; - - Ok(Self { header, data }) - } -} */ +pub use index_block::IndexBlock; From c9158f89b3b60e64c24c1523c4fb9f375f35759b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 9 Apr 2025 18:57:48 +0200 Subject: [PATCH 030/613] wip --- src/super_segment/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/super_segment/mod.rs b/src/super_segment/mod.rs index 1697f904..1677feae 100644 --- a/src/super_segment/mod.rs +++ b/src/super_segment/mod.rs @@ -2,9 +2,9 @@ pub(crate) mod binary_index; mod block; pub(crate) mod data_block; pub(crate) mod hash_index; -mod index_block; +// mod index_block; pub(crate) mod util; pub use block::{Block, Header}; pub use data_block::DataBlock; -pub use index_block::IndexBlock; +// pub use index_block::IndexBlock; From b242df235ca1742828d0c96b42c202f96c13e869 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 9 Apr 2025 18:58:03 +0200 Subject: [PATCH 031/613] refactor block binary index --- src/super_segment/binary_index/builder.rs | 6 +++--- src/super_segment/binary_index/reader.rs | 23 ++++++++++++++++------- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/src/super_segment/binary_index/builder.rs b/src/super_segment/binary_index/builder.rs index 3212d73d..d27e920e 100644 --- a/src/super_segment/binary_index/builder.rs +++ b/src/super_segment/binary_index/builder.rs @@ -12,7 +12,7 @@ impl Builder { self.0.push(pos); } - pub fn write(self, writer: &mut W) -> crate::Result<(u8, usize)> { + pub fn write(&self, writer: &mut W) -> crate::Result<(u8, usize)> { // NOTE: We check if the pointers may fit in 16-bits // If so, we halve the index size by storing u16 instead of u32 let step_size = { @@ -27,13 +27,13 @@ impl Builder { if step_size == 2 { // Write u16 index - for offset in self.0 { + for &offset in &self.0 { let offset = offset as u16; writer.write_u16::(offset)?; } } else { // Write u32 index - for offset in self.0 { + for &offset in &self.0 { writer.write_u32::(offset)?; } } diff --git a/src/super_segment/binary_index/reader.rs b/src/super_segment/binary_index/reader.rs index 7c557431..9d8fc81e 100644 --- a/src/super_segment/binary_index/reader.rs +++ b/src/super_segment/binary_index/reader.rs @@ -1,12 +1,24 @@ use byteorder::{LittleEndian, ReadBytesExt}; +macro_rules! unwrappy { + ($x:expr) => { + // $x.expect("should read") + + unsafe { $x.unwrap_unchecked() } + }; +} + pub struct Reader<'a> { bytes: &'a [u8], step_size: usize, } impl<'a> Reader<'a> { - pub fn new(bytes: &'a [u8], offset: usize, len: usize, step_size: usize) -> Self { + pub(crate) fn new(bytes: &'a [u8], offset: u32, len: u32, step_size: u8) -> Self { + let offset = offset as usize; + let len = len as usize; + let step_size = step_size as usize; + Self { bytes: &bytes[offset..(offset + len * step_size)], step_size, @@ -20,15 +32,12 @@ impl<'a> Reader<'a> { pub(crate) fn get(&self, idx: usize) -> usize { let offset = idx * self.step_size; - let mut bytes = self.bytes.get(offset..).expect("should be in array"); + let mut bytes = &self.bytes[offset..]; if self.step_size == 2 { - bytes - .read_u16::() - .expect("should read") - .into() + unwrappy!(bytes.read_u16::()).into() } else { - bytes.read_u32::().expect("should read") as usize + unwrappy!(bytes.read_u32::()) as usize } } } From cc2693559ab4a87437fd33b03fd4cef2af886b9e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 9 Apr 2025 18:59:12 +0200 Subject: [PATCH 032/613] data bl,ock finished (for now)? --- src/super_segment/data_block/iter.rs | 339 ++++++-- src/super_segment/data_block/mod.rs | 1190 +++++++++++++++++--------- 2 files changed, 1033 insertions(+), 496 deletions(-) diff --git a/src/super_segment/data_block/iter.rs b/src/super_segment/data_block/iter.rs index 41139cc8..9a04d59d 100644 --- a/src/super_segment/data_block/iter.rs +++ b/src/super_segment/data_block/iter.rs @@ -1,143 +1,292 @@ -use super::{encoder::TRAILER_SIZE, DataBlock}; -use crate::{ - coding::DecodeError, super_segment::data_block::encoder::TERMINATOR_MARKER, InternalValue, - Slice, ValueType, -}; -use byteorder::{LittleEndian, ReadBytesExt}; -use std::io::{Cursor, Seek}; -use varint_rs::VarintReader; +use super::DataBlock; +use crate::{key::InternalKey, InternalValue, SeqNo, Slice}; +use std::io::Cursor; /// Double-ended iterator over data blocks -pub struct Iter { - block: DataBlock, +pub struct Iter<'a> { + block: &'a DataBlock, cursor: usize, - idx: usize, + remaining_in_interval: usize, restart_interval: usize, - base_key: Option, + lo_watermark: usize, + + // base_key: Option<&'a [u8]>, + base_key_offset: Option, + + hi_ptr_idx: usize, + hi_stack: Vec, + // TODO: refactor into two members: LoScanner and HiScanner +} + +/// [start, end] slice indexes +#[derive(Debug)] +pub struct ParsedSlice(pub usize, pub usize); + +#[derive(Debug)] +pub struct ParsedItem { + pub value_type: u8, + pub seqno: SeqNo, + pub prefix: Option, + pub key: ParsedSlice, + pub value: Option, } -impl Iter { - pub fn new(block: DataBlock) -> Self { - let bytes = &block.inner.data; - let mut reader = &bytes[bytes.len() - TRAILER_SIZE..]; +impl ParsedItem { + pub fn materialize(&self, bytes: &Slice) -> InternalValue { + let key = if let Some(prefix) = &self.prefix { + let prefix_key = &bytes[prefix.0..prefix.1]; + let rest_key = &bytes[self.key.0..self.key.1]; + Slice::fused(prefix_key, rest_key) + } else { + bytes.slice(self.key.0..self.key.1) + }; + let key = InternalKey::new( + key, + self.seqno, + self.value_type.try_into().expect("should work"), + ); + + let value = if let Some(value) = &self.value { + bytes.slice(value.0..value.1) + } else { + Slice::empty() + }; - let _item_count = reader.read_u32::().expect("should read") as usize; - let restart_interval = reader.read_u8().expect("should read") as usize; + InternalValue { key, value } + } +} + +impl<'a> Iter<'a> { + pub fn new(block: &'a DataBlock) -> Self { + let restart_interval = block.restart_interval.into(); + let binary_index_len = block.binary_index_len as usize; Self { block, + cursor: 0, - idx: 0, + remaining_in_interval: 0, restart_interval, - base_key: None, + lo_watermark: 0, + + // base_key: None, // TODO: remove + base_key_offset: None, + + hi_ptr_idx: binary_index_len, + hi_stack: Vec::new(), } } -} -impl Iterator for Iter { - type Item = crate::Result; + pub fn with_offset(mut self, offset: usize) -> Self { + self.lo_watermark = offset; + self + } - fn next(&mut self) -> Option { - let is_restart = (self.idx % self.restart_interval) == 0; + // TODO: refactor together with deserialize and point_read + // skip should return the basic info, and rename to deserialize + // rename deserialize to materialize by using the return type of deserialize + /* fn skip_restart_item(&mut self) -> crate::Result { + let bytes = &self.block.inner.data; + + // SAFETY: The cursor is advanced by read_ operations which check for EOF, + // And the cursor starts at 0 - the slice is never empty + #[warn(unsafe_code)] + let mut reader = Cursor::new(unsafe { bytes.get_unchecked(self.cursor..) }); + let parsed = DataBlock::parse_restart_head(&mut reader)?; + + if parsed.value_type == TRAILER_START_MARKER { + return Ok(false); + } + + let value_type: ValueType = parsed + .value_type + .try_into() + .map_err(|()| DecodeError::InvalidTag(("ValueType", parsed.value_type)))?; + + let key_start = self.cursor + parsed.key_start; + let key_end = key_start + parsed.key_len; + let key = bytes.slice(key_start..key_end); + + let val_len: usize = if value_type == ValueType::Value { + reader.read_u32_varint()? as usize + } else { + 0 + }; + reader.seek_relative(val_len as i64)?; + + self.cursor += reader.position() as usize; + self.base_key = Some(key); + + Ok(true) + } */ + + // TODO: refactor together with deserialize and point_read + // skip should return the basic info, and rename to deserialize + // rename deserialize to materialize by using the return type of deserialize + /* fn skip_truncated_item(&mut self) -> crate::Result { let bytes = &self.block.inner.data; - let mut cursor = Cursor::new(&bytes[self.cursor..]); - if is_restart { - let parsed = fail_iter!(DataBlock::parse_restart_item(&mut cursor)); + // SAFETY: The cursor is advanced by read_ operations which check for EOF, + // And the cursor starts at 0 - the slice is never empty + #[warn(unsafe_code)] + let mut reader = Cursor::new(unsafe { bytes.get_unchecked(self.cursor..) }); - if parsed.value_type == TERMINATOR_MARKER { - return None; - } + let value_type = reader.read_u8()?; - let value_type: ValueType = fail_iter!(parsed - .value_type - .try_into() - .map_err(|()| DecodeError::InvalidTag(("ValueType", parsed.value_type)))); + if value_type == TRAILER_START_MARKER { + return Ok(false); + } - let seqno = parsed.seqno; + let value_type: ValueType = value_type + .try_into() + .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type)))?; - let key_start = self.cursor + parsed.key_start; - let key_end = key_start + parsed.key_len; - let key = bytes.slice(key_start..key_end); + let _seqno = reader.read_u64_varint()?; - let val_len: usize = if value_type == ValueType::Value { - fail_iter!(cursor.read_u32_varint()) as usize - } else { - 0 - }; - let val_offset = self.cursor + cursor.position() as usize; - fail_iter!(cursor.seek_relative(val_len as i64)); - - self.cursor += cursor.position() as usize; - self.idx += 1; - self.base_key = Some(key.clone()); - - Some(Ok(if value_type == ValueType::Value { - let value = bytes.slice(val_offset..(val_offset + val_len)); - InternalValue::from_components(key, value, seqno, value_type) - } else { - InternalValue::from_components(key, b"", seqno, value_type) - })) + let _shared_prefix_len: usize = reader.read_u16_varint()?.into(); + let rest_key_len: usize = reader.read_u16_varint()?.into(); + + reader.seek_relative(rest_key_len as i64)?; + + let val_len: usize = if value_type == ValueType::Value { + reader.read_u32_varint()? as usize } else { - let value_type = fail_iter!(cursor.read_u8()); + 0 + }; + reader.seek_relative(val_len as i64)?; - if value_type == TERMINATOR_MARKER { - return None; - } + self.cursor += reader.position() as usize; - let value_type: ValueType = fail_iter!(value_type - .try_into() - .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type)))); + Ok(true) + } */ - let seqno = fail_iter!(cursor.read_u64_varint()); + fn parse_restart_item(&mut self, offset: usize) -> Option { + let bytes = &self.block.inner.data; - let shared_prefix_len: usize = fail_iter!(cursor.read_u16_varint()).into(); - let rest_key_len: usize = fail_iter!(cursor.read_u16_varint()).into(); + // SAFETY: The cursor is advanced by read_ operations which check for EOF, + // And the cursor starts at 0 - the slice is never empty + #[warn(unsafe_code)] + let mut reader = Cursor::new(unsafe { bytes.get_unchecked(offset..) }); - let key_offset = self.cursor + cursor.position() as usize; + let Some(item) = DataBlock::parse_restart_item(&mut reader, offset) else { + return None; + }; - // SAFETY: We always start with a restart item, so the base key is always set to Some(_) - #[warn(unsafe_code)] - let base_key = unsafe { self.base_key.as_ref().unwrap_unchecked() }; + self.cursor += reader.position() as usize; + self.base_key_offset = Some(item.key.0); - let prefix_part = &base_key[..shared_prefix_len]; - let rest_key = &bytes[key_offset..(key_offset + rest_key_len)]; - fail_iter!(cursor.seek_relative(rest_key_len as i64)); + Some(item) + } - let val_len: usize = if value_type == ValueType::Value { - fail_iter!(cursor.read_u32_varint()) as usize - } else { - 0 - }; - let val_offset = self.cursor + cursor.position() as usize; - fail_iter!(cursor.seek_relative(val_len as i64)); + fn parse_truncated_item(&mut self, offset: usize) -> Option { + let bytes = &self.block.inner.data; - let key = if shared_prefix_len == 0 { - bytes.slice(key_offset..(key_offset + rest_key_len)) - } else { - // Stitch key - Slice::fused(prefix_part, rest_key) - }; + // SAFETY: The cursor is advanced by read_ operations which check for EOF, + // And the cursor starts at 0 - the slice is never empty + #[warn(unsafe_code)] + let mut reader = Cursor::new(unsafe { bytes.get_unchecked(offset..) }); + + let Some(item) = DataBlock::parse_truncated_item( + &mut reader, + offset, + self.base_key_offset.expect("should exist"), + ) else { + return None; + }; + + self.cursor += reader.position() as usize; + + Some(item) + } + + /* fn consume_stack_top(&mut self) -> crate::Result> { + if let Some(offset) = self.hi_stack.pop() { + if self.lo_watermark > 0 && offset <= self.lo_watermark { + return Ok(None); + } + + self.cursor = offset; + + // TODO: pop from stack, check if offset < self.cursor, then also make sure to terminate forwards iteration + // TODO: probably need a lo_cursor - self.cursor += cursor.position() as usize; - self.idx += 1; + let is_restart = self.hi_stack.is_empty(); - Some(Ok(if value_type == ValueType::Value { - let value = bytes.slice(val_offset..(val_offset + val_len)); - InternalValue::from_components(key, value, seqno, value_type) + if is_restart { + self.deserialize_restart_item() } else { - InternalValue::new_tombstone(key, seqno) - })) + self.deserialize_truncated_item() + } + } else { + Ok(None) } + } */ +} + +impl Iterator for Iter<'_> { + type Item = ParsedItem; + + fn next(&mut self) -> Option { + let is_restart = self.remaining_in_interval == 0; + + self.cursor = self.lo_watermark; + + let item = if is_restart { + self.remaining_in_interval = self.restart_interval; + self.parse_restart_item(self.lo_watermark) + } else { + self.parse_truncated_item(self.lo_watermark) + }; + + self.lo_watermark = self.cursor; + self.remaining_in_interval -= 1; + + item } } -impl DoubleEndedIterator for Iter { +impl DoubleEndedIterator for Iter<'_> { fn next_back(&mut self) -> Option { todo!() + /* if let Some(top) = fail_iter!(self.consume_stack_top()) { + return Some(Ok(top)); + } + + self.hi_ptr_idx = self.hi_ptr_idx.wrapping_sub(1); + + // NOTE: If we wrapped, we are at the end + // This is safe to do, because there cannot be that many restart intervals + if self.hi_ptr_idx == usize::MAX { + return None; + } + + let binary_index = self.block.get_binary_index_reader(); + + { + let offset = binary_index.get(self.hi_ptr_idx); + self.cursor = offset; + + if fail_iter!(self.skip_restart_item()) { + self.hi_stack.push(offset); + } + } + + for _ in 1..self.restart_interval { + let cursor = self.cursor; + + if fail_iter!(self.skip_truncated_item()) { + self.hi_stack.push(cursor); + } + } + + if self.hi_stack.is_empty() { + return None; + } + + self.consume_stack_top().transpose() */ } } diff --git a/src/super_segment/data_block/mod.rs b/src/super_segment/data_block/mod.rs index cda81672..f85f3659 100644 --- a/src/super_segment/data_block/mod.rs +++ b/src/super_segment/data_block/mod.rs @@ -1,85 +1,161 @@ -mod encoder; mod iter; +pub use iter::Iter; + +use super::block::Trailer; +use super::block::{Encodable, Encoder}; use super::hash_index::Reader as HashIndexReader; use super::{binary_index::Reader as BinaryIndexReader, Block}; -use crate::{coding::DecodeError, InternalValue, SeqNo, Slice, ValueType}; +use crate::super_segment::block::TRAILER_START_MARKER; +use crate::super_segment::util::compare_prefixed_slice; +use crate::{InternalValue, SeqNo, ValueType}; +use byteorder::WriteBytesExt; use byteorder::{LittleEndian, ReadBytesExt}; -use encoder::{TERMINATOR_MARKER, TRAILER_SIZE}; -use std::cmp::Ordering; -use std::{ - cmp::Reverse, - io::{Cursor, Seek}, -}; -use varint_rs::VarintReader; - -pub use encoder::Encoder; -pub use iter::Iter; - -type DataBlockEncoder<'a> = Encoder<'a>; +use iter::{ParsedItem, ParsedSlice}; +use std::io::Seek; +use std::{cmp::Reverse, io::Cursor}; +use varint_rs::{VarintReader, VarintWriter}; -// TODO: Fuzz test -fn compare_prefixed_slice(prefix: &[u8], suffix: &[u8], needle: &[u8]) -> Ordering { - if needle.is_empty() { - let combined_len = prefix.len() + suffix.len(); +impl Encodable<()> for InternalValue { + fn encode_full_into( + &self, + writer: &mut W, + _state: &mut (), + ) -> crate::Result<()> { + // We encode restart markers as: + // [value type] [seqno] [user key len] [user key] [value len] [value] + // 1 2 3 4 5? 6? + + writer.write_u8(u8::from(self.key.value_type))?; // 1 + writer.write_u64_varint(self.key.seqno)?; // 2 + + // NOTE: Truncation is okay and actually needed + #[allow(clippy::cast_possible_truncation)] + writer.write_u16_varint(self.key.user_key.len() as u16)?; // 3 + writer.write_all(&self.key.user_key)?; // 4 + + // NOTE: Only write value len + value if we are actually a value + if !self.is_tombstone() { + // NOTE: We know values are limited to 32-bit length + #[allow(clippy::cast_possible_truncation)] + writer.write_u32_varint(self.value.len() as u32)?; // 5 + writer.write_all(&self.value)?; // 6 + } - return if combined_len > 0 { - Ordering::Greater - } else { - Ordering::Equal - }; + Ok(()) } - match prefix.len().cmp(&needle.len()) { - Ordering::Equal => match prefix.cmp(needle) { - Ordering::Equal => {} - ordering => return ordering, - }, - Ordering::Greater => { - // SAFETY: We know that the prefix is longer than the needle, so we can safely - // truncate it to the needle's length - #[allow(unsafe_code)] - let prefix = unsafe { prefix.get_unchecked(0..needle.len()) }; - return prefix.cmp(needle); - } - Ordering::Less => { - // SAFETY: We know that the needle is longer than the prefix, so we can safely - // truncate it to the prefix's length - #[allow(unsafe_code)] - let needle = unsafe { needle.get_unchecked(0..prefix.len()) }; - - match prefix.cmp(needle) { - Ordering::Equal => {} - ordering => return ordering, - } + fn encode_truncated_into( + &self, + writer: &mut W, + _state: &mut (), + shared_len: usize, + ) -> crate::Result<()> { + // We encode truncated values as: + // [value type] [seqno] [shared prefix len] [rest key len] [rest key] [value len] [value] + // 1 2 3 4 5 6? 7? + + writer.write_u8(u8::from(self.key.value_type))?; // 1 + writer.write_u64_varint(self.key.seqno)?; // 2 + + // TODO: maybe we can skip this varint altogether if prefix truncation = false + writer.write_u16_varint(shared_len as u16)?; // 3 + + let rest_len = self.key().len() - shared_len; + writer.write_u16_varint(rest_len as u16)?; // 4 + + let truncated_user_key = self + .key + .user_key + .get(shared_len..) + .expect("should be in bounds"); + + writer.write_all(truncated_user_key)?; // 5 + + // NOTE: Only write value len + value if we are actually a value + if !self.is_tombstone() { + // NOTE: We know values are limited to 32-bit length + #[allow(clippy::cast_possible_truncation)] + writer.write_u32_varint(self.value.len() as u32)?; // 6 + writer.write_all(&self.value)?; // 7 } + + Ok(()) } - // SAFETY: We know that the prefix is definitely not longer than the needle - // so we can safely truncate - #[allow(unsafe_code)] - let needle = unsafe { needle.get_unchecked(prefix.len()..) }; - suffix.cmp(needle) + fn key(&self) -> &[u8] { + &self.key.user_key + } +} + +// TODO: allow disabling binary index (for meta block) +// -> saves space in metadata blocks +// -> point reads then need to use iter().find() to find stuff (which is fine) + +macro_rules! unwrappy { + ($x:expr) => { + // $x.expect("should read") + + unsafe { $x.unwrap_unchecked() } + }; } /// Block that contains key-value pairs (user data) #[derive(Clone)] pub struct DataBlock { - pub inner: Block, -} + inner: Block, + + // Cached metadata + restart_interval: u8, + + binary_index_step_size: u8, + binary_index_offset: u32, + binary_index_len: u32, -struct RestartHead { - value_type: u8, - seqno: SeqNo, - key_start: usize, - key_len: usize, + hash_index_offset: u32, + hash_index_len: u32, } impl DataBlock { #[must_use] pub fn new(inner: Block) -> Self { - let bytes = &inner.data; - Self { inner } + let trailer = Trailer::new(&inner); + let mut reader = trailer.as_slice(); + + let _item_count = reader.read_u32::().expect("should read"); + + let restart_interval = unwrappy!(reader.read_u8()); + + let binary_index_step_size = unwrappy!(reader.read_u8()); + let binary_index_offset = unwrappy!(reader.read_u32::()); + let binary_index_len = unwrappy!(reader.read_u32::()); + + let hash_index_offset = unwrappy!(reader.read_u32::()); + let hash_index_len = unwrappy!(reader.read_u32::()); + + debug_assert!( + binary_index_step_size == 2 || binary_index_step_size == 4, + "invalid binary index step size", + ); + + Self { + inner, + + restart_interval, + + binary_index_step_size, + binary_index_offset, + binary_index_len, + + hash_index_offset, + hash_index_len, + } + } + + /// Access the inner raw bytes + #[must_use] + fn bytes(&self) -> &[u8] { + &self.inner.data } /// Returns the uncompressed block size in bytes. @@ -89,32 +165,53 @@ impl DataBlock { } #[must_use] - pub fn iter(&self) -> Iter { - Iter::new(self.clone()) + #[allow(clippy::iter_without_into_iter)] + pub fn iter(&self) -> impl DoubleEndedIterator + '_ { + Iter::new(self).map(|kv| kv.materialize(&self.inner.data)) } - fn get_key_at(&self, pos: usize) -> crate::Result<(&[u8], Reverse)> { - // eprintln!("probe {pos}"); + /* pub fn range<'a, K: AsRef<[u8]> + 'a, R: RangeBounds + 'a>( + &'a self, + range: &'a R, + ) -> impl DoubleEndedIterator> + 'a { + let offset = 0; // TODO: range & seek to range start using binary index/hash index (first matching restart interval) + // TODO: and if range end, seek to range end as well (last matching restart interval) + ClippingIter::new(self.iter().with_offset(offset), range) + } */ + + fn get_key_at(&self, pos: usize) -> crate::Result<(&[u8], Reverse)> { let bytes = &self.inner.data; - let mut cursor = Cursor::new(&bytes[pos..]); - let parsed = Self::parse_restart_item(&mut cursor)?; - let key_start = pos + parsed.key_start; - let key_end = key_start + parsed.key_len; - let key = &bytes[key_start..key_end]; + // NOTE: Skip value type + let pos = pos + std::mem::size_of::(); + + // SAFETY: pos is always retrieved from the binary index, + // which we consider to be trustworthy + #[warn(unsafe_code)] + let mut cursor = Cursor::new(unsafe { bytes.get_unchecked(pos..) }); + + let seqno = unwrappy!(cursor.read_u64_varint()); + let key_len: usize = unwrappy!(cursor.read_u16_varint()).into(); + + let key_start = pos + cursor.position() as usize; + let key_end = key_start + key_len; - Ok((key, Reverse(parsed.seqno))) + #[warn(unsafe_code)] + let key = bytes.get(key_start..key_end).expect("should read"); + + Ok((key, Reverse(seqno))) } - fn parse_restart_item(cursor: &mut Cursor<&[u8]>) -> crate::Result { - let value_type = cursor.read_u8()?; + /* fn parse_restart_head(cursor: &mut Cursor<&[u8]>) -> crate::Result { + let value_type = unwrappy!(cursor.read_u8()); - let seqno = cursor.read_u64_varint()?; + let seqno = unwrappy!(cursor.read_u64_varint()); - let key_len: usize = cursor.read_u16_varint()?.into(); + let key_len: usize = unwrappy!(cursor.read_u16_varint()).into(); let key_start = cursor.position() as usize; - cursor.seek_relative(key_len as i64)?; + + unwrappy!(cursor.seek_relative(key_len as i64)); Ok(RestartHead { value_type, @@ -122,292 +219,376 @@ impl DataBlock { key_start, key_len, }) - } - - fn walk( - &self, - needle: &[u8], - seqno_watermark: Option, - pos: usize, - restart_interval: usize, - ) -> crate::Result> { - use std::cmp::Ordering::{Equal, Greater, Less}; - - let bytes = &self.inner.data; - let mut cursor = Cursor::new(&bytes[pos..]); + } */ - let mut base_key_pos = 0; + /// Returns the binary index length (number of pointers). + /// + /// The number of pointers is equal to the number of restart intervals. + #[must_use] + pub fn binary_index_len(&self) -> u32 { + self.binary_index_len + } - // NOTE: Check the full item - let base_key = { - let parsed = Self::parse_restart_item(&mut cursor)?; + /// Returns the binary index offset. + #[must_use] + fn binary_index_offset(&self) -> u32 { + self.binary_index_offset + } - let value_type: ValueType = parsed - .value_type - .try_into() - .map_err(|()| DecodeError::InvalidTag(("ValueType", parsed.value_type)))?; + /// Returns the binary index step size. + /// + /// The binary index can either store u16 or u32 pointers, + /// depending on the size of the data block. + /// + /// Typically blocks are < 64K, so u16 pointers reduce the index + /// size by half. + #[must_use] + fn binary_index_step_size(&self) -> u8 { + self.binary_index_step_size + } - let seqno = parsed.seqno; + /// Returns the hash index offset. + /// + /// If 0, the hash index does not exist. + #[must_use] + fn hash_index_offset(&self) -> u32 { + self.hash_index_offset + } - let key_start = pos + parsed.key_start; - let key_end = key_start + parsed.key_len; - let key = &bytes[key_start..key_end]; + /// Returns the number of hash buckets. + #[must_use] + pub fn hash_bucket_count(&self) -> Option { + if self.hash_index_offset() > 0 { + Some(self.hash_index_len) + } else { + None + } + } - base_key_pos = key_start; + fn get_binary_index_reader(&self) -> BinaryIndexReader { + BinaryIndexReader::new( + self.bytes(), + self.binary_index_offset(), + self.binary_index_len(), + self.binary_index_step_size(), + ) + } - let val_len: usize = if value_type == ValueType::Value { - cursor.read_u32_varint().expect("should read") as usize - } else { - 0 - }; + fn get_hash_index_reader(&self) -> Option { + self.hash_bucket_count() + .map(|offset| HashIndexReader::new(&self.inner.data, self.hash_index_offset, offset)) + } - match key.cmp(needle) { - Equal => { - let should_skip = seqno_watermark - .map(|watermark| seqno >= watermark) - .unwrap_or(false); + /// Returns the amount of conflicts in the hash buckets. + #[must_use] + pub fn hash_bucket_conflict_count(&self) -> Option { + self.get_hash_index_reader() + .map(|reader| reader.conflict_count()) + } - if !should_skip { - let key = bytes.slice(key_start..key_end); - - return Ok(Some(if value_type == ValueType::Value { - let val_offset = pos + cursor.position() as usize; - let value = bytes.slice(val_offset..(val_offset + val_len)); - InternalValue::from_components(key, value, seqno, value_type) - } else { - InternalValue::from_components(key, b"", seqno, value_type) - })); - } - } - Greater => { - // NOTE: Already passed searched key - return Ok(None); - } - Less => { - // NOTE: Continue - } - } + /// Returns the amount of empty hash buckets. + #[must_use] + pub fn hash_bucket_free_count(&self) -> Option { + self.get_hash_index_reader() + .map(|reader| reader.free_count()) + } - cursor.seek_relative(val_len as i64).expect("should read"); + /// Returns the amount of items in the block. + #[must_use] + pub fn len(&self) -> usize { + Trailer::new(&self.inner).item_count() + } - key - }; + /// Always returns false: a block is never empty. + #[must_use] + pub fn is_empty(&self) -> bool { + false + } - // NOTE: Check the rest items - for _idx in 1..restart_interval { - let value_type = cursor.read_u8()?; + fn binary_search_for_offset( + &self, + binary_index: &BinaryIndexReader, + needle: &[u8], + seqno: Option, + ) -> Option { + let mut left: usize = 0; + let mut right = binary_index.len(); - if value_type == TERMINATOR_MARKER { - return Ok(None); - } + if right == 0 { + return None; + } - let value_type: ValueType = value_type - .try_into() - .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type)))?; + if let Some(seqno) = seqno { + let seqno_cmp = Reverse(seqno - 1); - let seqno = cursor.read_u64_varint()?; + while left < right { + let mid = left + (right - left) / 2; - let shared_prefix_len: usize = cursor.read_u16_varint()?.into(); - let rest_key_len: usize = cursor.read_u16_varint()?.into(); + let offset = binary_index.get(mid); - let key_offset = pos + cursor.position() as usize; + let peter = unwrappy!(self.get_key_at(offset)); - let prefix_part = &base_key[0..shared_prefix_len]; - let rest_key = &bytes[key_offset..(key_offset + rest_key_len)]; - cursor.seek_relative(rest_key_len as i64)?; + if (needle, seqno_cmp) >= peter { + left = mid + 1; + } else { + right = mid; + } + } + } else { + while left < right { + let mid = left + (right - left) / 2; - let val_len: usize = if value_type == ValueType::Value { - cursor.read_u32_varint().expect("should read") as usize - } else { - 0 - }; + let offset = binary_index.get(mid); - match compare_prefixed_slice(prefix_part, rest_key, needle) { - Equal => { - let should_skip = seqno_watermark - .map(|watermark| seqno >= watermark) - .unwrap_or(false); + let peter = unwrappy!(self.get_key_at(offset)); - if !should_skip { - let key = if shared_prefix_len == 0 { - bytes.slice(key_offset..(key_offset + rest_key_len)) - } else if rest_key_len == 0 { - bytes.slice(base_key_pos..(base_key_pos + shared_prefix_len)) - } else { - // Stitch key - Slice::fused(prefix_part, rest_key) - }; - - return Ok(Some(if value_type == ValueType::Value { - let val_offset = pos + cursor.position() as usize; - let value = bytes.slice(val_offset..(val_offset + val_len)); - InternalValue::from_components(key, value, seqno, value_type) - } else { - InternalValue::from_components(key, b"", seqno, value_type) - })); - } - } - Greater => { - // NOTE: Already passed searched key - return Ok(None); + if needle >= peter.0 { + left = mid + 1; + } else { + right = mid; } - Less => { - // NOTE: Continue - } - } - - if value_type == ValueType::Value { - cursor.seek_relative(val_len as i64)?; } } - Ok(None) - } - - pub fn binary_index_pointer_count(&self) -> usize { - let bytes = &self.inner.data; - - // SAFETY: We know that there is always a trailer, so we cannot go out of bounds - #[warn(unsafe_code)] - let mut reader = unsafe { bytes.get_unchecked(self.trailer_offset()..) }; - - let _item_count = reader.read_u32::().expect("should read") as usize; - let _restart_interval = reader.read_u8().expect("should read") as usize; - - let _binary_index_step_size = reader.read_u8().expect("should read") as usize; + if left == 0 { + return None; + } - let _binary_index_offset = reader.read_u32::().expect("should read") as usize; + let offset = binary_index.get(left - 1); - reader.read_u32::().expect("should read") as usize + Some(offset) } - pub fn hash_bucket_count(&self) -> usize { - let bytes = &self.inner.data; - - // SAFETY: We know that there is always a trailer, so we cannot go out of bounds - #[warn(unsafe_code)] - let mut reader = unsafe { bytes.get_unchecked(self.trailer_offset()..) }; + fn parse_restart_item(reader: &mut Cursor<&[u8]>, offset: usize) -> Option { + let value_type = unwrappy!(reader.read_u8()); - let _item_count = reader.read_u32::().expect("should read") as usize; - let _restart_interval = reader.read_u8().expect("should read") as usize; + if value_type == TRAILER_START_MARKER { + return None; + } - let _binary_index_step_size = reader.read_u8().expect("should read") as usize; - let _binary_index_offset = reader.read_u32::().expect("should read") as usize; - let _binary_index_len = reader.read_u32::().expect("should read") as usize; + let seqno = unwrappy!(reader.read_u64_varint()); - let hash_index_offset = reader.read_u32::().expect("should read") as usize; + let key_len: usize = unwrappy!(reader.read_u16_varint()).into(); + let key_start = offset + reader.position() as usize; + unwrappy!(reader.seek_relative(key_len as i64)); - if hash_index_offset > 0 { - reader.read_u32::().expect("should read") as usize + let val_len: usize = if value_type == ValueType::Value.into() { + unwrappy!(reader.read_u32_varint()) as usize } else { 0 - } + }; + let val_offset = offset + reader.position() as usize; + unwrappy!(reader.seek_relative(val_len as i64)); + + Some(if value_type == ValueType::Value.into() { + ParsedItem { + value_type, + seqno, + prefix: None, + key: ParsedSlice(key_start, key_start + key_len), + value: Some(ParsedSlice(val_offset, val_offset + val_len)), + } + } else { + ParsedItem { + value_type, + seqno, + prefix: None, + key: ParsedSlice(key_start, key_start + key_len), + value: None, // TODO: enum value/tombstone, so value is not Option for values + } + }) } - fn trailer_offset(&self) -> usize { - self.inner.data.len() - TRAILER_SIZE - } + fn parse_truncated_item( + reader: &mut Cursor<&[u8]>, + offset: usize, + base_key_offset: usize, + ) -> Option { + let value_type = unwrappy!(reader.read_u8()); - /// Returns the amount of items in the block - #[must_use] - pub fn len(&self) -> usize { - let bytes = &self.inner.data; + if value_type == TRAILER_START_MARKER { + return None; + } - // SAFETY: We know that there is always a trailer, so we cannot go out of bounds - #[warn(unsafe_code)] - let mut reader = unsafe { bytes.get_unchecked(self.trailer_offset()..) }; + let seqno = unwrappy!(reader.read_u64_varint()); - reader.read_u32::().expect("should read") as usize - } + let shared_prefix_len: usize = unwrappy!(reader.read_u16_varint()).into(); + let rest_key_len: usize = unwrappy!(reader.read_u16_varint()).into(); - /// Always returns false: a block is never empty - #[must_use] - pub fn is_empty(&self) -> bool { - false - } + let key_offset = offset + reader.position() as usize; - pub fn point_read( - &self, - key: &[u8], - seqno: Option, - ) -> crate::Result> { - let bytes = &self.inner.data; + unwrappy!(reader.seek_relative(rest_key_len as i64)); - let start_pos = self.trailer_offset() - + /* skip item count */ std::mem::size_of::(); + let val_len: usize = if value_type == ValueType::Value.into() { + unwrappy!(reader.read_u32_varint()) as usize + } else { + 0 + }; + let val_offset = offset + reader.position() as usize; + unwrappy!(reader.seek_relative(val_len as i64)); + + Some(if value_type == ValueType::Value.into() { + ParsedItem { + value_type, + seqno, + prefix: Some(ParsedSlice( + base_key_offset, + base_key_offset + shared_prefix_len, + )), + key: ParsedSlice(key_offset, key_offset + rest_key_len), + value: Some(ParsedSlice(val_offset, val_offset + val_len)), + } + } else { + ParsedItem { + value_type, + seqno, + prefix: Some(ParsedSlice( + base_key_offset, + base_key_offset + shared_prefix_len, + )), + key: ParsedSlice(key_offset, key_offset + rest_key_len), + value: None, + } + }) + } - // SAFETY: We know that there is always a trailer, so we cannot go out of bounds - #[warn(unsafe_code)] - let mut reader = unsafe { bytes.get_unchecked(start_pos..) }; + fn scan(&self, needle: &[u8], seqno: Option, offset: usize) -> Option { + /* let iter = Iter::new(self).with_offset(offset); - let restart_interval = reader.read_u8().expect("should read") as usize; + for kv in iter { + let kv = kv?; - let binary_index_step_size = reader.read_u8().expect("should read") as usize; + let cmp_result = if let Some(prefix) = &kv.prefix { + let prefix = &self.bytes()[prefix.0..prefix.1]; + let rest_key = &self.bytes()[kv.key.0..kv.key.1]; + compare_prefixed_slice(prefix, rest_key, needle) + } else { + let key = &self.bytes()[kv.key.0..kv.key.1]; + key.cmp(needle) + }; - debug_assert!( - binary_index_step_size == 2 || binary_index_step_size == 4, - "invalid binary index step size", - ); + match cmp_result { + std::cmp::Ordering::Equal => { + // TODO: maybe return early if past seqno + let should_skip = seqno.is_some_and(|watermark| kv.seqno >= watermark); - // eprintln!("binary index step size={binary_index_step_size}"); + if !should_skip { + let kv = kv.materialize(&self.inner.data); + return Ok(Some(kv)); + } + } + std::cmp::Ordering::Greater => { + // Already passed needle + return Ok(None); + } + std::cmp::Ordering::Less => { + // Continue to next KV + } + } + } - let binary_index_offset = reader.read_u32::().expect("should read") as usize; - let binary_index_len = reader.read_u32::().expect("should read") as usize; - let binary_index = BinaryIndexReader::new( - bytes, - binary_index_offset, - binary_index_len, - binary_index_step_size, - ); + Ok(None) */ - // TODO: if the binary index is really dense, don't look into hash index, or - // maybe don't even build it in the first place + let bytes = self.bytes(); - let hash_index_offset = reader.read_u32::().expect("should read") as usize; + // SAFETY: The cursor is advanced by read_ operations which check for EOF, + // And the cursor starts at 0 - the slice is never empty + #[warn(unsafe_code)] + let mut reader = Cursor::new(unsafe { bytes.get_unchecked(offset..) }); - if hash_index_offset > 0 { - let hash_bucket_count = - reader.read_u32::().expect("should read") as usize; + let head = Self::parse_restart_item(&mut reader, offset)?; - let hash_index = HashIndexReader::new(bytes, hash_index_offset, hash_bucket_count); + let key = &bytes[head.key.0..head.key.1]; + let base_key_offset = head.key.0; - if let Some(bucket_value) = hash_index.get(key) { - let restart_entry_pos = binary_index.get(usize::from(bucket_value)); + match key.cmp(needle) { + std::cmp::Ordering::Equal => { + // TODO: maybe return early if past seqno + let should_skip = seqno.is_some_and(|watermark| head.seqno >= watermark); - return self.walk(key, seqno, restart_entry_pos, restart_interval); + if !should_skip { + let kv = head.materialize(&self.inner.data); + return Some(kv); + } + } + std::cmp::Ordering::Greater => { + // Already passed needle + return None; + } + std::cmp::Ordering::Less => { + // Continue to next KV } } - // NOTE: Fallback to binary search - - let mut left = 0; - let mut right = binary_index.len(); + for _ in 0..(self.restart_interval - 1) { + let kv = Self::parse_truncated_item(&mut reader, offset, base_key_offset)?; - if right == 0 { - return Ok(None); - } + let cmp_result = if let Some(prefix) = &kv.prefix { + let prefix = unsafe { bytes.get_unchecked(prefix.0..prefix.1) }; + let rest_key = unsafe { bytes.get_unchecked(kv.key.0..kv.key.1) }; + compare_prefixed_slice(prefix, rest_key, needle) + } else { + let key = unsafe { bytes.get_unchecked(kv.key.0..kv.key.1) }; + key.cmp(needle) + }; - let seqno_cmp = Reverse(seqno.unwrap_or(u64::MAX) - 1); + match cmp_result { + std::cmp::Ordering::Equal => { + // TODO: maybe return early if past seqno + let should_skip = seqno.is_some_and(|watermark| kv.seqno >= watermark); - while left < right { - let mid = (left + right) / 2; + if !should_skip { + let kv = kv.materialize(&self.inner.data); + return Some(kv); + } + } + std::cmp::Ordering::Greater => { + // Already passed needle + return None; + } + std::cmp::Ordering::Less => { + // Continue to next KV + } + } + } - let offset = binary_index.get(mid); + None + } - if (key, seqno_cmp) >= self.get_key_at(offset)? { - left = mid + 1; - } else { - right = mid; + /// Reads an item by key from the block, if it exists. + pub fn point_read( + &self, + needle: &[u8], + seqno: Option, + ) -> crate::Result> { + let binary_index = self.get_binary_index_reader(); + + // NOTE: Try hash index if it exists + if let Some(lookup) = self + .get_hash_index_reader() + .map(|reader| reader.get(needle)) + { + use super::hash_index::Lookup::{Conflicted, Found, NotFound}; + + match lookup { + Found(bucket_value) => { + let offset = binary_index.get(usize::from(bucket_value)); + return Ok(self.scan(needle, seqno, offset)); + } + NotFound => { + return Ok(None); + } + Conflicted => { + // NOTE: Fallback to binary search + } } } - if left == 0 { + let Some(offset) = self.binary_search_for_offset(&binary_index, needle, seqno) else { return Ok(None); - } - - let offset = binary_index.get(left - 1); + }; - self.walk(key, seqno, offset, restart_interval) + Ok(self.scan(needle, seqno, offset)) } pub fn encode_items( @@ -421,8 +602,12 @@ impl DataBlock { .key .user_key; - let mut serializer = - DataBlockEncoder::new(items.len(), restart_interval, hash_index_ratio, first_key); + let mut serializer = Encoder::<'_, (), InternalValue>::new( + items.len(), + restart_interval, + hash_index_ratio, + first_key, + ); for item in items { serializer.write(item)?; @@ -433,13 +618,13 @@ impl DataBlock { } #[cfg(test)] -#[allow(clippy::expect_used)] +#[allow(clippy::expect_used, clippy::unwrap_used)] mod tests { use super::*; use crate::{ - segment::block::{header::Header, offset::BlockOffset}, - super_segment::Block, - Checksum, InternalValue, + segment::block::offset::BlockOffset, + super_segment::{block::Header, Block}, + Checksum, InternalValue, Slice, ValueType::{Tombstone, Value}, }; use std::cmp::Ordering::{Equal, Greater, Less}; @@ -516,14 +701,13 @@ mod tests { data: bytes.into(), header: Header { checksum: Checksum::from_raw(0), - compression: crate::CompressionType::None, - data_length: 0, - uncompressed_length: 0, + data_length: 0, previous_block_offset: BlockOffset(0), }, }); - assert!(data_block.hash_bucket_count() > 0); + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); for needle in items { assert_eq!( @@ -553,18 +737,15 @@ mod tests { data: bytes.into(), header: Header { checksum: Checksum::from_raw(0), - compression: crate::CompressionType::None, - data_length: 0, - uncompressed_length: 0, + data_length: 0, previous_block_offset: BlockOffset(0), }, }); - assert!(data_block.hash_bucket_count() > 0); + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); for needle in items { - eprintln!("NEEDLE {needle:?}"); - assert_eq!( Some(needle.clone()), data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1))?, @@ -579,9 +760,9 @@ mod tests { #[test] fn v3_data_block_fuzz_2() -> crate::Result<()> { let items = [ - InternalValue::from_components([0], [], 18_446_568_565_776_614_018, Value), - InternalValue::from_components([0], [], 6_989_411_799_330_193_407, Tombstone), - InternalValue::from_components([0], [], 864_515_618_921_971_552, Value), + InternalValue::from_components([0], [], 5, Value), + InternalValue::from_components([0], [], 4, Tombstone), + InternalValue::from_components([0], [], 3, Value), InternalValue::from_components([0], [], 0, Value), ]; @@ -594,14 +775,13 @@ mod tests { data: bytes.into(), header: Header { checksum: Checksum::from_raw(0), - compression: crate::CompressionType::None, - data_length: 0, - uncompressed_length: 0, + data_length: 0, previous_block_offset: BlockOffset(0), }, }); - assert!(data_block.hash_bucket_count() == 0); + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().is_none()); for needle in items { eprintln!("NEEDLE {needle:?}"); @@ -643,31 +823,120 @@ mod tests { data: bytes.into(), header: Header { checksum: Checksum::from_raw(0), - compression: crate::CompressionType::None, - data_length: 0, - uncompressed_length: 0, + data_length: 0, previous_block_offset: BlockOffset(0), }, }); - assert!(data_block.hash_bucket_count() > 0); + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); assert_eq!( - data_block.iter().map(|x| x.expect("should be ok")).count(), + { + #[allow(clippy::suspicious_map)] + data_block.iter().count() + }, items.len(), ); + assert_eq!(items, *data_block.iter().collect::>(),); + + Ok(()) + } + + #[test] + fn v3_data_block_fuzz_4() -> crate::Result<()> { + let items = [ + InternalValue::from_components( + Slice::new(&[0]), + Slice::new(&[]), + 3_834_029_160_418_063_669, + Value, + ), + InternalValue::from_components(Slice::new(&[0]), Slice::new(&[]), 127, Tombstone), + InternalValue::from_components( + Slice::new(&[53, 53, 53]), + Slice::new(&[]), + 18_446_744_073_709_551_615, + Tombstone, + ), + InternalValue::from_components( + Slice::new(&[255]), + Slice::new(&[]), + 18_446_744_069_414_584_831, + Tombstone, + ), + InternalValue::from_components(Slice::new(&[255, 255]), Slice::new(&[]), 47, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 2, 1.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); + + for item in data_block.iter() { + eprintln!("{item:?}"); + } + assert_eq!( - items, - *data_block - .iter() - .map(|x| x.expect("should be ok")) - .collect::>(), + { + #[allow(clippy::suspicious_map)] + data_block.iter().count() + }, + items.len(), ); Ok(()) } + #[test] + fn v3_data_block_dense() -> crate::Result<()> { + let items = [ + InternalValue::from_components(b"a", b"a", 3, Value), + InternalValue::from_components(b"b", b"b", 2, Value), + InternalValue::from_components(b"c", b"c", 1, Value), + InternalValue::from_components(b"d", b"d", 65, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 1, 0.0)?; + eprintln!("{bytes:?}"); + eprintln!("{}", String::from_utf8_lossy(&bytes)); + eprintln!("encoded into {} bytes", bytes.len()); + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + + for needle in items { + eprintln!("NEEDLE {needle:?}"); + + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, None)?, + ); + } + + assert_eq!(None, data_block.point_read(b"yyy", None)?); + + Ok(()) + } + #[test] fn v3_data_block_dense_mvcc_with_hash() -> crate::Result<()> { let items = [ @@ -686,14 +955,13 @@ mod tests { data: bytes.into(), header: Header { checksum: Checksum::from_raw(0), - compression: crate::CompressionType::None, - data_length: 0, - uncompressed_length: 0, + data_length: 0, previous_block_offset: BlockOffset(0), }, }); - assert!(data_block.hash_bucket_count() > 0); + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); for needle in items { eprintln!("NEEDLE {needle:?}"); @@ -727,14 +995,13 @@ mod tests { data: bytes.into(), header: Header { checksum: Checksum::from_raw(0), - compression: crate::CompressionType::None, - data_length: 0, - uncompressed_length: 0, + data_length: 0, previous_block_offset: BlockOffset(0), }, }); - assert!(data_block.hash_bucket_count() == 0); + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().is_none()); for needle in items { eprintln!("NEEDLE {needle:?}"); @@ -769,14 +1036,13 @@ mod tests { data: bytes.into(), header: Header { checksum: Checksum::from_raw(0), - compression: crate::CompressionType::None, - data_length: 0, - uncompressed_length: 0, + data_length: 0, previous_block_offset: BlockOffset(0), }, }); - assert!(data_block.hash_bucket_count() > 0); + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); assert!(data_block .point_read(b"pla:venus:fact", None)? @@ -810,14 +1076,13 @@ mod tests { data: bytes.into(), header: Header { checksum: Checksum::from_raw(0), - compression: crate::CompressionType::None, - data_length: 0, - uncompressed_length: 0, + data_length: 0, previous_block_offset: BlockOffset(0), }, }); - assert!(data_block.hash_bucket_count() > 0); + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); for needle in items { assert_eq!( @@ -846,19 +1111,22 @@ mod tests { data: bytes.into(), header: Header { checksum: Checksum::from_raw(0), - compression: crate::CompressionType::None, - data_length: 0, - uncompressed_length: 0, + data_length: 0, previous_block_offset: BlockOffset(0), }, }); + assert_eq!(data_block.len(), items.len()); + assert_eq!( - data_block.iter().map(|x| x.expect("should be ok")).count(), + { + #[allow(clippy::suspicious_map)] + data_block.iter().count() + }, items.len() ); - assert_eq!(data_block.iter().flatten().collect::>(), items); + assert_eq!(data_block.iter().collect::>(), items); Ok(()) } @@ -879,27 +1147,23 @@ mod tests { data: bytes.into(), header: Header { checksum: Checksum::from_raw(0), - compression: crate::CompressionType::None, - data_length: 0, - uncompressed_length: 0, + data_length: 0, previous_block_offset: BlockOffset(0), }, }); - assert!(data_block.hash_bucket_count() > 0); + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); assert_eq!( - data_block.iter().map(|x| x.expect("should be ok")).count(), + { + #[allow(clippy::suspicious_map)] + data_block.iter().count() + }, items.len(), ); - assert_eq!( - items, - *data_block - .iter() - .map(|x| x.expect("should be ok")) - .collect::>(), - ); + assert_eq!(items, *data_block.iter().collect::>(),); Ok(()) } @@ -919,25 +1183,19 @@ mod tests { data: bytes.into(), header: Header { checksum: Checksum::from_raw(0), - compression: crate::CompressionType::None, - data_length: 0, - uncompressed_length: 0, + data_length: 0, previous_block_offset: BlockOffset(0), }, }); - assert_eq!( - items.len(), - data_block.iter().map(|x| x.expect("should be ok")).count(), - ); + assert_eq!(data_block.len(), items.len()); - assert_eq!( - items, - *data_block - .iter() - .map(|x| x.expect("should be ok")) - .collect::>(), - ); + assert_eq!(items.len(), { + #[allow(clippy::suspicious_map)] + data_block.iter().count() + }); + + assert_eq!(items, *data_block.iter().collect::>(),); Ok(()) } @@ -958,36 +1216,169 @@ mod tests { data: bytes.into(), header: Header { checksum: Checksum::from_raw(0), - compression: crate::CompressionType::None, - data_length: 0, - uncompressed_length: 0, + data_length: 0, previous_block_offset: BlockOffset(0), }, }); - assert!(data_block.hash_bucket_count() > 0); + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); - assert_eq!( - items.len(), - data_block - .iter() - .rev() - .map(|x| x.expect("should be ok")) - .count(), - ); + assert_eq!(items.len(), { + #[allow(clippy::suspicious_map)] + data_block.iter().rev().count() + }); assert_eq!( items.into_iter().rev().collect::>(), - data_block - .iter() - .rev() - .map(|x| x.expect("should be ok")) - .collect::>(), + data_block.iter().rev().collect::>(), ); Ok(()) } + #[test] + fn v3_data_block_iter_ping_pong() -> crate::Result<()> { + let items = [ + InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), + InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), + InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), + InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), + InternalValue::from_components("pla:venus:name", "Venus", 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 16, 0.75)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); + + { + let mut iter = data_block.iter(); + + assert_eq!(b"pla:saturn:fact", &*iter.next().unwrap().key.user_key); + assert_eq!(b"pla:venus:name", &*iter.next_back().unwrap().key.user_key); + assert_eq!(b"pla:saturn:name", &*iter.next().unwrap().key.user_key); + assert_eq!(b"pla:venus:fact", &*iter.next_back().unwrap().key.user_key); + + let last = iter.next().unwrap().key; + assert_eq!(b"pla:venus:fact", &*last.user_key); + assert_eq!(Tombstone, last.value_type); + assert_eq!(1, last.seqno); + } + + { + let mut iter = data_block.iter(); + + assert_eq!(b"pla:venus:name", &*iter.next_back().unwrap().key.user_key); + assert_eq!( + b"pla:saturn:fact", + &*iter + .next() + .inspect(|v| { + eprintln!("{:?}", String::from_utf8_lossy(&v.key.user_key)); + }) + .unwrap() + .key + .user_key + ); + assert_eq!(b"pla:venus:fact", &*iter.next_back().unwrap().key.user_key); + assert_eq!(b"pla:saturn:name", &*iter.next().unwrap().key.user_key); + + let last = iter.next_back().unwrap().key; + assert_eq!(b"pla:venus:fact", &*last.user_key); + assert_eq!(Tombstone, last.value_type); + assert_eq!(1, last.seqno); + } + + Ok(()) + } + + #[test] + fn v3_data_block_range() -> crate::Result<()> { + let items = [ + InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), + InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), + InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), + InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), + InternalValue::from_components("pla:venus:name", "Venus", 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 16, 0.75)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); + + /* assert_eq!( + { + #[allow(clippy::suspicious_map)] + data_block + .range(&((b"pla:venus:" as &[u8])..)) + .map(|x| x.unwrap()) + .count() + }, + 3, + ); */ + + Ok(()) + } + + #[test] + fn v3_data_block_range_rev() -> crate::Result<()> { + let items = [ + InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), + InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), + InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), + InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), + InternalValue::from_components("pla:venus:name", "Venus", 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 16, 0.75)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); + + /* assert_eq!( + { + #[allow(clippy::suspicious_map)] + data_block + .range(&((b"pla:venus:" as &[u8])..)) + .rev() + .map(|x| x.unwrap()) + .count() + }, + 3, + ); */ + + Ok(()) + } + #[test] fn v3_data_block_just_enough_pointers_for_hash_bucket() -> crate::Result<()> { let items = (0u64..254) @@ -1000,14 +1391,13 @@ mod tests { data: bytes.into(), header: Header { checksum: Checksum::from_raw(0), - compression: crate::CompressionType::None, - data_length: 0, - uncompressed_length: 0, + data_length: 0, previous_block_offset: BlockOffset(0), }, }); - assert_eq!(0, data_block.hash_bucket_count()); + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().is_none()); for needle in items { assert_eq!( @@ -1031,14 +1421,13 @@ mod tests { data: bytes.into(), header: Header { checksum: Checksum::from_raw(0), - compression: crate::CompressionType::None, - data_length: 0, - uncompressed_length: 0, + data_length: 0, previous_block_offset: BlockOffset(0), }, }); - assert_eq!(0, data_block.hash_bucket_count()); + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().is_none()); for needle in items { assert_eq!( @@ -1062,14 +1451,13 @@ mod tests { data: bytes.into(), header: Header { checksum: Checksum::from_raw(0), - compression: crate::CompressionType::None, - data_length: 0, - uncompressed_length: 0, + data_length: 0, previous_block_offset: BlockOffset(0), }, }); - assert_eq!(0, data_block.hash_bucket_count()); + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().is_none()); for needle in items { assert_eq!( From 6d598cd3c2680f98ac3a728df51516acf3dbc574 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 9 Apr 2025 19:00:32 +0200 Subject: [PATCH 033/613] fix fuzz --- fuzz/fuzz_targets/data_block.rs | 2 -- src/super_segment/block/mod.rs | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fuzz/fuzz_targets/data_block.rs b/fuzz/fuzz_targets/data_block.rs index 1d3f2a38..18ea83d0 100644 --- a/fuzz/fuzz_targets/data_block.rs +++ b/fuzz/fuzz_targets/data_block.rs @@ -82,9 +82,7 @@ fuzz_target!(|data: &[u8]| { data: bytes.into(), header: lsm_tree::segment::block::header::Header { checksum: lsm_tree::segment::block::checksum::Checksum::from_raw(0), - compression: lsm_tree::CompressionType::None, data_length: 0, - uncompressed_length: 0, previous_block_offset: BlockOffset(0), }, }); diff --git a/src/super_segment/block/mod.rs b/src/super_segment/block/mod.rs index 9b117f6d..37a4e890 100644 --- a/src/super_segment/block/mod.rs +++ b/src/super_segment/block/mod.rs @@ -10,7 +10,7 @@ use crate::Slice; /// A block on disk. /// -/// Consists of a header and some bytes (the data/payload) +/// Consists of a header and some bytes (the data/payload). #[derive(Clone)] pub struct Block { pub header: Header, @@ -18,7 +18,7 @@ pub struct Block { } impl Block { - /// Returns the uncompressed block size in bytes + /// Returns the uncompressed block size in bytes. #[must_use] pub fn size(&self) -> usize { self.data.len() From cb3d78d782686fcd64d536428fdcf87827d6415c Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 9 Apr 2025 23:17:21 +0200 Subject: [PATCH 034/613] refactor --- src/super_segment/binary_index/reader.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/super_segment/binary_index/reader.rs b/src/super_segment/binary_index/reader.rs index 9d8fc81e..729cdec3 100644 --- a/src/super_segment/binary_index/reader.rs +++ b/src/super_segment/binary_index/reader.rs @@ -18,9 +18,11 @@ impl<'a> Reader<'a> { let offset = offset as usize; let len = len as usize; let step_size = step_size as usize; + let size = len * step_size; + let end = offset + size; Self { - bytes: &bytes[offset..(offset + len * step_size)], + bytes: &bytes[offset..end], step_size, } } From bb2a3fe1f385427903bb1b1e7594f9d6b1c8d1b9 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 9 Apr 2025 23:18:28 +0200 Subject: [PATCH 035/613] refactor --- fuzz/fuzz_targets/data_block.rs | 3 +- src/super_segment/block/header.rs | 77 ++++++++++++++++++++++++++++- src/super_segment/data_block/mod.rs | 22 ++++++++- 3 files changed, 99 insertions(+), 3 deletions(-) diff --git a/fuzz/fuzz_targets/data_block.rs b/fuzz/fuzz_targets/data_block.rs index 18ea83d0..cb2af88c 100644 --- a/fuzz/fuzz_targets/data_block.rs +++ b/fuzz/fuzz_targets/data_block.rs @@ -80,9 +80,10 @@ fuzz_target!(|data: &[u8]| { let data_block = DataBlock::new(Block { data: bytes.into(), - header: lsm_tree::segment::block::header::Header { + header: lsm_tree::super_segment::block::Header { checksum: lsm_tree::segment::block::checksum::Checksum::from_raw(0), data_length: 0, + uncompressed_length: 0, previous_block_offset: BlockOffset(0), }, }); diff --git a/src/super_segment/block/header.rs b/src/super_segment/block/header.rs index f2cbfb5c..855ac7af 100644 --- a/src/super_segment/block/header.rs +++ b/src/super_segment/block/header.rs @@ -1,4 +1,8 @@ -use crate::{segment::block::offset::BlockOffset, Checksum}; +use crate::coding::{Encode, EncodeError,Decode,DecodeError}; +use crate::{file::MAGIC_BYTES, segment::block::offset::BlockOffset, Checksum}; +use byteorder::LittleEndian; +use byteorder::{ReadBytesExt,WriteBytesExt}; +use std::io::{Read, Write}; /// Header of a disk-based block #[derive(Copy, Clone, Debug, Eq, PartialEq)] @@ -11,4 +15,75 @@ pub struct Header { /// On-disk size of data segment pub data_length: u32, + + /// Uncompressed size of data segment + pub uncompressed_length: u32, +} + +impl Header { + #[must_use] + pub const fn serialized_len() -> usize { + MAGIC_BYTES.len() + // Checksum + + std::mem::size_of::() + // Backlink + + std::mem::size_of::() + // On-disk size + + std::mem::size_of::() + // Uncompressed data length + + std::mem::size_of::() + } +} + +impl Encode for Header { + fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { + // Write header + writer.write_all(&MAGIC_BYTES)?; + + // Write checksum + writer.write_u64::(*self.checksum)?; + + // Write prev offset + writer.write_u64::(*self.previous_block_offset)?; + + // Write on-disk size length + writer.write_u32::(self.data_length)?; + + // Write uncompressed data length + writer.write_u32::(self.uncompressed_length)?; + + Ok(()) + } } + +impl Decode for Header { + fn decode_from(reader: &mut R) -> Result { + // Check header + let mut magic = [0u8; MAGIC_BYTES.len()]; + reader.read_exact(&mut magic)?; + + if magic != MAGIC_BYTES { + return Err(DecodeError::InvalidHeader("Block")); + } + + + // Read checksum + let checksum = reader.read_u64::()?; + + // Read prev offset + let previous_block_offset = reader.read_u64::()?; + + // Read data length + let data_length = reader.read_u32::()?; + + // Read data length + let uncompressed_length = reader.read_u32::()?; + + Ok(Self { + checksum: Checksum::from_raw(checksum), + previous_block_offset: BlockOffset(previous_block_offset), + data_length, + uncompressed_length, + }) + } +} \ No newline at end of file diff --git a/src/super_segment/data_block/mod.rs b/src/super_segment/data_block/mod.rs index f85f3659..065a67ab 100644 --- a/src/super_segment/data_block/mod.rs +++ b/src/super_segment/data_block/mod.rs @@ -103,7 +103,7 @@ macro_rules! unwrappy { /// Block that contains key-value pairs (user data) #[derive(Clone)] pub struct DataBlock { - inner: Block, + pub inner: Block, // Cached metadata restart_interval: u8, @@ -702,6 +702,7 @@ mod tests { header: Header { checksum: Checksum::from_raw(0), data_length: 0, + uncompressed_length: 0, previous_block_offset: BlockOffset(0), }, }); @@ -738,6 +739,7 @@ mod tests { header: Header { checksum: Checksum::from_raw(0), data_length: 0, + uncompressed_length: 0, previous_block_offset: BlockOffset(0), }, }); @@ -776,6 +778,7 @@ mod tests { header: Header { checksum: Checksum::from_raw(0), data_length: 0, + uncompressed_length: 0, previous_block_offset: BlockOffset(0), }, }); @@ -824,6 +827,7 @@ mod tests { header: Header { checksum: Checksum::from_raw(0), data_length: 0, + uncompressed_length: 0, previous_block_offset: BlockOffset(0), }, }); @@ -876,6 +880,7 @@ mod tests { header: Header { checksum: Checksum::from_raw(0), data_length: 0, + uncompressed_length: 0, previous_block_offset: BlockOffset(0), }, }); @@ -917,6 +922,7 @@ mod tests { header: Header { checksum: Checksum::from_raw(0), data_length: 0, + uncompressed_length: 0, previous_block_offset: BlockOffset(0), }, }); @@ -956,6 +962,7 @@ mod tests { header: Header { checksum: Checksum::from_raw(0), data_length: 0, + uncompressed_length: 0, previous_block_offset: BlockOffset(0), }, }); @@ -996,6 +1003,7 @@ mod tests { header: Header { checksum: Checksum::from_raw(0), data_length: 0, + uncompressed_length: 0, previous_block_offset: BlockOffset(0), }, }); @@ -1037,6 +1045,7 @@ mod tests { header: Header { checksum: Checksum::from_raw(0), data_length: 0, + uncompressed_length: 0, previous_block_offset: BlockOffset(0), }, }); @@ -1077,6 +1086,7 @@ mod tests { header: Header { checksum: Checksum::from_raw(0), data_length: 0, + uncompressed_length: 0, previous_block_offset: BlockOffset(0), }, }); @@ -1112,6 +1122,7 @@ mod tests { header: Header { checksum: Checksum::from_raw(0), data_length: 0, + uncompressed_length: 0, previous_block_offset: BlockOffset(0), }, }); @@ -1148,6 +1159,7 @@ mod tests { header: Header { checksum: Checksum::from_raw(0), data_length: 0, + uncompressed_length: 0, previous_block_offset: BlockOffset(0), }, }); @@ -1184,6 +1196,7 @@ mod tests { header: Header { checksum: Checksum::from_raw(0), data_length: 0, + uncompressed_length: 0, previous_block_offset: BlockOffset(0), }, }); @@ -1217,6 +1230,7 @@ mod tests { header: Header { checksum: Checksum::from_raw(0), data_length: 0, + uncompressed_length: 0, previous_block_offset: BlockOffset(0), }, }); @@ -1254,6 +1268,7 @@ mod tests { header: Header { checksum: Checksum::from_raw(0), data_length: 0, + uncompressed_length: 0, previous_block_offset: BlockOffset(0), }, }); @@ -1319,6 +1334,7 @@ mod tests { header: Header { checksum: Checksum::from_raw(0), data_length: 0, + uncompressed_length: 0, previous_block_offset: BlockOffset(0), }, }); @@ -1357,6 +1373,7 @@ mod tests { header: Header { checksum: Checksum::from_raw(0), data_length: 0, + uncompressed_length: 0, previous_block_offset: BlockOffset(0), }, }); @@ -1392,6 +1409,7 @@ mod tests { header: Header { checksum: Checksum::from_raw(0), data_length: 0, + uncompressed_length: 0, previous_block_offset: BlockOffset(0), }, }); @@ -1422,6 +1440,7 @@ mod tests { header: Header { checksum: Checksum::from_raw(0), data_length: 0, + uncompressed_length: 0, previous_block_offset: BlockOffset(0), }, }); @@ -1452,6 +1471,7 @@ mod tests { header: Header { checksum: Checksum::from_raw(0), data_length: 0, + uncompressed_length: 0, previous_block_offset: BlockOffset(0), }, }); From f0d2203ff71dd3f9210b1a3d7235026917dbdc90 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 10 Apr 2025 00:41:42 +0200 Subject: [PATCH 036/613] data blocks iter working now with ping pong iteration too --- fuzz/Cargo.toml | 2 + fuzz/fuzz_targets/data_block.rs | 94 ++++++++- src/super_segment/data_block/iter.rs | 277 ++++++++++++--------------- src/super_segment/data_block/mod.rs | 192 +++++++++++++------ 4 files changed, 335 insertions(+), 230 deletions(-) diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index d7f3f948..10cbfbe4 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -11,6 +11,8 @@ cargo-fuzz = true arbitrary = { version = "1", features = ["derive"] } libfuzzer-sys = "0.4" lsm-tree = { path = ".." } +rand_chacha = "0.9" +rand = "0.9" [[bin]] name = "data_block" diff --git a/fuzz/fuzz_targets/data_block.rs b/fuzz/fuzz_targets/data_block.rs index cb2af88c..577cca38 100644 --- a/fuzz/fuzz_targets/data_block.rs +++ b/fuzz/fuzz_targets/data_block.rs @@ -44,6 +44,15 @@ impl<'a> Arbitrary<'a> for FuzzyValue { } } +fn generate_ping_pong_code(seed: u64, len: usize) -> Vec { + use rand::prelude::*; + use rand::SeedableRng; + use rand_chacha::ChaCha8Rng; + + let mut rng = ChaCha8Rng::seed_from_u64(seed); + (0..len).map(|_| rng.random_range(0..=1)).collect() +} + fuzz_target!(|data: &[u8]| { let mut unstructured = Unstructured::new(data); @@ -53,6 +62,8 @@ fuzz_target!(|data: &[u8]| { .min(1.0) .max(0.0); + let seed = u64::arbitrary(&mut unstructured).unwrap(); + // eprintln!("restart_interval={restart_interval}, hash_ratio={hash_ratio}"); if let Ok(mut items) = as Arbitrary>::arbitrary(&mut unstructured) { @@ -113,17 +124,84 @@ fuzz_target!(|data: &[u8]| { ); } - assert_eq!(items, data_block.iter().collect::>(),); + assert_eq!(items, data_block.iter().collect::>()); - /* assert_eq!( + assert_eq!( items.iter().rev().cloned().collect::>(), - data_block - .iter() - .rev() - .collect::>(), - ); */ + data_block.iter().rev().collect::>(), + ); + + { + let ping_pongs = generate_ping_pong_code(seed, items.len()); + + let expected_ping_ponged_items = { + let mut iter = items.iter(); + let mut v = vec![]; + + for &x in &ping_pongs { + if x == 0 { + v.push(iter.next().cloned().unwrap()); + } else { + v.push(iter.next_back().cloned().unwrap()); + } + } + + v + }; + + let real_ping_ponged_items = { + let mut iter = data_block.iter(); + let mut v = vec![]; + + for &x in &ping_pongs { + if x == 0 { + v.push(iter.next().unwrap()); + } else { + v.push(iter.next_back().unwrap()); + } + } + + v + }; + + assert_eq!(expected_ping_ponged_items, real_ping_ponged_items); + } + + { + let ping_pongs = generate_ping_pong_code(seed, items.len()); + + let expected_ping_ponged_items = { + let mut iter = items.iter().rev(); + let mut v = vec![]; - // TODO: add ping-pong iters + for &x in &ping_pongs { + if x == 0 { + v.push(iter.next().cloned().unwrap()); + } else { + v.push(iter.next_back().cloned().unwrap()); + } + } + + v + }; + + let real_ping_ponged_items = { + let mut iter = data_block.iter().rev(); + let mut v = vec![]; + + for &x in &ping_pongs { + if x == 0 { + v.push(iter.next().unwrap()); + } else { + v.push(iter.next_back().unwrap()); + } + } + + v + }; + + assert_eq!(expected_ping_ponged_items, real_ping_ponged_items); + } // TODO: add range iter too } diff --git a/src/super_segment/data_block/iter.rs b/src/super_segment/data_block/iter.rs index 9a04d59d..e66ccb14 100644 --- a/src/super_segment/data_block/iter.rs +++ b/src/super_segment/data_block/iter.rs @@ -2,22 +2,28 @@ use super::DataBlock; use crate::{key::InternalKey, InternalValue, SeqNo, Slice}; use std::io::Cursor; +#[derive(Default, Debug)] +struct LoScanner { + offset: usize, + remaining_in_interval: usize, + base_key_offset: Option, +} + +#[derive(Debug)] +struct HiScanner { + offset: usize, + ptr_idx: usize, + stack: Vec, + base_key_offset: Option, +} + /// Double-ended iterator over data blocks pub struct Iter<'a> { block: &'a DataBlock, - - cursor: usize, - remaining_in_interval: usize, restart_interval: usize, - lo_watermark: usize, - - // base_key: Option<&'a [u8]>, - base_key_offset: Option, - - hi_ptr_idx: usize, - hi_stack: Vec, - // TODO: refactor into two members: LoScanner and HiScanner + lo_scanner: LoScanner, + hi_scanner: HiScanner, } /// [start, end] slice indexes @@ -48,11 +54,10 @@ impl ParsedItem { self.value_type.try_into().expect("should work"), ); - let value = if let Some(value) = &self.value { - bytes.slice(value.0..value.1) - } else { - Slice::empty() - }; + let value = self + .value + .as_ref() + .map_or_else(Slice::empty, |v| bytes.slice(v.0..v.1)); InternalValue { key, value } } @@ -66,184 +71,127 @@ impl<'a> Iter<'a> { Self { block, - cursor: 0, - remaining_in_interval: 0, restart_interval, - lo_watermark: 0, + lo_scanner: LoScanner::default(), - // base_key: None, // TODO: remove - base_key_offset: None, - - hi_ptr_idx: binary_index_len, - hi_stack: Vec::new(), + hi_scanner: HiScanner { + offset: 0, + ptr_idx: binary_index_len, + stack: Vec::new(), + base_key_offset: None, + }, } } - pub fn with_offset(mut self, offset: usize) -> Self { + /* pub fn with_offset(mut self, offset: usize) -> Self { self.lo_watermark = offset; self - } - - // TODO: refactor together with deserialize and point_read - // skip should return the basic info, and rename to deserialize - // rename deserialize to materialize by using the return type of deserialize - /* fn skip_restart_item(&mut self) -> crate::Result { - let bytes = &self.block.inner.data; - - // SAFETY: The cursor is advanced by read_ operations which check for EOF, - // And the cursor starts at 0 - the slice is never empty - #[warn(unsafe_code)] - let mut reader = Cursor::new(unsafe { bytes.get_unchecked(self.cursor..) }); - - let parsed = DataBlock::parse_restart_head(&mut reader)?; - - if parsed.value_type == TRAILER_START_MARKER { - return Ok(false); - } - - let value_type: ValueType = parsed - .value_type - .try_into() - .map_err(|()| DecodeError::InvalidTag(("ValueType", parsed.value_type)))?; - - let key_start = self.cursor + parsed.key_start; - let key_end = key_start + parsed.key_len; - let key = bytes.slice(key_start..key_end); - - let val_len: usize = if value_type == ValueType::Value { - reader.read_u32_varint()? as usize - } else { - 0 - }; - reader.seek_relative(val_len as i64)?; - - self.cursor += reader.position() as usize; - self.base_key = Some(key); - - Ok(true) } */ - // TODO: refactor together with deserialize and point_read - // skip should return the basic info, and rename to deserialize - // rename deserialize to materialize by using the return type of deserialize - /* fn skip_truncated_item(&mut self) -> crate::Result { - let bytes = &self.block.inner.data; + fn parse_restart_item( + block: &DataBlock, + offset: &mut usize, + base_key_offset: &mut Option, + ) -> Option { + let bytes = block.bytes(); // SAFETY: The cursor is advanced by read_ operations which check for EOF, // And the cursor starts at 0 - the slice is never empty #[warn(unsafe_code)] - let mut reader = Cursor::new(unsafe { bytes.get_unchecked(self.cursor..) }); + let mut reader = Cursor::new(unsafe { bytes.get_unchecked(*offset..) }); - let value_type = reader.read_u8()?; - - if value_type == TRAILER_START_MARKER { - return Ok(false); - } + let item = DataBlock::parse_restart_item(&mut reader, *offset)?; - let value_type: ValueType = value_type - .try_into() - .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type)))?; - - let _seqno = reader.read_u64_varint()?; - - let _shared_prefix_len: usize = reader.read_u16_varint()?.into(); - let rest_key_len: usize = reader.read_u16_varint()?.into(); - - reader.seek_relative(rest_key_len as i64)?; - - let val_len: usize = if value_type == ValueType::Value { - reader.read_u32_varint()? as usize - } else { - 0 - }; - reader.seek_relative(val_len as i64)?; - - self.cursor += reader.position() as usize; - - Ok(true) - } */ - - fn parse_restart_item(&mut self, offset: usize) -> Option { - let bytes = &self.block.inner.data; - - // SAFETY: The cursor is advanced by read_ operations which check for EOF, - // And the cursor starts at 0 - the slice is never empty - #[warn(unsafe_code)] - let mut reader = Cursor::new(unsafe { bytes.get_unchecked(offset..) }); - - let Some(item) = DataBlock::parse_restart_item(&mut reader, offset) else { - return None; - }; - - self.cursor += reader.position() as usize; - self.base_key_offset = Some(item.key.0); + *offset += reader.position() as usize; + *base_key_offset = Some(item.key.0); Some(item) } - fn parse_truncated_item(&mut self, offset: usize) -> Option { - let bytes = &self.block.inner.data; + fn parse_truncated_item( + block: &DataBlock, + offset: &mut usize, + base_key_offset: usize, + ) -> Option { + let bytes = block.bytes(); // SAFETY: The cursor is advanced by read_ operations which check for EOF, // And the cursor starts at 0 - the slice is never empty #[warn(unsafe_code)] - let mut reader = Cursor::new(unsafe { bytes.get_unchecked(offset..) }); + let mut reader = Cursor::new(unsafe { bytes.get_unchecked(*offset..) }); - let Some(item) = DataBlock::parse_truncated_item( - &mut reader, - offset, - self.base_key_offset.expect("should exist"), - ) else { - return None; - }; + let item = DataBlock::parse_truncated_item(&mut reader, *offset, base_key_offset)?; - self.cursor += reader.position() as usize; + *offset += reader.position() as usize; Some(item) } - /* fn consume_stack_top(&mut self) -> crate::Result> { - if let Some(offset) = self.hi_stack.pop() { - if self.lo_watermark > 0 && offset <= self.lo_watermark { - return Ok(None); + fn consume_stack_top(&mut self) -> Option { + if let Some(offset) = self.hi_scanner.stack.pop() { + if self.lo_scanner.offset > 0 && offset < self.lo_scanner.offset { + return None; } - self.cursor = offset; - - // TODO: pop from stack, check if offset < self.cursor, then also make sure to terminate forwards iteration - // TODO: probably need a lo_cursor + self.hi_scanner.offset = offset; - let is_restart = self.hi_stack.is_empty(); + let is_restart = self.hi_scanner.stack.is_empty(); if is_restart { - self.deserialize_restart_item() + Self::parse_restart_item( + self.block, + &mut self.hi_scanner.offset, + &mut self.hi_scanner.base_key_offset, + ) } else { - self.deserialize_truncated_item() + Self::parse_truncated_item( + self.block, + &mut self.hi_scanner.offset, + self.hi_scanner.base_key_offset.expect("should exist"), + ) } } else { - Ok(None) + None } - } */ + } } impl Iterator for Iter<'_> { type Item = ParsedItem; fn next(&mut self) -> Option { - let is_restart = self.remaining_in_interval == 0; + if self.hi_scanner.base_key_offset.is_some() + && self.lo_scanner.offset >= self.hi_scanner.offset + { + return None; + } - self.cursor = self.lo_watermark; + let is_restart = self.lo_scanner.remaining_in_interval == 0; let item = if is_restart { - self.remaining_in_interval = self.restart_interval; - self.parse_restart_item(self.lo_watermark) + self.lo_scanner.remaining_in_interval = self.restart_interval; + + Self::parse_restart_item( + self.block, + &mut self.lo_scanner.offset, + &mut self.lo_scanner.base_key_offset, + ) } else { - self.parse_truncated_item(self.lo_watermark) + Self::parse_truncated_item( + self.block, + &mut self.lo_scanner.offset, + self.lo_scanner.base_key_offset.expect("should exist"), + ) }; - self.lo_watermark = self.cursor; - self.remaining_in_interval -= 1; + self.lo_scanner.remaining_in_interval -= 1; + + if self.hi_scanner.base_key_offset.is_some() + && self.lo_scanner.offset >= self.hi_scanner.offset + { + return None; + } item } @@ -251,42 +199,53 @@ impl Iterator for Iter<'_> { impl DoubleEndedIterator for Iter<'_> { fn next_back(&mut self) -> Option { - todo!() - /* if let Some(top) = fail_iter!(self.consume_stack_top()) { - return Some(Ok(top)); + if let Some(top) = self.consume_stack_top() { + return Some(top); } - self.hi_ptr_idx = self.hi_ptr_idx.wrapping_sub(1); + self.hi_scanner.ptr_idx = self.hi_scanner.ptr_idx.wrapping_sub(1); // NOTE: If we wrapped, we are at the end // This is safe to do, because there cannot be that many restart intervals - if self.hi_ptr_idx == usize::MAX { + if self.hi_scanner.ptr_idx == usize::MAX { return None; } let binary_index = self.block.get_binary_index_reader(); { - let offset = binary_index.get(self.hi_ptr_idx); - self.cursor = offset; - - if fail_iter!(self.skip_restart_item()) { - self.hi_stack.push(offset); + self.hi_scanner.offset = binary_index.get(self.hi_scanner.ptr_idx); + let offset = self.hi_scanner.offset; + + if Self::parse_restart_item( + self.block, + &mut self.hi_scanner.offset, + &mut self.hi_scanner.base_key_offset, + ) + .is_some() + { + self.hi_scanner.stack.push(offset); } } for _ in 1..self.restart_interval { - let cursor = self.cursor; - - if fail_iter!(self.skip_truncated_item()) { - self.hi_stack.push(cursor); + let offset = self.hi_scanner.offset; + + if Self::parse_truncated_item( + self.block, + &mut self.hi_scanner.offset, + self.hi_scanner.base_key_offset.expect("should exist"), + ) + .is_some() + { + self.hi_scanner.stack.push(offset); } } - if self.hi_stack.is_empty() { + if self.hi_scanner.stack.is_empty() { return None; } - self.consume_stack_top().transpose() */ + self.consume_stack_top() } } diff --git a/src/super_segment/data_block/mod.rs b/src/super_segment/data_block/mod.rs index 065a67ab..7eb188b4 100644 --- a/src/super_segment/data_block/mod.rs +++ b/src/super_segment/data_block/mod.rs @@ -122,7 +122,7 @@ impl DataBlock { let trailer = Trailer::new(&inner); let mut reader = trailer.as_slice(); - let _item_count = reader.read_u32::().expect("should read"); + let _item_count = unwrappy!(reader.read_u32::()); let restart_interval = unwrappy!(reader.read_u8()); @@ -180,7 +180,7 @@ impl DataBlock { ClippingIter::new(self.iter().with_offset(offset), range) } */ - fn get_key_at(&self, pos: usize) -> crate::Result<(&[u8], Reverse)> { + fn get_key_at(&self, pos: usize) -> (&[u8], Reverse) { let bytes = &self.inner.data; // NOTE: Skip value type @@ -200,27 +200,9 @@ impl DataBlock { #[warn(unsafe_code)] let key = bytes.get(key_start..key_end).expect("should read"); - Ok((key, Reverse(seqno))) + (key, Reverse(seqno)) } - /* fn parse_restart_head(cursor: &mut Cursor<&[u8]>) -> crate::Result { - let value_type = unwrappy!(cursor.read_u8()); - - let seqno = unwrappy!(cursor.read_u64_varint()); - - let key_len: usize = unwrappy!(cursor.read_u16_varint()).into(); - let key_start = cursor.position() as usize; - - unwrappy!(cursor.seek_relative(key_len as i64)); - - Ok(RestartHead { - value_type, - seqno, - key_start, - key_len, - }) - } */ - /// Returns the binary index length (number of pointers). /// /// The number of pointers is equal to the number of restart intervals. @@ -326,9 +308,7 @@ impl DataBlock { let offset = binary_index.get(mid); - let peter = unwrappy!(self.get_key_at(offset)); - - if (needle, seqno_cmp) >= peter { + if (needle, seqno_cmp) >= self.get_key_at(offset) { left = mid + 1; } else { right = mid; @@ -340,9 +320,7 @@ impl DataBlock { let offset = binary_index.get(mid); - let peter = unwrappy!(self.get_key_at(offset)); - - if needle >= peter.0 { + if needle >= self.get_key_at(offset).0 { left = mid + 1; } else { right = mid; @@ -453,42 +431,6 @@ impl DataBlock { } fn scan(&self, needle: &[u8], seqno: Option, offset: usize) -> Option { - /* let iter = Iter::new(self).with_offset(offset); - - for kv in iter { - let kv = kv?; - - let cmp_result = if let Some(prefix) = &kv.prefix { - let prefix = &self.bytes()[prefix.0..prefix.1]; - let rest_key = &self.bytes()[kv.key.0..kv.key.1]; - compare_prefixed_slice(prefix, rest_key, needle) - } else { - let key = &self.bytes()[kv.key.0..kv.key.1]; - key.cmp(needle) - }; - - match cmp_result { - std::cmp::Ordering::Equal => { - // TODO: maybe return early if past seqno - let should_skip = seqno.is_some_and(|watermark| kv.seqno >= watermark); - - if !should_skip { - let kv = kv.materialize(&self.inner.data); - return Ok(Some(kv)); - } - } - std::cmp::Ordering::Greater => { - // Already passed needle - return Ok(None); - } - std::cmp::Ordering::Less => { - // Continue to next KV - } - } - } - - Ok(None) */ - let bytes = self.bytes(); // SAFETY: The cursor is advanced by read_ operations which check for EOF, @@ -1488,4 +1430,128 @@ mod tests { Ok(()) } + + #[test] + fn v3_data_block_consume_last_back() -> crate::Result<()> { + let items = [ + InternalValue::from_components("pla:earth:fact", "eaaaaaaaaarth", 0, Value), + InternalValue::from_components("pla:jupiter:fact", "Jupiter is big", 0, Value), + InternalValue::from_components("pla:jupiter:mass", "Massive", 0, Value), + InternalValue::from_components("pla:jupiter:name", "Jupiter", 0, Value), + InternalValue::from_components("pla:jupiter:radius", "Big", 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 1, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().is_none()); + + { + let mut iter = data_block.iter(); + assert_eq!(b"pla:earth:fact", &*iter.next().unwrap().key.user_key); + assert_eq!(b"pla:jupiter:fact", &*iter.next().unwrap().key.user_key); + assert_eq!(b"pla:jupiter:mass", &*iter.next().unwrap().key.user_key); + assert_eq!(b"pla:jupiter:name", &*iter.next().unwrap().key.user_key); + assert_eq!( + b"pla:jupiter:radius", + &*iter.next_back().unwrap().key.user_key + ); + assert!(iter.next_back().is_none()); + assert!(iter.next().is_none()); + } + + { + let mut iter = data_block.iter(); + assert_eq!(b"pla:earth:fact", &*iter.next().unwrap().key.user_key); + assert_eq!(b"pla:jupiter:fact", &*iter.next().unwrap().key.user_key); + assert_eq!(b"pla:jupiter:mass", &*iter.next().unwrap().key.user_key); + assert_eq!(b"pla:jupiter:name", &*iter.next().unwrap().key.user_key); + assert_eq!( + b"pla:jupiter:radius", + &*iter.next_back().unwrap().key.user_key + ); + assert!(iter.next().is_none()); + assert!(iter.next_back().is_none()); + } + + Ok(()) + } + + #[test] + fn v3_data_block_consume_last_forwards() -> crate::Result<()> { + let items = [ + InternalValue::from_components("pla:earth:fact", "eaaaaaaaaarth", 0, Value), + InternalValue::from_components("pla:jupiter:fact", "Jupiter is big", 0, Value), + InternalValue::from_components("pla:jupiter:mass", "Massive", 0, Value), + InternalValue::from_components("pla:jupiter:name", "Jupiter", 0, Value), + InternalValue::from_components("pla:jupiter:radius", "Big", 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 1, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().is_none()); + + { + let mut iter = data_block.iter().rev(); + assert_eq!(b"pla:earth:fact", &*iter.next_back().unwrap().key.user_key); + assert_eq!( + b"pla:jupiter:fact", + &*iter.next_back().unwrap().key.user_key + ); + assert_eq!( + b"pla:jupiter:mass", + &*iter.next_back().unwrap().key.user_key + ); + assert_eq!( + b"pla:jupiter:name", + &*iter.next_back().unwrap().key.user_key + ); + assert_eq!(b"pla:jupiter:radius", &*iter.next().unwrap().key.user_key); + assert!(iter.next().is_none()); + assert!(iter.next_back().is_none()); + } + + { + let mut iter = data_block.iter().rev(); + assert_eq!(b"pla:earth:fact", &*iter.next_back().unwrap().key.user_key); + assert_eq!( + b"pla:jupiter:fact", + &*iter.next_back().unwrap().key.user_key + ); + assert_eq!( + b"pla:jupiter:mass", + &*iter.next_back().unwrap().key.user_key + ); + assert_eq!( + b"pla:jupiter:name", + &*iter.next_back().unwrap().key.user_key + ); + assert_eq!(b"pla:jupiter:radius", &*iter.next().unwrap().key.user_key); + assert!(iter.next_back().is_none()); + assert!(iter.next().is_none()); + } + + Ok(()) + } } From 648ac53fb34c92ed35cb609805fec4595d64f4a1 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 10 Apr 2025 01:04:30 +0200 Subject: [PATCH 037/613] license --- src/super_segment/binary_index/builder.rs | 4 ++++ src/super_segment/binary_index/mod.rs | 4 ++++ src/super_segment/binary_index/reader.rs | 4 ++++ src/super_segment/block/encoder.rs | 4 ++++ src/super_segment/block/header.rs | 4 ++++ src/super_segment/block/mod.rs | 4 ++++ src/super_segment/block/trailer.rs | 5 ++++- src/super_segment/data_block/iter.rs | 4 ++++ src/super_segment/data_block/mod.rs | 4 ++++ src/super_segment/hash_index/builder.rs | 4 ++++ src/super_segment/hash_index/mod.rs | 4 ++++ src/super_segment/hash_index/reader.rs | 4 ++++ src/super_segment/mod.rs | 4 ++++ src/super_segment/util.rs | 4 ++++ 14 files changed, 56 insertions(+), 1 deletion(-) diff --git a/src/super_segment/binary_index/builder.rs b/src/super_segment/binary_index/builder.rs index d27e920e..f17c481e 100644 --- a/src/super_segment/binary_index/builder.rs +++ b/src/super_segment/binary_index/builder.rs @@ -1,3 +1,7 @@ +// Copyright (c) 2025-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + use byteorder::{LittleEndian, WriteBytesExt}; #[derive(Debug)] diff --git a/src/super_segment/binary_index/mod.rs b/src/super_segment/binary_index/mod.rs index de6da13c..bdd12251 100644 --- a/src/super_segment/binary_index/mod.rs +++ b/src/super_segment/binary_index/mod.rs @@ -1,3 +1,7 @@ +// Copyright (c) 2025-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + mod builder; mod reader; diff --git a/src/super_segment/binary_index/reader.rs b/src/super_segment/binary_index/reader.rs index 729cdec3..c10130f8 100644 --- a/src/super_segment/binary_index/reader.rs +++ b/src/super_segment/binary_index/reader.rs @@ -1,3 +1,7 @@ +// Copyright (c) 2025-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + use byteorder::{LittleEndian, ReadBytesExt}; macro_rules! unwrappy { diff --git a/src/super_segment/block/encoder.rs b/src/super_segment/block/encoder.rs index a6997fe6..2d875e22 100644 --- a/src/super_segment/block/encoder.rs +++ b/src/super_segment/block/encoder.rs @@ -1,3 +1,7 @@ +// Copyright (c) 2025-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + use super::super::hash_index::Builder as HashIndexBuilder; use super::{super::binary_index::Builder as BinaryIndexBuilder, Trailer}; use crate::super_segment::util::longest_shared_prefix_length; diff --git a/src/super_segment/block/header.rs b/src/super_segment/block/header.rs index 855ac7af..28a72537 100644 --- a/src/super_segment/block/header.rs +++ b/src/super_segment/block/header.rs @@ -1,3 +1,7 @@ +// Copyright (c) 2025-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + use crate::coding::{Encode, EncodeError,Decode,DecodeError}; use crate::{file::MAGIC_BYTES, segment::block::offset::BlockOffset, Checksum}; use byteorder::LittleEndian; diff --git a/src/super_segment/block/mod.rs b/src/super_segment/block/mod.rs index 37a4e890..64bfdf11 100644 --- a/src/super_segment/block/mod.rs +++ b/src/super_segment/block/mod.rs @@ -1,3 +1,7 @@ +// Copyright (c) 2025-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + mod encoder; mod header; mod trailer; diff --git a/src/super_segment/block/trailer.rs b/src/super_segment/block/trailer.rs index 094d9581..273681c7 100644 --- a/src/super_segment/block/trailer.rs +++ b/src/super_segment/block/trailer.rs @@ -1,9 +1,12 @@ -use crate::super_segment::hash_index::MAX_POINTERS_FOR_HASH_INDEX; +// Copyright (c) 2025-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) use super::{ encoder::{Encodable, Encoder}, Block, }; +use crate::super_segment::hash_index::MAX_POINTERS_FOR_HASH_INDEX; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; pub const TRAILER_START_MARKER: u8 = 255; diff --git a/src/super_segment/data_block/iter.rs b/src/super_segment/data_block/iter.rs index e66ccb14..b4e5e2b8 100644 --- a/src/super_segment/data_block/iter.rs +++ b/src/super_segment/data_block/iter.rs @@ -1,3 +1,7 @@ +// Copyright (c) 2025-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + use super::DataBlock; use crate::{key::InternalKey, InternalValue, SeqNo, Slice}; use std::io::Cursor; diff --git a/src/super_segment/data_block/mod.rs b/src/super_segment/data_block/mod.rs index 7eb188b4..cbb60e50 100644 --- a/src/super_segment/data_block/mod.rs +++ b/src/super_segment/data_block/mod.rs @@ -1,3 +1,7 @@ +// Copyright (c) 2025-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + mod iter; pub use iter::Iter; diff --git a/src/super_segment/hash_index/builder.rs b/src/super_segment/hash_index/builder.rs index 4444dfb1..37aec8f5 100644 --- a/src/super_segment/hash_index/builder.rs +++ b/src/super_segment/hash_index/builder.rs @@ -1,3 +1,7 @@ +// Copyright (c) 2025-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + use super::{calculate_bucket_position, MARKER_CONFLICT, MARKER_FREE}; use byteorder::WriteBytesExt; diff --git a/src/super_segment/hash_index/mod.rs b/src/super_segment/hash_index/mod.rs index 3f394a76..760659ad 100644 --- a/src/super_segment/hash_index/mod.rs +++ b/src/super_segment/hash_index/mod.rs @@ -1,3 +1,7 @@ +// Copyright (c) 2025-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + //! The hash index is a lightweight (typically <=1 byte per KV) index //! embeddeded into a block to speed up point reads //! diff --git a/src/super_segment/hash_index/reader.rs b/src/super_segment/hash_index/reader.rs index 0579deb3..369a3bcb 100644 --- a/src/super_segment/hash_index/reader.rs +++ b/src/super_segment/hash_index/reader.rs @@ -1,3 +1,7 @@ +// Copyright (c) 2025-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + use super::{calculate_bucket_position, MARKER_CONFLICT, MARKER_FREE}; /// Hash index lookup result diff --git a/src/super_segment/mod.rs b/src/super_segment/mod.rs index 1677feae..a41faa57 100644 --- a/src/super_segment/mod.rs +++ b/src/super_segment/mod.rs @@ -1,3 +1,7 @@ +// Copyright (c) 2025-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + pub(crate) mod binary_index; mod block; pub(crate) mod data_block; diff --git a/src/super_segment/util.rs b/src/super_segment/util.rs index d993c032..3ee25cc2 100644 --- a/src/super_segment/util.rs +++ b/src/super_segment/util.rs @@ -1,3 +1,7 @@ +// Copyright (c) 2025-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + use std::cmp::Ordering; pub fn longest_shared_prefix_length(s1: &[u8], s2: &[u8]) -> usize { From 0dcad84eda5ba547618293339dd62efaf9018a9c Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 10 Apr 2025 01:04:34 +0200 Subject: [PATCH 038/613] wip --- src/super_segment/index_block/encoder.rs | 204 ----------------------- 1 file changed, 204 deletions(-) delete mode 100644 src/super_segment/index_block/encoder.rs diff --git a/src/super_segment/index_block/encoder.rs b/src/super_segment/index_block/encoder.rs deleted file mode 100644 index 4ab3aa14..00000000 --- a/src/super_segment/index_block/encoder.rs +++ /dev/null @@ -1,204 +0,0 @@ -use super::super::hash_index::Builder as HashIndexBuilder; -use super::{super::binary_index::Builder as BinaryIndexBuilder, NewKeyedBlockHandle}; -use crate::super_segment::util::longest_shared_prefix_length; -use crate::{ - segment::{block::offset::BlockOffset, trailer::TRAILER_SIZE}, - super_segment::{block::TRAILER_START_MARKER, hash_index::MAX_POINTERS_FOR_HASH_INDEX}, -}; -use byteorder::{LittleEndian, WriteBytesExt}; -use std::io::Write; -use varint_rs::VarintWriter; - -pub struct Encoder<'a> { - writer: Vec, - - binary_index_builder: BinaryIndexBuilder, - hash_index_builder: HashIndexBuilder, - - restart_interval: u8, - - use_prefix_truncation: bool, - base_key: &'a [u8], - - offset: BlockOffset, - - restart_count: usize, - item_count: usize, -} - -impl<'a> Encoder<'a> { - pub fn new( - item_count: usize, - restart_interval: u8, - hash_index_ratio: f32, - use_prefix_truncation: bool, - first_key: &'a [u8], - ) -> Self { - let binary_index_len = item_count / usize::from(restart_interval); - let bucket_count = (item_count as f32 * hash_index_ratio) as u32; // TODO: verify - - Self { - writer: Vec::with_capacity(u16::MAX.into()), - - binary_index_builder: BinaryIndexBuilder::new(binary_index_len), - hash_index_builder: HashIndexBuilder::new(bucket_count), - - restart_interval, - - use_prefix_truncation, - base_key: first_key, - - offset: BlockOffset(0), - - restart_count: 0, - item_count: 0, - } - } - - pub fn write(&mut self, handle: &'a NewKeyedBlockHandle) -> crate::Result<()> { - // NOTE: Check if we are a restart marker - if self.item_count % usize::from(self.restart_interval) == 0 { - // We encode restart markers as: - // [offset] [size] [key len] [end key] - - self.restart_count += 1; - - // NOTE: We know that data blocks will never even approach 4 GB in size - #[allow(clippy::cast_possible_truncation)] - self.binary_index_builder.insert(self.writer.len() as u32); - - self.writer.write_u64_varint(*handle.offset)?; - self.writer.write_u32_varint(handle.size)?; - self.writer.write_u16_varint(handle.end_key.len() as u16)?; - self.writer.write_all(&handle.end_key)?; - - self.base_key = &handle.end_key; - self.offset = BlockOffset(*handle.offset + u64::from(handle.size)); - } else { - // We encode truncated handles as: - // [size] [shared prefix len] [rest key len] [rest key] - - self.writer.write_u32_varint(handle.size)?; - - let shared_prefix_len = if self.use_prefix_truncation { - // NOTE: We can safely cast to u16, because keys are u16 long max - #[allow(clippy::cast_possible_truncation)] - let shared_prefix_len = - longest_shared_prefix_length(self.base_key, &handle.end_key) as u16; - - shared_prefix_len - } else { - self.writer.write_u8(0)?; - 0 - }; - - // TODO: maybe we can skip this varint altogether if prefix truncation = false - self.writer.write_u16_varint(shared_prefix_len)?; - - // NOTE: We can safely cast to u16, because keys are u16 long max - #[allow(clippy::cast_possible_truncation)] - let rest_len = handle.end_key.len() as u16 - shared_prefix_len; - - self.writer.write_u16_varint(rest_len)?; - - let truncated_user_key = handle - .end_key - .get(shared_prefix_len as usize..) - .expect("should be in bounds"); - - self.writer.write_all(truncated_user_key)?; - - self.offset += u64::from(handle.size); - } - - if self.hash_index_builder.bucket_count() > 0 { - // NOTE: The max binary index is bound by u8 (technically u8::MAX - 2) - #[allow(clippy::cast_possible_truncation)] - self.hash_index_builder - .set(&handle.end_key, (self.restart_count - 1) as u8); - } - - self.item_count += 1; - - Ok(()) - } - - // TODO: trailer of data block and index block are the same... consolidate into some - // kind of TrailerWriter or whatever - pub fn finish(mut self) -> crate::Result> { - // IMPORTANT: Terminator marker - self.writer.write_u8(TRAILER_START_MARKER)?; - - // TODO: version u8? -> add to segment metadata instead - - // NOTE: We know that data blocks will never even approach 4 GB in size - #[allow(clippy::cast_possible_truncation)] - let binary_index_offset = self.writer.len() as u32; - - // Write binary index - let (binary_index_step_size, binary_index_len) = - self.binary_index_builder.write(&mut self.writer)?; - - let mut hash_index_offset = 0u32; - let hash_index_len = self.hash_index_builder.bucket_count(); - - // NOTE: We can only use a hash index when there are 254 buckets or less - // Because 254 and 255 are reserved marker values - // - // With the default restart interval of 16, that still gives us support - // for up to ~4000 KVs - if self.hash_index_builder.bucket_count() > 0 - && binary_index_len <= MAX_POINTERS_FOR_HASH_INDEX.into() - { - // NOTE: We know that data blocks will never even approach 4 GB in size - #[allow(clippy::cast_possible_truncation)] - { - hash_index_offset = self.writer.len() as u32; - } - - // Write hash index - self.hash_index_builder.write(&mut self.writer)?; - } - - // Trailer: - // [item_count] [restart_interval] [binary_index_offset] [binary_index_len] [hash_index_offset] [hash_index_len] - { - #[cfg(debug_assertions)] - let bytes_before = self.writer.len(); - - // NOTE: We know that data blocks will never even approach 4 GB in size, so there can't be that many items either - #[allow(clippy::cast_possible_truncation)] - self.writer - .write_u32::(self.item_count as u32)?; - - self.writer.write_u8(self.restart_interval)?; - - self.writer.write_u8(binary_index_step_size)?; - - self.writer.write_u32::(binary_index_offset)?; - - // NOTE: Even with a dense index, there can't be more index pointers than items - #[allow(clippy::cast_possible_truncation)] - self.writer - .write_u32::(binary_index_len as u32)?; - - self.writer.write_u32::(hash_index_offset)?; - - self.writer - .write_u32::(if hash_index_offset > 0 { - hash_index_len - } else { - 0 - })?; - - #[cfg(debug_assertions)] - assert_eq!( - TRAILER_SIZE, - self.writer.len() - bytes_before, - "trailer size does not match", - ); - } - - Ok(self.writer) - } -} From f062c465eefd59b089aa1fdd08cc163e9ee93957 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 10 Apr 2025 01:22:43 +0200 Subject: [PATCH 039/613] bad data block range (for now) --- fuzz/fuzz_targets/data_block.rs | 30 ++++++++++++++++- src/clipping_iter.rs | 50 ++++++++++++---------------- src/super_segment/data_block/iter.rs | 6 ++-- src/super_segment/data_block/mod.rs | 29 ++++++++-------- 4 files changed, 70 insertions(+), 45 deletions(-) diff --git a/fuzz/fuzz_targets/data_block.rs b/fuzz/fuzz_targets/data_block.rs index 577cca38..5311f1d0 100644 --- a/fuzz/fuzz_targets/data_block.rs +++ b/fuzz/fuzz_targets/data_block.rs @@ -203,7 +203,35 @@ fuzz_target!(|data: &[u8]| { assert_eq!(expected_ping_ponged_items, real_ping_ponged_items); } - // TODO: add range iter too + { + use rand::prelude::*; + use rand::SeedableRng; + use rand_chacha::ChaCha8Rng; + + let mut rng = ChaCha8Rng::seed_from_u64(seed); + let mut lo = rng.random_range(0..items.len()); + let mut hi = rng.random_range(0..items.len()); + + if lo > hi { + std::mem::swap(&mut lo, &mut hi); + } + + let lo_key = &items[lo].key.user_key; + let hi_key = &items[hi].key.user_key; + + let expected_range: Vec<_> = items + .iter() + .filter(|kv| kv.key.user_key >= lo_key && kv.key.user_key <= hi_key) + .cloned() + .collect(); + + assert_eq!( + expected_range, + data_block + .range::<&[u8], _>(&(lo_key.as_ref()..=hi_key.as_ref())) + .collect::>(), + ); + } } } }); diff --git a/src/clipping_iter.rs b/src/clipping_iter.rs index c5845f29..7c70810f 100644 --- a/src/clipping_iter.rs +++ b/src/clipping_iter.rs @@ -4,12 +4,15 @@ use std::{ ops::{Bound, RangeBounds}, }; +/* crate::Result */ +type Item = InternalValue; + /// Clips an iterator to a key range pub struct ClippingIter<'a, K, R, I> where K: AsRef<[u8]>, R: RangeBounds, - I: DoubleEndedIterator>, + I: DoubleEndedIterator, { _phantom: std::marker::PhantomData, @@ -24,7 +27,7 @@ impl<'a, K, R, I> ClippingIter<'a, K, R, I> where K: AsRef<[u8]>, R: RangeBounds, - I: DoubleEndedIterator>, + I: DoubleEndedIterator, { pub fn new(iter: I, range: &'a R) -> Self { Self { @@ -43,13 +46,13 @@ impl Iterator for ClippingIter<'_, K, R, I> where K: AsRef<[u8]>, R: RangeBounds, - I: DoubleEndedIterator>, + I: DoubleEndedIterator, { - type Item = crate::Result; + type Item = Item; fn next(&mut self) -> Option { loop { - let item = fail_iter!(self.inner.next()?); + let item = self.inner.next()?; // NOTE: PERF: As soon as we enter ->[lo..] // we don't need to do key comparisons anymore which are @@ -90,7 +93,7 @@ where Bound::Unbounded => {} } - return Some(Ok(item)); + return Some(item); } } } @@ -99,11 +102,11 @@ impl DoubleEndedIterator for ClippingIter<'_, K, R, I> where K: AsRef<[u8]>, R: RangeBounds, - I: DoubleEndedIterator>, + I: DoubleEndedIterator, { fn next_back(&mut self) -> Option { loop { - let item = fail_iter!(self.inner.next_back()?); + let item = self.inner.next_back()?; match self.range.start_bound() { Bound::Included(start) => { @@ -144,7 +147,7 @@ where } } - return Some(Ok(item)); + return Some(item); } } } @@ -165,14 +168,14 @@ mod tests { ]; let range = "c"..="d"; - let mut iter = ClippingIter::new(items.into_iter().map(Ok), &range); + let mut iter = ClippingIter::new(items.into_iter(), &range); assert_eq!( Some(b"c" as &[u8]), - iter.next().transpose()?.map(|x| x.key.user_key).as_deref(), + iter.next().map(|x| x.key.user_key).as_deref(), ); assert_eq!( Some(b"d" as &[u8]), - iter.next().transpose()?.map(|x| x.key.user_key).as_deref(), + iter.next().map(|x| x.key.user_key).as_deref(), ); assert!(iter.next().is_none()); @@ -190,20 +193,14 @@ mod tests { ]; let range = "c"..="d"; - let mut iter = ClippingIter::new(items.into_iter().map(Ok), &range); + let mut iter = ClippingIter::new(items.into_iter(), &range); assert_eq!( Some(b"d" as &[u8]), - iter.next_back() - .transpose()? - .map(|x| x.key.user_key) - .as_deref(), + iter.next_back().map(|x| x.key.user_key).as_deref(), ); assert_eq!( Some(b"c" as &[u8]), - iter.next_back() - .transpose()? - .map(|x| x.key.user_key) - .as_deref(), + iter.next_back().map(|x| x.key.user_key).as_deref(), ); assert!(iter.next_back().is_none()); @@ -221,21 +218,18 @@ mod tests { ]; let range = "b"..="d"; - let mut iter = ClippingIter::new(items.into_iter().map(Ok), &range); + let mut iter = ClippingIter::new(items.into_iter(), &range); assert_eq!( Some(b"b" as &[u8]), - iter.next().transpose()?.map(|x| x.key.user_key).as_deref(), + iter.next().map(|x| x.key.user_key).as_deref(), ); assert_eq!( Some(b"d" as &[u8]), - iter.next_back() - .transpose()? - .map(|x| x.key.user_key) - .as_deref(), + iter.next_back().map(|x| x.key.user_key).as_deref(), ); assert_eq!( Some(b"c" as &[u8]), - iter.next().transpose()?.map(|x| x.key.user_key).as_deref(), + iter.next().map(|x| x.key.user_key).as_deref(), ); assert!(iter.next_back().is_none()); assert!(iter.next().is_none()); diff --git a/src/super_segment/data_block/iter.rs b/src/super_segment/data_block/iter.rs index b4e5e2b8..64445696 100644 --- a/src/super_segment/data_block/iter.rs +++ b/src/super_segment/data_block/iter.rs @@ -88,10 +88,10 @@ impl<'a> Iter<'a> { } } - /* pub fn with_offset(mut self, offset: usize) -> Self { - self.lo_watermark = offset; + pub fn with_offset(mut self, offset: usize) -> Self { + self.lo_scanner.offset = offset; self - } */ + } fn parse_restart_item( block: &DataBlock, diff --git a/src/super_segment/data_block/mod.rs b/src/super_segment/data_block/mod.rs index cbb60e50..3413b247 100644 --- a/src/super_segment/data_block/mod.rs +++ b/src/super_segment/data_block/mod.rs @@ -10,6 +10,7 @@ use super::block::Trailer; use super::block::{Encodable, Encoder}; use super::hash_index::Reader as HashIndexReader; use super::{binary_index::Reader as BinaryIndexReader, Block}; +use crate::clipping_iter::ClippingIter; use crate::super_segment::block::TRAILER_START_MARKER; use crate::super_segment::util::compare_prefixed_slice; use crate::{InternalValue, SeqNo, ValueType}; @@ -17,6 +18,7 @@ use byteorder::WriteBytesExt; use byteorder::{LittleEndian, ReadBytesExt}; use iter::{ParsedItem, ParsedSlice}; use std::io::Seek; +use std::ops::RangeBounds; use std::{cmp::Reverse, io::Cursor}; use varint_rs::{VarintReader, VarintWriter}; @@ -174,15 +176,20 @@ impl DataBlock { Iter::new(self).map(|kv| kv.materialize(&self.inner.data)) } - /* pub fn range<'a, K: AsRef<[u8]> + 'a, R: RangeBounds + 'a>( + pub fn range<'a, K: AsRef<[u8]> + 'a, R: RangeBounds + 'a>( &'a self, range: &'a R, - ) -> impl DoubleEndedIterator> + 'a { + ) -> impl DoubleEndedIterator + 'a { let offset = 0; // TODO: range & seek to range start using binary index/hash index (first matching restart interval) // TODO: and if range end, seek to range end as well (last matching restart interval) - ClippingIter::new(self.iter().with_offset(offset), range) - } */ + ClippingIter::new( + Iter::new(self) + .with_offset(offset) + .map(|kv| kv.materialize(&self.inner.data)), + range, + ) + } fn get_key_at(&self, pos: usize) -> (&[u8], Reverse) { let bytes = &self.inner.data; @@ -1288,16 +1295,13 @@ mod tests { assert_eq!(data_block.len(), items.len()); assert!(data_block.hash_bucket_count().unwrap() > 0); - /* assert_eq!( + assert_eq!( { #[allow(clippy::suspicious_map)] - data_block - .range(&((b"pla:venus:" as &[u8])..)) - .map(|x| x.unwrap()) - .count() + data_block.range(&((b"pla:venus:" as &[u8])..)).count() }, 3, - ); */ + ); Ok(()) } @@ -1327,17 +1331,16 @@ mod tests { assert_eq!(data_block.len(), items.len()); assert!(data_block.hash_bucket_count().unwrap() > 0); - /* assert_eq!( + assert_eq!( { #[allow(clippy::suspicious_map)] data_block .range(&((b"pla:venus:" as &[u8])..)) .rev() - .map(|x| x.unwrap()) .count() }, 3, - ); */ + ); Ok(()) } From bf6b39e87dc454f7700191bfe975b1766ec10821 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 10 Apr 2025 01:23:11 +0200 Subject: [PATCH 040/613] add Block::from_file with compression support --- src/super_segment/block/mod.rs | 82 ++++++++++++++++++++++++++++++++-- 1 file changed, 79 insertions(+), 3 deletions(-) diff --git a/src/super_segment/block/mod.rs b/src/super_segment/block/mod.rs index 64bfdf11..e6f62d16 100644 --- a/src/super_segment/block/mod.rs +++ b/src/super_segment/block/mod.rs @@ -6,11 +6,12 @@ mod encoder; mod header; mod trailer; -pub use encoder::{Encodable, Encoder}; +pub(crate) use encoder::{Encodable, Encoder}; pub use header::Header; -pub use trailer::{Trailer, TRAILER_START_MARKER}; +pub(crate) use trailer::{Trailer, TRAILER_START_MARKER}; -use crate::Slice; +use crate::{coding::Decode, segment::block::offset::BlockOffset, CompressionType, Slice}; +use std::fs::File; /// A block on disk. /// @@ -27,4 +28,79 @@ impl Block { pub fn size(&self) -> usize { self.data.len() } + + pub fn from_file( + file: &File, + offset: BlockOffset, + size: usize, + compression: CompressionType, + ) -> crate::Result { + // TODO: use a Slice::get_mut instead... needs value-log update + let mut buf = byteview::ByteView::with_size(size); + + { + let mut mutator = buf.get_mut().expect("should be the owner"); + + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + file.read_at(&mut mutator, *offset)?; + } + + #[cfg(windows)] + { + todo!() + } + + #[cfg(not(any(unix, windows)))] + { + compile_error!("unsupported OS"); + unimplemented!(); + } + } + + let header = Header::decode_from(&mut &*buf)?; + + debug_assert_eq!(header.uncompressed_length, { + #[allow(clippy::expect_used, clippy::cast_possible_truncation)] + { + buf.get(Header::serialized_len()..) + .expect("should be in bounds") + .len() as u32 + } + }); + + let data = match compression { + CompressionType::None => buf.slice(Header::serialized_len()..), + CompressionType::Lz4 => { + // NOTE: We that a header always exists and data is never empty + // So the slice is fine + #[allow(clippy::indexing_slicing)] + let raw_data = &buf[Header::serialized_len()..]; + + let mut data = byteview::ByteView::with_size(header.uncompressed_length as usize); + { + let mut mutator = data.get_mut().expect("should be the owner"); + lz4_flex::decompress_into(raw_data, &mut mutator) + .map_err(|_| crate::Error::Decompress(compression))?; + } + data + } + CompressionType::Miniz(_) => { + // NOTE: We that a header always exists and data is never empty + // So the slice is fine + #[allow(clippy::indexing_slicing)] + let raw_data = &buf[Header::serialized_len()..]; + + miniz_oxide::inflate::decompress_to_vec(raw_data) + .map_err(|_| crate::Error::Decompress(compression))? + .into() + } + }; + + Ok(Self { + header, + data: Slice::from(data), + }) + } } From 97e57aa1551541f076019c5b8001f6a8bc6948d5 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 10 Apr 2025 01:24:32 +0200 Subject: [PATCH 041/613] wip --- src/super_segment/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/super_segment/mod.rs b/src/super_segment/mod.rs index a41faa57..b749e99e 100644 --- a/src/super_segment/mod.rs +++ b/src/super_segment/mod.rs @@ -3,12 +3,12 @@ // (found in the LICENSE-* files in the repository) pub(crate) mod binary_index; -mod block; +pub mod block; pub(crate) mod data_block; pub(crate) mod hash_index; // mod index_block; pub(crate) mod util; -pub use block::{Block, Header}; +pub use block::Block; pub use data_block::DataBlock; // pub use index_block::IndexBlock; From d78c4d7532e9d680b2a99577fa49b90ad7b74335 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 10 Apr 2025 01:55:03 +0200 Subject: [PATCH 042/613] index block "point read" kinda works --- src/super_segment/index_block/block_handle.rs | 92 ++++ src/super_segment/index_block/mod.rs | 450 ++++++++++++++++++ src/super_segment/mod.rs | 4 +- 3 files changed, 544 insertions(+), 2 deletions(-) create mode 100644 src/super_segment/index_block/block_handle.rs create mode 100644 src/super_segment/index_block/mod.rs diff --git a/src/super_segment/index_block/block_handle.rs b/src/super_segment/index_block/block_handle.rs new file mode 100644 index 00000000..4a0c0485 --- /dev/null +++ b/src/super_segment/index_block/block_handle.rs @@ -0,0 +1,92 @@ +// Copyright (c) 2025-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use crate::{segment::block::offset::BlockOffset, super_segment::block::Encodable}; +use value_log::UserKey; +use varint_rs::VarintWriter; + +/// Points to a block on file +#[derive(Clone, Debug, Eq)] +#[allow(clippy::module_name_repetitions)] +pub struct NewKeyedBlockHandle { + /// Key of last item in block + pub end_key: UserKey, + + /// Position of block in file + pub offset: BlockOffset, + + /// Size of block in bytes + pub size: u32, +} + +impl Ord for NewKeyedBlockHandle { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.offset.cmp(&other.offset) + } +} + +impl PartialOrd for NewKeyedBlockHandle { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.offset.cmp(&other.offset)) + } +} + +impl PartialEq for NewKeyedBlockHandle { + fn eq(&self, other: &Self) -> bool { + self.offset == other.offset + } +} + +impl Encodable for NewKeyedBlockHandle { + fn encode_full_into( + &self, + writer: &mut W, + state: &mut BlockOffset, + ) -> crate::Result<()> { + // We encode restart markers as: + // [offset] [size] [key len] [end key] + // 1 2 3 4 + + writer.write_u64_varint(*self.offset)?; // 1 + writer.write_u32_varint(self.size)?; // 2 + writer.write_u16_varint(self.end_key.len() as u16)?; // 3 + writer.write_all(&self.end_key)?; // 4 + + *state = BlockOffset(*self.offset + u64::from(self.size)); + + Ok(()) + } + + fn encode_truncated_into( + &self, + writer: &mut W, + state: &mut BlockOffset, + shared_len: usize, + ) -> crate::Result<()> { + // We encode truncated handles as: + // [size] [shared prefix len] [rest key len] [rest key] + + writer.write_u32_varint(self.size)?; + + // TODO: maybe we can skip this varint altogether if prefix truncation = false + writer.write_u16_varint(shared_len as u16)?; + + // NOTE: We can safely cast to u16, because keys are u16 long max + #[allow(clippy::cast_possible_truncation)] + let rest_len = self.end_key.len() - shared_len; + + writer.write_u16_varint(rest_len as u16)?; + + let truncated_user_key = self.end_key.get(shared_len..).expect("should be in bounds"); + writer.write_all(truncated_user_key)?; + + *state += u64::from(self.size); + + Ok(()) + } + + fn key(&self) -> &[u8] { + &self.end_key + } +} diff --git a/src/super_segment/index_block/mod.rs b/src/super_segment/index_block/mod.rs new file mode 100644 index 00000000..099439aa --- /dev/null +++ b/src/super_segment/index_block/mod.rs @@ -0,0 +1,450 @@ +// Copyright (c) 2025-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +mod block_handle; + +pub use block_handle::NewKeyedBlockHandle; + +use super::{binary_index::Reader as BinaryIndexReader, block::Encoder, Block}; +use crate::{ + segment::{block::offset::BlockOffset, trailer::TRAILER_SIZE}, + super_segment::block::Trailer, +}; +use byteorder::{LittleEndian, ReadBytesExt}; +use std::io::{Cursor, Seek}; +use varint_rs::VarintReader; + +macro_rules! unwrappy { + ($x:expr) => { + // $x.expect("should read") + + unsafe { $x.unwrap_unchecked() } + }; +} + +/// Block that contains block handles (file offset + size) +pub struct IndexBlock { + pub inner: Block, + + // Cached metadata + restart_interval: u8, + + binary_index_step_size: u8, + binary_index_offset: u32, + binary_index_len: u32, +} + +struct RestartHead { + offset: BlockOffset, + size: u32, + key_start: usize, + key_len: usize, +} + +impl IndexBlock { + #[must_use] + pub fn new(inner: Block) -> Self { + let trailer = Trailer::new(&inner); + let mut reader = trailer.as_slice(); + + let _item_count = reader.read_u32::().expect("should read"); + + let restart_interval = unwrappy!(reader.read_u8()); + + let binary_index_step_size = unwrappy!(reader.read_u8()); + + debug_assert!( + binary_index_step_size == 2 || binary_index_step_size == 4, + "invalid binary index step size", + ); + + let binary_index_offset = unwrappy!(reader.read_u32::()); + let binary_index_len = unwrappy!(reader.read_u32::()); + + Self { + inner, + + restart_interval, + + binary_index_step_size, + binary_index_offset, + binary_index_len, + } + } + + /// Returns the amount of items in the block + #[must_use] + pub fn item_count(&self) -> usize { + Trailer::new(&self.inner).item_count() + } + + /// Always returns false: a block is never empty + #[must_use] + pub fn is_empty(&self) -> bool { + false + } + + /// Returns the trailer position. + fn trailer_offset(data: &[u8]) -> usize { + data.len() - TRAILER_SIZE + } + + /// Access the inner raw bytes + #[must_use] + fn bytes(&self) -> &[u8] { + &self.inner.data + } + + /// Returns the binary index length (number of pointers). + /// + /// The number of pointers is equal to the number of restart intervals. + #[must_use] + pub fn binary_index_len(&self) -> u32 { + self.binary_index_len + } + + /// Returns the binary index offset. + #[must_use] + fn binary_index_offset(&self) -> u32 { + self.binary_index_offset + } + + /// Returns the binary index step size. + /// + /// The binary index can either store u16 or u32 pointers, + /// depending on the size of the data block. + /// + /// Typically blocks are < 64K, so u16 pointers reduce the index + /// size by half. + #[must_use] + fn binary_index_step_size(&self) -> u8 { + self.binary_index_step_size + } + + fn get_binary_index_reader(&self) -> BinaryIndexReader { + BinaryIndexReader::new( + self.bytes(), + self.binary_index_offset(), + self.binary_index_len(), + self.binary_index_step_size(), + ) + } + + fn parse_restart_head(cursor: &mut Cursor<&[u8]>) -> RestartHead { + let offset = unwrappy!(cursor.read_u64_varint()); + let size = unwrappy!(cursor.read_u32_varint()); + + let key_len: usize = unwrappy!(cursor.read_u16_varint()).into(); + let key_start = cursor.position() as usize; + + unwrappy!(cursor.seek_relative(key_len as i64)); + + RestartHead { + offset: BlockOffset(offset), + size, + key_start, + key_len, + } + } + + fn get_key_at(&self, pos: usize) -> &[u8] { + let bytes = &self.inner.data; + + // SAFETY: pos is always retrieved from the binary index, + // which we consider to be trustworthy + #[warn(unsafe_code)] + let mut cursor = Cursor::new(unsafe { bytes.get_unchecked(pos..) }); + + // TODO: maybe move these behind the key + let _ = unwrappy!(cursor.read_u64_varint()); + let _ = unwrappy!(cursor.read_u32_varint()); + + let key_len: usize = unwrappy!(cursor.read_u16_varint()).into(); + let key_start = cursor.position() as usize; + + let key_start = pos + key_start; + let key_end = key_start + key_len; + + #[warn(unsafe_code)] + let key = bytes.get(key_start..key_end).expect("should read"); + + key + } + + /* fn walk( + &self, + needle: &[u8], + pos: usize, + restart_interval: usize, + ) -> crate::Result> { + use std::cmp::Ordering::{Equal, Greater, Less}; + + let bytes = &self.inner.data; + let mut cursor = Cursor::new(&bytes[pos..]); + + let mut base_key_pos = 0; + let mut offset = BlockOffset(0); + + // NOTE: Check the full item + let base_key = { + let parsed = unwrappy!(Self::parse_restart_head(&mut cursor)); + + let key_start = pos + parsed.key_start; + let key_end = key_start + parsed.key_len; + let key = &bytes[key_start..key_end]; + + match key.cmp(needle) { + Equal => { + let key = bytes.slice(key_start..key_end); + + return Ok(Some(NewKeyedBlockHandle { + end_key: key, + offset: parsed.offset, + size: parsed.size, + })); + } + Greater => { + // NOTE: Already passed searched key + return Ok(None); + } + Less => { + // NOTE: Continue + } + } + + base_key_pos = key_start; + offset = BlockOffset(*parsed.offset + u64::from(parsed.size)); + + key + }; + + // NOTE: Check the rest items + for _idx in 1..restart_interval { + let size = cursor.read_u32_varint()?; + + let shared_prefix_len: usize = unwrappy!(cursor.read_u16_varint()).into(); + let rest_key_len: usize = unwrappy!(cursor.read_u16_varint()).into(); + + let key_offset = pos + cursor.position() as usize; + + // NOTE: PERF: Slicing seems to be faster than get_unchecked!! + let prefix_part = &base_key[0..shared_prefix_len]; + let rest_key = &bytes[key_offset..(key_offset + rest_key_len)]; + + unwrappy!(cursor.seek_relative(rest_key_len as i64)); + + match compare_prefixed_slice(prefix_part, rest_key, needle) { + Equal => { + let key = if shared_prefix_len == 0 { + bytes.slice(key_offset..(key_offset + rest_key_len)) + } else if rest_key_len == 0 { + bytes.slice(base_key_pos..(base_key_pos + shared_prefix_len)) + } else { + // Stitch key + UserKey::fused(prefix_part, rest_key) + }; + + return Ok(Some(NewKeyedBlockHandle { + end_key: key, + offset, + size, + })); + } + Greater => { + // NOTE: Already passed searched key + return Ok(None); + } + Less => { + // NOTE: Continue + } + } + + offset += u64::from(size); + } + + Ok(None) + } */ + + fn binary_search_for_offset( + &self, + binary_index: &BinaryIndexReader, + needle: &[u8], + ) -> Option { + let mut left: usize = 0; + let mut right = binary_index.len(); + + if right == 0 { + return None; + } + + while left < right { + let mid = left + (right - left) / 2; + + let offset = binary_index.get(mid); + + if needle >= self.get_key_at(offset) { + left = mid + 1; + } else { + right = mid; + } + } + + if left == 0 { + return None; + } + + let offset = binary_index.get(left - 1); + + Some(offset) + } + + #[must_use] + pub fn get_lowest_possible_block(&self, needle: &[u8]) -> Option { + let binary_index = self.get_binary_index_reader(); + + let offset = self.binary_search_for_offset(&binary_index, needle)?; + + // SAFETY: pos is always retrieved from the binary index, + // which we consider to be trustworthy + #[warn(unsafe_code)] + let mut cursor = Cursor::new(unsafe { self.inner.data.get_unchecked(offset..) }); + + let item = Self::parse_restart_head(&mut cursor); + + Some(NewKeyedBlockHandle { + offset: item.offset, + size: item.size, + end_key: self + .inner + .data + .slice(item.key_start..(item.key_start + item.key_len)), + }) + + /* let binary_index = self.get_binary_index_reader(); + + // NOTE: Currently, the hash index is never initialized for index blocks + /* // NOTE: Try hash index if it exists + if let Some(bucket_value) = self + .get_hash_index_reader() + .and_then(|reader| reader.get(key)) + { + let restart_entry_pos = binary_index.get(usize::from(bucket_value)); + return self.walk(key, seqno, restart_entry_pos, self.restart_interval.into()); + } */ + + // NOTE: Fallback to binary search + + let mut left = 0; + let mut right = binary_index.len(); + + if right == 0 { + return Ok(None); + } + + while left < right { + let mid = (left + right) / 2; + + let offset = binary_index.get(mid); + + if key >= self.get_key_at(offset)? { + left = mid + 1; + } else { + right = mid; + } + } + + if left == 0 { + return Ok(None); + } + + let offset = binary_index.get(left - 1); + + self.walk(key, offset, self.restart_interval.into()) */ + } + + pub fn encode_items( + items: &[NewKeyedBlockHandle], + hash_index_ratio: f32, + ) -> crate::Result> { + let first_key = &items.first().expect("chunk should not be empty").end_key; + + let mut serializer = Encoder::<'_, BlockOffset, NewKeyedBlockHandle>::new( + items.len(), + 1, // TODO: hard-coded for now + hash_index_ratio, + first_key, + ); + + for item in items { + serializer.write(item)?; + } + + serializer.finish() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{segment::block::offset::BlockOffset, super_segment::block::Header, Checksum}; + use test_log::test; + + #[test] + fn v3_index_block_simple() -> crate::Result<()> { + let items = [ + NewKeyedBlockHandle { + end_key: b"a".into(), + offset: BlockOffset(0), + size: 6_000, + }, + NewKeyedBlockHandle { + end_key: b"abcdef".into(), + offset: BlockOffset(6_000), + size: 7_000, + }, + NewKeyedBlockHandle { + end_key: b"def".into(), + offset: BlockOffset(13_000), + size: 5_000, + }, + ]; + + let bytes = IndexBlock::encode_items(&items, 0.0)?; + /* eprintln!("{bytes:?}"); + eprintln!("{}", String::from_utf8_lossy(&bytes)); + eprintln!("encoded into {} bytes", bytes.len()); */ + + let data_block = IndexBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.item_count(), items.len()); + + for needle in items { + // eprintln!("NEEDLE {needle:?}"); + + assert_eq!( + Some(needle.clone()), + data_block.get_lowest_possible_block(&needle.end_key), + ); + } + + assert_eq!( + Some(NewKeyedBlockHandle { + end_key: b"abcdef".into(), + offset: BlockOffset(6_000), + size: 7_000, + }), + data_block.get_lowest_possible_block(b"ccc"), + ); + + Ok(()) + } +} diff --git a/src/super_segment/mod.rs b/src/super_segment/mod.rs index b749e99e..ccdc1c03 100644 --- a/src/super_segment/mod.rs +++ b/src/super_segment/mod.rs @@ -6,9 +6,9 @@ pub(crate) mod binary_index; pub mod block; pub(crate) mod data_block; pub(crate) mod hash_index; -// mod index_block; +mod index_block; pub(crate) mod util; pub use block::Block; pub use data_block::DataBlock; -// pub use index_block::IndexBlock; +pub use index_block::IndexBlock; From bdd1e176585010988333e8841b59712f0f4dfb50 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 10 Apr 2025 01:55:18 +0200 Subject: [PATCH 043/613] refactor --- src/super_segment/data_block/mod.rs | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/src/super_segment/data_block/mod.rs b/src/super_segment/data_block/mod.rs index 3413b247..29450ccf 100644 --- a/src/super_segment/data_block/mod.rs +++ b/src/super_segment/data_block/mod.rs @@ -646,9 +646,6 @@ mod tests { ]; let bytes = DataBlock::encode_items(&items, 16, 0.75)?; - eprintln!("{bytes:?}"); - eprintln!("{}", String::from_utf8_lossy(&bytes)); - eprintln!("encoded into {} bytes", bytes.len()); let data_block = DataBlock::new(Block { data: bytes.into(), @@ -683,9 +680,6 @@ mod tests { ]; let bytes = DataBlock::encode_items(&items, 16, 0.75)?; - eprintln!("{bytes:?}"); - eprintln!("{}", String::from_utf8_lossy(&bytes)); - eprintln!("encoded into {} bytes", bytes.len()); let data_block = DataBlock::new(Block { data: bytes.into(), @@ -722,9 +716,6 @@ mod tests { ]; let bytes = DataBlock::encode_items(&items, 2, 0.0)?; - eprintln!("{bytes:?}"); - eprintln!("{}", String::from_utf8_lossy(&bytes)); - eprintln!("encoded into {} bytes", bytes.len()); let data_block = DataBlock::new(Block { data: bytes.into(), @@ -866,9 +857,6 @@ mod tests { ]; let bytes = DataBlock::encode_items(&items, 1, 0.0)?; - eprintln!("{bytes:?}"); - eprintln!("{}", String::from_utf8_lossy(&bytes)); - eprintln!("encoded into {} bytes", bytes.len()); let data_block = DataBlock::new(Block { data: bytes.into(), @@ -906,9 +894,6 @@ mod tests { ]; let bytes = DataBlock::encode_items(&items, 1, 0.75)?; - eprintln!("{bytes:?}"); - eprintln!("{}", String::from_utf8_lossy(&bytes)); - eprintln!("encoded into {} bytes", bytes.len()); let data_block = DataBlock::new(Block { data: bytes.into(), @@ -947,9 +932,6 @@ mod tests { ]; let bytes = DataBlock::encode_items(&items, 1, 0.0)?; - eprintln!("{bytes:?}"); - eprintln!("{}", String::from_utf8_lossy(&bytes)); - eprintln!("encoded into {} bytes", bytes.len()); let data_block = DataBlock::new(Block { data: bytes.into(), @@ -989,9 +971,6 @@ mod tests { ]; let bytes = DataBlock::encode_items(&items, 16, 0.75)?; - eprintln!("{bytes:?}"); - eprintln!("{}", String::from_utf8_lossy(&bytes)); - eprintln!("encoded into {} bytes", bytes.len()); let data_block = DataBlock::new(Block { data: bytes.into(), @@ -1030,9 +1009,6 @@ mod tests { ]; let bytes = DataBlock::encode_items(&items, 1, 0.75)?; - eprintln!("{bytes:?}"); - eprintln!("{}", String::from_utf8_lossy(&bytes)); - eprintln!("encoded into {} bytes", bytes.len()); let data_block = DataBlock::new(Block { data: bytes.into(), From a88a73aa98c67647d06510733af198ee67c87160 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 10 Apr 2025 02:26:24 +0200 Subject: [PATCH 044/613] wip fuzz --- fuzz/fuzz_targets/data_block.rs | 301 ++++++++++++------------ src/super_segment/hash_index/builder.rs | 2 +- 2 files changed, 158 insertions(+), 145 deletions(-) diff --git a/fuzz/fuzz_targets/data_block.rs b/fuzz/fuzz_targets/data_block.rs index 5311f1d0..4b1c12c7 100644 --- a/fuzz/fuzz_targets/data_block.rs +++ b/fuzz/fuzz_targets/data_block.rs @@ -58,180 +58,193 @@ fuzz_target!(|data: &[u8]| { let restart_interval = u8::arbitrary(&mut unstructured).unwrap().max(1); - let hash_ratio = ((u16::arbitrary(&mut unstructured).unwrap() / u16::MAX) as f32) - .min(1.0) - .max(0.0); - let seed = u64::arbitrary(&mut unstructured).unwrap(); // eprintln!("restart_interval={restart_interval}, hash_ratio={hash_ratio}"); - if let Ok(mut items) = as Arbitrary>::arbitrary(&mut unstructured) { - // let mut items = items.to_vec(); - - if !items.is_empty() { - items.sort(); - items.dedup(); - - /* eprintln!("-- items --"); - for item in items.iter().map(|value| &value.0) { - eprintln!( - r#"InternalValue::from_components({:?}, {:?}, {}, {:?}),"#, - item.key.user_key, item.value, item.key.seqno, item.key.value_type, - ); - } */ - - /* if items.len() > 100 { - eprintln!("================== {}. ", items.len()); - } */ - - let items = items.into_iter().map(|value| value.0).collect::>(); - let bytes = - DataBlock::encode_items(&items, restart_interval.into(), hash_ratio).unwrap(); - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: lsm_tree::super_segment::block::Header { - checksum: lsm_tree::segment::block::checksum::Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - - if data_block.binary_index_len() > 254 { - assert!(data_block.hash_bucket_count().is_none()); - } else if hash_ratio > 0.0 { - assert!(data_block.hash_bucket_count().unwrap() > 0); - } - - // eprintln!("{items:?}"); - - for needle in &items { - if needle.key.seqno == SeqNo::MAX { - continue; - } + let item_count = { + use rand::prelude::*; + use rand::SeedableRng; + use rand_chacha::ChaCha8Rng; + + let mut rng = ChaCha8Rng::seed_from_u64(seed); + rng.random_range(1..1_000) + }; + + let hash_ratio = { + use rand::prelude::*; + use rand::SeedableRng; + use rand_chacha::ChaCha8Rng; + + let mut rng = ChaCha8Rng::seed_from_u64(seed); + rng.random_range(0.0..4.0) + }; + + let mut items = (0..item_count) + .map(|_| FuzzyValue::arbitrary(&mut unstructured).unwrap()) + .collect::>(); + + assert!(!items.is_empty()); + + items.sort(); + items.dedup(); + + /* eprintln!("-- items --"); + for item in items.iter().map(|value| &value.0) { + eprintln!( + r#"InternalValue::from_components({:?}, {:?}, {}, {:?}),"#, + item.key.user_key, item.value, item.key.seqno, item.key.value_type, + ); + } */ + + /* if items.len() > 100 { + eprintln!("================== {}. ", items.len()); + } */ + + let items = items.into_iter().map(|value| value.0).collect::>(); + let bytes = DataBlock::encode_items(&items, restart_interval.into(), hash_ratio).unwrap(); + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: lsm_tree::super_segment::block::Header { + checksum: lsm_tree::segment::block::checksum::Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + + if data_block.binary_index_len() > 254 { + assert!(data_block.hash_bucket_count().is_none()); + } else if hash_ratio > 0.0 { + assert!(data_block.hash_bucket_count().unwrap() > 0); + } - // eprintln!("needle: {:?}", needle.key); + // eprintln!("{items:?}"); - assert_eq!( - Some(needle.clone()), - data_block - .point_read(&needle.key.user_key, Some(needle.key.seqno + 1)) - .unwrap(), - ); - } + for needle in &items { + if needle.key.seqno == SeqNo::MAX { + continue; + } - assert_eq!(items, data_block.iter().collect::>()); + // eprintln!("needle: {:?}", needle.key); - assert_eq!( - items.iter().rev().cloned().collect::>(), - data_block.iter().rev().collect::>(), - ); + assert_eq!( + Some(needle.clone()), + data_block + .point_read(&needle.key.user_key, Some(needle.key.seqno + 1)) + .unwrap(), + ); + } - { - let ping_pongs = generate_ping_pong_code(seed, items.len()); + assert_eq!(items, data_block.iter().collect::>()); - let expected_ping_ponged_items = { - let mut iter = items.iter(); - let mut v = vec![]; + assert_eq!( + items.iter().rev().cloned().collect::>(), + data_block.iter().rev().collect::>(), + ); - for &x in &ping_pongs { - if x == 0 { - v.push(iter.next().cloned().unwrap()); - } else { - v.push(iter.next_back().cloned().unwrap()); - } - } + { + let ping_pongs = generate_ping_pong_code(seed, items.len()); - v - }; + let expected_ping_ponged_items = { + let mut iter = items.iter(); + let mut v = vec![]; - let real_ping_ponged_items = { - let mut iter = data_block.iter(); - let mut v = vec![]; + for &x in &ping_pongs { + if x == 0 { + v.push(iter.next().cloned().unwrap()); + } else { + v.push(iter.next_back().cloned().unwrap()); + } + } - for &x in &ping_pongs { - if x == 0 { - v.push(iter.next().unwrap()); - } else { - v.push(iter.next_back().unwrap()); - } - } + v + }; - v - }; + let real_ping_ponged_items = { + let mut iter = data_block.iter(); + let mut v = vec![]; - assert_eq!(expected_ping_ponged_items, real_ping_ponged_items); + for &x in &ping_pongs { + if x == 0 { + v.push(iter.next().unwrap()); + } else { + v.push(iter.next_back().unwrap()); + } } - { - let ping_pongs = generate_ping_pong_code(seed, items.len()); + v + }; - let expected_ping_ponged_items = { - let mut iter = items.iter().rev(); - let mut v = vec![]; + assert_eq!(expected_ping_ponged_items, real_ping_ponged_items); + } - for &x in &ping_pongs { - if x == 0 { - v.push(iter.next().cloned().unwrap()); - } else { - v.push(iter.next_back().cloned().unwrap()); - } - } + { + let ping_pongs = generate_ping_pong_code(seed, items.len()); - v - }; + let expected_ping_ponged_items = { + let mut iter = items.iter().rev(); + let mut v = vec![]; - let real_ping_ponged_items = { - let mut iter = data_block.iter().rev(); - let mut v = vec![]; + for &x in &ping_pongs { + if x == 0 { + v.push(iter.next().cloned().unwrap()); + } else { + v.push(iter.next_back().cloned().unwrap()); + } + } - for &x in &ping_pongs { - if x == 0 { - v.push(iter.next().unwrap()); - } else { - v.push(iter.next_back().unwrap()); - } - } + v + }; - v - }; + let real_ping_ponged_items = { + let mut iter = data_block.iter().rev(); + let mut v = vec![]; - assert_eq!(expected_ping_ponged_items, real_ping_ponged_items); + for &x in &ping_pongs { + if x == 0 { + v.push(iter.next().unwrap()); + } else { + v.push(iter.next_back().unwrap()); + } } - { - use rand::prelude::*; - use rand::SeedableRng; - use rand_chacha::ChaCha8Rng; + v + }; - let mut rng = ChaCha8Rng::seed_from_u64(seed); - let mut lo = rng.random_range(0..items.len()); - let mut hi = rng.random_range(0..items.len()); + assert_eq!(expected_ping_ponged_items, real_ping_ponged_items); + } - if lo > hi { - std::mem::swap(&mut lo, &mut hi); - } + { + use rand::prelude::*; + use rand::SeedableRng; + use rand_chacha::ChaCha8Rng; - let lo_key = &items[lo].key.user_key; - let hi_key = &items[hi].key.user_key; - - let expected_range: Vec<_> = items - .iter() - .filter(|kv| kv.key.user_key >= lo_key && kv.key.user_key <= hi_key) - .cloned() - .collect(); - - assert_eq!( - expected_range, - data_block - .range::<&[u8], _>(&(lo_key.as_ref()..=hi_key.as_ref())) - .collect::>(), - ); - } + let mut rng = ChaCha8Rng::seed_from_u64(seed); + let mut lo = rng.random_range(0..items.len()); + let mut hi = rng.random_range(0..items.len()); + + if lo > hi { + std::mem::swap(&mut lo, &mut hi); } + + let lo_key = &items[lo].key.user_key; + let hi_key = &items[hi].key.user_key; + + let expected_range: Vec<_> = items + .iter() + .filter(|kv| kv.key.user_key >= lo_key && kv.key.user_key <= hi_key) + .cloned() + .collect(); + + assert_eq!( + expected_range, + data_block + .range::<&[u8], _>(&(lo_key.as_ref()..=hi_key.as_ref())) + .collect::>(), + ); } }); diff --git a/src/super_segment/hash_index/builder.rs b/src/super_segment/hash_index/builder.rs index 37aec8f5..cb08108f 100644 --- a/src/super_segment/hash_index/builder.rs +++ b/src/super_segment/hash_index/builder.rs @@ -14,7 +14,7 @@ pub struct Builder(Vec); impl Builder { /// Initializes a new builder with the given amount of buckets. pub fn new(bucket_count: u32) -> Self { - Self(vec![MARKER_FREE; bucket_count as usize]) + Self(vec![MARKER_FREE; (bucket_count as usize).max(1)]) } // NOTE: We know the hash index has a bucket count <= u8 From 3232c14e368d1db90043d99469603e19b74a3e81 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 11 Apr 2025 17:37:57 +0200 Subject: [PATCH 045/613] remove old test files --- test_fixture/v1_export | Bin 146 -> 0 bytes test_fixture/v1_export_corrupt | Bin 146 -> 0 bytes 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 test_fixture/v1_export delete mode 100644 test_fixture/v1_export_corrupt diff --git a/test_fixture/v1_export b/test_fixture/v1_export deleted file mode 100644 index 6fa42ddefef1495a1f7be4fe5fdcd9a9a2438806..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 146 zcmZQzOk`kS5cSW{QAo+pQ!h~{PEA#)%r8~Q%P(hOOajVDh9_p0z+|&Zi%S&JGs{vH z@=Hq?7?XkW62AFBZdy@(E=XUILViw)LPmaRaVi623P?I6BePf`IkTivAvrNmAt_a% fIA0+rzZ4_~0xT9w&b_p{>J`55nu=anBOWc diff --git a/test_fixture/v1_export_corrupt b/test_fixture/v1_export_corrupt deleted file mode 100644 index 4c734831f8fd68e7b4b34dd164fed3337e71d2be..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 146 zcmZQzOk`kS5cSW{QAo+pQ!h~{PEA#)%r8~Q%P(hOOajVDh9_p0z+|&Zi%S&JGs{vH z@=Hq?7?XkW62AFisS0UD`MDr{MGE;jDGC|+rNyZXj42@Lkc`Y?h2+eVN`>UaJcXoG gh2ngLocvOd90;&jEIIem>W)*2Pq1%@Yeaw{0FOT^4gdfE From 13aa5610bc8d06b1c34bfa021b5d6d263e3d3521 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 11 Apr 2025 17:38:16 +0200 Subject: [PATCH 046/613] adjust v2 load tests --- tests/tree_v2_load_fixture.rs | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/tests/tree_v2_load_fixture.rs b/tests/tree_v2_load_fixture.rs index 7443389c..e85065a6 100644 --- a/tests/tree_v2_load_fixture.rs +++ b/tests/tree_v2_load_fixture.rs @@ -1,12 +1,16 @@ -use lsm_tree::{AbstractTree, Config}; +use lsm_tree::Config; use test_log::test; #[test] fn tree_load_v2() -> lsm_tree::Result<()> { let folder = "test_fixture/v2_tree"; - let tree = Config::new(folder).open()?; - assert_eq!(5, tree.len(None, None)?); + let result = Config::new(folder).open(); + + matches!( + result, + Err(lsm_tree::Error::InvalidVersion(lsm_tree::Version::V2)) + ); Ok(()) } @@ -15,8 +19,12 @@ fn tree_load_v2() -> lsm_tree::Result<()> { fn tree_load_v2_corrupt() -> lsm_tree::Result<()> { let folder = "test_fixture/v2_tree_corrupt"; - let result = Config::new(folder).open()?; - assert_eq!(1, result.verify()?); + let result = Config::new(folder).open(); + + matches!( + result, + Err(lsm_tree::Error::InvalidVersion(lsm_tree::Version::V2)) + ); Ok(()) } From 53d49f76cb4836de78d2fbcbba97818ff37da7b1 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 11 Apr 2025 17:39:08 +0200 Subject: [PATCH 047/613] update file format version --- src/file.rs | 2 +- src/tree/mod.rs | 4 ++-- src/version.rs | 5 +++++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/file.rs b/src/file.rs index f0cc62b7..a258d654 100644 --- a/src/file.rs +++ b/src/file.rs @@ -4,7 +4,7 @@ use std::{io::Write, path::Path}; -pub const MAGIC_BYTES: [u8; 4] = [b'L', b'S', b'M', 2]; +pub const MAGIC_BYTES: [u8; 4] = [b'L', b'S', b'M', 3]; pub const MANIFEST_FILE: &str = "manifest"; pub const SEGMENTS_FOLDER: &str = "segments"; diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 2340d687..c84be184 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -806,7 +806,7 @@ impl Tree { let mut bytes = Cursor::new(bytes); let manifest = Manifest::decode_from(&mut bytes)?; - if manifest.version != Version::V2 { + if manifest.version != Version::V3 { return Err(crate::Error::InvalidVersion(manifest.version)); } @@ -861,7 +861,7 @@ impl Tree { // -> the LSM is fully initialized let mut file = File::create(manifest_path)?; Manifest { - version: Version::V2, + version: Version::V3, level_count: config.level_count, tree_type: config.tree_type, table_type: TableType::Block, diff --git a/src/version.rs b/src/version.rs index 9560bbc6..19862ad8 100644 --- a/src/version.rs +++ b/src/version.rs @@ -10,6 +10,9 @@ pub enum Version { /// Version for 2.x.x releases V2, + + /// Version for 3.x.x releases + V3, } impl std::fmt::Display for Version { @@ -23,6 +26,7 @@ impl From for u8 { match value { Version::V1 => 1, Version::V2 => 2, + Version::V3 => 3, } } } @@ -34,6 +38,7 @@ impl TryFrom for Version { match value { 1 => Ok(Self::V1), 2 => Ok(Self::V2), + 3 => Ok(Self::V3), _ => Err(()), } } From d358362206e6faffe68bb7a9c33c525ea1cc3d25 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 11 Apr 2025 23:35:19 +0200 Subject: [PATCH 048/613] wip --- src/super_segment/index_block/block_handle.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/super_segment/index_block/block_handle.rs b/src/super_segment/index_block/block_handle.rs index 4a0c0485..2e29185b 100644 --- a/src/super_segment/index_block/block_handle.rs +++ b/src/super_segment/index_block/block_handle.rs @@ -2,7 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::{segment::block::offset::BlockOffset, super_segment::block::Encodable}; +use crate::super_segment::block::{BlockOffset, Encodable}; use value_log::UserKey; use varint_rs::VarintWriter; From e2ca4504b95069e356c71994f4c5094bf46195ba Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 11 Apr 2025 23:35:53 +0200 Subject: [PATCH 049/613] move files --- .../{ => block}/binary_index/builder.rs | 0 .../{ => block}/binary_index/mod.rs | 0 .../{ => block}/binary_index/reader.rs | 4 +++ .../{ => block}/hash_index/builder.rs | 12 ++++++-- .../{ => block}/hash_index/mod.rs | 0 .../{ => block}/hash_index/reader.rs | 0 src/super_segment/block/mod.rs | 4 +++ src/super_segment/block/offset.rs | 29 +++++++++++++++++++ 8 files changed, 47 insertions(+), 2 deletions(-) rename src/super_segment/{ => block}/binary_index/builder.rs (100%) rename src/super_segment/{ => block}/binary_index/mod.rs (100%) rename src/super_segment/{ => block}/binary_index/reader.rs (85%) rename src/super_segment/{ => block}/hash_index/builder.rs (87%) rename src/super_segment/{ => block}/hash_index/mod.rs (100%) rename src/super_segment/{ => block}/hash_index/reader.rs (100%) create mode 100644 src/super_segment/block/offset.rs diff --git a/src/super_segment/binary_index/builder.rs b/src/super_segment/block/binary_index/builder.rs similarity index 100% rename from src/super_segment/binary_index/builder.rs rename to src/super_segment/block/binary_index/builder.rs diff --git a/src/super_segment/binary_index/mod.rs b/src/super_segment/block/binary_index/mod.rs similarity index 100% rename from src/super_segment/binary_index/mod.rs rename to src/super_segment/block/binary_index/mod.rs diff --git a/src/super_segment/binary_index/reader.rs b/src/super_segment/block/binary_index/reader.rs similarity index 85% rename from src/super_segment/binary_index/reader.rs rename to src/super_segment/block/binary_index/reader.rs index c10130f8..3ed69d0a 100644 --- a/src/super_segment/binary_index/reader.rs +++ b/src/super_segment/block/binary_index/reader.rs @@ -26,6 +26,8 @@ impl<'a> Reader<'a> { let end = offset + size; Self { + // NOTE: We consider the caller to be trustworthy + #[warn(clippy::indexing_slicing)] bytes: &bytes[offset..end], step_size, } @@ -38,6 +40,8 @@ impl<'a> Reader<'a> { pub(crate) fn get(&self, idx: usize) -> usize { let offset = idx * self.step_size; + // NOTE: We consider the caller to be trustworthy + #[warn(clippy::indexing_slicing)] let mut bytes = &self.bytes[offset..]; if self.step_size == 2 { diff --git a/src/super_segment/hash_index/builder.rs b/src/super_segment/block/hash_index/builder.rs similarity index 87% rename from src/super_segment/hash_index/builder.rs rename to src/super_segment/block/hash_index/builder.rs index cb08108f..063d91a1 100644 --- a/src/super_segment/hash_index/builder.rs +++ b/src/super_segment/block/hash_index/builder.rs @@ -5,7 +5,8 @@ use super::{calculate_bucket_position, MARKER_CONFLICT, MARKER_FREE}; use byteorder::WriteBytesExt; -pub const MAX_POINTERS_FOR_HASH_INDEX: u8 = u8::MAX - 2; +/// With 254, pointers [0 - 253] can be indexed. +pub const MAX_POINTERS_FOR_HASH_INDEX: usize = 254; /// Builds a block hash index #[derive(Debug)] @@ -14,7 +15,7 @@ pub struct Builder(Vec); impl Builder { /// Initializes a new builder with the given amount of buckets. pub fn new(bucket_count: u32) -> Self { - Self(vec![MARKER_FREE; (bucket_count as usize).max(1)]) + Self(vec![MARKER_FREE; bucket_count as usize]) } // NOTE: We know the hash index has a bucket count <= u8 @@ -26,6 +27,13 @@ impl Builder { /// Tries to map the given key to the binary index position. pub fn set(&mut self, key: &[u8], binary_index_pos: u8) -> bool { + debug_assert!( + binary_index_pos <= 253, + "restart index too high for hash index" + ); + + assert!(self.bucket_count() > 0, "no buckets to insert into"); + let bucket_pos = calculate_bucket_position(key, self.bucket_count()); // SAFETY: We use modulo in `calculate_bucket_position` diff --git a/src/super_segment/hash_index/mod.rs b/src/super_segment/block/hash_index/mod.rs similarity index 100% rename from src/super_segment/hash_index/mod.rs rename to src/super_segment/block/hash_index/mod.rs diff --git a/src/super_segment/hash_index/reader.rs b/src/super_segment/block/hash_index/reader.rs similarity index 100% rename from src/super_segment/hash_index/reader.rs rename to src/super_segment/block/hash_index/reader.rs diff --git a/src/super_segment/block/mod.rs b/src/super_segment/block/mod.rs index e6f62d16..7635a764 100644 --- a/src/super_segment/block/mod.rs +++ b/src/super_segment/block/mod.rs @@ -2,8 +2,12 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) +pub(crate) mod binary_index; +mod checksum; mod encoder; +pub(crate) mod hash_index; mod header; +mod offset; mod trailer; pub(crate) use encoder::{Encodable, Encoder}; diff --git a/src/super_segment/block/offset.rs b/src/super_segment/block/offset.rs new file mode 100644 index 00000000..4f023296 --- /dev/null +++ b/src/super_segment/block/offset.rs @@ -0,0 +1,29 @@ +// TODO: rename FileOffset? +#[derive(Copy, Clone, Default, Debug, std::hash::Hash, PartialEq, Eq, Ord, PartialOrd)] +pub struct BlockOffset(pub u64); + +impl std::ops::Deref for BlockOffset { + type Target = u64; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl std::ops::AddAssign for BlockOffset { + fn add_assign(&mut self, rhs: Self) { + *self += *rhs; + } +} + +impl std::ops::AddAssign for BlockOffset { + fn add_assign(&mut self, rhs: u64) { + self.0 += rhs; + } +} + +impl std::fmt::Display for BlockOffset { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} From cfc6e691d9385687aa093698cb0f7c7eca60a332 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 11 Apr 2025 23:36:07 +0200 Subject: [PATCH 050/613] new block handle --- src/super_segment/index_block/block_handle.rs | 134 +++++++++++++++--- 1 file changed, 114 insertions(+), 20 deletions(-) diff --git a/src/super_segment/index_block/block_handle.rs b/src/super_segment/index_block/block_handle.rs index 2e29185b..589fa337 100644 --- a/src/super_segment/index_block/block_handle.rs +++ b/src/super_segment/index_block/block_handle.rs @@ -2,39 +2,133 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::super_segment::block::{BlockOffset, Encodable}; +use crate::{ + coding::{Decode, DecodeError, Encode, EncodeError}, + super_segment::block::{BlockOffset, Encodable}, +}; use value_log::UserKey; -use varint_rs::VarintWriter; +use varint_rs::{VarintReader, VarintWriter}; /// Points to a block on file -#[derive(Clone, Debug, Eq)] +#[derive(Copy, Clone, Debug, Default, Eq)] #[allow(clippy::module_name_repetitions)] -pub struct NewKeyedBlockHandle { - /// Key of last item in block - pub end_key: UserKey, - +pub struct NewBlockHandle { /// Position of block in file - pub offset: BlockOffset, + offset: BlockOffset, /// Size of block in bytes - pub size: u32, + size: u32, } -impl Ord for NewKeyedBlockHandle { +impl NewBlockHandle { + pub fn new(offset: BlockOffset, size: u32) -> Self { + Self { offset, size } + } + + pub fn size(&self) -> u32 { + self.size + } + + pub fn offset(&self) -> BlockOffset { + self.offset + } +} + +impl PartialEq for NewBlockHandle { + fn eq(&self, other: &Self) -> bool { + self.offset == other.offset + } +} + +impl Ord for NewBlockHandle { fn cmp(&self, other: &Self) -> std::cmp::Ordering { self.offset.cmp(&other.offset) } } -impl PartialOrd for NewKeyedBlockHandle { +impl PartialOrd for NewBlockHandle { fn partial_cmp(&self, other: &Self) -> Option { Some(self.offset.cmp(&other.offset)) } } +impl Encode for NewBlockHandle { + fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { + writer.write_u64_varint(*self.offset)?; + writer.write_u32_varint(self.size)?; + Ok(()) + } +} + +impl Decode for NewBlockHandle { + fn decode_from(reader: &mut R) -> Result + where + Self: Sized, + { + let offset = reader.read_u64_varint()?; + let size = reader.read_u32_varint()?; + + Ok(Self { + offset: BlockOffset(offset), + size, + }) + } +} + +/// Points to a block on file +#[derive(Clone, Debug, Eq)] +#[allow(clippy::module_name_repetitions)] +pub struct NewKeyedBlockHandle { + /// Key of last item in block + end_key: UserKey, + + inner: NewBlockHandle, +} + +impl NewKeyedBlockHandle { + pub fn shift(&mut self, delta: BlockOffset) { + self.inner.offset += delta; + } + + pub fn size(&self) -> u32 { + self.inner.size() + } + + pub fn offset(&self) -> BlockOffset { + self.inner.offset() + } + + pub fn end_key(&self) -> &UserKey { + &self.end_key + } + + pub fn into_end_key(self) -> UserKey { + self.end_key + } + + pub fn new(end_key: UserKey, offset: BlockOffset, size: u32) -> Self { + Self { + end_key, + inner: NewBlockHandle::new(offset, size), + } + } +} + impl PartialEq for NewKeyedBlockHandle { fn eq(&self, other: &Self) -> bool { - self.offset == other.offset + self.offset() == other.offset() + } +} + +impl Ord for NewKeyedBlockHandle { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.offset().cmp(&other.offset()) + } +} + +impl PartialOrd for NewKeyedBlockHandle { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.offset().cmp(&other.offset())) } } @@ -48,12 +142,11 @@ impl Encodable for NewKeyedBlockHandle { // [offset] [size] [key len] [end key] // 1 2 3 4 - writer.write_u64_varint(*self.offset)?; // 1 - writer.write_u32_varint(self.size)?; // 2 + self.inner.encode_into(writer)?; // 1, 2 writer.write_u16_varint(self.end_key.len() as u16)?; // 3 writer.write_all(&self.end_key)?; // 4 - *state = BlockOffset(*self.offset + u64::from(self.size)); + *state = BlockOffset(*self.offset() + u64::from(self.size())); Ok(()) } @@ -66,22 +159,23 @@ impl Encodable for NewKeyedBlockHandle { ) -> crate::Result<()> { // We encode truncated handles as: // [size] [shared prefix len] [rest key len] [rest key] + // 1 2 3 4 - writer.write_u32_varint(self.size)?; + writer.write_u32_varint(self.size())?; // 1 // TODO: maybe we can skip this varint altogether if prefix truncation = false - writer.write_u16_varint(shared_len as u16)?; + writer.write_u16_varint(shared_len as u16)?; // 2 // NOTE: We can safely cast to u16, because keys are u16 long max #[allow(clippy::cast_possible_truncation)] let rest_len = self.end_key.len() - shared_len; - writer.write_u16_varint(rest_len as u16)?; + writer.write_u16_varint(rest_len as u16)?; // 3 let truncated_user_key = self.end_key.get(shared_len..).expect("should be in bounds"); - writer.write_all(truncated_user_key)?; + writer.write_all(truncated_user_key)?; // 4 - *state += u64::from(self.size); + *state += u64::from(self.size()); Ok(()) } From d51cbbdfc44d0a87f01d5c30f1e949f836ddb1b0 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 11 Apr 2025 23:36:25 +0200 Subject: [PATCH 051/613] refactor --- src/super_segment/index_block/mod.rs | 73 ++++++++++------------------ 1 file changed, 25 insertions(+), 48 deletions(-) diff --git a/src/super_segment/index_block/mod.rs b/src/super_segment/index_block/mod.rs index 099439aa..403d88da 100644 --- a/src/super_segment/index_block/mod.rs +++ b/src/super_segment/index_block/mod.rs @@ -4,12 +4,11 @@ mod block_handle; -pub use block_handle::NewKeyedBlockHandle; +pub use block_handle::{NewBlockHandle, NewKeyedBlockHandle}; -use super::{binary_index::Reader as BinaryIndexReader, block::Encoder, Block}; -use crate::{ - segment::{block::offset::BlockOffset, trailer::TRAILER_SIZE}, - super_segment::block::Trailer, +use super::{ + block::{binary_index::Reader as BinaryIndexReader, BlockOffset, Encoder, Trailer}, + Block, }; use byteorder::{LittleEndian, ReadBytesExt}; use std::io::{Cursor, Seek}; @@ -85,11 +84,6 @@ impl IndexBlock { false } - /// Returns the trailer position. - fn trailer_offset(data: &[u8]) -> usize { - data.len() - TRAILER_SIZE - } - /// Access the inner raw bytes #[must_use] fn bytes(&self) -> &[u8] { @@ -312,14 +306,12 @@ impl IndexBlock { let item = Self::parse_restart_head(&mut cursor); - Some(NewKeyedBlockHandle { - offset: item.offset, - size: item.size, - end_key: self - .inner - .data - .slice(item.key_start..(item.key_start + item.key_len)), - }) + let end_key = self + .inner + .data + .slice(item.key_start..(item.key_start + item.key_len)); + + Some(NewKeyedBlockHandle::new(end_key, item.offset, item.size)) /* let binary_index = self.get_binary_index_reader(); @@ -363,16 +355,13 @@ impl IndexBlock { self.walk(key, offset, self.restart_interval.into()) */ } - pub fn encode_items( - items: &[NewKeyedBlockHandle], - hash_index_ratio: f32, - ) -> crate::Result> { - let first_key = &items.first().expect("chunk should not be empty").end_key; + pub fn encode_items(items: &[NewKeyedBlockHandle]) -> crate::Result> { + let first_key = items.first().expect("chunk should not be empty").end_key(); let mut serializer = Encoder::<'_, BlockOffset, NewKeyedBlockHandle>::new( items.len(), - 1, // TODO: hard-coded for now - hash_index_ratio, + 1, // TODO: hard-coded for now + 0.0, // TODO: hard-coded for now first_key, ); @@ -387,30 +376,18 @@ impl IndexBlock { #[cfg(test)] mod tests { use super::*; - use crate::{segment::block::offset::BlockOffset, super_segment::block::Header, Checksum}; + use crate::super_segment::block::{Checksum, Header}; use test_log::test; #[test] fn v3_index_block_simple() -> crate::Result<()> { let items = [ - NewKeyedBlockHandle { - end_key: b"a".into(), - offset: BlockOffset(0), - size: 6_000, - }, - NewKeyedBlockHandle { - end_key: b"abcdef".into(), - offset: BlockOffset(6_000), - size: 7_000, - }, - NewKeyedBlockHandle { - end_key: b"def".into(), - offset: BlockOffset(13_000), - size: 5_000, - }, + NewKeyedBlockHandle::new(b"a".into(), BlockOffset(0), 6_000), + NewKeyedBlockHandle::new(b"abcdef".into(), BlockOffset(6_000), 7_000), + NewKeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), ]; - let bytes = IndexBlock::encode_items(&items, 0.0)?; + let bytes = IndexBlock::encode_items(&items)?; /* eprintln!("{bytes:?}"); eprintln!("{}", String::from_utf8_lossy(&bytes)); eprintln!("encoded into {} bytes", bytes.len()); */ @@ -432,16 +409,16 @@ mod tests { assert_eq!( Some(needle.clone()), - data_block.get_lowest_possible_block(&needle.end_key), + data_block.get_lowest_possible_block(needle.end_key()), ); } assert_eq!( - Some(NewKeyedBlockHandle { - end_key: b"abcdef".into(), - offset: BlockOffset(6_000), - size: 7_000, - }), + Some(NewKeyedBlockHandle::new( + b"abcdef".into(), + BlockOffset(6_000), + 7_000 + )), data_block.get_lowest_possible_block(b"ccc"), ); From 8a5f16be2896a12ca3e8dc408ff11df0cdcedca2 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 11 Apr 2025 23:36:39 +0200 Subject: [PATCH 052/613] fix hash index tests --- src/super_segment/data_block/mod.rs | 84 +++++++++++++++++++++++++---- 1 file changed, 74 insertions(+), 10 deletions(-) diff --git a/src/super_segment/data_block/mod.rs b/src/super_segment/data_block/mod.rs index 29450ccf..ef27639b 100644 --- a/src/super_segment/data_block/mod.rs +++ b/src/super_segment/data_block/mod.rs @@ -6,12 +6,11 @@ mod iter; pub use iter::Iter; -use super::block::Trailer; -use super::block::{Encodable, Encoder}; -use super::hash_index::Reader as HashIndexReader; -use super::{binary_index::Reader as BinaryIndexReader, Block}; +use super::block::{ + binary_index::Reader as BinaryIndexReader, hash_index::Reader as HashIndexReader, Block, + Encodable, Encoder, Trailer, TRAILER_START_MARKER, +}; use crate::clipping_iter::ClippingIter; -use crate::super_segment::block::TRAILER_START_MARKER; use crate::super_segment::util::compare_prefixed_slice; use crate::{InternalValue, SeqNo, ValueType}; use byteorder::WriteBytesExt; @@ -521,7 +520,7 @@ impl DataBlock { .get_hash_index_reader() .map(|reader| reader.get(needle)) { - use super::hash_index::Lookup::{Conflicted, Found, NotFound}; + use super::block::hash_index::Lookup::{Conflicted, Found, NotFound}; match lookup { Found(bucket_value) => { @@ -575,9 +574,11 @@ impl DataBlock { mod tests { use super::*; use crate::{ - segment::block::offset::BlockOffset, - super_segment::{block::Header, Block}, - Checksum, InternalValue, Slice, + super_segment::{ + block::{BlockOffset, Checksum, Header}, + Block, + }, + InternalValue, Slice, ValueType::{Tombstone, Value}, }; use std::cmp::Ordering::{Equal, Greater, Less}; @@ -1321,6 +1322,38 @@ mod tests { Ok(()) } + #[test] + fn v3_data_block_small_hash_ratio() -> crate::Result<()> { + let items = (0u64..254) + .map(|x| InternalValue::from_components(x.to_be_bytes(), x.to_be_bytes(), 0, Value)) + .collect::>(); + + // NOTE: If >0.0, buckets are at least 1 + let bytes = DataBlock::encode_items(&items, 1, 0.0001)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); + + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1))?, + ); + } + + Ok(()) + } + #[test] fn v3_data_block_just_enough_pointers_for_hash_bucket() -> crate::Result<()> { let items = (0u64..254) @@ -1340,7 +1373,7 @@ mod tests { }); assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().is_none()); + assert!(data_block.hash_bucket_count().unwrap() > 0); for needle in items { assert_eq!( @@ -1383,6 +1416,37 @@ mod tests { Ok(()) } + #[test] + fn v3_data_block_way_too_many_pointers_for_hash_bucket() -> crate::Result<()> { + let items = (0u64..1_000) + .map(|x| InternalValue::from_components(x.to_be_bytes(), x.to_be_bytes(), 0, Value)) + .collect::>(); + + let bytes = DataBlock::encode_items(&items, 1, 0.75)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().is_none()); + + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1))?, + ); + } + + Ok(()) + } + #[test] fn v3_data_block_no_hash_index() -> crate::Result<()> { let items = (0u64..1) From 89586c666593ebadfa5786a70584f3b7ba69d9b4 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 11 Apr 2025 23:36:47 +0200 Subject: [PATCH 053/613] wip --- src/super_segment/block/trailer.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/super_segment/block/trailer.rs b/src/super_segment/block/trailer.rs index 273681c7..5c72c718 100644 --- a/src/super_segment/block/trailer.rs +++ b/src/super_segment/block/trailer.rs @@ -6,7 +6,7 @@ use super::{ encoder::{Encodable, Encoder}, Block, }; -use crate::super_segment::hash_index::MAX_POINTERS_FOR_HASH_INDEX; +use crate::super_segment::block::hash_index::MAX_POINTERS_FOR_HASH_INDEX; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; pub const TRAILER_START_MARKER: u8 = 255; @@ -84,7 +84,7 @@ impl<'a> Trailer<'a> { // With the default restart interval of 16, that still gives us support // for up to ~4000 KVs if encoder.hash_index_builder.bucket_count() > 0 - && binary_index_len <= MAX_POINTERS_FOR_HASH_INDEX.into() + && binary_index_len <= MAX_POINTERS_FOR_HASH_INDEX { // NOTE: We know that data blocks will never even approach 4 GB in size #[allow(clippy::cast_possible_truncation)] From 91c9dbe0fa7bca22dae1196dada5db4ca2643cce Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 11 Apr 2025 23:37:03 +0200 Subject: [PATCH 054/613] fix --- src/super_segment/block/encoder.rs | 28 ++++++++++++++++++++-------- src/super_segment/block/header.rs | 6 ++++-- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/src/super_segment/block/encoder.rs b/src/super_segment/block/encoder.rs index 2d875e22..85a038c5 100644 --- a/src/super_segment/block/encoder.rs +++ b/src/super_segment/block/encoder.rs @@ -2,9 +2,14 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::super::hash_index::Builder as HashIndexBuilder; -use super::{super::binary_index::Builder as BinaryIndexBuilder, Trailer}; -use crate::super_segment::util::longest_shared_prefix_length; +use super::{ + super::{ + block::binary_index::Builder as BinaryIndexBuilder, + block::hash_index::{Builder as HashIndexBuilder, MAX_POINTERS_FOR_HASH_INDEX}, + util::longest_shared_prefix_length, + }, + Trailer, +}; use std::marker::PhantomData; pub trait Encodable { @@ -57,7 +62,13 @@ impl<'a, S: Default, T: Encodable> Encoder<'a, S, T> { first_key: &'a [u8], ) -> Self { let binary_index_len = item_count / usize::from(restart_interval); - let bucket_count = (item_count as f32 * hash_index_ratio) as u32; // TODO: verify + + // TODO: verify + let bucket_count = if hash_index_ratio > 0.0 { + ((item_count as f32 * hash_index_ratio) as u32).max(1) + } else { + 0 + }; Self { phantom: PhantomData, @@ -107,11 +118,12 @@ impl<'a, S: Default, T: Encodable> Encoder<'a, S, T> { item.encode_truncated_into(&mut self.writer, &mut self.state, shared_prefix_len)?; } - if self.hash_index_builder.bucket_count() > 0 { - // NOTE: The max binary index is bound by u8 (technically u8::MAX - 2) + let restart_idx = self.restart_count - 1; + + if self.hash_index_builder.bucket_count() > 0 && restart_idx < MAX_POINTERS_FOR_HASH_INDEX { + // NOTE: The max binary index is bound to u8 by conditional #[allow(clippy::cast_possible_truncation)] - self.hash_index_builder - .set(item.key(), (self.restart_count - 1) as u8); + self.hash_index_builder.set(item.key(), restart_idx as u8); } self.item_count += 1; diff --git a/src/super_segment/block/header.rs b/src/super_segment/block/header.rs index 28a72537..455f8cc7 100644 --- a/src/super_segment/block/header.rs +++ b/src/super_segment/block/header.rs @@ -2,11 +2,13 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::coding::{Encode, EncodeError,Decode,DecodeError}; -use crate::{file::MAGIC_BYTES, segment::block::offset::BlockOffset, Checksum}; +use crate::coding::{Encode, EncodeError, Decode, DecodeError}; +use crate::file::MAGIC_BYTES; use byteorder::LittleEndian; use byteorder::{ReadBytesExt,WriteBytesExt}; use std::io::{Read, Write}; +use super::offset::BlockOffset; +use super::Checksum; /// Header of a disk-based block #[derive(Copy, Clone, Debug, Eq, PartialEq)] From 68c327ccf3346f954786c0c988fc99b927458ed8 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 11 Apr 2025 23:37:28 +0200 Subject: [PATCH 055/613] fix fuzz --- fuzz/fuzz_targets/data_block.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fuzz/fuzz_targets/data_block.rs b/fuzz/fuzz_targets/data_block.rs index 4b1c12c7..84cac6fa 100644 --- a/fuzz/fuzz_targets/data_block.rs +++ b/fuzz/fuzz_targets/data_block.rs @@ -2,8 +2,7 @@ use arbitrary::{Arbitrary, Result, Unstructured}; use libfuzzer_sys::fuzz_target; use lsm_tree::{ - segment::block::offset::BlockOffset, - super_segment::{Block, DataBlock}, + super_segment::{block::BlockOffset, Block, DataBlock}, InternalValue, SeqNo, ValueType, }; @@ -107,7 +106,7 @@ fuzz_target!(|data: &[u8]| { let data_block = DataBlock::new(Block { data: bytes.into(), header: lsm_tree::super_segment::block::Header { - checksum: lsm_tree::segment::block::checksum::Checksum::from_raw(0), + checksum: lsm_tree::super_segment::Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, previous_block_offset: BlockOffset(0), From ed853cd832eec7d807ebc8d7671bd9ee8ff917a5 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 11 Apr 2025 23:38:26 +0200 Subject: [PATCH 056/613] block writer --- src/super_segment/block/checksum.rs | 30 +++ src/super_segment/block/mod.rs | 73 ++++-- src/super_segment/mod.rs | 4 +- src/super_segment/trailer.rs | 162 +++++++++++++ src/super_segment/writer/index.rs | 169 +++++++++++++ src/super_segment/writer/meta.rs | 62 +++++ src/super_segment/writer/mod.rs | 355 ++++++++++++++++++++++++++++ 7 files changed, 838 insertions(+), 17 deletions(-) create mode 100644 src/super_segment/block/checksum.rs create mode 100644 src/super_segment/trailer.rs create mode 100644 src/super_segment/writer/index.rs create mode 100644 src/super_segment/writer/meta.rs create mode 100644 src/super_segment/writer/mod.rs diff --git a/src/super_segment/block/checksum.rs b/src/super_segment/block/checksum.rs new file mode 100644 index 00000000..5b3f0edc --- /dev/null +++ b/src/super_segment/block/checksum.rs @@ -0,0 +1,30 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use xxhash_rust::xxh3::xxh3_64; + +/// A checksum based on xxh3 +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub struct Checksum(u64); + +impl std::ops::Deref for Checksum { + type Target = u64; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl Checksum { + #[must_use] + pub fn from_raw(value: u64) -> Self { + Self(value) + } + + /// Calculates a checksum. + #[must_use] + pub fn from_bytes(bytes: &[u8]) -> Self { + Self(xxh3_64(bytes)) + } +} diff --git a/src/super_segment/block/mod.rs b/src/super_segment/block/mod.rs index 7635a764..21e9e432 100644 --- a/src/super_segment/block/mod.rs +++ b/src/super_segment/block/mod.rs @@ -10,12 +10,18 @@ mod header; mod offset; mod trailer; +pub use checksum::Checksum; pub(crate) use encoder::{Encodable, Encoder}; pub use header::Header; +pub use offset::BlockOffset; pub(crate) use trailer::{Trailer, TRAILER_START_MARKER}; -use crate::{coding::Decode, segment::block::offset::BlockOffset, CompressionType, Slice}; +use crate::{ + coding::{Decode, Encode}, + CompressionType, Slice, +}; use std::fs::File; +use xxhash_rust::xxh3::xxh3_64; /// A block on disk. /// @@ -33,14 +39,49 @@ impl Block { self.data.len() } + pub fn to_writer( + mut writer: &mut W, + data: &[u8], + compression: CompressionType, + ) -> crate::Result
{ + let checksum = xxh3_64(data); + + let mut header = Header { + checksum: Checksum::from_raw(checksum), + data_length: 0, // <-- NOTE: Is set later on + uncompressed_length: data.len() as u32, + previous_block_offset: BlockOffset(0), // <-- TODO: + }; + + let data = match compression { + CompressionType::None => data, + CompressionType::Lz4 => &lz4_flex::compress(data), + CompressionType::Miniz(level) => &miniz_oxide::deflate::compress_to_vec(data, level), + }; + header.data_length = data.len() as u32; + + debug_assert!(header.data_length > 0); + + header.encode_into(&mut writer)?; + writer.write_all(data)?; + + log::trace!( + "Writing block with size {}B (compressed: {}B)", + header.uncompressed_length, + header.data_length, + ); + + Ok(header) + } + pub fn from_file( file: &File, offset: BlockOffset, - size: usize, + size: u32, compression: CompressionType, ) -> crate::Result { // TODO: use a Slice::get_mut instead... needs value-log update - let mut buf = byteview::ByteView::with_size(size); + let mut buf = byteview::ByteView::with_size(size as usize); { let mut mutator = buf.get_mut().expect("should be the owner"); @@ -48,6 +89,7 @@ impl Block { #[cfg(unix)] { use std::os::unix::fs::FileExt; + file.read_at(&mut mutator, *offset)?; } @@ -63,35 +105,29 @@ impl Block { } } - let header = Header::decode_from(&mut &*buf)?; - - debug_assert_eq!(header.uncompressed_length, { - #[allow(clippy::expect_used, clippy::cast_possible_truncation)] - { - buf.get(Header::serialized_len()..) - .expect("should be in bounds") - .len() as u32 - } - }); + let header = Header::decode_from(&mut &buf[..])?; let data = match compression { CompressionType::None => buf.slice(Header::serialized_len()..), CompressionType::Lz4 => { - // NOTE: We that a header always exists and data is never empty + // NOTE: We know that a header always exists and data is never empty // So the slice is fine #[allow(clippy::indexing_slicing)] let raw_data = &buf[Header::serialized_len()..]; let mut data = byteview::ByteView::with_size(header.uncompressed_length as usize); { + // NOTE: We know that we are the owner + #[allow(clippy::expect_used)] let mut mutator = data.get_mut().expect("should be the owner"); + lz4_flex::decompress_into(raw_data, &mut mutator) .map_err(|_| crate::Error::Decompress(compression))?; } data } CompressionType::Miniz(_) => { - // NOTE: We that a header always exists and data is never empty + // NOTE: We know that a header always exists and data is never empty // So the slice is fine #[allow(clippy::indexing_slicing)] let raw_data = &buf[Header::serialized_len()..]; @@ -102,6 +138,13 @@ impl Block { } }; + debug_assert_eq!(header.uncompressed_length, { + #[allow(clippy::expect_used, clippy::cast_possible_truncation)] + { + data.len() as u32 + } + }); + Ok(Self { header, data: Slice::from(data), diff --git a/src/super_segment/mod.rs b/src/super_segment/mod.rs index ccdc1c03..14da79c6 100644 --- a/src/super_segment/mod.rs +++ b/src/super_segment/mod.rs @@ -2,12 +2,12 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -pub(crate) mod binary_index; pub mod block; pub(crate) mod data_block; -pub(crate) mod hash_index; mod index_block; +mod trailer; pub(crate) mod util; +mod writer; pub use block::Block; pub use data_block::DataBlock; diff --git a/src/super_segment/trailer.rs b/src/super_segment/trailer.rs new file mode 100644 index 00000000..27f05a4a --- /dev/null +++ b/src/super_segment/trailer.rs @@ -0,0 +1,162 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use super::index_block::NewBlockHandle; +use crate::{ + coding::{Decode, DecodeError, Encode, EncodeError}, + file::MAGIC_BYTES, + segment::trailer::TRAILER_SIZE, +}; +use std::{ + fs::File, + io::{BufReader, Read, Seek, Write}, + path::Path, +}; + +/// The segment trailer stores offsets to the different segment disk file "zones" +/// +/// ---------------- +/// | data blocks | <- implicitly start at 0 +/// |--------------| +/// | index blocks | +/// |--------------| +/// | tli block | +/// |--------------| +/// | filter block | <- may not exist +/// |--------------| +/// | ... TBD ... | +/// |--------------| +/// | meta block | +/// |--------------| +/// | trailer | <- fixed size +/// |--------------| +#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)] +pub struct Trailer { + pub index_block: NewBlockHandle, + pub tli: NewBlockHandle, + pub filter: NewBlockHandle, + + // // TODO: #2 https://github.com/fjall-rs/lsm-tree/issues/2 + // pub range_tombstones: BlockOffset, + + // // TODO: prefix filter for l0, l1? + // pub pfx: BlockOffset, + + // // TODO: #46 https://github.com/fjall-rs/lsm-tree/issues/46 + // pub range_filter: BlockOffset, + pub metadata: NewBlockHandle, +} + +impl Trailer { + /// Returns the on-disk size + #[must_use] + pub const fn serialized_len() -> usize { + 4 * std::mem::size_of::() + } + + pub fn write_into(&self, writer: &mut W) -> crate::Result<()> { + let mut v = Vec::with_capacity(TRAILER_SIZE); + + v.write_all(&MAGIC_BYTES)?; + + self.encode_into(&mut v)?; + + // Pad with remaining bytes + v.resize(TRAILER_SIZE, 0); + + assert_eq!( + v.len(), + TRAILER_SIZE, + "segment file trailer has invalid size" + ); + + writer.write_all(&v)?; + + Ok(()) + } + + pub fn from_file(path: &Path) -> crate::Result { + let file = File::open(path)?; + let mut reader = BufReader::new(file); + reader.seek(std::io::SeekFrom::End(-(TRAILER_SIZE as i64)))?; + + // Check trailer magic header + let mut magic = [0u8; MAGIC_BYTES.len()]; + reader.read_exact(&mut magic)?; + + // Parse pointers + let trailer = Self::decode_from(&mut reader)?; + + if magic != MAGIC_BYTES { + return Err(crate::Error::Decode(DecodeError::InvalidHeader( + "SegmentTrailer", + ))); + } + + debug_assert!(*trailer.index_block.offset() > 0); + debug_assert!(*trailer.tli.offset() > 0); + debug_assert!(*trailer.metadata.offset() > 0); + + Ok(trailer) + } +} + +impl Encode for Trailer { + fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { + self.index_block.encode_into(writer)?; + self.tli.encode_into(writer)?; + self.filter.encode_into(writer)?; + self.metadata.encode_into(writer)?; + Ok(()) + } +} + +impl Decode for Trailer { + fn decode_from(reader: &mut R) -> Result { + let index_block = NewBlockHandle::decode_from(reader)?; + let tli = NewBlockHandle::decode_from(reader)?; + let filter = NewBlockHandle::decode_from(reader)?; + let metadata = NewBlockHandle::decode_from(reader)?; + + Ok(Self { + index_block, + tli, + filter, + metadata, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::super_segment::BlockOffset; + use std::io::Cursor; + use test_log::test; + + #[test] + fn file_offsets_roundtrip() -> crate::Result<()> { + let before = Trailer { + index_block: NewBlockHandle::new(BlockOffset(15), 5), + tli: NewBlockHandle::new(BlockOffset(20), 5), + filter: NewBlockHandle::new(BlockOffset(25), 5), + metadata: NewBlockHandle::new(BlockOffset(30), 5), + }; + + let buf = before.encode_into_vec(); + + let mut cursor = Cursor::new(buf); + let after = Trailer::decode_from(&mut cursor)?; + + assert_eq!(after, before); + + Ok(()) + } + + #[test] + fn file_offsets_serialized_len() { + let buf = Trailer::default().encode_into_vec(); + assert_eq!(Trailer::serialized_len(), buf.len()); + } +} diff --git a/src/super_segment/writer/index.rs b/src/super_segment/writer/index.rs new file mode 100644 index 00000000..0a7397aa --- /dev/null +++ b/src/super_segment/writer/index.rs @@ -0,0 +1,169 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use crate::{ + segment::meta::CompressionType, + super_segment::{ + block::Header as BlockHeader, + index_block::{NewBlockHandle, NewKeyedBlockHandle}, + Block, BlockOffset, IndexBlock, + }, + value::UserKey, +}; +use std::{ + fs::File, + io::{BufWriter, Seek, Write}, +}; + +pub struct Writer { + file_pos: BlockOffset, + + prev_pos: (BlockOffset, BlockOffset), + + write_buffer: Vec, + + block_size: u32, + compression: CompressionType, + + buffer_size: u32, + + block_handles: Vec, + tli_pointers: Vec, + + pub block_count: usize, +} + +impl Writer { + pub fn new(block_size: u32) -> Self { + Self { + file_pos: BlockOffset(0), + prev_pos: (BlockOffset(0), BlockOffset(0)), + write_buffer: Vec::with_capacity(u16::MAX.into()), + buffer_size: 0, + block_size, + compression: CompressionType::None, + block_handles: Vec::new(), + tli_pointers: Vec::new(), + block_count: 0, + } + } + + #[must_use] + pub fn use_compression(mut self, compression: CompressionType) -> Self { + self.compression = compression; + self + } + + fn write_block(&mut self) -> crate::Result<()> { + let bytes = IndexBlock::encode_items(&self.block_handles)?; + + // TODO: prev block offset + let _header = Block::to_writer(&mut self.write_buffer, &bytes, self.compression)?; + + let bytes_written = (BlockHeader::serialized_len() + bytes.len()) as u32; + + // NOTE: Expect is fine, because the chunk is not empty + // + // Also, we are allowed to remove the last item + // to get ownership of it, because the chunk is cleared after + // this anyway + #[allow(clippy::expect_used)] + let last = self.block_handles.pop().expect("Chunk should not be empty"); + + let index_block_handle = + NewKeyedBlockHandle::new(last.into_end_key(), self.file_pos, bytes_written); + + self.tli_pointers.push(index_block_handle); + + // Adjust metadata + self.file_pos += bytes_written as u64; + self.block_count += 1; + + // Back link stuff + self.prev_pos.0 = self.prev_pos.1; + self.prev_pos.1 += bytes_written as u64; + + // IMPORTANT: Clear buffer after everything else + self.block_handles.clear(); + self.buffer_size = 0; + + Ok(()) + } + + pub fn register_block( + &mut self, + end_key: UserKey, + offset: BlockOffset, + size: u32, + ) -> crate::Result<()> { + log::trace!( + "Registering block at 0x{:X?} with size {size} [end_key={:?}]", + *offset, + end_key, + ); + + // NOTE: Truncation is OK, because a key is bound by 65535 bytes, so can never exceed u32s + #[allow(clippy::cast_possible_truncation)] + let block_handle_size = (end_key.len() + std::mem::size_of::()) as u32; + + let block_handle = NewKeyedBlockHandle::new(end_key, offset, size); + + self.block_handles.push(block_handle); + + self.buffer_size += block_handle_size; + + if self.buffer_size >= self.block_size { + self.write_block()?; + } + + Ok(()) + } + + fn write_top_level_index( + &mut self, + block_file_writer: &mut BufWriter, + file_offset: BlockOffset, + ) -> crate::Result { + block_file_writer.write_all(&self.write_buffer)?; + let tli_ptr = BlockOffset(block_file_writer.stream_position()?); + + log::trace!("Wrote index blocks into segment file"); + + for item in &mut self.tli_pointers { + item.shift(file_offset); + } + + let bytes = IndexBlock::encode_items(&self.tli_pointers)?; + + let _header = Block::to_writer(&mut self.write_buffer, &bytes, self.compression)?; + + let bytes_written = BlockHeader::serialized_len() + bytes.len(); + + block_file_writer.flush()?; + block_file_writer.get_mut().sync_all()?; + + log::trace!( + "Written top level index, with {} pointers ({} bytes)", + self.tli_pointers.len(), + bytes_written, + ); + + Ok(NewBlockHandle::new(tli_ptr, bytes_written as u32)) + } + + /// Returns the offset in the file to TLI + pub fn finish( + &mut self, + block_file_writer: &mut BufWriter, + ) -> crate::Result { + if self.buffer_size > 0 { + self.write_block()?; + } + + let index_block_ptr = BlockOffset(block_file_writer.stream_position()?); + let tli_handle = self.write_top_level_index(block_file_writer, index_block_ptr)?; + + Ok(tli_handle) + } +} diff --git a/src/super_segment/writer/meta.rs b/src/super_segment/writer/meta.rs new file mode 100644 index 00000000..bee160f4 --- /dev/null +++ b/src/super_segment/writer/meta.rs @@ -0,0 +1,62 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use crate::{super_segment::BlockOffset, SeqNo, UserKey}; + +pub struct Metadata { + /// Written data block count + pub data_block_count: usize, + + /// Written index block count + pub index_block_count: usize, + + /// Written item count + pub item_count: usize, + + /// Tombstone count + pub tombstone_count: usize, + + // TODO: 3.0.0 - https://github.com/fjall-rs/lsm-tree/issues/101 + /// Written key count (unique keys) + pub key_count: usize, + + /// Current file position of writer + pub file_pos: BlockOffset, + + /// Only takes user data into account + pub uncompressed_size: u64, + + /// First encountered key + pub first_key: Option, + + /// Last encountered key + pub last_key: Option, + + /// Lowest encountered seqno + pub lowest_seqno: SeqNo, + + /// Highest encountered seqno + pub highest_seqno: SeqNo, +} + +impl Default for Metadata { + fn default() -> Self { + Self { + data_block_count: 0, + index_block_count: 0, + + item_count: 0, + tombstone_count: 0, + key_count: 0, + file_pos: BlockOffset(0), + uncompressed_size: 0, + + first_key: None, + last_key: None, + + lowest_seqno: SeqNo::MAX, + highest_seqno: 0, + } + } +} diff --git a/src/super_segment/writer/mod.rs b/src/super_segment/writer/mod.rs new file mode 100644 index 00000000..b8fa3066 --- /dev/null +++ b/src/super_segment/writer/mod.rs @@ -0,0 +1,355 @@ +mod index; +mod meta; + +use super::{block::Header as BlockHeader, trailer::Trailer, Block, BlockOffset, DataBlock}; +use crate::{ + coding::Encode, file::fsync_directory, super_segment::index_block::NewBlockHandle, + CompressionType, InternalValue, SegmentId, UserKey, +}; +use index::Writer as IndexWriter; +use std::{ + fs::File, + io::{BufWriter, Seek, Write}, + path::PathBuf, +}; + +/// Serializes and compresses values into blocks and writes them to disk as segment +pub struct Writer { + /// Segment file + path: PathBuf, + + segment_id: SegmentId, + + data_block_size: u32, + + /// Compression to use + compression: CompressionType, + + /// Writer of data blocks + #[allow(clippy::struct_field_names)] + block_writer: BufWriter, + + /// Writer of index blocks + index_writer: IndexWriter, + + /// Buffer of KVs + chunk: Vec, + chunk_size: usize, + + pub(crate) meta: meta::Metadata, + + /// Stores the previous block position (used for creating back links) + prev_pos: (BlockOffset, BlockOffset), + + current_key: Option, + // bloom_policy: BloomConstructionPolicy, + + // /// Hashes for bloom filter + // /// + // /// using enhanced double hashing, so we got two u64s + // bloom_hash_buffer: Vec<(u64, u64)>, +} + +impl Writer { + pub fn new(path: PathBuf, segment_id: SegmentId) -> crate::Result { + let block_writer = File::create(&path)?; + let block_writer = BufWriter::with_capacity(u16::MAX.into(), block_writer); + + Ok(Self { + meta: meta::Metadata::default(), + + segment_id, + + data_block_size: 4_096, + + compression: CompressionType::None, + + path: std::path::absolute(path)?, + + index_writer: IndexWriter::new(4_096 /* TODO: hard coded for now */), + + block_writer, + chunk: Vec::new(), + + prev_pos: (BlockOffset(0), BlockOffset(0)), + + chunk_size: 0, + + current_key: None, + }) + } + + // TODO: data_block_size setter + + #[must_use] + pub(crate) fn use_compression(mut self, compression: CompressionType) -> Self { + self.compression = compression; + self.index_writer = self.index_writer.use_compression(compression); + self + } + + /// Writes an item. + /// + /// # Note + /// + /// It's important that the incoming stream of items is correctly + /// sorted as described by the [`UserKey`], otherwise the block layout will + /// be non-sense. + pub fn write(&mut self, item: InternalValue) -> crate::Result<()> { + if item.is_tombstone() { + self.meta.tombstone_count += 1; + } + + // NOTE: Check if we visit a new key + if Some(&item.key.user_key) != self.current_key.as_ref() { + self.meta.key_count += 1; + self.current_key = Some(item.key.user_key.clone()); + + // TODO: + // // IMPORTANT: Do not buffer *every* item's key + // // because there may be multiple versions + // // of the same key + // if self.bloom_policy.is_active() { + // self.bloom_hash_buffer + // .push(BloomFilter::get_hash(&item.key.user_key)); + // } + } + + let seqno = item.key.seqno; + + if self.meta.first_key.is_none() { + self.meta.first_key = Some(item.key.user_key.clone()); + } + + self.chunk_size += item.key.user_key.len() + item.value.len(); + self.chunk.push(item); + + if self.chunk_size >= self.data_block_size as usize { + self.spill_block()?; + } + + self.meta.lowest_seqno = self.meta.lowest_seqno.min(seqno); + self.meta.highest_seqno = self.meta.highest_seqno.max(seqno); + + Ok(()) + } + + /// Writes a compressed block to disk. + /// + /// This is triggered when a `Writer::write` causes the buffer to grow to the configured `block_size`. + /// + /// Should only be called when the block has items in it. + pub(crate) fn spill_block(&mut self) -> crate::Result<()> { + let Some(last) = self.chunk.last() else { + return Ok(()); + }; + + let bytes = DataBlock::encode_items(&self.chunk, 16, 0.75)?; + + // TODO: prev block offset + let header = Block::to_writer(&mut self.block_writer, &bytes, self.compression)?; + + self.meta.uncompressed_size += u64::from(header.uncompressed_length); + + let bytes_written = (BlockHeader::serialized_len() + bytes.len()) as u32; + + self.index_writer.register_block( + last.key.user_key.clone(), + self.meta.file_pos, + bytes_written, + )?; + + // Adjust metadata + self.meta.file_pos += bytes_written as u64; + self.meta.item_count += self.chunk.len(); + self.meta.data_block_count += 1; + + // Back link stuff + self.prev_pos.0 = self.prev_pos.1; + self.prev_pos.1 += bytes_written as u64; + + // Set last key + self.meta.last_key = Some( + // NOTE: Expect is fine, because the chunk is not empty + // + // Also, we are allowed to remove the last item + // to get ownership of it, because the chunk is cleared after + // this anyway + #[allow(clippy::expect_used)] + self.chunk + .pop() + .expect("chunk should not be empty") + .key + .user_key, + ); + + // IMPORTANT: Clear chunk after everything else + self.chunk.clear(); + self.chunk_size = 0; + + Ok(()) + } + + /// Finishes the segment, making sure all data is written durably + pub fn finish(mut self) -> crate::Result> { + self.spill_block()?; + + // No items written! Just delete segment file and return nothing + if self.meta.item_count == 0 { + std::fs::remove_file(&self.path)?; + return Ok(None); + } + + let index_block_start = BlockOffset(self.block_writer.stream_position()?); + + // // Append index blocks to file + let tli_handle = self.index_writer.finish(&mut self.block_writer)?; + + let index_block_handle = NewBlockHandle::new( + index_block_start, + (*tli_handle.offset() - *index_block_start) as u32, + ); + + self.meta.index_block_count = self.index_writer.block_count; + + // // Write bloom filter + // let bloom_ptr = { + // if self.bloom_hash_buffer.is_empty() { + // BlockOffset(0) + // } else { + // let bloom_ptr = self.block_writer.stream_position()?; + // let n = self.bloom_hash_buffer.len(); + + // log::trace!( + // "Constructing Bloom filter with {n} entries: {:?}", + // self.bloom_policy, + // ); + + // let start = std::time::Instant::now(); + + // let mut filter = self.bloom_policy.build(n); + + // for hash in std::mem::take(&mut self.bloom_hash_buffer) { + // filter.set_with_hash(hash); + // } + + // log::trace!("Built Bloom filter in {:?}", start.elapsed()); + + // filter.encode_into(&mut self.block_writer)?; + + // BlockOffset(bloom_ptr) + // } + // }; + // log::trace!("bloom_ptr={bloom_ptr}"); + + // // TODO: #46 https://github.com/fjall-rs/lsm-tree/issues/46 - Write range filter + // let rf_ptr = BlockOffset(0); + // log::trace!("rf_ptr={rf_ptr}"); + + // // TODO: #2 https://github.com/fjall-rs/lsm-tree/issues/2 - Write range tombstones + // let range_tombstones_ptr = BlockOffset(0); + // log::trace!("range_tombstones_ptr={range_tombstones_ptr}"); + + // // TODO: + // let pfx_ptr = BlockOffset(0); + // log::trace!("pfx_ptr={pfx_ptr}"); + + // Write metadata + let metadata_start = BlockOffset(self.block_writer.stream_position()?); + + let metadata_handle = { + fn meta(key: &str, value: &[u8]) -> InternalValue { + InternalValue::from_components(key, value, 0, crate::ValueType::Value) + } + + let meta_items = [ + meta( + "#data_block_count", + &self.meta.data_block_count.to_le_bytes(), + ), + meta("#id", &self.segment_id.to_le_bytes()), + meta( + "#index_block_count", + &self.meta.index_block_count.to_le_bytes(), + ), + meta("#item_count", &self.meta.item_count.to_le_bytes()), + meta( + "#key#max", + self.meta.last_key.as_ref().expect("should exist"), + ), + meta( + "#key#min", + self.meta.first_key.as_ref().expect("should exist"), + ), + meta("#key_count", &self.meta.key_count.to_le_bytes()), + meta("#seqno#max", &self.meta.highest_seqno.to_le_bytes()), + meta("#seqno#min", &self.meta.lowest_seqno.to_le_bytes()), + meta("#size", &self.meta.file_pos.to_le_bytes()), + meta("#tombstone_count", &self.meta.tombstone_count.to_le_bytes()), + meta( + "#user_data_size", + &self.meta.uncompressed_size.to_le_bytes(), + ), + meta("version#lsmt", env!("CARGO_PKG_VERSION").as_bytes()), + meta("version#table", b"3.0"), + ]; + + #[cfg(debug_assertions)] + { + let mut sorted_copy = meta_items.clone(); + sorted_copy.sort(); + + // Just to make sure the items are definitely sorted + assert_eq!(meta_items, sorted_copy, "meta items not sorted correctly"); + } + + log::trace!( + "Writing metadata to segment file {:?}: {meta_items:#?}", + self.path, + ); + + // TODO: no binary index + let bytes = DataBlock::encode_items(&meta_items, 1, 0.0)?; + let _header = Block::to_writer(&mut self.block_writer, &bytes, CompressionType::None)?; + + let bytes_written = BlockHeader::serialized_len() + bytes.len(); + + NewBlockHandle::new(metadata_start, bytes_written as u32) + }; + + // Bundle all the file offsets + let trailer = Trailer { + index_block: index_block_handle, + tli: tli_handle, + filter: NewBlockHandle::default(), + metadata: metadata_handle, + /* range_filter:range_filter_ptr: rf:rf_ptr, + range_tombstones:range_tombstones_ptr, + pfx:pfx_ptr, */ + }; + + log::trace!( + "Writing trailer to segment file {:?}: {trailer:#?}", + self.path, + ); + + // Write trailer + trailer.write_into(&mut self.block_writer)?; + + // Finally, flush & fsync the blocks file + self.block_writer.flush()?; + self.block_writer.get_mut().sync_all()?; + + // IMPORTANT: fsync folder on Unix + fsync_directory(self.path.parent().expect("should have folder"))?; + + log::debug!( + "Written {} items in {} blocks into new segment file, written {} MiB", + self.meta.item_count, + self.meta.data_block_count, + *self.meta.file_pos / 1_024 / 1_024, + ); + + Ok(Some(trailer)) + } +} From 21455f997f051e5d91bae48b920a6c7299a00aa7 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 11 Apr 2025 23:38:35 +0200 Subject: [PATCH 057/613] wip --- src/super_segment/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/super_segment/mod.rs b/src/super_segment/mod.rs index 14da79c6..6cbf0a50 100644 --- a/src/super_segment/mod.rs +++ b/src/super_segment/mod.rs @@ -12,3 +12,4 @@ mod writer; pub use block::Block; pub use data_block::DataBlock; pub use index_block::IndexBlock; +pub use writer::Writer; From 9af4e8f3f51cd924b37344f08eb37e93ce981565 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 12 Apr 2025 18:47:20 +0200 Subject: [PATCH 058/613] index block point queries finally work --- src/super_segment/data_block/mod.rs | 43 +++- src/super_segment/index_block/mod.rs | 339 ++++++++++++++++++++++----- 2 files changed, 315 insertions(+), 67 deletions(-) diff --git a/src/super_segment/data_block/mod.rs b/src/super_segment/data_block/mod.rs index ef27639b..24182397 100644 --- a/src/super_segment/data_block/mod.rs +++ b/src/super_segment/data_block/mod.rs @@ -314,11 +314,11 @@ impl DataBlock { let seqno_cmp = Reverse(seqno - 1); while left < right { - let mid = left + (right - left) / 2; + let mid = (left + right) / 2; let offset = binary_index.get(mid); - if (needle, seqno_cmp) >= self.get_key_at(offset) { + if self.get_key_at(offset) <= (needle, seqno_cmp) { left = mid + 1; } else { right = mid; @@ -326,11 +326,11 @@ impl DataBlock { } } else { while left < right { - let mid = left + (right - left) / 2; + let mid = (left + right) / 2; let offset = binary_index.get(mid); - if needle >= self.get_key_at(offset).0 { + if self.get_key_at(offset).0 <= needle { left = mid + 1; } else { right = mid; @@ -601,6 +601,41 @@ mod tests { assert_eq!(Less, compare_prefixed_slice(b"yyy", b"b", b"yyyyb")); } + #[test] + fn v3_data_block_point_read_one() -> crate::Result<()> { + let items = [InternalValue::from_components( + "pla:earth:fact", + "eaaaaaaaaarth", + 0, + crate::ValueType::Value, + )]; + + let bytes = DataBlock::encode_items(&items, 16, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, None)?, + ); + } + + assert_eq!(None, data_block.point_read(b"yyy", None)?); + + Ok(()) + } + #[test] fn v3_data_block_point_read() -> crate::Result<()> { let items = [ diff --git a/src/super_segment/index_block/mod.rs b/src/super_segment/index_block/mod.rs index 403d88da..f6939fd2 100644 --- a/src/super_segment/index_block/mod.rs +++ b/src/super_segment/index_block/mod.rs @@ -125,12 +125,12 @@ impl IndexBlock { ) } - fn parse_restart_head(cursor: &mut Cursor<&[u8]>) -> RestartHead { + fn parse_restart_head(cursor: &mut Cursor<&[u8]>, pos: usize) -> RestartHead { let offset = unwrappy!(cursor.read_u64_varint()); let size = unwrappy!(cursor.read_u32_varint()); let key_len: usize = unwrappy!(cursor.read_u16_varint()).into(); - let key_start = cursor.position() as usize; + let key_start = pos + cursor.position() as usize; unwrappy!(cursor.seek_relative(key_len as i64)); @@ -260,11 +260,8 @@ impl IndexBlock { Ok(None) } */ - fn binary_search_for_offset( - &self, - binary_index: &BinaryIndexReader, - needle: &[u8], - ) -> Option { + /// Search for the lowest block that may possibly contain the needle. + fn search_lowest(&self, binary_index: &BinaryIndexReader, needle: &[u8]) -> Option { let mut left: usize = 0; let mut right = binary_index.len(); @@ -273,86 +270,126 @@ impl IndexBlock { } while left < right { - let mid = left + (right - left) / 2; + let mid = (left + right) / 2; let offset = binary_index.get(mid); - if needle >= self.get_key_at(offset) { + if self.get_key_at(offset) < needle { left = mid + 1; } else { right = mid; } } - if left == 0 { + Some(if left < binary_index.len() { + binary_index.get(left) + } else { + binary_index.get(binary_index.len() - 1) + }) + } + + /// Search for the last block that may possibly contain the needle. + fn search_highest(&self, binary_index: &BinaryIndexReader, needle: &[u8]) -> Option { + let mut left: usize = 0; + let mut right = binary_index.len(); + + if right == 0 { return None; } - let offset = binary_index.get(left - 1); + while left < right { + let mid = (left + right) / 2; + + let offset = binary_index.get(mid); - Some(offset) + if self.get_key_at(offset) <= needle { + left = mid + 1; + } else { + right = mid; + } + } + + if left == 0 { + Some(binary_index.get(0)) + } else if left == binary_index.len() { + Some(binary_index.get(binary_index.len() - 1)) + } else { + Some(binary_index.get(left)) + } } #[must_use] pub fn get_lowest_possible_block(&self, needle: &[u8]) -> Option { let binary_index = self.get_binary_index_reader(); - let offset = self.binary_search_for_offset(&binary_index, needle)?; + /* + // NOTE: Currently, the hash index is never initialized for index blocks + /* // NOTE: Try hash index if it exists + if let Some(bucket_value) = self + .get_hash_index_reader() + .and_then(|reader| reader.get(key)) + { + let restart_entry_pos = binary_index.get(usize::from(bucket_value)); + return self.walk(key, seqno, restart_entry_pos, self.restart_interval.into()); + } */ + ) */ + + let offset = self.search_lowest(&binary_index, needle)?; // SAFETY: pos is always retrieved from the binary index, // which we consider to be trustworthy #[warn(unsafe_code)] let mut cursor = Cursor::new(unsafe { self.inner.data.get_unchecked(offset..) }); - let item = Self::parse_restart_head(&mut cursor); + let item = Self::parse_restart_head(&mut cursor, offset); let end_key = self .inner .data .slice(item.key_start..(item.key_start + item.key_len)); - Some(NewKeyedBlockHandle::new(end_key, item.offset, item.size)) + if needle > end_key { + return None; + } - /* let binary_index = self.get_binary_index_reader(); + Some(NewKeyedBlockHandle::new(end_key, item.offset, item.size)) + } - // NOTE: Currently, the hash index is never initialized for index blocks - /* // NOTE: Try hash index if it exists - if let Some(bucket_value) = self - .get_hash_index_reader() - .and_then(|reader| reader.get(key)) - { - let restart_entry_pos = binary_index.get(usize::from(bucket_value)); - return self.walk(key, seqno, restart_entry_pos, self.restart_interval.into()); - } */ + #[must_use] + pub fn get_highest_possible_block(&self, needle: &[u8]) -> Option { + let binary_index = self.get_binary_index_reader(); - // NOTE: Fallback to binary search + /* + // NOTE: Currently, the hash index is never initialized for index blocks + /* // NOTE: Try hash index if it exists + if let Some(bucket_value) = self + .get_hash_index_reader() + .and_then(|reader| reader.get(key)) + { + let restart_entry_pos = binary_index.get(usize::from(bucket_value)); + return self.walk(key, seqno, restart_entry_pos, self.restart_interval.into()); + } */ + ) */ - let mut left = 0; - let mut right = binary_index.len(); + let offset = self.search_highest(&binary_index, needle)?; - if right == 0 { - return Ok(None); - } - - while left < right { - let mid = (left + right) / 2; + // SAFETY: pos is always retrieved from the binary index, + // which we consider to be trustworthy + #[warn(unsafe_code)] + let mut cursor = Cursor::new(unsafe { self.inner.data.get_unchecked(offset..) }); - let offset = binary_index.get(mid); + let item = Self::parse_restart_head(&mut cursor, offset); - if key >= self.get_key_at(offset)? { - left = mid + 1; - } else { - right = mid; - } - } + let end_key = self + .inner + .data + .slice(item.key_start..(item.key_start + item.key_len)); - if left == 0 { - return Ok(None); + if needle > end_key { + return None; } - let offset = binary_index.get(left - 1); - - self.walk(key, offset, self.restart_interval.into()) */ + Some(NewKeyedBlockHandle::new(end_key, item.offset, item.size)) } pub fn encode_items(items: &[NewKeyedBlockHandle]) -> crate::Result> { @@ -380,17 +417,18 @@ mod tests { use test_log::test; #[test] + #[allow(clippy::unwrap_used)] fn v3_index_block_simple() -> crate::Result<()> { let items = [ - NewKeyedBlockHandle::new(b"a".into(), BlockOffset(0), 6_000), - NewKeyedBlockHandle::new(b"abcdef".into(), BlockOffset(6_000), 7_000), + NewKeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), + NewKeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), NewKeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), ]; let bytes = IndexBlock::encode_items(&items)?; - /* eprintln!("{bytes:?}"); + eprintln!("{bytes:?}"); eprintln!("{}", String::from_utf8_lossy(&bytes)); - eprintln!("encoded into {} bytes", bytes.len()); */ + /* eprintln!("encoded into {} bytes", bytes.len()); */ let data_block = IndexBlock::new(Block { data: bytes.into(), @@ -404,23 +442,198 @@ mod tests { assert_eq!(data_block.item_count(), items.len()); - for needle in items { - // eprintln!("NEEDLE {needle:?}"); + assert_eq!( + Some(items.first().unwrap().clone()), + data_block.get_lowest_possible_block(b"a") + ); + assert_eq!( + Some(items.first().unwrap().clone()), + data_block.get_lowest_possible_block(b"b") + ); + assert_eq!( + Some(items.get(1).unwrap().clone()), + data_block.get_lowest_possible_block(b"ba") + ); + assert_eq!( + Some(items.get(2).unwrap().clone()), + data_block.get_lowest_possible_block(b"d") + ); - assert_eq!( - Some(needle.clone()), - data_block.get_lowest_possible_block(needle.end_key()), - ); - } + // assert_eq!(None, data_block.get_lowest_possible_block(b"zzz")); + Ok(()) + } + + #[test] + #[allow(clippy::unwrap_used)] + fn v3_index_block_span() -> crate::Result<()> { + let items = [ + NewKeyedBlockHandle::new(b"a".into(), BlockOffset(0), 6_000), + NewKeyedBlockHandle::new(b"a".into(), BlockOffset(6_000), 7_000), + NewKeyedBlockHandle::new(b"b".into(), BlockOffset(13_000), 5_000), + ]; + + let bytes = IndexBlock::encode_items(&items)?; + // eprintln!("{bytes:?}"); + // eprintln!("{}", String::from_utf8_lossy(&bytes)); + /* eprintln!("encoded into {} bytes", bytes.len()); */ + + let data_block = IndexBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.item_count(), items.len()); + + assert_eq!( + Some(items.first().unwrap().clone()), + data_block.get_lowest_possible_block(b"a") + ); + assert_eq!( + Some(items.last().unwrap().clone()), + data_block.get_lowest_possible_block(b"abc") + ); + assert_eq!( + Some(items.last().unwrap().clone()), + data_block.get_lowest_possible_block(b"b") + ); + + Ok(()) + } + + #[test] + #[allow(clippy::unwrap_used)] + fn v3_index_block_span_highest() -> crate::Result<()> { + let items = [ + NewKeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), + NewKeyedBlockHandle::new(b"c".into(), BlockOffset(0), 6_000), + NewKeyedBlockHandle::new(b"c".into(), BlockOffset(6_000), 7_000), + NewKeyedBlockHandle::new(b"d".into(), BlockOffset(13_000), 5_000), + ]; + + let bytes = IndexBlock::encode_items(&items)?; + // eprintln!("{bytes:?}"); + // eprintln!("{}", String::from_utf8_lossy(&bytes)); + /* eprintln!("encoded into {} bytes", bytes.len()); */ + + let data_block = IndexBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.item_count(), items.len()); + + assert_eq!( + Some(items.first().unwrap().clone()), + data_block.get_highest_possible_block(b"a") + ); + assert_eq!( + Some(items.get(1).unwrap().clone()), + data_block.get_highest_possible_block(b"abc") + ); + assert_eq!( + Some(items.last().unwrap().clone()), + data_block.get_highest_possible_block(b"c") + ); + assert_eq!( + Some(items.last().unwrap().clone()), + data_block.get_highest_possible_block(b"cef") + ); + assert_eq!( + Some(items.last().unwrap().clone()), + data_block.get_highest_possible_block(b"d") + ); + assert_eq!(None, data_block.get_highest_possible_block(b"zzz")); + + Ok(()) + } + + #[test] + fn v3_index_block_one() -> crate::Result<()> { + let item = NewKeyedBlockHandle::new(b"c".into(), BlockOffset(0), 6_000); + + let bytes = IndexBlock::encode_items(&[item.clone()])?; + // eprintln!("{bytes:?}"); + // eprintln!("{}", String::from_utf8_lossy(&bytes)); + /* eprintln!("encoded into {} bytes", bytes.len()); */ + + let data_block = IndexBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.item_count(), 1); + + assert_eq!( + Some(item.clone()), + data_block.get_lowest_possible_block(b"a") + ); + assert_eq!( + Some(item.clone()), + data_block.get_lowest_possible_block(b"asdasd") + ); + assert_eq!( + Some(item.clone()), + data_block.get_lowest_possible_block(b"b") + ); + assert_eq!(Some(item), data_block.get_lowest_possible_block(b"c")); + assert_eq!(None, data_block.get_lowest_possible_block(b"d")); + assert_eq!(None, data_block.get_lowest_possible_block(b"z")); + + Ok(()) + } + + #[test] + fn v3_index_block_one_highest() -> crate::Result<()> { + let item = NewKeyedBlockHandle::new(b"c".into(), BlockOffset(0), 6_000); + + let bytes = IndexBlock::encode_items(&[item.clone()])?; + // eprintln!("{bytes:?}"); + // eprintln!("{}", String::from_utf8_lossy(&bytes)); + /* eprintln!("encoded into {} bytes", bytes.len()); */ + + let data_block = IndexBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.item_count(), 1); + + assert_eq!( + Some(item.clone()), + data_block.get_lowest_possible_block(b"a") + ); + assert_eq!( + Some(item.clone()), + data_block.get_lowest_possible_block(b"asdasd") + ); assert_eq!( - Some(NewKeyedBlockHandle::new( - b"abcdef".into(), - BlockOffset(6_000), - 7_000 - )), - data_block.get_lowest_possible_block(b"ccc"), + Some(item.clone()), + data_block.get_lowest_possible_block(b"b") ); + assert_eq!(Some(item), data_block.get_lowest_possible_block(b"c")); + assert_eq!(None, data_block.get_lowest_possible_block(b"d")); + assert_eq!(None, data_block.get_lowest_possible_block(b"z")); Ok(()) } From 95fd2c43e2f6604c3df263d495cc321a89e81ea0 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 12 Apr 2025 20:50:57 +0200 Subject: [PATCH 059/613] fix data block lookups for weird MVCC cases --- fuzz/fuzz_targets/data_block.rs | 8 + src/super_segment/data_block/mod.rs | 295 ++++++++++++++++++++++----- src/super_segment/index_block/mod.rs | 94 --------- 3 files changed, 256 insertions(+), 141 deletions(-) diff --git a/fuzz/fuzz_targets/data_block.rs b/fuzz/fuzz_targets/data_block.rs index 84cac6fa..c68db99a 100644 --- a/fuzz/fuzz_targets/data_block.rs +++ b/fuzz/fuzz_targets/data_block.rs @@ -136,6 +136,14 @@ fuzz_target!(|data: &[u8]| { .point_read(&needle.key.user_key, Some(needle.key.seqno + 1)) .unwrap(), ); + + assert_eq!( + data_block.point_read(&needle.key.user_key, None).unwrap(), + items + .iter() + .find(|item| item.key.user_key == needle.key.user_key) + .cloned(), + ); } assert_eq!(items, data_block.iter().collect::>()); diff --git a/src/super_segment/data_block/mod.rs b/src/super_segment/data_block/mod.rs index 24182397..c4cb5f4a 100644 --- a/src/super_segment/data_block/mod.rs +++ b/src/super_segment/data_block/mod.rs @@ -324,27 +324,55 @@ impl DataBlock { right = mid; } } - } else { + + if left == 0 { + return None; + } + + let offset = binary_index.get(left - 1); + + Some(offset) + } else if self.restart_interval == 1 { while left < right { let mid = (left + right) / 2; let offset = binary_index.get(mid); - if self.get_key_at(offset).0 <= needle { + if self.get_key_at(offset).0 < needle { left = mid + 1; } else { right = mid; } } - } - if left == 0 { - return None; - } + Some(if left == 0 { + binary_index.get(0) + } else if left < binary_index.len() { + binary_index.get(left) + } else { + binary_index.get(binary_index.len() - 1) + }) + } else { + while left < right { + let mid = (left + right) / 2; - let offset = binary_index.get(left - 1); + let offset = binary_index.get(mid); + + if self.get_key_at(offset).0 < needle { + left = mid + 1; + } else { + right = mid; + } + } - Some(offset) + Some(if left == 0 { + binary_index.get(0) + } else if left < binary_index.len() { + binary_index.get(left - 1) + } else { + binary_index.get(binary_index.len() - 1) + }) + } } fn parse_restart_item(reader: &mut Cursor<&[u8]>, offset: usize) -> Option { @@ -448,49 +476,19 @@ impl DataBlock { #[warn(unsafe_code)] let mut reader = Cursor::new(unsafe { bytes.get_unchecked(offset..) }); - let head = Self::parse_restart_item(&mut reader, offset)?; - - let key = &bytes[head.key.0..head.key.1]; - let base_key_offset = head.key.0; + loop { + let head = Self::parse_restart_item(&mut reader, offset)?; - match key.cmp(needle) { - std::cmp::Ordering::Equal => { - // TODO: maybe return early if past seqno - let should_skip = seqno.is_some_and(|watermark| head.seqno >= watermark); + let key = &bytes[head.key.0..head.key.1]; + let base_key_offset = head.key.0; - if !should_skip { - let kv = head.materialize(&self.inner.data); - return Some(kv); - } - } - std::cmp::Ordering::Greater => { - // Already passed needle - return None; - } - std::cmp::Ordering::Less => { - // Continue to next KV - } - } - - for _ in 0..(self.restart_interval - 1) { - let kv = Self::parse_truncated_item(&mut reader, offset, base_key_offset)?; - - let cmp_result = if let Some(prefix) = &kv.prefix { - let prefix = unsafe { bytes.get_unchecked(prefix.0..prefix.1) }; - let rest_key = unsafe { bytes.get_unchecked(kv.key.0..kv.key.1) }; - compare_prefixed_slice(prefix, rest_key, needle) - } else { - let key = unsafe { bytes.get_unchecked(kv.key.0..kv.key.1) }; - key.cmp(needle) - }; - - match cmp_result { + match key.cmp(needle) { std::cmp::Ordering::Equal => { // TODO: maybe return early if past seqno - let should_skip = seqno.is_some_and(|watermark| kv.seqno >= watermark); + let should_skip = seqno.is_some_and(|watermark| head.seqno >= watermark); if !should_skip { - let kv = kv.materialize(&self.inner.data); + let kv = head.materialize(&self.inner.data); return Some(kv); } } @@ -502,9 +500,39 @@ impl DataBlock { // Continue to next KV } } - } - None + for _ in 0..(self.restart_interval - 1) { + let kv = Self::parse_truncated_item(&mut reader, offset, base_key_offset)?; + + let cmp_result = if let Some(prefix) = &kv.prefix { + let prefix = unsafe { bytes.get_unchecked(prefix.0..prefix.1) }; + let rest_key = unsafe { bytes.get_unchecked(kv.key.0..kv.key.1) }; + compare_prefixed_slice(prefix, rest_key, needle) + } else { + let key = unsafe { bytes.get_unchecked(kv.key.0..kv.key.1) }; + key.cmp(needle) + }; + + match cmp_result { + std::cmp::Ordering::Equal => { + // TODO: maybe return early if past seqno + let should_skip = seqno.is_some_and(|watermark| kv.seqno >= watermark); + + if !should_skip { + let kv = kv.materialize(&self.inner.data); + return Some(kv); + } + } + std::cmp::Ordering::Greater => { + // Already passed needle + return None; + } + std::cmp::Ordering::Less => { + // Continue to next KV + } + } + } + } } /// Reads an item by key from the block, if it exists. @@ -958,6 +986,179 @@ mod tests { Ok(()) } + #[test] + #[allow(clippy::unwrap_used)] + fn v3_data_block_mvcc_latest() -> crate::Result<()> { + let items = [ + InternalValue::from_components(b"a", b"a", 3, Value), + InternalValue::from_components(b"a", b"a", 2, Value), + InternalValue::from_components(b"a", b"a", 1, Value), + InternalValue::from_components(b"b", b"b", 65, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 1, 0.75)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); + + assert_eq!( + Some(items.first().cloned().unwrap()), + data_block.point_read(b"a", None)? + ); + assert_eq!( + Some(items.last().cloned().unwrap()), + data_block.point_read(b"b", None)? + ); + assert_eq!(None, data_block.point_read(b"yyy", None)?); + + Ok(()) + } + + #[test] + #[allow(clippy::unwrap_used)] + fn v3_data_block_mvcc_latest_fuzz_1() -> crate::Result<()> { + let items = [ + InternalValue::from_components(Slice::from([0]), Slice::from([]), 0, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 0, Value), + InternalValue::from_components( + Slice::from([255, 255, 0]), + Slice::from([]), + 127_886_946_205_696, + Tombstone, + ), + ]; + + let bytes = DataBlock::encode_items(&items, 2, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + + assert_eq!( + Some(items.get(1).cloned().unwrap()), + data_block.point_read(&[233, 233], None)? + ); + assert_eq!(None, data_block.point_read(b"yyy", None)?); + + Ok(()) + } + + #[test] + #[allow(clippy::unwrap_used)] + fn v3_data_block_mvcc_latest_fuzz_2() -> crate::Result<()> { + let items = [ + InternalValue::from_components(Slice::from([0]), Slice::from([]), 0, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 8, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 7, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 6, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 5, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 4, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 3, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 2, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 1, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 0, Value), + InternalValue::from_components( + Slice::from([255, 255, 0]), + Slice::from([]), + 127_886_946_205_696, + Tombstone, + ), + ]; + + let bytes = DataBlock::encode_items(&items, 2, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + + assert_eq!( + Some(items.get(1).cloned().unwrap()), + data_block.point_read(&[233, 233], None)? + ); + assert_eq!( + Some(items.last().cloned().unwrap()), + data_block.point_read(&[255, 255, 0], None)? + ); + assert_eq!(None, data_block.point_read(b"yyy", None)?); + + Ok(()) + } + + #[test] + #[allow(clippy::unwrap_used)] + fn v3_data_block_mvcc_latest_fuzz_3() -> crate::Result<()> { + let items = [ + InternalValue::from_components(Slice::from([0]), Slice::from([]), 0, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 8, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 7, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 6, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 5, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 4, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 3, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 2, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 1, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 0, Value), + InternalValue::from_components( + Slice::from([255, 255, 0]), + Slice::from([]), + 127_886_946_205_696, + Tombstone, + ), + ]; + + let bytes = DataBlock::encode_items(&items, 2, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + + assert_eq!( + Some(items.get(1).cloned().unwrap()), + data_block.point_read(&[233, 233], Some(SeqNo::MAX))? + ); + assert_eq!( + Some(items.last().cloned().unwrap()), + data_block.point_read(&[255, 255, 0], Some(SeqNo::MAX))? + ); + assert_eq!(None, data_block.point_read(b"yyy", None)?); + + Ok(()) + } + #[test] fn v3_data_block_dense_mvcc_no_hash() -> crate::Result<()> { let items = [ diff --git a/src/super_segment/index_block/mod.rs b/src/super_segment/index_block/mod.rs index f6939fd2..ac4d64c5 100644 --- a/src/super_segment/index_block/mod.rs +++ b/src/super_segment/index_block/mod.rs @@ -166,100 +166,6 @@ impl IndexBlock { key } - /* fn walk( - &self, - needle: &[u8], - pos: usize, - restart_interval: usize, - ) -> crate::Result> { - use std::cmp::Ordering::{Equal, Greater, Less}; - - let bytes = &self.inner.data; - let mut cursor = Cursor::new(&bytes[pos..]); - - let mut base_key_pos = 0; - let mut offset = BlockOffset(0); - - // NOTE: Check the full item - let base_key = { - let parsed = unwrappy!(Self::parse_restart_head(&mut cursor)); - - let key_start = pos + parsed.key_start; - let key_end = key_start + parsed.key_len; - let key = &bytes[key_start..key_end]; - - match key.cmp(needle) { - Equal => { - let key = bytes.slice(key_start..key_end); - - return Ok(Some(NewKeyedBlockHandle { - end_key: key, - offset: parsed.offset, - size: parsed.size, - })); - } - Greater => { - // NOTE: Already passed searched key - return Ok(None); - } - Less => { - // NOTE: Continue - } - } - - base_key_pos = key_start; - offset = BlockOffset(*parsed.offset + u64::from(parsed.size)); - - key - }; - - // NOTE: Check the rest items - for _idx in 1..restart_interval { - let size = cursor.read_u32_varint()?; - - let shared_prefix_len: usize = unwrappy!(cursor.read_u16_varint()).into(); - let rest_key_len: usize = unwrappy!(cursor.read_u16_varint()).into(); - - let key_offset = pos + cursor.position() as usize; - - // NOTE: PERF: Slicing seems to be faster than get_unchecked!! - let prefix_part = &base_key[0..shared_prefix_len]; - let rest_key = &bytes[key_offset..(key_offset + rest_key_len)]; - - unwrappy!(cursor.seek_relative(rest_key_len as i64)); - - match compare_prefixed_slice(prefix_part, rest_key, needle) { - Equal => { - let key = if shared_prefix_len == 0 { - bytes.slice(key_offset..(key_offset + rest_key_len)) - } else if rest_key_len == 0 { - bytes.slice(base_key_pos..(base_key_pos + shared_prefix_len)) - } else { - // Stitch key - UserKey::fused(prefix_part, rest_key) - }; - - return Ok(Some(NewKeyedBlockHandle { - end_key: key, - offset, - size, - })); - } - Greater => { - // NOTE: Already passed searched key - return Ok(None); - } - Less => { - // NOTE: Continue - } - } - - offset += u64::from(size); - } - - Ok(None) - } */ - /// Search for the lowest block that may possibly contain the needle. fn search_lowest(&self, binary_index: &BinaryIndexReader, needle: &[u8]) -> Option { let mut left: usize = 0; From 44d3d14b653e17065bb5d4a47917be547ee2dae2 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 12 Apr 2025 20:51:22 +0200 Subject: [PATCH 060/613] hide clippy warning in binary search monkeypatch --- src/binary_search.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/binary_search.rs b/src/binary_search.rs index 0b8278c6..a30a0fed 100644 --- a/src/binary_search.rs +++ b/src/binary_search.rs @@ -21,7 +21,7 @@ where let mid = (left + right) / 2; // SAFETY: See https://github.com/rust-lang/rust/blob/ebf0cf75d368c035f4c7e7246d203bd469ee4a51/library/core/src/slice/mod.rs#L2834-L2836 - #[warn(unsafe_code)] + #[allow(unsafe_code)] let item = unsafe { slice.get_unchecked(mid) }; if pred(item) { From 9959f0215831b102ecaf538b7a5ab95559d473a4 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 12 Apr 2025 20:51:39 +0200 Subject: [PATCH 061/613] update docs --- UNSAFE.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/UNSAFE.md b/UNSAFE.md index 9e64c312..0952bb52 100644 --- a/UNSAFE.md +++ b/UNSAFE.md @@ -1,12 +1,11 @@ # Unsafe usage -Currently, the project itself only uses one **1** unsafe block (ignoring dependencies which are tested themselves separately): - -- https://github.com/fjall-rs/lsm-tree/blob/2d8686e873369bd9c4ff2b562ed988c1cea38331/src/binary_search.rs#L23-L25 +... ## Run fuzz testing ```bash cargo +nightly fuzz run data_block -- -max_len=8000000 +cargo +nightly fuzz run index_block -- -max_len=8000000 cargo +nightly fuzz run partition_point -- -max_len=1000000 ``` From 6280b5e08be519aa5ceb508aace82456b44d4ced Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 14 Apr 2025 18:30:41 +0200 Subject: [PATCH 062/613] move impl --- src/super_segment/index_block/block_handle.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/super_segment/index_block/block_handle.rs b/src/super_segment/index_block/block_handle.rs index 589fa337..bf05df5a 100644 --- a/src/super_segment/index_block/block_handle.rs +++ b/src/super_segment/index_block/block_handle.rs @@ -86,6 +86,13 @@ pub struct NewKeyedBlockHandle { } impl NewKeyedBlockHandle { + pub fn new(end_key: UserKey, offset: BlockOffset, size: u32) -> Self { + Self { + end_key, + inner: NewBlockHandle::new(offset, size), + } + } + pub fn shift(&mut self, delta: BlockOffset) { self.inner.offset += delta; } @@ -105,13 +112,6 @@ impl NewKeyedBlockHandle { pub fn into_end_key(self) -> UserKey { self.end_key } - - pub fn new(end_key: UserKey, offset: BlockOffset, size: u32) -> Self { - Self { - end_key, - inner: NewBlockHandle::new(offset, size), - } - } } impl PartialEq for NewKeyedBlockHandle { From f18a64fad6b9414c9d624abbf87ac9625bac295e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Apr 2025 00:04:34 +0200 Subject: [PATCH 063/613] wip --- .gitignore | 2 ++ Cargo.toml | 5 +++-- README.md | 11 +++++++---- fuzz/fuzz_targets/data_block.rs | 1 + 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 6fd45755..f136cb3f 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,5 @@ Cargo.lock .lsm.data .test* .bench + +mutants* diff --git a/Cargo.toml b/Cargo.toml index 38d48cb2..c0ab3aec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,7 @@ description = "A K.I.S.S. implementation of log-structured merge trees (LSM-tree license = "MIT OR Apache-2.0" version = "3.0.0" edition = "2021" -rust-version = "1.75.0" +rust-version = "1.80.0" readme = "README.md" include = ["src/**/*", "LICENSE-APACHE", "LICENSE-MIT", "README.md"] repository = "https://github.com/fjall-rs/lsm-tree" @@ -24,6 +24,7 @@ bytes = ["value-log/bytes"] [dependencies] byteorder = "1.5.0" +byteview = "0.6.1" crossbeam-skiplist = "0.1.3" double-ended-peekable = "0.1.0" enum_dispatch = "0.3.13" @@ -33,7 +34,7 @@ log = "0.4.22" lz4_flex = { version = "0.11.3", optional = true, default-features = false } miniz_oxide = { version = "0.8.0", optional = true } path-absolutize = "3.1.1" -quick_cache = { version = "0.6.5", default-features = false, features = [] } +quick_cache = { version = "0.6.13", default-features = false, features = [] } rustc-hash = "2.0.0" self_cell = "1.0.4" tempfile = "3.12.0" diff --git a/README.md b/README.md index b1368150..cc89031f 100644 --- a/README.md +++ b/README.md @@ -19,20 +19,21 @@ A K.I.S.S. implementation of log-structured merge trees (LSM-trees/LSMTs) in Rus This is the most feature-rich LSM-tree implementation in Rust! It features: - Thread-safe BTreeMap-like API -- [99.9% safe](./UNSAFE.md) & stable Rust +- Mostly [safe](./UNSAFE.md) & 100% stable Rust - Block-based tables with compression support + - Optional hash indexes in blocks for faster point lookups [[3]](#footnotes) - Range & prefix searching with forward and reverse iteration - Size-tiered, (concurrent) Leveled and FIFO compaction - Multi-threaded flushing (immutable/sealed memtables) -- Partitioned block index to reduce memory footprint and keep startup time short [[1]](#footnotes) +- Optionally partitioned block index to reduce memory footprint and keep startup time short [[1]](#footnotes) - Block caching to keep hot data in memory - Bloom filters to increase point lookup performance - Snapshots (MVCC) - Key-value separation (optional) [[2]](#footnotes) - Single deletion tombstones ("weak" deletion) -Keys are limited to 65536 bytes, values are limited to 2^32 bytes. As is normal with any kind of storage -engine, larger keys and values have a bigger performance impact. +Keys are limited to 65536 bytes, values are limited to 2^32 bytes. +As is normal with any kind of storage engine, larger keys and values have a bigger performance impact. ## Feature flags @@ -79,3 +80,5 @@ All contributions are to be licensed as MIT OR Apache-2.0. [1] https://rocksdb.org/blog/2017/05/12/partitioned-index-filter.html [2] https://github.com/facebook/rocksdb/wiki/BlobDB + +[3] https://rocksdb.org/blog/2018/08/23/data-block-hash-index.html diff --git a/fuzz/fuzz_targets/data_block.rs b/fuzz/fuzz_targets/data_block.rs index c68db99a..fc61a741 100644 --- a/fuzz/fuzz_targets/data_block.rs +++ b/fuzz/fuzz_targets/data_block.rs @@ -114,6 +114,7 @@ fuzz_target!(|data: &[u8]| { }); assert_eq!(data_block.len(), items.len()); + assert!(!data_block.is_empty()); if data_block.binary_index_len() > 254 { assert!(data_block.hash_bucket_count().is_none()); From bf7fd7b69a09b962c02400908eaeb78573a4e42c Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Apr 2025 00:05:15 +0200 Subject: [PATCH 064/613] well point reads mostly work now --- src/abstract.rs | 10 +- src/blob_tree/cache.rs | 12 +- src/blob_tree/mod.rs | 28 +- src/bloom/mod.rs | 10 +- src/cache.rs | 2 +- src/clipping_iter.rs | 17 +- src/compaction/fifo.rs | 11 +- src/compaction/leveled.rs | 61 +- src/compaction/maintenance.rs | 14 +- src/compaction/major.rs | 2 +- src/compaction/movedown.rs | 2 +- src/compaction/pulldown.rs | 2 +- src/compaction/tiered.rs | 11 +- src/compaction/worker.rs | 62 +-- src/config.rs | 16 +- src/file.rs | 2 + src/level_manifest/level.rs | 11 +- src/level_manifest/mod.rs | 6 +- src/level_reader.rs | 18 +- src/level_scanner.rs | 28 +- src/lib.rs | 15 +- src/new_cache.rs | 216 ++++++++ src/new_descriptor_table.rs | 65 +++ src/range.rs | 6 +- src/segment/block/header.rs | 8 +- src/segment/block/mod.rs | 8 - src/segment/block/offset.rs | 1 + src/segment/value_block.rs | 4 +- src/segment/writer/mod.rs | 2 +- src/super_segment/block/encoder.rs | 10 +- src/super_segment/block/mod.rs | 63 ++- src/super_segment/block_index/mod.rs | 168 ++++++ src/super_segment/data_block/mod.rs | 166 ++++-- src/super_segment/filter/bit_array/mod.rs | 74 +++ src/super_segment/filter/bit_array/sliced.rs | 60 ++ src/super_segment/filter/mod.rs | 6 + .../filter/standard_bloom/builder.rs | 134 +++++ .../filter/standard_bloom/mod.rs | 295 ++++++++++ src/super_segment/index_block/block_handle.rs | 35 +- .../index_block/forward_reader.rs | 235 ++++++++ src/super_segment/index_block/mod.rs | 149 ++--- src/super_segment/inner.rs | 66 +++ src/super_segment/meta.rs | 157 ++++++ src/super_segment/mod.rs | 521 +++++++++++++++++- src/super_segment/multi_writer.rs | 216 ++++++++ src/super_segment/scanner.rs | 77 +++ src/super_segment/trailer.rs | 57 +- src/super_segment/writer/index.rs | 116 +++- src/super_segment/writer/meta.rs | 4 - src/super_segment/writer/mod.rs | 170 +++--- src/tree/ingest.rs | 47 +- src/tree/mod.rs | 89 ++- src/value.rs | 9 +- tests/open_files.rs | 4 +- tests/tree_iter_lifetime.rs | 5 +- 55 files changed, 3079 insertions(+), 504 deletions(-) create mode 100644 src/new_cache.rs create mode 100644 src/new_descriptor_table.rs create mode 100644 src/super_segment/block_index/mod.rs create mode 100644 src/super_segment/filter/bit_array/mod.rs create mode 100644 src/super_segment/filter/bit_array/sliced.rs create mode 100644 src/super_segment/filter/mod.rs create mode 100644 src/super_segment/filter/standard_bloom/builder.rs create mode 100644 src/super_segment/filter/standard_bloom/mod.rs create mode 100644 src/super_segment/index_block/forward_reader.rs create mode 100644 src/super_segment/inner.rs create mode 100644 src/super_segment/meta.rs create mode 100644 src/super_segment/multi_writer.rs create mode 100644 src/super_segment/scanner.rs diff --git a/src/abstract.rs b/src/abstract.rs index e83f0367..5b6e88e3 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -3,8 +3,9 @@ // (found in the LICENSE-* files in the repository) use crate::{ - compaction::CompactionStrategy, config::TreeType, tree::inner::MemtableId, AnyTree, BlobTree, - Config, KvPair, Memtable, Segment, SegmentId, SeqNo, Snapshot, Tree, UserKey, UserValue, + compaction::CompactionStrategy, config::TreeType, super_segment::Segment, + tree::inner::MemtableId, AnyTree, BlobTree, Config, KvPair, Memtable, SegmentId, SeqNo, + Snapshot, Tree, UserKey, UserValue, }; use enum_dispatch::enum_dispatch; use std::{ @@ -44,8 +45,9 @@ pub trait AbstractTree { /// Gets the memory usage of all bloom filters in the tree. fn bloom_filter_size(&self) -> usize; - #[doc(hidden)] - fn verify(&self) -> crate::Result; + // TODO:? + /* #[doc(hidden)] + fn verify(&self) -> crate::Result; */ /// Synchronously flushes a memtable to a disk segment. /// diff --git a/src/blob_tree/cache.rs b/src/blob_tree/cache.rs index a0659b5d..5f1db2bc 100644 --- a/src/blob_tree/cache.rs +++ b/src/blob_tree/cache.rs @@ -1,9 +1,9 @@ -use crate::cache::Cache; +use crate::NewCache; use std::sync::Arc; use value_log::BlobCache; #[derive(Clone)] -pub struct MyBlobCache(pub(crate) Arc); +pub struct MyBlobCache(pub(crate) Arc); impl BlobCache for MyBlobCache { fn get( @@ -11,7 +11,9 @@ impl BlobCache for MyBlobCache { vlog_id: value_log::ValueLogId, vhandle: &value_log::ValueHandle, ) -> Option { - self.0.get_blob(vlog_id, vhandle) + todo!() + + // self.0.get_blob(vlog_id, vhandle) } fn insert( @@ -20,6 +22,8 @@ impl BlobCache for MyBlobCache { vhandle: &value_log::ValueHandle, value: value_log::UserValue, ) { - self.0.insert_blob(vlog_id, vhandle, value); + todo!() + + // self.0.insert_blob(vlog_id, vhandle, value); } } diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index 6eea0cbe..2a968f84 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -13,9 +13,10 @@ use crate::{ compaction::stream::CompactionStream, file::BLOBS_FOLDER, r#abstract::{AbstractTree, RangeItem}, + super_segment::Segment, tree::inner::MemtableId, value::InternalValue, - Config, KvPair, Memtable, Segment, SegmentId, SeqNo, Snapshot, UserKey, UserValue, + Config, KvPair, Memtable, SegmentId, SeqNo, Snapshot, UserKey, UserValue, }; use cache::MyBlobCache; use compression::MyCompressor; @@ -333,12 +334,12 @@ impl AbstractTree for BlobTree { self.index.sealed_memtable_count() } - #[doc(hidden)] + /* #[doc(hidden)] fn verify(&self) -> crate::Result { let index_tree_sum = self.index.verify()?; let vlog_sum = self.blobs.verify()?; Ok(index_tree_sum + vlog_sum) - } + } */ fn keys( &self, @@ -364,7 +365,8 @@ impl AbstractTree for BlobTree { ) -> crate::Result> { use crate::{ file::SEGMENTS_FOLDER, - segment::writer::{Options, Writer as SegmentWriter}, + //segment::writer::{Options, Writer as SegmentWriter}, + super_segment::Writer as SegmentWriter, }; use value::MaybeInlineValue; @@ -374,17 +376,21 @@ impl AbstractTree for BlobTree { log::debug!("=> to LSM segments in {:?}", lsm_segment_folder); log::debug!("=> to blob segment at {:?}", self.blobs.path); - let mut segment_writer = SegmentWriter::new(Options { + let mut segment_writer = SegmentWriter::new( + lsm_segment_folder.join(segment_id.to_string()), segment_id, - data_block_size: self.index.config.data_block_size, - index_block_size: self.index.config.index_block_size, - folder: lsm_segment_folder, - })? + /* Options { + segment_id, + data_block_size: self.index.config.data_block_size, + index_block_size: self.index.config.index_block_size, + folder: lsm_segment_folder, + } */ + )? .use_compression(self.index.config.compression); - segment_writer = segment_writer.use_bloom_policy( + /* segment_writer = segment_writer.use_bloom_policy( crate::segment::writer::BloomConstructionPolicy::FpRate(0.0001), - ); + ); */ let mut blob_writer = self.blobs.get_writer()?; diff --git a/src/bloom/mod.rs b/src/bloom/mod.rs index cc45b203..63660fae 100644 --- a/src/bloom/mod.rs +++ b/src/bloom/mod.rs @@ -9,7 +9,7 @@ use crate::{ file::MAGIC_BYTES, }; use bit_array::BitArray; -use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use std::io::{Read, Write}; /// Two hashes that are used for double hashing @@ -47,8 +47,8 @@ impl Encode for BloomFilter { // NOTE: Hash type (unused) writer.write_u8(0)?; - writer.write_u64::(self.m as u64)?; - writer.write_u64::(self.k as u64)?; + writer.write_u64::(self.m as u64)?; + writer.write_u64::(self.k as u64)?; writer.write_all(self.inner.bytes())?; Ok(()) @@ -73,8 +73,8 @@ impl Decode for BloomFilter { let hash_type = reader.read_u8()?; assert_eq!(0, hash_type, "Invalid bloom hash type"); - let m = reader.read_u64::()? as usize; - let k = reader.read_u64::()? as usize; + let m = reader.read_u64::()? as usize; + let k = reader.read_u64::()? as usize; let mut bytes = vec![0; m / 8]; reader.read_exact(&mut bytes)?; diff --git a/src/cache.rs b/src/cache.rs index 0ee51130..0a606fba 100644 --- a/src/cache.rs +++ b/src/cache.rs @@ -90,7 +90,7 @@ impl Cache { #[allow(clippy::default_trait_access)] let quick_cache = QuickCache::with( - 1_000_000, + 100_000, bytes, BlockWeighter, Default::default(), diff --git a/src/clipping_iter.rs b/src/clipping_iter.rs index 7c70810f..7fc769fd 100644 --- a/src/clipping_iter.rs +++ b/src/clipping_iter.rs @@ -1,10 +1,13 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + use crate::InternalValue; use std::{ marker::PhantomData, ops::{Bound, RangeBounds}, }; -/* crate::Result */ type Item = InternalValue; /// Clips an iterator to a key range @@ -158,7 +161,7 @@ mod tests { use test_log::test; #[test] - fn v3_clipping_iter_forwards() -> crate::Result<()> { + fn v3_clipping_iter_forwards() { let items = [ InternalValue::from_components(b"a", b"", 0, crate::ValueType::Value), InternalValue::from_components(b"b", b"", 0, crate::ValueType::Value), @@ -178,12 +181,10 @@ mod tests { iter.next().map(|x| x.key.user_key).as_deref(), ); assert!(iter.next().is_none()); - - Ok(()) } #[test] - fn v3_clipping_iter_rev() -> crate::Result<()> { + fn v3_clipping_iter_rev() { let items = [ InternalValue::from_components(b"a", b"", 0, crate::ValueType::Value), InternalValue::from_components(b"b", b"", 0, crate::ValueType::Value), @@ -203,12 +204,10 @@ mod tests { iter.next_back().map(|x| x.key.user_key).as_deref(), ); assert!(iter.next_back().is_none()); - - Ok(()) } #[test] - fn v3_clipping_iter_ping_pong() -> crate::Result<()> { + fn v3_clipping_iter_ping_pong() { let items = [ InternalValue::from_components(b"a", b"", 0, crate::ValueType::Value), InternalValue::from_components(b"b", b"", 0, crate::ValueType::Value), @@ -233,7 +232,5 @@ mod tests { ); assert!(iter.next_back().is_none()); assert!(iter.next().is_none()); - - Ok(()) } } diff --git a/src/compaction/fifo.rs b/src/compaction/fifo.rs index c4a75991..4cb169ca 100644 --- a/src/compaction/fifo.rs +++ b/src/compaction/fifo.rs @@ -58,7 +58,7 @@ impl CompactionStrategy for Strategy { let now = unix_timestamp().as_micros(); for segment in resolved_view.iter().flat_map(|lvl| &lvl.segments) { - let lifetime_us = now - segment.metadata.created_at; + let lifetime_us: u128 = /* now - segment.metadata.created_at */ todo!(); let lifetime_sec = lifetime_us / 1000 / 1000; if lifetime_sec > ttl_seconds.into() { @@ -128,8 +128,9 @@ mod tests { block_index::{two_level_index::TwoLevelBlockIndex, BlockIndexImpl}, file_offsets::FileOffsets, meta::{Metadata, SegmentId}, - Segment, SegmentInner, + SegmentInner, }, + super_segment::Segment, time::unix_timestamp, HashSet, KeyRange, }; @@ -139,7 +140,9 @@ mod tests { #[allow(clippy::expect_used)] #[allow(clippy::cast_possible_truncation)] fn fixture_segment(id: SegmentId, created_at: u128) -> Segment { - let cache = Arc::new(Cache::with_capacity_bytes(10 * 1_024 * 1_024)); + todo!() + + /* let cache = Arc::new(Cache::with_capacity_bytes(10 * 1_024 * 1_024)); let block_index = TwoLevelBlockIndex::new((0, id).into(), cache.clone()); let block_index = Arc::new(BlockIndexImpl::TwoLevel(block_index)); @@ -184,7 +187,7 @@ mod tests { path: "a".into(), is_deleted: AtomicBool::default(), } - .into() + .into() */ } #[test] diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 1e85914c..9f40b53d 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -6,7 +6,7 @@ use super::{Choice, CompactionStrategy, Input as CompactionInput}; use crate::{ config::Config, level_manifest::{hidden_set::HiddenSet, level::Level, LevelManifest}, - segment::Segment, + super_segment::Segment, windows::{GrowingWindowsExt, ShrinkingWindowsExt}, HashSet, KeyRange, SegmentId, }; @@ -272,7 +272,7 @@ impl CompactionStrategy for Strategy { target_size: u64::from(self.target_size), }; - // TODO: eventually, this should happen lazily + /*// TODO: eventually, this should happen lazily // if a segment file lives for very long, it should get rewritten // Rocks, by default, rewrites files that are 1 month or older // @@ -284,7 +284,7 @@ impl CompactionStrategy for Strategy { if goes_into_cold_storage { return Choice::Merge(choice); - } + }*/ if can_trivial_move && level.is_disjoint { return Choice::Move(choice); @@ -301,34 +301,30 @@ impl CompactionStrategy for Strategy { return Choice::DoNothing; }; - if first_level.len() >= self.l0_threshold.into() && !busy_levels.contains(&0) { + if busy_levels.contains(&0) { + return Choice::DoNothing; + } + + if first_level.len() >= self.l0_threshold.into() { let first_level_size = first_level.size(); // NOTE: Special handling for disjoint workloads - if levels.is_disjoint() { - if first_level_size < self.target_size.into() { - // TODO: also do this in non-disjoint workloads - // -> intra-L0 compaction - - // NOTE: Force a merge into L0 itself - // ...we seem to have *very* small flushes - return if first_level.len() >= 32 { - Choice::Merge(CompactionInput { - dest_level: 0, - segment_ids: first_level.list_ids(), - // NOTE: Allow a bit of overshooting - target_size: ((self.target_size as f32) * 1.1) as u64, - }) - } else { - Choice::DoNothing - }; - } - - return Choice::Merge(CompactionInput { - dest_level: 1, - segment_ids: first_level.list_ids(), - target_size: ((self.target_size as f32) * 1.1) as u64, - }); + if levels.is_disjoint() && first_level_size < self.target_size.into() { + // TODO: also do this in non-disjoint workloads + // -> intra-L0 compaction + + // NOTE: Force a merge into L0 itself + // ...we seem to have *very* small flushes + return if first_level.len() >= 30 { + Choice::Merge(CompactionInput { + dest_level: 0, + segment_ids: first_level.list_ids(), + // NOTE: Allow a bit of overshooting + target_size: ((self.target_size as f32) * 1.1) as u64, + }) + } else { + Choice::DoNothing + }; } if first_level_size < self.target_size.into() { @@ -392,8 +388,9 @@ mod tests { block_index::{two_level_index::TwoLevelBlockIndex, BlockIndexImpl}, file_offsets::FileOffsets, meta::{Metadata, SegmentId}, - Segment, SegmentInner, + SegmentInner, }, + super_segment::Segment, time::unix_timestamp, Config, HashSet, KeyRange, }; @@ -418,7 +415,9 @@ mod tests { size: u64, tombstone_ratio: f32, ) -> Segment { - let cache = Arc::new(Cache::with_capacity_bytes(10 * 1_024 * 1_024)); + todo!() + + /* let cache = Arc::new(Cache::with_capacity_bytes(10 * 1_024 * 1_024)); let block_index = TwoLevelBlockIndex::new((0, id).into(), cache.clone()); let block_index = Arc::new(BlockIndexImpl::TwoLevel(block_index)); @@ -463,7 +462,7 @@ mod tests { path: "a".into(), is_deleted: AtomicBool::default(), } - .into() + .into() */ } #[allow(clippy::expect_used)] diff --git a/src/compaction/maintenance.rs b/src/compaction/maintenance.rs index 83d604a9..643239d1 100644 --- a/src/compaction/maintenance.rs +++ b/src/compaction/maintenance.rs @@ -4,10 +4,8 @@ use super::{Choice, CompactionStrategy}; use crate::{ - config::Config, - level_manifest::LevelManifest, - segment::{meta::SegmentId, Segment}, - HashSet, + config::Config, level_manifest::LevelManifest, segment::meta::SegmentId, + super_segment::Segment, HashSet, }; const L0_SEGMENT_CAP: usize = 20; @@ -93,8 +91,9 @@ mod tests { block_index::{two_level_index::TwoLevelBlockIndex, BlockIndexImpl}, file_offsets::FileOffsets, meta::Metadata, - Segment, SegmentInner, + SegmentInner, }, + super_segment::Segment, KeyRange, }; use std::sync::{atomic::AtomicBool, Arc}; @@ -102,7 +101,8 @@ mod tests { #[allow(clippy::expect_used)] fn fixture_segment(id: SegmentId, created_at: u128) -> Segment { - let cache = Arc::new(Cache::with_capacity_bytes(10 * 1_024 * 1_024)); + todo!() + /* let cache = Arc::new(Cache::with_capacity_bytes(10 * 1_024 * 1_024)); let block_index = TwoLevelBlockIndex::new((0, id).into(), cache.clone()); let block_index = Arc::new(BlockIndexImpl::TwoLevel(block_index)); @@ -147,7 +147,7 @@ mod tests { path: "a".into(), is_deleted: AtomicBool::default(), } - .into() + .into() */ } #[test] diff --git a/src/compaction/major.rs b/src/compaction/major.rs index 5cb2c8f7..a9e2a720 100644 --- a/src/compaction/major.rs +++ b/src/compaction/major.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{Choice, CompactionStrategy, Input as CompactionInput}; -use crate::{config::Config, level_manifest::LevelManifest, HashSet, Segment}; +use crate::{config::Config, level_manifest::LevelManifest, super_segment::Segment, HashSet}; /// Major compaction /// diff --git a/src/compaction/movedown.rs b/src/compaction/movedown.rs index 756cd41e..6fc7afdf 100644 --- a/src/compaction/movedown.rs +++ b/src/compaction/movedown.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{Choice, CompactionStrategy, Input}; -use crate::{level_manifest::LevelManifest, Config, HashSet, Segment}; +use crate::{level_manifest::LevelManifest, super_segment::Segment, Config, HashSet}; /// Moves down a level into the destination level. pub struct Strategy(pub u8, pub u8); diff --git a/src/compaction/pulldown.rs b/src/compaction/pulldown.rs index 9698e275..2d238759 100644 --- a/src/compaction/pulldown.rs +++ b/src/compaction/pulldown.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{Choice, CompactionStrategy, Input}; -use crate::{level_manifest::LevelManifest, Config, HashSet, Segment}; +use crate::{level_manifest::LevelManifest, super_segment::Segment, Config, HashSet}; /// Pulls down and merges a level into the destination level. /// diff --git a/src/compaction/tiered.rs b/src/compaction/tiered.rs index b57b4470..b4c4b093 100644 --- a/src/compaction/tiered.rs +++ b/src/compaction/tiered.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{Choice, CompactionStrategy, Input as CompactionInput}; -use crate::{level_manifest::LevelManifest, Config, HashSet, Segment}; +use crate::{level_manifest::LevelManifest, super_segment::Segment, Config, HashSet}; fn desired_level_size_in_bytes(level_idx: u8, ratio: u8, base_size: u32) -> usize { (ratio as usize).pow(u32::from(level_idx + 1)) * (base_size as usize) @@ -167,8 +167,9 @@ mod tests { block_index::{two_level_index::TwoLevelBlockIndex, BlockIndexImpl}, file_offsets::FileOffsets, meta::{Metadata, SegmentId}, - Segment, SegmentInner, + SegmentInner, }, + super_segment::Segment, HashSet, KeyRange, SeqNo, }; use std::sync::{atomic::AtomicBool, Arc}; @@ -176,7 +177,9 @@ mod tests { #[allow(clippy::expect_used)] fn fixture_segment(id: SegmentId, size_mib: u64, max_seqno: SeqNo) -> Segment { - let cache = Arc::new(Cache::with_capacity_bytes(10 * 1_024 * 1_024)); + todo!() + + /* let cache = Arc::new(Cache::with_capacity_bytes(10 * 1_024 * 1_024)); let block_index = TwoLevelBlockIndex::new((0, id).into(), cache.clone()); let block_index = Arc::new(BlockIndexImpl::TwoLevel(block_index)); @@ -221,7 +224,7 @@ mod tests { path: "a".into(), is_deleted: AtomicBool::default(), } - .into() + .into() */ } #[test] diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 04364e4f..d6bbd98f 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -9,28 +9,19 @@ use crate::{ level_manifest::LevelManifest, level_scanner::LevelScanner, merge::Merger, - segment::{ - block_index::{ - full_index::FullBlockIndex, two_level_index::TwoLevelBlockIndex, BlockIndexImpl, - }, - id::GlobalSegmentId, - multi_writer::MultiWriter, - scanner::CompactionReader, - Segment, SegmentInner, - }, + segment::id::GlobalSegmentId, stop_signal::StopSignal, + super_segment::{multi_writer::MultiWriter, Segment}, tree::inner::TreeId, - Config, SegmentId, SeqNo, + Config, InternalValue, SegmentId, SeqNo, }; use std::{ - path::Path, - sync::{ - atomic::{AtomicBool, AtomicU64}, - Arc, RwLock, RwLockWriteGuard, - }, + sync::{atomic::AtomicU64, Arc, RwLock, RwLockWriteGuard}, time::Instant, }; +pub type CompactionReader<'a> = Box> + 'a>; + /// Compaction options pub struct Options { pub tree_id: TreeId, @@ -102,7 +93,6 @@ pub fn do_compaction(opts: &Options) -> crate::Result<()> { } fn create_compaction_stream<'a>( - segment_base_folder: &Path, levels: &LevelManifest, to_compact: &[SegmentId], eviction_seqno: SeqNo, @@ -139,7 +129,6 @@ fn create_compaction_stream<'a>( }; readers.push(Box::new(LevelScanner::from_indexes( - segment_base_folder.to_owned(), level.clone(), (Some(lo), Some(hi)), )?)); @@ -149,7 +138,7 @@ fn create_compaction_stream<'a>( for &id in to_compact { if let Some(segment) = level.segments.iter().find(|x| x.id() == id) { found += 1; - readers.push(Box::new(segment.scan(segment_base_folder)?)); + readers.push(Box::new(segment.scan()?)); } } } @@ -234,7 +223,6 @@ fn merge_segments( ); let Some(merge_iter) = create_compaction_stream( - &segments_base_folder, &levels, &payload.segment_ids.iter().copied().collect::>(), opts.eviction_seqno, @@ -261,14 +249,17 @@ fn merge_segments( let start = Instant::now(); let Ok(segment_writer) = MultiWriter::new( + segments_base_folder.clone(), opts.segment_id_generator.clone(), payload.target_size, + /* opts.segment_id_generator.clone(), + payload.target_size, crate::segment::writer::Options { folder: segments_base_folder.clone(), segment_id: 0, // TODO: this is never used in MultiWriter data_block_size: opts.config.data_block_size, index_block_size: opts.config.index_block_size, - }, + }, */ ) else { log::error!("Compaction failed"); @@ -281,9 +272,11 @@ fn merge_segments( return Ok(()); }; - let mut segment_writer = segment_writer.use_compression(opts.config.compression); + let mut segment_writer = segment_writer + .use_compression(opts.config.compression) + .use_data_block_size(opts.config.data_block_size); - { + /* { use crate::segment::writer::BloomConstructionPolicy; if opts.config.bloom_bits_per_key >= 0 { @@ -304,7 +297,7 @@ fn merge_segments( segment_writer = segment_writer.use_bloom_policy(BloomConstructionPolicy::BitsPerKey(0)); } - } + } */ for (idx, item) in merge_iter.enumerate() { let Ok(item) = item else { @@ -362,8 +355,17 @@ fn merge_segments( let Ok(created_segments) = writer_results .into_iter() - .map(|trailer| -> crate::Result { - let segment_id = trailer.metadata.id; + .map(|segment_id| -> crate::Result { + let segment_file_path = segments_base_folder.join(segment_id.to_string()); + + Segment::recover( + &segment_file_path, + opts.tree_id, + opts.config.cache.clone(), + opts.config.descriptor_table.clone(), + ) + + /* let segment_id = trailer.metadata.id; let segment_file_path = segments_base_folder.join(segment_id.to_string()); let block_index = match payload.dest_level { @@ -410,7 +412,7 @@ fn merge_segments( is_deleted: AtomicBool::default(), } - .into()) + .into()) */ }) .collect::>>() else { @@ -455,14 +457,6 @@ fn merge_segments( return Err(e); } - for segment in &created_segments { - let segment_file_path = segments_base_folder.join(segment.id().to_string()); - - opts.config - .descriptor_table - .insert(&segment_file_path, segment.global_id()); - } - // NOTE: If the application were to crash >here< it's fine // The segments are not referenced anymore, and will be // cleaned up upon recovery diff --git a/src/config.rs b/src/config.rs index fe496899..43dc7435 100644 --- a/src/config.rs +++ b/src/config.rs @@ -3,11 +3,9 @@ // (found in the LICENSE-* files in the repository) use crate::{ - cache::Cache, - descriptor_table::FileDescriptorTable, path::absolute_path, segment::meta::{CompressionType, TableType}, - BlobTree, Tree, + BlobTree, NewCache, NewDescriptorTable, Tree, }; use std::{ path::{Path, PathBuf}, @@ -85,7 +83,7 @@ pub struct Config { /// Block cache to use #[doc(hidden)] - pub cache: Arc, + pub cache: Arc, /// Blob file (value log segment) target size in bytes #[doc(hidden)] @@ -97,16 +95,16 @@ pub struct Config { /// Descriptor table to use #[doc(hidden)] - pub descriptor_table: Arc, + pub descriptor_table: Arc, } impl Default for Config { fn default() -> Self { Self { path: absolute_path(Path::new(DEFAULT_FILE_FOLDER)), - descriptor_table: Arc::new(FileDescriptorTable::new(128, 2)), + descriptor_table: Arc::new(NewDescriptorTable::new(256)), - cache: Arc::new(Cache::with_capacity_bytes(/* 16 MiB */ 16 * 1_024 * 1_024)), + cache: Arc::new(NewCache::with_capacity_bytes(/* 16 MiB */ 16 * 1_024 * 1_024)), data_block_size: /* 4 KiB */ 4_096, index_block_size: /* 4 KiB */ 4_096, @@ -242,7 +240,7 @@ impl Config { /// /// Defaults to a cache with 8 MiB of capacity *per tree*. #[must_use] - pub fn use_cache(mut self, cache: Arc) -> Self { + pub fn use_cache(mut self, cache: Arc) -> Self { self.cache = cache; self } @@ -280,7 +278,7 @@ impl Config { #[must_use] #[doc(hidden)] - pub fn descriptor_table(mut self, descriptor_table: Arc) -> Self { + pub fn descriptor_table(mut self, descriptor_table: Arc) -> Self { self.descriptor_table = descriptor_table; self } diff --git a/src/file.rs b/src/file.rs index a258d654..6ad1d4a3 100644 --- a/src/file.rs +++ b/src/file.rs @@ -19,6 +19,8 @@ pub fn rewrite_atomic(path: &Path, content: &[u8]) -> std::io::Result<()> { let mut temp_file = tempfile::NamedTempFile::new_in(folder)?; temp_file.write_all(content)?; + temp_file.flush()?; + temp_file.as_file_mut().sync_all()?; temp_file.persist(path)?; // TODO: not sure why it fails on Windows... diff --git a/src/level_manifest/level.rs b/src/level_manifest/level.rs index 1648b110..90ebc02b 100644 --- a/src/level_manifest/level.rs +++ b/src/level_manifest/level.rs @@ -3,7 +3,8 @@ // (found in the LICENSE-* files in the repository) use crate::{ - binary_search::partition_point, segment::meta::SegmentId, HashSet, KeyRange, Segment, UserKey, + binary_search::partition_point, segment::meta::SegmentId, super_segment::Segment, HashSet, + KeyRange, UserKey, }; use std::ops::Bound; @@ -253,8 +254,9 @@ mod tests { block_index::{two_level_index::TwoLevelBlockIndex, BlockIndexImpl}, file_offsets::FileOffsets, meta::{Metadata, SegmentId}, - Segment, SegmentInner, + SegmentInner, }, + super_segment::Segment, AbstractTree, KeyRange, Slice, }; use std::sync::{atomic::AtomicBool, Arc}; @@ -262,7 +264,8 @@ mod tests { #[allow(clippy::expect_used)] fn fixture_segment(id: SegmentId, key_range: KeyRange) -> Segment { - let cache = Arc::new(Cache::with_capacity_bytes(10 * 1_024 * 1_024)); + todo!() + /* let cache = Arc::new(Cache::with_capacity_bytes(10 * 1_024 * 1_024)); let block_index = TwoLevelBlockIndex::new((0, id).into(), cache.clone()); let block_index = Arc::new(BlockIndexImpl::TwoLevel(block_index)); @@ -307,7 +310,7 @@ mod tests { path: "a".into(), is_deleted: AtomicBool::default(), } - .into() + .into() */ } #[test] diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index 92540749..9f4e3ee2 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -8,8 +8,8 @@ pub(crate) mod level; use crate::{ coding::{DecodeError, Encode, EncodeError}, file::{rewrite_atomic, MAGIC_BYTES}, - segment::{meta::SegmentId, Segment}, - HashMap, HashSet, KeyRange, + super_segment::Segment, + HashMap, HashSet, KeyRange, SegmentId, }; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use hidden_set::HiddenSet; @@ -529,7 +529,7 @@ mod tests { #[rustfmt::skip] let raw = &[ // Magic - b'L', b'S', b'M', 2, + b'L', b'S', b'M', 3, // Count 0, diff --git a/src/level_reader.rs b/src/level_reader.rs index 6bd2b781..2104dd75 100644 --- a/src/level_reader.rs +++ b/src/level_reader.rs @@ -47,7 +47,9 @@ impl LevelReader { (lo, hi): (Option, Option), cache_policy: CachePolicy, ) -> Self { - let lo = lo.unwrap_or_default(); + todo!() + + /* let lo = lo.unwrap_or_default(); let hi = hi.unwrap_or(level.len() - 1); // TODO: lazily init readers? @@ -69,7 +71,7 @@ impl LevelReader { lo_reader: Some(lo_reader), hi_reader, cache_policy, - } + } */ } } @@ -77,7 +79,9 @@ impl Iterator for LevelReader { type Item = crate::Result; fn next(&mut self) -> Option { - loop { + todo!() + + /* loop { if let Some(lo_reader) = &mut self.lo_reader { if let Some(item) = lo_reader.next() { return Some(item); @@ -104,13 +108,15 @@ impl Iterator for LevelReader { } else { return None; } - } + } */ } } impl DoubleEndedIterator for LevelReader { fn next_back(&mut self) -> Option { - loop { + todo!() + + /* loop { if let Some(hi_reader) = &mut self.hi_reader { if let Some(item) = hi_reader.next_back() { return Some(item); @@ -137,7 +143,7 @@ impl DoubleEndedIterator for LevelReader { } else { return None; } - } + } */ } } diff --git a/src/level_scanner.rs b/src/level_scanner.rs index c9326402..289fbd4e 100644 --- a/src/level_scanner.rs +++ b/src/level_scanner.rs @@ -2,14 +2,13 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::{level_manifest::level::Level, segment::scanner::Scanner, InternalValue}; -use std::{path::PathBuf, sync::Arc}; +use crate::{level_manifest::level::Level, super_segment::Scanner, InternalValue}; +use std::sync::Arc; /// Scans through a disjoint level /// /// Optimized for compaction, by using a `SegmentScanner` instead of `SegmentReader`. pub struct LevelScanner { - base_folder: PathBuf, segments: Arc, lo: usize, hi: usize, @@ -18,7 +17,6 @@ pub struct LevelScanner { impl LevelScanner { pub fn from_indexes( - base_folder: PathBuf, level: Arc, (lo, hi): (Option, Option), ) -> crate::Result { @@ -27,10 +25,9 @@ impl LevelScanner { let lo_segment = level.segments.get(lo).expect("should exist"); - let lo_reader = lo_segment.scan(&base_folder)?; + let lo_reader = lo_segment.scan()?; Ok(Self { - base_folder, segments: level, lo, hi, @@ -54,11 +51,8 @@ impl Iterator for LevelScanner { self.lo += 1; if self.lo <= self.hi { - let scanner = fail_iter!(self - .segments - .get(self.lo) - .expect("should exist") - .scan(&self.base_folder)); + let scanner = + fail_iter!(self.segments.get(self.lo).expect("should exist").scan()); self.lo_reader = Some(scanner); } @@ -110,11 +104,7 @@ mod tests { #[allow(clippy::unwrap_used)] { - let multi_reader = LevelScanner::from_indexes( - tempdir.path().join("segments"), - level.clone(), - (None, None), - )?; + let multi_reader = LevelScanner::from_indexes(level.clone(), (None, None))?; let mut iter = multi_reader.flatten(); @@ -134,11 +124,7 @@ mod tests { #[allow(clippy::unwrap_used)] { - let multi_reader = LevelScanner::from_indexes( - tempdir.path().join("segments"), - level.clone(), - (Some(1), None), - )?; + let multi_reader = LevelScanner::from_indexes(level.clone(), (Some(1), None))?; let mut iter = multi_reader.flatten(); diff --git a/src/lib.rs b/src/lib.rs index 31a9fb3f..3d73582c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -90,7 +90,7 @@ #![doc(html_logo_url = "https://raw.githubusercontent.com/fjall-rs/lsm-tree/main/logo.png")] #![doc(html_favicon_url = "https://raw.githubusercontent.com/fjall-rs/lsm-tree/main/logo.png")] -#![deny(unsafe_code)] +#![warn(unsafe_code)] #![deny(clippy::all, missing_docs, clippy::cargo)] #![deny(clippy::unwrap_used)] #![deny(clippy::indexing_slicing)] @@ -120,6 +120,8 @@ macro_rules! fail_iter { }; } +// TODO: 3.0.0 change everything to LittleEndian? + mod any_tree; mod r#abstract; @@ -159,6 +161,12 @@ mod level_scanner; mod manifest; mod memtable; +#[doc(hidden)] +mod new_cache; + +#[doc(hidden)] +mod new_descriptor_table; + #[doc(hidden)] pub mod merge; @@ -204,17 +212,20 @@ pub mod coding { #[doc(hidden)] pub use { merge::BoxedIterator, + new_cache::NewCache, + new_descriptor_table::NewDescriptorTable, segment::{block::checksum::Checksum, id::GlobalSegmentId, meta::SegmentId}, tree::inner::TreeId, value::InternalValue, }; pub use { - cache::Cache, coding::{DecodeError, EncodeError}, config::{Config, TreeType}, error::{Error, Result}, memtable::Memtable, + new_cache::NewCache as Cache, // <- TODO: rename + new_descriptor_table::NewDescriptorTable as DescriptorTable, r#abstract::AbstractTree, segment::{meta::CompressionType, Segment}, seqno::SequenceNumberCounter, diff --git a/src/new_cache.rs b/src/new_cache.rs new file mode 100644 index 00000000..5e06fa6e --- /dev/null +++ b/src/new_cache.rs @@ -0,0 +1,216 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use crate::segment::id::GlobalSegmentId; +use crate::super_segment::block::Header; +use crate::super_segment::{Block, BlockOffset, DataBlock}; +use quick_cache::Weighter; +use quick_cache::{sync::Cache as QuickCache, Equivalent}; + +const TAG_BLOCK: u8 = 0; +const TAG_BLOB: u8 = 1; + +/* #[derive(Clone)] +enum Item { + DataBlock(Arc), + IndexBlock(Arc), + Blob(UserValue), +} */ + +type Item = Block; + +#[derive(Eq, std::hash::Hash, PartialEq)] +struct CacheKey(u8, u64, u64, u64); + +impl Equivalent for (u8, u64, u64, u64) { + fn equivalent(&self, key: &CacheKey) -> bool { + self.0 == key.0 && self.1 == key.1 && self.2 == key.2 && self.3 == key.3 + } +} + +impl From<(u8, u64, u64, u64)> for CacheKey { + fn from((tag, root_id, segment_id, offset): (u8, u64, u64, u64)) -> Self { + Self(tag, root_id, segment_id, offset) + } +} + +#[derive(Clone)] +struct BlockWeighter; + +impl Weighter for BlockWeighter { + fn weight(&self, _: &CacheKey, block: &Item) -> u64 { + (Header::serialized_len() as u64) + Into::::into(block.header.uncompressed_length) + } +} + +/// Cache, in which blocks or blobs are cached in-memory +/// after being retrieved from disk +/// +/// This speeds up consecutive queries to nearby data, improving +/// read performance for hot data. +/// +/// # Examples +/// +/// Sharing cache between multiple trees +/// +/// ``` +/// # use lsm_tree::{Tree, Config, Cache}; +/// # use std::sync::Arc; +/// # +/// // Provide 40 MB of cache capacity +/// let cache = Arc::new(Cache::with_capacity_bytes(40 * 1_000 * 1_000)); +/// +/// # let folder = tempfile::tempdir()?; +/// let tree1 = Config::new(folder).use_cache(cache.clone()).open()?; +/// # let folder = tempfile::tempdir()?; +/// let tree2 = Config::new(folder).use_cache(cache.clone()).open()?; +/// # +/// # Ok::<(), lsm_tree::Error>(()) +/// ``` +pub struct NewCache { + // NOTE: rustc_hash performed best: https://fjall-rs.github.io/post/fjall-2-1 + /// Concurrent cache implementation + data: QuickCache, + + /// Capacity in bytes + capacity: u64, +} + +impl NewCache { + /// Creates a new block cache with roughly `n` bytes of capacity. + #[must_use] + pub fn with_capacity_bytes(bytes: u64) -> Self { + use quick_cache::sync::DefaultLifecycle; + + #[allow(clippy::default_trait_access)] + let quick_cache = QuickCache::with( + 100_000, + bytes, + BlockWeighter, + Default::default(), + DefaultLifecycle::default(), + ); + + Self { + data: quick_cache, + capacity: bytes, + } + } + + /// Returns the amount of cached bytes. + #[must_use] + pub fn size(&self) -> u64 { + self.data.weight() + } + + /// Returns the cache capacity in bytes. + #[must_use] + pub fn capacity(&self) -> u64 { + self.capacity + } + + /// Returns the number of cached blocks. + #[must_use] + pub fn len(&self) -> usize { + self.data.len() + } + + /// Returns `true` if there are no cached blocks. + #[must_use] + pub fn is_empty(&self) -> bool { + self.data.is_empty() + } + + #[doc(hidden)] + #[must_use] + pub fn get_data_block(&self, id: GlobalSegmentId, offset: BlockOffset) -> Option { + let key: CacheKey = (TAG_BLOCK, id.tree_id(), id.segment_id(), *offset).into(); + self.data.get(&key).map(DataBlock::new) + } + + #[doc(hidden)] + pub fn insert_block(&self, id: GlobalSegmentId, offset: BlockOffset, value: Item) { + self.data.insert( + (TAG_BLOCK, id.tree_id(), id.segment_id(), *offset).into(), + value, + ); + } + + /* #[doc(hidden)] + pub fn insert_index_block( + &self, + id: GlobalSegmentId, + offset: BlockOffset, + value: Arc, + ) { + self.data.insert( + (TAG_BLOCK, id.tree_id(), id.segment_id(), *offset).into(), + Item::IndexBlock(value), + ); + } */ + + /* #[doc(hidden)] + #[must_use] + pub fn get_data_block( + &self, + id: GlobalSegmentId, + offset: BlockOffset, + ) -> Option> { + let key: CacheKey = (TAG_BLOCK, id.tree_id(), id.segment_id(), *offset).into(); + + if let Item::DataBlock(block) = self.data.get(&key)? { + Some(block) + } else { + log::warn!("cache item type was unexpected - this is a bug"); + None + } + } */ + + /* #[doc(hidden)] + #[must_use] + pub fn get_index_block( + &self, + id: GlobalSegmentId, + offset: BlockOffset, + ) -> Option> { + let key: CacheKey = (TAG_BLOCK, id.tree_id(), id.segment_id(), *offset).into(); + + if let Item::IndexBlock(block) = self.data.get(&key)? { + Some(block) + } else { + log::warn!("cache item type was unexpected - this is a bug"); + None + } + } */ + + /* #[doc(hidden)] + pub fn insert_blob( + &self, + vlog_id: value_log::ValueLogId, + vhandle: &value_log::ValueHandle, + value: UserValue, + ) { + self.data.insert( + (TAG_BLOB, vlog_id, vhandle.segment_id, vhandle.offset).into(), + Item::Blob(value), + ); + } */ + + /* #[doc(hidden)] + #[must_use] + pub fn get_blob( + &self, + vlog_id: value_log::ValueLogId, + vhandle: &value_log::ValueHandle, + ) -> Option { + let key: CacheKey = (TAG_BLOB, vlog_id, vhandle.segment_id, vhandle.offset).into(); + + if let Item::Blob(blob) = self.data.get(&key)? { + Some(blob) + } else { + log::warn!("cache item type was unexpected - this is a bug"); + None + } + } */ +} diff --git a/src/new_descriptor_table.rs b/src/new_descriptor_table.rs new file mode 100644 index 00000000..7555ded3 --- /dev/null +++ b/src/new_descriptor_table.rs @@ -0,0 +1,65 @@ +// Copyright (c) 2025-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use crate::GlobalSegmentId; +use quick_cache::{sync::Cache as QuickCache, UnitWeighter}; +use std::{fs::File, sync::Arc}; + +const TAG_BLOCK: u8 = 0; +const TAG_BLOB: u8 = 1; + +type Item = Arc; + +#[derive(Eq, std::hash::Hash, PartialEq)] +struct CacheKey(u8, u64, u64); + +// TODO: 3.0.0 rename +pub struct NewDescriptorTable { + inner: QuickCache, +} + +impl NewDescriptorTable { + #[must_use] + pub fn new(capacity: usize) -> Self { + use quick_cache::sync::DefaultLifecycle; + + #[allow(clippy::default_trait_access)] + let quick_cache = QuickCache::with( + 100_000, + capacity as u64, + UnitWeighter, + Default::default(), + DefaultLifecycle::default(), + ); + + Self { inner: quick_cache } + } + + #[doc(hidden)] + pub fn clear(&self) { + self.inner.clear(); + } + + #[must_use] + pub fn access_for_table(&self, id: &GlobalSegmentId) -> Option> { + let key = CacheKey(TAG_BLOCK, id.tree_id(), id.segment_id()); + self.inner.get(&key) + } + + pub fn insert_for_table(&self, id: GlobalSegmentId, item: Item) { + let key = CacheKey(TAG_BLOCK, id.tree_id(), id.segment_id()); + self.inner.insert(key, item); + } + + #[must_use] + pub fn access_for_blob_file(&self, id: &GlobalSegmentId) -> Option> { + let key = CacheKey(TAG_BLOB, id.tree_id(), id.segment_id()); + self.inner.get(&key) + } + + pub fn insert_for_blob_file(&self, id: GlobalSegmentId, item: Item) { + let key = CacheKey(TAG_BLOB, id.tree_id(), id.segment_id()); + self.inner.insert(key, item); + } +} diff --git a/src/range.rs b/src/range.rs index 1e174cb0..3af25f42 100644 --- a/src/range.rs +++ b/src/range.rs @@ -144,7 +144,9 @@ impl TreeIter { seqno: Option, level_manifest: ArcRwLockReadGuardian, ) -> Self { - Self::new(guard, |lock| { + todo!() + + /* Self::new(guard, |lock| { let lo = match &bounds.0 { // NOTE: See memtable.rs for range explanation Bound::Included(key) => Bound::Included(InternalKey::new( @@ -282,7 +284,7 @@ impl TreeIter { Ok(value) => !value.key.is_tombstone(), Err(_) => true, })) - }) + }) */ } } diff --git a/src/segment/block/header.rs b/src/segment/block/header.rs index 9c02b9fe..6c1bbe0b 100644 --- a/src/segment/block/header.rs +++ b/src/segment/block/header.rs @@ -15,7 +15,7 @@ use std::io::{Read, Write}; #[derive(Copy, Clone, Debug, Eq, PartialEq)] pub struct Header { /// Compression type used - pub compression: CompressionType, + pub compression: CompressionType, // TODO: 3.0.0 store in segment meta instead? /// Checksum value to verify integrity of data pub checksum: Checksum, @@ -23,11 +23,11 @@ pub struct Header { /// File offset of previous block - only used for data blocks pub previous_block_offset: BlockOffset, - /// Compressed size of data segment + /// On-disk size of data segment pub data_length: u32, /// Uncompressed size of data segment - pub uncompressed_length: u32, // TODO: v3: can remove this, because every block stores its allocation anyway + pub uncompressed_length: u32, } impl Header { @@ -124,7 +124,7 @@ mod tests { #[rustfmt::skip] let bytes = &[ // Header - b'L', b'S', b'M', 2, + b'L', b'S', b'M', 3, // Compression 0, 0, diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index 1eff7925..ee1513db 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -50,9 +50,6 @@ impl Block { let mut bytes = vec![0u8; header.data_length as usize]; reader.read_exact(&mut bytes)?; - // TODO: 3.0.0 when header.compressed is reliable - // can we preallocate a vector to stream the compression into? - // -> saves reallocation costs let bytes = match header.compression { super::meta::CompressionType::None => bytes, @@ -108,9 +105,6 @@ impl Block { #[allow(clippy::cast_possible_truncation)] data_length: packed.len() as u32, - // TODO: 3.0.0 pack_items should return the uncompressed, serialized - // size directly - // NOTE: Truncation is OK because a block cannot possible contain 4 billion items #[allow(clippy::cast_possible_truncation)] uncompressed_length: items.size() as u32, @@ -131,8 +125,6 @@ impl Block { value.encode_into(&mut buf)?; } - // TODO: 3.0.0 return buf.len() - 4 as uncompressed size - Ok(match compression { CompressionType::None => buf, diff --git a/src/segment/block/offset.rs b/src/segment/block/offset.rs index 2233f5c3..4f023296 100644 --- a/src/segment/block/offset.rs +++ b/src/segment/block/offset.rs @@ -1,3 +1,4 @@ +// TODO: rename FileOffset? #[derive(Copy, Clone, Default, Debug, std::hash::Hash, PartialEq, Eq, Ord, PartialOrd)] pub struct BlockOffset(pub u64); diff --git a/src/segment/value_block.rs b/src/segment/value_block.rs index 7668a592..87f7b116 100644 --- a/src/segment/value_block.rs +++ b/src/segment/value_block.rs @@ -4,8 +4,8 @@ use super::{block::Block, id::GlobalSegmentId}; use crate::{ - binary_search::partition_point, descriptor_table::FileDescriptorTable, - segment::block::offset::BlockOffset, value::InternalValue, Cache, + binary_search::partition_point, cache::Cache, descriptor_table::FileDescriptorTable, + segment::block::offset::BlockOffset, value::InternalValue, }; use std::sync::Arc; diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 2b09290a..8ba46df1 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -343,7 +343,7 @@ impl Writer { "Written {} items in {} blocks into new segment file, written {} MiB", self.meta.item_count, self.meta.data_block_count, - *self.meta.file_pos / 1_024 / 1_024 + *self.meta.file_pos / 1_024 / 1_024, ); Ok(Some(trailer)) diff --git a/src/super_segment/block/encoder.rs b/src/super_segment/block/encoder.rs index 85a038c5..0a54c51b 100644 --- a/src/super_segment/block/encoder.rs +++ b/src/super_segment/block/encoder.rs @@ -90,11 +90,17 @@ impl<'a, S: Default, T: Encodable> Encoder<'a, S, T> { } } - /* /// Toggles prefix truncation. + /// Toggles prefix truncation. pub fn use_prefix_truncation(mut self, flag: bool) -> Self { self.use_prefix_truncation = flag; + + // TODO: + if !flag { + unimplemented!() + } + self - } */ + } pub fn write(&mut self, item: &'a T) -> crate::Result<()> { // NOTE: Check if we are a restart marker diff --git a/src/super_segment/block/mod.rs b/src/super_segment/block/mod.rs index 21e9e432..15ba9586 100644 --- a/src/super_segment/block/mod.rs +++ b/src/super_segment/block/mod.rs @@ -55,13 +55,15 @@ impl Block { let data = match compression { CompressionType::None => data, + + #[cfg(feature = "lz4")] CompressionType::Lz4 => &lz4_flex::compress(data), + + #[cfg(feature = "miniz")] CompressionType::Miniz(level) => &miniz_oxide::deflate::compress_to_vec(data, level), }; header.data_length = data.len() as u32; - debug_assert!(header.data_length > 0); - header.encode_into(&mut writer)?; writer.write_all(data)?; @@ -74,12 +76,54 @@ impl Block { Ok(header) } + pub fn from_reader( + reader: &mut R, + compression: CompressionType, + ) -> crate::Result { + let header = Header::decode_from(reader)?; + let raw_data = Slice::from_reader(reader, header.data_length as usize)?; + + let data = match compression { + CompressionType::None => raw_data, + + #[cfg(feature = "lz4")] + CompressionType::Lz4 => { + let mut data = byteview::ByteView::with_size(header.uncompressed_length as usize); + { + // NOTE: We know that we are the owner + #[allow(clippy::expect_used)] + let mut mutator = data.get_mut().expect("should be the owner"); + + lz4_flex::decompress_into(&raw_data, &mut mutator) + .map_err(|_| crate::Error::Decompress(compression))?; + } + data.into() + } + + #[cfg(feature = "miniz")] + CompressionType::Miniz(_) => miniz_oxide::inflate::decompress_to_vec(&raw_data) + .map_err(|_| crate::Error::Decompress(compression))? + .into(), + }; + + debug_assert_eq!(header.uncompressed_length, { + #[allow(clippy::expect_used, clippy::cast_possible_truncation)] + { + data.len() as u32 + } + }); + + Ok(Self { header, data }) + } + + // TODO: take non-keyed block handle pub fn from_file( file: &File, offset: BlockOffset, size: u32, compression: CompressionType, ) -> crate::Result { + // TODO: use with_size_unzeroed (or whatever it will be called) // TODO: use a Slice::get_mut instead... needs value-log update let mut buf = byteview::ByteView::with_size(size as usize); @@ -90,12 +134,19 @@ impl Block { { use std::os::unix::fs::FileExt; - file.read_at(&mut mutator, *offset)?; + let bytes_read = file.read_at(&mut mutator, *offset)?; + assert_eq!( + bytes_read, + size as usize, + "not enough bytes read: file has length {}", + file.metadata()?.len(), + ); } #[cfg(windows)] { - todo!() + todo!(); + // assert_eq!(bytes_read, size as usize); } #[cfg(not(any(unix, windows)))] @@ -109,6 +160,8 @@ impl Block { let data = match compression { CompressionType::None => buf.slice(Header::serialized_len()..), + + #[cfg(feature = "lz4")] CompressionType::Lz4 => { // NOTE: We know that a header always exists and data is never empty // So the slice is fine @@ -126,6 +179,8 @@ impl Block { } data } + + #[cfg(feature = "miniz")] CompressionType::Miniz(_) => { // NOTE: We know that a header always exists and data is never empty // So the slice is fine diff --git a/src/super_segment/block_index/mod.rs b/src/super_segment/block_index/mod.rs new file mode 100644 index 00000000..d7721723 --- /dev/null +++ b/src/super_segment/block_index/mod.rs @@ -0,0 +1,168 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use super::{IndexBlock, NewKeyedBlockHandle}; +use crate::segment::value_block::CachePolicy; + +#[enum_dispatch::enum_dispatch] +pub trait NewBlockIndex { + /// Gets the lowest block handle that can possibly contain the given item. + fn get_lowest_block_containing_key( + &self, + key: &[u8], + cache_policy: CachePolicy, + ) -> crate::Result>; // TODO: return NewBlockHandle (::into_non_keyed) + + /// Gets the last block handle that can possibly contain the given item. + fn get_last_block_containing_key( + &self, + key: &[u8], + cache_policy: CachePolicy, + ) -> crate::Result>; + + /// Returns a handle to the last block. + fn get_last_block_handle( + &self, + cache_policy: CachePolicy, + ) -> crate::Result; +} + +/// The block index stores references to the positions of blocks on a file and their size +/// +/// __________________ +/// | | +/// | BLOCK0 | +/// |________________| <- 'G': 0x0 +/// | | +/// | BLOCK1 | +/// |________________| <- 'M': 0x... +/// | | +/// | BLOCK2 | +/// |________________| <- 'Z': 0x... +/// +/// The block information can be accessed by key. +/// Because the blocks are sorted, any entries not covered by the index (it is sparse) can be +/// found by finding the highest block that has a lower or equal end key than the searched key (by performing in-memory binary search). +/// In the diagram above, searching for 'J' yields the block starting with 'G'. +/// 'J' must be in that block, because the next block starts with 'M'). +#[enum_dispatch::enum_dispatch(NewBlockIndex)] +#[allow(clippy::module_name_repetitions)] +pub enum NewBlockIndexImpl { + Full(NewFullBlockIndex), + // TwoLevel(TwoLevelBlockIndex), +} + +/// Index that translates item keys to data block handles +/// +/// The index is fully loaded into memory. +pub struct NewFullBlockIndex(IndexBlock); + +impl NewFullBlockIndex { + pub fn new(block: IndexBlock) -> Self { + Self(block) + } + + pub fn forward_reader( + &self, + needle: &[u8], + ) -> Option + '_> { + self.0.forward_reader(needle) + } +} + +impl NewBlockIndex for NewFullBlockIndex { + fn get_last_block_containing_key( + &self, + key: &[u8], + _: CachePolicy, + ) -> crate::Result> { + Ok(self.0.get_highest_possible_block(key)) + } + + fn get_lowest_block_containing_key( + &self, + key: &[u8], + _: CachePolicy, + ) -> crate::Result> { + Ok(self.0.get_lowest_possible_block(key)) + } + + fn get_last_block_handle(&self, _: CachePolicy) -> crate::Result { + todo!() + } +} + +/* impl std::ops::Deref for FullBlockIndex { + type Target = Box<[NewKeyedBlockHandle]>; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} */ + +/* impl NewFullBlockIndex { + /* pub fn from_file( + path: &Path, + metadata: &crate::segment::meta::Metadata, + offsets: &crate::segment::file_offsets::FileOffsets, + ) -> crate::Result { + todo!() + /* let cnt = metadata.index_block_count as usize; + + log::trace!( + "reading full block index from {path:?} at idx_ptr={} ({cnt} index blocks)", + offsets.index_block_ptr, + ); + + let mut file = File::open(path)?; + file.seek(std::io::SeekFrom::Start(*offsets.index_block_ptr))?; + + let mut block_handles = Vec::with_capacity(cnt); + + for _ in 0..cnt { + let idx_block = IndexBlock::from_reader(&mut file)?.items; + // TODO: 1.80? IntoIter impl for Box<[T]> + block_handles.extend(idx_block.into_vec()); + } + + debug_assert!(!block_handles.is_empty()); + + Ok(Self(block_handles.into_boxed_slice())) */ + } */ +} */ + +/* impl BlockIndex for FullBlockIndex { + fn get_lowest_block_containing_key( + &self, + key: &[u8], + _: CachePolicy, + ) -> crate::Result> { + use super::KeyedBlockIndex; + + self.0 + .get_lowest_block_containing_key(key, CachePolicy::Read) + .map(|x| x.map(|x| x.offset)) + } + + /// Gets the last block handle that may contain the given item + fn get_last_block_containing_key( + &self, + key: &[u8], + cache_policy: CachePolicy, + ) -> crate::Result> { + use super::KeyedBlockIndex; + + self.0 + .get_last_block_containing_key(key, cache_policy) + .map(|x| x.map(|x| x.offset)) + } + + fn get_last_block_handle(&self, _: CachePolicy) -> crate::Result { + use super::KeyedBlockIndex; + + self.0 + .get_last_block_handle(CachePolicy::Read) + .map(|x| x.offset) + } +} */ diff --git a/src/super_segment/data_block/mod.rs b/src/super_segment/data_block/mod.rs index c4cb5f4a..ddc79991 100644 --- a/src/super_segment/data_block/mod.rs +++ b/src/super_segment/data_block/mod.rs @@ -536,11 +536,7 @@ impl DataBlock { } /// Reads an item by key from the block, if it exists. - pub fn point_read( - &self, - needle: &[u8], - seqno: Option, - ) -> crate::Result> { + pub fn point_read(&self, needle: &[u8], seqno: Option) -> Option { let binary_index = self.get_binary_index_reader(); // NOTE: Try hash index if it exists @@ -553,10 +549,10 @@ impl DataBlock { match lookup { Found(bucket_value) => { let offset = binary_index.get(usize::from(bucket_value)); - return Ok(self.scan(needle, seqno, offset)); + return self.scan(needle, seqno, offset); } NotFound => { - return Ok(None); + return None; } Conflicted => { // NOTE: Fallback to binary search @@ -564,11 +560,9 @@ impl DataBlock { } } - let Some(offset) = self.binary_search_for_offset(&binary_index, needle, seqno) else { - return Ok(None); - }; + let offset = self.binary_search_for_offset(&binary_index, needle, seqno)?; - Ok(self.scan(needle, seqno, offset)) + self.scan(needle, seqno, offset) } pub fn encode_items( @@ -639,6 +633,7 @@ mod tests { )]; let bytes = DataBlock::encode_items(&items, 16, 0.0)?; + let serialized_len = bytes.len(); let data_block = DataBlock::new(Block { data: bytes.into(), @@ -651,15 +646,17 @@ mod tests { }); assert_eq!(data_block.len(), items.len()); + assert!(!data_block.is_empty()); + assert_eq!(data_block.inner.size(), serialized_len); for needle in items { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, None)?, + data_block.point_read(&needle.key.user_key, None), ); } - assert_eq!(None, data_block.point_read(b"yyy", None)?); + assert_eq!(None, data_block.point_read(b"yyy", None)); Ok(()) } @@ -709,7 +706,7 @@ mod tests { InternalValue::from_components("pla:venus:name", "Venus", 0, crate::ValueType::Value), ]; - let bytes = DataBlock::encode_items(&items, 16, 0.75)?; + let bytes = DataBlock::encode_items(&items, 16, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -727,11 +724,11 @@ mod tests { for needle in items { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1))?, + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), ); } - assert_eq!(None, data_block.point_read(b"yyy", None)?); + assert_eq!(None, data_block.point_read(b"yyy", None)); Ok(()) } @@ -743,7 +740,7 @@ mod tests { InternalValue::from_components([0], b"", 0, Value), ]; - let bytes = DataBlock::encode_items(&items, 16, 0.75)?; + let bytes = DataBlock::encode_items(&items, 16, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -761,11 +758,11 @@ mod tests { for needle in items { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1))?, + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), ); } - assert_eq!(None, data_block.point_read(b"yyy", None)?); + assert_eq!(None, data_block.point_read(b"yyy", None)); Ok(()) } @@ -799,11 +796,11 @@ mod tests { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1))?, + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), ); } - assert_eq!(None, data_block.point_read(b"yyy", None)?); + assert_eq!(None, data_block.point_read(b"yyy", None)); Ok(()) } @@ -939,11 +936,11 @@ mod tests { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, None)?, + data_block.point_read(&needle.key.user_key, None), ); } - assert_eq!(None, data_block.point_read(b"yyy", None)?); + assert_eq!(None, data_block.point_read(b"yyy", None)); Ok(()) } @@ -957,7 +954,7 @@ mod tests { InternalValue::from_components(b"b", b"b", 65, Value), ]; - let bytes = DataBlock::encode_items(&items, 1, 0.75)?; + let bytes = DataBlock::encode_items(&items, 1, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -977,11 +974,11 @@ mod tests { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1))?, + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), ); } - assert_eq!(None, data_block.point_read(b"yyy", None)?); + assert_eq!(None, data_block.point_read(b"yyy", None)); Ok(()) } @@ -996,7 +993,7 @@ mod tests { InternalValue::from_components(b"b", b"b", 65, Value), ]; - let bytes = DataBlock::encode_items(&items, 1, 0.75)?; + let bytes = DataBlock::encode_items(&items, 1, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -1013,13 +1010,13 @@ mod tests { assert_eq!( Some(items.first().cloned().unwrap()), - data_block.point_read(b"a", None)? + data_block.point_read(b"a", None) ); assert_eq!( Some(items.last().cloned().unwrap()), - data_block.point_read(b"b", None)? + data_block.point_read(b"b", None) ); - assert_eq!(None, data_block.point_read(b"yyy", None)?); + assert_eq!(None, data_block.point_read(b"yyy", None)); Ok(()) } @@ -1054,9 +1051,9 @@ mod tests { assert_eq!( Some(items.get(1).cloned().unwrap()), - data_block.point_read(&[233, 233], None)? + data_block.point_read(&[233, 233], None) ); - assert_eq!(None, data_block.point_read(b"yyy", None)?); + assert_eq!(None, data_block.point_read(b"yyy", None)); Ok(()) } @@ -1099,13 +1096,13 @@ mod tests { assert_eq!( Some(items.get(1).cloned().unwrap()), - data_block.point_read(&[233, 233], None)? + data_block.point_read(&[233, 233], None) ); assert_eq!( Some(items.last().cloned().unwrap()), - data_block.point_read(&[255, 255, 0], None)? + data_block.point_read(&[255, 255, 0], None) ); - assert_eq!(None, data_block.point_read(b"yyy", None)?); + assert_eq!(None, data_block.point_read(b"yyy", None)); Ok(()) } @@ -1148,13 +1145,62 @@ mod tests { assert_eq!( Some(items.get(1).cloned().unwrap()), - data_block.point_read(&[233, 233], Some(SeqNo::MAX))? + data_block.point_read(&[233, 233], Some(SeqNo::MAX)) + ); + assert_eq!( + Some(items.last().cloned().unwrap()), + data_block.point_read(&[255, 255, 0], Some(SeqNo::MAX)) + ); + assert_eq!(None, data_block.point_read(b"yyy", None)); + + Ok(()) + } + + #[test] + #[allow(clippy::unwrap_used)] + fn v3_data_block_mvcc_latest_fuzz_3_dense() -> crate::Result<()> { + let items = [ + InternalValue::from_components(Slice::from([0]), Slice::from([]), 0, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 8, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 7, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 6, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 5, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 4, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 3, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 2, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 1, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 0, Value), + InternalValue::from_components( + Slice::from([255, 255, 0]), + Slice::from([]), + 127_886_946_205_696, + Tombstone, + ), + ]; + + let bytes = DataBlock::encode_items(&items, 1, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + + assert_eq!( + Some(items.get(1).cloned().unwrap()), + data_block.point_read(&[233, 233], None) ); assert_eq!( Some(items.last().cloned().unwrap()), - data_block.point_read(&[255, 255, 0], Some(SeqNo::MAX))? + data_block.point_read(&[255, 255, 0], None) ); - assert_eq!(None, data_block.point_read(b"yyy", None)?); + assert_eq!(None, data_block.point_read(b"yyy", None)); Ok(()) } @@ -1188,11 +1234,11 @@ mod tests { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1))?, + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), ); } - assert_eq!(None, data_block.point_read(b"yyy", None)?); + assert_eq!(None, data_block.point_read(b"yyy", None)); Ok(()) } @@ -1207,7 +1253,7 @@ mod tests { InternalValue::from_components("pla:venus:name", "Venus", 0, Value), ]; - let bytes = DataBlock::encode_items(&items, 16, 0.75)?; + let bytes = DataBlock::encode_items(&items, 16, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -1223,7 +1269,7 @@ mod tests { assert!(data_block.hash_bucket_count().unwrap() > 0); assert!(data_block - .point_read(b"pla:venus:fact", None)? + .point_read(b"pla:venus:fact", None) .expect("should exist") .is_tombstone()); @@ -1245,7 +1291,7 @@ mod tests { InternalValue::from_components("pla:venus:name", "Venus", 0, Value), ]; - let bytes = DataBlock::encode_items(&items, 1, 0.75)?; + let bytes = DataBlock::encode_items(&items, 1, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -1263,11 +1309,11 @@ mod tests { for needle in items { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1))?, + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), ); } - assert_eq!(None, data_block.point_read(b"yyy", None)?); + assert_eq!(None, data_block.point_read(b"yyy", None)); Ok(()) } @@ -1281,7 +1327,7 @@ mod tests { Value, )]; - let bytes = DataBlock::encode_items(&items, 16, 0.75)?; + let bytes = DataBlock::encode_items(&items, 16, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -1318,7 +1364,7 @@ mod tests { InternalValue::from_components("pla:venus:name", "Venus", 0, Value), ]; - let bytes = DataBlock::encode_items(&items, 16, 0.75)?; + let bytes = DataBlock::encode_items(&items, 16, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -1355,7 +1401,7 @@ mod tests { Value, )]; - let bytes = DataBlock::encode_items(&items, 1, 0.75)?; + let bytes = DataBlock::encode_items(&items, 1, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -1389,7 +1435,7 @@ mod tests { InternalValue::from_components("pla:venus:name", "Venus", 0, Value), ]; - let bytes = DataBlock::encode_items(&items, 16, 0.75)?; + let bytes = DataBlock::encode_items(&items, 16, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -1427,7 +1473,7 @@ mod tests { InternalValue::from_components("pla:venus:name", "Venus", 0, Value), ]; - let bytes = DataBlock::encode_items(&items, 16, 0.75)?; + let bytes = DataBlock::encode_items(&items, 16, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -1493,7 +1539,7 @@ mod tests { InternalValue::from_components("pla:venus:name", "Venus", 0, Value), ]; - let bytes = DataBlock::encode_items(&items, 16, 0.75)?; + let bytes = DataBlock::encode_items(&items, 16, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -1529,7 +1575,7 @@ mod tests { InternalValue::from_components("pla:venus:name", "Venus", 0, Value), ]; - let bytes = DataBlock::encode_items(&items, 16, 0.75)?; + let bytes = DataBlock::encode_items(&items, 16, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -1583,7 +1629,7 @@ mod tests { for needle in items { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1))?, + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), ); } @@ -1596,7 +1642,7 @@ mod tests { .map(|x| InternalValue::from_components(x.to_be_bytes(), x.to_be_bytes(), 0, Value)) .collect::>(); - let bytes = DataBlock::encode_items(&items, 1, 0.75)?; + let bytes = DataBlock::encode_items(&items, 1, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -1614,7 +1660,7 @@ mod tests { for needle in items { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1))?, + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), ); } @@ -1627,7 +1673,7 @@ mod tests { .map(|x| InternalValue::from_components(x.to_be_bytes(), x.to_be_bytes(), 0, Value)) .collect::>(); - let bytes = DataBlock::encode_items(&items, 1, 0.75)?; + let bytes = DataBlock::encode_items(&items, 1, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -1645,7 +1691,7 @@ mod tests { for needle in items { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1))?, + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), ); } @@ -1658,7 +1704,7 @@ mod tests { .map(|x| InternalValue::from_components(x.to_be_bytes(), x.to_be_bytes(), 0, Value)) .collect::>(); - let bytes = DataBlock::encode_items(&items, 1, 0.75)?; + let bytes = DataBlock::encode_items(&items, 1, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -1676,7 +1722,7 @@ mod tests { for needle in items { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1))?, + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), ); } @@ -1707,7 +1753,7 @@ mod tests { for needle in items { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1))?, + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), ); } diff --git a/src/super_segment/filter/bit_array/mod.rs b/src/super_segment/filter/bit_array/mod.rs new file mode 100644 index 00000000..c510c482 --- /dev/null +++ b/src/super_segment/filter/bit_array/mod.rs @@ -0,0 +1,74 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +mod sliced; + +pub use sliced::BitArray as BitArrayReader; + +const BIT_MASK: u8 = 0b1000_0000_u8; + +/// Sets a bit in the byte +pub fn set_bit(byte: u8, idx: usize, value: bool) -> u8 { + let bit_mask = BIT_MASK >> idx; + + if value { + byte | bit_mask + } else { + byte & !bit_mask + } +} + +/// Fixed-size bit array +#[derive(Debug, Eq, PartialEq)] +pub struct Builder(Box<[u8]>); + +impl Builder { + #[must_use] + pub fn with_capacity(bytes: usize) -> Self { + let vec = vec![0; bytes]; + Self(vec.into_boxed_slice()) + } + + #[must_use] + pub fn from_bytes(bytes: Box<[u8]>) -> Self { + Self(bytes) + } + + #[must_use] + pub fn bytes(&self) -> &[u8] { + &self.0 + } + + /// Sets the i-th bit + pub fn set(&mut self, idx: usize, val: bool) { + let byte_idx = idx / 8; + let byte = self.0.get_mut(byte_idx).expect("should be in bounds"); + + let bit_idx = idx % 8; + *byte = set_bit(*byte, bit_idx, val); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use test_log::test; + + #[test] + fn bit_set_true() { + assert_eq!(0b0000_0010, set_bit(0, 6, true)); + assert_eq!(0b1000_0000, set_bit(0, 0, true)); + assert_eq!(0b0100_0000, set_bit(0, 1, true)); + assert_eq!(0b0100_0110, set_bit(0b0000_0110, 1, true)); + } + + #[test] + fn bit_set_false() { + assert_eq!(0b1111_1101, set_bit(0xFF, 6, false)); + assert_eq!(0b0111_1111, set_bit(0xFF, 0, false)); + assert_eq!(0b1011_1111, set_bit(0xFF, 1, false)); + + assert_eq!(0b0000_0110, set_bit(0b0100_0110, 1, false)); + } +} diff --git a/src/super_segment/filter/bit_array/sliced.rs b/src/super_segment/filter/bit_array/sliced.rs new file mode 100644 index 00000000..f6f09810 --- /dev/null +++ b/src/super_segment/filter/bit_array/sliced.rs @@ -0,0 +1,60 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use crate::Slice; + +const BIT_MASK: u8 = 0b1000_0000_u8; + +/// Gets a bit from the byte +fn get_bit(byte: u8, idx: usize) -> bool { + let bit_mask = BIT_MASK >> idx; + + let masked = byte & bit_mask; + masked > 0 +} + +/// Fixed-size bit array +#[derive(Debug, Eq, PartialEq)] +pub struct BitArray(Slice); + +impl BitArray { + #[must_use] + pub fn new(slice: Slice) -> Self { + Self(slice) + } + + #[must_use] + pub fn bytes(&self) -> &[u8] { + &self.0 + } + + /// Gets the i-th bit + #[must_use] + pub fn get(&self, idx: usize) -> bool { + let byte_idx = idx / 8; + let byte = self.0.get(byte_idx).expect("should be in bounds"); + + let bit_idx = idx % 8; + get_bit(*byte, bit_idx) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::super_segment::filter::bit_array::set_bit; + use test_log::test; + + #[test] + fn bit_set_get() { + assert_eq!(0b1111_1101, set_bit(0xFF, 6, false)); + assert_eq!(0b0111_1111, set_bit(0xFF, 0, false)); + assert_eq!(0b1011_1111, set_bit(0xFF, 1, false)); + + assert!(!get_bit(0b0100_0110, 0)); + assert!(get_bit(0b0100_0110, 1)); + assert!(get_bit(0b0100_0110, 6)); + assert!(!get_bit(0b0100_0110, 7)); + } +} diff --git a/src/super_segment/filter/mod.rs b/src/super_segment/filter/mod.rs new file mode 100644 index 00000000..4e6962dd --- /dev/null +++ b/src/super_segment/filter/mod.rs @@ -0,0 +1,6 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +pub mod bit_array; +pub mod standard_bloom; diff --git a/src/super_segment/filter/standard_bloom/builder.rs b/src/super_segment/filter/standard_bloom/builder.rs new file mode 100644 index 00000000..8457d45b --- /dev/null +++ b/src/super_segment/filter/standard_bloom/builder.rs @@ -0,0 +1,134 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use crate::super_segment::filter::bit_array::BitArrayReader; + +use super::{super::bit_array::Builder as BitArrayBuilder, StandardBloomFilter}; + +/// Two hashes that are used for double hashing +pub type CompositeHash = (u64, u64); + +#[derive(Debug, Eq, PartialEq)] +#[allow(clippy::module_name_repetitions)] +pub struct Builder { + /// Raw bytes exposed as bit array + inner: BitArrayBuilder, + + /// Bit count + m: usize, + + /// Number of hash functions + k: usize, +} + +#[allow(clippy::len_without_is_empty)] +impl Builder { + pub fn build(self) -> StandardBloomFilter { + StandardBloomFilter { + inner: BitArrayReader::new(self.inner.bytes().into()), + k: self.k, + m: self.m, + } + } + + /// Constructs a bloom filter that can hold `n` items + /// while maintaining a certain false positive rate `fpr`. + #[must_use] + pub fn with_fp_rate(n: usize, fpr: f32) -> Self { + use std::f32::consts::LN_2; + + assert!(n > 0); + + // NOTE: Some sensible minimum + let fpr = fpr.max(0.000_001); + + let m = Self::calculate_m(n, fpr); + let bpk = m / n; + let k = (((bpk as f32) * LN_2) as usize).max(1); + + Self { + inner: BitArrayBuilder::with_capacity(m / 8), + m, + k, + } + } + + /// Constructs a bloom filter that can hold `n` items + /// with `bpk` bits per key. + /// + /// 10 bits per key is a sensible default. + #[must_use] + pub fn with_bpk(n: usize, bpk: u8) -> Self { + use std::f32::consts::LN_2; + + assert!(bpk > 0); + assert!(n > 0); + + let bpk = bpk as usize; + + let m = n * bpk; + let k = (((bpk as f32) * LN_2) as usize).max(1); + + // NOTE: Round up so we don't get too little bits + let bytes = (m as f32 / 8.0).ceil() as usize; + + Self { + inner: BitArrayBuilder::with_capacity(bytes), + m: bytes * 8, + k, + } + } + + fn calculate_m(n: usize, fp_rate: f32) -> usize { + use std::f32::consts::LN_2; + + let n = n as f32; + let ln2_squared = LN_2.powi(2); + + let numerator = n * fp_rate.ln(); + let m = -(numerator / ln2_squared); + + // Round up to next byte + ((m / 8.0).ceil() * 8.0) as usize + } + + /// Adds the key to the filter. + pub fn set_with_hash(&mut self, (mut h1, mut h2): CompositeHash) { + for i in 0..(self.k as u64) { + let idx = h1 % (self.m as u64); + + self.enable_bit(idx as usize); + + h1 = h1.wrapping_add(h2); + h2 = h2.wrapping_add(i); + } + } + + /// Sets the bit at the given index to `true`. + fn enable_bit(&mut self, idx: usize) { + self.inner.set(idx, true); + } + + /// Gets the hash of a key. + #[must_use] + pub fn get_hash(key: &[u8]) -> CompositeHash { + let h0 = xxhash_rust::xxh3::xxh3_128(key); + let h1 = (h0 >> 64) as u64; + let h2 = h0 as u64; + (h1, h2) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use test_log::test; + + #[test] + fn bloom_calculate_m() { + assert_eq!(9_592, Builder::calculate_m(1_000, 0.01)); + assert_eq!(4_800, Builder::calculate_m(1_000, 0.1)); + assert_eq!(4_792_536, Builder::calculate_m(1_000_000, 0.1)); + } +} diff --git a/src/super_segment/filter/standard_bloom/mod.rs b/src/super_segment/filter/standard_bloom/mod.rs new file mode 100644 index 00000000..dbfc00ac --- /dev/null +++ b/src/super_segment/filter/standard_bloom/mod.rs @@ -0,0 +1,295 @@ +use super::bit_array::BitArrayReader; +use crate::{ + coding::{Decode, DecodeError, Encode, EncodeError}, + file::MAGIC_BYTES, +}; +use builder::CompositeHash; +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use std::io::{Read, Write}; + +mod builder; + +pub use builder::Builder; + +/// A standard bloom filter +/// +/// Allows buffering the key hashes before actual filter construction +/// which is needed to properly calculate the filter size, as the amount of items +/// are unknown during segment construction. +/// +/// The filter uses double hashing instead of `k` hash functions, see: +/// +pub struct StandardBloomFilter { + /// Raw bytes exposed as bit array + inner: BitArrayReader, + + /// Bit count + m: usize, + + /// Number of hash functions + k: usize, +} + +impl Encode for StandardBloomFilter { + fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { + // Write header + writer.write_all(&MAGIC_BYTES)?; + + // NOTE: Filter type (unused) + writer.write_u8(0)?; + + // NOTE: Hash type (unused) + writer.write_u8(0)?; + + writer.write_u64::(self.m as u64)?; + writer.write_u64::(self.k as u64)?; + writer.write_all(self.inner.bytes())?; + + Ok(()) + } +} + +impl Decode for StandardBloomFilter { + fn decode_from(reader: &mut R) -> Result { + // Check header + let mut magic = [0u8; MAGIC_BYTES.len()]; + reader.read_exact(&mut magic)?; + + if magic != MAGIC_BYTES { + return Err(DecodeError::InvalidHeader("BloomFilter")); + } + + // NOTE: Filter type (unused) + let filter_type = reader.read_u8()?; + assert_eq!(0, filter_type, "Invalid filter type"); + + // NOTE: Hash type (unused) + let hash_type = reader.read_u8()?; + assert_eq!(0, hash_type, "Invalid bloom hash type"); + + let m = reader.read_u64::()? as usize; + let k = reader.read_u64::()? as usize; + + let mut bytes = vec![0; m / 8]; + reader.read_exact(&mut bytes)?; + + Ok(Self::from_raw(m, k, bytes.into())) + } +} + +#[allow(clippy::len_without_is_empty)] +impl StandardBloomFilter { + /// Size of bloom filter in bytes. + #[must_use] + pub fn len(&self) -> usize { + self.inner.bytes().len() + } + + fn from_raw(m: usize, k: usize, slice: crate::Slice) -> Self { + Self { + inner: BitArrayReader::new(slice), + m, + k, + } + } + + /// Returns `true` if the hash may be contained. + /// + /// Will never have a false negative. + #[must_use] + pub(crate) fn contains_hash(&self, hash: CompositeHash) -> bool { + let (mut h1, mut h2) = hash; + + for i in 0..(self.k as u64) { + let idx = h1 % (self.m as u64); + + // NOTE: should be in bounds because of modulo + #[allow(clippy::expect_used)] + if !self.has_bit(idx as usize) { + return false; + } + + h1 = h1.wrapping_add(h2); + h2 = h2.wrapping_add(i); + } + + true + } + + /// Returns `true` if the item may be contained. + /// + /// Will never have a false negative. + #[must_use] + pub fn contains(&self, key: &[u8]) -> bool { + self.contains_hash(Self::get_hash(key)) + } + + /// Returns `true` if the bit at `idx` is `1`. + fn has_bit(&self, idx: usize) -> bool { + self.inner.get(idx) + } + + /// Gets the hash of a key. + fn get_hash(key: &[u8]) -> CompositeHash { + let h0 = xxhash_rust::xxh3::xxh3_128(key); + let h1 = (h0 >> 64) as u64; + let h2 = h0 as u64; + (h1, h2) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fs::File; + use test_log::test; + + /* #[test] + fn bloom_serde_round_trip() -> crate::Result<()> { + let dir = tempfile::tempdir()?; + + let path = dir.path().join("bf"); + let mut file = File::create(&path)?; + + let mut filter = StandardBloomFilter::with_fp_rate(10, 0.0001); + + let keys = &[ + b"item0", b"item1", b"item2", b"item3", b"item4", b"item5", b"item6", b"item7", + b"item8", b"item9", + ]; + + for key in keys { + filter.set_with_hash(StandardBloomFilter::get_hash(*key)); + } + + for key in keys { + assert!(filter.contains(&**key)); + } + assert!(!filter.contains(b"asdasads")); + assert!(!filter.contains(b"item10")); + assert!(!filter.contains(b"cxycxycxy")); + + filter.encode_into(&mut file)?; + file.sync_all()?; + drop(file); + + let mut file = File::open(&path)?; + let filter_copy = StandardBloomFilter::decode_from(&mut file)?; + + assert_eq!(filter, filter_copy); + + for key in keys { + assert!(filter.contains(&**key)); + } + assert!(!filter_copy.contains(b"asdasads")); + assert!(!filter_copy.contains(b"item10")); + assert!(!filter_copy.contains(b"cxycxycxy")); + + Ok(()) + } */ + + /* #[test] + fn bloom_basic() { + let mut filter = StandardBloomFilter::with_fp_rate(10, 0.0001); + + for key in [ + b"item0", b"item1", b"item2", b"item3", b"item4", b"item5", b"item6", b"item7", + b"item8", b"item9", + ] { + assert!(!filter.contains(key)); + filter.set_with_hash(StandardBloomFilter::get_hash(key)); + assert!(filter.contains(key)); + + assert!(!filter.contains(b"asdasdasdasdasdasdasd")); + } + } */ + + /* #[test] + fn bloom_bpk() { + let item_count = 1_000; + let bpk = 5; + + let mut filter = StandardBloomFilter::with_bpk(item_count, bpk); + + for key in (0..item_count).map(|_| nanoid::nanoid!()) { + let key = key.as_bytes(); + + filter.set_with_hash(StandardBloomFilter::get_hash(key)); + assert!(filter.contains(key)); + } + + let mut false_positives = 0; + + for key in (0..item_count).map(|_| nanoid::nanoid!()) { + let key = key.as_bytes(); + + if filter.contains(key) { + false_positives += 1; + } + } + + #[allow(clippy::cast_precision_loss)] + let fpr = false_positives as f32 / item_count as f32; + assert!(fpr < 0.13); + } + + #[test] + fn bloom_fpr() { + let item_count = 100_000; + let wanted_fpr = 0.1; + + let mut filter = StandardBloomFilter::with_fp_rate(item_count, wanted_fpr); + + for key in (0..item_count).map(|_| nanoid::nanoid!()) { + let key = key.as_bytes(); + + filter.set_with_hash(StandardBloomFilter::get_hash(key)); + assert!(filter.contains(key)); + } + + let mut false_positives = 0; + + for key in (0..item_count).map(|_| nanoid::nanoid!()) { + let key = key.as_bytes(); + + if filter.contains(key) { + false_positives += 1; + } + } + + #[allow(clippy::cast_precision_loss)] + let fpr = false_positives as f32 / item_count as f32; + assert!(fpr > 0.05); + assert!(fpr < 0.13); + } + + #[test] + fn bloom_fpr_2() { + let item_count = 100_000; + let wanted_fpr = 0.5; + + let mut filter = StandardBloomFilter::with_fp_rate(item_count, wanted_fpr); + + for key in (0..item_count).map(|_| nanoid::nanoid!()) { + let key = key.as_bytes(); + + filter.set_with_hash(StandardBloomFilter::get_hash(key)); + assert!(filter.contains(key)); + } + + let mut false_positives = 0; + + for key in (0..item_count).map(|_| nanoid::nanoid!()) { + let key = key.as_bytes(); + + if filter.contains(key) { + false_positives += 1; + } + } + + #[allow(clippy::cast_precision_loss)] + let fpr = false_positives as f32 / item_count as f32; + assert!(fpr > 0.45); + assert!(fpr < 0.55); + } */ +} diff --git a/src/super_segment/index_block/block_handle.rs b/src/super_segment/index_block/block_handle.rs index bf05df5a..20fe78b4 100644 --- a/src/super_segment/index_block/block_handle.rs +++ b/src/super_segment/index_block/block_handle.rs @@ -6,6 +6,7 @@ use crate::{ coding::{Decode, DecodeError, Encode, EncodeError}, super_segment::block::{BlockOffset, Encodable}, }; +use byteorder::WriteBytesExt; use value_log::UserKey; use varint_rs::{VarintReader, VarintWriter}; @@ -85,6 +86,12 @@ pub struct NewKeyedBlockHandle { inner: NewBlockHandle, } +impl AsRef for NewKeyedBlockHandle { + fn as_ref(&self) -> &NewBlockHandle { + &self.inner + } +} + impl NewKeyedBlockHandle { pub fn new(end_key: UserKey, offset: BlockOffset, size: u32) -> Self { Self { @@ -139,12 +146,16 @@ impl Encodable for NewKeyedBlockHandle { state: &mut BlockOffset, ) -> crate::Result<()> { // We encode restart markers as: - // [offset] [size] [key len] [end key] - // 1 2 3 4 + // [marker=0] [offset] [size] [key len] [end key] + // 1 2 3 4 5 - self.inner.encode_into(writer)?; // 1, 2 - writer.write_u16_varint(self.end_key.len() as u16)?; // 3 - writer.write_all(&self.end_key)?; // 4 + writer.write_u8(0)?; // 1 + + // TODO: maybe move these behind the key + self.inner.encode_into(writer)?; // 2, 3 + + writer.write_u16_varint(self.end_key.len() as u16)?; // 4 + writer.write_all(&self.end_key)?; // 5 *state = BlockOffset(*self.offset() + u64::from(self.size())); @@ -158,22 +169,24 @@ impl Encodable for NewKeyedBlockHandle { shared_len: usize, ) -> crate::Result<()> { // We encode truncated handles as: - // [size] [shared prefix len] [rest key len] [rest key] - // 1 2 3 4 + // [marker=0] [size] [shared prefix len] [rest key len] [rest key] + // 1 2 3 4 5 + + writer.write_u8(0)?; // 1 - writer.write_u32_varint(self.size())?; // 1 + writer.write_u32_varint(self.size())?; // 2 // TODO: maybe we can skip this varint altogether if prefix truncation = false - writer.write_u16_varint(shared_len as u16)?; // 2 + writer.write_u16_varint(shared_len as u16)?; // 3 // NOTE: We can safely cast to u16, because keys are u16 long max #[allow(clippy::cast_possible_truncation)] let rest_len = self.end_key.len() - shared_len; - writer.write_u16_varint(rest_len as u16)?; // 3 + writer.write_u16_varint(rest_len as u16)?; // 4 let truncated_user_key = self.end_key.get(shared_len..).expect("should be in bounds"); - writer.write_all(truncated_user_key)?; // 4 + writer.write_all(truncated_user_key)?; // 5 *state += u64::from(self.size()); diff --git a/src/super_segment/index_block/forward_reader.rs b/src/super_segment/index_block/forward_reader.rs new file mode 100644 index 00000000..79ffe5ad --- /dev/null +++ b/src/super_segment/index_block/forward_reader.rs @@ -0,0 +1,235 @@ +// Copyright (c) 2025-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use super::{IndexBlock, NewKeyedBlockHandle}; +use crate::{super_segment::BlockOffset, Slice}; +use std::io::Cursor; + +#[derive(Default, Debug)] +struct LoScanner { + offset: usize, + remaining_in_interval: usize, + base_key_offset: Option, +} + +/// Specialized reader to scan an index block only in forwards direction +/// +/// Is less expensive than a double ended iterator. +pub struct ForwardReader<'a> { + block: &'a IndexBlock, + restart_interval: usize, + + lo_scanner: LoScanner, +} + +/// [start, end] slice indexes +#[derive(Debug)] +pub struct ParsedSlice(pub usize, pub usize); + +#[derive(Debug)] +pub struct ParsedItem { + pub offset: BlockOffset, + pub size: u32, + pub prefix: Option, + pub end_key: ParsedSlice, +} + +impl ParsedItem { + pub fn materialize(&self, bytes: &Slice) -> NewKeyedBlockHandle { + let end_key = if let Some(prefix) = &self.prefix { + let prefix_key = &bytes[prefix.0..prefix.1]; + let rest_key = &bytes[self.end_key.0..self.end_key.1]; + Slice::fused(prefix_key, rest_key) + } else { + bytes.slice(self.end_key.0..self.end_key.1) + }; + + NewKeyedBlockHandle::new(end_key, self.offset, self.size) + } +} + +impl<'a> ForwardReader<'a> { + pub fn new(block: &'a IndexBlock) -> Self { + let restart_interval = block.restart_interval.into(); + + Self { + block, + + restart_interval, + + lo_scanner: LoScanner::default(), + } + } + + pub fn with_offset(mut self, offset: usize) -> Self { + self.lo_scanner.offset = offset; + self + } + + fn parse_restart_item( + block: &IndexBlock, + offset: &mut usize, + base_key_offset: &mut Option, + ) -> Option { + let bytes = block.bytes(); + + // SAFETY: The cursor is advanced by read_ operations which check for EOF, + // And the cursor starts at 0 - the slice is never empty + #[warn(unsafe_code)] + let mut reader = Cursor::new(unsafe { bytes.get_unchecked(*offset..) }); + + let item = IndexBlock::parse_restart_item(&mut reader, *offset)?; + + *offset += reader.position() as usize; + *base_key_offset = Some(item.end_key.0); + + Some(item) + } + + fn parse_truncated_item( + block: &IndexBlock, + offset: &mut usize, + base_key_offset: usize, + ) -> Option { + let bytes = block.bytes(); + + // SAFETY: The cursor is advanced by read_ operations which check for EOF, + // And the cursor starts at 0 - the slice is never empty + #[warn(unsafe_code)] + let mut reader = Cursor::new(unsafe { bytes.get_unchecked(*offset..) }); + + let item = IndexBlock::parse_truncated_item(&mut reader, *offset, base_key_offset)?; + + *offset += reader.position() as usize; + + Some(item) + } +} + +impl Iterator for ForwardReader<'_> { + type Item = ParsedItem; + + fn next(&mut self) -> Option { + let is_restart = self.lo_scanner.remaining_in_interval == 0; + + let item = if is_restart { + self.lo_scanner.remaining_in_interval = self.restart_interval; + + Self::parse_restart_item( + self.block, + &mut self.lo_scanner.offset, + &mut self.lo_scanner.base_key_offset, + ) + } else { + Self::parse_truncated_item( + self.block, + &mut self.lo_scanner.offset, + self.lo_scanner.base_key_offset.expect("should exist"), + ) + }; + + self.lo_scanner.remaining_in_interval -= 1; + + item + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::super_segment::{block::Header, Block, Checksum}; + use test_log::test; + + #[test] + #[allow(clippy::unwrap_used)] + fn v3_index_block_simple() -> crate::Result<()> { + let items = [ + NewKeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), + NewKeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), + NewKeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), + ]; + + let bytes = IndexBlock::encode_items(&items, 1)?; + eprintln!("{bytes:?}"); + eprintln!("{}", String::from_utf8_lossy(&bytes)); + /* eprintln!("encoded into {} bytes", bytes.len()); */ + + let block = IndexBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(block.item_count(), items.len()); + + let iter = block.forward_reader(b"a").unwrap(); + assert_eq!(&items, &*iter.collect::>()); + + Ok(()) + } + + #[test] + #[allow(clippy::unwrap_used)] + fn v3_index_block_seek() -> crate::Result<()> { + let items = [ + NewKeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), + NewKeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), + NewKeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), + ]; + + let bytes = IndexBlock::encode_items(&items, 1)?; + eprintln!("{bytes:?}"); + eprintln!("{}", String::from_utf8_lossy(&bytes)); + /* eprintln!("encoded into {} bytes", bytes.len()); */ + + let block = IndexBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(block.item_count(), items.len()); + + { + let iter = block.forward_reader(b"a").unwrap(); + assert_eq!(&items, &*iter.into_iter().collect::>()); + } + + { + let iter = block.forward_reader(b"b").unwrap(); + assert_eq!(&items, &*iter.into_iter().collect::>()); + } + + { + let iter = block.forward_reader(b"c").unwrap(); + assert_eq!( + items.iter().skip(2).cloned().collect::>(), + &*iter.collect::>(), + ); + } + + { + let iter = block.forward_reader(b"def").unwrap(); + assert_eq!( + items.iter().skip(2).cloned().collect::>(), + &*iter.collect::>(), + ); + } + + { + let iter = block.forward_reader(b"zzz"); + assert!(iter.is_none(), "iterator should seek past index block"); + } + + Ok(()) + } +} diff --git a/src/super_segment/index_block/mod.rs b/src/super_segment/index_block/mod.rs index ac4d64c5..e1e00395 100644 --- a/src/super_segment/index_block/mod.rs +++ b/src/super_segment/index_block/mod.rs @@ -3,13 +3,16 @@ // (found in the LICENSE-* files in the repository) mod block_handle; +mod forward_reader; pub use block_handle::{NewBlockHandle, NewKeyedBlockHandle}; +use forward_reader::{ForwardReader, ParsedItem, ParsedSlice}; use super::{ block::{binary_index::Reader as BinaryIndexReader, BlockOffset, Encoder, Trailer}, Block, }; +use crate::super_segment::block::TRAILER_START_MARKER; use byteorder::{LittleEndian, ReadBytesExt}; use std::io::{Cursor, Seek}; use varint_rs::VarintReader; @@ -34,13 +37,6 @@ pub struct IndexBlock { binary_index_len: u32, } -struct RestartHead { - offset: BlockOffset, - size: u32, - key_start: usize, - key_len: usize, -} - impl IndexBlock { #[must_use] pub fn new(inner: Block) -> Self { @@ -125,21 +121,74 @@ impl IndexBlock { ) } - fn parse_restart_head(cursor: &mut Cursor<&[u8]>, pos: usize) -> RestartHead { - let offset = unwrappy!(cursor.read_u64_varint()); - let size = unwrappy!(cursor.read_u32_varint()); + // TODO: should not return Option<>? + #[must_use] + #[allow(clippy::iter_without_into_iter)] + pub fn forward_reader( + &self, + needle: &[u8], + ) -> Option + '_> { + let offset = self + .search_lowest(&self.get_binary_index_reader(), needle) + .unwrap_or_default(); + + // SAFETY: pos is always retrieved from the binary index, + // which we consider to be trustworthy + #[warn(unsafe_code)] + let mut cursor = Cursor::new(unsafe { self.inner.data.get_unchecked(offset..) }); + + let item = Self::parse_restart_item(&mut cursor, offset)?; - let key_len: usize = unwrappy!(cursor.read_u16_varint()).into(); - let key_start = pos + cursor.position() as usize; + let key = &self.inner.data[item.end_key.0..item.end_key.1]; - unwrappy!(cursor.seek_relative(key_len as i64)); + if needle > key { + return None; + } - RestartHead { + Some( + ForwardReader::new(self) + .with_offset(offset) + .map(|kv| kv.materialize(&self.inner.data)), + ) + } + + fn parse_restart_item(reader: &mut Cursor<&[u8]>, pos: usize) -> Option { + let marker = unwrappy!(reader.read_u8()); + + if marker == TRAILER_START_MARKER { + return None; + } + + let offset = unwrappy!(reader.read_u64_varint()); + let size = unwrappy!(reader.read_u32_varint()); + + let key_len: usize = unwrappy!(reader.read_u16_varint()).into(); + let key_start = pos + reader.position() as usize; + + unwrappy!(reader.seek_relative(key_len as i64)); + + Some(ParsedItem { + prefix: None, + end_key: ParsedSlice(key_start, key_start + key_len), offset: BlockOffset(offset), size, - key_start, - key_len, + }) + } + + fn parse_truncated_item( + reader: &mut Cursor<&[u8]>, + offset: usize, + base_key_offset: usize, + ) -> Option { + let marker = unwrappy!(reader.read_u8()); + + if marker == TRAILER_START_MARKER { + return None; } + + let size = unwrappy!(reader.read_u32_varint()); + + todo!() } fn get_key_at(&self, pos: usize) -> &[u8] { @@ -150,20 +199,9 @@ impl IndexBlock { #[warn(unsafe_code)] let mut cursor = Cursor::new(unsafe { bytes.get_unchecked(pos..) }); - // TODO: maybe move these behind the key - let _ = unwrappy!(cursor.read_u64_varint()); - let _ = unwrappy!(cursor.read_u32_varint()); - - let key_len: usize = unwrappy!(cursor.read_u16_varint()).into(); - let key_start = cursor.position() as usize; + let item = Self::parse_restart_item(&mut cursor, pos).expect("should exist"); - let key_start = pos + key_start; - let key_end = key_start + key_len; - - #[warn(unsafe_code)] - let key = bytes.get(key_start..key_end).expect("should read"); - - key + &bytes[item.end_key.0..item.end_key.1] } /// Search for the lowest block that may possibly contain the needle. @@ -247,36 +285,23 @@ impl IndexBlock { #[warn(unsafe_code)] let mut cursor = Cursor::new(unsafe { self.inner.data.get_unchecked(offset..) }); - let item = Self::parse_restart_head(&mut cursor, offset); + let item = Self::parse_restart_item(&mut cursor, offset)?; - let end_key = self - .inner - .data - .slice(item.key_start..(item.key_start + item.key_len)); + let key = &self.inner.data[item.end_key.0..item.end_key.1]; - if needle > end_key { + if needle > key { return None; } - Some(NewKeyedBlockHandle::new(end_key, item.offset, item.size)) + // TODO: 3.0.0 scan(), delta encoding etc., add test with restart interval > 1 + + Some(item.materialize(&self.inner.data)) } #[must_use] pub fn get_highest_possible_block(&self, needle: &[u8]) -> Option { let binary_index = self.get_binary_index_reader(); - /* - // NOTE: Currently, the hash index is never initialized for index blocks - /* // NOTE: Try hash index if it exists - if let Some(bucket_value) = self - .get_hash_index_reader() - .and_then(|reader| reader.get(key)) - { - let restart_entry_pos = binary_index.get(usize::from(bucket_value)); - return self.walk(key, seqno, restart_entry_pos, self.restart_interval.into()); - } */ - ) */ - let offset = self.search_highest(&binary_index, needle)?; // SAFETY: pos is always retrieved from the binary index, @@ -284,26 +309,26 @@ impl IndexBlock { #[warn(unsafe_code)] let mut cursor = Cursor::new(unsafe { self.inner.data.get_unchecked(offset..) }); - let item = Self::parse_restart_head(&mut cursor, offset); + let item = Self::parse_restart_item(&mut cursor, offset)?; - let end_key = self - .inner - .data - .slice(item.key_start..(item.key_start + item.key_len)); + let key = &self.inner.data[item.end_key.0..item.end_key.1]; - if needle > end_key { + if needle > key { return None; } - Some(NewKeyedBlockHandle::new(end_key, item.offset, item.size)) + Some(item.materialize(&self.inner.data)) } - pub fn encode_items(items: &[NewKeyedBlockHandle]) -> crate::Result> { + pub fn encode_items( + items: &[NewKeyedBlockHandle], + restart_interval: u8, + ) -> crate::Result> { let first_key = items.first().expect("chunk should not be empty").end_key(); let mut serializer = Encoder::<'_, BlockOffset, NewKeyedBlockHandle>::new( items.len(), - 1, // TODO: hard-coded for now + restart_interval, 0.0, // TODO: hard-coded for now first_key, ); @@ -331,7 +356,7 @@ mod tests { NewKeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), ]; - let bytes = IndexBlock::encode_items(&items)?; + let bytes = IndexBlock::encode_items(&items, 1)?; eprintln!("{bytes:?}"); eprintln!("{}", String::from_utf8_lossy(&bytes)); /* eprintln!("encoded into {} bytes", bytes.len()); */ @@ -379,7 +404,7 @@ mod tests { NewKeyedBlockHandle::new(b"b".into(), BlockOffset(13_000), 5_000), ]; - let bytes = IndexBlock::encode_items(&items)?; + let bytes = IndexBlock::encode_items(&items, 1)?; // eprintln!("{bytes:?}"); // eprintln!("{}", String::from_utf8_lossy(&bytes)); /* eprintln!("encoded into {} bytes", bytes.len()); */ @@ -422,7 +447,7 @@ mod tests { NewKeyedBlockHandle::new(b"d".into(), BlockOffset(13_000), 5_000), ]; - let bytes = IndexBlock::encode_items(&items)?; + let bytes = IndexBlock::encode_items(&items, 1)?; // eprintln!("{bytes:?}"); // eprintln!("{}", String::from_utf8_lossy(&bytes)); /* eprintln!("encoded into {} bytes", bytes.len()); */ @@ -468,7 +493,7 @@ mod tests { fn v3_index_block_one() -> crate::Result<()> { let item = NewKeyedBlockHandle::new(b"c".into(), BlockOffset(0), 6_000); - let bytes = IndexBlock::encode_items(&[item.clone()])?; + let bytes = IndexBlock::encode_items(&[item.clone()], 1)?; // eprintln!("{bytes:?}"); // eprintln!("{}", String::from_utf8_lossy(&bytes)); /* eprintln!("encoded into {} bytes", bytes.len()); */ @@ -508,7 +533,7 @@ mod tests { fn v3_index_block_one_highest() -> crate::Result<()> { let item = NewKeyedBlockHandle::new(b"c".into(), BlockOffset(0), 6_000); - let bytes = IndexBlock::encode_items(&[item.clone()])?; + let bytes = IndexBlock::encode_items(&[item.clone()], 1)?; // eprintln!("{bytes:?}"); // eprintln!("{}", String::from_utf8_lossy(&bytes)); /* eprintln!("encoded into {} bytes", bytes.len()); */ diff --git a/src/super_segment/inner.rs b/src/super_segment/inner.rs new file mode 100644 index 00000000..1d45db3d --- /dev/null +++ b/src/super_segment/inner.rs @@ -0,0 +1,66 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use super::{ + block_index::NewBlockIndexImpl, filter::standard_bloom::StandardBloomFilter, meta::ParsedMeta, + trailer::Trailer, +}; +use crate::{ + new_cache::NewCache, new_descriptor_table::NewDescriptorTable, tree::inner::TreeId, + GlobalSegmentId, +}; +use std::{ + path::PathBuf, + sync::{atomic::AtomicBool, Arc}, +}; + +pub struct Inner { + pub path: PathBuf, + + pub(crate) tree_id: TreeId, + + #[doc(hidden)] + pub descriptor_table: Arc, + + /// Segment metadata object + #[doc(hidden)] + pub metadata: ParsedMeta, + + pub(crate) trailer: Trailer, // TODO: remove...? + + /// Translates key (first item of a block) to block offset (address inside file) and (compressed) size + #[doc(hidden)] + pub block_index: Arc, + + /// Block cache + /// + /// Stores index and data blocks + #[doc(hidden)] + pub cache: Arc, + + /// Pinned AMQ filter + pub pinned_filter: Option, + + // /// Pinned filter + // #[doc(hidden)] + // pub bloom_filter: Option, + pub is_deleted: AtomicBool, +} + +impl Drop for Inner { + fn drop(&mut self) { + let global_id: GlobalSegmentId = (self.tree_id, self.metadata.id).into(); + + if self.is_deleted.load(std::sync::atomic::Ordering::Acquire) { + log::trace!("Cleanup deleted segment {global_id:?} at {:?}", self.path); + + if let Err(e) = std::fs::remove_file(&self.path) { + log::warn!( + "Failed to cleanup deleted segment {global_id:?} at {:?}: {e:?}", + self.path, + ); + } + } + } +} diff --git a/src/super_segment/meta.rs b/src/super_segment/meta.rs new file mode 100644 index 00000000..c6e4eb06 --- /dev/null +++ b/src/super_segment/meta.rs @@ -0,0 +1,157 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use super::{trailer::Trailer, Block, DataBlock}; +use crate::{coding::Decode, CompressionType, KeyRange, SegmentId, SeqNo}; +use byteorder::{LittleEndian, ReadBytesExt}; +use std::{fs::File, ops::Deref}; + +/// Nano-second timestamp. +pub struct Timestamp(u128); + +impl Deref for Timestamp { + type Target = u128; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl From for u128 { + fn from(val: Timestamp) -> Self { + val.0 + } +} + +impl From for Timestamp { + fn from(value: u128) -> Self { + Self(value) + } +} + +pub struct ParsedMeta { + pub id: SegmentId, + pub created_at: Timestamp, + pub data_block_count: u64, + pub index_block_count: u64, + pub key_range: KeyRange, + pub seqnos: (SeqNo, SeqNo), + pub file_size: u64, + pub item_count: u64, + + pub data_block_compression: CompressionType, +} + +impl ParsedMeta { + pub fn from_trailer(file: &File, trailer: &Trailer) -> crate::Result { + let ptr = trailer.metadata; + let block = Block::from_file(file, ptr.offset(), ptr.size(), CompressionType::None)?; + let block = DataBlock::new(block); + + let id = { + let bytes = block + .point_read(b"#id", None) + .expect("Segment ID should exist"); + + let mut bytes = &bytes.value[..]; + bytes.read_u64::()? + }; + + let created_at = { + let bytes = block + .point_read(b"#created_at", None) + .expect("Segment created_at should exist"); + + let mut bytes = &bytes.value[..]; + bytes.read_u128::()?.into() + }; + + let item_count = { + let bytes = block + .point_read(b"#item_count", None) + .expect("Segment ID should exist"); + + let mut bytes = &bytes.value[..]; + bytes.read_u64::()? + }; + + let data_block_count = { + let bytes = block + .point_read(b"#data_block_count", None) + .expect("data_block_count should exist"); + + let mut bytes = &bytes.value[..]; + bytes.read_u64::()? + }; + + let index_block_count = { + let bytes = block + .point_read(b"#index_block_count", None) + .expect("index_block_count should exist"); + + let mut bytes = &bytes.value[..]; + bytes.read_u64::()? + }; + + let key_range = KeyRange::new(( + block + .point_read(b"#key#min", None) + .expect("key min should exist") + .value, + block + .point_read(b"#key#max", None) + .expect("key max should exist") + .value, + )); + + let seqnos = { + let min = { + let bytes = block + .point_read(b"#seqno#min", None) + .expect("seqno min should exist") + .value; + let mut bytes = &bytes[..]; + bytes.read_u64::()? + }; + + let max = { + let bytes = block + .point_read(b"#seqno#max", None) + .expect("seqno max should exist") + .value; + let mut bytes = &bytes[..]; + bytes.read_u64::()? + }; + + (min, max) + }; + + let file_size = { + let bytes = block.point_read(b"#size", None).expect("size should exist"); + let mut bytes = &bytes.value[..]; + bytes.read_u64::()? + }; + + let data_block_compression = { + let bytes = block + .point_read(b"#compression#data", None) + .expect("size should exist"); + + let mut bytes = &bytes.value[..]; + CompressionType::decode_from(&mut bytes)? + }; + + Ok(Self { + id, + created_at, + data_block_count, + index_block_count, + key_range, + seqnos, + file_size, + item_count, + data_block_compression, + }) + } +} diff --git a/src/super_segment/mod.rs b/src/super_segment/mod.rs index 6cbf0a50..2b703442 100644 --- a/src/super_segment/mod.rs +++ b/src/super_segment/mod.rs @@ -3,13 +3,530 @@ // (found in the LICENSE-* files in the repository) pub mod block; +mod block_index; pub(crate) mod data_block; +mod filter; mod index_block; +mod inner; +mod meta; +pub(crate) mod multi_writer; +mod scanner; mod trailer; pub(crate) mod util; mod writer; -pub use block::Block; +pub use block::{Block, BlockOffset, Checksum}; pub use data_block::DataBlock; -pub use index_block::IndexBlock; +pub use index_block::{IndexBlock, NewKeyedBlockHandle}; +pub use scanner::Scanner; pub use writer::Writer; + +use crate::{ + bloom::CompositeHash, new_cache::NewCache, new_descriptor_table::NewDescriptorTable, + CompressionType, GlobalSegmentId, InternalValue, SegmentId, SeqNo, TreeId, UserKey, +}; +use block_index::{NewBlockIndex, NewBlockIndexImpl, NewFullBlockIndex}; +use filter::standard_bloom::StandardBloomFilter; +use index_block::NewBlockHandle; +use inner::Inner; +use meta::ParsedMeta; +use std::{ + ops::{Bound, RangeBounds}, + path::Path, + sync::{atomic::AtomicBool, Arc}, +}; + +#[allow(clippy::module_name_repetitions)] +pub type SegmentInner = Inner; + +/// Disk segment (a.k.a. `SSTable`, `SST`, `sorted string table`) that is located on disk +/// +/// A segment is an immutable list of key-value pairs, split into compressed blocks. +/// A reference to the block (`block handle`) is saved in the "block index". +/// +/// Deleted entries are represented by tombstones. +/// +/// Segments can be merged together to improve read performance and reduce disk space by removing outdated item versions. +#[doc(alias("sstable", "sst", "sorted string table"))] +#[derive(Clone)] +pub struct Segment(Arc); + +impl From for Segment { + fn from(value: Inner) -> Self { + Self(Arc::new(value)) + } +} + +impl std::ops::Deref for Segment { + type Target = Inner; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl std::fmt::Debug for Segment { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Segment:{}({})", self.id(), self.metadata.key_range) + } +} + +impl Segment { + /// Gets the global segment ID. + #[must_use] + pub fn global_id(&self) -> GlobalSegmentId { + (self.tree_id, self.id()).into() + } + + #[must_use] + pub fn bloom_filter_size(&self) -> usize { + if let Some(pinned_filter) = &self.pinned_filter { + pinned_filter.len() + } else { + // TODO: meta.filter_size + todo!() + } + } + + /// Gets the segment ID. + /// + /// The segment ID is unique for this tree, but not + /// across multiple trees, use [`Segment::global_id`] for that. + #[must_use] + pub fn id(&self) -> SegmentId { + self.metadata.id + } + + fn load_data_block(&self, handle: &NewBlockHandle) -> crate::Result { + let id = self.global_id(); + + if let Some(data_block) = self.cache.get_data_block(id, handle.offset()) { + return Ok(data_block); + } + + let cached_fd = self.descriptor_table.access_for_table(&id); + let cache_miss = cached_fd.is_none(); + + let fd = if let Some(fd) = cached_fd { + fd + } else { + Arc::new(std::fs::File::open(&self.path)?) + }; + + let block = Block::from_file( + &fd, + handle.offset(), + handle.size(), + self.metadata.data_block_compression, + ) + .map(DataBlock::new)?; + + // Cache FD + if cache_miss { + self.descriptor_table.insert_for_table(id, fd); + } + + self.cache + .insert_block(id, handle.offset(), block.inner.clone()); + + Ok(block) + } + + pub fn get( + &self, + key: &[u8], + seqno: Option, + key_hash: CompositeHash, + ) -> crate::Result> { + if let Some(seqno) = seqno { + if self.metadata.seqnos.0 >= seqno { + return Ok(None); + } + } + + if let Some(filter) = &self.pinned_filter { + if !filter.contains_hash(key_hash) { + return Ok(None); + } + } + + self.point_read(key, seqno) + } + + fn point_read(&self, key: &[u8], seqno: Option) -> crate::Result> { + use crate::segment::value_block::CachePolicy; + + match seqno { + None => { + let Some(block_handle) = self + .block_index + .get_lowest_block_containing_key(key, CachePolicy::Write)? + else { + return Ok(None); + }; + + let block = self.load_data_block(block_handle.as_ref())?; + + // NOTE: Fastpath for non-seqno reads + return Ok(block.point_read(key, None)); + } + Some(seqno) => { + let NewBlockIndexImpl::Full(block_index) = &*self.block_index else { + todo!(); + }; + + let Some(iter) = block_index.forward_reader(key) else { + return Ok(None); + }; + + for block_handle in iter { + if block_handle.end_key() < &key { + return Ok(None); + } + + let block = self.load_data_block(block_handle.as_ref())?; + + if let Some(item) = block.point_read(key, Some(seqno)) { + return Ok(Some(item)); + } + } + } + } + + Ok(None) + } + + /// Creates a scanner over the `Segment`. + /// + /// The scanner is ĺogically the same as a normal iter(), + /// however it uses its own file descriptor, does not look into the block cache + /// and uses buffered I/O. + /// + /// Used for compactions and thus not available to a user. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + #[doc(hidden)] + pub fn scan(&self) -> crate::Result { + let block_count = self + .metadata + .data_block_count + .try_into() + .expect("data block count should fit"); + + Scanner::new( + &self.path, + block_count, + self.metadata.data_block_compression, + ) + } + + /// Creates an iterator over the `Segment`. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + #[must_use] + #[allow(clippy::iter_without_into_iter)] + #[doc(hidden)] + pub fn iter(&self) -> impl DoubleEndedIterator> + '_ { + // self.range(..) + todo!(); + + std::iter::empty() + } + + /// Creates a ranged iterator over the `Segment`. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + #[must_use] + #[allow(clippy::iter_without_into_iter)] + #[doc(hidden)] + pub fn range, R: RangeBounds>( + &self, + range: R, + ) -> impl DoubleEndedIterator> + '_ { + // self.range((std::ops::Bound::Unbounded, std::ops::Bound::Unbounded)) + todo!(); + + std::iter::empty() + } + + /// Tries to recover a segment from a file. + pub fn recover( + file_path: &Path, + tree_id: TreeId, + cache: Arc, + descriptor_table: Arc, + ) -> crate::Result { + // use block_index::{full_index::FullBlockIndex, two_level_index::TwoLevelBlockIndex}; + use trailer::Trailer; + + log::debug!("Recovering segment from file {file_path:?}"); + let trailer = Trailer::from_file(file_path)?; + log::trace!("Got trailer: {trailer:#?}"); + + log::debug!("Reading meta block, with meta_ptr={:?}", trailer.metadata); + let metadata = ParsedMeta::from_trailer(&std::fs::File::open(file_path)?, &trailer)?; + + /* assert_eq!( + 0, *trailer.range_tombstones_ptr, + "Range tombstones not supported" + ); */ + + let file = std::fs::File::open(file_path)?; + + let tli_block = { + log::debug!("Reading TLI block, with tli_ptr={:?}", trailer.tli); + + let block = Block::from_file( + &file, + trailer.tli.offset(), + trailer.tli.size(), + metadata.data_block_compression, // TODO: index blocks may get their own compression level + )?; + + IndexBlock::new(block) + }; + + let block_index = if let Some(index_block_handle) = trailer.index_blocks { + log::debug!( + "Creating partitioned block index, with tli_ptr={:?}, index_block_ptr={index_block_handle:?}", + trailer.tli, + ); + todo!(); + // BlockIndexImpl::TwoLevel(tli_block, todo!()) + } else { + log::debug!("Creating full block index, with tli_ptr={:?}", trailer.tli); + NewBlockIndexImpl::Full(NewFullBlockIndex::new(tli_block)) + }; + + /* let block_index = if use_full_block_index { + let block_index = + FullBlockIndex::from_file(file_path, &trailer.metadata, &trailer.offsets)?; + + BlockIndexImpl::Full(block_index) + } else { + let block_index = TwoLevelBlockIndex::from_file( + file_path, + &trailer.metadata, + trailer.offsets.tli_ptr, + (tree_id, trailer.metadata.id).into(), + descriptor_table.clone(), + cache.clone(), + )?; + BlockIndexImpl::TwoLevel(block_index) + }; */ + + let pinned_filter = trailer + .filter + .map(|filter_ptr| { + use crate::coding::Decode; + + log::debug!("Reading filter block for pinning, with filter_ptr={filter_ptr:?}"); + + let block = Block::from_file( + &file, + filter_ptr.offset(), + filter_ptr.size(), + crate::CompressionType::None, // NOTE: We never write a filter block with compression + )?; + + let mut reader = &block.data[..]; + StandardBloomFilter::decode_from(&mut reader).map_err(Into::::into) + }) + .transpose()?; + + descriptor_table.insert_for_table((tree_id, metadata.id).into(), Arc::new(file)); + + let segment = Self(Arc::new(Inner { + path: file_path.into(), + tree_id, + + metadata, + trailer, + + cache, + + descriptor_table, + + block_index: Arc::new(block_index), + + pinned_filter, + + is_deleted: AtomicBool::default(), + })); + + Ok(segment) + } + + pub(crate) fn mark_as_deleted(&self) { + self.0 + .is_deleted + .store(true, std::sync::atomic::Ordering::Release); + } + + #[must_use] + pub fn is_key_in_key_range(&self, key: &[u8]) -> bool { + self.metadata.key_range.contains_key(key) + } + + /// Checks if a key range is (partially or fully) contained in this segment. + pub(crate) fn check_key_range_overlap( + &self, + bounds: &(Bound, Bound), + ) -> bool { + self.metadata.key_range.overlaps_with_bounds(bounds) + } + + /// Returns the highest sequence number in the segment. + #[must_use] + pub fn get_highest_seqno(&self) -> SeqNo { + self.metadata.seqnos.1 + } + + /// Returns the amount of tombstone markers in the `Segment`. + #[must_use] + #[doc(hidden)] + pub fn tombstone_count(&self) -> u64 { + todo!() + + // self.metadata.tombstone_count + } + + /// Returns the ratio of tombstone markers in the `Segment`. + #[must_use] + #[doc(hidden)] + pub fn tombstone_ratio(&self) -> f32 { + todo!() + + // self.metadata.tombstone_count as f32 / self.metadata.key_count as f32 + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::tempdir; + use test_log::test; + + #[test] + #[allow(clippy::unwrap_used)] + fn v3_segment_recover() -> crate::Result<()> { + let dir = tempdir()?; + let file = dir.path().join("segment"); + + { + let mut writer = crate::super_segment::Writer::new(file.clone(), 5)?; + writer.write(crate::InternalValue::from_components( + b"abc", + b"asdasdasd", + 3, + crate::ValueType::Value, + ))?; + let _trailer = writer.finish()?; + } + + { + let segment = Segment::recover( + &file, + 0, + Arc::new(NewCache::with_capacity_bytes(1_000_000)), + Arc::new(NewDescriptorTable::new(10)), + )?; + + assert_eq!(5, segment.id()); + assert_eq!(1, segment.metadata.item_count); + assert_eq!(1, segment.metadata.data_block_count); + assert_eq!(1, segment.metadata.index_block_count); // 1 because we use a full index + assert!( + segment.trailer.index_blocks.is_none(), + "should use full index, so only TLI exists", + ); + + assert_eq!( + b"abc", + &*segment.point_read(b"abc", None)?.unwrap().key.user_key, + ); + assert_eq!( + b"abc", + &*segment.point_read(b"abc", None)?.unwrap().key.user_key, + ); + assert_eq!(None, segment.point_read(b"def", None)?); + + assert_eq!( + segment.metadata.key_range, + crate::KeyRange::new((b"abc".into(), b"abc".into())), + ); + } + + Ok(()) + } + + #[test] + #[allow(clippy::unwrap_used)] + fn v3_segment_scan() -> crate::Result<()> { + let dir = tempdir()?; + let file = dir.path().join("segment"); + + let items = [ + crate::InternalValue::from_components(b"abc", b"asdasdasd", 3, crate::ValueType::Value), + crate::InternalValue::from_components(b"def", b"asdasdasd", 3, crate::ValueType::Value), + crate::InternalValue::from_components(b"xyz", b"asdasdasd", 3, crate::ValueType::Value), + ]; + + { + let mut writer = crate::super_segment::Writer::new(file.clone(), 5)?; + + for item in items.iter().cloned() { + writer.write(item)?; + } + + let _trailer = writer.finish()?; + } + + { + let segment = Segment::recover( + &file, + 0, + Arc::new(NewCache::with_capacity_bytes(1_000_000)), + Arc::new(NewDescriptorTable::new(10)), + )?; + + assert_eq!(5, segment.id()); + assert_eq!(3, segment.metadata.item_count); + assert_eq!(1, segment.metadata.data_block_count); + assert_eq!(1, segment.metadata.index_block_count); // 1 because we use a full index + assert!( + segment.trailer.index_blocks.is_none(), + "should use full index, so only TLI exists", + ); + + assert_eq!( + b"abc", + &*segment.point_read(b"abc", None)?.unwrap().key.user_key, + ); + assert_eq!( + b"def", + &*segment.point_read(b"def", None)?.unwrap().key.user_key, + ); + assert_eq!( + b"xyz", + &*segment.point_read(b"xyz", None)?.unwrap().key.user_key, + ); + assert_eq!(None, segment.point_read(b"____", None)?); + + assert_eq!(items, &*segment.scan()?.flatten().collect::>()); + + assert_eq!( + segment.metadata.key_range, + crate::KeyRange::new((b"abc".into(), b"xyz".into())), + ); + } + + Ok(()) + } +} diff --git a/src/super_segment/multi_writer.rs b/src/super_segment/multi_writer.rs new file mode 100644 index 00000000..8d401ebe --- /dev/null +++ b/src/super_segment/multi_writer.rs @@ -0,0 +1,216 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use super::writer::Writer; +use crate::{value::InternalValue, CompressionType, SegmentId, UserKey}; +use std::{ + path::PathBuf, + sync::{atomic::AtomicU64, Arc}, +}; + +/// Like `Writer` but will rotate to a new segment, once a segment grows larger than `target_size` +/// +/// This results in a sorted "run" of segments +#[allow(clippy::module_name_repetitions)] +pub struct MultiWriter { + base_path: PathBuf, + + data_block_size: u32, + + /// Target size of segments in bytes + /// + /// If a segment reaches the target size, a new one is started, + /// resulting in a sorted "run" of segments + pub target_size: u64, + + // pub opts: Options, + results: Vec, + + segment_id_generator: Arc, + current_segment_id: u64, + + pub writer: Writer, + + pub compression: CompressionType, + + // bloom_policy: BloomConstructionPolicy, + current_key: Option, +} + +impl MultiWriter { + /// Sets up a new `MultiWriter` at the given segments folder + pub fn new( + base_path: PathBuf, + segment_id_generator: Arc, + target_size: u64, + // opts: Options, + ) -> crate::Result { + let current_segment_id = + segment_id_generator.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + + let path = base_path.join(current_segment_id.to_string()); + let writer = Writer::new(path, current_segment_id)?; + + /* let writer = Writer::new(Options { + segment_id: current_segment_id, + folder: opts.folder.clone(), + data_block_size: opts.data_block_size, + index_block_size: opts.index_block_size, + })?; */ + + Ok(Self { + base_path, + + data_block_size: 4_096, + + target_size, + results: Vec::with_capacity(10), + // opts, + segment_id_generator, + current_segment_id, + writer, + + compression: CompressionType::None, + + // bloom_policy: BloomConstructionPolicy::default(), + current_key: None, + }) + } + + #[must_use] + pub(crate) fn use_data_block_size(mut self, size: u32) -> Self { + assert!( + size <= 4 * 1_024 * 1_024, + "data block size must be <= 4 MiB", + ); + self.data_block_size = size; + self + } + + #[must_use] + pub fn use_compression(mut self, compression: CompressionType) -> Self { + self.compression = compression; + self.writer = self.writer.use_compression(compression); + self + } + + /* #[must_use] + pub fn use_bloom_policy(mut self, bloom_policy: BloomConstructionPolicy) -> Self { + self.bloom_policy = bloom_policy; + self.writer = self.writer.use_bloom_policy(bloom_policy); + self + } */ + + fn get_next_segment_id(&mut self) -> u64 { + self.current_segment_id = self + .segment_id_generator + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); + + self.current_segment_id + } + + /// Flushes the current writer, stores its metadata, and sets up a new writer for the next segment + fn rotate(&mut self) -> crate::Result<()> { + log::debug!("Rotating segment writer"); + + let new_segment_id = self.get_next_segment_id(); + let path = self.base_path.join(new_segment_id.to_string()); + + let new_writer = Writer::new(path, new_segment_id)? + .use_compression(self.compression) + .use_data_block_size(self.data_block_size); + + // new_writer = new_writer.use_bloom_policy(self.bloom_policy); + + let old_writer = std::mem::replace(&mut self.writer, new_writer); + + if let Some(segment_id) = old_writer.finish()? { + self.results.push(segment_id); + } + + Ok(()) + } + + /// Writes an item + pub fn write(&mut self, item: InternalValue) -> crate::Result<()> { + let is_next_key = self.current_key.as_ref() < Some(&item.key.user_key); + + if is_next_key { + self.current_key = Some(item.key.user_key.clone()); + + if *self.writer.meta.file_pos >= self.target_size { + self.rotate()?; + } + } + + self.writer.write(item)?; + + Ok(()) + } + + /// Finishes the last segment, making sure all data is written durably + /// + /// Returns the metadata of created segments + pub fn finish(mut self) -> crate::Result> { + if let Some(last_writer_result) = self.writer.finish()? { + self.results.push(last_writer_result); + } + + Ok(self.results) + } +} + +#[cfg(test)] +mod tests { + use crate::{AbstractTree, Config}; + use test_log::test; + + // NOTE: Tests that versions of the same key stay + // in the same segment even if it needs to be rotated + // This avoids segments' key ranges overlapping + #[test] + fn segment_multi_writer_same_key_norotate() -> crate::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new(&folder).open()?; + + tree.insert("a", "a1".repeat(4_000), 0); + tree.insert("a", "a2".repeat(4_000), 1); + tree.insert("a", "a3".repeat(4_000), 2); + tree.insert("a", "a4".repeat(4_000), 3); + tree.insert("a", "a5".repeat(4_000), 4); + tree.flush_active_memtable(0)?; + assert_eq!(1, tree.segment_count()); + assert_eq!(1, tree.len(None, None)?); + + tree.major_compact(1_024, 0)?; + assert_eq!(1, tree.segment_count()); + assert_eq!(1, tree.len(None, None)?); + + Ok(()) + } + + #[test] + fn segment_multi_writer_same_key_norotate_2() -> crate::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new(&folder).open()?; + + tree.insert("a", "a1".repeat(4_000), 0); + tree.insert("a", "a1".repeat(4_000), 1); + tree.insert("a", "a1".repeat(4_000), 2); + tree.insert("b", "a1".repeat(4_000), 0); + tree.insert("c", "a1".repeat(4_000), 0); + tree.insert("c", "a1".repeat(4_000), 1); + tree.flush_active_memtable(0)?; + assert_eq!(1, tree.segment_count()); + assert_eq!(3, tree.len(None, None)?); + + tree.major_compact(1_024, 0)?; + assert_eq!(3, tree.segment_count()); + assert_eq!(3, tree.len(None, None)?); + + Ok(()) + } +} diff --git a/src/super_segment/scanner.rs b/src/super_segment/scanner.rs new file mode 100644 index 00000000..c04f50f3 --- /dev/null +++ b/src/super_segment/scanner.rs @@ -0,0 +1,77 @@ +use super::{Block, DataBlock}; +use crate::{CompressionType, InternalValue}; +use self_cell::self_cell; +use std::{fs::File, io::BufReader, path::Path}; + +type BlockIter<'a> = Box + 'a>; + +self_cell!( + pub struct Iter { + owner: DataBlock, + + #[covariant] + dependent: BlockIter, + } +); + +/// Segment reader that is optimized for consuming an entire segment +pub struct Scanner { + reader: BufReader, + iter: Iter, + + compression: CompressionType, + block_count: usize, + read_count: usize, +} + +impl Scanner { + pub fn new( + path: &Path, + block_count: usize, + compression: CompressionType, + ) -> crate::Result { + // TODO: a larger buffer size may be better for HDD, maybe make this configurable + let mut reader = BufReader::with_capacity(8 * 4_096, File::open(path)?); + + let block = Self::fetch_next_block(&mut reader, compression)?; + let iter = Iter::new(block, |block| Box::new(block.iter())); + + Ok(Self { + reader, + iter, + + compression, + block_count, + read_count: 1, + }) + } + + fn fetch_next_block( + reader: &mut BufReader, + compression: CompressionType, + ) -> crate::Result { + Block::from_reader(reader, compression).map(DataBlock::new) + } +} + +impl Iterator for Scanner { + type Item = crate::Result; + + fn next(&mut self) -> Option { + loop { + if let Some(item) = self.iter.with_dependent_mut(|_, iter| iter.next()) { + return Some(Ok(item)); + } + + if self.read_count >= self.block_count { + return None; + } + + // Init new block + let block = fail_iter!(Self::fetch_next_block(&mut self.reader, self.compression)); + self.iter = Iter::new(block, |block| Box::new(block.iter())); + + self.read_count += 1; + } + } +} diff --git a/src/super_segment/trailer.rs b/src/super_segment/trailer.rs index 27f05a4a..8d67f4c2 100644 --- a/src/super_segment/trailer.rs +++ b/src/super_segment/trailer.rs @@ -19,9 +19,9 @@ use std::{ /// ---------------- /// | data blocks | <- implicitly start at 0 /// |--------------| -/// | index blocks | +/// | tli | /// |--------------| -/// | tli block | +/// | index block | <- may not exist (if full block index is used, TLI will be dense) /// |--------------| /// | filter block | <- may not exist /// |--------------| @@ -33,9 +33,9 @@ use std::{ /// |--------------| #[derive(Copy, Clone, Debug, Default, PartialEq, Eq)] pub struct Trailer { - pub index_block: NewBlockHandle, pub tli: NewBlockHandle, - pub filter: NewBlockHandle, + pub index_blocks: Option, + pub filter: Option, // option // // TODO: #2 https://github.com/fjall-rs/lsm-tree/issues/2 // pub range_tombstones: BlockOffset, @@ -49,11 +49,11 @@ pub struct Trailer { } impl Trailer { - /// Returns the on-disk size + /* /// Returns the on-disk size #[must_use] pub const fn serialized_len() -> usize { 4 * std::mem::size_of::() - } + } */ pub fn write_into(&self, writer: &mut W) -> crate::Result<()> { let mut v = Vec::with_capacity(TRAILER_SIZE); @@ -76,6 +76,7 @@ impl Trailer { Ok(()) } + // TODO: the trailer is fixed size so we can use read_at?! pub fn from_file(path: &Path) -> crate::Result { let file = File::open(path)?; let mut reader = BufReader::new(file); @@ -94,7 +95,6 @@ impl Trailer { ))); } - debug_assert!(*trailer.index_block.offset() > 0); debug_assert!(*trailer.tli.offset() > 0); debug_assert!(*trailer.metadata.offset() > 0); @@ -102,27 +102,46 @@ impl Trailer { } } +// TODO: honestly we could just store the meta offset in trailer, and the just store pointers in meta... impl Encode for Trailer { fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { - self.index_block.encode_into(writer)?; self.tli.encode_into(writer)?; - self.filter.encode_into(writer)?; + + if let Some(handle) = &self.index_blocks { + handle.encode_into(writer) + } else { + NewBlockHandle::default().encode_into(writer) + }?; + + if let Some(handle) = &self.filter { + handle.encode_into(writer) + } else { + NewBlockHandle::default().encode_into(writer) + }?; + self.metadata.encode_into(writer)?; + Ok(()) } } impl Decode for Trailer { fn decode_from(reader: &mut R) -> Result { - let index_block = NewBlockHandle::decode_from(reader)?; let tli = NewBlockHandle::decode_from(reader)?; + let index_blocks = NewBlockHandle::decode_from(reader)?; let filter = NewBlockHandle::decode_from(reader)?; let metadata = NewBlockHandle::decode_from(reader)?; Ok(Self { - index_block, + index_blocks: match *index_blocks.offset() { + 0 => None, + _ => Some(index_blocks), + }, tli, - filter, + filter: match *filter.offset() { + 0 => None, + _ => Some(filter), + }, metadata, }) } @@ -136,11 +155,11 @@ mod tests { use test_log::test; #[test] - fn file_offsets_roundtrip() -> crate::Result<()> { + fn v3_file_offsets_roundtrip() -> crate::Result<()> { let before = Trailer { - index_block: NewBlockHandle::new(BlockOffset(15), 5), - tli: NewBlockHandle::new(BlockOffset(20), 5), - filter: NewBlockHandle::new(BlockOffset(25), 5), + tli: NewBlockHandle::new(BlockOffset(15), 5), + index_blocks: Some(NewBlockHandle::new(BlockOffset(20), 5)), + filter: Some(NewBlockHandle::new(BlockOffset(25), 5)), metadata: NewBlockHandle::new(BlockOffset(30), 5), }; @@ -154,9 +173,9 @@ mod tests { Ok(()) } - #[test] - fn file_offsets_serialized_len() { + /* #[test] + fn v3_file_offsets_serialized_len() { let buf = Trailer::default().encode_into_vec(); assert_eq!(Trailer::serialized_len(), buf.len()); - } + } */ } diff --git a/src/super_segment/writer/index.rs b/src/super_segment/writer/index.rs index 0a7397aa..23d66f53 100644 --- a/src/super_segment/writer/index.rs +++ b/src/super_segment/writer/index.rs @@ -9,14 +9,95 @@ use crate::{ index_block::{NewBlockHandle, NewKeyedBlockHandle}, Block, BlockOffset, IndexBlock, }, - value::UserKey, -}; -use std::{ - fs::File, - io::{BufWriter, Seek, Write}, }; -pub struct Writer { +pub trait BlockIndexWriter { + /// Registers a data block in the block index. + fn register_data_block(&mut self, block_handle: NewKeyedBlockHandle) -> crate::Result<()>; + + /// Writes the block index to a file. + /// + /// Returns the (optional) index blocks handle and the TLI handle. + fn finish( + &mut self, + block_file_writer: &mut W, + ) -> crate::Result<(NewBlockHandle, Option)>; + + fn use_compression(&mut self, compression: CompressionType); + + fn len(&self) -> usize; +} + +pub struct FullIndexWriter { + compression: CompressionType, + block_handles: Vec, +} + +impl FullIndexWriter { + pub fn new() -> Self { + Self { + compression: CompressionType::None, + block_handles: Vec::new(), + } + } +} + +impl BlockIndexWriter for FullIndexWriter { + fn len(&self) -> usize { + 1 + } + + fn use_compression(&mut self, compression: CompressionType) { + self.compression = compression; + } + + fn register_data_block(&mut self, block_handle: NewKeyedBlockHandle) -> crate::Result<()> { + log::trace!( + "Registering block at {:?} with size {} [end_key={:?}]", + block_handle.offset(), + block_handle.size(), + block_handle.end_key(), + ); + + self.block_handles.push(block_handle); + + Ok(()) + } + + fn finish( + &mut self, + block_file_writer: &mut W, + ) -> crate::Result<(NewBlockHandle, Option)> { + let tli_ptr = BlockOffset(block_file_writer.stream_position()?); + + let bytes = + IndexBlock::encode_items(&self.block_handles, 1 /* TODO: hard coded for now */)?; + + let header = Block::to_writer(block_file_writer, &bytes, self.compression)?; + + // NOTE: We know that blocks never even approach u32 size + #[allow(clippy::cast_possible_truncation)] + let bytes_written = BlockHeader::serialized_len() as u32 + header.data_length; + + log::trace!( + "Written top level index, with {} pointers ({bytes_written}B)", + self.block_handles.len(), + ); + + Ok((NewBlockHandle::new(tli_ptr, bytes_written), None)) + } +} + +// TODO: we need 2 index writers (enum dispatch or Box then) +// TODO: -> FullIndexWriter +// TODO: -> PartitionedIndexWriter +// +// FullIndexWriter puts all block handles into the TLI, and sets the index blocks handle to NULL +// PartitionedIndexWriter works as Writer does currently +// +// That way, when index_blocks_handle == 0, TLI is a dense index + +/* pub struct Writer { file_pos: BlockOffset, prev_pos: (BlockOffset, BlockOffset), @@ -56,7 +137,8 @@ impl Writer { } fn write_block(&mut self) -> crate::Result<()> { - let bytes = IndexBlock::encode_items(&self.block_handles)?; + let bytes = + IndexBlock::encode_items(&self.block_handles, 1 /* TODO: hard coded for now */)?; // TODO: prev block offset let _header = Block::to_writer(&mut self.write_buffer, &bytes, self.compression)?; @@ -126,22 +208,22 @@ impl Writer { file_offset: BlockOffset, ) -> crate::Result { block_file_writer.write_all(&self.write_buffer)?; - let tli_ptr = BlockOffset(block_file_writer.stream_position()?); - log::trace!("Wrote index blocks into segment file"); + let tli_ptr = BlockOffset(block_file_writer.stream_position()?); + for item in &mut self.tli_pointers { item.shift(file_offset); } - let bytes = IndexBlock::encode_items(&self.tli_pointers)?; - - let _header = Block::to_writer(&mut self.write_buffer, &bytes, self.compression)?; + let bytes = + IndexBlock::encode_items(&self.tli_pointers, 1 /* TODO: hard coded for now */)?; - let bytes_written = BlockHeader::serialized_len() + bytes.len(); + let _header = Block::to_writer(block_file_writer, &bytes, self.compression)?; - block_file_writer.flush()?; - block_file_writer.get_mut().sync_all()?; + // NOTE: We know that blocks never even approach u32 size + #[allow(clippy::cast_possible_truncation)] + let bytes_written = (BlockHeader::serialized_len() + bytes.len()) as u32; log::trace!( "Written top level index, with {} pointers ({} bytes)", @@ -149,7 +231,7 @@ impl Writer { bytes_written, ); - Ok(NewBlockHandle::new(tli_ptr, bytes_written as u32)) + Ok(NewBlockHandle::new(tli_ptr, bytes_written)) } /// Returns the offset in the file to TLI @@ -166,4 +248,4 @@ impl Writer { Ok(tli_handle) } -} +} */ diff --git a/src/super_segment/writer/meta.rs b/src/super_segment/writer/meta.rs index bee160f4..0f2ae2ef 100644 --- a/src/super_segment/writer/meta.rs +++ b/src/super_segment/writer/meta.rs @@ -8,9 +8,6 @@ pub struct Metadata { /// Written data block count pub data_block_count: usize, - /// Written index block count - pub index_block_count: usize, - /// Written item count pub item_count: usize, @@ -44,7 +41,6 @@ impl Default for Metadata { fn default() -> Self { Self { data_block_count: 0, - index_block_count: 0, item_count: 0, tombstone_count: 0, diff --git a/src/super_segment/writer/mod.rs b/src/super_segment/writer/mod.rs index b8fa3066..f68c1d44 100644 --- a/src/super_segment/writer/mod.rs +++ b/src/super_segment/writer/mod.rs @@ -1,12 +1,18 @@ mod index; mod meta; -use super::{block::Header as BlockHeader, trailer::Trailer, Block, BlockOffset, DataBlock}; +use super::{ + block::Header as BlockHeader, trailer::Trailer, Block, BlockOffset, DataBlock, + NewKeyedBlockHandle, +}; use crate::{ - coding::Encode, file::fsync_directory, super_segment::index_block::NewBlockHandle, + coding::Encode, + file::fsync_directory, + super_segment::{filter::standard_bloom::Builder, index_block::NewBlockHandle}, + time::unix_timestamp, CompressionType, InternalValue, SegmentId, UserKey, }; -use index::Writer as IndexWriter; +use index::{BlockIndexWriter, FullIndexWriter}; use std::{ fs::File, io::{BufWriter, Seek, Write}, @@ -16,7 +22,7 @@ use std::{ /// Serializes and compresses values into blocks and writes them to disk as segment pub struct Writer { /// Segment file - path: PathBuf, + pub(crate) path: PathBuf, segment_id: SegmentId, @@ -30,7 +36,7 @@ pub struct Writer { block_writer: BufWriter, /// Writer of index blocks - index_writer: IndexWriter, + index_writer: Box>>, /// Buffer of KVs chunk: Vec, @@ -42,12 +48,12 @@ pub struct Writer { prev_pos: (BlockOffset, BlockOffset), current_key: Option, - // bloom_policy: BloomConstructionPolicy, - // /// Hashes for bloom filter - // /// - // /// using enhanced double hashing, so we got two u64s - // bloom_hash_buffer: Vec<(u64, u64)>, + // bloom_policy: BloomConstructionPolicy, + /// Hashes for bloom filter + /// + /// using enhanced double hashing, so we got two u64s + bloom_hash_buffer: Vec<(u64, u64)>, } impl Writer { @@ -66,7 +72,7 @@ impl Writer { path: std::path::absolute(path)?, - index_writer: IndexWriter::new(4_096 /* TODO: hard coded for now */), + index_writer: Box::new(FullIndexWriter::new()), block_writer, chunk: Vec::new(), @@ -76,15 +82,25 @@ impl Writer { chunk_size: 0, current_key: None, + + bloom_hash_buffer: Vec::new(), }) } - // TODO: data_block_size setter + #[must_use] + pub(crate) fn use_data_block_size(mut self, size: u32) -> Self { + assert!( + size <= 4 * 1_024 * 1_024, + "data block size must be <= 4 MiB", + ); + self.data_block_size = size; + self + } #[must_use] pub(crate) fn use_compression(mut self, compression: CompressionType) -> Self { self.compression = compression; - self.index_writer = self.index_writer.use_compression(compression); + self.index_writer.use_compression(compression); self } @@ -105,13 +121,14 @@ impl Writer { self.meta.key_count += 1; self.current_key = Some(item.key.user_key.clone()); - // TODO: - // // IMPORTANT: Do not buffer *every* item's key - // // because there may be multiple versions - // // of the same key - // if self.bloom_policy.is_active() { - // self.bloom_hash_buffer - // .push(BloomFilter::get_hash(&item.key.user_key)); + // IMPORTANT: Do not buffer *every* item's key + // because there may be multiple versions + // of the same key + + // TODO: policy + //if self.bloom_policy.is_active() { + self.bloom_hash_buffer + .push(Builder::get_hash(&item.key.user_key)); // } } @@ -144,20 +161,21 @@ impl Writer { return Ok(()); }; - let bytes = DataBlock::encode_items(&self.chunk, 16, 0.75)?; + let bytes = DataBlock::encode_items(&self.chunk, 16, 1.33)?; // TODO: prev block offset let header = Block::to_writer(&mut self.block_writer, &bytes, self.compression)?; self.meta.uncompressed_size += u64::from(header.uncompressed_length); - let bytes_written = (BlockHeader::serialized_len() + bytes.len()) as u32; + let bytes_written = BlockHeader::serialized_len() as u32 + header.data_length; - self.index_writer.register_block( - last.key.user_key.clone(), - self.meta.file_pos, - bytes_written, - )?; + self.index_writer + .register_data_block(NewKeyedBlockHandle::new( + last.key.user_key.clone(), + self.meta.file_pos, + bytes_written, + ))?; // Adjust metadata self.meta.file_pos += bytes_written as u64; @@ -191,7 +209,7 @@ impl Writer { } /// Finishes the segment, making sure all data is written durably - pub fn finish(mut self) -> crate::Result> { + pub fn finish(mut self) -> crate::Result> { self.spill_block()?; // No items written! Just delete segment file and return nothing @@ -200,47 +218,52 @@ impl Writer { return Ok(None); } - let index_block_start = BlockOffset(self.block_writer.stream_position()?); - // // Append index blocks to file - let tli_handle = self.index_writer.finish(&mut self.block_writer)?; + let (tli_handle, index_blocks_handle) = self.index_writer.finish(&mut self.block_writer)?; + log::trace!("tli_ptr={tli_handle:?}"); + log::trace!("index_blocks_ptr={index_blocks_handle:?}"); - let index_block_handle = NewBlockHandle::new( - index_block_start, - (*tli_handle.offset() - *index_block_start) as u32, - ); + // Write filter + let filter_handle = { + if self.bloom_hash_buffer.is_empty() { + None + } else { + let filter_ptr = self.block_writer.stream_position()?; + let n = self.bloom_hash_buffer.len(); - self.meta.index_block_count = self.index_writer.block_count; + // TODO: + /* log::trace!( + "Constructing Bloom filter with {n} entries: {:?}", + self.bloom_policy, + ); */ - // // Write bloom filter - // let bloom_ptr = { - // if self.bloom_hash_buffer.is_empty() { - // BlockOffset(0) - // } else { - // let bloom_ptr = self.block_writer.stream_position()?; - // let n = self.bloom_hash_buffer.len(); + let start = std::time::Instant::now(); - // log::trace!( - // "Constructing Bloom filter with {n} entries: {:?}", - // self.bloom_policy, - // ); + // let mut filter = self.bloom_policy.build(n); - // let start = std::time::Instant::now(); + let filter = { + let mut builder = Builder::with_bpk(n, 10); - // let mut filter = self.bloom_policy.build(n); + for hash in std::mem::take(&mut self.bloom_hash_buffer) { + builder.set_with_hash(hash); + } - // for hash in std::mem::take(&mut self.bloom_hash_buffer) { - // filter.set_with_hash(hash); - // } + builder.build() + }; - // log::trace!("Built Bloom filter in {:?}", start.elapsed()); + log::trace!("Built Bloom filter in {:?}", start.elapsed()); - // filter.encode_into(&mut self.block_writer)?; + let bytes = filter.encode_into_vec(); - // BlockOffset(bloom_ptr) - // } - // }; - // log::trace!("bloom_ptr={bloom_ptr}"); + let block = + Block::to_writer(&mut self.block_writer, &bytes, CompressionType::None)?; + + let bytes_written = (BlockHeader::serialized_len() as u32) + block.data_length; + + Some(NewBlockHandle::new(BlockOffset(filter_ptr), bytes_written)) + } + }; + log::trace!("filter_ptr={filter_handle:?}"); // // TODO: #46 https://github.com/fjall-rs/lsm-tree/issues/46 - Write range filter // let rf_ptr = BlockOffset(0); @@ -263,16 +286,19 @@ impl Writer { } let meta_items = [ + meta("#compression#data", &self.compression.encode_into_vec()), + meta("#created_at", &unix_timestamp().as_nanos().to_le_bytes()), meta( "#data_block_count", - &self.meta.data_block_count.to_le_bytes(), + &(self.meta.data_block_count as u64).to_le_bytes(), ), + meta("#hash_type", b"xxh3"), meta("#id", &self.segment_id.to_le_bytes()), meta( "#index_block_count", - &self.meta.index_block_count.to_le_bytes(), + &(self.index_writer.len() as u64).to_le_bytes(), ), - meta("#item_count", &self.meta.item_count.to_le_bytes()), + meta("#item_count", &(self.meta.item_count as u64).to_le_bytes()), meta( "#key#max", self.meta.last_key.as_ref().expect("should exist"), @@ -281,17 +307,21 @@ impl Writer { "#key#min", self.meta.first_key.as_ref().expect("should exist"), ), - meta("#key_count", &self.meta.key_count.to_le_bytes()), + meta("#key_count", &(self.meta.key_count as u64).to_le_bytes()), meta("#seqno#max", &self.meta.highest_seqno.to_le_bytes()), meta("#seqno#min", &self.meta.lowest_seqno.to_le_bytes()), meta("#size", &self.meta.file_pos.to_le_bytes()), - meta("#tombstone_count", &self.meta.tombstone_count.to_le_bytes()), + meta( + "#tombstone_count", + &(self.meta.tombstone_count as u64).to_le_bytes(), + ), meta( "#user_data_size", &self.meta.uncompressed_size.to_le_bytes(), ), - meta("version#lsmt", env!("CARGO_PKG_VERSION").as_bytes()), - meta("version#table", b"3.0"), + meta("v#lsmt", env!("CARGO_PKG_VERSION").as_bytes()), + meta("v#table", b"3.0"), + // TODO: tli_handle_count ]; #[cfg(debug_assertions)] @@ -310,18 +340,18 @@ impl Writer { // TODO: no binary index let bytes = DataBlock::encode_items(&meta_items, 1, 0.0)?; - let _header = Block::to_writer(&mut self.block_writer, &bytes, CompressionType::None)?; + let header = Block::to_writer(&mut self.block_writer, &bytes, CompressionType::None)?; - let bytes_written = BlockHeader::serialized_len() + bytes.len(); + let bytes_written = BlockHeader::serialized_len() as u32 + header.data_length; NewBlockHandle::new(metadata_start, bytes_written as u32) }; // Bundle all the file offsets let trailer = Trailer { - index_block: index_block_handle, tli: tli_handle, - filter: NewBlockHandle::default(), + index_blocks: None, + filter: filter_handle, metadata: metadata_handle, /* range_filter:range_filter_ptr: rf:rf_ptr, range_tombstones:range_tombstones_ptr, @@ -350,6 +380,6 @@ impl Writer { *self.meta.file_pos / 1_024 / 1_024, ); - Ok(Some(trailer)) + Ok(Some(self.segment_id)) } } diff --git a/src/tree/ingest.rs b/src/tree/ingest.rs index 6ad6c931..f1c81a83 100644 --- a/src/tree/ingest.rs +++ b/src/tree/ingest.rs @@ -5,13 +5,10 @@ use super::Tree; use crate::{ file::SEGMENTS_FOLDER, - segment::{block_index::BlockIndexImpl, multi_writer::MultiWriter, SegmentInner}, - AbstractTree, Segment, UserKey, UserValue, ValueType, -}; -use std::{ - path::PathBuf, - sync::{atomic::AtomicBool, Arc}, + super_segment::{multi_writer::MultiWriter, Segment}, + AbstractTree, UserKey, UserValue, ValueType, }; +use std::{path::PathBuf, sync::Arc}; pub struct Ingestion<'a> { folder: PathBuf, @@ -30,19 +27,20 @@ impl<'a> Ingestion<'a> { let folder = tree.config.path.join(SEGMENTS_FOLDER); log::debug!("Ingesting into disk segments in {folder:?}"); - let mut writer = MultiWriter::new( + let writer = MultiWriter::new( + folder.clone(), tree.segment_id_counter.clone(), 128 * 1_024 * 1_024, - crate::segment::writer::Options { + /* crate::segment::writer::Options { folder: folder.clone(), data_block_size: tree.config.data_block_size, index_block_size: tree.config.index_block_size, segment_id: 0, /* TODO: unused */ - }, + }, */ )? .use_compression(tree.config.compression); - { + /* { use crate::segment::writer::BloomConstructionPolicy; if tree.config.bloom_bits_per_key >= 0 { @@ -52,7 +50,7 @@ impl<'a> Ingestion<'a> { } else { writer = writer.use_bloom_policy(BloomConstructionPolicy::BitsPerKey(0)); } - } + } */ Ok(Self { folder, @@ -71,16 +69,27 @@ impl<'a> Ingestion<'a> { } pub fn finish(self) -> crate::Result<()> { - use crate::{ - compaction::MoveDown, segment::block_index::two_level_index::TwoLevelBlockIndex, - }; + use crate::compaction::MoveDown; let results = self.writer.finish()?; + log::info!("Finished ingestion writer"); + let created_segments = results .into_iter() - .map(|trailer| -> crate::Result { - let segment_id = trailer.metadata.id; + .map(|segment_id| -> crate::Result { + let segment_file_path = self.folder.join(segment_id.to_string()); + + Segment::recover( + &segment_file_path, + self.tree.id, + self.tree.config.cache.clone(), + self.tree.config.descriptor_table.clone(), + ) + + // todo!() + + /* let segment_id = trailer.metadata.id; let segment_file_path = self.folder.join(segment_id.to_string()); let block_index = TwoLevelBlockIndex::from_file( @@ -114,7 +123,7 @@ impl<'a> Ingestion<'a> { path: segment_file_path, is_deleted: AtomicBool::default(), } - .into()) + .into()) */ }) .collect::>>()?; @@ -122,14 +131,14 @@ impl<'a> Ingestion<'a> { self.tree.compact(Arc::new(MoveDown(0, 6)), 0)?; - for segment in &created_segments { + /* for segment in &created_segments { let segment_file_path = self.folder.join(segment.id().to_string()); self.tree .config .descriptor_table .insert(&segment_file_path, segment.global_id()); - } + } */ Ok(()) } diff --git a/src/tree/mod.rs b/src/tree/mod.rs index c84be184..3aaefd0b 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -6,29 +6,24 @@ pub(crate) mod ingest; pub mod inner; use crate::{ - cache::Cache, coding::{Decode, Encode}, compaction::CompactionStrategy, config::Config, - descriptor_table::FileDescriptorTable, level_manifest::LevelManifest, manifest::Manifest, memtable::Memtable, - segment::{ - block_index::{full_index::FullBlockIndex, BlockIndexImpl}, - meta::TableType, - Segment, SegmentInner, - }, + segment::meta::TableType, + super_segment::Segment, value::InternalValue, version::Version, - AbstractTree, KvPair, SegmentId, SeqNo, Snapshot, UserKey, UserValue, ValueType, + AbstractTree, KvPair, NewCache, NewDescriptorTable, SegmentId, SeqNo, Snapshot, UserKey, + UserValue, ValueType, }; use inner::{MemtableId, SealedMemtables, TreeId, TreeInner}; use std::{ io::Cursor, ops::RangeBounds, path::Path, - sync::atomic::AtomicBool, sync::{atomic::AtomicU64, Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}, }; @@ -133,7 +128,7 @@ impl AbstractTree for Tree { .read() .expect("lock is poisoned") .iter() - .map(super::segment::Segment::bloom_filter_size) + .map(Segment::bloom_filter_size) .sum() } @@ -144,7 +139,7 @@ impl AbstractTree for Tree { .len() } - fn verify(&self) -> crate::Result { + /* fn verify(&self) -> crate::Result { // NOTE: Lock memtable to prevent any tampering with disk segments let _lock = self.lock_active_memtable(); @@ -159,7 +154,7 @@ impl AbstractTree for Tree { } Ok(sum) - } + } */ fn keys( &self, @@ -184,9 +179,7 @@ impl AbstractTree for Tree { seqno_threshold: SeqNo, ) -> crate::Result> { use crate::{ - compaction::stream::CompactionStream, - file::SEGMENTS_FOLDER, - segment::writer::{Options, Writer}, + compaction::stream::CompactionStream, file::SEGMENTS_FOLDER, super_segment::Writer, }; use std::time::Instant; @@ -195,15 +188,20 @@ impl AbstractTree for Tree { let folder = self.config.path.join(SEGMENTS_FOLDER); log::debug!("writing segment to {folder:?}"); - let mut segment_writer = Writer::new(Options { + let mut segment_writer = Writer::new( + folder.join(segment_id.to_string()), segment_id, - folder, - data_block_size: self.config.data_block_size, - index_block_size: self.config.index_block_size, - })? - .use_compression(self.config.compression); - - { + /* Options { + segment_id, + folder, + data_block_size: self.config.data_block_size, + index_block_size: self.config.index_block_size, + } */ + )? + .use_compression(self.config.compression) + .use_data_block_size(self.config.data_block_size); + + /* { use crate::segment::writer::BloomConstructionPolicy; if self.config.bloom_bits_per_key >= 0 { @@ -213,7 +211,7 @@ impl AbstractTree for Tree { segment_writer = segment_writer.use_bloom_policy(BloomConstructionPolicy::BitsPerKey(0)); } - } + } */ let iter = memtable.iter().map(Ok); let compaction_filter = CompactionStream::new(iter, seqno_threshold); @@ -393,10 +391,7 @@ impl AbstractTree for Tree { fn get_highest_persisted_seqno(&self) -> Option { let levels = self.levels.read().expect("lock is poisoned"); - levels - .iter() - .map(super::segment::Segment::get_highest_seqno) - .max() + levels.iter().map(Segment::get_highest_seqno).max() } fn snapshot(&self, seqno: SeqNo) -> Snapshot { @@ -492,19 +487,18 @@ impl Tree { pub(crate) fn consume_writer( &self, - segment_id: SegmentId, - mut writer: crate::segment::writer::Writer, + segment_id: SegmentId, // TODO: <- remove + writer: crate::super_segment::Writer, ) -> crate::Result> { - let segment_folder = writer.opts.folder.clone(); - let segment_file_path = segment_folder.join(segment_id.to_string()); + let segment_file_path = writer.path.to_path_buf(); - let Some(trailer) = writer.finish()? else { + let Some(_) = writer.finish()? else { return Ok(None); }; - log::debug!("Finalized segment write at {segment_folder:?}"); + log::debug!("Finalized segment write at {segment_file_path:?}"); - let block_index = + /* let block_index = FullBlockIndex::from_file(&segment_file_path, &trailer.metadata, &trailer.offsets)?; let block_index = Arc::new(BlockIndexImpl::Full(block_index)); @@ -524,13 +518,20 @@ impl Tree { is_deleted: AtomicBool::default(), } - .into(); + .into(); */ + + /* self.config + .descriptor_table + .insert(segment_file_path, created_segment.global_id()); */ - self.config - .descriptor_table - .insert(segment_file_path, created_segment.global_id()); + let created_segment = Segment::recover( + &segment_file_path, + self.id, + self.config.cache.clone(), + self.config.descriptor_table.clone(), + )?; - log::debug!("Flushed segment to {segment_folder:?}"); + log::debug!("Flushed segment to {segment_file_path:?}"); Ok(Some(created_segment)) } @@ -881,8 +882,8 @@ impl Tree { fn recover_levels>( tree_path: P, tree_id: TreeId, - cache: &Arc, - descriptor_table: &Arc, + cache: &Arc, + descriptor_table: &Arc, ) -> crate::Result { use crate::{ file::fsync_directory, @@ -951,11 +952,9 @@ impl Tree { tree_id, cache.clone(), descriptor_table.clone(), - level_idx == 0 || level_idx == 1, + // level_idx == 0 || level_idx == 1, )?; - descriptor_table.insert(&segment_file_path, segment.global_id()); - segments.push(segment); log::debug!("Recovered segment from {segment_file_path:?}"); diff --git a/src/value.rs b/src/value.rs index 5bc10a83..4a08e58c 100644 --- a/src/value.rs +++ b/src/value.rs @@ -224,7 +224,6 @@ impl Decode for InternalValue { #[cfg(test)] mod tests { use super::*; - use std::io::Cursor; use test_log::test; #[test] @@ -241,7 +240,7 @@ mod tests { assert!(a > b); } - #[test] + /* #[test] fn value_raw() -> crate::Result<()> { // Create an empty Value instance let value = @@ -251,13 +250,13 @@ mod tests { let bytes = [ // Seqno 1, - + // Type 0, // User key 3, 1, 2, 3, - + // User value 3, 3, 2, 1, ]; @@ -269,7 +268,7 @@ mod tests { assert_eq!(value, deserialized); Ok(()) - } + } */ #[test] fn value_empty_value() -> crate::Result<()> { diff --git a/tests/open_files.rs b/tests/open_files.rs index 24600046..3b867b47 100644 --- a/tests/open_files.rs +++ b/tests/open_files.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Cache, Config}; +use lsm_tree::{AbstractTree, Config, NewCache}; use std::sync::Arc; use test_log::test; @@ -9,7 +9,7 @@ fn open_file_limit() -> lsm_tree::Result<()> { let folder = tempfile::tempdir_in(".test_open_files")?; let tree = Config::new(folder) - .use_cache(Arc::new(Cache::with_capacity_bytes(0))) + .use_cache(Arc::new(NewCache::with_capacity_bytes(0))) .open()?; for _ in 0..2_048 { diff --git a/tests/tree_iter_lifetime.rs b/tests/tree_iter_lifetime.rs index 655de9d3..2fd663ba 100644 --- a/tests/tree_iter_lifetime.rs +++ b/tests/tree_iter_lifetime.rs @@ -15,9 +15,10 @@ fn iterrr( Ok(tree.iter(None, None)) } -#[test] +// TODO: 3.0.0 compiler error +/* #[test] fn tree_iter_lifetime() -> lsm_tree::Result<()> { let folder = tempfile::tempdir().unwrap(); assert_eq!(100, iterrr(folder.path())?.count()); Ok(()) -} +} */ From 345079e071cd04e268402b96709393a3e2d42c3e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Apr 2025 13:40:46 +0200 Subject: [PATCH 065/613] remove old segment module --- src/abstract.rs | 8 +- src/blob_tree/mod.rs | 8 +- src/bloom/bit_array.rs | 84 -- src/bloom/mod.rs | 391 --------- src/cache.rs | 225 ------ src/compaction/fifo.rs | 3 +- src/compaction/leveled.rs | 3 +- src/compaction/maintenance.rs | 6 +- src/compaction/mod.rs | 2 +- src/compaction/tiered.rs | 4 +- src/compaction/worker.rs | 3 +- src/{segment/meta => }/compression.rs | 0 src/config.rs | 15 +- src/descriptor_table/lru.rs | 35 - src/descriptor_table/mod.rs | 282 ------- src/level_manifest/hidden_set.rs | 3 +- src/level_manifest/level.rs | 6 +- src/level_manifest/mod.rs | 10 +- src/level_reader.rs | 10 +- src/lib.rs | 18 +- src/manifest.rs | 13 +- src/memtable/mod.rs | 14 +- src/new_cache.rs | 19 +- src/range.rs | 5 +- src/segment/block/checksum.rs | 30 - src/segment/block/header.rs | 155 ---- src/segment/block/mod.rs | 225 ------ src/segment/block/offset.rs | 29 - src/segment/block_index/block_handle.rs | 110 --- src/segment/block_index/full_index.rs | 85 -- src/segment/block_index/mod.rs | 395 --------- src/segment/block_index/top_level.rs | 78 -- src/segment/block_index/two_level_index.rs | 239 ------ src/segment/block_index/writer.rs | 172 ---- src/segment/file_offsets.rs | 103 --- src/segment/forward_reader.rs | 133 --- src/segment/inner.rs | 59 -- src/segment/meta/mod.rs | 282 ------- src/segment/meta/table_type.rs | 27 - src/segment/mod.rs | 490 ----------- src/segment/multi_writer.rs | 201 ----- src/segment/range.rs | 764 ------------------ src/segment/reader.rs | 270 ------- src/segment/scanner.rs | 53 -- src/segment/trailer.rs | 83 -- src/segment/value_block.rs | 161 ---- src/segment/value_block_consumer.rs | 310 ------- src/segment/writer/meta.rs | 61 -- src/segment/writer/mod.rs | 574 ------------- src/super_segment/block_index/mod.rs | 3 +- src/super_segment/data_block/mod.rs | 8 +- .../filter/standard_bloom/builder.rs | 7 +- .../filter/standard_bloom/mod.rs | 15 +- src/{segment => super_segment}/id.rs | 3 +- src/super_segment/mod.rs | 31 +- src/super_segment/trailer.rs | 3 +- src/super_segment/writer/index.rs | 2 +- src/super_segment/writer/mod.rs | 226 ++++++ src/tree/inner.rs | 2 +- src/tree/mod.rs | 17 +- src/value.rs | 10 - 61 files changed, 335 insertions(+), 6248 deletions(-) delete mode 100644 src/bloom/bit_array.rs delete mode 100644 src/bloom/mod.rs delete mode 100644 src/cache.rs rename src/{segment/meta => }/compression.rs (100%) delete mode 100644 src/descriptor_table/lru.rs delete mode 100644 src/descriptor_table/mod.rs delete mode 100644 src/segment/block/checksum.rs delete mode 100644 src/segment/block/header.rs delete mode 100644 src/segment/block/mod.rs delete mode 100644 src/segment/block/offset.rs delete mode 100644 src/segment/block_index/block_handle.rs delete mode 100644 src/segment/block_index/full_index.rs delete mode 100644 src/segment/block_index/mod.rs delete mode 100644 src/segment/block_index/top_level.rs delete mode 100644 src/segment/block_index/two_level_index.rs delete mode 100644 src/segment/block_index/writer.rs delete mode 100644 src/segment/file_offsets.rs delete mode 100644 src/segment/forward_reader.rs delete mode 100644 src/segment/inner.rs delete mode 100644 src/segment/meta/mod.rs delete mode 100644 src/segment/meta/table_type.rs delete mode 100644 src/segment/mod.rs delete mode 100644 src/segment/multi_writer.rs delete mode 100644 src/segment/range.rs delete mode 100644 src/segment/reader.rs delete mode 100644 src/segment/scanner.rs delete mode 100644 src/segment/trailer.rs delete mode 100644 src/segment/value_block.rs delete mode 100644 src/segment/value_block_consumer.rs delete mode 100644 src/segment/writer/meta.rs delete mode 100644 src/segment/writer/mod.rs rename src/{segment => super_segment}/id.rs (96%) diff --git a/src/abstract.rs b/src/abstract.rs index 5b6e88e3..babd4647 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -118,7 +118,7 @@ pub trait AbstractTree { /// Returns the approximate size of the active memtable in bytes. /// /// May be used to flush the memtable if it grows too large. - fn active_memtable_size(&self) -> u32; + fn active_memtable_size(&self) -> u64; /// Returns the tree type. fn tree_type(&self) -> TreeType; @@ -550,7 +550,7 @@ pub trait AbstractTree { key: K, value: V, seqno: SeqNo, - ) -> (u32, u32); + ) -> (u64, u64); /// Removes an item from the tree. /// @@ -579,7 +579,7 @@ pub trait AbstractTree { /// # Errors /// /// Will return `Err` if an IO error occurs. - fn remove>(&self, key: K, seqno: SeqNo) -> (u32, u32); + fn remove>(&self, key: K, seqno: SeqNo) -> (u64, u64); /// Removes an item from the tree. /// @@ -613,5 +613,5 @@ pub trait AbstractTree { /// # Errors /// /// Will return `Err` if an IO error occurs. - fn remove_weak>(&self, key: K, seqno: SeqNo) -> (u32, u32); + fn remove_weak>(&self, key: K, seqno: SeqNo) -> (u64, u64); } diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index 2a968f84..de36bd3d 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -523,7 +523,7 @@ impl AbstractTree for BlobTree { self.index.get_highest_seqno() } - fn active_memtable_size(&self) -> u32 { + fn active_memtable_size(&self) -> u64 { self.index.active_memtable_size() } @@ -612,7 +612,7 @@ impl AbstractTree for BlobTree { key: K, value: V, seqno: SeqNo, - ) -> (u32, u32) { + ) -> (u64, u64) { use value::MaybeInlineValue; // NOTE: Initially, we always write an inline value @@ -652,11 +652,11 @@ impl AbstractTree for BlobTree { } } - fn remove>(&self, key: K, seqno: SeqNo) -> (u32, u32) { + fn remove>(&self, key: K, seqno: SeqNo) -> (u64, u64) { self.index.remove(key, seqno) } - fn remove_weak>(&self, key: K, seqno: SeqNo) -> (u32, u32) { + fn remove_weak>(&self, key: K, seqno: SeqNo) -> (u64, u64) { self.index.remove_weak(key, seqno) } } diff --git a/src/bloom/bit_array.rs b/src/bloom/bit_array.rs deleted file mode 100644 index 3eae750a..00000000 --- a/src/bloom/bit_array.rs +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -const BIT_MASK: u8 = 0b1000_0000_u8; - -/// Gets a bit from the byte. -fn get_bit(byte: u8, idx: usize) -> bool { - let bit_mask = BIT_MASK >> idx; - let masked = byte & bit_mask; - masked > 0 -} - -/// Enables the given bit in the byte. -fn enable_bit(byte: u8, idx: usize) -> u8 { - let bit_mask = BIT_MASK >> idx; - byte | bit_mask -} - -/// Fixed-size bit array -#[derive(Debug, Eq, PartialEq)] -pub struct BitArray(Box<[u8]>); - -impl BitArray { - /// Creates a new bit array with the given size in bytes. - #[must_use] - pub fn with_capacity(bytes: usize) -> Self { - let vec = vec![0; bytes]; - Self(vec.into_boxed_slice()) - } - - /// Treats the given byte array as bit array. - #[must_use] - pub fn from_bytes(bytes: Box<[u8]>) -> Self { - Self(bytes) - } - - /// Returns the inner data. - #[must_use] - pub fn bytes(&self) -> &[u8] { - &self.0 - } - - /// Sets the i-th bit to `true`. - pub fn enable(&mut self, idx: usize) { - let byte_idx = idx / 8; - let byte = self.0.get_mut(byte_idx).expect("should be in bounds"); - - let bit_idx: usize = idx % 8; - *byte = enable_bit(*byte, bit_idx); - } - - /// Gets the i-th bit. - #[must_use] - pub fn get(&self, idx: usize) -> bool { - let byte_idx = idx / 8; - let byte = self.0.get(byte_idx).expect("should be in bounds"); - - let bit_idx = idx % 8; - get_bit(*byte, bit_idx) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use test_log::test; - - #[test] - fn bit_set_true() { - assert_eq!(0b0000_0010, enable_bit(0, 6)); - assert_eq!(0b1000_0000, enable_bit(0, 0)); - assert_eq!(0b0100_0000, enable_bit(0, 1)); - assert_eq!(0b0100_0110, enable_bit(0b0000_0110, 1)); - } - - #[test] - fn bit_set_get() { - assert!(!get_bit(0b0100_0110, 0)); - assert!(get_bit(0b0100_0110, 1)); - assert!(get_bit(0b0100_0110, 6)); - assert!(!get_bit(0b0100_0110, 7)); - } -} diff --git a/src/bloom/mod.rs b/src/bloom/mod.rs deleted file mode 100644 index 68091a45..00000000 --- a/src/bloom/mod.rs +++ /dev/null @@ -1,391 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -mod bit_array; - -use crate::{ - coding::{Decode, DecodeError, Encode, EncodeError}, - file::MAGIC_BYTES, -}; -use bit_array::BitArray; -use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use std::io::{Read, Write}; - -/// Two hashes that are used for double hashing -pub type CompositeHash = (u64, u64); - -/// A standard bloom filter -/// -/// Allows buffering the key hashes before actual filter construction -/// which is needed to properly calculate the filter size, as the amount of items -/// are unknown during segment construction. -/// -/// The filter uses double hashing instead of `k` hash functions, see: -/// -#[derive(Debug, Eq, PartialEq)] -#[allow(clippy::module_name_repetitions)] -pub struct BloomFilter { - /// Raw bytes exposed as bit array - inner: BitArray, - - /// Bit count - m: usize, - - /// Number of hash functions - k: usize, -} - -impl Encode for BloomFilter { - fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { - // Write header - writer.write_all(&MAGIC_BYTES)?; - - // NOTE: Filter type - writer.write_u8(0)?; - - // NOTE: Hash type (unused) - writer.write_u8(0)?; - - writer.write_u64::(self.m as u64)?; - writer.write_u64::(self.k as u64)?; - writer.write_all(self.inner.bytes())?; - - Ok(()) - } -} - -impl Decode for BloomFilter { - fn decode_from(reader: &mut R) -> Result { - // Check header - let mut magic = [0u8; MAGIC_BYTES.len()]; - reader.read_exact(&mut magic)?; - - if magic != MAGIC_BYTES { - return Err(DecodeError::InvalidHeader("BloomFilter")); - } - - // NOTE: Filter type - let filter_type = reader.read_u8()?; - assert_eq!(0, filter_type, "Invalid filter type"); - - // NOTE: Hash type (unused) - let hash_type = reader.read_u8()?; - assert_eq!(0, hash_type, "Invalid bloom hash type"); - - let m = reader.read_u64::()? as usize; - let k = reader.read_u64::()? as usize; - - let mut bytes = vec![0; m / 8]; - reader.read_exact(&mut bytes)?; - - Ok(Self::from_raw(m, k, bytes.into_boxed_slice())) - } -} - -#[allow(clippy::len_without_is_empty)] -impl BloomFilter { - /// Returns the size of the bloom filter in bytes. - #[must_use] - pub fn len(&self) -> usize { - self.inner.bytes().len() - } - - /// Returns the amount of hashes used per lookup. - #[must_use] - pub fn hash_fn_count(&self) -> usize { - self.k - } - - fn from_raw(m: usize, k: usize, bytes: Box<[u8]>) -> Self { - Self { - inner: BitArray::from_bytes(bytes), - m, - k, - } - } - - /// Constructs a bloom filter that can hold `n` items - /// while maintaining a certain false positive rate `fpr`. - #[must_use] - pub fn with_fp_rate(n: usize, fpr: f32) -> Self { - use std::f32::consts::LN_2; - - assert!(n > 0); - - // NOTE: Some sensible minimum - let fpr = fpr.max(0.000_001); - - let m = Self::calculate_m(n, fpr); - let bpk = m / n; - let k = (((bpk as f32) * LN_2) as usize).max(1); - - Self { - inner: BitArray::with_capacity(m / 8), - m, - k, - } - } - - /// Constructs a bloom filter that can hold `n` items - /// with `bpk` bits per key. - /// - /// 10 bits per key is a sensible default. - #[must_use] - pub fn with_bpk(n: usize, bpk: u8) -> Self { - use std::f32::consts::LN_2; - - assert!(bpk > 0); - assert!(n > 0); - - let bpk = bpk as usize; - - let m = n * bpk; - let k = (((bpk as f32) * LN_2) as usize).max(1); - - // NOTE: Round up so we don't get too little bits - let bytes = (m as f32 / 8.0).ceil() as usize; - - Self { - inner: BitArray::with_capacity(bytes), - m: bytes * 8, - k, - } - } - - fn calculate_m(n: usize, fp_rate: f32) -> usize { - use std::f32::consts::LN_2; - - let n = n as f32; - let ln2_squared = LN_2.powi(2); - - let numerator = n * fp_rate.ln(); - let m = -(numerator / ln2_squared); - - // Round up to next byte - ((m / 8.0).ceil() * 8.0) as usize - } - - /// Returns `true` if the hash may be contained. - /// - /// Will never have a false negative. - #[must_use] - pub fn contains_hash(&self, (mut h1, mut h2): CompositeHash) -> bool { - for i in 0..(self.k as u64) { - let idx = h1 % (self.m as u64); - - // NOTE: should be in bounds because of modulo - #[allow(clippy::expect_used)] - if !self.has_bit(idx as usize) { - return false; - } - - h1 = h1.wrapping_add(h2); - h2 = h2.wrapping_add(i); - } - - true - } - - /// Returns `true` if the item may be contained. - /// - /// Will never have a false negative. - #[must_use] - pub fn contains(&self, key: &[u8]) -> bool { - self.contains_hash(Self::get_hash(key)) - } - - /// Adds the key to the filter. - pub fn set_with_hash(&mut self, (mut h1, mut h2): CompositeHash) { - for i in 0..(self.k as u64) { - let idx = h1 % (self.m as u64); - - self.enable_bit(idx as usize); - - h1 = h1.wrapping_add(h2); - h2 = h2.wrapping_add(i); - } - } - - /// Returns `true` if the bit at `idx` is `1`. - fn has_bit(&self, idx: usize) -> bool { - self.inner.get(idx) - } - - /// Sets the bit at the given index to `true`. - fn enable_bit(&mut self, idx: usize) { - self.inner.enable(idx); - } - - /// Gets the hash of a key. - #[must_use] - pub fn get_hash(key: &[u8]) -> CompositeHash { - let h0 = xxhash_rust::xxh3::xxh3_128(key); - let h1 = (h0 >> 64) as u64; - let h2 = h0 as u64; - (h1, h2) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::fs::File; - use test_log::test; - - #[test] - fn bloom_serde_round_trip() -> crate::Result<()> { - let dir = tempfile::tempdir()?; - - let path = dir.path().join("bf"); - let mut file = File::create(&path)?; - - let mut filter = BloomFilter::with_fp_rate(10, 0.0001); - - let keys = &[ - b"item0", b"item1", b"item2", b"item3", b"item4", b"item5", b"item6", b"item7", - b"item8", b"item9", - ]; - - for key in keys { - filter.set_with_hash(BloomFilter::get_hash(*key)); - } - - for key in keys { - assert!(filter.contains(&**key)); - } - assert!(!filter.contains(b"asdasads")); - assert!(!filter.contains(b"item10")); - assert!(!filter.contains(b"cxycxycxy")); - - filter.encode_into(&mut file)?; - file.sync_all()?; - drop(file); - - let mut file = File::open(&path)?; - let filter_copy = BloomFilter::decode_from(&mut file)?; - - assert_eq!(filter, filter_copy); - - for key in keys { - assert!(filter.contains(&**key)); - } - assert!(!filter_copy.contains(b"asdasads")); - assert!(!filter_copy.contains(b"item10")); - assert!(!filter_copy.contains(b"cxycxycxy")); - - Ok(()) - } - - #[test] - fn bloom_calculate_m() { - assert_eq!(9_592, BloomFilter::calculate_m(1_000, 0.01)); - assert_eq!(4_800, BloomFilter::calculate_m(1_000, 0.1)); - assert_eq!(4_792_536, BloomFilter::calculate_m(1_000_000, 0.1)); - } - - #[test] - fn bloom_basic() { - let mut filter = BloomFilter::with_fp_rate(10, 0.0001); - - for key in [ - b"item0", b"item1", b"item2", b"item3", b"item4", b"item5", b"item6", b"item7", - b"item8", b"item9", - ] { - assert!(!filter.contains(key)); - filter.set_with_hash(BloomFilter::get_hash(key)); - assert!(filter.contains(key)); - - assert!(!filter.contains(b"asdasdasdasdasdasdasd")); - } - } - - #[test] - fn bloom_bpk() { - let item_count = 1_000; - let bpk = 5; - - let mut filter = BloomFilter::with_bpk(item_count, bpk); - - for key in (0..item_count).map(|_| nanoid::nanoid!()) { - let key = key.as_bytes(); - - filter.set_with_hash(BloomFilter::get_hash(key)); - assert!(filter.contains(key)); - } - - let mut false_positives = 0; - - for key in (0..item_count).map(|_| nanoid::nanoid!()) { - let key = key.as_bytes(); - - if filter.contains(key) { - false_positives += 1; - } - } - - #[allow(clippy::cast_precision_loss)] - let fpr = false_positives as f32 / item_count as f32; - assert!(fpr < 0.13); - } - - #[test] - fn bloom_fpr() { - let item_count = 100_000; - let wanted_fpr = 0.1; - - let mut filter = BloomFilter::with_fp_rate(item_count, wanted_fpr); - - for key in (0..item_count).map(|_| nanoid::nanoid!()) { - let key = key.as_bytes(); - - filter.set_with_hash(BloomFilter::get_hash(key)); - assert!(filter.contains(key)); - } - - let mut false_positives = 0; - - for key in (0..item_count).map(|_| nanoid::nanoid!()) { - let key = key.as_bytes(); - - if filter.contains(key) { - false_positives += 1; - } - } - - #[allow(clippy::cast_precision_loss)] - let fpr = false_positives as f32 / item_count as f32; - assert!(fpr > 0.05); - assert!(fpr < 0.13); - } - - #[test] - fn bloom_fpr_2() { - let item_count = 100_000; - let wanted_fpr = 0.5; - - let mut filter = BloomFilter::with_fp_rate(item_count, wanted_fpr); - - for key in (0..item_count).map(|_| nanoid::nanoid!()) { - let key = key.as_bytes(); - - filter.set_with_hash(BloomFilter::get_hash(key)); - assert!(filter.contains(key)); - } - - let mut false_positives = 0; - - for key in (0..item_count).map(|_| nanoid::nanoid!()) { - let key = key.as_bytes(); - - if filter.contains(key) { - false_positives += 1; - } - } - - #[allow(clippy::cast_precision_loss)] - let fpr = false_positives as f32 / item_count as f32; - assert!(fpr > 0.45); - assert!(fpr < 0.55); - } -} diff --git a/src/cache.rs b/src/cache.rs deleted file mode 100644 index 0a606fba..00000000 --- a/src/cache.rs +++ /dev/null @@ -1,225 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use crate::segment::block::offset::BlockOffset; -use crate::segment::id::GlobalSegmentId; -use crate::segment::{block_index::IndexBlock, value_block::ValueBlock}; -use crate::UserValue; -use quick_cache::Weighter; -use quick_cache::{sync::Cache as QuickCache, Equivalent}; -use std::sync::Arc; - -const TAG_BLOCK: u8 = 0; -const TAG_BLOB: u8 = 1; - -#[derive(Clone)] -enum Item { - DataBlock(Arc), - IndexBlock(Arc), - Blob(UserValue), -} - -#[derive(Eq, std::hash::Hash, PartialEq)] -struct CacheKey(u8, u64, u64, u64); - -impl Equivalent for (u8, u64, u64, u64) { - fn equivalent(&self, key: &CacheKey) -> bool { - self.0 == key.0 && self.1 == key.1 && self.2 == key.2 && self.3 == key.3 - } -} - -impl From<(u8, u64, u64, u64)> for CacheKey { - fn from((tag, root_id, segment_id, offset): (u8, u64, u64, u64)) -> Self { - Self(tag, root_id, segment_id, offset) - } -} - -#[derive(Clone)] -struct BlockWeighter; - -impl Weighter for BlockWeighter { - fn weight(&self, _: &CacheKey, block: &Item) -> u64 { - #[allow(clippy::cast_possible_truncation)] - match block { - Item::DataBlock(block) => block.header.uncompressed_length.into(), - Item::IndexBlock(block) => block.header.uncompressed_length.into(), - Item::Blob(blob) => blob.len() as u64, - } - } -} - -/// Cache, in which blocks or blobs are cached in-memory -/// after being retrieved from disk -/// -/// This speeds up consecutive queries to nearby data, improving -/// read performance for hot data. -/// -/// # Examples -/// -/// Sharing cache between multiple trees -/// -/// ``` -/// # use lsm_tree::{Tree, Config, Cache}; -/// # use std::sync::Arc; -/// # -/// // Provide 40 MB of cache capacity -/// let cache = Arc::new(Cache::with_capacity_bytes(40 * 1_000 * 1_000)); -/// -/// # let folder = tempfile::tempdir()?; -/// let tree1 = Config::new(folder).use_cache(cache.clone()).open()?; -/// # let folder = tempfile::tempdir()?; -/// let tree2 = Config::new(folder).use_cache(cache.clone()).open()?; -/// # -/// # Ok::<(), lsm_tree::Error>(()) -/// ``` -pub struct Cache { - // NOTE: rustc_hash performed best: https://fjall-rs.github.io/post/fjall-2-1 - /// Concurrent cache implementation - data: QuickCache, - - /// Capacity in bytes - capacity: u64, -} - -impl Cache { - /// Creates a new block cache with roughly `n` bytes of capacity. - #[must_use] - pub fn with_capacity_bytes(bytes: u64) -> Self { - use quick_cache::sync::DefaultLifecycle; - - #[allow(clippy::default_trait_access)] - let quick_cache = QuickCache::with( - 100_000, - bytes, - BlockWeighter, - Default::default(), - DefaultLifecycle::default(), - ); - - Self { - data: quick_cache, - capacity: bytes, - } - } - - /// Returns the amount of cached bytes. - #[must_use] - pub fn size(&self) -> u64 { - self.data.weight() - } - - /// Returns the cache capacity in bytes. - #[must_use] - pub fn capacity(&self) -> u64 { - self.capacity - } - - /// Returns the number of cached blocks. - #[must_use] - pub fn len(&self) -> usize { - self.data.len() - } - - /// Returns `true` if there are no cached blocks. - #[must_use] - pub fn is_empty(&self) -> bool { - self.data.is_empty() - } - - #[doc(hidden)] - pub fn insert_data_block( - &self, - id: GlobalSegmentId, - offset: BlockOffset, - value: Arc, - ) { - if self.capacity > 0 { - self.data.insert( - (TAG_BLOCK, id.tree_id(), id.segment_id(), *offset).into(), - Item::DataBlock(value), - ); - } - } - - #[doc(hidden)] - pub fn insert_index_block( - &self, - id: GlobalSegmentId, - offset: BlockOffset, - value: Arc, - ) { - if self.capacity > 0 { - self.data.insert( - (TAG_BLOCK, id.tree_id(), id.segment_id(), *offset).into(), - Item::IndexBlock(value), - ); - } - } - - #[doc(hidden)] - #[must_use] - pub fn get_data_block( - &self, - id: GlobalSegmentId, - offset: BlockOffset, - ) -> Option> { - let key: CacheKey = (TAG_BLOCK, id.tree_id(), id.segment_id(), *offset).into(); - - if let Item::DataBlock(block) = self.data.get(&key)? { - Some(block) - } else { - log::warn!("cache item type was unexpected - this is a bug"); - None - } - } - - #[doc(hidden)] - #[must_use] - pub fn get_index_block( - &self, - id: GlobalSegmentId, - offset: BlockOffset, - ) -> Option> { - let key: CacheKey = (TAG_BLOCK, id.tree_id(), id.segment_id(), *offset).into(); - - if let Item::IndexBlock(block) = self.data.get(&key)? { - Some(block) - } else { - log::warn!("cache item type was unexpected - this is a bug"); - None - } - } - - #[doc(hidden)] - pub fn insert_blob( - &self, - vlog_id: value_log::ValueLogId, - vhandle: &value_log::ValueHandle, - value: UserValue, - ) { - if self.capacity > 0 { - self.data.insert( - (TAG_BLOB, vlog_id, vhandle.segment_id, vhandle.offset).into(), - Item::Blob(value), - ); - } - } - - #[doc(hidden)] - #[must_use] - pub fn get_blob( - &self, - vlog_id: value_log::ValueLogId, - vhandle: &value_log::ValueHandle, - ) -> Option { - let key: CacheKey = (TAG_BLOB, vlog_id, vhandle.segment_id, vhandle.offset).into(); - - if let Item::Blob(blob) = self.data.get(&key)? { - Some(blob) - } else { - log::warn!("cache item type was unexpected - this is a bug"); - None - } - } -} diff --git a/src/compaction/fifo.rs b/src/compaction/fifo.rs index 4cb169ca..079ef29f 100644 --- a/src/compaction/fifo.rs +++ b/src/compaction/fifo.rs @@ -112,7 +112,7 @@ impl CompactionStrategy for Strategy { } } } - +/* #[cfg(test)] mod tests { use super::Strategy; @@ -276,3 +276,4 @@ mod tests { Ok(()) } } + */ diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index a9fad7e4..09f2f96f 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -375,7 +375,7 @@ impl CompactionStrategy for Strategy { Choice::DoNothing } } - +/* #[cfg(test)] mod tests { use super::{Choice, Strategy}; @@ -680,3 +680,4 @@ mod tests { Ok(()) } } + */ diff --git a/src/compaction/maintenance.rs b/src/compaction/maintenance.rs index 643239d1..987fac29 100644 --- a/src/compaction/maintenance.rs +++ b/src/compaction/maintenance.rs @@ -4,8 +4,7 @@ use super::{Choice, CompactionStrategy}; use crate::{ - config::Config, level_manifest::LevelManifest, segment::meta::SegmentId, - super_segment::Segment, HashSet, + config::Config, level_manifest::LevelManifest, super_segment::Segment, HashSet, SegmentId, }; const L0_SEGMENT_CAP: usize = 20; @@ -75,7 +74,7 @@ impl CompactionStrategy for Strategy { } } } - +/* #[cfg(test)] mod tests { use super::*; @@ -205,3 +204,4 @@ mod tests { Ok(()) } } + */ diff --git a/src/compaction/mod.rs b/src/compaction/mod.rs index eafdcc99..0cf79ef0 100644 --- a/src/compaction/mod.rs +++ b/src/compaction/mod.rs @@ -18,7 +18,7 @@ pub use fifo::Strategy as Fifo; pub use leveled::Strategy as Leveled; pub use tiered::Strategy as SizeTiered; -use crate::{config::Config, level_manifest::LevelManifest, segment::meta::SegmentId, HashSet}; +use crate::{config::Config, level_manifest::LevelManifest, HashSet, SegmentId}; /// Alias for `Leveled` pub type Levelled = Leveled; diff --git a/src/compaction/tiered.rs b/src/compaction/tiered.rs index b4c4b093..36ccac6d 100644 --- a/src/compaction/tiered.rs +++ b/src/compaction/tiered.rs @@ -150,12 +150,11 @@ impl CompactionStrategy for Strategy { super::maintenance::Strategy.choose(levels, config) } } - +/* #[cfg(test)] mod tests { use super::Strategy; use crate::{ - bloom::BloomFilter, cache::Cache, compaction::{Choice, CompactionStrategy, Input as CompactionInput}, config::Config, @@ -433,3 +432,4 @@ mod tests { Ok(()) } } + */ diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 73a8f990..92011fd8 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -9,11 +9,10 @@ use crate::{ level_manifest::LevelManifest, level_scanner::LevelScanner, merge::Merger, - segment::id::GlobalSegmentId, stop_signal::StopSignal, super_segment::{multi_writer::MultiWriter, Segment}, tree::inner::TreeId, - Config, InternalValue, SegmentId, SeqNo, + Config, GlobalSegmentId, InternalValue, SegmentId, SeqNo, }; use std::{ sync::{atomic::AtomicU64, Arc, RwLock, RwLockWriteGuard}, diff --git a/src/segment/meta/compression.rs b/src/compression.rs similarity index 100% rename from src/segment/meta/compression.rs rename to src/compression.rs diff --git a/src/config.rs b/src/config.rs index 43dc7435..7f8df802 100644 --- a/src/config.rs +++ b/src/config.rs @@ -2,11 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::{ - path::absolute_path, - segment::meta::{CompressionType, TableType}, - BlobTree, NewCache, NewDescriptorTable, Tree, -}; +use crate::{path::absolute_path, BlobTree, CompressionType, NewCache, NewDescriptorTable, Tree}; use std::{ path::{Path, PathBuf}, sync::Arc, @@ -62,10 +58,9 @@ pub struct Config { /// What type of compression is used for blobs pub blob_compression: CompressionType, - /// Table type (unused) - #[allow(unused)] - pub(crate) table_type: TableType, - + // /// Table type (unused) + // #[allow(unused)] + // pub(crate) table_type: TableType, /// Block size of data blocks pub data_block_size: u32, @@ -110,7 +105,7 @@ impl Default for Config { index_block_size: /* 4 KiB */ 4_096, level_count: 7, tree_type: TreeType::Standard, - table_type: TableType::Block, + // table_type: TableType::Block, compression: CompressionType::None, blob_compression: CompressionType::None, bloom_bits_per_key: 10, diff --git a/src/descriptor_table/lru.rs b/src/descriptor_table/lru.rs deleted file mode 100644 index 252879c7..00000000 --- a/src/descriptor_table/lru.rs +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use std::collections::VecDeque; - -#[derive(Default)] -#[allow(clippy::module_name_repetitions)] -pub struct LruList(VecDeque); - -impl LruList { - #[must_use] - pub fn with_capacity(n: usize) -> Self { - Self(VecDeque::with_capacity(n)) - } - - pub fn remove_by(&mut self, f: impl FnMut(&T) -> bool) { - self.0.retain(f); - } - - pub fn remove(&mut self, item: &T) { - self.remove_by(|x| x != item); - } - - pub fn refresh(&mut self, item: T) { - self.remove(&item); - self.0.push_back(item); - } - - pub fn get_least_recently_used(&mut self) -> Option { - let front = self.0.pop_front()?; - self.0.push_back(front.clone()); - Some(front) - } -} diff --git a/src/descriptor_table/mod.rs b/src/descriptor_table/mod.rs deleted file mode 100644 index 1f1c6a10..00000000 --- a/src/descriptor_table/mod.rs +++ /dev/null @@ -1,282 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -mod lru; - -use crate::{segment::id::GlobalSegmentId, HashMap}; -use lru::LruList; -use std::{ - fs::File, - io::BufReader, - path::PathBuf, - sync::{ - atomic::{AtomicBool, AtomicUsize}, - Arc, Mutex, RwLock, RwLockWriteGuard, - }, -}; - -pub struct FileGuard(Arc); - -impl std::ops::Deref for FileGuard { - type Target = Arc; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl Drop for FileGuard { - fn drop(&mut self) { - self.0 - .is_used - .store(false, std::sync::atomic::Ordering::Release); - } -} - -pub struct FileDescriptorWrapper { - pub file: Mutex>, - is_used: AtomicBool, -} - -pub struct FileHandle { - descriptors: RwLock>>, - path: PathBuf, -} - -// TODO: FileDescriptorTable should wrap Arc -// TODO: table should probably use a concurrent hashmap - -pub struct FileDescriptorTableInner { - table: HashMap, - lru: Mutex>, - size: AtomicUsize, -} - -/// The descriptor table caches file descriptors to avoid `fopen()` calls -/// -/// See `TableCache` in `RocksDB`. -#[doc(alias("table cache"))] -#[allow(clippy::module_name_repetitions)] -pub struct FileDescriptorTable { - inner: RwLock, - concurrency: usize, - limit: usize, -} - -impl FileDescriptorTable { - /// Closes all file descriptors - pub fn clear(&self) { - let mut lock = self.inner.write().expect("lock is poisoned"); - lock.table.clear(); - } - - #[must_use] - pub fn new(limit: usize, concurrency: usize) -> Self { - Self { - inner: RwLock::new(FileDescriptorTableInner { - table: HashMap::with_capacity_and_hasher( - 100, - xxhash_rust::xxh3::Xxh3Builder::new(), - ), - lru: Mutex::new(LruList::with_capacity(100)), - size: AtomicUsize::default(), - }), - concurrency, - limit, - } - } - - /// Number of segments - pub fn len(&self) -> usize { - self.inner.read().expect("lock is poisoned").table.len() - } - - #[must_use] - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - - pub fn size(&self) -> usize { - self.inner - .read() - .expect("lock is poisoned") - .size - .load(std::sync::atomic::Ordering::Acquire) - } - - // TODO: on access, adjust hotness of ID -> lock contention though - pub fn access(&self, id: &GlobalSegmentId) -> crate::Result> { - let lock = self.inner.read().expect("lock is poisoned"); - - let Some(item) = lock.table.get(id) else { - return Ok(None); - }; - - let fd_array = item.descriptors.read().expect("lock is poisoned"); - - if fd_array.is_empty() { - drop(fd_array); - drop(lock); - - let lock = self.inner.write().expect("lock is poisoned"); - let mut lru = lock.lru.lock().expect("lock is poisoned"); - lru.refresh(*id); - - let fd = { - let item = lock.table.get(id).expect("should exist"); - let mut fd_lock = item.descriptors.write().expect("lock is poisoned"); - - for _ in 0..(self.concurrency - 1) { - let fd = Arc::new(FileDescriptorWrapper { - file: Mutex::new(BufReader::new(File::open(&item.path)?)), - is_used: AtomicBool::default(), - }); - fd_lock.push(fd.clone()); - } - - let fd = Arc::new(FileDescriptorWrapper { - file: Mutex::new(BufReader::new(File::open(&item.path)?)), - is_used: AtomicBool::new(true), - }); - fd_lock.push(fd.clone()); - - fd - }; - - let mut size_now = lock - .size - .fetch_add(self.concurrency, std::sync::atomic::Ordering::AcqRel) - + self.concurrency; - - while size_now > self.limit { - if let Some(oldest) = lru.get_least_recently_used() { - if &oldest != id { - if let Some(item) = lock.table.get(&oldest) { - let mut oldest_lock = - item.descriptors.write().expect("lock is poisoned"); - - lock.size - .fetch_sub(oldest_lock.len(), std::sync::atomic::Ordering::Release); - size_now -= oldest_lock.len(); - - oldest_lock.clear(); - }; - } - } else { - break; - } - } - - Ok(Some(FileGuard(fd))) - } else { - loop { - for shard in &*fd_array { - if shard.is_used.compare_exchange( - false, - true, - // TODO: could probably be not SeqCst - std::sync::atomic::Ordering::SeqCst, - std::sync::atomic::Ordering::SeqCst, - ) == Ok(false) - { - return Ok(Some(FileGuard(shard.clone()))); - } - } - } - } - } - - fn inner_insert( - mut lock: RwLockWriteGuard<'_, FileDescriptorTableInner>, - path: PathBuf, - id: GlobalSegmentId, - ) { - lock.table.insert( - id, - FileHandle { - descriptors: RwLock::new(vec![]), - path, - }, - ); - - lock.lru.lock().expect("lock is poisoned").refresh(id); - } - - pub fn insert>(&self, path: P, id: GlobalSegmentId) { - let lock = self.inner.write().expect("lock is poisoned"); - Self::inner_insert(lock, path.into(), id); - } - - pub fn remove(&self, id: GlobalSegmentId) { - let mut lock = self.inner.write().expect("lock is poisoned"); - - if let Some(item) = lock.table.remove(&id) { - lock.size.fetch_sub( - item.descriptors.read().expect("lock is poisoned").len(), - std::sync::atomic::Ordering::Release, - ); - } - - lock.lru.lock().expect("lock is poisoned").remove(&id); - } -} - -#[cfg(test)] -mod tests { - use super::*; - use test_log::test; - - #[test] - fn descriptor_table_limit() -> crate::Result<()> { - let folder = tempfile::tempdir()?; - let path = folder.path(); - - File::create(path.join("1"))?; - File::create(path.join("2"))?; - File::create(path.join("3"))?; - - let table = FileDescriptorTable::new(2, 1); - - assert_eq!(0, table.size()); - - table.insert(path.join("1"), (0, 1).into()); - assert_eq!(0, table.size()); - - { - let _ = table.access(&(0, 1).into()); - assert_eq!(1, table.size()); - } - - table.insert(path.join("2"), (0, 2).into()); - - { - assert_eq!(1, table.size()); - let _ = table.access(&(0, 1).into()); - } - - { - let _ = table.access(&(0, 2).into()); - assert_eq!(2, table.size()); - } - - table.insert(path.join("3"), (0, 3).into()); - assert_eq!(2, table.size()); - - { - let _ = table.access(&(0, 3).into()); - assert_eq!(2, table.size()); - } - - table.remove((0, 3).into()); - assert_eq!(1, table.size()); - - table.remove((0, 2).into()); - assert_eq!(0, table.size()); - - let _ = table.access(&(0, 1).into()); - assert_eq!(1, table.size()); - - Ok(()) - } -} diff --git a/src/level_manifest/hidden_set.rs b/src/level_manifest/hidden_set.rs index a37f7df1..b84a349e 100644 --- a/src/level_manifest/hidden_set.rs +++ b/src/level_manifest/hidden_set.rs @@ -1,5 +1,4 @@ -use crate::segment::meta::SegmentId; -use crate::HashSet; +use crate::{HashSet, SegmentId}; /// The hidden set keeps track of which segments are currently being compacted /// diff --git a/src/level_manifest/level.rs b/src/level_manifest/level.rs index 90ebc02b..c2f56f6e 100644 --- a/src/level_manifest/level.rs +++ b/src/level_manifest/level.rs @@ -3,8 +3,7 @@ // (found in the LICENSE-* files in the repository) use crate::{ - binary_search::partition_point, segment::meta::SegmentId, super_segment::Segment, HashSet, - KeyRange, UserKey, + binary_search::partition_point, super_segment::Segment, HashSet, KeyRange, SegmentId, UserKey, }; use std::ops::Bound; @@ -241,7 +240,7 @@ impl<'a> DisjointLevel<'a> { Some((lo, hi)) } } - +/* #[cfg(test)] #[allow(clippy::expect_used)] mod tests { @@ -534,3 +533,4 @@ mod tests { ); } } + */ diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index 9f4e3ee2..559d352a 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -11,7 +11,7 @@ use crate::{ super_segment::Segment, HashMap, HashSet, KeyRange, SegmentId, }; -use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use hidden_set::HiddenSet; use level::Level; use std::{ @@ -162,10 +162,10 @@ impl LevelManifest { for _ in 0..level_count { let mut level = vec![]; - let segment_count = level_manifest.read_u32::()?; + let segment_count = level_manifest.read_u32::()?; for _ in 0..segment_count { - let id = level_manifest.read_u64::()?; + let id = level_manifest.read_u64::()?; level.push(id); } @@ -451,10 +451,10 @@ impl<'a> Encode for Runs<'a> { for level in self.iter() { // NOTE: "Truncation" is OK, because there are never 4 billion segments in a tree, I hope #[allow(clippy::cast_possible_truncation)] - writer.write_u32::(level.segments.len() as u32)?; + writer.write_u32::(level.segments.len() as u32)?; for segment in &level.segments { - writer.write_u64::(segment.id())?; + writer.write_u64::(segment.id())?; } } diff --git a/src/level_reader.rs b/src/level_reader.rs index 2104dd75..506cd241 100644 --- a/src/level_reader.rs +++ b/src/level_reader.rs @@ -2,11 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::{ - level_manifest::level::Level, - segment::{range::Range, value_block::CachePolicy}, - InternalValue, UserKey, -}; +use crate::{level_manifest::level::Level, super_segment::CachePolicy, InternalValue, UserKey}; use std::{ops::Bound, sync::Arc}; /// Reads through a disjoint level @@ -14,8 +10,8 @@ pub struct LevelReader { segments: Arc, lo: usize, hi: usize, - lo_reader: Option, - hi_reader: Option, + lo_reader: Option<()>, // TODO: range + hi_reader: Option<()>, // TODO: range cache_policy: CachePolicy, } diff --git a/src/lib.rs b/src/lib.rs index 3d73582c..2aee3af7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -120,8 +120,6 @@ macro_rules! fail_iter { }; } -// TODO: 3.0.0 change everything to LittleEndian? - mod any_tree; mod r#abstract; @@ -132,18 +130,11 @@ pub mod binary_search; #[doc(hidden)] pub mod blob_tree; -mod cache; - -#[doc(hidden)] -pub mod bloom; - mod clipping_iter; pub mod compaction; +mod compression; mod config; -#[doc(hidden)] -pub mod descriptor_table; - mod error; // mod export; @@ -180,9 +171,6 @@ mod path; #[doc(hidden)] pub mod range; -#[doc(hidden)] -pub mod segment; - mod seqno; mod snapshot; mod windows; @@ -214,20 +202,20 @@ pub use { merge::BoxedIterator, new_cache::NewCache, new_descriptor_table::NewDescriptorTable, - segment::{block::checksum::Checksum, id::GlobalSegmentId, meta::SegmentId}, + super_segment::{block::Checksum, GlobalSegmentId, SegmentId}, tree::inner::TreeId, value::InternalValue, }; pub use { coding::{DecodeError, EncodeError}, + compression::CompressionType, config::{Config, TreeType}, error::{Error, Result}, memtable::Memtable, new_cache::NewCache as Cache, // <- TODO: rename new_descriptor_table::NewDescriptorTable as DescriptorTable, r#abstract::AbstractTree, - segment::{meta::CompressionType, Segment}, seqno::SequenceNumberCounter, snapshot::Snapshot, tree::Tree, diff --git a/src/manifest.rs b/src/manifest.rs index c2dc6fa3..5e8132b7 100644 --- a/src/manifest.rs +++ b/src/manifest.rs @@ -5,7 +5,6 @@ use crate::{ coding::{Decode, DecodeError, Encode, EncodeError}, file::MAGIC_BYTES, - segment::meta::TableType, TreeType, Version, }; use byteorder::{ReadBytesExt, WriteBytesExt}; @@ -14,7 +13,7 @@ use std::io::Write; pub struct Manifest { pub(crate) version: Version, pub(crate) tree_type: TreeType, - pub(crate) table_type: TableType, + // pub(crate) table_type: TableType, pub(crate) level_count: u8, } @@ -22,7 +21,7 @@ impl Encode for Manifest { fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { writer.write_all(&MAGIC_BYTES)?; writer.write_u8(self.tree_type.into())?; - writer.write_u8(self.table_type.into())?; + // writer.write_u8(self.table_type.into())?; writer.write_u8(self.level_count)?; Ok(()) } @@ -42,7 +41,7 @@ impl Decode for Manifest { let version = Version::try_from(version).map_err(|()| DecodeError::InvalidVersion)?; let tree_type = reader.read_u8()?; - let table_type = reader.read_u8()?; + // let table_type = reader.read_u8()?; let level_count = reader.read_u8()?; Ok(Self { @@ -51,9 +50,9 @@ impl Decode for Manifest { tree_type: tree_type .try_into() .map_err(|()| DecodeError::InvalidTag(("TreeType", tree_type)))?, - table_type: table_type - .try_into() - .map_err(|()| DecodeError::InvalidTag(("TableType", table_type)))?, + // table_type: table_type + // .try_into() + // .map_err(|()| DecodeError::InvalidTag(("TableType", table_type)))?, }) } } diff --git a/src/memtable/mod.rs b/src/memtable/mod.rs index f8ba5a6a..d4377e5e 100644 --- a/src/memtable/mod.rs +++ b/src/memtable/mod.rs @@ -3,11 +3,10 @@ // (found in the LICENSE-* files in the repository) use crate::key::InternalKey; -use crate::segment::block::ItemSize; use crate::value::{InternalValue, SeqNo, UserValue, ValueType}; use crossbeam_skiplist::SkipMap; use std::ops::RangeBounds; -use std::sync::atomic::{AtomicU32, AtomicU64}; +use std::sync::atomic::AtomicU64; /// The memtable serves as an intermediary, ephemeral, sorted storage for new items /// @@ -21,7 +20,7 @@ pub struct Memtable { /// Approximate active memtable size. /// /// If this grows too large, a flush is triggered. - pub(crate) approximate_size: AtomicU32, + pub(crate) approximate_size: AtomicU64, /// Highest encountered sequence number. /// @@ -103,7 +102,7 @@ impl Memtable { } /// Gets approximate size of memtable in bytes. - pub fn size(&self) -> u32 { + pub fn size(&self) -> u64 { self.approximate_size .load(std::sync::atomic::Ordering::Acquire) } @@ -121,10 +120,13 @@ impl Memtable { /// Inserts an item into the memtable #[doc(hidden)] - pub fn insert(&self, item: InternalValue) -> (u32, u32) { + pub fn insert(&self, item: InternalValue) -> (u64, u64) { // NOTE: We know values are limited to 32-bit length #[allow(clippy::cast_possible_truncation)] - let item_size = item.size() as u32; + let item_size = + { item.key.user_key.len() + item.value.len() + std::mem::size_of::() } + .try_into() + .expect("should fit into u64"); let size_before = self .approximate_size diff --git a/src/new_cache.rs b/src/new_cache.rs index 5e06fa6e..7a472f95 100644 --- a/src/new_cache.rs +++ b/src/new_cache.rs @@ -2,9 +2,9 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::segment::id::GlobalSegmentId; use crate::super_segment::block::Header; use crate::super_segment::{Block, BlockOffset, DataBlock}; +use crate::GlobalSegmentId; use quick_cache::Weighter; use quick_cache::{sync::Cache as QuickCache, Equivalent}; @@ -150,23 +150,6 @@ impl NewCache { ); } */ - /* #[doc(hidden)] - #[must_use] - pub fn get_data_block( - &self, - id: GlobalSegmentId, - offset: BlockOffset, - ) -> Option> { - let key: CacheKey = (TAG_BLOCK, id.tree_id(), id.segment_id(), *offset).into(); - - if let Item::DataBlock(block) = self.data.get(&key)? { - Some(block) - } else { - log::warn!("cache item type was unexpected - this is a bug"); - None - } - } */ - /* #[doc(hidden)] #[must_use] pub fn get_index_block( diff --git a/src/range.rs b/src/range.rs index 3af25f42..4cbd52b6 100644 --- a/src/range.rs +++ b/src/range.rs @@ -3,14 +3,11 @@ // (found in the LICENSE-* files in the repository) use crate::{ - key::InternalKey, level_manifest::{level::Level, LevelManifest}, level_reader::LevelReader, memtable::Memtable, - merge::{BoxedIterator, Merger}, multi_reader::MultiReader, - mvcc_stream::MvccStream, - segment::value_block::CachePolicy, + super_segment::CachePolicy, value::{SeqNo, UserKey}, InternalValue, }; diff --git a/src/segment/block/checksum.rs b/src/segment/block/checksum.rs deleted file mode 100644 index 5b3f0edc..00000000 --- a/src/segment/block/checksum.rs +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use xxhash_rust::xxh3::xxh3_64; - -/// A checksum based on xxh3 -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub struct Checksum(u64); - -impl std::ops::Deref for Checksum { - type Target = u64; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl Checksum { - #[must_use] - pub fn from_raw(value: u64) -> Self { - Self(value) - } - - /// Calculates a checksum. - #[must_use] - pub fn from_bytes(bytes: &[u8]) -> Self { - Self(xxh3_64(bytes)) - } -} diff --git a/src/segment/block/header.rs b/src/segment/block/header.rs deleted file mode 100644 index 6c1bbe0b..00000000 --- a/src/segment/block/header.rs +++ /dev/null @@ -1,155 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use super::{checksum::Checksum, offset::BlockOffset}; -use crate::{ - coding::{Decode, DecodeError, Encode, EncodeError}, - file::MAGIC_BYTES, - segment::meta::CompressionType, -}; -use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; -use std::io::{Read, Write}; - -/// Header of a disk-based block -#[derive(Copy, Clone, Debug, Eq, PartialEq)] -pub struct Header { - /// Compression type used - pub compression: CompressionType, // TODO: 3.0.0 store in segment meta instead? - - /// Checksum value to verify integrity of data - pub checksum: Checksum, - - /// File offset of previous block - only used for data blocks - pub previous_block_offset: BlockOffset, - - /// On-disk size of data segment - pub data_length: u32, - - /// Uncompressed size of data segment - pub uncompressed_length: u32, -} - -impl Header { - #[must_use] - pub const fn serialized_len() -> usize { - MAGIC_BYTES.len() - // NOTE: Compression is 2 bytes - + std::mem::size_of::() - + std::mem::size_of::() - // Checksum - + std::mem::size_of::() - // Backlink - + std::mem::size_of::() - // Data length - + std::mem::size_of::() - // Uncompressed data length - + std::mem::size_of::() - } -} - -impl Encode for Header { - fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { - // Write header - writer.write_all(&MAGIC_BYTES)?; - - self.compression.encode_into(writer)?; - - // Write checksum - writer.write_u64::(*self.checksum)?; - - // Write prev offset - writer.write_u64::(*self.previous_block_offset)?; - - // Write data length - writer.write_u32::(self.data_length)?; - - // Write uncompressed data length - writer.write_u32::(self.uncompressed_length)?; - - Ok(()) - } -} - -impl Decode for Header { - fn decode_from(reader: &mut R) -> Result { - // Check header - let mut magic = [0u8; MAGIC_BYTES.len()]; - reader.read_exact(&mut magic)?; - - if magic != MAGIC_BYTES { - return Err(DecodeError::InvalidHeader("Block")); - } - - let compression = CompressionType::decode_from(reader)?; - - // Read checksum - let checksum = reader.read_u64::()?; - - // Read prev offset - let previous_block_offset = reader.read_u64::()?; - - // Read data length - let data_length = reader.read_u32::()?; - - // Read data length - let uncompressed_length = reader.read_u32::()?; - - Ok(Self { - compression, - checksum: Checksum::from_raw(checksum), - previous_block_offset: BlockOffset(previous_block_offset), - data_length, - uncompressed_length, - }) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::io::Cursor; - use test_log::test; - - #[test] - fn block_header_raw() -> crate::Result<()> { - let header = Header { - compression: CompressionType::None, - checksum: Checksum::from_raw(4), - previous_block_offset: BlockOffset(2), - data_length: 15, - uncompressed_length: 15, - }; - - #[rustfmt::skip] - let bytes = &[ - // Header - b'L', b'S', b'M', 3, - - // Compression - 0, 0, - - // Checksum - 0, 0, 0, 0, 0, 0, 0, 4, - - // Backlink - 0, 0, 0, 0, 0, 0, 0, 2, - - // Data length - 0, 0, 0, 0x0F, - - // Uncompressed length - 0, 0, 0, 0x0F, - ]; - - // Deserialize the empty Value - let deserialized = Header::decode_from(&mut Cursor::new(bytes))?; - - // Check if deserialized Value is equivalent to the original empty Value - assert_eq!(header, deserialized); - - assert_eq!(Header::serialized_len(), bytes.len()); - - Ok(()) - } -} diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs deleted file mode 100644 index ee1513db..00000000 --- a/src/segment/block/mod.rs +++ /dev/null @@ -1,225 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -pub mod checksum; -pub mod header; -pub mod offset; - -use super::meta::CompressionType; -use crate::coding::{Decode, Encode}; -use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; -use checksum::Checksum; -use header::Header as BlockHeader; -use offset::BlockOffset; -use std::io::{Cursor, Read}; - -// TODO: better name -pub trait ItemSize { - fn size(&self) -> usize; -} - -impl ItemSize for [T] { - fn size(&self) -> usize { - self.iter().map(ItemSize::size).sum() - } -} - -/// A disk-based block -/// -/// A block is split into its header and a blob of data. -/// The data blob may be compressed. -/// -/// \[ header \] -/// \[ data \] -/// -/// The integrity of a block can be checked using the checksum value that is saved in its header. -#[derive(Clone, Debug)] -pub struct Block { - pub header: BlockHeader, - pub items: Box<[T]>, -} - -impl Block { - pub fn from_reader(reader: &mut R) -> crate::Result { - // Read block header - let header = BlockHeader::decode_from(reader)?; - log::trace!("Got block header: {header:?}"); - - // Read the (possibly compressed) data - let mut bytes = vec![0u8; header.data_length as usize]; - reader.read_exact(&mut bytes)?; - - let bytes = match header.compression { - super::meta::CompressionType::None => bytes, - - #[cfg(feature = "lz4")] - super::meta::CompressionType::Lz4 => lz4_flex::decompress_size_prepended(&bytes) - .map_err(|_| crate::Error::Decompress(header.compression))?, - - #[cfg(feature = "miniz")] - super::meta::CompressionType::Miniz(_) => { - miniz_oxide::inflate::decompress_to_vec(&bytes) - .map_err(|_| crate::Error::Decompress(header.compression))? - } - }; - let mut bytes = Cursor::new(bytes); - - // Read number of items - let item_count = bytes.read_u32::()? as usize; - - // Deserialize each value - let mut items = Vec::with_capacity(item_count); - for _ in 0..item_count { - items.push(T::decode_from(&mut bytes)?); - } - - Ok(Self { - header, - items: items.into_boxed_slice(), - }) - } - - pub fn from_file( - reader: &mut R, - offset: BlockOffset, - ) -> crate::Result { - reader.seek(std::io::SeekFrom::Start(*offset))?; - Self::from_reader(reader) - } - - pub fn to_bytes_compressed( - items: &[T], - previous_block_offset: BlockOffset, - compression: CompressionType, - ) -> crate::Result<(BlockHeader, Vec)> { - let packed = Self::pack_items(items, compression)?; - let checksum = Checksum::from_bytes(&packed); - - let header = BlockHeader { - checksum, - compression, - previous_block_offset, - - // NOTE: Truncation is OK because block size is max 512 KiB - #[allow(clippy::cast_possible_truncation)] - data_length: packed.len() as u32, - - // NOTE: Truncation is OK because a block cannot possible contain 4 billion items - #[allow(clippy::cast_possible_truncation)] - uncompressed_length: items.size() as u32, - }; - - Ok((header, packed)) - } - - fn pack_items(items: &[T], compression: CompressionType) -> crate::Result> { - let mut buf = Vec::with_capacity(u16::MAX.into()); - - // NOTE: There cannot be 4 billion items in a block - #[allow(clippy::cast_possible_truncation)] - buf.write_u32::(items.len() as u32)?; - - // Serialize each value - for value in items { - value.encode_into(&mut buf)?; - } - - Ok(match compression { - CompressionType::None => buf, - - #[cfg(feature = "lz4")] - CompressionType::Lz4 => lz4_flex::compress_prepend_size(&buf), - - #[cfg(feature = "miniz")] - CompressionType::Miniz(level) => miniz_oxide::deflate::compress_to_vec(&buf, level), - }) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::{ - segment::value_block::ValueBlock, - value::{InternalValue, ValueType}, - }; - use std::io::Write; - use test_log::test; - - #[test] - fn disk_block_deserialization_success() -> crate::Result<()> { - let item1 = - InternalValue::from_components(vec![1, 2, 3], vec![4, 5, 6], 42, ValueType::Value); - let item2 = - InternalValue::from_components(vec![7, 8, 9], vec![10, 11, 12], 43, ValueType::Value); - - let items = vec![item1.clone(), item2.clone()]; - - // Serialize to bytes - let mut serialized = Vec::new(); - - let (header, data) = - ValueBlock::to_bytes_compressed(&items, BlockOffset(0), CompressionType::None)?; - - header.encode_into(&mut serialized)?; - serialized.write_all(&data)?; - - assert_eq!(serialized.len(), BlockHeader::serialized_len() + data.len()); - - // Deserialize from bytes - let mut cursor = Cursor::new(serialized); - let block = ValueBlock::from_reader(&mut cursor)?; - - assert_eq!(2, block.items.len()); - assert_eq!(block.items.first().cloned(), Some(item1)); - assert_eq!(block.items.get(1).cloned(), Some(item2)); - - let checksum = { - let (_, data) = ValueBlock::to_bytes_compressed( - &block.items, - block.header.previous_block_offset, - block.header.compression, - )?; - Checksum::from_bytes(&data) - }; - assert_eq!(block.header.checksum, checksum); - - Ok(()) - } - - #[test] - fn disk_block_deserialization_failure_checksum() -> crate::Result<()> { - let item1 = - InternalValue::from_components(vec![1, 2, 3], vec![4, 5, 6], 42, ValueType::Value); - let item2 = - InternalValue::from_components(vec![7, 8, 9], vec![10, 11, 12], 43, ValueType::Value); - - let items = vec![item1, item2]; - - // Serialize to bytes - let mut serialized = Vec::new(); - - let (header, data) = - ValueBlock::to_bytes_compressed(&items, BlockOffset(0), CompressionType::None)?; - - header.encode_into(&mut serialized)?; - serialized.write_all(&data)?; - - // Deserialize from bytes - let mut cursor = Cursor::new(serialized); - let block = ValueBlock::from_reader(&mut cursor)?; - - let checksum = { - let (_, data) = ValueBlock::to_bytes_compressed( - &block.items, - block.header.previous_block_offset, - block.header.compression, - )?; - Checksum::from_bytes(&data) - }; - assert_eq!(block.header.checksum, checksum); - - Ok(()) - } -} diff --git a/src/segment/block/offset.rs b/src/segment/block/offset.rs deleted file mode 100644 index 4f023296..00000000 --- a/src/segment/block/offset.rs +++ /dev/null @@ -1,29 +0,0 @@ -// TODO: rename FileOffset? -#[derive(Copy, Clone, Default, Debug, std::hash::Hash, PartialEq, Eq, Ord, PartialOrd)] -pub struct BlockOffset(pub u64); - -impl std::ops::Deref for BlockOffset { - type Target = u64; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl std::ops::AddAssign for BlockOffset { - fn add_assign(&mut self, rhs: Self) { - *self += *rhs; - } -} - -impl std::ops::AddAssign for BlockOffset { - fn add_assign(&mut self, rhs: u64) { - self.0 += rhs; - } -} - -impl std::fmt::Display for BlockOffset { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.0) - } -} diff --git a/src/segment/block_index/block_handle.rs b/src/segment/block_index/block_handle.rs deleted file mode 100644 index 35be5bc4..00000000 --- a/src/segment/block_index/block_handle.rs +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use crate::{ - coding::{Decode, DecodeError, Encode, EncodeError}, - segment::block::{offset::BlockOffset, ItemSize}, - value::UserKey, - Slice, -}; -use std::io::{Read, Write}; -use varint_rs::{VarintReader, VarintWriter}; - -/// Points to a block on file -#[derive(Clone, Debug)] -#[allow(clippy::module_name_repetitions)] -pub struct KeyedBlockHandle { - /// Key of last item in block - pub end_key: UserKey, - - /// Position of block in file - pub offset: BlockOffset, -} - -impl KeyedBlockHandle { - #[must_use] - pub fn new>(end_key: K, offset: BlockOffset) -> Self { - Self { - end_key: end_key.into(), - offset, - } - } -} - -impl ItemSize for KeyedBlockHandle { - fn size(&self) -> usize { - std::mem::size_of::() + self.end_key.len() - } -} - -impl PartialEq for KeyedBlockHandle { - fn eq(&self, other: &Self) -> bool { - self.offset == other.offset - } -} -impl Eq for KeyedBlockHandle {} - -impl std::hash::Hash for KeyedBlockHandle { - fn hash(&self, state: &mut H) { - state.write_u64(*self.offset); - } -} - -impl PartialOrd for KeyedBlockHandle { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for KeyedBlockHandle { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - (&self.end_key, self.offset).cmp(&(&other.end_key, other.offset)) - } -} - -impl Encode for KeyedBlockHandle { - fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { - writer.write_u64_varint(*self.offset)?; - - // NOTE: Truncation is okay and actually needed - #[allow(clippy::cast_possible_truncation)] - writer.write_u16_varint(self.end_key.len() as u16)?; - writer.write_all(&self.end_key)?; - - Ok(()) - } -} - -impl Decode for KeyedBlockHandle { - fn decode_from(reader: &mut R) -> Result - where - Self: Sized, - { - let offset = reader.read_u64_varint()?; - - let key_len = reader.read_u16_varint()?; - let mut key = vec![0; key_len.into()]; - reader.read_exact(&mut key)?; - - Ok(Self { - offset: BlockOffset(offset), - end_key: UserKey::from(key), - }) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use test_log::test; - - #[test] - fn index_block_size() { - let items = [ - KeyedBlockHandle::new("abcd", BlockOffset(5)), - KeyedBlockHandle::new("efghij", BlockOffset(10)), - ]; - assert_eq!(26, items.size()); - } -} diff --git a/src/segment/block_index/full_index.rs b/src/segment/block_index/full_index.rs deleted file mode 100644 index 16700dda..00000000 --- a/src/segment/block_index/full_index.rs +++ /dev/null @@ -1,85 +0,0 @@ -use super::{block_handle::KeyedBlockHandle, BlockIndex}; -use crate::segment::{ - block::offset::BlockOffset, block_index::IndexBlock, value_block::CachePolicy, -}; -use std::{fs::File, io::Seek, path::Path}; - -/// Index that translates item keys to data block handles -/// -/// The index is fully loaded into memory. -/// -/// Currently, a full block index is used for L0 & L1 segments. -pub struct FullBlockIndex(Box<[KeyedBlockHandle]>); - -impl std::ops::Deref for FullBlockIndex { - type Target = Box<[KeyedBlockHandle]>; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl FullBlockIndex { - pub fn from_file( - path: &Path, - metadata: &crate::segment::meta::Metadata, - offsets: &crate::segment::file_offsets::FileOffsets, - ) -> crate::Result { - let cnt = metadata.index_block_count as usize; - - log::trace!( - "reading full block index from {path:?} at idx_ptr={} ({cnt} index blocks)", - offsets.index_block_ptr, - ); - - let mut file = File::open(path)?; - file.seek(std::io::SeekFrom::Start(*offsets.index_block_ptr))?; - - let mut block_handles = Vec::with_capacity(cnt); - - for _ in 0..cnt { - let idx_block = IndexBlock::from_reader(&mut file)?.items; - // TODO: 1.80? IntoIter impl for Box<[T]> - block_handles.extend(idx_block.into_vec()); - } - - debug_assert!(!block_handles.is_empty()); - - Ok(Self(block_handles.into_boxed_slice())) - } -} - -impl BlockIndex for FullBlockIndex { - fn get_lowest_block_containing_key( - &self, - key: &[u8], - _: CachePolicy, - ) -> crate::Result> { - use super::KeyedBlockIndex; - - self.0 - .get_lowest_block_containing_key(key, CachePolicy::Read) - .map(|x| x.map(|x| x.offset)) - } - - /// Gets the last block handle that may contain the given item - fn get_last_block_containing_key( - &self, - key: &[u8], - cache_policy: CachePolicy, - ) -> crate::Result> { - use super::KeyedBlockIndex; - - self.0 - .get_last_block_containing_key(key, cache_policy) - .map(|x| x.map(|x| x.offset)) - } - - fn get_last_block_handle(&self, _: CachePolicy) -> crate::Result { - use super::KeyedBlockIndex; - - self.0 - .get_last_block_handle(CachePolicy::Read) - .map(|x| x.offset) - } -} diff --git a/src/segment/block_index/mod.rs b/src/segment/block_index/mod.rs deleted file mode 100644 index 97269d0f..00000000 --- a/src/segment/block_index/mod.rs +++ /dev/null @@ -1,395 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -pub mod block_handle; -pub mod full_index; -pub mod top_level; -pub mod two_level_index; -pub mod writer; - -use super::{ - block::{offset::BlockOffset, Block}, - value_block::CachePolicy, -}; -use crate::binary_search::partition_point; -use block_handle::KeyedBlockHandle; -use full_index::FullBlockIndex; -use two_level_index::TwoLevelBlockIndex; - -pub type IndexBlock = Block; - -#[allow(clippy::module_name_repetitions)] -pub trait KeyedBlockIndex { - /// Gets the lowest block handle that may contain the given item - fn get_lowest_block_containing_key( - &self, - key: &[u8], - cache_policy: CachePolicy, - ) -> crate::Result>; - - /// Gets the last block handle that may contain the given item - fn get_last_block_containing_key( - &self, - key: &[u8], - cache_policy: CachePolicy, - ) -> crate::Result>; - - /// Returns a handle to the last block - fn get_last_block_handle(&self, cache_policy: CachePolicy) -> crate::Result<&KeyedBlockHandle>; -} - -impl KeyedBlockIndex for [KeyedBlockHandle] { - fn get_lowest_block_containing_key( - &self, - key: &[u8], - _: CachePolicy, - ) -> crate::Result> { - let idx = partition_point(self, |item| item.end_key < key); - Ok(self.get(idx)) - } - - fn get_last_block_containing_key( - &self, - key: &[u8], - _: CachePolicy, - ) -> crate::Result> { - let idx = partition_point(self, |x| &*x.end_key <= key); - - if idx == 0 { - return Ok(self.first()); - } - if idx == self.len() { - let Some(last_block) = self.last() else { - return Ok(None); - }; - - if last_block.end_key < key { - return Ok(None); - } - - return Ok(Some(last_block)); - } - - Ok(self.get(idx)) - } - - fn get_last_block_handle(&self, _: CachePolicy) -> crate::Result<&KeyedBlockHandle> { - // NOTE: Index is never empty - #[allow(clippy::expect_used)] - Ok(self.last().expect("index should not be empty")) - } -} - -#[enum_dispatch::enum_dispatch] -pub trait BlockIndex { - /// Gets the lowest block handle that may contain the given item - fn get_lowest_block_containing_key( - &self, - key: &[u8], - cache_policy: CachePolicy, - ) -> crate::Result>; - - /// Gets the last block handle that may contain the given item - fn get_last_block_containing_key( - &self, - key: &[u8], - cache_policy: CachePolicy, - ) -> crate::Result>; - - /// Returns a handle to the last block - fn get_last_block_handle(&self, cache_policy: CachePolicy) -> crate::Result; -} - -/// The block index stores references to the positions of blocks on a file and their size -/// -/// __________________ -/// | | -/// | BLOCK0 | -/// |________________| <- 'G': 0x0 -/// | | -/// | BLOCK1 | -/// |________________| <- 'M': 0x... -/// | | -/// | BLOCK2 | -/// |________________| <- 'Z': 0x... -/// -/// The block information can be accessed by key. -/// Because the blocks are sorted, any entries not covered by the index (it is sparse) can be -/// found by finding the highest block that has a lower or equal end key than the searched key (by performing in-memory binary search). -/// In the diagram above, searching for 'J' yields the block starting with 'G'. -/// 'J' must be in that block, because the next block starts with 'M'). -#[enum_dispatch::enum_dispatch(BlockIndex)] -#[allow(clippy::module_name_repetitions)] -pub enum BlockIndexImpl { - Full(FullBlockIndex), - TwoLevel(TwoLevelBlockIndex), -} - -#[cfg(test)] -#[allow(clippy::expect_used)] -mod tests { - use super::*; - use crate::{segment::block::offset::BlockOffset, UserKey}; - use test_log::test; - - fn bh>(end_key: K, offset: BlockOffset) -> KeyedBlockHandle { - KeyedBlockHandle { - end_key: end_key.into(), - offset, - } - } - - #[test] - fn block_handle_array_lowest() { - let index = [ - bh(*b"c", BlockOffset(0)), - bh(*b"g", BlockOffset(10)), - bh(*b"g", BlockOffset(20)), - bh(*b"l", BlockOffset(30)), - bh(*b"t", BlockOffset(40)), - ]; - - { - let handle = index - .get_lowest_block_containing_key(b"a", CachePolicy::Read) - .expect("cannot fail") - .expect("should exist"); - - assert_eq!(&*handle.end_key, *b"c"); - assert_eq!(handle.offset, BlockOffset(0)); - } - - { - let handle = index - .get_lowest_block_containing_key(b"b", CachePolicy::Read) - .expect("cannot fail") - .expect("should exist"); - - assert_eq!(&*handle.end_key, *b"c"); - assert_eq!(handle.offset, BlockOffset(0)); - } - - { - let handle = index - .get_lowest_block_containing_key(b"c", CachePolicy::Read) - .expect("cannot fail") - .expect("should exist"); - - assert_eq!(&*handle.end_key, *b"c"); - assert_eq!(handle.offset, BlockOffset(0)); - } - - { - let handle = index - .get_lowest_block_containing_key(b"d", CachePolicy::Read) - .expect("cannot fail") - .expect("should exist"); - - assert_eq!(&*handle.end_key, *b"g"); - assert_eq!(handle.offset, BlockOffset(10)); - } - - { - let handle = index - .get_lowest_block_containing_key(b"j", CachePolicy::Read) - .expect("cannot fail") - .expect("should exist"); - - assert_eq!(&*handle.end_key, *b"l"); - assert_eq!(handle.offset, BlockOffset(30)); - } - - { - let handle = index - .get_lowest_block_containing_key(b"m", CachePolicy::Read) - .expect("cannot fail") - .expect("should exist"); - - assert_eq!(&*handle.end_key, *b"t"); - assert_eq!(handle.offset, BlockOffset(40)); - } - - { - let handle = index - .get_lowest_block_containing_key(b"t", CachePolicy::Read) - .expect("cannot fail") - .expect("should exist"); - - assert_eq!(&*handle.end_key, *b"t"); - assert_eq!(handle.offset, BlockOffset(40)); - } - - { - let handle = index - .get_lowest_block_containing_key(b"z", CachePolicy::Read) - .expect("cannot fail"); - - assert!(handle.is_none()); - } - } - - #[test] - fn block_handle_array_spanning_lowest() { - let index = [ - bh(*b"a", BlockOffset(0)), - bh(*b"a", BlockOffset(10)), - bh(*b"a", BlockOffset(20)), - bh(*b"a", BlockOffset(30)), - bh(*b"b", BlockOffset(40)), - bh(*b"b", BlockOffset(50)), - bh(*b"c", BlockOffset(60)), - ]; - - { - let handle = index - .get_lowest_block_containing_key(b"0", CachePolicy::Read) - .expect("cannot fail") - .expect("should exist"); - - assert_eq!(&*handle.end_key, *b"a"); - assert_eq!(handle.offset, BlockOffset(0)); - } - - { - let handle = index - .get_lowest_block_containing_key(b"a", CachePolicy::Read) - .expect("cannot fail") - .expect("should exist"); - - assert_eq!(&*handle.end_key, *b"a"); - assert_eq!(handle.offset, BlockOffset(0)); - } - - { - let handle = index - .get_lowest_block_containing_key(b"ab", CachePolicy::Read) - .expect("cannot fail") - .expect("should exist"); - - assert_eq!(&*handle.end_key, *b"b"); - assert_eq!(handle.offset, BlockOffset(40)); - } - - { - let handle = index - .get_lowest_block_containing_key(b"b", CachePolicy::Read) - .expect("cannot fail") - .expect("should exist"); - - assert_eq!(&*handle.end_key, *b"b"); - assert_eq!(handle.offset, BlockOffset(40)); - } - - { - let handle = index - .get_lowest_block_containing_key(b"c", CachePolicy::Read) - .expect("cannot fail") - .expect("should exist"); - - assert_eq!(&*handle.end_key, *b"c"); - assert_eq!(handle.offset, BlockOffset(60)); - } - - { - let handle = index - .get_lowest_block_containing_key(b"d", CachePolicy::Read) - .expect("cannot fail"); - - assert!(handle.is_none()); - } - } - - #[test] - fn block_handle_array_last_of_key() { - let index = [ - bh(*b"a", BlockOffset(0)), - bh(*b"a", BlockOffset(10)), - bh(*b"a", BlockOffset(20)), - bh(*b"a", BlockOffset(30)), - bh(*b"b", BlockOffset(40)), - bh(*b"b", BlockOffset(50)), - bh(*b"c", BlockOffset(60)), - ]; - - { - let handle = index - .get_last_block_containing_key(b"0", CachePolicy::Read) - .expect("cannot fail") - .expect("should exist"); - - assert_eq!(&*handle.end_key, *b"a"); - assert_eq!(handle.offset, BlockOffset(0)); - } - - { - let handle = index - .get_last_block_containing_key(b"a", CachePolicy::Read) - .expect("cannot fail") - .expect("should exist"); - - assert_eq!(&*handle.end_key, *b"b"); - assert_eq!(handle.offset, BlockOffset(40)); - } - - { - let handle = index - .get_last_block_containing_key(b"ab", CachePolicy::Read) - .expect("cannot fail") - .expect("should exist"); - - assert_eq!(&*handle.end_key, *b"b"); - assert_eq!(handle.offset, BlockOffset(40)); - } - - { - let handle = index - .get_last_block_containing_key(b"b", CachePolicy::Read) - .expect("cannot fail") - .expect("should exist"); - - assert_eq!(&*handle.end_key, *b"c"); - assert_eq!(handle.offset, BlockOffset(60)); - } - - { - let handle = index - .get_last_block_containing_key(b"c", CachePolicy::Read) - .expect("cannot fail") - .expect("should exist"); - - assert_eq!(&*handle.end_key, *b"c"); - assert_eq!(handle.offset, BlockOffset(60)); - } - - { - let handle = index - .get_last_block_containing_key(b"d", CachePolicy::Read) - .expect("cannot fail"); - - assert!(handle.is_none()); - } - } - - #[test] - fn block_handle_array_last() { - let index = [ - bh(*b"a", BlockOffset(0)), - bh(*b"a", BlockOffset(10)), - bh(*b"a", BlockOffset(20)), - bh(*b"a", BlockOffset(30)), - bh(*b"b", BlockOffset(40)), - bh(*b"b", BlockOffset(50)), - bh(*b"c", BlockOffset(60)), - ]; - - { - let handle = index - .get_last_block_handle(CachePolicy::Read) - .expect("cannot fail"); - - assert_eq!(&*handle.end_key, *b"c"); - assert_eq!(handle.offset, BlockOffset(60)); - } - } -} diff --git a/src/segment/block_index/top_level.rs b/src/segment/block_index/top_level.rs deleted file mode 100644 index 7c6b9ea2..00000000 --- a/src/segment/block_index/top_level.rs +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use super::{block_handle::KeyedBlockHandle, KeyedBlockIndex}; -use crate::segment::{ - block::offset::BlockOffset, block_index::IndexBlock, value_block::CachePolicy, -}; -use std::{fs::File, path::Path}; - -/// The top-level index (TLI) is the level-0 index in a partitioned (two-level) block index -/// -/// See `top_level_index.rs` for more info. -#[allow(clippy::module_name_repetitions)] -#[derive(Debug)] -pub struct TopLevelIndex(Box<[KeyedBlockHandle]>); - -impl TopLevelIndex { - pub fn from_file( - path: &Path, - _: &crate::segment::meta::Metadata, - tli_ptr: BlockOffset, - ) -> crate::Result { - log::trace!("reading TLI from {path:?} at tli_ptr={tli_ptr}"); - - let mut file = File::open(path)?; - let items = IndexBlock::from_file(&mut file, tli_ptr)?.items; - - log::trace!("loaded TLI ({path:?}): {items:?}"); - debug_assert!(!items.is_empty()); - - Ok(Self::from_boxed_slice(items)) - } - - /// Creates a top-level block index - #[must_use] - pub fn from_boxed_slice(handles: Box<[KeyedBlockHandle]>) -> Self { - Self(handles) - } - - #[must_use] - pub fn len(&self) -> usize { - self.0.len() - } - - #[must_use] - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - - pub fn iter(&self) -> impl Iterator { - self.0.iter() - } -} - -impl KeyedBlockIndex for TopLevelIndex { - fn get_lowest_block_containing_key( - &self, - key: &[u8], - _: CachePolicy, - ) -> crate::Result> { - self.0 - .get_lowest_block_containing_key(key, CachePolicy::Read) - } - - /// Gets the last block handle that may contain the given item - fn get_last_block_containing_key( - &self, - key: &[u8], - cache_policy: CachePolicy, - ) -> crate::Result> { - self.0.get_last_block_containing_key(key, cache_policy) - } - - fn get_last_block_handle(&self, _: CachePolicy) -> crate::Result<&KeyedBlockHandle> { - self.0.get_last_block_handle(CachePolicy::Read) - } -} diff --git a/src/segment/block_index/two_level_index.rs b/src/segment/block_index/two_level_index.rs deleted file mode 100644 index 9b9e6cd5..00000000 --- a/src/segment/block_index/two_level_index.rs +++ /dev/null @@ -1,239 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use super::{ - super::{id::GlobalSegmentId, value_block::CachePolicy}, - top_level::TopLevelIndex, - BlockIndex, IndexBlock, -}; -use crate::{ - cache::Cache, - descriptor_table::FileDescriptorTable, - segment::{block::offset::BlockOffset, meta::Metadata}, -}; -use std::{path::Path, sync::Arc}; - -/// Allows reading index blocks - just a wrapper around a block cache -#[allow(clippy::module_name_repetitions)] -pub struct IndexBlockFetcher(Arc); - -impl IndexBlockFetcher { - pub fn insert(&self, segment_id: GlobalSegmentId, offset: BlockOffset, value: Arc) { - self.0.insert_index_block(segment_id, offset, value); - } - - #[must_use] - pub fn get(&self, segment_id: GlobalSegmentId, offset: BlockOffset) -> Option> { - self.0.get_index_block(segment_id, offset) - } -} - -/// Index that translates item keys to data block handles -/// -/// The index is only partially loaded into memory. -/// -/// See -#[allow(clippy::module_name_repetitions)] -pub struct TwoLevelBlockIndex { - segment_id: GlobalSegmentId, - - descriptor_table: Arc, - - /// Level-0 index. Is read-only and always fully loaded. - /// - /// This index points to index blocks inside the level-1 index. - pub(crate) top_level_index: TopLevelIndex, - - /// Level-1 index. - /// - /// This index is only partially loaded into memory, decreasing memory usage, compared to a fully loaded one. - /// - /// However to find a disk block, one layer of indirection is required: - /// - /// To find a reference to a segment block, first the level-0 index needs to be checked, - /// then the corresponding index block needs to be loaded, which contains the wanted disk block handle. - index_block_fetcher: IndexBlockFetcher, -} - -impl BlockIndex for TwoLevelBlockIndex { - fn get_lowest_block_containing_key( - &self, - key: &[u8], - cache_policy: CachePolicy, - ) -> crate::Result> { - self.get_lowest_data_block_handle_containing_item(key, cache_policy) - } - - fn get_last_block_handle(&self, cache_policy: CachePolicy) -> crate::Result { - self.get_last_data_block_handle(cache_policy) - } - - fn get_last_block_containing_key( - &self, - key: &[u8], - cache_policy: CachePolicy, - ) -> crate::Result> { - self.get_last_data_block_handle_containing_item(key, cache_policy) - } -} - -impl TwoLevelBlockIndex { - /// Gets the lowest block handle that may contain the given item - pub fn get_lowest_data_block_handle_containing_item( - &self, - key: &[u8], - cache_policy: CachePolicy, - ) -> crate::Result> { - use super::KeyedBlockIndex; - - let Some(index_block_handle) = self - .top_level_index - .get_lowest_block_containing_key(key, cache_policy) - .expect("cannot fail") - else { - return Ok(None); - }; - - let index_block = self.load_index_block(index_block_handle.offset, cache_policy)?; - - Ok({ - use super::KeyedBlockIndex; - - index_block - .items - .get_lowest_block_containing_key(key, cache_policy) - .expect("cannot fail") - .map(|x| x.offset) - }) - } - - /// Gets the last block handle that may contain the given item - pub fn get_last_data_block_handle_containing_item( - &self, - key: &[u8], - cache_policy: CachePolicy, - ) -> crate::Result> { - use super::KeyedBlockIndex; - - let Some(index_block_handle) = self - .top_level_index - .get_last_block_containing_key(key, cache_policy) - .expect("cannot fail") - else { - return Ok(Some(self.get_last_data_block_handle(cache_policy)?)); - }; - - let index_block = self.load_index_block(index_block_handle.offset, cache_policy)?; - - Ok({ - use super::KeyedBlockIndex; - - index_block - .items - .get_last_block_containing_key(key, cache_policy) - .expect("cannot fail") - .map(|x| x.offset) - }) - } - - pub fn get_last_data_block_handle( - &self, - cache_policy: CachePolicy, - ) -> crate::Result { - use super::KeyedBlockIndex; - - let index_block_handle = self - .top_level_index - .get_last_block_handle(cache_policy) - .expect("cannot fail"); - - let index_block = self.load_index_block(index_block_handle.offset, cache_policy)?; - - Ok(index_block - .items - .last() - .expect("index block should not be empty") - .offset) - } - - /// Loads an index block from disk - pub fn load_index_block( - &self, - offset: BlockOffset, - cache_policy: CachePolicy, - ) -> crate::Result> { - log::trace!("loading index block {:?}/{offset:?}", self.segment_id); - - if let Some(block) = self.index_block_fetcher.get(self.segment_id, offset) { - // Cache hit: Copy from block - - Ok(block) - } else { - // Cache miss: load from disk - - let file_guard = self - .descriptor_table - .access(&self.segment_id)? - .expect("should acquire file handle"); - - let block = IndexBlock::from_file( - &mut *file_guard.file.lock().expect("lock is poisoned"), - offset, - ) - .map_err(|e| { - log::error!( - "Failed to load index block {:?}/{:?}: {e:?}", - self.segment_id, - offset - ); - e - })?; - // TODO: ^ inspect_err instead: 1.76 - - drop(file_guard); - - let block = Arc::new(block); - - if cache_policy == CachePolicy::Write { - self.index_block_fetcher - .insert(self.segment_id, offset, block.clone()); - } - - Ok(block) - } - } - - #[cfg(test)] - #[allow(dead_code, clippy::expect_used)] - pub(crate) fn new(segment_id: GlobalSegmentId, block_cache: Arc) -> Self { - let index_block_index = IndexBlockFetcher(block_cache); - - Self { - descriptor_table: Arc::new(FileDescriptorTable::new(512, 1)), - segment_id, - index_block_fetcher: index_block_index, - top_level_index: TopLevelIndex::from_boxed_slice(Box::default()), - } - } - - pub fn from_file( - file_path: &Path, - metadata: &Metadata, - tli_ptr: BlockOffset, - segment_id: GlobalSegmentId, - descriptor_table: Arc, - block_cache: Arc, - ) -> crate::Result { - log::trace!("Reading block index from {file_path:?}"); - - let top_level_index = TopLevelIndex::from_file(file_path, metadata, tli_ptr)?; - - Ok(Self { - descriptor_table, - segment_id, - top_level_index, - index_block_fetcher: IndexBlockFetcher(block_cache), - }) - } -} diff --git a/src/segment/block_index/writer.rs b/src/segment/block_index/writer.rs deleted file mode 100644 index d4e654f9..00000000 --- a/src/segment/block_index/writer.rs +++ /dev/null @@ -1,172 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use super::{IndexBlock, KeyedBlockHandle}; -use crate::{ - coding::Encode, - segment::{ - block::{header::Header as BlockHeader, offset::BlockOffset}, - meta::CompressionType, - }, - value::UserKey, -}; -use std::{ - fs::File, - io::{BufWriter, Seek, Write}, -}; - -pub struct Writer { - file_pos: BlockOffset, - - prev_pos: (BlockOffset, BlockOffset), - - write_buffer: Vec, - - block_size: u32, - compression: CompressionType, - - buffer_size: u32, - - block_handles: Vec, - tli_pointers: Vec, - - pub block_count: usize, -} - -impl Writer { - pub fn new(block_size: u32) -> crate::Result { - Ok(Self { - file_pos: BlockOffset(0), - prev_pos: (BlockOffset(0), BlockOffset(0)), - write_buffer: Vec::with_capacity(u16::MAX.into()), - buffer_size: 0, - block_size, - compression: CompressionType::None, - block_handles: Vec::new(), - tli_pointers: Vec::new(), - block_count: 0, - }) - } - - #[must_use] - pub fn use_compression(mut self, compression: CompressionType) -> Self { - self.compression = compression; - self - } - - fn write_block(&mut self) -> crate::Result<()> { - // Write to file - let (header, data) = IndexBlock::to_bytes_compressed( - &self.block_handles, - self.prev_pos.0, - self.compression, - )?; - - header.encode_into(&mut self.write_buffer)?; - self.write_buffer.write_all(&data)?; - - // NOTE: Expect is fine, the block size definitely fits into u64 - #[allow(clippy::expect_used)] - let bytes_written: u64 = (BlockHeader::serialized_len() + data.len()) - .try_into() - .expect("block size should fit into u64"); - - // NOTE: Expect is fine, because the chunk is not empty - // - // Also, we are allowed to remove the last item - // to get ownership of it, because the chunk is cleared after - // this anyway - #[allow(clippy::expect_used)] - let last = self.block_handles.pop().expect("Chunk should not be empty"); - - let index_block_handle = KeyedBlockHandle { - end_key: last.end_key, - offset: self.file_pos, - }; - - self.tli_pointers.push(index_block_handle); - - // Adjust metadata - self.file_pos += bytes_written; - self.block_count += 1; - - // Back link stuff - self.prev_pos.0 = self.prev_pos.1; - self.prev_pos.1 += bytes_written; - - // IMPORTANT: Clear buffer after everything else - self.block_handles.clear(); - self.buffer_size = 0; - - Ok(()) - } - - pub fn register_block(&mut self, end_key: UserKey, offset: BlockOffset) -> crate::Result<()> { - // NOTE: Truncation is OK, because a key is bound by 65535 bytes, so can never exceed u32s - #[allow(clippy::cast_possible_truncation)] - let block_handle_size = (end_key.len() + std::mem::size_of::()) as u32; - - let block_handle = KeyedBlockHandle { end_key, offset }; - - self.block_handles.push(block_handle); - - self.buffer_size += block_handle_size; - - if self.buffer_size >= self.block_size { - self.write_block()?; - } - - Ok(()) - } - - fn write_top_level_index( - &mut self, - block_file_writer: &mut BufWriter, - file_offset: BlockOffset, - ) -> crate::Result { - block_file_writer.write_all(&self.write_buffer)?; - let tli_ptr = block_file_writer.stream_position()?; - - log::trace!("Concatted index blocks onto blocks file"); - - for item in &mut self.tli_pointers { - item.offset += file_offset; - } - - // Write to file - let (header, data) = - IndexBlock::to_bytes_compressed(&self.tli_pointers, BlockOffset(0), self.compression)?; - - header.encode_into(block_file_writer)?; - block_file_writer.write_all(&data)?; - - let bytes_written = BlockHeader::serialized_len() + data.len(); - - block_file_writer.flush()?; - block_file_writer.get_mut().sync_all()?; - - log::trace!( - "Written top level index, with {} pointers ({} bytes)", - self.tli_pointers.len(), - bytes_written, - ); - - Ok(tli_ptr) - } - - /// Returns the offset in the file to TLI - pub fn finish( - &mut self, - block_file_writer: &mut BufWriter, - ) -> crate::Result { - if self.buffer_size > 0 { - self.write_block()?; - } - - let index_block_ptr = BlockOffset(block_file_writer.stream_position()?); - let tli_ptr = self.write_top_level_index(block_file_writer, index_block_ptr)?; - - Ok(BlockOffset(tli_ptr)) - } -} diff --git a/src/segment/file_offsets.rs b/src/segment/file_offsets.rs deleted file mode 100644 index 97cf44f5..00000000 --- a/src/segment/file_offsets.rs +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use super::block::offset::BlockOffset; -use crate::coding::{Decode, DecodeError, Encode, EncodeError}; -use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; -use std::io::{Read, Write}; - -#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)] -pub struct FileOffsets { - pub metadata_ptr: BlockOffset, - pub index_block_ptr: BlockOffset, - pub tli_ptr: BlockOffset, - pub bloom_ptr: BlockOffset, - - // TODO: #46 https://github.com/fjall-rs/lsm-tree/issues/46 - pub range_filter_ptr: BlockOffset, - - // TODO: #2 https://github.com/fjall-rs/lsm-tree/issues/2 - pub range_tombstones_ptr: BlockOffset, - - // TODO: prefix filter for l0, l1? - pub pfx_ptr: BlockOffset, -} - -impl FileOffsets { - /// Returns the on-disk size - #[must_use] - pub const fn serialized_len() -> usize { - 7 * std::mem::size_of::() - } -} - -impl Encode for FileOffsets { - fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { - writer.write_u64::(*self.metadata_ptr)?; - writer.write_u64::(*self.index_block_ptr)?; - writer.write_u64::(*self.tli_ptr)?; - writer.write_u64::(*self.bloom_ptr)?; - writer.write_u64::(*self.range_filter_ptr)?; - writer.write_u64::(*self.range_tombstones_ptr)?; - writer.write_u64::(*self.pfx_ptr)?; - Ok(()) - } -} - -impl Decode for FileOffsets { - fn decode_from(reader: &mut R) -> Result { - let metadata_ptr = reader.read_u64::()?; - let index_block_ptr = reader.read_u64::()?; - let tli_ptr = reader.read_u64::()?; - let bloom_ptr = reader.read_u64::()?; - let rf_ptr = reader.read_u64::()?; - let range_tombstones_ptr = reader.read_u64::()?; - let pfx_ptr = reader.read_u64::()?; - - Ok(Self { - index_block_ptr: BlockOffset(index_block_ptr), - tli_ptr: BlockOffset(tli_ptr), - bloom_ptr: BlockOffset(bloom_ptr), - range_filter_ptr: BlockOffset(rf_ptr), - range_tombstones_ptr: BlockOffset(range_tombstones_ptr), - pfx_ptr: BlockOffset(pfx_ptr), - metadata_ptr: BlockOffset(metadata_ptr), - }) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::io::Cursor; - use test_log::test; - - #[test] - fn file_offsets_roundtrip() -> crate::Result<()> { - let before = FileOffsets { - bloom_ptr: BlockOffset(15), - index_block_ptr: BlockOffset(14), - metadata_ptr: BlockOffset(17), - pfx_ptr: BlockOffset(18), - range_filter_ptr: BlockOffset(13), - range_tombstones_ptr: BlockOffset(5), - tli_ptr: BlockOffset(4), - }; - - let buf = before.encode_into_vec(); - - let mut cursor = Cursor::new(buf); - let after = FileOffsets::decode_from(&mut cursor)?; - - assert_eq!(after, before); - - Ok(()) - } - - #[test] - fn file_offsets_serialized_len() { - let buf = FileOffsets::default().encode_into_vec(); - assert_eq!(FileOffsets::serialized_len(), buf.len()); - } -} diff --git a/src/segment/forward_reader.rs b/src/segment/forward_reader.rs deleted file mode 100644 index 93fdf576..00000000 --- a/src/segment/forward_reader.rs +++ /dev/null @@ -1,133 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use super::{ - block::offset::BlockOffset, - value_block::{CachePolicy, ValueBlock}, - value_block_consumer::ValueBlockConsumer, -}; -use crate::{ - cache::Cache, descriptor_table::FileDescriptorTable, segment::block::header::Header, - value::InternalValue, GlobalSegmentId, -}; - -/// Segment forward reader specialized for point reads -pub struct ForwardReader<'a> { - segment_id: GlobalSegmentId, - - descriptor_table: &'a FileDescriptorTable, - block_cache: &'a Cache, - - data_block_boundary: BlockOffset, - - pub lo_block_offset: BlockOffset, - pub(crate) lo_block_size: u64, - pub(crate) lo_block_items: Option, - pub(crate) lo_initialized: bool, - - cache_policy: CachePolicy, -} - -impl<'a> ForwardReader<'a> { - #[must_use] - pub fn new( - data_block_boundary: BlockOffset, - descriptor_table: &'a FileDescriptorTable, - segment_id: GlobalSegmentId, - block_cache: &'a Cache, - lo_block_offset: BlockOffset, - ) -> Self { - Self { - descriptor_table, - segment_id, - block_cache, - - data_block_boundary, - - lo_block_offset, - lo_block_size: 0, - lo_block_items: None, - lo_initialized: false, - - cache_policy: CachePolicy::Write, - } - } - - fn load_data_block( - &self, - offset: BlockOffset, - ) -> crate::Result> { - let block = ValueBlock::load_by_block_handle( - self.descriptor_table, - self.block_cache, - self.segment_id, - offset, - self.cache_policy, - )?; - - // Truncate as many items as possible - block.map_or(Ok(None), |block| { - Ok(Some(( - block.header.data_length.into(), - block.header.previous_block_offset, - ValueBlockConsumer::with_bounds(block, None, None), - ))) - }) - } - - fn initialize_lo(&mut self) -> crate::Result<()> { - if let Some((size, _, items)) = self.load_data_block(self.lo_block_offset)? { - self.lo_block_items = Some(items); - self.lo_block_size = size; - } - - self.lo_initialized = true; - - Ok(()) - } -} - -impl<'a> Iterator for ForwardReader<'a> { - type Item = crate::Result; - - fn next(&mut self) -> Option { - if !self.lo_initialized { - fail_iter!(self.initialize_lo()); - } - - if let Some(head) = self.lo_block_items.as_mut()?.next() { - // Just consume item - return Some(Ok(head)); - } - - // Load next block - let next_block_offset = BlockOffset( - *self.lo_block_offset + Header::serialized_len() as u64 + self.lo_block_size, - ); - - if next_block_offset >= self.data_block_boundary { - // We are done - return None; - } - - assert_ne!( - self.lo_block_offset, next_block_offset, - "invalid next block offset", - ); - - match fail_iter!(self.load_data_block(next_block_offset)) { - Some((size, _, items)) => { - self.lo_block_items = Some(items); - self.lo_block_size = size; - self.lo_block_offset = next_block_offset; - - // We just loaded the block - self.lo_block_items.as_mut()?.next().map(Ok) - } - None => { - panic!("searched for invalid data block"); - } - } - } -} diff --git a/src/segment/inner.rs b/src/segment/inner.rs deleted file mode 100644 index 69ea5483..00000000 --- a/src/segment/inner.rs +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use super::{block_index::BlockIndexImpl, file_offsets::FileOffsets, meta::Metadata}; -use crate::{cache::Cache, descriptor_table::FileDescriptorTable, tree::inner::TreeId}; -use std::{ - path::PathBuf, - sync::{atomic::AtomicBool, Arc}, -}; - -pub struct Inner { - pub path: PathBuf, - - pub(crate) tree_id: TreeId, - - #[doc(hidden)] - pub descriptor_table: Arc, - - /// Segment metadata object - #[doc(hidden)] - pub metadata: Metadata, - - pub(crate) offsets: FileOffsets, - - /// Translates key (first item of a block) to block offset (address inside file) and (compressed) size - #[doc(hidden)] - pub block_index: Arc, - - /// Block cache - /// - /// Stores index and data blocks - #[doc(hidden)] - pub cache: Arc, - - /// Bloom filter - #[doc(hidden)] - pub bloom_filter: Option, - - pub is_deleted: AtomicBool, -} - -impl Drop for Inner { - fn drop(&mut self) { - let global_id = (self.tree_id, self.metadata.id).into(); - - if self.is_deleted.load(std::sync::atomic::Ordering::Acquire) { - if let Err(e) = std::fs::remove_file(&self.path) { - log::warn!( - "Failed to cleanup deleted segment {global_id:?} at {:?}: {e:?}", - self.path, - ); - } - - log::trace!("Closing file handles of deleted segment file {global_id:?}"); - self.descriptor_table.remove(global_id); - } - } -} diff --git a/src/segment/meta/mod.rs b/src/segment/meta/mod.rs deleted file mode 100644 index 204cfac6..00000000 --- a/src/segment/meta/mod.rs +++ /dev/null @@ -1,282 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -mod compression; -mod table_type; - -use super::writer::Writer; -use crate::{ - coding::{Decode, DecodeError, Encode, EncodeError}, - file::MAGIC_BYTES, - time::unix_timestamp, - value::SeqNo, - KeyRange, -}; -use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; -use std::{ - io::{Cursor, Read, Write}, - path::Path, -}; -pub use {compression::CompressionType, table_type::TableType}; - -pub type SegmentId = u64; - -#[derive(Clone, Debug, Eq, PartialEq)] -pub struct Metadata { - /// Segment ID - pub id: SegmentId, - - /// Creation time as unix timestamp (in µs) - pub created_at: u128, - - /// Number of KV-pairs in the segment - /// - /// This may include tombstones and multiple versions of the same key - pub item_count: u64, - - /// Number of unique keys in the segment - /// - /// This may include tombstones - pub key_count: u64, - - /// Number of tombstones - pub tombstone_count: u64, - - /// Number of range tombstones - pub(crate) range_tombstone_count: u64, - - /// compressed size in bytes (on disk) (without the fixed size trailer) - pub file_size: u64, - - /// true size in bytes (if no compression were used) - pub uncompressed_size: u64, - - /// Data block size (uncompressed) - pub data_block_size: u32, - - /// Index block size (uncompressed) - pub index_block_size: u32, - - /// Number of written data blocks - pub data_block_count: u32, - - /// Number of written index blocks - pub index_block_count: u32, - - /// What type of compression is used - pub compression: CompressionType, - - /// Type of table (unused) - pub(crate) table_type: TableType, - - /// Sequence number range - pub seqnos: (SeqNo, SeqNo), - - /// Key range - pub key_range: KeyRange, -} - -impl Encode for Metadata { - fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { - // Write header - writer.write_all(&MAGIC_BYTES)?; - - writer.write_u64::(self.id)?; - - writer.write_u128::(self.created_at)?; - - writer.write_u64::(self.item_count)?; - writer.write_u64::(self.key_count)?; - writer.write_u64::(self.tombstone_count)?; - writer.write_u64::(self.range_tombstone_count)?; - - writer.write_u64::(self.file_size)?; - writer.write_u64::(self.uncompressed_size)?; - - writer.write_u32::(self.data_block_size)?; - writer.write_u32::(self.index_block_size)?; - - writer.write_u32::(self.data_block_count)?; - writer.write_u32::(self.index_block_count)?; - - self.compression.encode_into(writer)?; - - writer.write_u8(self.table_type.into())?; - - writer.write_u64::(self.seqnos.0)?; - writer.write_u64::(self.seqnos.1)?; - - self.key_range.encode_into(writer)?; - - Ok(()) - } -} - -impl Decode for Metadata { - fn decode_from(reader: &mut R) -> Result { - // Check header - let mut magic = [0u8; MAGIC_BYTES.len()]; - reader.read_exact(&mut magic)?; - - if magic != MAGIC_BYTES { - return Err(DecodeError::InvalidHeader("SegmentMetadata")); - } - - let id = reader.read_u64::()?; - - let created_at = reader.read_u128::()?; - - let item_count = reader.read_u64::()?; - let key_count = reader.read_u64::()?; - let tombstone_count = reader.read_u64::()?; - let range_tombstone_count = reader.read_u64::()?; - - let file_size = reader.read_u64::()?; - let uncompressed_size = reader.read_u64::()?; - - let data_block_size = reader.read_u32::()?; - let index_block_size = reader.read_u32::()?; - - let data_block_count = reader.read_u32::()?; - let index_block_count = reader.read_u32::()?; - - let compression = CompressionType::decode_from(reader)?; - - let table_type = reader.read_u8()?; - let table_type = TableType::try_from(table_type) - .map_err(|()| DecodeError::InvalidTag(("TableType", table_type)))?; - - let seqno_min = reader.read_u64::()?; - let seqno_max = reader.read_u64::()?; - - let key_range = KeyRange::decode_from(reader)?; - - Ok(Self { - id, - created_at, - - item_count, - key_count, - tombstone_count, - range_tombstone_count, - - file_size, - uncompressed_size, - - data_block_size, - index_block_size, - - data_block_count, - index_block_count, - - compression, - table_type, - - seqnos: (seqno_min, seqno_max), - - key_range, - }) - } -} - -impl Metadata { - /// Consumes a writer and its metadata to create the segment metadata. - /// - /// The writer should not be empty. - pub fn from_writer(id: SegmentId, writer: &Writer) -> crate::Result { - Ok(Self { - id, - - // NOTE: Using seconds is not granular enough - // But because millis already returns u128, might as well use micros :) - created_at: unix_timestamp().as_micros(), - - compression: CompressionType::None, - table_type: TableType::Block, - - // NOTE: Truncation is OK - even with the smallest block size (1 KiB), 4 billion blocks would be 4 TB - #[allow(clippy::cast_possible_truncation)] - data_block_count: writer.meta.data_block_count as u32, - - // NOTE: Truncation is OK as well - #[allow(clippy::cast_possible_truncation)] - index_block_count: writer.meta.index_block_count as u32, - - data_block_size: writer.opts.data_block_size, - index_block_size: writer.opts.index_block_size, - - file_size: *writer.meta.file_pos, - uncompressed_size: writer.meta.uncompressed_size, - item_count: writer.meta.item_count as u64, - key_count: writer.meta.key_count as u64, - - // NOTE: from_writer should not be called when the writer wrote nothing - #[allow(clippy::expect_used)] - key_range: KeyRange::new(( - writer - .meta - .first_key - .clone() - .expect("should have written at least 1 item"), - writer - .meta - .last_key - .clone() - .expect("should have written at least 1 item"), - )), - - seqnos: (writer.meta.lowest_seqno, writer.meta.highest_seqno), - - tombstone_count: writer.meta.tombstone_count as u64, - - // TODO: #2 https://github.com/fjall-rs/lsm-tree/issues/2 - range_tombstone_count: 0, - }) - } - - /// Reads and parses a Segment metadata file - pub fn from_disk(path: &Path) -> crate::Result { - let file_content = std::fs::read(path)?; - let mut cursor = Cursor::new(file_content); - let meta = Self::decode_from(&mut cursor)?; - Ok(meta) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::io::Cursor; - use test_log::test; - - #[test] - fn segment_metadata_serde_round_trip() -> crate::Result<()> { - let metadata = Metadata { - data_block_count: 0, - index_block_count: 0, - data_block_size: 4_096, - index_block_size: 4_096, - created_at: 5, - id: 632_632, - file_size: 1, - compression: CompressionType::None, - table_type: TableType::Block, - item_count: 0, - key_count: 0, - key_range: KeyRange::new((vec![2].into(), vec![5].into())), - tombstone_count: 0, - range_tombstone_count: 0, - uncompressed_size: 0, - seqnos: (0, 5), - }; - - let bytes = metadata.encode_into_vec(); - let mut cursor = Cursor::new(bytes); - let metadata_copy = Metadata::decode_from(&mut cursor)?; - - assert_eq!(metadata, metadata_copy); - - Ok(()) - } -} diff --git a/src/segment/meta/table_type.rs b/src/segment/meta/table_type.rs deleted file mode 100644 index 298d1d41..00000000 --- a/src/segment/meta/table_type.rs +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -#[derive(Copy, Clone, Debug, Eq, PartialEq)] -pub enum TableType { - Block, -} - -impl From for u8 { - fn from(val: TableType) -> Self { - match val { - TableType::Block => 0, - } - } -} - -impl TryFrom for TableType { - type Error = (); - - fn try_from(value: u8) -> Result { - match value { - 0 => Ok(Self::Block), - _ => Err(()), - } - } -} diff --git a/src/segment/mod.rs b/src/segment/mod.rs deleted file mode 100644 index 4ce357ec..00000000 --- a/src/segment/mod.rs +++ /dev/null @@ -1,490 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -pub mod block; -pub mod block_index; -pub mod file_offsets; -mod forward_reader; -pub mod id; -pub mod inner; -pub mod meta; -pub mod multi_writer; -pub mod range; -pub mod reader; -pub mod scanner; -pub mod trailer; -pub mod value_block; -pub mod value_block_consumer; -pub mod writer; - -use crate::{ - bloom::{BloomFilter, CompositeHash}, - cache::Cache, - descriptor_table::FileDescriptorTable, - time::unix_timestamp, - tree::inner::TreeId, - value::{InternalValue, SeqNo, UserKey}, -}; -use block_index::BlockIndexImpl; -use forward_reader::ForwardReader; -use id::GlobalSegmentId; -use inner::Inner; -use meta::SegmentId; -use range::Range; -use scanner::Scanner; -use std::{ - ops::Bound, - path::Path, - sync::{atomic::AtomicBool, Arc}, -}; - -#[allow(clippy::module_name_repetitions)] -pub type SegmentInner = Inner; - -/// Disk segment (a.k.a. `SSTable`, `SST`, `sorted string table`) that is located on disk -/// -/// A segment is an immutable list of key-value pairs, split into compressed blocks. -/// A reference to the block (`block handle`) is saved in the "block index". -/// -/// Deleted entries are represented by tombstones. -/// -/// Segments can be merged together to improve read performance and reduce disk space by removing outdated item versions. -#[doc(alias("sstable", "sst", "sorted string table"))] -#[derive(Clone)] -pub struct Segment(Arc); - -impl From for Segment { - fn from(value: Inner) -> Self { - Self(Arc::new(value)) - } -} - -impl std::ops::Deref for Segment { - type Target = Inner; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl std::fmt::Debug for Segment { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "Segment:{}({})", self.id(), self.metadata.key_range) - } -} - -impl Segment { - // TODO: in Leveled compaction, compact segments that live very long and have - // many versions (possibly unnecessary space usage of old, stale versions) - /// Calculates how many versions per key there are on average. - #[must_use] - pub fn version_factor(&self) -> f32 { - self.metadata.item_count as f32 / self.metadata.key_count as f32 - } - - /// Gets the segment age in nanoseconds. - #[must_use] - pub fn age(&self) -> u128 { - let now = unix_timestamp().as_nanos(); - let created_at = self.metadata.created_at * 1_000; - now.saturating_sub(created_at) - } - - /// Gets the global segment ID. - #[must_use] - pub fn global_id(&self) -> GlobalSegmentId { - (self.tree_id, self.id()).into() - } - - /// Gets the segment ID. - /// - /// The segment ID is unique for this tree, but not - /// across multiple trees, use [`Segment::global_id`] for that. - #[must_use] - pub fn id(&self) -> SegmentId { - self.metadata.id - } - - pub(crate) fn verify(&self) -> crate::Result { - use block::checksum::Checksum; - use block_index::IndexBlock; - use value_block::ValueBlock; - - let mut data_block_count = 0; - let mut broken_count = 0; - - let guard = self - .descriptor_table - .access(&self.global_id())? - .expect("should have gotten file"); - - let mut file = guard.file.lock().expect("lock is poisoned"); - - // TODO: maybe move to BlockIndexImpl::verify - match &*self.block_index { - BlockIndexImpl::Full(block_index) => { - for handle in block_index.iter() { - let value_block = match ValueBlock::from_file(&mut *file, handle.offset) { - Ok(v) => v, - Err(e) => { - log::error!( - "data block {handle:?} could not be loaded, it is probably corrupted: {e:?}" - ); - broken_count += 1; - data_block_count += 1; - continue; - } - }; - - let (_, data) = ValueBlock::to_bytes_compressed( - &value_block.items, - value_block.header.previous_block_offset, - value_block.header.compression, - )?; - let actual_checksum = Checksum::from_bytes(&data); - - if value_block.header.checksum != actual_checksum { - log::error!("{handle:?} is corrupted, invalid checksum value"); - broken_count += 1; - } - - data_block_count += 1; - - if data_block_count % 1_000 == 0 { - log::debug!("Checked {data_block_count} data blocks"); - } - } - } - BlockIndexImpl::TwoLevel(block_index) => { - // NOTE: TODO: because of 1.74.0 - #[allow(clippy::explicit_iter_loop)] - for handle in block_index.top_level_index.iter() { - let block = match IndexBlock::from_file(&mut *file, handle.offset) { - Ok(v) => v, - Err(e) => { - log::error!( - "index block {handle:?} could not be loaded, it is probably corrupted: {e:?}" - ); - broken_count += 1; - continue; - } - }; - - for handle in &*block.items { - let value_block = match ValueBlock::from_file(&mut *file, handle.offset) { - Ok(v) => v, - Err(e) => { - log::error!( - "data block {handle:?} could not be loaded, it is probably corrupted: {e:?}" - ); - broken_count += 1; - data_block_count += 1; - continue; - } - }; - - let (_, data) = ValueBlock::to_bytes_compressed( - &value_block.items, - value_block.header.previous_block_offset, - value_block.header.compression, - )?; - let actual_checksum = Checksum::from_bytes(&data); - - if value_block.header.checksum != actual_checksum { - log::error!("{handle:?} is corrupted, invalid checksum value"); - broken_count += 1; - } - - data_block_count += 1; - - if data_block_count % 1_000 == 0 { - log::debug!("Checked {data_block_count} data blocks"); - } - } - } - } - } - - if data_block_count != self.metadata.data_block_count { - log::error!( - "Not all data blocks were visited during verification of disk segment {:?}", - self.id(), - ); - broken_count += 1; - } - - Ok(broken_count) - } - - pub(crate) fn load_bloom( - path: &Path, - ptr: block::offset::BlockOffset, - ) -> crate::Result> { - Ok(if *ptr > 0 { - use crate::coding::Decode; - use std::{ - fs::File, - io::{Seek, SeekFrom}, - }; - - let mut reader = File::open(path)?; - reader.seek(SeekFrom::Start(*ptr))?; - Some(BloomFilter::decode_from(&mut reader)?) - } else { - None - }) - } - - /// Tries to recover a segment from a file. - pub(crate) fn recover( - file_path: &Path, - tree_id: TreeId, - cache: Arc, - descriptor_table: Arc, - use_full_block_index: bool, - ) -> crate::Result { - use block_index::{full_index::FullBlockIndex, two_level_index::TwoLevelBlockIndex}; - use trailer::SegmentFileTrailer; - - log::debug!("Recovering segment from file {file_path:?}"); - let trailer = SegmentFileTrailer::from_file(file_path)?; - - assert_eq!( - 0, *trailer.offsets.range_tombstones_ptr, - "Range tombstones not supported" - ); - - log::debug!( - "Creating block index, with tli_ptr={}", - trailer.offsets.tli_ptr - ); - - let block_index = if use_full_block_index { - let block_index = - FullBlockIndex::from_file(file_path, &trailer.metadata, &trailer.offsets)?; - - BlockIndexImpl::Full(block_index) - } else { - let block_index = TwoLevelBlockIndex::from_file( - file_path, - &trailer.metadata, - trailer.offsets.tli_ptr, - (tree_id, trailer.metadata.id).into(), - descriptor_table.clone(), - cache.clone(), - )?; - BlockIndexImpl::TwoLevel(block_index) - }; - - let bloom_ptr = trailer.offsets.bloom_ptr; - - Ok(Self(Arc::new(Inner { - path: file_path.into(), - - tree_id, - - descriptor_table, - metadata: trailer.metadata, - offsets: trailer.offsets, - - block_index: Arc::new(block_index), - cache, - - bloom_filter: Self::load_bloom(file_path, bloom_ptr)?, - - is_deleted: AtomicBool::default(), - }))) - } - - pub(crate) fn mark_as_deleted(&self) { - self.0 - .is_deleted - .store(true, std::sync::atomic::Ordering::Release); - } - - #[must_use] - /// Gets the bloom filter size - pub fn bloom_filter_size(&self) -> usize { - self.bloom_filter - .as_ref() - .map(super::bloom::BloomFilter::len) - .unwrap_or_default() - } - - pub fn get( - &self, - key: &[u8], - seqno: Option, - hash: CompositeHash, - ) -> crate::Result> { - if let Some(seqno) = seqno { - if self.metadata.seqnos.0 >= seqno { - return Ok(None); - } - } - - if let Some(bf) = &self.bloom_filter { - if !bf.contains_hash(hash) { - return Ok(None); - } - } - - self.point_read(key, seqno) - } - - fn point_read(&self, key: &[u8], seqno: Option) -> crate::Result> { - use block_index::BlockIndex; - use value_block::{CachePolicy, ValueBlock}; - use value_block_consumer::ValueBlockConsumer; - - let Some(first_block_handle) = self - .block_index - .get_lowest_block_containing_key(key, CachePolicy::Write)? - else { - return Ok(None); - }; - - let Some(block) = ValueBlock::load_by_block_handle( - &self.descriptor_table, - &self.cache, - self.global_id(), - first_block_handle, - CachePolicy::Write, - )? - else { - return Ok(None); - }; - - if seqno.is_none() { - // NOTE: Fastpath for non-seqno reads (which are most common) - // This avoids setting up a rather expensive block iterator - // (see explanation for that below) - // This only really works because sequence numbers are sorted - // in descending order - return Ok(block.get_latest(key).cloned()); - } - - let mut reader = ForwardReader::new( - self.offsets.index_block_ptr, - &self.descriptor_table, - self.global_id(), - &self.cache, - first_block_handle, - ); - reader.lo_block_size = block.header.data_length.into(); - reader.lo_block_items = Some(ValueBlockConsumer::with_bounds(block, Some(key), None)); - reader.lo_initialized = true; - - // NOTE: For finding a specific seqno, - // we need to use a reader - // because nothing really prevents the version - // we are searching for to be in the next block - // after the one our key starts in, or the block after that - // - // Example (key:seqno), searching for a:2: - // - // [..., a:5, a:4] [a:3, a:2, b: 4, b:3] - // ^ ^ - // Block A Block B - // - // Based on get_lower_bound_block, "a" is in Block A - // However, we are searching for A with seqno 2, which - // unfortunately is in the next block - // - // Also because of weak tombstones, we may have to look further than the first item we encounter - let mut reader = reader.filter(|x| { - match x { - Ok(entry) => { - // Check for seqno if needed - if let Some(seqno) = seqno { - entry.key.seqno < seqno - } else { - true - } - } - Err(_) => true, - } - }); - - let Some(entry) = reader.next().transpose()? else { - return Ok(None); - }; - - // NOTE: We are past the searched key, so don't return anything - if &*entry.key.user_key > key { - return Ok(None); - } - - Ok(Some(entry)) - } - - #[must_use] - pub fn is_key_in_key_range(&self, key: &[u8]) -> bool { - self.metadata.key_range.contains_key(key) - } - - /// Creates an iterator over the `Segment`. - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - #[must_use] - #[allow(clippy::iter_without_into_iter)] - #[doc(hidden)] - pub fn iter(&self) -> Range { - self.range((std::ops::Bound::Unbounded, std::ops::Bound::Unbounded)) - } - - #[doc(hidden)] - pub fn scan>(&self, base_folder: P) -> crate::Result { - let segment_file_path = base_folder.as_ref().join(self.metadata.id.to_string()); - let block_count = self.metadata.data_block_count.try_into().expect("oops"); - Scanner::new(&segment_file_path, block_count) - } - - /// Creates a ranged iterator over the `Segment`. - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - #[must_use] - pub(crate) fn range(&self, range: (Bound, Bound)) -> Range { - Range::new( - self.offsets.index_block_ptr, - self.descriptor_table.clone(), - self.global_id(), - self.cache.clone(), - self.block_index.clone(), - range, - ) - } - - /// Returns the highest sequence number in the segment. - #[must_use] - pub fn get_highest_seqno(&self) -> SeqNo { - self.metadata.seqnos.1 - } - - /// Returns the amount of tombstone markers in the `Segment`. - #[must_use] - #[doc(hidden)] - pub fn tombstone_count(&self) -> u64 { - self.metadata.tombstone_count - } - - /// Returns the ratio of tombstone markers in the `Segment`. - #[must_use] - #[doc(hidden)] - pub fn tombstone_ratio(&self) -> f32 { - self.metadata.tombstone_count as f32 / self.metadata.key_count as f32 - } - - /// Checks if a key range is (partially or fully) contained in this segment. - pub(crate) fn check_key_range_overlap( - &self, - bounds: &(Bound, Bound), - ) -> bool { - self.metadata.key_range.overlaps_with_bounds(bounds) - } -} diff --git a/src/segment/multi_writer.rs b/src/segment/multi_writer.rs deleted file mode 100644 index fb96d40b..00000000 --- a/src/segment/multi_writer.rs +++ /dev/null @@ -1,201 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use super::{ - trailer::SegmentFileTrailer, - writer::{BloomConstructionPolicy, Options, Writer}, -}; -use crate::{value::InternalValue, CompressionType, UserKey}; -use std::sync::{atomic::AtomicU64, Arc}; - -/// Like `Writer` but will rotate to a new segment, once a segment grows larger than `target_size` -/// -/// This results in a sorted "run" of segments -#[allow(clippy::module_name_repetitions)] -pub struct MultiWriter { - /// Target size of segments in bytes - /// - /// If a segment reaches the target size, a new one is started, - /// resulting in a sorted "run" of segments - pub target_size: u64, - - pub opts: Options, - results: Vec, - - segment_id_generator: Arc, - current_segment_id: u64, - - pub writer: Writer, - - pub compression: CompressionType, - - bloom_policy: BloomConstructionPolicy, - - current_key: Option, -} - -impl MultiWriter { - /// Sets up a new `MultiWriter` at the given segments folder - pub fn new( - segment_id_generator: Arc, - target_size: u64, - opts: Options, - ) -> crate::Result { - let current_segment_id = - segment_id_generator.fetch_add(1, std::sync::atomic::Ordering::Relaxed); - - let writer = Writer::new(Options { - segment_id: current_segment_id, - folder: opts.folder.clone(), - data_block_size: opts.data_block_size, - index_block_size: opts.index_block_size, - })?; - - Ok(Self { - target_size, - results: Vec::with_capacity(10), - opts, - segment_id_generator, - current_segment_id, - writer, - - compression: CompressionType::None, - - bloom_policy: BloomConstructionPolicy::default(), - - current_key: None, - }) - } - - #[must_use] - pub fn use_compression(mut self, compression: CompressionType) -> Self { - self.compression = compression; - self.writer = self.writer.use_compression(compression); - self - } - - #[must_use] - pub fn use_bloom_policy(mut self, bloom_policy: BloomConstructionPolicy) -> Self { - self.bloom_policy = bloom_policy; - self.writer = self.writer.use_bloom_policy(bloom_policy); - self - } - - fn get_next_segment_id(&mut self) -> u64 { - self.current_segment_id = self - .segment_id_generator - .fetch_add(1, std::sync::atomic::Ordering::Relaxed); - - self.current_segment_id - } - - /// Flushes the current writer, stores its metadata, and sets up a new writer for the next segment - fn rotate(&mut self) -> crate::Result<()> { - log::debug!("Rotating segment writer"); - - let new_segment_id = self.get_next_segment_id(); - - // NOTE: Feature-dependent - #[allow(unused_mut)] - let mut new_writer = Writer::new(Options { - segment_id: new_segment_id, - folder: self.opts.folder.clone(), - data_block_size: self.opts.data_block_size, - index_block_size: self.opts.index_block_size, - })? - .use_compression(self.compression); - - new_writer = new_writer.use_bloom_policy(self.bloom_policy); - - let mut old_writer = std::mem::replace(&mut self.writer, new_writer); - - if let Some(result) = old_writer.finish()? { - self.results.push(result); - } - - Ok(()) - } - - /// Writes an item - pub fn write(&mut self, item: InternalValue) -> crate::Result<()> { - let is_next_key = self.current_key.as_ref() < Some(&item.key.user_key); - - if is_next_key { - self.current_key = Some(item.key.user_key.clone()); - - if *self.writer.meta.file_pos >= self.target_size { - self.rotate()?; - } - } - - self.writer.write(item)?; - - Ok(()) - } - - /// Finishes the last segment, making sure all data is written durably - /// - /// Returns the metadata of created segments - pub fn finish(mut self) -> crate::Result> { - if let Some(last_writer_result) = self.writer.finish()? { - self.results.push(last_writer_result); - } - - Ok(self.results) - } -} - -#[cfg(test)] -mod tests { - use crate::{AbstractTree, Config}; - use test_log::test; - - // NOTE: Tests that versions of the same key stay - // in the same segment even if it needs to be rotated - // This avoids segments' key ranges overlapping - #[test] - fn segment_multi_writer_same_key_norotate() -> crate::Result<()> { - let folder = tempfile::tempdir()?; - - let tree = Config::new(&folder).open()?; - - tree.insert("a", "a1".repeat(4_000), 0); - tree.insert("a", "a2".repeat(4_000), 1); - tree.insert("a", "a3".repeat(4_000), 2); - tree.insert("a", "a4".repeat(4_000), 3); - tree.insert("a", "a5".repeat(4_000), 4); - tree.flush_active_memtable(0)?; - assert_eq!(1, tree.segment_count()); - assert_eq!(1, tree.len(None, None)?); - - tree.major_compact(1_024, 0)?; - assert_eq!(1, tree.segment_count()); - assert_eq!(1, tree.len(None, None)?); - - Ok(()) - } - - #[test] - fn segment_multi_writer_same_key_norotate_2() -> crate::Result<()> { - let folder = tempfile::tempdir()?; - - let tree = Config::new(&folder).open()?; - - tree.insert("a", "a1".repeat(4_000), 0); - tree.insert("a", "a1".repeat(4_000), 1); - tree.insert("a", "a1".repeat(4_000), 2); - tree.insert("b", "a1".repeat(4_000), 0); - tree.insert("c", "a1".repeat(4_000), 0); - tree.insert("c", "a1".repeat(4_000), 1); - tree.flush_active_memtable(0)?; - assert_eq!(1, tree.segment_count()); - assert_eq!(3, tree.len(None, None)?); - - tree.major_compact(1_024, 0)?; - assert_eq!(3, tree.segment_count()); - assert_eq!(3, tree.len(None, None)?); - - Ok(()) - } -} diff --git a/src/segment/range.rs b/src/segment/range.rs deleted file mode 100644 index 5e65b54a..00000000 --- a/src/segment/range.rs +++ /dev/null @@ -1,764 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use super::block::offset::BlockOffset; -use super::block_index::BlockIndex; -use super::block_index::BlockIndexImpl; -use super::id::GlobalSegmentId; -use super::reader::Reader; -use super::value_block::CachePolicy; -use crate::cache::Cache; -use crate::descriptor_table::FileDescriptorTable; -use crate::value::InternalValue; -use crate::value::UserKey; -use crate::Slice; -use std::ops::Bound; -use std::ops::RangeBounds; -use std::sync::Arc; - -pub struct Range { - block_index: Arc, - - lo_initialized: bool, - hi_initialized: bool, - - pub(crate) range: (Bound, Bound), - - pub(crate) reader: Reader, - - has_entered_lo: bool, - has_entered_hi: bool, -} - -impl Range { - pub fn new( - data_block_boundary: BlockOffset, - descriptor_table: Arc, - segment_id: GlobalSegmentId, - cache: Arc, - block_index: Arc, - range: (Bound, Bound), - ) -> Self { - let reader = Reader::new( - data_block_boundary, - descriptor_table, - segment_id, - cache, - BlockOffset(0), - None, - ); - - Self { - lo_initialized: false, - hi_initialized: false, - - block_index, - - reader, - range, - - has_entered_lo: false, - has_entered_hi: false, - } - } - - /// Sets the cache policy - #[must_use] - pub fn cache_policy(mut self, policy: CachePolicy) -> Self { - self.reader = self.reader.cache_policy(policy); - self - } - - fn initialize_lo_bound(&mut self) -> crate::Result<()> { - let start_key = match self.range.start_bound() { - Bound::Unbounded => None, - Bound::Included(start) | Bound::Excluded(start) => { - if let Some(lower_bound) = self - .block_index - .get_lowest_block_containing_key(start, CachePolicy::Write)? - { - self.reader.lo_block_offset = lower_bound; - } - - Some(start) - } - }; - - if let Some(key) = start_key.cloned() { - self.reader.set_lower_bound(key); - } - - self.lo_initialized = true; - - Ok(()) - } - - fn initialize_hi_bound(&mut self) -> crate::Result<()> { - let end_key: Option<&Slice> = match self.range.end_bound() { - Bound::Unbounded => { - let upper_bound = self.block_index.get_last_block_handle(CachePolicy::Write)?; - - self.reader.hi_block_offset = Some(upper_bound); - - None - } - Bound::Included(end) | Bound::Excluded(end) => { - if let Some(upper_bound) = self - .block_index - .get_last_block_containing_key(end, CachePolicy::Write)? - { - self.reader.hi_block_offset = Some(upper_bound); - } else { - self.reader.hi_block_offset = - Some(self.block_index.get_last_block_handle(CachePolicy::Write)?); - } - - Some(end) - } - }; - - if let Some(key) = end_key.cloned() { - self.reader.set_upper_bound(key); - } - - self.hi_initialized = true; - - Ok(()) - } -} - -impl Iterator for Range { - type Item = crate::Result; - - fn next(&mut self) -> Option { - if !self.lo_initialized { - if let Err(e) = self.initialize_lo_bound() { - return Some(Err(e)); - }; - } - - loop { - let entry_result = self.reader.next()?; - - match entry_result { - Ok(entry) => { - if !self.has_entered_lo { - match self.range.start_bound() { - Bound::Included(start) => { - if entry.key.user_key < *start { - // Before min key - continue; - } - self.has_entered_lo = true; - } - Bound::Excluded(start) => { - if entry.key.user_key <= *start { - // Before or equal min key - continue; - } - self.has_entered_lo = true; - } - Bound::Unbounded => {} - } - } - - match self.range.end_bound() { - Bound::Included(start) => { - if entry.key.user_key > *start { - // After max key - return None; - } - } - Bound::Excluded(start) => { - if entry.key.user_key >= *start { - // Reached max key - return None; - } - } - Bound::Unbounded => {} - } - - return Some(Ok(entry)); - } - Err(error) => return Some(Err(error)), - }; - } - } -} - -impl DoubleEndedIterator for Range { - fn next_back(&mut self) -> Option { - if !self.hi_initialized { - if let Err(e) = self.initialize_hi_bound() { - return Some(Err(e)); - }; - } - - loop { - match self.reader.next_back()? { - Ok(entry) => { - match self.range.start_bound() { - Bound::Included(start) => { - if entry.key.user_key < *start { - // Reached min key - return None; - } - } - Bound::Excluded(start) => { - if entry.key.user_key <= *start { - // Before min key - return None; - } - } - Bound::Unbounded => {} - } - - if !self.has_entered_hi { - match self.range.end_bound() { - Bound::Included(end) => { - if entry.key.user_key > *end { - // After max key - continue; - } - self.has_entered_hi = true; - } - Bound::Excluded(end) => { - if entry.key.user_key >= *end { - // After or equal max key - continue; - } - self.has_entered_hi = true; - } - Bound::Unbounded => {} - } - } - - return Some(Ok(entry)); - } - Err(error) => return Some(Err(error)), - }; - } - } -} - -#[cfg(test)] -#[allow(clippy::expect_used)] -mod tests { - use crate::{ - cache::Cache, - descriptor_table::FileDescriptorTable, - segment::{ - block_index::{two_level_index::TwoLevelBlockIndex, BlockIndexImpl}, - range::Range, - writer::{Options, Writer}, - }, - value::{InternalValue, UserKey, ValueType}, - Slice, - }; - use std::ops::{ - Bound::{self, *}, - RangeBounds, - }; - use std::sync::Arc; - use test_log::test; - - const ITEM_COUNT: u64 = 10_000; - - #[test] - #[allow(clippy::expect_used)] - fn segment_range_reader_lower_bound() -> crate::Result<()> { - let chars = (b'a'..=b'z').collect::>(); - - let folder = tempfile::tempdir()?.into_path(); - - let mut writer = Writer::new(Options { - segment_id: 0, - folder: folder.clone(), - data_block_size: 1_000, // NOTE: Block size 1 to for each item to be its own block - index_block_size: 4_096, - })?; - - let items = chars.iter().map(|&key| { - InternalValue::from_components( - &[key][..], - *b"dsgfgfdsgsfdsgfdgfdfgdsgfdhsnreezrzsernszsdaadsadsadsadsadsadsadsadsadsadsdsensnzersnzers", - 0, - ValueType::Value, - ) - }); - - for item in items { - writer.write(item)?; - } - - let trailer = writer.finish()?.expect("should exist"); - - let segment_file_path = folder.join("0"); - - let table = Arc::new(FileDescriptorTable::new(512, 1)); - table.insert(&segment_file_path, (0, 0).into()); - - let block_cache = Arc::new(Cache::with_capacity_bytes(10 * 1_024 * 1_024)); - let block_index = TwoLevelBlockIndex::from_file( - &segment_file_path, - &trailer.metadata, - trailer.offsets.tli_ptr, - (0, 0).into(), - table.clone(), - block_cache.clone(), - )?; - let block_index = Arc::new(BlockIndexImpl::TwoLevel(block_index)); - - let iter = Range::new( - trailer.offsets.index_block_ptr, - table.clone(), - (0, 0).into(), - block_cache.clone(), - block_index.clone(), - (Bound::Unbounded, Bound::Unbounded), - ); - assert_eq!(chars.len(), iter.flatten().count()); - - for start_char in chars { - let key = &[start_char][..]; - let key = Slice::from(key); - - log::debug!("{}..=z", start_char as char); - - // NOTE: Forwards - let expected_range = (start_char..=b'z').collect::>(); - - let iter = Range::new( - trailer.offsets.index_block_ptr, - table.clone(), - (0, 0).into(), - block_cache.clone(), - block_index.clone(), - (Bound::Included(key.clone()), Bound::Unbounded), - ); - let items = iter - .flatten() - .map(|x| x.key.user_key.first().copied().expect("is ok")) - .collect::>(); - - assert_eq!(items, expected_range); - - // NOTE: Reverse - let expected_range = (start_char..=b'z').rev().collect::>(); - - let iter = Range::new( - trailer.offsets.index_block_ptr, - table.clone(), - (0, 0).into(), - block_cache.clone(), - block_index.clone(), - (Bound::Included(key), Bound::Unbounded), - ); - let items = iter - .rev() - .flatten() - .map(|x| x.key.user_key.first().copied().expect("is ok")) - .collect::>(); - - assert_eq!(items, expected_range); - } - - Ok(()) - } - - #[test] - #[allow(clippy::expect_used)] - fn segment_range_reader_unbounded() -> crate::Result<()> { - let folder = tempfile::tempdir()?.into_path(); - - let mut writer = Writer::new(Options { - segment_id: 0, - folder: folder.clone(), - data_block_size: 4_096, - index_block_size: 4_096, - })?; - - let items = (0u64..ITEM_COUNT).map(|i| { - InternalValue::from_components( - i.to_be_bytes(), - nanoid::nanoid!().as_bytes(), - 1000 + i, - ValueType::Value, - ) - }); - - for item in items { - writer.write(item)?; - } - - let trailer = writer.finish()?.expect("should exist"); - - let segment_file_path = folder.join("0"); - - let table = Arc::new(FileDescriptorTable::new(512, 1)); - table.insert(&segment_file_path, (0, 0).into()); - - let block_cache = Arc::new(Cache::with_capacity_bytes(10 * 1_024 * 1_024)); - let block_index = TwoLevelBlockIndex::from_file( - &segment_file_path, - &trailer.metadata, - trailer.offsets.tli_ptr, - (0, 0).into(), - table.clone(), - block_cache.clone(), - )?; - let block_index = Arc::new(BlockIndexImpl::TwoLevel(block_index)); - - { - let mut iter = Range::new( - trailer.offsets.index_block_ptr, - table.clone(), - (0, 0).into(), - block_cache.clone(), - block_index.clone(), - range_bounds_to_tuple(&..), - ); - - for key in (0u64..ITEM_COUNT).map(u64::to_be_bytes) { - let item = iter.next().expect("item should exist")?; - assert_eq!(key, &*item.key.user_key); - } - - let mut iter = Range::new( - trailer.offsets.index_block_ptr, - table.clone(), - (0, 0).into(), - block_cache.clone(), - block_index.clone(), - range_bounds_to_tuple(&..), - ); - - for key in (0u64..ITEM_COUNT).rev().map(u64::to_be_bytes) { - let item = iter.next_back().expect("item should exist")?; - assert_eq!(key, &*item.key.user_key); - } - } - - { - log::info!("Getting every item (unbounded start)"); - - let end: Slice = 5_000_u64.to_be_bytes().into(); - - let mut iter = Range::new( - trailer.offsets.index_block_ptr, - table.clone(), - (0, 0).into(), - block_cache.clone(), - block_index.clone(), - range_bounds_to_tuple::(&..end), - ); - - for key in (0..5_000).map(u64::to_be_bytes) { - let item = iter.next().expect("item should exist")?; - assert_eq!(key, &*item.key.user_key); - } - - log::info!("Getting every item in reverse (unbounded start)"); - - let end: Slice = 5_000_u64.to_be_bytes().into(); - - let mut iter = Range::new( - trailer.offsets.index_block_ptr, - table.clone(), - (0, 0).into(), - block_cache.clone(), - block_index.clone(), - range_bounds_to_tuple(&..end), - ); - - for key in (1_000..5_000).rev().map(u64::to_be_bytes) { - let item = iter.next_back().expect("item should exist")?; - assert_eq!(key, &*item.key.user_key); - } - } - - { - log::info!("Getting every item (unbounded end)"); - - let start: Slice = 1_000_u64.to_be_bytes().into(); - - let mut iter = Range::new( - trailer.offsets.index_block_ptr, - table.clone(), - (0, 0).into(), - block_cache.clone(), - block_index.clone(), - range_bounds_to_tuple(&(start..)), - ); - - for key in (1_000..5_000).map(u64::to_be_bytes) { - let item = iter.next().expect("item should exist")?; - assert_eq!(key, &*item.key.user_key); - } - - log::info!("Getting every item in reverse (unbounded end)"); - - let start: Slice = 1_000_u64.to_be_bytes().into(); - let end: Slice = 5_000_u64.to_be_bytes().into(); - - let mut iter = Range::new( - trailer.offsets.index_block_ptr, - table, - (0, 0).into(), - block_cache, - block_index, - range_bounds_to_tuple(&(start..end)), - ); - - for key in (1_000..5_000).rev().map(u64::to_be_bytes) { - let item = iter.next_back().expect("item should exist")?; - assert_eq!(key, &*item.key.user_key); - } - } - - Ok(()) - } - - fn range_bounds_to_tuple(range: &impl RangeBounds) -> (Bound, Bound) { - let start_bound = match range.start_bound() { - Included(value) => Included(value.clone()), - Excluded(value) => Excluded(value.clone()), - Unbounded => Unbounded, - }; - - let end_bound = match range.end_bound() { - Included(value) => Included(value.clone()), - Excluded(value) => Excluded(value.clone()), - Unbounded => Unbounded, - }; - - (start_bound, end_bound) - } - - fn bounds_u64_to_bytes(bounds: &(Bound, Bound)) -> (Bound, Bound) { - let start_bytes = match bounds.0 { - Included(start) => Included(start.to_be_bytes().into()), - Excluded(start) => Excluded(start.to_be_bytes().into()), - Unbounded => Unbounded, - }; - - let end_bytes = match bounds.1 { - Included(end) => Included(end.to_be_bytes().into()), - Excluded(end) => Excluded(end.to_be_bytes().into()), - Unbounded => Unbounded, - }; - - (start_bytes, end_bytes) - } - - fn create_range(bounds: (Bound, Bound)) -> (u64, u64) { - let start = match bounds.0 { - Included(value) => value, - Excluded(value) => value + 1, - Unbounded => 0, - }; - - let end = match bounds.1 { - Included(value) => value + 1, - Excluded(value) => value, - Unbounded => u64::MAX, - }; - - (start, end) - } - - #[test] - fn segment_range_reader_bounded_ranges() -> crate::Result<()> { - for data_block_size in [1, 10, 100, 200, 500, 1_000, 4_096] { - let folder = tempfile::tempdir()?.into_path(); - - let mut writer = Writer::new(Options { - segment_id: 0, - folder: folder.clone(), - data_block_size, - index_block_size: 4_096, - })?; - - let items = (0u64..ITEM_COUNT).map(|i| { - InternalValue::from_components( - i.to_be_bytes(), - nanoid::nanoid!().as_bytes(), - 1000 + i, - ValueType::Value, - ) - }); - - for item in items { - writer.write(item)?; - } - - let trailer = writer.finish()?.expect("should exist"); - - let segment_file_path = folder.join("0"); - - let table = Arc::new(FileDescriptorTable::new(512, 1)); - table.insert(&segment_file_path, (0, 0).into()); - - let block_cache = Arc::new(Cache::with_capacity_bytes(10 * 1_024 * 1_024)); - let block_index = TwoLevelBlockIndex::from_file( - &segment_file_path, - &trailer.metadata, - trailer.offsets.tli_ptr, - (0, 0).into(), - table.clone(), - block_cache.clone(), - )?; - let block_index = Arc::new(BlockIndexImpl::TwoLevel(block_index)); - - let ranges: Vec<(Bound, Bound)> = vec![ - range_bounds_to_tuple(&(0..1_000)), - range_bounds_to_tuple(&(0..=1_000)), - range_bounds_to_tuple(&(1_000..5_000)), - range_bounds_to_tuple(&(1_000..=5_000)), - range_bounds_to_tuple(&(1_000..ITEM_COUNT)), - range_bounds_to_tuple(&..5_000), - ]; - - for bounds in ranges { - log::info!("Bounds: {bounds:?}"); - - let (start, end) = create_range(bounds); - - log::debug!("Getting every item in range"); - let range = std::ops::Range { start, end }; - - let mut iter = Range::new( - trailer.offsets.index_block_ptr, - table.clone(), - (0, 0).into(), - block_cache.clone(), - block_index.clone(), - bounds_u64_to_bytes(&bounds), - ); - - for key in range.map(u64::to_be_bytes) { - let item = iter.next().unwrap_or_else(|| { - panic!("item should exist: {:?} ({})", key, u64::from_be_bytes(key)) - })?; - - assert_eq!(key, &*item.key.user_key); - } - - log::debug!("Getting every item in range in reverse"); - let range = std::ops::Range { start, end }; - - let mut iter = Range::new( - trailer.offsets.index_block_ptr, - table.clone(), - (0, 0).into(), - block_cache.clone(), - block_index.clone(), - bounds_u64_to_bytes(&bounds), - ); - - for key in range.rev().map(u64::to_be_bytes) { - let item = iter.next_back().unwrap_or_else(|| { - panic!("item should exist: {:?} ({})", key, u64::from_be_bytes(key)) - })?; - - assert_eq!(key, &*item.key.user_key); - } - } - } - - Ok(()) - } - - #[test] - #[allow(clippy::expect_used)] - fn segment_range_reader_char_ranges() -> crate::Result<()> { - let chars = (b'a'..=b'z').collect::>(); - - let folder = tempfile::tempdir()?.into_path(); - - let mut writer = Writer::new(Options { - segment_id: 0, - folder: folder.clone(), - data_block_size: 250, - index_block_size: 4_096, - })?; - - let items = chars.iter().map(|&key| { - InternalValue::from_components( - &[key][..], - *b"dsgfgfdsgsfdsgfdgfdfgdsgfdhsnreezrzsernszsdaadsadsadsadsadsdsensnzersnzers", - 0, - ValueType::Value, - ) - }); - - for item in items { - writer.write(item)?; - } - - let trailer = writer.finish()?.expect("should exist"); - - let segment_file_path = folder.join("0"); - - let table = Arc::new(FileDescriptorTable::new(512, 1)); - table.insert(&segment_file_path, (0, 0).into()); - - let block_cache = Arc::new(Cache::with_capacity_bytes(10 * 1_024 * 1_024)); - let block_index = TwoLevelBlockIndex::from_file( - &segment_file_path, - &trailer.metadata, - trailer.offsets.tli_ptr, - (0, 0).into(), - table.clone(), - block_cache.clone(), - )?; - let block_index = Arc::new(BlockIndexImpl::TwoLevel(block_index)); - - for (i, &start_char) in chars.iter().enumerate() { - for &end_char in chars.iter().skip(i + 1) { - log::debug!("checking ({}, {})", start_char as char, end_char as char); - - let expected_range = (start_char..=end_char).collect::>(); - - let iter = Range::new( - trailer.offsets.index_block_ptr, - table.clone(), - (0, 0).into(), - block_cache.clone(), - block_index.clone(), - ( - Included(Slice::from([start_char])), - Included(Slice::from([end_char])), - ), - ); - - let mut range = iter.flatten().map(|x| x.key.user_key); - - for &item in &expected_range { - assert_eq!(&*range.next().expect("should exist"), &[item]); - } - - let iter = Range::new( - trailer.offsets.index_block_ptr, - table.clone(), - (0, 0).into(), - block_cache.clone(), - block_index.clone(), - ( - Included(Slice::from([start_char])), - Included(Slice::from([end_char])), - ), - ); - - let mut range = iter.flatten().map(|x| x.key.user_key); - - for &item in expected_range.iter().rev() { - assert_eq!(&*range.next_back().expect("should exist"), &[item]); - } - } - } - - Ok(()) - } -} diff --git a/src/segment/reader.rs b/src/segment/reader.rs deleted file mode 100644 index 9c84d871..00000000 --- a/src/segment/reader.rs +++ /dev/null @@ -1,270 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use super::{ - block::offset::BlockOffset, - value_block::{CachePolicy, ValueBlock}, - value_block_consumer::ValueBlockConsumer, -}; -use crate::{ - cache::Cache, descriptor_table::FileDescriptorTable, segment::block::header::Header, - value::InternalValue, GlobalSegmentId, UserKey, -}; -use std::sync::Arc; - -pub struct Reader { - segment_id: GlobalSegmentId, - - descriptor_table: Arc, - block_cache: Arc, - - data_block_boundary: BlockOffset, - - pub lo_block_offset: BlockOffset, - pub(crate) lo_block_size: u64, - pub(crate) lo_block_items: Option, - pub(crate) lo_initialized: bool, - - pub hi_block_offset: Option, - pub hi_block_backlink: BlockOffset, - pub hi_block_items: Option, - pub hi_initialized: bool, - - start_key: Option, - end_key: Option, - - cache_policy: CachePolicy, -} - -impl Reader { - #[must_use] - pub fn new( - data_block_boundary: BlockOffset, - descriptor_table: Arc, - segment_id: GlobalSegmentId, - block_cache: Arc, - lo_block_offset: BlockOffset, - hi_block_offset: Option, - ) -> Self { - Self { - data_block_boundary, - - descriptor_table, - segment_id, - block_cache, - - lo_block_offset, - lo_block_size: 0, - lo_block_items: None, - lo_initialized: false, - - hi_block_offset, - hi_block_backlink: BlockOffset(0), - hi_block_items: None, - hi_initialized: false, - - cache_policy: CachePolicy::Write, - - start_key: None, - end_key: None, - } - } - - /// Sets the lower bound block, such that as many blocks as possible can be skipped. - pub fn set_lower_bound(&mut self, key: UserKey) { - self.start_key = Some(key); - } - - /// Sets the upper bound block, such that as many blocks as possible can be skipped. - pub fn set_upper_bound(&mut self, key: UserKey) { - self.end_key = Some(key); - } - - /// Sets the cache policy - #[must_use] - pub fn cache_policy(mut self, policy: CachePolicy) -> Self { - self.cache_policy = policy; - self - } - - fn load_data_block( - &self, - offset: BlockOffset, - ) -> crate::Result> { - let block = ValueBlock::load_by_block_handle( - &self.descriptor_table, - &self.block_cache, - self.segment_id, - offset, - self.cache_policy, - )?; - - // TODO: we only need to truncate items from blocks that are not the first and last block - // TODO: because any block inbetween must (trivially) only contain relevant items - - // Truncate as many items as possible - block.map_or(Ok(None), |block| { - Ok(Some(( - block.header.data_length.into(), - block.header.previous_block_offset, - ValueBlockConsumer::with_bounds( - block, - self.start_key.as_deref(), - self.end_key.as_deref(), - ), - ))) - }) - } - - fn initialize_lo(&mut self) -> crate::Result<()> { - if let Some((size, _, items)) = self.load_data_block(self.lo_block_offset)? { - self.lo_block_items = Some(items); - self.lo_block_size = size; - } - - self.lo_initialized = true; - - Ok(()) - } - - fn initialize_hi(&mut self) -> crate::Result<()> { - let offset = self - .hi_block_offset - .expect("no hi offset configured for segment reader"); - - if let Some((_, backlink, items)) = self.load_data_block(offset)? { - self.hi_block_items = Some(items); - self.hi_block_backlink = backlink; - } - - self.hi_initialized = true; - - Ok(()) - } -} - -impl Iterator for Reader { - type Item = crate::Result; - - fn next(&mut self) -> Option { - if !self.lo_initialized { - fail_iter!(self.initialize_lo()); - } - - if let Some(head) = self.lo_block_items.as_mut()?.next() { - // Just consume item - return Some(Ok(head)); - } - - // Front buffer is empty - - // Load next block - let next_block_offset = BlockOffset( - *self.lo_block_offset + Header::serialized_len() as u64 + self.lo_block_size, - ); - - assert_ne!( - self.lo_block_offset, next_block_offset, - "invalid next block offset" - ); - - if next_block_offset >= self.data_block_boundary { - // We are done - return None; - } - - if let Some(hi_offset) = self.hi_block_offset { - if next_block_offset == hi_offset { - if !self.hi_initialized { - fail_iter!(self.initialize_hi()); - } - - // We reached the last block, consume from it instead - return self.hi_block_items.as_mut()?.next().map(Ok); - } - } - - // TODO: when loading the next data block, we unnecessarily do binary search through it - // (ValueBlock::with_bounds), but we may be able to skip it sometimes - match fail_iter!(self.load_data_block(next_block_offset)) { - Some((size, _, items)) => { - self.lo_block_items = Some(items); - self.lo_block_size = size; - self.lo_block_offset = next_block_offset; - - // We just loaded the block - self.lo_block_items.as_mut()?.next().map(Ok) - } - None => { - panic!("searched for invalid data block"); - } - } - } -} - -impl DoubleEndedIterator for Reader { - fn next_back(&mut self) -> Option { - if !self.hi_initialized { - fail_iter!(self.initialize_hi()); - } - - loop { - // NOTE: See init function - let hi_offset = self - .hi_block_offset - .expect("no hi offset configured for segment reader"); - - if hi_offset == self.lo_block_offset { - if !self.lo_initialized { - fail_iter!(self.initialize_lo()); - } - - // We reached the last block, consume from it instead - return self.lo_block_items.as_mut()?.next_back().map(Ok); - } - - if let Some(tail) = self.hi_block_items.as_mut()?.next_back() { - // Just consume item - return Some(Ok(tail)); - } - - // Back buffer is empty - - if hi_offset == BlockOffset(0) { - // We are done - return None; - } - - // Load prev block - let prev_block_offset = self.hi_block_backlink; - - if prev_block_offset == self.lo_block_offset { - if !self.lo_initialized { - fail_iter!(self.initialize_lo()); - } - - // We reached the last block, consume from it instead - return self.lo_block_items.as_mut()?.next_back().map(Ok); - } - - // TODO: when loading the next data block, we unnecessarily do binary search through it - // (ValueBlock::with_bounds), but we may be able to skip it sometimes - match fail_iter!(self.load_data_block(prev_block_offset)) { - Some((_, backlink, items)) => { - self.hi_block_items = Some(items); - self.hi_block_backlink = backlink; - self.hi_block_offset = Some(prev_block_offset); - - // We just loaded the block - if let Some(item) = self.hi_block_items.as_mut()?.next_back() { - return Some(Ok(item)); - } - } - None => { - panic!("searched for invalid data block"); - } - } - } - } -} diff --git a/src/segment/scanner.rs b/src/segment/scanner.rs deleted file mode 100644 index 53191bb6..00000000 --- a/src/segment/scanner.rs +++ /dev/null @@ -1,53 +0,0 @@ -use super::value_block::ValueBlock; -use crate::InternalValue; -use std::{collections::VecDeque, fs::File, io::BufReader, path::Path}; - -/// Segment reader that is optimized for consuming an entire segment -pub struct Scanner { - reader: BufReader, - - block_count: usize, - read_count: usize, - - buffer: VecDeque, -} - -impl Scanner { - pub fn new(path: &Path, block_count: usize) -> crate::Result { - // TODO: a larger buffer size may be better for HDD - let reader = BufReader::with_capacity(8 * 4_096, File::open(path)?); - - Ok(Self { - reader, - block_count, - read_count: 0, - buffer: VecDeque::new(), - }) - } -} - -impl Iterator for Scanner { - type Item = crate::Result; - - fn next(&mut self) -> Option { - loop { - if let Some(item) = self.buffer.pop_front() { - return Some(Ok(item)); - } - - if self.read_count >= self.block_count { - return None; - } - - let block = ValueBlock::from_reader(&mut self.reader); - let block = fail_iter!(block); - - // TODO: 1.80? IntoIter impl for Box<[T]> - self.buffer.extend(block.items.into_vec()); - - self.read_count += 1; - } - } -} - -pub type CompactionReader<'a> = Box> + 'a>; diff --git a/src/segment/trailer.rs b/src/segment/trailer.rs deleted file mode 100644 index 0c921ba3..00000000 --- a/src/segment/trailer.rs +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use super::{file_offsets::FileOffsets, meta::Metadata}; -use crate::{ - coding::{Decode, DecodeError, Encode, EncodeError}, - file::MAGIC_BYTES, -}; -use std::{ - fs::File, - io::{BufReader, Read, Seek, Write}, - path::Path, -}; - -pub const TRAILER_SIZE: usize = 256; - -#[derive(Debug)] -#[allow(clippy::module_name_repetitions)] -pub struct SegmentFileTrailer { - #[doc(hidden)] - pub metadata: Metadata, - - #[doc(hidden)] - pub offsets: FileOffsets, -} - -impl SegmentFileTrailer { - pub fn from_file(path: &Path) -> crate::Result { - let file = File::open(path)?; - let mut reader = BufReader::new(file); - reader.seek(std::io::SeekFrom::End(-(TRAILER_SIZE as i64)))?; - - // Parse pointers - let offsets = FileOffsets::decode_from(&mut reader)?; - - let remaining_padding = TRAILER_SIZE - FileOffsets::serialized_len() - MAGIC_BYTES.len(); - reader.seek_relative(remaining_padding as i64)?; - - // Check trailer magic - let mut magic = [0u8; MAGIC_BYTES.len()]; - reader.read_exact(&mut magic)?; - - if magic != MAGIC_BYTES { - return Err(crate::Error::Decode(DecodeError::InvalidHeader( - "SegmentTrailer", - ))); - } - - log::trace!("Trailer offsets: {offsets:#?}"); - - // Jump to metadata and parse - reader.seek(std::io::SeekFrom::Start(*offsets.metadata_ptr))?; - let metadata = Metadata::decode_from(&mut reader)?; - - Ok(Self { metadata, offsets }) - } -} - -impl Encode for SegmentFileTrailer { - fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { - let mut v = Vec::with_capacity(TRAILER_SIZE); - - // TODO: 3.0.0, magic header, too? - - self.offsets.encode_into(&mut v)?; - - // Pad with remaining bytes - v.resize(TRAILER_SIZE - MAGIC_BYTES.len(), 0); - - v.write_all(&MAGIC_BYTES)?; - - assert_eq!( - v.len(), - TRAILER_SIZE, - "segment file trailer has invalid size" - ); - - writer.write_all(&v)?; - - Ok(()) - } -} diff --git a/src/segment/value_block.rs b/src/segment/value_block.rs deleted file mode 100644 index 87f7b116..00000000 --- a/src/segment/value_block.rs +++ /dev/null @@ -1,161 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use super::{block::Block, id::GlobalSegmentId}; -use crate::{ - binary_search::partition_point, cache::Cache, descriptor_table::FileDescriptorTable, - segment::block::offset::BlockOffset, value::InternalValue, -}; -use std::sync::Arc; - -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum CachePolicy { - /// Read cached blocks, but do not change cache - Read, - - /// Read cached blocks, and update cache - Write, -} - -/// Value blocks are the building blocks of a [`crate::segment::Segment`]. Each block is a sorted list of [`InternalValue`]s, -/// and stored in compressed form on disk, in sorted order. -/// -/// The integrity of a block can be checked using the checksum value that is saved in it. -#[allow(clippy::module_name_repetitions)] -pub type ValueBlock = Block; - -impl ValueBlock { - #[must_use] - pub fn get_latest(&self, key: &[u8]) -> Option<&InternalValue> { - let idx = partition_point(&self.items, |item| &*item.key.user_key < key); - - self.items - .get(idx) - .filter(|&item| &*item.key.user_key == key) - } - - pub fn load_by_block_handle( - descriptor_table: &FileDescriptorTable, - block_cache: &Cache, - segment_id: GlobalSegmentId, - offset: BlockOffset, - cache_policy: CachePolicy, - ) -> crate::Result>> { - Ok( - if let Some(block) = block_cache.get_data_block(segment_id, offset) { - // Cache hit: Copy from block - - Some(block) - } else { - // Cache miss: load from disk - - log::trace!("loading value block from disk: {segment_id:?}/{offset:?}"); - - let file_guard = descriptor_table - .access(&segment_id)? - .ok_or(()) - .map_err(|()| { - log::error!("Failed to get file guard for segment {segment_id:?}"); - }) - .expect("should acquire file handle"); - // TODO: ^ use inspect instead: 1.76 - - let block = Self::from_file( - &mut *file_guard.file.lock().expect("lock is poisoned"), - offset, - ) - .map_err(|e| { - log::error!("Failed to load value block {segment_id:?}/{offset:?}: {e:?}"); - e - })?; - // TODO: ^ inspect_err instead: 1.76 - - drop(file_guard); - - let block = Arc::new(block); - - if cache_policy == CachePolicy::Write { - block_cache.insert_data_block(segment_id, offset, block.clone()); - } - - Some(block) - }, - ) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::{ - segment::{ - block::{checksum::Checksum, header::Header as BlockHeader, ItemSize}, - meta::CompressionType, - }, - ValueType, - }; - use test_log::test; - - #[test] - fn value_block_size() { - let items = [ - InternalValue::from_components(*b"ba", *b"asd", 2, ValueType::Value), - InternalValue::from_components(*b"bb", *b"def", 1, ValueType::Value), - ]; - assert_eq!(28, items.size()); - } - - #[test] - #[allow(clippy::unwrap_used)] - fn value_block_find_latest() { - let items = [ - InternalValue::from_components(*b"b", *b"b", 2, ValueType::Value), - InternalValue::from_components(*b"b", *b"b", 1, ValueType::Value), - InternalValue::from_components(*b"b", *b"b", 0, ValueType::Value), - InternalValue::from_components(*b"c", *b"c", 0, ValueType::Value), - InternalValue::from_components(*b"d", *b"d", 5, ValueType::Value), - ]; - - let block = ValueBlock { - items: items.into(), - header: BlockHeader { - compression: CompressionType::None, - checksum: Checksum::from_raw(0), - data_length: 0, - previous_block_offset: BlockOffset(0), - uncompressed_length: 0, - }, - }; - - assert_eq!(block.get_latest(b"a"), None); - assert_eq!( - block.get_latest(b"b"), - Some(&InternalValue::from_components( - *b"b", - *b"b", - 2, - ValueType::Value - )) - ); - assert_eq!( - block.get_latest(b"c"), - Some(&InternalValue::from_components( - *b"c", - *b"c", - 0, - ValueType::Value - )) - ); - assert_eq!( - block.get_latest(b"d"), - Some(&InternalValue::from_components( - *b"d", - *b"d", - 5, - ValueType::Value - )) - ); - assert_eq!(block.get_latest(b"e"), None); - } -} diff --git a/src/segment/value_block_consumer.rs b/src/segment/value_block_consumer.rs deleted file mode 100644 index f02fca05..00000000 --- a/src/segment/value_block_consumer.rs +++ /dev/null @@ -1,310 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use super::value_block::ValueBlock; -use crate::{binary_search::partition_point, value::InternalValue}; -use std::sync::Arc; - -pub struct ValueBlockConsumer { - pub(crate) inner: Arc, - lo: usize, - hi: usize, -} - -impl ValueBlockConsumer { - #[must_use] - pub fn new(inner: Arc) -> Self { - Self::with_bounds(inner, None, None) - } - - #[must_use] - pub fn with_bounds( - inner: Arc, - start_key: Option<&[u8]>, - end_key: Option<&[u8]>, - ) -> Self { - let mut lo = start_key.as_ref().map_or(0, |key| { - partition_point(&inner.items, |x| &*x.key.user_key < *key) - }); - - let hi = end_key.as_ref().map_or_else( - || inner.items.len() - 1, - |key| { - let idx = partition_point(&inner.items, |x| &*x.key.user_key <= *key); - - if idx == 0 { - let first = inner - .items - .first() - .expect("value block should not be empty"); - - if &*first.key.user_key > *key { - lo = 1; - } - } - - idx.saturating_sub(1) - }, - ); - - Self { inner, lo, hi } - } -} - -impl Iterator for ValueBlockConsumer { - type Item = InternalValue; - - fn next(&mut self) -> Option { - if self.lo > self.hi { - None - } else { - let item = self.inner.items.get(self.lo)?; - self.lo += 1; - - Some(item.clone()) - } - } -} - -impl DoubleEndedIterator for ValueBlockConsumer { - fn next_back(&mut self) -> Option { - if self.hi < self.lo { - None - } else { - let item = self.inner.items.get(self.hi)?; - - if self.hi == 0 { - // Prevent underflow - self.lo += 1; - } else { - self.hi -= 1; - } - - Some(item.clone()) - } - } -} - -#[cfg(test)] -#[allow(clippy::expect_used)] -mod tests { - use super::*; - use crate::segment::block::{checksum::Checksum, header::Header, offset::BlockOffset}; - use test_log::test; - - macro_rules! iter_closed { - ($iter:expr) => { - assert!($iter.next().is_none(), "iterator should be closed (done)"); - assert!( - $iter.next_back().is_none(), - "iterator should be closed (done)" - ); - }; - } - - fn block(items: Vec) -> ValueBlock { - ValueBlock { - header: Header { - compression: crate::segment::meta::CompressionType::None, - checksum: Checksum::from_raw(0), - data_length: 0, - previous_block_offset: BlockOffset(0), - uncompressed_length: 0, - }, - items: items.into_boxed_slice(), - } - } - - #[test] - fn block_consumer_simple() { - let block = block(vec![ - InternalValue::from_components(*b"a", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"b", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"c", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"d", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"e", vec![], 0, crate::ValueType::Value), - ]); - - let mut iter = ValueBlockConsumer::new(block.into()); - assert_eq!(*b"a", &*iter.next().expect("should exist").key.user_key); - assert_eq!(*b"b", &*iter.next().expect("should exist").key.user_key); - assert_eq!(*b"c", &*iter.next().expect("should exist").key.user_key); - assert_eq!(*b"d", &*iter.next().expect("should exist").key.user_key); - assert_eq!(*b"e", &*iter.next().expect("should exist").key.user_key); - iter_closed!(iter); - } - - #[test] - fn block_consumer_simple_rev() { - let block = block(vec![ - InternalValue::from_components(*b"a", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"b", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"c", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"d", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"e", vec![], 0, crate::ValueType::Value), - ]); - - let mut iter = ValueBlockConsumer::new(block.into()); - assert_eq!( - *b"e", - &*iter.next_back().expect("should exist").key.user_key - ); - assert_eq!( - *b"d", - &*iter.next_back().expect("should exist").key.user_key - ); - assert_eq!( - *b"c", - &*iter.next_back().expect("should exist").key.user_key - ); - assert_eq!( - *b"b", - &*iter.next_back().expect("should exist").key.user_key - ); - assert_eq!( - *b"a", - &*iter.next_back().expect("should exist").key.user_key - ); - iter_closed!(iter); - } - - #[test] - fn block_consumer_simple_ping_pong() { - let block = block(vec![ - InternalValue::from_components(*b"a", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"b", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"c", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"d", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"e", vec![], 0, crate::ValueType::Value), - ]); - - let mut iter = ValueBlockConsumer::new(block.clone().into()); - assert_eq!(*b"a", &*iter.next().expect("should exist").key.user_key); - assert_eq!( - *b"e", - &*iter.next_back().expect("should exist").key.user_key - ); - assert_eq!(*b"b", &*iter.next().expect("should exist").key.user_key); - assert_eq!( - *b"d", - &*iter.next_back().expect("should exist").key.user_key - ); - assert_eq!(*b"c", &*iter.next().expect("should exist").key.user_key); - iter_closed!(iter); - - let mut iter = ValueBlockConsumer::new(block.into()); - assert_eq!( - *b"e", - &*iter.next_back().expect("should exist").key.user_key - ); - assert_eq!(*b"a", &*iter.next().expect("should exist").key.user_key); - assert_eq!( - *b"d", - &*iter.next_back().expect("should exist").key.user_key - ); - assert_eq!(*b"b", &*iter.next().expect("should exist").key.user_key); - assert_eq!( - *b"c", - &*iter.next_back().expect("should exist").key.user_key - ); - iter_closed!(iter); - } - - #[test] - fn block_consumer_start_key() { - let block = block(vec![ - InternalValue::from_components(*b"a", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"b", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"c", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"d", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"e", vec![], 0, crate::ValueType::Value), - ]); - - let mut iter = ValueBlockConsumer::with_bounds(block.clone().into(), Some(b"c"), None); - assert_eq!(*b"c", &*iter.next().expect("should exist").key.user_key); - assert_eq!(*b"d", &*iter.next().expect("should exist").key.user_key); - assert_eq!(*b"e", &*iter.next().expect("should exist").key.user_key); - iter_closed!(iter); - - let mut iter = ValueBlockConsumer::with_bounds(block.into(), Some(b"c"), None); - assert_eq!( - *b"e", - &*iter.next_back().expect("should exist").key.user_key - ); - assert_eq!( - *b"d", - &*iter.next_back().expect("should exist").key.user_key - ); - assert_eq!( - *b"c", - &*iter.next_back().expect("should exist").key.user_key - ); - iter_closed!(iter); - } - - #[test] - fn block_consumer_end_key() { - let block = block(vec![ - InternalValue::from_components(*b"a", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"b", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"c", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"d", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"e", vec![], 0, crate::ValueType::Value), - ]); - - let mut iter = ValueBlockConsumer::with_bounds(block.clone().into(), None, Some(b"c")); - assert_eq!(*b"a", &*iter.next().expect("should exist").key.user_key); - assert_eq!(*b"b", &*iter.next().expect("should exist").key.user_key); - assert_eq!(*b"c", &*iter.next().expect("should exist").key.user_key); - iter_closed!(iter); - - let mut iter = ValueBlockConsumer::with_bounds(block.into(), None, Some(b"c")); - assert_eq!( - *b"c", - &*iter.next_back().expect("should exist").key.user_key - ); - assert_eq!( - *b"b", - &*iter.next_back().expect("should exist").key.user_key - ); - assert_eq!( - *b"a", - &*iter.next_back().expect("should exist").key.user_key - ); - iter_closed!(iter); - } - - #[test] - fn block_consumer_no_range_end() { - let block = block(vec![ - InternalValue::from_components(*b"b", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"c", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"d", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"e", vec![], 0, crate::ValueType::Value), - ]); - - let mut iter = ValueBlockConsumer::with_bounds(block.clone().into(), None, Some(b"a")); - iter_closed!(iter); - - let mut iter = ValueBlockConsumer::with_bounds(block.into(), None, Some(b"a")).rev(); - iter_closed!(iter); - } - - #[test] - fn block_consumer_no_range_start() { - let block = block(vec![ - InternalValue::from_components(*b"a", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"b", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"c", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"d", vec![], 0, crate::ValueType::Value), - InternalValue::from_components(*b"e", vec![], 0, crate::ValueType::Value), - ]); - - let mut iter = ValueBlockConsumer::with_bounds(block.clone().into(), Some(b"f"), None); - iter_closed!(iter); - - let mut iter = ValueBlockConsumer::with_bounds(block.into(), Some(b"f"), None).rev(); - iter_closed!(iter); - } -} diff --git a/src/segment/writer/meta.rs b/src/segment/writer/meta.rs deleted file mode 100644 index 01d3e95b..00000000 --- a/src/segment/writer/meta.rs +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use crate::{segment::block::offset::BlockOffset, SeqNo, UserKey}; - -pub struct Metadata { - /// Written data block count - pub data_block_count: usize, - - /// Written index block count - pub index_block_count: usize, - - /// Written item count - pub item_count: usize, - - /// Tombstone count - pub tombstone_count: usize, - - /// Written key count (unique keys) - pub key_count: usize, - - /// Current file position of writer - pub file_pos: BlockOffset, - - /// Only takes user data into account - pub uncompressed_size: u64, - - /// First encountered key - pub first_key: Option, - - /// Last encountered key - pub last_key: Option, - - /// Lowest encountered seqno - pub lowest_seqno: SeqNo, - - /// Highest encountered seqno - pub highest_seqno: SeqNo, -} - -impl Default for Metadata { - fn default() -> Self { - Self { - data_block_count: 0, - index_block_count: 0, - - item_count: 0, - tombstone_count: 0, - key_count: 0, - file_pos: BlockOffset(0), - uncompressed_size: 0, - - first_key: None, - last_key: None, - - lowest_seqno: SeqNo::MAX, - highest_seqno: 0, - } - } -} diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs deleted file mode 100644 index 8ba46df1..00000000 --- a/src/segment/writer/mod.rs +++ /dev/null @@ -1,574 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -mod meta; - -use super::{ - block::{header::Header as BlockHeader, offset::BlockOffset}, - block_index::writer::Writer as IndexWriter, - file_offsets::FileOffsets, - meta::{CompressionType, Metadata}, - trailer::SegmentFileTrailer, - value_block::ValueBlock, -}; -use crate::{ - bloom::BloomFilter, - coding::Encode, - file::fsync_directory, - segment::block::ItemSize, - value::{InternalValue, UserKey}, - SegmentId, -}; -use std::{ - fs::File, - io::{BufWriter, Seek, Write}, - path::PathBuf, -}; - -/// Serializes and compresses values into blocks and writes them to disk as segment -pub struct Writer { - pub(crate) opts: Options, - - /// Compression to use - compression: CompressionType, - - /// Segment file - segment_file_path: PathBuf, - - /// Writer of data blocks - block_writer: BufWriter, - - /// Writer of index blocks - index_writer: IndexWriter, - - /// Buffer of KVs - chunk: Vec, - chunk_size: usize, - - pub(crate) meta: meta::Metadata, - - /// Stores the previous block position (used for creating back links) - prev_pos: (BlockOffset, BlockOffset), - - current_key: Option, - - bloom_policy: BloomConstructionPolicy, - - /// Hashes for bloom filter - /// - /// using enhanced double hashing, so we got two u64s - bloom_hash_buffer: Vec<(u64, u64)>, -} - -#[derive(Copy, Clone, Debug)] -pub enum BloomConstructionPolicy { - BitsPerKey(u8), - FpRate(f32), -} - -impl Default for BloomConstructionPolicy { - fn default() -> Self { - Self::BitsPerKey(10) - } -} - -impl BloomConstructionPolicy { - #[must_use] - pub fn build(&self, n: usize) -> BloomFilter { - match self { - Self::BitsPerKey(bpk) => BloomFilter::with_bpk(n, *bpk), - Self::FpRate(fpr) => BloomFilter::with_fp_rate(n, *fpr), - } - } - - #[must_use] - pub fn is_active(&self) -> bool { - match self { - Self::BitsPerKey(bpk) => *bpk > 0, - Self::FpRate(_) => true, - } - } -} - -pub struct Options { - pub folder: PathBuf, - pub data_block_size: u32, - pub index_block_size: u32, - pub segment_id: SegmentId, -} - -impl Writer { - /// Sets up a new `Writer` at the given folder - pub fn new(opts: Options) -> crate::Result { - let segment_file_path = opts.folder.join(opts.segment_id.to_string()); - - let block_writer = File::create(&segment_file_path)?; - let block_writer = BufWriter::with_capacity(u16::MAX.into(), block_writer); - - let index_writer = IndexWriter::new(opts.index_block_size)?; - - let chunk = Vec::new(); - - Ok(Self { - opts, - meta: meta::Metadata::default(), - - compression: CompressionType::None, - - segment_file_path, - - block_writer, - index_writer, - chunk, - - prev_pos: (BlockOffset(0), BlockOffset(0)), - - chunk_size: 0, - - current_key: None, - - bloom_policy: BloomConstructionPolicy::default(), - - bloom_hash_buffer: Vec::new(), - }) - } - - #[must_use] - pub(crate) fn use_compression(mut self, compression: CompressionType) -> Self { - self.compression = compression; - self.index_writer = self.index_writer.use_compression(compression); - self - } - - #[must_use] - pub(crate) fn use_bloom_policy(mut self, bloom_policy: BloomConstructionPolicy) -> Self { - self.bloom_policy = bloom_policy; - self - } - - /// Writes a compressed block to disk. - /// - /// This is triggered when a `Writer::write` causes the buffer to grow to the configured `block_size`. - /// - /// Should only be called when the block has items in it. - pub(crate) fn spill_block(&mut self) -> crate::Result<()> { - let Some(last) = self.chunk.last() else { - return Ok(()); - }; - - let (header, data) = - ValueBlock::to_bytes_compressed(&self.chunk, self.prev_pos.0, self.compression)?; - - self.meta.uncompressed_size += u64::from(header.uncompressed_length); - - header.encode_into(&mut self.block_writer)?; - - // Write to file - self.block_writer.write_all(&data)?; - - let bytes_written = (BlockHeader::serialized_len() + data.len()) as u64; - - self.index_writer - .register_block(last.key.user_key.clone(), self.meta.file_pos)?; - - // Adjust metadata - self.meta.file_pos += bytes_written; - self.meta.item_count += self.chunk.len(); - self.meta.data_block_count += 1; - - // Back link stuff - self.prev_pos.0 = self.prev_pos.1; - self.prev_pos.1 += bytes_written; - - // Set last key - self.meta.last_key = Some( - // NOTE: Expect is fine, because the chunk is not empty - // - // Also, we are allowed to remove the last item - // to get ownership of it, because the chunk is cleared after - // this anyway - #[allow(clippy::expect_used)] - self.chunk - .pop() - .expect("chunk should not be empty") - .key - .user_key, - ); - - // IMPORTANT: Clear chunk after everything else - self.chunk.clear(); - self.chunk_size = 0; - - Ok(()) - } - - /// Writes an item. - /// - /// # Note - /// - /// It's important that the incoming stream of items is correctly - /// sorted as described by the [`UserKey`], otherwise the block layout will - /// be non-sense. - pub fn write(&mut self, item: InternalValue) -> crate::Result<()> { - if item.is_tombstone() { - self.meta.tombstone_count += 1; - } - - // NOTE: Check if we visit a new key - if Some(&item.key.user_key) != self.current_key.as_ref() { - self.meta.key_count += 1; - self.current_key = Some(item.key.user_key.clone()); - - // IMPORTANT: Do not buffer *every* item's key - // because there may be multiple versions - // of the same key - if self.bloom_policy.is_active() { - self.bloom_hash_buffer - .push(BloomFilter::get_hash(&item.key.user_key)); - } - } - - let seqno = item.key.seqno; - - if self.meta.first_key.is_none() { - self.meta.first_key = Some(item.key.user_key.clone()); - } - - self.chunk_size += item.size(); - self.chunk.push(item); - - if self.chunk_size >= self.opts.data_block_size as usize { - self.spill_block()?; - } - - self.meta.lowest_seqno = self.meta.lowest_seqno.min(seqno); - self.meta.highest_seqno = self.meta.highest_seqno.max(seqno); - - Ok(()) - } - - // TODO: should take mut self to avoid double finish - - /// Finishes the segment, making sure all data is written durably - pub fn finish(&mut self) -> crate::Result> { - self.spill_block()?; - - // No items written! Just delete segment file and return nothing - if self.meta.item_count == 0 { - std::fs::remove_file(&self.segment_file_path)?; - return Ok(None); - } - - let index_block_ptr = BlockOffset(self.block_writer.stream_position()?); - log::trace!("index_block_ptr={index_block_ptr}"); - - // Append index blocks to file - let tli_ptr = self.index_writer.finish(&mut self.block_writer)?; - log::trace!("tli_ptr={tli_ptr}"); - - self.meta.index_block_count = self.index_writer.block_count; - - // Write bloom filter - let bloom_ptr = { - if self.bloom_hash_buffer.is_empty() { - BlockOffset(0) - } else { - let bloom_ptr = self.block_writer.stream_position()?; - let n = self.bloom_hash_buffer.len(); - - log::trace!( - "Constructing Bloom filter with {n} entries: {:?}", - self.bloom_policy, - ); - - let start = std::time::Instant::now(); - - let mut filter = self.bloom_policy.build(n); - - for hash in std::mem::take(&mut self.bloom_hash_buffer) { - filter.set_with_hash(hash); - } - - log::trace!("Built Bloom filter in {:?}", start.elapsed()); - - filter.encode_into(&mut self.block_writer)?; - - BlockOffset(bloom_ptr) - } - }; - log::trace!("bloom_ptr={bloom_ptr}"); - - // TODO: #46 https://github.com/fjall-rs/lsm-tree/issues/46 - Write range filter - let rf_ptr = BlockOffset(0); - log::trace!("rf_ptr={rf_ptr}"); - - // TODO: #2 https://github.com/fjall-rs/lsm-tree/issues/2 - Write range tombstones - let range_tombstones_ptr = BlockOffset(0); - log::trace!("range_tombstones_ptr={range_tombstones_ptr}"); - - // TODO: - let pfx_ptr = BlockOffset(0); - log::trace!("pfx_ptr={pfx_ptr}"); - - // Write metadata - let metadata_ptr = BlockOffset(self.block_writer.stream_position()?); - - let metadata = Metadata::from_writer(self.opts.segment_id, self)?; - metadata.encode_into(&mut self.block_writer)?; - - // Bundle all the file offsets - let offsets = FileOffsets { - index_block_ptr, - tli_ptr, - bloom_ptr, - range_filter_ptr: rf_ptr, - range_tombstones_ptr, - pfx_ptr, - metadata_ptr, - }; - - // Write trailer - let trailer = SegmentFileTrailer { metadata, offsets }; - trailer.encode_into(&mut self.block_writer)?; - - // Finally, flush & fsync the blocks file - self.block_writer.flush()?; - self.block_writer.get_mut().sync_all()?; - - // IMPORTANT: fsync folder on Unix - fsync_directory(&self.opts.folder)?; - - log::debug!( - "Written {} items in {} blocks into new segment file, written {} MiB", - self.meta.item_count, - self.meta.data_block_count, - *self.meta.file_pos / 1_024 / 1_024, - ); - - Ok(Some(trailer)) - } -} - -#[cfg(test)] -#[allow(clippy::expect_used)] -mod tests { - use super::*; - use crate::cache::Cache; - use crate::descriptor_table::FileDescriptorTable; - use crate::segment::block_index::top_level::TopLevelIndex; - use crate::segment::reader::Reader; - use crate::value::{InternalValue, ValueType}; - use std::sync::Arc; - use test_log::test; - - #[test] - fn segment_writer_seqnos() -> crate::Result<()> { - let folder = tempfile::tempdir()?.into_path(); - - let segment_id = 532; - - let mut writer = Writer::new(Options { - folder, - data_block_size: 4_096, - index_block_size: 4_096, - segment_id, - })?; - - writer.write(InternalValue::from_components( - "a", - nanoid::nanoid!().as_bytes(), - 7, - ValueType::Value, - ))?; - writer.write(InternalValue::from_components( - "b", - nanoid::nanoid!().as_bytes(), - 5, - ValueType::Value, - ))?; - writer.write(InternalValue::from_components( - "c", - nanoid::nanoid!().as_bytes(), - 8, - ValueType::Value, - ))?; - writer.write(InternalValue::from_components( - "d", - nanoid::nanoid!().as_bytes(), - 10, - ValueType::Value, - ))?; - - let trailer = writer.finish()?.expect("should exist"); - - assert_eq!(5, trailer.metadata.seqnos.0); - assert_eq!(10, trailer.metadata.seqnos.1); - - Ok(()) - } - - #[test] - fn segment_writer_zero_bpk() -> crate::Result<()> { - const ITEM_COUNT: u64 = 100; - - let folder = tempfile::tempdir()?.into_path(); - - let segment_id = 532; - - let mut writer = Writer::new(Options { - folder, - data_block_size: 4_096, - index_block_size: 4_096, - segment_id, - })? - .use_bloom_policy(BloomConstructionPolicy::BitsPerKey(0)); - - let items = (0u64..ITEM_COUNT).map(|i| { - InternalValue::from_components( - i.to_be_bytes(), - nanoid::nanoid!().as_bytes(), - 0, - ValueType::Value, - ) - }); - - for item in items { - writer.write(item)?; - } - - let trailer = writer.finish()?.expect("should exist"); - - assert_eq!(ITEM_COUNT, trailer.metadata.item_count); - assert_eq!(ITEM_COUNT, trailer.metadata.key_count); - assert_eq!(trailer.offsets.bloom_ptr, BlockOffset(0)); - - Ok(()) - } - - #[test] - fn segment_writer_write_read() -> crate::Result<()> { - const ITEM_COUNT: u64 = 100; - - let folder = tempfile::tempdir()?.into_path(); - - let segment_id = 532; - - let mut writer = Writer::new(Options { - folder: folder.clone(), - data_block_size: 4_096, - index_block_size: 4_096, - segment_id, - })?; - - let items = (0u64..ITEM_COUNT).map(|i| { - InternalValue::from_components( - i.to_be_bytes(), - nanoid::nanoid!().as_bytes(), - 0, - ValueType::Value, - ) - }); - - for item in items { - writer.write(item)?; - } - - let trailer = writer.finish()?.expect("should exist"); - - assert_eq!(ITEM_COUNT, trailer.metadata.item_count); - assert_eq!(ITEM_COUNT, trailer.metadata.key_count); - - assert!(*trailer.offsets.bloom_ptr > 0); - - let segment_file_path = folder.join(segment_id.to_string()); - - // NOTE: The TLI is bound by the index block count, because we know the index block count is u32 - // the TLI length fits into u32 as well - #[allow(clippy::cast_possible_truncation)] - { - let tli = TopLevelIndex::from_file( - &segment_file_path, - &trailer.metadata, - trailer.offsets.tli_ptr, - )?; - - assert_eq!(tli.len() as u32, trailer.metadata.index_block_count); - } - - let table = Arc::new(FileDescriptorTable::new(512, 1)); - table.insert(segment_file_path, (0, segment_id).into()); - - let block_cache = Arc::new(Cache::with_capacity_bytes(10 * 1_024 * 1_024)); - - let iter = Reader::new( - trailer.offsets.index_block_ptr, - table, - (0, segment_id).into(), - block_cache, - BlockOffset(0), - None, - ); - - assert_eq!(ITEM_COUNT, iter.count() as u64); - - Ok(()) - } - - #[test] - fn segment_writer_write_read_mvcc() -> crate::Result<()> { - const ITEM_COUNT: u64 = 1_000; - const VERSION_COUNT: u64 = 5; - - let folder = tempfile::tempdir()?.into_path(); - - let segment_id = 532; - - let mut writer = Writer::new(Options { - folder: folder.clone(), - data_block_size: 4_096, - index_block_size: 4_096, - segment_id, - })?; - - for key in 0u64..ITEM_COUNT { - for seqno in (0..VERSION_COUNT).rev() { - let value = InternalValue::from_components( - key.to_be_bytes(), - nanoid::nanoid!().as_bytes(), - seqno, - ValueType::Value, - ); - - writer.write(value)?; - } - } - - let trailer = writer.finish()?.expect("should exist"); - - assert_eq!(ITEM_COUNT * VERSION_COUNT, trailer.metadata.item_count); - assert_eq!(ITEM_COUNT, trailer.metadata.key_count); - - assert!(*trailer.offsets.bloom_ptr > 0); - - let segment_file_path = folder.join(segment_id.to_string()); - - let table = Arc::new(FileDescriptorTable::new(512, 1)); - table.insert(segment_file_path, (0, segment_id).into()); - - let block_cache = Arc::new(Cache::with_capacity_bytes(10 * 1_024 * 1_024)); - - let iter = Reader::new( - trailer.offsets.index_block_ptr, - table, - (0, segment_id).into(), - block_cache, - BlockOffset(0), - None, - ); - - assert_eq!(ITEM_COUNT * VERSION_COUNT, iter.count() as u64); - - Ok(()) - } -} diff --git a/src/super_segment/block_index/mod.rs b/src/super_segment/block_index/mod.rs index d7721723..f6e0eb01 100644 --- a/src/super_segment/block_index/mod.rs +++ b/src/super_segment/block_index/mod.rs @@ -2,8 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::{IndexBlock, NewKeyedBlockHandle}; -use crate::segment::value_block::CachePolicy; +use super::{CachePolicy, IndexBlock, NewKeyedBlockHandle}; #[enum_dispatch::enum_dispatch] pub trait NewBlockIndex { diff --git a/src/super_segment/data_block/mod.rs b/src/super_segment/data_block/mod.rs index ddc79991..c12e5fb4 100644 --- a/src/super_segment/data_block/mod.rs +++ b/src/super_segment/data_block/mod.rs @@ -388,7 +388,7 @@ impl DataBlock { let key_start = offset + reader.position() as usize; unwrappy!(reader.seek_relative(key_len as i64)); - let val_len: usize = if value_type == ValueType::Value.into() { + let val_len: usize = if value_type == u8::from(ValueType::Value) { unwrappy!(reader.read_u32_varint()) as usize } else { 0 @@ -396,7 +396,7 @@ impl DataBlock { let val_offset = offset + reader.position() as usize; unwrappy!(reader.seek_relative(val_len as i64)); - Some(if value_type == ValueType::Value.into() { + Some(if value_type == u8::from(ValueType::Value) { ParsedItem { value_type, seqno, @@ -435,7 +435,7 @@ impl DataBlock { unwrappy!(reader.seek_relative(rest_key_len as i64)); - let val_len: usize = if value_type == ValueType::Value.into() { + let val_len: usize = if value_type == u8::from(ValueType::Value) { unwrappy!(reader.read_u32_varint()) as usize } else { 0 @@ -443,7 +443,7 @@ impl DataBlock { let val_offset = offset + reader.position() as usize; unwrappy!(reader.seek_relative(val_len as i64)); - Some(if value_type == ValueType::Value.into() { + Some(if value_type == u8::from(ValueType::Value) { ParsedItem { value_type, seqno, diff --git a/src/super_segment/filter/standard_bloom/builder.rs b/src/super_segment/filter/standard_bloom/builder.rs index 8457d45b..a4e596fe 100644 --- a/src/super_segment/filter/standard_bloom/builder.rs +++ b/src/super_segment/filter/standard_bloom/builder.rs @@ -2,9 +2,8 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::super_segment::filter::bit_array::BitArrayReader; - use super::{super::bit_array::Builder as BitArrayBuilder, StandardBloomFilter}; +use crate::super_segment::filter::bit_array::BitArrayReader; /// Two hashes that are used for double hashing pub type CompositeHash = (u64, u64); @@ -95,13 +94,13 @@ impl Builder { /// Adds the key to the filter. pub fn set_with_hash(&mut self, (mut h1, mut h2): CompositeHash) { - for i in 0..(self.k as u64) { + for i in 1..=(self.k as u64) { let idx = h1 % (self.m as u64); self.enable_bit(idx as usize); h1 = h1.wrapping_add(h2); - h2 = h2.wrapping_add(i); + h2 = h2.wrapping_mul(i); } } diff --git a/src/super_segment/filter/standard_bloom/mod.rs b/src/super_segment/filter/standard_bloom/mod.rs index dbfc00ac..16d8b413 100644 --- a/src/super_segment/filter/standard_bloom/mod.rs +++ b/src/super_segment/filter/standard_bloom/mod.rs @@ -3,13 +3,12 @@ use crate::{ coding::{Decode, DecodeError, Encode, EncodeError}, file::MAGIC_BYTES, }; -use builder::CompositeHash; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use std::io::{Read, Write}; mod builder; -pub use builder::Builder; +pub use builder::{Builder, CompositeHash}; /// A standard bloom filter /// @@ -30,6 +29,8 @@ pub struct StandardBloomFilter { k: usize, } +// TODO: change encode/decode to be Filter enum + impl Encode for StandardBloomFilter { fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { // Write header @@ -100,7 +101,7 @@ impl StandardBloomFilter { pub(crate) fn contains_hash(&self, hash: CompositeHash) -> bool { let (mut h1, mut h2) = hash; - for i in 0..(self.k as u64) { + for i in 1..=(self.k as u64) { let idx = h1 % (self.m as u64); // NOTE: should be in bounds because of modulo @@ -110,7 +111,7 @@ impl StandardBloomFilter { } h1 = h1.wrapping_add(h2); - h2 = h2.wrapping_add(i); + h2 = h2.wrapping_mul(i); } true @@ -131,13 +132,11 @@ impl StandardBloomFilter { /// Gets the hash of a key. fn get_hash(key: &[u8]) -> CompositeHash { - let h0 = xxhash_rust::xxh3::xxh3_128(key); - let h1 = (h0 >> 64) as u64; - let h2 = h0 as u64; - (h1, h2) + Builder::get_hash(key) } } +// TODO: restore #[cfg(test)] mod tests { use super::*; diff --git a/src/segment/id.rs b/src/super_segment/id.rs similarity index 96% rename from src/segment/id.rs rename to src/super_segment/id.rs index 4f2281ba..f89a3ba2 100644 --- a/src/segment/id.rs +++ b/src/super_segment/id.rs @@ -2,9 +2,10 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::meta::SegmentId; use crate::tree::inner::TreeId; +pub type SegmentId = u64; + #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] #[allow(clippy::module_name_repetitions)] pub struct GlobalSegmentId(TreeId, SegmentId); diff --git a/src/super_segment/mod.rs b/src/super_segment/mod.rs index 2b703442..9d8dbcd2 100644 --- a/src/super_segment/mod.rs +++ b/src/super_segment/mod.rs @@ -5,7 +5,8 @@ pub mod block; mod block_index; pub(crate) mod data_block; -mod filter; +pub(crate) mod filter; +mod id; mod index_block; mod inner; mod meta; @@ -17,16 +18,17 @@ mod writer; pub use block::{Block, BlockOffset, Checksum}; pub use data_block::DataBlock; +pub use id::{GlobalSegmentId, SegmentId}; pub use index_block::{IndexBlock, NewKeyedBlockHandle}; pub use scanner::Scanner; pub use writer::Writer; use crate::{ - bloom::CompositeHash, new_cache::NewCache, new_descriptor_table::NewDescriptorTable, - CompressionType, GlobalSegmentId, InternalValue, SegmentId, SeqNo, TreeId, UserKey, + new_cache::NewCache, new_descriptor_table::NewDescriptorTable, InternalValue, SeqNo, TreeId, + UserKey, }; use block_index::{NewBlockIndex, NewBlockIndexImpl, NewFullBlockIndex}; -use filter::standard_bloom::StandardBloomFilter; +use filter::standard_bloom::{CompositeHash, StandardBloomFilter}; use index_block::NewBlockHandle; use inner::Inner; use meta::ParsedMeta; @@ -36,6 +38,25 @@ use std::{ sync::{atomic::AtomicBool, Arc}, }; +// todo + +// TODO: segment iter: +// TODO: we only need to truncate items from blocks that are not the first and last block +// TODO: because any block inbetween must (trivially) only contain relevant items + +// TODO: in Leveled compaction, compact segments that live very long and have +// many versions (possibly unnecessary space usage of old, stale versions) + +// TODO: move into module +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum CachePolicy { + /// Read cached blocks, but do not change cache + Read, + + /// Read cached blocks, and update cache + Write, +} + #[allow(clippy::module_name_repetitions)] pub type SegmentInner = Inner; @@ -154,8 +175,6 @@ impl Segment { } fn point_read(&self, key: &[u8], seqno: Option) -> crate::Result> { - use crate::segment::value_block::CachePolicy; - match seqno { None => { let Some(block_handle) = self diff --git a/src/super_segment/trailer.rs b/src/super_segment/trailer.rs index 8d67f4c2..b0e1d12c 100644 --- a/src/super_segment/trailer.rs +++ b/src/super_segment/trailer.rs @@ -6,7 +6,6 @@ use super::index_block::NewBlockHandle; use crate::{ coding::{Decode, DecodeError, Encode, EncodeError}, file::MAGIC_BYTES, - segment::trailer::TRAILER_SIZE, }; use std::{ fs::File, @@ -14,6 +13,8 @@ use std::{ path::Path, }; +const TRAILER_SIZE: usize = 128; + /// The segment trailer stores offsets to the different segment disk file "zones" /// /// ---------------- diff --git a/src/super_segment/writer/index.rs b/src/super_segment/writer/index.rs index 23d66f53..ce6c555a 100644 --- a/src/super_segment/writer/index.rs +++ b/src/super_segment/writer/index.rs @@ -3,12 +3,12 @@ // (found in the LICENSE-* files in the repository) use crate::{ - segment::meta::CompressionType, super_segment::{ block::Header as BlockHeader, index_block::{NewBlockHandle, NewKeyedBlockHandle}, Block, BlockOffset, IndexBlock, }, + CompressionType, }; pub trait BlockIndexWriter { diff --git a/src/super_segment/writer/mod.rs b/src/super_segment/writer/mod.rs index f68c1d44..210c600c 100644 --- a/src/super_segment/writer/mod.rs +++ b/src/super_segment/writer/mod.rs @@ -383,3 +383,229 @@ impl Writer { Ok(Some(self.segment_id)) } } + +// TODO: restore +/* +#[cfg(test)] +#[allow(clippy::expect_used)] +mod tests { + use super::*; + use crate::cache::Cache; + use crate::descriptor_table::FileDescriptorTable; + use crate::segment::block_index::top_level::TopLevelIndex; + use crate::segment::reader::Reader; + use crate::value::{InternalValue, ValueType}; + use std::sync::Arc; + use test_log::test; + + #[test] + fn segment_writer_seqnos() -> crate::Result<()> { + let folder = tempfile::tempdir()?.into_path(); + + let segment_id = 532; + + let mut writer = Writer::new(Options { + folder, + data_block_size: 4_096, + index_block_size: 4_096, + segment_id, + })?; + + writer.write(InternalValue::from_components( + "a", + nanoid::nanoid!().as_bytes(), + 7, + ValueType::Value, + ))?; + writer.write(InternalValue::from_components( + "b", + nanoid::nanoid!().as_bytes(), + 5, + ValueType::Value, + ))?; + writer.write(InternalValue::from_components( + "c", + nanoid::nanoid!().as_bytes(), + 8, + ValueType::Value, + ))?; + writer.write(InternalValue::from_components( + "d", + nanoid::nanoid!().as_bytes(), + 10, + ValueType::Value, + ))?; + + let trailer = writer.finish()?.expect("should exist"); + + assert_eq!(5, trailer.metadata.seqnos.0); + assert_eq!(10, trailer.metadata.seqnos.1); + + Ok(()) + } + + #[test] + fn segment_writer_zero_bpk() -> crate::Result<()> { + const ITEM_COUNT: u64 = 100; + + let folder = tempfile::tempdir()?.into_path(); + + let segment_id = 532; + + let mut writer = Writer::new(Options { + folder, + data_block_size: 4_096, + index_block_size: 4_096, + segment_id, + })? + .use_bloom_policy(BloomConstructionPolicy::BitsPerKey(0)); + + let items = (0u64..ITEM_COUNT).map(|i| { + InternalValue::from_components( + i.to_be_bytes(), + nanoid::nanoid!().as_bytes(), + 0, + ValueType::Value, + ) + }); + + for item in items { + writer.write(item)?; + } + + let trailer = writer.finish()?.expect("should exist"); + + assert_eq!(ITEM_COUNT, trailer.metadata.item_count); + assert_eq!(ITEM_COUNT, trailer.metadata.key_count); + assert_eq!(trailer.offsets.bloom_ptr, BlockOffset(0)); + + Ok(()) + } + + #[test] + fn segment_writer_write_read() -> crate::Result<()> { + const ITEM_COUNT: u64 = 100; + + let folder = tempfile::tempdir()?.into_path(); + + let segment_id = 532; + + let mut writer = Writer::new(Options { + folder: folder.clone(), + data_block_size: 4_096, + index_block_size: 4_096, + segment_id, + })?; + + let items = (0u64..ITEM_COUNT).map(|i| { + InternalValue::from_components( + i.to_be_bytes(), + nanoid::nanoid!().as_bytes(), + 0, + ValueType::Value, + ) + }); + + for item in items { + writer.write(item)?; + } + + let trailer = writer.finish()?.expect("should exist"); + + assert_eq!(ITEM_COUNT, trailer.metadata.item_count); + assert_eq!(ITEM_COUNT, trailer.metadata.key_count); + + assert!(*trailer.offsets.bloom_ptr > 0); + + let segment_file_path = folder.join(segment_id.to_string()); + + // NOTE: The TLI is bound by the index block count, because we know the index block count is u32 + // the TLI length fits into u32 as well + #[allow(clippy::cast_possible_truncation)] + { + let tli = TopLevelIndex::from_file( + &segment_file_path, + &trailer.metadata, + trailer.offsets.tli_ptr, + )?; + + assert_eq!(tli.len() as u32, trailer.metadata.index_block_count); + } + + let table = Arc::new(FileDescriptorTable::new(512, 1)); + table.insert(segment_file_path, (0, segment_id).into()); + + let block_cache = Arc::new(Cache::with_capacity_bytes(10 * 1_024 * 1_024)); + + let iter = Reader::new( + trailer.offsets.index_block_ptr, + table, + (0, segment_id).into(), + block_cache, + BlockOffset(0), + None, + ); + + assert_eq!(ITEM_COUNT, iter.count() as u64); + + Ok(()) + } + + #[test] + fn segment_writer_write_read_mvcc() -> crate::Result<()> { + const ITEM_COUNT: u64 = 1_000; + const VERSION_COUNT: u64 = 5; + + let folder = tempfile::tempdir()?.into_path(); + + let segment_id = 532; + + let mut writer = Writer::new(Options { + folder: folder.clone(), + data_block_size: 4_096, + index_block_size: 4_096, + segment_id, + })?; + + for key in 0u64..ITEM_COUNT { + for seqno in (0..VERSION_COUNT).rev() { + let value = InternalValue::from_components( + key.to_be_bytes(), + nanoid::nanoid!().as_bytes(), + seqno, + ValueType::Value, + ); + + writer.write(value)?; + } + } + + let trailer = writer.finish()?.expect("should exist"); + + assert_eq!(ITEM_COUNT * VERSION_COUNT, trailer.metadata.item_count); + assert_eq!(ITEM_COUNT, trailer.metadata.key_count); + + assert!(*trailer.offsets.bloom_ptr > 0); + + let segment_file_path = folder.join(segment_id.to_string()); + + let table = Arc::new(FileDescriptorTable::new(512, 1)); + table.insert(segment_file_path, (0, segment_id).into()); + + let block_cache = Arc::new(Cache::with_capacity_bytes(10 * 1_024 * 1_024)); + + let iter = Reader::new( + trailer.offsets.index_block_ptr, + table, + (0, segment_id).into(), + block_cache, + BlockOffset(0), + None, + ); + + assert_eq!(ITEM_COUNT * VERSION_COUNT, iter.count() as u64); + + Ok(()) + } +} + */ diff --git a/src/tree/inner.rs b/src/tree/inner.rs index dfa12639..f2d3d924 100644 --- a/src/tree/inner.rs +++ b/src/tree/inner.rs @@ -4,7 +4,7 @@ use crate::{ config::Config, file::LEVELS_MANIFEST_FILE, level_manifest::LevelManifest, memtable::Memtable, - segment::meta::SegmentId, stop_signal::StopSignal, + stop_signal::StopSignal, SegmentId, }; use std::sync::{atomic::AtomicU64, Arc, RwLock}; diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 3aaefd0b..703e525e 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -12,7 +12,6 @@ use crate::{ level_manifest::LevelManifest, manifest::Manifest, memtable::Memtable, - segment::meta::TableType, super_segment::Segment, value::InternalValue, version::Version, @@ -300,7 +299,7 @@ impl AbstractTree for Tree { &self.config } - fn active_memtable_size(&self) -> u32 { + fn active_memtable_size(&self) -> u64 { use std::sync::atomic::Ordering::Acquire; self.active_memtable @@ -433,17 +432,17 @@ impl AbstractTree for Tree { key: K, value: V, seqno: SeqNo, - ) -> (u32, u32) { + ) -> (u64, u64) { let value = InternalValue::from_components(key, value, seqno, ValueType::Value); self.append_entry(value) } - fn remove>(&self, key: K, seqno: SeqNo) -> (u32, u32) { + fn remove>(&self, key: K, seqno: SeqNo) -> (u64, u64) { let value = InternalValue::new_tombstone(key, seqno); self.append_entry(value) } - fn remove_weak>(&self, key: K, seqno: SeqNo) -> (u32, u32) { + fn remove_weak>(&self, key: K, seqno: SeqNo) -> (u64, u64) { let value = InternalValue::new_weak_tombstone(key, seqno); self.append_entry(value) } @@ -619,7 +618,7 @@ impl Tree { ) -> crate::Result> { // NOTE: Create key hash for hash sharing // https://fjall-rs.github.io/post/bloom-filter-hash-sharing/ - let key_hash = crate::bloom::BloomFilter::get_hash(key); + let key_hash = crate::super_segment::filter::standard_bloom::Builder::get_hash(key); let level_manifest = self.levels.read().expect("lock is poisoned"); @@ -787,7 +786,7 @@ impl Tree { /// Returns the added item's size and new size of the memtable. #[doc(hidden)] #[must_use] - pub fn append_entry(&self, value: InternalValue) -> (u32, u32) { + pub fn append_entry(&self, value: InternalValue) -> (u64, u64) { let memtable_lock = self.active_memtable.read().expect("lock is poisoned"); memtable_lock.insert(value) } @@ -813,7 +812,7 @@ impl Tree { // IMPORTANT: Restore persisted config config.level_count = manifest.level_count; - config.table_type = manifest.table_type; + // config.table_type = manifest.table_type; config.tree_type = manifest.tree_type; let tree_id = get_next_tree_id(); @@ -865,7 +864,7 @@ impl Tree { version: Version::V3, level_count: config.level_count, tree_type: config.tree_type, - table_type: TableType::Block, + // table_type: TableType::Block, } .encode_into(&mut file)?; file.sync_all()?; diff --git a/src/value.rs b/src/value.rs index 4a08e58c..927096c0 100644 --- a/src/value.rs +++ b/src/value.rs @@ -5,7 +5,6 @@ use crate::{ coding::{Decode, DecodeError, Encode, EncodeError}, key::InternalKey, - segment::block::ItemSize, Slice, }; use std::io::{Read, Write}; @@ -159,15 +158,6 @@ impl Ord for InternalValue { } } -impl ItemSize for InternalValue { - fn size(&self) -> usize { - std::mem::size_of::() - + std::mem::size_of::() - + self.key.user_key.len() - + self.value.len() - } -} - impl std::fmt::Debug for InternalValue { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( From 3624fa539ab4902163f340059d388c9ef9f75fe5 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Apr 2025 13:41:11 +0200 Subject: [PATCH 066/613] rename new cache --- src/blob_tree/cache.rs | 4 ++-- src/config.rs | 8 ++++---- src/lib.rs | 4 ++-- src/new_cache.rs | 4 ++-- src/super_segment/inner.rs | 4 ++-- src/super_segment/mod.rs | 8 ++++---- src/tree/mod.rs | 4 ++-- 7 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/blob_tree/cache.rs b/src/blob_tree/cache.rs index 5f1db2bc..e5753471 100644 --- a/src/blob_tree/cache.rs +++ b/src/blob_tree/cache.rs @@ -1,9 +1,9 @@ -use crate::NewCache; +use crate::Cache; use std::sync::Arc; use value_log::BlobCache; #[derive(Clone)] -pub struct MyBlobCache(pub(crate) Arc); +pub struct MyBlobCache(pub(crate) Arc); impl BlobCache for MyBlobCache { fn get( diff --git a/src/config.rs b/src/config.rs index 7f8df802..2f8be1d5 100644 --- a/src/config.rs +++ b/src/config.rs @@ -2,7 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::{path::absolute_path, BlobTree, CompressionType, NewCache, NewDescriptorTable, Tree}; +use crate::{path::absolute_path, BlobTree, CompressionType, Cache, NewDescriptorTable, Tree}; use std::{ path::{Path, PathBuf}, sync::Arc, @@ -78,7 +78,7 @@ pub struct Config { /// Block cache to use #[doc(hidden)] - pub cache: Arc, + pub cache: Arc, /// Blob file (value log segment) target size in bytes #[doc(hidden)] @@ -99,7 +99,7 @@ impl Default for Config { path: absolute_path(Path::new(DEFAULT_FILE_FOLDER)), descriptor_table: Arc::new(NewDescriptorTable::new(256)), - cache: Arc::new(NewCache::with_capacity_bytes(/* 16 MiB */ 16 * 1_024 * 1_024)), + cache: Arc::new(Cache::with_capacity_bytes(/* 16 MiB */ 16 * 1_024 * 1_024)), data_block_size: /* 4 KiB */ 4_096, index_block_size: /* 4 KiB */ 4_096, @@ -235,7 +235,7 @@ impl Config { /// /// Defaults to a cache with 8 MiB of capacity *per tree*. #[must_use] - pub fn use_cache(mut self, cache: Arc) -> Self { + pub fn use_cache(mut self, cache: Arc) -> Self { self.cache = cache; self } diff --git a/src/lib.rs b/src/lib.rs index 2aee3af7..4ef42fa6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -200,7 +200,7 @@ pub mod coding { #[doc(hidden)] pub use { merge::BoxedIterator, - new_cache::NewCache, + new_cache::Cache, new_descriptor_table::NewDescriptorTable, super_segment::{block::Checksum, GlobalSegmentId, SegmentId}, tree::inner::TreeId, @@ -213,7 +213,7 @@ pub use { config::{Config, TreeType}, error::{Error, Result}, memtable::Memtable, - new_cache::NewCache as Cache, // <- TODO: rename + new_cache::Cache as Cache, // <- TODO: rename new_descriptor_table::NewDescriptorTable as DescriptorTable, r#abstract::AbstractTree, seqno::SequenceNumberCounter, diff --git a/src/new_cache.rs b/src/new_cache.rs index 7a472f95..4037905a 100644 --- a/src/new_cache.rs +++ b/src/new_cache.rs @@ -68,7 +68,7 @@ impl Weighter for BlockWeighter { /// # /// # Ok::<(), lsm_tree::Error>(()) /// ``` -pub struct NewCache { +pub struct Cache { // NOTE: rustc_hash performed best: https://fjall-rs.github.io/post/fjall-2-1 /// Concurrent cache implementation data: QuickCache, @@ -77,7 +77,7 @@ pub struct NewCache { capacity: u64, } -impl NewCache { +impl Cache { /// Creates a new block cache with roughly `n` bytes of capacity. #[must_use] pub fn with_capacity_bytes(bytes: u64) -> Self { diff --git a/src/super_segment/inner.rs b/src/super_segment/inner.rs index 1d45db3d..647e0424 100644 --- a/src/super_segment/inner.rs +++ b/src/super_segment/inner.rs @@ -7,7 +7,7 @@ use super::{ trailer::Trailer, }; use crate::{ - new_cache::NewCache, new_descriptor_table::NewDescriptorTable, tree::inner::TreeId, + new_cache::Cache, new_descriptor_table::NewDescriptorTable, tree::inner::TreeId, GlobalSegmentId, }; use std::{ @@ -37,7 +37,7 @@ pub struct Inner { /// /// Stores index and data blocks #[doc(hidden)] - pub cache: Arc, + pub cache: Arc, /// Pinned AMQ filter pub pinned_filter: Option, diff --git a/src/super_segment/mod.rs b/src/super_segment/mod.rs index 9d8dbcd2..d50b0d0f 100644 --- a/src/super_segment/mod.rs +++ b/src/super_segment/mod.rs @@ -24,7 +24,7 @@ pub use scanner::Scanner; pub use writer::Writer; use crate::{ - new_cache::NewCache, new_descriptor_table::NewDescriptorTable, InternalValue, SeqNo, TreeId, + new_cache::Cache, new_descriptor_table::NewDescriptorTable, InternalValue, SeqNo, TreeId, UserKey, }; use block_index::{NewBlockIndex, NewBlockIndexImpl, NewFullBlockIndex}; @@ -278,7 +278,7 @@ impl Segment { pub fn recover( file_path: &Path, tree_id: TreeId, - cache: Arc, + cache: Arc, descriptor_table: Arc, ) -> crate::Result { // use block_index::{full_index::FullBlockIndex, two_level_index::TwoLevelBlockIndex}; @@ -453,7 +453,7 @@ mod tests { let segment = Segment::recover( &file, 0, - Arc::new(NewCache::with_capacity_bytes(1_000_000)), + Arc::new(Cache::with_capacity_bytes(1_000_000)), Arc::new(NewDescriptorTable::new(10)), )?; @@ -511,7 +511,7 @@ mod tests { let segment = Segment::recover( &file, 0, - Arc::new(NewCache::with_capacity_bytes(1_000_000)), + Arc::new(Cache::with_capacity_bytes(1_000_000)), Arc::new(NewDescriptorTable::new(10)), )?; diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 703e525e..42acee31 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -15,7 +15,7 @@ use crate::{ super_segment::Segment, value::InternalValue, version::Version, - AbstractTree, KvPair, NewCache, NewDescriptorTable, SegmentId, SeqNo, Snapshot, UserKey, + AbstractTree, KvPair, Cache, NewDescriptorTable, SegmentId, SeqNo, Snapshot, UserKey, UserValue, ValueType, }; use inner::{MemtableId, SealedMemtables, TreeId, TreeInner}; @@ -881,7 +881,7 @@ impl Tree { fn recover_levels>( tree_path: P, tree_id: TreeId, - cache: &Arc, + cache: &Arc, descriptor_table: &Arc, ) -> crate::Result { use crate::{ From 7d2158518f40b3e9847bfa0ac11411ca5b7f9041 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Apr 2025 13:41:41 +0200 Subject: [PATCH 067/613] move new cache module --- src/{new_cache.rs => cache.rs} | 0 src/lib.rs | 5 ++--- src/super_segment/inner.rs | 2 +- src/super_segment/mod.rs | 3 +-- tests/open_files.rs | 4 ++-- 5 files changed, 6 insertions(+), 8 deletions(-) rename src/{new_cache.rs => cache.rs} (100%) diff --git a/src/new_cache.rs b/src/cache.rs similarity index 100% rename from src/new_cache.rs rename to src/cache.rs diff --git a/src/lib.rs b/src/lib.rs index 4ef42fa6..b6f32bda 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -153,7 +153,7 @@ mod manifest; mod memtable; #[doc(hidden)] -mod new_cache; +mod cache; #[doc(hidden)] mod new_descriptor_table; @@ -200,7 +200,6 @@ pub mod coding { #[doc(hidden)] pub use { merge::BoxedIterator, - new_cache::Cache, new_descriptor_table::NewDescriptorTable, super_segment::{block::Checksum, GlobalSegmentId, SegmentId}, tree::inner::TreeId, @@ -208,12 +207,12 @@ pub use { }; pub use { + cache::Cache, coding::{DecodeError, EncodeError}, compression::CompressionType, config::{Config, TreeType}, error::{Error, Result}, memtable::Memtable, - new_cache::Cache as Cache, // <- TODO: rename new_descriptor_table::NewDescriptorTable as DescriptorTable, r#abstract::AbstractTree, seqno::SequenceNumberCounter, diff --git a/src/super_segment/inner.rs b/src/super_segment/inner.rs index 647e0424..e247ef98 100644 --- a/src/super_segment/inner.rs +++ b/src/super_segment/inner.rs @@ -7,7 +7,7 @@ use super::{ trailer::Trailer, }; use crate::{ - new_cache::Cache, new_descriptor_table::NewDescriptorTable, tree::inner::TreeId, + cache::Cache, new_descriptor_table::NewDescriptorTable, tree::inner::TreeId, GlobalSegmentId, }; use std::{ diff --git a/src/super_segment/mod.rs b/src/super_segment/mod.rs index d50b0d0f..4667bdfe 100644 --- a/src/super_segment/mod.rs +++ b/src/super_segment/mod.rs @@ -24,8 +24,7 @@ pub use scanner::Scanner; pub use writer::Writer; use crate::{ - new_cache::Cache, new_descriptor_table::NewDescriptorTable, InternalValue, SeqNo, TreeId, - UserKey, + cache::Cache, new_descriptor_table::NewDescriptorTable, InternalValue, SeqNo, TreeId, UserKey, }; use block_index::{NewBlockIndex, NewBlockIndexImpl, NewFullBlockIndex}; use filter::standard_bloom::{CompositeHash, StandardBloomFilter}; diff --git a/tests/open_files.rs b/tests/open_files.rs index 3b867b47..24600046 100644 --- a/tests/open_files.rs +++ b/tests/open_files.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, NewCache}; +use lsm_tree::{AbstractTree, Cache, Config}; use std::sync::Arc; use test_log::test; @@ -9,7 +9,7 @@ fn open_file_limit() -> lsm_tree::Result<()> { let folder = tempfile::tempdir_in(".test_open_files")?; let tree = Config::new(folder) - .use_cache(Arc::new(NewCache::with_capacity_bytes(0))) + .use_cache(Arc::new(Cache::with_capacity_bytes(0))) .open()?; for _ in 0..2_048 { From 56ca192eb48906fc640533d48857a13cd3469eed Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Apr 2025 13:41:57 +0200 Subject: [PATCH 068/613] rename new descriptor table --- src/config.rs | 8 ++++---- src/lib.rs | 4 ++-- src/new_descriptor_table.rs | 5 ++--- src/super_segment/inner.rs | 5 ++--- src/super_segment/mod.rs | 8 ++++---- src/tree/mod.rs | 6 +++--- 6 files changed, 17 insertions(+), 19 deletions(-) diff --git a/src/config.rs b/src/config.rs index 2f8be1d5..bc3e1d6b 100644 --- a/src/config.rs +++ b/src/config.rs @@ -2,7 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::{path::absolute_path, BlobTree, CompressionType, Cache, NewDescriptorTable, Tree}; +use crate::{path::absolute_path, BlobTree, CompressionType, Cache, DescriptorTable, Tree}; use std::{ path::{Path, PathBuf}, sync::Arc, @@ -90,14 +90,14 @@ pub struct Config { /// Descriptor table to use #[doc(hidden)] - pub descriptor_table: Arc, + pub descriptor_table: Arc, } impl Default for Config { fn default() -> Self { Self { path: absolute_path(Path::new(DEFAULT_FILE_FOLDER)), - descriptor_table: Arc::new(NewDescriptorTable::new(256)), + descriptor_table: Arc::new(DescriptorTable::new(256)), cache: Arc::new(Cache::with_capacity_bytes(/* 16 MiB */ 16 * 1_024 * 1_024)), @@ -273,7 +273,7 @@ impl Config { #[must_use] #[doc(hidden)] - pub fn descriptor_table(mut self, descriptor_table: Arc) -> Self { + pub fn descriptor_table(mut self, descriptor_table: Arc) -> Self { self.descriptor_table = descriptor_table; self } diff --git a/src/lib.rs b/src/lib.rs index b6f32bda..eaa01c1e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -200,7 +200,7 @@ pub mod coding { #[doc(hidden)] pub use { merge::BoxedIterator, - new_descriptor_table::NewDescriptorTable, + new_descriptor_table::DescriptorTable, super_segment::{block::Checksum, GlobalSegmentId, SegmentId}, tree::inner::TreeId, value::InternalValue, @@ -213,7 +213,7 @@ pub use { config::{Config, TreeType}, error::{Error, Result}, memtable::Memtable, - new_descriptor_table::NewDescriptorTable as DescriptorTable, + new_descriptor_table::DescriptorTable as DescriptorTable, r#abstract::AbstractTree, seqno::SequenceNumberCounter, snapshot::Snapshot, diff --git a/src/new_descriptor_table.rs b/src/new_descriptor_table.rs index 7555ded3..ffa372f1 100644 --- a/src/new_descriptor_table.rs +++ b/src/new_descriptor_table.rs @@ -14,12 +14,11 @@ type Item = Arc; #[derive(Eq, std::hash::Hash, PartialEq)] struct CacheKey(u8, u64, u64); -// TODO: 3.0.0 rename -pub struct NewDescriptorTable { +pub struct DescriptorTable { inner: QuickCache, } -impl NewDescriptorTable { +impl DescriptorTable { #[must_use] pub fn new(capacity: usize) -> Self { use quick_cache::sync::DefaultLifecycle; diff --git a/src/super_segment/inner.rs b/src/super_segment/inner.rs index e247ef98..35e677a6 100644 --- a/src/super_segment/inner.rs +++ b/src/super_segment/inner.rs @@ -7,8 +7,7 @@ use super::{ trailer::Trailer, }; use crate::{ - cache::Cache, new_descriptor_table::NewDescriptorTable, tree::inner::TreeId, - GlobalSegmentId, + cache::Cache, new_descriptor_table::DescriptorTable, tree::inner::TreeId, GlobalSegmentId, }; use std::{ path::PathBuf, @@ -21,7 +20,7 @@ pub struct Inner { pub(crate) tree_id: TreeId, #[doc(hidden)] - pub descriptor_table: Arc, + pub descriptor_table: Arc, /// Segment metadata object #[doc(hidden)] diff --git a/src/super_segment/mod.rs b/src/super_segment/mod.rs index 4667bdfe..a52689e5 100644 --- a/src/super_segment/mod.rs +++ b/src/super_segment/mod.rs @@ -24,7 +24,7 @@ pub use scanner::Scanner; pub use writer::Writer; use crate::{ - cache::Cache, new_descriptor_table::NewDescriptorTable, InternalValue, SeqNo, TreeId, UserKey, + cache::Cache, new_descriptor_table::DescriptorTable, InternalValue, SeqNo, TreeId, UserKey, }; use block_index::{NewBlockIndex, NewBlockIndexImpl, NewFullBlockIndex}; use filter::standard_bloom::{CompositeHash, StandardBloomFilter}; @@ -278,7 +278,7 @@ impl Segment { file_path: &Path, tree_id: TreeId, cache: Arc, - descriptor_table: Arc, + descriptor_table: Arc, ) -> crate::Result { // use block_index::{full_index::FullBlockIndex, two_level_index::TwoLevelBlockIndex}; use trailer::Trailer; @@ -453,7 +453,7 @@ mod tests { &file, 0, Arc::new(Cache::with_capacity_bytes(1_000_000)), - Arc::new(NewDescriptorTable::new(10)), + Arc::new(DescriptorTable::new(10)), )?; assert_eq!(5, segment.id()); @@ -511,7 +511,7 @@ mod tests { &file, 0, Arc::new(Cache::with_capacity_bytes(1_000_000)), - Arc::new(NewDescriptorTable::new(10)), + Arc::new(DescriptorTable::new(10)), )?; assert_eq!(5, segment.id()); diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 42acee31..e79c500d 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -15,8 +15,8 @@ use crate::{ super_segment::Segment, value::InternalValue, version::Version, - AbstractTree, KvPair, Cache, NewDescriptorTable, SegmentId, SeqNo, Snapshot, UserKey, - UserValue, ValueType, + AbstractTree, Cache, DescriptorTable, KvPair, SegmentId, SeqNo, Snapshot, UserKey, UserValue, + ValueType, }; use inner::{MemtableId, SealedMemtables, TreeId, TreeInner}; use std::{ @@ -882,7 +882,7 @@ impl Tree { tree_path: P, tree_id: TreeId, cache: &Arc, - descriptor_table: &Arc, + descriptor_table: &Arc, ) -> crate::Result { use crate::{ file::fsync_directory, From 37bf05875a1eb2ce7dea00506df2eac4e1b7d6cd Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Apr 2025 13:42:13 +0200 Subject: [PATCH 069/613] move new descriptor table module --- src/{new_descriptor_table.rs => descriptor_table.rs} | 0 src/lib.rs | 6 +++--- src/super_segment/inner.rs | 2 +- src/super_segment/mod.rs | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) rename src/{new_descriptor_table.rs => descriptor_table.rs} (100%) diff --git a/src/new_descriptor_table.rs b/src/descriptor_table.rs similarity index 100% rename from src/new_descriptor_table.rs rename to src/descriptor_table.rs diff --git a/src/lib.rs b/src/lib.rs index eaa01c1e..247f3d77 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -156,7 +156,7 @@ mod memtable; mod cache; #[doc(hidden)] -mod new_descriptor_table; +mod descriptor_table; #[doc(hidden)] pub mod merge; @@ -199,8 +199,8 @@ pub mod coding { #[doc(hidden)] pub use { + descriptor_table::DescriptorTable, merge::BoxedIterator, - new_descriptor_table::DescriptorTable, super_segment::{block::Checksum, GlobalSegmentId, SegmentId}, tree::inner::TreeId, value::InternalValue, @@ -211,9 +211,9 @@ pub use { coding::{DecodeError, EncodeError}, compression::CompressionType, config::{Config, TreeType}, + descriptor_table::DescriptorTable, error::{Error, Result}, memtable::Memtable, - new_descriptor_table::DescriptorTable as DescriptorTable, r#abstract::AbstractTree, seqno::SequenceNumberCounter, snapshot::Snapshot, diff --git a/src/super_segment/inner.rs b/src/super_segment/inner.rs index 35e677a6..ccd4bd88 100644 --- a/src/super_segment/inner.rs +++ b/src/super_segment/inner.rs @@ -7,7 +7,7 @@ use super::{ trailer::Trailer, }; use crate::{ - cache::Cache, new_descriptor_table::DescriptorTable, tree::inner::TreeId, GlobalSegmentId, + cache::Cache, descriptor_table::DescriptorTable, tree::inner::TreeId, GlobalSegmentId, }; use std::{ path::PathBuf, diff --git a/src/super_segment/mod.rs b/src/super_segment/mod.rs index a52689e5..c785b387 100644 --- a/src/super_segment/mod.rs +++ b/src/super_segment/mod.rs @@ -24,7 +24,7 @@ pub use scanner::Scanner; pub use writer::Writer; use crate::{ - cache::Cache, new_descriptor_table::DescriptorTable, InternalValue, SeqNo, TreeId, UserKey, + cache::Cache, descriptor_table::DescriptorTable, InternalValue, SeqNo, TreeId, UserKey, }; use block_index::{NewBlockIndex, NewBlockIndexImpl, NewFullBlockIndex}; use filter::standard_bloom::{CompositeHash, StandardBloomFilter}; From a86da17aba3ca3644e6d2a6ba1416c4dd4648e73 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Apr 2025 13:42:23 +0200 Subject: [PATCH 070/613] fix --- src/lib.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 247f3d77..38f4ba99 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -199,7 +199,6 @@ pub mod coding { #[doc(hidden)] pub use { - descriptor_table::DescriptorTable, merge::BoxedIterator, super_segment::{block::Checksum, GlobalSegmentId, SegmentId}, tree::inner::TreeId, From a9e9452570ef49d5df4bfdf6bd9060b4fb43b9ea Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Apr 2025 13:43:12 +0200 Subject: [PATCH 071/613] move new segment module --- src/abstract.rs | 2 +- src/blob_tree/mod.rs | 4 ++-- src/cache.rs | 4 ++-- src/compaction/leveled.rs | 2 +- src/compaction/maintenance.rs | 2 +- src/compaction/major.rs | 2 +- src/compaction/movedown.rs | 2 +- src/compaction/pulldown.rs | 2 +- src/compaction/tiered.rs | 2 +- src/compaction/worker.rs | 2 +- src/level_manifest/level.rs | 2 +- src/level_manifest/mod.rs | 2 +- src/level_reader.rs | 2 +- src/level_scanner.rs | 2 +- src/lib.rs | 4 ++-- src/range.rs | 2 +- .../block/binary_index/builder.rs | 0 src/{super_segment => segment}/block/binary_index/mod.rs | 0 .../block/binary_index/reader.rs | 0 src/{super_segment => segment}/block/checksum.rs | 0 src/{super_segment => segment}/block/encoder.rs | 0 .../block/hash_index/builder.rs | 0 src/{super_segment => segment}/block/hash_index/mod.rs | 0 src/{super_segment => segment}/block/hash_index/reader.rs | 0 src/{super_segment => segment}/block/header.rs | 0 src/{super_segment => segment}/block/mod.rs | 0 src/{super_segment => segment}/block/offset.rs | 0 src/{super_segment => segment}/block/trailer.rs | 2 +- src/{super_segment => segment}/block_index/mod.rs | 0 src/{super_segment => segment}/data_block/iter.rs | 0 src/{super_segment => segment}/data_block/mod.rs | 4 ++-- src/{super_segment => segment}/filter/bit_array/mod.rs | 0 src/{super_segment => segment}/filter/bit_array/sliced.rs | 2 +- src/{super_segment => segment}/filter/mod.rs | 0 .../filter/standard_bloom/builder.rs | 2 +- .../filter/standard_bloom/mod.rs | 0 src/{super_segment => segment}/id.rs | 0 .../index_block/block_handle.rs | 2 +- .../index_block/forward_reader.rs | 4 ++-- src/{super_segment => segment}/index_block/mod.rs | 4 ++-- src/{super_segment => segment}/inner.rs | 0 src/{super_segment => segment}/meta.rs | 0 src/{super_segment => segment}/mod.rs | 4 ++-- src/{super_segment => segment}/multi_writer.rs | 0 src/{super_segment => segment}/scanner.rs | 0 src/{super_segment => segment}/trailer.rs | 2 +- src/{super_segment => segment}/util.rs | 0 src/{super_segment => segment}/writer/index.rs | 2 +- src/{super_segment => segment}/writer/meta.rs | 2 +- src/{super_segment => segment}/writer/mod.rs | 2 +- src/tree/ingest.rs | 2 +- src/tree/mod.rs | 8 ++++---- 52 files changed, 40 insertions(+), 40 deletions(-) rename src/{super_segment => segment}/block/binary_index/builder.rs (100%) rename src/{super_segment => segment}/block/binary_index/mod.rs (100%) rename src/{super_segment => segment}/block/binary_index/reader.rs (100%) rename src/{super_segment => segment}/block/checksum.rs (100%) rename src/{super_segment => segment}/block/encoder.rs (100%) rename src/{super_segment => segment}/block/hash_index/builder.rs (100%) rename src/{super_segment => segment}/block/hash_index/mod.rs (100%) rename src/{super_segment => segment}/block/hash_index/reader.rs (100%) rename src/{super_segment => segment}/block/header.rs (100%) rename src/{super_segment => segment}/block/mod.rs (100%) rename src/{super_segment => segment}/block/offset.rs (100%) rename src/{super_segment => segment}/block/trailer.rs (98%) rename src/{super_segment => segment}/block_index/mod.rs (100%) rename src/{super_segment => segment}/data_block/iter.rs (100%) rename src/{super_segment => segment}/data_block/mod.rs (99%) rename src/{super_segment => segment}/filter/bit_array/mod.rs (100%) rename src/{super_segment => segment}/filter/bit_array/sliced.rs (95%) rename src/{super_segment => segment}/filter/mod.rs (100%) rename src/{super_segment => segment}/filter/standard_bloom/builder.rs (98%) rename src/{super_segment => segment}/filter/standard_bloom/mod.rs (100%) rename src/{super_segment => segment}/id.rs (100%) rename src/{super_segment => segment}/index_block/block_handle.rs (98%) rename src/{super_segment => segment}/index_block/forward_reader.rs (98%) rename src/{super_segment => segment}/index_block/mod.rs (99%) rename src/{super_segment => segment}/inner.rs (100%) rename src/{super_segment => segment}/meta.rs (100%) rename src/{super_segment => segment}/mod.rs (99%) rename src/{super_segment => segment}/multi_writer.rs (100%) rename src/{super_segment => segment}/scanner.rs (100%) rename src/{super_segment => segment}/trailer.rs (99%) rename src/{super_segment => segment}/util.rs (100%) rename src/{super_segment => segment}/writer/index.rs (99%) rename src/{super_segment => segment}/writer/meta.rs (95%) rename src/{super_segment => segment}/writer/mod.rs (99%) diff --git a/src/abstract.rs b/src/abstract.rs index babd4647..fc6b475e 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use crate::{ - compaction::CompactionStrategy, config::TreeType, super_segment::Segment, + compaction::CompactionStrategy, config::TreeType, segment::Segment, tree::inner::MemtableId, AnyTree, BlobTree, Config, KvPair, Memtable, SegmentId, SeqNo, Snapshot, Tree, UserKey, UserValue, }; diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index de36bd3d..6c9635b8 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -13,7 +13,7 @@ use crate::{ compaction::stream::CompactionStream, file::BLOBS_FOLDER, r#abstract::{AbstractTree, RangeItem}, - super_segment::Segment, + segment::Segment, tree::inner::MemtableId, value::InternalValue, Config, KvPair, Memtable, SegmentId, SeqNo, Snapshot, UserKey, UserValue, @@ -366,7 +366,7 @@ impl AbstractTree for BlobTree { use crate::{ file::SEGMENTS_FOLDER, //segment::writer::{Options, Writer as SegmentWriter}, - super_segment::Writer as SegmentWriter, + segment::Writer as SegmentWriter, }; use value::MaybeInlineValue; diff --git a/src/cache.rs b/src/cache.rs index 4037905a..c2247055 100644 --- a/src/cache.rs +++ b/src/cache.rs @@ -2,8 +2,8 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::super_segment::block::Header; -use crate::super_segment::{Block, BlockOffset, DataBlock}; +use crate::segment::block::Header; +use crate::segment::{Block, BlockOffset, DataBlock}; use crate::GlobalSegmentId; use quick_cache::Weighter; use quick_cache::{sync::Cache as QuickCache, Equivalent}; diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 09f2f96f..92e40209 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -6,7 +6,7 @@ use super::{Choice, CompactionStrategy, Input as CompactionInput}; use crate::{ config::Config, level_manifest::{hidden_set::HiddenSet, level::Level, LevelManifest}, - super_segment::Segment, + segment::Segment, windows::{GrowingWindowsExt, ShrinkingWindowsExt}, HashSet, KeyRange, SegmentId, }; diff --git a/src/compaction/maintenance.rs b/src/compaction/maintenance.rs index 987fac29..16987a47 100644 --- a/src/compaction/maintenance.rs +++ b/src/compaction/maintenance.rs @@ -4,7 +4,7 @@ use super::{Choice, CompactionStrategy}; use crate::{ - config::Config, level_manifest::LevelManifest, super_segment::Segment, HashSet, SegmentId, + config::Config, level_manifest::LevelManifest, segment::Segment, HashSet, SegmentId, }; const L0_SEGMENT_CAP: usize = 20; diff --git a/src/compaction/major.rs b/src/compaction/major.rs index a9e2a720..1aa7b5ba 100644 --- a/src/compaction/major.rs +++ b/src/compaction/major.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{Choice, CompactionStrategy, Input as CompactionInput}; -use crate::{config::Config, level_manifest::LevelManifest, super_segment::Segment, HashSet}; +use crate::{config::Config, level_manifest::LevelManifest, segment::Segment, HashSet}; /// Major compaction /// diff --git a/src/compaction/movedown.rs b/src/compaction/movedown.rs index 8990a331..f8de943e 100644 --- a/src/compaction/movedown.rs +++ b/src/compaction/movedown.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{Choice, CompactionStrategy, Input}; -use crate::{level_manifest::LevelManifest, super_segment::Segment, Config, HashSet}; +use crate::{level_manifest::LevelManifest, segment::Segment, Config, HashSet}; /// Moves down a level into the destination level. pub struct Strategy(pub u8, pub u8); diff --git a/src/compaction/pulldown.rs b/src/compaction/pulldown.rs index a0e5fa14..3ca1555d 100644 --- a/src/compaction/pulldown.rs +++ b/src/compaction/pulldown.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{Choice, CompactionStrategy, Input}; -use crate::{level_manifest::LevelManifest, super_segment::Segment, Config, HashSet}; +use crate::{level_manifest::LevelManifest, segment::Segment, Config, HashSet}; /// Pulls down and merges a level into the destination level. /// diff --git a/src/compaction/tiered.rs b/src/compaction/tiered.rs index 36ccac6d..3af377f0 100644 --- a/src/compaction/tiered.rs +++ b/src/compaction/tiered.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{Choice, CompactionStrategy, Input as CompactionInput}; -use crate::{level_manifest::LevelManifest, super_segment::Segment, Config, HashSet}; +use crate::{level_manifest::LevelManifest, segment::Segment, Config, HashSet}; fn desired_level_size_in_bytes(level_idx: u8, ratio: u8, base_size: u32) -> usize { (ratio as usize).pow(u32::from(level_idx + 1)) * (base_size as usize) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 92011fd8..b3e8ee39 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -9,8 +9,8 @@ use crate::{ level_manifest::LevelManifest, level_scanner::LevelScanner, merge::Merger, + segment::{multi_writer::MultiWriter, Segment}, stop_signal::StopSignal, - super_segment::{multi_writer::MultiWriter, Segment}, tree::inner::TreeId, Config, GlobalSegmentId, InternalValue, SegmentId, SeqNo, }; diff --git a/src/level_manifest/level.rs b/src/level_manifest/level.rs index c2f56f6e..6cf9d7e3 100644 --- a/src/level_manifest/level.rs +++ b/src/level_manifest/level.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use crate::{ - binary_search::partition_point, super_segment::Segment, HashSet, KeyRange, SegmentId, UserKey, + binary_search::partition_point, segment::Segment, HashSet, KeyRange, SegmentId, UserKey, }; use std::ops::Bound; diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index 559d352a..11df5e88 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -8,7 +8,7 @@ pub(crate) mod level; use crate::{ coding::{DecodeError, Encode, EncodeError}, file::{rewrite_atomic, MAGIC_BYTES}, - super_segment::Segment, + segment::Segment, HashMap, HashSet, KeyRange, SegmentId, }; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; diff --git a/src/level_reader.rs b/src/level_reader.rs index 506cd241..5f92d892 100644 --- a/src/level_reader.rs +++ b/src/level_reader.rs @@ -2,7 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::{level_manifest::level::Level, super_segment::CachePolicy, InternalValue, UserKey}; +use crate::{level_manifest::level::Level, segment::CachePolicy, InternalValue, UserKey}; use std::{ops::Bound, sync::Arc}; /// Reads through a disjoint level diff --git a/src/level_scanner.rs b/src/level_scanner.rs index 289fbd4e..d7f9ef4b 100644 --- a/src/level_scanner.rs +++ b/src/level_scanner.rs @@ -2,7 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::{level_manifest::level::Level, super_segment::Scanner, InternalValue}; +use crate::{level_manifest::level::Level, segment::Scanner, InternalValue}; use std::sync::Arc; /// Scans through a disjoint level diff --git a/src/lib.rs b/src/lib.rs index 38f4ba99..9253aaae 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -184,7 +184,7 @@ mod value; mod version; #[doc(hidden)] -pub mod super_segment; +pub mod segment; /// KV-tuple, typically returned by an iterator pub type KvPair = (UserKey, UserValue); @@ -200,7 +200,7 @@ pub mod coding { #[doc(hidden)] pub use { merge::BoxedIterator, - super_segment::{block::Checksum, GlobalSegmentId, SegmentId}, + segment::{block::Checksum, GlobalSegmentId, SegmentId}, tree::inner::TreeId, value::InternalValue, }; diff --git a/src/range.rs b/src/range.rs index 4cbd52b6..e1a6023a 100644 --- a/src/range.rs +++ b/src/range.rs @@ -7,7 +7,7 @@ use crate::{ level_reader::LevelReader, memtable::Memtable, multi_reader::MultiReader, - super_segment::CachePolicy, + segment::CachePolicy, value::{SeqNo, UserKey}, InternalValue, }; diff --git a/src/super_segment/block/binary_index/builder.rs b/src/segment/block/binary_index/builder.rs similarity index 100% rename from src/super_segment/block/binary_index/builder.rs rename to src/segment/block/binary_index/builder.rs diff --git a/src/super_segment/block/binary_index/mod.rs b/src/segment/block/binary_index/mod.rs similarity index 100% rename from src/super_segment/block/binary_index/mod.rs rename to src/segment/block/binary_index/mod.rs diff --git a/src/super_segment/block/binary_index/reader.rs b/src/segment/block/binary_index/reader.rs similarity index 100% rename from src/super_segment/block/binary_index/reader.rs rename to src/segment/block/binary_index/reader.rs diff --git a/src/super_segment/block/checksum.rs b/src/segment/block/checksum.rs similarity index 100% rename from src/super_segment/block/checksum.rs rename to src/segment/block/checksum.rs diff --git a/src/super_segment/block/encoder.rs b/src/segment/block/encoder.rs similarity index 100% rename from src/super_segment/block/encoder.rs rename to src/segment/block/encoder.rs diff --git a/src/super_segment/block/hash_index/builder.rs b/src/segment/block/hash_index/builder.rs similarity index 100% rename from src/super_segment/block/hash_index/builder.rs rename to src/segment/block/hash_index/builder.rs diff --git a/src/super_segment/block/hash_index/mod.rs b/src/segment/block/hash_index/mod.rs similarity index 100% rename from src/super_segment/block/hash_index/mod.rs rename to src/segment/block/hash_index/mod.rs diff --git a/src/super_segment/block/hash_index/reader.rs b/src/segment/block/hash_index/reader.rs similarity index 100% rename from src/super_segment/block/hash_index/reader.rs rename to src/segment/block/hash_index/reader.rs diff --git a/src/super_segment/block/header.rs b/src/segment/block/header.rs similarity index 100% rename from src/super_segment/block/header.rs rename to src/segment/block/header.rs diff --git a/src/super_segment/block/mod.rs b/src/segment/block/mod.rs similarity index 100% rename from src/super_segment/block/mod.rs rename to src/segment/block/mod.rs diff --git a/src/super_segment/block/offset.rs b/src/segment/block/offset.rs similarity index 100% rename from src/super_segment/block/offset.rs rename to src/segment/block/offset.rs diff --git a/src/super_segment/block/trailer.rs b/src/segment/block/trailer.rs similarity index 98% rename from src/super_segment/block/trailer.rs rename to src/segment/block/trailer.rs index 5c72c718..8707df42 100644 --- a/src/super_segment/block/trailer.rs +++ b/src/segment/block/trailer.rs @@ -6,7 +6,7 @@ use super::{ encoder::{Encodable, Encoder}, Block, }; -use crate::super_segment::block::hash_index::MAX_POINTERS_FOR_HASH_INDEX; +use crate::segment::block::hash_index::MAX_POINTERS_FOR_HASH_INDEX; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; pub const TRAILER_START_MARKER: u8 = 255; diff --git a/src/super_segment/block_index/mod.rs b/src/segment/block_index/mod.rs similarity index 100% rename from src/super_segment/block_index/mod.rs rename to src/segment/block_index/mod.rs diff --git a/src/super_segment/data_block/iter.rs b/src/segment/data_block/iter.rs similarity index 100% rename from src/super_segment/data_block/iter.rs rename to src/segment/data_block/iter.rs diff --git a/src/super_segment/data_block/mod.rs b/src/segment/data_block/mod.rs similarity index 99% rename from src/super_segment/data_block/mod.rs rename to src/segment/data_block/mod.rs index c12e5fb4..8be90689 100644 --- a/src/super_segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -11,7 +11,7 @@ use super::block::{ Encodable, Encoder, Trailer, TRAILER_START_MARKER, }; use crate::clipping_iter::ClippingIter; -use crate::super_segment::util::compare_prefixed_slice; +use crate::segment::util::compare_prefixed_slice; use crate::{InternalValue, SeqNo, ValueType}; use byteorder::WriteBytesExt; use byteorder::{LittleEndian, ReadBytesExt}; @@ -596,7 +596,7 @@ impl DataBlock { mod tests { use super::*; use crate::{ - super_segment::{ + segment::{ block::{BlockOffset, Checksum, Header}, Block, }, diff --git a/src/super_segment/filter/bit_array/mod.rs b/src/segment/filter/bit_array/mod.rs similarity index 100% rename from src/super_segment/filter/bit_array/mod.rs rename to src/segment/filter/bit_array/mod.rs diff --git a/src/super_segment/filter/bit_array/sliced.rs b/src/segment/filter/bit_array/sliced.rs similarity index 95% rename from src/super_segment/filter/bit_array/sliced.rs rename to src/segment/filter/bit_array/sliced.rs index f6f09810..83066a3f 100644 --- a/src/super_segment/filter/bit_array/sliced.rs +++ b/src/segment/filter/bit_array/sliced.rs @@ -43,7 +43,7 @@ impl BitArray { #[cfg(test)] mod tests { use super::*; - use crate::super_segment::filter::bit_array::set_bit; + use crate::segment::filter::bit_array::set_bit; use test_log::test; #[test] diff --git a/src/super_segment/filter/mod.rs b/src/segment/filter/mod.rs similarity index 100% rename from src/super_segment/filter/mod.rs rename to src/segment/filter/mod.rs diff --git a/src/super_segment/filter/standard_bloom/builder.rs b/src/segment/filter/standard_bloom/builder.rs similarity index 98% rename from src/super_segment/filter/standard_bloom/builder.rs rename to src/segment/filter/standard_bloom/builder.rs index a4e596fe..8dc59d54 100644 --- a/src/super_segment/filter/standard_bloom/builder.rs +++ b/src/segment/filter/standard_bloom/builder.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{super::bit_array::Builder as BitArrayBuilder, StandardBloomFilter}; -use crate::super_segment::filter::bit_array::BitArrayReader; +use crate::segment::filter::bit_array::BitArrayReader; /// Two hashes that are used for double hashing pub type CompositeHash = (u64, u64); diff --git a/src/super_segment/filter/standard_bloom/mod.rs b/src/segment/filter/standard_bloom/mod.rs similarity index 100% rename from src/super_segment/filter/standard_bloom/mod.rs rename to src/segment/filter/standard_bloom/mod.rs diff --git a/src/super_segment/id.rs b/src/segment/id.rs similarity index 100% rename from src/super_segment/id.rs rename to src/segment/id.rs diff --git a/src/super_segment/index_block/block_handle.rs b/src/segment/index_block/block_handle.rs similarity index 98% rename from src/super_segment/index_block/block_handle.rs rename to src/segment/index_block/block_handle.rs index 20fe78b4..cd2a35be 100644 --- a/src/super_segment/index_block/block_handle.rs +++ b/src/segment/index_block/block_handle.rs @@ -4,7 +4,7 @@ use crate::{ coding::{Decode, DecodeError, Encode, EncodeError}, - super_segment::block::{BlockOffset, Encodable}, + segment::block::{BlockOffset, Encodable}, }; use byteorder::WriteBytesExt; use value_log::UserKey; diff --git a/src/super_segment/index_block/forward_reader.rs b/src/segment/index_block/forward_reader.rs similarity index 98% rename from src/super_segment/index_block/forward_reader.rs rename to src/segment/index_block/forward_reader.rs index 79ffe5ad..c2d85b02 100644 --- a/src/super_segment/index_block/forward_reader.rs +++ b/src/segment/index_block/forward_reader.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{IndexBlock, NewKeyedBlockHandle}; -use crate::{super_segment::BlockOffset, Slice}; +use crate::{segment::BlockOffset, Slice}; use std::io::Cursor; #[derive(Default, Debug)] @@ -138,7 +138,7 @@ impl Iterator for ForwardReader<'_> { #[cfg(test)] mod tests { use super::*; - use crate::super_segment::{block::Header, Block, Checksum}; + use crate::segment::{block::Header, Block, Checksum}; use test_log::test; #[test] diff --git a/src/super_segment/index_block/mod.rs b/src/segment/index_block/mod.rs similarity index 99% rename from src/super_segment/index_block/mod.rs rename to src/segment/index_block/mod.rs index e1e00395..7295d96e 100644 --- a/src/super_segment/index_block/mod.rs +++ b/src/segment/index_block/mod.rs @@ -12,7 +12,7 @@ use super::{ block::{binary_index::Reader as BinaryIndexReader, BlockOffset, Encoder, Trailer}, Block, }; -use crate::super_segment::block::TRAILER_START_MARKER; +use crate::segment::block::TRAILER_START_MARKER; use byteorder::{LittleEndian, ReadBytesExt}; use std::io::{Cursor, Seek}; use varint_rs::VarintReader; @@ -344,7 +344,7 @@ impl IndexBlock { #[cfg(test)] mod tests { use super::*; - use crate::super_segment::block::{Checksum, Header}; + use crate::segment::block::{Checksum, Header}; use test_log::test; #[test] diff --git a/src/super_segment/inner.rs b/src/segment/inner.rs similarity index 100% rename from src/super_segment/inner.rs rename to src/segment/inner.rs diff --git a/src/super_segment/meta.rs b/src/segment/meta.rs similarity index 100% rename from src/super_segment/meta.rs rename to src/segment/meta.rs diff --git a/src/super_segment/mod.rs b/src/segment/mod.rs similarity index 99% rename from src/super_segment/mod.rs rename to src/segment/mod.rs index c785b387..ba14625f 100644 --- a/src/super_segment/mod.rs +++ b/src/segment/mod.rs @@ -438,7 +438,7 @@ mod tests { let file = dir.path().join("segment"); { - let mut writer = crate::super_segment::Writer::new(file.clone(), 5)?; + let mut writer = crate::segment::Writer::new(file.clone(), 5)?; writer.write(crate::InternalValue::from_components( b"abc", b"asdasdasd", @@ -497,7 +497,7 @@ mod tests { ]; { - let mut writer = crate::super_segment::Writer::new(file.clone(), 5)?; + let mut writer = crate::segment::Writer::new(file.clone(), 5)?; for item in items.iter().cloned() { writer.write(item)?; diff --git a/src/super_segment/multi_writer.rs b/src/segment/multi_writer.rs similarity index 100% rename from src/super_segment/multi_writer.rs rename to src/segment/multi_writer.rs diff --git a/src/super_segment/scanner.rs b/src/segment/scanner.rs similarity index 100% rename from src/super_segment/scanner.rs rename to src/segment/scanner.rs diff --git a/src/super_segment/trailer.rs b/src/segment/trailer.rs similarity index 99% rename from src/super_segment/trailer.rs rename to src/segment/trailer.rs index b0e1d12c..5453e277 100644 --- a/src/super_segment/trailer.rs +++ b/src/segment/trailer.rs @@ -151,7 +151,7 @@ impl Decode for Trailer { #[cfg(test)] mod tests { use super::*; - use crate::super_segment::BlockOffset; + use crate::segment::BlockOffset; use std::io::Cursor; use test_log::test; diff --git a/src/super_segment/util.rs b/src/segment/util.rs similarity index 100% rename from src/super_segment/util.rs rename to src/segment/util.rs diff --git a/src/super_segment/writer/index.rs b/src/segment/writer/index.rs similarity index 99% rename from src/super_segment/writer/index.rs rename to src/segment/writer/index.rs index ce6c555a..c0455e8a 100644 --- a/src/super_segment/writer/index.rs +++ b/src/segment/writer/index.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use crate::{ - super_segment::{ + segment::{ block::Header as BlockHeader, index_block::{NewBlockHandle, NewKeyedBlockHandle}, Block, BlockOffset, IndexBlock, diff --git a/src/super_segment/writer/meta.rs b/src/segment/writer/meta.rs similarity index 95% rename from src/super_segment/writer/meta.rs rename to src/segment/writer/meta.rs index 0f2ae2ef..11ac694b 100644 --- a/src/super_segment/writer/meta.rs +++ b/src/segment/writer/meta.rs @@ -2,7 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::{super_segment::BlockOffset, SeqNo, UserKey}; +use crate::{segment::BlockOffset, SeqNo, UserKey}; pub struct Metadata { /// Written data block count diff --git a/src/super_segment/writer/mod.rs b/src/segment/writer/mod.rs similarity index 99% rename from src/super_segment/writer/mod.rs rename to src/segment/writer/mod.rs index 210c600c..d7ab10d5 100644 --- a/src/super_segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -8,7 +8,7 @@ use super::{ use crate::{ coding::Encode, file::fsync_directory, - super_segment::{filter::standard_bloom::Builder, index_block::NewBlockHandle}, + segment::{filter::standard_bloom::Builder, index_block::NewBlockHandle}, time::unix_timestamp, CompressionType, InternalValue, SegmentId, UserKey, }; diff --git a/src/tree/ingest.rs b/src/tree/ingest.rs index f1c81a83..f9f99380 100644 --- a/src/tree/ingest.rs +++ b/src/tree/ingest.rs @@ -5,7 +5,7 @@ use super::Tree; use crate::{ file::SEGMENTS_FOLDER, - super_segment::{multi_writer::MultiWriter, Segment}, + segment::{multi_writer::MultiWriter, Segment}, AbstractTree, UserKey, UserValue, ValueType, }; use std::{path::PathBuf, sync::Arc}; diff --git a/src/tree/mod.rs b/src/tree/mod.rs index e79c500d..9bd40d52 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -12,7 +12,7 @@ use crate::{ level_manifest::LevelManifest, manifest::Manifest, memtable::Memtable, - super_segment::Segment, + segment::Segment, value::InternalValue, version::Version, AbstractTree, Cache, DescriptorTable, KvPair, SegmentId, SeqNo, Snapshot, UserKey, UserValue, @@ -178,7 +178,7 @@ impl AbstractTree for Tree { seqno_threshold: SeqNo, ) -> crate::Result> { use crate::{ - compaction::stream::CompactionStream, file::SEGMENTS_FOLDER, super_segment::Writer, + compaction::stream::CompactionStream, file::SEGMENTS_FOLDER, segment::Writer, }; use std::time::Instant; @@ -487,7 +487,7 @@ impl Tree { pub(crate) fn consume_writer( &self, segment_id: SegmentId, // TODO: <- remove - writer: crate::super_segment::Writer, + writer: crate::segment::Writer, ) -> crate::Result> { let segment_file_path = writer.path.to_path_buf(); @@ -618,7 +618,7 @@ impl Tree { ) -> crate::Result> { // NOTE: Create key hash for hash sharing // https://fjall-rs.github.io/post/bloom-filter-hash-sharing/ - let key_hash = crate::super_segment::filter::standard_bloom::Builder::get_hash(key); + let key_hash = crate::segment::filter::standard_bloom::Builder::get_hash(key); let level_manifest = self.levels.read().expect("lock is poisoned"); From 9f8caa0f598ff96bdc9ed3311ee5aeb6741a8a04 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Apr 2025 13:44:42 +0200 Subject: [PATCH 072/613] wip --- src/lib.rs | 2 +- src/segment/scanner.rs | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 9253aaae..3729040e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -200,7 +200,7 @@ pub mod coding { #[doc(hidden)] pub use { merge::BoxedIterator, - segment::{block::Checksum, GlobalSegmentId, SegmentId}, + segment::{block::Checksum, GlobalSegmentId, Segment, SegmentId}, tree::inner::TreeId, value::InternalValue, }; diff --git a/src/segment/scanner.rs b/src/segment/scanner.rs index c04f50f3..65e73fd2 100644 --- a/src/segment/scanner.rs +++ b/src/segment/scanner.rs @@ -1,3 +1,7 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + use super::{Block, DataBlock}; use crate::{CompressionType, InternalValue}; use self_cell::self_cell; From f3333cc6856ec898ce091cbcb63c7654708d606c Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Apr 2025 13:47:04 +0200 Subject: [PATCH 073/613] wip --- src/segment/meta.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/segment/meta.rs b/src/segment/meta.rs index c6e4eb06..8517dc2b 100644 --- a/src/segment/meta.rs +++ b/src/segment/meta.rs @@ -49,6 +49,15 @@ impl ParsedMeta { let block = Block::from_file(file, ptr.offset(), ptr.size(), CompressionType::None)?; let block = DataBlock::new(block); + assert_eq!( + b"xxh3", + &*block + .point_read(b"#hash_type", None) + .expect("Segment ID should exist") + .value, + "invalid hash type", + ); + let id = { let bytes = block .point_read(b"#id", None) From 817723d37a3b3d009adbf92542fdb52f6736cb88 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Apr 2025 13:47:28 +0200 Subject: [PATCH 074/613] wip --- src/segment/meta.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/segment/meta.rs b/src/segment/meta.rs index 8517dc2b..10632b2b 100644 --- a/src/segment/meta.rs +++ b/src/segment/meta.rs @@ -44,6 +44,7 @@ pub struct ParsedMeta { } impl ParsedMeta { + #[allow(clippy::expect_used)] pub fn from_trailer(file: &File, trailer: &Trailer) -> crate::Result { let ptr = trailer.metadata; let block = Block::from_file(file, ptr.offset(), ptr.size(), CompressionType::None)?; From d1dd45914022fcc898921ddcac970a3adc92f4e5 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Apr 2025 13:47:55 +0200 Subject: [PATCH 075/613] wip --- src/segment/writer/mod.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index d7ab10d5..59c5f3e7 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -208,6 +208,8 @@ impl Writer { Ok(()) } + // TODO: 3.0.0 split meta writing into new function + #[allow(clippy::too_many_lines)] /// Finishes the segment, making sure all data is written durably pub fn finish(mut self) -> crate::Result> { self.spill_block()?; From befbf359eb303c113734c4f859c5a4873a01b9c3 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Apr 2025 13:52:45 +0200 Subject: [PATCH 076/613] wip bloom bench --- benches/bloom.rs | 19 ++++++++++++------- src/segment/filter/standard_bloom/mod.rs | 2 +- src/segment/mod.rs | 2 +- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/benches/bloom.rs b/benches/bloom.rs index 4e10f08b..88902a64 100644 --- a/benches/bloom.rs +++ b/benches/bloom.rs @@ -1,34 +1,39 @@ use criterion::{criterion_group, criterion_main, Criterion}; -use lsm_tree::bloom::BloomFilter; fn filter_construction(c: &mut Criterion) { - let mut filter = BloomFilter::with_fp_rate(1_000_000, 0.01); + use lsm_tree::segment::filter::standard_bloom::Builder; + + let mut filter = Builder::with_fp_rate(500_000_000, 0.01); c.bench_function("bloom filter add key", |b| { b.iter(|| { let key = nanoid::nanoid!(); - filter.set_with_hash(BloomFilter::get_hash(key.as_bytes())); + filter.set_with_hash(Builder::get_hash(key.as_bytes())); }); }); } fn filter_contains(c: &mut Criterion) { + use lsm_tree::segment::filter::standard_bloom::Builder; + let keys = (0..100_000u128) .map(|x| x.to_be_bytes().to_vec()) .collect::>(); for fpr in [0.01, 0.001, 0.0001, 0.00001] { - let mut filter = BloomFilter::with_fp_rate(100_000, fpr); + let mut filter = Builder::with_fp_rate(100_000_000, fpr); for key in &keys { - filter.set_with_hash(BloomFilter::get_hash(key)); + filter.set_with_hash(Builder::get_hash(key)); } let mut rng = rand::rng(); + let filter = filter.build(); + c.bench_function( &format!( - "bloom filter contains key, true positive ({}%)", + "standard bloom filter contains key, true positive ({}%)", fpr * 100.0, ), |b| { @@ -36,7 +41,7 @@ fn filter_contains(c: &mut Criterion) { use rand::seq::IndexedRandom; let sample = keys.choose(&mut rng).unwrap(); - let hash = BloomFilter::get_hash(sample); + let hash = Builder::get_hash(sample); assert!(filter.contains_hash(hash)); }); }, diff --git a/src/segment/filter/standard_bloom/mod.rs b/src/segment/filter/standard_bloom/mod.rs index 16d8b413..2fd8bba0 100644 --- a/src/segment/filter/standard_bloom/mod.rs +++ b/src/segment/filter/standard_bloom/mod.rs @@ -98,7 +98,7 @@ impl StandardBloomFilter { /// /// Will never have a false negative. #[must_use] - pub(crate) fn contains_hash(&self, hash: CompositeHash) -> bool { + pub fn contains_hash(&self, hash: CompositeHash) -> bool { let (mut h1, mut h2) = hash; for i in 1..=(self.k as u64) { diff --git a/src/segment/mod.rs b/src/segment/mod.rs index ba14625f..d4eec1a5 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -5,7 +5,7 @@ pub mod block; mod block_index; pub(crate) mod data_block; -pub(crate) mod filter; +pub mod filter; mod id; mod index_block; mod inner; From 6ff76d216c185663a07ade247ad6df2dc3c2d72c Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Apr 2025 13:59:42 +0200 Subject: [PATCH 077/613] restore some tests --- UNSAFE.md | 6 ++ src/segment/filter/standard_bloom/mod.rs | 72 +++++++++++++++--------- 2 files changed, 51 insertions(+), 27 deletions(-) diff --git a/UNSAFE.md b/UNSAFE.md index 0952bb52..773d3e44 100644 --- a/UNSAFE.md +++ b/UNSAFE.md @@ -9,3 +9,9 @@ cargo +nightly fuzz run data_block -- -max_len=8000000 cargo +nightly fuzz run index_block -- -max_len=8000000 cargo +nightly fuzz run partition_point -- -max_len=1000000 ``` + +## Run mutation testing + +```bash +cargo-mutants mutants --test-tool=nextest +``` diff --git a/src/segment/filter/standard_bloom/mod.rs b/src/segment/filter/standard_bloom/mod.rs index 2fd8bba0..7670a640 100644 --- a/src/segment/filter/standard_bloom/mod.rs +++ b/src/segment/filter/standard_bloom/mod.rs @@ -136,21 +136,20 @@ impl StandardBloomFilter { } } -// TODO: restore #[cfg(test)] mod tests { use super::*; use std::fs::File; use test_log::test; - /* #[test] + #[test] fn bloom_serde_round_trip() -> crate::Result<()> { let dir = tempfile::tempdir()?; let path = dir.path().join("bf"); let mut file = File::create(&path)?; - let mut filter = StandardBloomFilter::with_fp_rate(10, 0.0001); + let mut filter = Builder::with_fp_rate(10, 0.0001); let keys = &[ b"item0", b"item1", b"item2", b"item3", b"item4", b"item5", b"item6", b"item7", @@ -161,6 +160,8 @@ mod tests { filter.set_with_hash(StandardBloomFilter::get_hash(*key)); } + let filter = filter.build(); + for key in keys { assert!(filter.contains(&**key)); } @@ -175,7 +176,7 @@ mod tests { let mut file = File::open(&path)?; let filter_copy = StandardBloomFilter::decode_from(&mut file)?; - assert_eq!(filter, filter_copy); + assert_eq!(filter.inner, filter_copy.inner); for key in keys { assert!(filter.contains(&**key)); @@ -185,38 +186,53 @@ mod tests { assert!(!filter_copy.contains(b"cxycxycxy")); Ok(()) - } */ + } - /* #[test] + #[test] fn bloom_basic() { - let mut filter = StandardBloomFilter::with_fp_rate(10, 0.0001); + let mut filter = Builder::with_fp_rate(10, 0.0001); + + let keys = [ + b"item0" as &[u8], + b"item1", + b"item2", + b"item3", + b"item4", + b"item5", + b"item6", + b"item7", + b"item8", + b"item9", + ]; - for key in [ - b"item0", b"item1", b"item2", b"item3", b"item4", b"item5", b"item6", b"item7", - b"item8", b"item9", - ] { - assert!(!filter.contains(key)); - filter.set_with_hash(StandardBloomFilter::get_hash(key)); - assert!(filter.contains(key)); + for key in &keys { + filter.set_with_hash(Builder::get_hash(key)); + } - assert!(!filter.contains(b"asdasdasdasdasdasdasd")); + let filter = filter.build(); + + for key in &keys { + assert!(filter.contains(key)); } - } */ - /* #[test] + assert!(!filter.contains(b"asdasdasdasdasdasdasd")); + } + + #[test] fn bloom_bpk() { let item_count = 1_000; let bpk = 5; - let mut filter = StandardBloomFilter::with_bpk(item_count, bpk); + let mut filter = Builder::with_bpk(item_count, bpk); for key in (0..item_count).map(|_| nanoid::nanoid!()) { let key = key.as_bytes(); - filter.set_with_hash(StandardBloomFilter::get_hash(key)); - assert!(filter.contains(key)); + filter.set_with_hash(Builder::get_hash(key)); } + let filter = filter.build(); + let mut false_positives = 0; for key in (0..item_count).map(|_| nanoid::nanoid!()) { @@ -237,15 +253,16 @@ mod tests { let item_count = 100_000; let wanted_fpr = 0.1; - let mut filter = StandardBloomFilter::with_fp_rate(item_count, wanted_fpr); + let mut filter = Builder::with_fp_rate(item_count, wanted_fpr); for key in (0..item_count).map(|_| nanoid::nanoid!()) { let key = key.as_bytes(); - filter.set_with_hash(StandardBloomFilter::get_hash(key)); - assert!(filter.contains(key)); + filter.set_with_hash(Builder::get_hash(key)); } + let filter = filter.build(); + let mut false_positives = 0; for key in (0..item_count).map(|_| nanoid::nanoid!()) { @@ -267,15 +284,16 @@ mod tests { let item_count = 100_000; let wanted_fpr = 0.5; - let mut filter = StandardBloomFilter::with_fp_rate(item_count, wanted_fpr); + let mut filter = Builder::with_fp_rate(item_count, wanted_fpr); for key in (0..item_count).map(|_| nanoid::nanoid!()) { let key = key.as_bytes(); - filter.set_with_hash(StandardBloomFilter::get_hash(key)); - assert!(filter.contains(key)); + filter.set_with_hash(Builder::get_hash(key)); } + let filter = filter.build(); + let mut false_positives = 0; for key in (0..item_count).map(|_| nanoid::nanoid!()) { @@ -290,5 +308,5 @@ mod tests { let fpr = false_positives as f32 / item_count as f32; assert!(fpr > 0.45); assert!(fpr < 0.55); - } */ + } } From 2ab142e369640c4f3285fb2e7a74aeb8cd433aa0 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Apr 2025 14:10:04 +0200 Subject: [PATCH 078/613] wip --- src/segment/meta.rs | 9 +++++++++ src/segment/writer/mod.rs | 1 + 2 files changed, 10 insertions(+) diff --git a/src/segment/meta.rs b/src/segment/meta.rs index 10632b2b..f3bb818e 100644 --- a/src/segment/meta.rs +++ b/src/segment/meta.rs @@ -59,6 +59,15 @@ impl ParsedMeta { "invalid hash type", ); + assert_eq!( + b"xxh3", + &*block + .point_read(b"#checksum_type", None) + .expect("Segment ID should exist") + .value, + "invalid checksum type", + ); + let id = { let bytes = block .point_read(b"#id", None) diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 59c5f3e7..93c60694 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -288,6 +288,7 @@ impl Writer { } let meta_items = [ + meta("#checksum_type", b"xxh3"), meta("#compression#data", &self.compression.encode_into_vec()), meta("#created_at", &unix_timestamp().as_nanos().to_le_bytes()), meta( From 068ec9096d50cb191938495cd8304a1107248ad0 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Apr 2025 14:14:25 +0200 Subject: [PATCH 079/613] wip --- src/segment/filter/bit_array/mod.rs | 14 +++----------- src/segment/filter/standard_bloom/builder.rs | 7 +------ 2 files changed, 4 insertions(+), 17 deletions(-) diff --git a/src/segment/filter/bit_array/mod.rs b/src/segment/filter/bit_array/mod.rs index c510c482..2be288bc 100644 --- a/src/segment/filter/bit_array/mod.rs +++ b/src/segment/filter/bit_array/mod.rs @@ -9,6 +9,7 @@ pub use sliced::BitArray as BitArrayReader; const BIT_MASK: u8 = 0b1000_0000_u8; /// Sets a bit in the byte +#[must_use] pub fn set_bit(byte: u8, idx: usize, value: bool) -> u8 { let bit_mask = BIT_MASK >> idx; @@ -41,12 +42,12 @@ impl Builder { } /// Sets the i-th bit - pub fn set(&mut self, idx: usize, val: bool) { + pub fn enable_bit(&mut self, idx: usize) { let byte_idx = idx / 8; let byte = self.0.get_mut(byte_idx).expect("should be in bounds"); let bit_idx = idx % 8; - *byte = set_bit(*byte, bit_idx, val); + *byte = set_bit(*byte, bit_idx, true); } } @@ -62,13 +63,4 @@ mod tests { assert_eq!(0b0100_0000, set_bit(0, 1, true)); assert_eq!(0b0100_0110, set_bit(0b0000_0110, 1, true)); } - - #[test] - fn bit_set_false() { - assert_eq!(0b1111_1101, set_bit(0xFF, 6, false)); - assert_eq!(0b0111_1111, set_bit(0xFF, 0, false)); - assert_eq!(0b1011_1111, set_bit(0xFF, 1, false)); - - assert_eq!(0b0000_0110, set_bit(0b0100_0110, 1, false)); - } } diff --git a/src/segment/filter/standard_bloom/builder.rs b/src/segment/filter/standard_bloom/builder.rs index 8dc59d54..eab00c51 100644 --- a/src/segment/filter/standard_bloom/builder.rs +++ b/src/segment/filter/standard_bloom/builder.rs @@ -97,18 +97,13 @@ impl Builder { for i in 1..=(self.k as u64) { let idx = h1 % (self.m as u64); - self.enable_bit(idx as usize); + self.inner.enable_bit(idx as usize); h1 = h1.wrapping_add(h2); h2 = h2.wrapping_mul(i); } } - /// Sets the bit at the given index to `true`. - fn enable_bit(&mut self, idx: usize) { - self.inner.set(idx, true); - } - /// Gets the hash of a key. #[must_use] pub fn get_hash(key: &[u8]) -> CompositeHash { From 8fa81776c6e826c56cb59a9cce2d4f393f72dade Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Apr 2025 14:16:07 +0200 Subject: [PATCH 080/613] wip --- src/segment/filter/bit_array/mod.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/segment/filter/bit_array/mod.rs b/src/segment/filter/bit_array/mod.rs index 2be288bc..b9696b13 100644 --- a/src/segment/filter/bit_array/mod.rs +++ b/src/segment/filter/bit_array/mod.rs @@ -63,4 +63,16 @@ mod tests { assert_eq!(0b0100_0000, set_bit(0, 1, true)); assert_eq!(0b0100_0110, set_bit(0b0000_0110, 1, true)); } + + #[test] + fn bit_array_builder_basic() { + let mut builder = Builder::with_capacity(1); + assert_eq!(&[0], builder.bytes()); + + builder.enable_bit(0); + assert_eq!(&[0b1000_0000], builder.bytes()); + + builder.enable_bit(7); + assert_eq!(&[0b1000_0001], builder.bytes()); + } } From dc7a8579f063d51c257e1777ebf93a495e872399 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Apr 2025 14:20:34 +0200 Subject: [PATCH 081/613] wip --- src/segment/filter/standard_bloom/builder.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/segment/filter/standard_bloom/builder.rs b/src/segment/filter/standard_bloom/builder.rs index eab00c51..e8cdfd65 100644 --- a/src/segment/filter/standard_bloom/builder.rs +++ b/src/segment/filter/standard_bloom/builder.rs @@ -23,6 +23,7 @@ pub struct Builder { #[allow(clippy::len_without_is_empty)] impl Builder { + #[must_use] pub fn build(self) -> StandardBloomFilter { StandardBloomFilter { inner: BitArrayReader::new(self.inner.bytes().into()), From 86f4f38acd556f9e9cbf536be1a3403675921dfb Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Apr 2025 14:31:40 +0200 Subject: [PATCH 082/613] blob caching --- src/blob_tree/cache.rs | 8 +--- src/cache.rs | 101 ++++++++++++++++++----------------------- 2 files changed, 46 insertions(+), 63 deletions(-) diff --git a/src/blob_tree/cache.rs b/src/blob_tree/cache.rs index e5753471..2d1946ff 100644 --- a/src/blob_tree/cache.rs +++ b/src/blob_tree/cache.rs @@ -11,9 +11,7 @@ impl BlobCache for MyBlobCache { vlog_id: value_log::ValueLogId, vhandle: &value_log::ValueHandle, ) -> Option { - todo!() - - // self.0.get_blob(vlog_id, vhandle) + self.0.get_blob(vlog_id, vhandle) } fn insert( @@ -22,8 +20,6 @@ impl BlobCache for MyBlobCache { vhandle: &value_log::ValueHandle, value: value_log::UserValue, ) { - todo!() - - // self.0.insert_blob(vlog_id, vhandle, value); + self.0.insert_blob(vlog_id, vhandle, value); } } diff --git a/src/cache.rs b/src/cache.rs index c2247055..e8b1fdd0 100644 --- a/src/cache.rs +++ b/src/cache.rs @@ -3,22 +3,19 @@ // (found in the LICENSE-* files in the repository) use crate::segment::block::Header; -use crate::segment::{Block, BlockOffset, DataBlock}; -use crate::GlobalSegmentId; +use crate::segment::{Block, BlockOffset, DataBlock, IndexBlock}; +use crate::{GlobalSegmentId, UserValue}; use quick_cache::Weighter; use quick_cache::{sync::Cache as QuickCache, Equivalent}; const TAG_BLOCK: u8 = 0; const TAG_BLOB: u8 = 1; -/* #[derive(Clone)] +#[derive(Clone)] enum Item { - DataBlock(Arc), - IndexBlock(Arc), + Block(Block), Blob(UserValue), -} */ - -type Item = Block; +} #[derive(Eq, std::hash::Hash, PartialEq)] struct CacheKey(u8, u64, u64, u64); @@ -39,8 +36,15 @@ impl From<(u8, u64, u64, u64)> for CacheKey { struct BlockWeighter; impl Weighter for BlockWeighter { - fn weight(&self, _: &CacheKey, block: &Item) -> u64 { - (Header::serialized_len() as u64) + Into::::into(block.header.uncompressed_length) + fn weight(&self, _: &CacheKey, item: &Item) -> u64 { + use Item::{Blob, Block}; + + match item { + Block(b) => { + (Header::serialized_len() as u64) + Into::::into(b.header.uncompressed_length) + } + Blob(b) => b.len() as u64, + } } } @@ -122,65 +126,50 @@ impl Cache { self.data.is_empty() } + #[doc(hidden)] + #[must_use] + pub fn get_index_block(&self, id: GlobalSegmentId, offset: BlockOffset) -> Option { + let key: CacheKey = (TAG_BLOCK, id.tree_id(), id.segment_id(), *offset).into(); + + Some(match self.data.get(&key)? { + Item::Block(block) => IndexBlock::new(block), + Item::Blob(_) => unreachable!("invalid cache item"), + }) + } + #[doc(hidden)] #[must_use] pub fn get_data_block(&self, id: GlobalSegmentId, offset: BlockOffset) -> Option { let key: CacheKey = (TAG_BLOCK, id.tree_id(), id.segment_id(), *offset).into(); - self.data.get(&key).map(DataBlock::new) + + Some(match self.data.get(&key)? { + Item::Block(block) => DataBlock::new(block), + Item::Blob(_) => unreachable!("invalid cache item"), + }) } #[doc(hidden)] - pub fn insert_block(&self, id: GlobalSegmentId, offset: BlockOffset, value: Item) { + pub fn insert_block(&self, id: GlobalSegmentId, offset: BlockOffset, block: Block) { self.data.insert( (TAG_BLOCK, id.tree_id(), id.segment_id(), *offset).into(), - value, + Item::Block(block), ); } - /* #[doc(hidden)] - pub fn insert_index_block( - &self, - id: GlobalSegmentId, - offset: BlockOffset, - value: Arc, - ) { - self.data.insert( - (TAG_BLOCK, id.tree_id(), id.segment_id(), *offset).into(), - Item::IndexBlock(value), - ); - } */ - - /* #[doc(hidden)] - #[must_use] - pub fn get_index_block( - &self, - id: GlobalSegmentId, - offset: BlockOffset, - ) -> Option> { - let key: CacheKey = (TAG_BLOCK, id.tree_id(), id.segment_id(), *offset).into(); - - if let Item::IndexBlock(block) = self.data.get(&key)? { - Some(block) - } else { - log::warn!("cache item type was unexpected - this is a bug"); - None - } - } */ - - /* #[doc(hidden)] + #[doc(hidden)] pub fn insert_blob( &self, vlog_id: value_log::ValueLogId, vhandle: &value_log::ValueHandle, value: UserValue, ) { - self.data.insert( - (TAG_BLOB, vlog_id, vhandle.segment_id, vhandle.offset).into(), - Item::Blob(value), - ); - } */ + self.data.insert( + (TAG_BLOB, vlog_id, vhandle.segment_id, vhandle.offset).into(), + Item::Blob(value), + ); + } - /* #[doc(hidden)] + #[doc(hidden)] #[must_use] pub fn get_blob( &self, @@ -189,11 +178,9 @@ impl Cache { ) -> Option { let key: CacheKey = (TAG_BLOB, vlog_id, vhandle.segment_id, vhandle.offset).into(); - if let Item::Blob(blob) = self.data.get(&key)? { - Some(blob) - } else { - log::warn!("cache item type was unexpected - this is a bug"); - None - } - } */ + Some(match self.data.get(&key)? { + Item::Blob(blob) => blob, + Item::Block(_) => unreachable!("invalid cache item"), + }) + } } From acc5108671fbb43739429b263a0cf51c66de2440 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Apr 2025 16:48:42 +0200 Subject: [PATCH 083/613] restore standard bloom construction policy --- src/compaction/worker.rs | 44 ++++++++++++++++------------------ src/segment/data_block/mod.rs | 44 ++++++++++++++++++---------------- src/segment/filter/mod.rs | 34 ++++++++++++++++++++++++++ src/segment/multi_writer.rs | 27 +++++++-------------- src/segment/writer/mod.rs | 35 ++++++++++++++++----------- src/tree/mod.rs | 45 +++++++++++++---------------------- 6 files changed, 123 insertions(+), 106 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index b3e8ee39..c004de80 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -273,30 +273,26 @@ fn merge_segments( let mut segment_writer = segment_writer .use_compression(opts.config.compression) - .use_data_block_size(opts.config.data_block_size); - - /* { - use crate::segment::writer::BloomConstructionPolicy; - - if opts.config.bloom_bits_per_key >= 0 { - // NOTE: Apply some MONKEY to have very high FPR on small levels - // because it's cheap - // - // See https://nivdayan.github.io/monkeykeyvaluestore.pdf - let bloom_policy = match payload.dest_level { - 0 => BloomConstructionPolicy::FpRate(0.00001), - 1 => BloomConstructionPolicy::FpRate(0.0005), - _ => BloomConstructionPolicy::BitsPerKey( - opts.config.bloom_bits_per_key.unsigned_abs(), - ), - }; - - segment_writer = segment_writer.use_bloom_policy(bloom_policy); - } else { - segment_writer = - segment_writer.use_bloom_policy(BloomConstructionPolicy::BitsPerKey(0)); - } - } */ + .use_data_block_size(opts.config.data_block_size) + .use_bloom_policy({ + use crate::segment::filter::BloomConstructionPolicy; + + if opts.config.bloom_bits_per_key >= 0 { + // NOTE: Apply some MONKEY to have very high FPR on small levels + // because it's cheap + // + // See https://nivdayan.github.io/monkeykeyvaluestore.pdf + match payload.dest_level { + 0 => BloomConstructionPolicy::FpRate(0.00001), + 1 => BloomConstructionPolicy::FpRate(0.0005), + _ => BloomConstructionPolicy::BitsPerKey( + opts.config.bloom_bits_per_key.unsigned_abs(), + ), + } + } else { + BloomConstructionPolicy::BitsPerKey(0) + } + }); for (idx, item) in merge_iter.enumerate() { let Ok(item) = item else { diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index 8be90689..2824467f 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -706,30 +706,32 @@ mod tests { InternalValue::from_components("pla:venus:name", "Venus", 0, crate::ValueType::Value), ]; - let bytes = DataBlock::encode_items(&items, 16, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); + for restart_interval in 1..=20 { + let bytes = DataBlock::encode_items(&items, restart_interval, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); + + for needle in &items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + ); + } - for needle in items { - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), - ); + assert_eq!(None, data_block.point_read(b"yyy", None)); } - assert_eq!(None, data_block.point_read(b"yyy", None)); - Ok(()) } diff --git a/src/segment/filter/mod.rs b/src/segment/filter/mod.rs index 4e6962dd..2f53ab5b 100644 --- a/src/segment/filter/mod.rs +++ b/src/segment/filter/mod.rs @@ -4,3 +4,37 @@ pub mod bit_array; pub mod standard_bloom; + +use standard_bloom::Builder as StandardBloomFilterBuilder; + +#[derive(Copy, Clone, Debug)] +pub enum BloomConstructionPolicy { + BitsPerKey(u8), + FpRate(f32), +} + +impl Default for BloomConstructionPolicy { + fn default() -> Self { + Self::BitsPerKey(10) + } +} + +impl BloomConstructionPolicy { + #[must_use] + pub fn init(&self, n: usize) -> StandardBloomFilterBuilder { + use standard_bloom::Builder; + + match self { + Self::BitsPerKey(bpk) => Builder::with_bpk(n, *bpk), + Self::FpRate(fpr) => Builder::with_fp_rate(n, *fpr), + } + } + + #[must_use] + pub fn is_active(&self) -> bool { + match self { + Self::BitsPerKey(bpk) => *bpk > 0, + Self::FpRate(fpr) => *fpr > 0.0, + } + } +} diff --git a/src/segment/multi_writer.rs b/src/segment/multi_writer.rs index 8d401ebe..0fce03be 100644 --- a/src/segment/multi_writer.rs +++ b/src/segment/multi_writer.rs @@ -2,7 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::writer::Writer; +use super::{filter::BloomConstructionPolicy, writer::Writer}; use crate::{value::InternalValue, CompressionType, SegmentId, UserKey}; use std::{ path::PathBuf, @@ -24,7 +24,6 @@ pub struct MultiWriter { /// resulting in a sorted "run" of segments pub target_size: u64, - // pub opts: Options, results: Vec, segment_id_generator: Arc, @@ -34,7 +33,8 @@ pub struct MultiWriter { pub compression: CompressionType, - // bloom_policy: BloomConstructionPolicy, + bloom_policy: BloomConstructionPolicy, + current_key: Option, } @@ -44,7 +44,6 @@ impl MultiWriter { base_path: PathBuf, segment_id_generator: Arc, target_size: u64, - // opts: Options, ) -> crate::Result { let current_segment_id = segment_id_generator.fetch_add(1, std::sync::atomic::Ordering::Relaxed); @@ -52,13 +51,6 @@ impl MultiWriter { let path = base_path.join(current_segment_id.to_string()); let writer = Writer::new(path, current_segment_id)?; - /* let writer = Writer::new(Options { - segment_id: current_segment_id, - folder: opts.folder.clone(), - data_block_size: opts.data_block_size, - index_block_size: opts.index_block_size, - })?; */ - Ok(Self { base_path, @@ -66,14 +58,14 @@ impl MultiWriter { target_size, results: Vec::with_capacity(10), - // opts, segment_id_generator, current_segment_id, writer, compression: CompressionType::None, - // bloom_policy: BloomConstructionPolicy::default(), + bloom_policy: BloomConstructionPolicy::default(), + current_key: None, }) } @@ -95,12 +87,12 @@ impl MultiWriter { self } - /* #[must_use] + #[must_use] pub fn use_bloom_policy(mut self, bloom_policy: BloomConstructionPolicy) -> Self { self.bloom_policy = bloom_policy; self.writer = self.writer.use_bloom_policy(bloom_policy); self - } */ + } fn get_next_segment_id(&mut self) -> u64 { self.current_segment_id = self @@ -119,9 +111,8 @@ impl MultiWriter { let new_writer = Writer::new(path, new_segment_id)? .use_compression(self.compression) - .use_data_block_size(self.data_block_size); - - // new_writer = new_writer.use_bloom_policy(self.bloom_policy); + .use_data_block_size(self.data_block_size) + .use_bloom_policy(self.bloom_policy); let old_writer = std::mem::replace(&mut self.writer, new_writer); diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 93c60694..369d31b4 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -2,8 +2,8 @@ mod index; mod meta; use super::{ - block::Header as BlockHeader, trailer::Trailer, Block, BlockOffset, DataBlock, - NewKeyedBlockHandle, + block::Header as BlockHeader, filter::BloomConstructionPolicy, trailer::Trailer, Block, + BlockOffset, DataBlock, NewKeyedBlockHandle, }; use crate::{ coding::Encode, @@ -27,6 +27,7 @@ pub struct Writer { segment_id: SegmentId, data_block_size: u32, + index_block_size: u32, // TODO: implement /// Compression to use compression: CompressionType, @@ -49,7 +50,8 @@ pub struct Writer { current_key: Option, - // bloom_policy: BloomConstructionPolicy, + bloom_policy: BloomConstructionPolicy, + /// Hashes for bloom filter /// /// using enhanced double hashing, so we got two u64s @@ -67,6 +69,7 @@ impl Writer { segment_id, data_block_size: 4_096, + index_block_size: 4_096, compression: CompressionType::None, @@ -83,6 +86,8 @@ impl Writer { current_key: None, + bloom_policy: BloomConstructionPolicy::default(), + bloom_hash_buffer: Vec::new(), }) } @@ -104,6 +109,12 @@ impl Writer { self } + #[must_use] + pub(crate) fn use_bloom_policy(mut self, bloom_policy: BloomConstructionPolicy) -> Self { + self.bloom_policy = bloom_policy; + self + } + /// Writes an item. /// /// # Note @@ -125,11 +136,10 @@ impl Writer { // because there may be multiple versions // of the same key - // TODO: policy - //if self.bloom_policy.is_active() { - self.bloom_hash_buffer - .push(Builder::get_hash(&item.key.user_key)); - // } + if self.bloom_policy.is_active() { + self.bloom_hash_buffer + .push(Builder::get_hash(&item.key.user_key)); + } } let seqno = item.key.seqno; @@ -233,18 +243,15 @@ impl Writer { let filter_ptr = self.block_writer.stream_position()?; let n = self.bloom_hash_buffer.len(); - // TODO: - /* log::trace!( + log::trace!( "Constructing Bloom filter with {n} entries: {:?}", self.bloom_policy, - ); */ + ); let start = std::time::Instant::now(); - // let mut filter = self.bloom_policy.build(n); - let filter = { - let mut builder = Builder::with_bpk(n, 10); + let mut builder = self.bloom_policy.init(n); for hash in std::mem::take(&mut self.bloom_hash_buffer) { builder.set_with_hash(hash); diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 9bd40d52..ea0585f8 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -177,40 +177,27 @@ impl AbstractTree for Tree { memtable: &Arc, seqno_threshold: SeqNo, ) -> crate::Result> { - use crate::{ - compaction::stream::CompactionStream, file::SEGMENTS_FOLDER, segment::Writer, - }; + use crate::{compaction::stream::CompactionStream, file::SEGMENTS_FOLDER, segment::Writer}; use std::time::Instant; let start = Instant::now(); let folder = self.config.path.join(SEGMENTS_FOLDER); - log::debug!("writing segment to {folder:?}"); - - let mut segment_writer = Writer::new( - folder.join(segment_id.to_string()), - segment_id, - /* Options { - segment_id, - folder, - data_block_size: self.config.data_block_size, - index_block_size: self.config.index_block_size, - } */ - )? - .use_compression(self.config.compression) - .use_data_block_size(self.config.data_block_size); - - /* { - use crate::segment::writer::BloomConstructionPolicy; - - if self.config.bloom_bits_per_key >= 0 { - segment_writer = - segment_writer.use_bloom_policy(BloomConstructionPolicy::FpRate(0.00001)); - } else { - segment_writer = - segment_writer.use_bloom_policy(BloomConstructionPolicy::BitsPerKey(0)); - } - } */ + let segment_file_path = folder.join(segment_id.to_string()); + log::debug!("writing segment to {segment_file_path:?}"); + + let mut segment_writer = Writer::new(segment_file_path, segment_id)? + .use_compression(self.config.compression) + .use_data_block_size(self.config.data_block_size) + .use_bloom_policy({ + use crate::segment::filter::BloomConstructionPolicy; + + if self.config.bloom_bits_per_key >= 0 { + BloomConstructionPolicy::FpRate(0.00001) + } else { + BloomConstructionPolicy::BitsPerKey(0) + } + }); let iter = memtable.iter().map(Ok); let compaction_filter = CompactionStream::new(iter, seqno_threshold); From 0ba8fd3b2b504237f526442af65306ce8e4f7f0d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Apr 2025 17:27:44 +0200 Subject: [PATCH 084/613] fix fuzz --- fuzz/fuzz_targets/data_block.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fuzz/fuzz_targets/data_block.rs b/fuzz/fuzz_targets/data_block.rs index fc61a741..17eda8ca 100644 --- a/fuzz/fuzz_targets/data_block.rs +++ b/fuzz/fuzz_targets/data_block.rs @@ -2,7 +2,7 @@ use arbitrary::{Arbitrary, Result, Unstructured}; use libfuzzer_sys::fuzz_target; use lsm_tree::{ - super_segment::{block::BlockOffset, Block, DataBlock}, + segment::{block::BlockOffset, Block, DataBlock}, InternalValue, SeqNo, ValueType, }; @@ -105,8 +105,8 @@ fuzz_target!(|data: &[u8]| { let data_block = DataBlock::new(Block { data: bytes.into(), - header: lsm_tree::super_segment::block::Header { - checksum: lsm_tree::super_segment::Checksum::from_raw(0), + header: lsm_tree::segment::block::Header { + checksum: lsm_tree::segment::Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, previous_block_offset: BlockOffset(0), @@ -133,9 +133,7 @@ fuzz_target!(|data: &[u8]| { assert_eq!( Some(needle.clone()), - data_block - .point_read(&needle.key.user_key, Some(needle.key.seqno + 1)) - .unwrap(), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), ); assert_eq!( @@ -143,7 +141,8 @@ fuzz_target!(|data: &[u8]| { items .iter() .find(|item| item.key.user_key == needle.key.user_key) - .cloned(), + .cloned() + .unwrap(), ); } From 4d10d5164a2d7289dba32d4b272cf3bdaadb4842 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Apr 2025 19:36:40 +0200 Subject: [PATCH 085/613] msrv 1.80 --- .github/workflows/test.yml | 2 +- Cargo.toml | 4 ---- src/path.rs | 5 +---- src/range.rs | 9 ++------- 4 files changed, 4 insertions(+), 16 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index cce0cb65..aa40f91e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -18,7 +18,7 @@ jobs: matrix: rust_version: - stable - - "1.75.0" # MSRV + - "1.80.0" # MSRV os: - ubuntu-latest - windows-latest diff --git a/Cargo.toml b/Cargo.toml index c0ab3aec..5064db0b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,7 +33,6 @@ interval-heap = "0.0.5" log = "0.4.22" lz4_flex = { version = "0.11.3", optional = true, default-features = false } miniz_oxide = { version = "0.8.0", optional = true } -path-absolutize = "3.1.1" quick_cache = { version = "0.6.13", default-features = false, features = [] } rustc-hash = "2.0.0" self_cell = "1.0.4" @@ -49,9 +48,6 @@ nanoid = "0.4.0" rand = "0.9.0" test-log = "0.2.16" -# half 2.5.0 has MSRV 1.81 -half = "=2.4.0" - [package.metadata.cargo-all-features] denylist = [] diff --git a/src/path.rs b/src/path.rs index 12e13732..2a358f2e 100644 --- a/src/path.rs +++ b/src/path.rs @@ -2,15 +2,12 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use path_absolutize::Absolutize; use std::path::{Path, PathBuf}; -// TODO: std::path::absolute 1.79 - #[allow(clippy::module_name_repetitions)] pub fn absolute_path(path: &Path) -> PathBuf { // NOTE: Not sure if this can even fail realistically // not much we can do about it #[allow(clippy::expect_used)] - path.absolutize().expect("should be absolute path").into() + std::path::absolute(path).expect("should be absolute path") } diff --git a/src/range.rs b/src/range.rs index e1a6023a..d0191e8f 100644 --- a/src/range.rs +++ b/src/range.rs @@ -301,13 +301,8 @@ mod tests { _ if prefix.is_empty() => Unbounded, _ => Included(Slice::from(prefix)), }, - // TODO: Bound::map 1.77 - match upper_bound { - Unbounded => Unbounded, - Included(x) => Included(Slice::from(x)), - Excluded(x) => Excluded(Slice::from(x)), - } - ) + upper_bound.map(Slice::from), + ), ); } From 5f1f61df3eb354773b43e7e2f074f90895a56686 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Apr 2025 21:43:11 +0200 Subject: [PATCH 086/613] wip --- fuzz/fuzz_targets/data_block.rs | 13 ++++++++----- src/segment/index_block/mod.rs | 16 ++++++++-------- src/segment/mod.rs | 3 +-- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/fuzz/fuzz_targets/data_block.rs b/fuzz/fuzz_targets/data_block.rs index 17eda8ca..1b4e67d5 100644 --- a/fuzz/fuzz_targets/data_block.rs +++ b/fuzz/fuzz_targets/data_block.rs @@ -101,9 +101,11 @@ fuzz_target!(|data: &[u8]| { } */ let items = items.into_iter().map(|value| value.0).collect::>(); - let bytes = DataBlock::encode_items(&items, restart_interval.into(), hash_ratio).unwrap(); - let data_block = DataBlock::new(Block { + for restart_interval in 1..=u8::MAX { + let bytes = DataBlock::encode_items(&items, restart_interval.into(), hash_ratio).unwrap(); + + let data_block = DataBlock::new(Block { data: bytes.into(), header: lsm_tree::segment::block::Header { checksum: lsm_tree::segment::Checksum::from_raw(0), @@ -250,8 +252,9 @@ fuzz_target!(|data: &[u8]| { assert_eq!( expected_range, data_block - .range::<&[u8], _>(&(lo_key.as_ref()..=hi_key.as_ref())) - .collect::>(), - ); + .range::<&[u8], _>(&(lo_key.as_ref()..=hi_key.as_ref())) + .collect::>(), + ); + } } }); diff --git a/src/segment/index_block/mod.rs b/src/segment/index_block/mod.rs index 7295d96e..558fe9cb 100644 --- a/src/segment/index_block/mod.rs +++ b/src/segment/index_block/mod.rs @@ -68,13 +68,13 @@ impl IndexBlock { } } - /// Returns the amount of items in the block + /// Returns the amount of items in the block. #[must_use] - pub fn item_count(&self) -> usize { + pub fn len(&self) -> usize { Trailer::new(&self.inner).item_count() } - /// Always returns false: a block is never empty + /// Always returns false: a block is never empty. #[must_use] pub fn is_empty(&self) -> bool { false @@ -371,7 +371,7 @@ mod tests { }, }); - assert_eq!(data_block.item_count(), items.len()); + assert_eq!(data_block.len(), items.len()); assert_eq!( Some(items.first().unwrap().clone()), @@ -419,7 +419,7 @@ mod tests { }, }); - assert_eq!(data_block.item_count(), items.len()); + assert_eq!(data_block.len(), items.len()); assert_eq!( Some(items.first().unwrap().clone()), @@ -462,7 +462,7 @@ mod tests { }, }); - assert_eq!(data_block.item_count(), items.len()); + assert_eq!(data_block.len(), items.len()); assert_eq!( Some(items.first().unwrap().clone()), @@ -508,7 +508,7 @@ mod tests { }, }); - assert_eq!(data_block.item_count(), 1); + assert_eq!(data_block.len(), 1); assert_eq!( Some(item.clone()), @@ -548,7 +548,7 @@ mod tests { }, }); - assert_eq!(data_block.item_count(), 1); + assert_eq!(data_block.len(), 1); assert_eq!( Some(item.clone()), diff --git a/src/segment/mod.rs b/src/segment/mod.rs index d4eec1a5..60a9df22 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -19,7 +19,7 @@ mod writer; pub use block::{Block, BlockOffset, Checksum}; pub use data_block::DataBlock; pub use id::{GlobalSegmentId, SegmentId}; -pub use index_block::{IndexBlock, NewKeyedBlockHandle}; +pub use index_block::{IndexBlock, NewBlockHandle, NewKeyedBlockHandle}; pub use scanner::Scanner; pub use writer::Writer; @@ -28,7 +28,6 @@ use crate::{ }; use block_index::{NewBlockIndex, NewBlockIndexImpl, NewFullBlockIndex}; use filter::standard_bloom::{CompositeHash, StandardBloomFilter}; -use index_block::NewBlockHandle; use inner::Inner; use meta::ParsedMeta; use std::{ From be7cae4bc76daabce7eeeb7eebcca1b70fb98c8e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Apr 2025 21:45:52 +0200 Subject: [PATCH 087/613] rename structs --- src/segment/block_index/mod.rs | 18 ++++++------ src/segment/index_block/block_handle.rs | 34 ++++++++++----------- src/segment/index_block/forward_reader.rs | 22 +++++++------- src/segment/index_block/mod.rs | 36 +++++++++++------------ src/segment/mod.rs | 4 +-- src/segment/trailer.rs | 30 +++++++++---------- src/segment/writer/index.rs | 30 +++++++++---------- src/segment/writer/mod.rs | 10 +++---- 8 files changed, 92 insertions(+), 92 deletions(-) diff --git a/src/segment/block_index/mod.rs b/src/segment/block_index/mod.rs index f6e0eb01..e178f570 100644 --- a/src/segment/block_index/mod.rs +++ b/src/segment/block_index/mod.rs @@ -2,7 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::{CachePolicy, IndexBlock, NewKeyedBlockHandle}; +use super::{CachePolicy, IndexBlock, KeyedBlockHandle}; #[enum_dispatch::enum_dispatch] pub trait NewBlockIndex { @@ -11,20 +11,20 @@ pub trait NewBlockIndex { &self, key: &[u8], cache_policy: CachePolicy, - ) -> crate::Result>; // TODO: return NewBlockHandle (::into_non_keyed) + ) -> crate::Result>; // TODO: return BlockHandle (::into_non_keyed) /// Gets the last block handle that can possibly contain the given item. fn get_last_block_containing_key( &self, key: &[u8], cache_policy: CachePolicy, - ) -> crate::Result>; + ) -> crate::Result>; /// Returns a handle to the last block. fn get_last_block_handle( &self, cache_policy: CachePolicy, - ) -> crate::Result; + ) -> crate::Result; } /// The block index stores references to the positions of blocks on a file and their size @@ -65,7 +65,7 @@ impl NewFullBlockIndex { pub fn forward_reader( &self, needle: &[u8], - ) -> Option + '_> { + ) -> Option + '_> { self.0.forward_reader(needle) } } @@ -75,7 +75,7 @@ impl NewBlockIndex for NewFullBlockIndex { &self, key: &[u8], _: CachePolicy, - ) -> crate::Result> { + ) -> crate::Result> { Ok(self.0.get_highest_possible_block(key)) } @@ -83,17 +83,17 @@ impl NewBlockIndex for NewFullBlockIndex { &self, key: &[u8], _: CachePolicy, - ) -> crate::Result> { + ) -> crate::Result> { Ok(self.0.get_lowest_possible_block(key)) } - fn get_last_block_handle(&self, _: CachePolicy) -> crate::Result { + fn get_last_block_handle(&self, _: CachePolicy) -> crate::Result { todo!() } } /* impl std::ops::Deref for FullBlockIndex { - type Target = Box<[NewKeyedBlockHandle]>; + type Target = Box<[KeyedBlockHandle]>; fn deref(&self) -> &Self::Target { &self.0 diff --git a/src/segment/index_block/block_handle.rs b/src/segment/index_block/block_handle.rs index cd2a35be..331dfcd3 100644 --- a/src/segment/index_block/block_handle.rs +++ b/src/segment/index_block/block_handle.rs @@ -13,7 +13,7 @@ use varint_rs::{VarintReader, VarintWriter}; /// Points to a block on file #[derive(Copy, Clone, Debug, Default, Eq)] #[allow(clippy::module_name_repetitions)] -pub struct NewBlockHandle { +pub struct BlockHandle { /// Position of block in file offset: BlockOffset, @@ -21,7 +21,7 @@ pub struct NewBlockHandle { size: u32, } -impl NewBlockHandle { +impl BlockHandle { pub fn new(offset: BlockOffset, size: u32) -> Self { Self { offset, size } } @@ -35,25 +35,25 @@ impl NewBlockHandle { } } -impl PartialEq for NewBlockHandle { +impl PartialEq for BlockHandle { fn eq(&self, other: &Self) -> bool { self.offset == other.offset } } -impl Ord for NewBlockHandle { +impl Ord for BlockHandle { fn cmp(&self, other: &Self) -> std::cmp::Ordering { self.offset.cmp(&other.offset) } } -impl PartialOrd for NewBlockHandle { +impl PartialOrd for BlockHandle { fn partial_cmp(&self, other: &Self) -> Option { Some(self.offset.cmp(&other.offset)) } } -impl Encode for NewBlockHandle { +impl Encode for BlockHandle { fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { writer.write_u64_varint(*self.offset)?; writer.write_u32_varint(self.size)?; @@ -61,7 +61,7 @@ impl Encode for NewBlockHandle { } } -impl Decode for NewBlockHandle { +impl Decode for BlockHandle { fn decode_from(reader: &mut R) -> Result where Self: Sized, @@ -79,24 +79,24 @@ impl Decode for NewBlockHandle { /// Points to a block on file #[derive(Clone, Debug, Eq)] #[allow(clippy::module_name_repetitions)] -pub struct NewKeyedBlockHandle { +pub struct KeyedBlockHandle { /// Key of last item in block end_key: UserKey, - inner: NewBlockHandle, + inner: BlockHandle, } -impl AsRef for NewKeyedBlockHandle { - fn as_ref(&self) -> &NewBlockHandle { +impl AsRef for KeyedBlockHandle { + fn as_ref(&self) -> &BlockHandle { &self.inner } } -impl NewKeyedBlockHandle { +impl KeyedBlockHandle { pub fn new(end_key: UserKey, offset: BlockOffset, size: u32) -> Self { Self { end_key, - inner: NewBlockHandle::new(offset, size), + inner: BlockHandle::new(offset, size), } } @@ -121,25 +121,25 @@ impl NewKeyedBlockHandle { } } -impl PartialEq for NewKeyedBlockHandle { +impl PartialEq for KeyedBlockHandle { fn eq(&self, other: &Self) -> bool { self.offset() == other.offset() } } -impl Ord for NewKeyedBlockHandle { +impl Ord for KeyedBlockHandle { fn cmp(&self, other: &Self) -> std::cmp::Ordering { self.offset().cmp(&other.offset()) } } -impl PartialOrd for NewKeyedBlockHandle { +impl PartialOrd for KeyedBlockHandle { fn partial_cmp(&self, other: &Self) -> Option { Some(self.offset().cmp(&other.offset())) } } -impl Encodable for NewKeyedBlockHandle { +impl Encodable for KeyedBlockHandle { fn encode_full_into( &self, writer: &mut W, diff --git a/src/segment/index_block/forward_reader.rs b/src/segment/index_block/forward_reader.rs index c2d85b02..88744692 100644 --- a/src/segment/index_block/forward_reader.rs +++ b/src/segment/index_block/forward_reader.rs @@ -2,7 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::{IndexBlock, NewKeyedBlockHandle}; +use super::{IndexBlock, KeyedBlockHandle}; use crate::{segment::BlockOffset, Slice}; use std::io::Cursor; @@ -36,7 +36,7 @@ pub struct ParsedItem { } impl ParsedItem { - pub fn materialize(&self, bytes: &Slice) -> NewKeyedBlockHandle { + pub fn materialize(&self, bytes: &Slice) -> KeyedBlockHandle { let end_key = if let Some(prefix) = &self.prefix { let prefix_key = &bytes[prefix.0..prefix.1]; let rest_key = &bytes[self.end_key.0..self.end_key.1]; @@ -45,7 +45,7 @@ impl ParsedItem { bytes.slice(self.end_key.0..self.end_key.1) }; - NewKeyedBlockHandle::new(end_key, self.offset, self.size) + KeyedBlockHandle::new(end_key, self.offset, self.size) } } @@ -145,9 +145,9 @@ mod tests { #[allow(clippy::unwrap_used)] fn v3_index_block_simple() -> crate::Result<()> { let items = [ - NewKeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), - NewKeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), - NewKeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), + KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), + KeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), + KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), ]; let bytes = IndexBlock::encode_items(&items, 1)?; @@ -165,7 +165,7 @@ mod tests { }, }); - assert_eq!(block.item_count(), items.len()); + assert_eq!(block.len(), items.len()); let iter = block.forward_reader(b"a").unwrap(); assert_eq!(&items, &*iter.collect::>()); @@ -177,9 +177,9 @@ mod tests { #[allow(clippy::unwrap_used)] fn v3_index_block_seek() -> crate::Result<()> { let items = [ - NewKeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), - NewKeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), - NewKeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), + KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), + KeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), + KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), ]; let bytes = IndexBlock::encode_items(&items, 1)?; @@ -197,7 +197,7 @@ mod tests { }, }); - assert_eq!(block.item_count(), items.len()); + assert_eq!(block.len(), items.len()); { let iter = block.forward_reader(b"a").unwrap(); diff --git a/src/segment/index_block/mod.rs b/src/segment/index_block/mod.rs index 558fe9cb..e4b4dd1a 100644 --- a/src/segment/index_block/mod.rs +++ b/src/segment/index_block/mod.rs @@ -5,7 +5,7 @@ mod block_handle; mod forward_reader; -pub use block_handle::{NewBlockHandle, NewKeyedBlockHandle}; +pub use block_handle::{BlockHandle, KeyedBlockHandle}; use forward_reader::{ForwardReader, ParsedItem, ParsedSlice}; use super::{ @@ -127,7 +127,7 @@ impl IndexBlock { pub fn forward_reader( &self, needle: &[u8], - ) -> Option + '_> { + ) -> Option + '_> { let offset = self .search_lowest(&self.get_binary_index_reader(), needle) .unwrap_or_default(); @@ -263,7 +263,7 @@ impl IndexBlock { } #[must_use] - pub fn get_lowest_possible_block(&self, needle: &[u8]) -> Option { + pub fn get_lowest_possible_block(&self, needle: &[u8]) -> Option { let binary_index = self.get_binary_index_reader(); /* @@ -299,7 +299,7 @@ impl IndexBlock { } #[must_use] - pub fn get_highest_possible_block(&self, needle: &[u8]) -> Option { + pub fn get_highest_possible_block(&self, needle: &[u8]) -> Option { let binary_index = self.get_binary_index_reader(); let offset = self.search_highest(&binary_index, needle)?; @@ -321,12 +321,12 @@ impl IndexBlock { } pub fn encode_items( - items: &[NewKeyedBlockHandle], + items: &[KeyedBlockHandle], restart_interval: u8, ) -> crate::Result> { let first_key = items.first().expect("chunk should not be empty").end_key(); - let mut serializer = Encoder::<'_, BlockOffset, NewKeyedBlockHandle>::new( + let mut serializer = Encoder::<'_, BlockOffset, KeyedBlockHandle>::new( items.len(), restart_interval, 0.0, // TODO: hard-coded for now @@ -351,9 +351,9 @@ mod tests { #[allow(clippy::unwrap_used)] fn v3_index_block_simple() -> crate::Result<()> { let items = [ - NewKeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), - NewKeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), - NewKeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), + KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), + KeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), + KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), ]; let bytes = IndexBlock::encode_items(&items, 1)?; @@ -399,9 +399,9 @@ mod tests { #[allow(clippy::unwrap_used)] fn v3_index_block_span() -> crate::Result<()> { let items = [ - NewKeyedBlockHandle::new(b"a".into(), BlockOffset(0), 6_000), - NewKeyedBlockHandle::new(b"a".into(), BlockOffset(6_000), 7_000), - NewKeyedBlockHandle::new(b"b".into(), BlockOffset(13_000), 5_000), + KeyedBlockHandle::new(b"a".into(), BlockOffset(0), 6_000), + KeyedBlockHandle::new(b"a".into(), BlockOffset(6_000), 7_000), + KeyedBlockHandle::new(b"b".into(), BlockOffset(13_000), 5_000), ]; let bytes = IndexBlock::encode_items(&items, 1)?; @@ -441,10 +441,10 @@ mod tests { #[allow(clippy::unwrap_used)] fn v3_index_block_span_highest() -> crate::Result<()> { let items = [ - NewKeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), - NewKeyedBlockHandle::new(b"c".into(), BlockOffset(0), 6_000), - NewKeyedBlockHandle::new(b"c".into(), BlockOffset(6_000), 7_000), - NewKeyedBlockHandle::new(b"d".into(), BlockOffset(13_000), 5_000), + KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), + KeyedBlockHandle::new(b"c".into(), BlockOffset(0), 6_000), + KeyedBlockHandle::new(b"c".into(), BlockOffset(6_000), 7_000), + KeyedBlockHandle::new(b"d".into(), BlockOffset(13_000), 5_000), ]; let bytes = IndexBlock::encode_items(&items, 1)?; @@ -491,7 +491,7 @@ mod tests { #[test] fn v3_index_block_one() -> crate::Result<()> { - let item = NewKeyedBlockHandle::new(b"c".into(), BlockOffset(0), 6_000); + let item = KeyedBlockHandle::new(b"c".into(), BlockOffset(0), 6_000); let bytes = IndexBlock::encode_items(&[item.clone()], 1)?; // eprintln!("{bytes:?}"); @@ -531,7 +531,7 @@ mod tests { #[test] fn v3_index_block_one_highest() -> crate::Result<()> { - let item = NewKeyedBlockHandle::new(b"c".into(), BlockOffset(0), 6_000); + let item = KeyedBlockHandle::new(b"c".into(), BlockOffset(0), 6_000); let bytes = IndexBlock::encode_items(&[item.clone()], 1)?; // eprintln!("{bytes:?}"); diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 60a9df22..d5c060cc 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -19,7 +19,7 @@ mod writer; pub use block::{Block, BlockOffset, Checksum}; pub use data_block::DataBlock; pub use id::{GlobalSegmentId, SegmentId}; -pub use index_block::{IndexBlock, NewBlockHandle, NewKeyedBlockHandle}; +pub use index_block::{IndexBlock, BlockHandle, KeyedBlockHandle}; pub use scanner::Scanner; pub use writer::Writer; @@ -116,7 +116,7 @@ impl Segment { self.metadata.id } - fn load_data_block(&self, handle: &NewBlockHandle) -> crate::Result { + fn load_data_block(&self, handle: &BlockHandle) -> crate::Result { let id = self.global_id(); if let Some(data_block) = self.cache.get_data_block(id, handle.offset()) { diff --git a/src/segment/trailer.rs b/src/segment/trailer.rs index 5453e277..d3590830 100644 --- a/src/segment/trailer.rs +++ b/src/segment/trailer.rs @@ -2,7 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::index_block::NewBlockHandle; +use super::index_block::BlockHandle; use crate::{ coding::{Decode, DecodeError, Encode, EncodeError}, file::MAGIC_BYTES, @@ -34,9 +34,9 @@ const TRAILER_SIZE: usize = 128; /// |--------------| #[derive(Copy, Clone, Debug, Default, PartialEq, Eq)] pub struct Trailer { - pub tli: NewBlockHandle, - pub index_blocks: Option, - pub filter: Option, // option + pub tli: BlockHandle, + pub index_blocks: Option, + pub filter: Option, // option // // TODO: #2 https://github.com/fjall-rs/lsm-tree/issues/2 // pub range_tombstones: BlockOffset, @@ -46,7 +46,7 @@ pub struct Trailer { // // TODO: #46 https://github.com/fjall-rs/lsm-tree/issues/46 // pub range_filter: BlockOffset, - pub metadata: NewBlockHandle, + pub metadata: BlockHandle, } impl Trailer { @@ -111,13 +111,13 @@ impl Encode for Trailer { if let Some(handle) = &self.index_blocks { handle.encode_into(writer) } else { - NewBlockHandle::default().encode_into(writer) + BlockHandle::default().encode_into(writer) }?; if let Some(handle) = &self.filter { handle.encode_into(writer) } else { - NewBlockHandle::default().encode_into(writer) + BlockHandle::default().encode_into(writer) }?; self.metadata.encode_into(writer)?; @@ -128,10 +128,10 @@ impl Encode for Trailer { impl Decode for Trailer { fn decode_from(reader: &mut R) -> Result { - let tli = NewBlockHandle::decode_from(reader)?; - let index_blocks = NewBlockHandle::decode_from(reader)?; - let filter = NewBlockHandle::decode_from(reader)?; - let metadata = NewBlockHandle::decode_from(reader)?; + let tli = BlockHandle::decode_from(reader)?; + let index_blocks = BlockHandle::decode_from(reader)?; + let filter = BlockHandle::decode_from(reader)?; + let metadata = BlockHandle::decode_from(reader)?; Ok(Self { index_blocks: match *index_blocks.offset() { @@ -158,10 +158,10 @@ mod tests { #[test] fn v3_file_offsets_roundtrip() -> crate::Result<()> { let before = Trailer { - tli: NewBlockHandle::new(BlockOffset(15), 5), - index_blocks: Some(NewBlockHandle::new(BlockOffset(20), 5)), - filter: Some(NewBlockHandle::new(BlockOffset(25), 5)), - metadata: NewBlockHandle::new(BlockOffset(30), 5), + tli: BlockHandle::new(BlockOffset(15), 5), + index_blocks: Some(BlockHandle::new(BlockOffset(20), 5)), + filter: Some(BlockHandle::new(BlockOffset(25), 5)), + metadata: BlockHandle::new(BlockOffset(30), 5), }; let buf = before.encode_into_vec(); diff --git a/src/segment/writer/index.rs b/src/segment/writer/index.rs index c0455e8a..898c8ffb 100644 --- a/src/segment/writer/index.rs +++ b/src/segment/writer/index.rs @@ -5,7 +5,7 @@ use crate::{ segment::{ block::Header as BlockHeader, - index_block::{NewBlockHandle, NewKeyedBlockHandle}, + index_block::{BlockHandle, KeyedBlockHandle}, Block, BlockOffset, IndexBlock, }, CompressionType, @@ -13,7 +13,7 @@ use crate::{ pub trait BlockIndexWriter { /// Registers a data block in the block index. - fn register_data_block(&mut self, block_handle: NewKeyedBlockHandle) -> crate::Result<()>; + fn register_data_block(&mut self, block_handle: KeyedBlockHandle) -> crate::Result<()>; /// Writes the block index to a file. /// @@ -21,7 +21,7 @@ pub trait BlockIndexWriter { fn finish( &mut self, block_file_writer: &mut W, - ) -> crate::Result<(NewBlockHandle, Option)>; + ) -> crate::Result<(BlockHandle, Option)>; fn use_compression(&mut self, compression: CompressionType); @@ -30,7 +30,7 @@ pub trait BlockIndexWriter { pub struct FullIndexWriter { compression: CompressionType, - block_handles: Vec, + block_handles: Vec, } impl FullIndexWriter { @@ -51,7 +51,7 @@ impl BlockIndexWriter for FullIndexWriter self.compression = compression; } - fn register_data_block(&mut self, block_handle: NewKeyedBlockHandle) -> crate::Result<()> { + fn register_data_block(&mut self, block_handle: KeyedBlockHandle) -> crate::Result<()> { log::trace!( "Registering block at {:?} with size {} [end_key={:?}]", block_handle.offset(), @@ -67,7 +67,7 @@ impl BlockIndexWriter for FullIndexWriter fn finish( &mut self, block_file_writer: &mut W, - ) -> crate::Result<(NewBlockHandle, Option)> { + ) -> crate::Result<(BlockHandle, Option)> { let tli_ptr = BlockOffset(block_file_writer.stream_position()?); let bytes = @@ -84,7 +84,7 @@ impl BlockIndexWriter for FullIndexWriter self.block_handles.len(), ); - Ok((NewBlockHandle::new(tli_ptr, bytes_written), None)) + Ok((BlockHandle::new(tli_ptr, bytes_written), None)) } } @@ -109,8 +109,8 @@ impl BlockIndexWriter for FullIndexWriter buffer_size: u32, - block_handles: Vec, - tli_pointers: Vec, + block_handles: Vec, + tli_pointers: Vec, pub block_count: usize, } @@ -154,7 +154,7 @@ impl Writer { let last = self.block_handles.pop().expect("Chunk should not be empty"); let index_block_handle = - NewKeyedBlockHandle::new(last.into_end_key(), self.file_pos, bytes_written); + KeyedBlockHandle::new(last.into_end_key(), self.file_pos, bytes_written); self.tli_pointers.push(index_block_handle); @@ -187,9 +187,9 @@ impl Writer { // NOTE: Truncation is OK, because a key is bound by 65535 bytes, so can never exceed u32s #[allow(clippy::cast_possible_truncation)] - let block_handle_size = (end_key.len() + std::mem::size_of::()) as u32; + let block_handle_size = (end_key.len() + std::mem::size_of::()) as u32; - let block_handle = NewKeyedBlockHandle::new(end_key, offset, size); + let block_handle = KeyedBlockHandle::new(end_key, offset, size); self.block_handles.push(block_handle); @@ -206,7 +206,7 @@ impl Writer { &mut self, block_file_writer: &mut BufWriter, file_offset: BlockOffset, - ) -> crate::Result { + ) -> crate::Result { block_file_writer.write_all(&self.write_buffer)?; log::trace!("Wrote index blocks into segment file"); @@ -231,14 +231,14 @@ impl Writer { bytes_written, ); - Ok(NewBlockHandle::new(tli_ptr, bytes_written)) + Ok(BlockHandle::new(tli_ptr, bytes_written)) } /// Returns the offset in the file to TLI pub fn finish( &mut self, block_file_writer: &mut BufWriter, - ) -> crate::Result { + ) -> crate::Result { if self.buffer_size > 0 { self.write_block()?; } diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 369d31b4..0a326cf3 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -3,12 +3,12 @@ mod meta; use super::{ block::Header as BlockHeader, filter::BloomConstructionPolicy, trailer::Trailer, Block, - BlockOffset, DataBlock, NewKeyedBlockHandle, + BlockOffset, DataBlock, KeyedBlockHandle, }; use crate::{ coding::Encode, file::fsync_directory, - segment::{filter::standard_bloom::Builder, index_block::NewBlockHandle}, + segment::{filter::standard_bloom::Builder, index_block::BlockHandle}, time::unix_timestamp, CompressionType, InternalValue, SegmentId, UserKey, }; @@ -181,7 +181,7 @@ impl Writer { let bytes_written = BlockHeader::serialized_len() as u32 + header.data_length; self.index_writer - .register_data_block(NewKeyedBlockHandle::new( + .register_data_block(KeyedBlockHandle::new( last.key.user_key.clone(), self.meta.file_pos, bytes_written, @@ -269,7 +269,7 @@ impl Writer { let bytes_written = (BlockHeader::serialized_len() as u32) + block.data_length; - Some(NewBlockHandle::new(BlockOffset(filter_ptr), bytes_written)) + Some(BlockHandle::new(BlockOffset(filter_ptr), bytes_written)) } }; log::trace!("filter_ptr={filter_handle:?}"); @@ -354,7 +354,7 @@ impl Writer { let bytes_written = BlockHeader::serialized_len() as u32 + header.data_length; - NewBlockHandle::new(metadata_start, bytes_written as u32) + BlockHandle::new(metadata_start, bytes_written as u32) }; // Bundle all the file offsets From 6b07a239a933e0f38178d74aea2c55b3724e29e7 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 20 Apr 2025 16:06:20 +0200 Subject: [PATCH 088/613] fmt --- src/segment/block/header.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/segment/block/header.rs b/src/segment/block/header.rs index 455f8cc7..b359a5b3 100644 --- a/src/segment/block/header.rs +++ b/src/segment/block/header.rs @@ -2,13 +2,13 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::coding::{Encode, EncodeError, Decode, DecodeError}; +use super::offset::BlockOffset; +use super::Checksum; +use crate::coding::{Decode, DecodeError, Encode, EncodeError}; use crate::file::MAGIC_BYTES; use byteorder::LittleEndian; -use byteorder::{ReadBytesExt,WriteBytesExt}; +use byteorder::{ReadBytesExt, WriteBytesExt}; use std::io::{Read, Write}; -use super::offset::BlockOffset; -use super::Checksum; /// Header of a disk-based block #[derive(Copy, Clone, Debug, Eq, PartialEq)] @@ -35,7 +35,7 @@ impl Header { // Backlink + std::mem::size_of::() // On-disk size - + std::mem::size_of::() + + std::mem::size_of::() // Uncompressed data length + std::mem::size_of::() } @@ -72,7 +72,6 @@ impl Decode for Header { return Err(DecodeError::InvalidHeader("Block")); } - // Read checksum let checksum = reader.read_u64::()?; @@ -92,4 +91,4 @@ impl Decode for Header { uncompressed_length, }) } -} \ No newline at end of file +} From b97361ce5bc113262e73028ad63d36a05c8009b1 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 20 Apr 2025 16:06:30 +0200 Subject: [PATCH 089/613] fmt --- src/segment/block_index/mod.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/segment/block_index/mod.rs b/src/segment/block_index/mod.rs index e178f570..252cda5e 100644 --- a/src/segment/block_index/mod.rs +++ b/src/segment/block_index/mod.rs @@ -21,10 +21,7 @@ pub trait NewBlockIndex { ) -> crate::Result>; /// Returns a handle to the last block. - fn get_last_block_handle( - &self, - cache_policy: CachePolicy, - ) -> crate::Result; + fn get_last_block_handle(&self, cache_policy: CachePolicy) -> crate::Result; } /// The block index stores references to the positions of blocks on a file and their size From 09cc22ac3b9e47e04b68e2a1a1d0a57ea6d8f092 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 20 Apr 2025 16:06:37 +0200 Subject: [PATCH 090/613] fmt --- src/segment/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/segment/mod.rs b/src/segment/mod.rs index d5c060cc..e283f535 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -19,7 +19,7 @@ mod writer; pub use block::{Block, BlockOffset, Checksum}; pub use data_block::DataBlock; pub use id::{GlobalSegmentId, SegmentId}; -pub use index_block::{IndexBlock, BlockHandle, KeyedBlockHandle}; +pub use index_block::{BlockHandle, IndexBlock, KeyedBlockHandle}; pub use scanner::Scanner; pub use writer::Writer; From 71a1ea736cae5e159b335a7edb80fc09274a2821 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 20 Apr 2025 20:07:09 +0200 Subject: [PATCH 091/613] refactor --- src/cache.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/cache.rs b/src/cache.rs index e8b1fdd0..33a7590c 100644 --- a/src/cache.rs +++ b/src/cache.rs @@ -40,9 +40,7 @@ impl Weighter for BlockWeighter { use Item::{Blob, Block}; match item { - Block(b) => { - (Header::serialized_len() as u64) + Into::::into(b.header.uncompressed_length) - } + Block(b) => (Header::serialized_len() as u64) + u64::from(b.header.uncompressed_length), Blob(b) => b.len() as u64, } } From e44caab45b8f0081b833f51675157a3dd07dbd41 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 20 Apr 2025 20:07:22 +0200 Subject: [PATCH 092/613] adjust for vlog changes --- Cargo.toml | 2 +- src/blob_tree/mod.rs | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5064db0b..9b051d1f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,7 +37,7 @@ quick_cache = { version = "0.6.13", default-features = false, features = [] } rustc-hash = "2.0.0" self_cell = "1.0.4" tempfile = "3.12.0" -value-log = { version = "~1.8", default-features = false, features = [] } +value-log = { version = "~1.9", default-features = false, features = [] } varint-rs = "2.2.0" xxhash-rust = { version = "0.8.12", features = ["xxh3"] } diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index 6c9635b8..97b3a2cf 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -85,7 +85,12 @@ impl BlobTree { let vlog_cfg = value_log::Config::::new(MyBlobCache(config.cache.clone())) .segment_size_bytes(config.blob_file_target_size) - .compression(MyCompressor(config.blob_compression)); + .compression(match config.blob_compression { + crate::CompressionType::None => None, + + #[cfg(any(feature = "lz4", feature = "miniz"))] + c => Some(MyCompressor(c)), + }); let index: IndexTree = config.open()?.into(); From 024b9841b4201e44c92365c75bd2e390dd445c5b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 21 Apr 2025 14:11:56 +0200 Subject: [PATCH 093/613] fmt --- src/tree/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tree/mod.rs b/src/tree/mod.rs index ea0585f8..4e9b9041 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -651,7 +651,7 @@ impl Tree { if let Some(entry) = memtable_lock.get(key, seqno) { return Ok(ignore_tombstone_value(entry)); - }; + } drop(memtable_lock); From 38b23221322bad10e77f7465b5f0e03734fd7e11 Mon Sep 17 00:00:00 2001 From: Jing Yang Date: Sun, 20 Apr 2025 12:23:41 +0800 Subject: [PATCH 094/613] feat: implement blocked bloom --- benches/bloom.rs | 62 ++++++++- src/segment/filter/blocked_bloom/builder.rs | 138 ++++++++++++++++++++ src/segment/filter/blocked_bloom/mod.rs | 116 ++++++++++++++++ src/segment/filter/mod.rs | 4 + 4 files changed, 316 insertions(+), 4 deletions(-) create mode 100644 src/segment/filter/blocked_bloom/builder.rs create mode 100644 src/segment/filter/blocked_bloom/mod.rs diff --git a/benches/bloom.rs b/benches/bloom.rs index 88902a64..a19850e8 100644 --- a/benches/bloom.rs +++ b/benches/bloom.rs @@ -1,11 +1,11 @@ use criterion::{criterion_group, criterion_main, Criterion}; -fn filter_construction(c: &mut Criterion) { +fn standard_filter_construction(c: &mut Criterion) { use lsm_tree::segment::filter::standard_bloom::Builder; let mut filter = Builder::with_fp_rate(500_000_000, 0.01); - c.bench_function("bloom filter add key", |b| { + c.bench_function("standard bloom filter add key", |b| { b.iter(|| { let key = nanoid::nanoid!(); filter.set_with_hash(Builder::get_hash(key.as_bytes())); @@ -13,7 +13,7 @@ fn filter_construction(c: &mut Criterion) { }); } -fn filter_contains(c: &mut Criterion) { +fn standard_filter_contains(c: &mut Criterion) { use lsm_tree::segment::filter::standard_bloom::Builder; let keys = (0..100_000u128) @@ -49,5 +49,59 @@ fn filter_contains(c: &mut Criterion) { } } -criterion_group!(benches, filter_construction, filter_contains,); +fn blocked_filter_construction(c: &mut Criterion) { + use lsm_tree::segment::filter::blocked_bloom::Builder; + + let mut filter = Builder::with_fp_rate(500_000_000, 0.01); + + c.bench_function("blocked bloom filter add key", |b| { + b.iter(|| { + let key = nanoid::nanoid!(); + filter.set_with_hash(Builder::get_hash(key.as_bytes())); + }); + }); +} + +fn blocked_filter_contains(c: &mut Criterion) { + use lsm_tree::segment::filter::blocked_bloom::Builder; + + let keys = (0..100_000u128) + .map(|x| x.to_be_bytes().to_vec()) + .collect::>(); + + for fpr in [0.01, 0.001, 0.0001, 0.00001] { + let mut filter = Builder::with_fp_rate(100_000_000, fpr); + + for key in &keys { + filter.set_with_hash(Builder::get_hash(key)); + } + + let mut rng = rand::rng(); + + let filter = filter.build(); + + c.bench_function( + &format!( + "blocked bloom filter contains key, true positive ({}%)", + fpr * 100.0, + ), + |b| { + b.iter(|| { + use rand::seq::IndexedRandom; + + let sample = keys.choose(&mut rng).unwrap(); + let hash = Builder::get_hash(sample); + assert!(filter.contains_hash(hash)); + }); + }, + ); + } +} +criterion_group!( + benches, + standard_filter_construction, + standard_filter_contains, + blocked_filter_construction, + blocked_filter_contains, +); criterion_main!(benches); diff --git a/src/segment/filter/blocked_bloom/builder.rs b/src/segment/filter/blocked_bloom/builder.rs new file mode 100644 index 00000000..49da48ff --- /dev/null +++ b/src/segment/filter/blocked_bloom/builder.rs @@ -0,0 +1,138 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use super::{super::bit_array::Builder as BitArrayBuilder, BlockedBloomFilter}; +use crate::segment::filter::{bit_array::BitArrayReader, CACHE_LINE_BYTES}; + +/// Two hashes that are used for double hashing +pub type CompositeHash = (u64, u64); + +#[derive(Debug, Eq, PartialEq)] +#[allow(clippy::module_name_repetitions)] +pub struct Builder { + /// Raw bytes exposed as bit array + inner: BitArrayBuilder, + + /// Number of hash functions + k: usize, + + /// Number of blocks in the blocked bloom filter + num_blocks: usize, +} + +#[allow(clippy::len_without_is_empty)] +impl Builder { + #[must_use] + pub fn build(self) -> BlockedBloomFilter { + BlockedBloomFilter { + inner: BitArrayReader::new(self.inner.bytes().into()), + k: self.k, + num_blocks: self.num_blocks, + } + } + + /// Constructs a bloom filter that can hold `n` items + /// while maintaining a certain false positive rate `fpr`. + #[must_use] + pub fn with_fp_rate(n: usize, fpr: f32) -> Self { + use std::f32::consts::LN_2; + + assert!(n > 0); + + // NOTE: Some sensible minimum + let fpr = fpr.max(0.000_001); + + // TODO: m and k is still calculated by traditional standard bloom filter formula + let m = Self::calculate_m(n, fpr); + let bpk = m / n; + let k = (((bpk as f32) * LN_2) as usize).max(1); + + let num_blocks = m.div_ceil(CACHE_LINE_BYTES); + + Self { + inner: BitArrayBuilder::with_capacity(num_blocks * CACHE_LINE_BYTES), + k, + num_blocks, + } + } + + /// Constructs a bloom filter that can hold `n` items + /// with `bpk` bits per key. + /// + /// 10 bits per key is a sensible default. + #[must_use] + pub fn with_bpk(n: usize, bpk: u8) -> Self { + use std::f32::consts::LN_2; + + assert!(bpk > 0); + assert!(n > 0); + + let bpk = bpk as usize; + + let m = n * bpk; + let k = (((bpk as f32) * LN_2) as usize).max(1); + + let num_blocks = m.div_ceil(CACHE_LINE_BYTES); + + Self { + inner: BitArrayBuilder::with_capacity(num_blocks * CACHE_LINE_BYTES), + k, + num_blocks, + } + } + + fn calculate_m(n: usize, fp_rate: f32) -> usize { + use std::f32::consts::LN_2; + + let n = n as f32; + let ln2_squared = LN_2.powi(2); + + let numerator = n * fp_rate.ln(); + let m = -(numerator / ln2_squared); + + // Round up to next byte + ((m / 8.0).ceil() * 8.0) as usize + } + + /// Adds the key to the filter. + pub fn set_with_hash(&mut self, (mut h1, mut h2): CompositeHash) { + let block_idx = h1 % (self.num_blocks as u64); + + for i in 1..(self.k as u64) { + h1 = h1.wrapping_add(h2); + h2 = h2.wrapping_add(i); + + let idx = h1 % (CACHE_LINE_BYTES as u64); + + self.inner + .enable_bit(Self::get_bit_idx(block_idx as usize, idx as usize)); + } + } + + pub fn get_bit_idx(block_idx: usize, idx_in_block: usize) -> usize { + block_idx * CACHE_LINE_BYTES as usize + idx_in_block + } + + /// Gets the hash of a key. + #[must_use] + pub fn get_hash(key: &[u8]) -> CompositeHash { + let h0 = xxhash_rust::xxh3::xxh3_128(key); + let h1 = (h0 >> 64) as u64; + let h2 = h0 as u64; + (h1, h2) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use test_log::test; + + #[test] + fn bloom_calculate_m() { + assert_eq!(9_592, Builder::calculate_m(1_000, 0.01)); + assert_eq!(4_800, Builder::calculate_m(1_000, 0.1)); + assert_eq!(4_792_536, Builder::calculate_m(1_000_000, 0.1)); + } +} diff --git a/src/segment/filter/blocked_bloom/mod.rs b/src/segment/filter/blocked_bloom/mod.rs new file mode 100644 index 00000000..42c4dbc9 --- /dev/null +++ b/src/segment/filter/blocked_bloom/mod.rs @@ -0,0 +1,116 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +mod builder; +use super::{bit_array::BitArrayReader, CACHE_LINE_BYTES}; +pub use builder::Builder; + +/// Two hashes that are used for double hashing +pub type CompositeHash = (u64, u64); + +pub struct BlockedBloomFilter { + /// Raw bytes exposed as bit array + inner: BitArrayReader, + + /// Number of hash functions + k: usize, + + /// Number of blocks in the blocked bloom filter + num_blocks: usize, +} + +// TODO: Implement Encode and Decode for BlockedBloomFilter + +impl BlockedBloomFilter { + /// Size of bloom filter in bytes + #[must_use] + pub fn len(&self) -> usize { + self.inner.bytes().len() + } + + fn from_raw(m: usize, k: usize, slice: crate::Slice) -> Self { + let num_blocks = m.div_ceil(CACHE_LINE_BYTES); + Self { + inner: BitArrayReader::new(slice), + k, + num_blocks, + } + } + + /// Returns `true` if the hash may be contained. + /// + /// Will never have a false negative. + #[must_use] + pub fn contains_hash(&self, (mut h1, mut h2): CompositeHash) -> bool { + let block_idx = h1 % (self.num_blocks as u64); + + for i in 1..(self.k as u64) { + h1 = h1.wrapping_add(h2); + h2 = h2.wrapping_add(i); + + let idx = h1 % (CACHE_LINE_BYTES as u64); + + // NOTE: should be in bounds because of modulo + #[allow(clippy::expect_used, clippy::cast_possible_truncation)] + if !self.has_bit(block_idx as usize, idx as usize) { + return false; + } + } + + true + } + + /// Returns `true` if the item may be contained. + /// + /// Will never have a false negative. + #[must_use] + pub fn contains(&self, key: &[u8]) -> bool { + self.contains_hash(Self::get_hash(key)) + } + + /// Returns `true` if the bit at `idx` is `1`. + fn has_bit(&self, block_idx: usize, idx_in_block: usize) -> bool { + self.inner + .get(Builder::get_bit_idx(block_idx, idx_in_block)) + } + + /// Gets the hash of a key. + pub fn get_hash(key: &[u8]) -> CompositeHash { + Builder::get_hash(key) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn blocked_bloom_basic() { + let mut filter = Builder::with_fp_rate(10, 0.0001); + let keys = [ + b"item0" as &[u8], + b"item1", + b"item2", + b"item3", + b"item4", + b"item5", + b"item6", + b"item7", + b"item8", + b"item9", + ]; + + for key in &keys { + filter.set_with_hash(Builder::get_hash(key)); + } + + let filter = filter.build(); + + for key in &keys { + assert!(filter.contains(key)); + } + + assert!(!filter.contains(b"asdasdasdasdasdasdasd")); + } +} diff --git a/src/segment/filter/mod.rs b/src/segment/filter/mod.rs index 2f53ab5b..f781d62d 100644 --- a/src/segment/filter/mod.rs +++ b/src/segment/filter/mod.rs @@ -3,8 +3,12 @@ // (found in the LICENSE-* files in the repository) pub mod bit_array; +pub mod blocked_bloom; pub mod standard_bloom; +const CACHE_LINE_BYTES: usize = 64; + +use blocked_bloom::Builder as BlockedBloomFilterBuilder; use standard_bloom::Builder as StandardBloomFilterBuilder; #[derive(Copy, Clone, Debug)] From 00b1d6e13cbbbfd1818e5db01bcb292fca488cf0 Mon Sep 17 00:00:00 2001 From: Jing Yang Date: Thu, 24 Apr 2025 11:52:00 +0800 Subject: [PATCH 095/613] chore: use AMQFilter trait and decode StandardBloomFilter from AMQFilterBuilder --- benches/bloom.rs | 2 +- src/segment/filter/mod.rs | 62 ++++++++++++- src/segment/filter/standard_bloom/mod.rs | 109 +++++++++++------------ src/segment/inner.rs | 5 +- src/segment/mod.rs | 6 +- 5 files changed, 116 insertions(+), 68 deletions(-) diff --git a/benches/bloom.rs b/benches/bloom.rs index a19850e8..25a67a5e 100644 --- a/benches/bloom.rs +++ b/benches/bloom.rs @@ -14,7 +14,7 @@ fn standard_filter_construction(c: &mut Criterion) { } fn standard_filter_contains(c: &mut Criterion) { - use lsm_tree::segment::filter::standard_bloom::Builder; + use lsm_tree::segment::filter::{standard_bloom::Builder, AMQFilter}; let keys = (0..100_000u128) .map(|x| x.to_be_bytes().to_vec()) diff --git a/src/segment/filter/mod.rs b/src/segment/filter/mod.rs index f781d62d..90d653e8 100644 --- a/src/segment/filter/mod.rs +++ b/src/segment/filter/mod.rs @@ -6,10 +6,13 @@ pub mod bit_array; pub mod blocked_bloom; pub mod standard_bloom; -const CACHE_LINE_BYTES: usize = 64; +use crate::{coding::DecodeError, file::MAGIC_BYTES}; +use byteorder::ReadBytesExt; +use std::io::Read; + +use standard_bloom::{Builder as StandardBloomFilterBuilder, StandardBloomFilter}; -use blocked_bloom::Builder as BlockedBloomFilterBuilder; -use standard_bloom::Builder as StandardBloomFilterBuilder; +const CACHE_LINE_BYTES: usize = 64; #[derive(Copy, Clone, Debug)] pub enum BloomConstructionPolicy { @@ -42,3 +45,56 @@ impl BloomConstructionPolicy { } } } + +enum FilterType { + StandardBloom = 0, + BlockedBloom = 1, +} + +impl TryFrom for FilterType { + type Error = (); + fn try_from(value: u8) -> Result { + match value { + 0 => Ok(Self::StandardBloom), + 1 => Ok(Self::BlockedBloom), + _ => Err(()), + } + } +} + +pub trait AMQFilter: Sync + Send { + fn bytes(&self) -> &[u8]; + fn len(&self) -> usize; + fn contains(&self, item: &[u8]) -> bool; + fn contains_hash(&self, hash: (u64, u64)) -> bool; +} + +pub struct AMQFilterBuilder {} + +impl AMQFilterBuilder { + pub fn decode_from(reader: &mut R) -> Result, DecodeError> { + // Check header + let mut magic = [0u8; MAGIC_BYTES.len()]; + reader.read_exact(&mut magic)?; + + if magic != MAGIC_BYTES { + return Err(DecodeError::InvalidHeader("BloomFilter")); + } + + let filter_type = reader.read_u8()?; + let filter_type = FilterType::try_from(filter_type) + .map_err(|_| DecodeError::InvalidHeader("FilterType"))?; + + match filter_type { + FilterType::StandardBloom => StandardBloomFilter::decode_from(reader) + .map(Self::wrap_filter) + .map_err(|e| DecodeError::from(e)), + // TODO: Implement + FilterType::BlockedBloom => Err(DecodeError::InvalidHeader("BlockedBloom")), + } + } + + fn wrap_filter(filter: T) -> Box { + Box::new(filter) + } +} diff --git a/src/segment/filter/standard_bloom/mod.rs b/src/segment/filter/standard_bloom/mod.rs index 7670a640..5dfaa66f 100644 --- a/src/segment/filter/standard_bloom/mod.rs +++ b/src/segment/filter/standard_bloom/mod.rs @@ -1,4 +1,4 @@ -use super::bit_array::BitArrayReader; +use super::{bit_array::BitArrayReader, AMQFilter}; use crate::{ coding::{Decode, DecodeError, Encode, EncodeError}, file::MAGIC_BYTES, @@ -30,6 +30,49 @@ pub struct StandardBloomFilter { } // TODO: change encode/decode to be Filter enum +impl AMQFilter for StandardBloomFilter { + /// Size of bloom filter in bytes. + #[must_use] + fn len(&self) -> usize { + self.inner.bytes().len() + } + + /// Returns the raw bytes of the filter. + fn bytes(&self) -> &[u8] { + self.inner.bytes() + } + + /// Returns `true` if the item may be contained. + /// + /// Will never have a false negative. + #[must_use] + fn contains(&self, key: &[u8]) -> bool { + self.contains_hash(Self::get_hash(key)) + } + + /// Returns `true` if the hash may be contained. + /// + /// Will never have a false negative. + #[must_use] + fn contains_hash(&self, hash: CompositeHash) -> bool { + let (mut h1, mut h2) = hash; + + for i in 1..=(self.k as u64) { + let idx = h1 % (self.m as u64); + + // NOTE: should be in bounds because of modulo + #[allow(clippy::expect_used)] + if !self.has_bit(idx as usize) { + return false; + } + + h1 = h1.wrapping_add(h2); + h2 = h2.wrapping_mul(i); + } + + true + } +} impl Encode for StandardBloomFilter { fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { @@ -50,20 +93,10 @@ impl Encode for StandardBloomFilter { } } -impl Decode for StandardBloomFilter { - fn decode_from(reader: &mut R) -> Result { - // Check header - let mut magic = [0u8; MAGIC_BYTES.len()]; - reader.read_exact(&mut magic)?; - - if magic != MAGIC_BYTES { - return Err(DecodeError::InvalidHeader("BloomFilter")); - } - - // NOTE: Filter type (unused) - let filter_type = reader.read_u8()?; - assert_eq!(0, filter_type, "Invalid filter type"); - +#[allow(clippy::len_without_is_empty)] +impl StandardBloomFilter { + // To be used by AMQFilter after magic bytes and filter type have been read and parsed + pub(super) fn decode_from(reader: &mut R) -> Result { // NOTE: Hash type (unused) let hash_type = reader.read_u8()?; assert_eq!(0, hash_type, "Invalid bloom hash type"); @@ -76,15 +109,6 @@ impl Decode for StandardBloomFilter { Ok(Self::from_raw(m, k, bytes.into())) } -} - -#[allow(clippy::len_without_is_empty)] -impl StandardBloomFilter { - /// Size of bloom filter in bytes. - #[must_use] - pub fn len(&self) -> usize { - self.inner.bytes().len() - } fn from_raw(m: usize, k: usize, slice: crate::Slice) -> Self { Self { @@ -94,37 +118,6 @@ impl StandardBloomFilter { } } - /// Returns `true` if the hash may be contained. - /// - /// Will never have a false negative. - #[must_use] - pub fn contains_hash(&self, hash: CompositeHash) -> bool { - let (mut h1, mut h2) = hash; - - for i in 1..=(self.k as u64) { - let idx = h1 % (self.m as u64); - - // NOTE: should be in bounds because of modulo - #[allow(clippy::expect_used)] - if !self.has_bit(idx as usize) { - return false; - } - - h1 = h1.wrapping_add(h2); - h2 = h2.wrapping_mul(i); - } - - true - } - - /// Returns `true` if the item may be contained. - /// - /// Will never have a false negative. - #[must_use] - pub fn contains(&self, key: &[u8]) -> bool { - self.contains_hash(Self::get_hash(key)) - } - /// Returns `true` if the bit at `idx` is `1`. fn has_bit(&self, idx: usize) -> bool { self.inner.get(idx) @@ -138,6 +131,8 @@ impl StandardBloomFilter { #[cfg(test)] mod tests { + use crate::segment::filter::AMQFilterBuilder; + use super::*; use std::fs::File; use test_log::test; @@ -174,9 +169,9 @@ mod tests { drop(file); let mut file = File::open(&path)?; - let filter_copy = StandardBloomFilter::decode_from(&mut file)?; + let filter_copy = AMQFilterBuilder::decode_from(&mut file)?; - assert_eq!(filter.inner, filter_copy.inner); + assert_eq!(filter.inner.bytes(), filter_copy.bytes()); for key in keys { assert!(filter.contains(&**key)); diff --git a/src/segment/inner.rs b/src/segment/inner.rs index ccd4bd88..8cdda47f 100644 --- a/src/segment/inner.rs +++ b/src/segment/inner.rs @@ -3,8 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{ - block_index::NewBlockIndexImpl, filter::standard_bloom::StandardBloomFilter, meta::ParsedMeta, - trailer::Trailer, + block_index::NewBlockIndexImpl, filter::AMQFilter, meta::ParsedMeta, trailer::Trailer, }; use crate::{ cache::Cache, descriptor_table::DescriptorTable, tree::inner::TreeId, GlobalSegmentId, @@ -39,7 +38,7 @@ pub struct Inner { pub cache: Arc, /// Pinned AMQ filter - pub pinned_filter: Option, + pub pinned_filter: Option>, // /// Pinned filter // #[doc(hidden)] diff --git a/src/segment/mod.rs b/src/segment/mod.rs index e283f535..1e43f838 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -27,7 +27,7 @@ use crate::{ cache::Cache, descriptor_table::DescriptorTable, InternalValue, SeqNo, TreeId, UserKey, }; use block_index::{NewBlockIndex, NewBlockIndexImpl, NewFullBlockIndex}; -use filter::standard_bloom::{CompositeHash, StandardBloomFilter}; +use filter::{standard_bloom::CompositeHash, AMQFilterBuilder}; use inner::Inner; use meta::ParsedMeta; use std::{ @@ -341,8 +341,6 @@ impl Segment { let pinned_filter = trailer .filter .map(|filter_ptr| { - use crate::coding::Decode; - log::debug!("Reading filter block for pinning, with filter_ptr={filter_ptr:?}"); let block = Block::from_file( @@ -353,7 +351,7 @@ impl Segment { )?; let mut reader = &block.data[..]; - StandardBloomFilter::decode_from(&mut reader).map_err(Into::::into) + AMQFilterBuilder::decode_from(&mut reader).map_err(Into::::into) }) .transpose()?; From 7e2bad766dee2d6c390d21030b502ea115ce49f7 Mon Sep 17 00:00:00 2001 From: Jing Yang Date: Thu, 24 Apr 2025 12:47:27 +0800 Subject: [PATCH 096/613] feat: encode and decode BlockedBloomFilter --- benches/bloom.rs | 2 +- src/segment/filter/blocked_bloom/mod.rs | 126 ++++++++++++++++++++--- src/segment/filter/mod.rs | 10 +- src/segment/filter/standard_bloom/mod.rs | 10 +- 4 files changed, 126 insertions(+), 22 deletions(-) diff --git a/benches/bloom.rs b/benches/bloom.rs index 25a67a5e..d3d8f2bc 100644 --- a/benches/bloom.rs +++ b/benches/bloom.rs @@ -63,7 +63,7 @@ fn blocked_filter_construction(c: &mut Criterion) { } fn blocked_filter_contains(c: &mut Criterion) { - use lsm_tree::segment::filter::blocked_bloom::Builder; + use lsm_tree::segment::filter::{blocked_bloom::Builder, AMQFilter}; let keys = (0..100_000u128) .map(|x| x.to_be_bytes().to_vec()) diff --git a/src/segment/filter/blocked_bloom/mod.rs b/src/segment/filter/blocked_bloom/mod.rs index 42c4dbc9..38707ccc 100644 --- a/src/segment/filter/blocked_bloom/mod.rs +++ b/src/segment/filter/blocked_bloom/mod.rs @@ -3,8 +3,14 @@ // (found in the LICENSE-* files in the repository) mod builder; -use super::{bit_array::BitArrayReader, CACHE_LINE_BYTES}; +use super::{bit_array::BitArrayReader, AMQFilter, CACHE_LINE_BYTES}; +use crate::{ + coding::{DecodeError, Encode, EncodeError}, + file::MAGIC_BYTES, +}; pub use builder::Builder; +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use std::io::{Read, Write}; /// Two hashes that are used for double hashing pub type CompositeHash = (u64, u64); @@ -20,29 +26,22 @@ pub struct BlockedBloomFilter { num_blocks: usize, } -// TODO: Implement Encode and Decode for BlockedBloomFilter +impl AMQFilter for BlockedBloomFilter { + fn bytes(&self) -> &[u8] { + self.inner.bytes() + } -impl BlockedBloomFilter { /// Size of bloom filter in bytes #[must_use] - pub fn len(&self) -> usize { + fn len(&self) -> usize { self.inner.bytes().len() } - fn from_raw(m: usize, k: usize, slice: crate::Slice) -> Self { - let num_blocks = m.div_ceil(CACHE_LINE_BYTES); - Self { - inner: BitArrayReader::new(slice), - k, - num_blocks, - } - } - /// Returns `true` if the hash may be contained. /// /// Will never have a false negative. #[must_use] - pub fn contains_hash(&self, (mut h1, mut h2): CompositeHash) -> bool { + fn contains_hash(&self, (mut h1, mut h2): CompositeHash) -> bool { let block_idx = h1 % (self.num_blocks as u64); for i in 1..(self.k as u64) { @@ -65,10 +64,56 @@ impl BlockedBloomFilter { /// /// Will never have a false negative. #[must_use] - pub fn contains(&self, key: &[u8]) -> bool { + fn contains(&self, key: &[u8]) -> bool { self.contains_hash(Self::get_hash(key)) } + fn filter_type(&self) -> super::FilterType { + super::FilterType::BlockedBloom + } +} + +impl Encode for BlockedBloomFilter { + fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { + // Write header + writer.write_all(&MAGIC_BYTES)?; + + writer.write_u8(super::FilterType::BlockedBloom as u8)?; + + // NOTE: Hash type (unused) + writer.write_u8(0)?; + + writer.write_u64::(self.num_blocks as u64)?; + writer.write_u64::(self.k as u64)?; + writer.write_all(self.inner.bytes())?; + + Ok(()) + } +} + +impl BlockedBloomFilter { + // To be used by AMQFilter after magic bytes and filter type have been read and parsed + pub(super) fn decode_from(reader: &mut R) -> Result { + // NOTE: Hash type (unused) + let hash_type = reader.read_u8()?; + assert_eq!(0, hash_type, "Invalid bloom hash type"); + + let num_blocks = reader.read_u64::()? as usize; + let k = reader.read_u64::()? as usize; + + let mut bytes = vec![0; num_blocks * CACHE_LINE_BYTES]; + reader.read_exact(&mut bytes)?; + + Ok(Self::from_raw(num_blocks, k, bytes.into())) + } + + fn from_raw(num_blocks: usize, k: usize, slice: crate::Slice) -> Self { + Self { + inner: BitArrayReader::new(slice), + k, + num_blocks, + } + } /// Returns `true` if the bit at `idx` is `1`. fn has_bit(&self, block_idx: usize, idx_in_block: usize) -> bool { self.inner @@ -84,6 +129,57 @@ impl BlockedBloomFilter { #[cfg(test)] mod tests { use super::*; + use crate::segment::filter::{AMQFilterBuilder, FilterType}; + + use std::fs::File; + use test_log::test; + + #[test] + fn blocked_bloom_serde_round_trip() -> crate::Result<()> { + let dir = tempfile::tempdir()?; + + let path = dir.path().join("bf"); + let mut file = File::create(&path)?; + + let mut filter = Builder::with_fp_rate(10, 0.0001); + + let keys = &[ + b"item0", b"item1", b"item2", b"item3", b"item4", b"item5", b"item6", b"item7", + b"item8", b"item9", + ]; + + for key in keys { + filter.set_with_hash(BlockedBloomFilter::get_hash(*key)); + } + + let filter = filter.build(); + + for key in keys { + assert!(filter.contains(&**key)); + } + assert!(!filter.contains(b"asdasads")); + assert!(!filter.contains(b"item10")); + assert!(!filter.contains(b"cxycxycxy")); + + filter.encode_into(&mut file)?; + file.sync_all()?; + drop(file); + + let mut file = File::open(&path)?; + let filter_copy = AMQFilterBuilder::decode_from(&mut file)?; + + assert_eq!(filter.inner.bytes(), filter_copy.bytes()); + assert_eq!(FilterType::BlockedBloom, filter_copy.filter_type()); + + for key in keys { + assert!(filter.contains(&**key)); + } + assert!(!filter_copy.contains(b"asdasads")); + assert!(!filter_copy.contains(b"item10")); + assert!(!filter_copy.contains(b"cxycxycxy")); + + Ok(()) + } #[test] fn blocked_bloom_basic() { diff --git a/src/segment/filter/mod.rs b/src/segment/filter/mod.rs index 90d653e8..bc80ca12 100644 --- a/src/segment/filter/mod.rs +++ b/src/segment/filter/mod.rs @@ -7,6 +7,7 @@ pub mod blocked_bloom; pub mod standard_bloom; use crate::{coding::DecodeError, file::MAGIC_BYTES}; +use blocked_bloom::BlockedBloomFilter; use byteorder::ReadBytesExt; use std::io::Read; @@ -46,7 +47,8 @@ impl BloomConstructionPolicy { } } -enum FilterType { +#[derive(PartialEq, Debug)] +pub enum FilterType { StandardBloom = 0, BlockedBloom = 1, } @@ -67,6 +69,7 @@ pub trait AMQFilter: Sync + Send { fn len(&self) -> usize; fn contains(&self, item: &[u8]) -> bool; fn contains_hash(&self, hash: (u64, u64)) -> bool; + fn filter_type(&self) -> FilterType; } pub struct AMQFilterBuilder {} @@ -89,8 +92,9 @@ impl AMQFilterBuilder { FilterType::StandardBloom => StandardBloomFilter::decode_from(reader) .map(Self::wrap_filter) .map_err(|e| DecodeError::from(e)), - // TODO: Implement - FilterType::BlockedBloom => Err(DecodeError::InvalidHeader("BlockedBloom")), + FilterType::BlockedBloom => BlockedBloomFilter::decode_from(reader) + .map(Self::wrap_filter) + .map_err(|e| DecodeError::from(e)), } } diff --git a/src/segment/filter/standard_bloom/mod.rs b/src/segment/filter/standard_bloom/mod.rs index 5dfaa66f..eb94e79f 100644 --- a/src/segment/filter/standard_bloom/mod.rs +++ b/src/segment/filter/standard_bloom/mod.rs @@ -72,6 +72,10 @@ impl AMQFilter for StandardBloomFilter { true } + + fn filter_type(&self) -> super::FilterType { + super::FilterType::StandardBloom + } } impl Encode for StandardBloomFilter { @@ -79,8 +83,7 @@ impl Encode for StandardBloomFilter { // Write header writer.write_all(&MAGIC_BYTES)?; - // NOTE: Filter type (unused) - writer.write_u8(0)?; + writer.write_u8(super::FilterType::StandardBloom as u8)?; // NOTE: Hash type (unused) writer.write_u8(0)?; @@ -131,7 +134,7 @@ impl StandardBloomFilter { #[cfg(test)] mod tests { - use crate::segment::filter::AMQFilterBuilder; + use crate::segment::filter::{AMQFilterBuilder, FilterType}; use super::*; use std::fs::File; @@ -172,6 +175,7 @@ mod tests { let filter_copy = AMQFilterBuilder::decode_from(&mut file)?; assert_eq!(filter.inner.bytes(), filter_copy.bytes()); + assert_eq!(FilterType::StandardBloom, filter_copy.filter_type()); for key in keys { assert!(filter.contains(&**key)); From f16abbe1a7ff09efe4a13625a96181ce5304bc14 Mon Sep 17 00:00:00 2001 From: Jing Yang Date: Sat, 26 Apr 2025 14:17:07 +0800 Subject: [PATCH 097/613] fix: use wrapping_mul --- src/segment/filter/blocked_bloom/builder.rs | 2 +- src/segment/filter/blocked_bloom/mod.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/segment/filter/blocked_bloom/builder.rs b/src/segment/filter/blocked_bloom/builder.rs index 49da48ff..d105bb16 100644 --- a/src/segment/filter/blocked_bloom/builder.rs +++ b/src/segment/filter/blocked_bloom/builder.rs @@ -101,7 +101,7 @@ impl Builder { for i in 1..(self.k as u64) { h1 = h1.wrapping_add(h2); - h2 = h2.wrapping_add(i); + h2 = h2.wrapping_mul(i); let idx = h1 % (CACHE_LINE_BYTES as u64); diff --git a/src/segment/filter/blocked_bloom/mod.rs b/src/segment/filter/blocked_bloom/mod.rs index 38707ccc..6e01f890 100644 --- a/src/segment/filter/blocked_bloom/mod.rs +++ b/src/segment/filter/blocked_bloom/mod.rs @@ -46,7 +46,7 @@ impl AMQFilter for BlockedBloomFilter { for i in 1..(self.k as u64) { h1 = h1.wrapping_add(h2); - h2 = h2.wrapping_add(i); + h2 = h2.wrapping_mul(i); let idx = h1 % (CACHE_LINE_BYTES as u64); From a3df96370d6d7f0b8e02ebfb6bd31d4f5f4fbad2 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 26 Apr 2025 18:20:52 +0200 Subject: [PATCH 098/613] BinaryIndexReader::get_last --- src/segment/block/binary_index/reader.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/segment/block/binary_index/reader.rs b/src/segment/block/binary_index/reader.rs index 3ed69d0a..8cbab201 100644 --- a/src/segment/block/binary_index/reader.rs +++ b/src/segment/block/binary_index/reader.rs @@ -50,4 +50,8 @@ impl<'a> Reader<'a> { unwrappy!(bytes.read_u32::()) as usize } } + + pub(crate) fn get_last(&self) -> usize { + self.get(self.len() - 1) + } } From e161dd3a369cd759db6147a527da788d3483e07d Mon Sep 17 00:00:00 2001 From: Jing Yang Date: Sun, 27 Apr 2025 17:52:15 +0800 Subject: [PATCH 099/613] fix: fix bits to bytes calculation from m --- src/segment/filter/blocked_bloom/builder.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/segment/filter/blocked_bloom/builder.rs b/src/segment/filter/blocked_bloom/builder.rs index d105bb16..18e93c39 100644 --- a/src/segment/filter/blocked_bloom/builder.rs +++ b/src/segment/filter/blocked_bloom/builder.rs @@ -48,7 +48,7 @@ impl Builder { let bpk = m / n; let k = (((bpk as f32) * LN_2) as usize).max(1); - let num_blocks = m.div_ceil(CACHE_LINE_BYTES); + let num_blocks = m.div_ceil(CACHE_LINE_BYTES * 8); Self { inner: BitArrayBuilder::with_capacity(num_blocks * CACHE_LINE_BYTES), @@ -73,7 +73,7 @@ impl Builder { let m = n * bpk; let k = (((bpk as f32) * LN_2) as usize).max(1); - let num_blocks = m.div_ceil(CACHE_LINE_BYTES); + let num_blocks = m.div_ceil(CACHE_LINE_BYTES * 8); Self { inner: BitArrayBuilder::with_capacity(num_blocks * CACHE_LINE_BYTES), From b9e7d1c9140b648293fc5a7a92e11f31a3fd79bc Mon Sep 17 00:00:00 2001 From: Jing Yang Date: Sun, 27 Apr 2025 18:08:31 +0800 Subject: [PATCH 100/613] fix: fix bits calculation --- src/segment/filter/blocked_bloom/builder.rs | 4 ++-- src/segment/filter/blocked_bloom/mod.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/segment/filter/blocked_bloom/builder.rs b/src/segment/filter/blocked_bloom/builder.rs index 18e93c39..9def8b9f 100644 --- a/src/segment/filter/blocked_bloom/builder.rs +++ b/src/segment/filter/blocked_bloom/builder.rs @@ -103,7 +103,7 @@ impl Builder { h1 = h1.wrapping_add(h2); h2 = h2.wrapping_mul(i); - let idx = h1 % (CACHE_LINE_BYTES as u64); + let idx = h1 % (CACHE_LINE_BYTES as u64 * 8); self.inner .enable_bit(Self::get_bit_idx(block_idx as usize, idx as usize)); @@ -111,7 +111,7 @@ impl Builder { } pub fn get_bit_idx(block_idx: usize, idx_in_block: usize) -> usize { - block_idx * CACHE_LINE_BYTES as usize + idx_in_block + block_idx * CACHE_LINE_BYTES * 8 + idx_in_block } /// Gets the hash of a key. diff --git a/src/segment/filter/blocked_bloom/mod.rs b/src/segment/filter/blocked_bloom/mod.rs index 6e01f890..4dae8b8f 100644 --- a/src/segment/filter/blocked_bloom/mod.rs +++ b/src/segment/filter/blocked_bloom/mod.rs @@ -48,7 +48,7 @@ impl AMQFilter for BlockedBloomFilter { h1 = h1.wrapping_add(h2); h2 = h2.wrapping_mul(i); - let idx = h1 % (CACHE_LINE_BYTES as u64); + let idx = h1 % (CACHE_LINE_BYTES as u64 * 8); // NOTE: should be in bounds because of modulo #[allow(clippy::expect_used, clippy::cast_possible_truncation)] From dce7d017e62892ef44ebfb2a651a1cb6fe68b4d4 Mon Sep 17 00:00:00 2001 From: Jing Yang Date: Sun, 27 Apr 2025 22:17:49 +0800 Subject: [PATCH 101/613] feat: use enum dispatch for filter type --- src/segment/filter/blocked_bloom/mod.rs | 22 +++++----- src/segment/filter/mod.rs | 54 ++++++++++++------------ src/segment/filter/standard_bloom/mod.rs | 21 ++++----- src/segment/inner.rs | 4 +- src/segment/mod.rs | 2 +- 5 files changed, 53 insertions(+), 50 deletions(-) diff --git a/src/segment/filter/blocked_bloom/mod.rs b/src/segment/filter/blocked_bloom/mod.rs index 4dae8b8f..5d3b68b9 100644 --- a/src/segment/filter/blocked_bloom/mod.rs +++ b/src/segment/filter/blocked_bloom/mod.rs @@ -3,18 +3,20 @@ // (found in the LICENSE-* files in the repository) mod builder; -use super::{bit_array::BitArrayReader, AMQFilter, CACHE_LINE_BYTES}; +use super::{bit_array::BitArrayReader, AMQFilter, BloomFilter, BloomFilterType, CACHE_LINE_BYTES}; use crate::{ coding::{DecodeError, Encode, EncodeError}, file::MAGIC_BYTES, }; pub use builder::Builder; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use core::num; use std::io::{Read, Write}; /// Two hashes that are used for double hashing pub type CompositeHash = (u64, u64); +#[derive(Debug, PartialEq)] pub struct BlockedBloomFilter { /// Raw bytes exposed as bit array inner: BitArrayReader, @@ -67,10 +69,6 @@ impl AMQFilter for BlockedBloomFilter { fn contains(&self, key: &[u8]) -> bool { self.contains_hash(Self::get_hash(key)) } - - fn filter_type(&self) -> super::FilterType { - super::FilterType::BlockedBloom - } } impl Encode for BlockedBloomFilter { @@ -78,7 +76,7 @@ impl Encode for BlockedBloomFilter { // Write header writer.write_all(&MAGIC_BYTES)?; - writer.write_u8(super::FilterType::BlockedBloom as u8)?; + writer.write_u8(BloomFilterType::BlockedBloom as u8)?; // NOTE: Hash type (unused) writer.write_u8(0)?; @@ -93,7 +91,7 @@ impl Encode for BlockedBloomFilter { impl BlockedBloomFilter { // To be used by AMQFilter after magic bytes and filter type have been read and parsed - pub(super) fn decode_from(reader: &mut R) -> Result { + pub(super) fn decode_from(reader: &mut R) -> Result { // NOTE: Hash type (unused) let hash_type = reader.read_u8()?; assert_eq!(0, hash_type, "Invalid bloom hash type"); @@ -104,7 +102,11 @@ impl BlockedBloomFilter { let mut bytes = vec![0; num_blocks * CACHE_LINE_BYTES]; reader.read_exact(&mut bytes)?; - Ok(Self::from_raw(num_blocks, k, bytes.into())) + Ok(BloomFilter::BlockedBloom(Self::from_raw( + num_blocks, + k, + bytes.into(), + ))) } fn from_raw(num_blocks: usize, k: usize, slice: crate::Slice) -> Self { @@ -129,7 +131,7 @@ impl BlockedBloomFilter { #[cfg(test)] mod tests { use super::*; - use crate::segment::filter::{AMQFilterBuilder, FilterType}; + use crate::segment::filter::{AMQFilterBuilder, BloomFilter}; use std::fs::File; use test_log::test; @@ -169,7 +171,7 @@ mod tests { let filter_copy = AMQFilterBuilder::decode_from(&mut file)?; assert_eq!(filter.inner.bytes(), filter_copy.bytes()); - assert_eq!(FilterType::BlockedBloom, filter_copy.filter_type()); + assert!(matches!(filter_copy, BloomFilter::BlockedBloom(_))); for key in keys { assert!(filter.contains(&**key)); diff --git a/src/segment/filter/mod.rs b/src/segment/filter/mod.rs index bc80ca12..b2405cb6 100644 --- a/src/segment/filter/mod.rs +++ b/src/segment/filter/mod.rs @@ -9,9 +9,8 @@ pub mod standard_bloom; use crate::{coding::DecodeError, file::MAGIC_BYTES}; use blocked_bloom::BlockedBloomFilter; use byteorder::ReadBytesExt; -use std::io::Read; - use standard_bloom::{Builder as StandardBloomFilterBuilder, StandardBloomFilter}; +use std::io::Read; const CACHE_LINE_BYTES: usize = 64; @@ -47,13 +46,27 @@ impl BloomConstructionPolicy { } } +#[enum_dispatch::enum_dispatch] +pub trait AMQFilter { + fn bytes(&self) -> &[u8]; + fn len(&self) -> usize; + fn contains(&self, item: &[u8]) -> bool; + fn contains_hash(&self, hash: (u64, u64)) -> bool; +} +#[enum_dispatch::enum_dispatch(AMQFilter)] #[derive(PartialEq, Debug)] -pub enum FilterType { +pub enum BloomFilter { + StandardBloom(StandardBloomFilter), + BlockedBloom(BlockedBloomFilter), +} + +#[derive(Debug, PartialEq)] +pub enum BloomFilterType { StandardBloom = 0, BlockedBloom = 1, } -impl TryFrom for FilterType { +impl TryFrom for BloomFilterType { type Error = (); fn try_from(value: u8) -> Result { match value { @@ -64,18 +77,10 @@ impl TryFrom for FilterType { } } -pub trait AMQFilter: Sync + Send { - fn bytes(&self) -> &[u8]; - fn len(&self) -> usize; - fn contains(&self, item: &[u8]) -> bool; - fn contains_hash(&self, hash: (u64, u64)) -> bool; - fn filter_type(&self) -> FilterType; -} - pub struct AMQFilterBuilder {} impl AMQFilterBuilder { - pub fn decode_from(reader: &mut R) -> Result, DecodeError> { + pub fn decode_from(reader: &mut R) -> Result { // Check header let mut magic = [0u8; MAGIC_BYTES.len()]; reader.read_exact(&mut magic)?; @@ -85,20 +90,15 @@ impl AMQFilterBuilder { } let filter_type = reader.read_u8()?; - let filter_type = FilterType::try_from(filter_type) - .map_err(|_| DecodeError::InvalidHeader("FilterType"))?; - - match filter_type { - FilterType::StandardBloom => StandardBloomFilter::decode_from(reader) - .map(Self::wrap_filter) - .map_err(|e| DecodeError::from(e)), - FilterType::BlockedBloom => BlockedBloomFilter::decode_from(reader) - .map(Self::wrap_filter) - .map_err(|e| DecodeError::from(e)), - } - } - fn wrap_filter(filter: T) -> Box { - Box::new(filter) + match BloomFilterType::try_from(filter_type) { + Ok(BloomFilterType::StandardBloom) => { + StandardBloomFilter::decode_from(reader).map_err(|e| DecodeError::from(e)) + } + Ok(BloomFilterType::BlockedBloom) => { + BlockedBloomFilter::decode_from(reader).map_err(|e| DecodeError::from(e)) + } + _ => Err(DecodeError::InvalidHeader("Unknown filter type")), + } } } diff --git a/src/segment/filter/standard_bloom/mod.rs b/src/segment/filter/standard_bloom/mod.rs index eb94e79f..5b70fb30 100644 --- a/src/segment/filter/standard_bloom/mod.rs +++ b/src/segment/filter/standard_bloom/mod.rs @@ -1,4 +1,4 @@ -use super::{bit_array::BitArrayReader, AMQFilter}; +use super::{bit_array::BitArrayReader, AMQFilter, BloomFilter}; use crate::{ coding::{Decode, DecodeError, Encode, EncodeError}, file::MAGIC_BYTES, @@ -18,6 +18,7 @@ pub use builder::{Builder, CompositeHash}; /// /// The filter uses double hashing instead of `k` hash functions, see: /// +#[derive(Debug, PartialEq)] pub struct StandardBloomFilter { /// Raw bytes exposed as bit array inner: BitArrayReader, @@ -72,10 +73,6 @@ impl AMQFilter for StandardBloomFilter { true } - - fn filter_type(&self) -> super::FilterType { - super::FilterType::StandardBloom - } } impl Encode for StandardBloomFilter { @@ -83,7 +80,7 @@ impl Encode for StandardBloomFilter { // Write header writer.write_all(&MAGIC_BYTES)?; - writer.write_u8(super::FilterType::StandardBloom as u8)?; + writer.write_u8(0)?; // TODO: How to make this a enum? // NOTE: Hash type (unused) writer.write_u8(0)?; @@ -99,7 +96,7 @@ impl Encode for StandardBloomFilter { #[allow(clippy::len_without_is_empty)] impl StandardBloomFilter { // To be used by AMQFilter after magic bytes and filter type have been read and parsed - pub(super) fn decode_from(reader: &mut R) -> Result { + pub(super) fn decode_from(reader: &mut R) -> Result { // NOTE: Hash type (unused) let hash_type = reader.read_u8()?; assert_eq!(0, hash_type, "Invalid bloom hash type"); @@ -110,7 +107,11 @@ impl StandardBloomFilter { let mut bytes = vec![0; m / 8]; reader.read_exact(&mut bytes)?; - Ok(Self::from_raw(m, k, bytes.into())) + Ok(BloomFilter::StandardBloom(Self::from_raw( + m, + k, + bytes.into(), + ))) } fn from_raw(m: usize, k: usize, slice: crate::Slice) -> Self { @@ -134,7 +135,7 @@ impl StandardBloomFilter { #[cfg(test)] mod tests { - use crate::segment::filter::{AMQFilterBuilder, FilterType}; + use crate::segment::filter::{AMQFilterBuilder, BloomFilter}; use super::*; use std::fs::File; @@ -175,7 +176,7 @@ mod tests { let filter_copy = AMQFilterBuilder::decode_from(&mut file)?; assert_eq!(filter.inner.bytes(), filter_copy.bytes()); - assert_eq!(FilterType::StandardBloom, filter_copy.filter_type()); + assert!(matches!(filter_copy, BloomFilter::StandardBloom(_))); for key in keys { assert!(filter.contains(&**key)); diff --git a/src/segment/inner.rs b/src/segment/inner.rs index 8cdda47f..169c7ae4 100644 --- a/src/segment/inner.rs +++ b/src/segment/inner.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{ - block_index::NewBlockIndexImpl, filter::AMQFilter, meta::ParsedMeta, trailer::Trailer, + block_index::NewBlockIndexImpl, filter::BloomFilter, meta::ParsedMeta, trailer::Trailer, }; use crate::{ cache::Cache, descriptor_table::DescriptorTable, tree::inner::TreeId, GlobalSegmentId, @@ -38,7 +38,7 @@ pub struct Inner { pub cache: Arc, /// Pinned AMQ filter - pub pinned_filter: Option>, + pub pinned_filter: Option, // /// Pinned filter // #[doc(hidden)] diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 1e43f838..155ceac8 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -27,7 +27,7 @@ use crate::{ cache::Cache, descriptor_table::DescriptorTable, InternalValue, SeqNo, TreeId, UserKey, }; use block_index::{NewBlockIndex, NewBlockIndexImpl, NewFullBlockIndex}; -use filter::{standard_bloom::CompositeHash, AMQFilterBuilder}; +use filter::{standard_bloom::CompositeHash, AMQFilter, AMQFilterBuilder}; use inner::Inner; use meta::ParsedMeta; use std::{ From 610a090fcdc5c8f174c38bc5e1a631fafc29cb3e Mon Sep 17 00:00:00 2001 From: Jing Yang Date: Sun, 27 Apr 2025 22:31:07 +0800 Subject: [PATCH 102/613] chore: formatting --- src/segment/filter/blocked_bloom/mod.rs | 4 ++-- src/segment/filter/mod.rs | 2 +- src/segment/filter/standard_bloom/mod.rs | 5 ++--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/segment/filter/blocked_bloom/mod.rs b/src/segment/filter/blocked_bloom/mod.rs index 5d3b68b9..fa1a5385 100644 --- a/src/segment/filter/blocked_bloom/mod.rs +++ b/src/segment/filter/blocked_bloom/mod.rs @@ -50,11 +50,11 @@ impl AMQFilter for BlockedBloomFilter { h1 = h1.wrapping_add(h2); h2 = h2.wrapping_mul(i); - let idx = h1 % (CACHE_LINE_BYTES as u64 * 8); + let bit_idx = h1 % (CACHE_LINE_BYTES as u64 * 8); // NOTE: should be in bounds because of modulo #[allow(clippy::expect_used, clippy::cast_possible_truncation)] - if !self.has_bit(block_idx as usize, idx as usize) { + if !self.has_bit(block_idx as usize, bit_idx as usize) { return false; } } diff --git a/src/segment/filter/mod.rs b/src/segment/filter/mod.rs index b2405cb6..e8548222 100644 --- a/src/segment/filter/mod.rs +++ b/src/segment/filter/mod.rs @@ -53,6 +53,7 @@ pub trait AMQFilter { fn contains(&self, item: &[u8]) -> bool; fn contains_hash(&self, hash: (u64, u64)) -> bool; } + #[enum_dispatch::enum_dispatch(AMQFilter)] #[derive(PartialEq, Debug)] pub enum BloomFilter { @@ -60,7 +61,6 @@ pub enum BloomFilter { BlockedBloom(BlockedBloomFilter), } -#[derive(Debug, PartialEq)] pub enum BloomFilterType { StandardBloom = 0, BlockedBloom = 1, diff --git a/src/segment/filter/standard_bloom/mod.rs b/src/segment/filter/standard_bloom/mod.rs index 5b70fb30..4fa78df1 100644 --- a/src/segment/filter/standard_bloom/mod.rs +++ b/src/segment/filter/standard_bloom/mod.rs @@ -1,4 +1,4 @@ -use super::{bit_array::BitArrayReader, AMQFilter, BloomFilter}; +use super::{bit_array::BitArrayReader, AMQFilter, BloomFilter, BloomFilterType}; use crate::{ coding::{Decode, DecodeError, Encode, EncodeError}, file::MAGIC_BYTES, @@ -30,7 +30,6 @@ pub struct StandardBloomFilter { k: usize, } -// TODO: change encode/decode to be Filter enum impl AMQFilter for StandardBloomFilter { /// Size of bloom filter in bytes. #[must_use] @@ -80,7 +79,7 @@ impl Encode for StandardBloomFilter { // Write header writer.write_all(&MAGIC_BYTES)?; - writer.write_u8(0)?; // TODO: How to make this a enum? + writer.write_u8(BloomFilterType::StandardBloom as u8)?; // NOTE: Hash type (unused) writer.write_u8(0)?; From 3e051f54d286762c891336c69aa643f4b0dc14be Mon Sep 17 00:00:00 2001 From: Jing Yang Date: Sun, 27 Apr 2025 22:34:46 +0800 Subject: [PATCH 103/613] chore: rename --- benches/bloom.rs | 4 ++-- src/segment/filter/blocked_bloom/mod.rs | 12 ++++++------ src/segment/filter/mod.rs | 8 ++++---- src/segment/filter/standard_bloom/mod.rs | 16 ++++++---------- src/segment/inner.rs | 4 ++-- src/segment/mod.rs | 2 +- 6 files changed, 21 insertions(+), 25 deletions(-) diff --git a/benches/bloom.rs b/benches/bloom.rs index d3d8f2bc..c6700ad9 100644 --- a/benches/bloom.rs +++ b/benches/bloom.rs @@ -14,7 +14,7 @@ fn standard_filter_construction(c: &mut Criterion) { } fn standard_filter_contains(c: &mut Criterion) { - use lsm_tree::segment::filter::{standard_bloom::Builder, AMQFilter}; + use lsm_tree::segment::filter::{standard_bloom::Builder, AMQ}; let keys = (0..100_000u128) .map(|x| x.to_be_bytes().to_vec()) @@ -63,7 +63,7 @@ fn blocked_filter_construction(c: &mut Criterion) { } fn blocked_filter_contains(c: &mut Criterion) { - use lsm_tree::segment::filter::{blocked_bloom::Builder, AMQFilter}; + use lsm_tree::segment::filter::{blocked_bloom::Builder, AMQ}; let keys = (0..100_000u128) .map(|x| x.to_be_bytes().to_vec()) diff --git a/src/segment/filter/blocked_bloom/mod.rs b/src/segment/filter/blocked_bloom/mod.rs index fa1a5385..203e9e58 100644 --- a/src/segment/filter/blocked_bloom/mod.rs +++ b/src/segment/filter/blocked_bloom/mod.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) mod builder; -use super::{bit_array::BitArrayReader, AMQFilter, BloomFilter, BloomFilterType, CACHE_LINE_BYTES}; +use super::{bit_array::BitArrayReader, AMQFilter, BloomFilterType, AMQ, CACHE_LINE_BYTES}; use crate::{ coding::{DecodeError, Encode, EncodeError}, file::MAGIC_BYTES, @@ -28,7 +28,7 @@ pub struct BlockedBloomFilter { num_blocks: usize, } -impl AMQFilter for BlockedBloomFilter { +impl AMQ for BlockedBloomFilter { fn bytes(&self) -> &[u8] { self.inner.bytes() } @@ -91,7 +91,7 @@ impl Encode for BlockedBloomFilter { impl BlockedBloomFilter { // To be used by AMQFilter after magic bytes and filter type have been read and parsed - pub(super) fn decode_from(reader: &mut R) -> Result { + pub(super) fn decode_from(reader: &mut R) -> Result { // NOTE: Hash type (unused) let hash_type = reader.read_u8()?; assert_eq!(0, hash_type, "Invalid bloom hash type"); @@ -102,7 +102,7 @@ impl BlockedBloomFilter { let mut bytes = vec![0; num_blocks * CACHE_LINE_BYTES]; reader.read_exact(&mut bytes)?; - Ok(BloomFilter::BlockedBloom(Self::from_raw( + Ok(AMQFilter::BlockedBloom(Self::from_raw( num_blocks, k, bytes.into(), @@ -131,7 +131,7 @@ impl BlockedBloomFilter { #[cfg(test)] mod tests { use super::*; - use crate::segment::filter::{AMQFilterBuilder, BloomFilter}; + use crate::segment::filter::{AMQFilter, AMQFilterBuilder}; use std::fs::File; use test_log::test; @@ -171,7 +171,7 @@ mod tests { let filter_copy = AMQFilterBuilder::decode_from(&mut file)?; assert_eq!(filter.inner.bytes(), filter_copy.bytes()); - assert!(matches!(filter_copy, BloomFilter::BlockedBloom(_))); + assert!(matches!(filter_copy, AMQFilter::BlockedBloom(_))); for key in keys { assert!(filter.contains(&**key)); diff --git a/src/segment/filter/mod.rs b/src/segment/filter/mod.rs index e8548222..ca940146 100644 --- a/src/segment/filter/mod.rs +++ b/src/segment/filter/mod.rs @@ -47,16 +47,16 @@ impl BloomConstructionPolicy { } #[enum_dispatch::enum_dispatch] -pub trait AMQFilter { +pub trait AMQ { fn bytes(&self) -> &[u8]; fn len(&self) -> usize; fn contains(&self, item: &[u8]) -> bool; fn contains_hash(&self, hash: (u64, u64)) -> bool; } -#[enum_dispatch::enum_dispatch(AMQFilter)] +#[enum_dispatch::enum_dispatch(AMQ)] #[derive(PartialEq, Debug)] -pub enum BloomFilter { +pub enum AMQFilter { StandardBloom(StandardBloomFilter), BlockedBloom(BlockedBloomFilter), } @@ -80,7 +80,7 @@ impl TryFrom for BloomFilterType { pub struct AMQFilterBuilder {} impl AMQFilterBuilder { - pub fn decode_from(reader: &mut R) -> Result { + pub fn decode_from(reader: &mut R) -> Result { // Check header let mut magic = [0u8; MAGIC_BYTES.len()]; reader.read_exact(&mut magic)?; diff --git a/src/segment/filter/standard_bloom/mod.rs b/src/segment/filter/standard_bloom/mod.rs index 4fa78df1..d20e75aa 100644 --- a/src/segment/filter/standard_bloom/mod.rs +++ b/src/segment/filter/standard_bloom/mod.rs @@ -1,4 +1,4 @@ -use super::{bit_array::BitArrayReader, AMQFilter, BloomFilter, BloomFilterType}; +use super::{bit_array::BitArrayReader, AMQFilter, BloomFilterType, AMQ}; use crate::{ coding::{Decode, DecodeError, Encode, EncodeError}, file::MAGIC_BYTES, @@ -30,7 +30,7 @@ pub struct StandardBloomFilter { k: usize, } -impl AMQFilter for StandardBloomFilter { +impl AMQ for StandardBloomFilter { /// Size of bloom filter in bytes. #[must_use] fn len(&self) -> usize { @@ -95,7 +95,7 @@ impl Encode for StandardBloomFilter { #[allow(clippy::len_without_is_empty)] impl StandardBloomFilter { // To be used by AMQFilter after magic bytes and filter type have been read and parsed - pub(super) fn decode_from(reader: &mut R) -> Result { + pub(super) fn decode_from(reader: &mut R) -> Result { // NOTE: Hash type (unused) let hash_type = reader.read_u8()?; assert_eq!(0, hash_type, "Invalid bloom hash type"); @@ -106,11 +106,7 @@ impl StandardBloomFilter { let mut bytes = vec![0; m / 8]; reader.read_exact(&mut bytes)?; - Ok(BloomFilter::StandardBloom(Self::from_raw( - m, - k, - bytes.into(), - ))) + Ok(AMQFilter::StandardBloom(Self::from_raw(m, k, bytes.into()))) } fn from_raw(m: usize, k: usize, slice: crate::Slice) -> Self { @@ -134,7 +130,7 @@ impl StandardBloomFilter { #[cfg(test)] mod tests { - use crate::segment::filter::{AMQFilterBuilder, BloomFilter}; + use crate::segment::filter::{AMQFilter, AMQFilterBuilder}; use super::*; use std::fs::File; @@ -175,7 +171,7 @@ mod tests { let filter_copy = AMQFilterBuilder::decode_from(&mut file)?; assert_eq!(filter.inner.bytes(), filter_copy.bytes()); - assert!(matches!(filter_copy, BloomFilter::StandardBloom(_))); + assert!(matches!(filter_copy, AMQFilter::StandardBloom(_))); for key in keys { assert!(filter.contains(&**key)); diff --git a/src/segment/inner.rs b/src/segment/inner.rs index 169c7ae4..467e4263 100644 --- a/src/segment/inner.rs +++ b/src/segment/inner.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{ - block_index::NewBlockIndexImpl, filter::BloomFilter, meta::ParsedMeta, trailer::Trailer, + block_index::NewBlockIndexImpl, filter::AMQFilter, meta::ParsedMeta, trailer::Trailer, }; use crate::{ cache::Cache, descriptor_table::DescriptorTable, tree::inner::TreeId, GlobalSegmentId, @@ -38,7 +38,7 @@ pub struct Inner { pub cache: Arc, /// Pinned AMQ filter - pub pinned_filter: Option, + pub pinned_filter: Option, // /// Pinned filter // #[doc(hidden)] diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 155ceac8..287bdaf6 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -27,7 +27,7 @@ use crate::{ cache::Cache, descriptor_table::DescriptorTable, InternalValue, SeqNo, TreeId, UserKey, }; use block_index::{NewBlockIndex, NewBlockIndexImpl, NewFullBlockIndex}; -use filter::{standard_bloom::CompositeHash, AMQFilter, AMQFilterBuilder}; +use filter::{standard_bloom::CompositeHash, AMQ, AMQFilterBuilder}; use inner::Inner; use meta::ParsedMeta; use std::{ From 1cb655e61dad33f74b497f8c577aea4dfc1c07a2 Mon Sep 17 00:00:00 2001 From: Jing Yang Date: Tue, 29 Apr 2025 11:01:32 +0800 Subject: [PATCH 104/613] fix: move hash calculation --- src/segment/filter/blocked_bloom/builder.rs | 6 +++--- src/segment/filter/blocked_bloom/mod.rs | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/segment/filter/blocked_bloom/builder.rs b/src/segment/filter/blocked_bloom/builder.rs index 9def8b9f..840b8ccb 100644 --- a/src/segment/filter/blocked_bloom/builder.rs +++ b/src/segment/filter/blocked_bloom/builder.rs @@ -100,13 +100,13 @@ impl Builder { let block_idx = h1 % (self.num_blocks as u64); for i in 1..(self.k as u64) { - h1 = h1.wrapping_add(h2); - h2 = h2.wrapping_mul(i); - let idx = h1 % (CACHE_LINE_BYTES as u64 * 8); self.inner .enable_bit(Self::get_bit_idx(block_idx as usize, idx as usize)); + + h1 = h1.wrapping_add(h2); + h2 = h2.wrapping_mul(i); } } diff --git a/src/segment/filter/blocked_bloom/mod.rs b/src/segment/filter/blocked_bloom/mod.rs index 203e9e58..b7cb1b2d 100644 --- a/src/segment/filter/blocked_bloom/mod.rs +++ b/src/segment/filter/blocked_bloom/mod.rs @@ -47,9 +47,6 @@ impl AMQ for BlockedBloomFilter { let block_idx = h1 % (self.num_blocks as u64); for i in 1..(self.k as u64) { - h1 = h1.wrapping_add(h2); - h2 = h2.wrapping_mul(i); - let bit_idx = h1 % (CACHE_LINE_BYTES as u64 * 8); // NOTE: should be in bounds because of modulo @@ -57,6 +54,9 @@ impl AMQ for BlockedBloomFilter { if !self.has_bit(block_idx as usize, bit_idx as usize) { return false; } + + h1 = h1.wrapping_add(h2); + h2 = h2.wrapping_mul(i); } true From 141de10f97a6ac0d7f67e52bdb8cc3f848341d3d Mon Sep 17 00:00:00 2001 From: Jing Yang Date: Tue, 29 Apr 2025 20:38:35 +0800 Subject: [PATCH 105/613] chore: add bloom fpr test --- tests/bloom_fpr.rs | 112 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 tests/bloom_fpr.rs diff --git a/tests/bloom_fpr.rs b/tests/bloom_fpr.rs new file mode 100644 index 00000000..40f3c371 --- /dev/null +++ b/tests/bloom_fpr.rs @@ -0,0 +1,112 @@ +use lsm_tree::{ + segment::filter::{ + blocked_bloom::Builder as BlockedBloomBuilder, + standard_bloom::Builder as StandardBloomBuilder, AMQ, + }, + Result, +}; + +// [Theoretical] FPR: 1.0000%, [Empirical] Standard Bloom FPR: 0.0002, Blocked Bloom FPR: 0.0313% +// [Theoretical] FPR: 0.1000%, [Empirical] Standard Bloom FPR: 0.0000, Blocked Bloom FPR: 0.0303% +// [Theoretical] FPR: 0.0100%, [Empirical] Standard Bloom FPR: 0.0000, Blocked Bloom FPR: 0.0287% +// [Theoretical] FPR: 0.0010%, [Empirical] Standard Bloom FPR: 0.0000, Blocked Bloom FPR: 0.0257% +#[test] +fn measure_bloom_fpr_with_fp_rate() -> Result<()> { + let keys = (0..1_000_000u128) + .map(|x| x.to_be_bytes().to_vec()) + .collect::>(); + + let non_existent_keys = (1_000_000..2_000_000u128) + .map(|x| x.to_be_bytes().to_vec()) + .collect::>(); + + let n: usize = 5_000_000; + + for fpr in [0.01, 0.001, 0.0001, 0.00001] { + let mut blocked_builder = BlockedBloomBuilder::with_fp_rate(n, fpr); + let mut standard_builder = StandardBloomBuilder::with_fp_rate(n, fpr); + + for key in &keys { + blocked_builder.set_with_hash(BlockedBloomBuilder::get_hash(key.as_slice())); + standard_builder.set_with_hash(StandardBloomBuilder::get_hash(key.as_slice())); + } + + let blocked_filter = blocked_builder.build(); + let standard_filter = standard_builder.build(); + + let mut blocked_fp = 0; + let mut standard_fp = 0; + for non_existent_key in &non_existent_keys { + if blocked_filter + .contains_hash(BlockedBloomBuilder::get_hash(non_existent_key.as_slice())) + { + blocked_fp += 1; + } + if standard_filter + .contains_hash(StandardBloomBuilder::get_hash(non_existent_key.as_slice())) + { + standard_fp += 1; + } + } + + println!( + "[Theoretical] FPR: {:.4}%, [Empirical] Standard Bloom FPR: {:.4}, Blocked Bloom FPR: {:.4}%", + fpr * 100.0, + (standard_fp as f64 / non_existent_keys.len() as f64) * 100.0, + (blocked_fp as f64 / non_existent_keys.len() as f64) * 100.0 + ); + } + + Ok(()) +} + +// n = 5000000, [Empirical] Standard Bloom FPR: 0.0006, Blocked Bloom FPR: 0.0276% +// n = 10000000, [Empirical] Standard Bloom FPR: 0.0000, Blocked Bloom FPR: 0.0108% +// n = 15000000, [Empirical] Standard Bloom FPR: 0.0000, Blocked Bloom FPR: 0.0086% +#[test] +fn measure_bloom_fpr_with_bpk() -> Result<()> { + let keys = (0..1_000_000u128) + .map(|x| x.to_be_bytes().to_vec()) + .collect::>(); + + let non_existent_keys = (1_000_000..2_000_000u128) + .map(|x| x.to_be_bytes().to_vec()) + .collect::>(); + + for n in [5_000_000, 10_000_000, 15_000_000] { + let mut blocked_builder = BlockedBloomBuilder::with_bpk(n, 10); + let mut standard_builder = StandardBloomBuilder::with_bpk(n, 10); + + for key in &keys { + blocked_builder.set_with_hash(BlockedBloomBuilder::get_hash(key.as_slice())); + standard_builder.set_with_hash(StandardBloomBuilder::get_hash(key.as_slice())); + } + + let blocked_filter = blocked_builder.build(); + let standard_filter = standard_builder.build(); + + let mut blocked_fp = 0; + let mut standard_fp = 0; + for non_existent_key in &non_existent_keys { + if blocked_filter + .contains_hash(BlockedBloomBuilder::get_hash(non_existent_key.as_slice())) + { + blocked_fp += 1; + } + if standard_filter + .contains_hash(StandardBloomBuilder::get_hash(non_existent_key.as_slice())) + { + standard_fp += 1; + } + } + + println!( + "n = {}, [Empirical] Standard Bloom FPR: {:.4}, Blocked Bloom FPR: {:.4}%", + n, + (standard_fp as f64 / non_existent_keys.len() as f64) * 100.0, + (blocked_fp as f64 / non_existent_keys.len() as f64) * 100.0 + ); + } + + Ok(()) +} From 8e49cc8ea90b543e2df621652cf6bfda5e9983bf Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 30 Apr 2025 21:03:11 +0200 Subject: [PATCH 106/613] refactor: Segment::load_block --- src/cache.rs | 15 ++------------- src/segment/mod.rs | 22 +++++++++++++--------- 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/src/cache.rs b/src/cache.rs index 33a7590c..72c22d27 100644 --- a/src/cache.rs +++ b/src/cache.rs @@ -126,22 +126,11 @@ impl Cache { #[doc(hidden)] #[must_use] - pub fn get_index_block(&self, id: GlobalSegmentId, offset: BlockOffset) -> Option { + pub fn get_block(&self, id: GlobalSegmentId, offset: BlockOffset) -> Option { let key: CacheKey = (TAG_BLOCK, id.tree_id(), id.segment_id(), *offset).into(); Some(match self.data.get(&key)? { - Item::Block(block) => IndexBlock::new(block), - Item::Blob(_) => unreachable!("invalid cache item"), - }) - } - - #[doc(hidden)] - #[must_use] - pub fn get_data_block(&self, id: GlobalSegmentId, offset: BlockOffset) -> Option { - let key: CacheKey = (TAG_BLOCK, id.tree_id(), id.segment_id(), *offset).into(); - - Some(match self.data.get(&key)? { - Item::Block(block) => DataBlock::new(block), + Item::Block(block) => block, Item::Blob(_) => unreachable!("invalid cache item"), }) } diff --git a/src/segment/mod.rs b/src/segment/mod.rs index e283f535..d93c809e 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -116,15 +116,15 @@ impl Segment { self.metadata.id } - fn load_data_block(&self, handle: &BlockHandle) -> crate::Result { + fn load_block(&self, handle: &BlockHandle) -> crate::Result { let id = self.global_id(); - if let Some(data_block) = self.cache.get_data_block(id, handle.offset()) { - return Ok(data_block); + if let Some(block) = self.cache.get_block(id, handle.offset()) { + return Ok(block); } let cached_fd = self.descriptor_table.access_for_table(&id); - let cache_miss = cached_fd.is_none(); + let fd_cache_miss = cached_fd.is_none(); let fd = if let Some(fd) = cached_fd { fd @@ -137,20 +137,24 @@ impl Segment { handle.offset(), handle.size(), self.metadata.data_block_compression, - ) - .map(DataBlock::new)?; + )?; + + let id = self.global_id(); // Cache FD - if cache_miss { + if fd_cache_miss { self.descriptor_table.insert_for_table(id, fd); } - self.cache - .insert_block(id, handle.offset(), block.inner.clone()); + self.cache.insert_block(id, handle.offset(), block.clone()); Ok(block) } + fn load_data_block(&self, handle: &BlockHandle) -> crate::Result { + self.load_block(handle).map(DataBlock::new) + } + pub fn get( &self, key: &[u8], From 81e6dfff3491a900b1a931b21ec803823726b055 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 30 Apr 2025 22:18:03 +0200 Subject: [PATCH 107/613] refactor: block hash index --- src/segment/block/encoder.rs | 9 ++------- src/segment/block/hash_index/builder.rs | 27 ++++++++++++++++++++++--- src/segment/block/hash_index/mod.rs | 10 ++++----- src/segment/block/hash_index/reader.rs | 4 ++++ 4 files changed, 35 insertions(+), 15 deletions(-) diff --git a/src/segment/block/encoder.rs b/src/segment/block/encoder.rs index 0a54c51b..c46680b0 100644 --- a/src/segment/block/encoder.rs +++ b/src/segment/block/encoder.rs @@ -63,12 +63,7 @@ impl<'a, S: Default, T: Encodable> Encoder<'a, S, T> { ) -> Self { let binary_index_len = item_count / usize::from(restart_interval); - // TODO: verify - let bucket_count = if hash_index_ratio > 0.0 { - ((item_count as f32 * hash_index_ratio) as u32).max(1) - } else { - 0 - }; + let hash_index_builder = HashIndexBuilder::with_hash_ratio(item_count, hash_index_ratio); Self { phantom: PhantomData, @@ -84,7 +79,7 @@ impl<'a, S: Default, T: Encodable> Encoder<'a, S, T> { use_prefix_truncation: true, binary_index_builder: BinaryIndexBuilder::new(binary_index_len), - hash_index_builder: HashIndexBuilder::new(bucket_count), + hash_index_builder, base_key: first_key, } diff --git a/src/segment/block/hash_index/builder.rs b/src/segment/block/hash_index/builder.rs index 063d91a1..4ec1e3a1 100644 --- a/src/segment/block/hash_index/builder.rs +++ b/src/segment/block/hash_index/builder.rs @@ -14,13 +14,33 @@ pub struct Builder(Vec); impl Builder { /// Initializes a new builder with the given amount of buckets. - pub fn new(bucket_count: u32) -> Self { + #[must_use] + pub fn with_bucket_count(bucket_count: u32) -> Self { Self(vec![MARKER_FREE; bucket_count as usize]) } + #[must_use] + pub fn with_hash_ratio(item_count: usize, hash_ratio: f32) -> Self { + Self::with_bucket_count(Self::calculate_bucket_count(item_count, hash_ratio)) + } + + fn calculate_bucket_count(item_count: usize, hash_ratio: f32) -> u32 { + assert!( + hash_ratio.is_sign_positive(), + "hash_ratio may not be negative", + ); + + if hash_ratio > 0.0 { + ((item_count as f32 * hash_ratio) as u32).max(1) + } else { + 0 + } + } + // NOTE: We know the hash index has a bucket count <= u8 #[allow(clippy::cast_possible_truncation)] /// Returns the number of buckets. + #[must_use] pub fn bucket_count(&self) -> u32 { self.0.len() as u32 } @@ -72,8 +92,9 @@ impl Builder { /// Consumes the builder, returning its raw bytes. /// - /// Only used for tests - #[cfg(test)] + /// Only used for tests/benchmarks + #[must_use] + #[doc(hidden)] pub fn into_inner(self) -> Vec { self.0 } diff --git a/src/segment/block/hash_index/mod.rs b/src/segment/block/hash_index/mod.rs index 760659ad..46644a53 100644 --- a/src/segment/block/hash_index/mod.rs +++ b/src/segment/block/hash_index/mod.rs @@ -44,7 +44,7 @@ mod tests { #[test] fn v3_hash_index_build_simple() { - let mut hash_index = Builder::new(100); + let mut hash_index = Builder::with_bucket_count(100); hash_index.set(b"a", 5); hash_index.set(b"b", 8); @@ -77,7 +77,7 @@ mod tests { #[test] fn v3_hash_index_build_conflict() { - let mut hash_index = Builder::new(1); + let mut hash_index = Builder::with_bucket_count(1); hash_index.set(b"a", 5); hash_index.set(b"b", 8); @@ -91,7 +91,7 @@ mod tests { #[test] fn v3_hash_index_build_same_offset() { - let mut hash_index = Builder::new(1); + let mut hash_index = Builder::with_bucket_count(1); hash_index.set(b"a", 5); hash_index.set(b"b", 5); @@ -108,7 +108,7 @@ mod tests { #[test] fn v3_hash_index_build_mix() { - let mut hash_index = Builder::new(1); + let mut hash_index = Builder::with_bucket_count(1); hash_index.set(b"a", 5); hash_index.set(b"b", 5); @@ -123,7 +123,7 @@ mod tests { #[test] fn v3_hash_index_read_conflict() { - let mut hash_index = Builder::new(1); + let mut hash_index = Builder::with_bucket_count(1); hash_index.set(b"a", 5); hash_index.set(b"b", 8); diff --git a/src/segment/block/hash_index/reader.rs b/src/segment/block/hash_index/reader.rs index 369a3bcb..7b6df72d 100644 --- a/src/segment/block/hash_index/reader.rs +++ b/src/segment/block/hash_index/reader.rs @@ -22,6 +22,7 @@ pub struct Reader<'a>(&'a [u8]); impl<'a> Reader<'a> { /// Initializes a new hash index reader. + #[must_use] pub fn new(bytes: &'a [u8], offset: u32, len: u32) -> Self { let offset = offset as usize; let len = len as usize; @@ -35,6 +36,7 @@ impl<'a> Reader<'a> { // NOTE: Not used for performance reasons, so no need to be hyper-optimized #[allow(clippy::naive_bytecount)] /// Returns the amount of empty slots in the hash index. + #[must_use] pub fn free_count(&self) -> usize { self.0.iter().filter(|&&byte| byte == MARKER_FREE).count() } @@ -42,6 +44,7 @@ impl<'a> Reader<'a> { // NOTE: Not used for performance reasons, so no need to be hyper-optimized #[allow(clippy::naive_bytecount)] /// Returns the amount of conflict markers in the hash index. + #[must_use] pub fn conflict_count(&self) -> usize { self.0 .iter() @@ -50,6 +53,7 @@ impl<'a> Reader<'a> { } /// Returns the binary index position if the key is not conflicted. + #[must_use] pub fn get(&self, key: &[u8]) -> Lookup { // NOTE: Even with very high hash ratio, there will be nearly enough items to // cause us to create u32 buckets From 520a2a743b6c87ac638faf8fc2c8cf7a862bf06d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 5 May 2025 22:04:15 +0200 Subject: [PATCH 108/613] data block forward reader --- src/segment/data_block/forward_reader.rs | 1427 +++++++++++++++++++++ src/segment/data_block/iter.rs | 173 ++- src/segment/data_block/mod.rs | 1402 +------------------- src/segment/index_block/block_handle.rs | 3 + src/segment/index_block/forward_reader.rs | 2 + src/segment/index_block/mod.rs | 6 +- src/segment/util.rs | 48 +- 7 files changed, 1634 insertions(+), 1427 deletions(-) create mode 100644 src/segment/data_block/forward_reader.rs diff --git a/src/segment/data_block/forward_reader.rs b/src/segment/data_block/forward_reader.rs new file mode 100644 index 00000000..3e55c030 --- /dev/null +++ b/src/segment/data_block/forward_reader.rs @@ -0,0 +1,1427 @@ +// Copyright (c) 2025-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use super::{iter::ParsedItem, DataBlock}; +use crate::{segment::util::compare_prefixed_slice, InternalValue, SeqNo}; +use std::io::{Cursor, Seek}; + +// TODO: flatten into main struct +#[derive(Default, Debug)] +struct LoScanner { + offset: usize, + remaining_in_interval: usize, + base_key_offset: Option, +} + +/// Specialized reader to scan an index block only in forwards direction +/// +/// Is less expensive than a double ended iterator. +pub struct ForwardReader<'a> { + block: &'a DataBlock, + restart_interval: usize, + lo_scanner: LoScanner, +} + +impl<'a> ForwardReader<'a> { + #[must_use] + pub fn new(block: &'a DataBlock) -> Self { + let restart_interval = block.restart_interval.into(); + + Self { + block, + + restart_interval, + + lo_scanner: LoScanner::default(), + } + } + + #[must_use] + pub fn offset(&self) -> usize { + self.lo_scanner.offset + } + + /// Reads an item by key from the block, if it exists. + #[must_use] + pub fn point_read(&mut self, needle: &[u8], seqno: Option) -> Option { + let may_exist = self.seek(needle, seqno); + + if !may_exist { + return None; + } + + let bytes = self.block.bytes(); + + for item in &mut *self { + let cmp_result = if let Some(prefix) = &item.prefix { + let prefix = unsafe { bytes.get_unchecked(prefix.0..prefix.1) }; + let rest_key = unsafe { bytes.get_unchecked(item.key.0..item.key.1) }; + compare_prefixed_slice(prefix, rest_key, needle) + } else { + let key = unsafe { bytes.get_unchecked(item.key.0..item.key.1) }; + key.cmp(needle) + }; + + match cmp_result { + std::cmp::Ordering::Equal => { + // TODO: maybe return early if past seqno + let should_skip = seqno.is_some_and(|watermark| item.seqno >= watermark); + + if !should_skip { + let kv = item.materialize(&self.block.inner.data); + return Some(kv); + } + } + std::cmp::Ordering::Greater => { + // Already passed needle + return None; + } + std::cmp::Ordering::Less => { + // Continue to next KV + } + } + } + + None + } + + /// Seeks to the lowest item that is eligible based on the requested + /// needle and seqno. + /// + /// Returns `false` if `next()` can be safely skipped because the item definitely + /// does not exist. + pub fn seek(&mut self, needle: &[u8], seqno: Option) -> bool { + let binary_index = self.block.get_binary_index_reader(); + + // NOTE: Try hash index if it exists + if let Some(lookup) = self + .block + .get_hash_index_reader() + .map(|reader| reader.get(needle)) + { + use super::super::block::hash_index::Lookup::{Conflicted, Found, NotFound}; + + match lookup { + Found(bucket_value) => { + let offset = binary_index.get(usize::from(bucket_value)); + self.lo_scanner.offset = offset; + self.linear_probe(needle, seqno); + return true; + } + NotFound => { + return false; + } + Conflicted => { + // NOTE: Fallback to binary search + } + } + } + + let offset = self + .block + .binary_search_for_offset(&binary_index, needle, seqno) + .expect("should work"); + + self.lo_scanner.offset = offset; + + self.linear_probe(needle, seqno) + } + + fn linear_probe(&mut self, needle: &[u8], seqno: Option /* TODO: use */) -> bool { + let bytes = self.block.bytes(); + + // SAFETY: The cursor is advanced by read_ operations which check for EOF, + // And the cursor starts at 0 - the slice is never empty + #[warn(unsafe_code)] + let mut reader = Cursor::new(bytes); + + reader + .seek_relative(self.lo_scanner.offset as i64) + .expect("should be in bounds"); + + loop { + let Some(head) = DataBlock::parse_restart_item(&mut reader, 0) else { + return false; + }; + + let cmp_result = { + let key = unsafe { bytes.get_unchecked(head.key.0..head.key.1) }; + key.cmp(needle) + }; + + match cmp_result { + std::cmp::Ordering::Equal => { + // TODO: return true + return true; + } + std::cmp::Ordering::Greater => { + // Already passed needle + + return false; + } + std::cmp::Ordering::Less => { + // Continue to next KV + } + } + + let base_key_offset = head.key.0; + self.lo_scanner.base_key_offset = Some(base_key_offset); + + self.lo_scanner.remaining_in_interval = self.restart_interval; + self.lo_scanner.offset = reader.position() as usize; + self.lo_scanner.remaining_in_interval -= 1; + + for _ in 0..(self.restart_interval - 1) { + let Some(head) = DataBlock::parse_truncated_item(&mut reader, 0, base_key_offset) + else { + return false; + }; + + let cmp_result = if let Some(prefix) = &head.prefix { + let prefix = unsafe { bytes.get_unchecked(prefix.0..prefix.1) }; + let rest_key = unsafe { bytes.get_unchecked(head.key.0..head.key.1) }; + compare_prefixed_slice(prefix, rest_key, needle) + } else { + let key = unsafe { bytes.get_unchecked(head.key.0..head.key.1) }; + key.cmp(needle) + }; + + match cmp_result { + std::cmp::Ordering::Equal => { + return true; + } + std::cmp::Ordering::Greater => { + // Already passed needle + + return false; + } + std::cmp::Ordering::Less => { + // Continue to next KV + } + } + + self.lo_scanner.offset = reader.position() as usize; + self.lo_scanner.remaining_in_interval -= 1; + } + } + } + + fn parse_restart_item( + block: &DataBlock, + offset: &mut usize, + base_key_offset: &mut Option, + ) -> Option { + let bytes = block.bytes(); + + // SAFETY: The cursor is advanced by read_ operations which check for EOF, + // And the cursor starts at 0 - the slice is never empty + #[warn(unsafe_code)] + let mut reader = Cursor::new(unsafe { bytes.get_unchecked(*offset..) }); + + let item = DataBlock::parse_restart_item(&mut reader, *offset)?; + + *offset += reader.position() as usize; + *base_key_offset = Some(item.key.0); + + Some(item) + } + + fn parse_truncated_item( + block: &DataBlock, + offset: &mut usize, + base_key_offset: usize, + ) -> Option { + let bytes = block.bytes(); + + // SAFETY: The cursor is advanced by read_ operations which check for EOF, + // And the cursor starts at 0 - the slice is never empty + #[warn(unsafe_code)] + let mut reader = Cursor::new(unsafe { bytes.get_unchecked(*offset..) }); + + let item = DataBlock::parse_truncated_item(&mut reader, *offset, base_key_offset)?; + + *offset += reader.position() as usize; + + Some(item) + } +} + +impl Iterator for ForwardReader<'_> { + type Item = ParsedItem; + + fn next(&mut self) -> Option { + let is_restart = self.lo_scanner.remaining_in_interval == 0; + + let item = if is_restart { + self.lo_scanner.remaining_in_interval = self.restart_interval; + + Self::parse_restart_item( + self.block, + &mut self.lo_scanner.offset, + &mut self.lo_scanner.base_key_offset, + ) + } else { + Self::parse_truncated_item( + self.block, + &mut self.lo_scanner.offset, + self.lo_scanner.base_key_offset.expect("should exist"), + ) + }; + + self.lo_scanner.remaining_in_interval -= 1; + + item + } +} + +#[cfg(test)] +#[allow(clippy::unwrap_used)] +mod tests { + use super::*; + use crate::{ + segment::{block::Header, Block, BlockOffset, Checksum}, + Slice, + ValueType::{Tombstone, Value}, + }; + use test_log::test; + + #[test] + fn v3_data_block_point_read_one() -> crate::Result<()> { + let items = [InternalValue::from_components( + "pla:earth:fact", + "eaaaaaaaaarth", + 0, + crate::ValueType::Value, + )]; + + let bytes = DataBlock::encode_items(&items, 16, 0.0)?; + let serialized_len = bytes.len(); + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(!data_block.is_empty()); + assert_eq!(data_block.inner.size(), serialized_len); + + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, None), + ); + } + + assert_eq!(None, data_block.point_read(b"yyy", None)); + + Ok(()) + } + + #[test] + fn v3_data_block_point_read() -> crate::Result<()> { + let items = [ + InternalValue::from_components( + "pla:earth:fact", + "eaaaaaaaaarth", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components( + "pla:jupiter:fact", + "Jupiter is big", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components( + "pla:jupiter:mass", + "Massive", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components( + "pla:jupiter:name", + "Jupiter", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components("pla:jupiter:radius", "Big", 0, crate::ValueType::Value), + InternalValue::from_components( + "pla:saturn:fact", + "Saturn is pretty big", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components("pla:saturn:name", "Saturn", 0, crate::ValueType::Value), + InternalValue::from_components("pla:venus:fact", "", 1, crate::ValueType::Tombstone), + InternalValue::from_components( + "pla:venus:fact", + "Venus exists", + 0, + crate::ValueType::Value, + ), + InternalValue::from_components("pla:venus:name", "Venus", 0, crate::ValueType::Value), + ]; + + for restart_interval in 1..=20 { + let bytes = DataBlock::encode_items(&items, restart_interval, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); + + for needle in &items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + ); + } + + assert_eq!(None, data_block.point_read(b"yyy", None)); + } + + Ok(()) + } + + #[test] + fn v3_data_block_fuzz_1() -> crate::Result<()> { + let items = [ + InternalValue::from_components([0], b"", 23_523_531_241_241_242, Value), + InternalValue::from_components([0], b"", 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 16, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); + + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + ); + } + + assert_eq!(None, data_block.point_read(b"yyy", None)); + + Ok(()) + } + + #[test] + fn v3_data_block_fuzz_2() -> crate::Result<()> { + let items = [ + InternalValue::from_components([0], [], 5, Value), + InternalValue::from_components([0], [], 4, Tombstone), + InternalValue::from_components([0], [], 3, Value), + InternalValue::from_components([0], [], 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 2, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().is_none()); + + for needle in items { + eprintln!("NEEDLE {needle:?}"); + + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + ); + } + + assert_eq!(None, data_block.point_read(b"yyy", None)); + + Ok(()) + } + + #[test] + fn v3_data_block_fuzz_3() -> crate::Result<()> { + let items = [ + InternalValue::from_components( + Slice::from([ + 255, 255, 255, 255, 5, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, + ]), + Slice::from([0, 0, 192]), + 18_446_744_073_701_163_007, + Tombstone, + ), + InternalValue::from_components( + Slice::from([255, 255, 255, 255, 255, 255, 0]), + Slice::from([]), + 0, + Value, + ), + ]; + + let bytes = DataBlock::encode_items(&items, 5, 1.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); + + assert_eq!( + { + #[allow(clippy::suspicious_map)] + data_block.iter().count() + }, + items.len(), + ); + + assert_eq!(items, *data_block.iter().collect::>(),); + + Ok(()) + } + + #[test] + fn v3_data_block_fuzz_4() -> crate::Result<()> { + let items = [ + InternalValue::from_components( + Slice::new(&[0]), + Slice::new(&[]), + 3_834_029_160_418_063_669, + Value, + ), + InternalValue::from_components(Slice::new(&[0]), Slice::new(&[]), 127, Tombstone), + InternalValue::from_components( + Slice::new(&[53, 53, 53]), + Slice::new(&[]), + 18_446_744_073_709_551_615, + Tombstone, + ), + InternalValue::from_components( + Slice::new(&[255]), + Slice::new(&[]), + 18_446_744_069_414_584_831, + Tombstone, + ), + InternalValue::from_components(Slice::new(&[255, 255]), Slice::new(&[]), 47, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 2, 1.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); + + for item in data_block.iter() { + eprintln!("{item:?}"); + } + + assert_eq!( + { + #[allow(clippy::suspicious_map)] + data_block.iter().count() + }, + items.len(), + ); + + Ok(()) + } + + #[test] + fn v3_data_block_dense() -> crate::Result<()> { + let items = [ + InternalValue::from_components(b"a", b"a", 3, Value), + InternalValue::from_components(b"b", b"b", 2, Value), + InternalValue::from_components(b"c", b"c", 1, Value), + InternalValue::from_components(b"d", b"d", 65, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 1, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + + for needle in items { + eprintln!("NEEDLE {needle:?}"); + + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, None), + ); + } + + assert_eq!(None, data_block.point_read(b"yyy", None)); + + Ok(()) + } + + #[test] + fn v3_data_block_dense_mvcc_with_hash() -> crate::Result<()> { + let items = [ + InternalValue::from_components(b"a", b"a", 3, Value), + InternalValue::from_components(b"a", b"a", 2, Value), + InternalValue::from_components(b"a", b"a", 1, Value), + InternalValue::from_components(b"b", b"b", 65, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 1, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); + + for needle in items { + eprintln!("NEEDLE {needle:?}"); + + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + ); + } + + assert_eq!(None, data_block.point_read(b"yyy", None)); + + Ok(()) + } + + #[test] + #[allow(clippy::unwrap_used)] + fn v3_data_block_mvcc_latest() -> crate::Result<()> { + let items = [ + InternalValue::from_components(b"a", b"a", 3, Value), + InternalValue::from_components(b"a", b"a", 2, Value), + InternalValue::from_components(b"a", b"a", 1, Value), + InternalValue::from_components(b"b", b"b", 65, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 1, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); + + assert_eq!( + Some(items.first().cloned().unwrap()), + data_block.point_read(b"a", None) + ); + assert_eq!( + Some(items.last().cloned().unwrap()), + data_block.point_read(b"b", None) + ); + assert_eq!(None, data_block.point_read(b"yyy", None)); + + Ok(()) + } + + #[test] + #[allow(clippy::unwrap_used)] + fn v3_data_block_mvcc_latest_fuzz_1() -> crate::Result<()> { + let items = [ + InternalValue::from_components(Slice::from([0]), Slice::from([]), 0, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 0, Value), + InternalValue::from_components( + Slice::from([255, 255, 0]), + Slice::from([]), + 127_886_946_205_696, + Tombstone, + ), + ]; + + let bytes = DataBlock::encode_items(&items, 2, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + + assert_eq!( + Some(items.get(1).cloned().unwrap()), + data_block.point_read(&[233, 233], None) + ); + assert_eq!(None, data_block.point_read(b"yyy", None)); + + Ok(()) + } + + #[test] + #[allow(clippy::unwrap_used)] + fn v3_data_block_mvcc_latest_fuzz_2() -> crate::Result<()> { + let items = [ + InternalValue::from_components(Slice::from([0]), Slice::from([]), 0, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 8, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 7, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 6, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 5, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 4, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 3, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 2, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 1, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 0, Value), + InternalValue::from_components( + Slice::from([255, 255, 0]), + Slice::from([]), + 127_886_946_205_696, + Tombstone, + ), + ]; + + let bytes = DataBlock::encode_items(&items, 2, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + + assert_eq!( + Some(items.get(1).cloned().unwrap()), + data_block.point_read(&[233, 233], None) + ); + assert_eq!( + Some(items.last().cloned().unwrap()), + data_block.point_read(&[255, 255, 0], None) + ); + assert_eq!(None, data_block.point_read(b"yyy", None)); + + Ok(()) + } + + #[test] + #[allow(clippy::unwrap_used)] + fn v3_data_block_mvcc_latest_fuzz_3() -> crate::Result<()> { + let items = [ + InternalValue::from_components(Slice::from([0]), Slice::from([]), 0, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 8, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 7, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 6, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 5, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 4, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 3, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 2, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 1, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 0, Value), + InternalValue::from_components( + Slice::from([255, 255, 0]), + Slice::from([]), + 127_886_946_205_696, + Tombstone, + ), + ]; + + let bytes = DataBlock::encode_items(&items, 2, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + + assert_eq!( + Some(items.get(1).cloned().unwrap()), + data_block.point_read(&[233, 233], Some(SeqNo::MAX)) + ); + assert_eq!( + Some(items.last().cloned().unwrap()), + data_block.point_read(&[255, 255, 0], Some(SeqNo::MAX)) + ); + assert_eq!(None, data_block.point_read(b"yyy", None)); + + Ok(()) + } + + #[test] + #[allow(clippy::unwrap_used)] + fn v3_data_block_mvcc_latest_fuzz_3_dense() -> crate::Result<()> { + let items = [ + InternalValue::from_components(Slice::from([0]), Slice::from([]), 0, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 8, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 7, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 6, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 5, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 4, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 3, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 2, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 1, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 0, Value), + InternalValue::from_components( + Slice::from([255, 255, 0]), + Slice::from([]), + 127_886_946_205_696, + Tombstone, + ), + ]; + + let bytes = DataBlock::encode_items(&items, 1, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + + assert_eq!( + Some(items.get(1).cloned().unwrap()), + data_block.point_read(&[233, 233], None) + ); + assert_eq!( + Some(items.last().cloned().unwrap()), + data_block.point_read(&[255, 255, 0], None) + ); + assert_eq!(None, data_block.point_read(b"yyy", None)); + + Ok(()) + } + + #[test] + fn v3_data_block_dense_mvcc_no_hash() -> crate::Result<()> { + let items = [ + InternalValue::from_components(b"a", b"a", 3, Value), + InternalValue::from_components(b"a", b"a", 2, Value), + InternalValue::from_components(b"a", b"a", 1, Value), + InternalValue::from_components(b"b", b"b", 65, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 1, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().is_none()); + + for needle in items { + eprintln!("NEEDLE {needle:?}"); + + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + ); + } + + assert_eq!(None, data_block.point_read(b"yyy", None)); + + Ok(()) + } + + #[test] + fn v3_data_block_point_read_shadowing() -> crate::Result<()> { + let items = [ + InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), + InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), + InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), + InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), + InternalValue::from_components("pla:venus:name", "Venus", 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 16, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); + + assert!(data_block + .point_read(b"pla:venus:fact", None) + .expect("should exist") + .is_tombstone()); + + Ok(()) + } + + #[test] + fn v3_data_block_point_read_dense() -> crate::Result<()> { + let items = [ + InternalValue::from_components("pla:earth:fact", "eaaaaaaaaarth", 0, Value), + InternalValue::from_components("pla:jupiter:fact", "Jupiter is big", 0, Value), + InternalValue::from_components("pla:jupiter:mass", "Massive", 0, Value), + InternalValue::from_components("pla:jupiter:name", "Jupiter", 0, Value), + InternalValue::from_components("pla:jupiter:radius", "Big", 0, Value), + InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), + InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), + InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), + InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), + InternalValue::from_components("pla:venus:name", "Venus", 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 1, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); + + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + ); + } + + assert_eq!(None, data_block.point_read(b"yyy", None)); + + Ok(()) + } + + #[test] + fn v3_data_block_iter_forward_one_time() -> crate::Result<()> { + let items = [InternalValue::from_components( + "pla:saturn:fact", + "Saturn is pretty big", + 0, + Value, + )]; + + let bytes = DataBlock::encode_items(&items, 16, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + + assert_eq!( + { + #[allow(clippy::suspicious_map)] + data_block.iter().count() + }, + items.len() + ); + + assert_eq!(data_block.iter().collect::>(), items); + + Ok(()) + } + + #[test] + fn v3_data_block_iter_forward() -> crate::Result<()> { + let items = [ + InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), + InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), + InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), + InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), + InternalValue::from_components("pla:venus:name", "Venus", 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 16, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); + + assert_eq!( + { + #[allow(clippy::suspicious_map)] + data_block.iter().count() + }, + items.len(), + ); + + assert_eq!(items, *data_block.iter().collect::>(),); + + Ok(()) + } + + #[test] + fn v3_data_block_iter_forward_dense() -> crate::Result<()> { + let items = [InternalValue::from_components( + "pla:saturn:fact", + "Saturn is pretty big", + 0, + Value, + )]; + + let bytes = DataBlock::encode_items(&items, 1, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + + assert_eq!(items.len(), { + #[allow(clippy::suspicious_map)] + data_block.iter().count() + }); + + assert_eq!(items, *data_block.iter().collect::>(),); + + Ok(()) + } + + #[test] + fn v3_data_block_iter_rev() -> crate::Result<()> { + let items = [ + InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), + InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), + InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), + InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), + InternalValue::from_components("pla:venus:name", "Venus", 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 16, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); + + assert_eq!(items.len(), { + #[allow(clippy::suspicious_map)] + data_block.iter().rev().count() + }); + + assert_eq!( + items.into_iter().rev().collect::>(), + data_block.iter().rev().collect::>(), + ); + + Ok(()) + } + + #[test] + fn v3_data_block_iter_ping_pong() -> crate::Result<()> { + let items = [ + InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), + InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), + InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), + InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), + InternalValue::from_components("pla:venus:name", "Venus", 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 16, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); + + { + let mut iter = data_block.iter(); + + assert_eq!(b"pla:saturn:fact", &*iter.next().unwrap().key.user_key); + assert_eq!(b"pla:venus:name", &*iter.next_back().unwrap().key.user_key); + assert_eq!(b"pla:saturn:name", &*iter.next().unwrap().key.user_key); + assert_eq!(b"pla:venus:fact", &*iter.next_back().unwrap().key.user_key); + + let last = iter.next().unwrap().key; + assert_eq!(b"pla:venus:fact", &*last.user_key); + assert_eq!(Tombstone, last.value_type); + assert_eq!(1, last.seqno); + } + + { + let mut iter = data_block.iter(); + + assert_eq!(b"pla:venus:name", &*iter.next_back().unwrap().key.user_key); + assert_eq!( + b"pla:saturn:fact", + &*iter + .next() + .inspect(|v| { + eprintln!("{:?}", String::from_utf8_lossy(&v.key.user_key)); + }) + .unwrap() + .key + .user_key + ); + assert_eq!(b"pla:venus:fact", &*iter.next_back().unwrap().key.user_key); + assert_eq!(b"pla:saturn:name", &*iter.next().unwrap().key.user_key); + + let last = iter.next_back().unwrap().key; + assert_eq!(b"pla:venus:fact", &*last.user_key); + assert_eq!(Tombstone, last.value_type); + assert_eq!(1, last.seqno); + } + + Ok(()) + } + + #[test] + fn v3_data_block_range() -> crate::Result<()> { + let items = [ + InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), + InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), + InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), + InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), + InternalValue::from_components("pla:venus:name", "Venus", 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 16, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); + + assert_eq!( + { + #[allow(clippy::suspicious_map)] + data_block.range(&((b"pla:venus:" as &[u8])..)).count() + }, + 3, + ); + + Ok(()) + } + + #[test] + fn v3_data_block_range_rev() -> crate::Result<()> { + let items = [ + InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), + InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), + InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), + InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), + InternalValue::from_components("pla:venus:name", "Venus", 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 16, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); + + assert_eq!( + { + #[allow(clippy::suspicious_map)] + data_block + .range(&((b"pla:venus:" as &[u8])..)) + .rev() + .count() + }, + 3, + ); + + Ok(()) + } + + #[test] + fn v3_data_block_small_hash_ratio() -> crate::Result<()> { + let items = (0u64..254) + .map(|x| InternalValue::from_components(x.to_be_bytes(), x.to_be_bytes(), 0, Value)) + .collect::>(); + + // NOTE: If >0.0, buckets are at least 1 + let bytes = DataBlock::encode_items(&items, 1, 0.0001)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); + + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + ); + } + + Ok(()) + } + + #[test] + fn v3_data_block_just_enough_pointers_for_hash_bucket() -> crate::Result<()> { + let items = (0u64..254) + .map(|x| InternalValue::from_components(x.to_be_bytes(), x.to_be_bytes(), 0, Value)) + .collect::>(); + + let bytes = DataBlock::encode_items(&items, 1, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().unwrap() > 0); + + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + ); + } + + Ok(()) + } + + #[test] + fn v3_data_block_too_many_pointers_for_hash_bucket() -> crate::Result<()> { + let items = (0u64..255) + .map(|x| InternalValue::from_components(x.to_be_bytes(), x.to_be_bytes(), 0, Value)) + .collect::>(); + + let bytes = DataBlock::encode_items(&items, 1, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().is_none()); + + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + ); + } + + Ok(()) + } + + #[test] + fn v3_data_block_way_too_many_pointers_for_hash_bucket() -> crate::Result<()> { + let items = (0u64..1_000) + .map(|x| InternalValue::from_components(x.to_be_bytes(), x.to_be_bytes(), 0, Value)) + .collect::>(); + + let bytes = DataBlock::encode_items(&items, 1, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().is_none()); + + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + ); + } + + Ok(()) + } + + #[test] + fn v3_data_block_no_hash_index() -> crate::Result<()> { + let items = (0u64..1) + .map(|x| InternalValue::from_components(x.to_be_bytes(), x.to_be_bytes(), 0, Value)) + .collect::>(); + + let bytes = DataBlock::encode_items(&items, 1, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().is_none()); + + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + ); + } + + Ok(()) + } +} diff --git a/src/segment/data_block/iter.rs b/src/segment/data_block/iter.rs index 64445696..3a316168 100644 --- a/src/segment/data_block/iter.rs +++ b/src/segment/data_block/iter.rs @@ -2,17 +2,10 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::DataBlock; +use super::{forward_reader::ForwardReader, DataBlock}; use crate::{key::InternalKey, InternalValue, SeqNo, Slice}; use std::io::Cursor; -#[derive(Default, Debug)] -struct LoScanner { - offset: usize, - remaining_in_interval: usize, - base_key_offset: Option, -} - #[derive(Debug)] struct HiScanner { offset: usize, @@ -26,7 +19,7 @@ pub struct Iter<'a> { block: &'a DataBlock, restart_interval: usize, - lo_scanner: LoScanner, + lo_scanner: ForwardReader<'a>, hi_scanner: HiScanner, } @@ -45,6 +38,8 @@ pub struct ParsedItem { impl ParsedItem { pub fn materialize(&self, bytes: &Slice) -> InternalValue { + // NOTE: We consider the prefix and key slice indexes to be trustworthy + #[allow(clippy::indexing_slicing)] let key = if let Some(prefix) = &self.prefix { let prefix_key = &bytes[prefix.0..prefix.1]; let rest_key = &bytes[self.key.0..self.key.1]; @@ -55,6 +50,8 @@ impl ParsedItem { let key = InternalKey::new( key, self.seqno, + // NOTE: Value type is (or should be) checked when reading it + #[allow(clippy::expect_used)] self.value_type.try_into().expect("should work"), ); @@ -77,8 +74,9 @@ impl<'a> Iter<'a> { restart_interval, - lo_scanner: LoScanner::default(), + lo_scanner: ForwardReader::new(block), + /* lo_scanner: LoScanner::default(), */ hi_scanner: HiScanner { offset: 0, ptr_idx: binary_index_len, @@ -88,10 +86,10 @@ impl<'a> Iter<'a> { } } - pub fn with_offset(mut self, offset: usize) -> Self { + /* pub fn with_offset(mut self, offset: usize) -> Self { self.lo_scanner.offset = offset; self - } + } */ fn parse_restart_item( block: &DataBlock, @@ -134,7 +132,7 @@ impl<'a> Iter<'a> { fn consume_stack_top(&mut self) -> Option { if let Some(offset) = self.hi_scanner.stack.pop() { - if self.lo_scanner.offset > 0 && offset < self.lo_scanner.offset { + if self.lo_scanner.offset() > 0 && offset < self.lo_scanner.offset() { return None; } @@ -166,12 +164,12 @@ impl Iterator for Iter<'_> { fn next(&mut self) -> Option { if self.hi_scanner.base_key_offset.is_some() - && self.lo_scanner.offset >= self.hi_scanner.offset + && self.lo_scanner.offset() >= self.hi_scanner.offset { return None; } - let is_restart = self.lo_scanner.remaining_in_interval == 0; + /* let is_restart = self.lo_scanner.remaining_in_interval == 0; let item = if is_restart { self.lo_scanner.remaining_in_interval = self.restart_interval; @@ -189,10 +187,12 @@ impl Iterator for Iter<'_> { ) }; - self.lo_scanner.remaining_in_interval -= 1; + self.lo_scanner.remaining_in_interval -= 1; */ + + let item = self.lo_scanner.next(); if self.hi_scanner.base_key_offset.is_some() - && self.lo_scanner.offset >= self.hi_scanner.offset + && self.lo_scanner.offset() >= self.hi_scanner.offset { return None; } @@ -253,3 +253,142 @@ impl DoubleEndedIterator for Iter<'_> { self.consume_stack_top() } } + +#[cfg(test)] +#[allow(clippy::unwrap_used, clippy::expect_used)] +mod tests { + use super::*; + use crate::{ + segment::{ + block::{BlockOffset, Checksum, Header}, + Block, + }, + InternalValue, + ValueType::Value, + }; + use test_log::test; + + #[test] + fn v3_data_block_consume_last_back() -> crate::Result<()> { + let items = [ + InternalValue::from_components("pla:earth:fact", "eaaaaaaaaarth", 0, Value), + InternalValue::from_components("pla:jupiter:fact", "Jupiter is big", 0, Value), + InternalValue::from_components("pla:jupiter:mass", "Massive", 0, Value), + InternalValue::from_components("pla:jupiter:name", "Jupiter", 0, Value), + InternalValue::from_components("pla:jupiter:radius", "Big", 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 1, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().is_none()); + + { + let mut iter = data_block.iter(); + assert_eq!(b"pla:earth:fact", &*iter.next().unwrap().key.user_key); + assert_eq!(b"pla:jupiter:fact", &*iter.next().unwrap().key.user_key); + assert_eq!(b"pla:jupiter:mass", &*iter.next().unwrap().key.user_key); + assert_eq!(b"pla:jupiter:name", &*iter.next().unwrap().key.user_key); + assert_eq!( + b"pla:jupiter:radius", + &*iter.next_back().unwrap().key.user_key + ); + assert!(iter.next_back().is_none()); + assert!(iter.next().is_none()); + } + + { + let mut iter = data_block.iter(); + assert_eq!(b"pla:earth:fact", &*iter.next().unwrap().key.user_key); + assert_eq!(b"pla:jupiter:fact", &*iter.next().unwrap().key.user_key); + assert_eq!(b"pla:jupiter:mass", &*iter.next().unwrap().key.user_key); + assert_eq!(b"pla:jupiter:name", &*iter.next().unwrap().key.user_key); + assert_eq!( + b"pla:jupiter:radius", + &*iter.next_back().unwrap().key.user_key + ); + assert!(iter.next().is_none()); + assert!(iter.next_back().is_none()); + } + + Ok(()) + } + + #[test] + fn v3_data_block_consume_last_forwards() -> crate::Result<()> { + let items = [ + InternalValue::from_components("pla:earth:fact", "eaaaaaaaaarth", 0, Value), + InternalValue::from_components("pla:jupiter:fact", "Jupiter is big", 0, Value), + InternalValue::from_components("pla:jupiter:mass", "Massive", 0, Value), + InternalValue::from_components("pla:jupiter:name", "Jupiter", 0, Value), + InternalValue::from_components("pla:jupiter:radius", "Big", 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 1, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().is_none()); + + { + let mut iter = data_block.iter().rev(); + assert_eq!(b"pla:earth:fact", &*iter.next_back().unwrap().key.user_key); + assert_eq!( + b"pla:jupiter:fact", + &*iter.next_back().unwrap().key.user_key + ); + assert_eq!( + b"pla:jupiter:mass", + &*iter.next_back().unwrap().key.user_key + ); + assert_eq!( + b"pla:jupiter:name", + &*iter.next_back().unwrap().key.user_key + ); + assert_eq!(b"pla:jupiter:radius", &*iter.next().unwrap().key.user_key); + assert!(iter.next().is_none()); + assert!(iter.next_back().is_none()); + } + + { + let mut iter = data_block.iter().rev(); + assert_eq!(b"pla:earth:fact", &*iter.next_back().unwrap().key.user_key); + assert_eq!( + b"pla:jupiter:fact", + &*iter.next_back().unwrap().key.user_key + ); + assert_eq!( + b"pla:jupiter:mass", + &*iter.next_back().unwrap().key.user_key + ); + assert_eq!( + b"pla:jupiter:name", + &*iter.next_back().unwrap().key.user_key + ); + assert_eq!(b"pla:jupiter:radius", &*iter.next().unwrap().key.user_key); + assert!(iter.next_back().is_none()); + assert!(iter.next().is_none()); + } + + Ok(()) + } +} diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index 2824467f..e894e9f6 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -2,6 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) +pub mod forward_reader; mod iter; pub use iter::Iter; @@ -11,10 +12,10 @@ use super::block::{ Encodable, Encoder, Trailer, TRAILER_START_MARKER, }; use crate::clipping_iter::ClippingIter; -use crate::segment::util::compare_prefixed_slice; use crate::{InternalValue, SeqNo, ValueType}; use byteorder::WriteBytesExt; use byteorder::{LittleEndian, ReadBytesExt}; +use forward_reader::ForwardReader; use iter::{ParsedItem, ParsedSlice}; use std::io::Seek; use std::ops::RangeBounds; @@ -99,9 +100,9 @@ impl Encodable<()> for InternalValue { macro_rules! unwrappy { ($x:expr) => { - // $x.expect("should read") + $x.expect("should read") - unsafe { $x.unwrap_unchecked() } + // unsafe { $x.unwrap_unchecked() } }; } @@ -184,7 +185,7 @@ impl DataBlock { ClippingIter::new( Iter::new(self) - .with_offset(offset) + // .with_offset(offset) // TODO: .map(|kv| kv.materialize(&self.inner.data)), range, ) @@ -468,101 +469,10 @@ impl DataBlock { }) } - fn scan(&self, needle: &[u8], seqno: Option, offset: usize) -> Option { - let bytes = self.bytes(); - - // SAFETY: The cursor is advanced by read_ operations which check for EOF, - // And the cursor starts at 0 - the slice is never empty - #[warn(unsafe_code)] - let mut reader = Cursor::new(unsafe { bytes.get_unchecked(offset..) }); - - loop { - let head = Self::parse_restart_item(&mut reader, offset)?; - - let key = &bytes[head.key.0..head.key.1]; - let base_key_offset = head.key.0; - - match key.cmp(needle) { - std::cmp::Ordering::Equal => { - // TODO: maybe return early if past seqno - let should_skip = seqno.is_some_and(|watermark| head.seqno >= watermark); - - if !should_skip { - let kv = head.materialize(&self.inner.data); - return Some(kv); - } - } - std::cmp::Ordering::Greater => { - // Already passed needle - return None; - } - std::cmp::Ordering::Less => { - // Continue to next KV - } - } - - for _ in 0..(self.restart_interval - 1) { - let kv = Self::parse_truncated_item(&mut reader, offset, base_key_offset)?; - - let cmp_result = if let Some(prefix) = &kv.prefix { - let prefix = unsafe { bytes.get_unchecked(prefix.0..prefix.1) }; - let rest_key = unsafe { bytes.get_unchecked(kv.key.0..kv.key.1) }; - compare_prefixed_slice(prefix, rest_key, needle) - } else { - let key = unsafe { bytes.get_unchecked(kv.key.0..kv.key.1) }; - key.cmp(needle) - }; - - match cmp_result { - std::cmp::Ordering::Equal => { - // TODO: maybe return early if past seqno - let should_skip = seqno.is_some_and(|watermark| kv.seqno >= watermark); - - if !should_skip { - let kv = kv.materialize(&self.inner.data); - return Some(kv); - } - } - std::cmp::Ordering::Greater => { - // Already passed needle - return None; - } - std::cmp::Ordering::Less => { - // Continue to next KV - } - } - } - } - } - - /// Reads an item by key from the block, if it exists. + #[must_use] pub fn point_read(&self, needle: &[u8], seqno: Option) -> Option { - let binary_index = self.get_binary_index_reader(); - - // NOTE: Try hash index if it exists - if let Some(lookup) = self - .get_hash_index_reader() - .map(|reader| reader.get(needle)) - { - use super::block::hash_index::Lookup::{Conflicted, Found, NotFound}; - - match lookup { - Found(bucket_value) => { - let offset = binary_index.get(usize::from(bucket_value)); - return self.scan(needle, seqno, offset); - } - NotFound => { - return None; - } - Conflicted => { - // NOTE: Fallback to binary search - } - } - } - - let offset = self.binary_search_for_offset(&binary_index, needle, seqno)?; - - self.scan(needle, seqno, offset) + let mut reader = ForwardReader::new(self); + reader.point_read(needle, seqno) } pub fn encode_items( @@ -590,1299 +500,3 @@ impl DataBlock { serializer.finish() } } - -#[cfg(test)] -#[allow(clippy::expect_used, clippy::unwrap_used)] -mod tests { - use super::*; - use crate::{ - segment::{ - block::{BlockOffset, Checksum, Header}, - Block, - }, - InternalValue, Slice, - ValueType::{Tombstone, Value}, - }; - use std::cmp::Ordering::{Equal, Greater, Less}; - use test_log::test; - - #[test] - fn v3_compare_prefixed_slice() { - assert_eq!(Equal, compare_prefixed_slice(b"", b"", b"")); - - assert_eq!(Greater, compare_prefixed_slice(b"a", b"", b"")); - assert_eq!(Greater, compare_prefixed_slice(b"", b"a", b"")); - assert_eq!(Greater, compare_prefixed_slice(b"a", b"a", b"")); - assert_eq!(Greater, compare_prefixed_slice(b"b", b"a", b"a")); - assert_eq!(Greater, compare_prefixed_slice(b"a", b"b", b"a")); - - assert_eq!(Less, compare_prefixed_slice(b"a", b"", b"y")); - assert_eq!(Less, compare_prefixed_slice(b"a", b"", b"yyy")); - assert_eq!(Less, compare_prefixed_slice(b"a", b"", b"yyy")); - assert_eq!(Less, compare_prefixed_slice(b"yyyy", b"a", b"yyyyb")); - assert_eq!(Less, compare_prefixed_slice(b"yyy", b"b", b"yyyyb")); - } - - #[test] - fn v3_data_block_point_read_one() -> crate::Result<()> { - let items = [InternalValue::from_components( - "pla:earth:fact", - "eaaaaaaaaarth", - 0, - crate::ValueType::Value, - )]; - - let bytes = DataBlock::encode_items(&items, 16, 0.0)?; - let serialized_len = bytes.len(); - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(!data_block.is_empty()); - assert_eq!(data_block.inner.size(), serialized_len); - - for needle in items { - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, None), - ); - } - - assert_eq!(None, data_block.point_read(b"yyy", None)); - - Ok(()) - } - - #[test] - fn v3_data_block_point_read() -> crate::Result<()> { - let items = [ - InternalValue::from_components( - "pla:earth:fact", - "eaaaaaaaaarth", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components( - "pla:jupiter:fact", - "Jupiter is big", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components( - "pla:jupiter:mass", - "Massive", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components( - "pla:jupiter:name", - "Jupiter", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components("pla:jupiter:radius", "Big", 0, crate::ValueType::Value), - InternalValue::from_components( - "pla:saturn:fact", - "Saturn is pretty big", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components("pla:saturn:name", "Saturn", 0, crate::ValueType::Value), - InternalValue::from_components("pla:venus:fact", "", 1, crate::ValueType::Tombstone), - InternalValue::from_components( - "pla:venus:fact", - "Venus exists", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components("pla:venus:name", "Venus", 0, crate::ValueType::Value), - ]; - - for restart_interval in 1..=20 { - let bytes = DataBlock::encode_items(&items, restart_interval, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - for needle in &items { - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), - ); - } - - assert_eq!(None, data_block.point_read(b"yyy", None)); - } - - Ok(()) - } - - #[test] - fn v3_data_block_fuzz_1() -> crate::Result<()> { - let items = [ - InternalValue::from_components([0], b"", 23_523_531_241_241_242, Value), - InternalValue::from_components([0], b"", 0, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 16, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - for needle in items { - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), - ); - } - - assert_eq!(None, data_block.point_read(b"yyy", None)); - - Ok(()) - } - - #[test] - fn v3_data_block_fuzz_2() -> crate::Result<()> { - let items = [ - InternalValue::from_components([0], [], 5, Value), - InternalValue::from_components([0], [], 4, Tombstone), - InternalValue::from_components([0], [], 3, Value), - InternalValue::from_components([0], [], 0, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 2, 0.0)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().is_none()); - - for needle in items { - eprintln!("NEEDLE {needle:?}"); - - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), - ); - } - - assert_eq!(None, data_block.point_read(b"yyy", None)); - - Ok(()) - } - - #[test] - fn v3_data_block_fuzz_3() -> crate::Result<()> { - let items = [ - InternalValue::from_components( - Slice::from([ - 255, 255, 255, 255, 5, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, - ]), - Slice::from([0, 0, 192]), - 18_446_744_073_701_163_007, - Tombstone, - ), - InternalValue::from_components( - Slice::from([255, 255, 255, 255, 255, 255, 0]), - Slice::from([]), - 0, - Value, - ), - ]; - - let bytes = DataBlock::encode_items(&items, 5, 1.0)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - assert_eq!( - { - #[allow(clippy::suspicious_map)] - data_block.iter().count() - }, - items.len(), - ); - - assert_eq!(items, *data_block.iter().collect::>(),); - - Ok(()) - } - - #[test] - fn v3_data_block_fuzz_4() -> crate::Result<()> { - let items = [ - InternalValue::from_components( - Slice::new(&[0]), - Slice::new(&[]), - 3_834_029_160_418_063_669, - Value, - ), - InternalValue::from_components(Slice::new(&[0]), Slice::new(&[]), 127, Tombstone), - InternalValue::from_components( - Slice::new(&[53, 53, 53]), - Slice::new(&[]), - 18_446_744_073_709_551_615, - Tombstone, - ), - InternalValue::from_components( - Slice::new(&[255]), - Slice::new(&[]), - 18_446_744_069_414_584_831, - Tombstone, - ), - InternalValue::from_components(Slice::new(&[255, 255]), Slice::new(&[]), 47, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 2, 1.0)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - for item in data_block.iter() { - eprintln!("{item:?}"); - } - - assert_eq!( - { - #[allow(clippy::suspicious_map)] - data_block.iter().count() - }, - items.len(), - ); - - Ok(()) - } - - #[test] - fn v3_data_block_dense() -> crate::Result<()> { - let items = [ - InternalValue::from_components(b"a", b"a", 3, Value), - InternalValue::from_components(b"b", b"b", 2, Value), - InternalValue::from_components(b"c", b"c", 1, Value), - InternalValue::from_components(b"d", b"d", 65, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 1, 0.0)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - - for needle in items { - eprintln!("NEEDLE {needle:?}"); - - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, None), - ); - } - - assert_eq!(None, data_block.point_read(b"yyy", None)); - - Ok(()) - } - - #[test] - fn v3_data_block_dense_mvcc_with_hash() -> crate::Result<()> { - let items = [ - InternalValue::from_components(b"a", b"a", 3, Value), - InternalValue::from_components(b"a", b"a", 2, Value), - InternalValue::from_components(b"a", b"a", 1, Value), - InternalValue::from_components(b"b", b"b", 65, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 1, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - for needle in items { - eprintln!("NEEDLE {needle:?}"); - - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), - ); - } - - assert_eq!(None, data_block.point_read(b"yyy", None)); - - Ok(()) - } - - #[test] - #[allow(clippy::unwrap_used)] - fn v3_data_block_mvcc_latest() -> crate::Result<()> { - let items = [ - InternalValue::from_components(b"a", b"a", 3, Value), - InternalValue::from_components(b"a", b"a", 2, Value), - InternalValue::from_components(b"a", b"a", 1, Value), - InternalValue::from_components(b"b", b"b", 65, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 1, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - assert_eq!( - Some(items.first().cloned().unwrap()), - data_block.point_read(b"a", None) - ); - assert_eq!( - Some(items.last().cloned().unwrap()), - data_block.point_read(b"b", None) - ); - assert_eq!(None, data_block.point_read(b"yyy", None)); - - Ok(()) - } - - #[test] - #[allow(clippy::unwrap_used)] - fn v3_data_block_mvcc_latest_fuzz_1() -> crate::Result<()> { - let items = [ - InternalValue::from_components(Slice::from([0]), Slice::from([]), 0, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 0, Value), - InternalValue::from_components( - Slice::from([255, 255, 0]), - Slice::from([]), - 127_886_946_205_696, - Tombstone, - ), - ]; - - let bytes = DataBlock::encode_items(&items, 2, 0.0)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - - assert_eq!( - Some(items.get(1).cloned().unwrap()), - data_block.point_read(&[233, 233], None) - ); - assert_eq!(None, data_block.point_read(b"yyy", None)); - - Ok(()) - } - - #[test] - #[allow(clippy::unwrap_used)] - fn v3_data_block_mvcc_latest_fuzz_2() -> crate::Result<()> { - let items = [ - InternalValue::from_components(Slice::from([0]), Slice::from([]), 0, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 8, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 7, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 6, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 5, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 4, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 3, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 2, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 1, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 0, Value), - InternalValue::from_components( - Slice::from([255, 255, 0]), - Slice::from([]), - 127_886_946_205_696, - Tombstone, - ), - ]; - - let bytes = DataBlock::encode_items(&items, 2, 0.0)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - - assert_eq!( - Some(items.get(1).cloned().unwrap()), - data_block.point_read(&[233, 233], None) - ); - assert_eq!( - Some(items.last().cloned().unwrap()), - data_block.point_read(&[255, 255, 0], None) - ); - assert_eq!(None, data_block.point_read(b"yyy", None)); - - Ok(()) - } - - #[test] - #[allow(clippy::unwrap_used)] - fn v3_data_block_mvcc_latest_fuzz_3() -> crate::Result<()> { - let items = [ - InternalValue::from_components(Slice::from([0]), Slice::from([]), 0, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 8, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 7, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 6, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 5, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 4, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 3, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 2, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 1, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 0, Value), - InternalValue::from_components( - Slice::from([255, 255, 0]), - Slice::from([]), - 127_886_946_205_696, - Tombstone, - ), - ]; - - let bytes = DataBlock::encode_items(&items, 2, 0.0)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - - assert_eq!( - Some(items.get(1).cloned().unwrap()), - data_block.point_read(&[233, 233], Some(SeqNo::MAX)) - ); - assert_eq!( - Some(items.last().cloned().unwrap()), - data_block.point_read(&[255, 255, 0], Some(SeqNo::MAX)) - ); - assert_eq!(None, data_block.point_read(b"yyy", None)); - - Ok(()) - } - - #[test] - #[allow(clippy::unwrap_used)] - fn v3_data_block_mvcc_latest_fuzz_3_dense() -> crate::Result<()> { - let items = [ - InternalValue::from_components(Slice::from([0]), Slice::from([]), 0, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 8, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 7, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 6, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 5, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 4, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 3, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 2, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 1, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 0, Value), - InternalValue::from_components( - Slice::from([255, 255, 0]), - Slice::from([]), - 127_886_946_205_696, - Tombstone, - ), - ]; - - let bytes = DataBlock::encode_items(&items, 1, 0.0)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - - assert_eq!( - Some(items.get(1).cloned().unwrap()), - data_block.point_read(&[233, 233], None) - ); - assert_eq!( - Some(items.last().cloned().unwrap()), - data_block.point_read(&[255, 255, 0], None) - ); - assert_eq!(None, data_block.point_read(b"yyy", None)); - - Ok(()) - } - - #[test] - fn v3_data_block_dense_mvcc_no_hash() -> crate::Result<()> { - let items = [ - InternalValue::from_components(b"a", b"a", 3, Value), - InternalValue::from_components(b"a", b"a", 2, Value), - InternalValue::from_components(b"a", b"a", 1, Value), - InternalValue::from_components(b"b", b"b", 65, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 1, 0.0)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().is_none()); - - for needle in items { - eprintln!("NEEDLE {needle:?}"); - - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), - ); - } - - assert_eq!(None, data_block.point_read(b"yyy", None)); - - Ok(()) - } - - #[test] - fn v3_data_block_point_read_shadowing() -> crate::Result<()> { - let items = [ - InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), - InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), - InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), - InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), - InternalValue::from_components("pla:venus:name", "Venus", 0, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 16, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - assert!(data_block - .point_read(b"pla:venus:fact", None) - .expect("should exist") - .is_tombstone()); - - Ok(()) - } - - #[test] - fn v3_data_block_point_read_dense() -> crate::Result<()> { - let items = [ - InternalValue::from_components("pla:earth:fact", "eaaaaaaaaarth", 0, Value), - InternalValue::from_components("pla:jupiter:fact", "Jupiter is big", 0, Value), - InternalValue::from_components("pla:jupiter:mass", "Massive", 0, Value), - InternalValue::from_components("pla:jupiter:name", "Jupiter", 0, Value), - InternalValue::from_components("pla:jupiter:radius", "Big", 0, Value), - InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), - InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), - InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), - InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), - InternalValue::from_components("pla:venus:name", "Venus", 0, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 1, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - for needle in items { - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), - ); - } - - assert_eq!(None, data_block.point_read(b"yyy", None)); - - Ok(()) - } - - #[test] - fn v3_data_block_iter_forward_one_time() -> crate::Result<()> { - let items = [InternalValue::from_components( - "pla:saturn:fact", - "Saturn is pretty big", - 0, - Value, - )]; - - let bytes = DataBlock::encode_items(&items, 16, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - - assert_eq!( - { - #[allow(clippy::suspicious_map)] - data_block.iter().count() - }, - items.len() - ); - - assert_eq!(data_block.iter().collect::>(), items); - - Ok(()) - } - - #[test] - fn v3_data_block_iter_forward() -> crate::Result<()> { - let items = [ - InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), - InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), - InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), - InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), - InternalValue::from_components("pla:venus:name", "Venus", 0, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 16, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - assert_eq!( - { - #[allow(clippy::suspicious_map)] - data_block.iter().count() - }, - items.len(), - ); - - assert_eq!(items, *data_block.iter().collect::>(),); - - Ok(()) - } - - #[test] - fn v3_data_block_iter_forward_dense() -> crate::Result<()> { - let items = [InternalValue::from_components( - "pla:saturn:fact", - "Saturn is pretty big", - 0, - Value, - )]; - - let bytes = DataBlock::encode_items(&items, 1, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - - assert_eq!(items.len(), { - #[allow(clippy::suspicious_map)] - data_block.iter().count() - }); - - assert_eq!(items, *data_block.iter().collect::>(),); - - Ok(()) - } - - #[test] - fn v3_data_block_iter_rev() -> crate::Result<()> { - let items = [ - InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), - InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), - InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), - InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), - InternalValue::from_components("pla:venus:name", "Venus", 0, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 16, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - assert_eq!(items.len(), { - #[allow(clippy::suspicious_map)] - data_block.iter().rev().count() - }); - - assert_eq!( - items.into_iter().rev().collect::>(), - data_block.iter().rev().collect::>(), - ); - - Ok(()) - } - - #[test] - fn v3_data_block_iter_ping_pong() -> crate::Result<()> { - let items = [ - InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), - InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), - InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), - InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), - InternalValue::from_components("pla:venus:name", "Venus", 0, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 16, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - { - let mut iter = data_block.iter(); - - assert_eq!(b"pla:saturn:fact", &*iter.next().unwrap().key.user_key); - assert_eq!(b"pla:venus:name", &*iter.next_back().unwrap().key.user_key); - assert_eq!(b"pla:saturn:name", &*iter.next().unwrap().key.user_key); - assert_eq!(b"pla:venus:fact", &*iter.next_back().unwrap().key.user_key); - - let last = iter.next().unwrap().key; - assert_eq!(b"pla:venus:fact", &*last.user_key); - assert_eq!(Tombstone, last.value_type); - assert_eq!(1, last.seqno); - } - - { - let mut iter = data_block.iter(); - - assert_eq!(b"pla:venus:name", &*iter.next_back().unwrap().key.user_key); - assert_eq!( - b"pla:saturn:fact", - &*iter - .next() - .inspect(|v| { - eprintln!("{:?}", String::from_utf8_lossy(&v.key.user_key)); - }) - .unwrap() - .key - .user_key - ); - assert_eq!(b"pla:venus:fact", &*iter.next_back().unwrap().key.user_key); - assert_eq!(b"pla:saturn:name", &*iter.next().unwrap().key.user_key); - - let last = iter.next_back().unwrap().key; - assert_eq!(b"pla:venus:fact", &*last.user_key); - assert_eq!(Tombstone, last.value_type); - assert_eq!(1, last.seqno); - } - - Ok(()) - } - - #[test] - fn v3_data_block_range() -> crate::Result<()> { - let items = [ - InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), - InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), - InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), - InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), - InternalValue::from_components("pla:venus:name", "Venus", 0, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 16, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - assert_eq!( - { - #[allow(clippy::suspicious_map)] - data_block.range(&((b"pla:venus:" as &[u8])..)).count() - }, - 3, - ); - - Ok(()) - } - - #[test] - fn v3_data_block_range_rev() -> crate::Result<()> { - let items = [ - InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), - InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), - InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), - InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), - InternalValue::from_components("pla:venus:name", "Venus", 0, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 16, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - assert_eq!( - { - #[allow(clippy::suspicious_map)] - data_block - .range(&((b"pla:venus:" as &[u8])..)) - .rev() - .count() - }, - 3, - ); - - Ok(()) - } - - #[test] - fn v3_data_block_small_hash_ratio() -> crate::Result<()> { - let items = (0u64..254) - .map(|x| InternalValue::from_components(x.to_be_bytes(), x.to_be_bytes(), 0, Value)) - .collect::>(); - - // NOTE: If >0.0, buckets are at least 1 - let bytes = DataBlock::encode_items(&items, 1, 0.0001)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - for needle in items { - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), - ); - } - - Ok(()) - } - - #[test] - fn v3_data_block_just_enough_pointers_for_hash_bucket() -> crate::Result<()> { - let items = (0u64..254) - .map(|x| InternalValue::from_components(x.to_be_bytes(), x.to_be_bytes(), 0, Value)) - .collect::>(); - - let bytes = DataBlock::encode_items(&items, 1, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - for needle in items { - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), - ); - } - - Ok(()) - } - - #[test] - fn v3_data_block_too_many_pointers_for_hash_bucket() -> crate::Result<()> { - let items = (0u64..255) - .map(|x| InternalValue::from_components(x.to_be_bytes(), x.to_be_bytes(), 0, Value)) - .collect::>(); - - let bytes = DataBlock::encode_items(&items, 1, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().is_none()); - - for needle in items { - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), - ); - } - - Ok(()) - } - - #[test] - fn v3_data_block_way_too_many_pointers_for_hash_bucket() -> crate::Result<()> { - let items = (0u64..1_000) - .map(|x| InternalValue::from_components(x.to_be_bytes(), x.to_be_bytes(), 0, Value)) - .collect::>(); - - let bytes = DataBlock::encode_items(&items, 1, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().is_none()); - - for needle in items { - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), - ); - } - - Ok(()) - } - - #[test] - fn v3_data_block_no_hash_index() -> crate::Result<()> { - let items = (0u64..1) - .map(|x| InternalValue::from_components(x.to_be_bytes(), x.to_be_bytes(), 0, Value)) - .collect::>(); - - let bytes = DataBlock::encode_items(&items, 1, 0.0)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().is_none()); - - for needle in items { - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), - ); - } - - Ok(()) - } - - #[test] - fn v3_data_block_consume_last_back() -> crate::Result<()> { - let items = [ - InternalValue::from_components("pla:earth:fact", "eaaaaaaaaarth", 0, Value), - InternalValue::from_components("pla:jupiter:fact", "Jupiter is big", 0, Value), - InternalValue::from_components("pla:jupiter:mass", "Massive", 0, Value), - InternalValue::from_components("pla:jupiter:name", "Jupiter", 0, Value), - InternalValue::from_components("pla:jupiter:radius", "Big", 0, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 1, 0.0)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().is_none()); - - { - let mut iter = data_block.iter(); - assert_eq!(b"pla:earth:fact", &*iter.next().unwrap().key.user_key); - assert_eq!(b"pla:jupiter:fact", &*iter.next().unwrap().key.user_key); - assert_eq!(b"pla:jupiter:mass", &*iter.next().unwrap().key.user_key); - assert_eq!(b"pla:jupiter:name", &*iter.next().unwrap().key.user_key); - assert_eq!( - b"pla:jupiter:radius", - &*iter.next_back().unwrap().key.user_key - ); - assert!(iter.next_back().is_none()); - assert!(iter.next().is_none()); - } - - { - let mut iter = data_block.iter(); - assert_eq!(b"pla:earth:fact", &*iter.next().unwrap().key.user_key); - assert_eq!(b"pla:jupiter:fact", &*iter.next().unwrap().key.user_key); - assert_eq!(b"pla:jupiter:mass", &*iter.next().unwrap().key.user_key); - assert_eq!(b"pla:jupiter:name", &*iter.next().unwrap().key.user_key); - assert_eq!( - b"pla:jupiter:radius", - &*iter.next_back().unwrap().key.user_key - ); - assert!(iter.next().is_none()); - assert!(iter.next_back().is_none()); - } - - Ok(()) - } - - #[test] - fn v3_data_block_consume_last_forwards() -> crate::Result<()> { - let items = [ - InternalValue::from_components("pla:earth:fact", "eaaaaaaaaarth", 0, Value), - InternalValue::from_components("pla:jupiter:fact", "Jupiter is big", 0, Value), - InternalValue::from_components("pla:jupiter:mass", "Massive", 0, Value), - InternalValue::from_components("pla:jupiter:name", "Jupiter", 0, Value), - InternalValue::from_components("pla:jupiter:radius", "Big", 0, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 1, 0.0)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().is_none()); - - { - let mut iter = data_block.iter().rev(); - assert_eq!(b"pla:earth:fact", &*iter.next_back().unwrap().key.user_key); - assert_eq!( - b"pla:jupiter:fact", - &*iter.next_back().unwrap().key.user_key - ); - assert_eq!( - b"pla:jupiter:mass", - &*iter.next_back().unwrap().key.user_key - ); - assert_eq!( - b"pla:jupiter:name", - &*iter.next_back().unwrap().key.user_key - ); - assert_eq!(b"pla:jupiter:radius", &*iter.next().unwrap().key.user_key); - assert!(iter.next().is_none()); - assert!(iter.next_back().is_none()); - } - - { - let mut iter = data_block.iter().rev(); - assert_eq!(b"pla:earth:fact", &*iter.next_back().unwrap().key.user_key); - assert_eq!( - b"pla:jupiter:fact", - &*iter.next_back().unwrap().key.user_key - ); - assert_eq!( - b"pla:jupiter:mass", - &*iter.next_back().unwrap().key.user_key - ); - assert_eq!( - b"pla:jupiter:name", - &*iter.next_back().unwrap().key.user_key - ); - assert_eq!(b"pla:jupiter:radius", &*iter.next().unwrap().key.user_key); - assert!(iter.next_back().is_none()); - assert!(iter.next().is_none()); - } - - Ok(()) - } -} diff --git a/src/segment/index_block/block_handle.rs b/src/segment/index_block/block_handle.rs index 331dfcd3..f75f7ab0 100644 --- a/src/segment/index_block/block_handle.rs +++ b/src/segment/index_block/block_handle.rs @@ -22,14 +22,17 @@ pub struct BlockHandle { } impl BlockHandle { + #[must_use] pub fn new(offset: BlockOffset, size: u32) -> Self { Self { offset, size } } + #[must_use] pub fn size(&self) -> u32 { self.size } + #[must_use] pub fn offset(&self) -> BlockOffset { self.offset } diff --git a/src/segment/index_block/forward_reader.rs b/src/segment/index_block/forward_reader.rs index 88744692..9a71c90c 100644 --- a/src/segment/index_block/forward_reader.rs +++ b/src/segment/index_block/forward_reader.rs @@ -37,6 +37,8 @@ pub struct ParsedItem { impl ParsedItem { pub fn materialize(&self, bytes: &Slice) -> KeyedBlockHandle { + // NOTE: We consider the prefix and key slice indexes to be trustworthy + #[allow(clippy::indexing_slicing)] let end_key = if let Some(prefix) = &self.prefix { let prefix_key = &bytes[prefix.0..prefix.1]; let rest_key = &bytes[self.end_key.0..self.end_key.1]; diff --git a/src/segment/index_block/mod.rs b/src/segment/index_block/mod.rs index e4b4dd1a..59d583ac 100644 --- a/src/segment/index_block/mod.rs +++ b/src/segment/index_block/mod.rs @@ -6,7 +6,6 @@ mod block_handle; mod forward_reader; pub use block_handle::{BlockHandle, KeyedBlockHandle}; -use forward_reader::{ForwardReader, ParsedItem, ParsedSlice}; use super::{ block::{binary_index::Reader as BinaryIndexReader, BlockOffset, Encoder, Trailer}, @@ -14,14 +13,15 @@ use super::{ }; use crate::segment::block::TRAILER_START_MARKER; use byteorder::{LittleEndian, ReadBytesExt}; +use forward_reader::{ForwardReader, ParsedItem, ParsedSlice}; use std::io::{Cursor, Seek}; use varint_rs::VarintReader; macro_rules! unwrappy { ($x:expr) => { - // $x.expect("should read") + $x.expect("should read") - unsafe { $x.unwrap_unchecked() } + // unsafe { $x.unwrap_unchecked() } }; } diff --git a/src/segment/util.rs b/src/segment/util.rs index 3ee25cc2..79e3a6b0 100644 --- a/src/segment/util.rs +++ b/src/segment/util.rs @@ -2,8 +2,6 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use std::cmp::Ordering; - pub fn longest_shared_prefix_length(s1: &[u8], s2: &[u8]) -> usize { s1.iter() .zip(s2.iter()) @@ -12,37 +10,35 @@ pub fn longest_shared_prefix_length(s1: &[u8], s2: &[u8]) -> usize { } // TODO: Fuzz test -pub fn compare_prefixed_slice(prefix: &[u8], suffix: &[u8], needle: &[u8]) -> Ordering { +pub fn compare_prefixed_slice(prefix: &[u8], suffix: &[u8], needle: &[u8]) -> std::cmp::Ordering { + use std::cmp::Ordering::{Equal, Greater, Less}; + if needle.is_empty() { let combined_len = prefix.len() + suffix.len(); - return if combined_len > 0 { - Ordering::Greater - } else { - Ordering::Equal - }; + return if combined_len > 0 { Greater } else { Equal }; } match prefix.len().cmp(&needle.len()) { - Ordering::Equal => match prefix.cmp(needle) { - Ordering::Equal => {} + Equal => match prefix.cmp(needle) { + Equal => {} ordering => return ordering, }, - Ordering::Greater => { + Greater => { // SAFETY: We know that the prefix is longer than the needle, so we can safely // truncate it to the needle's length #[allow(unsafe_code)] let prefix = unsafe { prefix.get_unchecked(0..needle.len()) }; return prefix.cmp(needle); } - Ordering::Less => { + Less => { // SAFETY: We know that the needle is longer than the prefix, so we can safely // truncate it to the prefix's length #[allow(unsafe_code)] let needle = unsafe { needle.get_unchecked(0..prefix.len()) }; match prefix.cmp(needle) { - Ordering::Equal => {} + Equal => {} ordering => return ordering, } } @@ -54,3 +50,29 @@ pub fn compare_prefixed_slice(prefix: &[u8], suffix: &[u8], needle: &[u8]) -> Or let needle = unsafe { needle.get_unchecked(prefix.len()..) }; suffix.cmp(needle) } + +#[cfg(test)] +#[allow(clippy::expect_used, clippy::unwrap_used)] +mod tests { + use super::*; + use test_log::test; + + #[test] + fn v3_compare_prefixed_slice() { + use std::cmp::Ordering::{Equal, Greater, Less}; + + assert_eq!(Equal, compare_prefixed_slice(b"", b"", b"")); + + assert_eq!(Greater, compare_prefixed_slice(b"a", b"", b"")); + assert_eq!(Greater, compare_prefixed_slice(b"", b"a", b"")); + assert_eq!(Greater, compare_prefixed_slice(b"a", b"a", b"")); + assert_eq!(Greater, compare_prefixed_slice(b"b", b"a", b"a")); + assert_eq!(Greater, compare_prefixed_slice(b"a", b"b", b"a")); + + assert_eq!(Less, compare_prefixed_slice(b"a", b"", b"y")); + assert_eq!(Less, compare_prefixed_slice(b"a", b"", b"yyy")); + assert_eq!(Less, compare_prefixed_slice(b"a", b"", b"yyy")); + assert_eq!(Less, compare_prefixed_slice(b"yyyy", b"a", b"yyyyb")); + assert_eq!(Less, compare_prefixed_slice(b"yyy", b"b", b"yyyyb")); + } +} From 71f42a8e80f38b803afc227f24428df78f8f21a2 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 5 May 2025 22:04:25 +0200 Subject: [PATCH 109/613] drop guardian --- Cargo.toml | 1 - src/tree/mod.rs | 30 ++++++++++++------------------ 2 files changed, 12 insertions(+), 19 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9b051d1f..bbe27064 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,7 +28,6 @@ byteview = "0.6.1" crossbeam-skiplist = "0.1.3" double-ended-peekable = "0.1.0" enum_dispatch = "0.3.13" -guardian = "1.1.0" interval-heap = "0.0.5" log = "0.4.22" lz4_flex = { version = "0.11.3", optional = true, default-features = false } diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 2c12deed..4f8beb9e 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -715,30 +715,24 @@ impl Tree { let bounds: (Bound, Bound) = (lo, hi); - log::trace!("range read: acquiring levels manifest read lock"); // NOTE: Mind lock order L -> M -> S - let level_manifest = - guardian::ArcRwLockReadGuardian::take(self.levels.clone()).expect("lock is poisoned"); - log::trace!("range read: acquired level manifest read lock"); + log::trace!("range read: acquiring read locks"); - log::trace!("range read: acquiring active memtable read lock"); - let active = guardian::ArcRwLockReadGuardian::take(self.active_memtable.clone()) - .expect("lock is poisoned"); - log::trace!("range read: acquired active memtable read lock"); + let level_manifest = self.levels.read().expect("lock is poisoned"); - log::trace!("range read: acquiring sealed memtable read lock"); - let sealed = guardian::ArcRwLockReadGuardian::take(self.sealed_memtables.clone()) - .expect("lock is poisoned"); - log::trace!("range read: acquired sealed memtable read lock"); + let iter_state = { + let active = self.active_memtable.read().expect("lock is poisoned"); + let sealed = &self.sealed_memtables.read().expect("lock is poisoned"); - let iter_state = IterState { - active: active.clone(), - sealed: sealed.iter().map(|(_, mt)| mt.clone()).collect(), - ephemeral, - levels: level_manifest.levels.clone(), + IterState { + active: active.clone(), + sealed: sealed.iter().map(|(_, mt)| mt.clone()).collect(), + ephemeral, + levels: level_manifest.levels.clone(), + } }; - TreeIter::create_range(iter_state, bounds, seqno, level_manifest) + TreeIter::create_range(iter_state, bounds, seqno, &level_manifest) } #[doc(hidden)] From b8698fe3347afaa8e0bf3e05013263cc4bcc13a5 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 5 May 2025 22:04:51 +0200 Subject: [PATCH 110/613] refactor --- src/compression.rs | 2 +- src/range.rs | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/compression.rs b/src/compression.rs index eac0358a..09a8237b 100644 --- a/src/compression.rs +++ b/src/compression.rs @@ -56,7 +56,7 @@ impl Encode for CompressionType { writer.write_u8(2)?; writer.write_u8(*level)?; } - }; + } Ok(()) } diff --git a/src/range.rs b/src/range.rs index d0191e8f..94d2f299 100644 --- a/src/range.rs +++ b/src/range.rs @@ -11,7 +11,6 @@ use crate::{ value::{SeqNo, UserKey}, InternalValue, }; -use guardian::ArcRwLockReadGuardian; use self_cell::self_cell; use std::{ops::Bound, sync::Arc}; @@ -139,7 +138,7 @@ impl TreeIter { guard: IterState, bounds: (Bound, Bound), seqno: Option, - level_manifest: ArcRwLockReadGuardian, + level_manifest: &LevelManifest, ) -> Self { todo!() From e596175d9e33278ff4d916312b6ffeada7c034c3 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 5 May 2025 22:04:55 +0200 Subject: [PATCH 111/613] doc --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 0130231d..1b97ccb4 100644 --- a/README.md +++ b/README.md @@ -21,15 +21,15 @@ This is the most feature-rich LSM-tree implementation in Rust! It features: - Thread-safe BTreeMap-like API - Mostly [safe](./UNSAFE.md) & 100% stable Rust -- Block-based tables with compression support - - Optional hash indexes in blocks for faster point lookups [[3]](#footnotes) +- Block-based tables with compression support & prefix truncation + - Optional block hash indexes in blocks for faster point lookups [[3]](#footnotes) - Range & prefix searching with forward and reverse iteration -- Size-tiered, (concurrent) Leveled and FIFO compaction -- Multi-threaded flushing (immutable/sealed memtables) -- Optionally partitioned block index to reduce memory footprint and keep startup time short [[1]](#footnotes) - Block caching to keep hot data in memory -- Bloom filters to increase point lookup performance +- AMQ filters (currently Bloom filters) to increase point lookup performance - Snapshots (MVCC) +- Optionally partitioned block index & filters for better cache efficiency [[1]](#footnotes) +- Size-tiered, (concurrent) Leveled and FIFO compaction +- Multi-threaded flushing (immutable/sealed memtables) - Key-value separation (optional) [[2]](#footnotes) - Single deletion tombstones ("weak" deletion) From b6899da4f0eafedd469faf1e4182644362893596 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 5 May 2025 22:06:18 +0200 Subject: [PATCH 112/613] rename structs --- src/segment/block_index/mod.rs | 16 ++++++++-------- src/segment/mod.rs | 6 +++--- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/segment/block_index/mod.rs b/src/segment/block_index/mod.rs index 252cda5e..4070ceb1 100644 --- a/src/segment/block_index/mod.rs +++ b/src/segment/block_index/mod.rs @@ -5,7 +5,7 @@ use super::{CachePolicy, IndexBlock, KeyedBlockHandle}; #[enum_dispatch::enum_dispatch] -pub trait NewBlockIndex { +pub trait BlockIndex { /// Gets the lowest block handle that can possibly contain the given item. fn get_lowest_block_containing_key( &self, @@ -42,19 +42,19 @@ pub trait NewBlockIndex { /// found by finding the highest block that has a lower or equal end key than the searched key (by performing in-memory binary search). /// In the diagram above, searching for 'J' yields the block starting with 'G'. /// 'J' must be in that block, because the next block starts with 'M'). -#[enum_dispatch::enum_dispatch(NewBlockIndex)] +#[enum_dispatch::enum_dispatch(BlockIndex)] #[allow(clippy::module_name_repetitions)] -pub enum NewBlockIndexImpl { - Full(NewFullBlockIndex), +pub enum BlockIndexImpl { + Full(FullBlockIndex), // TwoLevel(TwoLevelBlockIndex), } /// Index that translates item keys to data block handles /// /// The index is fully loaded into memory. -pub struct NewFullBlockIndex(IndexBlock); +pub struct FullBlockIndex(IndexBlock); -impl NewFullBlockIndex { +impl FullBlockIndex { pub fn new(block: IndexBlock) -> Self { Self(block) } @@ -67,7 +67,7 @@ impl NewFullBlockIndex { } } -impl NewBlockIndex for NewFullBlockIndex { +impl BlockIndex for FullBlockIndex { fn get_last_block_containing_key( &self, key: &[u8], @@ -97,7 +97,7 @@ impl NewBlockIndex for NewFullBlockIndex { } } */ -/* impl NewFullBlockIndex { +/* impl FullBlockIndex { /* pub fn from_file( path: &Path, metadata: &crate::segment::meta::Metadata, diff --git a/src/segment/mod.rs b/src/segment/mod.rs index d93c809e..87c99f58 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -26,7 +26,7 @@ pub use writer::Writer; use crate::{ cache::Cache, descriptor_table::DescriptorTable, InternalValue, SeqNo, TreeId, UserKey, }; -use block_index::{NewBlockIndex, NewBlockIndexImpl, NewFullBlockIndex}; +use block_index::{BlockIndex, BlockIndexImpl, FullBlockIndex}; use filter::standard_bloom::{CompositeHash, StandardBloomFilter}; use inner::Inner; use meta::ParsedMeta; @@ -192,7 +192,7 @@ impl Segment { return Ok(block.point_read(key, None)); } Some(seqno) => { - let NewBlockIndexImpl::Full(block_index) = &*self.block_index else { + let BlockIndexImpl::Full(block_index) = &*self.block_index else { todo!(); }; @@ -322,7 +322,7 @@ impl Segment { // BlockIndexImpl::TwoLevel(tli_block, todo!()) } else { log::debug!("Creating full block index, with tli_ptr={:?}", trailer.tli); - NewBlockIndexImpl::Full(NewFullBlockIndex::new(tli_block)) + BlockIndexImpl::Full(FullBlockIndex::new(tli_block)) }; /* let block_index = if use_full_block_index { From 97a64e96ba0c0fd19aedb4a6da4cc6dac1e8da89 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 5 May 2025 22:08:01 +0200 Subject: [PATCH 113/613] fix --- src/segment/inner.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/segment/inner.rs b/src/segment/inner.rs index ccd4bd88..fcb11dea 100644 --- a/src/segment/inner.rs +++ b/src/segment/inner.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{ - block_index::NewBlockIndexImpl, filter::standard_bloom::StandardBloomFilter, meta::ParsedMeta, + block_index::BlockIndexImpl, filter::standard_bloom::StandardBloomFilter, meta::ParsedMeta, trailer::Trailer, }; use crate::{ @@ -30,7 +30,7 @@ pub struct Inner { /// Translates key (first item of a block) to block offset (address inside file) and (compressed) size #[doc(hidden)] - pub block_index: Arc, + pub block_index: Arc, /// Block cache /// From a87060b1db59b1d219965fae44facf7a7aa3634a Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 6 May 2025 01:27:42 +0200 Subject: [PATCH 114/613] add indirection for trailer->block handles also bloom filter unpinning --- README.md | 1 + src/abstract.rs | 10 +- src/blob_tree/mod.rs | 4 +- src/compaction/worker.rs | 1 + src/segment/index_block/block_handle.rs | 1 - src/segment/inner.rs | 8 +- src/segment/meta.rs | 9 +- src/segment/mod.rs | 245 +++++++++++++++++++----- src/segment/regions.rs | 107 +++++++++++ src/segment/trailer.rs | 148 ++++---------- src/segment/writer/mod.rs | 43 +++-- src/tree/ingest.rs | 3 +- src/tree/mod.rs | 9 +- 13 files changed, 388 insertions(+), 201 deletions(-) create mode 100644 src/segment/regions.rs diff --git a/README.md b/README.md index 1b97ccb4..42c05254 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,7 @@ This is the most feature-rich LSM-tree implementation in Rust! It features: - Mostly [safe](./UNSAFE.md) & 100% stable Rust - Block-based tables with compression support & prefix truncation - Optional block hash indexes in blocks for faster point lookups [[3]](#footnotes) + - Per-level filter/index block pinning configuration - Range & prefix searching with forward and reverse iteration - Block caching to keep hot data in memory - AMQ filters (currently Bloom filters) to increase point lookup performance diff --git a/src/abstract.rs b/src/abstract.rs index fc6b475e..a7237daf 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -3,9 +3,9 @@ // (found in the LICENSE-* files in the repository) use crate::{ - compaction::CompactionStrategy, config::TreeType, segment::Segment, - tree::inner::MemtableId, AnyTree, BlobTree, Config, KvPair, Memtable, SegmentId, SeqNo, - Snapshot, Tree, UserKey, UserValue, + compaction::CompactionStrategy, config::TreeType, segment::Segment, tree::inner::MemtableId, + AnyTree, BlobTree, Config, KvPair, Memtable, SegmentId, SeqNo, Snapshot, Tree, UserKey, + UserValue, }; use enum_dispatch::enum_dispatch; use std::{ @@ -42,8 +42,8 @@ pub trait AbstractTree { /// Will return `Err` if an IO error occurs. fn major_compact(&self, target_size: u64, seqno_threshold: SeqNo) -> crate::Result<()>; - /// Gets the memory usage of all bloom filters in the tree. - fn bloom_filter_size(&self) -> usize; + /// Gets the memory usage of all pinned bloom filters in the tree. + fn pinned_bloom_filter_size(&self) -> usize; // TODO:? /* #[doc(hidden)] diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index 97b3a2cf..d08134d3 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -331,8 +331,8 @@ impl AbstractTree for BlobTree { })) } - fn bloom_filter_size(&self) -> usize { - self.index.bloom_filter_size() + fn pinned_bloom_filter_size(&self) -> usize { + self.index.pinned_bloom_filter_size() } fn sealed_memtable_count(&self) -> usize { diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 32bde052..404d3b32 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -359,6 +359,7 @@ fn merge_segments( opts.tree_id, opts.config.cache.clone(), opts.config.descriptor_table.clone(), + true, // TODO: look at configuration ) /* let segment_id = trailer.metadata.id; diff --git a/src/segment/index_block/block_handle.rs b/src/segment/index_block/block_handle.rs index f75f7ab0..8dd20729 100644 --- a/src/segment/index_block/block_handle.rs +++ b/src/segment/index_block/block_handle.rs @@ -22,7 +22,6 @@ pub struct BlockHandle { } impl BlockHandle { - #[must_use] pub fn new(offset: BlockOffset, size: u32) -> Self { Self { offset, size } } diff --git a/src/segment/inner.rs b/src/segment/inner.rs index fcb11dea..5b4ada30 100644 --- a/src/segment/inner.rs +++ b/src/segment/inner.rs @@ -4,7 +4,7 @@ use super::{ block_index::BlockIndexImpl, filter::standard_bloom::StandardBloomFilter, meta::ParsedMeta, - trailer::Trailer, + regions::ParsedRegions, }; use crate::{ cache::Cache, descriptor_table::DescriptorTable, tree::inner::TreeId, GlobalSegmentId, @@ -22,11 +22,13 @@ pub struct Inner { #[doc(hidden)] pub descriptor_table: Arc, - /// Segment metadata object + /// Parsed metadata #[doc(hidden)] pub metadata: ParsedMeta, - pub(crate) trailer: Trailer, // TODO: remove...? + /// Parsed region block handles + #[doc(hidden)] + pub regions: ParsedRegions, /// Translates key (first item of a block) to block offset (address inside file) and (compressed) size #[doc(hidden)] diff --git a/src/segment/meta.rs b/src/segment/meta.rs index f3bb818e..cd71a5cc 100644 --- a/src/segment/meta.rs +++ b/src/segment/meta.rs @@ -2,12 +2,12 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::{trailer::Trailer, Block, DataBlock}; +use super::{Block, BlockHandle, DataBlock}; use crate::{coding::Decode, CompressionType, KeyRange, SegmentId, SeqNo}; use byteorder::{LittleEndian, ReadBytesExt}; use std::{fs::File, ops::Deref}; -/// Nano-second timestamp. +/// Nanosecond timestamp. pub struct Timestamp(u128); impl Deref for Timestamp { @@ -45,9 +45,8 @@ pub struct ParsedMeta { impl ParsedMeta { #[allow(clippy::expect_used)] - pub fn from_trailer(file: &File, trailer: &Trailer) -> crate::Result { - let ptr = trailer.metadata; - let block = Block::from_file(file, ptr.offset(), ptr.size(), CompressionType::None)?; + pub fn load_with_handle(file: &File, handle: &BlockHandle) -> crate::Result { + let block = Block::from_file(file, handle.offset(), handle.size(), CompressionType::None)?; let block = DataBlock::new(block); assert_eq!( diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 87c99f58..31711bd0 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -11,6 +11,7 @@ mod index_block; mod inner; mod meta; pub(crate) mod multi_writer; +mod regions; mod scanner; mod trailer; pub(crate) mod util; @@ -20,6 +21,7 @@ pub use block::{Block, BlockOffset, Checksum}; pub use data_block::DataBlock; pub use id::{GlobalSegmentId, SegmentId}; pub use index_block::{BlockHandle, IndexBlock, KeyedBlockHandle}; +use regions::ParsedRegions; pub use scanner::Scanner; pub use writer::Writer; @@ -98,13 +100,11 @@ impl Segment { } #[must_use] - pub fn bloom_filter_size(&self) -> usize { - if let Some(pinned_filter) = &self.pinned_filter { - pinned_filter.len() - } else { - // TODO: meta.filter_size - todo!() - } + pub fn pinned_bloom_filter_size(&self) -> usize { + self.pinned_filter + .as_ref() + .map(|filter| filter.len()) + .unwrap_or_default() } /// Gets the segment ID. @@ -168,6 +168,20 @@ impl Segment { } if let Some(filter) = &self.pinned_filter { + if !filter.contains_hash(key_hash) { + return Ok(None); + } + } else if let Some(filter_block_handle) = &self.regions.filter { + use crate::coding::Decode; + + let block = self.load_block(filter_block_handle)?; + + let mut reader = &block.data[..]; + + // TODO: FilterReader that does not need decoding... just use the slice directly + let filter = StandardBloomFilter::decode_from(&mut reader) + .map_err(Into::::into)?; + if !filter.contains_hash(key_hash) { return Ok(None); } @@ -282,46 +296,47 @@ impl Segment { tree_id: TreeId, cache: Arc, descriptor_table: Arc, + pin_filter: bool, ) -> crate::Result { - // use block_index::{full_index::FullBlockIndex, two_level_index::TwoLevelBlockIndex}; use trailer::Trailer; log::debug!("Recovering segment from file {file_path:?}"); - let trailer = Trailer::from_file(file_path)?; - log::trace!("Got trailer: {trailer:#?}"); + let mut file = std::fs::File::open(file_path)?; - log::debug!("Reading meta block, with meta_ptr={:?}", trailer.metadata); - let metadata = ParsedMeta::from_trailer(&std::fs::File::open(file_path)?, &trailer)?; + let trailer = Trailer::from_file(&mut file)?; + log::trace!("Got trailer: {trailer:#?}"); - /* assert_eq!( - 0, *trailer.range_tombstones_ptr, - "Range tombstones not supported" - ); */ + log::debug!( + "Reading regions block, with region_ptr={:?}", + trailer.regions_block_handle(), + ); + let regions = ParsedRegions::load_with_handle(&file, trailer.regions_block_handle())?; - let file = std::fs::File::open(file_path)?; + log::debug!("Reading meta block, with meta_ptr={:?}", regions.metadata); + let metadata = ParsedMeta::load_with_handle(&file, ®ions.metadata)?; let tli_block = { - log::debug!("Reading TLI block, with tli_ptr={:?}", trailer.tli); + log::debug!("Reading TLI block, with tli_ptr={:?}", regions.tli); let block = Block::from_file( &file, - trailer.tli.offset(), - trailer.tli.size(), + regions.tli.offset(), + regions.tli.size(), metadata.data_block_compression, // TODO: index blocks may get their own compression level )?; IndexBlock::new(block) }; - let block_index = if let Some(index_block_handle) = trailer.index_blocks { + let block_index = if let Some(index_block_handle) = regions.index { log::debug!( "Creating partitioned block index, with tli_ptr={:?}, index_block_ptr={index_block_handle:?}", - trailer.tli, + regions.tli, ); todo!(); // BlockIndexImpl::TwoLevel(tli_block, todo!()) } else { - log::debug!("Creating full block index, with tli_ptr={:?}", trailer.tli); + log::debug!("Creating full block index, with tli_ptr={:?}", regions.tli); BlockIndexImpl::Full(FullBlockIndex::new(tli_block)) }; @@ -342,24 +357,30 @@ impl Segment { BlockIndexImpl::TwoLevel(block_index) }; */ - let pinned_filter = trailer - .filter - .map(|filter_ptr| { - use crate::coding::Decode; + // TODO: load FilterBlock + let pinned_filter = if pin_filter { + regions + .filter + .map(|filter_ptr| { + use crate::coding::Decode; - log::debug!("Reading filter block for pinning, with filter_ptr={filter_ptr:?}"); + log::debug!("Loading and pinning filter block, with filter_ptr={filter_ptr:?}"); - let block = Block::from_file( - &file, + let block = Block::from_file( + &file, filter_ptr.offset(), filter_ptr.size(), crate::CompressionType::None, // NOTE: We never write a filter block with compression - )?; + )?; - let mut reader = &block.data[..]; - StandardBloomFilter::decode_from(&mut reader).map_err(Into::::into) - }) - .transpose()?; + let mut reader = &block.data[..]; + StandardBloomFilter::decode_from(&mut reader) + .map_err(Into::::into) + }) + .transpose()? + } else { + None + }; descriptor_table.insert_for_table((tree_id, metadata.id).into(), Arc::new(file)); @@ -368,7 +389,7 @@ impl Segment { tree_id, metadata, - trailer, + regions, cache, @@ -457,6 +478,7 @@ mod tests { 0, Arc::new(Cache::with_capacity_bytes(1_000_000)), Arc::new(DescriptorTable::new(10)), + true, )?; assert_eq!(5, segment.id()); @@ -464,19 +486,42 @@ mod tests { assert_eq!(1, segment.metadata.data_block_count); assert_eq!(1, segment.metadata.index_block_count); // 1 because we use a full index assert!( - segment.trailer.index_blocks.is_none(), + segment.regions.index.is_none(), "should use full index, so only TLI exists", ); assert_eq!( b"abc", - &*segment.point_read(b"abc", None)?.unwrap().key.user_key, + &*segment + .get( + b"abc", + None, + crate::segment::filter::standard_bloom::Builder::get_hash(b"abc") + )? + .unwrap() + .key + .user_key, ); assert_eq!( b"abc", - &*segment.point_read(b"abc", None)?.unwrap().key.user_key, + &*segment + .get( + b"abc", + None, + crate::segment::filter::standard_bloom::Builder::get_hash(b"abc") + )? + .unwrap() + .key + .user_key, + ); + assert_eq!( + None, + segment.get( + b"def", + None, + crate::segment::filter::standard_bloom::Builder::get_hash(b"def") + )? ); - assert_eq!(None, segment.point_read(b"def", None)?); assert_eq!( segment.metadata.key_range, @@ -515,6 +560,7 @@ mod tests { 0, Arc::new(Cache::with_capacity_bytes(1_000_000)), Arc::new(DescriptorTable::new(10)), + true, )?; assert_eq!(5, segment.id()); @@ -522,23 +568,54 @@ mod tests { assert_eq!(1, segment.metadata.data_block_count); assert_eq!(1, segment.metadata.index_block_count); // 1 because we use a full index assert!( - segment.trailer.index_blocks.is_none(), + segment.regions.index.is_none(), "should use full index, so only TLI exists", ); assert_eq!( b"abc", - &*segment.point_read(b"abc", None)?.unwrap().key.user_key, + &*segment + .get( + b"abc", + None, + crate::segment::filter::standard_bloom::Builder::get_hash(b"abc") + )? + .unwrap() + .key + .user_key, ); assert_eq!( b"def", - &*segment.point_read(b"def", None)?.unwrap().key.user_key, + &*segment + .get( + b"def", + None, + crate::segment::filter::standard_bloom::Builder::get_hash(b"def") + )? + .unwrap() + .key + .user_key, ); assert_eq!( b"xyz", - &*segment.point_read(b"xyz", None)?.unwrap().key.user_key, + &*segment + .get( + b"xyz", + None, + crate::segment::filter::standard_bloom::Builder::get_hash(b"xyz") + )? + .unwrap() + .key + .user_key, + ); + assert_eq!( + None, + segment.get( + b"____", + None, + crate::segment::filter::standard_bloom::Builder::get_hash(b"____") + )? ); - assert_eq!(None, segment.point_read(b"____", None)?); assert_eq!(items, &*segment.scan()?.flatten().collect::>()); @@ -550,4 +627,82 @@ mod tests { Ok(()) } + + // TODO: when using stats cfg feature: check filter hits += 1 + #[test] + #[allow(clippy::unwrap_used)] + fn v3_segment_unpinned_filter() -> crate::Result<()> { + let dir = tempdir()?; + let file = dir.path().join("segment"); + + { + let mut writer = crate::segment::Writer::new(file.clone(), 5)?; + writer.write(crate::InternalValue::from_components( + b"abc", + b"asdasdasd", + 3, + crate::ValueType::Value, + ))?; + let _trailer = writer.finish()?; + } + + { + let segment = Segment::recover( + &file, + 0, + Arc::new(Cache::with_capacity_bytes(1_000_000)), + Arc::new(DescriptorTable::new(10)), + false, + )?; + + assert_eq!(5, segment.id()); + assert_eq!(1, segment.metadata.item_count); + assert_eq!(1, segment.metadata.data_block_count); + assert_eq!(1, segment.metadata.index_block_count); // 1 because we use a full index + assert!( + segment.regions.index.is_none(), + "should use full index, so only TLI exists", + ); + + assert_eq!( + b"abc", + &*segment + .get( + b"abc", + None, + crate::segment::filter::standard_bloom::Builder::get_hash(b"abc") + )? + .unwrap() + .key + .user_key, + ); + assert_eq!( + b"abc", + &*segment + .get( + b"abc", + None, + crate::segment::filter::standard_bloom::Builder::get_hash(b"abc") + )? + .unwrap() + .key + .user_key, + ); + assert_eq!( + None, + segment.get( + b"def", + None, + crate::segment::filter::standard_bloom::Builder::get_hash(b"def") + )? + ); + + assert_eq!( + segment.metadata.key_range, + crate::KeyRange::new((b"abc".into(), b"abc".into())), + ); + } + + Ok(()) + } } diff --git a/src/segment/regions.rs b/src/segment/regions.rs new file mode 100644 index 00000000..c9ec1ffb --- /dev/null +++ b/src/segment/regions.rs @@ -0,0 +1,107 @@ +use super::{Block, BlockHandle}; +use crate::{ + coding::{Decode, Encode}, + segment::DataBlock, + CompressionType, InternalValue, UserValue, +}; +use std::fs::File; + +/// The regions block stores offsets to the different segment disk file "regions" +#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)] +pub struct ParsedRegions { + pub tli: BlockHandle, + pub index: Option, + pub filter: Option, + pub metadata: BlockHandle, +} + +impl ParsedRegions { + pub fn load_with_handle(file: &File, handle: &BlockHandle) -> crate::Result { + let block = Block::from_file(file, handle.offset(), handle.size(), CompressionType::None)?; + let block = DataBlock::new(block); + + let tli = { + let bytes = block + .point_read(b"tli", None) + .expect("TLI handle should exist"); + + let mut bytes = &bytes.value[..]; + BlockHandle::decode_from(&mut bytes) + }?; + + let metadata = { + let bytes = block + .point_read(b"meta", None) + .expect("Metadata handle should exist"); + + let mut bytes = &bytes.value[..]; + BlockHandle::decode_from(&mut bytes) + }?; + + let index = { + match block.point_read(b"index", None) { + Some(bytes) if !bytes.value.is_empty() => { + let mut bytes = &bytes.value[..]; + Some(BlockHandle::decode_from(&mut bytes)) + } + _ => None, + } + } + .transpose()?; + + let filter = { + match block.point_read(b"filter", None) { + Some(bytes) if !bytes.value.is_empty() => { + let mut bytes = &bytes.value[..]; + Some(BlockHandle::decode_from(&mut bytes)) + } + _ => None, + } + } + .transpose()?; + + Ok(Self { + tli, + index, + filter, + metadata, + }) + } + + pub fn encode_into_vec(&self) -> crate::Result> { + fn region(key: &str, value: impl Into) -> InternalValue { + InternalValue::from_components(key, value, 0, crate::ValueType::Value) + } + + let items = [ + region( + "filter", + match self.filter { + Some(handle) => handle.encode_into_vec(), + None => vec![], + }, + ), + region( + "index", + match self.index { + Some(handle) => handle.encode_into_vec(), + None => vec![], + }, + ), + region("meta", self.metadata.encode_into_vec()), + region("tli", self.tli.encode_into_vec()), + ]; + + #[cfg(debug_assertions)] + { + let mut sorted_copy = items.clone(); + sorted_copy.sort(); + + // Just to make sure the items are definitely sorted + assert_eq!(items, sorted_copy, "region items not sorted correctly"); + } + + // TODO: no binary index + DataBlock::encode_items(&items, 1, 0.0) + } +} diff --git a/src/segment/trailer.rs b/src/segment/trailer.rs index d3590830..c6504fca 100644 --- a/src/segment/trailer.rs +++ b/src/segment/trailer.rs @@ -4,64 +4,59 @@ use super::index_block::BlockHandle; use crate::{ - coding::{Decode, DecodeError, Encode, EncodeError}, + coding::{Decode, DecodeError, Encode}, file::MAGIC_BYTES, }; use std::{ fs::File, - io::{BufReader, Read, Seek, Write}, - path::Path, + io::{Read, Seek, Write}, }; -const TRAILER_SIZE: usize = 128; +const TRAILER_SIZE: usize = 32; -/// The segment trailer stores offsets to the different segment disk file "zones" +/// The fixed-size segment trailer stores a block handle to the regions block /// /// ---------------- /// | data blocks | <- implicitly start at 0 /// |--------------| -/// | tli | +/// | tli block | /// |--------------| -/// | index block | <- may not exist (if full block index is used, TLI will be dense) +/// | index block | <- may not exist (if full block index is used, TLI will be dense) /// |--------------| /// | filter block | <- may not exist /// |--------------| /// | ... TBD ... | /// |--------------| -/// | meta block | +/// | meta block | +/// |--------------| +/// | region block | /// |--------------| /// | trailer | <- fixed size /// |--------------| +/// +/// Through this indirection, we can have a variable amount of region block handles. #[derive(Copy, Clone, Debug, Default, PartialEq, Eq)] pub struct Trailer { - pub tli: BlockHandle, - pub index_blocks: Option, - pub filter: Option, // option - - // // TODO: #2 https://github.com/fjall-rs/lsm-tree/issues/2 - // pub range_tombstones: BlockOffset, - - // // TODO: prefix filter for l0, l1? - // pub pfx: BlockOffset, - - // // TODO: #46 https://github.com/fjall-rs/lsm-tree/issues/46 - // pub range_filter: BlockOffset, - pub metadata: BlockHandle, + regions_block_handle: BlockHandle, } impl Trailer { - /* /// Returns the on-disk size - #[must_use] - pub const fn serialized_len() -> usize { - 4 * std::mem::size_of::() - } */ + pub fn from_handle(regions_block_handle: BlockHandle) -> Self { + Self { + regions_block_handle, + } + } + + pub fn regions_block_handle(&self) -> &BlockHandle { + &self.regions_block_handle + } pub fn write_into(&self, writer: &mut W) -> crate::Result<()> { let mut v = Vec::with_capacity(TRAILER_SIZE); v.write_all(&MAGIC_BYTES)?; - self.encode_into(&mut v)?; + self.regions_block_handle.encode_into(&mut v)?; // Pad with remaining bytes v.resize(TRAILER_SIZE, 0); @@ -78,105 +73,28 @@ impl Trailer { } // TODO: the trailer is fixed size so we can use read_at?! - pub fn from_file(path: &Path) -> crate::Result { - let file = File::open(path)?; - let mut reader = BufReader::new(file); - reader.seek(std::io::SeekFrom::End(-(TRAILER_SIZE as i64)))?; + // TODO: then we don't need &mut File + pub fn from_file(file: &mut File) -> crate::Result { + file.seek(std::io::SeekFrom::End(-(TRAILER_SIZE as i64)))?; + + let mut trailer_bytes = [0u8; TRAILER_SIZE]; + file.read_exact(&mut trailer_bytes)?; + + let mut reader = &mut &trailer_bytes[..]; // Check trailer magic header let mut magic = [0u8; MAGIC_BYTES.len()]; reader.read_exact(&mut magic)?; - // Parse pointers - let trailer = Self::decode_from(&mut reader)?; - if magic != MAGIC_BYTES { return Err(crate::Error::Decode(DecodeError::InvalidHeader( "SegmentTrailer", ))); } - debug_assert!(*trailer.tli.offset() > 0); - debug_assert!(*trailer.metadata.offset() > 0); + // Get regions block handle + let handle = BlockHandle::decode_from(&mut reader)?; - Ok(trailer) + Ok(Self::from_handle(handle)) } } - -// TODO: honestly we could just store the meta offset in trailer, and the just store pointers in meta... -impl Encode for Trailer { - fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { - self.tli.encode_into(writer)?; - - if let Some(handle) = &self.index_blocks { - handle.encode_into(writer) - } else { - BlockHandle::default().encode_into(writer) - }?; - - if let Some(handle) = &self.filter { - handle.encode_into(writer) - } else { - BlockHandle::default().encode_into(writer) - }?; - - self.metadata.encode_into(writer)?; - - Ok(()) - } -} - -impl Decode for Trailer { - fn decode_from(reader: &mut R) -> Result { - let tli = BlockHandle::decode_from(reader)?; - let index_blocks = BlockHandle::decode_from(reader)?; - let filter = BlockHandle::decode_from(reader)?; - let metadata = BlockHandle::decode_from(reader)?; - - Ok(Self { - index_blocks: match *index_blocks.offset() { - 0 => None, - _ => Some(index_blocks), - }, - tli, - filter: match *filter.offset() { - 0 => None, - _ => Some(filter), - }, - metadata, - }) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::segment::BlockOffset; - use std::io::Cursor; - use test_log::test; - - #[test] - fn v3_file_offsets_roundtrip() -> crate::Result<()> { - let before = Trailer { - tli: BlockHandle::new(BlockOffset(15), 5), - index_blocks: Some(BlockHandle::new(BlockOffset(20), 5)), - filter: Some(BlockHandle::new(BlockOffset(25), 5)), - metadata: BlockHandle::new(BlockOffset(30), 5), - }; - - let buf = before.encode_into_vec(); - - let mut cursor = Cursor::new(buf); - let after = Trailer::decode_from(&mut cursor)?; - - assert_eq!(after, before); - - Ok(()) - } - - /* #[test] - fn v3_file_offsets_serialized_len() { - let buf = Trailer::default().encode_into_vec(); - assert_eq!(Trailer::serialized_len(), buf.len()); - } */ -} diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 0a326cf3..a573d455 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -8,7 +8,7 @@ use super::{ use crate::{ coding::Encode, file::fsync_directory, - segment::{filter::standard_bloom::Builder, index_block::BlockHandle}, + segment::{filter::standard_bloom::Builder, index_block::BlockHandle, regions::ParsedRegions}, time::unix_timestamp, CompressionType, InternalValue, SegmentId, UserKey, }; @@ -343,10 +343,7 @@ impl Writer { assert_eq!(meta_items, sorted_copy, "meta items not sorted correctly"); } - log::trace!( - "Writing metadata to segment file {:?}: {meta_items:#?}", - self.path, - ); + log::trace!("Encoding metadata block: {meta_items:#?}"); // TODO: no binary index let bytes = DataBlock::encode_items(&meta_items, 1, 0.0)?; @@ -357,23 +354,29 @@ impl Writer { BlockHandle::new(metadata_start, bytes_written as u32) }; - // Bundle all the file offsets - let trailer = Trailer { - tli: tli_handle, - index_blocks: None, - filter: filter_handle, - metadata: metadata_handle, - /* range_filter:range_filter_ptr: rf:rf_ptr, - range_tombstones:range_tombstones_ptr, - pfx:pfx_ptr, */ - }; + // Write regions block + let regions_block_handle = { + let regions_block_start = BlockOffset(self.block_writer.stream_position()?); - log::trace!( - "Writing trailer to segment file {:?}: {trailer:#?}", - self.path, - ); + let regions = ParsedRegions { + tli: tli_handle, + index: None, + filter: filter_handle, + metadata: metadata_handle, + }; + + log::trace!("Encoding regions: {regions:#?}"); + + let bytes = regions.encode_into_vec()?; + let header = Block::to_writer(&mut self.block_writer, &bytes, CompressionType::None)?; + + let bytes_written = BlockHeader::serialized_len() as u32 + header.data_length; + + BlockHandle::new(regions_block_start, bytes_written as u32) + }; - // Write trailer + // Write fixed-size trailer + let trailer = Trailer::from_handle(regions_block_handle); trailer.write_into(&mut self.block_writer)?; // Finally, flush & fsync the blocks file diff --git a/src/tree/ingest.rs b/src/tree/ingest.rs index f9f99380..31207873 100644 --- a/src/tree/ingest.rs +++ b/src/tree/ingest.rs @@ -85,7 +85,8 @@ impl<'a> Ingestion<'a> { self.tree.id, self.tree.config.cache.clone(), self.tree.config.descriptor_table.clone(), - ) + true, + ) // TODO: look at configuration // todo!() diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 4f8beb9e..1ca2eedb 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -122,12 +122,12 @@ impl AbstractTree for Tree { Ok(self.get(key, seqno)?.map(|x| x.len() as u32)) } - fn bloom_filter_size(&self) -> usize { + fn pinned_bloom_filter_size(&self) -> usize { self.levels .read() .expect("lock is poisoned") .iter() - .map(Segment::bloom_filter_size) + .map(Segment::pinned_bloom_filter_size) .sum() } @@ -515,6 +515,7 @@ impl Tree { self.id, self.config.cache.clone(), self.config.descriptor_table.clone(), + true, // TODO: look at configuration )?; log::debug!("Flushed segment to {segment_file_path:?}"); @@ -920,13 +921,13 @@ impl Tree { crate::Error::Unrecoverable })?; - if let Some(&level_idx) = segment_id_map.get(&segment_id) { + if let Some(&_level_idx) = segment_id_map.get(&segment_id) { let segment = Segment::recover( &segment_file_path, tree_id, cache.clone(), descriptor_table.clone(), - // level_idx == 0 || level_idx == 1, + true, // TODO: look at configuration )?; segments.push(segment); From fb890fa7fc8561e6c7ac66d242363d6b58c230b4 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 10 May 2025 18:15:12 +0200 Subject: [PATCH 115/613] refactor: bloom filter --- src/segment/filter/bit_array/builder.rs | 69 +++++++++++ src/segment/filter/bit_array/mod.rs | 81 +----------- .../filter/bit_array/{sliced.rs => reader.rs} | 33 +---- src/segment/filter/standard_bloom/builder.rs | 35 ++++-- src/segment/filter/standard_bloom/mod.rs | 115 ++++++------------ 5 files changed, 140 insertions(+), 193 deletions(-) create mode 100644 src/segment/filter/bit_array/builder.rs rename src/segment/filter/bit_array/{sliced.rs => reader.rs} (52%) diff --git a/src/segment/filter/bit_array/builder.rs b/src/segment/filter/bit_array/builder.rs new file mode 100644 index 00000000..a0b4bcf6 --- /dev/null +++ b/src/segment/filter/bit_array/builder.rs @@ -0,0 +1,69 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +const BIT_MASK: u8 = 0b1000_0000_u8; + +/// Sets a bit in the byte to `true` +#[must_use] +pub fn enable_bit(byte: u8, idx: usize) -> u8 { + let bit_mask = BIT_MASK >> idx; + byte | bit_mask +} + +/// Fixed-size bit array +#[derive(Debug, Eq, PartialEq)] +pub struct Builder(Box<[u8]>); + +impl Builder { + #[must_use] + pub fn with_capacity(bytes: usize) -> Self { + let vec = vec![0; bytes]; + Self(vec.into_boxed_slice()) + } + + #[must_use] + pub fn from_bytes(bytes: Box<[u8]>) -> Self { + Self(bytes) + } + + #[must_use] + pub fn bytes(&self) -> &[u8] { + &self.0 + } + + /// Sets the i-th bit + pub fn enable_bit(&mut self, idx: usize) { + let byte_idx = idx / 8; + let byte = self.0.get_mut(byte_idx).expect("should be in bounds"); + + let bit_idx = idx % 8; + *byte = enable_bit(*byte, bit_idx); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use test_log::test; + + #[test] + fn bit_set_true() { + assert_eq!(0b0000_0010, enable_bit(0, 6)); + assert_eq!(0b1000_0000, enable_bit(0, 0)); + assert_eq!(0b0100_0000, enable_bit(0, 1)); + assert_eq!(0b0100_0110, enable_bit(0b0000_0110, 1)); + } + + #[test] + fn bit_array_builder_basic() { + let mut builder = Builder::with_capacity(1); + assert_eq!(&[0], builder.bytes()); + + builder.enable_bit(0); + assert_eq!(&[0b1000_0000], builder.bytes()); + + builder.enable_bit(7); + assert_eq!(&[0b1000_0001], builder.bytes()); + } +} diff --git a/src/segment/filter/bit_array/mod.rs b/src/segment/filter/bit_array/mod.rs index b9696b13..2078a6f9 100644 --- a/src/segment/filter/bit_array/mod.rs +++ b/src/segment/filter/bit_array/mod.rs @@ -1,78 +1,5 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) +mod builder; +mod reader; -mod sliced; - -pub use sliced::BitArray as BitArrayReader; - -const BIT_MASK: u8 = 0b1000_0000_u8; - -/// Sets a bit in the byte -#[must_use] -pub fn set_bit(byte: u8, idx: usize, value: bool) -> u8 { - let bit_mask = BIT_MASK >> idx; - - if value { - byte | bit_mask - } else { - byte & !bit_mask - } -} - -/// Fixed-size bit array -#[derive(Debug, Eq, PartialEq)] -pub struct Builder(Box<[u8]>); - -impl Builder { - #[must_use] - pub fn with_capacity(bytes: usize) -> Self { - let vec = vec![0; bytes]; - Self(vec.into_boxed_slice()) - } - - #[must_use] - pub fn from_bytes(bytes: Box<[u8]>) -> Self { - Self(bytes) - } - - #[must_use] - pub fn bytes(&self) -> &[u8] { - &self.0 - } - - /// Sets the i-th bit - pub fn enable_bit(&mut self, idx: usize) { - let byte_idx = idx / 8; - let byte = self.0.get_mut(byte_idx).expect("should be in bounds"); - - let bit_idx = idx % 8; - *byte = set_bit(*byte, bit_idx, true); - } -} - -#[cfg(test)] -mod tests { - use super::*; - use test_log::test; - - #[test] - fn bit_set_true() { - assert_eq!(0b0000_0010, set_bit(0, 6, true)); - assert_eq!(0b1000_0000, set_bit(0, 0, true)); - assert_eq!(0b0100_0000, set_bit(0, 1, true)); - assert_eq!(0b0100_0110, set_bit(0b0000_0110, 1, true)); - } - - #[test] - fn bit_array_builder_basic() { - let mut builder = Builder::with_capacity(1); - assert_eq!(&[0], builder.bytes()); - - builder.enable_bit(0); - assert_eq!(&[0b1000_0000], builder.bytes()); - - builder.enable_bit(7); - assert_eq!(&[0b1000_0001], builder.bytes()); - } -} +pub use builder::Builder; +pub use reader::BitArrayReader; diff --git a/src/segment/filter/bit_array/sliced.rs b/src/segment/filter/bit_array/reader.rs similarity index 52% rename from src/segment/filter/bit_array/sliced.rs rename to src/segment/filter/bit_array/reader.rs index 83066a3f..ea8c2fb9 100644 --- a/src/segment/filter/bit_array/sliced.rs +++ b/src/segment/filter/bit_array/reader.rs @@ -2,8 +2,6 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::Slice; - const BIT_MASK: u8 = 0b1000_0000_u8; /// Gets a bit from the byte @@ -14,19 +12,19 @@ fn get_bit(byte: u8, idx: usize) -> bool { masked > 0 } -/// Fixed-size bit array +/// Fixed-size bit array reader #[derive(Debug, Eq, PartialEq)] -pub struct BitArray(Slice); +pub struct BitArrayReader<'a>(&'a [u8]); -impl BitArray { +impl<'a> BitArrayReader<'a> { #[must_use] - pub fn new(slice: Slice) -> Self { - Self(slice) + pub fn new(bytes: &'a [u8]) -> Self { + Self(bytes) } #[must_use] pub fn bytes(&self) -> &[u8] { - &self.0 + self.0 } /// Gets the i-th bit @@ -39,22 +37,3 @@ impl BitArray { get_bit(*byte, bit_idx) } } - -#[cfg(test)] -mod tests { - use super::*; - use crate::segment::filter::bit_array::set_bit; - use test_log::test; - - #[test] - fn bit_set_get() { - assert_eq!(0b1111_1101, set_bit(0xFF, 6, false)); - assert_eq!(0b0111_1111, set_bit(0xFF, 0, false)); - assert_eq!(0b1011_1111, set_bit(0xFF, 1, false)); - - assert!(!get_bit(0b0100_0110, 0)); - assert!(get_bit(0b0100_0110, 1)); - assert!(get_bit(0b0100_0110, 6)); - assert!(!get_bit(0b0100_0110, 7)); - } -} diff --git a/src/segment/filter/standard_bloom/builder.rs b/src/segment/filter/standard_bloom/builder.rs index e8cdfd65..649f77a7 100644 --- a/src/segment/filter/standard_bloom/builder.rs +++ b/src/segment/filter/standard_bloom/builder.rs @@ -2,8 +2,10 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::{super::bit_array::Builder as BitArrayBuilder, StandardBloomFilter}; -use crate::segment::filter::bit_array::BitArrayReader; +use super::super::bit_array::Builder as BitArrayBuilder; +use crate::file::MAGIC_BYTES; +use byteorder::{LittleEndian, WriteBytesExt}; +use std::io::Write; /// Two hashes that are used for double hashing pub type CompositeHash = (u64, u64); @@ -15,21 +17,34 @@ pub struct Builder { inner: BitArrayBuilder, /// Bit count - m: usize, + pub(super) m: usize, /// Number of hash functions - k: usize, + pub(super) k: usize, } #[allow(clippy::len_without_is_empty)] impl Builder { #[must_use] - pub fn build(self) -> StandardBloomFilter { - StandardBloomFilter { - inner: BitArrayReader::new(self.inner.bytes().into()), - k: self.k, - m: self.m, - } + pub fn build(&self) -> Vec { + let mut v = vec![]; + + // Write header + v.write_all(&MAGIC_BYTES).expect("should not fail"); + + // NOTE: Filter type (unused) + v.write_u8(0).expect("should not fail"); + + // NOTE: Hash type (unused) + v.write_u8(0).expect("should not fail"); + + v.write_u64::(self.m as u64) + .expect("should not fail"); + v.write_u64::(self.k as u64) + .expect("should not fail"); + v.write_all(self.inner.bytes()).expect("should not fail"); + + v } /// Constructs a bloom filter that can hold `n` items diff --git a/src/segment/filter/standard_bloom/mod.rs b/src/segment/filter/standard_bloom/mod.rs index 7670a640..b430dd5f 100644 --- a/src/segment/filter/standard_bloom/mod.rs +++ b/src/segment/filter/standard_bloom/mod.rs @@ -1,15 +1,12 @@ -use super::bit_array::BitArrayReader; -use crate::{ - coding::{Decode, DecodeError, Encode, EncodeError}, - file::MAGIC_BYTES, -}; -use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use std::io::{Read, Write}; - mod builder; pub use builder::{Builder, CompositeHash}; +use super::bit_array::BitArrayReader; +use crate::file::MAGIC_BYTES; +use byteorder::{LittleEndian, ReadBytesExt}; +use std::io::{Cursor, Read}; + /// A standard bloom filter /// /// Allows buffering the key hashes before actual filter construction @@ -18,9 +15,9 @@ pub use builder::{Builder, CompositeHash}; /// /// The filter uses double hashing instead of `k` hash functions, see: /// -pub struct StandardBloomFilter { +pub struct StandardBloomFilterReader<'a> { /// Raw bytes exposed as bit array - inner: BitArrayReader, + inner: BitArrayReader<'a>, /// Bit count m: usize, @@ -29,35 +26,18 @@ pub struct StandardBloomFilter { k: usize, } -// TODO: change encode/decode to be Filter enum - -impl Encode for StandardBloomFilter { - fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { - // Write header - writer.write_all(&MAGIC_BYTES)?; - - // NOTE: Filter type (unused) - writer.write_u8(0)?; - - // NOTE: Hash type (unused) - writer.write_u8(0)?; - - writer.write_u64::(self.m as u64)?; - writer.write_u64::(self.k as u64)?; - writer.write_all(self.inner.bytes())?; - - Ok(()) - } -} +impl<'a> StandardBloomFilterReader<'a> { + pub fn new(slice: &'a [u8]) -> crate::Result { + let mut reader = Cursor::new(slice); -impl Decode for StandardBloomFilter { - fn decode_from(reader: &mut R) -> Result { // Check header let mut magic = [0u8; MAGIC_BYTES.len()]; reader.read_exact(&mut magic)?; if magic != MAGIC_BYTES { - return Err(DecodeError::InvalidHeader("BloomFilter")); + return Err(crate::Error::Decode(crate::DecodeError::InvalidHeader( + "BloomFilter", + ))); } // NOTE: Filter type (unused) @@ -71,29 +51,25 @@ impl Decode for StandardBloomFilter { let m = reader.read_u64::()? as usize; let k = reader.read_u64::()? as usize; - let mut bytes = vec![0; m / 8]; - reader.read_exact(&mut bytes)?; + let offset = reader.position() as usize; - Ok(Self::from_raw(m, k, bytes.into())) + #[allow(clippy::indexing_slicing)] + Ok(Self { + k, + m, + inner: BitArrayReader::new(slice.get(offset..).expect("should be in bounds")), + }) } } #[allow(clippy::len_without_is_empty)] -impl StandardBloomFilter { +impl StandardBloomFilterReader<'_> { /// Size of bloom filter in bytes. #[must_use] pub fn len(&self) -> usize { self.inner.bytes().len() } - fn from_raw(m: usize, k: usize, slice: crate::Slice) -> Self { - Self { - inner: BitArrayReader::new(slice), - m, - k, - } - } - /// Returns `true` if the hash may be contained. /// /// Will never have a false negative. @@ -137,18 +113,13 @@ impl StandardBloomFilter { } #[cfg(test)] +#[allow(clippy::unwrap_used)] mod tests { use super::*; - use std::fs::File; use test_log::test; #[test] - fn bloom_serde_round_trip() -> crate::Result<()> { - let dir = tempfile::tempdir()?; - - let path = dir.path().join("bf"); - let mut file = File::create(&path)?; - + fn bloom_serde_round_trip() { let mut filter = Builder::with_fp_rate(10, 0.0001); let keys = &[ @@ -157,35 +128,17 @@ mod tests { ]; for key in keys { - filter.set_with_hash(StandardBloomFilter::get_hash(*key)); + filter.set_with_hash(StandardBloomFilterReader::get_hash(*key)); } - let filter = filter.build(); + let filter_bytes = filter.build(); + let filter_copy = StandardBloomFilterReader::new(&filter_bytes).unwrap(); - for key in keys { - assert!(filter.contains(&**key)); - } - assert!(!filter.contains(b"asdasads")); - assert!(!filter.contains(b"item10")); - assert!(!filter.contains(b"cxycxycxy")); - - filter.encode_into(&mut file)?; - file.sync_all()?; - drop(file); - - let mut file = File::open(&path)?; - let filter_copy = StandardBloomFilter::decode_from(&mut file)?; - - assert_eq!(filter.inner, filter_copy.inner); - - for key in keys { - assert!(filter.contains(&**key)); - } + assert_eq!(filter.k, filter_copy.k); + assert_eq!(filter.m, filter_copy.m); assert!(!filter_copy.contains(b"asdasads")); assert!(!filter_copy.contains(b"item10")); assert!(!filter_copy.contains(b"cxycxycxy")); - - Ok(()) } #[test] @@ -209,7 +162,8 @@ mod tests { filter.set_with_hash(Builder::get_hash(key)); } - let filter = filter.build(); + let filter_bytes = filter.build(); + let filter = StandardBloomFilterReader::new(&filter_bytes).unwrap(); for key in &keys { assert!(filter.contains(key)); @@ -231,7 +185,8 @@ mod tests { filter.set_with_hash(Builder::get_hash(key)); } - let filter = filter.build(); + let filter_bytes = filter.build(); + let filter = StandardBloomFilterReader::new(&filter_bytes).unwrap(); let mut false_positives = 0; @@ -261,7 +216,8 @@ mod tests { filter.set_with_hash(Builder::get_hash(key)); } - let filter = filter.build(); + let filter_bytes = filter.build(); + let filter = StandardBloomFilterReader::new(&filter_bytes).unwrap(); let mut false_positives = 0; @@ -292,7 +248,8 @@ mod tests { filter.set_with_hash(Builder::get_hash(key)); } - let filter = filter.build(); + let filter_bytes = filter.build(); + let filter = StandardBloomFilterReader::new(&filter_bytes).unwrap(); let mut false_positives = 0; From f29d1b7c23c5bdbe3115910863885b77db706645 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 10 May 2025 18:15:58 +0200 Subject: [PATCH 116/613] feat: (un)pinned filter logic --- src/segment/mod.rs | 90 ++++++++++++++++----------------------- src/segment/writer/mod.rs | 12 +++--- 2 files changed, 44 insertions(+), 58 deletions(-) diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 31711bd0..613e946f 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -101,9 +101,9 @@ impl Segment { #[must_use] pub fn pinned_bloom_filter_size(&self) -> usize { - self.pinned_filter + self.pinned_filter_block .as_ref() - .map(|filter| filter.len()) + .map(Block::size) .unwrap_or_default() } @@ -116,7 +116,11 @@ impl Segment { self.metadata.id } - fn load_block(&self, handle: &BlockHandle) -> crate::Result { + fn load_block( + &self, + handle: &BlockHandle, + compression: CompressionType, + ) -> crate::Result { let id = self.global_id(); if let Some(block) = self.cache.get_block(id, handle.offset()) { @@ -132,12 +136,7 @@ impl Segment { Arc::new(std::fs::File::open(&self.path)?) }; - let block = Block::from_file( - &fd, - handle.offset(), - handle.size(), - self.metadata.data_block_compression, - )?; + let block = Block::from_file(&fd, handle.offset(), handle.size(), compression)?; let id = self.global_id(); @@ -152,7 +151,8 @@ impl Segment { } fn load_data_block(&self, handle: &BlockHandle) -> crate::Result { - self.load_block(handle).map(DataBlock::new) + self.load_block(handle, self.metadata.data_block_compression) + .map(DataBlock::new) } pub fn get( @@ -167,20 +167,15 @@ impl Segment { } } - if let Some(filter) = &self.pinned_filter { + if let Some(block) = &self.pinned_filter_block { + let filter = StandardBloomFilterReader::new(&block.data)?; + if !filter.contains_hash(key_hash) { return Ok(None); } } else if let Some(filter_block_handle) = &self.regions.filter { - use crate::coding::Decode; - - let block = self.load_block(filter_block_handle)?; - - let mut reader = &block.data[..]; - - // TODO: FilterReader that does not need decoding... just use the slice directly - let filter = StandardBloomFilter::decode_from(&mut reader) - .map_err(Into::::into)?; + let block = self.load_block(filter_block_handle, CompressionType::None)?; + let filter = StandardBloomFilterReader::new(&block.data)?; if !filter.contains_hash(key_hash) { return Ok(None); @@ -215,6 +210,7 @@ impl Segment { }; for block_handle in iter { + // TODO: can this ever happen...? if block_handle.end_key() < &key { return Ok(None); } @@ -224,6 +220,12 @@ impl Segment { if let Some(item) = block.point_read(key, Some(seqno)) { return Ok(Some(item)); } + + // NOTE: If the last block key is higher than ours, + // our key cannot be in the next block + if block_handle.end_key() > &key { + return Ok(None); + } } } } @@ -333,55 +335,37 @@ impl Segment { "Creating partitioned block index, with tli_ptr={:?}, index_block_ptr={index_block_handle:?}", regions.tli, ); - todo!(); + + unimplemented!("partitioned index is not supported yet"); + // BlockIndexImpl::TwoLevel(tli_block, todo!()) } else { log::debug!("Creating full block index, with tli_ptr={:?}", regions.tli); BlockIndexImpl::Full(FullBlockIndex::new(tli_block)) }; - /* let block_index = if use_full_block_index { - let block_index = - FullBlockIndex::from_file(file_path, &trailer.metadata, &trailer.offsets)?; - - BlockIndexImpl::Full(block_index) - } else { - let block_index = TwoLevelBlockIndex::from_file( - file_path, - &trailer.metadata, - trailer.offsets.tli_ptr, - (tree_id, trailer.metadata.id).into(), - descriptor_table.clone(), - cache.clone(), - )?; - BlockIndexImpl::TwoLevel(block_index) - }; */ - // TODO: load FilterBlock - let pinned_filter = if pin_filter { + let pinned_filter_block = if pin_filter { regions .filter - .map(|filter_ptr| { - use crate::coding::Decode; - - log::debug!("Loading and pinning filter block, with filter_ptr={filter_ptr:?}"); + .map(|filter_handle| { + log::debug!( + "Loading and pinning filter block, with filter_ptr={filter_handle:?}" + ); - let block = Block::from_file( + Block::from_file( &file, - filter_ptr.offset(), - filter_ptr.size(), - crate::CompressionType::None, // NOTE: We never write a filter block with compression - )?; - - let mut reader = &block.data[..]; - StandardBloomFilter::decode_from(&mut reader) - .map_err(Into::::into) + filter_handle.offset(), + filter_handle.size(), + crate::CompressionType::None, // NOTE: We never write a filter block with compression + ) }) .transpose()? } else { None }; + // NOTE: We already have a file descriptor open, so let's just cache it immediately descriptor_table.insert_for_table((tree_id, metadata.id).into(), Arc::new(file)); let segment = Self(Arc::new(Inner { @@ -397,7 +381,7 @@ impl Segment { block_index: Arc::new(block_index), - pinned_filter, + pinned_filter_block, is_deleted: AtomicBool::default(), })); diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index a573d455..8c85651f 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -250,7 +250,7 @@ impl Writer { let start = std::time::Instant::now(); - let filter = { + let filter_bytes = { let mut builder = self.bloom_policy.init(n); for hash in std::mem::take(&mut self.bloom_hash_buffer) { @@ -260,12 +260,14 @@ impl Writer { builder.build() }; - log::trace!("Built Bloom filter in {:?}", start.elapsed()); - - let bytes = filter.encode_into_vec(); + log::trace!( + "Built Bloom filter ({} B) in {:?}", + filter_bytes.len(), + start.elapsed(), + ); let block = - Block::to_writer(&mut self.block_writer, &bytes, CompressionType::None)?; + Block::to_writer(&mut self.block_writer, &filter_bytes, CompressionType::None)?; let bytes_written = (BlockHeader::serialized_len() as u32) + block.data_length; From c28767f1524a9a55769d15b7b4dae742eeaf618d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 10 May 2025 18:17:01 +0200 Subject: [PATCH 117/613] move tree version --- src/error.rs | 2 +- src/lib.rs | 3 ++- src/tree/mod.rs | 2 +- src/tree_version.rs | 45 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 3 deletions(-) create mode 100644 src/tree_version.rs diff --git a/src/error.rs b/src/error.rs index d97f12d4..49cf17f4 100644 --- a/src/error.rs +++ b/src/error.rs @@ -4,7 +4,7 @@ use crate::{ coding::{DecodeError, EncodeError}, - version::Version, + tree_version::Version, Checksum, CompressionType, }; diff --git a/src/lib.rs b/src/lib.rs index 3729040e..a4ab2a76 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -180,6 +180,7 @@ pub mod stop_signal; mod time; mod tree; +mod tree_version; mod value; mod version; @@ -217,8 +218,8 @@ pub use { seqno::SequenceNumberCounter, snapshot::Snapshot, tree::Tree, + tree_version::Version as TreeVersion, value::{SeqNo, UserKey, UserValue, ValueType}, - version::Version, }; pub use any_tree::AnyTree; diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 1ca2eedb..d78f528b 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -13,8 +13,8 @@ use crate::{ manifest::Manifest, memtable::Memtable, segment::Segment, + tree_version::Version, value::InternalValue, - version::Version, AbstractTree, Cache, DescriptorTable, KvPair, SegmentId, SeqNo, Snapshot, UserKey, UserValue, ValueType, }; diff --git a/src/tree_version.rs b/src/tree_version.rs new file mode 100644 index 00000000..19862ad8 --- /dev/null +++ b/src/tree_version.rs @@ -0,0 +1,45 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +/// Disk format version +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum Version { + /// Version for 1.x.x releases + V1, + + /// Version for 2.x.x releases + V2, + + /// Version for 3.x.x releases + V3, +} + +impl std::fmt::Display for Version { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", u8::from(*self)) + } +} + +impl From for u8 { + fn from(value: Version) -> Self { + match value { + Version::V1 => 1, + Version::V2 => 2, + Version::V3 => 3, + } + } +} + +impl TryFrom for Version { + type Error = (); + + fn try_from(value: u8) -> Result { + match value { + 1 => Ok(Self::V1), + 2 => Ok(Self::V2), + 3 => Ok(Self::V3), + _ => Err(()), + } + } +} From bcffea29e0a9f51e29c8e948a45fc7f3fc3ffae6 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 10 May 2025 18:17:52 +0200 Subject: [PATCH 118/613] segment inner pinned filter block --- src/segment/inner.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/segment/inner.rs b/src/segment/inner.rs index 5b4ada30..8b16085c 100644 --- a/src/segment/inner.rs +++ b/src/segment/inner.rs @@ -2,10 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::{ - block_index::BlockIndexImpl, filter::standard_bloom::StandardBloomFilter, meta::ParsedMeta, - regions::ParsedRegions, -}; +use super::{block_index::BlockIndexImpl, meta::ParsedMeta, regions::ParsedRegions, Block}; use crate::{ cache::Cache, descriptor_table::DescriptorTable, tree::inner::TreeId, GlobalSegmentId, }; @@ -41,7 +38,7 @@ pub struct Inner { pub cache: Arc, /// Pinned AMQ filter - pub pinned_filter: Option, + pub pinned_filter_block: Option, // /// Pinned filter // #[doc(hidden)] From b29b000d10801926761b537d1fc03d9e8e51ec4f Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 10 May 2025 18:17:59 +0200 Subject: [PATCH 119/613] wip --- src/version.rs | 45 --------------------------------------------- 1 file changed, 45 deletions(-) delete mode 100644 src/version.rs diff --git a/src/version.rs b/src/version.rs deleted file mode 100644 index 19862ad8..00000000 --- a/src/version.rs +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -/// Disk format version -#[derive(Copy, Clone, Debug, Eq, PartialEq)] -pub enum Version { - /// Version for 1.x.x releases - V1, - - /// Version for 2.x.x releases - V2, - - /// Version for 3.x.x releases - V3, -} - -impl std::fmt::Display for Version { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", u8::from(*self)) - } -} - -impl From for u8 { - fn from(value: Version) -> Self { - match value { - Version::V1 => 1, - Version::V2 => 2, - Version::V3 => 3, - } - } -} - -impl TryFrom for Version { - type Error = (); - - fn try_from(value: u8) -> Result { - match value { - 1 => Ok(Self::V1), - 2 => Ok(Self::V2), - 3 => Ok(Self::V3), - _ => Err(()), - } - } -} From 85d85536a1d4a24e7e627cbf9290ab9dd86c7c51 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 10 May 2025 18:18:12 +0200 Subject: [PATCH 120/613] disable some tests temporarily --- tests/blob_gc.rs | 3 +++ tests/blob_gc_watermark.rs | 1 + tests/blob_tombstone.rs | 1 + tests/blob_tree_flush.rs | 1 + tests/blob_tree_reload_blob.rs | 2 ++ 5 files changed, 8 insertions(+) diff --git a/tests/blob_gc.rs b/tests/blob_gc.rs index ce80a771..a1fa196b 100644 --- a/tests/blob_gc.rs +++ b/tests/blob_gc.rs @@ -2,6 +2,7 @@ use lsm_tree::{AbstractTree, Config, SequenceNumberCounter}; use test_log::test; #[test] +#[ignore] fn blob_gc_1() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; @@ -45,6 +46,7 @@ fn blob_gc_1() -> lsm_tree::Result<()> { } #[test] +#[ignore] fn blob_gc_2() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; @@ -94,6 +96,7 @@ fn blob_gc_2() -> lsm_tree::Result<()> { } #[test] +#[ignore] fn blob_gc_3() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; diff --git a/tests/blob_gc_watermark.rs b/tests/blob_gc_watermark.rs index 13d1a2e0..6f0ae505 100644 --- a/tests/blob_gc_watermark.rs +++ b/tests/blob_gc_watermark.rs @@ -2,6 +2,7 @@ use lsm_tree::{AbstractTree, Config, SequenceNumberCounter}; use test_log::test; #[test] +#[ignore] fn blob_gc_seqno_watermark() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; diff --git a/tests/blob_tombstone.rs b/tests/blob_tombstone.rs index 0c63562b..c7b0e310 100644 --- a/tests/blob_tombstone.rs +++ b/tests/blob_tombstone.rs @@ -2,6 +2,7 @@ use lsm_tree::AbstractTree; use test_log::test; #[test] +#[ignore] fn blob_tree_tombstone() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let path = folder.path(); diff --git a/tests/blob_tree_flush.rs b/tests/blob_tree_flush.rs index ff43396c..a3270cc8 100644 --- a/tests/blob_tree_flush.rs +++ b/tests/blob_tree_flush.rs @@ -2,6 +2,7 @@ use lsm_tree::{AbstractTree, Config, SequenceNumberCounter}; use test_log::test; #[test] +#[ignore] fn blob_gc_flush_tombstone() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; diff --git a/tests/blob_tree_reload_blob.rs b/tests/blob_tree_reload_blob.rs index e16dcf20..953ca860 100644 --- a/tests/blob_tree_reload_blob.rs +++ b/tests/blob_tree_reload_blob.rs @@ -5,6 +5,7 @@ use test_log::test; const ITEM_COUNT: usize = 10_000; #[test] +#[ignore] fn blob_tree_reload_empty() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; @@ -41,6 +42,7 @@ fn blob_tree_reload_empty() -> lsm_tree::Result<()> { } #[test] +#[ignore] fn blob_tree_reload() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; From 66d20dfd25a04dc8fcb71b3e0f63cb650293c0f2 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 10 May 2025 18:18:51 +0200 Subject: [PATCH 121/613] misc --- src/cache.rs | 2 +- src/level_manifest/level.rs | 1 - src/segment/block/encoder.rs | 1 - src/segment/data_block/iter.rs | 1 + src/segment/regions.rs | 4 ++++ 5 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/cache.rs b/src/cache.rs index 72c22d27..6bdd042d 100644 --- a/src/cache.rs +++ b/src/cache.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use crate::segment::block::Header; -use crate::segment::{Block, BlockOffset, DataBlock, IndexBlock}; +use crate::segment::{Block, BlockOffset}; use crate::{GlobalSegmentId, UserValue}; use quick_cache::Weighter; use quick_cache::{sync::Cache as QuickCache, Equivalent}; diff --git a/src/level_manifest/level.rs b/src/level_manifest/level.rs index 6cf9d7e3..1a81bcc2 100644 --- a/src/level_manifest/level.rs +++ b/src/level_manifest/level.rs @@ -188,7 +188,6 @@ impl<'a> DisjointLevel<'a> { .cloned() } - // TODO: use a single custom binary search instead of partition_point... benchmark it and add some unit tests before pub fn range_indexes( &'a self, key_range: &'a (Bound, Bound), diff --git a/src/segment/block/encoder.rs b/src/segment/block/encoder.rs index c46680b0..090c9577 100644 --- a/src/segment/block/encoder.rs +++ b/src/segment/block/encoder.rs @@ -53,7 +53,6 @@ pub struct Encoder<'a, S: Default, T: Encodable> { base_key: &'a [u8], } -// TODO: maybe split into Builder impl<'a, S: Default, T: Encodable> Encoder<'a, S, T> { pub fn new( item_count: usize, diff --git a/src/segment/data_block/iter.rs b/src/segment/data_block/iter.rs index 3a316168..fe800c9a 100644 --- a/src/segment/data_block/iter.rs +++ b/src/segment/data_block/iter.rs @@ -65,6 +65,7 @@ impl ParsedItem { } impl<'a> Iter<'a> { + #[must_use] pub fn new(block: &'a DataBlock) -> Self { let restart_interval = block.restart_interval.into(); let binary_index_len = block.binary_index_len as usize; diff --git a/src/segment/regions.rs b/src/segment/regions.rs index c9ec1ffb..77bfb048 100644 --- a/src/segment/regions.rs +++ b/src/segment/regions.rs @@ -1,3 +1,7 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + use super::{Block, BlockHandle}; use crate::{ coding::{Decode, Encode}, From 2d056961c2839fbfbf359b80084e8f4f487cec16 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 10 May 2025 18:19:30 +0200 Subject: [PATCH 122/613] wip --- src/segment/mod.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 613e946f..ca203fb4 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -3,8 +3,8 @@ // (found in the LICENSE-* files in the repository) pub mod block; -mod block_index; -pub(crate) mod data_block; +pub(crate) mod block_index; +pub mod data_block; pub mod filter; mod id; mod index_block; @@ -26,10 +26,11 @@ pub use scanner::Scanner; pub use writer::Writer; use crate::{ - cache::Cache, descriptor_table::DescriptorTable, InternalValue, SeqNo, TreeId, UserKey, + cache::Cache, descriptor_table::DescriptorTable, CompressionType, InternalValue, SeqNo, TreeId, + UserKey, }; use block_index::{BlockIndex, BlockIndexImpl, FullBlockIndex}; -use filter::standard_bloom::{CompositeHash, StandardBloomFilter}; +use filter::standard_bloom::{CompositeHash, StandardBloomFilterReader}; use inner::Inner; use meta::ParsedMeta; use std::{ From 89f93fae755779a1c503f7b4d82268f01ae94eca Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 10 May 2025 18:19:49 +0200 Subject: [PATCH 123/613] in segment scanner, use data block forward reader --- src/segment/data_block/mod.rs | 5 +++++ src/segment/scanner.rs | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index e894e9f6..7e016b90 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -176,6 +176,11 @@ impl DataBlock { Iter::new(self).map(|kv| kv.materialize(&self.inner.data)) } + #[allow(clippy::iter_without_into_iter)] + pub fn scan(&self) -> impl Iterator + '_ { + ForwardReader::new(self).map(|kv| kv.materialize(&self.inner.data)) + } + pub fn range<'a, K: AsRef<[u8]> + 'a, R: RangeBounds + 'a>( &'a self, range: &'a R, diff --git a/src/segment/scanner.rs b/src/segment/scanner.rs index 65e73fd2..aaf4697f 100644 --- a/src/segment/scanner.rs +++ b/src/segment/scanner.rs @@ -38,7 +38,7 @@ impl Scanner { let mut reader = BufReader::with_capacity(8 * 4_096, File::open(path)?); let block = Self::fetch_next_block(&mut reader, compression)?; - let iter = Iter::new(block, |block| Box::new(block.iter())); + let iter = Iter::new(block, |block| Box::new(block.scan())); Ok(Self { reader, @@ -73,7 +73,7 @@ impl Iterator for Scanner { // Init new block let block = fail_iter!(Self::fetch_next_block(&mut self.reader, self.compression)); - self.iter = Iter::new(block, |block| Box::new(block.iter())); + self.iter = Iter::new(block, |block| Box::new(block.scan())); self.read_count += 1; } From 45dda56db831364eea16dff4e70b6ef8e4865c69 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 10 May 2025 18:20:35 +0200 Subject: [PATCH 124/613] skip buffer zeroing in block load --- Cargo.toml | 2 +- src/segment/block/mod.rs | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index bbe27064..7757276a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,7 +24,7 @@ bytes = ["value-log/bytes"] [dependencies] byteorder = "1.5.0" -byteview = "0.6.1" +byteview = "0.7.0" # TODO: remove in favor of Slice wrapper crossbeam-skiplist = "0.1.3" double-ended-peekable = "0.1.0" enum_dispatch = "0.3.13" diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index 15ba9586..ccbda6fe 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -5,7 +5,7 @@ pub(crate) mod binary_index; mod checksum; mod encoder; -pub(crate) mod hash_index; +pub mod hash_index; mod header; mod offset; mod trailer; @@ -123,8 +123,12 @@ impl Block { size: u32, compression: CompressionType, ) -> crate::Result { - // TODO: use with_size_unzeroed (or whatever it will be called) - // TODO: use a Slice::get_mut instead... needs value-log update + // TODO: toggle with use_unsafe and add bench + + #[cfg(feature = "use_unsafe")] + let mut buf = byteview::ByteView::with_size_unzeroed(size as usize); + + #[cfg(not(feature = "use_unsafe"))] let mut buf = byteview::ByteView::with_size(size as usize); { From 3ad02dab9286172392e122dad670ea8be40a2739 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 10 May 2025 18:20:57 +0200 Subject: [PATCH 125/613] refactor: remove superfluous parameter --- src/blob_tree/mod.rs | 2 +- src/tree/mod.rs | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index d08134d3..f966f90b 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -465,7 +465,7 @@ impl AbstractTree for BlobTree { self.blobs.register_writer(blob_writer)?; log::trace!("Creating LSM-tree segment {segment_id}"); - let segment = self.index.consume_writer(segment_id, segment_writer)?; + let segment = self.index.consume_writer(segment_writer)?; // TODO: this can probably solved in a nicer way if segment.is_some() { diff --git a/src/tree/mod.rs b/src/tree/mod.rs index d78f528b..5437d542 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -206,7 +206,7 @@ impl AbstractTree for Tree { segment_writer.write(item?)?; } - let result = self.consume_writer(segment_id, segment_writer)?; + let result = self.consume_writer(segment_writer)?; log::debug!("Flushed memtable {segment_id:?} in {:?}", start.elapsed()); @@ -473,7 +473,6 @@ impl Tree { pub(crate) fn consume_writer( &self, - segment_id: SegmentId, // TODO: <- remove writer: crate::segment::Writer, ) -> crate::Result> { let segment_file_path = writer.path.to_path_buf(); From fa0c52d706725064de2fa33d0ac756c3a19bb670 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 10 May 2025 18:22:13 +0200 Subject: [PATCH 126/613] fix --- src/manifest.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/manifest.rs b/src/manifest.rs index 5e8132b7..e10d1736 100644 --- a/src/manifest.rs +++ b/src/manifest.rs @@ -5,13 +5,13 @@ use crate::{ coding::{Decode, DecodeError, Encode, EncodeError}, file::MAGIC_BYTES, - TreeType, Version, + TreeType, TreeVersion, }; use byteorder::{ReadBytesExt, WriteBytesExt}; use std::io::Write; pub struct Manifest { - pub(crate) version: Version, + pub(crate) version: TreeVersion, pub(crate) tree_type: TreeType, // pub(crate) table_type: TableType, pub(crate) level_count: u8, @@ -38,7 +38,7 @@ impl Decode for Manifest { #[allow(clippy::expect_used)] let version = *header.get(3).expect("header must be length 4"); - let version = Version::try_from(version).map_err(|()| DecodeError::InvalidVersion)?; + let version = TreeVersion::try_from(version).map_err(|()| DecodeError::InvalidVersion)?; let tree_type = reader.read_u8()?; // let table_type = reader.read_u8()?; From 0a3cf3846f11128a6a4da50e14786287885e7c44 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 10 May 2025 18:30:25 +0200 Subject: [PATCH 127/613] refactor --- src/segment/block/encoder.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/segment/block/encoder.rs b/src/segment/block/encoder.rs index 090c9577..3aa7c130 100644 --- a/src/segment/block/encoder.rs +++ b/src/segment/block/encoder.rs @@ -60,8 +60,7 @@ impl<'a, S: Default, T: Encodable> Encoder<'a, S, T> { hash_index_ratio: f32, first_key: &'a [u8], ) -> Self { - let binary_index_len = item_count / usize::from(restart_interval); - + let binary_index_builder = BinaryIndexBuilder::new(item_count / restart_interval as usize); let hash_index_builder = HashIndexBuilder::with_hash_ratio(item_count, hash_index_ratio); Self { @@ -77,7 +76,7 @@ impl<'a, S: Default, T: Encodable> Encoder<'a, S, T> { restart_interval, use_prefix_truncation: true, - binary_index_builder: BinaryIndexBuilder::new(binary_index_len), + binary_index_builder, hash_index_builder, base_key: first_key, From dd0e5082052a37cf4fa884517316c7665f190d0d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 23 May 2025 21:21:27 +0200 Subject: [PATCH 128/613] fix: data block snapshot read --- src/segment/data_block/forward_reader.rs | 34 ++++++++++++++++++++++++ src/segment/data_block/mod.rs | 2 +- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/src/segment/data_block/forward_reader.rs b/src/segment/data_block/forward_reader.rs index 3e55c030..1db08462 100644 --- a/src/segment/data_block/forward_reader.rs +++ b/src/segment/data_block/forward_reader.rs @@ -286,6 +286,40 @@ mod tests { }; use test_log::test; + #[test] + fn v3_data_block_snapshot_read_first() -> crate::Result<()> { + let items = [InternalValue::from_components( + "hello", + "world", + 0, + crate::ValueType::Value, + )]; + + let bytes = DataBlock::encode_items(&items, 16, 0.0)?; + let serialized_len = bytes.len(); + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(!data_block.is_empty()); + assert_eq!(data_block.inner.size(), serialized_len); + + assert_eq!( + Some(items[0].clone()), + data_block.point_read(b"hello", Some(777)) + ); + + Ok(()) + } + #[test] fn v3_data_block_point_read_one() -> crate::Result<()> { let items = [InternalValue::from_components( diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index 7e016b90..8bec8083 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -332,7 +332,7 @@ impl DataBlock { } if left == 0 { - return None; + return Some(0); } let offset = binary_index.get(left - 1); From 508f91966910f478d4fac3ef27e81aec89f11f87 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 23 May 2025 21:23:19 +0200 Subject: [PATCH 129/613] rename TreeVersion --- src/error.rs | 4 ++-- src/{tree_version.rs => format_version.rs} | 16 ++++++++-------- src/lib.rs | 4 ++-- src/manifest.rs | 6 +++--- src/tree/mod.rs | 8 ++++---- tests/tree_v1_load_fixture.rs | 4 ++-- tests/tree_v2_load_fixture.rs | 4 ++-- 7 files changed, 23 insertions(+), 23 deletions(-) rename src/{tree_version.rs => format_version.rs} (72%) diff --git a/src/error.rs b/src/error.rs index 49cf17f4..14ab2372 100644 --- a/src/error.rs +++ b/src/error.rs @@ -4,7 +4,7 @@ use crate::{ coding::{DecodeError, EncodeError}, - tree_version::Version, + format_version::FormatVersion, Checksum, CompressionType, }; @@ -25,7 +25,7 @@ pub enum Error { Decompress(CompressionType), /// Invalid or unparsable data format version - InvalidVersion(Version), + InvalidVersion(FormatVersion), /// Some required segments could not be recovered from disk Unrecoverable, diff --git a/src/tree_version.rs b/src/format_version.rs similarity index 72% rename from src/tree_version.rs rename to src/format_version.rs index 19862ad8..54a6bc37 100644 --- a/src/tree_version.rs +++ b/src/format_version.rs @@ -4,7 +4,7 @@ /// Disk format version #[derive(Copy, Clone, Debug, Eq, PartialEq)] -pub enum Version { +pub enum FormatVersion { /// Version for 1.x.x releases V1, @@ -15,23 +15,23 @@ pub enum Version { V3, } -impl std::fmt::Display for Version { +impl std::fmt::Display for FormatVersion { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", u8::from(*self)) } } -impl From for u8 { - fn from(value: Version) -> Self { +impl From for u8 { + fn from(value: FormatVersion) -> Self { match value { - Version::V1 => 1, - Version::V2 => 2, - Version::V3 => 3, + FormatVersion::V1 => 1, + FormatVersion::V2 => 2, + FormatVersion::V3 => 3, } } } -impl TryFrom for Version { +impl TryFrom for FormatVersion { type Error = (); fn try_from(value: u8) -> Result { diff --git a/src/lib.rs b/src/lib.rs index a4ab2a76..020a0e26 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -178,9 +178,9 @@ mod windows; #[doc(hidden)] pub mod stop_signal; +mod format_version; mod time; mod tree; -mod tree_version; mod value; mod version; @@ -213,12 +213,12 @@ pub use { config::{Config, TreeType}, descriptor_table::DescriptorTable, error::{Error, Result}, + format_version::FormatVersion, memtable::Memtable, r#abstract::AbstractTree, seqno::SequenceNumberCounter, snapshot::Snapshot, tree::Tree, - tree_version::Version as TreeVersion, value::{SeqNo, UserKey, UserValue, ValueType}, }; diff --git a/src/manifest.rs b/src/manifest.rs index e10d1736..2fea376e 100644 --- a/src/manifest.rs +++ b/src/manifest.rs @@ -5,13 +5,13 @@ use crate::{ coding::{Decode, DecodeError, Encode, EncodeError}, file::MAGIC_BYTES, - TreeType, TreeVersion, + FormatVersion, TreeType, }; use byteorder::{ReadBytesExt, WriteBytesExt}; use std::io::Write; pub struct Manifest { - pub(crate) version: TreeVersion, + pub(crate) version: FormatVersion, pub(crate) tree_type: TreeType, // pub(crate) table_type: TableType, pub(crate) level_count: u8, @@ -38,7 +38,7 @@ impl Decode for Manifest { #[allow(clippy::expect_used)] let version = *header.get(3).expect("header must be length 4"); - let version = TreeVersion::try_from(version).map_err(|()| DecodeError::InvalidVersion)?; + let version = FormatVersion::try_from(version).map_err(|()| DecodeError::InvalidVersion)?; let tree_type = reader.read_u8()?; // let table_type = reader.read_u8()?; diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 5437d542..9201b25b 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -9,11 +9,11 @@ use crate::{ coding::{Decode, Encode}, compaction::CompactionStrategy, config::Config, + format_version::FormatVersion, level_manifest::LevelManifest, manifest::Manifest, memtable::Memtable, segment::Segment, - tree_version::Version, value::InternalValue, AbstractTree, Cache, DescriptorTable, KvPair, SegmentId, SeqNo, Snapshot, UserKey, UserValue, ValueType, @@ -455,7 +455,7 @@ impl Tree { // Check for old version if config.path.join("version").try_exists()? { - return Err(crate::Error::InvalidVersion(Version::V1)); + return Err(crate::Error::InvalidVersion(FormatVersion::V1)); } let tree = if config.path.join(MANIFEST_FILE).try_exists()? { @@ -787,7 +787,7 @@ impl Tree { let mut bytes = Cursor::new(bytes); let manifest = Manifest::decode_from(&mut bytes)?; - if manifest.version != Version::V3 { + if manifest.version != FormatVersion::V3 { return Err(crate::Error::InvalidVersion(manifest.version)); } @@ -842,7 +842,7 @@ impl Tree { // -> the LSM is fully initialized let mut file = File::create(manifest_path)?; Manifest { - version: Version::V3, + version: FormatVersion::V3, level_count: config.level_count, tree_type: config.tree_type, // table_type: TableType::Block, diff --git a/tests/tree_v1_load_fixture.rs b/tests/tree_v1_load_fixture.rs index 1afff2bc..306b9303 100644 --- a/tests/tree_v1_load_fixture.rs +++ b/tests/tree_v1_load_fixture.rs @@ -9,7 +9,7 @@ fn tree_load_v1() -> lsm_tree::Result<()> { matches!( result, - Err(lsm_tree::Error::InvalidVersion(lsm_tree::Version::V1)) + Err(lsm_tree::Error::InvalidVersion(lsm_tree::FormatVersion::V1)) ); Ok(()) @@ -23,7 +23,7 @@ fn tree_load_v1_corrupt() -> lsm_tree::Result<()> { matches!( result, - Err(lsm_tree::Error::InvalidVersion(lsm_tree::Version::V1)) + Err(lsm_tree::Error::InvalidVersion(lsm_tree::FormatVersion::V1)) ); Ok(()) diff --git a/tests/tree_v2_load_fixture.rs b/tests/tree_v2_load_fixture.rs index e85065a6..5c04cff9 100644 --- a/tests/tree_v2_load_fixture.rs +++ b/tests/tree_v2_load_fixture.rs @@ -9,7 +9,7 @@ fn tree_load_v2() -> lsm_tree::Result<()> { matches!( result, - Err(lsm_tree::Error::InvalidVersion(lsm_tree::Version::V2)) + Err(lsm_tree::Error::InvalidVersion(lsm_tree::FormatVersion::V2)) ); Ok(()) @@ -23,7 +23,7 @@ fn tree_load_v2_corrupt() -> lsm_tree::Result<()> { matches!( result, - Err(lsm_tree::Error::InvalidVersion(lsm_tree::Version::V2)) + Err(lsm_tree::Error::InvalidVersion(lsm_tree::FormatVersion::V2)) ); Ok(()) From b4e3adc3d9812531acf47889ecac5a404e9d8ec9 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 23 May 2025 21:23:50 +0200 Subject: [PATCH 130/613] doc: fix complexity of operations --- src/abstract.rs | 2 +- src/snapshot.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/abstract.rs b/src/abstract.rs index a7237daf..4dbfb7c2 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -199,7 +199,7 @@ pub trait AbstractTree { /// Returns `true` if the tree is empty. /// - /// This operation has O(1) complexity. + /// This operation has O(log N) complexity. /// /// # Examples /// diff --git a/src/snapshot.rs b/src/snapshot.rs index f6ba9890..5d8348fc 100644 --- a/src/snapshot.rs +++ b/src/snapshot.rs @@ -323,7 +323,7 @@ impl Snapshot { /// Returns `true` if the snapshot is empty. /// - /// This operation has O(1) complexity. + /// This operation has O(log N) complexity. /// /// # Examples /// From ad46c7fb3e1edd1efea86e4f0aa65bd82d9e6c41 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 23 May 2025 21:24:15 +0200 Subject: [PATCH 131/613] add derives to Timestamp --- src/segment/meta.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/segment/meta.rs b/src/segment/meta.rs index cd71a5cc..d9af86f3 100644 --- a/src/segment/meta.rs +++ b/src/segment/meta.rs @@ -8,6 +8,7 @@ use byteorder::{LittleEndian, ReadBytesExt}; use std::{fs::File, ops::Deref}; /// Nanosecond timestamp. +#[derive(Copy, Clone, PartialEq, Eq, Ord, PartialOrd)] pub struct Timestamp(u128); impl Deref for Timestamp { From 80a8cb3ad1c5c167c8abe6fdbe580d87ce2c1a42 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 23 May 2025 21:24:34 +0200 Subject: [PATCH 132/613] wip --- src/segment/index_block/block_handle.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/segment/index_block/block_handle.rs b/src/segment/index_block/block_handle.rs index 8dd20729..f75f7ab0 100644 --- a/src/segment/index_block/block_handle.rs +++ b/src/segment/index_block/block_handle.rs @@ -22,6 +22,7 @@ pub struct BlockHandle { } impl BlockHandle { + #[must_use] pub fn new(offset: BlockOffset, size: u32) -> Self { Self { offset, size } } From 9be5bf0b814842c99d7b2d7490a1d6e97bc5a2fa Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 23 May 2025 21:26:39 +0200 Subject: [PATCH 133/613] refactor: checksum --- src/segment/block/checksum.rs | 10 +--------- src/segment/block/mod.rs | 6 +++--- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/src/segment/block/checksum.rs b/src/segment/block/checksum.rs index 5b3f0edc..882de155 100644 --- a/src/segment/block/checksum.rs +++ b/src/segment/block/checksum.rs @@ -2,9 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use xxhash_rust::xxh3::xxh3_64; - -/// A checksum based on xxh3 +/// An 64-bit checksum #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub struct Checksum(u64); @@ -21,10 +19,4 @@ impl Checksum { pub fn from_raw(value: u64) -> Self { Self(value) } - - /// Calculates a checksum. - #[must_use] - pub fn from_bytes(bytes: &[u8]) -> Self { - Self(xxh3_64(bytes)) - } } diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index ccbda6fe..31c00ca4 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -44,10 +44,8 @@ impl Block { data: &[u8], compression: CompressionType, ) -> crate::Result
{ - let checksum = xxh3_64(data); - let mut header = Header { - checksum: Checksum::from_raw(checksum), + checksum: Checksum::from_raw(xxh3_64(data)), data_length: 0, // <-- NOTE: Is set later on uncompressed_length: data.len() as u32, previous_block_offset: BlockOffset(0), // <-- TODO: @@ -197,6 +195,8 @@ impl Block { } }; + // TODO: check checksum + debug_assert_eq!(header.uncompressed_length, { #[allow(clippy::expect_used, clippy::cast_possible_truncation)] { From b7672df01a002ffe2a6cfea81f18b3494f80777e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 23 May 2025 21:29:53 +0200 Subject: [PATCH 134/613] refactor: block --- src/segment/block/mod.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index 31c00ca4..906208d2 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -39,6 +39,7 @@ impl Block { self.data.len() } + /// Encodes a block into a writer. pub fn to_writer( mut writer: &mut W, data: &[u8], @@ -74,6 +75,7 @@ impl Block { Ok(header) } + /// Reads a block from a reader. pub fn from_reader( reader: &mut R, compression: CompressionType, @@ -115,6 +117,7 @@ impl Block { } // TODO: take non-keyed block handle + /// Reads a block from a file without needing to seek the file. pub fn from_file( file: &File, offset: BlockOffset, @@ -197,12 +200,10 @@ impl Block { // TODO: check checksum - debug_assert_eq!(header.uncompressed_length, { - #[allow(clippy::expect_used, clippy::cast_possible_truncation)] - { - data.len() as u32 - } - }); + #[allow(clippy::expect_used, clippy::cast_possible_truncation)] + { + debug_assert_eq!(header.uncompressed_length, data.len() as u32); + } Ok(Self { header, From 2e6fd99179ef09e4cb381797af7534a83e3bc71f Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 26 May 2025 18:49:53 +0200 Subject: [PATCH 135/613] add simple tree reload test --- tests/tree_reload.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/tree_reload.rs b/tests/tree_reload.rs index c9043866..a2808fea 100644 --- a/tests/tree_reload.rs +++ b/tests/tree_reload.rs @@ -4,6 +4,30 @@ use test_log::test; const ITEM_COUNT: usize = 10_000; +#[test] +fn tree_reload_smoke_test() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + { + let tree = Config::new(&folder).open()?; + assert_eq!(0, tree.segment_count()); + + tree.insert("a", "a", 0); + tree.flush_active_memtable(0)?; + + assert_eq!(1, tree.segment_count()); + assert!(tree.contains_key("a", None)?); + } + + { + let tree = Config::new(&folder).open()?; + assert_eq!(1, tree.segment_count()); + assert!(tree.contains_key("a", None)?); + } + + Ok(()) +} + #[test] fn tree_reload_empty() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; From eebab0b585c39b2488044a8e3acf47db416c5c2e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 26 May 2025 18:53:25 +0200 Subject: [PATCH 136/613] add key range partitions structs --- src/version/key_range_partition.rs | 266 +++++++++++++++++++++++++++++ 1 file changed, 266 insertions(+) create mode 100644 src/version/key_range_partition.rs diff --git a/src/version/key_range_partition.rs b/src/version/key_range_partition.rs new file mode 100644 index 00000000..a3a2c524 --- /dev/null +++ b/src/version/key_range_partition.rs @@ -0,0 +1,266 @@ +use crate::{ + binary_search::partition_point, version::run::Ranged, KeyRange, Segment, SegmentId, UserKey, +}; +use std::{ + collections::{HashSet, VecDeque}, + fmt::Debug, +}; + +pub trait Identifiable { + fn id(&self) -> Id; +} + +impl Identifiable for Segment { + fn id(&self) -> SegmentId { + self.id() + } +} + +#[derive(Clone, Debug)] +pub struct Partition> { + key_range: KeyRange, + segments: VecDeque, +} + +#[derive(Clone, Debug, Default)] +pub struct KeyRangePartitions>( + Vec>, +); + +impl> KeyRangePartitions { + pub fn new(pairs: impl Iterator) -> Self { + let mut partitions = vec![]; + + for (start_key, end_key) in pairs { + partitions.push(Partition { + key_range: KeyRange::new((start_key, end_key)), + segments: VecDeque::new(), + }); + } + + Self(partitions) + } + + pub fn index_segment(&mut self, segment: &T) { + let key_range = &segment.key_range(); + let start_key = key_range.min(); + let end_key = key_range.max(); + + let idx = partition_point(&self.0, |x| x.key_range.max() < start_key); + + if let Some(slice) = self.0.get_mut(idx..) { + for partition in slice.iter_mut().filter(|x| x.key_range.max() <= end_key) { + partition.segments.push_back(segment.clone()); + } + } + } + + pub fn into_optimized_runs(mut self) -> Vec> { + let mut optimized = VecDeque::new(); + let mut blacklist = HashSet::::default(); + + loop { + let run = { + let mut v: Vec = vec![]; + + for partition in &mut self.0 { + let Some(front) = partition.segments.front() else { + continue; + }; + + let curr_id = front.id(); + + if blacklist.contains(&curr_id) { + continue; + } + + if v.iter() + .any(|x| x.key_range().overlaps_with_key_range(front.key_range())) + { + continue; + } + + // NOTE: We just got front previously + #[allow(clippy::expect_used)] + v.push(partition.segments.pop_front().expect("front should exist")); + + blacklist.insert(curr_id); + } + + v + }; + + if run.is_empty() { + break; + } + + #[cfg(debug_assertions)] + { + let ranges = run.iter().map(Ranged::key_range).collect::>(); + debug_assert!(KeyRange::is_disjoint(&ranges)); + } + + optimized.push_front(run); + } + + optimized.into() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use test_log::test; + + #[derive(Clone, Debug, PartialEq, Eq)] + struct FauxSegment { + key_range: KeyRange, + id: SegmentId, + } + + impl Identifiable for FauxSegment { + fn id(&self) -> SegmentId { + self.id + } + } + + impl Ranged for FauxSegment { + fn key_range(&self) -> &KeyRange { + &self.key_range + } + } + + #[test] + fn key_range_partition_one_segment() { + let segment = FauxSegment { + key_range: KeyRange::new((UserKey::new(b"a"), UserKey::new(b"b"))), + id: 0, + }; + + let mut index = KeyRangePartitions::::new(std::iter::once(( + UserKey::new(b"a"), + UserKey::new(b"b"), + ))); + + index.index_segment(&segment); + + assert_eq!(vec![vec![segment]], index.into_optimized_runs()); + } + + #[test] + fn key_range_partition_two_to_one() { + let a = FauxSegment { + key_range: KeyRange::new((UserKey::new(b"a"), UserKey::new(b"b"))), + id: 0, + }; + let b = FauxSegment { + key_range: KeyRange::new((UserKey::new(b"c"), UserKey::new(b"d"))), + id: 1, + }; + + { + let mut index = KeyRangePartitions::::new( + [ + (UserKey::new(b"a"), UserKey::new(b"b")), + (UserKey::new(b"b"), UserKey::new(b"c")), + (UserKey::new(b"c"), UserKey::new(b"d")), + ] + .into_iter(), + ); + + index.index_segment(&a); + index.index_segment(&b); + + assert_eq!( + vec![vec![a.clone(), b.clone()]], + index.into_optimized_runs() + ); + } + + { + let mut index = KeyRangePartitions::::new( + [ + (UserKey::new(b"a"), UserKey::new(b"b")), + (UserKey::new(b"b"), UserKey::new(b"c")), + (UserKey::new(b"c"), UserKey::new(b"d")), + ] + .into_iter(), + ); + + index.index_segment(&b); + index.index_segment(&a); + + assert_eq!(vec![vec![a, b]], index.into_optimized_runs()); + } + } + + #[test] + fn key_range_partition_full_overlap() { + let a = FauxSegment { + key_range: KeyRange::new((UserKey::new(b"a"), UserKey::new(b"z"))), + id: 0, + }; + let b = FauxSegment { + key_range: KeyRange::new((UserKey::new(b"a"), UserKey::new(b"z"))), + id: 1, + }; + + { + let mut index = KeyRangePartitions::::new(std::iter::once(( + UserKey::new(b"a"), + UserKey::new(b"z"), + ))); + + index.index_segment(&a); + index.index_segment(&b); + + assert_eq!( + vec![vec![b.clone()], vec![a.clone()]], + index.into_optimized_runs() + ); + } + + { + let mut index = KeyRangePartitions::::new(std::iter::once(( + UserKey::new(b"a"), + UserKey::new(b"z"), + ))); + + index.index_segment(&b); + index.index_segment(&a); + + assert_eq!(vec![vec![a], vec![b]], index.into_optimized_runs()); + } + } + + #[test] + fn key_range_partition_partial_overlap() { + let a = FauxSegment { + key_range: KeyRange::new((UserKey::new(b"a"), UserKey::new(b"k"))), + id: 0, + }; + let b = FauxSegment { + key_range: KeyRange::new((UserKey::new(b"c"), UserKey::new(b"z"))), + id: 1, + }; + + { + let mut index = KeyRangePartitions::::new( + [ + (UserKey::new(b"a"), UserKey::new(b"c")), + (UserKey::new(b"c"), UserKey::new(b"k")), + (UserKey::new(b"k"), UserKey::new(b"z")), + ] + .into_iter(), + ); + + index.index_segment(&a); + index.index_segment(&b); + + assert_eq!( + vec![vec![b.clone()], vec![a.clone()]], + index.into_optimized_runs() + ); + } + } +} From ee7e4777535db59f9881b2829f6e424fb23af74a Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 26 May 2025 18:54:42 +0200 Subject: [PATCH 137/613] add version & run structs --- src/version/mod.rs | 352 +++++++++++++++++++++++++++++++++++++++++++++ src/version/run.rs | 308 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 660 insertions(+) create mode 100644 src/version/mod.rs create mode 100644 src/version/run.rs diff --git a/src/version/mod.rs b/src/version/mod.rs new file mode 100644 index 00000000..37500ca5 --- /dev/null +++ b/src/version/mod.rs @@ -0,0 +1,352 @@ +pub mod key_range_partition; +pub mod run; + +pub use run::Run; + +use crate::{HashSet, KeyRange, Segment, SegmentId, UserKey}; +use key_range_partition::KeyRangePartitions; +use run::Ranged; +use std::{collections::BTreeSet, ops::Deref, sync::Arc}; + +pub type VersionId = u64; + +impl Ranged for Segment { + fn key_range(&self) -> &KeyRange { + &self.metadata.key_range + } +} + +pub struct GenericLevel { + runs: Vec>>, +} + +impl std::ops::Deref for GenericLevel { + type Target = [Arc>]; + + fn deref(&self) -> &Self::Target { + &self.runs + } +} + +impl GenericLevel { + pub fn new(runs: Vec>>) -> Self { + Self { runs } + } + + pub fn get_runs(&self) -> Vec>> { + self.runs.clone() + } + + pub fn segment_count(&self) -> usize { + self.iter().map(|x| x.len()).sum() + } + + pub fn run_count(&self) -> usize { + self.runs.len() + } + + pub fn is_disjoint(&self) -> bool { + self.run_count() == 1 + } + + pub fn is_empty(&self) -> bool { + self.runs.is_empty() + } + + pub fn iter(&self) -> impl Iterator> { + self.runs.iter().map(std::ops::Deref::deref) + } + + pub fn get_for_key<'a>(&'a self, key: &'a [u8]) -> impl Iterator { + self.iter().filter_map(|x| x.get_for_key(key)) + } + + pub fn get_overlapping<'a>(&'a self, key_range: &'a KeyRange) -> impl Iterator { + self.iter().flat_map(|x| x.get_overlapping(key_range)) + } + + pub fn get_contained<'a>(&'a self, key_range: &'a KeyRange) -> impl Iterator { + self.iter().flat_map(|x| x.get_contained(key_range)) + } +} + +#[derive(Clone)] +pub struct Level(Arc>); + +impl std::ops::Deref for Level { + type Target = GenericLevel; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl Level { + pub fn empty() -> Self { + Self::from_runs(vec![]) + } + + pub fn from_runs(runs: Vec>>) -> Self { + Self(Arc::new(GenericLevel { runs })) + } + + pub fn list_ids(&self) -> HashSet { + self.iter() + .flat_map(|run| run.iter()) + .map(Segment::id) + .collect() + } + + pub fn first_run(&self) -> Option<&Arc>> { + assert!(self.runs.len() <= 1, "should have at most one run"); + + self.runs.first() + } + + /// Returns the on-disk size of the level. + pub fn size(&self) -> u64 { + self.0 + .iter() + .flat_map(|x| x.iter()) + .map(|x| x.metadata.file_size) + .sum() + } + + pub fn aggregate_key_range(&self) -> KeyRange { + let key_ranges = self + .iter() + .map(Run::aggregate_key_range) + .collect::>(); + + KeyRange::aggregate(key_ranges.iter()) + } +} + +pub struct VersionInner { + id: VersionId, + pub(crate) levels: Vec, +} + +/// A version is a point-in-time view of a tree's structure +/// +/// Any time a segment is created or deleted, a new version is created. +#[derive(Clone)] +pub struct Version(Arc); + +impl std::ops::Deref for Version { + type Target = VersionInner; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +pub fn optimize_runs(level: Vec>) -> Vec> { + if level.len() <= 1 { + level + } else { + let mut key_range_boundaries: BTreeSet = BTreeSet::::default(); + + for run in &level { + for fragment in run.iter() { + let key_range = &fragment.metadata.key_range; + key_range_boundaries.insert(key_range.min().clone()); + key_range_boundaries.insert(key_range.max().clone()); + } + } + + let mut index = KeyRangePartitions::new( + key_range_boundaries + .into_iter() + .collect::>() + .windows(2) + .map(|pair| { + // NOTE: We are iterating over pairs, so index 0 and 1 always exist + #[allow(clippy::expect_used)] + #[allow(clippy::get_first)] + ( + pair.get(0).expect("exists").clone(), + pair.get(1).expect("exists").clone(), + ) + }), + ); + + // IMPORTANT: Index from bottom to top + for run in level.iter().rev() { + for segment in run.iter() { + index.index_segment(segment); + } + } + + index + .into_optimized_runs() + .into_iter() + .map(Run::new) + .collect() + } +} + +// TODO: impl using generics so we can easily unit test Version transformation functions +impl Version { + pub fn id(&self) -> VersionId { + self.id + } + + pub fn new(id: VersionId) -> Self { + let levels = (0..7).map(|_| Level::empty()).collect(); + + Self(Arc::new(VersionInner { id, levels })) + } + + pub fn from_levels(id: VersionId, levels: Vec) -> Self { + Self(Arc::new(VersionInner { id, levels })) + } + + /// Returns the amount of levels. + pub fn level_count(&self) -> usize { + self.levels.len() + } + + pub fn iter_levels(&self) -> impl Iterator { + self.levels.iter() + } + + pub fn segment_count(&self) -> usize { + self.iter_levels().map(|x| x.segment_count()).sum() + } + + pub fn iter_segments(&self) -> impl Iterator { + self.levels + .iter() + .flat_map(|x| x.iter()) + .flat_map(|x| x.iter()) + } + + pub fn level(&self, n: usize) -> Option<&Level> { + self.levels.get(n) + } + + pub fn with_new_l0_segment(&self, run: &[Segment]) -> Self { + let id = self.id + 1; + + let mut levels = vec![]; + + // L0 + levels.push({ + // Copy-on-write the first level with new run at top + let l0 = self.levels.first().expect("L0 should always exist"); + + let prev_runs = l0.get_runs(); + + let mut runs = Vec::with_capacity(prev_runs.len() + 1); + runs.push(Arc::new(Run::new(run.to_vec()))); + runs.extend(prev_runs); + + Level::from_runs(runs) + }); + + // L1+ + levels.extend(self.levels.iter().skip(1).cloned()); + + Self(Arc::new(VersionInner { id, levels })) + } + + pub fn with_dropped(&self, ids: &[SegmentId]) -> Self { + let id = self.id + 1; + + let mut levels = vec![]; + + for level in &self.levels { + let runs = level + .runs + .iter() + .map(|run| { + // TODO: don't clone Arc inner if we don't need to modify + let mut run: Run = run.deref().clone(); + run.retain(|x| !ids.contains(&x.metadata.id)); + run + }) + .filter(|x| !x.is_empty()) + .collect::>(); + + let runs = optimize_runs(runs); + + levels.push(Level::from_runs(runs.into_iter().map(Arc::new).collect())); + } + + Self(Arc::new(VersionInner { id, levels })) + } + + pub fn with_merge( + &self, + old_ids: &[SegmentId], + new_segments: &[Segment], + dest_level: usize, + ) -> Self { + let id = self.id + 1; + + let mut levels = vec![]; + + for (level_idx, level) in self.levels.iter().enumerate() { + let mut runs = level + .runs + .iter() + .map(|run| { + // TODO: don't clone Arc inner if we don't need to modify + let mut run: Run = run.deref().clone(); + run.retain(|x| !old_ids.contains(&x.metadata.id)); + run + }) + .filter(|x| !x.is_empty()) + .collect::>(); + + if level_idx == dest_level { + runs.insert(0, Run::new(new_segments.to_vec())); + } + + let runs = optimize_runs(runs); + + levels.push(Level::from_runs(runs.into_iter().map(Arc::new).collect())); + } + + Self(Arc::new(VersionInner { id, levels })) + } + + pub fn with_moved(&self, ids: &[SegmentId], dest_level: usize) -> Self { + let id = self.id + 1; + + let affected_segments = self + .iter_segments() + .filter(|x| ids.contains(&x.id())) + .cloned() + .collect::>(); + + assert_eq!(affected_segments.len(), ids.len(), "invalid segment IDs"); + + let mut levels = vec![]; + + for (level_idx, level) in self.levels.iter().enumerate() { + let mut runs = level + .runs + .iter() + .map(|run| { + // TODO: don't clone Arc inner if we don't need to modify + let mut run: Run = run.deref().clone(); + run.retain(|x| !ids.contains(&x.metadata.id)); + run + }) + .filter(|x| !x.is_empty()) + .collect::>(); + + if level_idx == dest_level { + runs.insert(0, Run::new(affected_segments.clone())); + } + + let runs = optimize_runs(runs); + + levels.push(Level::from_runs(runs.into_iter().map(Arc::new).collect())); + } + + Self(Arc::new(VersionInner { id, levels })) + } +} diff --git a/src/version/run.rs b/src/version/run.rs new file mode 100644 index 00000000..d98281ef --- /dev/null +++ b/src/version/run.rs @@ -0,0 +1,308 @@ +use crate::{binary_search::partition_point, KeyRange}; +use std::ops::{Bound, RangeBounds}; + +pub trait Ranged { + fn key_range(&self) -> &KeyRange; +} + +/// Item inside a run +/// +/// May point to an interval [min, max] of segments in the next run. +pub struct Indexed { + inner: T, + // cascade_indexes: (u32, u32), +} + +/* impl Indexed { + pub fn update_cascading(&mut self, next_run: &Run) { + let kr = self.key_range(); + let range = &**kr.min()..=&**kr.max(); + + if let Some((lo, hi)) = next_run.range_indexes(range) { + // NOTE: There are never 4+ billion segments in a run + #[allow(clippy::cast_possible_truncation)] + let interval = (lo as u32, hi as u32); + + self.cascade_indexes = interval; + } else { + self.cascade_indexes = (u32::MAX, u32::MAX); + } + } +} */ + +impl Ranged for Indexed { + fn key_range(&self) -> &KeyRange { + self.inner.key_range() + } +} + +impl std::ops::Deref for Indexed { + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.inner + } +} + +/// A disjoint run of disk segments +#[derive(Clone)] +pub struct Run(Vec); + +impl std::ops::Deref for Run { + type Target = [T]; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl Run { + pub fn new(items: Vec) -> Self { + Self(items) + } + + pub fn push(&mut self, item: T) { + self.0.push(item); + } + + pub fn extend(&mut self, items: Vec) { + self.0.extend(items); + } + + pub fn retain(&mut self, f: F) + where + F: FnMut(&T) -> bool, + { + self.0.retain(f); + } + + pub fn remove(&mut self, idx: usize) -> T { + self.0.remove(idx) + } + + /// Returns the segment tha'a,t possibly contains the key. + pub fn get_for_key(&self, key: &[u8]) -> Option<&T> { + let idx = partition_point(self, |x| x.key_range().max() < &key); + + self.0.get(idx).filter(|x| x.key_range().min() <= &key) + } + + pub fn aggregate_key_range(&self) -> KeyRange { + let lo = self.first().expect("runs are never empty"); + let hi = self.last().expect("runs are never empty"); + KeyRange::new((lo.key_range().min().clone(), hi.key_range().max().clone())) + } + + /// Returns an iterator over segments in the level that have a key range + /// overlapping the input key range. + pub fn get_overlapping<'a>(&'a self, key_range: &'a KeyRange) -> impl Iterator { + self.iter() + .filter(|x| x.key_range().overlaps_with_key_range(key_range)) + } + + /// Returns an iterator over segments in the level that have a key range + /// fully contained in the input key range. + pub fn get_contained<'a>(&'a self, key_range: &'a KeyRange) -> impl Iterator { + self.iter() + .filter(|x| key_range.contains_range(x.key_range())) + } + + /// Returns the indexes of the interval [min, max] of segments that overlap with a given range. + pub fn range_indexes<'a, R: RangeBounds<&'a [u8]>>( + &self, + key_range: R, + ) -> Option<(usize, usize)> { + let level = &self.0; + + let lo = match key_range.start_bound() { + Bound::Unbounded => 0, + Bound::Included(start_key) => { + partition_point(level, |x| x.key_range().max() < start_key) + } + Bound::Excluded(start_key) => { + partition_point(level, |x| x.key_range().max() <= start_key) + } + }; + + if lo >= level.len() { + return None; + } + + // NOTE: We check for level length above + #[allow(clippy::indexing_slicing)] + let truncated_level = &level[lo..]; + + let hi = match key_range.end_bound() { + Bound::Unbounded => level.len() - 1, + Bound::Included(end_key) => { + // IMPORTANT: We need to add back `lo` because we sliced it off + let idx = lo + partition_point(truncated_level, |x| x.key_range().min() <= end_key); + + if idx == 0 { + return None; + } + + idx.saturating_sub(1) // To avoid underflow + } + Bound::Excluded(end_key) => { + // IMPORTANT: We need to add back `lo` because we sliced it off + let idx = lo + partition_point(truncated_level, |x| x.key_range().min() < end_key); + + if idx == 0 { + return None; + } + + idx.saturating_sub(1) // To avoid underflow + } + }; + + if lo > hi { + return None; + } + + Some((lo, hi)) + } +} + +#[cfg(test)] +#[allow(clippy::unwrap_used)] +mod tests { + use super::*; + use test_log::test; + + #[derive(Clone)] + struct FakeSegment { + id: u64, + key_range: KeyRange, + } + + impl Ranged for FakeSegment { + fn key_range(&self) -> &KeyRange { + &self.key_range + } + } + + fn s(id: u64, min: &str, max: &str) -> FakeSegment { + FakeSegment { + id, + key_range: KeyRange::new((min.as_bytes().into(), max.as_bytes().into())), + } + } + + #[test] + fn run_aggregate_key_range() { + let items = vec![ + s(0, "a", "d"), + s(1, "e", "j"), + s(2, "k", "o"), + s(3, "p", "z"), + ]; + + let run = Run(items); + + assert_eq!( + KeyRange::new((b"a".into(), b"z".into())), + run.aggregate_key_range(), + ); + } + + #[test] + fn run_point_lookup() { + let items = vec![ + s(0, "a", "d"), + s(1, "e", "j"), + s(2, "k", "o"), + s(3, "p", "z"), + ]; + + let run = Run(items); + + assert_eq!(0, run.get_for_key(b"a").unwrap().id); + assert_eq!(0, run.get_for_key(b"aaa").unwrap().id); + assert_eq!(0, run.get_for_key(b"b").unwrap().id); + assert_eq!(0, run.get_for_key(b"c").unwrap().id); + assert_eq!(0, run.get_for_key(b"d").unwrap().id); + assert_eq!(1, run.get_for_key(b"e").unwrap().id); + assert_eq!(1, run.get_for_key(b"j").unwrap().id); + assert_eq!(2, run.get_for_key(b"k").unwrap().id); + assert_eq!(2, run.get_for_key(b"o").unwrap().id); + assert_eq!(3, run.get_for_key(b"p").unwrap().id); + assert_eq!(3, run.get_for_key(b"z").unwrap().id); + assert!(run.get_for_key(b"zzz").is_none()); + } + + #[test] + fn run_range_culling() { + let items = vec![ + s(0, "a", "d"), + s(1, "e", "j"), + s(2, "k", "o"), + s(3, "p", "z"), + ]; + + let run = Run(items); + + assert_eq!(Some((0, 3)), run.range_indexes(..)); + assert_eq!(Some((0, 0)), run.range_indexes(b"a" as &[u8]..=b"a")); + assert_eq!(Some((0, 0)), run.range_indexes(b"a" as &[u8]..=b"b")); + assert_eq!(Some((0, 0)), run.range_indexes(b"a" as &[u8]..=b"d")); + assert_eq!(Some((0, 0)), run.range_indexes(b"a" as &[u8]..b"d")); + assert_eq!(Some((0, 1)), run.range_indexes(b"a" as &[u8]..=b"g")); + assert_eq!(Some((0, 3)), run.range_indexes(b"a" as &[u8]..=b"z")); + assert_eq!(Some((3, 3)), run.range_indexes(b"z" as &[u8]..=b"zzz")); + assert_eq!(Some((3, 3)), run.range_indexes(b"z" as &[u8]..)); + assert!(run.range_indexes(b"zzz" as &[u8]..=b"zzzzzzz").is_none()); + } + + #[test] + fn run_range_overlaps() { + let items = vec![ + s(0, "a", "d"), + s(1, "e", "j"), + s(2, "k", "o"), + s(3, "p", "z"), + ]; + + let run = Run(items); + + assert_eq!( + &[0], + &*run + .get_overlapping(&KeyRange::new((b"d".into(), b"d".into()))) + .map(|x| x.id) + .collect::>(), + ); + + assert_eq!( + &[0], + &*run + .get_overlapping(&KeyRange::new((b"a".into(), b"d".into()))) + .map(|x| x.id) + .collect::>(), + ); + + assert_eq!( + &[0, 1], + &*run + .get_overlapping(&KeyRange::new((b"a".into(), b"f".into()))) + .map(|x| x.id) + .collect::>(), + ); + + assert_eq!( + &[0, 1, 2, 3], + &*run + .get_overlapping(&KeyRange::new((b"a".into(), b"zzz".into()))) + .map(|x| x.id) + .collect::>(), + ); + + assert_eq!( + &[] as &[u64], + &*run + .get_overlapping(&KeyRange::new((b"zzz".into(), b"zzzz".into()))) + .map(|x| x.id) + .collect::>(), + ); + } +} From 29fbd0c2d8e6b9a1078119c45c849975d522e699 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 26 May 2025 18:56:31 +0200 Subject: [PATCH 138/613] change level_reader and _scanner to use run struct --- src/{level_reader.rs => run_reader.rs} | 39 +++++++++++------------- src/{level_scanner.rs => run_scanner.rs} | 35 ++++++++++----------- 2 files changed, 35 insertions(+), 39 deletions(-) rename src/{level_reader.rs => run_reader.rs} (93%) rename src/{level_scanner.rs => run_scanner.rs} (86%) diff --git a/src/level_reader.rs b/src/run_reader.rs similarity index 93% rename from src/level_reader.rs rename to src/run_reader.rs index 5f92d892..94a7ffc8 100644 --- a/src/level_reader.rs +++ b/src/run_reader.rs @@ -2,12 +2,12 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::{level_manifest::level::Level, segment::CachePolicy, InternalValue, UserKey}; +use crate::{segment::CachePolicy, version::Run, InternalValue, Segment}; use std::{ops::Bound, sync::Arc}; -/// Reads through a disjoint level +/// Reads through a disjoint run pub struct LevelReader { - segments: Arc, + segments: Arc>, lo: usize, hi: usize, lo_reader: Option<()>, // TODO: range @@ -18,28 +18,21 @@ pub struct LevelReader { impl LevelReader { #[must_use] pub fn new( - level: Arc, - range: &(Bound, Bound), + run: Arc>, + range: &(Bound<&[u8]>, Bound<&[u8]>), cache_policy: CachePolicy, ) -> Option { - assert!(!level.is_empty(), "level reader cannot read empty level"); + assert!(!run.is_empty(), "level reader cannot read empty level"); - let disjoint_level = level.as_disjoint().expect("level should be disjoint"); + let (lo, hi) = run.range_indexes(*range)?; - let (lo, hi) = disjoint_level.range_indexes(range)?; - - Some(Self::from_indexes( - level, - range, - (Some(lo), Some(hi)), - cache_policy, - )) + Some(Self::culled(run, range, (Some(lo), Some(hi)), cache_policy)) } #[must_use] - pub fn from_indexes( - level: Arc, - range: &(Bound, Bound), + pub fn culled( + run: Arc>, + range: &(Bound<&[u8]>, Bound<&[u8]>), (lo, hi): (Option, Option), cache_policy: CachePolicy, ) -> Self { @@ -151,7 +144,8 @@ mod tests { use std::ops::Bound::{Included, Unbounded}; use test_log::test; - #[test] + // TODO: restore + /* #[test] fn level_reader_skip() -> crate::Result<()> { let tempdir = tempfile::tempdir()?; let tree = crate::Config::new(&tempdir).open()?; @@ -198,9 +192,10 @@ mod tests { .is_none()); Ok(()) - } + } */ - #[test] + // TODO: restore + /* #[test] #[allow(clippy::unwrap_used)] fn level_reader_basic() -> crate::Result<()> { let tempdir = tempfile::tempdir()?; @@ -333,5 +328,5 @@ mod tests { } Ok(()) - } + } */ } diff --git a/src/level_scanner.rs b/src/run_scanner.rs similarity index 86% rename from src/level_scanner.rs rename to src/run_scanner.rs index d7f9ef4b..14ec5532 100644 --- a/src/level_scanner.rs +++ b/src/run_scanner.rs @@ -2,33 +2,33 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::{level_manifest::level::Level, segment::Scanner, InternalValue}; +use crate::{segment::Scanner, version::Run, InternalValue, Segment}; use std::sync::Arc; -/// Scans through a disjoint level +/// Scans through a disjoint run /// /// Optimized for compaction, by using a `SegmentScanner` instead of `SegmentReader`. -pub struct LevelScanner { - segments: Arc, +pub struct RunScanner { + segments: Arc>, lo: usize, hi: usize, lo_reader: Option, } -impl LevelScanner { - pub fn from_indexes( - level: Arc, +impl RunScanner { + pub fn culled( + run: Arc>, (lo, hi): (Option, Option), ) -> crate::Result { let lo = lo.unwrap_or_default(); - let hi = hi.unwrap_or(level.len() - 1); + let hi = hi.unwrap_or(run.len() - 1); - let lo_segment = level.segments.get(lo).expect("should exist"); + let lo_segment = run.get(lo).expect("should exist"); let lo_reader = lo_segment.scan()?; Ok(Self { - segments: level, + segments: run, lo, hi, lo_reader: Some(lo_reader), @@ -36,7 +36,7 @@ impl LevelScanner { } } -impl Iterator for LevelScanner { +impl Iterator for RunScanner { type Item = crate::Result; fn next(&mut self) -> Option { @@ -70,7 +70,8 @@ mod tests { use crate::{AbstractTree, Slice}; use test_log::test; - #[test] + // TODO: restore + /* #[test] fn level_scanner_basic() -> crate::Result<()> { let tempdir = tempfile::tempdir()?; let tree = crate::Config::new(&tempdir).open()?; @@ -90,10 +91,10 @@ mod tests { } let segments = tree - .levels + .version .read() .expect("lock is poisoned") - .iter() + .iter_segments() .cloned() .collect::>(); @@ -104,7 +105,7 @@ mod tests { #[allow(clippy::unwrap_used)] { - let multi_reader = LevelScanner::from_indexes(level.clone(), (None, None))?; + let multi_reader = RunScanner::from_indexes(level.clone(), (None, None))?; let mut iter = multi_reader.flatten(); @@ -124,7 +125,7 @@ mod tests { #[allow(clippy::unwrap_used)] { - let multi_reader = LevelScanner::from_indexes(level.clone(), (Some(1), None))?; + let multi_reader = RunScanner::from_indexes(level.clone(), (Some(1), None))?; let mut iter = multi_reader.flatten(); @@ -140,5 +141,5 @@ mod tests { } Ok(()) - } + } */ } From 56fd4fb876ec1a94d78b007cb8038b287ed0024c Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 26 May 2025 19:08:10 +0200 Subject: [PATCH 139/613] update level manifest with new version & run structs --- src/level_manifest/mod.rs | 505 ++++++++++++++++++-------------------- 1 file changed, 235 insertions(+), 270 deletions(-) diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index 11df5e88..efca42b3 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -3,106 +3,111 @@ // (found in the LICENSE-* files in the repository) pub(crate) mod hidden_set; -pub(crate) mod level; use crate::{ - coding::{DecodeError, Encode, EncodeError}, - file::{rewrite_atomic, MAGIC_BYTES}, + coding::DecodeError, + file::{fsync_directory, rewrite_atomic, MAGIC_BYTES}, segment::Segment, - HashMap, HashSet, KeyRange, SegmentId, + version::{Level, Run, Version, VersionId}, + HashSet, SegmentId, }; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use hidden_set::HiddenSet; -use level::Level; use std::{ - io::{Cursor, Read, Write}, + io::{BufWriter, Cursor, Read, Write}, path::{Path, PathBuf}, sync::Arc, }; -type Levels = Vec>; - /// Represents the levels of a log-structured merge tree pub struct LevelManifest { - /// Path of level manifest file. - path: PathBuf, + /// Path of tree folder. + folder: PathBuf, - /// Actual levels containing segments. - #[doc(hidden)] - pub levels: Levels, + /// Current version + current: Version, /// Set of segment IDs that are masked. /// /// While consuming segments (because of compaction) they will not appear in the list of segments /// as to not cause conflicts between multiple compaction threads (compacting the same segments). hidden_set: HiddenSet, - - is_disjoint: bool, } impl std::fmt::Display for LevelManifest { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - for (idx, level) in self.levels.iter().enumerate() { - write!( + for (idx, level) in self.current.iter_levels().enumerate() { + writeln!( f, - "{idx} [{}]: ", - match (level.is_empty(), level.compute_is_disjoint()) { + "{idx} [{}], r={}: ", + match (level.is_empty(), level.is_disjoint()) { (true, _) => ".", (false, true) => "D", (false, false) => "_", - } + }, + level.len(), )?; - if level.segments.is_empty() { - write!(f, "")?; - } else if level.segments.len() >= 30 { - #[allow(clippy::indexing_slicing)] - for segment in level.segments.iter().take(2) { - let id = segment.id(); - let is_hidden = self.hidden_set.is_hidden(id); - - write!( + for run in level.iter() { + write!(f, " ")?; + + if run.is_empty() { + writeln!(f, "")?; + } else if run.len() >= 30 { + #[allow(clippy::indexing_slicing)] + for segment in run.iter().take(2) { + let id = segment.id(); + let is_hidden = self.hidden_set.is_hidden(id); + + write!( + f, + "{}{id}{}", + if is_hidden { "(" } else { "[" }, + if is_hidden { ")" } else { "]" }, + )?; + } + write!(f, " . . . ")?; + + #[allow(clippy::indexing_slicing)] + for segment in run.iter().rev().take(2).rev() { + let id = segment.id(); + let is_hidden = self.hidden_set.is_hidden(id); + + write!( + f, + "{}{id}{}", + if is_hidden { "(" } else { "[" }, + if is_hidden { ")" } else { "]" }, + )?; + } + + writeln!( f, - "{}{id}{}", - if is_hidden { "(" } else { "[" }, - if is_hidden { ")" } else { "]" }, + " | # = {}, {} MiB", + run.len(), + run.iter().map(|x| x.metadata.file_size).sum::() / 1_024 / 1_024, )?; - } - write!(f, " . . . ")?; - - #[allow(clippy::indexing_slicing)] - for segment in level.segments.iter().rev().take(2).rev() { - let id = segment.id(); - let is_hidden = self.hidden_set.is_hidden(id); - - write!( + } else { + for segment in run.iter() { + let id = segment.id(); + let is_hidden = self.hidden_set.is_hidden(id); + + write!( + f, + "{}{id}{}", + if is_hidden { "(" } else { "[" }, + if is_hidden { ")" } else { "]" }, + )?; + } + + writeln!( f, - "{}{id}{}", - if is_hidden { "(" } else { "[" }, - if is_hidden { ")" } else { "]" }, - )?; - } - } else { - for segment in &level.segments { - let id = segment.id(); - let is_hidden = self.hidden_set.is_hidden(id); - - write!( - f, - "{}{id}{}", - if is_hidden { "(" } else { "[" }, - /* segment.metadata.file_size / 1_024 / 1_024, */ - if is_hidden { ")" } else { "]" }, + " | # = {}, {} MiB", + run.len(), + run.iter().map(|x| x.metadata.file_size).sum::() / 1_024 / 1_024, )?; } } - - writeln!( - f, - " | # = {}, {} MiB", - level.len(), - level.size() / 1_024 / 1_024, - )?; } Ok(()) @@ -110,40 +115,31 @@ impl std::fmt::Display for LevelManifest { } impl LevelManifest { + #[must_use] + pub fn current_version(&self) -> &Version { + &self.current + } + pub(crate) fn is_compacting(&self) -> bool { !self.hidden_set.is_empty() } - pub(crate) fn create_new>(level_count: u8, path: P) -> crate::Result { - assert!(level_count > 0, "level_count should be >= 1"); - - let levels = (0..level_count).map(|_| Arc::default()).collect::>(); + pub(crate) fn create_new>(folder: P) -> crate::Result { + // assert!(level_count > 0, "level_count should be >= 1"); #[allow(unused_mut)] let mut manifest = Self { - path: path.into(), - levels, + folder: folder.into(), + current: Version::new(0), hidden_set: HiddenSet::default(), - is_disjoint: true, }; - Self::write_to_disk(&manifest.path, &manifest.deep_clone())?; - Ok(manifest) - } + Self::persist_version(&manifest.folder, &manifest.current)?; - fn set_disjoint_flag(&mut self) { - // TODO: store key range in levels precomputed - let key_ranges = self - .levels - .iter() - .filter(|x| !x.is_empty()) - .map(|x| KeyRange::aggregate(x.iter().map(|s| &s.metadata.key_range))) - .collect::>(); - - self.is_disjoint = KeyRange::is_disjoint(&key_ranges.iter().collect::>()); + Ok(manifest) } - pub(crate) fn load_level_manifest(path: &Path) -> crate::Result>> { + pub(crate) fn load_version(path: &Path) -> crate::Result>>> { let mut level_manifest = Cursor::new(std::fs::read(path)?); // Check header @@ -162,11 +158,18 @@ impl LevelManifest { for _ in 0..level_count { let mut level = vec![]; - let segment_count = level_manifest.read_u32::()?; + let run_count = level_manifest.read_u8()?; + + for _ in 0..run_count { + let mut run = vec![]; + let segment_count = level_manifest.read_u32::()?; - for _ in 0..segment_count { - let id = level_manifest.read_u64::()?; - level.push(id); + for _ in 0..segment_count { + let id = level_manifest.read_u64::()?; + run.push(id); + } + + level.push(run); } levels.push(level); @@ -176,151 +179,165 @@ impl LevelManifest { } pub(crate) fn recover_ids( - path: &Path, + folder: &Path, ) -> crate::Result> { - let manifest = Self::load_level_manifest(path)?; + let curr_version = Self::get_current_version(folder)?; + let version_file_path = folder.join(format!("v{curr_version}")); + + let manifest = Self::load_version(&version_file_path)?; let mut result = crate::HashMap::default(); for (level_idx, segment_ids) in manifest.into_iter().enumerate() { - for segment_id in segment_ids { - result.insert( - segment_id, - level_idx - .try_into() - .expect("there are less than 256 levels"), - ); + for run in segment_ids { + for segment_id in run { + // NOTE: We know there are always less than 256 levels + #[allow(clippy::expect_used)] + result.insert( + segment_id, + level_idx + .try_into() + .expect("there are less than 256 levels"), + ); + } } } Ok(result) } - fn resolve_levels( - level_manifest: Vec>, - segments: &HashMap, - ) -> Levels { - let mut levels = Vec::with_capacity(level_manifest.len()); + pub fn get_current_version(folder: &Path) -> crate::Result { + let mut buf = [0; 8]; - for level in level_manifest { - let mut created_level = Level::default(); - - for id in level { - let segment = segments.get(&id).cloned().expect("should find segment"); - created_level.insert(segment); - } - - levels.push(Arc::new(created_level)); + { + let mut file = std::fs::File::open(folder.join("current"))?; + file.read_exact(&mut buf)?; } - levels + Ok(u64::from_le_bytes(buf)) } pub(crate) fn recover>( - path: P, - segments: Vec, + folder: P, + segments: &[Segment], ) -> crate::Result { - let path = path.into(); + let folder = folder.into(); - let level_manifest = Self::load_level_manifest(&path)?; + let curr_version = Self::get_current_version(&folder)?; + let version_file_path = folder.join(format!("v{curr_version}")); - let segments: HashMap<_, _> = segments.into_iter().map(|seg| (seg.id(), seg)).collect(); + let version_file = std::path::Path::new(&version_file_path); - let levels = Self::resolve_levels(level_manifest, &segments); + if !version_file.try_exists()? { + log::error!("Cannot find version file {version_file_path:?}"); + return Err(crate::Error::Unrecoverable); + } - let mut manifest = Self { - levels, - hidden_set: HiddenSet::default(), - path, - is_disjoint: false, - }; - manifest.set_disjoint_flag(); + let raw_version = Self::load_version(&version_file_path)?; - Ok(manifest) + let version_levels = raw_version + .iter() + .map(|level| { + let level_runs = level + .iter() + .map(|run| { + let run_segments = run + .iter() + .map(|segment_id| { + segments + .iter() + .find(|x| x.id() == *segment_id) + .cloned() + .ok_or(crate::Error::Unrecoverable) + }) + .collect::>>()?; + + Ok(Arc::new(Run::new(run_segments))) + }) + .collect::>>()?; + + Ok(Level::from_runs(level_runs)) + }) + .collect::>>()?; + + // TODO: 3. create free list from versions that are N < CURRENT + + Ok(Self { + current: Version::from_levels(curr_version, version_levels), + folder, + hidden_set: HiddenSet::default(), + }) } - pub(crate) fn write_to_disk(path: &Path, levels: &[Level]) -> crate::Result<()> { - log::trace!("Writing level manifest to {path:?}"); + fn persist_version(folder: &Path, version: &Version) -> crate::Result<()> { + log::trace!("Persisting version {} in {folder:?}", version.id()); - let serialized = Runs(levels).encode_into_vec(); + let file = std::fs::File::create(folder.join(format!("v{}", version.id())))?; + let mut writer = BufWriter::new(file); - // NOTE: Compaction threads don't have concurrent access to the level manifest - // because it is behind a mutex - // *However*, the file still needs to be rewritten atomically, because - // the system could crash at any moment, so - // - // a) truncating is not an option, because for a short moment, the file is empty - // b) just overwriting corrupts the file content - rewrite_atomic(path, &serialized)?; + // Magic + writer.write_all(&MAGIC_BYTES)?; - Ok(()) - } + // Level count + // NOTE: We know there are always less than 256 levels + #[allow(clippy::cast_possible_truncation)] + writer.write_u8(version.level_count() as u8)?; - /// Clones the level to get a mutable copy for atomic swap. - fn deep_clone(&self) -> Vec { - self.levels - .iter() - .map(|x| Level { - segments: x.segments.clone(), - is_disjoint: x.is_disjoint, - }) - .collect() + for level in version.iter_levels() { + // Run count + // NOTE: We know there are always less than 256 runs + #[allow(clippy::cast_possible_truncation)] + writer.write_u8(level.len() as u8)?; + + for run in level.iter() { + // Segment count + // NOTE: We know there are always less than 4 billion segments in a run + #[allow(clippy::cast_possible_truncation)] + writer.write_u32::(run.len() as u32)?; + + // Segment IDs + for id in run.iter().map(Segment::id) { + writer.write_u64::(id)?; + } + } + } + + writer.flush()?; + writer.get_mut().sync_all()?; + fsync_directory(folder)?; + // IMPORTANT: ^ wait for fsync and directory sync to fully finish + + rewrite_atomic(&folder.join("current"), &version.id().to_le_bytes())?; + + Ok(()) } /// Modifies the level manifest atomically. - pub(crate) fn atomic_swap)>(&mut self, f: F) -> crate::Result<()> { + /// + /// The function accepts a transition function that receives the current version + /// and returns a new version. + /// + /// The function takes care of persisting the version changes on disk. + pub(crate) fn atomic_swap Version>( + &mut self, + f: F, + ) -> crate::Result<()> { // NOTE: Copy-on-write... // // Create a copy of the levels we can operate on // without mutating the current level manifest // If persisting to disk fails, this way the level manifest // is unchanged - let mut working_copy = self.deep_clone(); + let next_version = f(&self.current); - f(&mut working_copy); + Self::persist_version(&self.folder, &next_version)?; - Self::write_to_disk(&self.path, &working_copy)?; - self.levels = working_copy.into_iter().map(Arc::new).collect(); - self.update_metadata(); - self.set_disjoint_flag(); + // TODO: add old version to free list - log::trace!("Swapped level manifest to:\n{self}"); + self.current = next_version; - Ok(()) - } + // TODO: GC version history by traversing free list - #[allow(unused)] - #[cfg(test)] - pub(crate) fn add(&mut self, segment: Segment) { - self.insert_into_level(0, segment); - } - - pub fn update_metadata(&mut self) { - for level in &mut self.levels { - Arc::get_mut(level) - .expect("could not get mutable Arc - this is a bug") - .update_metadata(); - } - } - - #[allow(unused)] - #[cfg(test)] - pub(crate) fn insert_into_level(&mut self, level_no: u8, segment: Segment) { - let last_level_index = self.depth() - 1; - let index = level_no.clamp(0, last_level_index); - - let level = self - .levels - .get_mut(index as usize) - .expect("level should exist"); - - let level = Arc::get_mut(level).expect("only used in tests"); - - level.insert(segment); - } - - #[must_use] - pub fn is_disjoint(&self) -> bool { - self.is_disjoint && self.levels.iter().all(|x| x.is_disjoint) + Ok(()) } /// Returns `true` if there are no segments @@ -331,29 +348,25 @@ impl LevelManifest { /// Returns the amount of levels in the tree #[must_use] - pub fn depth(&self) -> u8 { + pub fn level_count(&self) -> u8 { // NOTE: Level count is u8 #[allow(clippy::cast_possible_truncation)] - let len = self.levels.len() as u8; - - len - } - - #[must_use] - pub fn first_level_segment_count(&self) -> usize { - self.levels.first().map(|lvl| lvl.len()).unwrap_or_default() + { + self.current.level_count() as u8 + } } /// Returns the amount of levels in the tree #[must_use] pub fn last_level_index(&self) -> u8 { - self.depth() - 1 + // NOTE: Currently hard coded to 7 - 1 + 6 } /// Returns the amount of segments, summed over all levels #[must_use] pub fn len(&self) -> usize { - self.levels.iter().map(|lvl| lvl.len()).sum() + self.current.segment_count() } /// Returns the (compressed) size of all segments @@ -367,46 +380,30 @@ impl LevelManifest { let mut output = HashSet::with_capacity_and_hasher(self.len(), xxhash_rust::xxh3::Xxh3Builder::new()); - for (idx, level) in self.levels.iter().enumerate() { - if level.ids().any(|id| self.hidden_set.is_hidden(id)) { - // NOTE: Level count is u8 - #[allow(clippy::cast_possible_truncation)] - output.insert(idx as u8); + for (idx, level) in self.current.iter_levels().enumerate() { + for segment in level.iter().flat_map(|run| run.iter()) { + if self.hidden_set.is_hidden(segment.id()) { + // NOTE: Level count is u8 + #[allow(clippy::cast_possible_truncation)] + output.insert(idx as u8); + } } } output } - pub(crate) fn get_segment(&self, id: SegmentId) -> Option { - for level in &self.levels { - if let Some(segment) = level.segments.iter().find(|x| x.id() == id).cloned() { - return Some(segment); - } - } - None + pub(crate) fn get_segment(&self, id: SegmentId) -> Option<&Segment> { + self.current.iter_segments().find(|x| x.metadata.id == id) } - /// Returns a view into the levels, hiding all segments that currently are being compacted #[must_use] - pub fn resolved_view(&self) -> Vec { - let mut output = Vec::with_capacity(self.len()); - - for raw_level in &self.levels { - let mut level = raw_level.iter().cloned().collect::>(); - level.retain(|x| !self.hidden_set.is_hidden(x.id())); - - output.push(Level { - segments: level, - is_disjoint: raw_level.is_disjoint, - }); - } - - output + pub fn as_slice(&self) -> &[Level] { + &self.current.levels } - pub fn iter(&self) -> impl Iterator + '_ { - self.levels.iter().flat_map(|x| &x.segments) + pub fn iter(&self) -> impl Iterator { + self.current.iter_segments() } pub(crate) fn should_decline_compaction>( @@ -429,43 +426,9 @@ impl LevelManifest { } } -struct Runs<'a>(&'a [Level]); - -impl<'a> std::ops::Deref for Runs<'a> { - type Target = [Level]; - - fn deref(&self) -> &Self::Target { - self.0 - } -} - -impl<'a> Encode for Runs<'a> { - fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { - // Write header - writer.write_all(&MAGIC_BYTES)?; - - // NOTE: "Truncation" is OK, because levels are created from a u8 - #[allow(clippy::cast_possible_truncation)] - writer.write_u8(self.len() as u8)?; - - for level in self.iter() { - // NOTE: "Truncation" is OK, because there are never 4 billion segments in a tree, I hope - #[allow(clippy::cast_possible_truncation)] - writer.write_u32::(level.segments.len() as u32)?; - - for segment in &level.segments { - writer.write_u64::(segment.id())?; - } - } - - Ok(()) - } -} - #[cfg(test)] #[allow(clippy::expect_used)] mod tests { - use super::Runs; use crate::{ coding::Encode, level_manifest::{hidden_set::HiddenSet, LevelManifest}, @@ -473,7 +436,9 @@ mod tests { }; use test_log::test; - #[test] + // TODO: restore + /* #[test] + #[ignore] fn level_manifest_atomicity() -> crate::Result<()> { let folder = tempfile::tempdir()?; @@ -513,9 +478,9 @@ mod tests { assert_eq!(segment_count_before_major_compact, tree.segment_count()); Ok(()) - } + } */ - #[test] + /* #[test] fn level_manifest_raw_empty() -> crate::Result<()> { let manifest = LevelManifest { hidden_set: HiddenSet::default(), @@ -538,5 +503,5 @@ mod tests { assert_eq!(bytes, raw); Ok(()) - } + } */ } From ce3d6f9a95aea5c274e88d9ebfb90ce41768f387 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 26 May 2025 19:39:04 +0200 Subject: [PATCH 140/613] update Tree to use new levels structs --- src/tree/inner.rs | 14 ++--- src/tree/mod.rs | 141 +++++++++++++++++++++++----------------------- 2 files changed, 77 insertions(+), 78 deletions(-) diff --git a/src/tree/inner.rs b/src/tree/inner.rs index f2d3d924..e8a0add4 100644 --- a/src/tree/inner.rs +++ b/src/tree/inner.rs @@ -3,8 +3,8 @@ // (found in the LICENSE-* files in the repository) use crate::{ - config::Config, file::LEVELS_MANIFEST_FILE, level_manifest::LevelManifest, memtable::Memtable, - stop_signal::StopSignal, SegmentId, + config::Config, level_manifest::LevelManifest, memtable::Memtable, stop_signal::StopSignal, + SegmentId, }; use std::sync::{atomic::AtomicU64, Arc, RwLock}; @@ -64,9 +64,8 @@ pub struct TreeInner { /// Frozen memtables that are being flushed pub(crate) sealed_memtables: Arc>, - /// Level manifest - #[doc(hidden)] - pub levels: Arc>, + /// Current tree version + pub(crate) manifest: Arc>, /// Tree configuration pub config: Config, @@ -80,8 +79,7 @@ pub struct TreeInner { impl TreeInner { pub(crate) fn create_new(config: Config) -> crate::Result { - let levels = - LevelManifest::create_new(config.level_count, config.path.join(LEVELS_MANIFEST_FILE))?; + let manifest = LevelManifest::create_new(&config.path)?; Ok(Self { id: get_next_tree_id(), @@ -89,7 +87,7 @@ impl TreeInner { config, active_memtable: Arc::default(), sealed_memtables: Arc::default(), - levels: Arc::new(RwLock::new(levels)), + manifest: Arc::new(RwLock::new(manifest)), stop_signal: StopSignal::default(), major_compaction_lock: RwLock::default(), }) diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 9201b25b..3d8f73d9 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -101,21 +101,13 @@ impl AbstractTree for Tree { } fn l0_run_count(&self) -> usize { - let lock = self.levels.read().expect("lock is poisoned"); - - let first_level = lock - .levels - .first() - .expect("first level should always exist"); - - if first_level.is_disjoint { - 1 - } else { - // TODO: in the future, there will be a Vec per Level - // TODO: so this will need to change, - // TODO: but then we also don't need the manual is_disjoint check - first_level.segments.len() - } + self.manifest + .read() + .expect("lock is poisoned") + .current_version() + .level(0) + .map(|x| x.run_count()) + .unwrap_or_default() } fn size_of>(&self, key: K, seqno: Option) -> crate::Result> { @@ -123,11 +115,12 @@ impl AbstractTree for Tree { } fn pinned_bloom_filter_size(&self) -> usize { - self.levels + self.manifest .read() .expect("lock is poisoned") - .iter() - .map(Segment::pinned_bloom_filter_size) + .current_version() + .iter_segments() + .map(|x| x.pinned_bloom_filter_size()) .sum() } @@ -216,7 +209,7 @@ impl AbstractTree for Tree { fn register_segments(&self, segments: &[Segment]) -> crate::Result<()> { // NOTE: Mind lock order L -> M -> S log::trace!("register: Acquiring levels manifest write lock"); - let mut original_levels = self.levels.write().expect("lock is poisoned"); + let mut manifest = self.manifest.write().expect("lock is poisoned"); log::trace!("register: Acquired levels manifest write lock"); // NOTE: Mind lock order L -> M -> S @@ -224,16 +217,9 @@ impl AbstractTree for Tree { let mut sealed_memtables = self.sealed_memtables.write().expect("lock is poisoned"); log::trace!("register: Acquired sealed memtables write lock"); - original_levels.atomic_swap(|recipe| { - for segment in segments.iter().cloned() { - recipe - .first_mut() - .expect("first level should exist") - .insert(segment); - } - })?; + manifest.atomic_swap(|version| version.with_new_l0_segment(segments))?; - // eprintln!("{original_levels}"); + // eprintln!("{manifest}"); for segment in segments { log::trace!("releasing sealed memtable {}", segment.id()); @@ -323,26 +309,35 @@ impl AbstractTree for Tree { } fn segment_count(&self) -> usize { - self.levels.read().expect("lock is poisoned").len() + self.manifest + .read() + .expect("lock is poisoned") + .current_version() + .segment_count() } fn level_segment_count(&self, idx: usize) -> Option { - self.levels + self.manifest .read() .expect("lock is poisoned") - .levels - .get(idx) - .map(|x| x.len()) + .current_version() + .level(idx) + .map(|x| x.segment_count()) } #[allow(clippy::significant_drop_tightening)] fn approximate_len(&self) -> usize { // NOTE: Mind lock order L -> M -> S - let levels = self.levels.read().expect("lock is poisoned"); + let manifest = self.manifest.read().expect("lock is poisoned"); let memtable = self.active_memtable.read().expect("lock is poisoned"); let sealed = self.sealed_memtables.read().expect("lock is poisoned"); - let segments_item_count = levels.iter().map(|x| x.metadata.item_count).sum::(); + let segments_item_count = manifest + .current_version() + .iter_segments() + .map(|x| x.metadata.item_count) + .sum::(); + let memtable_count = memtable.len() as u64; let sealed_count = sealed.iter().map(|(_, mt)| mt.len()).sum::() as u64; @@ -352,8 +347,13 @@ impl AbstractTree for Tree { } fn disk_space(&self) -> u64 { - let levels = self.levels.read().expect("lock is poisoned"); - levels.iter().map(|x| x.metadata.file_size).sum() + self.manifest + .read() + .expect("lock is poisoned") + .current_version() + .iter_levels() + .map(|x| x.size()) + .sum() } fn get_highest_memtable_seqno(&self) -> Option { @@ -376,8 +376,13 @@ impl AbstractTree for Tree { } fn get_highest_persisted_seqno(&self) -> Option { - let levels = self.levels.read().expect("lock is poisoned"); - levels.iter().map(Segment::get_highest_seqno).max() + self.manifest + .read() + .expect("lock is poisoned") + .current_version() + .iter_segments() + .map(|x| x.get_highest_seqno()) + .max() } fn snapshot(&self, seqno: SeqNo) -> Snapshot { @@ -553,8 +558,10 @@ impl Tree { #[doc(hidden)] #[must_use] pub fn is_compacting(&self) -> bool { - let levels = self.levels.read().expect("lock is poisoned"); - levels.is_compacting() + self.manifest + .read() + .expect("lock is poisoned") + .is_compacting() } /// Write-locks the sealed memtables for exclusive access @@ -607,13 +614,13 @@ impl Tree { // https://fjall-rs.github.io/post/bloom-filter-hash-sharing/ let key_hash = crate::segment::filter::standard_bloom::Builder::get_hash(key); - let level_manifest = self.levels.read().expect("lock is poisoned"); + let manifest = self.manifest.read().expect("lock is poisoned"); - for level in &level_manifest.levels { - // NOTE: Based on benchmarking, binary search is only worth it with ~4 segments - if level.len() >= 4 { - if let Some(level) = level.as_disjoint() { - if let Some(segment) = level.get_segment_containing_key(key) { + for level in manifest.current_version().iter_levels() { + for run in level.iter() { + // NOTE: Based on benchmarking, binary search is only worth it with ~4 segments + if run.len() >= 4 { + if let Some(segment) = run.get_for_key(key) { if let Some(item) = segment.get(key, seqno, key_hash)? { return Ok(ignore_tombstone_value(item)); } @@ -622,16 +629,16 @@ impl Tree { // NOTE: Go to next level continue; } - } - // NOTE: Fallback to linear search - for segment in &level.segments { - if !segment.is_key_in_key_range(key) { - continue; - } + // NOTE: Fallback to linear search + for segment in run.iter() { + if !segment.is_key_in_key_range(key) { + continue; + } - if let Some(item) = segment.get(key, seqno, key_hash)? { - return Ok(ignore_tombstone_value(item)); + if let Some(item) = segment.get(key, seqno, key_hash)? { + return Ok(ignore_tombstone_value(item)); + } } } } @@ -718,7 +725,7 @@ impl Tree { // NOTE: Mind lock order L -> M -> S log::trace!("range read: acquiring read locks"); - let level_manifest = self.levels.read().expect("lock is poisoned"); + let manifest = self.manifest.read().expect("lock is poisoned"); let iter_state = { let active = self.active_memtable.read().expect("lock is poisoned"); @@ -728,11 +735,11 @@ impl Tree { active: active.clone(), sealed: sealed.iter().map(|(_, mt)| mt.clone()).collect(), ephemeral, - levels: level_manifest.levels.clone(), + version: manifest.current_version().clone(), } }; - TreeIter::create_range(iter_state, bounds, seqno, &level_manifest) + TreeIter::create_range(iter_state, bounds, seqno, &manifest) } #[doc(hidden)] @@ -798,13 +805,12 @@ impl Tree { let tree_id = get_next_tree_id(); - let mut levels = Self::recover_levels( + let levels = Self::recover_levels( &config.path, tree_id, &config.cache, &config.descriptor_table, )?; - levels.update_metadata(); let highest_segment_id = levels.iter().map(Segment::id).max().unwrap_or_default(); @@ -813,7 +819,7 @@ impl Tree { segment_id_counter: Arc::new(AtomicU64::new(highest_segment_id + 1)), active_memtable: Arc::default(), sealed_memtables: Arc::default(), - levels: Arc::new(RwLock::new(levels)), + manifest: Arc::new(RwLock::new(levels)), stop_signal: StopSignal::default(), config, major_compaction_lock: RwLock::default(), @@ -865,18 +871,13 @@ impl Tree { cache: &Arc, descriptor_table: &Arc, ) -> crate::Result { - use crate::{ - file::fsync_directory, - file::{LEVELS_MANIFEST_FILE, SEGMENTS_FOLDER}, - SegmentId, - }; + use crate::{file::fsync_directory, file::SEGMENTS_FOLDER, SegmentId}; let tree_path = tree_path.as_ref(); - let level_manifest_path = tree_path.join(LEVELS_MANIFEST_FILE); - log::info!("Recovering manifest at {level_manifest_path:?}"); + log::info!("Recovering manifest at {tree_path:?}"); - let segment_id_map = LevelManifest::recover_ids(&level_manifest_path)?; + let segment_id_map = LevelManifest::recover_ids(tree_path)?; let cnt = segment_id_map.len(); log::debug!("Recovering {cnt} disk segments from {tree_path:?}"); @@ -951,6 +952,6 @@ impl Tree { log::debug!("Successfully recovered {} segments", segments.len()); - LevelManifest::recover(&level_manifest_path, segments) + LevelManifest::recover(tree_path, &segments) } } From 628a08c2b93af96e20d068f566157042b26936c8 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 26 May 2025 19:41:01 +0200 Subject: [PATCH 141/613] update multi reader --- src/multi_reader.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/multi_reader.rs b/src/multi_reader.rs index f319a0ca..9ce0fe5a 100644 --- a/src/multi_reader.rs +++ b/src/multi_reader.rs @@ -76,10 +76,11 @@ mod tests { } let segments = tree - .levels + .manifest .read() .expect("lock is poisoned") - .iter() + .current_version() + .iter_segments() .cloned() .collect::>(); From 72f1961d9409d584b04894fda56d785e785b3296 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 26 May 2025 19:41:34 +0200 Subject: [PATCH 142/613] update leveled compaction --- src/compaction/leveled.rs | 303 ++++++++++++++++++++++++-------------- 1 file changed, 189 insertions(+), 114 deletions(-) diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 92e40209..44def4ef 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -5,8 +5,9 @@ use super::{Choice, CompactionStrategy, Input as CompactionInput}; use crate::{ config::Config, - level_manifest::{hidden_set::HiddenSet, level::Level, LevelManifest}, + level_manifest::{hidden_set::HiddenSet, LevelManifest}, segment::Segment, + version::Run, windows::{GrowingWindowsExt, ShrinkingWindowsExt}, HashSet, KeyRange, SegmentId, }; @@ -20,9 +21,10 @@ fn aggregate_key_range(segments: &[Segment]) -> KeyRange { /// Tries to find the most optimal compaction set from /// one level into the other. fn pick_minimal_compaction( - curr_level: &Level, - next_level: &Level, + curr_run: &Run, + next_run: Option<&Run>, hidden_set: &HiddenSet, + overshoot: u64, ) -> Option<(HashSet, bool)> { // assert!(curr_level.is_disjoint, "Lx is not disjoint"); // assert!(next_level.is_disjoint, "Lx+1 is not disjoint"); @@ -36,90 +38,89 @@ fn pick_minimal_compaction( let mut choices = vec![]; let mut add_choice = |choice: Choice| { - let mut valid_choice = true; - - // IMPORTANT: Compaction is blocked because of other - // on-going compaction - valid_choice &= !choice.segment_ids.iter().any(|x| hidden_set.is_hidden(*x)); + let valid_choice = if hidden_set.is_blocked(choice.segment_ids.iter().copied()) { + // IMPORTANT: Compaction is blocked because of other + // on-going compaction + false + } else if choice.can_trivial_move { + true + } else { + // TODO: this should not consider the number of segments, but the amount of rewritten data + // which corresponds to the amount of temporary space amp - // NOTE: Keep compactions with 25 or less segments - // to make compactions not too large - valid_choice &= choice.can_trivial_move || choice.segment_ids.len() <= 25; + // NOTE: Keep compactions with N or less segments + // to make compactions not too large + // + // This value is currently manually fine-tuned based on benchmarks + // with 50%/50% read-write workload + // + // Making compactions too granular heavily increases read tail latencies + choice.segment_ids.len() < 100 + // true + }; if valid_choice { choices.push(choice); } }; - for window in next_level.growing_windows() { - if hidden_set.is_blocked(window.iter().map(Segment::id)) { - // IMPORTANT: Compaction is blocked because of other - // on-going compaction - continue; - } + if let Some(next_run) = &next_run { + for window in next_run.growing_windows() { + if hidden_set.is_blocked(window.iter().map(Segment::id)) { + // IMPORTANT: Compaction is blocked because of other + // on-going compaction + continue; + } - let key_range = aggregate_key_range(window); + let key_range = aggregate_key_range(window); - // Pull in all segments in current level into compaction - let curr_level_pull_in: Vec<_> = if curr_level.is_disjoint { - // IMPORTANT: Avoid "infectious spread" of key ranges - // Imagine these levels: - // - // A B C D E F - // L1 | ----- ----- ----- ----- ----- ----- - // L2 | ----- ----- ----- ----- ----- - // 1 2 3 4 5 - // - // If we took 1, we would also have to include B, - // but then we would also have to include 2, - // but then we would also have to include C, - // but then we would also have to include 3, - // ... - // - // Instead, we consider a window like 1 - 3 - // and then take B & C, because they are *contained* in that range - // Not including A or D is fine, because we are not shadowing data unexpectedly - curr_level.contained_segments(&key_range).collect() - } else { - // If the level is not disjoint, we just merge everything that overlaps - // to try and "repair" the level - curr_level.overlapping_segments(&key_range).collect() - }; + // Pull in all segments in current level into compaction + let curr_level_pull_in: Vec<_> = curr_run.get_contained(&key_range).collect(); - if hidden_set.is_blocked(curr_level_pull_in.iter().map(|x| x.id())) { - // IMPORTANT: Compaction is blocked because of other - // on-going compaction - continue; - } + if hidden_set.is_blocked(curr_level_pull_in.iter().map(|x| x.id())) { + // IMPORTANT: Compaction is blocked because of other + // on-going compaction + continue; + } - let curr_level_size = curr_level_pull_in - .iter() - .map(|x| x.metadata.file_size) - .sum::(); + let curr_level_size = curr_level_pull_in + .iter() + .map(|x| x.metadata.file_size) + .sum::(); - // NOTE: Only consider compactions where we actually reach the amount - // of bytes we need to merge - if curr_level_size >= 1 { - let next_level_size = window.iter().map(|x| x.metadata.file_size).sum::(); + // NOTE: Only consider compactions where we actually reach the amount + // of bytes we need to merge? + if curr_level_size >= overshoot { + let next_level_size = window.iter().map(|x| x.metadata.file_size).sum::(); - let mut segment_ids: HashSet<_> = window.iter().map(Segment::id).collect(); - segment_ids.extend(curr_level_pull_in.iter().map(|x| x.id())); + let mut segment_ids: HashSet<_> = window.iter().map(Segment::id).collect(); + segment_ids.extend(curr_level_pull_in.iter().map(|x| x.id())); - let write_amp = (next_level_size as f32) / (curr_level_size as f32); + let write_amp = (next_level_size as f32) / (curr_level_size as f32); - add_choice(Choice { - write_amp, - segment_ids, - can_trivial_move: false, - }); + add_choice(Choice { + write_amp, + segment_ids, + can_trivial_move: false, + }); + } } } // NOTE: Find largest trivial move (if it exists) - for window in curr_level.shrinking_windows() { + for window in curr_run.shrinking_windows() { let key_range = aggregate_key_range(window); - if next_level.overlapping_segments(&key_range).next().is_none() { + if let Some(next_run) = &next_run { + if next_run.get_overlapping(&key_range).next().is_none() { + add_choice(Choice { + write_amp: 0.0, + segment_ids: window.iter().map(Segment::id).collect(), + can_trivial_move: true, + }); + break; + } + } else { add_choice(Choice { write_amp: 0.0, segment_ids: window.iter().map(Segment::id).collect(), @@ -215,16 +216,14 @@ impl CompactionStrategy for Strategy { #[allow(clippy::too_many_lines)] fn choose(&self, levels: &LevelManifest, _: &Config) -> Choice { - let view = &levels.levels; - - // TODO: look at L1+, if not disjoint - // TODO: try to repairing level by rewriting - // TODO: abort if any segment is hidden - // TODO: then make sure, non-disjoint levels cannot be used in subsequent code below - // TODO: add tests - // L1+ compactions - for (curr_level_index, level) in view.iter().enumerate().skip(1).take(view.len() - 2).rev() + for (curr_level_index, level) in levels + .as_slice() + .iter() + .enumerate() + .skip(1) + .take(usize::from(levels.level_count() - 2)) + .rev() { // NOTE: Level count is 255 max #[allow(clippy::cast_possible_truncation)] @@ -237,8 +236,8 @@ impl CompactionStrategy for Strategy { } let level_size: u64 = level - .segments .iter() + .flat_map(|x| x.iter()) // NOTE: Take bytes that are already being compacted into account, // otherwise we may be overcompensating .filter(|x| !levels.hidden_set().is_hidden(x.id())) @@ -250,43 +249,35 @@ impl CompactionStrategy for Strategy { let overshoot = level_size.saturating_sub(desired_bytes); if overshoot > 0 { - let Some(next_level) = &view.get(next_level_index as usize) else { + let Some(next_level) = levels.current_version().level(next_level_index as usize) + else { break; }; - let Some((segment_ids, can_trivial_move)) = - pick_minimal_compaction(level, next_level, levels.hidden_set()) - else { + let Some((segment_ids, can_trivial_move)) = pick_minimal_compaction( + level.first_run().expect("should have exactly one run"), + next_level.first_run().map(std::ops::Deref::deref), + levels.hidden_set(), + overshoot, + ) else { break; }; - // eprintln!( - // "merge {} segments, L{}->L{next_level_index}: {segment_ids:?}", - // segment_ids.len(), - // next_level_index - 1, - // ); - let choice = CompactionInput { segment_ids, dest_level: next_level_index, target_size: u64::from(self.target_size), }; - /*// TODO: eventually, this should happen lazily - // if a segment file lives for very long, it should get rewritten - // Rocks, by default, rewrites files that are 1 month or older - // - // TODO: 3.0.0 configuration? - // NOTE: We purposefully not trivially move segments - // if we go from L1 to L2 - // https://github.com/fjall-rs/lsm-tree/issues/63 - let goes_into_cold_storage = next_level_index == 2; - - if goes_into_cold_storage { - return Choice::Merge(choice); - }*/ + eprintln!( + "{} {} segments, L{}->L{next_level_index}: {:?}", + if can_trivial_move { "move" } else { "merge" }, + choice.segment_ids.len(), + next_level_index - 1, + choice.segment_ids, + ); - if can_trivial_move && level.is_disjoint { + if can_trivial_move && level.is_disjoint() { return Choice::Move(choice); } return Choice::Merge(choice); @@ -297,7 +288,7 @@ impl CompactionStrategy for Strategy { { let busy_levels = levels.busy_levels(); - let Some(first_level) = view.first() else { + let Some(first_level) = levels.current_version().level(0) else { return Choice::DoNothing; }; @@ -306,9 +297,9 @@ impl CompactionStrategy for Strategy { } if first_level.len() >= self.l0_threshold.into() { - let first_level_size = first_level.size(); + // let first_level_size = first_level.size(); - // NOTE: Special handling for disjoint workloads + /* // NOTE: Special handling for disjoint workloads if levels.is_disjoint() && first_level_size < self.target_size.into() { // TODO: also do this in non-disjoint workloads // -> intra-L0 compaction @@ -335,24 +326,21 @@ impl CompactionStrategy for Strategy { segment_ids: first_level.list_ids(), target_size: self.target_size.into(), }); - } + } */ if !busy_levels.contains(&1) { - let mut level = (**first_level).clone(); - level.sort_by_key_range(); - - let Some(next_level) = &view.get(1) else { + let Some(next_level) = &levels.current_version().level(1) else { return Choice::DoNothing; }; - // TODO: list_ids() - let mut segment_ids: HashSet = level.iter().map(Segment::id).collect(); + let mut segment_ids: HashSet = first_level.list_ids(); - // Get overlapping segments in next level - let key_range = aggregate_key_range(&level); + let key_range = first_level.aggregate_key_range(); + // Get overlapping segments in next level let next_level_overlapping_segment_ids: Vec<_> = next_level - .overlapping_segments(&key_range) + .iter() + .flat_map(|run| run.get_overlapping(&key_range)) .map(Segment::id) .collect(); @@ -364,7 +352,13 @@ impl CompactionStrategy for Strategy { target_size: u64::from(self.target_size), }; - if next_level_overlapping_segment_ids.is_empty() && level.is_disjoint { + eprintln!( + "merge {} segments, L0->L1: {:?}", + choice.segment_ids.len(), + choice.segment_ids, + ); + + if next_level_overlapping_segment_ids.is_empty() && first_level.is_disjoint() { return Choice::Move(choice); } return Choice::Merge(choice); @@ -373,6 +367,87 @@ impl CompactionStrategy for Strategy { } Choice::DoNothing + + /* let view = &levels.levels; + + // TODO: look at L1+, if not disjoint + // TODO: try to repairing level by rewriting + // TODO: abort if any segment is hidden + // TODO: then make sure, non-disjoint levels cannot be used in subsequent code below + // TODO: add tests + + // L1+ compactions + for (curr_level_index, level) in view.iter().enumerate().skip(1).take(view.len() - 2).rev() + { + // NOTE: Level count is 255 max + #[allow(clippy::cast_possible_truncation)] + let curr_level_index = curr_level_index as u8; + + let next_level_index = curr_level_index + 1; + + if level.is_empty() { + continue; + } + + let level_size: u64 = level + .segments + .iter() + // NOTE: Take bytes that are already being compacted into account, + // otherwise we may be overcompensating + .filter(|x| !levels.hidden_set().is_hidden(x.id())) + .map(|x| x.metadata.file_size) + .sum(); + + let desired_bytes = self.level_target_size(curr_level_index); + + let overshoot = level_size.saturating_sub(desired_bytes); + + if overshoot > 0 { + let Some(next_level) = &view.get(next_level_index as usize) else { + break; + }; + + let Some((segment_ids, can_trivial_move)) = + pick_minimal_compaction(level, next_level, levels.hidden_set(), overshoot) + else { + break; + }; + + /* eprintln!( + "{} {} segments, L{}->L{next_level_index}: {segment_ids:?}", + if can_trivial_move { "move" } else { "merge" }, + segment_ids.len(), + next_level_index - 1, + ); */ + + let choice = CompactionInput { + segment_ids, + dest_level: next_level_index, + target_size: u64::from(self.target_size), + }; + + /*// TODO: eventually, this should happen lazily + // if a segment file lives for very long, it should get rewritten + // Rocks, by default, rewrites files that are 1 month or older + // + // TODO: 3.0.0 configuration? + // NOTE: We purposefully not trivially move segments + // if we go from L1 to L2 + // https://github.com/fjall-rs/lsm-tree/issues/63 + let goes_into_cold_storage = next_level_index == 2; + + if goes_into_cold_storage { + return Choice::Merge(choice); + }*/ + + if can_trivial_move && level.is_disjoint { + return Choice::Move(choice); + } + return Choice::Merge(choice); + } + } + + */ } } /* From 502fedf0c33e71563d36dd5fa1cd6f582bfffc2d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 26 May 2025 19:42:26 +0200 Subject: [PATCH 143/613] refactor: data block item count skip --- src/segment/data_block/mod.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index 8bec8083..0fa9f285 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -126,9 +126,10 @@ impl DataBlock { #[must_use] pub fn new(inner: Block) -> Self { let trailer = Trailer::new(&inner); - let mut reader = trailer.as_slice(); - let _item_count = unwrappy!(reader.read_u32::()); + // NOTE: Skip item count (u32) + let offset = std::mem::size_of::(); + let mut reader = unwrappy!(trailer.as_slice().get(offset..)); let restart_interval = unwrappy!(reader.read_u8()); From f38c9da715afc96b9f8424ccaddb084a9521379f Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 26 May 2025 19:45:20 +0200 Subject: [PATCH 144/613] prepare range --- src/range.rs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/range.rs b/src/range.rs index 94d2f299..a35d8406 100644 --- a/src/range.rs +++ b/src/range.rs @@ -3,12 +3,13 @@ // (found in the LICENSE-* files in the repository) use crate::{ - level_manifest::{level::Level, LevelManifest}, - level_reader::LevelReader, + level_manifest::LevelManifest, memtable::Memtable, multi_reader::MultiReader, + run_reader::LevelReader, segment::CachePolicy, value::{SeqNo, UserKey}, + version::Version, InternalValue, }; use self_cell::self_cell; @@ -52,12 +53,9 @@ pub struct IterState { pub(crate) sealed: Vec>, pub(crate) ephemeral: Option>, - // NOTE: Monkey patch to keep segments referenced until range read drops - // Otherwise segment files can get deleted too early - // (because once we create the range iterator, it does not hold onto segments normally) - // TODO: we need a Version system + // NOTE: Hold the version so segments cannot be unlinked #[allow(unused)] - pub(crate) levels: Vec>, + pub(crate) version: Version, } type BoxedMerge<'a> = Box> + 'a>; From 25785998f1f45d712ad4d225a58149f7ea95affa Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 26 May 2025 19:45:30 +0200 Subject: [PATCH 145/613] update module names in lib.rs --- src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 020a0e26..06bb5ca4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -146,8 +146,8 @@ mod key; #[doc(hidden)] pub mod level_manifest; -mod level_reader; -mod level_scanner; +mod run_reader; +mod run_scanner; mod manifest; mod memtable; From 0311dab9d70ddf42ba442673728b12ca1f9dc85d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 26 May 2025 19:45:36 +0200 Subject: [PATCH 146/613] remove unused string constant --- src/file.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/file.rs b/src/file.rs index 7fb48722..2bdbf516 100644 --- a/src/file.rs +++ b/src/file.rs @@ -8,7 +8,6 @@ pub const MAGIC_BYTES: [u8; 4] = [b'L', b'S', b'M', 3]; pub const MANIFEST_FILE: &str = "manifest"; pub const SEGMENTS_FOLDER: &str = "segments"; -pub const LEVELS_MANIFEST_FILE: &str = "levels"; pub const BLOBS_FOLDER: &str = "blobs"; /// Atomically rewrites a file From 2a0323e60159edac6e6c4a4de2da692e1d071b83 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 26 May 2025 19:48:00 +0200 Subject: [PATCH 147/613] update compaction worker to use new levels structs --- src/compaction/worker.rs | 106 ++++++++++++++------------------------- 1 file changed, 38 insertions(+), 68 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 404d3b32..9dd8e9a5 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -7,12 +7,12 @@ use crate::{ compaction::{stream::CompactionStream, Choice}, file::SEGMENTS_FOLDER, level_manifest::LevelManifest, - level_scanner::LevelScanner, merge::Merger, + run_scanner::RunScanner, segment::{multi_writer::MultiWriter, Segment}, stop_signal::StopSignal, tree::inner::TreeId, - Config, GlobalSegmentId, InternalValue, SegmentId, SeqNo, + Config, InternalValue, SegmentId, SeqNo, }; use std::{ sync::{atomic::AtomicU64, Arc, RwLock, RwLockWriteGuard}, @@ -50,7 +50,7 @@ impl Options { tree_id: tree.id, segment_id_generator: tree.segment_id_counter.clone(), config: tree.config.clone(), - levels: tree.levels.clone(), + levels: tree.manifest.clone(), stop_signal: tree.stop_signal.clone(), strategy, eviction_seqno: 0, @@ -79,10 +79,7 @@ pub fn do_compaction(opts: &Options) -> crate::Result<()> { Choice::Drop(payload) => drop_segments( original_levels, opts, - &payload - .into_iter() - .map(|x| (opts.tree_id, x).into()) - .collect::>(), + &payload.into_iter().collect::>(), ), Choice::DoNothing => { log::trace!("Compactor chose to do nothing"); @@ -99,14 +96,15 @@ fn create_compaction_stream<'a>( let mut readers: Vec> = vec![]; let mut found = 0; - for level in &levels.levels { + for level in levels.current_version().iter_levels() { if level.is_empty() { continue; } - if level.is_disjoint && level.len() > 1 { - let Some(lo) = level - .segments + if level.is_disjoint() && level.len() > 1 { + let run = level.first().expect("run should exist"); + + let Some(lo) = run .iter() .enumerate() .filter(|(_, segment)| to_compact.contains(&segment.id())) @@ -116,8 +114,7 @@ fn create_compaction_stream<'a>( continue; }; - let Some(hi) = level - .segments + let Some(hi) = run .iter() .enumerate() .filter(|(_, segment)| to_compact.contains(&segment.id())) @@ -127,18 +124,20 @@ fn create_compaction_stream<'a>( continue; }; - readers.push(Box::new(LevelScanner::from_indexes( - level.clone(), + readers.push(Box::new(RunScanner::culled( + run.clone(), (Some(lo), Some(hi)), )?)); found += hi - lo + 1; } else { - for &id in to_compact { - if let Some(segment) = level.segments.iter().find(|x| x.id() == id) { - found += 1; - readers.push(Box::new(segment.scan()?)); - } + for segment in level + .iter() + .flat_map(|x| x.iter()) + .filter(|x| to_compact.contains(&x.metadata.id)) + { + found += 1; + readers.push(Box::new(segment.scan()?)); } } } @@ -164,18 +163,11 @@ fn move_segments( return Ok(()); } - levels.atomic_swap(|recipe| { - for segment_id in payload.segment_ids { - if let Some(segment) = recipe.iter_mut().find_map(|x| x.remove(segment_id)) { - // NOTE: Destination level should definitely exist - #[allow(clippy::expect_used)] - recipe - .get_mut(payload.dest_level as usize) - .expect("should exist") - .insert(segment); - } - } - }) + let segment_ids = payload.segment_ids.iter().copied().collect::>(); + + levels.atomic_swap(|current| current.with_moved(&segment_ids, payload.dest_level as usize))?; + + Ok(()) } #[allow(clippy::too_many_lines)] @@ -201,7 +193,7 @@ fn merge_segments( let Some(segments) = payload .segment_ids .iter() - .map(|&id| levels.get_segment(id)) + .map(|&id| levels.get_segment(id).cloned()) .collect::>>() else { log::warn!( @@ -359,7 +351,7 @@ fn merge_segments( opts.tree_id, opts.config.cache.clone(), opts.config.descriptor_table.clone(), - true, // TODO: look at configuration + payload.dest_level <= 2, // TODO: look at configuration ) /* let segment_id = trailer.metadata.id; @@ -435,25 +427,12 @@ fn merge_segments( let mut levels = opts.levels.write().expect("lock is poisoned"); log::trace!("compactor: acquired levels manifest write lock"); - // IMPORTANT: Write the manifest with the removed segments first - // Otherwise the segment files are deleted, but are still referenced! - let swap_result = levels.atomic_swap(|recipe| { - for segment in created_segments.iter().cloned() { - log::trace!("Persisting segment {}", segment.id()); - - recipe - .get_mut(payload.dest_level as usize) - .expect("destination level should exist") - .insert(segment); - } - - for segment_id in &payload.segment_ids { - log::trace!("Removing segment {segment_id}"); - - for level in recipe.iter_mut() { - level.remove(*segment_id); - } - } + let swap_result = levels.atomic_swap(|current| { + current.with_merge( + &payload.segment_ids.iter().copied().collect::>(), + &created_segments, + payload.dest_level as usize, + ) }); if let Err(e) = swap_result { @@ -480,10 +459,10 @@ fn merge_segments( fn drop_segments( mut levels: RwLockWriteGuard<'_, LevelManifest>, opts: &Options, - segment_ids: &[GlobalSegmentId], + ids_to_drop: &[SegmentId], ) -> crate::Result<()> { // Fail-safe for buggy compaction strategies - if levels.should_decline_compaction(segment_ids.iter().map(GlobalSegmentId::segment_id)) { + if levels.should_decline_compaction(ids_to_drop.iter().copied()) { log::warn!( "Compaction task created by {:?} contained hidden segments, declining to run it - please report this at https://github.com/fjall-rs/lsm-tree/issues/new?template=bug_report.md", opts.strategy.get_name(), @@ -491,9 +470,9 @@ fn drop_segments( return Ok(()); } - let Some(segments) = segment_ids + let Some(segments) = ids_to_drop .iter() - .map(|id| levels.get_segment(id.segment_id())) + .map(|&id| levels.get_segment(id).cloned()) .collect::>>() else { log::warn!( @@ -505,16 +484,7 @@ fn drop_segments( // IMPORTANT: Write the manifest with the removed segments first // Otherwise the segment files are deleted, but are still referenced! - levels.atomic_swap(|recipe| { - for key in segment_ids { - let segment_id = key.segment_id(); - log::trace!("Removing segment {segment_id}"); - - for level in recipe.iter_mut() { - level.remove(segment_id); - } - } - })?; + levels.atomic_swap(|current| current.with_dropped(ids_to_drop))?; drop(levels); @@ -525,7 +495,7 @@ fn drop_segments( segment.mark_as_deleted(); } - log::trace!("Dropped {} segments", segment_ids.len()); + log::trace!("Dropped {} segments", ids_to_drop.len()); Ok(()) } From f9a9bc12c312eb7efd24f55397d8cf0ce5b51734 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 27 May 2025 02:10:21 +0200 Subject: [PATCH 148/613] fix test --- tests/tree_recover_counter.rs | 49 ++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/tests/tree_recover_counter.rs b/tests/tree_recover_counter.rs index 464fcaf3..09176a51 100644 --- a/tests/tree_recover_counter.rs +++ b/tests/tree_recover_counter.rs @@ -26,8 +26,20 @@ fn tree_recover_segment_counter() -> lsm_tree::Result<()> { ); { - let first_level = &tree.levels.read().expect("lock is poisoned").levels[0]; - assert_eq!(0, first_level.segments[0].id()); + assert_eq!( + 0, + tree.manifest + .read() + .expect("lock is poisoned") + .current_version() + .level(0) + .expect("should exist") + .first() + .expect("should have exactly 1 run") + .first() + .expect("should have one segment") + .id() + ); } tree.insert("b", "b", 0); @@ -40,10 +52,35 @@ fn tree_recover_segment_counter() -> lsm_tree::Result<()> { .load(std::sync::atomic::Ordering::Relaxed) ); - { - let first_level = &tree.levels.read().expect("lock is poisoned").levels[0]; - assert_eq!(1, first_level.segments[1].id()); - } + assert_eq!( + 1, + tree.manifest + .read() + .expect("lock is poisoned") + .current_version() + .level(0) + .expect("should exist") + .first() + .expect("should have at least 1 run") + .first() + .expect("should have one segment") + .id() + ); + + assert_eq!( + 0, + tree.manifest + .read() + .expect("lock is poisoned") + .current_version() + .level(0) + .expect("should exist") + .get(1) + .expect("should have at least 1 run") + .first() + .expect("should have one segment") + .id() + ); } { From 7368f3b298c8e02b097ba6232df2337af5dec79e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 27 May 2025 02:13:34 +0200 Subject: [PATCH 149/613] restore segment metadata.tombstone_count --- src/segment/meta.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/segment/meta.rs b/src/segment/meta.rs index d9af86f3..6bd45c18 100644 --- a/src/segment/meta.rs +++ b/src/segment/meta.rs @@ -40,6 +40,7 @@ pub struct ParsedMeta { pub seqnos: (SeqNo, SeqNo), pub file_size: u64, pub item_count: u64, + pub tombstone_count: u64, pub data_block_compression: CompressionType, } @@ -95,6 +96,15 @@ impl ParsedMeta { bytes.read_u64::()? }; + let tombstone_count = { + let bytes = block + .point_read(b"#tombstone_count", None) + .expect("Segment ID should exist"); + + let mut bytes = &bytes.value[..]; + bytes.read_u64::()? + }; + let data_block_count = { let bytes = block .point_read(b"#data_block_count", None) @@ -170,6 +180,7 @@ impl ParsedMeta { seqnos, file_size, item_count, + tombstone_count, data_block_compression, }) } From aef9aeb9c0ea41e025a3c627f7e5c42e8b3d00bb Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 27 May 2025 02:13:40 +0200 Subject: [PATCH 150/613] update test --- tests/tree_flush_eviction.rs | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/tests/tree_flush_eviction.rs b/tests/tree_flush_eviction.rs index de9b883c..ade483ea 100644 --- a/tests/tree_flush_eviction.rs +++ b/tests/tree_flush_eviction.rs @@ -82,15 +82,18 @@ fn tree_flush_eviction_4() -> lsm_tree::Result<()> { assert_eq!(1, tree.len(None, None)?); assert_eq!( 1, - tree.levels + tree.manifest .read() - .unwrap() - .levels + .expect("lock is poisoned") + .current_version() + .level(0) + .expect("should exist") .first() - .unwrap() + .expect("should have at least 1 run") .first() - .unwrap() - .tombstone_count() + .expect("should have one segment") + .metadata + .tombstone_count ); // NOTE: Should evict tombstone because last level @@ -99,15 +102,18 @@ fn tree_flush_eviction_4() -> lsm_tree::Result<()> { assert_eq!(1, tree.len(None, None)?); assert_eq!( 0, - tree.levels + tree.manifest .read() - .unwrap() - .levels - .last() - .unwrap() + .expect("lock is poisoned") + .current_version() + .level(6) + .expect("should exist") .first() - .unwrap() - .tombstone_count() + .expect("should have at least 1 run") + .first() + .expect("should have one segment") + .metadata + .tombstone_count ); Ok(()) From 492b4d096e32dec667b0d5de3710d48deb85bca9 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 27 May 2025 02:15:13 +0200 Subject: [PATCH 151/613] fix --- src/tree/inner.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/tree/inner.rs b/src/tree/inner.rs index e8a0add4..988981f9 100644 --- a/src/tree/inner.rs +++ b/src/tree/inner.rs @@ -65,7 +65,8 @@ pub struct TreeInner { pub(crate) sealed_memtables: Arc>, /// Current tree version - pub(crate) manifest: Arc>, + #[doc(hidden)] + pub manifest: Arc>, /// Tree configuration pub config: Config, From 89a0b828b31f47c5459999437e7e7c13a9d21e79 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 27 May 2025 02:17:20 +0200 Subject: [PATCH 152/613] wip --- tests/compaction_readers_grouping.rs | 40 ++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/tests/compaction_readers_grouping.rs b/tests/compaction_readers_grouping.rs index f0df6fb2..33010cd2 100644 --- a/tests/compaction_readers_grouping.rs +++ b/tests/compaction_readers_grouping.rs @@ -35,11 +35,41 @@ fn compaction_readers_grouping() -> lsm_tree::Result<()> { // breaking this tree.compact(Arc::new(lsm_tree::compaction::PullDown(2, 3)), 0)?; - eprintln!("{}", tree.levels.read().expect("asdasd")); - assert!(!tree.levels.read().expect("asdasd").levels[0].is_empty()); - assert!(tree.levels.read().expect("asdasd").levels[1].is_empty()); - assert!(tree.levels.read().expect("asdasd").levels[2].is_empty()); - assert!(!tree.levels.read().expect("asdasd").levels[3].is_empty()); + assert!(!tree + .manifest + .read() + .expect("asdasd") + .current_version() + .level(0) + .expect("level should exist") + .is_empty()); + + assert!(tree + .manifest + .read() + .expect("asdasd") + .current_version() + .level(1) + .expect("level should exist") + .is_empty()); + + assert!(tree + .manifest + .read() + .expect("asdasd") + .current_version() + .level(2) + .expect("level should exist") + .is_empty()); + + assert!(!tree + .manifest + .read() + .expect("asdasd") + .current_version() + .level(3) + .expect("level should exist") + .is_empty()); Ok(()) } From fe2145fe832466dcedf5443db14c600bbbb4559f Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 27 May 2025 02:20:03 +0200 Subject: [PATCH 153/613] fix test --- tests/tree_disjoint_point_read.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/tree_disjoint_point_read.rs b/tests/tree_disjoint_point_read.rs index ed22639f..d2ecd62b 100644 --- a/tests/tree_disjoint_point_read.rs +++ b/tests/tree_disjoint_point_read.rs @@ -84,11 +84,11 @@ fn tree_disjoint_point_read_multiple_levels() -> lsm_tree::Result<()> { tree.compact(Arc::new(lsm_tree::compaction::SizeTiered::new(10, 8)), 1)?; assert_eq!( 1, - tree.levels + tree.manifest .read() .expect("asdasd") - .levels - .get(1) + .current_version() + .level(1) .unwrap() .len() ); @@ -139,11 +139,11 @@ fn tree_disjoint_point_read_multiple_levels_blob() -> lsm_tree::Result<()> { assert_eq!( 1, tree.index - .levels + .manifest .read() .expect("asdasd") - .levels - .get(1) + .current_version() + .level(1) .unwrap() .len() ); From 2f2d407af5320094a51b69df7d7767fc3da22b5d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 27 May 2025 15:33:48 +0200 Subject: [PATCH 154/613] fix test --- tests/mvcc_slab.rs | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/tests/mvcc_slab.rs b/tests/mvcc_slab.rs index 079ceff5..21d2a0ef 100644 --- a/tests/mvcc_slab.rs +++ b/tests/mvcc_slab.rs @@ -21,15 +21,16 @@ fn segment_reader_mvcc_slab() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; - let level_manifest = tree.levels.read().expect("lock is poisoned"); + let level_manifest = tree.manifest.read().expect("lock is poisoned"); let segment = level_manifest - .levels + .current_version() + .level(0) + .expect("level should exist") .first() - .expect("should exist") - .segments + .expect("run should exist") .first() - .expect("should exist"); + .expect("segment should exist"); let reader = segment.iter(); assert_eq!(reader.count(), ITEM_COUNT + 1); @@ -57,15 +58,16 @@ fn segment_reader_mvcc_slab_blob() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; - let level_manifest = tree.index.levels.read().expect("lock is poisoned"); + let level_manifest = tree.index.manifest.read().expect("lock is poisoned"); let segment = level_manifest - .levels + .current_version() + .level(0) + .expect("level should exist") .first() - .expect("should exist") - .segments + .expect("run should exist") .first() - .expect("should exist"); + .expect("segment should exist"); let reader = segment.iter(); assert_eq!(reader.count(), ITEM_COUNT + 1); From 0a136c59c48d4b7e4ce8091831a829b55fd55f83 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 27 May 2025 15:34:41 +0200 Subject: [PATCH 155/613] fix test --- tests/multi_trees.rs | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/tests/multi_trees.rs b/tests/multi_trees.rs index ae4ddd8b..c7015ec7 100644 --- a/tests/multi_trees.rs +++ b/tests/multi_trees.rs @@ -30,7 +30,17 @@ fn tree_multi_segment_ids() -> lsm_tree::Result<()> { assert_eq!( 0, - tree0.levels.read().expect("lock is poisoned").levels[0].segments[0] + tree0 + .manifest + .read() + .expect("lock is poisoned") + .current_version() + .level(0) + .expect("level should exist") + .first() + .expect("run should exist") + .first() + .expect("segment should exist") .metadata .id ); @@ -59,7 +69,17 @@ fn tree_multi_segment_ids() -> lsm_tree::Result<()> { assert_eq!( 0, - tree1.levels.read().expect("lock is poisoned").levels[0].segments[0] + tree1 + .manifest + .read() + .expect("lock is poisoned") + .current_version() + .level(0) + .expect("level should exist") + .first() + .expect("run should exist") + .first() + .expect("segment should exist") .metadata .id ); From e1a5576dc9e22b21733da603c413e17c15871b90 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 30 May 2025 18:38:28 +0200 Subject: [PATCH 156/613] add fallible clipping iter --- src/fallible_clipping_iter.rs | 326 ++++++++++++++++++++++++++++++++++ src/lib.rs | 2 + 2 files changed, 328 insertions(+) create mode 100644 src/fallible_clipping_iter.rs diff --git a/src/fallible_clipping_iter.rs b/src/fallible_clipping_iter.rs new file mode 100644 index 00000000..6c320d3a --- /dev/null +++ b/src/fallible_clipping_iter.rs @@ -0,0 +1,326 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use crate::InternalValue; +use std::{ + marker::PhantomData, + ops::{Bound, RangeBounds}, +}; + +type Item = crate::Result; + +/// Clips an iterator to a key range +pub struct FallibleClippingIter +where + K: AsRef<[u8]>, + R: RangeBounds, + I: DoubleEndedIterator, +{ + _phantom: std::marker::PhantomData, + + inner: I, + range: R, + + has_entered_lo: bool, + has_entered_hi: bool, +} + +impl FallibleClippingIter +where + K: AsRef<[u8]>, + R: RangeBounds, + I: DoubleEndedIterator, +{ + pub fn new(iter: I, range: R) -> Self { + Self { + _phantom: PhantomData, + + inner: iter, + range, + + has_entered_lo: false, + has_entered_hi: false, + } + } +} + +impl Iterator for FallibleClippingIter +where + K: AsRef<[u8]>, + R: RangeBounds, + I: DoubleEndedIterator, +{ + type Item = Item; + + fn next(&mut self) -> Option { + loop { + let item = fail_iter!(self.inner.next()?); + + // NOTE: PERF: As soon as we enter ->[lo..] + // we don't need to do key comparisons anymore which are + // more expensive than a simple flag check, especially for long keys + if !self.has_entered_lo { + match self.range.start_bound() { + Bound::Included(start) => { + if item.key.user_key < start.as_ref() { + // Before min key + continue; + } + self.has_entered_lo = true; + } + Bound::Excluded(start) => { + if item.key.user_key <= start.as_ref() { + // Before or equal min key + continue; + } + self.has_entered_lo = true; + } + Bound::Unbounded => {} + } + } + + match self.range.end_bound() { + Bound::Included(start) => { + if item.key.user_key > start.as_ref() { + // After max key + return None; + } + } + Bound::Excluded(start) => { + if item.key.user_key >= start.as_ref() { + // Reached max key + return None; + } + } + Bound::Unbounded => {} + } + + return Some(Ok(item)); + } + } +} + +impl DoubleEndedIterator for FallibleClippingIter +where + K: AsRef<[u8]>, + R: RangeBounds, + I: DoubleEndedIterator, +{ + fn next_back(&mut self) -> Option { + loop { + let item = fail_iter!(self.inner.next_back()?); + + match self.range.start_bound() { + Bound::Included(start) => { + if item.key.user_key < start.as_ref() { + // Reached min key + return None; + } + } + Bound::Excluded(start) => { + if item.key.user_key <= start.as_ref() { + // Before min key + return None; + } + } + Bound::Unbounded => {} + } + + // NOTE: PERF: As soon as we enter [..hi]<- + // we don't need to do key comparisons anymore which are + // more expensive than a simple flag check, especially for long keys + if !self.has_entered_hi { + match self.range.end_bound() { + Bound::Included(end) => { + if item.key.user_key > end.as_ref() { + // After max key + continue; + } + self.has_entered_hi = true; + } + Bound::Excluded(end) => { + if item.key.user_key >= end.as_ref() { + // After or equal max key + continue; + } + self.has_entered_hi = true; + } + Bound::Unbounded => {} + } + } + + return Some(Ok(item)); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use test_log::test; + + #[test] + fn v3_clipping_iter_forwards() -> crate::Result<()> { + let items = [ + Ok(InternalValue::from_components( + b"a", + b"", + 0, + crate::ValueType::Value, + )), + Ok(InternalValue::from_components( + b"b", + b"", + 0, + crate::ValueType::Value, + )), + Ok(InternalValue::from_components( + b"c", + b"", + 0, + crate::ValueType::Value, + )), + Ok(InternalValue::from_components( + b"d", + b"", + 0, + crate::ValueType::Value, + )), + Ok(InternalValue::from_components( + b"e", + b"", + 0, + crate::ValueType::Value, + )), + ]; + let range = "c"..="d"; + + let mut iter = FallibleClippingIter::new(items.into_iter(), range); + assert_eq!( + Some(b"c" as &[u8]), + iter.next().transpose()?.map(|x| x.key.user_key).as_deref(), + ); + assert_eq!( + Some(b"d" as &[u8]), + iter.next().transpose()?.map(|x| x.key.user_key).as_deref(), + ); + assert!(iter.next().is_none()); + + Ok(()) + } + + #[test] + fn v3_clipping_iter_rev() -> crate::Result<()> { + let items = [ + Ok(InternalValue::from_components( + b"a", + b"", + 0, + crate::ValueType::Value, + )), + Ok(InternalValue::from_components( + b"b", + b"", + 0, + crate::ValueType::Value, + )), + Ok(InternalValue::from_components( + b"c", + b"", + 0, + crate::ValueType::Value, + )), + Ok(InternalValue::from_components( + b"d", + b"", + 0, + crate::ValueType::Value, + )), + Ok(InternalValue::from_components( + b"e", + b"", + 0, + crate::ValueType::Value, + )), + ]; + let range = "c"..="d"; + + let mut iter = FallibleClippingIter::new(items.into_iter(), range); + assert_eq!( + Some(b"d" as &[u8]), + iter.next_back() + .transpose()? + .map(|x| x.key.user_key) + .as_deref(), + ); + assert_eq!( + Some(b"c" as &[u8]), + iter.next_back() + .transpose()? + .map(|x| x.key.user_key) + .as_deref(), + ); + assert!(iter.next_back().is_none()); + + Ok(()) + } + + #[test] + fn v3_clipping_iter_ping_pong() -> crate::Result<()> { + let items = [ + Ok(InternalValue::from_components( + b"a", + b"", + 0, + crate::ValueType::Value, + )), + Ok(InternalValue::from_components( + b"b", + b"", + 0, + crate::ValueType::Value, + )), + Ok(InternalValue::from_components( + b"c", + b"", + 0, + crate::ValueType::Value, + )), + Ok(InternalValue::from_components( + b"d", + b"", + 0, + crate::ValueType::Value, + )), + Ok(InternalValue::from_components( + b"e", + b"", + 0, + crate::ValueType::Value, + )), + ]; + let range = "b"..="d"; + + let mut iter = FallibleClippingIter::new(items.into_iter(), range); + assert_eq!( + Some(b"b" as &[u8]), + iter.next().transpose()?.map(|x| x.key.user_key).as_deref(), + ); + assert_eq!( + Some(b"d" as &[u8]), + iter.next_back() + .transpose()? + .map(|x| x.key.user_key) + .as_deref(), + ); + assert_eq!( + Some(b"c" as &[u8]), + iter.next().transpose()?.map(|x| x.key.user_key).as_deref(), + ); + assert!(iter.next_back().is_none()); + assert!(iter.next().is_none()); + + Ok(()) + } +} diff --git a/src/lib.rs b/src/lib.rs index 06bb5ca4..f81b81f9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -138,6 +138,8 @@ mod config; mod error; // mod export; +pub(crate) mod fallible_clipping_iter; + #[doc(hidden)] pub mod file; From 9fed8cf23dcc234e3453bcea14c26ce53542eede Mon Sep 17 00:00:00 2001 From: Marvin <33938500+marvin-j97@users.noreply.github.com> Date: Tue, 3 Jun 2025 02:48:29 +0200 Subject: [PATCH 157/613] Update mod.rs --- src/segment/block/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index 906208d2..8f2eca24 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -23,9 +23,9 @@ use crate::{ use std::fs::File; use xxhash_rust::xxh3::xxh3_64; -/// A block on disk. +/// A block on disk /// -/// Consists of a header and some bytes (the data/payload). +/// Consists of a fixed-size header and some bytes (the data/payload). #[derive(Clone)] pub struct Block { pub header: Header, From f791ca6511cd9b1b8a2171219ee24d2ddaf4dfe4 Mon Sep 17 00:00:00 2001 From: Marvin <33938500+marvin-j97@users.noreply.github.com> Date: Tue, 3 Jun 2025 02:49:22 +0200 Subject: [PATCH 158/613] Update offset.rs --- src/segment/block/offset.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/segment/block/offset.rs b/src/segment/block/offset.rs index 4f023296..a69f93ac 100644 --- a/src/segment/block/offset.rs +++ b/src/segment/block/offset.rs @@ -1,3 +1,7 @@ +// Copyright (c) 2025-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + // TODO: rename FileOffset? #[derive(Copy, Clone, Default, Debug, std::hash::Hash, PartialEq, Eq, Ord, PartialOrd)] pub struct BlockOffset(pub u64); From 083d818777f231b5a01864ebfd9e73fc76dba972 Mon Sep 17 00:00:00 2001 From: Marvin <33938500+marvin-j97@users.noreply.github.com> Date: Tue, 3 Jun 2025 02:50:39 +0200 Subject: [PATCH 159/613] Update trailer.rs --- src/segment/trailer.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/segment/trailer.rs b/src/segment/trailer.rs index c6504fca..15bf5287 100644 --- a/src/segment/trailer.rs +++ b/src/segment/trailer.rs @@ -16,6 +16,8 @@ const TRAILER_SIZE: usize = 32; /// The fixed-size segment trailer stores a block handle to the regions block /// +/// # Diagram +/// /// ---------------- /// | data blocks | <- implicitly start at 0 /// |--------------| From 6f0f5ed952394b98d623634c1aeb4898b95ed675 Mon Sep 17 00:00:00 2001 From: Marvin <33938500+marvin-j97@users.noreply.github.com> Date: Tue, 3 Jun 2025 02:52:13 +0200 Subject: [PATCH 160/613] Update trailer.rs --- src/segment/trailer.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/segment/trailer.rs b/src/segment/trailer.rs index 15bf5287..6cc0ff36 100644 --- a/src/segment/trailer.rs +++ b/src/segment/trailer.rs @@ -19,21 +19,21 @@ const TRAILER_SIZE: usize = 32; /// # Diagram /// /// ---------------- -/// | data blocks | <- implicitly start at 0 +/// | data blocks | <- implicitly start at 0 /// |--------------| -/// | tli block | +/// | tli block | /// |--------------| -/// | index block | <- may not exist (if full block index is used, TLI will be dense) +/// | index block | <- may not exist (if full block index is used, TLI will be dense) /// |--------------| /// | filter block | <- may not exist /// |--------------| /// | ... TBD ... | /// |--------------| -/// | meta block | +/// | meta block | /// |--------------| /// | region block | /// |--------------| -/// | trailer | <- fixed size +/// | trailer | <- fixed size /// |--------------| /// /// Through this indirection, we can have a variable amount of region block handles. From a4393c7a3b2d6cc111c4a778f1671ed42f198588 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 3 Jun 2025 18:42:21 +0200 Subject: [PATCH 161/613] use vlog git dep --- Cargo.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 7757276a..53ba5d57 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,7 +36,8 @@ quick_cache = { version = "0.6.13", default-features = false, features = [] } rustc-hash = "2.0.0" self_cell = "1.0.4" tempfile = "3.12.0" -value-log = { version = "~1.9", default-features = false, features = [] } +value-log = { git = "https://github.com/fjall-rs/value-log", branch = "v2", default-features = false, features = [ +] } varint-rs = "2.2.0" xxhash-rust = { version = "0.8.12", features = ["xxh3"] } From 5c8d1edfc41a9ee9b94d44426037d908e722af33 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:22:34 +0200 Subject: [PATCH 162/613] add conversion method for InternalKey --- src/key.rs | 82 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 32 deletions(-) diff --git a/src/key.rs b/src/key.rs index 045756f2..df67a106 100644 --- a/src/key.rs +++ b/src/key.rs @@ -21,6 +21,18 @@ pub struct InternalKey { pub value_type: ValueType, } +impl<'a> From<&InternalKeyRef<'a>> for InternalKey { + fn from(value: &InternalKeyRef<'a>) -> Self { + Self::new(value.user_key, value.seqno, value.value_type) + } +} + +impl AsRef<[u8]> for InternalKey { + fn as_ref(&self) -> &[u8] { + &self.user_key + } +} + impl PartialEq for InternalKey { fn eq(&self, other: &Self) -> bool { self.user_key == other.user_key && self.seqno == other.seqno @@ -134,39 +146,45 @@ impl Ord for InternalKey { // } // } -// Temporary internal key without heap allocation -// #[derive(Debug, Eq)] -// pub struct InternalKeyRef<'a> { -// pub user_key: &'a [u8], -// pub seqno: SeqNo, -// pub value_type: ValueType, -// } +/* /// Temporary internal key without heap allocation +#[derive(Clone, Debug, Eq)] +pub struct InternalKeyRef<'a> { + pub user_key: &'a [u8], + pub seqno: SeqNo, + pub value_type: ValueType, +} -// impl<'a> InternalKeyRef<'a> { -// // Constructor for InternalKeyRef -// pub fn new(user_key: &'a [u8], seqno: u64, value_type: ValueType) -> Self { -// InternalKeyRef { -// user_key, -// seqno, -// value_type, -// } -// } -// } +impl<'a> AsRef<[u8]> for InternalKeyRef<'a> { + fn as_ref(&self) -> &[u8] { + self.user_key + } +} -// impl<'a> PartialEq for InternalKeyRef<'a> { -// fn eq(&self, other: &Self) -> bool { -// self.user_key == other.user_key && self.seqno == other.seqno -// } -// } +impl<'a> InternalKeyRef<'a> { + // Constructor for InternalKeyRef + pub fn new(user_key: &'a [u8], seqno: u64, value_type: ValueType) -> Self { + InternalKeyRef { + user_key, + seqno, + value_type, + } + } +} -// impl<'a> PartialOrd for InternalKeyRef<'a> { -// fn partial_cmp(&self, other: &Self) -> Option { -// Some(self.cmp(other)) -// } -// } +impl<'a> PartialEq for InternalKeyRef<'a> { + fn eq(&self, other: &Self) -> bool { + self.user_key == other.user_key && self.seqno == other.seqno + } +} -// impl<'a> Ord for InternalKeyRef<'a> { -// fn cmp(&self, other: &Self) -> std::cmp::Ordering { -// (&self.user_key, Reverse(self.seqno)).cmp(&(&other.user_key, Reverse(other.seqno))) -// } -// } +impl<'a> PartialOrd for InternalKeyRef<'a> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl<'a> Ord for InternalKeyRef<'a> { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + (&self.user_key, Reverse(self.seqno)).cmp(&(&other.user_key, Reverse(other.seqno))) + } +} */ From 94e82ed53ee5baef20c8a7f0f9f84e624c057623 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:24:48 +0200 Subject: [PATCH 163/613] change segment path type --- src/compaction/worker.rs | 4 +--- src/segment/inner.rs | 4 ++-- src/segment/mod.rs | 8 ++++---- src/tree/ingest.rs | 4 +--- src/tree/mod.rs | 9 +++++---- 5 files changed, 13 insertions(+), 16 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 9dd8e9a5..c7487e82 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -344,10 +344,8 @@ fn merge_segments( let created_segments = writer_results .into_iter() .map(|segment_id| -> crate::Result { - let segment_file_path = segments_base_folder.join(segment_id.to_string()); - Segment::recover( - &segment_file_path, + segments_base_folder.join(segment_id.to_string()), opts.tree_id, opts.config.cache.clone(), opts.config.descriptor_table.clone(), diff --git a/src/segment/inner.rs b/src/segment/inner.rs index 8b16085c..0438cb99 100644 --- a/src/segment/inner.rs +++ b/src/segment/inner.rs @@ -12,7 +12,7 @@ use std::{ }; pub struct Inner { - pub path: PathBuf, + pub path: Arc, pub(crate) tree_id: TreeId, @@ -53,7 +53,7 @@ impl Drop for Inner { if self.is_deleted.load(std::sync::atomic::Ordering::Acquire) { log::trace!("Cleanup deleted segment {global_id:?} at {:?}", self.path); - if let Err(e) = std::fs::remove_file(&self.path) { + if let Err(e) = std::fs::remove_file(&*self.path) { log::warn!( "Failed to cleanup deleted segment {global_id:?} at {:?}: {e:?}", self.path, diff --git a/src/segment/mod.rs b/src/segment/mod.rs index ca203fb4..13a45688 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -35,7 +35,7 @@ use inner::Inner; use meta::ParsedMeta; use std::{ ops::{Bound, RangeBounds}, - path::Path, + path::PathBuf, sync::{atomic::AtomicBool, Arc}, }; @@ -295,7 +295,7 @@ impl Segment { /// Tries to recover a segment from a file. pub fn recover( - file_path: &Path, + file_path: PathBuf, tree_id: TreeId, cache: Arc, descriptor_table: Arc, @@ -304,7 +304,7 @@ impl Segment { use trailer::Trailer; log::debug!("Recovering segment from file {file_path:?}"); - let mut file = std::fs::File::open(file_path)?; + let mut file = std::fs::File::open(&file_path)?; let trailer = Trailer::from_file(&mut file)?; log::trace!("Got trailer: {trailer:#?}"); @@ -370,7 +370,7 @@ impl Segment { descriptor_table.insert_for_table((tree_id, metadata.id).into(), Arc::new(file)); let segment = Self(Arc::new(Inner { - path: file_path.into(), + path: Arc::new(file_path), tree_id, metadata, diff --git a/src/tree/ingest.rs b/src/tree/ingest.rs index 31207873..a8cfd7d0 100644 --- a/src/tree/ingest.rs +++ b/src/tree/ingest.rs @@ -78,10 +78,8 @@ impl<'a> Ingestion<'a> { let created_segments = results .into_iter() .map(|segment_id| -> crate::Result { - let segment_file_path = self.folder.join(segment_id.to_string()); - Segment::recover( - &segment_file_path, + self.folder.join(segment_id.to_string()), self.tree.id, self.tree.config.cache.clone(), self.tree.config.descriptor_table.clone(), diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 3d8f73d9..7e54c309 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -515,14 +515,14 @@ impl Tree { .insert(segment_file_path, created_segment.global_id()); */ let created_segment = Segment::recover( - &segment_file_path, + segment_file_path, self.id, self.config.cache.clone(), self.config.descriptor_table.clone(), true, // TODO: look at configuration )?; - log::debug!("Flushed segment to {segment_file_path:?}"); + log::debug!("Flushed segment to {:?}", created_segment.path); Ok(Some(created_segment)) } @@ -923,15 +923,16 @@ impl Tree { if let Some(&_level_idx) = segment_id_map.get(&segment_id) { let segment = Segment::recover( - &segment_file_path, + segment_file_path, tree_id, cache.clone(), descriptor_table.clone(), true, // TODO: look at configuration )?; + log::debug!("Recovered segment from {:?}", segment.path); + segments.push(segment); - log::debug!("Recovered segment from {segment_file_path:?}"); if idx % progress_mod == 0 { log::debug!("Recovered {idx}/{cnt} disk segments"); From 6aeb57815ebf2431b3de1e3b487356b8968877d8 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:25:07 +0200 Subject: [PATCH 164/613] refactor --- src/blob_tree/gc/reader.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blob_tree/gc/reader.rs b/src/blob_tree/gc/reader.rs index 867b7475..1a1a8ace 100644 --- a/src/blob_tree/gc/reader.rs +++ b/src/blob_tree/gc/reader.rs @@ -33,7 +33,7 @@ impl<'a> GcReader<'a> { } } -impl<'a> value_log::IndexReader for GcReader<'a> { +impl value_log::IndexReader for GcReader<'_> { fn get(&self, key: &[u8]) -> std::io::Result> { use std::io::{Error as IoError, ErrorKind as IoErrorKind}; use MaybeInlineValue::{Indirect, Inline}; From 03ac00ec8c18a1b2ab234de04ded7857cc27dbbd Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:25:27 +0200 Subject: [PATCH 165/613] adjust test --- tests/tree_recover_counter.rs | 47 ----------------------------------- 1 file changed, 47 deletions(-) diff --git a/tests/tree_recover_counter.rs b/tests/tree_recover_counter.rs index 09176a51..ac65283a 100644 --- a/tests/tree_recover_counter.rs +++ b/tests/tree_recover_counter.rs @@ -25,23 +25,6 @@ fn tree_recover_segment_counter() -> lsm_tree::Result<()> { .load(std::sync::atomic::Ordering::Relaxed) ); - { - assert_eq!( - 0, - tree.manifest - .read() - .expect("lock is poisoned") - .current_version() - .level(0) - .expect("should exist") - .first() - .expect("should have exactly 1 run") - .first() - .expect("should have one segment") - .id() - ); - } - tree.insert("b", "b", 0); tree.flush_active_memtable(0)?; @@ -51,36 +34,6 @@ fn tree_recover_segment_counter() -> lsm_tree::Result<()> { .segment_id_counter .load(std::sync::atomic::Ordering::Relaxed) ); - - assert_eq!( - 1, - tree.manifest - .read() - .expect("lock is poisoned") - .current_version() - .level(0) - .expect("should exist") - .first() - .expect("should have at least 1 run") - .first() - .expect("should have one segment") - .id() - ); - - assert_eq!( - 0, - tree.manifest - .read() - .expect("lock is poisoned") - .current_version() - .level(0) - .expect("should exist") - .get(1) - .expect("should have at least 1 run") - .first() - .expect("should have one segment") - .id() - ); } { From f5d330151b38eefbc8aecde8fccece17d9d8a7e6 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:25:55 +0200 Subject: [PATCH 166/613] test: temp disable some tests --- tests/compaction_readers_grouping.rs | 1 + tests/tree_bulk_ingest.rs | 4 ++++ tests/tree_disjoint_point_read.rs | 2 ++ tests/tree_flush_eviction.rs | 2 ++ 4 files changed, 9 insertions(+) diff --git a/tests/compaction_readers_grouping.rs b/tests/compaction_readers_grouping.rs index 33010cd2..3e6ead60 100644 --- a/tests/compaction_readers_grouping.rs +++ b/tests/compaction_readers_grouping.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use test_log::test; #[test] +#[ignore] fn compaction_readers_grouping() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let path = folder.path(); diff --git a/tests/tree_bulk_ingest.rs b/tests/tree_bulk_ingest.rs index 4714821e..b29529cb 100644 --- a/tests/tree_bulk_ingest.rs +++ b/tests/tree_bulk_ingest.rs @@ -4,6 +4,7 @@ use test_log::test; const ITEM_COUNT: usize = 100_000; #[test] +#[ignore] fn tree_bulk_ingest() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; @@ -29,6 +30,7 @@ fn tree_bulk_ingest() -> lsm_tree::Result<()> { } #[test] +#[ignore] fn tree_copy() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let src = Config::new(folder).open()?; @@ -73,6 +75,7 @@ fn tree_copy() -> lsm_tree::Result<()> { } #[test] +#[ignore] fn blob_tree_bulk_ingest() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; @@ -101,6 +104,7 @@ fn blob_tree_bulk_ingest() -> lsm_tree::Result<()> { } #[test] +#[ignore] fn blob_tree_copy() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let src = Config::new(folder) diff --git a/tests/tree_disjoint_point_read.rs b/tests/tree_disjoint_point_read.rs index d2ecd62b..14c858d9 100644 --- a/tests/tree_disjoint_point_read.rs +++ b/tests/tree_disjoint_point_read.rs @@ -61,6 +61,7 @@ fn tree_disjoint_point_read_blob() -> lsm_tree::Result<()> { } #[test] +#[ignore] fn tree_disjoint_point_read_multiple_levels() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.into_path(); @@ -115,6 +116,7 @@ fn tree_disjoint_point_read_multiple_levels() -> lsm_tree::Result<()> { } #[test] +#[ignore] fn tree_disjoint_point_read_multiple_levels_blob() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.into_path(); diff --git a/tests/tree_flush_eviction.rs b/tests/tree_flush_eviction.rs index ade483ea..6d881076 100644 --- a/tests/tree_flush_eviction.rs +++ b/tests/tree_flush_eviction.rs @@ -41,6 +41,7 @@ fn tree_flush_eviction_2() -> lsm_tree::Result<()> { } #[test] +#[ignore] fn tree_flush_eviction_3() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let path = folder.path(); @@ -65,6 +66,7 @@ fn tree_flush_eviction_3() -> lsm_tree::Result<()> { } #[test] +#[ignore] fn tree_flush_eviction_4() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let path = folder.path(); From 5a820d66bd62c0d2fb7ead857a89d60d65c4ebc4 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:26:48 +0200 Subject: [PATCH 167/613] add log --- src/tree/mod.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 7e54c309..1708521f 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -207,6 +207,8 @@ impl AbstractTree for Tree { } fn register_segments(&self, segments: &[Segment]) -> crate::Result<()> { + log::trace!("Registering {} segments", segments.len()); + // NOTE: Mind lock order L -> M -> S log::trace!("register: Acquiring levels manifest write lock"); let mut manifest = self.manifest.write().expect("lock is poisoned"); From 262ba91fbc70dbe20bc5b6d01c7edb69417815d8 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:27:00 +0200 Subject: [PATCH 168/613] refactor: change type signature --- src/version/run.rs | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/version/run.rs b/src/version/run.rs index d98281ef..761eac68 100644 --- a/src/version/run.rs +++ b/src/version/run.rs @@ -45,7 +45,7 @@ impl std::ops::Deref for Indexed { } /// A disjoint run of disk segments -#[derive(Clone)] +#[derive(Clone, Debug)] pub struct Run(Vec); impl std::ops::Deref for Run { @@ -108,9 +108,9 @@ impl Run { } /// Returns the indexes of the interval [min, max] of segments that overlap with a given range. - pub fn range_indexes<'a, R: RangeBounds<&'a [u8]>>( + pub fn range_indexes, R: RangeBounds>( &self, - key_range: R, + key_range: &R, ) -> Option<(usize, usize)> { let level = &self.0; @@ -242,16 +242,16 @@ mod tests { let run = Run(items); - assert_eq!(Some((0, 3)), run.range_indexes(..)); - assert_eq!(Some((0, 0)), run.range_indexes(b"a" as &[u8]..=b"a")); - assert_eq!(Some((0, 0)), run.range_indexes(b"a" as &[u8]..=b"b")); - assert_eq!(Some((0, 0)), run.range_indexes(b"a" as &[u8]..=b"d")); - assert_eq!(Some((0, 0)), run.range_indexes(b"a" as &[u8]..b"d")); - assert_eq!(Some((0, 1)), run.range_indexes(b"a" as &[u8]..=b"g")); - assert_eq!(Some((0, 3)), run.range_indexes(b"a" as &[u8]..=b"z")); - assert_eq!(Some((3, 3)), run.range_indexes(b"z" as &[u8]..=b"zzz")); - assert_eq!(Some((3, 3)), run.range_indexes(b"z" as &[u8]..)); - assert!(run.range_indexes(b"zzz" as &[u8]..=b"zzzzzzz").is_none()); + assert_eq!(Some((0, 3)), run.range_indexes::<&[u8], _>(&..)); + assert_eq!(Some((0, 0)), run.range_indexes(&(b"a" as &[u8]..=b"a"))); + assert_eq!(Some((0, 0)), run.range_indexes(&(b"a" as &[u8]..=b"b"))); + assert_eq!(Some((0, 0)), run.range_indexes(&(b"a" as &[u8]..=b"d"))); + assert_eq!(Some((0, 0)), run.range_indexes(&(b"a" as &[u8]..b"d"))); + assert_eq!(Some((0, 1)), run.range_indexes(&(b"a" as &[u8]..=b"g"))); + assert_eq!(Some((0, 3)), run.range_indexes(&(b"a" as &[u8]..=b"z"))); + assert_eq!(Some((3, 3)), run.range_indexes(&(b"z" as &[u8]..=b"zzz"))); + assert_eq!(Some((3, 3)), run.range_indexes(&(b"z" as &[u8]..))); + assert!(run.range_indexes(&(b"zzz" as &[u8]..=b"zzzzzzz")).is_none()); } #[test] From d6e7e6f201fbcf1c6a5ac9e08c1ef254116c15ad Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:27:22 +0200 Subject: [PATCH 169/613] change segment print helper --- src/segment/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 13a45688..d9cce249 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -89,7 +89,7 @@ impl std::ops::Deref for Segment { impl std::fmt::Debug for Segment { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "Segment:{}({})", self.id(), self.metadata.key_range) + write!(f, "Segment:{}({:?})", self.id(), self.metadata.key_range) } } From cb01faebc0b9adaf4aae53f3a2622f169d5cda13 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:28:10 +0200 Subject: [PATCH 170/613] add load_block helper fn --- src/segment/mod.rs | 42 ++++++++++++------------------------------ src/segment/util.rs | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 30 deletions(-) diff --git a/src/segment/mod.rs b/src/segment/mod.rs index d9cce249..b9634c16 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -23,6 +23,7 @@ pub use id::{GlobalSegmentId, SegmentId}; pub use index_block::{BlockHandle, IndexBlock, KeyedBlockHandle}; use regions::ParsedRegions; pub use scanner::Scanner; +use util::load_block; pub use writer::Writer; use crate::{ @@ -122,33 +123,14 @@ impl Segment { handle: &BlockHandle, compression: CompressionType, ) -> crate::Result { - let id = self.global_id(); - - if let Some(block) = self.cache.get_block(id, handle.offset()) { - return Ok(block); - } - - let cached_fd = self.descriptor_table.access_for_table(&id); - let fd_cache_miss = cached_fd.is_none(); - - let fd = if let Some(fd) = cached_fd { - fd - } else { - Arc::new(std::fs::File::open(&self.path)?) - }; - - let block = Block::from_file(&fd, handle.offset(), handle.size(), compression)?; - - let id = self.global_id(); - - // Cache FD - if fd_cache_miss { - self.descriptor_table.insert_for_table(id, fd); - } - - self.cache.insert_block(id, handle.offset(), block.clone()); - - Ok(block) + load_block( + self.global_id(), + &self.path, + &self.descriptor_table, + &self.cache, + handle, + compression, + ) } fn load_data_block(&self, handle: &BlockHandle) -> crate::Result { @@ -459,7 +441,7 @@ mod tests { { let segment = Segment::recover( - &file, + file, 0, Arc::new(Cache::with_capacity_bytes(1_000_000)), Arc::new(DescriptorTable::new(10)), @@ -541,7 +523,7 @@ mod tests { { let segment = Segment::recover( - &file, + file, 0, Arc::new(Cache::with_capacity_bytes(1_000_000)), Arc::new(DescriptorTable::new(10)), @@ -633,7 +615,7 @@ mod tests { { let segment = Segment::recover( - &file, + file, 0, Arc::new(Cache::with_capacity_bytes(1_000_000)), Arc::new(DescriptorTable::new(10)), diff --git a/src/segment/util.rs b/src/segment/util.rs index 79e3a6b0..fc4d6422 100644 --- a/src/segment/util.rs +++ b/src/segment/util.rs @@ -2,6 +2,46 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) +use super::{Block, BlockHandle, GlobalSegmentId}; +use crate::{Cache, CompressionType, DescriptorTable}; +use std::{path::Path, sync::Arc}; + +pub fn load_block( + segment_id: GlobalSegmentId, + path: &Path, + descriptor_table: &DescriptorTable, + cache: &Cache, + handle: &BlockHandle, + compression: CompressionType, +) -> crate::Result { + log::trace!("load block {handle:?}"); + + if let Some(block) = cache.get_block(segment_id, handle.offset()) { + return Ok(block); + } + + let cached_fd = descriptor_table.access_for_table(&segment_id); + let fd_cache_miss = cached_fd.is_none(); + + let fd = if let Some(fd) = cached_fd { + fd + } else { + Arc::new(std::fs::File::open(path)?) + }; + + let block = Block::from_file(&fd, handle.offset(), handle.size(), compression)?; + + // Cache FD + if fd_cache_miss { + descriptor_table.insert_for_table(segment_id, fd); + } + + cache.insert_block(segment_id, handle.offset(), block.clone()); + + Ok(block) +} + +#[must_use] pub fn longest_shared_prefix_length(s1: &[u8], s2: &[u8]) -> usize { s1.iter() .zip(s2.iter()) From 2d4c7735339f4bd1a2acfc5855d08d449086afeb Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:28:23 +0200 Subject: [PATCH 171/613] refactor simplified compare_prefixed_slice --- src/segment/util.rs | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/src/segment/util.rs b/src/segment/util.rs index fc4d6422..423f88b0 100644 --- a/src/segment/util.rs +++ b/src/segment/util.rs @@ -50,8 +50,9 @@ pub fn longest_shared_prefix_length(s1: &[u8], s2: &[u8]) -> usize { } // TODO: Fuzz test +#[must_use] pub fn compare_prefixed_slice(prefix: &[u8], suffix: &[u8], needle: &[u8]) -> std::cmp::Ordering { - use std::cmp::Ordering::{Equal, Greater, Less}; + use std::cmp::Ordering::{Equal, Greater}; if needle.is_empty() { let combined_len = prefix.len() + suffix.len(); @@ -59,35 +60,33 @@ pub fn compare_prefixed_slice(prefix: &[u8], suffix: &[u8], needle: &[u8]) -> st return if combined_len > 0 { Greater } else { Equal }; } - match prefix.len().cmp(&needle.len()) { - Equal => match prefix.cmp(needle) { + let max_pfx_len = prefix.len().min(needle.len()); + + { + #[allow(unsafe_code)] + let prefix = unsafe { prefix.get_unchecked(0..max_pfx_len) }; + + #[allow(unsafe_code)] + let needle = unsafe { needle.get_unchecked(0..max_pfx_len) }; + + match prefix.cmp(needle) { Equal => {} ordering => return ordering, - }, - Greater => { - // SAFETY: We know that the prefix is longer than the needle, so we can safely - // truncate it to the needle's length - #[allow(unsafe_code)] - let prefix = unsafe { prefix.get_unchecked(0..needle.len()) }; - return prefix.cmp(needle); } - Less => { - // SAFETY: We know that the needle is longer than the prefix, so we can safely - // truncate it to the prefix's length - #[allow(unsafe_code)] - let needle = unsafe { needle.get_unchecked(0..prefix.len()) }; - - match prefix.cmp(needle) { - Equal => {} - ordering => return ordering, - } + } + + let rest_len = needle.len() - max_pfx_len; + if rest_len == 0 { + if !suffix.is_empty() { + return std::cmp::Ordering::Greater; } + return std::cmp::Ordering::Equal; } // SAFETY: We know that the prefix is definitely not longer than the needle // so we can safely truncate #[allow(unsafe_code)] - let needle = unsafe { needle.get_unchecked(prefix.len()..) }; + let needle = unsafe { needle.get_unchecked(max_pfx_len..) }; suffix.cmp(needle) } From 7977171d07304301220f1a0c42383a3a281e94ef Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:28:45 +0200 Subject: [PATCH 172/613] add license header --- src/segment/block/offset.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/segment/block/offset.rs b/src/segment/block/offset.rs index 4f023296..a69f93ac 100644 --- a/src/segment/block/offset.rs +++ b/src/segment/block/offset.rs @@ -1,3 +1,7 @@ +// Copyright (c) 2025-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + // TODO: rename FileOffset? #[derive(Copy, Clone, Default, Debug, std::hash::Hash, PartialEq, Eq, Ord, PartialOrd)] pub struct BlockOffset(pub u64); From 1d7a257b35bd918c60f1c35287ee42757bdb8ee9 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:29:08 +0200 Subject: [PATCH 173/613] temp disable another test --- src/compaction/worker.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index c7487e82..dfec3033 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -505,6 +505,7 @@ mod tests { use test_log::test; #[test] + #[ignore] fn compaction_drop_segments() -> crate::Result<()> { let folder = tempfile::tempdir()?; From a7b195c6aa6b7a188b3a7ea47113181a7fb0e47e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:29:56 +0200 Subject: [PATCH 174/613] fix: key range partitions --- src/version/key_range_partition.rs | 101 ++++++++++++++++++++++++++--- 1 file changed, 93 insertions(+), 8 deletions(-) diff --git a/src/version/key_range_partition.rs b/src/version/key_range_partition.rs index a3a2c524..819f6cdb 100644 --- a/src/version/key_range_partition.rs +++ b/src/version/key_range_partition.rs @@ -44,12 +44,14 @@ impl> KeyRangePartitions pub fn index_segment(&mut self, segment: &T) { let key_range = &segment.key_range(); let start_key = key_range.min(); - let end_key = key_range.max(); let idx = partition_point(&self.0, |x| x.key_range.max() < start_key); if let Some(slice) = self.0.get_mut(idx..) { - for partition in slice.iter_mut().filter(|x| x.key_range.max() <= end_key) { + for partition in slice + .iter_mut() + .filter(|x| x.key_range.overlaps_with_key_range(key_range)) + { partition.segments.push_back(segment.clone()); } } @@ -59,7 +61,11 @@ impl> KeyRangePartitions let mut optimized = VecDeque::new(); let mut blacklist = HashSet::::default(); - loop { + while self + .0 + .iter() + .any(|partition| !partition.segments.is_empty()) + { let run = { let mut v: Vec = vec![]; @@ -71,6 +77,7 @@ impl> KeyRangePartitions let curr_id = front.id(); if blacklist.contains(&curr_id) { + partition.segments.pop_front().expect("front should exist"); continue; } @@ -90,17 +97,15 @@ impl> KeyRangePartitions v }; - if run.is_empty() { - break; - } - #[cfg(debug_assertions)] { let ranges = run.iter().map(Ranged::key_range).collect::>(); debug_assert!(KeyRange::is_disjoint(&ranges)); } - optimized.push_front(run); + if !run.is_empty() { + optimized.push_front(run); + } } optimized.into() @@ -130,6 +135,84 @@ mod tests { } } + #[test] + fn key_range_partition_single_key_twice() { + let a = FauxSegment { + key_range: KeyRange::new((UserKey::new(&[0; 8]), UserKey::new(&[0; 8]))), + id: 0, + }; + let b = FauxSegment { + key_range: KeyRange::new((UserKey::new(&[0; 8]), UserKey::new(&[0; 8]))), + id: 1, + }; + + { + let mut index = KeyRangePartitions::::new(std::iter::once(( + UserKey::new(&[0; 8]), + UserKey::new(&[0; 8]), + ))); + + index.index_segment(&a); + index.index_segment(&b); + + assert_eq!( + vec![vec![b.clone()], vec![a.clone()]], + index.into_optimized_runs() + ); + } + + { + let mut index = KeyRangePartitions::::new(std::iter::once(( + UserKey::new(&[0; 8]), + UserKey::new(&[0; 8]), + ))); + + index.index_segment(&b); + index.index_segment(&a); + + assert_eq!(vec![vec![a], vec![b]], index.into_optimized_runs()); + } + } + + #[test] + fn key_range_partition_single_key() { + let a = FauxSegment { + key_range: KeyRange::new((UserKey::new(b"a"), UserKey::new(b"b"))), + id: 0, + }; + let b = FauxSegment { + key_range: KeyRange::new((UserKey::new(b"a"), UserKey::new(b"a"))), + id: 1, + }; + + { + let mut index = KeyRangePartitions::::new(std::iter::once(( + UserKey::new(b"a"), + UserKey::new(b"b"), + ))); + + index.index_segment(&a); + index.index_segment(&b); + + assert_eq!( + vec![vec![b.clone()], vec![a.clone()]], + index.into_optimized_runs() + ); + } + + { + let mut index = KeyRangePartitions::::new(std::iter::once(( + UserKey::new(b"a"), + UserKey::new(b"b"), + ))); + + index.index_segment(&b); + index.index_segment(&a); + + assert_eq!(vec![vec![a], vec![b]], index.into_optimized_runs()); + } + } + #[test] fn key_range_partition_one_segment() { let segment = FauxSegment { @@ -171,6 +254,8 @@ mod tests { index.index_segment(&a); index.index_segment(&b); + eprintln!("{index:#?}"); + assert_eq!( vec![vec![a.clone(), b.clone()]], index.into_optimized_runs() From 2eef118145ea13d9d53b1f4076338cc165e4147b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:30:35 +0200 Subject: [PATCH 175/613] naming --- src/segment/index_block/mod.rs | 79 +++++++++++++++++----------------- 1 file changed, 40 insertions(+), 39 deletions(-) diff --git a/src/segment/index_block/mod.rs b/src/segment/index_block/mod.rs index 59d583ac..7b36a671 100644 --- a/src/segment/index_block/mod.rs +++ b/src/segment/index_block/mod.rs @@ -26,6 +26,7 @@ macro_rules! unwrappy { } /// Block that contains block handles (file offset + size) +#[derive(Clone)] pub struct IndexBlock { pub inner: Block, @@ -280,7 +281,7 @@ impl IndexBlock { let offset = self.search_lowest(&binary_index, needle)?; - // SAFETY: pos is always retrieved from the binary index, + // SAFETY: offset is always retrieved from the binary index, // which we consider to be trustworthy #[warn(unsafe_code)] let mut cursor = Cursor::new(unsafe { self.inner.data.get_unchecked(offset..) }); @@ -304,7 +305,7 @@ impl IndexBlock { let offset = self.search_highest(&binary_index, needle)?; - // SAFETY: pos is always retrieved from the binary index, + // SAFETY: offset is always retrieved from the binary index, // which we consider to be trustworthy #[warn(unsafe_code)] let mut cursor = Cursor::new(unsafe { self.inner.data.get_unchecked(offset..) }); @@ -357,11 +358,11 @@ mod tests { ]; let bytes = IndexBlock::encode_items(&items, 1)?; - eprintln!("{bytes:?}"); - eprintln!("{}", String::from_utf8_lossy(&bytes)); + // eprintln!("{bytes:?}"); + // eprintln!("{}", String::from_utf8_lossy(&bytes)); /* eprintln!("encoded into {} bytes", bytes.len()); */ - let data_block = IndexBlock::new(Block { + let index_block = IndexBlock::new(Block { data: bytes.into(), header: Header { checksum: Checksum::from_raw(0), @@ -371,23 +372,23 @@ mod tests { }, }); - assert_eq!(data_block.len(), items.len()); + assert_eq!(index_block.len(), items.len()); assert_eq!( Some(items.first().unwrap().clone()), - data_block.get_lowest_possible_block(b"a") + index_block.get_lowest_possible_block(b"a") ); assert_eq!( Some(items.first().unwrap().clone()), - data_block.get_lowest_possible_block(b"b") + index_block.get_lowest_possible_block(b"b") ); assert_eq!( Some(items.get(1).unwrap().clone()), - data_block.get_lowest_possible_block(b"ba") + index_block.get_lowest_possible_block(b"ba") ); assert_eq!( Some(items.get(2).unwrap().clone()), - data_block.get_lowest_possible_block(b"d") + index_block.get_lowest_possible_block(b"d") ); // assert_eq!(None, data_block.get_lowest_possible_block(b"zzz")); @@ -409,7 +410,7 @@ mod tests { // eprintln!("{}", String::from_utf8_lossy(&bytes)); /* eprintln!("encoded into {} bytes", bytes.len()); */ - let data_block = IndexBlock::new(Block { + let index_block = IndexBlock::new(Block { data: bytes.into(), header: Header { checksum: Checksum::from_raw(0), @@ -419,19 +420,19 @@ mod tests { }, }); - assert_eq!(data_block.len(), items.len()); + assert_eq!(index_block.len(), items.len()); assert_eq!( Some(items.first().unwrap().clone()), - data_block.get_lowest_possible_block(b"a") + index_block.get_lowest_possible_block(b"a") ); assert_eq!( Some(items.last().unwrap().clone()), - data_block.get_lowest_possible_block(b"abc") + index_block.get_lowest_possible_block(b"abc") ); assert_eq!( Some(items.last().unwrap().clone()), - data_block.get_lowest_possible_block(b"b") + index_block.get_lowest_possible_block(b"b") ); Ok(()) @@ -452,7 +453,7 @@ mod tests { // eprintln!("{}", String::from_utf8_lossy(&bytes)); /* eprintln!("encoded into {} bytes", bytes.len()); */ - let data_block = IndexBlock::new(Block { + let index_block = IndexBlock::new(Block { data: bytes.into(), header: Header { checksum: Checksum::from_raw(0), @@ -462,29 +463,29 @@ mod tests { }, }); - assert_eq!(data_block.len(), items.len()); + assert_eq!(index_block.len(), items.len()); assert_eq!( Some(items.first().unwrap().clone()), - data_block.get_highest_possible_block(b"a") + index_block.get_highest_possible_block(b"a") ); assert_eq!( Some(items.get(1).unwrap().clone()), - data_block.get_highest_possible_block(b"abc") + index_block.get_highest_possible_block(b"abc") ); assert_eq!( Some(items.last().unwrap().clone()), - data_block.get_highest_possible_block(b"c") + index_block.get_highest_possible_block(b"c") ); assert_eq!( Some(items.last().unwrap().clone()), - data_block.get_highest_possible_block(b"cef") + index_block.get_highest_possible_block(b"cef") ); assert_eq!( Some(items.last().unwrap().clone()), - data_block.get_highest_possible_block(b"d") + index_block.get_highest_possible_block(b"d") ); - assert_eq!(None, data_block.get_highest_possible_block(b"zzz")); + assert_eq!(None, index_block.get_highest_possible_block(b"zzz")); Ok(()) } @@ -498,7 +499,7 @@ mod tests { // eprintln!("{}", String::from_utf8_lossy(&bytes)); /* eprintln!("encoded into {} bytes", bytes.len()); */ - let data_block = IndexBlock::new(Block { + let index_block = IndexBlock::new(Block { data: bytes.into(), header: Header { checksum: Checksum::from_raw(0), @@ -508,23 +509,23 @@ mod tests { }, }); - assert_eq!(data_block.len(), 1); + assert_eq!(index_block.len(), 1); assert_eq!( Some(item.clone()), - data_block.get_lowest_possible_block(b"a") + index_block.get_lowest_possible_block(b"a") ); assert_eq!( Some(item.clone()), - data_block.get_lowest_possible_block(b"asdasd") + index_block.get_lowest_possible_block(b"asdasd") ); assert_eq!( Some(item.clone()), - data_block.get_lowest_possible_block(b"b") + index_block.get_lowest_possible_block(b"b") ); - assert_eq!(Some(item), data_block.get_lowest_possible_block(b"c")); - assert_eq!(None, data_block.get_lowest_possible_block(b"d")); - assert_eq!(None, data_block.get_lowest_possible_block(b"z")); + assert_eq!(Some(item), index_block.get_lowest_possible_block(b"c")); + assert_eq!(None, index_block.get_lowest_possible_block(b"d")); + assert_eq!(None, index_block.get_lowest_possible_block(b"z")); Ok(()) } @@ -538,7 +539,7 @@ mod tests { // eprintln!("{}", String::from_utf8_lossy(&bytes)); /* eprintln!("encoded into {} bytes", bytes.len()); */ - let data_block = IndexBlock::new(Block { + let index_block = IndexBlock::new(Block { data: bytes.into(), header: Header { checksum: Checksum::from_raw(0), @@ -548,23 +549,23 @@ mod tests { }, }); - assert_eq!(data_block.len(), 1); + assert_eq!(index_block.len(), 1); assert_eq!( Some(item.clone()), - data_block.get_lowest_possible_block(b"a") + index_block.get_lowest_possible_block(b"a") ); assert_eq!( Some(item.clone()), - data_block.get_lowest_possible_block(b"asdasd") + index_block.get_lowest_possible_block(b"asdasd") ); assert_eq!( Some(item.clone()), - data_block.get_lowest_possible_block(b"b") + index_block.get_lowest_possible_block(b"b") ); - assert_eq!(Some(item), data_block.get_lowest_possible_block(b"c")); - assert_eq!(None, data_block.get_lowest_possible_block(b"d")); - assert_eq!(None, data_block.get_lowest_possible_block(b"z")); + assert_eq!(Some(item), index_block.get_lowest_possible_block(b"c")); + assert_eq!(None, index_block.get_lowest_possible_block(b"d")); + assert_eq!(None, index_block.get_lowest_possible_block(b"z")); Ok(()) } From 661b0671d3b47323a8ac8360d1e5844620f47e9b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:31:21 +0200 Subject: [PATCH 176/613] refactor: data block binary search --- src/segment/data_block/mod.rs | 160 ++++++++++++++++++++++------------ 1 file changed, 105 insertions(+), 55 deletions(-) diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index 0fa9f285..441d5312 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -15,8 +15,7 @@ use crate::clipping_iter::ClippingIter; use crate::{InternalValue, SeqNo, ValueType}; use byteorder::WriteBytesExt; use byteorder::{LittleEndian, ReadBytesExt}; -use forward_reader::ForwardReader; -use iter::{ParsedItem, ParsedSlice}; +use forward_reader::{ForwardReader, ParsedItem, ParsedSlice}; use std::io::Seek; use std::ops::RangeBounds; use std::{cmp::Reverse, io::Cursor}; @@ -308,8 +307,13 @@ impl DataBlock { &self, binary_index: &BinaryIndexReader, needle: &[u8], - seqno: Option, + seqno: SeqNo, ) -> Option { + debug_assert!( + binary_index.len() >= 1, + "binary index should never be empty", + ); + let mut left: usize = 0; let mut right = binary_index.len(); @@ -317,69 +321,57 @@ impl DataBlock { return None; } - if let Some(seqno) = seqno { - let seqno_cmp = Reverse(seqno - 1); - - while left < right { - let mid = (left + right) / 2; - - let offset = binary_index.get(mid); - - if self.get_key_at(offset) <= (needle, seqno_cmp) { - left = mid + 1; - } else { - right = mid; - } - } - - if left == 0 { - return Some(0); - } + let seqno_cmp = Reverse(seqno - 1); - let offset = binary_index.get(left - 1); + while left < right { + let mid = (left + right) / 2; - Some(offset) - } else if self.restart_interval == 1 { - while left < right { - let mid = (left + right) / 2; + let offset = binary_index.get(mid); - let offset = binary_index.get(mid); + let (head_key, head_seqno) = self.get_key_at(offset); - if self.get_key_at(offset).0 < needle { + match head_key.cmp(needle) { + std::cmp::Ordering::Less => { left = mid + 1; - } else { - right = mid; } - } - - Some(if left == 0 { - binary_index.get(0) - } else if left < binary_index.len() { - binary_index.get(left) - } else { - binary_index.get(binary_index.len() - 1) - }) - } else { - while left < right { - let mid = (left + right) / 2; + std::cmp::Ordering::Equal => match head_seqno.cmp(&seqno_cmp) { + std::cmp::Ordering::Less => { + left = mid + 1; + } + std::cmp::Ordering::Equal => { + left = mid; + right = mid; + } + std::cmp::Ordering::Greater => { + right = mid; + } + }, + std::cmp::Ordering::Greater => { + // NOTE: If we are at the first restart interval head + // and its key is larger than the requested key, the key cannot be possibly contained in the block + // + // Block + // [b... c... d... e...] + // + // ^ + // needle = "a" + // + if mid == 0 { + return None; + } - let offset = binary_index.get(mid); - - if self.get_key_at(offset).0 < needle { - left = mid + 1; - } else { right = mid; } } + } - Some(if left == 0 { - binary_index.get(0) - } else if left < binary_index.len() { - binary_index.get(left - 1) - } else { - binary_index.get(binary_index.len() - 1) - }) + if left == 0 { + return Some(0); } + + let offset = binary_index.get(left - 1); + + Some(offset) } fn parse_restart_item(reader: &mut Cursor<&[u8]>, offset: usize) -> Option { @@ -476,7 +468,7 @@ impl DataBlock { } #[must_use] - pub fn point_read(&self, needle: &[u8], seqno: Option) -> Option { + pub fn point_read(&self, needle: &[u8], seqno: SeqNo) -> Option { let mut reader = ForwardReader::new(self); reader.point_read(needle, seqno) } @@ -506,3 +498,61 @@ impl DataBlock { serializer.finish() } } + +#[cfg(test)] +mod tests { + use crate::{ + segment::{block::Header, Block, BlockOffset, DataBlock}, + Checksum, InternalValue, SeqNo, + ValueType::{Tombstone, Value}, + }; + use test_log::test; + + #[test] + fn v3_data_block_binary_search() -> crate::Result<()> { + let items = [ + InternalValue::from_components("b", "b", 0, Value), + InternalValue::from_components("c", "c", 0, Value), + InternalValue::from_components("d", "d", 1, Tombstone), + InternalValue::from_components("e", "e", 0, Value), + InternalValue::from_components("f", "f", 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 16, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + let binary_index = data_block.get_binary_index_reader(); + + assert!( + data_block + .binary_search_for_offset(&binary_index, b"a", SeqNo::MAX) + .is_none(), + "should return None because a is less than min key", + ); + + assert!( + data_block + .binary_search_for_offset(&binary_index, b"b", SeqNo::MAX) + .is_some(), + "should return Some because b exists", + ); + + assert!( + data_block + .binary_search_for_offset(&binary_index, b"z", SeqNo::MAX) + .is_some(), + "should return Some because z may be in last restart interval", + ); + + Ok(()) + } +} From 030b99dea0126810f1d9ac07fa8eecea30335d61 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:31:53 +0200 Subject: [PATCH 177/613] expose full block index' index block --- src/segment/block_index/mod.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/segment/block_index/mod.rs b/src/segment/block_index/mod.rs index 4070ceb1..94c950cb 100644 --- a/src/segment/block_index/mod.rs +++ b/src/segment/block_index/mod.rs @@ -65,6 +65,10 @@ impl FullBlockIndex { ) -> Option + '_> { self.0.forward_reader(needle) } + + pub fn inner(&self) -> &IndexBlock { + &self.0 + } } impl BlockIndex for FullBlockIndex { From 3cdabe24534c257eac87040640159f2eb8899810 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:32:19 +0200 Subject: [PATCH 178/613] restore run reader --- src/run_reader.rs | 155 ++++++++++++++++++---------------------------- 1 file changed, 59 insertions(+), 96 deletions(-) diff --git a/src/run_reader.rs b/src/run_reader.rs index 94a7ffc8..e5eb75d9 100644 --- a/src/run_reader.rs +++ b/src/run_reader.rs @@ -2,75 +2,76 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::{segment::CachePolicy, version::Run, InternalValue, Segment}; -use std::{ops::Bound, sync::Arc}; +use crate::{segment::CachePolicy, version::Run, BoxedIterator, InternalValue, Segment, UserKey}; +use std::{ + ops::{Deref, RangeBounds}, + sync::Arc, +}; /// Reads through a disjoint run -pub struct LevelReader { - segments: Arc>, +pub struct RunReader { + run: Arc>, lo: usize, hi: usize, - lo_reader: Option<()>, // TODO: range - hi_reader: Option<()>, // TODO: range + lo_reader: Option>>>, + hi_reader: Option>>>, cache_policy: CachePolicy, } -impl LevelReader { +impl RunReader { #[must_use] - pub fn new( + pub fn new + Clone + 'static>( run: Arc>, - range: &(Bound<&[u8]>, Bound<&[u8]>), + range: R, cache_policy: CachePolicy, ) -> Option { assert!(!run.is_empty(), "level reader cannot read empty level"); - let (lo, hi) = run.range_indexes(*range)?; + let (lo, hi) = run.range_indexes(&range)?; Some(Self::culled(run, range, (Some(lo), Some(hi)), cache_policy)) } #[must_use] - pub fn culled( + pub fn culled + Clone + 'static>( run: Arc>, - range: &(Bound<&[u8]>, Bound<&[u8]>), + range: R, (lo, hi): (Option, Option), cache_policy: CachePolicy, ) -> Self { - todo!() - - /* let lo = lo.unwrap_or_default(); - let hi = hi.unwrap_or(level.len() - 1); + let lo = lo.unwrap_or_default(); + let hi = hi.unwrap_or(run.len() - 1); // TODO: lazily init readers? - let lo_segment = level.segments.get(lo).expect("should exist"); - let lo_reader = lo_segment.range(range.clone()).cache_policy(cache_policy); + let lo_segment = run.deref().get(lo).expect("should exist"); + let lo_reader = lo_segment.range(range.clone())/* .cache_policy(cache_policy) */; // TODO: lazily init readers? let hi_reader = if hi > lo { - let hi_segment = level.segments.get(hi).expect("should exist"); - Some(hi_segment.range(range.clone()).cache_policy(cache_policy)) + let hi_segment = run.deref().get(hi).expect("should exist"); + Some( + hi_segment.range(range), /* .cache_policy(cache_policy) */ + ) } else { None }; Self { - segments: level, + run, lo, hi, - lo_reader: Some(lo_reader), - hi_reader, + lo_reader: Some(Box::new(lo_reader)), + hi_reader: hi_reader.map(|x| Box::new(x) as BoxedIterator), cache_policy, - } */ + } } } -impl Iterator for LevelReader { +impl Iterator for RunReader { type Item = crate::Result; fn next(&mut self) -> Option { - todo!() - - /* loop { + loop { if let Some(lo_reader) = &mut self.lo_reader { if let Some(item) = lo_reader.next() { return Some(item); @@ -81,13 +82,9 @@ impl Iterator for LevelReader { self.lo += 1; if self.lo < self.hi { - self.lo_reader = Some( - self.segments - .get(self.lo) - .expect("should exist") - .iter() - .cache_policy(self.cache_policy), - ); + self.lo_reader = Some(Box::new( + self.run.get(self.lo).expect("should exist").iter(), + ) /* .cache_policy(self.cache_policy) */); } } else if let Some(hi_reader) = &mut self.hi_reader { // NOTE: We reached the hi marker, so consume from it instead @@ -97,15 +94,13 @@ impl Iterator for LevelReader { } else { return None; } - } */ + } } } -impl DoubleEndedIterator for LevelReader { +impl DoubleEndedIterator for RunReader { fn next_back(&mut self) -> Option { - todo!() - - /* loop { + loop { if let Some(hi_reader) = &mut self.hi_reader { if let Some(item) = hi_reader.next_back() { return Some(item); @@ -116,13 +111,9 @@ impl DoubleEndedIterator for LevelReader { self.hi -= 1; if self.lo < self.hi { - self.hi_reader = Some( - self.segments - .get(self.hi) - .expect("should exist") - .iter() - .cache_policy(self.cache_policy), - ); + self.hi_reader = Some(Box::new( + self.run.get(self.hi).expect("should exist").iter(), + ) /* .cache_policy(self.cache_policy) */); } } else if let Some(lo_reader) = &mut self.lo_reader { // NOTE: We reached the lo marker, so consume from it instead @@ -132,7 +123,7 @@ impl DoubleEndedIterator for LevelReader { } else { return None; } - } */ + } } } @@ -141,12 +132,10 @@ impl DoubleEndedIterator for LevelReader { mod tests { use super::*; use crate::{AbstractTree, Slice}; - use std::ops::Bound::{Included, Unbounded}; use test_log::test; - // TODO: restore - /* #[test] - fn level_reader_skip() -> crate::Result<()> { + #[test] + fn v3_run_reader_skip() -> crate::Result<()> { let tempdir = tempfile::tempdir()?; let tree = crate::Config::new(&tempdir).open()?; @@ -165,39 +154,30 @@ mod tests { } let segments = tree - .levels + .manifest .read() .expect("lock is poisoned") .iter() .cloned() .collect::>(); - let level = Arc::new(Level { - segments, - is_disjoint: true, - }); + let level = Arc::new(Run::new(segments)); - assert!(LevelReader::new( + assert!(RunReader::new( level.clone(), - &(Included(b"y".into()), Included(b"z".into())), + UserKey::from("y")..=UserKey::from("z"), CachePolicy::Read ) .is_none()); - assert!(LevelReader::new( - level.clone(), - &(Included(b"y".into()), Unbounded), - CachePolicy::Read - ) - .is_none()); + assert!(RunReader::new(level, UserKey::from("y").., CachePolicy::Read).is_none()); Ok(()) - } */ + } - // TODO: restore - /* #[test] + #[test] #[allow(clippy::unwrap_used)] - fn level_reader_basic() -> crate::Result<()> { + fn v3_run_reader_basic() -> crate::Result<()> { let tempdir = tempfile::tempdir()?; let tree = crate::Config::new(&tempdir).open()?; @@ -216,22 +196,17 @@ mod tests { } let segments = tree - .levels + .manifest .read() .expect("lock is poisoned") .iter() .cloned() .collect::>(); - let level = Arc::new(Level { - segments, - is_disjoint: true, - }); + let level = Arc::new(Run::new(segments)); { - let multi_reader = - LevelReader::new(level.clone(), &(Unbounded, Unbounded), CachePolicy::Read) - .unwrap(); + let multi_reader = RunReader::new(level.clone(), .., CachePolicy::Read).unwrap(); let mut iter = multi_reader.flatten(); @@ -250,9 +225,7 @@ mod tests { } { - let multi_reader = - LevelReader::new(level.clone(), &(Unbounded, Unbounded), CachePolicy::Read) - .unwrap(); + let multi_reader = RunReader::new(level.clone(), .., CachePolicy::Read).unwrap(); let mut iter = multi_reader.rev().flatten(); @@ -271,9 +244,7 @@ mod tests { } { - let multi_reader = - LevelReader::new(level.clone(), &(Unbounded, Unbounded), CachePolicy::Read) - .unwrap(); + let multi_reader = RunReader::new(level.clone(), .., CachePolicy::Read).unwrap(); let mut iter = multi_reader.flatten(); @@ -292,12 +263,8 @@ mod tests { } { - let multi_reader = LevelReader::new( - level.clone(), - &(Included(b"g".into()), Unbounded), - CachePolicy::Read, - ) - .unwrap(); + let multi_reader = + RunReader::new(level.clone(), UserKey::from("g").., CachePolicy::Read).unwrap(); let mut iter = multi_reader.flatten(); @@ -310,12 +277,8 @@ mod tests { } { - let multi_reader = LevelReader::new( - level, - &(Included(b"g".into()), Unbounded), - CachePolicy::Read, - ) - .unwrap(); + let multi_reader = + RunReader::new(level, UserKey::from("g").., CachePolicy::Read).unwrap(); let mut iter = multi_reader.flatten().rev(); @@ -328,5 +291,5 @@ mod tests { } Ok(()) - } */ + } } From 8cf696a2fc437fdf03d3a92824e9fed1598e35b4 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:33:18 +0200 Subject: [PATCH 179/613] index block iterator --- src/segment/data_block/forward_reader.rs | 179 ++++++--- src/segment/data_block/iter.rs | 459 +++++++++++++++++++--- src/segment/index_block/forward_reader.rs | 14 +- src/segment/index_block/iter.rs | 383 ++++++++++++++++++ src/segment/index_block/mod.rs | 12 +- 5 files changed, 945 insertions(+), 102 deletions(-) create mode 100644 src/segment/index_block/iter.rs diff --git a/src/segment/data_block/forward_reader.rs b/src/segment/data_block/forward_reader.rs index 1db08462..09c9e7ad 100644 --- a/src/segment/data_block/forward_reader.rs +++ b/src/segment/data_block/forward_reader.rs @@ -2,10 +2,51 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::{iter::ParsedItem, DataBlock}; -use crate::{segment::util::compare_prefixed_slice, InternalValue, SeqNo}; +use super::DataBlock; +use crate::{key::InternalKey, segment::util::compare_prefixed_slice, InternalValue, SeqNo, Slice}; use std::io::{Cursor, Seek}; +/// [start, end] slice indexes +#[derive(Debug)] +pub struct ParsedSlice(pub usize, pub usize); + +impl ParsedItem { + pub fn materialize(&self, bytes: &Slice) -> InternalValue { + // NOTE: We consider the prefix and key slice indexes to be trustworthy + #[allow(clippy::indexing_slicing)] + let key = if let Some(prefix) = &self.prefix { + let prefix_key = &bytes[prefix.0..prefix.1]; + let rest_key = &bytes[self.key.0..self.key.1]; + Slice::fused(prefix_key, rest_key) + } else { + bytes.slice(self.key.0..self.key.1) + }; + let key = InternalKey::new( + key, + self.seqno, + // NOTE: Value type is (or should be) checked when reading it + #[allow(clippy::expect_used)] + self.value_type.try_into().expect("should work"), + ); + + let value = self + .value + .as_ref() + .map_or_else(Slice::empty, |v| bytes.slice(v.0..v.1)); + + InternalValue { key, value } + } +} + +#[derive(Debug)] +pub struct ParsedItem { + pub value_type: u8, + pub seqno: SeqNo, + pub prefix: Option, + pub key: ParsedSlice, + pub value: Option, +} + // TODO: flatten into main struct #[derive(Default, Debug)] struct LoScanner { @@ -44,7 +85,7 @@ impl<'a> ForwardReader<'a> { /// Reads an item by key from the block, if it exists. #[must_use] - pub fn point_read(&mut self, needle: &[u8], seqno: Option) -> Option { + pub fn point_read(&mut self, needle: &[u8], seqno: SeqNo) -> Option { let may_exist = self.seek(needle, seqno); if !may_exist { @@ -65,10 +106,7 @@ impl<'a> ForwardReader<'a> { match cmp_result { std::cmp::Ordering::Equal => { - // TODO: maybe return early if past seqno - let should_skip = seqno.is_some_and(|watermark| item.seqno >= watermark); - - if !should_skip { + if item.seqno < seqno { let kv = item.materialize(&self.block.inner.data); return Some(kv); } @@ -91,7 +129,7 @@ impl<'a> ForwardReader<'a> { /// /// Returns `false` if `next()` can be safely skipped because the item definitely /// does not exist. - pub fn seek(&mut self, needle: &[u8], seqno: Option) -> bool { + pub fn seek(&mut self, needle: &[u8], seqno: SeqNo) -> bool { let binary_index = self.block.get_binary_index_reader(); // NOTE: Try hash index if it exists @@ -118,17 +156,19 @@ impl<'a> ForwardReader<'a> { } } - let offset = self + let Some(offset) = self .block .binary_search_for_offset(&binary_index, needle, seqno) - .expect("should work"); + else { + return false; + }; self.lo_scanner.offset = offset; self.linear_probe(needle, seqno) } - fn linear_probe(&mut self, needle: &[u8], seqno: Option /* TODO: use */) -> bool { + fn linear_probe(&mut self, needle: &[u8], seqno: SeqNo /* TODO: use */) -> bool { let bytes = self.block.bytes(); // SAFETY: The cursor is advanced by read_ operations which check for EOF, @@ -286,6 +326,46 @@ mod tests { }; use test_log::test; + #[test] + fn v3_data_block_seek_too_low() -> crate::Result<()> { + let items = [ + InternalValue::from_components("b", "b", 0, Value), + InternalValue::from_components("c", "c", 0, Value), + InternalValue::from_components("d", "d", 1, Tombstone), + InternalValue::from_components("e", "e", 0, Value), + InternalValue::from_components("f", "f", 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 16, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert!( + data_block.point_read(b"a", SeqNo::MAX).is_none(), + "should return None because a does not exist", + ); + + assert!( + data_block.point_read(b"b", SeqNo::MAX).is_some(), + "should return Some because b exists", + ); + + assert!( + data_block.point_read(b"z", SeqNo::MAX).is_none(), + "should return Some because z does not exist", + ); + + Ok(()) + } + #[test] fn v3_data_block_snapshot_read_first() -> crate::Result<()> { let items = [InternalValue::from_components( @@ -312,10 +392,7 @@ mod tests { assert!(!data_block.is_empty()); assert_eq!(data_block.inner.size(), serialized_len); - assert_eq!( - Some(items[0].clone()), - data_block.point_read(b"hello", Some(777)) - ); + assert_eq!(Some(items[0].clone()), data_block.point_read(b"hello", 777)); Ok(()) } @@ -349,11 +426,11 @@ mod tests { for needle in items { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, None), + data_block.point_read(&needle.key.user_key, SeqNo::MAX), ); } - assert_eq!(None, data_block.point_read(b"yyy", None)); + assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); Ok(()) } @@ -422,11 +499,11 @@ mod tests { for needle in &items { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), ); } - assert_eq!(None, data_block.point_read(b"yyy", None)); + assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); } Ok(()) @@ -457,11 +534,11 @@ mod tests { for needle in items { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), ); } - assert_eq!(None, data_block.point_read(b"yyy", None)); + assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); Ok(()) } @@ -495,11 +572,11 @@ mod tests { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), ); } - assert_eq!(None, data_block.point_read(b"yyy", None)); + assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); Ok(()) } @@ -635,11 +712,11 @@ mod tests { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, None), + data_block.point_read(&needle.key.user_key, SeqNo::MAX), ); } - assert_eq!(None, data_block.point_read(b"yyy", None)); + assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); Ok(()) } @@ -673,11 +750,11 @@ mod tests { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), ); } - assert_eq!(None, data_block.point_read(b"yyy", None)); + assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); Ok(()) } @@ -709,13 +786,13 @@ mod tests { assert_eq!( Some(items.first().cloned().unwrap()), - data_block.point_read(b"a", None) + data_block.point_read(b"a", SeqNo::MAX) ); assert_eq!( Some(items.last().cloned().unwrap()), - data_block.point_read(b"b", None) + data_block.point_read(b"b", SeqNo::MAX) ); - assert_eq!(None, data_block.point_read(b"yyy", None)); + assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); Ok(()) } @@ -750,9 +827,9 @@ mod tests { assert_eq!( Some(items.get(1).cloned().unwrap()), - data_block.point_read(&[233, 233], None) + data_block.point_read(&[233, 233], SeqNo::MAX) ); - assert_eq!(None, data_block.point_read(b"yyy", None)); + assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); Ok(()) } @@ -795,13 +872,13 @@ mod tests { assert_eq!( Some(items.get(1).cloned().unwrap()), - data_block.point_read(&[233, 233], None) + data_block.point_read(&[233, 233], SeqNo::MAX) ); assert_eq!( Some(items.last().cloned().unwrap()), - data_block.point_read(&[255, 255, 0], None) + data_block.point_read(&[255, 255, 0], SeqNo::MAX) ); - assert_eq!(None, data_block.point_read(b"yyy", None)); + assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); Ok(()) } @@ -844,13 +921,13 @@ mod tests { assert_eq!( Some(items.get(1).cloned().unwrap()), - data_block.point_read(&[233, 233], Some(SeqNo::MAX)) + data_block.point_read(&[233, 233], SeqNo::MAX) ); assert_eq!( Some(items.last().cloned().unwrap()), - data_block.point_read(&[255, 255, 0], Some(SeqNo::MAX)) + data_block.point_read(&[255, 255, 0], SeqNo::MAX) ); - assert_eq!(None, data_block.point_read(b"yyy", None)); + assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); Ok(()) } @@ -893,13 +970,13 @@ mod tests { assert_eq!( Some(items.get(1).cloned().unwrap()), - data_block.point_read(&[233, 233], None) + data_block.point_read(&[233, 233], SeqNo::MAX) ); assert_eq!( Some(items.last().cloned().unwrap()), - data_block.point_read(&[255, 255, 0], None) + data_block.point_read(&[255, 255, 0], SeqNo::MAX) ); - assert_eq!(None, data_block.point_read(b"yyy", None)); + assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); Ok(()) } @@ -933,11 +1010,11 @@ mod tests { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), ); } - assert_eq!(None, data_block.point_read(b"yyy", None)); + assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); Ok(()) } @@ -968,7 +1045,7 @@ mod tests { assert!(data_block.hash_bucket_count().unwrap() > 0); assert!(data_block - .point_read(b"pla:venus:fact", None) + .point_read(b"pla:venus:fact", SeqNo::MAX) .expect("should exist") .is_tombstone()); @@ -1008,11 +1085,11 @@ mod tests { for needle in items { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), ); } - assert_eq!(None, data_block.point_read(b"yyy", None)); + assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); Ok(()) } @@ -1328,7 +1405,7 @@ mod tests { for needle in items { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), ); } @@ -1359,7 +1436,7 @@ mod tests { for needle in items { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), ); } @@ -1390,7 +1467,7 @@ mod tests { for needle in items { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), ); } @@ -1421,7 +1498,7 @@ mod tests { for needle in items { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), ); } @@ -1452,7 +1529,7 @@ mod tests { for needle in items { assert_eq!( Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), + data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), ); } diff --git a/src/segment/data_block/iter.rs b/src/segment/data_block/iter.rs index fe800c9a..a0d52863 100644 --- a/src/segment/data_block/iter.rs +++ b/src/segment/data_block/iter.rs @@ -2,15 +2,17 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::{forward_reader::ForwardReader, DataBlock}; -use crate::{key::InternalKey, InternalValue, SeqNo, Slice}; +use super::{ + forward_reader::{ForwardReader, ParsedItem}, + DataBlock, +}; use std::io::Cursor; #[derive(Debug)] struct HiScanner { offset: usize, ptr_idx: usize, - stack: Vec, + stack: Vec, // TODO: SmallVec? base_key_offset: Option, } @@ -23,47 +25,6 @@ pub struct Iter<'a> { hi_scanner: HiScanner, } -/// [start, end] slice indexes -#[derive(Debug)] -pub struct ParsedSlice(pub usize, pub usize); - -#[derive(Debug)] -pub struct ParsedItem { - pub value_type: u8, - pub seqno: SeqNo, - pub prefix: Option, - pub key: ParsedSlice, - pub value: Option, -} - -impl ParsedItem { - pub fn materialize(&self, bytes: &Slice) -> InternalValue { - // NOTE: We consider the prefix and key slice indexes to be trustworthy - #[allow(clippy::indexing_slicing)] - let key = if let Some(prefix) = &self.prefix { - let prefix_key = &bytes[prefix.0..prefix.1]; - let rest_key = &bytes[self.key.0..self.key.1]; - Slice::fused(prefix_key, rest_key) - } else { - bytes.slice(self.key.0..self.key.1) - }; - let key = InternalKey::new( - key, - self.seqno, - // NOTE: Value type is (or should be) checked when reading it - #[allow(clippy::expect_used)] - self.value_type.try_into().expect("should work"), - ); - - let value = self - .value - .as_ref() - .map_or_else(Slice::empty, |v| bytes.slice(v.0..v.1)); - - InternalValue { key, value } - } -} - impl<'a> Iter<'a> { #[must_use] pub fn new(block: &'a DataBlock) -> Self { @@ -208,6 +169,12 @@ impl DoubleEndedIterator for Iter<'_> { return Some(top); } + // NOTE: If we wrapped, we are at the end + // This is safe to do, because there cannot be that many restart intervals + if self.hi_scanner.ptr_idx == usize::MAX { + return None; + } + self.hi_scanner.ptr_idx = self.hi_scanner.ptr_idx.wrapping_sub(1); // NOTE: If we wrapped, we are at the end @@ -392,4 +359,408 @@ mod tests { Ok(()) } + + #[test] + fn v3_data_block_ping_pong_exhaust() -> crate::Result<()> { + let items = [ + InternalValue::from_components("a", "a", 0, Value), + InternalValue::from_components("b", "b", 0, Value), + InternalValue::from_components("c", "c", 0, Value), + InternalValue::from_components("d", "d", 0, Value), + InternalValue::from_components("e", "e", 0, Value), + ]; + + for restart_interval in 1..=u8::MAX { + let bytes = DataBlock::encode_items(&items, restart_interval, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().is_none()); + + { + let mut iter = data_block.iter(); + assert_eq!(b"a", &*iter.next().unwrap().key.user_key); + assert_eq!(b"b", &*iter.next().unwrap().key.user_key); + assert_eq!(b"c", &*iter.next().unwrap().key.user_key); + assert_eq!(b"d", &*iter.next().unwrap().key.user_key); + assert_eq!(b"e", &*iter.next().unwrap().key.user_key); + assert!(iter.next().is_none()); + assert!(iter.next().is_none()); + } + + { + let mut iter = data_block.iter(); + assert_eq!(b"e", &*iter.next_back().unwrap().key.user_key); + assert_eq!(b"d", &*iter.next_back().unwrap().key.user_key); + assert_eq!(b"c", &*iter.next_back().unwrap().key.user_key); + assert_eq!(b"b", &*iter.next_back().unwrap().key.user_key); + assert_eq!(b"a", &*iter.next_back().unwrap().key.user_key); + assert!(iter.next_back().is_none()); + assert!(iter.next_back().is_none()); + } + + { + let mut iter = data_block.iter(); + assert_eq!(b"a", &*iter.next().unwrap().key.user_key); + assert_eq!(b"b", &*iter.next().unwrap().key.user_key); + assert_eq!(b"c", &*iter.next().unwrap().key.user_key); + assert_eq!(b"d", &*iter.next().unwrap().key.user_key); + assert_eq!(b"e", &*iter.next().unwrap().key.user_key); + assert!(iter.next_back().is_none()); + assert!(iter.next_back().is_none()); + assert!(iter.next().is_none()); + assert!(iter.next().is_none()); + } + + { + let mut iter = data_block.iter(); + assert_eq!(b"e", &*iter.next_back().unwrap().key.user_key); + assert_eq!(b"d", &*iter.next_back().unwrap().key.user_key); + assert_eq!(b"c", &*iter.next_back().unwrap().key.user_key); + assert_eq!(b"b", &*iter.next_back().unwrap().key.user_key); + assert_eq!(b"a", &*iter.next_back().unwrap().key.user_key); + assert!(iter.next().is_none()); + assert!(iter.next().is_none()); + assert!(iter.next_back().is_none()); + assert!(iter.next_back().is_none()); + } + } + + Ok(()) + } + + /* #[test] + fn v3_data_block_ping_pongs() -> crate::Result<()>{ + use crate::{UserKey, UserValue}; + + + pub struct BinaryCodeIterator { + length: usize, + current_number: u128, // Use u128 to support lengths up to 128 bits + max_number: u128, + } + + impl BinaryCodeIterator { + /// Creates a new iterator for all binary codes of a given length. + /// + /// # Panics + /// Panics if `length` is greater than 128, as `u128` cannot hold + /// numbers with more than 128 bits. + pub fn new(length: usize) -> Self { + if length > 128 { + panic!("Length too large for u128 to represent all combinations."); + } + let max_number = if length == 0 { + 0 // Special case for length 0, only one combination (empty vector) + } else { + (1 << length) - 1 // 2^len - 1 is the maximum value for a 'len'-bit number + }; + BinaryCodeIterator { + length, + current_number: 0, + max_number, + } + } + } + + impl Iterator for BinaryCodeIterator { + // The iterator will yield Vec where each u8 is either 0 or 1. + type Item = Vec; + + fn next(&mut self) -> Option { + if self.current_number > self.max_number { + return None; // All codes have been generated + } + + // Convert the current_number into a binary Vec + let mut code = Vec::with_capacity(self.length); + if self.length == 0 { + // For length 0, only one item: an empty vector + // We've handled max_number=0 already, so this will only run once. + } else { + // Iterate from the least significant bit (LSB) to the most significant bit (MSB) + // or from MSB to LSB depending on desired order. + // This implementation generates from MSB to LSB to match typical binary representation + // e.g., 0b101 -> [1, 0, 1] + for i in (0..self.length).rev() { + // Check if the i-th bit is set + if (self.current_number >> i) & 1 == 1 { + code.push(1); + } else { + code.push(0); + } + } + } + + + // Increment for the next iteration + self.current_number += 1; + + Some(code) + } + } + + let items = [ + InternalValue::from_components(UserKey::from([22, 192]), UserValue::from([]), 0, Value), + InternalValue::from_components(UserKey::from([22, 193]), UserValue::from([]), 0, Value), + InternalValue::from_components(UserKey::from([22, 194]), UserValue::from([]), 0, Value), + InternalValue::from_components(UserKey::from([22, 195]), UserValue::from([]), 0, Value), + InternalValue::from_components(UserKey::from([22, 196]), UserValue::from([]), 0, Value), + InternalValue::from_components(UserKey::from([22, 197]), UserValue::from([]), 0, Value), + InternalValue::from_components(UserKey::from([22, 198]), UserValue::from([]), 0, Value), + InternalValue::from_components(UserKey::from([22, 199]), UserValue::from([]), 0, Value), + InternalValue::from_components(UserKey::from([22, 200]), UserValue::from([]), 0, Value), + InternalValue::from_components(UserKey::from([22, 201]), UserValue::from([]), 0, Value), + InternalValue::from_components(UserKey::from([22, 202]), UserValue::from([]), 0, Value), + InternalValue::from_components(UserKey::from([22, 203]), UserValue::from([]), 0, Value), + InternalValue::from_components(UserKey::from([22, 204]), UserValue::from([]), 0, Value), + InternalValue::from_components(UserKey::from([22, 205]), UserValue::from([]), 0, Value), + InternalValue::from_components(UserKey::from([22, 206]), UserValue::from([]), 0, Value), + InternalValue::from_components(UserKey::from([22, 207]), UserValue::from([]), 0, Value), + InternalValue::from_components(UserKey::from([22, 208]), UserValue::from([]), 0, Value), + InternalValue::from_components(UserKey::from([22, 209]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 210]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 211]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 212]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 213]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 214]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 215]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 216]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 217]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 218]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 219]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 220]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 221]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 222]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 223]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 224]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 225]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 226]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 227]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 228]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 229]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 230]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 231]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 232]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 233]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 234]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 235]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 236]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 237]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 238]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 239]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 240]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 241]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 242]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 243]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 244]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 245]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 246]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 247]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 248]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 249]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 250]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 251]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 252]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 253]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 254]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 255]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 0]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 1]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 2]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 3]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 4]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 5]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 6]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 7]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 8]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 9]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 10]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 11]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 12]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 13]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 14]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 15]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 16]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 17]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 18]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 19]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 20]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 21]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 22]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 23]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 24]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 25]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 26]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 27]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 28]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 29]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 30]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 31]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 32]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 33]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 34]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 35]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 36]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 37]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 38]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 39]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 40]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 41]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 42]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 43]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 44]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 45]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 46]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 47]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 48]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 49]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 50]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 51]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 52]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 53]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 54]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 55]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 56]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 57]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 58]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 59]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 60]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 61]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 62]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 63]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 64]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 65]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 66]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 67]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 68]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 69]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 70]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 71]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 72]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 73]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 74]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 75]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 76]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 77]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 78]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 79]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 80]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 81]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 82]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 83]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 84]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 85]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 86]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 87]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 88]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 89]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 90]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 91]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 92]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 93]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 94]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 95]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 96]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 97]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 98]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 99]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 100]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 101]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 102]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 103]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 104]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 105]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 106]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 107]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 108]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 109]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 110]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 111]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 112]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 113]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 114]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 115]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 116]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 117]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 118]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 119]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 120]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 121]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 122]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 123]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 124]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 125]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 126]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 127]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 128]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 129]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 130]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 131]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 132]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 133]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 134]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 135]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 136]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 137]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 138]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 139]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 140]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 141]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 142]), UserValue::from([]), 0, Value), + // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 143]), UserValue::from([]), 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 1, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().is_none()); + + for code in BinaryCodeIterator::new(items.len()) { + let mut iter = data_block.iter(); + + for &x in &code { + log::warn!("code: {code:?}"); + + if x % 2 == 0 { + eprintln!("[{x}] next"); + + let Some(_) = iter.next() else { + break; + }; + + // count += 1; + } else { + eprintln!("[{x}] next_back"); + + let Some(_) = iter.next_back() else { + break; + }; + + // count += 1; + } + } + } + + Ok(()) + } */ } diff --git a/src/segment/index_block/forward_reader.rs b/src/segment/index_block/forward_reader.rs index 9a71c90c..caccbca6 100644 --- a/src/segment/index_block/forward_reader.rs +++ b/src/segment/index_block/forward_reader.rs @@ -3,7 +3,10 @@ // (found in the LICENSE-* files in the repository) use super::{IndexBlock, KeyedBlockHandle}; -use crate::{segment::BlockOffset, Slice}; +use crate::{ + segment::{data_block::forward_reader::ParsedSlice, BlockOffset}, + Slice, +}; use std::io::Cursor; #[derive(Default, Debug)] @@ -23,10 +26,6 @@ pub struct ForwardReader<'a> { lo_scanner: LoScanner, } -/// [start, end] slice indexes -#[derive(Debug)] -pub struct ParsedSlice(pub usize, pub usize); - #[derive(Debug)] pub struct ParsedItem { pub offset: BlockOffset, @@ -69,6 +68,11 @@ impl<'a> ForwardReader<'a> { self } + #[must_use] + pub fn offset(&self) -> usize { + self.lo_scanner.offset + } + fn parse_restart_item( block: &IndexBlock, offset: &mut usize, diff --git a/src/segment/index_block/iter.rs b/src/segment/index_block/iter.rs new file mode 100644 index 00000000..0793a4a8 --- /dev/null +++ b/src/segment/index_block/iter.rs @@ -0,0 +1,383 @@ +// Copyright (c) 2025-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use super::{ + forward_reader::{ForwardReader, ParsedItem}, + IndexBlock, +}; +use std::io::Cursor; + +#[derive(Debug)] +struct HiScanner { + offset: usize, + ptr_idx: usize, + stack: Vec, // TODO: SmallVec? + base_key_offset: Option, +} + +/// Double-ended iterator over index blocks +pub struct Iter<'a> { + block: &'a IndexBlock, + restart_interval: usize, + + lo_scanner: ForwardReader<'a>, + hi_scanner: HiScanner, +} + +impl<'a> Iter<'a> { + #[must_use] + pub fn new(block: &'a IndexBlock) -> Self { + let restart_interval = block.restart_interval.into(); + let binary_index_len = block.binary_index_len as usize; + + Self { + block, + + restart_interval, + + lo_scanner: ForwardReader::new(block), + + /* lo_scanner: LoScanner::default(), */ + hi_scanner: HiScanner { + offset: 0, + ptr_idx: binary_index_len, + stack: Vec::new(), + base_key_offset: None, + }, + } + } + + pub fn with_offset(mut self, offset: usize) -> Self { + self.lo_scanner = self.lo_scanner.with_offset(offset); + self + } + + fn parse_restart_item( + block: &IndexBlock, + offset: &mut usize, + base_key_offset: &mut Option, + ) -> Option { + let bytes = block.bytes(); + + // SAFETY: The cursor is advanced by read_ operations which check for EOF, + // And the cursor starts at 0 - the slice is never empty + #[warn(unsafe_code)] + let mut reader = Cursor::new(unsafe { bytes.get_unchecked(*offset..) }); + + let item = IndexBlock::parse_restart_item(&mut reader, *offset)?; + + *offset += reader.position() as usize; + *base_key_offset = Some(item.end_key.0); + + Some(item) + } + + fn parse_truncated_item( + block: &IndexBlock, + offset: &mut usize, + base_key_offset: usize, + ) -> Option { + let bytes = block.bytes(); + + // SAFETY: The cursor is advanced by read_ operations which check for EOF, + // And the cursor starts at 0 - the slice is never empty + #[warn(unsafe_code)] + let mut reader = Cursor::new(unsafe { bytes.get_unchecked(*offset..) }); + + let item = IndexBlock::parse_truncated_item(&mut reader, *offset, base_key_offset)?; + + *offset += reader.position() as usize; + + Some(item) + } + + fn consume_stack_top(&mut self) -> Option { + if let Some(offset) = self.hi_scanner.stack.pop() { + if self.lo_scanner.offset() > 0 && offset < self.lo_scanner.offset() { + return None; + } + + self.hi_scanner.offset = offset; + + let is_restart = self.hi_scanner.stack.is_empty(); + + if is_restart { + Self::parse_restart_item( + self.block, + &mut self.hi_scanner.offset, + &mut self.hi_scanner.base_key_offset, + ) + } else { + Self::parse_truncated_item( + self.block, + &mut self.hi_scanner.offset, + self.hi_scanner.base_key_offset.expect("should exist"), + ) + } + } else { + None + } + } +} + +impl Iterator for Iter<'_> { + type Item = ParsedItem; + + fn next(&mut self) -> Option { + if self.hi_scanner.base_key_offset.is_some() + && self.lo_scanner.offset() >= self.hi_scanner.offset + { + return None; + } + + /* let is_restart = self.lo_scanner.remaining_in_interval == 0; + + let item = if is_restart { + self.lo_scanner.remaining_in_interval = self.restart_interval; + + Self::parse_restart_item( + self.block, + &mut self.lo_scanner.offset, + &mut self.lo_scanner.base_key_offset, + ) + } else { + Self::parse_truncated_item( + self.block, + &mut self.lo_scanner.offset, + self.lo_scanner.base_key_offset.expect("should exist"), + ) + }; + + self.lo_scanner.remaining_in_interval -= 1; */ + + let item = self.lo_scanner.next(); + + if self.hi_scanner.base_key_offset.is_some() + && self.lo_scanner.offset() >= self.hi_scanner.offset + { + return None; + } + + item + } +} + +impl DoubleEndedIterator for Iter<'_> { + fn next_back(&mut self) -> Option { + if let Some(top) = self.consume_stack_top() { + return Some(top); + } + + // NOTE: If we wrapped, we are at the end + // This is safe to do, because there cannot be that many restart intervals + if self.hi_scanner.ptr_idx == usize::MAX { + return None; + } + + self.hi_scanner.ptr_idx = self.hi_scanner.ptr_idx.wrapping_sub(1); + + // NOTE: If we wrapped, we are at the end + // This is safe to do, because there cannot be that many restart intervals + if self.hi_scanner.ptr_idx == usize::MAX { + return None; + } + + let binary_index = self.block.get_binary_index_reader(); + + { + self.hi_scanner.offset = binary_index.get(self.hi_scanner.ptr_idx); + let offset = self.hi_scanner.offset; + + if Self::parse_restart_item( + self.block, + &mut self.hi_scanner.offset, + &mut self.hi_scanner.base_key_offset, + ) + .is_some() + { + self.hi_scanner.stack.push(offset); + } + } + + for _ in 1..self.restart_interval { + let offset = self.hi_scanner.offset; + + if Self::parse_truncated_item( + self.block, + &mut self.hi_scanner.offset, + self.hi_scanner.base_key_offset.expect("should exist"), + ) + .is_some() + { + self.hi_scanner.stack.push(offset); + } + } + + if self.hi_scanner.stack.is_empty() { + return None; + } + + self.consume_stack_top() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{ + segment::{block::Header, Block, BlockOffset, KeyedBlockHandle}, + Checksum, + }; + use test_log::test; + + #[test] + #[allow(clippy::unwrap_used)] + fn v3_index_block_iter_simple() -> crate::Result<()> { + let items = [ + KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), + KeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), + KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), + ]; + + let bytes = IndexBlock::encode_items(&items, 1)?; + // eprintln!("{bytes:?}"); + // eprintln!("{}", String::from_utf8_lossy(&bytes)); + /* eprintln!("encoded into {} bytes", bytes.len()); */ + + let index_block = IndexBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(index_block.len(), items.len()); + assert_eq!(index_block.iter().count(), items.len()); + assert_eq!(index_block.iter().rev().count(), items.len()); + + { + let mut iter = index_block.iter(); + + assert_eq!(b"b", &**iter.next().unwrap().end_key()); + assert_eq!(b"bcdef", &**iter.next().unwrap().end_key()); + assert_eq!(b"def", &**iter.next().unwrap().end_key()); + assert!(iter.next().is_none()); + assert!(iter.next_back().is_none()); + } + + { + let mut iter = index_block.iter().rev(); + + assert_eq!(b"def", &**iter.next().unwrap().end_key()); + assert_eq!(b"bcdef", &**iter.next().unwrap().end_key()); + assert_eq!(b"b", &**iter.next().unwrap().end_key()); + assert!(iter.next_back().is_none()); + assert!(iter.next().is_none()); + } + + { + let mut iter = index_block.iter(); + + assert_eq!(b"b", &**iter.next().unwrap().end_key()); + assert_eq!(b"bcdef", &**iter.next().unwrap().end_key()); + assert_eq!(b"def", &**iter.next_back().unwrap().end_key()); + assert!(iter.next().is_none()); + assert!(iter.next_back().is_none()); + } + + { + let mut iter = index_block.iter().rev(); + + assert_eq!(b"def", &**iter.next().unwrap().end_key()); + assert_eq!(b"bcdef", &**iter.next().unwrap().end_key()); + assert_eq!(b"b", &**iter.next_back().unwrap().end_key()); + assert!(iter.next_back().is_none()); + assert!(iter.next().is_none()); + } + + Ok(()) + } + + #[test] + #[allow(clippy::unwrap_used)] + fn v3_index_block_iter_exhaust() -> crate::Result<()> { + let items = [ + KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), + KeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), + KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), + ]; + + let bytes = IndexBlock::encode_items(&items, 1)?; + // eprintln!("{bytes:?}"); + // eprintln!("{}", String::from_utf8_lossy(&bytes)); + /* eprintln!("encoded into {} bytes", bytes.len()); */ + + let index_block = IndexBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(index_block.len(), items.len()); + assert_eq!(index_block.iter().count(), items.len()); + assert_eq!(index_block.iter().rev().count(), items.len()); + + { + let mut iter = index_block.iter(); + + assert_eq!(b"b", &**iter.next().unwrap().end_key()); + assert_eq!(b"bcdef", &**iter.next().unwrap().end_key()); + assert_eq!(b"def", &**iter.next().unwrap().end_key()); + assert!(iter.next().is_none()); + assert!(iter.next().is_none()); + assert!(iter.next_back().is_none()); + assert!(iter.next_back().is_none()); + } + + { + let mut iter = index_block.iter().rev(); + + assert_eq!(b"def", &**iter.next().unwrap().end_key()); + assert_eq!(b"bcdef", &**iter.next().unwrap().end_key()); + assert_eq!(b"b", &**iter.next().unwrap().end_key()); + assert!(iter.next_back().is_none()); + assert!(iter.next_back().is_none()); + assert!(iter.next().is_none()); + assert!(iter.next().is_none()); + } + + { + let mut iter = index_block.iter(); + + assert_eq!(b"b", &**iter.next().unwrap().end_key()); + assert_eq!(b"bcdef", &**iter.next().unwrap().end_key()); + assert_eq!(b"def", &**iter.next_back().unwrap().end_key()); + assert!(iter.next().is_none()); + assert!(iter.next().is_none()); + assert!(iter.next_back().is_none()); + assert!(iter.next_back().is_none()); + } + + { + let mut iter = index_block.iter().rev(); + + assert_eq!(b"def", &**iter.next().unwrap().end_key()); + assert_eq!(b"bcdef", &**iter.next().unwrap().end_key()); + assert_eq!(b"b", &**iter.next_back().unwrap().end_key()); + assert!(iter.next_back().is_none()); + assert!(iter.next_back().is_none()); + assert!(iter.next().is_none()); + assert!(iter.next().is_none()); + } + + Ok(()) + } +} diff --git a/src/segment/index_block/mod.rs b/src/segment/index_block/mod.rs index 7b36a671..773690f0 100644 --- a/src/segment/index_block/mod.rs +++ b/src/segment/index_block/mod.rs @@ -4,16 +4,18 @@ mod block_handle; mod forward_reader; +mod iter; pub use block_handle::{BlockHandle, KeyedBlockHandle}; +pub use iter::Iter; use super::{ block::{binary_index::Reader as BinaryIndexReader, BlockOffset, Encoder, Trailer}, Block, }; -use crate::segment::block::TRAILER_START_MARKER; +use crate::segment::{block::TRAILER_START_MARKER, data_block::forward_reader::ParsedSlice}; use byteorder::{LittleEndian, ReadBytesExt}; -use forward_reader::{ForwardReader, ParsedItem, ParsedSlice}; +use forward_reader::{ForwardReader, ParsedItem}; use std::io::{Cursor, Seek}; use varint_rs::VarintReader; @@ -176,6 +178,12 @@ impl IndexBlock { }) } + #[must_use] + #[allow(clippy::iter_without_into_iter)] + pub fn iter(&self) -> impl DoubleEndedIterator + '_ { + Iter::new(self).map(|kv| kv.materialize(&self.inner.data)) + } + fn parse_truncated_item( reader: &mut Cursor<&[u8]>, offset: usize, From 9cb6e3415e06f8430c3581dbccfae832f3c9b684 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:33:38 +0200 Subject: [PATCH 180/613] full block index iterator --- src/segment/block_index/iter.rs | 35 +++++++++++++++++++++++++++++++++ src/segment/block_index/mod.rs | 2 ++ 2 files changed, 37 insertions(+) create mode 100644 src/segment/block_index/iter.rs diff --git a/src/segment/block_index/iter.rs b/src/segment/block_index/iter.rs new file mode 100644 index 00000000..970e9428 --- /dev/null +++ b/src/segment/block_index/iter.rs @@ -0,0 +1,35 @@ +// Copyright (c) 2025-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use crate::segment::{IndexBlock, KeyedBlockHandle}; +use self_cell::self_cell; + +type BoxedIter<'a> = Box + 'a>; + +self_cell!( + pub struct IndexBlockConsumer { + owner: IndexBlock, + + #[covariant] + dependent: BoxedIter, + } +); + +pub fn create_index_block_reader(block: IndexBlock) -> IndexBlockConsumer { + IndexBlockConsumer::new(block, |block| Box::new(block.iter())) +} + +impl Iterator for IndexBlockConsumer { + type Item = KeyedBlockHandle; + + fn next(&mut self) -> Option { + self.with_dependent_mut(|_, iter| iter.next()) + } +} + +impl DoubleEndedIterator for IndexBlockConsumer { + fn next_back(&mut self) -> Option { + self.with_dependent_mut(|_, iter| iter.next_back()) + } +} diff --git a/src/segment/block_index/mod.rs b/src/segment/block_index/mod.rs index 94c950cb..46edd4b4 100644 --- a/src/segment/block_index/mod.rs +++ b/src/segment/block_index/mod.rs @@ -2,6 +2,8 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) +pub(crate) mod iter; + use super::{CachePolicy, IndexBlock, KeyedBlockHandle}; #[enum_dispatch::enum_dispatch] From c07d248ff6887c2b31aeaccf797ff8df9036d905 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:33:54 +0200 Subject: [PATCH 181/613] optimize L0 after flush --- src/version/mod.rs | 57 +++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/src/version/mod.rs b/src/version/mod.rs index 37500ca5..1b938581 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -33,10 +33,6 @@ impl GenericLevel { Self { runs } } - pub fn get_runs(&self) -> Vec>> { - self.runs.clone() - } - pub fn segment_count(&self) -> usize { self.iter().map(|x| x.len()).sum() } @@ -53,8 +49,8 @@ impl GenericLevel { self.runs.is_empty() } - pub fn iter(&self) -> impl Iterator> { - self.runs.iter().map(std::ops::Deref::deref) + pub fn iter(&self) -> impl Iterator>> { + self.runs.iter() } pub fn get_for_key<'a>(&'a self, key: &'a [u8]) -> impl Iterator { @@ -115,7 +111,7 @@ impl Level { pub fn aggregate_key_range(&self) -> KeyRange { let key_ranges = self .iter() - .map(Run::aggregate_key_range) + .map(|x| Run::aggregate_key_range(x)) .collect::>(); KeyRange::aggregate(key_ranges.iter()) @@ -141,11 +137,12 @@ impl std::ops::Deref for Version { } } +// TODO: optimize runs unit test(s) pub fn optimize_runs(level: Vec>) -> Vec> { if level.len() <= 1 { level } else { - let mut key_range_boundaries: BTreeSet = BTreeSet::::default(); + let mut key_range_boundaries: BTreeSet = BTreeSet::::default(); for run in &level { for fragment in run.iter() { @@ -155,21 +152,20 @@ pub fn optimize_runs(level: Vec>) -> Vec> { } } - let mut index = KeyRangePartitions::new( - key_range_boundaries - .into_iter() - .collect::>() - .windows(2) - .map(|pair| { - // NOTE: We are iterating over pairs, so index 0 and 1 always exist - #[allow(clippy::expect_used)] - #[allow(clippy::get_first)] - ( - pair.get(0).expect("exists").clone(), - pair.get(1).expect("exists").clone(), - ) - }), - ); + let range_boundaries = key_range_boundaries + .into_iter() + .flat_map(|key| vec![key.clone(), key]) + .collect::>(); + + let mut index = KeyRangePartitions::new(range_boundaries.windows(2).map(|pair| { + // NOTE: We are iterating over pairs, so index 0 and 1 always exist + #[allow(clippy::expect_used)] + #[allow(clippy::get_first)] + ( + pair.get(0).expect("exists").clone(), + pair.get(1).expect("exists").clone(), + ) + })); // IMPORTANT: Index from bottom to top for run in level.iter().rev() { @@ -236,13 +232,22 @@ impl Version { // Copy-on-write the first level with new run at top let l0 = self.levels.first().expect("L0 should always exist"); - let prev_runs = l0.get_runs(); + let prev_runs = l0 + .runs + .iter() + .map(|run| { + let run: Run = run.deref().clone(); + run + }) + .collect::>(); let mut runs = Vec::with_capacity(prev_runs.len() + 1); - runs.push(Arc::new(Run::new(run.to_vec()))); + runs.push(Run::new(run.to_vec())); runs.extend(prev_runs); - Level::from_runs(runs) + let runs = optimize_runs(runs); + + Level::from_runs(runs.into_iter().map(Arc::new).collect()) }); // L1+ From 3085640866fc2c371a01fbaa09faee69cc14f6f5 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:35:00 +0200 Subject: [PATCH 182/613] segment iter WIP --- src/segment/iter.rs | 209 +++++++++++++++++++++ src/segment/meta.rs | 30 +-- src/segment/mod.rs | 355 ++++++++++++++++++++++++----------- src/segment/regions.rs | 10 +- src/tree/mod.rs | 33 ++-- tests/major_compaction.rs | 8 +- tests/segment_point_reads.rs | 4 +- tests/snapshot_point_read.rs | 2 +- tests/tree_write_read.rs | 14 +- 9 files changed, 508 insertions(+), 157 deletions(-) create mode 100644 src/segment/iter.rs diff --git a/src/segment/iter.rs b/src/segment/iter.rs new file mode 100644 index 00000000..5656e172 --- /dev/null +++ b/src/segment/iter.rs @@ -0,0 +1,209 @@ +// Copyright (c) 2025-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use super::{BlockOffset, DataBlock, GlobalSegmentId, KeyedBlockHandle}; +use crate::{ + segment::{util::load_block, BlockHandle}, + Cache, CompressionType, DescriptorTable, InternalValue, +}; +use self_cell::self_cell; +use std::{path::PathBuf, sync::Arc}; + +type BoxedIter<'a> = Box + 'a>; + +self_cell!( + pub struct DataBlockConsumer { + owner: DataBlock, + + #[covariant] + dependent: BoxedIter, + } +); + +pub fn create_data_block_reader(block: DataBlock) -> DataBlockConsumer { + DataBlockConsumer::new(block, |block| Box::new(block.iter())) +} + +impl Iterator for DataBlockConsumer { + type Item = InternalValue; + + fn next(&mut self) -> Option { + self.with_dependent_mut(|_, iter| iter.next()) + } +} + +impl DoubleEndedIterator for DataBlockConsumer { + fn next_back(&mut self) -> Option { + self.with_dependent_mut(|_, iter| iter.next_back()) + } +} + +pub struct Iter +where + I: DoubleEndedIterator, +{ + segment_id: GlobalSegmentId, + path: Arc, + + #[allow(clippy::struct_field_names)] + index_iter: I, + descriptor_table: Arc, + cache: Arc, + compression: CompressionType, + + lo_offset: BlockOffset, + lo_data_block: Option, + + hi_offset: BlockOffset, + hi_data_block: Option, +} + +impl Iter +where + I: DoubleEndedIterator, +{ + pub fn new( + segment_id: GlobalSegmentId, + path: Arc, + index_iter: I, + descriptor_table: Arc, + cache: Arc, + compression: CompressionType, + ) -> Self { + Self { + segment_id, + path, + + index_iter, + descriptor_table, + cache, + compression, + + lo_offset: BlockOffset(0), + lo_data_block: None, + + hi_offset: BlockOffset(u64::MAX), + hi_data_block: None, + } + } +} + +impl Iterator for Iter +where + I: DoubleEndedIterator, +{ + type Item = crate::Result; + + fn next(&mut self) -> Option { + if let Some(block) = &mut self.lo_data_block { + if let Some(item) = block.next().map(Ok) { + return Some(item); + } + } + + let Some(handle) = self.index_iter.next() else { + // NOTE: No more block handles from index, + // Now check hi buffer if it exists + if let Some(block) = &mut self.hi_data_block { + if let Some(item) = block.next().map(Ok) { + return Some(item); + } + } + + // NOTE: If there is no more item, we are done + self.lo_data_block = None; + self.hi_data_block = None; + return None; + }; + + // NOTE: Load next lo block + #[allow(clippy::single_match_else)] + let block = match self.cache.get_block(self.segment_id, handle.offset()) { + Some(block) => block, + None => { + fail_iter!(load_block( + self.segment_id, + &self.path, + &self.descriptor_table, + &self.cache, + &BlockHandle::new(handle.offset(), handle.size()), + self.compression + )) + } + }; + let block = DataBlock::new(block); + + let mut reader = create_data_block_reader(block); + + let item = reader.next(); + + self.lo_offset = handle.offset(); + self.lo_data_block = Some(reader); + + item.map(Ok) + } +} + +impl DoubleEndedIterator for Iter +where + I: DoubleEndedIterator, +{ + fn next_back(&mut self) -> Option { + if let Some(block) = &mut self.hi_data_block { + if let Some(item) = block.next_back().map(Ok) { + return Some(item); + } + } + + let Some(handle) = self.index_iter.next_back() else { + // NOTE: No more block handles from index, + // Now check lo buffer if it exists + if let Some(block) = &mut self.lo_data_block { + // eprintln!("=== lo block ==="); + + // for item in block.borrow_owner().iter() { + // eprintln!( + // r#"InternalValue::from_components({:?}, {:?}, {}, {:?}),"#, + // item.key.user_key, item.value, item.key.seqno, item.key.value_type, + // ); + // } + + if let Some(item) = block.next_back().map(Ok) { + return Some(item); + } + } + + // NOTE: If there is no more item, we are done + self.lo_data_block = None; + self.hi_data_block = None; + return None; + }; + + // NOTE: Load next hi block + #[allow(clippy::single_match_else)] + let block = match self.cache.get_block(self.segment_id, handle.offset()) { + Some(block) => block, + None => { + fail_iter!(load_block( + self.segment_id, + &self.path, + &self.descriptor_table, + &self.cache, + &BlockHandle::new(handle.offset(), handle.size()), + self.compression + )) + } + }; + let block = DataBlock::new(block); + + let mut reader = create_data_block_reader(block); + + let item = reader.next_back(); + + self.hi_offset = handle.offset(); + self.hi_data_block = Some(reader); + + item.map(Ok) + } +} diff --git a/src/segment/meta.rs b/src/segment/meta.rs index 6bd45c18..2a9fe3d1 100644 --- a/src/segment/meta.rs +++ b/src/segment/meta.rs @@ -54,7 +54,7 @@ impl ParsedMeta { assert_eq!( b"xxh3", &*block - .point_read(b"#hash_type", None) + .point_read(b"#hash_type", SeqNo::MAX) .expect("Segment ID should exist") .value, "invalid hash type", @@ -63,7 +63,7 @@ impl ParsedMeta { assert_eq!( b"xxh3", &*block - .point_read(b"#checksum_type", None) + .point_read(b"#checksum_type", SeqNo::MAX) .expect("Segment ID should exist") .value, "invalid checksum type", @@ -71,7 +71,7 @@ impl ParsedMeta { let id = { let bytes = block - .point_read(b"#id", None) + .point_read(b"#id", SeqNo::MAX) .expect("Segment ID should exist"); let mut bytes = &bytes.value[..]; @@ -80,7 +80,7 @@ impl ParsedMeta { let created_at = { let bytes = block - .point_read(b"#created_at", None) + .point_read(b"#created_at", SeqNo::MAX) .expect("Segment created_at should exist"); let mut bytes = &bytes.value[..]; @@ -89,7 +89,7 @@ impl ParsedMeta { let item_count = { let bytes = block - .point_read(b"#item_count", None) + .point_read(b"#item_count", SeqNo::MAX) .expect("Segment ID should exist"); let mut bytes = &bytes.value[..]; @@ -98,7 +98,7 @@ impl ParsedMeta { let tombstone_count = { let bytes = block - .point_read(b"#tombstone_count", None) + .point_read(b"#tombstone_count", SeqNo::MAX) .expect("Segment ID should exist"); let mut bytes = &bytes.value[..]; @@ -107,7 +107,7 @@ impl ParsedMeta { let data_block_count = { let bytes = block - .point_read(b"#data_block_count", None) + .point_read(b"#data_block_count", SeqNo::MAX) .expect("data_block_count should exist"); let mut bytes = &bytes.value[..]; @@ -116,7 +116,7 @@ impl ParsedMeta { let index_block_count = { let bytes = block - .point_read(b"#index_block_count", None) + .point_read(b"#index_block_count", SeqNo::MAX) .expect("index_block_count should exist"); let mut bytes = &bytes.value[..]; @@ -125,11 +125,11 @@ impl ParsedMeta { let key_range = KeyRange::new(( block - .point_read(b"#key#min", None) + .point_read(b"#key#min", SeqNo::MAX) .expect("key min should exist") .value, block - .point_read(b"#key#max", None) + .point_read(b"#key#max", SeqNo::MAX) .expect("key max should exist") .value, )); @@ -137,7 +137,7 @@ impl ParsedMeta { let seqnos = { let min = { let bytes = block - .point_read(b"#seqno#min", None) + .point_read(b"#seqno#min", SeqNo::MAX) .expect("seqno min should exist") .value; let mut bytes = &bytes[..]; @@ -146,7 +146,7 @@ impl ParsedMeta { let max = { let bytes = block - .point_read(b"#seqno#max", None) + .point_read(b"#seqno#max", SeqNo::MAX) .expect("seqno max should exist") .value; let mut bytes = &bytes[..]; @@ -157,14 +157,16 @@ impl ParsedMeta { }; let file_size = { - let bytes = block.point_read(b"#size", None).expect("size should exist"); + let bytes = block + .point_read(b"#size", SeqNo::MAX) + .expect("size should exist"); let mut bytes = &bytes.value[..]; bytes.read_u64::()? }; let data_block_compression = { let bytes = block - .point_read(b"#compression#data", None) + .point_read(b"#compression#data", SeqNo::MAX) .expect("size should exist"); let mut bytes = &bytes.value[..]; diff --git a/src/segment/mod.rs b/src/segment/mod.rs index b9634c16..628cc090 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -9,12 +9,13 @@ pub mod filter; mod id; mod index_block; mod inner; +mod iter; mod meta; pub(crate) mod multi_writer; mod regions; mod scanner; mod trailer; -pub(crate) mod util; +pub mod util; mod writer; pub use block::{Block, BlockOffset, Checksum}; @@ -27,12 +28,14 @@ use util::load_block; pub use writer::Writer; use crate::{ - cache::Cache, descriptor_table::DescriptorTable, CompressionType, InternalValue, SeqNo, TreeId, - UserKey, + cache::Cache, descriptor_table::DescriptorTable, fallible_clipping_iter::FallibleClippingIter, + segment::block_index::iter::create_index_block_reader, CompressionType, InternalValue, SeqNo, + TreeId, UserKey, }; -use block_index::{BlockIndex, BlockIndexImpl, FullBlockIndex}; +use block_index::{BlockIndexImpl, FullBlockIndex}; use filter::standard_bloom::{CompositeHash, StandardBloomFilterReader}; use inner::Inner; +use iter::Iter; use meta::ParsedMeta; use std::{ ops::{Bound, RangeBounds}, @@ -141,13 +144,11 @@ impl Segment { pub fn get( &self, key: &[u8], - seqno: Option, + seqno: SeqNo, key_hash: CompositeHash, ) -> crate::Result> { - if let Some(seqno) = seqno { - if self.metadata.seqnos.0 >= seqno { - return Ok(None); - } + if self.metadata.seqnos.0 >= seqno { + return Ok(None); } if let Some(block) = &self.pinned_filter_block { @@ -168,48 +169,35 @@ impl Segment { self.point_read(key, seqno) } - fn point_read(&self, key: &[u8], seqno: Option) -> crate::Result> { - match seqno { - None => { - let Some(block_handle) = self - .block_index - .get_lowest_block_containing_key(key, CachePolicy::Write)? - else { - return Ok(None); - }; + // TODO: maybe we can skip Fuse costs of the user key + // TODO: because we just want to return the value + // TODO: we would need to return something like ValueType + Value + // TODO: so the caller can decide whether to return the value or not + fn point_read(&self, key: &[u8], seqno: SeqNo) -> crate::Result> { + let BlockIndexImpl::Full(block_index) = &*self.block_index else { + todo!(); + }; + + let Some(iter) = block_index.forward_reader(key) else { + return Ok(None); + }; + + for block_handle in iter { + // TODO: can this ever happen...? + if block_handle.end_key() < &key { + return Ok(None); + } - let block = self.load_data_block(block_handle.as_ref())?; + let block = self.load_data_block(block_handle.as_ref())?; - // NOTE: Fastpath for non-seqno reads - return Ok(block.point_read(key, None)); + if let Some(item) = block.point_read(key, seqno) { + return Ok(Some(item)); } - Some(seqno) => { - let BlockIndexImpl::Full(block_index) = &*self.block_index else { - todo!(); - }; - - let Some(iter) = block_index.forward_reader(key) else { - return Ok(None); - }; - - for block_handle in iter { - // TODO: can this ever happen...? - if block_handle.end_key() < &key { - return Ok(None); - } - - let block = self.load_data_block(block_handle.as_ref())?; - - if let Some(item) = block.point_read(key, Some(seqno)) { - return Ok(Some(item)); - } - - // NOTE: If the last block key is higher than ours, - // our key cannot be in the next block - if block_handle.end_key() > &key { - return Ok(None); - } - } + + // NOTE: If the last block key is higher than ours, + // our key cannot be in the next block + if block_handle.end_key() > &key { + return Ok(None); } } @@ -250,11 +238,8 @@ impl Segment { #[must_use] #[allow(clippy::iter_without_into_iter)] #[doc(hidden)] - pub fn iter(&self) -> impl DoubleEndedIterator> + '_ { - // self.range(..) - todo!(); - - std::iter::empty() + pub fn iter(&self) -> impl DoubleEndedIterator> { + self.range(..) } /// Creates a ranged iterator over the `Segment`. @@ -265,14 +250,32 @@ impl Segment { #[must_use] #[allow(clippy::iter_without_into_iter)] #[doc(hidden)] - pub fn range, R: RangeBounds>( + pub fn range>( &self, range: R, - ) -> impl DoubleEndedIterator> + '_ { - // self.range((std::ops::Bound::Unbounded, std::ops::Bound::Unbounded)) - todo!(); + ) -> impl DoubleEndedIterator> { + let BlockIndexImpl::Full(block_index) = &*self.block_index else { + todo!(); + }; - std::iter::empty() + // TODO: range should be RangeBounds? + + // TODO: seek iter to lowest block containing lower bound + let index_iter = create_index_block_reader(block_index.inner().clone()); + + // TODO: then when we read the first data block + // (first .next(), seek inside the first data block) + + let iter = Iter::new( + self.global_id(), + self.path.clone(), + index_iter, + self.descriptor_table.clone(), + self.cache.clone(), + self.metadata.data_block_compression, + ); + + FallibleClippingIter::new(iter, range) } /// Tries to recover a segment from a file. @@ -384,10 +387,7 @@ impl Segment { } /// Checks if a key range is (partially or fully) contained in this segment. - pub(crate) fn check_key_range_overlap( - &self, - bounds: &(Bound, Bound), - ) -> bool { + pub(crate) fn check_key_range_overlap(&self, bounds: &(Bound<&[u8]>, Bound<&[u8]>)) -> bool { self.metadata.key_range.overlaps_with_bounds(bounds) } @@ -462,7 +462,7 @@ mod tests { &*segment .get( b"abc", - None, + SeqNo::MAX, crate::segment::filter::standard_bloom::Builder::get_hash(b"abc") )? .unwrap() @@ -474,7 +474,7 @@ mod tests { &*segment .get( b"abc", - None, + SeqNo::MAX, crate::segment::filter::standard_bloom::Builder::get_hash(b"abc") )? .unwrap() @@ -485,10 +485,18 @@ mod tests { None, segment.get( b"def", - None, + SeqNo::MAX, crate::segment::filter::standard_bloom::Builder::get_hash(b"def") )? ); + assert_eq!( + None, + segment.get( + b"____", + SeqNo::MAX, + crate::segment::filter::standard_bloom::Builder::get_hash(b"____") + )? + ); assert_eq!( segment.metadata.key_range, @@ -539,56 +547,191 @@ mod tests { "should use full index, so only TLI exists", ); + assert_eq!(items, &*segment.scan()?.flatten().collect::>()); + assert_eq!( - b"abc", - &*segment - .get( - b"abc", - None, - crate::segment::filter::standard_bloom::Builder::get_hash(b"abc") - )? - .unwrap() - .key - .user_key, + segment.metadata.key_range, + crate::KeyRange::new((b"abc".into(), b"xyz".into())), ); + } + + Ok(()) + } + + #[test] + #[allow(clippy::unwrap_used)] + fn v3_segment_iter_simple() -> crate::Result<()> { + let dir = tempdir()?; + let file = dir.path().join("segment"); + + let items = [ + crate::InternalValue::from_components(b"abc", b"asdasdasd", 3, crate::ValueType::Value), + crate::InternalValue::from_components(b"def", b"asdasdasd", 3, crate::ValueType::Value), + crate::InternalValue::from_components(b"xyz", b"asdasdasd", 3, crate::ValueType::Value), + ]; + + { + let mut writer = crate::segment::Writer::new(file.clone(), 5)?; + + for item in items.iter().cloned() { + writer.write(item)?; + } + + let _trailer = writer.finish()?; + } + + { + let segment = Segment::recover( + file, + 0, + Arc::new(Cache::with_capacity_bytes(1_000_000)), + Arc::new(DescriptorTable::new(10)), + true, + )?; + + assert_eq!(5, segment.id()); + assert_eq!(3, segment.metadata.item_count); + assert_eq!(1, segment.metadata.data_block_count); + assert_eq!(1, segment.metadata.index_block_count); // 1 because we use a full index + assert!( + segment.regions.index.is_none(), + "should use full index, so only TLI exists", + ); + + assert_eq!(items, &*segment.iter().flatten().collect::>()); assert_eq!( - b"def", - &*segment - .get( - b"def", - None, - crate::segment::filter::standard_bloom::Builder::get_hash(b"def") - )? - .unwrap() - .key - .user_key, + items.iter().rev().cloned().collect::>(), + &*segment.iter().rev().flatten().collect::>(), + ); + } + + Ok(()) + } + + #[test] + #[allow(clippy::unwrap_used)] + fn v3_segment_range_simple() -> crate::Result<()> { + let dir = tempdir()?; + let file = dir.path().join("segment"); + + let items = [ + crate::InternalValue::from_components(b"abc", b"asdasdasd", 3, crate::ValueType::Value), + crate::InternalValue::from_components(b"def", b"asdasdasd", 3, crate::ValueType::Value), + crate::InternalValue::from_components(b"xyz", b"asdasdasd", 3, crate::ValueType::Value), + ]; + + { + let mut writer = crate::segment::Writer::new(file.clone(), 5)?; + + for item in items.iter().cloned() { + writer.write(item)?; + } + + let _trailer = writer.finish()?; + } + + { + let segment = Segment::recover( + file, + 0, + Arc::new(Cache::with_capacity_bytes(1_000_000)), + Arc::new(DescriptorTable::new(10)), + true, + )?; + + assert_eq!(5, segment.id()); + assert_eq!(3, segment.metadata.item_count); + assert_eq!(1, segment.metadata.data_block_count); + assert_eq!(1, segment.metadata.index_block_count); // 1 because we use a full index + assert!( + segment.regions.index.is_none(), + "should use full index, so only TLI exists", ); + assert_eq!( - b"xyz", + items.iter().skip(1).cloned().collect::>(), &*segment - .get( - b"xyz", - None, - crate::segment::filter::standard_bloom::Builder::get_hash(b"xyz") - )? - .unwrap() - .key - .user_key, + .range(UserKey::from("b")..) + .flatten() + .collect::>() ); + assert_eq!( - None, - segment.get( - b"____", - None, - crate::segment::filter::standard_bloom::Builder::get_hash(b"____") - )? + items.iter().skip(1).rev().cloned().collect::>(), + &*segment + .range(UserKey::from("b")..) + .rev() + .flatten() + .collect::>(), ); + } - assert_eq!(items, &*segment.scan()?.flatten().collect::>()); + Ok(()) + } + + #[test] + #[allow(clippy::unwrap_used)] + fn v3_segment_range_multiple_data_blocks() -> crate::Result<()> { + let dir = tempdir()?; + let file = dir.path().join("segment"); + + let items = [ + crate::InternalValue::from_components(b"a", b"asdasdasd", 3, crate::ValueType::Value), + crate::InternalValue::from_components(b"b", b"asdasdasd", 3, crate::ValueType::Value), + crate::InternalValue::from_components(b"c", b"asdasdasd", 3, crate::ValueType::Value), + crate::InternalValue::from_components(b"d", b"asdasdasd", 3, crate::ValueType::Value), + crate::InternalValue::from_components(b"e", b"asdasdasd", 3, crate::ValueType::Value), + ]; + + { + let mut writer = crate::segment::Writer::new(file.clone(), 5)?.use_data_block_size(1); + + for item in items.iter().cloned() { + writer.write(item)?; + } + + let _trailer = writer.finish()?; + } + + { + let segment = Segment::recover( + file, + 0, + Arc::new(Cache::with_capacity_bytes(1_000_000)), + Arc::new(DescriptorTable::new(10)), + true, + )?; + + assert_eq!(5, segment.id()); + assert_eq!(5, segment.metadata.item_count); + assert_eq!(5, segment.metadata.data_block_count); + assert_eq!(1, segment.metadata.index_block_count); // 1 because we use a full index + assert!( + segment.regions.index.is_none(), + "should use full index, so only TLI exists", + ); assert_eq!( - segment.metadata.key_range, - crate::KeyRange::new((b"abc".into(), b"xyz".into())), + items.iter().skip(1).take(3).cloned().collect::>(), + &*segment + .range(UserKey::from("b")..=UserKey::from("d")) + .flatten() + .collect::>() + ); + + assert_eq!( + items + .iter() + .skip(1) + .take(3) + .rev() + .cloned() + .collect::>(), + &*segment + .range(UserKey::from("b")..=UserKey::from("d")) + .rev() + .flatten() + .collect::>(), ); } @@ -636,7 +779,7 @@ mod tests { &*segment .get( b"abc", - None, + SeqNo::MAX, crate::segment::filter::standard_bloom::Builder::get_hash(b"abc") )? .unwrap() @@ -648,7 +791,7 @@ mod tests { &*segment .get( b"abc", - None, + SeqNo::MAX, crate::segment::filter::standard_bloom::Builder::get_hash(b"abc") )? .unwrap() @@ -659,7 +802,7 @@ mod tests { None, segment.get( b"def", - None, + SeqNo::MAX, crate::segment::filter::standard_bloom::Builder::get_hash(b"def") )? ); diff --git a/src/segment/regions.rs b/src/segment/regions.rs index 77bfb048..6e1563c0 100644 --- a/src/segment/regions.rs +++ b/src/segment/regions.rs @@ -6,7 +6,7 @@ use super::{Block, BlockHandle}; use crate::{ coding::{Decode, Encode}, segment::DataBlock, - CompressionType, InternalValue, UserValue, + CompressionType, InternalValue, SeqNo, UserValue, }; use std::fs::File; @@ -26,7 +26,7 @@ impl ParsedRegions { let tli = { let bytes = block - .point_read(b"tli", None) + .point_read(b"tli", SeqNo::MAX) .expect("TLI handle should exist"); let mut bytes = &bytes.value[..]; @@ -35,7 +35,7 @@ impl ParsedRegions { let metadata = { let bytes = block - .point_read(b"meta", None) + .point_read(b"meta", SeqNo::MAX) .expect("Metadata handle should exist"); let mut bytes = &bytes.value[..]; @@ -43,7 +43,7 @@ impl ParsedRegions { }?; let index = { - match block.point_read(b"index", None) { + match block.point_read(b"index", SeqNo::MAX) { Some(bytes) if !bytes.value.is_empty() => { let mut bytes = &bytes.value[..]; Some(BlockHandle::decode_from(&mut bytes)) @@ -54,7 +54,7 @@ impl ParsedRegions { .transpose()?; let filter = { - match block.point_read(b"filter", None) { + match block.point_read(b"filter", SeqNo::MAX) { Some(bytes) if !bytes.value.is_empty() => { let mut bytes = &bytes.value[..]; Some(BlockHandle::decode_from(&mut bytes)) diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 1708521f..c5c45441 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -383,7 +383,7 @@ impl AbstractTree for Tree { .expect("lock is poisoned") .current_version() .iter_segments() - .map(|x| x.get_highest_seqno()) + .map(Segment::get_highest_seqno) .max() } @@ -399,7 +399,7 @@ impl AbstractTree for Tree { seqno: Option, ) -> crate::Result> { Ok(self - .get_internal_entry(key.as_ref(), seqno)? + .get_internal_entry(key.as_ref(), seqno.unwrap_or(SeqNo::MAX))? .map(|x| x.value)) } @@ -577,7 +577,7 @@ impl Tree { &self, memtable_lock: &Memtable, key: &[u8], - seqno: Option, + seqno: SeqNo, ) -> crate::Result> { if let Some(entry) = memtable_lock.get(key, seqno) { return Ok(ignore_tombstone_value(entry)); @@ -594,7 +594,7 @@ impl Tree { fn get_internal_entry_from_sealed_memtables( &self, key: &[u8], - seqno: Option, + seqno: SeqNo, ) -> Option { let memtable_lock = self.sealed_memtables.read().expect("lock is poisoned"); @@ -610,7 +610,7 @@ impl Tree { fn get_internal_entry_from_segments( &self, key: &[u8], - seqno: Option, + seqno: SeqNo, ) -> crate::Result> { // NOTE: Create key hash for hash sharing // https://fjall-rs.github.io/post/bloom-filter-hash-sharing/ @@ -627,19 +627,16 @@ impl Tree { return Ok(ignore_tombstone_value(item)); } } + } else { + // NOTE: Fallback to linear search + for segment in run.iter() { + if !segment.is_key_in_key_range(key) { + continue; + } - // NOTE: Go to next level - continue; - } - - // NOTE: Fallback to linear search - for segment in run.iter() { - if !segment.is_key_in_key_range(key) { - continue; - } - - if let Some(item) = segment.get(key, seqno, key_hash)? { - return Ok(ignore_tombstone_value(item)); + if let Some(item) = segment.get(key, seqno, key_hash)? { + return Ok(ignore_tombstone_value(item)); + } } } } @@ -652,7 +649,7 @@ impl Tree { pub fn get_internal_entry( &self, key: &[u8], - seqno: Option, + seqno: SeqNo, ) -> crate::Result> { // TODO: consolidate memtable & sealed behind single RwLock diff --git a/tests/major_compaction.rs b/tests/major_compaction.rs index 82893e3f..02699fa8 100644 --- a/tests/major_compaction.rs +++ b/tests/major_compaction.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, SequenceNumberCounter}; +use lsm_tree::{AbstractTree, Config, SeqNo, SequenceNumberCounter}; use test_log::test; #[test] @@ -20,17 +20,17 @@ fn tree_major_compaction() -> lsm_tree::Result<()> { tree.major_compact(u64::MAX, 1_000 /* NOTE: Simulate some time passing */)?; assert_eq!(1, tree.segment_count()); - let item = tree.get_internal_entry(b"a", None)?.unwrap(); + let item = tree.get_internal_entry(b"a", SeqNo::MAX)?.unwrap(); assert_eq!(&*item.key.user_key, "a".as_bytes()); assert!(!item.is_tombstone()); assert_eq!(item.key.seqno, 0); - let item = tree.get_internal_entry(b"b", None)?.unwrap(); + let item = tree.get_internal_entry(b"b", SeqNo::MAX)?.unwrap(); assert_eq!(&*item.key.user_key, "b".as_bytes()); assert!(!item.is_tombstone()); assert_eq!(item.key.seqno, 1); - let item = tree.get_internal_entry(b"c", None)?.unwrap(); + let item = tree.get_internal_entry(b"c", SeqNo::MAX)?.unwrap(); assert_eq!(&*item.key.user_key, "c".as_bytes()); assert!(!item.is_tombstone()); assert_eq!(item.key.seqno, 2); diff --git a/tests/segment_point_reads.rs b/tests/segment_point_reads.rs index fbeffc41..144459b2 100644 --- a/tests/segment_point_reads.rs +++ b/tests/segment_point_reads.rs @@ -47,7 +47,7 @@ fn segment_point_reads_mvcc() -> lsm_tree::Result<()> { for x in 0..ITEM_COUNT as u64 { let key = x.to_be_bytes(); - let item = tree.get_internal_entry(&key, None)?.unwrap(); + let item = tree.get_internal_entry(&key, SeqNo::MAX)?.unwrap(); assert_eq!(item.key.seqno, 2); assert_eq!(&*item.value, b"2"); @@ -89,7 +89,7 @@ fn segment_point_reads_mvcc_slab() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; for key in &keys { - let item = tree.get_internal_entry(key, None)?.unwrap(); + let item = tree.get_internal_entry(key, SeqNo::MAX)?.unwrap(); assert_eq!(item.key.seqno, ITEM_COUNT as u64 - 1); } diff --git a/tests/snapshot_point_read.rs b/tests/snapshot_point_read.rs index 226ca79c..d6ea8b2d 100644 --- a/tests/snapshot_point_read.rs +++ b/tests/snapshot_point_read.rs @@ -60,7 +60,7 @@ fn snapshot_lots_of_versions() -> lsm_tree::Result<()> { for seqno in 1..version_count { let item = tree - .get_internal_entry(key.as_bytes(), Some(seqno))? + .get_internal_entry(key.as_bytes(), seqno)? .expect("should exist"); assert_eq!(format!("abc{}", version_count).as_bytes(), &*item.value); diff --git a/tests/tree_write_read.rs b/tests/tree_write_read.rs index be028f82..8ac55090 100644 --- a/tests/tree_write_read.rs +++ b/tests/tree_write_read.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config}; +use lsm_tree::{AbstractTree, Config, SeqNo}; use test_log::test; #[test] @@ -11,17 +11,17 @@ fn tree_write_and_read() -> lsm_tree::Result<()> { tree.insert("b".as_bytes(), nanoid::nanoid!().as_bytes(), 1); tree.insert("c".as_bytes(), nanoid::nanoid!().as_bytes(), 2); - let item = tree.get_internal_entry(b"a", None)?.unwrap(); + let item = tree.get_internal_entry(b"a", SeqNo::MAX)?.unwrap(); assert_eq!(&*item.key.user_key, "a".as_bytes()); assert!(!item.is_tombstone()); assert_eq!(item.key.seqno, 0); - let item = tree.get_internal_entry(b"b", None)?.unwrap(); + let item = tree.get_internal_entry(b"b", SeqNo::MAX)?.unwrap(); assert_eq!(&*item.key.user_key, "b".as_bytes()); assert!(!item.is_tombstone()); assert_eq!(item.key.seqno, 1); - let item = tree.get_internal_entry(b"c", None)?.unwrap(); + let item = tree.get_internal_entry(b"c", SeqNo::MAX)?.unwrap(); assert_eq!(&*item.key.user_key, "c".as_bytes()); assert!(!item.is_tombstone()); assert_eq!(item.key.seqno, 2); @@ -30,17 +30,17 @@ fn tree_write_and_read() -> lsm_tree::Result<()> { let tree = Config::new(folder).open()?; - let item = tree.get_internal_entry(b"a", None)?.unwrap(); + let item = tree.get_internal_entry(b"a", SeqNo::MAX)?.unwrap(); assert_eq!(&*item.key.user_key, "a".as_bytes()); assert!(!item.is_tombstone()); assert_eq!(item.key.seqno, 0); - let item = tree.get_internal_entry(b"b", None)?.unwrap(); + let item = tree.get_internal_entry(b"b", SeqNo::MAX)?.unwrap(); assert_eq!(&*item.key.user_key, "b".as_bytes()); assert!(!item.is_tombstone()); assert_eq!(item.key.seqno, 1); - let item = tree.get_internal_entry(b"c", None)?.unwrap(); + let item = tree.get_internal_entry(b"c", SeqNo::MAX)?.unwrap(); assert_eq!(&*item.key.user_key, "c".as_bytes()); assert!(!item.is_tombstone()); assert_eq!(item.key.seqno, 2); From 09e33c546b9d4b850ea49b067c2fb4740912aa70 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:35:05 +0200 Subject: [PATCH 183/613] restore tree range --- src/range.rs | 164 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 111 insertions(+), 53 deletions(-) diff --git a/src/range.rs b/src/range.rs index a35d8406..74af9a24 100644 --- a/src/range.rs +++ b/src/range.rs @@ -3,17 +3,22 @@ // (found in the LICENSE-* files in the repository) use crate::{ + key::InternalKey, level_manifest::LevelManifest, memtable::Memtable, - multi_reader::MultiReader, - run_reader::LevelReader, + merge::Merger, + mvcc_stream::MvccStream, + run_reader::RunReader, segment::CachePolicy, value::{SeqNo, UserKey}, version::Version, - InternalValue, + BoxedIterator, InternalValue, }; use self_cell::self_cell; -use std::{ops::Bound, sync::Arc}; +use std::{ + ops::{Bound, RangeBounds}, + sync::Arc, +}; #[must_use] pub fn seqno_filter(item_seqno: SeqNo, seqno: SeqNo) -> bool { @@ -60,6 +65,7 @@ pub struct IterState { type BoxedMerge<'a> = Box> + 'a>; +// TODO: maybe we can lifetime TreeIter and then use InternalKeyRef everywhere to bound lifetime of iterators (no need to construct InternalKey then, can just use range) self_cell!( pub struct TreeIter { owner: IterState, @@ -83,11 +89,13 @@ impl DoubleEndedIterator for TreeIter { } } -fn collect_disjoint_tree_with_range( +/* fn collect_disjoint_tree_with_range<'a>( level_manifest: &LevelManifest, bounds: &(Bound, Bound), -) -> MultiReader { - debug_assert!(level_manifest.is_disjoint()); +) -> MultiReader> { + todo!() + + /* debug_assert!(level_manifest.is_disjoint()); let mut levels = level_manifest .levels @@ -126,37 +134,35 @@ fn collect_disjoint_tree_with_range( .filter_map(|lvl| LevelReader::new(lvl, bounds, CachePolicy::Write)) .collect(); - MultiReader::new(readers) -} + MultiReader::new(readers) */ +} */ impl TreeIter { #[must_use] #[allow(clippy::too_many_lines)] - pub fn create_range( + pub fn create_range, R: RangeBounds>( guard: IterState, - bounds: (Bound, Bound), + range: R, seqno: Option, level_manifest: &LevelManifest, ) -> Self { - todo!() - - /* Self::new(guard, |lock| { - let lo = match &bounds.0 { + Self::new(guard, |lock| { + let lo = match range.start_bound() { // NOTE: See memtable.rs for range explanation Bound::Included(key) => Bound::Included(InternalKey::new( - key.clone(), + key.as_ref(), SeqNo::MAX, crate::value::ValueType::Tombstone, )), Bound::Excluded(key) => Bound::Excluded(InternalKey::new( - key.clone(), + key.as_ref(), 0, crate::value::ValueType::Tombstone, )), Bound::Unbounded => Bound::Unbounded, }; - let hi = match &bounds.1 { + let hi = match range.end_bound() { // NOTE: See memtable.rs for range explanation, this is the reverse case // where we need to go all the way to the last seqno of an item // @@ -172,12 +178,12 @@ impl TreeIter { // abcdef -> 5 // Bound::Included(key) => Bound::Included(InternalKey::new( - key.clone(), + key.as_ref(), 0, crate::value::ValueType::Value, )), Bound::Excluded(key) => Bound::Excluded(InternalKey::new( - key.clone(), + key.as_ref(), SeqNo::MAX, crate::value::ValueType::Value, )), @@ -188,7 +194,7 @@ impl TreeIter { let mut iters: Vec> = Vec::with_capacity(5); - // NOTE: Optimize disjoint trees (e.g. timeseries) to only use a single MultiReader. + /* // TODO: Optimize disjoint trees (e.g. timeseries) to only use a single MultiReader. if level_manifest.is_disjoint() { let reader = collect_disjoint_tree_with_range(&level_manifest, &bounds); @@ -200,46 +206,98 @@ impl TreeIter { } else { iters.push(Box::new(reader)); } - } else { - for level in &level_manifest.levels { - if level.is_disjoint { - if !level.is_empty() { - if let Some(reader) = - LevelReader::new(level.clone(), &bounds, CachePolicy::Write) - { - if let Some(seqno) = seqno { - iters.push(Box::new(reader.filter(move |item| match item { - Ok(item) => seqno_filter(item.key.seqno, seqno), - Err(_) => true, - }))); - } else { - iters.push(Box::new(reader)); - } + } else { */ + + // }; + + #[allow(clippy::needless_continue)] + for run in level_manifest + .current_version() + .iter_levels() + .flat_map(|lvl| lvl.iter()) + { + match run.len() { + 0 => continue, + 1 => { + // NOTE: We checked for length + #[allow(clippy::expect_used)] + let segment = run.first().expect("should exist"); + + if segment.check_key_range_overlap(&( + range.start_bound().map(|x| &*x.user_key), + range.end_bound().map(|x| &*x.user_key), + )) { + let reader = segment.range(( + range.start_bound().map(|x| &x.user_key).cloned(), + range.end_bound().map(|x| &x.user_key).cloned(), + )); + + if let Some(seqno) = seqno { + iters.push(Box::new(reader.filter(move |item| match item { + Ok(item) => seqno_filter(item.key.seqno, seqno), + Err(_) => true, + }))); + } else { + iters.push(Box::new(reader)); } } - } else { - for segment in &level.segments { - if segment.check_key_range_overlap(&bounds) { - let reader = segment.range(bounds.clone()); - - if let Some(seqno) = seqno { - iters.push(Box::new(reader.filter(move |item| match item { - Ok(item) => seqno_filter(item.key.seqno, seqno), - Err(_) => true, - }))); - } else { - iters.push(Box::new(reader)); - } + } + _ => { + if let Some(reader) = RunReader::new( + run.clone(), + ( + range.start_bound().map(|x| &x.user_key).cloned(), + range.end_bound().map(|x| &x.user_key).cloned(), + ), + CachePolicy::Write, + ) { + if let Some(seqno) = seqno { + iters.push(Box::new(reader.filter(move |item| match item { + Ok(item) => seqno_filter(item.key.seqno, seqno), + Err(_) => true, + }))); + } else { + iters.push(Box::new(reader)); } } } } - }; - drop(level_manifest); + /* if level.is_disjoint { + if !level.is_empty() { + if let Some(reader) = + LevelReader::new(level.clone(), &bounds, CachePolicy::Write) + { + if let Some(seqno) = seqno { + iters.push(Box::new(reader.filter(move |item| match item { + Ok(item) => seqno_filter(item.key.seqno, seqno), + Err(_) => true, + }))); + } else { + iters.push(Box::new(reader)); + } + } + } + } else { + for segment in &level.segments { + if segment.check_key_range_overlap(&bounds) { + let reader = segment.range(bounds.clone()); + + if let Some(seqno) = seqno { + iters.push(Box::new(reader.filter(move |item| match item { + Ok(item) => seqno_filter(item.key.seqno, seqno), + Err(_) => true, + }))); + } else { + iters.push(Box::new(reader)); + } + } + } + } */ + } // Sealed memtables - for memtable in lock.sealed.iter() { + for memtable in &lock.sealed { let iter = memtable.range(range.clone()); if let Some(seqno) = seqno { @@ -278,7 +336,7 @@ impl TreeIter { Ok(value) => !value.key.is_tombstone(), Err(_) => true, })) - }) */ + }) } } From cb6efb58aa91808ddab7ddb5b33944fdf401b950 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:35:16 +0200 Subject: [PATCH 184/613] adjust memtable point read --- src/memtable/mod.rs | 43 ++++++++++++++++++------------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/src/memtable/mod.rs b/src/memtable/mod.rs index d4377e5e..6e166ff3 100644 --- a/src/memtable/mod.rs +++ b/src/memtable/mod.rs @@ -60,8 +60,8 @@ impl Memtable { /// /// The item with the highest seqno will be returned, if `seqno` is None. #[doc(hidden)] - pub fn get(&self, key: &[u8], seqno: Option) -> Option { - if seqno == Some(0) { + pub fn get(&self, key: &[u8], seqno: SeqNo) -> Option { + if seqno == 0 { return None; } @@ -81,14 +81,7 @@ impl Memtable { // abcdef -> 6 // abcdef -> 5 // - let lower_bound = InternalKey::new( - key, - match seqno { - Some(seqno) => seqno - 1, - None => SeqNo::MAX, - }, - ValueType::Value, - ); + let lower_bound = InternalKey::new(key, seqno - 1, ValueType::Value); let mut iter = self .items @@ -172,10 +165,10 @@ mod tests { ValueType::Value, )); - let item = memtable.get(b"hello-key-99999", None); + let item = memtable.get(b"hello-key-99999", SeqNo::MAX); assert_eq!(None, item); - let item = memtable.get(b"hello-key-999991", None); + let item = memtable.get(b"hello-key-999991", SeqNo::MAX); assert_eq!(*b"hello-value-999991", &*item.unwrap().value); memtable.insert(InternalValue::from_components( @@ -185,22 +178,22 @@ mod tests { ValueType::Value, )); - let item = memtable.get(b"hello-key-99999", None); + let item = memtable.get(b"hello-key-99999", SeqNo::MAX); assert_eq!(None, item); - let item = memtable.get(b"hello-key-999991", None); + let item = memtable.get(b"hello-key-999991", SeqNo::MAX); assert_eq!((*b"hello-value-999991-2"), &*item.unwrap().value); - let item = memtable.get(b"hello-key-99999", Some(1)); + let item = memtable.get(b"hello-key-99999", 1); assert_eq!(None, item); - let item = memtable.get(b"hello-key-999991", Some(1)); + let item = memtable.get(b"hello-key-999991", 1); assert_eq!((*b"hello-value-999991"), &*item.unwrap().value); - let item = memtable.get(b"hello-key-99999", Some(2)); + let item = memtable.get(b"hello-key-99999", 2); assert_eq!(None, item); - let item = memtable.get(b"hello-key-999991", Some(2)); + let item = memtable.get(b"hello-key-999991", 2); assert_eq!((*b"hello-value-999991-2"), &*item.unwrap().value); } @@ -213,7 +206,7 @@ mod tests { memtable.insert(value.clone()); - assert_eq!(Some(value), memtable.get(b"abc", None)); + assert_eq!(Some(value), memtable.get(b"abc", SeqNo::MAX)); } #[test] @@ -258,7 +251,7 @@ mod tests { 4, ValueType::Value, )), - memtable.get(b"abc", None) + memtable.get(b"abc", SeqNo::MAX) ); } @@ -286,7 +279,7 @@ mod tests { 255, ValueType::Value, )), - memtable.get(b"abc", None) + memtable.get(b"abc", SeqNo::MAX) ); assert_eq!( @@ -296,7 +289,7 @@ mod tests { 0, ValueType::Value, )), - memtable.get(b"abc0", None) + memtable.get(b"abc0", SeqNo::MAX) ); } @@ -330,7 +323,7 @@ mod tests { 255, ValueType::Value, )), - memtable.get(b"abc", None) + memtable.get(b"abc", SeqNo::MAX) ); assert_eq!( @@ -340,7 +333,7 @@ mod tests { 99, ValueType::Value, )), - memtable.get(b"abc", Some(100)) + memtable.get(b"abc", 100) ); assert_eq!( @@ -350,7 +343,7 @@ mod tests { 0, ValueType::Value, )), - memtable.get(b"abc", Some(50)) + memtable.get(b"abc", 50) ); } } From 98289ed231f337ca394c315350d6003e76075566 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:36:46 +0200 Subject: [PATCH 185/613] temporarily gut compaction strategies --- src/compaction/fifo.rs | 66 +----------------------- src/compaction/leveled.rs | 81 ------------------------------ src/compaction/maintenance.rs | 32 +----------- src/compaction/movedown.rs | 23 +-------- src/compaction/pulldown.rs | 21 +------- src/compaction/tiered.rs | 94 +---------------------------------- 6 files changed, 6 insertions(+), 311 deletions(-) diff --git a/src/compaction/fifo.rs b/src/compaction/fifo.rs index 079ef29f..39c179fe 100644 --- a/src/compaction/fifo.rs +++ b/src/compaction/fifo.rs @@ -45,71 +45,7 @@ impl CompactionStrategy for Strategy { } fn choose(&self, levels: &LevelManifest, config: &Config) -> Choice { - let resolved_view = levels.resolved_view(); - - // NOTE: First level always exists, trivial - #[allow(clippy::expect_used)] - let first_level = resolved_view.first().expect("L0 should always exist"); - - let mut segment_ids_to_delete = HashSet::with_hasher(xxhash_rust::xxh3::Xxh3Builder::new()); - - if let Some(ttl_seconds) = self.ttl_seconds { - if ttl_seconds > 0 { - let now = unix_timestamp().as_micros(); - - for segment in resolved_view.iter().flat_map(|lvl| &lvl.segments) { - let lifetime_us: u128 = /* now - segment.metadata.created_at */ todo!(); - let lifetime_sec = lifetime_us / 1000 / 1000; - - if lifetime_sec > ttl_seconds.into() { - log::warn!("segment is older than configured TTL: {:?}", segment.id(),); - segment_ids_to_delete.insert(segment.id()); - } - } - } - } - - let db_size = levels.size(); - - if db_size > self.limit { - let mut bytes_to_delete = db_size - self.limit; - - // NOTE: Sort the level by oldest to newest - // levels are sorted from newest to oldest, so we can just reverse - let mut first_level = first_level.clone(); - first_level.sort_by_seqno(); - first_level.segments.reverse(); - - for segment in first_level.iter() { - if bytes_to_delete == 0 { - break; - } - - bytes_to_delete = bytes_to_delete.saturating_sub(segment.metadata.file_size); - - segment_ids_to_delete.insert(segment.id()); - - log::debug!( - "dropping segment to reach configured size limit: {:?}", - segment.id(), - ); - } - } - - if segment_ids_to_delete.is_empty() { - // NOTE: Only try to merge segments if they are not disjoint - // to improve read performance - // But ideally FIFO is only used for monotonic workloads - // so there's nothing we need to do - if first_level.is_disjoint { - Choice::DoNothing - } else { - super::maintenance::Strategy.choose(levels, config) - } - } else { - let ids = segment_ids_to_delete.into_iter().collect(); - Choice::Drop(ids) - } + todo!() } } /* diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 44def4ef..2dbee8dc 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -367,87 +367,6 @@ impl CompactionStrategy for Strategy { } Choice::DoNothing - - /* let view = &levels.levels; - - // TODO: look at L1+, if not disjoint - // TODO: try to repairing level by rewriting - // TODO: abort if any segment is hidden - // TODO: then make sure, non-disjoint levels cannot be used in subsequent code below - // TODO: add tests - - // L1+ compactions - for (curr_level_index, level) in view.iter().enumerate().skip(1).take(view.len() - 2).rev() - { - // NOTE: Level count is 255 max - #[allow(clippy::cast_possible_truncation)] - let curr_level_index = curr_level_index as u8; - - let next_level_index = curr_level_index + 1; - - if level.is_empty() { - continue; - } - - let level_size: u64 = level - .segments - .iter() - // NOTE: Take bytes that are already being compacted into account, - // otherwise we may be overcompensating - .filter(|x| !levels.hidden_set().is_hidden(x.id())) - .map(|x| x.metadata.file_size) - .sum(); - - let desired_bytes = self.level_target_size(curr_level_index); - - let overshoot = level_size.saturating_sub(desired_bytes); - - if overshoot > 0 { - let Some(next_level) = &view.get(next_level_index as usize) else { - break; - }; - - let Some((segment_ids, can_trivial_move)) = - pick_minimal_compaction(level, next_level, levels.hidden_set(), overshoot) - else { - break; - }; - - /* eprintln!( - "{} {} segments, L{}->L{next_level_index}: {segment_ids:?}", - if can_trivial_move { "move" } else { "merge" }, - segment_ids.len(), - next_level_index - 1, - ); */ - - let choice = CompactionInput { - segment_ids, - dest_level: next_level_index, - target_size: u64::from(self.target_size), - }; - - /*// TODO: eventually, this should happen lazily - // if a segment file lives for very long, it should get rewritten - // Rocks, by default, rewrites files that are 1 month or older - // - // TODO: 3.0.0 configuration? - // NOTE: We purposefully not trivially move segments - // if we go from L1 to L2 - // https://github.com/fjall-rs/lsm-tree/issues/63 - let goes_into_cold_storage = next_level_index == 2; - - if goes_into_cold_storage { - return Choice::Merge(choice); - }*/ - - if can_trivial_move && level.is_disjoint { - return Choice::Move(choice); - } - return Choice::Merge(choice); - } - } - - */ } } /* diff --git a/src/compaction/maintenance.rs b/src/compaction/maintenance.rs index 16987a47..78599575 100644 --- a/src/compaction/maintenance.rs +++ b/src/compaction/maintenance.rs @@ -3,9 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{Choice, CompactionStrategy}; -use crate::{ - config::Config, level_manifest::LevelManifest, segment::Segment, HashSet, SegmentId, -}; +use crate::{config::Config, level_manifest::LevelManifest, segment::Segment, HashSet, SegmentId}; const L0_SEGMENT_CAP: usize = 20; @@ -45,33 +43,7 @@ impl CompactionStrategy for Strategy { } fn choose(&self, levels: &LevelManifest, _: &Config) -> Choice { - let resolved_view = levels.resolved_view(); - - // NOTE: First level always exists, trivial - #[allow(clippy::expect_used)] - let first_level = resolved_view.first().expect("L0 should always exist"); - - if first_level.len() > L0_SEGMENT_CAP { - // NOTE: +1 because two will merge into one - // So if we have 18 segments, and merge two, we'll have 17, not 16 - let segments_to_merge = first_level.len() - L0_SEGMENT_CAP + 1; - - // NOTE: Sort the level by oldest to newest - // levels are sorted from newest to oldest, so we can just reverse - let mut first_level = first_level.clone(); - first_level.sort_by_seqno(); - first_level.segments.reverse(); - - let segment_ids = choose_least_effort_compaction(&first_level, segments_to_merge); - - Choice::Merge(super::Input { - dest_level: 0, - segment_ids, - target_size: u64::MAX, - }) - } else { - Choice::DoNothing - } + todo!() } } /* diff --git a/src/compaction/movedown.rs b/src/compaction/movedown.rs index f8de943e..1f0dfc28 100644 --- a/src/compaction/movedown.rs +++ b/src/compaction/movedown.rs @@ -15,27 +15,6 @@ impl CompactionStrategy for Strategy { #[allow(clippy::expect_used)] fn choose(&self, levels: &LevelManifest, _: &Config) -> Choice { - let resolved_view = levels.resolved_view(); - - let level = resolved_view - .get(usize::from(self.0)) - .expect("level should exist"); - - let next_level = resolved_view - .get(usize::from(self.1)) - .expect("next level should exist"); - - if next_level.is_empty() { - // TODO: list_ids() - let segment_ids: HashSet<_> = level.segments.iter().map(Segment::id).collect(); - - Choice::Move(Input { - segment_ids, - dest_level: self.1, - target_size: 64_000_000, - }) - } else { - Choice::DoNothing - } + todo!() } } diff --git a/src/compaction/pulldown.rs b/src/compaction/pulldown.rs index 3ca1555d..d88ffd2a 100644 --- a/src/compaction/pulldown.rs +++ b/src/compaction/pulldown.rs @@ -17,25 +17,6 @@ impl CompactionStrategy for Strategy { #[allow(clippy::expect_used)] fn choose(&self, levels: &LevelManifest, _: &Config) -> Choice { - let resolved_view = levels.resolved_view(); - - let level = resolved_view - .get(usize::from(self.0)) - .expect("level should exist"); - - let next_level = resolved_view - .get(usize::from(self.1)) - .expect("next level should exist"); - - // TODO: list_ids() - let mut segment_ids: HashSet<_> = level.segments.iter().map(Segment::id).collect(); - - segment_ids.extend(next_level.segments.iter().map(Segment::id)); - - Choice::Merge(Input { - segment_ids, - dest_level: self.1, - target_size: 64_000_000, - }) + todo!() } } diff --git a/src/compaction/tiered.rs b/src/compaction/tiered.rs index 3af377f0..638f78b4 100644 --- a/src/compaction/tiered.rs +++ b/src/compaction/tiered.rs @@ -55,99 +55,7 @@ impl CompactionStrategy for Strategy { } fn choose(&self, levels: &LevelManifest, config: &Config) -> Choice { - let resolved_view = levels.resolved_view(); - - for (curr_level_index, level) in resolved_view - .iter() - .enumerate() - .take(resolved_view.len() - 1) - .rev() - { - // NOTE: Level count is 255 max - #[allow(clippy::cast_possible_truncation)] - let curr_level_index = curr_level_index as u8; - - let next_level_index = curr_level_index + 1; - - if level.is_empty() { - continue; - } - - let level_size: u64 = level - .segments - .iter() - // NOTE: Take bytes that are already being compacted into account, - // otherwise we may be overcompensating - .filter(|x| !levels.hidden_set().is_hidden(x.id())) - .map(|x| x.metadata.file_size) - .sum(); - - let desired_bytes = - desired_level_size_in_bytes(curr_level_index, self.level_ratio, self.base_size) - as u64; - - if level_size >= desired_bytes { - // NOTE: Take desired_bytes because we are in tiered mode - // We want to take N segments, not just the overshoot (like in leveled) - let mut overshoot = desired_bytes; - - let mut segments_to_compact = vec![]; - - for segment in level.iter().rev().take(self.level_ratio.into()).cloned() { - if overshoot == 0 { - break; - } - - overshoot = overshoot.saturating_sub(segment.metadata.file_size); - segments_to_compact.push(segment); - } - - let mut segment_ids: HashSet<_> = - segments_to_compact.iter().map(Segment::id).collect(); - - // NOTE: If dest level is the last level, just overwrite it - // - // If we didn't overwrite Lmax, it would end up amassing more and more - // segments - // Also, because it's the last level, the frequency of overwiting it is - // amortized because of the LSM-tree's level structure - if next_level_index == 6 { - // Wait for L6 to be non-busy - if levels.busy_levels().contains(&next_level_index) { - continue; - } - - segment_ids.extend( - levels - .levels - .last() - .expect("last level should always exist") - .list_ids(), - ); - } - - return Choice::Merge(CompactionInput { - segment_ids, - dest_level: next_level_index, - target_size: u64::MAX, - }); - } - } - - // TODO: after major compaction, SizeTiered may behave weirdly - // if major compaction is not outputting into Lmax - - // TODO: if level.size >= base_size and there are enough - // segments with size < base_size, compact them together - // no matter the amount of segments in L0 -> should reduce - // write stall chance - // - // TODO: however: force compaction if L0 becomes way too large - - // NOTE: Reduce L0 segments if needed - // this is probably an edge case if the `base_size` does not line up with - // the `max_memtable_size` AT ALL - super::maintenance::Strategy.choose(levels, config) + todo!() } } /* From ed26c0ab8bbbf39f2da75dcdcf601aa96369c7ee Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:36:56 +0200 Subject: [PATCH 186/613] adjust blob tree GC reader --- src/blob_tree/gc/reader.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/blob_tree/gc/reader.rs b/src/blob_tree/gc/reader.rs index 1a1a8ace..04d6843d 100644 --- a/src/blob_tree/gc/reader.rs +++ b/src/blob_tree/gc/reader.rs @@ -2,7 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::{blob_tree::value::MaybeInlineValue, coding::Decode, Memtable}; +use crate::{blob_tree::value::MaybeInlineValue, coding::Decode, Memtable, SeqNo}; use std::io::Cursor; use value_log::ValueHandle; @@ -20,7 +20,7 @@ impl<'a> GcReader<'a> { fn get_internal(&self, key: &[u8]) -> crate::Result> { let Some(item) = self .tree - .get_internal_entry_with_memtable(self.memtable, key, None)? + .get_internal_entry_with_memtable(self.memtable, key, SeqNo::MAX)? .map(|x| x.value) else { return Ok(None); From 36d1f89ab20a9a686825bf39510a215d8888be25 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:38:02 +0200 Subject: [PATCH 187/613] change assertion message --- src/segment/block/encoder.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/segment/block/encoder.rs b/src/segment/block/encoder.rs index 3aa7c130..2eeead2b 100644 --- a/src/segment/block/encoder.rs +++ b/src/segment/block/encoder.rs @@ -85,12 +85,9 @@ impl<'a, S: Default, T: Encodable> Encoder<'a, S, T> { /// Toggles prefix truncation. pub fn use_prefix_truncation(mut self, flag: bool) -> Self { - self.use_prefix_truncation = flag; + assert!(flag, "prefix truncation is currently required to be true"); - // TODO: - if !flag { - unimplemented!() - } + self.use_prefix_truncation = flag; self } From 06c04eac5b3181b6911dfe81a2aa8a179768579e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 20:41:12 +0200 Subject: [PATCH 188/613] fmt & fix --- src/config.rs | 2 +- src/key.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/config.rs b/src/config.rs index bc3e1d6b..8f422a1d 100644 --- a/src/config.rs +++ b/src/config.rs @@ -2,7 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::{path::absolute_path, BlobTree, CompressionType, Cache, DescriptorTable, Tree}; +use crate::{path::absolute_path, BlobTree, Cache, CompressionType, DescriptorTable, Tree}; use std::{ path::{Path, PathBuf}, sync::Arc, diff --git a/src/key.rs b/src/key.rs index df67a106..24e42032 100644 --- a/src/key.rs +++ b/src/key.rs @@ -21,11 +21,11 @@ pub struct InternalKey { pub value_type: ValueType, } -impl<'a> From<&InternalKeyRef<'a>> for InternalKey { +/* impl<'a> From<&InternalKeyRef<'a>> for InternalKey { fn from(value: &InternalKeyRef<'a>) -> Self { Self::new(value.user_key, value.seqno, value.value_type) } -} +} */ impl AsRef<[u8]> for InternalKey { fn as_ref(&self) -> &[u8] { From 6b883d51e0d27a00a4521c4d611ee9c735b15e60 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 22:33:35 +0200 Subject: [PATCH 189/613] clippy --- src/segment/index_block/mod.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/segment/index_block/mod.rs b/src/segment/index_block/mod.rs index 773690f0..910616d3 100644 --- a/src/segment/index_block/mod.rs +++ b/src/segment/index_block/mod.rs @@ -142,6 +142,8 @@ impl IndexBlock { let item = Self::parse_restart_item(&mut cursor, offset)?; + // SAFETY: We trust the parsed restart head + #[allow(clippy::indexing_slicing)] let key = &self.inner.data[item.end_key.0..item.end_key.1]; if needle > key { @@ -210,6 +212,8 @@ impl IndexBlock { let item = Self::parse_restart_item(&mut cursor, pos).expect("should exist"); + // SAFETY: We trust the parsed restart head + #[allow(clippy::indexing_slicing)] &bytes[item.end_key.0..item.end_key.1] } @@ -296,6 +300,8 @@ impl IndexBlock { let item = Self::parse_restart_item(&mut cursor, offset)?; + // SAFETY: We trust the parsed restart head + #[allow(clippy::indexing_slicing)] let key = &self.inner.data[item.end_key.0..item.end_key.1]; if needle > key { @@ -320,6 +326,8 @@ impl IndexBlock { let item = Self::parse_restart_item(&mut cursor, offset)?; + // SAFETY: We trust the parsed restart head + #[allow(clippy::indexing_slicing)] let key = &self.inner.data[item.end_key.0..item.end_key.1]; if needle > key { From 0b0cdead225c3a8ee834be6fd6d6e1456f587d29 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 22:37:24 +0200 Subject: [PATCH 190/613] clippy --- src/blob_tree/gc/reader.rs | 4 ++-- src/blob_tree/gc/writer.rs | 2 +- src/blob_tree/mod.rs | 5 ++--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/blob_tree/gc/reader.rs b/src/blob_tree/gc/reader.rs index 04d6843d..bebf86cd 100644 --- a/src/blob_tree/gc/reader.rs +++ b/src/blob_tree/gc/reader.rs @@ -35,12 +35,12 @@ impl<'a> GcReader<'a> { impl value_log::IndexReader for GcReader<'_> { fn get(&self, key: &[u8]) -> std::io::Result> { - use std::io::{Error as IoError, ErrorKind as IoErrorKind}; + use std::io::Error as IoError; use MaybeInlineValue::{Indirect, Inline}; let Some(item) = self .get_internal(key) - .map_err(|e| IoError::new(IoErrorKind::Other, e.to_string()))? + .map_err(|e| IoError::other(e.to_string()))? else { return Ok(None); }; diff --git a/src/blob_tree/gc/writer.rs b/src/blob_tree/gc/writer.rs index ea8abe7a..f314a0e3 100644 --- a/src/blob_tree/gc/writer.rs +++ b/src/blob_tree/gc/writer.rs @@ -25,7 +25,7 @@ impl<'a> GcWriter<'a> { } } -impl<'a> value_log::IndexWriter for GcWriter<'a> { +impl value_log::IndexWriter for GcWriter<'_> { fn insert_indirect( &mut self, key: &[u8], diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index f966f90b..c5b43a13 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -144,8 +144,7 @@ impl BlobTree { .blobs .scan_for_stats(iter.filter_map(|kv| { let Ok(kv) = kv else { - return Some(Err(IoError::new( - IoErrorKind::Other, + return Some(Err(IoError::other( "Failed to load KV pair from index tree", ))); }; @@ -153,7 +152,7 @@ impl BlobTree { let mut cursor = Cursor::new(kv.value); let value = match MaybeInlineValue::decode_from(&mut cursor) { Ok(v) => v, - Err(e) => return Some(Err(IoError::new(IoErrorKind::Other, e.to_string()))), + Err(e) => return Some(Err(IoError::other(e.to_string()))), }; match value { From 6cc7a7798eff9d7db410a6b6997044d87b3a0ba5 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 23:05:24 +0200 Subject: [PATCH 191/613] refactor --- src/blob_tree/mod.rs | 2 +- src/segment/block/mod.rs | 41 +++++++++++++++++++++++++--------------- src/segment/meta.rs | 2 +- src/segment/mod.rs | 6 ++---- src/segment/regions.rs | 2 +- src/segment/util.rs | 2 +- 6 files changed, 32 insertions(+), 23 deletions(-) diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index c5b43a13..17949894 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -109,7 +109,7 @@ impl BlobTree { seqno: SeqNo, gc_watermark: SeqNo, ) -> crate::Result { - use std::io::{Error as IoError, ErrorKind as IoErrorKind}; + use std::io::Error as IoError; use MaybeInlineValue::{Indirect, Inline}; while self diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index 906208d2..4023bde7 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -18,6 +18,7 @@ pub(crate) use trailer::{Trailer, TRAILER_START_MARKER}; use crate::{ coding::{Decode, Encode}, + segment::BlockHandle, CompressionType, Slice, }; use std::fs::File; @@ -88,7 +89,12 @@ impl Block { #[cfg(feature = "lz4")] CompressionType::Lz4 => { - let mut data = byteview::ByteView::with_size(header.uncompressed_length as usize); + #[cfg(feature = "use_unsafe")] + let mut data = Slice::with_size_unzeroed(header.uncompressed_length as usize); + + #[cfg(not(feature = "use_unsafe"))] + let mut data = Slice::with_size(header.uncompressed_length as usize); + { // NOTE: We know that we are the owner #[allow(clippy::expect_used)] @@ -97,7 +103,8 @@ impl Block { lz4_flex::decompress_into(&raw_data, &mut mutator) .map_err(|_| crate::Error::Decompress(compression))?; } - data.into() + + data } #[cfg(feature = "miniz")] @@ -116,21 +123,19 @@ impl Block { Ok(Self { header, data }) } - // TODO: take non-keyed block handle /// Reads a block from a file without needing to seek the file. pub fn from_file( file: &File, - offset: BlockOffset, - size: u32, + handle: BlockHandle, compression: CompressionType, ) -> crate::Result { // TODO: toggle with use_unsafe and add bench #[cfg(feature = "use_unsafe")] - let mut buf = byteview::ByteView::with_size_unzeroed(size as usize); + let mut buf = Slice::with_size_unzeroed(handle.size() as usize); #[cfg(not(feature = "use_unsafe"))] - let mut buf = byteview::ByteView::with_size(size as usize); + let mut buf = Slice::with_size(handle.size() as usize); { let mut mutator = buf.get_mut().expect("should be the owner"); @@ -139,10 +144,11 @@ impl Block { { use std::os::unix::fs::FileExt; - let bytes_read = file.read_at(&mut mutator, *offset)?; + let bytes_read = file.read_at(&mut mutator, *handle.offset())?; + assert_eq!( bytes_read, - size as usize, + handle.size() as usize, "not enough bytes read: file has length {}", file.metadata()?.len(), ); @@ -150,8 +156,16 @@ impl Block { #[cfg(windows)] { - todo!(); - // assert_eq!(bytes_read, size as usize); + use std::os::windows::fs::FileExt; + + let bytes_read = file.seek_read(&mut mutator, *handle.offset())?; + + assert_eq!( + bytes_read, + handle.size() as usize, + "not enough bytes read: file has length {}", + file.metadata()?.len(), + ); } #[cfg(not(any(unix, windows)))] @@ -205,9 +219,6 @@ impl Block { debug_assert_eq!(header.uncompressed_length, data.len() as u32); } - Ok(Self { - header, - data: Slice::from(data), - }) + Ok(Self { header, data }) } } diff --git a/src/segment/meta.rs b/src/segment/meta.rs index 2a9fe3d1..efe85ec2 100644 --- a/src/segment/meta.rs +++ b/src/segment/meta.rs @@ -48,7 +48,7 @@ pub struct ParsedMeta { impl ParsedMeta { #[allow(clippy::expect_used)] pub fn load_with_handle(file: &File, handle: &BlockHandle) -> crate::Result { - let block = Block::from_file(file, handle.offset(), handle.size(), CompressionType::None)?; + let block = Block::from_file(file, *handle, CompressionType::None)?; let block = DataBlock::new(block); assert_eq!( diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 628cc090..8c24cf40 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -308,8 +308,7 @@ impl Segment { let block = Block::from_file( &file, - regions.tli.offset(), - regions.tli.size(), + regions.tli, metadata.data_block_compression, // TODO: index blocks may get their own compression level )?; @@ -341,8 +340,7 @@ impl Segment { Block::from_file( &file, - filter_handle.offset(), - filter_handle.size(), + filter_handle, crate::CompressionType::None, // NOTE: We never write a filter block with compression ) }) diff --git a/src/segment/regions.rs b/src/segment/regions.rs index 6e1563c0..787232eb 100644 --- a/src/segment/regions.rs +++ b/src/segment/regions.rs @@ -21,7 +21,7 @@ pub struct ParsedRegions { impl ParsedRegions { pub fn load_with_handle(file: &File, handle: &BlockHandle) -> crate::Result { - let block = Block::from_file(file, handle.offset(), handle.size(), CompressionType::None)?; + let block = Block::from_file(file, *handle, CompressionType::None)?; let block = DataBlock::new(block); let tli = { diff --git a/src/segment/util.rs b/src/segment/util.rs index 423f88b0..c3bc13da 100644 --- a/src/segment/util.rs +++ b/src/segment/util.rs @@ -29,7 +29,7 @@ pub fn load_block( Arc::new(std::fs::File::open(path)?) }; - let block = Block::from_file(&fd, handle.offset(), handle.size(), compression)?; + let block = Block::from_file(&fd, *handle, compression)?; // Cache FD if fd_cache_miss { From 4e84ba199ee52d26508dfa93b830c759a0cadb25 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 23:09:52 +0200 Subject: [PATCH 192/613] fix --- src/segment/block/mod.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index 4023bde7..fbaa42fd 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -187,7 +187,12 @@ impl Block { #[allow(clippy::indexing_slicing)] let raw_data = &buf[Header::serialized_len()..]; - let mut data = byteview::ByteView::with_size(header.uncompressed_length as usize); + #[cfg(feature = "use_unsafe")] + let mut data = Slice::with_size_unzeroed(header.uncompressed_length as usize); + + #[cfg(not(feature = "use_unsafe"))] + let mut data = Slice::with_size(header.uncompressed_length as usize); + { // NOTE: We know that we are the owner #[allow(clippy::expect_used)] @@ -196,6 +201,7 @@ impl Block { lz4_flex::decompress_into(raw_data, &mut mutator) .map_err(|_| crate::Error::Decompress(compression))?; } + data } From afc149b202885db9edd211ea4e625da005c9ed9d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 23:12:04 +0200 Subject: [PATCH 193/613] increase msrv idc --- .github/workflows/test.yml | 2 +- Cargo.toml | 5 ++--- README.md | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index aa40f91e..f783aa97 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -18,7 +18,7 @@ jobs: matrix: rust_version: - stable - - "1.80.0" # MSRV + - "1.81.0" # MSRV os: - ubuntu-latest - windows-latest diff --git a/Cargo.toml b/Cargo.toml index 53ba5d57..2d4a4cd6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,7 @@ description = "A K.I.S.S. implementation of log-structured merge trees (LSM-tree license = "MIT OR Apache-2.0" version = "3.0.0" edition = "2021" -rust-version = "1.80.0" +rust-version = "1.81.0" readme = "README.md" include = ["src/**/*", "LICENSE-APACHE", "LICENSE-MIT", "README.md"] repository = "https://github.com/fjall-rs/lsm-tree" @@ -24,14 +24,13 @@ bytes = ["value-log/bytes"] [dependencies] byteorder = "1.5.0" -byteview = "0.7.0" # TODO: remove in favor of Slice wrapper crossbeam-skiplist = "0.1.3" double-ended-peekable = "0.1.0" enum_dispatch = "0.3.13" interval-heap = "0.0.5" log = "0.4.22" lz4_flex = { version = "0.11.3", optional = true, default-features = false } -miniz_oxide = { version = "0.8.0", optional = true } +miniz_oxide = { version = "0.8.0", optional = true } # TODO: zlib-rs? quick_cache = { version = "0.6.13", default-features = false, features = [] } rustc-hash = "2.0.0" self_cell = "1.0.4" diff --git a/README.md b/README.md index 42c05254..23f0f2dd 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![CI](https://github.com/fjall-rs/lsm-tree/actions/workflows/test.yml/badge.svg)](https://github.com/fjall-rs/lsm-tree/actions/workflows/test.yml) [![docs.rs](https://img.shields.io/docsrs/lsm-tree?color=green)](https://docs.rs/lsm-tree) [![Crates.io](https://img.shields.io/crates/v/lsm-tree?color=blue)](https://crates.io/crates/lsm-tree) -![MSRV](https://img.shields.io/badge/MSRV-1.75.0-blue) +![MSRV](https://img.shields.io/badge/MSRV-1.81.0-blue) [![dependency status](https://deps.rs/repo/github/fjall-rs/lsm-tree/status.svg)](https://deps.rs/repo/github/fjall-rs/lsm-tree) A K.I.S.S. implementation of log-structured merge trees (LSM-trees/LSMTs) in Rust. From c608676a08375e3d6b60136857d0f39eed74d997 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Jun 2025 23:20:27 +0200 Subject: [PATCH 194/613] bump From 309ef3df900ba78c7b49eea36149210928629872 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 5 Jun 2025 18:14:35 +0200 Subject: [PATCH 195/613] temp disable bytes feature flag --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 2d4a4cd6..e7c2fcba 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,7 @@ path = "src/lib.rs" default = [] lz4 = ["dep:lz4_flex"] miniz = ["dep:miniz_oxide"] -bytes = ["value-log/bytes"] +bytes = [] # TODO: restore [dependencies] byteorder = "1.5.0" From 349964e9b43e4e55345d0cf8cb94a364db69dba4 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 5 Jun 2025 18:23:25 +0200 Subject: [PATCH 196/613] fix: kv example --- examples/kv/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/kv/src/main.rs b/examples/kv/src/main.rs index fbb7c685..e61cad5c 100644 --- a/examples/kv/src/main.rs +++ b/examples/kv/src/main.rs @@ -121,7 +121,7 @@ impl KvStore { Ok(()) } - pub fn maintenance(&mut self, memtable_size: u32) -> lsm_tree::Result<()> { + pub fn maintenance(&mut self, memtable_size: u64) -> lsm_tree::Result<()> { // 8 MiB limit if memtable_size > 8 * 1_024 * 1_024 { self.force_flush()?; From 91eaa954580c28f1e63615f3604b224698f83ddc Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 26 Jun 2025 22:12:12 +0200 Subject: [PATCH 197/613] fork double-ended-peekable --- src/double_ended_peekable.rs | 227 +++++++++++++++++++++++++++++++++++ 1 file changed, 227 insertions(+) create mode 100644 src/double_ended_peekable.rs diff --git a/src/double_ended_peekable.rs b/src/double_ended_peekable.rs new file mode 100644 index 00000000..6ed4e655 --- /dev/null +++ b/src/double_ended_peekable.rs @@ -0,0 +1,227 @@ +//! A fork of https://github.com/dodomorandi/double-ended-peekable +//! to allow accessing the inner type +//! +//! Also changes the generics a bit so it plays well with `self_cell`. + +use core::{fmt::Debug, hash::Hash, hint::unreachable_unchecked, mem}; + +/// An _extension trait_ to create [`DoubleEndedPeekable`]. +/// +/// This has a blanket implementation for all types that implement [`Iterator`]. +pub trait DoubleEndedPeekableExt> { + /// Creates an iterator which works similarly to [`Peekable`], but also provides additional + /// functions if the underlying type implements [`DoubleEndedIterator`]. + /// + /// See [`DoubleEndedPeekable`] for more information. + /// + /// [`Peekable`]: core::iter::Peekable + fn double_ended_peekable(self) -> DoubleEndedPeekable; +} + +impl DoubleEndedPeekableExt for I +where + I: Iterator, +{ + #[inline] + fn double_ended_peekable(self) -> DoubleEndedPeekable { + DoubleEndedPeekable { + iter: self, + front: MaybePeeked::Unpeeked, + back: MaybePeeked::Unpeeked, + } + } +} + +/// An advanced version of [`Peekable`] that works well with double-ended iterators. +/// +/// This `struct` is created by the [`double_ended_peekable`] method on [`DoubleEndedPeekableExt`]. +/// +/// [`Peekable`]: core::iter::Peekable +/// [`double_ended_peekable`]: DoubleEndedPeekableExt::double_ended_peekable +pub struct DoubleEndedPeekable> { + iter: I, + front: MaybePeeked, + back: MaybePeeked, +} + +impl DoubleEndedPeekable +where + I: Iterator, +{ + pub fn inner(&self) -> &I { + &self.iter + } + + pub fn inner_mut(&mut self) -> &mut I { + &mut self.iter + } + + /// Returns a reference to the `next()` value without advancing the iterator. + /// + /// See [`Peekable::peek`] for more information. + /// + /// [`Peekable::peek`]: core::iter::Peekable::peek + #[inline] + pub fn peek(&mut self) -> Option<&I::Item> { + self.front + .get_peeked_or_insert_with(|| self.iter.next()) + .as_ref() + .or_else(|| self.back.peeked_value_ref()) + } +} + +impl DoubleEndedPeekable +where + I: DoubleEndedIterator, +{ + /// Returns a reference to the `next_back()` value without advancing the _back_ of the iterator. + /// + /// Like [`next_back`], if there is a value, it is wrapped in a `Some(T)`. + /// But if the iteration is over, `None` is returned. + /// + /// [`next_back`]: DoubleEndedIterator::next_back + /// + /// Because `peek_back()` returns a reference, and many iterators iterate over references, + /// there can be a possibly confusing situation where the return value is a double reference. + /// You can see this effect in the examples below. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use double_ended_peekable::DoubleEndedPeekableExt; + /// + /// let xs = [1, 2, 3]; + /// + /// let mut iter = xs.into_iter().double_ended_peekable(); + /// + /// // peek_back() lets us see into the past of the future + /// assert_eq!(iter.peek_back(), Some(&3)); + /// assert_eq!(iter.next_back(), Some(3)); + /// + /// assert_eq!(iter.next_back(), Some(2)); + /// + /// // The iterator does not advance even if we `peek_back` multiple times + /// assert_eq!(iter.peek_back(), Some(&1)); + /// assert_eq!(iter.peek_back(), Some(&1)); + /// + /// assert_eq!(iter.next_back(), Some(1)); + /// + /// // After the iterator is finished, so is `peek_back()` + /// assert_eq!(iter.peek_back(), None); + /// assert_eq!(iter.next_back(), None); + /// ``` + #[inline] + pub fn peek_back(&mut self) -> Option<&I::Item> { + self.back + .get_peeked_or_insert_with(|| self.iter.next_back()) + .as_ref() + .or_else(|| self.front.peeked_value_ref()) + } +} + +impl Iterator for DoubleEndedPeekable +where + I: Iterator, +{ + type Item = I::Item; + + #[inline] + fn next(&mut self) -> Option { + match self.front.take() { + MaybePeeked::Peeked(out @ Some(_)) => out, + MaybePeeked::Peeked(None) => self.back.take().into_peeked_value(), + MaybePeeked::Unpeeked => match self.iter.next() { + item @ Some(_) => item, + None => self.back.take().into_peeked_value(), + }, + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let (lower, upper) = self.iter.size_hint(); + let additional = match (&self.front, &self.back) { + (MaybePeeked::Peeked(_), MaybePeeked::Peeked(_)) => 2, + (MaybePeeked::Peeked(_), _) | (_, MaybePeeked::Peeked(_)) => 1, + (MaybePeeked::Unpeeked, MaybePeeked::Unpeeked) => 0, + }; + + (lower + additional, upper.map(|upper| upper + additional)) + } +} + +impl DoubleEndedIterator for DoubleEndedPeekable +where + I: DoubleEndedIterator, +{ + #[inline] + fn next_back(&mut self) -> Option { + match self.back.take() { + MaybePeeked::Peeked(out @ Some(_)) => out, + MaybePeeked::Peeked(None) => self.front.take().into_peeked_value(), + MaybePeeked::Unpeeked => match self.iter.next_back() { + out @ Some(_) => out, + None => self.front.take().into_peeked_value(), + }, + } + } +} + +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +enum MaybePeeked { + #[default] + Unpeeked, + Peeked(Option), +} + +impl MaybePeeked { + fn get_peeked_or_insert_with(&mut self, f: F) -> &mut Option + where + F: FnOnce() -> Option, + { + if let MaybePeeked::Unpeeked = self { + *self = MaybePeeked::Peeked(f()); + } + + let MaybePeeked::Peeked(peeked) = self else { + // SAFETY: it cannot be `Unpeeked` because that case has been just replaced with + // `Peeked`, and we only have two possible states. + #[allow(unsafe_code)] + unsafe { + unreachable_unchecked() + } + }; + peeked + } + + const fn peeked_value_ref(&self) -> Option<&T> { + match self { + MaybePeeked::Unpeeked | MaybePeeked::Peeked(None) => None, + MaybePeeked::Peeked(Some(peeked)) => Some(peeked), + } + } + + fn peeked_value_mut(&mut self) -> Option<&mut T> { + match self { + MaybePeeked::Unpeeked | MaybePeeked::Peeked(None) => None, + MaybePeeked::Peeked(Some(peeked)) => Some(peeked), + } + } + + const fn is_unpeeked(&self) -> bool { + matches!(self, MaybePeeked::Unpeeked) + } + + fn take(&mut self) -> Self { + mem::replace(self, MaybePeeked::Unpeeked) + } + + fn into_peeked_value(self) -> Option { + match self { + MaybePeeked::Unpeeked | MaybePeeked::Peeked(None) => None, + MaybePeeked::Peeked(Some(peeked)) => Some(peeked), + } + } +} From 159c0c1459d73ba41bc45489b2ba6614f1ea861f Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 26 Jun 2025 22:12:40 +0200 Subject: [PATCH 198/613] remove double-ended-peekable --- Cargo.toml | 1 - src/lib.rs | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index e7c2fcba..013c22f3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,7 +25,6 @@ bytes = [] # TODO: restore [dependencies] byteorder = "1.5.0" crossbeam-skiplist = "0.1.3" -double-ended-peekable = "0.1.0" enum_dispatch = "0.3.13" interval-heap = "0.0.5" log = "0.4.22" diff --git a/src/lib.rs b/src/lib.rs index f81b81f9..a0a0087d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -134,6 +134,7 @@ mod clipping_iter; pub mod compaction; mod compression; mod config; +mod double_ended_peekable; mod error; // mod export; From d3e48f877cbb12ba8026567344b01d01011d8269 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 26 Jun 2025 22:12:54 +0200 Subject: [PATCH 199/613] refactor --- src/manifest.rs | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/manifest.rs b/src/manifest.rs index 2fea376e..c9e88fcf 100644 --- a/src/manifest.rs +++ b/src/manifest.rs @@ -13,7 +13,6 @@ use std::io::Write; pub struct Manifest { pub(crate) version: FormatVersion, pub(crate) tree_type: TreeType, - // pub(crate) table_type: TableType, pub(crate) level_count: u8, } @@ -21,7 +20,6 @@ impl Encode for Manifest { fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { writer.write_all(&MAGIC_BYTES)?; writer.write_u8(self.tree_type.into())?; - // writer.write_u8(self.table_type.into())?; writer.write_u8(self.level_count)?; Ok(()) } @@ -41,18 +39,16 @@ impl Decode for Manifest { let version = FormatVersion::try_from(version).map_err(|()| DecodeError::InvalidVersion)?; let tree_type = reader.read_u8()?; - // let table_type = reader.read_u8()?; + let tree_type = tree_type + .try_into() + .map_err(|()| DecodeError::InvalidTag(("TreeType", tree_type)))?; + let level_count = reader.read_u8()?; Ok(Self { version, + tree_type, level_count, - tree_type: tree_type - .try_into() - .map_err(|()| DecodeError::InvalidTag(("TreeType", tree_type)))?, - // table_type: table_type - // .try_into() - // .map_err(|()| DecodeError::InvalidTag(("TableType", table_type)))?, }) } } From e793984f0d2a32eca9101c35d964221c9f92e106 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 26 Jun 2025 22:20:29 +0200 Subject: [PATCH 200/613] block decoder --- src/segment/block/decoder.rs | 505 +++++++++++++++++++++++++++++++++++ src/segment/block/mod.rs | 2 + 2 files changed, 507 insertions(+) create mode 100644 src/segment/block/decoder.rs diff --git a/src/segment/block/decoder.rs b/src/segment/block/decoder.rs new file mode 100644 index 00000000..07b7863a --- /dev/null +++ b/src/segment/block/decoder.rs @@ -0,0 +1,505 @@ +// Copyright (c) 2025-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use super::{binary_index::Reader as BinaryIndexReader, hash_index::Reader as HashIndexReader}; +use crate::{ + segment::{block::Trailer, Block}, + unwrappy, Slice, +}; +use byteorder::{LittleEndian, ReadBytesExt}; +use std::{io::Cursor, marker::PhantomData}; + +pub trait ParsedItem { + /// Returns the key as byte slice. + /// + /// # Warning + /// + /// May only be called on a restart head as a prefix-truncated item cannot be + /// represented by a single byte slice. + fn key<'a>(&self, bytes: &'a [u8]) -> &'a [u8]; + + /// Compares this item's key with a needle. + /// + /// We can not access the key directly because it may be comprised of prefix + suffix. + fn compare_key(&self, needle: &[u8], bytes: &[u8]) -> std::cmp::Ordering; + + /// Returns the byte offset of the key's start position. + fn key_offset(&self) -> usize; + + /// Converts the parsed representation to an owned value. + fn materialize(&self, bytes: &Slice) -> M; +} + +pub trait Decodable { + /// Parses the key of the next restart head from a reader. + fn parse_restart_key<'a>( + reader: &mut Cursor<&[u8]>, + offset: usize, + data: &'a [u8], + ) -> Option<&'a [u8]>; + + /// Parses a restart head from a reader. + /// + /// `offset` is the position of the item to read in the block's byte slice. + fn parse_full(reader: &mut Cursor<&[u8]>, offset: usize) -> Option; + + /// Parses a (possibly) prefix truncated item from a reader. + fn parse_truncated( + reader: &mut Cursor<&[u8]>, + offset: usize, + base_key_offset: usize, + ) -> Option; +} + +#[derive(Debug)] +struct LoScanner { + offset: usize, + remaining_in_interval: usize, + base_key_offset: Option, +} + +#[derive(Debug)] +struct HiScanner { + offset: usize, + ptr_idx: usize, + stack: Vec, // TODO: SmallVec? + base_key_offset: Option, +} + +pub struct Decoder<'a, Item: Decodable, Parsed: ParsedItem> { + block: &'a Block, + phantom: PhantomData<(Item, Parsed)>, + + lo_scanner: LoScanner, + hi_scanner: HiScanner, + + // Cached metadata + pub(crate) restart_interval: u8, + + binary_index_step_size: u8, + binary_index_offset: u32, + binary_index_len: u32, + + hash_index_offset: u32, + hash_index_len: u32, +} + +impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Parsed> { + #[must_use] + pub fn new(block: &'a Block) -> Self { + let trailer = Trailer::new(block); + let mut reader = trailer.as_slice(); + + let _item_count = reader.read_u32::().expect("should read"); + + let restart_interval = unwrappy!(reader.read_u8()); + + let binary_index_step_size = unwrappy!(reader.read_u8()); + + debug_assert!( + binary_index_step_size == 2 || binary_index_step_size == 4, + "invalid binary index step size", + ); + + let binary_index_offset = unwrappy!(reader.read_u32::()); + let binary_index_len = unwrappy!(reader.read_u32::()); + + let hash_index_offset = unwrappy!(reader.read_u32::()); + let hash_index_len = unwrappy!(reader.read_u32::()); + + Self { + block, + phantom: PhantomData, + + lo_scanner: LoScanner { + offset: 0, + remaining_in_interval: 0, + base_key_offset: None, + }, + + hi_scanner: HiScanner { + offset: 0, + ptr_idx: binary_index_len as usize, + stack: Vec::new(), + base_key_offset: None, + }, + + restart_interval, + + binary_index_step_size, + binary_index_offset, + binary_index_len, + + hash_index_offset, + hash_index_len, + } + } + + #[must_use] + pub fn block(&self) -> &Block { + self.block + } + + #[must_use] + pub fn bytes(&self) -> &[u8] { + &self.block.data + } + + /// Returns the amount of items in the block. + #[must_use] + pub fn len(&self) -> usize { + Trailer::new(self.block).item_count() + } + + #[must_use] + pub fn is_empty(&self) -> bool { + false + } + + fn get_binary_index_reader(&self) -> BinaryIndexReader { + BinaryIndexReader::new( + &self.block.data, + self.binary_index_offset, + self.binary_index_len, + self.binary_index_step_size, + ) + } + + /// Returns the number of hash buckets. + #[must_use] + pub fn hash_bucket_count(&self) -> Option { + if self.hash_index_offset > 0 { + Some(self.hash_index_len) + } else { + None + } + } + + fn get_hash_index_reader(&self) -> Option { + self.hash_bucket_count() + .map(|offset| HashIndexReader::new(&self.block.data, self.hash_index_offset, offset)) + } + + fn get_key_at(&self, pos: usize) -> &[u8] { + let bytes = &self.block.data; + + // SAFETY: pos is always retrieved from the binary index, + // which we consider to be trustworthy + #[warn(unsafe_code)] + let mut cursor = Cursor::new(unsafe { bytes.get_unchecked(pos..) }); + + Item::parse_restart_key(&mut cursor, pos, bytes).expect("should exist") + } + + fn partition_point( + &self, + pred: impl Fn(&[u8]) -> bool, + ) -> Option<(/* offset */ usize, /* idx */ usize)> { + let binary_index = self.get_binary_index_reader(); + + debug_assert!( + binary_index.len() >= 1, + "binary index should never be empty", + ); + + let mut left: usize = 0; + let mut right = binary_index.len(); + + if right == 0 { + return None; + } + + while left < right { + let mid = (left + right) / 2; + + let offset = binary_index.get(mid); + + let head_key = self.get_key_at(offset); + + if pred(head_key) { + left = mid + 1; + } else { + right = mid; + } + } + + if left == 0 { + return Some((0, 0)); + } + + let offset = binary_index.get(left - 1); + + Some((offset, left - 1)) + } + + // TODO: + fn partition_point_2( + &self, + pred: impl Fn(&[u8]) -> bool, + ) -> Option<(/* offset */ usize, /* idx */ usize)> { + let binary_index = self.get_binary_index_reader(); + + debug_assert!( + binary_index.len() >= 1, + "binary index should never be empty", + ); + + let mut left: usize = 0; + let mut right = binary_index.len(); + + if right == 0 { + return None; + } + + while left < right { + let mid = (left + right) / 2; + + let offset = binary_index.get(mid); + + let head_key = self.get_key_at(offset); + + if pred(head_key) { + left = mid + 1; + } else { + right = mid; + } + } + + if left == binary_index.len() { + let idx = binary_index.len() - 1; + let offset = binary_index.get(idx); + return Some((offset, idx)); + } + + let offset = binary_index.get(left); + + Some((offset, left)) + } + + /// Seeks using the given predicate. + /// + /// Returns `false` if the key does not possible exist. + pub fn seek( + &mut self, + needle: &[u8], + pred: impl Fn(&[u8]) -> bool, + second_partition: bool, + ) -> bool { + // Try hash index lookup + if let Some(hash_index) = self.get_hash_index_reader() { + match hash_index.get(needle) { + super::hash_index::Lookup::Found(idx) => { + let offset = self.get_binary_index_reader().get(idx.into()); + self.lo_scanner.offset = offset; + return true; + } + super::hash_index::Lookup::NotFound => return false, + super::hash_index::Lookup::Conflicted => { + // Fall back to binary search + } + } + } + + // TODO: make this nicer, maybe predicate that can affect the resulting index...? + let result = if second_partition { + self.partition_point_2(pred) + } else { + self.partition_point(pred) + }; + + // Binary index lookup + let Some((offset, _)) = result else { + return false; + }; + + self.lo_scanner.offset = offset; + + true + } + + /// Seeks the upper bound using the given predicate. + /// + /// Returns `false` if the key does not possible exist. + pub fn seek_upper(&mut self, pred: impl Fn(&[u8]) -> bool, second_partition: bool) -> bool { + let result = if second_partition { + self.partition_point_2(pred) + } else { + self.partition_point(pred) + }; + + // Binary index lookup + let Some((offset, idx)) = result else { + return false; + }; + + eprintln!("seeked upper to {idx}"); + eprintln!("hi scanner offset now {offset}"); + + self.hi_scanner.offset = offset; + self.hi_scanner.ptr_idx = idx; + self.hi_scanner.stack.clear(); + self.hi_scanner.base_key_offset = None; + + self.fill_stack(); + + true + } + + fn parse_current_item( + reader: &mut Cursor<&[u8]>, + offset: usize, + base_key_offset: Option, + is_restart: bool, + ) -> Option { + if is_restart { + Item::parse_full(reader, offset) + } else { + Item::parse_truncated(reader, offset, base_key_offset.expect("should exist")) + } + } + + fn fill_stack(&mut self) { + let binary_index = self.get_binary_index_reader(); + + { + self.hi_scanner.offset = binary_index.get(self.hi_scanner.ptr_idx); + + let offset = self.hi_scanner.offset; + + // SAFETY: The cursor is advanced by read_ operations which check for EOF, + // And the cursor starts at 0 - the slice is never empty + #[warn(unsafe_code)] + let mut reader = Cursor::new(unsafe { self.block.data.get_unchecked(offset..) }); + + if Item::parse_full(&mut reader, offset) + .inspect(|item| { + self.hi_scanner.offset += reader.position() as usize; + self.hi_scanner.base_key_offset = Some(item.key_offset()); + }) + .is_some() + { + self.hi_scanner.stack.push(offset); + } + } + + for _ in 1..self.restart_interval { + let offset = self.hi_scanner.offset; + + // SAFETY: The cursor is advanced by read_ operations which check for EOF, + // And the cursor starts at 0 - the slice is never empty + #[warn(unsafe_code)] + let mut reader = Cursor::new(unsafe { self.block.data.get_unchecked(offset..) }); + + if Item::parse_truncated( + &mut reader, + offset, + self.hi_scanner.base_key_offset.expect("should exist"), + ) + .inspect(|_| { + self.hi_scanner.offset += reader.position() as usize; + }) + .is_some() + { + self.hi_scanner.stack.push(offset); + } else { + break; + } + } + } + + fn consume_stack_top(&mut self) -> Option { + let offset = self.hi_scanner.stack.pop()?; + + if self.lo_scanner.offset > 0 && offset < self.lo_scanner.offset { + return None; + } + + self.hi_scanner.offset = offset; + + let is_restart = self.hi_scanner.stack.is_empty(); + + // SAFETY: The cursor is advanced by read_ operations which check for EOF, + // And the cursor starts at 0 - the slice is never empty + #[warn(unsafe_code)] + let mut reader = Cursor::new(unsafe { self.block.data.get_unchecked(offset..) }); + + Self::parse_current_item( + &mut reader, + offset, + self.hi_scanner.base_key_offset, + is_restart, + ) + } +} + +impl, Parsed: ParsedItem> Iterator for Decoder<'_, Item, Parsed> { + type Item = Parsed; + + fn next(&mut self) -> Option { + if self.hi_scanner.base_key_offset.is_some() + && self.lo_scanner.offset >= self.hi_scanner.offset + { + eprintln!("damn, hi scanner is already at {}", self.hi_scanner.offset); + return None; + } + + let is_restart: bool = self.lo_scanner.remaining_in_interval == 0; + + // SAFETY: The cursor is advanced by read_ operations which check for EOF, + // And the cursor starts at 0 - the slice is never empty + #[warn(unsafe_code)] + let mut reader = + Cursor::new(unsafe { self.block.data.get_unchecked(self.lo_scanner.offset..) }); + + let item = Self::parse_current_item( + &mut reader, + self.lo_scanner.offset, + self.lo_scanner.base_key_offset, + is_restart, + ) + .inspect(|item| { + self.lo_scanner.offset += reader.position() as usize; + + if is_restart { + self.lo_scanner.base_key_offset = Some(item.key_offset()); + } + }); + + if is_restart { + self.lo_scanner.remaining_in_interval = usize::from(self.restart_interval) - 1; + } else { + self.lo_scanner.remaining_in_interval -= 1; + } + + item + } +} + +impl, Parsed: ParsedItem> DoubleEndedIterator + for Decoder<'_, Item, Parsed> +{ + fn next_back(&mut self) -> Option { + if let Some(top) = self.consume_stack_top() { + return Some(top); + } + + // NOTE: If we wrapped, we are at the end + // This is safe to do, because there cannot be that many restart intervals + if self.hi_scanner.ptr_idx == usize::MAX { + return None; + } + + self.hi_scanner.ptr_idx = self.hi_scanner.ptr_idx.wrapping_sub(1); + + // NOTE: If we wrapped, we are at the end + // This is safe to do, because there cannot be that many restart intervals + if self.hi_scanner.ptr_idx == usize::MAX { + return None; + } + + self.fill_stack(); + + self.consume_stack_top() + } +} diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index fbaa42fd..7a760630 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -4,6 +4,7 @@ pub(crate) mod binary_index; mod checksum; +pub mod decoder; mod encoder; pub mod hash_index; mod header; @@ -11,6 +12,7 @@ mod offset; mod trailer; pub use checksum::Checksum; +pub(crate) use decoder::{Decodable, Decoder, ParsedItem}; pub(crate) use encoder::{Encodable, Encoder}; pub use header::Header; pub use offset::BlockOffset; From eefcd78e31b4e4d108bafad829d4dab033561bd8 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 26 Jun 2025 22:21:33 +0200 Subject: [PATCH 201/613] unsafe feature flag in hash index --- src/segment/block/hash_index/reader.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/segment/block/hash_index/reader.rs b/src/segment/block/hash_index/reader.rs index 7b6df72d..54157b76 100644 --- a/src/segment/block/hash_index/reader.rs +++ b/src/segment/block/hash_index/reader.rs @@ -33,7 +33,7 @@ impl<'a> Reader<'a> { Self(&bytes[offset..end]) } - // NOTE: Not used for performance reasons, so no need to be hyper-optimized + // NOTE: Only used in metrics, so no need to be hyper-optimized #[allow(clippy::naive_bytecount)] /// Returns the amount of empty slots in the hash index. #[must_use] @@ -41,10 +41,10 @@ impl<'a> Reader<'a> { self.0.iter().filter(|&&byte| byte == MARKER_FREE).count() } - // NOTE: Not used for performance reasons, so no need to be hyper-optimized - #[allow(clippy::naive_bytecount)] + // NOTE: Only used in metrics, so no need to be hyper-optimized /// Returns the amount of conflict markers in the hash index. #[must_use] + #[allow(clippy::naive_bytecount)] pub fn conflict_count(&self) -> usize { self.0 .iter() @@ -64,8 +64,14 @@ impl<'a> Reader<'a> { // SAFETY: We use modulo in `calculate_bucket_position` #[allow(unsafe_code)] + #[cfg(feature = "use_unsafe")] let marker = unsafe { *self.0.get_unchecked(bucket_pos) }; + // SAFETY: We use modulo in `calculate_bucket_position` + #[allow(clippy::indexing_slicing)] + #[cfg(not(feature = "use_unsafe"))] + let marker = self.0[bucket_pos]; + match marker { MARKER_CONFLICT => Lookup::Conflicted, MARKER_FREE => Lookup::NotFound, From 88104d2bbdc2d1f676a7086c7e35e11b7cdbc35d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 26 Jun 2025 22:21:52 +0200 Subject: [PATCH 202/613] use new double ended peekable --- src/mvcc_stream.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mvcc_stream.rs b/src/mvcc_stream.rs index 258d3225..b622fa53 100644 --- a/src/mvcc_stream.rs +++ b/src/mvcc_stream.rs @@ -2,15 +2,15 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) +use crate::double_ended_peekable::{DoubleEndedPeekable, DoubleEndedPeekableExt}; use crate::{InternalValue, UserKey}; -use double_ended_peekable::{DoubleEndedPeekable, DoubleEndedPeekableExt}; /// Consumes a stream of KVs and emits a new stream according to MVCC and tombstone rules /// /// This iterator is used for read operations. #[allow(clippy::module_name_repetitions)] pub struct MvccStream>> { - inner: DoubleEndedPeekable, + inner: DoubleEndedPeekable, I>, } impl>> MvccStream { From 12437838bc36663deb51a490c203574341d127ac Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 26 Jun 2025 22:22:19 +0200 Subject: [PATCH 203/613] crate-level unwrap macro --- src/lib.rs | 10 ++++++++++ src/segment/block/binary_index/reader.rs | 9 +-------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index a0a0087d..9c1d3cc7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -237,3 +237,13 @@ pub mod gc { GcReport as Report, GcStrategy as Strategy, SpaceAmpStrategy, StaleThresholdStrategy, }; } + +macro_rules! unwrappy { + ($x:expr) => { + $x.expect("should read") + + // unsafe { $x.unwrap_unchecked() } + }; +} + +pub(crate) use unwrappy; diff --git a/src/segment/block/binary_index/reader.rs b/src/segment/block/binary_index/reader.rs index 8cbab201..a8acd484 100644 --- a/src/segment/block/binary_index/reader.rs +++ b/src/segment/block/binary_index/reader.rs @@ -2,16 +2,9 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) +use crate::unwrappy; use byteorder::{LittleEndian, ReadBytesExt}; -macro_rules! unwrappy { - ($x:expr) => { - // $x.expect("should read") - - unsafe { $x.unwrap_unchecked() } - }; -} - pub struct Reader<'a> { bytes: &'a [u8], step_size: usize, From 2c0cdee04530f83a227977c8010431e80ba32368 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 26 Jun 2025 22:22:51 +0200 Subject: [PATCH 204/613] refactor: block encoder --- src/segment/block/encoder.rs | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/segment/block/encoder.rs b/src/segment/block/encoder.rs index 2eeead2b..d56dcd28 100644 --- a/src/segment/block/encoder.rs +++ b/src/segment/block/encoder.rs @@ -12,13 +12,13 @@ use super::{ }; use std::marker::PhantomData; -pub trait Encodable { +pub trait Encodable { fn key(&self) -> &[u8]; fn encode_full_into( &self, writer: &mut W, - state: &mut S, + state: &mut Context, ) -> crate::Result<()> where Self: Sized; @@ -26,7 +26,7 @@ pub trait Encodable { fn encode_truncated_into( &self, writer: &mut W, - state: &mut S, + state: &mut Context, shared_len: usize, ) -> crate::Result<()> where @@ -34,12 +34,15 @@ pub trait Encodable { } /// Block encoder -pub struct Encoder<'a, S: Default, T: Encodable> { - pub(crate) phantom: PhantomData<(S, T)>, +/// +/// The block encoder accepts an ascending stream of items, encodes them into +/// restart intervals and builds binary index (and optionally a hash index). +pub struct Encoder<'a, Context: Default, Item: Encodable> { + pub(crate) phantom: PhantomData<(Context, Item)>, pub(crate) writer: Vec, - pub(crate) state: S, + pub(crate) state: Context, pub(crate) item_count: usize, pub(crate) restart_count: usize, @@ -53,10 +56,13 @@ pub struct Encoder<'a, S: Default, T: Encodable> { base_key: &'a [u8], } -impl<'a, S: Default, T: Encodable> Encoder<'a, S, T> { +// TODO: support no binary index -> use in meta blocks with restart interval = 1 +// TODO: adjust test + fuzz tests to also test for no binary index + +impl<'a, Context: Default, Item: Encodable> Encoder<'a, Context, Item> { pub fn new( item_count: usize, - restart_interval: u8, + restart_interval: u8, // TODO: should be NonZero hash_index_ratio: f32, first_key: &'a [u8], ) -> Self { @@ -68,7 +74,7 @@ impl<'a, S: Default, T: Encodable> Encoder<'a, S, T> { writer: Vec::new(), - state: S::default(), + state: Context::default(), item_count: 0, restart_count: 0, @@ -92,7 +98,7 @@ impl<'a, S: Default, T: Encodable> Encoder<'a, S, T> { self } - pub fn write(&mut self, item: &'a T) -> crate::Result<()> { + pub fn write(&mut self, item: &'a Item) -> crate::Result<()> { // NOTE: Check if we are a restart marker if self.item_count % usize::from(self.restart_interval) == 0 { self.restart_count += 1; From 761e39f5a4d6334be7494ec130ff1b5107c1345a Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 26 Jun 2025 22:24:13 +0200 Subject: [PATCH 205/613] refactor: data block iter --- src/segment/data_block/forward_reader.rs | 1538 ---------------------- src/segment/data_block/iter.rs | 1513 ++++++++++++--------- 2 files changed, 921 insertions(+), 2130 deletions(-) delete mode 100644 src/segment/data_block/forward_reader.rs diff --git a/src/segment/data_block/forward_reader.rs b/src/segment/data_block/forward_reader.rs deleted file mode 100644 index 09c9e7ad..00000000 --- a/src/segment/data_block/forward_reader.rs +++ /dev/null @@ -1,1538 +0,0 @@ -// Copyright (c) 2025-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use super::DataBlock; -use crate::{key::InternalKey, segment::util::compare_prefixed_slice, InternalValue, SeqNo, Slice}; -use std::io::{Cursor, Seek}; - -/// [start, end] slice indexes -#[derive(Debug)] -pub struct ParsedSlice(pub usize, pub usize); - -impl ParsedItem { - pub fn materialize(&self, bytes: &Slice) -> InternalValue { - // NOTE: We consider the prefix and key slice indexes to be trustworthy - #[allow(clippy::indexing_slicing)] - let key = if let Some(prefix) = &self.prefix { - let prefix_key = &bytes[prefix.0..prefix.1]; - let rest_key = &bytes[self.key.0..self.key.1]; - Slice::fused(prefix_key, rest_key) - } else { - bytes.slice(self.key.0..self.key.1) - }; - let key = InternalKey::new( - key, - self.seqno, - // NOTE: Value type is (or should be) checked when reading it - #[allow(clippy::expect_used)] - self.value_type.try_into().expect("should work"), - ); - - let value = self - .value - .as_ref() - .map_or_else(Slice::empty, |v| bytes.slice(v.0..v.1)); - - InternalValue { key, value } - } -} - -#[derive(Debug)] -pub struct ParsedItem { - pub value_type: u8, - pub seqno: SeqNo, - pub prefix: Option, - pub key: ParsedSlice, - pub value: Option, -} - -// TODO: flatten into main struct -#[derive(Default, Debug)] -struct LoScanner { - offset: usize, - remaining_in_interval: usize, - base_key_offset: Option, -} - -/// Specialized reader to scan an index block only in forwards direction -/// -/// Is less expensive than a double ended iterator. -pub struct ForwardReader<'a> { - block: &'a DataBlock, - restart_interval: usize, - lo_scanner: LoScanner, -} - -impl<'a> ForwardReader<'a> { - #[must_use] - pub fn new(block: &'a DataBlock) -> Self { - let restart_interval = block.restart_interval.into(); - - Self { - block, - - restart_interval, - - lo_scanner: LoScanner::default(), - } - } - - #[must_use] - pub fn offset(&self) -> usize { - self.lo_scanner.offset - } - - /// Reads an item by key from the block, if it exists. - #[must_use] - pub fn point_read(&mut self, needle: &[u8], seqno: SeqNo) -> Option { - let may_exist = self.seek(needle, seqno); - - if !may_exist { - return None; - } - - let bytes = self.block.bytes(); - - for item in &mut *self { - let cmp_result = if let Some(prefix) = &item.prefix { - let prefix = unsafe { bytes.get_unchecked(prefix.0..prefix.1) }; - let rest_key = unsafe { bytes.get_unchecked(item.key.0..item.key.1) }; - compare_prefixed_slice(prefix, rest_key, needle) - } else { - let key = unsafe { bytes.get_unchecked(item.key.0..item.key.1) }; - key.cmp(needle) - }; - - match cmp_result { - std::cmp::Ordering::Equal => { - if item.seqno < seqno { - let kv = item.materialize(&self.block.inner.data); - return Some(kv); - } - } - std::cmp::Ordering::Greater => { - // Already passed needle - return None; - } - std::cmp::Ordering::Less => { - // Continue to next KV - } - } - } - - None - } - - /// Seeks to the lowest item that is eligible based on the requested - /// needle and seqno. - /// - /// Returns `false` if `next()` can be safely skipped because the item definitely - /// does not exist. - pub fn seek(&mut self, needle: &[u8], seqno: SeqNo) -> bool { - let binary_index = self.block.get_binary_index_reader(); - - // NOTE: Try hash index if it exists - if let Some(lookup) = self - .block - .get_hash_index_reader() - .map(|reader| reader.get(needle)) - { - use super::super::block::hash_index::Lookup::{Conflicted, Found, NotFound}; - - match lookup { - Found(bucket_value) => { - let offset = binary_index.get(usize::from(bucket_value)); - self.lo_scanner.offset = offset; - self.linear_probe(needle, seqno); - return true; - } - NotFound => { - return false; - } - Conflicted => { - // NOTE: Fallback to binary search - } - } - } - - let Some(offset) = self - .block - .binary_search_for_offset(&binary_index, needle, seqno) - else { - return false; - }; - - self.lo_scanner.offset = offset; - - self.linear_probe(needle, seqno) - } - - fn linear_probe(&mut self, needle: &[u8], seqno: SeqNo /* TODO: use */) -> bool { - let bytes = self.block.bytes(); - - // SAFETY: The cursor is advanced by read_ operations which check for EOF, - // And the cursor starts at 0 - the slice is never empty - #[warn(unsafe_code)] - let mut reader = Cursor::new(bytes); - - reader - .seek_relative(self.lo_scanner.offset as i64) - .expect("should be in bounds"); - - loop { - let Some(head) = DataBlock::parse_restart_item(&mut reader, 0) else { - return false; - }; - - let cmp_result = { - let key = unsafe { bytes.get_unchecked(head.key.0..head.key.1) }; - key.cmp(needle) - }; - - match cmp_result { - std::cmp::Ordering::Equal => { - // TODO: return true - return true; - } - std::cmp::Ordering::Greater => { - // Already passed needle - - return false; - } - std::cmp::Ordering::Less => { - // Continue to next KV - } - } - - let base_key_offset = head.key.0; - self.lo_scanner.base_key_offset = Some(base_key_offset); - - self.lo_scanner.remaining_in_interval = self.restart_interval; - self.lo_scanner.offset = reader.position() as usize; - self.lo_scanner.remaining_in_interval -= 1; - - for _ in 0..(self.restart_interval - 1) { - let Some(head) = DataBlock::parse_truncated_item(&mut reader, 0, base_key_offset) - else { - return false; - }; - - let cmp_result = if let Some(prefix) = &head.prefix { - let prefix = unsafe { bytes.get_unchecked(prefix.0..prefix.1) }; - let rest_key = unsafe { bytes.get_unchecked(head.key.0..head.key.1) }; - compare_prefixed_slice(prefix, rest_key, needle) - } else { - let key = unsafe { bytes.get_unchecked(head.key.0..head.key.1) }; - key.cmp(needle) - }; - - match cmp_result { - std::cmp::Ordering::Equal => { - return true; - } - std::cmp::Ordering::Greater => { - // Already passed needle - - return false; - } - std::cmp::Ordering::Less => { - // Continue to next KV - } - } - - self.lo_scanner.offset = reader.position() as usize; - self.lo_scanner.remaining_in_interval -= 1; - } - } - } - - fn parse_restart_item( - block: &DataBlock, - offset: &mut usize, - base_key_offset: &mut Option, - ) -> Option { - let bytes = block.bytes(); - - // SAFETY: The cursor is advanced by read_ operations which check for EOF, - // And the cursor starts at 0 - the slice is never empty - #[warn(unsafe_code)] - let mut reader = Cursor::new(unsafe { bytes.get_unchecked(*offset..) }); - - let item = DataBlock::parse_restart_item(&mut reader, *offset)?; - - *offset += reader.position() as usize; - *base_key_offset = Some(item.key.0); - - Some(item) - } - - fn parse_truncated_item( - block: &DataBlock, - offset: &mut usize, - base_key_offset: usize, - ) -> Option { - let bytes = block.bytes(); - - // SAFETY: The cursor is advanced by read_ operations which check for EOF, - // And the cursor starts at 0 - the slice is never empty - #[warn(unsafe_code)] - let mut reader = Cursor::new(unsafe { bytes.get_unchecked(*offset..) }); - - let item = DataBlock::parse_truncated_item(&mut reader, *offset, base_key_offset)?; - - *offset += reader.position() as usize; - - Some(item) - } -} - -impl Iterator for ForwardReader<'_> { - type Item = ParsedItem; - - fn next(&mut self) -> Option { - let is_restart = self.lo_scanner.remaining_in_interval == 0; - - let item = if is_restart { - self.lo_scanner.remaining_in_interval = self.restart_interval; - - Self::parse_restart_item( - self.block, - &mut self.lo_scanner.offset, - &mut self.lo_scanner.base_key_offset, - ) - } else { - Self::parse_truncated_item( - self.block, - &mut self.lo_scanner.offset, - self.lo_scanner.base_key_offset.expect("should exist"), - ) - }; - - self.lo_scanner.remaining_in_interval -= 1; - - item - } -} - -#[cfg(test)] -#[allow(clippy::unwrap_used)] -mod tests { - use super::*; - use crate::{ - segment::{block::Header, Block, BlockOffset, Checksum}, - Slice, - ValueType::{Tombstone, Value}, - }; - use test_log::test; - - #[test] - fn v3_data_block_seek_too_low() -> crate::Result<()> { - let items = [ - InternalValue::from_components("b", "b", 0, Value), - InternalValue::from_components("c", "c", 0, Value), - InternalValue::from_components("d", "d", 1, Tombstone), - InternalValue::from_components("e", "e", 0, Value), - InternalValue::from_components("f", "f", 0, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 16, 0.0)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert!( - data_block.point_read(b"a", SeqNo::MAX).is_none(), - "should return None because a does not exist", - ); - - assert!( - data_block.point_read(b"b", SeqNo::MAX).is_some(), - "should return Some because b exists", - ); - - assert!( - data_block.point_read(b"z", SeqNo::MAX).is_none(), - "should return Some because z does not exist", - ); - - Ok(()) - } - - #[test] - fn v3_data_block_snapshot_read_first() -> crate::Result<()> { - let items = [InternalValue::from_components( - "hello", - "world", - 0, - crate::ValueType::Value, - )]; - - let bytes = DataBlock::encode_items(&items, 16, 0.0)?; - let serialized_len = bytes.len(); - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(!data_block.is_empty()); - assert_eq!(data_block.inner.size(), serialized_len); - - assert_eq!(Some(items[0].clone()), data_block.point_read(b"hello", 777)); - - Ok(()) - } - - #[test] - fn v3_data_block_point_read_one() -> crate::Result<()> { - let items = [InternalValue::from_components( - "pla:earth:fact", - "eaaaaaaaaarth", - 0, - crate::ValueType::Value, - )]; - - let bytes = DataBlock::encode_items(&items, 16, 0.0)?; - let serialized_len = bytes.len(); - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(!data_block.is_empty()); - assert_eq!(data_block.inner.size(), serialized_len); - - for needle in items { - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, SeqNo::MAX), - ); - } - - assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); - - Ok(()) - } - - #[test] - fn v3_data_block_point_read() -> crate::Result<()> { - let items = [ - InternalValue::from_components( - "pla:earth:fact", - "eaaaaaaaaarth", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components( - "pla:jupiter:fact", - "Jupiter is big", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components( - "pla:jupiter:mass", - "Massive", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components( - "pla:jupiter:name", - "Jupiter", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components("pla:jupiter:radius", "Big", 0, crate::ValueType::Value), - InternalValue::from_components( - "pla:saturn:fact", - "Saturn is pretty big", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components("pla:saturn:name", "Saturn", 0, crate::ValueType::Value), - InternalValue::from_components("pla:venus:fact", "", 1, crate::ValueType::Tombstone), - InternalValue::from_components( - "pla:venus:fact", - "Venus exists", - 0, - crate::ValueType::Value, - ), - InternalValue::from_components("pla:venus:name", "Venus", 0, crate::ValueType::Value), - ]; - - for restart_interval in 1..=20 { - let bytes = DataBlock::encode_items(&items, restart_interval, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - for needle in &items { - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), - ); - } - - assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); - } - - Ok(()) - } - - #[test] - fn v3_data_block_fuzz_1() -> crate::Result<()> { - let items = [ - InternalValue::from_components([0], b"", 23_523_531_241_241_242, Value), - InternalValue::from_components([0], b"", 0, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 16, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - for needle in items { - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), - ); - } - - assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); - - Ok(()) - } - - #[test] - fn v3_data_block_fuzz_2() -> crate::Result<()> { - let items = [ - InternalValue::from_components([0], [], 5, Value), - InternalValue::from_components([0], [], 4, Tombstone), - InternalValue::from_components([0], [], 3, Value), - InternalValue::from_components([0], [], 0, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 2, 0.0)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().is_none()); - - for needle in items { - eprintln!("NEEDLE {needle:?}"); - - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), - ); - } - - assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); - - Ok(()) - } - - #[test] - fn v3_data_block_fuzz_3() -> crate::Result<()> { - let items = [ - InternalValue::from_components( - Slice::from([ - 255, 255, 255, 255, 5, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, - ]), - Slice::from([0, 0, 192]), - 18_446_744_073_701_163_007, - Tombstone, - ), - InternalValue::from_components( - Slice::from([255, 255, 255, 255, 255, 255, 0]), - Slice::from([]), - 0, - Value, - ), - ]; - - let bytes = DataBlock::encode_items(&items, 5, 1.0)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - assert_eq!( - { - #[allow(clippy::suspicious_map)] - data_block.iter().count() - }, - items.len(), - ); - - assert_eq!(items, *data_block.iter().collect::>(),); - - Ok(()) - } - - #[test] - fn v3_data_block_fuzz_4() -> crate::Result<()> { - let items = [ - InternalValue::from_components( - Slice::new(&[0]), - Slice::new(&[]), - 3_834_029_160_418_063_669, - Value, - ), - InternalValue::from_components(Slice::new(&[0]), Slice::new(&[]), 127, Tombstone), - InternalValue::from_components( - Slice::new(&[53, 53, 53]), - Slice::new(&[]), - 18_446_744_073_709_551_615, - Tombstone, - ), - InternalValue::from_components( - Slice::new(&[255]), - Slice::new(&[]), - 18_446_744_069_414_584_831, - Tombstone, - ), - InternalValue::from_components(Slice::new(&[255, 255]), Slice::new(&[]), 47, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 2, 1.0)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - for item in data_block.iter() { - eprintln!("{item:?}"); - } - - assert_eq!( - { - #[allow(clippy::suspicious_map)] - data_block.iter().count() - }, - items.len(), - ); - - Ok(()) - } - - #[test] - fn v3_data_block_dense() -> crate::Result<()> { - let items = [ - InternalValue::from_components(b"a", b"a", 3, Value), - InternalValue::from_components(b"b", b"b", 2, Value), - InternalValue::from_components(b"c", b"c", 1, Value), - InternalValue::from_components(b"d", b"d", 65, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 1, 0.0)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - - for needle in items { - eprintln!("NEEDLE {needle:?}"); - - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, SeqNo::MAX), - ); - } - - assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); - - Ok(()) - } - - #[test] - fn v3_data_block_dense_mvcc_with_hash() -> crate::Result<()> { - let items = [ - InternalValue::from_components(b"a", b"a", 3, Value), - InternalValue::from_components(b"a", b"a", 2, Value), - InternalValue::from_components(b"a", b"a", 1, Value), - InternalValue::from_components(b"b", b"b", 65, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 1, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - for needle in items { - eprintln!("NEEDLE {needle:?}"); - - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), - ); - } - - assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); - - Ok(()) - } - - #[test] - #[allow(clippy::unwrap_used)] - fn v3_data_block_mvcc_latest() -> crate::Result<()> { - let items = [ - InternalValue::from_components(b"a", b"a", 3, Value), - InternalValue::from_components(b"a", b"a", 2, Value), - InternalValue::from_components(b"a", b"a", 1, Value), - InternalValue::from_components(b"b", b"b", 65, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 1, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - assert_eq!( - Some(items.first().cloned().unwrap()), - data_block.point_read(b"a", SeqNo::MAX) - ); - assert_eq!( - Some(items.last().cloned().unwrap()), - data_block.point_read(b"b", SeqNo::MAX) - ); - assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); - - Ok(()) - } - - #[test] - #[allow(clippy::unwrap_used)] - fn v3_data_block_mvcc_latest_fuzz_1() -> crate::Result<()> { - let items = [ - InternalValue::from_components(Slice::from([0]), Slice::from([]), 0, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 0, Value), - InternalValue::from_components( - Slice::from([255, 255, 0]), - Slice::from([]), - 127_886_946_205_696, - Tombstone, - ), - ]; - - let bytes = DataBlock::encode_items(&items, 2, 0.0)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - - assert_eq!( - Some(items.get(1).cloned().unwrap()), - data_block.point_read(&[233, 233], SeqNo::MAX) - ); - assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); - - Ok(()) - } - - #[test] - #[allow(clippy::unwrap_used)] - fn v3_data_block_mvcc_latest_fuzz_2() -> crate::Result<()> { - let items = [ - InternalValue::from_components(Slice::from([0]), Slice::from([]), 0, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 8, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 7, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 6, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 5, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 4, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 3, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 2, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 1, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 0, Value), - InternalValue::from_components( - Slice::from([255, 255, 0]), - Slice::from([]), - 127_886_946_205_696, - Tombstone, - ), - ]; - - let bytes = DataBlock::encode_items(&items, 2, 0.0)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - - assert_eq!( - Some(items.get(1).cloned().unwrap()), - data_block.point_read(&[233, 233], SeqNo::MAX) - ); - assert_eq!( - Some(items.last().cloned().unwrap()), - data_block.point_read(&[255, 255, 0], SeqNo::MAX) - ); - assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); - - Ok(()) - } - - #[test] - #[allow(clippy::unwrap_used)] - fn v3_data_block_mvcc_latest_fuzz_3() -> crate::Result<()> { - let items = [ - InternalValue::from_components(Slice::from([0]), Slice::from([]), 0, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 8, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 7, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 6, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 5, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 4, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 3, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 2, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 1, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 0, Value), - InternalValue::from_components( - Slice::from([255, 255, 0]), - Slice::from([]), - 127_886_946_205_696, - Tombstone, - ), - ]; - - let bytes = DataBlock::encode_items(&items, 2, 0.0)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - - assert_eq!( - Some(items.get(1).cloned().unwrap()), - data_block.point_read(&[233, 233], SeqNo::MAX) - ); - assert_eq!( - Some(items.last().cloned().unwrap()), - data_block.point_read(&[255, 255, 0], SeqNo::MAX) - ); - assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); - - Ok(()) - } - - #[test] - #[allow(clippy::unwrap_used)] - fn v3_data_block_mvcc_latest_fuzz_3_dense() -> crate::Result<()> { - let items = [ - InternalValue::from_components(Slice::from([0]), Slice::from([]), 0, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 8, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 7, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 6, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 5, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 4, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 3, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 2, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 1, Value), - InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 0, Value), - InternalValue::from_components( - Slice::from([255, 255, 0]), - Slice::from([]), - 127_886_946_205_696, - Tombstone, - ), - ]; - - let bytes = DataBlock::encode_items(&items, 1, 0.0)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - - assert_eq!( - Some(items.get(1).cloned().unwrap()), - data_block.point_read(&[233, 233], SeqNo::MAX) - ); - assert_eq!( - Some(items.last().cloned().unwrap()), - data_block.point_read(&[255, 255, 0], SeqNo::MAX) - ); - assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); - - Ok(()) - } - - #[test] - fn v3_data_block_dense_mvcc_no_hash() -> crate::Result<()> { - let items = [ - InternalValue::from_components(b"a", b"a", 3, Value), - InternalValue::from_components(b"a", b"a", 2, Value), - InternalValue::from_components(b"a", b"a", 1, Value), - InternalValue::from_components(b"b", b"b", 65, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 1, 0.0)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().is_none()); - - for needle in items { - eprintln!("NEEDLE {needle:?}"); - - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), - ); - } - - assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); - - Ok(()) - } - - #[test] - fn v3_data_block_point_read_shadowing() -> crate::Result<()> { - let items = [ - InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), - InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), - InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), - InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), - InternalValue::from_components("pla:venus:name", "Venus", 0, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 16, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - assert!(data_block - .point_read(b"pla:venus:fact", SeqNo::MAX) - .expect("should exist") - .is_tombstone()); - - Ok(()) - } - - #[test] - fn v3_data_block_point_read_dense() -> crate::Result<()> { - let items = [ - InternalValue::from_components("pla:earth:fact", "eaaaaaaaaarth", 0, Value), - InternalValue::from_components("pla:jupiter:fact", "Jupiter is big", 0, Value), - InternalValue::from_components("pla:jupiter:mass", "Massive", 0, Value), - InternalValue::from_components("pla:jupiter:name", "Jupiter", 0, Value), - InternalValue::from_components("pla:jupiter:radius", "Big", 0, Value), - InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), - InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), - InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), - InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), - InternalValue::from_components("pla:venus:name", "Venus", 0, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 1, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - for needle in items { - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), - ); - } - - assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); - - Ok(()) - } - - #[test] - fn v3_data_block_iter_forward_one_time() -> crate::Result<()> { - let items = [InternalValue::from_components( - "pla:saturn:fact", - "Saturn is pretty big", - 0, - Value, - )]; - - let bytes = DataBlock::encode_items(&items, 16, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - - assert_eq!( - { - #[allow(clippy::suspicious_map)] - data_block.iter().count() - }, - items.len() - ); - - assert_eq!(data_block.iter().collect::>(), items); - - Ok(()) - } - - #[test] - fn v3_data_block_iter_forward() -> crate::Result<()> { - let items = [ - InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), - InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), - InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), - InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), - InternalValue::from_components("pla:venus:name", "Venus", 0, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 16, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - assert_eq!( - { - #[allow(clippy::suspicious_map)] - data_block.iter().count() - }, - items.len(), - ); - - assert_eq!(items, *data_block.iter().collect::>(),); - - Ok(()) - } - - #[test] - fn v3_data_block_iter_forward_dense() -> crate::Result<()> { - let items = [InternalValue::from_components( - "pla:saturn:fact", - "Saturn is pretty big", - 0, - Value, - )]; - - let bytes = DataBlock::encode_items(&items, 1, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - - assert_eq!(items.len(), { - #[allow(clippy::suspicious_map)] - data_block.iter().count() - }); - - assert_eq!(items, *data_block.iter().collect::>(),); - - Ok(()) - } - - #[test] - fn v3_data_block_iter_rev() -> crate::Result<()> { - let items = [ - InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), - InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), - InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), - InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), - InternalValue::from_components("pla:venus:name", "Venus", 0, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 16, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - assert_eq!(items.len(), { - #[allow(clippy::suspicious_map)] - data_block.iter().rev().count() - }); - - assert_eq!( - items.into_iter().rev().collect::>(), - data_block.iter().rev().collect::>(), - ); - - Ok(()) - } - - #[test] - fn v3_data_block_iter_ping_pong() -> crate::Result<()> { - let items = [ - InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), - InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), - InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), - InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), - InternalValue::from_components("pla:venus:name", "Venus", 0, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 16, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - { - let mut iter = data_block.iter(); - - assert_eq!(b"pla:saturn:fact", &*iter.next().unwrap().key.user_key); - assert_eq!(b"pla:venus:name", &*iter.next_back().unwrap().key.user_key); - assert_eq!(b"pla:saturn:name", &*iter.next().unwrap().key.user_key); - assert_eq!(b"pla:venus:fact", &*iter.next_back().unwrap().key.user_key); - - let last = iter.next().unwrap().key; - assert_eq!(b"pla:venus:fact", &*last.user_key); - assert_eq!(Tombstone, last.value_type); - assert_eq!(1, last.seqno); - } - - { - let mut iter = data_block.iter(); - - assert_eq!(b"pla:venus:name", &*iter.next_back().unwrap().key.user_key); - assert_eq!( - b"pla:saturn:fact", - &*iter - .next() - .inspect(|v| { - eprintln!("{:?}", String::from_utf8_lossy(&v.key.user_key)); - }) - .unwrap() - .key - .user_key - ); - assert_eq!(b"pla:venus:fact", &*iter.next_back().unwrap().key.user_key); - assert_eq!(b"pla:saturn:name", &*iter.next().unwrap().key.user_key); - - let last = iter.next_back().unwrap().key; - assert_eq!(b"pla:venus:fact", &*last.user_key); - assert_eq!(Tombstone, last.value_type); - assert_eq!(1, last.seqno); - } - - Ok(()) - } - - #[test] - fn v3_data_block_range() -> crate::Result<()> { - let items = [ - InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), - InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), - InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), - InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), - InternalValue::from_components("pla:venus:name", "Venus", 0, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 16, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - assert_eq!( - { - #[allow(clippy::suspicious_map)] - data_block.range(&((b"pla:venus:" as &[u8])..)).count() - }, - 3, - ); - - Ok(()) - } - - #[test] - fn v3_data_block_range_rev() -> crate::Result<()> { - let items = [ - InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), - InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), - InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), - InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), - InternalValue::from_components("pla:venus:name", "Venus", 0, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 16, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - assert_eq!( - { - #[allow(clippy::suspicious_map)] - data_block - .range(&((b"pla:venus:" as &[u8])..)) - .rev() - .count() - }, - 3, - ); - - Ok(()) - } - - #[test] - fn v3_data_block_small_hash_ratio() -> crate::Result<()> { - let items = (0u64..254) - .map(|x| InternalValue::from_components(x.to_be_bytes(), x.to_be_bytes(), 0, Value)) - .collect::>(); - - // NOTE: If >0.0, buckets are at least 1 - let bytes = DataBlock::encode_items(&items, 1, 0.0001)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - for needle in items { - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), - ); - } - - Ok(()) - } - - #[test] - fn v3_data_block_just_enough_pointers_for_hash_bucket() -> crate::Result<()> { - let items = (0u64..254) - .map(|x| InternalValue::from_components(x.to_be_bytes(), x.to_be_bytes(), 0, Value)) - .collect::>(); - - let bytes = DataBlock::encode_items(&items, 1, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().unwrap() > 0); - - for needle in items { - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), - ); - } - - Ok(()) - } - - #[test] - fn v3_data_block_too_many_pointers_for_hash_bucket() -> crate::Result<()> { - let items = (0u64..255) - .map(|x| InternalValue::from_components(x.to_be_bytes(), x.to_be_bytes(), 0, Value)) - .collect::>(); - - let bytes = DataBlock::encode_items(&items, 1, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().is_none()); - - for needle in items { - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), - ); - } - - Ok(()) - } - - #[test] - fn v3_data_block_way_too_many_pointers_for_hash_bucket() -> crate::Result<()> { - let items = (0u64..1_000) - .map(|x| InternalValue::from_components(x.to_be_bytes(), x.to_be_bytes(), 0, Value)) - .collect::>(); - - let bytes = DataBlock::encode_items(&items, 1, 1.33)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().is_none()); - - for needle in items { - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), - ); - } - - Ok(()) - } - - #[test] - fn v3_data_block_no_hash_index() -> crate::Result<()> { - let items = (0u64..1) - .map(|x| InternalValue::from_components(x.to_be_bytes(), x.to_be_bytes(), 0, Value)) - .collect::>(); - - let bytes = DataBlock::encode_items(&items, 1, 0.0)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().is_none()); - - for needle in items { - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), - ); - } - - Ok(()) - } -} diff --git a/src/segment/data_block/iter.rs b/src/segment/data_block/iter.rs index a0d52863..d6d297da 100644 --- a/src/segment/data_block/iter.rs +++ b/src/segment/data_block/iter.rs @@ -2,242 +2,761 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::{ - forward_reader::{ForwardReader, ParsedItem}, - DataBlock, +use crate::{ + double_ended_peekable::{DoubleEndedPeekable, DoubleEndedPeekableExt}, + segment::{ + block::{Decoder, ParsedItem}, + data_block::DataBlockParsedItem, + }, + InternalValue, }; -use std::io::Cursor; - -#[derive(Debug)] -struct HiScanner { - offset: usize, - ptr_idx: usize, - stack: Vec, // TODO: SmallVec? - base_key_offset: Option, -} -/// Double-ended iterator over data blocks +// TODO: rename pub struct Iter<'a> { - block: &'a DataBlock, - restart_interval: usize, - - lo_scanner: ForwardReader<'a>, - hi_scanner: HiScanner, + bytes: &'a [u8], + decoder: + DoubleEndedPeekable>, } impl<'a> Iter<'a> { #[must_use] - pub fn new(block: &'a DataBlock) -> Self { - let restart_interval = block.restart_interval.into(); - let binary_index_len = block.binary_index_len as usize; + pub fn new(bytes: &'a [u8], decoder: Decoder<'a, InternalValue, DataBlockParsedItem>) -> Self { + let decoder = decoder.double_ended_peekable(); + Self { bytes, decoder } + } - Self { - block, + pub fn seek(&mut self, needle: &[u8]) -> bool { + if !self + .decoder + .inner_mut() + .seek(needle, |head_key| head_key < needle, false) + { + return false; + } - restart_interval, + // TODO: make sure we only linear scan over the current restart interval + // TODO: if we do more steps, something has gone wrong with the seek probably, maybe...? - lo_scanner: ForwardReader::new(block), + // Linear scan + loop { + let Some(item) = self.decoder.peek() else { + return false; + }; - /* lo_scanner: LoScanner::default(), */ - hi_scanner: HiScanner { - offset: 0, - ptr_idx: binary_index_len, - stack: Vec::new(), - base_key_offset: None, - }, + match item.compare_key(needle, self.bytes) { + std::cmp::Ordering::Equal => { + return true; + } + std::cmp::Ordering::Greater => { + return false; + } + std::cmp::Ordering::Less => { + // Continue + + self.decoder.next().expect("should exist"); + } + } + } + } + + pub fn seek_upper(&mut self, needle: &[u8]) -> bool { + if !self + .decoder + .inner_mut() + .seek_upper(|head_key| head_key <= needle, false) + { + return false; + } + + // Linear scan + loop { + let Some(item) = self.decoder.peek_back() else { + return false; + }; + + match item.compare_key(needle, self.bytes) { + std::cmp::Ordering::Equal => { + return true; + } + std::cmp::Ordering::Less => { + return false; + } + std::cmp::Ordering::Greater => { + // Continue + + self.decoder.next_back().expect("should exist"); + } + } + } + } +} + +impl Iterator for Iter<'_> { + type Item = DataBlockParsedItem; + + fn next(&mut self) -> Option { + self.decoder.next() + } +} + +impl DoubleEndedIterator for Iter<'_> { + fn next_back(&mut self) -> Option { + self.decoder.next_back() + } +} + +#[cfg(test)] +mod tests { + use crate::{ + segment::{ + block::{Header, ParsedItem}, + Block, BlockOffset, DataBlock, + }, + Checksum, InternalValue, Slice, + ValueType::{Tombstone, Value}, + }; + use test_log::test; + + #[test] + fn v3_data_block_iter_forward() -> crate::Result<()> { + let items = [ + InternalValue::from_components("b", "b", 0, Value), + InternalValue::from_components("c", "c", 0, Value), + InternalValue::from_components("d", "d", 1, Tombstone), + InternalValue::from_components("e", "e", 0, Value), + InternalValue::from_components("f", "f", 0, Value), + ]; + + for restart_interval in 1..=16 { + let bytes = DataBlock::encode_items(&items, restart_interval, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + let iter = data_block + .iter() + .map(|item| item.materialize(&data_block.inner.data)); + + let real_items: Vec<_> = iter.collect(); + + assert_eq!(items, &*real_items); + } + + Ok(()) + } + + #[test] + fn v3_data_block_iter_rev() -> crate::Result<()> { + let items = [ + InternalValue::from_components("b", "b", 0, Value), + InternalValue::from_components("c", "c", 0, Value), + InternalValue::from_components("d", "d", 1, Tombstone), + InternalValue::from_components("e", "e", 0, Value), + InternalValue::from_components("f", "f", 0, Value), + ]; + + for restart_interval in 1..=16 { + let bytes = DataBlock::encode_items(&items, restart_interval, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + let iter = data_block + .iter() + .rev() + .map(|item| item.materialize(&data_block.inner.data)); + + let real_items: Vec<_> = iter.collect(); + + assert_eq!( + items.iter().rev().cloned().collect::>(), + &*real_items, + ); + } + + Ok(()) + } + + #[test] + fn v3_data_block_iter_rev_seek_back() -> crate::Result<()> { + let items = [ + InternalValue::from_components("b", "b", 0, Value), + InternalValue::from_components("c", "c", 0, Value), + InternalValue::from_components("d", "d", 1, Tombstone), + InternalValue::from_components("e", "e", 0, Value), + InternalValue::from_components("f", "f", 0, Value), + ]; + + for restart_interval in 1..=16 { + let bytes = DataBlock::encode_items(&items, restart_interval, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + let mut iter = data_block.iter(); + + assert!(iter.seek_upper(b"d"), "should seek"); + + let iter = iter.map(|item| item.materialize(&data_block.inner.data)); + + let real_items: Vec<_> = iter.rev().collect(); + + assert_eq!( + items.iter().rev().skip(2).cloned().collect::>(), + &*real_items, + ); } + + Ok(()) } - /* pub fn with_offset(mut self, offset: usize) -> Self { - self.lo_scanner.offset = offset; - self - } */ + #[test] + fn v3_data_block_iter_range_edges() -> crate::Result<()> { + let items = [ + InternalValue::from_components("b", "b", 0, Value), + InternalValue::from_components("c", "c", 0, Value), + InternalValue::from_components("d", "d", 1, Tombstone), + InternalValue::from_components("e", "e", 0, Value), + InternalValue::from_components("f", "f", 0, Value), + ]; + + for restart_interval in 1..=16 { + let bytes = DataBlock::encode_items(&items, restart_interval, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + { + let mut iter = data_block.iter(); + + assert!(!iter.seek(b"a"), "should not seek"); + + let iter = iter.map(|item| item.materialize(&data_block.inner.data)); + + let real_items: Vec<_> = iter.collect(); + + assert_eq!(items.to_vec(), &*real_items); + } + + { + let mut iter = data_block.iter(); + + assert!(!iter.seek_upper(b"g"), "should not seek"); + + let iter = iter.map(|item| item.materialize(&data_block.inner.data)); + + let real_items: Vec<_> = iter.collect(); + + assert_eq!(items.to_vec(), &*real_items); + } + + { + let mut iter = data_block.iter(); + + assert!(iter.seek_upper(b"b"), "should seek"); + + let iter = iter.map(|item| item.materialize(&data_block.inner.data)); + + let real_items: Vec<_> = iter.collect(); + + assert_eq!( + items.iter().take(1).cloned().collect::>(), + &*real_items, + ); + } + + { + let mut iter = data_block.iter(); + + assert!(iter.seek(b"f"), "should seek"); + + let iter = iter.map(|item| item.materialize(&data_block.inner.data)); + + let real_items: Vec<_> = iter.collect(); + + assert_eq!( + items.iter().rev().take(1).cloned().collect::>(), + &*real_items, + ); + } + } + + Ok(()) + } + + #[test] + fn v3_data_block_iter_range() -> crate::Result<()> { + let items = [ + InternalValue::from_components("b", "b", 0, Value), + InternalValue::from_components("c", "c", 0, Value), + InternalValue::from_components("d", "d", 1, Tombstone), + InternalValue::from_components("e", "e", 0, Value), + InternalValue::from_components("f", "f", 0, Value), + ]; + + for restart_interval in 1..=16 { + let bytes = DataBlock::encode_items(&items, restart_interval, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + let mut iter = data_block.iter(); + + assert!(iter.seek(b"c"), "should seek"); + assert!(iter.seek_upper(b"d"), "should seek"); + + let iter = iter.map(|item| item.materialize(&data_block.inner.data)); + + let real_items: Vec<_> = iter.collect(); + + assert_eq!( + items.iter().skip(1).take(2).cloned().collect::>(), + &*real_items, + ); + } + + Ok(()) + } + + #[test] + fn v3_data_block_iter_only_first() -> crate::Result<()> { + let items = [ + InternalValue::from_components("b", "b", 0, Value), + InternalValue::from_components("c", "c", 0, Value), + InternalValue::from_components("d", "d", 1, Tombstone), + InternalValue::from_components("e", "e", 0, Value), + InternalValue::from_components("f", "f", 0, Value), + ]; + + for restart_interval in 1..=16 { + let bytes = DataBlock::encode_items(&items, restart_interval, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + let mut iter = data_block.iter(); + + assert!(iter.seek_upper(b"b"), "should seek"); + + let iter = iter.map(|item| item.materialize(&data_block.inner.data)); + + let real_items: Vec<_> = iter.collect(); + + assert_eq!( + items.iter().take(1).cloned().collect::>(), + &*real_items, + ); + } + + Ok(()) + } + + #[test] + fn v3_data_block_iter_range_same_key() -> crate::Result<()> { + let items = [ + InternalValue::from_components("b", "b", 0, Value), + InternalValue::from_components("c", "c", 0, Value), + InternalValue::from_components("d", "d", 1, Tombstone), + InternalValue::from_components("e", "e", 0, Value), + InternalValue::from_components("f", "f", 0, Value), + ]; + + for restart_interval in 1..=16 { + let bytes = DataBlock::encode_items(&items, restart_interval, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + { + let mut iter = data_block.iter(); + + assert!(iter.seek(b"d"), "should seek"); + assert!(iter.seek_upper(b"d"), "should seek"); + + let iter = iter.map(|item| item.materialize(&data_block.inner.data)); + + let real_items: Vec<_> = iter.collect(); + + assert_eq!( + items.iter().skip(2).take(1).cloned().collect::>(), + &*real_items, + ); + } + + { + let mut iter = data_block.iter(); + + assert!(iter.seek_upper(b"d"), "should seek"); + assert!(iter.seek(b"d"), "should seek"); + + let iter = iter.map(|item| item.materialize(&data_block.inner.data)); + + let real_items: Vec<_> = iter.collect(); + + assert_eq!( + items.iter().skip(2).take(1).cloned().collect::>(), + &*real_items, + ); + } + + { + let mut iter = data_block.iter(); + + assert!(iter.seek(b"d"), "should seek"); + assert!(iter.seek_upper(b"d"), "should seek"); + + let iter = iter.map(|item| item.materialize(&data_block.inner.data)); + + let real_items: Vec<_> = iter.rev().collect(); + + assert_eq!( + items + .iter() + .rev() + .skip(2) + .take(1) + .cloned() + .collect::>(), + &*real_items, + ); + } + + { + let mut iter = data_block.iter(); + + assert!(iter.seek_upper(b"d"), "should seek"); + assert!(iter.seek(b"d"), "should seek"); + + let iter = iter.map(|item| item.materialize(&data_block.inner.data)); + + let real_items: Vec<_> = iter.rev().collect(); + + assert_eq!( + items + .iter() + .rev() + .skip(2) + .take(1) + .cloned() + .collect::>(), + &*real_items, + ); + } + } + + Ok(()) + } + + #[test] + fn v3_data_block_iter_range_empty() -> crate::Result<()> { + let items = [ + InternalValue::from_components("b", "b", 0, Value), + InternalValue::from_components("c", "c", 0, Value), + InternalValue::from_components("d", "d", 1, Tombstone), + InternalValue::from_components("e", "e", 0, Value), + InternalValue::from_components("f", "f", 0, Value), + ]; + + for restart_interval in 1..=16 { + let bytes = DataBlock::encode_items(&items, restart_interval, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + { + let mut iter = data_block.iter(); + + assert!(iter.seek(b"f"), "should seek"); + iter.seek_upper(b"e"); + + let mut iter = iter.map(|item| item.materialize(&data_block.inner.data)); + + assert!(iter.next().is_none(), "iter should be empty"); + } + + { + let mut iter = data_block.iter(); + + assert!(iter.seek(b"f"), "should seek"); + iter.seek_upper(b"e"); + + let mut iter = iter.map(|item| item.materialize(&data_block.inner.data)); + + assert!(iter.next_back().is_none(), "iter should be empty"); + } + + { + let mut iter = data_block.iter(); + + assert!(iter.seek_upper(b"e"), "should seek"); + iter.seek(b"f"); + + let mut iter = iter.map(|item| item.materialize(&data_block.inner.data)); + + assert!(iter.next_back().is_none(), "iter should be empty"); + } + + { + let mut iter = data_block.iter(); + + assert!(iter.seek_upper(b"e"), "should seek"); + iter.seek(b"f"); + + let mut iter = iter.map(|item| item.materialize(&data_block.inner.data)); + + assert!(iter.next_back().is_none(), "iter should be empty"); + } + } + + Ok(()) + } + + #[test] + fn v3_data_block_iter_forward_seek_restart_head() -> crate::Result<()> { + let items = [ + InternalValue::from_components("b", "b", 0, Value), + InternalValue::from_components("c", "c", 0, Value), + InternalValue::from_components("d", "d", 1, Tombstone), + InternalValue::from_components("e", "e", 0, Value), + InternalValue::from_components("f", "f", 0, Value), + ]; + + for restart_interval in 1..=16 { + let bytes = DataBlock::encode_items(&items, restart_interval, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + let mut iter = data_block.iter(); + + assert!(iter.seek(b"b"), "should seek correctly"); + + let iter = iter.map(|item| item.materialize(&data_block.inner.data)); + + let real_items: Vec<_> = iter.collect(); + + assert_eq!(items, &*real_items); + } + + Ok(()) + } + + #[test] + fn v3_data_block_iter_forward_seek_in_interval() -> crate::Result<()> { + let items = [ + InternalValue::from_components("b", "b", 0, Value), + InternalValue::from_components("c", "c", 0, Value), + InternalValue::from_components("d", "d", 1, Tombstone), + InternalValue::from_components("e", "e", 0, Value), + InternalValue::from_components("f", "f", 0, Value), + ]; + + for restart_interval in 1..=16 { + let bytes = DataBlock::encode_items(&items, restart_interval, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + let mut iter = data_block.iter(); - fn parse_restart_item( - block: &DataBlock, - offset: &mut usize, - base_key_offset: &mut Option, - ) -> Option { - let bytes = block.bytes(); + assert!(iter.seek(b"d"), "should seek correctly"); - // SAFETY: The cursor is advanced by read_ operations which check for EOF, - // And the cursor starts at 0 - the slice is never empty - #[warn(unsafe_code)] - let mut reader = Cursor::new(unsafe { bytes.get_unchecked(*offset..) }); + let iter = iter.map(|item| item.materialize(&data_block.inner.data)); - let item = DataBlock::parse_restart_item(&mut reader, *offset)?; + let real_items: Vec<_> = iter.collect(); - *offset += reader.position() as usize; - *base_key_offset = Some(item.key.0); + assert_eq!( + items.iter().skip(2).cloned().collect::>(), + real_items, + ); + } - Some(item) + Ok(()) } - fn parse_truncated_item( - block: &DataBlock, - offset: &mut usize, - base_key_offset: usize, - ) -> Option { - let bytes = block.bytes(); - - // SAFETY: The cursor is advanced by read_ operations which check for EOF, - // And the cursor starts at 0 - the slice is never empty - #[warn(unsafe_code)] - let mut reader = Cursor::new(unsafe { bytes.get_unchecked(*offset..) }); + #[test] + fn v3_data_block_iter_forward_seek_last() -> crate::Result<()> { + let items = [ + InternalValue::from_components("b", "b", 0, Value), + InternalValue::from_components("c", "c", 0, Value), + InternalValue::from_components("d", "d", 1, Tombstone), + InternalValue::from_components("e", "e", 0, Value), + InternalValue::from_components("f", "f", 0, Value), + ]; - let item = DataBlock::parse_truncated_item(&mut reader, *offset, base_key_offset)?; + for restart_interval in 1..=16 { + let bytes = DataBlock::encode_items(&items, restart_interval, 1.33)?; - *offset += reader.position() as usize; + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); - Some(item) - } + let mut iter = data_block.iter(); - fn consume_stack_top(&mut self) -> Option { - if let Some(offset) = self.hi_scanner.stack.pop() { - if self.lo_scanner.offset() > 0 && offset < self.lo_scanner.offset() { - return None; - } + assert!(iter.seek(b"f"), "should seek correctly"); - self.hi_scanner.offset = offset; - - let is_restart = self.hi_scanner.stack.is_empty(); - - if is_restart { - Self::parse_restart_item( - self.block, - &mut self.hi_scanner.offset, - &mut self.hi_scanner.base_key_offset, - ) - } else { - Self::parse_truncated_item( - self.block, - &mut self.hi_scanner.offset, - self.hi_scanner.base_key_offset.expect("should exist"), - ) - } - } else { - None - } - } -} + let iter = iter.map(|item| item.materialize(&data_block.inner.data)); -impl Iterator for Iter<'_> { - type Item = ParsedItem; + let real_items: Vec<_> = iter.collect(); - fn next(&mut self) -> Option { - if self.hi_scanner.base_key_offset.is_some() - && self.lo_scanner.offset() >= self.hi_scanner.offset - { - return None; + assert_eq!( + items.iter().skip(4).cloned().collect::>(), + real_items, + ); } - /* let is_restart = self.lo_scanner.remaining_in_interval == 0; + Ok(()) + } - let item = if is_restart { - self.lo_scanner.remaining_in_interval = self.restart_interval; + #[test] + fn v3_data_block_iter_forward_seek_before_first() -> crate::Result<()> { + let items = [ + InternalValue::from_components("b", "b", 0, Value), + InternalValue::from_components("c", "c", 0, Value), + InternalValue::from_components("d", "d", 1, Tombstone), + InternalValue::from_components("e", "e", 0, Value), + InternalValue::from_components("f", "f", 0, Value), + ]; - Self::parse_restart_item( - self.block, - &mut self.lo_scanner.offset, - &mut self.lo_scanner.base_key_offset, - ) - } else { - Self::parse_truncated_item( - self.block, - &mut self.lo_scanner.offset, - self.lo_scanner.base_key_offset.expect("should exist"), - ) - }; + for restart_interval in 1..=16 { + let bytes = DataBlock::encode_items(&items, restart_interval, 1.33)?; - self.lo_scanner.remaining_in_interval -= 1; */ + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); - let item = self.lo_scanner.next(); + let mut iter = data_block.iter(); - if self.hi_scanner.base_key_offset.is_some() - && self.lo_scanner.offset() >= self.hi_scanner.offset - { - return None; - } + assert!(!iter.seek(b"a"), "should not find exact match"); - item - } -} + let iter = iter.map(|item| item.materialize(&data_block.inner.data)); -impl DoubleEndedIterator for Iter<'_> { - fn next_back(&mut self) -> Option { - if let Some(top) = self.consume_stack_top() { - return Some(top); - } + let real_items: Vec<_> = iter.collect(); - // NOTE: If we wrapped, we are at the end - // This is safe to do, because there cannot be that many restart intervals - if self.hi_scanner.ptr_idx == usize::MAX { - return None; + assert_eq!(items, &*real_items); } - self.hi_scanner.ptr_idx = self.hi_scanner.ptr_idx.wrapping_sub(1); + Ok(()) + } - // NOTE: If we wrapped, we are at the end - // This is safe to do, because there cannot be that many restart intervals - if self.hi_scanner.ptr_idx == usize::MAX { - return None; - } + #[test] + fn v3_data_block_iter_forward_seek_after_last() -> crate::Result<()> { + let items = [ + InternalValue::from_components("b", "b", 0, Value), + InternalValue::from_components("c", "c", 0, Value), + InternalValue::from_components("d", "d", 1, Tombstone), + InternalValue::from_components("e", "e", 0, Value), + InternalValue::from_components("f", "f", 0, Value), + ]; - let binary_index = self.block.get_binary_index_reader(); + for restart_interval in 1..=16 { + let bytes = DataBlock::encode_items(&items, restart_interval, 1.33)?; - { - self.hi_scanner.offset = binary_index.get(self.hi_scanner.ptr_idx); - let offset = self.hi_scanner.offset; - - if Self::parse_restart_item( - self.block, - &mut self.hi_scanner.offset, - &mut self.hi_scanner.base_key_offset, - ) - .is_some() - { - self.hi_scanner.stack.push(offset); - } - } + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); - for _ in 1..self.restart_interval { - let offset = self.hi_scanner.offset; + let mut iter = data_block.iter(); - if Self::parse_truncated_item( - self.block, - &mut self.hi_scanner.offset, - self.hi_scanner.base_key_offset.expect("should exist"), - ) - .is_some() - { - self.hi_scanner.stack.push(offset); - } - } + assert!(!iter.seek(b"g"), "should not find exact match"); - if self.hi_scanner.stack.is_empty() { - return None; + assert!(iter.next().is_none(), "should not collect any items"); } - self.consume_stack_top() + Ok(()) } -} - -#[cfg(test)] -#[allow(clippy::unwrap_used, clippy::expect_used)] -mod tests { - use super::*; - use crate::{ - segment::{ - block::{BlockOffset, Checksum, Header}, - Block, - }, - InternalValue, - ValueType::Value, - }; - use test_log::test; #[test] - fn v3_data_block_consume_last_back() -> crate::Result<()> { + #[allow(clippy::unwrap_used)] + fn v3_data_block_iter_consume_last_back() -> crate::Result<()> { let items = [ InternalValue::from_components("pla:earth:fact", "eaaaaaaaaarth", 0, Value), InternalValue::from_components("pla:jupiter:fact", "Jupiter is big", 0, Value), @@ -246,54 +765,60 @@ mod tests { InternalValue::from_components("pla:jupiter:radius", "Big", 0, Value), ]; - let bytes = DataBlock::encode_items(&items, 1, 0.0)?; + for restart_interval in 1..=16 { + let bytes = DataBlock::encode_items(&items, restart_interval, 0.0)?; - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().is_none()); + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().is_none()); - { - let mut iter = data_block.iter(); - assert_eq!(b"pla:earth:fact", &*iter.next().unwrap().key.user_key); - assert_eq!(b"pla:jupiter:fact", &*iter.next().unwrap().key.user_key); - assert_eq!(b"pla:jupiter:mass", &*iter.next().unwrap().key.user_key); - assert_eq!(b"pla:jupiter:name", &*iter.next().unwrap().key.user_key); - assert_eq!( - b"pla:jupiter:radius", - &*iter.next_back().unwrap().key.user_key - ); - assert!(iter.next_back().is_none()); - assert!(iter.next().is_none()); - } + { + let mut iter = data_block + .iter() + .map(|item| item.materialize(&data_block.inner.data)); + + assert_eq!(b"pla:earth:fact", &*iter.next().unwrap().key.user_key); + assert_eq!(b"pla:jupiter:fact", &*iter.next().unwrap().key.user_key); + assert_eq!(b"pla:jupiter:mass", &*iter.next().unwrap().key.user_key); + assert_eq!(b"pla:jupiter:name", &*iter.next().unwrap().key.user_key); + assert_eq!(b"pla:jupiter:radius", &*iter.next().unwrap().key.user_key); + assert!(iter.next_back().is_none()); + assert!(iter.next().is_none()); + } - { - let mut iter = data_block.iter(); - assert_eq!(b"pla:earth:fact", &*iter.next().unwrap().key.user_key); - assert_eq!(b"pla:jupiter:fact", &*iter.next().unwrap().key.user_key); - assert_eq!(b"pla:jupiter:mass", &*iter.next().unwrap().key.user_key); - assert_eq!(b"pla:jupiter:name", &*iter.next().unwrap().key.user_key); - assert_eq!( - b"pla:jupiter:radius", - &*iter.next_back().unwrap().key.user_key - ); - assert!(iter.next().is_none()); - assert!(iter.next_back().is_none()); + { + let mut iter = data_block + .iter() + .map(|item| item.materialize(&data_block.inner.data)); + + assert_eq!(b"pla:earth:fact", &*iter.next().unwrap().key.user_key); + assert_eq!(b"pla:jupiter:fact", &*iter.next().unwrap().key.user_key); + assert_eq!(b"pla:jupiter:mass", &*iter.next().unwrap().key.user_key); + assert_eq!(b"pla:jupiter:name", &*iter.next().unwrap().key.user_key); + assert_eq!( + b"pla:jupiter:radius", + &*iter.next_back().unwrap().key.user_key + ); + assert!(iter.next().is_none()); + assert!(iter.next_back().is_none()); + } } Ok(()) } #[test] - fn v3_data_block_consume_last_forwards() -> crate::Result<()> { + #[allow(clippy::unwrap_used)] + fn v3_data_block_iter_consume_last_forwards() -> crate::Result<()> { let items = [ InternalValue::from_components("pla:earth:fact", "eaaaaaaaaarth", 0, Value), InternalValue::from_components("pla:jupiter:fact", "Jupiter is big", 0, Value), @@ -302,66 +827,77 @@ mod tests { InternalValue::from_components("pla:jupiter:radius", "Big", 0, Value), ]; - let bytes = DataBlock::encode_items(&items, 1, 0.0)?; + for restart_interval in 1..=16 { + let bytes = DataBlock::encode_items(&items, restart_interval, 0.0)?; - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().is_none()); + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().is_none()); - { - let mut iter = data_block.iter().rev(); - assert_eq!(b"pla:earth:fact", &*iter.next_back().unwrap().key.user_key); - assert_eq!( - b"pla:jupiter:fact", - &*iter.next_back().unwrap().key.user_key - ); - assert_eq!( - b"pla:jupiter:mass", - &*iter.next_back().unwrap().key.user_key - ); - assert_eq!( - b"pla:jupiter:name", - &*iter.next_back().unwrap().key.user_key - ); - assert_eq!(b"pla:jupiter:radius", &*iter.next().unwrap().key.user_key); - assert!(iter.next().is_none()); - assert!(iter.next_back().is_none()); - } + { + let mut iter = data_block + .iter() + .rev() + .map(|item| item.materialize(&data_block.inner.data)); + + assert_eq!(b"pla:earth:fact", &*iter.next_back().unwrap().key.user_key); + assert_eq!( + b"pla:jupiter:fact", + &*iter.next_back().unwrap().key.user_key + ); + assert_eq!( + b"pla:jupiter:mass", + &*iter.next_back().unwrap().key.user_key + ); + assert_eq!( + b"pla:jupiter:name", + &*iter.next_back().unwrap().key.user_key + ); + assert_eq!(b"pla:jupiter:radius", &*iter.next().unwrap().key.user_key); + assert!(iter.next().is_none()); + assert!(iter.next_back().is_none()); + } - { - let mut iter = data_block.iter().rev(); - assert_eq!(b"pla:earth:fact", &*iter.next_back().unwrap().key.user_key); - assert_eq!( - b"pla:jupiter:fact", - &*iter.next_back().unwrap().key.user_key - ); - assert_eq!( - b"pla:jupiter:mass", - &*iter.next_back().unwrap().key.user_key - ); - assert_eq!( - b"pla:jupiter:name", - &*iter.next_back().unwrap().key.user_key - ); - assert_eq!(b"pla:jupiter:radius", &*iter.next().unwrap().key.user_key); - assert!(iter.next_back().is_none()); - assert!(iter.next().is_none()); + { + let mut iter = data_block + .iter() + .rev() + .map(|item| item.materialize(&data_block.inner.data)); + + assert_eq!(b"pla:earth:fact", &*iter.next_back().unwrap().key.user_key); + assert_eq!( + b"pla:jupiter:fact", + &*iter.next_back().unwrap().key.user_key + ); + assert_eq!( + b"pla:jupiter:mass", + &*iter.next_back().unwrap().key.user_key + ); + assert_eq!( + b"pla:jupiter:name", + &*iter.next_back().unwrap().key.user_key + ); + assert_eq!(b"pla:jupiter:radius", &*iter.next().unwrap().key.user_key); + assert!(iter.next_back().is_none()); + assert!(iter.next().is_none()); + } } Ok(()) } #[test] - fn v3_data_block_ping_pong_exhaust() -> crate::Result<()> { + #[allow(clippy::unwrap_used)] + fn v3_data_block_iter_ping_pong_exhaust() -> crate::Result<()> { let items = [ InternalValue::from_components("a", "a", 0, Value), InternalValue::from_components("b", "b", 0, Value), @@ -387,7 +923,10 @@ mod tests { assert!(data_block.hash_bucket_count().is_none()); { - let mut iter = data_block.iter(); + let mut iter = data_block + .iter() + .map(|item| item.materialize(&data_block.inner.data)); + assert_eq!(b"a", &*iter.next().unwrap().key.user_key); assert_eq!(b"b", &*iter.next().unwrap().key.user_key); assert_eq!(b"c", &*iter.next().unwrap().key.user_key); @@ -398,7 +937,10 @@ mod tests { } { - let mut iter = data_block.iter(); + let mut iter = data_block + .iter() + .map(|item| item.materialize(&data_block.inner.data)); + assert_eq!(b"e", &*iter.next_back().unwrap().key.user_key); assert_eq!(b"d", &*iter.next_back().unwrap().key.user_key); assert_eq!(b"c", &*iter.next_back().unwrap().key.user_key); @@ -409,7 +951,10 @@ mod tests { } { - let mut iter = data_block.iter(); + let mut iter = data_block + .iter() + .map(|item| item.materialize(&data_block.inner.data)); + assert_eq!(b"a", &*iter.next().unwrap().key.user_key); assert_eq!(b"b", &*iter.next().unwrap().key.user_key); assert_eq!(b"c", &*iter.next().unwrap().key.user_key); @@ -422,7 +967,10 @@ mod tests { } { - let mut iter = data_block.iter(); + let mut iter = data_block + .iter() + .map(|item| item.materialize(&data_block.inner.data)); + assert_eq!(b"e", &*iter.next_back().unwrap().key.user_key); assert_eq!(b"d", &*iter.next_back().unwrap().key.user_key); assert_eq!(b"c", &*iter.next_back().unwrap().key.user_key); @@ -438,329 +986,110 @@ mod tests { Ok(()) } - /* #[test] - fn v3_data_block_ping_pongs() -> crate::Result<()>{ - use crate::{UserKey, UserValue}; - - - pub struct BinaryCodeIterator { - length: usize, - current_number: u128, // Use u128 to support lengths up to 128 bits - max_number: u128, - } - - impl BinaryCodeIterator { - /// Creates a new iterator for all binary codes of a given length. - /// - /// # Panics - /// Panics if `length` is greater than 128, as `u128` cannot hold - /// numbers with more than 128 bits. - pub fn new(length: usize) -> Self { - if length > 128 { - panic!("Length too large for u128 to represent all combinations."); - } - let max_number = if length == 0 { - 0 // Special case for length 0, only one combination (empty vector) - } else { - (1 << length) - 1 // 2^len - 1 is the maximum value for a 'len'-bit number - }; - BinaryCodeIterator { - length, - current_number: 0, - max_number, - } - } - } - - impl Iterator for BinaryCodeIterator { - // The iterator will yield Vec where each u8 is either 0 or 1. - type Item = Vec; - - fn next(&mut self) -> Option { - if self.current_number > self.max_number { - return None; // All codes have been generated - } + #[test] + fn v3_data_block_iter_fuzz_3() -> crate::Result<()> { + let items = [ + InternalValue::from_components( + Slice::from([ + 255, 255, 255, 255, 5, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, + ]), + Slice::from([0, 0, 192]), + 18_446_744_073_701_163_007, + Tombstone, + ), + InternalValue::from_components( + Slice::from([255, 255, 255, 255, 255, 255, 0]), + Slice::from([]), + 0, + Value, + ), + ]; - // Convert the current_number into a binary Vec - let mut code = Vec::with_capacity(self.length); - if self.length == 0 { - // For length 0, only one item: an empty vector - // We've handled max_number=0 already, so this will only run once. - } else { - // Iterate from the least significant bit (LSB) to the most significant bit (MSB) - // or from MSB to LSB depending on desired order. - // This implementation generates from MSB to LSB to match typical binary representation - // e.g., 0b101 -> [1, 0, 1] - for i in (0..self.length).rev() { - // Check if the i-th bit is set - if (self.current_number >> i) & 1 == 1 { - code.push(1); - } else { - code.push(0); - } - } - } + let bytes = DataBlock::encode_items(&items, 5, 1.0)?; + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); - // Increment for the next iteration - self.current_number += 1; + assert_eq!(data_block.len(), items.len()); + assert!( + data_block + .hash_bucket_count() + .expect("should have built hash index") + > 0, + ); + + assert_eq!( + { + #[allow(clippy::suspicious_map)] + data_block.iter().count() + }, + items.len(), + ); - Some(code) - } + Ok(()) } - let items = [ - InternalValue::from_components(UserKey::from([22, 192]), UserValue::from([]), 0, Value), - InternalValue::from_components(UserKey::from([22, 193]), UserValue::from([]), 0, Value), - InternalValue::from_components(UserKey::from([22, 194]), UserValue::from([]), 0, Value), - InternalValue::from_components(UserKey::from([22, 195]), UserValue::from([]), 0, Value), - InternalValue::from_components(UserKey::from([22, 196]), UserValue::from([]), 0, Value), - InternalValue::from_components(UserKey::from([22, 197]), UserValue::from([]), 0, Value), - InternalValue::from_components(UserKey::from([22, 198]), UserValue::from([]), 0, Value), - InternalValue::from_components(UserKey::from([22, 199]), UserValue::from([]), 0, Value), - InternalValue::from_components(UserKey::from([22, 200]), UserValue::from([]), 0, Value), - InternalValue::from_components(UserKey::from([22, 201]), UserValue::from([]), 0, Value), - InternalValue::from_components(UserKey::from([22, 202]), UserValue::from([]), 0, Value), - InternalValue::from_components(UserKey::from([22, 203]), UserValue::from([]), 0, Value), - InternalValue::from_components(UserKey::from([22, 204]), UserValue::from([]), 0, Value), - InternalValue::from_components(UserKey::from([22, 205]), UserValue::from([]), 0, Value), - InternalValue::from_components(UserKey::from([22, 206]), UserValue::from([]), 0, Value), - InternalValue::from_components(UserKey::from([22, 207]), UserValue::from([]), 0, Value), - InternalValue::from_components(UserKey::from([22, 208]), UserValue::from([]), 0, Value), - InternalValue::from_components(UserKey::from([22, 209]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 210]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 211]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 212]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 213]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 214]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 215]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 216]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 217]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 218]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 219]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 220]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 221]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 222]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 223]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 224]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 225]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 226]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 227]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 228]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 229]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 230]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 231]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 232]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 233]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 234]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 235]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 236]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 237]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 238]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 239]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 240]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 241]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 242]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 243]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 244]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 245]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 246]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 247]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 248]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 249]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 250]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 251]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 252]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 253]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 254]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 22, 255]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 0]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 1]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 2]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 3]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 4]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 5]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 6]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 7]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 8]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 9]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 10]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 11]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 12]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 13]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 14]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 15]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 16]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 17]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 18]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 19]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 20]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 21]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 22]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 23]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 24]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 25]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 26]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 27]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 28]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 29]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 30]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 31]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 32]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 33]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 34]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 35]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 36]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 37]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 38]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 39]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 40]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 41]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 42]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 43]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 44]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 45]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 46]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 47]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 48]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 49]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 50]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 51]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 52]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 53]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 54]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 55]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 56]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 57]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 58]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 59]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 60]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 61]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 62]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 63]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 64]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 65]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 66]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 67]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 68]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 69]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 70]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 71]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 72]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 73]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 74]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 75]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 76]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 77]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 78]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 79]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 80]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 81]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 82]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 83]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 84]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 85]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 86]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 87]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 88]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 89]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 90]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 91]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 92]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 93]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 94]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 95]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 96]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 97]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 98]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 99]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 100]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 101]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 102]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 103]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 104]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 105]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 106]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 107]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 108]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 109]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 110]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 111]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 112]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 113]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 114]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 115]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 116]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 117]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 118]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 119]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 120]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 121]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 122]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 123]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 124]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 125]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 126]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 127]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 128]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 129]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 130]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 131]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 132]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 133]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 134]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 135]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 136]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 137]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 138]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 139]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 140]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 141]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 142]), UserValue::from([]), 0, Value), - // InternalValue::from_components(UserKey::from([0, 0, 0, 0, 0, 0, 23, 143]), UserValue::from([]), 0, Value), - ]; - - let bytes = DataBlock::encode_items(&items, 1, 0.0)?; - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(data_block.hash_bucket_count().is_none()); - - for code in BinaryCodeIterator::new(items.len()) { - let mut iter = data_block.iter(); - - for &x in &code { - log::warn!("code: {code:?}"); - - if x % 2 == 0 { - eprintln!("[{x}] next"); - - let Some(_) = iter.next() else { - break; - }; + #[test] + fn v3_data_block_iter_fuzz_4() -> crate::Result<()> { + let items = [ + InternalValue::from_components( + Slice::new(&[0]), + Slice::new(&[]), + 3_834_029_160_418_063_669, + Value, + ), + InternalValue::from_components(Slice::new(&[0]), Slice::new(&[]), 127, Tombstone), + InternalValue::from_components( + Slice::new(&[53, 53, 53]), + Slice::new(&[]), + 18_446_744_073_709_551_615, + Tombstone, + ), + InternalValue::from_components( + Slice::new(&[255]), + Slice::new(&[]), + 18_446_744_069_414_584_831, + Tombstone, + ), + InternalValue::from_components(Slice::new(&[255, 255]), Slice::new(&[]), 47, Value), + ]; - // count += 1; - } else { - eprintln!("[{x}] next_back"); + let bytes = DataBlock::encode_items(&items, 2, 1.0)?; - let Some(_) = iter.next_back() else { - break; - }; + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); - // count += 1; - } - } - } + assert_eq!(data_block.len(), items.len()); + assert!( + data_block + .hash_bucket_count() + .expect("should have built hash index") + > 0, + ); + + assert_eq!( + { + #[allow(clippy::suspicious_map)] + data_block.iter().count() + }, + items.len(), + ); - Ok(()) - } */ + Ok(()) + } } From 3ab995a1f6fee036b70ab51b5511c0910770839d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 26 Jun 2025 22:24:29 +0200 Subject: [PATCH 206/613] refactor: data block --- src/segment/data_block/mod.rs | 1183 ++++++++++++++++++++++++--------- 1 file changed, 864 insertions(+), 319 deletions(-) diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index 441d5312..c7420d74 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -2,25 +2,138 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -pub mod forward_reader; mod iter; pub use iter::Iter; use super::block::{ - binary_index::Reader as BinaryIndexReader, hash_index::Reader as HashIndexReader, Block, - Encodable, Encoder, Trailer, TRAILER_START_MARKER, + Block, Decodable, Decoder, Encodable, Encoder, ParsedItem as Parsy, Trailer, + TRAILER_START_MARKER, }; -use crate::clipping_iter::ClippingIter; -use crate::{InternalValue, SeqNo, ValueType}; +use crate::key::InternalKey; +use crate::segment::util::{compare_prefixed_slice, SliceIndexes}; +use crate::{unwrappy, InternalValue, SeqNo, Slice, ValueType}; use byteorder::WriteBytesExt; use byteorder::{LittleEndian, ReadBytesExt}; -use forward_reader::{ForwardReader, ParsedItem, ParsedSlice}; +use std::io::Cursor; use std::io::Seek; -use std::ops::RangeBounds; -use std::{cmp::Reverse, io::Cursor}; use varint_rs::{VarintReader, VarintWriter}; +impl Decodable for InternalValue { + fn parse_restart_key<'a>( + reader: &mut Cursor<&[u8]>, + offset: usize, + data: &'a [u8], + ) -> Option<&'a [u8]> { + let value_type = unwrappy!(reader.read_u8()); + + if value_type == TRAILER_START_MARKER { + return None; + } + + let _seqno = unwrappy!(reader.read_u64_varint()); + + let key_len: usize = unwrappy!(reader.read_u16_varint()).into(); + let key_start = offset + reader.position() as usize; + unwrappy!(reader.seek_relative(key_len as i64)); + + data.get(key_start..(key_start + key_len)) + } + + fn parse_full(reader: &mut Cursor<&[u8]>, offset: usize) -> Option { + let value_type = unwrappy!(reader.read_u8()); + + if value_type == TRAILER_START_MARKER { + return None; + } + + let seqno = unwrappy!(reader.read_u64_varint()); + + let key_len: usize = unwrappy!(reader.read_u16_varint()).into(); + let key_start = offset + reader.position() as usize; + unwrappy!(reader.seek_relative(key_len as i64)); + + let val_len: usize = if value_type == u8::from(ValueType::Value) { + unwrappy!(reader.read_u32_varint()) as usize + } else { + 0 + }; + let val_offset = offset + reader.position() as usize; + unwrappy!(reader.seek_relative(val_len as i64)); + + Some(if value_type == u8::from(ValueType::Value) { + DataBlockParsedItem { + value_type, + seqno, + prefix: None, + key: SliceIndexes(key_start, key_start + key_len), + value: Some(SliceIndexes(val_offset, val_offset + val_len)), + } + } else { + DataBlockParsedItem { + value_type, + seqno, + prefix: None, + key: SliceIndexes(key_start, key_start + key_len), + value: None, // TODO: enum value/tombstone, so value is not Option for values + } + }) + } + + fn parse_truncated( + reader: &mut Cursor<&[u8]>, + offset: usize, + base_key_offset: usize, + ) -> Option { + let value_type = unwrappy!(reader.read_u8()); + + if value_type == TRAILER_START_MARKER { + return None; + } + + let seqno = unwrappy!(reader.read_u64_varint()); + + let shared_prefix_len: usize = unwrappy!(reader.read_u16_varint()).into(); + let rest_key_len: usize = unwrappy!(reader.read_u16_varint()).into(); + + let key_offset = offset + reader.position() as usize; + + unwrappy!(reader.seek_relative(rest_key_len as i64)); + + let val_len: usize = if value_type == u8::from(ValueType::Value) { + unwrappy!(reader.read_u32_varint()) as usize + } else { + 0 + }; + let val_offset = offset + reader.position() as usize; + unwrappy!(reader.seek_relative(val_len as i64)); + + Some(if value_type == u8::from(ValueType::Value) { + DataBlockParsedItem { + value_type, + seqno, + prefix: Some(SliceIndexes( + base_key_offset, + base_key_offset + shared_prefix_len, + )), + key: SliceIndexes(key_offset, key_offset + rest_key_len), + value: Some(SliceIndexes(val_offset, val_offset + val_len)), + } + } else { + DataBlockParsedItem { + value_type, + seqno, + prefix: Some(SliceIndexes( + base_key_offset, + base_key_offset + shared_prefix_len, + )), + key: SliceIndexes(key_offset, key_offset + rest_key_len), + value: None, + } + }) + } +} + impl Encodable<()> for InternalValue { fn encode_full_into( &self, @@ -93,74 +206,83 @@ impl Encodable<()> for InternalValue { } } -// TODO: allow disabling binary index (for meta block) -// -> saves space in metadata blocks -// -> point reads then need to use iter().find() to find stuff (which is fine) +#[derive(Debug)] +pub struct DataBlockParsedItem { + pub value_type: u8, + pub seqno: SeqNo, + pub prefix: Option, + pub key: SliceIndexes, + pub value: Option, +} -macro_rules! unwrappy { - ($x:expr) => { - $x.expect("should read") +impl Parsy for DataBlockParsedItem { + fn key<'a>(&self, bytes: &'a [u8]) -> &'a [u8] { + debug_assert!(self.prefix.is_none(), "can only get key of restart heads"); - // unsafe { $x.unwrap_unchecked() } - }; + unwrappy!(bytes.get(self.key.0..self.key.1)) + } + + fn compare_key(&self, needle: &[u8], bytes: &[u8]) -> std::cmp::Ordering { + if let Some(prefix) = &self.prefix { + let prefix = unsafe { bytes.get_unchecked(prefix.0..prefix.1) }; + let rest_key = unsafe { bytes.get_unchecked(self.key.0..self.key.1) }; + compare_prefixed_slice(prefix, rest_key, needle) + } else { + let key = unsafe { bytes.get_unchecked(self.key.0..self.key.1) }; + key.cmp(needle) + } + } + + fn key_offset(&self) -> usize { + self.key.0 + } + + fn materialize(&self, bytes: &Slice) -> InternalValue { + // NOTE: We consider the prefix and key slice indexes to be trustworthy + #[allow(clippy::indexing_slicing)] + let key = if let Some(prefix) = &self.prefix { + let prefix_key = &bytes[prefix.0..prefix.1]; + let rest_key = &bytes[self.key.0..self.key.1]; + Slice::fused(prefix_key, rest_key) + } else { + bytes.slice(self.key.0..self.key.1) + }; + let key = InternalKey::new( + key, + self.seqno, + // NOTE: Value type is (or should be) checked when reading it + #[allow(clippy::expect_used)] + self.value_type.try_into().expect("should work"), + ); + + let value = self + .value + .as_ref() + .map_or_else(Slice::empty, |v| bytes.slice(v.0..v.1)); + + InternalValue { key, value } + } } +// TODO: allow disabling binary index (for meta block) +// -> saves space in metadata blocks +// -> point reads then need to use iter().find() to find stuff (which is fine) + /// Block that contains key-value pairs (user data) #[derive(Clone)] pub struct DataBlock { pub inner: Block, - - // Cached metadata - restart_interval: u8, - - binary_index_step_size: u8, - binary_index_offset: u32, - binary_index_len: u32, - - hash_index_offset: u32, - hash_index_len: u32, } impl DataBlock { #[must_use] pub fn new(inner: Block) -> Self { - let trailer = Trailer::new(&inner); - - // NOTE: Skip item count (u32) - let offset = std::mem::size_of::(); - let mut reader = unwrappy!(trailer.as_slice().get(offset..)); - - let restart_interval = unwrappy!(reader.read_u8()); - - let binary_index_step_size = unwrappy!(reader.read_u8()); - let binary_index_offset = unwrappy!(reader.read_u32::()); - let binary_index_len = unwrappy!(reader.read_u32::()); - - let hash_index_offset = unwrappy!(reader.read_u32::()); - let hash_index_len = unwrappy!(reader.read_u32::()); - - debug_assert!( - binary_index_step_size == 2 || binary_index_step_size == 4, - "invalid binary index step size", - ); - - Self { - inner, - - restart_interval, - - binary_index_step_size, - binary_index_offset, - binary_index_len, - - hash_index_offset, - hash_index_len, - } + Self { inner } } /// Access the inner raw bytes #[must_use] - fn bytes(&self) -> &[u8] { + pub fn as_slice(&self) -> &Slice { &self.inner.data } @@ -170,53 +292,64 @@ impl DataBlock { self.inner.size() } + // TODO: handle seqno more nicely (make Key generic, so we can do binary search over (key, seqno)) #[must_use] - #[allow(clippy::iter_without_into_iter)] - pub fn iter(&self) -> impl DoubleEndedIterator + '_ { - Iter::new(self).map(|kv| kv.materialize(&self.inner.data)) - } - - #[allow(clippy::iter_without_into_iter)] - pub fn scan(&self) -> impl Iterator + '_ { - ForwardReader::new(self).map(|kv| kv.materialize(&self.inner.data)) - } - - pub fn range<'a, K: AsRef<[u8]> + 'a, R: RangeBounds + 'a>( - &'a self, - range: &'a R, - ) -> impl DoubleEndedIterator + 'a { - let offset = 0; // TODO: range & seek to range start using binary index/hash index (first matching restart interval) - // TODO: and if range end, seek to range end as well (last matching restart interval) - - ClippingIter::new( - Iter::new(self) - // .with_offset(offset) // TODO: - .map(|kv| kv.materialize(&self.inner.data)), - range, - ) - } + pub fn point_read(&self, needle: &[u8], seqno: SeqNo) -> Option { + // TODO: hash index lookup, impl in Decoder + /* + // NOTE: Try hash index if it exists + if let Some(lookup) = self + .block + .get_hash_index_reader() + .map(|reader| reader.get(needle)) + { + use super::super::block::hash_index::Lookup::{Conflicted, Found, NotFound}; + + match lookup { + Found(bucket_value) => { + let offset = binary_index.get(usize::from(bucket_value)); + self.offset = offset; + self.linear_probe(needle, seqno); + return true; + } + NotFound => { + return false; + } + Conflicted => { + // NOTE: Fallback to binary search + } + } + } + */ - fn get_key_at(&self, pos: usize) -> (&[u8], Reverse) { - let bytes = &self.inner.data; + let mut iter = self.iter(); - // NOTE: Skip value type - let pos = pos + std::mem::size_of::(); + if !iter.seek(needle) { + return None; + } - // SAFETY: pos is always retrieved from the binary index, - // which we consider to be trustworthy - #[warn(unsafe_code)] - let mut cursor = Cursor::new(unsafe { bytes.get_unchecked(pos..) }); + for item in iter { + if item.compare_key(needle, &self.inner.data).is_gt() { + return None; + } - let seqno = unwrappy!(cursor.read_u64_varint()); - let key_len: usize = unwrappy!(cursor.read_u16_varint()).into(); + if item.seqno >= seqno { + continue; + } - let key_start = pos + cursor.position() as usize; - let key_end = key_start + key_len; + return Some(item.materialize(&self.inner.data)); + } - #[warn(unsafe_code)] - let key = bytes.get(key_start..key_end).expect("should read"); + None + } - (key, Reverse(seqno)) + // TODO: rename iter() + #[must_use] + pub fn iter(&self) -> Iter { + Iter::new( + &self.inner.data, + Decoder::::new(&self.inner), + ) } /// Returns the binary index length (number of pointers). @@ -224,72 +357,59 @@ impl DataBlock { /// The number of pointers is equal to the number of restart intervals. #[must_use] pub fn binary_index_len(&self) -> u32 { - self.binary_index_len - } - - /// Returns the binary index offset. - #[must_use] - fn binary_index_offset(&self) -> u32 { - self.binary_index_offset - } + let trailer = Trailer::new(&self.inner); - /// Returns the binary index step size. - /// - /// The binary index can either store u16 or u32 pointers, - /// depending on the size of the data block. - /// - /// Typically blocks are < 64K, so u16 pointers reduce the index - /// size by half. - #[must_use] - fn binary_index_step_size(&self) -> u8 { - self.binary_index_step_size - } + // NOTE: Skip item count (u32), restart interval (u8), binary index step size (u8), + // and binary index offset (u32) + let offset = std::mem::size_of::() + + (2 * std::mem::size_of::()) + + std::mem::size_of::(); + let mut reader = unwrappy!(trailer.as_slice().get(offset..)); - /// Returns the hash index offset. - /// - /// If 0, the hash index does not exist. - #[must_use] - fn hash_index_offset(&self) -> u32 { - self.hash_index_offset + unwrappy!(reader.read_u32::()) } /// Returns the number of hash buckets. #[must_use] pub fn hash_bucket_count(&self) -> Option { - if self.hash_index_offset() > 0 { - Some(self.hash_index_len) + let trailer = Trailer::new(&self.inner); + + // NOTE: Skip item count (u32), restart interval (u8), binary index step size (u8), + // and binary index offset+len (2x u32) + let offset = std::mem::size_of::() + + (2 * std::mem::size_of::()) + + (2 * std::mem::size_of::()); + + let mut reader = unwrappy!(trailer.as_slice().get(offset..)); + + let hash_index_offset = unwrappy!(reader.read_u32::()); + let hash_index_len = unwrappy!(reader.read_u32::()); + + if hash_index_offset > 0 { + Some(hash_index_len) } else { None } } - fn get_binary_index_reader(&self) -> BinaryIndexReader { - BinaryIndexReader::new( - self.bytes(), - self.binary_index_offset(), - self.binary_index_len(), - self.binary_index_step_size(), - ) - } - - fn get_hash_index_reader(&self) -> Option { + /* fn get_hash_index_reader(&self) -> Option { self.hash_bucket_count() .map(|offset| HashIndexReader::new(&self.inner.data, self.hash_index_offset, offset)) - } + } */ - /// Returns the amount of conflicts in the hash buckets. + /* /// Returns the amount of conflicts in the hash buckets. #[must_use] pub fn hash_bucket_conflict_count(&self) -> Option { self.get_hash_index_reader() .map(|reader| reader.conflict_count()) - } + } */ - /// Returns the amount of empty hash buckets. + /* /// Returns the amount of empty hash buckets. #[must_use] pub fn hash_bucket_free_count(&self) -> Option { self.get_hash_index_reader() .map(|reader| reader.free_count()) - } + } */ /// Returns the amount of items in the block. #[must_use] @@ -303,222 +423,313 @@ impl DataBlock { false } - fn binary_search_for_offset( - &self, - binary_index: &BinaryIndexReader, - needle: &[u8], - seqno: SeqNo, - ) -> Option { - debug_assert!( - binary_index.len() >= 1, - "binary index should never be empty", - ); + pub fn encode_items( + items: &[InternalValue], + restart_interval: u8, + hash_index_ratio: f32, + ) -> crate::Result> { + let first_key = &items + .first() + .expect("chunk should not be empty") + .key + .user_key; - let mut left: usize = 0; - let mut right = binary_index.len(); + let mut serializer = Encoder::<'_, (), InternalValue>::new( + items.len(), + restart_interval, + hash_index_ratio, + first_key, + ); - if right == 0 { - return None; + for item in items { + serializer.write(item)?; } - let seqno_cmp = Reverse(seqno - 1); + serializer.finish() + } +} + +#[cfg(test)] +mod tests { + use crate::{ + segment::{ + block::{Header, ParsedItem}, + Block, BlockOffset, DataBlock, + }, + Checksum, InternalValue, SeqNo, Slice, + ValueType::{Tombstone, Value}, + }; + use test_log::test; + + #[test] + #[allow(clippy::unwrap_used)] + fn v3_data_block_ping_pong_fuzz_1() -> crate::Result<()> { + let items = [ + InternalValue::from_components( + Slice::from([111]), + Slice::from([119]), + 8_602_264_972_526_186_597, + Value, + ), + InternalValue::from_components( + Slice::from([121, 120, 99]), + Slice::from([101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101]), + 11_426_548_769_907, + Value, + ), + ]; + + let ping_pong_code = [1, 0]; - while left < right { - let mid = (left + right) / 2; + let bytes: Vec = DataBlock::encode_items(&items, 1, 0.0)?; - let offset = binary_index.get(mid); + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); - let (head_key, head_seqno) = self.get_key_at(offset); + let expected_ping_ponged_items = { + let mut iter = items.iter(); + let mut v = vec![]; - match head_key.cmp(needle) { - std::cmp::Ordering::Less => { - left = mid + 1; + for &x in &ping_pong_code { + if x == 0 { + v.push(iter.next().cloned().unwrap()); + } else { + v.push(iter.next_back().cloned().unwrap()); } - std::cmp::Ordering::Equal => match head_seqno.cmp(&seqno_cmp) { - std::cmp::Ordering::Less => { - left = mid + 1; - } - std::cmp::Ordering::Equal => { - left = mid; - right = mid; - } - std::cmp::Ordering::Greater => { - right = mid; - } - }, - std::cmp::Ordering::Greater => { - // NOTE: If we are at the first restart interval head - // and its key is larger than the requested key, the key cannot be possibly contained in the block - // - // Block - // [b... c... d... e...] - // - // ^ - // needle = "a" - // - if mid == 0 { - return None; - } + } + + v + }; + + let real_ping_ponged_items = { + let mut iter = data_block + .iter() + .map(|x| x.materialize(data_block.as_slice())); - right = mid; + let mut v = vec![]; + + for &x in &ping_pong_code { + if x == 0 { + v.push(iter.next().unwrap()); + } else { + v.push(iter.next_back().unwrap()); } } - } - if left == 0 { - return Some(0); - } + v + }; - let offset = binary_index.get(left - 1); + assert_eq!(expected_ping_ponged_items, real_ping_ponged_items); - Some(offset) + Ok(()) } - fn parse_restart_item(reader: &mut Cursor<&[u8]>, offset: usize) -> Option { - let value_type = unwrappy!(reader.read_u8()); + #[test] + fn v3_data_block_point_read_simple() -> crate::Result<()> { + let items = [ + InternalValue::from_components("b", "b", 0, Value), + InternalValue::from_components("c", "c", 0, Value), + InternalValue::from_components("d", "d", 1, Tombstone), + InternalValue::from_components("e", "e", 0, Value), + InternalValue::from_components("f", "f", 0, Value), + ]; - if value_type == TRAILER_START_MARKER { - return None; + for restart_interval in 1..=16 { + let bytes: Vec = DataBlock::encode_items(&items, restart_interval, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert!( + data_block.point_read(b"a", SeqNo::MAX).is_none(), + "should return None because a does not exist", + ); + + assert!( + data_block.point_read(b"b", SeqNo::MAX).is_some(), + "should return Some because b exists", + ); + + assert!( + data_block.point_read(b"z", SeqNo::MAX).is_none(), + "should return Some because z does not exist", + ); } - let seqno = unwrappy!(reader.read_u64_varint()); + Ok(()) + } - let key_len: usize = unwrappy!(reader.read_u16_varint()).into(); - let key_start = offset + reader.position() as usize; - unwrappy!(reader.seek_relative(key_len as i64)); + #[test] + fn v3_data_block_point_read_one() -> crate::Result<()> { + let items = [InternalValue::from_components( + "pla:earth:fact", + "eaaaaaaaaarth", + 0, + crate::ValueType::Value, + )]; - let val_len: usize = if value_type == u8::from(ValueType::Value) { - unwrappy!(reader.read_u32_varint()) as usize - } else { - 0 - }; - let val_offset = offset + reader.position() as usize; - unwrappy!(reader.seek_relative(val_len as i64)); + let bytes = DataBlock::encode_items(&items, 16, 0.0)?; + let serialized_len = bytes.len(); - Some(if value_type == u8::from(ValueType::Value) { - ParsedItem { - value_type, - seqno, - prefix: None, - key: ParsedSlice(key_start, key_start + key_len), - value: Some(ParsedSlice(val_offset, val_offset + val_len)), - } - } else { - ParsedItem { - value_type, - seqno, - prefix: None, - key: ParsedSlice(key_start, key_start + key_len), - value: None, // TODO: enum value/tombstone, so value is not Option for values - } - }) - } + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); - fn parse_truncated_item( - reader: &mut Cursor<&[u8]>, - offset: usize, - base_key_offset: usize, - ) -> Option { - let value_type = unwrappy!(reader.read_u8()); + assert_eq!(data_block.len(), items.len()); + assert!(!data_block.is_empty()); + assert_eq!(data_block.inner.size(), serialized_len); + assert_eq!(1, data_block.binary_index_len()); - if value_type == TRAILER_START_MARKER { - return None; + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, SeqNo::MAX), + ); } - let seqno = unwrappy!(reader.read_u64_varint()); + assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); - let shared_prefix_len: usize = unwrappy!(reader.read_u16_varint()).into(); - let rest_key_len: usize = unwrappy!(reader.read_u16_varint()).into(); + Ok(()) + } - let key_offset = offset + reader.position() as usize; + #[test] + fn v3_data_block_mvcc_read_first() -> crate::Result<()> { + let items = [InternalValue::from_components( + "hello", + "world", + 0, + crate::ValueType::Value, + )]; + + for restart_interval in 1..=16 { + let bytes = DataBlock::encode_items(&items, restart_interval, 0.0)?; + let serialized_len = bytes.len(); + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); - unwrappy!(reader.seek_relative(rest_key_len as i64)); + assert_eq!(data_block.len(), items.len()); + assert!(!data_block.is_empty()); + assert_eq!(data_block.inner.size(), serialized_len); - let val_len: usize = if value_type == u8::from(ValueType::Value) { - unwrappy!(reader.read_u32_varint()) as usize - } else { - 0 - }; - let val_offset = offset + reader.position() as usize; - unwrappy!(reader.seek_relative(val_len as i64)); + assert_eq!(Some(items[0].clone()), data_block.point_read(b"hello", 777)); + } - Some(if value_type == u8::from(ValueType::Value) { - ParsedItem { - value_type, - seqno, - prefix: Some(ParsedSlice( - base_key_offset, - base_key_offset + shared_prefix_len, - )), - key: ParsedSlice(key_offset, key_offset + rest_key_len), - value: Some(ParsedSlice(val_offset, val_offset + val_len)), - } - } else { - ParsedItem { - value_type, - seqno, - prefix: Some(ParsedSlice( - base_key_offset, - base_key_offset + shared_prefix_len, - )), - key: ParsedSlice(key_offset, key_offset + rest_key_len), - value: None, - } - }) + Ok(()) } - #[must_use] - pub fn point_read(&self, needle: &[u8], seqno: SeqNo) -> Option { - let mut reader = ForwardReader::new(self); - reader.point_read(needle, seqno) - } + #[test] + fn v3_data_block_point_read_fuzz_1() -> crate::Result<()> { + let items = [ + InternalValue::from_components([0], b"", 23_523_531_241_241_242, Value), + InternalValue::from_components([0], b"", 0, Value), + ]; - pub fn encode_items( - items: &[InternalValue], - restart_interval: u8, - hash_index_ratio: f32, - ) -> crate::Result> { - let first_key = &items - .first() - .expect("chunk should not be empty") - .key - .user_key; + let bytes = DataBlock::encode_items(&items, 16, 1.33)?; - let mut serializer = Encoder::<'_, (), InternalValue>::new( - items.len(), - restart_interval, - hash_index_ratio, - first_key, + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!( + data_block + .hash_bucket_count() + .expect("should have built hash index") + > 0, ); - for item in items { - serializer.write(item)?; + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), + ); } - serializer.finish() + assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); + + Ok(()) } -} -#[cfg(test)] -mod tests { - use crate::{ - segment::{block::Header, Block, BlockOffset, DataBlock}, - Checksum, InternalValue, SeqNo, - ValueType::{Tombstone, Value}, - }; - use test_log::test; + #[test] + fn v3_data_block_point_read_fuzz_2() -> crate::Result<()> { + let items = [ + InternalValue::from_components([0], [], 5, Value), + InternalValue::from_components([0], [], 4, Tombstone), + InternalValue::from_components([0], [], 3, Value), + InternalValue::from_components([0], [], 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 2, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().is_none()); + + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), + ); + } + + assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); + + Ok(()) + } #[test] - fn v3_data_block_binary_search() -> crate::Result<()> { + fn v3_data_block_point_read_dense() -> crate::Result<()> { let items = [ - InternalValue::from_components("b", "b", 0, Value), - InternalValue::from_components("c", "c", 0, Value), - InternalValue::from_components("d", "d", 1, Tombstone), - InternalValue::from_components("e", "e", 0, Value), - InternalValue::from_components("f", "f", 0, Value), + InternalValue::from_components(b"a", b"a", 3, Value), + InternalValue::from_components(b"b", b"b", 2, Value), + InternalValue::from_components(b"c", b"c", 1, Value), + InternalValue::from_components(b"d", b"d", 65, Value), ]; - let bytes = DataBlock::encode_items(&items, 16, 1.33)?; + let bytes = DataBlock::encode_items(&items, 1, 0.0)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -530,29 +741,363 @@ mod tests { }, }); - let binary_index = data_block.get_binary_index_reader(); + assert_eq!(data_block.len(), items.len()); + + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, SeqNo::MAX), + ); + } + + assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); + + Ok(()) + } + + #[test] + fn v3_data_block_point_read_dense_mvcc_with_hash() -> crate::Result<()> { + let items = [ + InternalValue::from_components(b"a", b"a", 3, Value), + InternalValue::from_components(b"a", b"a", 2, Value), + InternalValue::from_components(b"a", b"a", 1, Value), + InternalValue::from_components(b"b", b"b", 65, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 1, 1.33)?; + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); assert!( data_block - .binary_search_for_offset(&binary_index, b"a", SeqNo::MAX) - .is_none(), - "should return None because a is less than min key", + .hash_bucket_count() + .expect("should have built hash index") + > 0, + ); + + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), + ); + } + + assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); + + Ok(()) + } + + #[test] + #[allow(clippy::unwrap_used)] + fn v3_data_block_point_read_mvcc_latest_fuzz_1() -> crate::Result<()> { + let items = [ + InternalValue::from_components(Slice::from([0]), Slice::from([]), 0, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 0, Value), + InternalValue::from_components( + Slice::from([255, 255, 0]), + Slice::from([]), + 127_886_946_205_696, + Tombstone, + ), + ]; + + let bytes = DataBlock::encode_items(&items, 2, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + + assert_eq!( + Some(items.get(1).cloned().unwrap()), + data_block.point_read(&[233, 233], SeqNo::MAX) + ); + assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); + + Ok(()) + } + + #[test] + #[allow(clippy::unwrap_used)] + fn v3_data_block_point_read_mvcc_latest_fuzz_2() -> crate::Result<()> { + let items = [ + InternalValue::from_components(Slice::from([0]), Slice::from([]), 0, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 8, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 7, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 6, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 5, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 4, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 3, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 2, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 1, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 0, Value), + InternalValue::from_components( + Slice::from([255, 255, 0]), + Slice::from([]), + 127_886_946_205_696, + Tombstone, + ), + ]; + + let bytes = DataBlock::encode_items(&items, 2, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + + assert_eq!( + Some(items.get(1).cloned().unwrap()), + data_block.point_read(&[233, 233], SeqNo::MAX) + ); + assert_eq!( + Some(items.last().cloned().unwrap()), + data_block.point_read(&[255, 255, 0], SeqNo::MAX) + ); + assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); + + Ok(()) + } + + #[test] + #[allow(clippy::unwrap_used)] + fn v3_data_block_point_read_mvcc_latest_fuzz_3() -> crate::Result<()> { + let items = [ + InternalValue::from_components(Slice::from([0]), Slice::from([]), 0, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 8, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 7, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 6, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 5, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 4, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 3, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 2, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 1, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 0, Value), + InternalValue::from_components( + Slice::from([255, 255, 0]), + Slice::from([]), + 127_886_946_205_696, + Tombstone, + ), + ]; + + let bytes = DataBlock::encode_items(&items, 2, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + + assert_eq!( + Some(items.get(1).cloned().unwrap()), + data_block.point_read(&[233, 233], SeqNo::MAX) ); + assert_eq!( + Some(items.last().cloned().unwrap()), + data_block.point_read(&[255, 255, 0], SeqNo::MAX) + ); + assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); + + Ok(()) + } + #[test] + #[allow(clippy::unwrap_used)] + fn v3_data_block_point_read_mvcc_latest_fuzz_3_dense() -> crate::Result<()> { + let items = [ + InternalValue::from_components(Slice::from([0]), Slice::from([]), 0, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 8, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 7, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 6, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 5, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 4, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 3, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 2, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 1, Value), + InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 0, Value), + InternalValue::from_components( + Slice::from([255, 255, 0]), + Slice::from([]), + 127_886_946_205_696, + Tombstone, + ), + ]; + + let bytes = DataBlock::encode_items(&items, 1, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + + assert_eq!( + Some(items.get(1).cloned().unwrap()), + data_block.point_read(&[233, 233], SeqNo::MAX) + ); + assert_eq!( + Some(items.last().cloned().unwrap()), + data_block.point_read(&[255, 255, 0], SeqNo::MAX) + ); + assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); + + Ok(()) + } + + #[test] + fn v3_data_block_point_read_dense_mvcc_no_hash() -> crate::Result<()> { + let items = [ + InternalValue::from_components(b"a", b"a", 3, Value), + InternalValue::from_components(b"a", b"a", 2, Value), + InternalValue::from_components(b"a", b"a", 1, Value), + InternalValue::from_components(b"b", b"b", 65, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 1, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert!(data_block.hash_bucket_count().is_none()); + + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), + ); + } + + assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); + + Ok(()) + } + + #[test] + fn v3_data_block_point_read_shadowing() -> crate::Result<()> { + let items = [ + InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), + InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), + InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), + InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), + InternalValue::from_components("pla:venus:name", "Venus", 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 16, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); assert!( data_block - .binary_search_for_offset(&binary_index, b"b", SeqNo::MAX) - .is_some(), - "should return Some because b exists", + .hash_bucket_count() + .expect("should have built hash index") + > 0, ); + assert!(data_block + .point_read(b"pla:venus:fact", SeqNo::MAX) + .expect("should exist") + .is_tombstone()); + + Ok(()) + } + + #[test] + fn v3_data_block_point_read_dense_2() -> crate::Result<()> { + let items = [ + InternalValue::from_components("pla:earth:fact", "eaaaaaaaaarth", 0, Value), + InternalValue::from_components("pla:jupiter:fact", "Jupiter is big", 0, Value), + InternalValue::from_components("pla:jupiter:mass", "Massive", 0, Value), + InternalValue::from_components("pla:jupiter:name", "Jupiter", 0, Value), + InternalValue::from_components("pla:jupiter:radius", "Big", 0, Value), + InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), + InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), + InternalValue::from_components("pla:venus:fact", "", 1, Tombstone), + InternalValue::from_components("pla:venus:fact", "Venus exists", 0, Value), + InternalValue::from_components("pla:venus:name", "Venus", 0, Value), + ]; + + let bytes = DataBlock::encode_items(&items, 1, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); assert!( data_block - .binary_search_for_offset(&binary_index, b"z", SeqNo::MAX) - .is_some(), - "should return Some because z may be in last restart interval", + .hash_bucket_count() + .expect("should have built hash index") + > 0, ); + for needle in items { + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), + ); + } + + assert_eq!(None, data_block.point_read(b"yyy", SeqNo::MAX)); + Ok(()) } } From 5a7e44d7d18935de11fcdac97e7072cb62006206 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 26 Jun 2025 22:25:00 +0200 Subject: [PATCH 207/613] refactor: index block iter --- src/segment/index_block/block_handle.rs | 70 ++- src/segment/index_block/forward_reader.rs | 241 ---------- src/segment/index_block/iter.rs | 548 +++++++++++----------- 3 files changed, 351 insertions(+), 508 deletions(-) delete mode 100644 src/segment/index_block/forward_reader.rs diff --git a/src/segment/index_block/block_handle.rs b/src/segment/index_block/block_handle.rs index f75f7ab0..5e9f07e6 100644 --- a/src/segment/index_block/block_handle.rs +++ b/src/segment/index_block/block_handle.rs @@ -4,9 +4,15 @@ use crate::{ coding::{Decode, DecodeError, Encode, EncodeError}, - segment::block::{BlockOffset, Encodable}, + segment::{ + block::{BlockOffset, Decodable, Encodable, TRAILER_START_MARKER}, + index_block::IndexBlockParsedItem, + util::SliceIndexes, + }, + unwrappy, }; -use byteorder::WriteBytesExt; +use byteorder::{ReadBytesExt, WriteBytesExt}; +use std::io::{Cursor, Seek}; use value_log::UserKey; use varint_rs::{VarintReader, VarintWriter}; @@ -96,6 +102,7 @@ impl AsRef for KeyedBlockHandle { } impl KeyedBlockHandle { + #[must_use] pub fn new(end_key: UserKey, offset: BlockOffset, size: u32) -> Self { Self { end_key, @@ -107,18 +114,22 @@ impl KeyedBlockHandle { self.inner.offset += delta; } + #[must_use] pub fn size(&self) -> u32 { self.inner.size() } + #[must_use] pub fn offset(&self) -> BlockOffset { self.inner.offset() } + #[must_use] pub fn end_key(&self) -> &UserKey { &self.end_key } + #[must_use] pub fn into_end_key(self) -> UserKey { self.end_key } @@ -200,3 +211,58 @@ impl Encodable for KeyedBlockHandle { &self.end_key } } + +impl Decodable for KeyedBlockHandle { + fn parse_full(reader: &mut Cursor<&[u8]>, offset: usize) -> Option { + let marker = unwrappy!(reader.read_u8()); + + if marker == TRAILER_START_MARKER { + return None; + } + + let file_offset = unwrappy!(reader.read_u64_varint()); + let size = unwrappy!(reader.read_u32_varint()); + + let key_len: usize = unwrappy!(reader.read_u16_varint()).into(); + let key_start = offset + reader.position() as usize; + + unwrappy!(reader.seek_relative(key_len as i64)); + + Some(IndexBlockParsedItem { + prefix: None, + end_key: SliceIndexes(key_start, key_start + key_len), + offset: BlockOffset(file_offset), + size, + }) + } + + fn parse_restart_key<'a>( + reader: &mut Cursor<&[u8]>, + offset: usize, + data: &'a [u8], + ) -> Option<&'a [u8]> { + let marker = unwrappy!(reader.read_u8()); + + if marker == TRAILER_START_MARKER { + return None; + } + + let _file_offset = unwrappy!(reader.read_u64_varint()); + let _size = unwrappy!(reader.read_u32_varint()); + + let key_len: usize = unwrappy!(reader.read_u16_varint()).into(); + let key_start = offset + reader.position() as usize; + + unwrappy!(reader.seek_relative(key_len as i64)); + + data.get(key_start..(key_start + key_len)) + } + + fn parse_truncated( + reader: &mut Cursor<&[u8]>, + offset: usize, + base_key_offset: usize, + ) -> Option { + todo!() + } +} diff --git a/src/segment/index_block/forward_reader.rs b/src/segment/index_block/forward_reader.rs deleted file mode 100644 index caccbca6..00000000 --- a/src/segment/index_block/forward_reader.rs +++ /dev/null @@ -1,241 +0,0 @@ -// Copyright (c) 2025-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use super::{IndexBlock, KeyedBlockHandle}; -use crate::{ - segment::{data_block::forward_reader::ParsedSlice, BlockOffset}, - Slice, -}; -use std::io::Cursor; - -#[derive(Default, Debug)] -struct LoScanner { - offset: usize, - remaining_in_interval: usize, - base_key_offset: Option, -} - -/// Specialized reader to scan an index block only in forwards direction -/// -/// Is less expensive than a double ended iterator. -pub struct ForwardReader<'a> { - block: &'a IndexBlock, - restart_interval: usize, - - lo_scanner: LoScanner, -} - -#[derive(Debug)] -pub struct ParsedItem { - pub offset: BlockOffset, - pub size: u32, - pub prefix: Option, - pub end_key: ParsedSlice, -} - -impl ParsedItem { - pub fn materialize(&self, bytes: &Slice) -> KeyedBlockHandle { - // NOTE: We consider the prefix and key slice indexes to be trustworthy - #[allow(clippy::indexing_slicing)] - let end_key = if let Some(prefix) = &self.prefix { - let prefix_key = &bytes[prefix.0..prefix.1]; - let rest_key = &bytes[self.end_key.0..self.end_key.1]; - Slice::fused(prefix_key, rest_key) - } else { - bytes.slice(self.end_key.0..self.end_key.1) - }; - - KeyedBlockHandle::new(end_key, self.offset, self.size) - } -} - -impl<'a> ForwardReader<'a> { - pub fn new(block: &'a IndexBlock) -> Self { - let restart_interval = block.restart_interval.into(); - - Self { - block, - - restart_interval, - - lo_scanner: LoScanner::default(), - } - } - - pub fn with_offset(mut self, offset: usize) -> Self { - self.lo_scanner.offset = offset; - self - } - - #[must_use] - pub fn offset(&self) -> usize { - self.lo_scanner.offset - } - - fn parse_restart_item( - block: &IndexBlock, - offset: &mut usize, - base_key_offset: &mut Option, - ) -> Option { - let bytes = block.bytes(); - - // SAFETY: The cursor is advanced by read_ operations which check for EOF, - // And the cursor starts at 0 - the slice is never empty - #[warn(unsafe_code)] - let mut reader = Cursor::new(unsafe { bytes.get_unchecked(*offset..) }); - - let item = IndexBlock::parse_restart_item(&mut reader, *offset)?; - - *offset += reader.position() as usize; - *base_key_offset = Some(item.end_key.0); - - Some(item) - } - - fn parse_truncated_item( - block: &IndexBlock, - offset: &mut usize, - base_key_offset: usize, - ) -> Option { - let bytes = block.bytes(); - - // SAFETY: The cursor is advanced by read_ operations which check for EOF, - // And the cursor starts at 0 - the slice is never empty - #[warn(unsafe_code)] - let mut reader = Cursor::new(unsafe { bytes.get_unchecked(*offset..) }); - - let item = IndexBlock::parse_truncated_item(&mut reader, *offset, base_key_offset)?; - - *offset += reader.position() as usize; - - Some(item) - } -} - -impl Iterator for ForwardReader<'_> { - type Item = ParsedItem; - - fn next(&mut self) -> Option { - let is_restart = self.lo_scanner.remaining_in_interval == 0; - - let item = if is_restart { - self.lo_scanner.remaining_in_interval = self.restart_interval; - - Self::parse_restart_item( - self.block, - &mut self.lo_scanner.offset, - &mut self.lo_scanner.base_key_offset, - ) - } else { - Self::parse_truncated_item( - self.block, - &mut self.lo_scanner.offset, - self.lo_scanner.base_key_offset.expect("should exist"), - ) - }; - - self.lo_scanner.remaining_in_interval -= 1; - - item - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::segment::{block::Header, Block, Checksum}; - use test_log::test; - - #[test] - #[allow(clippy::unwrap_used)] - fn v3_index_block_simple() -> crate::Result<()> { - let items = [ - KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), - KeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), - KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), - ]; - - let bytes = IndexBlock::encode_items(&items, 1)?; - eprintln!("{bytes:?}"); - eprintln!("{}", String::from_utf8_lossy(&bytes)); - /* eprintln!("encoded into {} bytes", bytes.len()); */ - - let block = IndexBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(block.len(), items.len()); - - let iter = block.forward_reader(b"a").unwrap(); - assert_eq!(&items, &*iter.collect::>()); - - Ok(()) - } - - #[test] - #[allow(clippy::unwrap_used)] - fn v3_index_block_seek() -> crate::Result<()> { - let items = [ - KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), - KeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), - KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), - ]; - - let bytes = IndexBlock::encode_items(&items, 1)?; - eprintln!("{bytes:?}"); - eprintln!("{}", String::from_utf8_lossy(&bytes)); - /* eprintln!("encoded into {} bytes", bytes.len()); */ - - let block = IndexBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(block.len(), items.len()); - - { - let iter = block.forward_reader(b"a").unwrap(); - assert_eq!(&items, &*iter.into_iter().collect::>()); - } - - { - let iter = block.forward_reader(b"b").unwrap(); - assert_eq!(&items, &*iter.into_iter().collect::>()); - } - - { - let iter = block.forward_reader(b"c").unwrap(); - assert_eq!( - items.iter().skip(2).cloned().collect::>(), - &*iter.collect::>(), - ); - } - - { - let iter = block.forward_reader(b"def").unwrap(); - assert_eq!( - items.iter().skip(2).cloned().collect::>(), - &*iter.collect::>(), - ); - } - - { - let iter = block.forward_reader(b"zzz"); - assert!(iter.is_none(), "iterator should seek past index block"); - } - - Ok(()) - } -} diff --git a/src/segment/index_block/iter.rs b/src/segment/index_block/iter.rs index 0793a4a8..b3acbd18 100644 --- a/src/segment/index_block/iter.rs +++ b/src/segment/index_block/iter.rs @@ -1,239 +1,203 @@ -// Copyright (c) 2025-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use super::{ - forward_reader::{ForwardReader, ParsedItem}, - IndexBlock, +use crate::{ + double_ended_peekable::{DoubleEndedPeekable, DoubleEndedPeekableExt}, + segment::{block::Decoder, index_block::IndexBlockParsedItem, KeyedBlockHandle}, }; -use std::io::Cursor; - -#[derive(Debug)] -struct HiScanner { - offset: usize, - ptr_idx: usize, - stack: Vec, // TODO: SmallVec? - base_key_offset: Option, -} -/// Double-ended iterator over index blocks pub struct Iter<'a> { - block: &'a IndexBlock, - restart_interval: usize, - - lo_scanner: ForwardReader<'a>, - hi_scanner: HiScanner, + decoder: DoubleEndedPeekable< + IndexBlockParsedItem, + Decoder<'a, KeyedBlockHandle, IndexBlockParsedItem>, + >, } impl<'a> Iter<'a> { #[must_use] - pub fn new(block: &'a IndexBlock) -> Self { - let restart_interval = block.restart_interval.into(); - let binary_index_len = block.binary_index_len as usize; + pub fn new(decoder: Decoder<'a, KeyedBlockHandle, IndexBlockParsedItem>) -> Self { + let decoder = decoder.double_ended_peekable(); + Self { decoder } + } - Self { - block, + pub fn seek(&mut self, needle: &[u8]) -> bool { + self.decoder + .inner_mut() + .seek(needle, |end_key| end_key < needle, true) + } - restart_interval, + pub fn seek_upper(&mut self, needle: &[u8]) -> bool { + self.decoder + .inner_mut() + .seek_upper(|end_key| end_key <= needle, true) + } +} - lo_scanner: ForwardReader::new(block), +impl Iterator for Iter<'_> { + type Item = IndexBlockParsedItem; - /* lo_scanner: LoScanner::default(), */ - hi_scanner: HiScanner { - offset: 0, - ptr_idx: binary_index_len, - stack: Vec::new(), - base_key_offset: None, - }, - } + fn next(&mut self) -> Option { + self.decoder.next() } +} - pub fn with_offset(mut self, offset: usize) -> Self { - self.lo_scanner = self.lo_scanner.with_offset(offset); - self +impl DoubleEndedIterator for Iter<'_> { + fn next_back(&mut self) -> Option { + self.decoder.next_back() } +} - fn parse_restart_item( - block: &IndexBlock, - offset: &mut usize, - base_key_offset: &mut Option, - ) -> Option { - let bytes = block.bytes(); +#[cfg(test)] +mod tests { + use crate::{ + segment::{ + block::{Header, ParsedItem}, + Block, BlockOffset, IndexBlock, KeyedBlockHandle, + }, + Checksum, + }; + use test_log::test; - // SAFETY: The cursor is advanced by read_ operations which check for EOF, - // And the cursor starts at 0 - the slice is never empty - #[warn(unsafe_code)] - let mut reader = Cursor::new(unsafe { bytes.get_unchecked(*offset..) }); + #[test] + fn v3_index_block_iter_seek_before_start() -> crate::Result<()> { + let items = [ + KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), + KeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), + KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), + ]; - let item = IndexBlock::parse_restart_item(&mut reader, *offset)?; + let bytes = IndexBlock::encode_items(&items, 1)?; - *offset += reader.position() as usize; - *base_key_offset = Some(item.end_key.0); + let index_block = IndexBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); - Some(item) - } + assert_eq!(index_block.len(), items.len()); - fn parse_truncated_item( - block: &IndexBlock, - offset: &mut usize, - base_key_offset: usize, - ) -> Option { - let bytes = block.bytes(); + let mut iter = index_block.iter(); + assert!(iter.seek(b"a"), "should seek"); - // SAFETY: The cursor is advanced by read_ operations which check for EOF, - // And the cursor starts at 0 - the slice is never empty - #[warn(unsafe_code)] - let mut reader = Cursor::new(unsafe { bytes.get_unchecked(*offset..) }); + let iter = index_block + .iter() + .map(|item| item.materialize(&index_block.inner.data)); - let item = IndexBlock::parse_truncated_item(&mut reader, *offset, base_key_offset)?; + let real_items: Vec<_> = iter.collect(); - *offset += reader.position() as usize; + assert_eq!(items, &*real_items); - Some(item) + Ok(()) } - fn consume_stack_top(&mut self) -> Option { - if let Some(offset) = self.hi_scanner.stack.pop() { - if self.lo_scanner.offset() > 0 && offset < self.lo_scanner.offset() { - return None; - } - - self.hi_scanner.offset = offset; - - let is_restart = self.hi_scanner.stack.is_empty(); - - if is_restart { - Self::parse_restart_item( - self.block, - &mut self.hi_scanner.offset, - &mut self.hi_scanner.base_key_offset, - ) - } else { - Self::parse_truncated_item( - self.block, - &mut self.hi_scanner.offset, - self.hi_scanner.base_key_offset.expect("should exist"), - ) - } - } else { - None - } - } -} + #[test] + fn v3_index_block_iter_seek_start() -> crate::Result<()> { + let items = [ + KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), + KeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), + KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), + ]; -impl Iterator for Iter<'_> { - type Item = ParsedItem; + let bytes = IndexBlock::encode_items(&items, 1)?; - fn next(&mut self) -> Option { - if self.hi_scanner.base_key_offset.is_some() - && self.lo_scanner.offset() >= self.hi_scanner.offset - { - return None; - } + let index_block = IndexBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); - /* let is_restart = self.lo_scanner.remaining_in_interval == 0; + assert_eq!(index_block.len(), items.len()); - let item = if is_restart { - self.lo_scanner.remaining_in_interval = self.restart_interval; + let mut iter = index_block.iter(); + assert!(iter.seek(b"b"), "should seek"); - Self::parse_restart_item( - self.block, - &mut self.lo_scanner.offset, - &mut self.lo_scanner.base_key_offset, - ) - } else { - Self::parse_truncated_item( - self.block, - &mut self.lo_scanner.offset, - self.lo_scanner.base_key_offset.expect("should exist"), - ) - }; + let real_items: Vec<_> = iter + .map(|item| item.materialize(&index_block.inner.data)) + .collect(); - self.lo_scanner.remaining_in_interval -= 1; */ + assert_eq!(items, &*real_items); - let item = self.lo_scanner.next(); + Ok(()) + } - if self.hi_scanner.base_key_offset.is_some() - && self.lo_scanner.offset() >= self.hi_scanner.offset - { - return None; - } + #[test] + fn v3_index_block_iter_seek_middle() -> crate::Result<()> { + let items = [ + KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), + KeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), + KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), + ]; - item - } -} + let bytes = IndexBlock::encode_items(&items, 1)?; -impl DoubleEndedIterator for Iter<'_> { - fn next_back(&mut self) -> Option { - if let Some(top) = self.consume_stack_top() { - return Some(top); - } + let index_block = IndexBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); - // NOTE: If we wrapped, we are at the end - // This is safe to do, because there cannot be that many restart intervals - if self.hi_scanner.ptr_idx == usize::MAX { - return None; - } + assert_eq!(index_block.len(), items.len()); - self.hi_scanner.ptr_idx = self.hi_scanner.ptr_idx.wrapping_sub(1); + let mut iter = index_block.iter(); + assert!(iter.seek(b"c"), "should seek"); - // NOTE: If we wrapped, we are at the end - // This is safe to do, because there cannot be that many restart intervals - if self.hi_scanner.ptr_idx == usize::MAX { - return None; - } + let real_items: Vec<_> = iter + .map(|item| item.materialize(&index_block.inner.data)) + .collect(); - let binary_index = self.block.get_binary_index_reader(); + assert_eq!( + items.iter().skip(2).cloned().collect::>(), + &*real_items, + ); - { - self.hi_scanner.offset = binary_index.get(self.hi_scanner.ptr_idx); - let offset = self.hi_scanner.offset; - - if Self::parse_restart_item( - self.block, - &mut self.hi_scanner.offset, - &mut self.hi_scanner.base_key_offset, - ) - .is_some() - { - self.hi_scanner.stack.push(offset); - } - } + Ok(()) + } - for _ in 1..self.restart_interval { - let offset = self.hi_scanner.offset; - - if Self::parse_truncated_item( - self.block, - &mut self.hi_scanner.offset, - self.hi_scanner.base_key_offset.expect("should exist"), - ) - .is_some() - { - self.hi_scanner.stack.push(offset); - } - } + #[test] + fn v3_index_block_iter_rev_seek() -> crate::Result<()> { + let items = [ + KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), + KeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), + KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), + ]; - if self.hi_scanner.stack.is_empty() { - return None; - } + let bytes = IndexBlock::encode_items(&items, 1)?; - self.consume_stack_top() - } -} + let index_block = IndexBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); -#[cfg(test)] -mod tests { - use super::*; - use crate::{ - segment::{block::Header, Block, BlockOffset, KeyedBlockHandle}, - Checksum, - }; - use test_log::test; + assert_eq!(index_block.len(), items.len()); + + let mut iter = index_block.iter(); + assert!(iter.seek_upper(b"c"), "should seek"); + + let real_items: Vec<_> = iter + .map(|item| item.materialize(&index_block.inner.data)) + .collect(); + + assert_eq!(items.to_vec(), &*real_items); + + Ok(()) + } #[test] - #[allow(clippy::unwrap_used)] - fn v3_index_block_iter_simple() -> crate::Result<()> { + fn v3_index_block_iter_rev_seek_2() -> crate::Result<()> { let items = [ KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), KeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), @@ -241,9 +205,6 @@ mod tests { ]; let bytes = IndexBlock::encode_items(&items, 1)?; - // eprintln!("{bytes:?}"); - // eprintln!("{}", String::from_utf8_lossy(&bytes)); - /* eprintln!("encoded into {} bytes", bytes.len()); */ let index_block = IndexBlock::new(Block { data: bytes.into(), @@ -256,55 +217,59 @@ mod tests { }); assert_eq!(index_block.len(), items.len()); - assert_eq!(index_block.iter().count(), items.len()); - assert_eq!(index_block.iter().rev().count(), items.len()); - { - let mut iter = index_block.iter(); + let mut iter = index_block.iter(); + assert!(iter.seek_upper(b"e"), "should seek"); - assert_eq!(b"b", &**iter.next().unwrap().end_key()); - assert_eq!(b"bcdef", &**iter.next().unwrap().end_key()); - assert_eq!(b"def", &**iter.next().unwrap().end_key()); - assert!(iter.next().is_none()); - assert!(iter.next_back().is_none()); - } + let real_items: Vec<_> = iter + .map(|item| item.materialize(&index_block.inner.data)) + .collect(); - { - let mut iter = index_block.iter().rev(); + assert_eq!(items.to_vec(), &*real_items); - assert_eq!(b"def", &**iter.next().unwrap().end_key()); - assert_eq!(b"bcdef", &**iter.next().unwrap().end_key()); - assert_eq!(b"b", &**iter.next().unwrap().end_key()); - assert!(iter.next_back().is_none()); - assert!(iter.next().is_none()); - } + Ok(()) + } - { - let mut iter = index_block.iter(); + #[test] + fn v3_index_block_iter_rev_seek_3() -> crate::Result<()> { + let items = [ + KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), + KeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), + KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), + ]; - assert_eq!(b"b", &**iter.next().unwrap().end_key()); - assert_eq!(b"bcdef", &**iter.next().unwrap().end_key()); - assert_eq!(b"def", &**iter.next_back().unwrap().end_key()); - assert!(iter.next().is_none()); - assert!(iter.next_back().is_none()); - } + let bytes = IndexBlock::encode_items(&items, 1)?; - { - let mut iter = index_block.iter().rev(); + let index_block = IndexBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); - assert_eq!(b"def", &**iter.next().unwrap().end_key()); - assert_eq!(b"bcdef", &**iter.next().unwrap().end_key()); - assert_eq!(b"b", &**iter.next_back().unwrap().end_key()); - assert!(iter.next_back().is_none()); - assert!(iter.next().is_none()); - } + assert_eq!(index_block.len(), items.len()); + + let mut iter = index_block.iter(); + assert!(iter.seek_upper(b"b"), "should seek"); + + let real_items: Vec<_> = iter + .map(|item| item.materialize(&index_block.inner.data)) + .collect(); + + assert_eq!( + items.iter().take(2).cloned().collect::>(), + &*real_items, + ); Ok(()) } #[test] - #[allow(clippy::unwrap_used)] - fn v3_index_block_iter_exhaust() -> crate::Result<()> { + #[ignore] + fn v3_index_block_iter_too_far() -> crate::Result<()> { let items = [ KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), KeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), @@ -312,9 +277,6 @@ mod tests { ]; let bytes = IndexBlock::encode_items(&items, 1)?; - // eprintln!("{bytes:?}"); - // eprintln!("{}", String::from_utf8_lossy(&bytes)); - /* eprintln!("encoded into {} bytes", bytes.len()); */ let index_block = IndexBlock::new(Block { data: bytes.into(), @@ -327,55 +289,111 @@ mod tests { }); assert_eq!(index_block.len(), items.len()); - assert_eq!(index_block.iter().count(), items.len()); - assert_eq!(index_block.iter().rev().count(), items.len()); + + let mut iter = index_block.iter(); + assert!(!iter.seek(b"zzz"), "should not seek"); + + let real_items: Vec<_> = iter + .map(|item| item.materialize(&index_block.inner.data)) + .collect(); + + assert_eq!(&[] as &[KeyedBlockHandle], &*real_items); + + Ok(()) + } + + #[test] + fn v3_index_block_iter_span() -> crate::Result<()> { + let items = [ + KeyedBlockHandle::new(b"a".into(), BlockOffset(0), 6_000), + KeyedBlockHandle::new(b"a".into(), BlockOffset(6_000), 7_000), + KeyedBlockHandle::new(b"b".into(), BlockOffset(13_000), 5_000), + ]; + + let bytes = IndexBlock::encode_items(&items, 1)?; + + let index_block = IndexBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(index_block.len(), items.len()); { let mut iter = index_block.iter(); + assert!(iter.seek(b"a"), "should seek"); - assert_eq!(b"b", &**iter.next().unwrap().end_key()); - assert_eq!(b"bcdef", &**iter.next().unwrap().end_key()); - assert_eq!(b"def", &**iter.next().unwrap().end_key()); - assert!(iter.next().is_none()); - assert!(iter.next().is_none()); - assert!(iter.next_back().is_none()); - assert!(iter.next_back().is_none()); + let real_items: Vec<_> = iter + .map(|item| item.materialize(&index_block.inner.data)) + .collect(); + + assert_eq!(items.to_vec(), &*real_items); } { - let mut iter = index_block.iter().rev(); - - assert_eq!(b"def", &**iter.next().unwrap().end_key()); - assert_eq!(b"bcdef", &**iter.next().unwrap().end_key()); - assert_eq!(b"b", &**iter.next().unwrap().end_key()); - assert!(iter.next_back().is_none()); - assert!(iter.next_back().is_none()); - assert!(iter.next().is_none()); - assert!(iter.next().is_none()); + let mut iter = index_block.iter(); + assert!(iter.seek(b"b"), "should seek"); + + let real_items: Vec<_> = iter + .map(|item| item.materialize(&index_block.inner.data)) + .collect(); + + assert_eq!( + items.iter().skip(2).cloned().collect::>(), + &*real_items, + ); } + Ok(()) + } + + #[test] + fn v3_index_block_iter_rev_span() -> crate::Result<()> { + let items = [ + KeyedBlockHandle::new(b"a".into(), BlockOffset(0), 6_000), + KeyedBlockHandle::new(b"a".into(), BlockOffset(6_000), 7_000), + KeyedBlockHandle::new(b"b".into(), BlockOffset(13_000), 5_000), + ]; + + let bytes = IndexBlock::encode_items(&items, 1)?; + + let index_block = IndexBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(index_block.len(), items.len()); + { let mut iter = index_block.iter(); + assert!(iter.seek_upper(b"a"), "should seek"); + + let real_items: Vec<_> = iter + .map(|item| item.materialize(&index_block.inner.data)) + .collect(); - assert_eq!(b"b", &**iter.next().unwrap().end_key()); - assert_eq!(b"bcdef", &**iter.next().unwrap().end_key()); - assert_eq!(b"def", &**iter.next_back().unwrap().end_key()); - assert!(iter.next().is_none()); - assert!(iter.next().is_none()); - assert!(iter.next_back().is_none()); - assert!(iter.next_back().is_none()); + assert_eq!(items.to_vec(), &*real_items); } { - let mut iter = index_block.iter().rev(); - - assert_eq!(b"def", &**iter.next().unwrap().end_key()); - assert_eq!(b"bcdef", &**iter.next().unwrap().end_key()); - assert_eq!(b"b", &**iter.next_back().unwrap().end_key()); - assert!(iter.next_back().is_none()); - assert!(iter.next_back().is_none()); - assert!(iter.next().is_none()); - assert!(iter.next().is_none()); + let mut iter = index_block.iter(); + assert!(iter.seek_upper(b"b"), "should seek"); + + let real_items: Vec<_> = iter + .map(|item| item.materialize(&index_block.inner.data)) + .collect(); + + assert_eq!(items.to_vec(), &*real_items); } Ok(()) From 5a08d9b7579e525a9827663bff103971fb5b1b30 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 26 Jun 2025 22:25:10 +0200 Subject: [PATCH 208/613] refactor: index block --- src/segment/index_block/mod.rs | 582 ++++----------------------------- 1 file changed, 57 insertions(+), 525 deletions(-) diff --git a/src/segment/index_block/mod.rs b/src/segment/index_block/mod.rs index 910616d3..e838e5c2 100644 --- a/src/segment/index_block/mod.rs +++ b/src/segment/index_block/mod.rs @@ -3,72 +3,79 @@ // (found in the LICENSE-* files in the repository) mod block_handle; -mod forward_reader; mod iter; pub use block_handle::{BlockHandle, KeyedBlockHandle}; pub use iter::Iter; use super::{ - block::{binary_index::Reader as BinaryIndexReader, BlockOffset, Encoder, Trailer}, + block::{BlockOffset, Encoder, Trailer}, Block, }; -use crate::segment::{block::TRAILER_START_MARKER, data_block::forward_reader::ParsedSlice}; -use byteorder::{LittleEndian, ReadBytesExt}; -use forward_reader::{ForwardReader, ParsedItem}; -use std::io::{Cursor, Seek}; -use varint_rs::VarintReader; +use crate::Slice; +use crate::{ + segment::{ + block::{Decoder, ParsedItem as Parsy}, + util::{compare_prefixed_slice, SliceIndexes}, + }, + unwrappy, +}; + +#[derive(Debug)] +pub struct IndexBlockParsedItem { + pub offset: BlockOffset, + pub size: u32, + pub prefix: Option, + pub end_key: SliceIndexes, +} + +impl Parsy for IndexBlockParsedItem { + fn compare_key(&self, needle: &[u8], bytes: &[u8]) -> std::cmp::Ordering { + if let Some(prefix) = &self.prefix { + let prefix = unsafe { bytes.get_unchecked(prefix.0..prefix.1) }; + let rest_key = unsafe { bytes.get_unchecked(self.end_key.0..self.end_key.1) }; + compare_prefixed_slice(prefix, rest_key, needle) + } else { + let key = unsafe { bytes.get_unchecked(self.end_key.0..self.end_key.1) }; + key.cmp(needle) + } + } + + fn key<'a>(&self, bytes: &'a [u8]) -> &'a [u8] { + debug_assert!(self.prefix.is_none(), "can only get key of restart heads"); + + unwrappy!(bytes.get(self.end_key.0..self.end_key.1)) + } + + fn key_offset(&self) -> usize { + self.end_key.0 + } -macro_rules! unwrappy { - ($x:expr) => { - $x.expect("should read") + fn materialize(&self, bytes: &Slice) -> KeyedBlockHandle { + // NOTE: We consider the prefix and key slice indexes to be trustworthy + #[allow(clippy::indexing_slicing)] + let key = if let Some(prefix) = &self.prefix { + let prefix_key = &bytes[prefix.0..prefix.1]; + let rest_key = &bytes[self.end_key.0..self.end_key.1]; + Slice::fused(prefix_key, rest_key) + } else { + bytes.slice(self.end_key.0..self.end_key.1) + }; - // unsafe { $x.unwrap_unchecked() } - }; + KeyedBlockHandle::new(key, self.offset, self.size) + } } /// Block that contains block handles (file offset + size) #[derive(Clone)] pub struct IndexBlock { pub inner: Block, - - // Cached metadata - restart_interval: u8, - - binary_index_step_size: u8, - binary_index_offset: u32, - binary_index_len: u32, } impl IndexBlock { #[must_use] pub fn new(inner: Block) -> Self { - let trailer = Trailer::new(&inner); - let mut reader = trailer.as_slice(); - - let _item_count = reader.read_u32::().expect("should read"); - - let restart_interval = unwrappy!(reader.read_u8()); - - let binary_index_step_size = unwrappy!(reader.read_u8()); - - debug_assert!( - binary_index_step_size == 2 || binary_index_step_size == 4, - "invalid binary index step size", - ); - - let binary_index_offset = unwrappy!(reader.read_u32::()); - let binary_index_len = unwrappy!(reader.read_u32::()); - - Self { - inner, - - restart_interval, - - binary_index_step_size, - binary_index_offset, - binary_index_len, - } + Self { inner } } /// Returns the amount of items in the block. @@ -83,258 +90,12 @@ impl IndexBlock { false } - /// Access the inner raw bytes - #[must_use] - fn bytes(&self) -> &[u8] { - &self.inner.data - } - - /// Returns the binary index length (number of pointers). - /// - /// The number of pointers is equal to the number of restart intervals. - #[must_use] - pub fn binary_index_len(&self) -> u32 { - self.binary_index_len - } - - /// Returns the binary index offset. - #[must_use] - fn binary_index_offset(&self) -> u32 { - self.binary_index_offset - } - - /// Returns the binary index step size. - /// - /// The binary index can either store u16 or u32 pointers, - /// depending on the size of the data block. - /// - /// Typically blocks are < 64K, so u16 pointers reduce the index - /// size by half. - #[must_use] - fn binary_index_step_size(&self) -> u8 { - self.binary_index_step_size - } - - fn get_binary_index_reader(&self) -> BinaryIndexReader { - BinaryIndexReader::new( - self.bytes(), - self.binary_index_offset(), - self.binary_index_len(), - self.binary_index_step_size(), - ) - } - - // TODO: should not return Option<>? #[must_use] #[allow(clippy::iter_without_into_iter)] - pub fn forward_reader( - &self, - needle: &[u8], - ) -> Option + '_> { - let offset = self - .search_lowest(&self.get_binary_index_reader(), needle) - .unwrap_or_default(); - - // SAFETY: pos is always retrieved from the binary index, - // which we consider to be trustworthy - #[warn(unsafe_code)] - let mut cursor = Cursor::new(unsafe { self.inner.data.get_unchecked(offset..) }); - - let item = Self::parse_restart_item(&mut cursor, offset)?; - - // SAFETY: We trust the parsed restart head - #[allow(clippy::indexing_slicing)] - let key = &self.inner.data[item.end_key.0..item.end_key.1]; - - if needle > key { - return None; - } - - Some( - ForwardReader::new(self) - .with_offset(offset) - .map(|kv| kv.materialize(&self.inner.data)), - ) - } - - fn parse_restart_item(reader: &mut Cursor<&[u8]>, pos: usize) -> Option { - let marker = unwrappy!(reader.read_u8()); - - if marker == TRAILER_START_MARKER { - return None; - } - - let offset = unwrappy!(reader.read_u64_varint()); - let size = unwrappy!(reader.read_u32_varint()); - - let key_len: usize = unwrappy!(reader.read_u16_varint()).into(); - let key_start = pos + reader.position() as usize; - - unwrappy!(reader.seek_relative(key_len as i64)); - - Some(ParsedItem { - prefix: None, - end_key: ParsedSlice(key_start, key_start + key_len), - offset: BlockOffset(offset), - size, - }) - } - - #[must_use] - #[allow(clippy::iter_without_into_iter)] - pub fn iter(&self) -> impl DoubleEndedIterator + '_ { - Iter::new(self).map(|kv| kv.materialize(&self.inner.data)) - } - - fn parse_truncated_item( - reader: &mut Cursor<&[u8]>, - offset: usize, - base_key_offset: usize, - ) -> Option { - let marker = unwrappy!(reader.read_u8()); - - if marker == TRAILER_START_MARKER { - return None; - } - - let size = unwrappy!(reader.read_u32_varint()); - - todo!() - } - - fn get_key_at(&self, pos: usize) -> &[u8] { - let bytes = &self.inner.data; - - // SAFETY: pos is always retrieved from the binary index, - // which we consider to be trustworthy - #[warn(unsafe_code)] - let mut cursor = Cursor::new(unsafe { bytes.get_unchecked(pos..) }); - - let item = Self::parse_restart_item(&mut cursor, pos).expect("should exist"); - - // SAFETY: We trust the parsed restart head - #[allow(clippy::indexing_slicing)] - &bytes[item.end_key.0..item.end_key.1] - } - - /// Search for the lowest block that may possibly contain the needle. - fn search_lowest(&self, binary_index: &BinaryIndexReader, needle: &[u8]) -> Option { - let mut left: usize = 0; - let mut right = binary_index.len(); - - if right == 0 { - return None; - } - - while left < right { - let mid = (left + right) / 2; - - let offset = binary_index.get(mid); - - if self.get_key_at(offset) < needle { - left = mid + 1; - } else { - right = mid; - } - } - - Some(if left < binary_index.len() { - binary_index.get(left) - } else { - binary_index.get(binary_index.len() - 1) - }) - } - - /// Search for the last block that may possibly contain the needle. - fn search_highest(&self, binary_index: &BinaryIndexReader, needle: &[u8]) -> Option { - let mut left: usize = 0; - let mut right = binary_index.len(); - - if right == 0 { - return None; - } - - while left < right { - let mid = (left + right) / 2; - - let offset = binary_index.get(mid); - - if self.get_key_at(offset) <= needle { - left = mid + 1; - } else { - right = mid; - } - } - - if left == 0 { - Some(binary_index.get(0)) - } else if left == binary_index.len() { - Some(binary_index.get(binary_index.len() - 1)) - } else { - Some(binary_index.get(left)) - } - } - - #[must_use] - pub fn get_lowest_possible_block(&self, needle: &[u8]) -> Option { - let binary_index = self.get_binary_index_reader(); - - /* - // NOTE: Currently, the hash index is never initialized for index blocks - /* // NOTE: Try hash index if it exists - if let Some(bucket_value) = self - .get_hash_index_reader() - .and_then(|reader| reader.get(key)) - { - let restart_entry_pos = binary_index.get(usize::from(bucket_value)); - return self.walk(key, seqno, restart_entry_pos, self.restart_interval.into()); - } */ - ) */ - - let offset = self.search_lowest(&binary_index, needle)?; - - // SAFETY: offset is always retrieved from the binary index, - // which we consider to be trustworthy - #[warn(unsafe_code)] - let mut cursor = Cursor::new(unsafe { self.inner.data.get_unchecked(offset..) }); - - let item = Self::parse_restart_item(&mut cursor, offset)?; - - // SAFETY: We trust the parsed restart head - #[allow(clippy::indexing_slicing)] - let key = &self.inner.data[item.end_key.0..item.end_key.1]; - - if needle > key { - return None; - } - - // TODO: 3.0.0 scan(), delta encoding etc., add test with restart interval > 1 - - Some(item.materialize(&self.inner.data)) - } - - #[must_use] - pub fn get_highest_possible_block(&self, needle: &[u8]) -> Option { - let binary_index = self.get_binary_index_reader(); - - let offset = self.search_highest(&binary_index, needle)?; - - // SAFETY: offset is always retrieved from the binary index, - // which we consider to be trustworthy - #[warn(unsafe_code)] - let mut cursor = Cursor::new(unsafe { self.inner.data.get_unchecked(offset..) }); - - let item = Self::parse_restart_item(&mut cursor, offset)?; - - // SAFETY: We trust the parsed restart head - #[allow(clippy::indexing_slicing)] - let key = &self.inner.data[item.end_key.0..item.end_key.1]; - - if needle > key { - return None; - } - - Some(item.materialize(&self.inner.data)) + pub fn iter(&self) -> Iter { + Iter::new(Decoder::::new( + &self.inner, + )) } pub fn encode_items( @@ -346,7 +107,7 @@ impl IndexBlock { let mut serializer = Encoder::<'_, BlockOffset, KeyedBlockHandle>::new( items.len(), restart_interval, - 0.0, // TODO: hard-coded for now + 0.0, // NOTE: Index blocks do not support hash index first_key, ); @@ -357,232 +118,3 @@ impl IndexBlock { serializer.finish() } } - -#[cfg(test)] -mod tests { - use super::*; - use crate::segment::block::{Checksum, Header}; - use test_log::test; - - #[test] - #[allow(clippy::unwrap_used)] - fn v3_index_block_simple() -> crate::Result<()> { - let items = [ - KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), - KeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), - KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), - ]; - - let bytes = IndexBlock::encode_items(&items, 1)?; - // eprintln!("{bytes:?}"); - // eprintln!("{}", String::from_utf8_lossy(&bytes)); - /* eprintln!("encoded into {} bytes", bytes.len()); */ - - let index_block = IndexBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(index_block.len(), items.len()); - - assert_eq!( - Some(items.first().unwrap().clone()), - index_block.get_lowest_possible_block(b"a") - ); - assert_eq!( - Some(items.first().unwrap().clone()), - index_block.get_lowest_possible_block(b"b") - ); - assert_eq!( - Some(items.get(1).unwrap().clone()), - index_block.get_lowest_possible_block(b"ba") - ); - assert_eq!( - Some(items.get(2).unwrap().clone()), - index_block.get_lowest_possible_block(b"d") - ); - - // assert_eq!(None, data_block.get_lowest_possible_block(b"zzz")); - - Ok(()) - } - - #[test] - #[allow(clippy::unwrap_used)] - fn v3_index_block_span() -> crate::Result<()> { - let items = [ - KeyedBlockHandle::new(b"a".into(), BlockOffset(0), 6_000), - KeyedBlockHandle::new(b"a".into(), BlockOffset(6_000), 7_000), - KeyedBlockHandle::new(b"b".into(), BlockOffset(13_000), 5_000), - ]; - - let bytes = IndexBlock::encode_items(&items, 1)?; - // eprintln!("{bytes:?}"); - // eprintln!("{}", String::from_utf8_lossy(&bytes)); - /* eprintln!("encoded into {} bytes", bytes.len()); */ - - let index_block = IndexBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(index_block.len(), items.len()); - - assert_eq!( - Some(items.first().unwrap().clone()), - index_block.get_lowest_possible_block(b"a") - ); - assert_eq!( - Some(items.last().unwrap().clone()), - index_block.get_lowest_possible_block(b"abc") - ); - assert_eq!( - Some(items.last().unwrap().clone()), - index_block.get_lowest_possible_block(b"b") - ); - - Ok(()) - } - - #[test] - #[allow(clippy::unwrap_used)] - fn v3_index_block_span_highest() -> crate::Result<()> { - let items = [ - KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), - KeyedBlockHandle::new(b"c".into(), BlockOffset(0), 6_000), - KeyedBlockHandle::new(b"c".into(), BlockOffset(6_000), 7_000), - KeyedBlockHandle::new(b"d".into(), BlockOffset(13_000), 5_000), - ]; - - let bytes = IndexBlock::encode_items(&items, 1)?; - // eprintln!("{bytes:?}"); - // eprintln!("{}", String::from_utf8_lossy(&bytes)); - /* eprintln!("encoded into {} bytes", bytes.len()); */ - - let index_block = IndexBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(index_block.len(), items.len()); - - assert_eq!( - Some(items.first().unwrap().clone()), - index_block.get_highest_possible_block(b"a") - ); - assert_eq!( - Some(items.get(1).unwrap().clone()), - index_block.get_highest_possible_block(b"abc") - ); - assert_eq!( - Some(items.last().unwrap().clone()), - index_block.get_highest_possible_block(b"c") - ); - assert_eq!( - Some(items.last().unwrap().clone()), - index_block.get_highest_possible_block(b"cef") - ); - assert_eq!( - Some(items.last().unwrap().clone()), - index_block.get_highest_possible_block(b"d") - ); - assert_eq!(None, index_block.get_highest_possible_block(b"zzz")); - - Ok(()) - } - - #[test] - fn v3_index_block_one() -> crate::Result<()> { - let item = KeyedBlockHandle::new(b"c".into(), BlockOffset(0), 6_000); - - let bytes = IndexBlock::encode_items(&[item.clone()], 1)?; - // eprintln!("{bytes:?}"); - // eprintln!("{}", String::from_utf8_lossy(&bytes)); - /* eprintln!("encoded into {} bytes", bytes.len()); */ - - let index_block = IndexBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(index_block.len(), 1); - - assert_eq!( - Some(item.clone()), - index_block.get_lowest_possible_block(b"a") - ); - assert_eq!( - Some(item.clone()), - index_block.get_lowest_possible_block(b"asdasd") - ); - assert_eq!( - Some(item.clone()), - index_block.get_lowest_possible_block(b"b") - ); - assert_eq!(Some(item), index_block.get_lowest_possible_block(b"c")); - assert_eq!(None, index_block.get_lowest_possible_block(b"d")); - assert_eq!(None, index_block.get_lowest_possible_block(b"z")); - - Ok(()) - } - - #[test] - fn v3_index_block_one_highest() -> crate::Result<()> { - let item = KeyedBlockHandle::new(b"c".into(), BlockOffset(0), 6_000); - - let bytes = IndexBlock::encode_items(&[item.clone()], 1)?; - // eprintln!("{bytes:?}"); - // eprintln!("{}", String::from_utf8_lossy(&bytes)); - /* eprintln!("encoded into {} bytes", bytes.len()); */ - - let index_block = IndexBlock::new(Block { - data: bytes.into(), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(index_block.len(), 1); - - assert_eq!( - Some(item.clone()), - index_block.get_lowest_possible_block(b"a") - ); - assert_eq!( - Some(item.clone()), - index_block.get_lowest_possible_block(b"asdasd") - ); - assert_eq!( - Some(item.clone()), - index_block.get_lowest_possible_block(b"b") - ); - assert_eq!(Some(item), index_block.get_lowest_possible_block(b"c")); - assert_eq!(None, index_block.get_lowest_possible_block(b"d")); - assert_eq!(None, index_block.get_lowest_possible_block(b"z")); - - Ok(()) - } -} From a6100e28f5404cbb9010f0169ac179f72380a8c0 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 26 Jun 2025 22:25:46 +0200 Subject: [PATCH 209/613] adjust block index --- src/segment/block_index/iter.rs | 37 +++++++++++++++++++++++---------- src/segment/block_index/mod.rs | 15 +++++++++---- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/src/segment/block_index/iter.rs b/src/segment/block_index/iter.rs index 970e9428..6cfdbf81 100644 --- a/src/segment/block_index/iter.rs +++ b/src/segment/block_index/iter.rs @@ -2,34 +2,49 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::segment::{IndexBlock, KeyedBlockHandle}; +use crate::segment::{ + block::ParsedItem, index_block::Iter as IndexBlockIter, IndexBlock, KeyedBlockHandle, +}; use self_cell::self_cell; -type BoxedIter<'a> = Box + 'a>; - self_cell!( - pub struct IndexBlockConsumer { + pub struct OwnedIndexBlockIter { owner: IndexBlock, #[covariant] - dependent: BoxedIter, + dependent: IndexBlockIter, } ); -pub fn create_index_block_reader(block: IndexBlock) -> IndexBlockConsumer { - IndexBlockConsumer::new(block, |block| Box::new(block.iter())) +impl OwnedIndexBlockIter { + pub fn seek_lower(&mut self, needle: &[u8]) -> bool { + self.with_dependent_mut(|_, m| m.seek(needle /* TODO: , seqno */)) + } + + pub fn seek_upper(&mut self, needle: &[u8]) -> bool { + self.with_dependent_mut(|_, m| m.seek_upper(needle /* TODO: , seqno */)) + } } -impl Iterator for IndexBlockConsumer { +impl Iterator for OwnedIndexBlockIter { type Item = KeyedBlockHandle; fn next(&mut self) -> Option { - self.with_dependent_mut(|_, iter| iter.next()) + self.with_dependent_mut(|block, iter| { + iter.next().map(|item| item.materialize(&block.inner.data)) + }) } } -impl DoubleEndedIterator for IndexBlockConsumer { +impl DoubleEndedIterator for OwnedIndexBlockIter { fn next_back(&mut self) -> Option { - self.with_dependent_mut(|_, iter| iter.next_back()) + self.with_dependent_mut(|block, iter| { + iter.next_back() + .map(|item| item.materialize(&block.inner.data)) + }) } } + +pub fn create_index_block_reader(block: IndexBlock) -> OwnedIndexBlockIter { + OwnedIndexBlockIter::new(block, IndexBlock::iter) +} diff --git a/src/segment/block_index/mod.rs b/src/segment/block_index/mod.rs index 46edd4b4..1605b6d6 100644 --- a/src/segment/block_index/mod.rs +++ b/src/segment/block_index/mod.rs @@ -5,6 +5,7 @@ pub(crate) mod iter; use super::{CachePolicy, IndexBlock, KeyedBlockHandle}; +use crate::segment::block::ParsedItem; #[enum_dispatch::enum_dispatch] pub trait BlockIndex { @@ -44,7 +45,7 @@ pub trait BlockIndex { /// found by finding the highest block that has a lower or equal end key than the searched key (by performing in-memory binary search). /// In the diagram above, searching for 'J' yields the block starting with 'G'. /// 'J' must be in that block, because the next block starts with 'M'). -#[enum_dispatch::enum_dispatch(BlockIndex)] +// #[enum_dispatch::enum_dispatch(BlockIndex)] #[allow(clippy::module_name_repetitions)] pub enum BlockIndexImpl { Full(FullBlockIndex), @@ -65,7 +66,13 @@ impl FullBlockIndex { &self, needle: &[u8], ) -> Option + '_> { - self.0.forward_reader(needle) + let mut iter = self.0.iter(); + + if iter.seek(needle) { + Some(iter.map(|x| x.materialize(&self.0.inner.data))) + } else { + None + } } pub fn inner(&self) -> &IndexBlock { @@ -73,7 +80,7 @@ impl FullBlockIndex { } } -impl BlockIndex for FullBlockIndex { +/* impl BlockIndex for FullBlockIndex { fn get_last_block_containing_key( &self, key: &[u8], @@ -93,7 +100,7 @@ impl BlockIndex for FullBlockIndex { fn get_last_block_handle(&self, _: CachePolicy) -> crate::Result { todo!() } -} +} */ /* impl std::ops::Deref for FullBlockIndex { type Target = Box<[KeyedBlockHandle]>; From ae7313897d88e38ec96cb16815ec0be4f9aa75b8 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 26 Jun 2025 22:26:17 +0200 Subject: [PATCH 210/613] adjust segment iter --- src/segment/iter.rs | 51 ++++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/src/segment/iter.rs b/src/segment/iter.rs index 5656e172..1c20d70c 100644 --- a/src/segment/iter.rs +++ b/src/segment/iter.rs @@ -2,43 +2,60 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::{BlockOffset, DataBlock, GlobalSegmentId, KeyedBlockHandle}; +use super::{ + data_block::Iter as DataBlockIter, BlockOffset, DataBlock, GlobalSegmentId, KeyedBlockHandle, +}; use crate::{ - segment::{util::load_block, BlockHandle}, - Cache, CompressionType, DescriptorTable, InternalValue, + segment::{block::ParsedItem, util::load_block, BlockHandle}, + Cache, CompressionType, DescriptorTable, InternalValue, SeqNo, }; use self_cell::self_cell; use std::{path::PathBuf, sync::Arc}; -type BoxedIter<'a> = Box + 'a>; +type InnerIter<'a> = DataBlockIter<'a>; self_cell!( - pub struct DataBlockConsumer { + pub struct OwnedDataBlockIter { owner: DataBlock, #[covariant] - dependent: BoxedIter, + dependent: InnerIter, } ); -pub fn create_data_block_reader(block: DataBlock) -> DataBlockConsumer { - DataBlockConsumer::new(block, |block| Box::new(block.iter())) +impl OwnedDataBlockIter { + pub fn seek_lower(&mut self, needle: &[u8], seqno: SeqNo) -> bool { + self.with_dependent_mut(|_, m| m.seek(needle /* TODO: , seqno */)) + } + + pub fn seek_upper(&mut self, needle: &[u8], seqno: SeqNo) -> bool { + self.with_dependent_mut(|_, m| m.seek_upper(needle /* TODO: , seqno */)) + } } -impl Iterator for DataBlockConsumer { +impl Iterator for OwnedDataBlockIter { type Item = InternalValue; fn next(&mut self) -> Option { - self.with_dependent_mut(|_, iter| iter.next()) + self.with_dependent_mut(|block, iter| { + iter.next().map(|item| item.materialize(&block.inner.data)) + }) } } -impl DoubleEndedIterator for DataBlockConsumer { +impl DoubleEndedIterator for OwnedDataBlockIter { fn next_back(&mut self) -> Option { - self.with_dependent_mut(|_, iter| iter.next_back()) + self.with_dependent_mut(|block, iter| { + iter.next_back() + .map(|item| item.materialize(&block.inner.data)) + }) } } +pub fn create_data_block_reader(block: DataBlock) -> OwnedDataBlockIter { + OwnedDataBlockIter::new(block, super::data_block::DataBlock::iter) +} + pub struct Iter where I: DoubleEndedIterator, @@ -53,10 +70,10 @@ where compression: CompressionType, lo_offset: BlockOffset, - lo_data_block: Option, + lo_data_block: Option, hi_offset: BlockOffset, - hi_data_block: Option, + hi_data_block: Option, } impl Iter @@ -136,6 +153,12 @@ where let mut reader = create_data_block_reader(block); + // TODO: + /* // NOTE: This is the first block, seek it + if self.lo_data_block.is_none() { + reader.seek_lower(self.); + } */ + let item = reader.next(); self.lo_offset = handle.offset(); From 961181902b55b55c374e5c020d0a9c5ea2abe263 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 26 Jun 2025 22:26:33 +0200 Subject: [PATCH 211/613] refactor: segment scanner --- src/segment/scanner.rs | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/src/segment/scanner.rs b/src/segment/scanner.rs index aaf4697f..68c2480b 100644 --- a/src/segment/scanner.rs +++ b/src/segment/scanner.rs @@ -3,25 +3,13 @@ // (found in the LICENSE-* files in the repository) use super::{Block, DataBlock}; -use crate::{CompressionType, InternalValue}; -use self_cell::self_cell; +use crate::{segment::iter::OwnedDataBlockIter, CompressionType, InternalValue}; use std::{fs::File, io::BufReader, path::Path}; -type BlockIter<'a> = Box + 'a>; - -self_cell!( - pub struct Iter { - owner: DataBlock, - - #[covariant] - dependent: BlockIter, - } -); - /// Segment reader that is optimized for consuming an entire segment pub struct Scanner { reader: BufReader, - iter: Iter, + iter: OwnedDataBlockIter, compression: CompressionType, block_count: usize, @@ -38,7 +26,7 @@ impl Scanner { let mut reader = BufReader::with_capacity(8 * 4_096, File::open(path)?); let block = Self::fetch_next_block(&mut reader, compression)?; - let iter = Iter::new(block, |block| Box::new(block.scan())); + let iter = OwnedDataBlockIter::new(block, DataBlock::iter); Ok(Self { reader, @@ -63,7 +51,7 @@ impl Iterator for Scanner { fn next(&mut self) -> Option { loop { - if let Some(item) = self.iter.with_dependent_mut(|_, iter| iter.next()) { + if let Some(item) = self.iter.next() { return Some(Ok(item)); } @@ -73,7 +61,7 @@ impl Iterator for Scanner { // Init new block let block = fail_iter!(Self::fetch_next_block(&mut self.reader, self.compression)); - self.iter = Iter::new(block, |block| Box::new(block.scan())); + self.iter = OwnedDataBlockIter::new(block, DataBlock::iter); self.read_count += 1; } From ed5d05b982ffdc7aa3d9cc8ab3669cff864e3a82 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 26 Jun 2025 22:26:43 +0200 Subject: [PATCH 212/613] SliceIndexes struct --- src/segment/util.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/segment/util.rs b/src/segment/util.rs index c3bc13da..79e29d71 100644 --- a/src/segment/util.rs +++ b/src/segment/util.rs @@ -6,6 +6,10 @@ use super::{Block, BlockHandle, GlobalSegmentId}; use crate::{Cache, CompressionType, DescriptorTable}; use std::{path::Path, sync::Arc}; +/// [start, end] slice indexes +#[derive(Debug)] +pub struct SliceIndexes(pub usize, pub usize); + pub fn load_block( segment_id: GlobalSegmentId, path: &Path, From e03248c0eff3b230488de6f541f3f99899b83adb Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 26 Jun 2025 22:27:37 +0200 Subject: [PATCH 213/613] seek index in segment range read ops --- src/segment/mod.rs | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 8c24cf40..7b55b281 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -258,13 +258,19 @@ impl Segment { todo!(); }; - // TODO: range should be RangeBounds? + let mut index_iter = create_index_block_reader(block_index.inner().clone()); - // TODO: seek iter to lowest block containing lower bound - let index_iter = create_index_block_reader(block_index.inner().clone()); + // TODO: this should probably happen lazily on first read + if let Bound::Excluded(key) | Bound::Included(key) = range.start_bound() { + index_iter.seek_lower(key); + } + + // TODO: this should probably happen lazily on first read + if let Bound::Excluded(key) | Bound::Included(key) = range.end_bound() { + index_iter.seek_upper(key); + } - // TODO: then when we read the first data block - // (first .next(), seek inside the first data block) + // TODO: need a Range struct that wraps Iter, so we can seek in first and last DATA blocks let iter = Iter::new( self.global_id(), From ff36871da49aa8e5dc556ad87efb548814d43a3c Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 26 Jun 2025 22:27:54 +0200 Subject: [PATCH 214/613] wip --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 9c1d3cc7..2db7f8e2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -100,6 +100,7 @@ #![warn(clippy::multiple_crate_versions)] #![allow(clippy::option_if_let_else)] #![warn(clippy::needless_lifetimes)] +#![warn(clippy::redundant_feature_names)] pub(crate) type HashMap = std::collections::HashMap; pub(crate) type HashSet = std::collections::HashSet; @@ -137,7 +138,6 @@ mod config; mod double_ended_peekable; mod error; -// mod export; pub(crate) mod fallible_clipping_iter; From 36d4976ef2a286c13022897388afab28a9674c1a Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 26 Jun 2025 22:31:45 +0200 Subject: [PATCH 215/613] fix: clippy --- src/blob_tree/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index 17949894..354e6097 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -377,7 +377,7 @@ impl AbstractTree for BlobTree { let lsm_segment_folder = self.index.config.path.join(SEGMENTS_FOLDER); log::debug!("flushing memtable & performing key-value separation"); - log::debug!("=> to LSM segments in {:?}", lsm_segment_folder); + log::debug!("=> to LSM segments in {lsm_segment_folder:?}"); log::debug!("=> to blob segment at {:?}", self.blobs.path); let mut segment_writer = SegmentWriter::new( From c0db1380efe27a6cc6a07ca28a54c699d216a979 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 26 Jun 2025 22:35:13 +0200 Subject: [PATCH 216/613] fix: clippy --- src/double_ended_peekable.rs | 74 +++++------------------------------- 1 file changed, 9 insertions(+), 65 deletions(-) diff --git a/src/double_ended_peekable.rs b/src/double_ended_peekable.rs index 6ed4e655..45b256f1 100644 --- a/src/double_ended_peekable.rs +++ b/src/double_ended_peekable.rs @@ -1,4 +1,4 @@ -//! A fork of https://github.com/dodomorandi/double-ended-peekable +//! A fork of //! to allow accessing the inner type //! //! Also changes the generics a bit so it plays well with `self_cell`. @@ -48,10 +48,6 @@ impl DoubleEndedPeekable where I: Iterator, { - pub fn inner(&self) -> &I { - &self.iter - } - pub fn inner_mut(&mut self) -> &mut I { &mut self.iter } @@ -83,35 +79,6 @@ where /// /// Because `peek_back()` returns a reference, and many iterators iterate over references, /// there can be a possibly confusing situation where the return value is a double reference. - /// You can see this effect in the examples below. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use double_ended_peekable::DoubleEndedPeekableExt; - /// - /// let xs = [1, 2, 3]; - /// - /// let mut iter = xs.into_iter().double_ended_peekable(); - /// - /// // peek_back() lets us see into the past of the future - /// assert_eq!(iter.peek_back(), Some(&3)); - /// assert_eq!(iter.next_back(), Some(3)); - /// - /// assert_eq!(iter.next_back(), Some(2)); - /// - /// // The iterator does not advance even if we `peek_back` multiple times - /// assert_eq!(iter.peek_back(), Some(&1)); - /// assert_eq!(iter.peek_back(), Some(&1)); - /// - /// assert_eq!(iter.next_back(), Some(1)); - /// - /// // After the iterator is finished, so is `peek_back()` - /// assert_eq!(iter.peek_back(), None); - /// assert_eq!(iter.next_back(), None); - /// ``` #[inline] pub fn peek_back(&mut self) -> Option<&I::Item> { self.back @@ -138,18 +105,6 @@ where }, } } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - let (lower, upper) = self.iter.size_hint(); - let additional = match (&self.front, &self.back) { - (MaybePeeked::Peeked(_), MaybePeeked::Peeked(_)) => 2, - (MaybePeeked::Peeked(_), _) | (_, MaybePeeked::Peeked(_)) => 1, - (MaybePeeked::Unpeeked, MaybePeeked::Unpeeked) => 0, - }; - - (lower + additional, upper.map(|upper| upper + additional)) - } } impl DoubleEndedIterator for DoubleEndedPeekable @@ -181,11 +136,11 @@ impl MaybePeeked { where F: FnOnce() -> Option, { - if let MaybePeeked::Unpeeked = self { - *self = MaybePeeked::Peeked(f()); + if matches!(self, Self::Unpeeked) { + *self = Self::Peeked(f()); } - let MaybePeeked::Peeked(peeked) = self else { + let Self::Peeked(peeked) = self else { // SAFETY: it cannot be `Unpeeked` because that case has been just replaced with // `Peeked`, and we only have two possible states. #[allow(unsafe_code)] @@ -198,30 +153,19 @@ impl MaybePeeked { const fn peeked_value_ref(&self) -> Option<&T> { match self { - MaybePeeked::Unpeeked | MaybePeeked::Peeked(None) => None, - MaybePeeked::Peeked(Some(peeked)) => Some(peeked), + Self::Unpeeked | Self::Peeked(None) => None, + Self::Peeked(Some(peeked)) => Some(peeked), } } - fn peeked_value_mut(&mut self) -> Option<&mut T> { - match self { - MaybePeeked::Unpeeked | MaybePeeked::Peeked(None) => None, - MaybePeeked::Peeked(Some(peeked)) => Some(peeked), - } - } - - const fn is_unpeeked(&self) -> bool { - matches!(self, MaybePeeked::Unpeeked) - } - fn take(&mut self) -> Self { - mem::replace(self, MaybePeeked::Unpeeked) + mem::replace(self, Self::Unpeeked) } fn into_peeked_value(self) -> Option { match self { - MaybePeeked::Unpeeked | MaybePeeked::Peeked(None) => None, - MaybePeeked::Peeked(Some(peeked)) => Some(peeked), + Self::Unpeeked | Self::Peeked(None) => None, + Self::Peeked(Some(peeked)) => Some(peeked), } } } From 73f4c5c1d65d7f16f588a17c4f47b2893ebd3bc7 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 26 Jun 2025 23:44:39 +0200 Subject: [PATCH 217/613] remove some logging --- src/segment/block/decoder.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/segment/block/decoder.rs b/src/segment/block/decoder.rs index 07b7863a..5ea911df 100644 --- a/src/segment/block/decoder.rs +++ b/src/segment/block/decoder.rs @@ -333,9 +333,6 @@ impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Pa return false; }; - eprintln!("seeked upper to {idx}"); - eprintln!("hi scanner offset now {offset}"); - self.hi_scanner.offset = offset; self.hi_scanner.ptr_idx = idx; self.hi_scanner.stack.clear(); @@ -440,7 +437,6 @@ impl, Parsed: ParsedItem> Iterator for Decoder<'_, if self.hi_scanner.base_key_offset.is_some() && self.lo_scanner.offset >= self.hi_scanner.offset { - eprintln!("damn, hi scanner is already at {}", self.hi_scanner.offset); return None; } From ab9baf74754b9c2f980ca25ca7da83d9a6513973 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 27 Jun 2025 01:03:51 +0200 Subject: [PATCH 218/613] refactor --- src/segment/block/decoder.rs | 6 +----- src/segment/data_block/mod.rs | 9 +-------- src/segment/index_block/mod.rs | 7 +------ 3 files changed, 3 insertions(+), 19 deletions(-) diff --git a/src/segment/block/decoder.rs b/src/segment/block/decoder.rs index 5ea911df..f1b0f6ca 100644 --- a/src/segment/block/decoder.rs +++ b/src/segment/block/decoder.rs @@ -148,15 +148,11 @@ impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Pa /// Returns the amount of items in the block. #[must_use] + #[allow(clippy::len_without_is_empty)] pub fn len(&self) -> usize { Trailer::new(self.block).item_count() } - #[must_use] - pub fn is_empty(&self) -> bool { - false - } - fn get_binary_index_reader(&self) -> BinaryIndexReader { BinaryIndexReader::new( &self.block.data, diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index c7420d74..13ce240c 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -413,16 +413,11 @@ impl DataBlock { /// Returns the amount of items in the block. #[must_use] + #[allow(clippy::len_without_is_empty)] pub fn len(&self) -> usize { Trailer::new(&self.inner).item_count() } - /// Always returns false: a block is never empty. - #[must_use] - pub fn is_empty(&self) -> bool { - false - } - pub fn encode_items( items: &[InternalValue], restart_interval: u8, @@ -596,7 +591,6 @@ mod tests { }); assert_eq!(data_block.len(), items.len()); - assert!(!data_block.is_empty()); assert_eq!(data_block.inner.size(), serialized_len); assert_eq!(1, data_block.binary_index_len()); @@ -636,7 +630,6 @@ mod tests { }); assert_eq!(data_block.len(), items.len()); - assert!(!data_block.is_empty()); assert_eq!(data_block.inner.size(), serialized_len); assert_eq!(Some(items[0].clone()), data_block.point_read(b"hello", 777)); diff --git a/src/segment/index_block/mod.rs b/src/segment/index_block/mod.rs index e838e5c2..9fa7aa92 100644 --- a/src/segment/index_block/mod.rs +++ b/src/segment/index_block/mod.rs @@ -80,16 +80,11 @@ impl IndexBlock { /// Returns the amount of items in the block. #[must_use] + #[allow(clippy::len_without_is_empty)] pub fn len(&self) -> usize { Trailer::new(&self.inner).item_count() } - /// Always returns false: a block is never empty. - #[must_use] - pub fn is_empty(&self) -> bool { - false - } - #[must_use] #[allow(clippy::iter_without_into_iter)] pub fn iter(&self) -> Iter { From a26c8cd7decbd09a9c1b80359a319dcb3540c345 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 27 Jun 2025 22:45:10 +0200 Subject: [PATCH 219/613] wip --- .gitignore | 1 + README.md | 6 +++--- src/double_ended_peekable.rs | 1 + 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index f136cb3f..8db24cc8 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,4 @@ Cargo.lock .bench mutants* +profile.json diff --git a/README.md b/README.md index 7182ccac..1989767f 100644 --- a/README.md +++ b/README.md @@ -19,15 +19,15 @@ A K.I.S.S. implementation of log-structured merge trees (LSM-trees/LSMTs) in Rus This is the most feature-rich LSM-tree implementation in Rust! It features: -- Thread-safe BTreeMap-like API +- Thread-safe `BTreeMap`-like API - Mostly [safe](./UNSAFE.md) & 100% stable Rust - Block-based tables with compression support & prefix truncation - Optional block hash indexes in blocks for faster point lookups [[3]](#footnotes) - Per-level filter/index block pinning configuration - Range & prefix searching with forward and reverse iteration - Block caching to keep hot data in memory -- AMQ filters (currently Bloom filters) to increase point lookup performance -- Snapshots (MVCC) +- *AMQ* filters (currently Bloom filters) to improve point lookup performance +- Snapshots (*MVCC*) - Optionally partitioned block index & filters for better cache efficiency [[1]](#footnotes) - Size-tiered, (concurrent) Leveled and FIFO compaction - Multi-threaded flushing (immutable/sealed memtables) diff --git a/src/double_ended_peekable.rs b/src/double_ended_peekable.rs index 45b256f1..9269954a 100644 --- a/src/double_ended_peekable.rs +++ b/src/double_ended_peekable.rs @@ -148,6 +148,7 @@ impl MaybePeeked { unreachable_unchecked() } }; + peeked } From ec223a5e5bdc63cd3c8bcd117499614b009b6e20 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 27 Jun 2025 22:45:26 +0200 Subject: [PATCH 220/613] refactor --- src/tree/mod.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 5fb84614..5a56afb5 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -111,6 +111,8 @@ impl AbstractTree for Tree { } fn size_of>(&self, key: K, seqno: Option) -> crate::Result> { + // NOTE: We know that values are u32 max + #[allow(clippy::cast_possible_truncation)] Ok(self.get(key, seqno)?.map(|x| x.len() as u32)) } @@ -120,7 +122,7 @@ impl AbstractTree for Tree { .expect("lock is poisoned") .current_version() .iter_segments() - .map(|x| x.pinned_bloom_filter_size()) + .map(Segment::pinned_bloom_filter_size) .sum() } From 72ad26a7ec4a1ae99bc397b6b06de2fe220f42bd Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 27 Jun 2025 22:46:07 +0200 Subject: [PATCH 221/613] refactor --- src/segment/data_block/mod.rs | 29 +---------------------------- 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index 13ce240c..18d1277d 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -295,33 +295,6 @@ impl DataBlock { // TODO: handle seqno more nicely (make Key generic, so we can do binary search over (key, seqno)) #[must_use] pub fn point_read(&self, needle: &[u8], seqno: SeqNo) -> Option { - // TODO: hash index lookup, impl in Decoder - /* - // NOTE: Try hash index if it exists - if let Some(lookup) = self - .block - .get_hash_index_reader() - .map(|reader| reader.get(needle)) - { - use super::super::block::hash_index::Lookup::{Conflicted, Found, NotFound}; - - match lookup { - Found(bucket_value) => { - let offset = binary_index.get(usize::from(bucket_value)); - self.offset = offset; - self.linear_probe(needle, seqno); - return true; - } - NotFound => { - return false; - } - Conflicted => { - // NOTE: Fallback to binary search - } - } - } - */ - let mut iter = self.iter(); if !iter.seek(needle) { @@ -343,8 +316,8 @@ impl DataBlock { None } - // TODO: rename iter() #[must_use] + #[allow(clippy::iter_without_into_iter)] pub fn iter(&self) -> Iter { Iter::new( &self.inner.data, From f7e09d92114ba9bb3844401e922c4b8a85fef8a8 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 29 Jun 2025 19:50:54 +0200 Subject: [PATCH 222/613] disable monkey temporarily --- src/compaction/worker.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index dfec3033..f264b616 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -265,17 +265,19 @@ fn merge_segments( use crate::segment::filter::BloomConstructionPolicy; if opts.config.bloom_bits_per_key >= 0 { + // TODO: // NOTE: Apply some MONKEY to have very high FPR on small levels // because it's cheap // // See https://nivdayan.github.io/monkeykeyvaluestore.pdf - match payload.dest_level { + /* match payload.dest_level { 0 => BloomConstructionPolicy::FpRate(0.00001), 1 => BloomConstructionPolicy::FpRate(0.0005), _ => BloomConstructionPolicy::BitsPerKey( opts.config.bloom_bits_per_key.unsigned_abs(), ), - } + } */ + BloomConstructionPolicy::BitsPerKey(opts.config.bloom_bits_per_key.unsigned_abs()) } else { BloomConstructionPolicy::BitsPerKey(0) } From 6d09a66073c4a85b51f270e4a1ab273bd895d858 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 29 Jun 2025 19:51:26 +0200 Subject: [PATCH 223/613] perf: correctly seek index on range read --- src/segment/mod.rs | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 7b55b281..84f3bf83 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -258,21 +258,9 @@ impl Segment { todo!(); }; - let mut index_iter = create_index_block_reader(block_index.inner().clone()); + let index_iter = create_index_block_reader(block_index.inner().clone()); - // TODO: this should probably happen lazily on first read - if let Bound::Excluded(key) | Bound::Included(key) = range.start_bound() { - index_iter.seek_lower(key); - } - - // TODO: this should probably happen lazily on first read - if let Bound::Excluded(key) | Bound::Included(key) = range.end_bound() { - index_iter.seek_upper(key); - } - - // TODO: need a Range struct that wraps Iter, so we can seek in first and last DATA blocks - - let iter = Iter::new( + let mut iter = Iter::new( self.global_id(), self.path.clone(), index_iter, @@ -280,6 +268,21 @@ impl Segment { self.cache.clone(), self.metadata.data_block_compression, ); + ); + + match range.start_bound() { + Bound::Excluded(key) | Bound::Included(key) => { + iter.set_lower_bound(key.clone()); + } + Bound::Unbounded => {} + } + + match range.end_bound() { + Bound::Excluded(key) | Bound::Included(key) => { + iter.set_upper_bound(key.clone()); + } + Bound::Unbounded => {} + } FallibleClippingIter::new(iter, range) } From f01057e029b6c7403ff5f1893bc6283e52f9c22b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 1 Jul 2025 19:15:11 +0200 Subject: [PATCH 224/613] bump msrv to 1.82 --- .github/workflows/test.yml | 2 +- Cargo.toml | 2 +- README.md | 2 +- src/segment/writer/mod.rs | 5 ++--- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f783aa97..9b46ee32 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -18,7 +18,7 @@ jobs: matrix: rust_version: - stable - - "1.81.0" # MSRV + - "1.82.0" # MSRV os: - ubuntu-latest - windows-latest diff --git a/Cargo.toml b/Cargo.toml index 013c22f3..6c033a1e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,7 @@ description = "A K.I.S.S. implementation of log-structured merge trees (LSM-tree license = "MIT OR Apache-2.0" version = "3.0.0" edition = "2021" -rust-version = "1.81.0" +rust-version = "1.82.0" readme = "README.md" include = ["src/**/*", "LICENSE-APACHE", "LICENSE-MIT", "README.md"] repository = "https://github.com/fjall-rs/lsm-tree" diff --git a/README.md b/README.md index 1989767f..76ad6608 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![CI](https://github.com/fjall-rs/lsm-tree/actions/workflows/test.yml/badge.svg)](https://github.com/fjall-rs/lsm-tree/actions/workflows/test.yml) [![docs.rs](https://img.shields.io/docsrs/lsm-tree?color=green)](https://docs.rs/lsm-tree) [![Crates.io](https://img.shields.io/crates/v/lsm-tree?color=blue)](https://crates.io/crates/lsm-tree) -![MSRV](https://img.shields.io/badge/MSRV-1.81.0-blue) +![MSRV](https://img.shields.io/badge/MSRV-1.82.0-blue) [![dependency status](https://deps.rs/repo/github/fjall-rs/lsm-tree/status.svg)](https://deps.rs/repo/github/fjall-rs/lsm-tree) A K.I.S.S. implementation of log-structured merge trees (LSM-trees/LSMTs) in Rust. diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 8c85651f..12fd1979 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -338,11 +338,10 @@ impl Writer { #[cfg(debug_assertions)] { - let mut sorted_copy = meta_items.clone(); - sorted_copy.sort(); + let is_sorted = meta_items.iter().is_sorted_by_key(|kv| &kv.key); // Just to make sure the items are definitely sorted - assert_eq!(meta_items, sorted_copy, "meta items not sorted correctly"); + assert!(is_sorted, "meta items not sorted correctly"); } log::trace!("Encoding metadata block: {meta_items:#?}"); From 6428135ef9753b30a17250c4cc9440b213998363 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 1 Jul 2025 19:40:43 +0200 Subject: [PATCH 225/613] wip --- src/segment/meta.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/segment/meta.rs b/src/segment/meta.rs index efe85ec2..9bee9672 100644 --- a/src/segment/meta.rs +++ b/src/segment/meta.rs @@ -46,7 +46,7 @@ pub struct ParsedMeta { } impl ParsedMeta { - #[allow(clippy::expect_used)] + #[allow(clippy::expect_used, clippy::too_many_lines)] pub fn load_with_handle(file: &File, handle: &BlockHandle) -> crate::Result { let block = Block::from_file(file, *handle, CompressionType::None)?; let block = DataBlock::new(block); From 363e6479a1dd8242c4fcc5cc0805274b930966b9 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 1 Jul 2025 19:41:18 +0200 Subject: [PATCH 226/613] doc --- src/segment/block/encoder.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/segment/block/encoder.rs b/src/segment/block/encoder.rs index d56dcd28..033c1835 100644 --- a/src/segment/block/encoder.rs +++ b/src/segment/block/encoder.rs @@ -37,6 +37,27 @@ pub trait Encodable { /// /// The block encoder accepts an ascending stream of items, encodes them into /// restart intervals and builds binary index (and optionally a hash index). +/// +/// # Example +/// +/// A block with `restart_interval=4` +/// +/// ``` +/// _______________ +/// __________|__________ | +/// v v | | +/// [h][t][t][t][h][t][t][t][h][t][t][t][h][t][t][t][h][t][t][t][0,1,2,3,4][0 C F F 3][ptr][ptr] +/// ^ ^ ^ ^ ^ ^ ^ +/// 0 1 2 3 4 bin index hash index +/// +/// h = restart head +/// t = truncated item +/// ``` +/// +/// The binary index holds pointers to all restart heads. +/// Because restart heads hold a full key, they can be used to compare to a needle key. +/// +/// For explanation of hash index, see `hash_index/mod.rs`. pub struct Encoder<'a, Context: Default, Item: Encodable> { pub(crate) phantom: PhantomData<(Context, Item)>, From edf9a902a3ae9639e8da6987ef8f4f86119bca55 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 14 Jul 2025 18:55:54 +0200 Subject: [PATCH 227/613] perf: optimize aggregate_key_range --- src/version/mod.rs | 19 ++++++++++++++----- src/version/run.rs | 4 ++-- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/src/version/mod.rs b/src/version/mod.rs index 1b938581..34c4e20b 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -109,12 +109,21 @@ impl Level { } pub fn aggregate_key_range(&self) -> KeyRange { - let key_ranges = self - .iter() - .map(|x| Run::aggregate_key_range(x)) - .collect::>(); + if self.run_count() == 1 { + // NOTE: We check for run_count, so the first run must exist + #[allow(clippy::expect_used)] + self.runs + .first() + .expect("should exist") + .aggregate_key_range() + } else { + let key_ranges = self + .iter() + .map(|x| Run::aggregate_key_range(x)) + .collect::>(); - KeyRange::aggregate(key_ranges.iter()) + KeyRange::aggregate(key_ranges.iter()) + } } } diff --git a/src/version/run.rs b/src/version/run.rs index 761eac68..6a39249b 100644 --- a/src/version/run.rs +++ b/src/version/run.rs @@ -88,8 +88,8 @@ impl Run { } pub fn aggregate_key_range(&self) -> KeyRange { - let lo = self.first().expect("runs are never empty"); - let hi = self.last().expect("runs are never empty"); + let lo = self.first().expect("run should never be empty"); + let hi = self.last().expect("run should never be empty"); KeyRange::new((lo.key_range().min().clone(), hi.key_range().max().clone())) } From 0fe25be6d006e5ca6630ffa5a365131b8980c484 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 14 Jul 2025 18:56:22 +0200 Subject: [PATCH 228/613] rename macro --- src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 2db7f8e2..0e2c9663 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -238,7 +238,7 @@ pub mod gc { }; } -macro_rules! unwrappy { +macro_rules! unwrap { ($x:expr) => { $x.expect("should read") @@ -246,4 +246,4 @@ macro_rules! unwrappy { }; } -pub(crate) use unwrappy; +pub(crate) use unwrap; From bc8dc6d3c803cc1af67fcdda59b0cc96c3255071 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 14 Jul 2025 18:57:08 +0200 Subject: [PATCH 229/613] update import --- src/segment/block/binary_index/reader.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/segment/block/binary_index/reader.rs b/src/segment/block/binary_index/reader.rs index a8acd484..9354a87e 100644 --- a/src/segment/block/binary_index/reader.rs +++ b/src/segment/block/binary_index/reader.rs @@ -2,7 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::unwrappy; +use crate::unwrap; use byteorder::{LittleEndian, ReadBytesExt}; pub struct Reader<'a> { @@ -38,9 +38,9 @@ impl<'a> Reader<'a> { let mut bytes = &self.bytes[offset..]; if self.step_size == 2 { - unwrappy!(bytes.read_u16::()).into() + unwrap!(bytes.read_u16::()).into() } else { - unwrappy!(bytes.read_u32::()) as usize + unwrap!(bytes.read_u32::()) as usize } } From 508eb21c281112f59e7c9293402e542cd4934328 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 14 Jul 2025 18:59:24 +0200 Subject: [PATCH 230/613] wip --- src/compaction/leveled.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 2dbee8dc..5c1b9cdb 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -7,15 +7,15 @@ use crate::{ config::Config, level_manifest::{hidden_set::HiddenSet, LevelManifest}, segment::Segment, - version::Run, + version::{run::Ranged, Run}, windows::{GrowingWindowsExt, ShrinkingWindowsExt}, HashSet, KeyRange, SegmentId, }; -// TODO: for a disjoint set of segments, we could just take the first and last segment and use their first and last key respectively -/// Aggregates the key range of a list of segments. -fn aggregate_key_range(segments: &[Segment]) -> KeyRange { - KeyRange::aggregate(segments.iter().map(|x| &x.metadata.key_range)) +pub fn aggregate_run_key_range(segments: &[Segment]) -> KeyRange { + let lo = segments.first().expect("run should never be empty"); + let hi = segments.last().expect("run should never be empty"); + KeyRange::new((lo.key_range().min().clone(), hi.key_range().max().clone())) } /// Tries to find the most optimal compaction set from @@ -72,7 +72,7 @@ fn pick_minimal_compaction( continue; } - let key_range = aggregate_key_range(window); + let key_range = aggregate_run_key_range(window); // Pull in all segments in current level into compaction let curr_level_pull_in: Vec<_> = curr_run.get_contained(&key_range).collect(); @@ -109,7 +109,7 @@ fn pick_minimal_compaction( // NOTE: Find largest trivial move (if it exists) for window in curr_run.shrinking_windows() { - let key_range = aggregate_key_range(window); + let key_range = aggregate_run_key_range(window); if let Some(next_run) = &next_run { if next_run.get_overlapping(&key_range).next().is_none() { From 6d21314181b1aa0e396237047247a2b90b87ea7b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 14 Jul 2025 19:00:31 +0200 Subject: [PATCH 231/613] remove unnecessary assertions --- src/compaction/leveled.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 5c1b9cdb..625f5281 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -26,9 +26,6 @@ fn pick_minimal_compaction( hidden_set: &HiddenSet, overshoot: u64, ) -> Option<(HashSet, bool)> { - // assert!(curr_level.is_disjoint, "Lx is not disjoint"); - // assert!(next_level.is_disjoint, "Lx+1 is not disjoint"); - struct Choice { write_amp: f32, segment_ids: HashSet, From 1178fc530d77adc5ff7272429e6a5b15639238f7 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 14 Jul 2025 19:00:38 +0200 Subject: [PATCH 232/613] add debug asserts --- src/compaction/leveled.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 625f5281..41084ffe 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -251,6 +251,9 @@ impl CompactionStrategy for Strategy { break; }; + debug_assert!(level.is_disjoint(), "level should be disjoint"); + debug_assert!(next_level.is_disjoint(), "next level should be disjoint"); + let Some((segment_ids, can_trivial_move)) = pick_minimal_compaction( level.first_run().expect("should have exactly one run"), next_level.first_run().map(std::ops::Deref::deref), From 60bb3a173a6561edcf46f0b9c016ba7e6781749f Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 14 Jul 2025 19:01:05 +0200 Subject: [PATCH 233/613] comment out debug logs --- src/compaction/leveled.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 41084ffe..d14bfb06 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -269,13 +269,13 @@ impl CompactionStrategy for Strategy { target_size: u64::from(self.target_size), }; - eprintln!( + /* eprintln!( "{} {} segments, L{}->L{next_level_index}: {:?}", if can_trivial_move { "move" } else { "merge" }, choice.segment_ids.len(), next_level_index - 1, choice.segment_ids, - ); + ); */ if can_trivial_move && level.is_disjoint() { return Choice::Move(choice); @@ -352,11 +352,11 @@ impl CompactionStrategy for Strategy { target_size: u64::from(self.target_size), }; - eprintln!( + /* eprintln!( "merge {} segments, L0->L1: {:?}", choice.segment_ids.len(), choice.segment_ids, - ); + ); */ if next_level_overlapping_segment_ids.is_empty() && first_level.is_disjoint() { return Choice::Move(choice); From 1b345723c07439acd59037e8b72565a5f984409e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 14 Jul 2025 19:01:36 +0200 Subject: [PATCH 234/613] block decoder docs --- src/segment/block/decoder.rs | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/src/segment/block/decoder.rs b/src/segment/block/decoder.rs index f1b0f6ca..46dfcfa9 100644 --- a/src/segment/block/decoder.rs +++ b/src/segment/block/decoder.rs @@ -5,20 +5,15 @@ use super::{binary_index::Reader as BinaryIndexReader, hash_index::Reader as HashIndexReader}; use crate::{ segment::{block::Trailer, Block}, - unwrappy, Slice, + unwrap, Slice, }; use byteorder::{LittleEndian, ReadBytesExt}; use std::{io::Cursor, marker::PhantomData}; +/// Represents an object that was parsed from a byte array +/// +/// Parsed items only hold references to their keys and values, use `materialize` to create an owned value. pub trait ParsedItem { - /// Returns the key as byte slice. - /// - /// # Warning - /// - /// May only be called on a restart head as a prefix-truncated item cannot be - /// represented by a single byte slice. - fn key<'a>(&self, bytes: &'a [u8]) -> &'a [u8]; - /// Compares this item's key with a needle. /// /// We can not access the key directly because it may be comprised of prefix + suffix. @@ -31,8 +26,11 @@ pub trait ParsedItem { fn materialize(&self, bytes: &Slice) -> M; } +/// Describes an object that can be parsed from a block, either a full item (restart head), or a truncated item pub trait Decodable { /// Parses the key of the next restart head from a reader. + /// + /// This is used for the binary search index. fn parse_restart_key<'a>( reader: &mut Cursor<&[u8]>, offset: usize, @@ -67,6 +65,9 @@ struct HiScanner { base_key_offset: Option, } +/// Generic block decoder for RocksDB-style blocks +/// +/// Supports prefix truncation, binary search index (through restart intervals) and optionally hash indexes. pub struct Decoder<'a, Item: Decodable, Parsed: ParsedItem> { block: &'a Block, phantom: PhantomData<(Item, Parsed)>, @@ -93,20 +94,20 @@ impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Pa let _item_count = reader.read_u32::().expect("should read"); - let restart_interval = unwrappy!(reader.read_u8()); + let restart_interval = unwrap!(reader.read_u8()); - let binary_index_step_size = unwrappy!(reader.read_u8()); + let binary_index_step_size = unwrap!(reader.read_u8()); debug_assert!( binary_index_step_size == 2 || binary_index_step_size == 4, "invalid binary index step size", ); - let binary_index_offset = unwrappy!(reader.read_u32::()); - let binary_index_len = unwrappy!(reader.read_u32::()); + let binary_index_offset = unwrap!(reader.read_u32::()); + let binary_index_len = unwrap!(reader.read_u32::()); - let hash_index_offset = unwrappy!(reader.read_u32::()); - let hash_index_len = unwrappy!(reader.read_u32::()); + let hash_index_offset = unwrap!(reader.read_u32::()); + let hash_index_len = unwrap!(reader.read_u32::()); Self { block, From ac64c9c69150b809792e4415ef2e4c6127f34d9e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 14 Jul 2025 19:01:53 +0200 Subject: [PATCH 235/613] wip --- src/segment/data_block/mod.rs | 57 +++++++++++++++-------------------- 1 file changed, 25 insertions(+), 32 deletions(-) diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index 18d1277d..4d1b557b 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -7,12 +7,11 @@ mod iter; pub use iter::Iter; use super::block::{ - Block, Decodable, Decoder, Encodable, Encoder, ParsedItem as Parsy, Trailer, - TRAILER_START_MARKER, + Block, Decodable, Decoder, Encodable, Encoder, ParsedItem, Trailer, TRAILER_START_MARKER, }; use crate::key::InternalKey; use crate::segment::util::{compare_prefixed_slice, SliceIndexes}; -use crate::{unwrappy, InternalValue, SeqNo, Slice, ValueType}; +use crate::{unwrap, InternalValue, SeqNo, Slice, ValueType}; use byteorder::WriteBytesExt; use byteorder::{LittleEndian, ReadBytesExt}; use std::io::Cursor; @@ -25,41 +24,41 @@ impl Decodable for InternalValue { offset: usize, data: &'a [u8], ) -> Option<&'a [u8]> { - let value_type = unwrappy!(reader.read_u8()); + let value_type = unwrap!(reader.read_u8()); if value_type == TRAILER_START_MARKER { return None; } - let _seqno = unwrappy!(reader.read_u64_varint()); + let _seqno = unwrap!(reader.read_u64_varint()); - let key_len: usize = unwrappy!(reader.read_u16_varint()).into(); + let key_len: usize = unwrap!(reader.read_u16_varint()).into(); let key_start = offset + reader.position() as usize; - unwrappy!(reader.seek_relative(key_len as i64)); + unwrap!(reader.seek_relative(key_len as i64)); data.get(key_start..(key_start + key_len)) } fn parse_full(reader: &mut Cursor<&[u8]>, offset: usize) -> Option { - let value_type = unwrappy!(reader.read_u8()); + let value_type = unwrap!(reader.read_u8()); if value_type == TRAILER_START_MARKER { return None; } - let seqno = unwrappy!(reader.read_u64_varint()); + let seqno = unwrap!(reader.read_u64_varint()); - let key_len: usize = unwrappy!(reader.read_u16_varint()).into(); + let key_len: usize = unwrap!(reader.read_u16_varint()).into(); let key_start = offset + reader.position() as usize; - unwrappy!(reader.seek_relative(key_len as i64)); + unwrap!(reader.seek_relative(key_len as i64)); let val_len: usize = if value_type == u8::from(ValueType::Value) { - unwrappy!(reader.read_u32_varint()) as usize + unwrap!(reader.read_u32_varint()) as usize } else { 0 }; let val_offset = offset + reader.position() as usize; - unwrappy!(reader.seek_relative(val_len as i64)); + unwrap!(reader.seek_relative(val_len as i64)); Some(if value_type == u8::from(ValueType::Value) { DataBlockParsedItem { @@ -85,28 +84,28 @@ impl Decodable for InternalValue { offset: usize, base_key_offset: usize, ) -> Option { - let value_type = unwrappy!(reader.read_u8()); + let value_type = unwrap!(reader.read_u8()); if value_type == TRAILER_START_MARKER { return None; } - let seqno = unwrappy!(reader.read_u64_varint()); + let seqno = unwrap!(reader.read_u64_varint()); - let shared_prefix_len: usize = unwrappy!(reader.read_u16_varint()).into(); - let rest_key_len: usize = unwrappy!(reader.read_u16_varint()).into(); + let shared_prefix_len: usize = unwrap!(reader.read_u16_varint()).into(); + let rest_key_len: usize = unwrap!(reader.read_u16_varint()).into(); let key_offset = offset + reader.position() as usize; - unwrappy!(reader.seek_relative(rest_key_len as i64)); + unwrap!(reader.seek_relative(rest_key_len as i64)); let val_len: usize = if value_type == u8::from(ValueType::Value) { - unwrappy!(reader.read_u32_varint()) as usize + unwrap!(reader.read_u32_varint()) as usize } else { 0 }; let val_offset = offset + reader.position() as usize; - unwrappy!(reader.seek_relative(val_len as i64)); + unwrap!(reader.seek_relative(val_len as i64)); Some(if value_type == u8::from(ValueType::Value) { DataBlockParsedItem { @@ -215,13 +214,7 @@ pub struct DataBlockParsedItem { pub value: Option, } -impl Parsy for DataBlockParsedItem { - fn key<'a>(&self, bytes: &'a [u8]) -> &'a [u8] { - debug_assert!(self.prefix.is_none(), "can only get key of restart heads"); - - unwrappy!(bytes.get(self.key.0..self.key.1)) - } - +impl ParsedItem for DataBlockParsedItem { fn compare_key(&self, needle: &[u8], bytes: &[u8]) -> std::cmp::Ordering { if let Some(prefix) = &self.prefix { let prefix = unsafe { bytes.get_unchecked(prefix.0..prefix.1) }; @@ -337,9 +330,9 @@ impl DataBlock { let offset = std::mem::size_of::() + (2 * std::mem::size_of::()) + std::mem::size_of::(); - let mut reader = unwrappy!(trailer.as_slice().get(offset..)); + let mut reader = unwrap!(trailer.as_slice().get(offset..)); - unwrappy!(reader.read_u32::()) + unwrap!(reader.read_u32::()) } /// Returns the number of hash buckets. @@ -353,10 +346,10 @@ impl DataBlock { + (2 * std::mem::size_of::()) + (2 * std::mem::size_of::()); - let mut reader = unwrappy!(trailer.as_slice().get(offset..)); + let mut reader = unwrap!(trailer.as_slice().get(offset..)); - let hash_index_offset = unwrappy!(reader.read_u32::()); - let hash_index_len = unwrappy!(reader.read_u32::()); + let hash_index_offset = unwrap!(reader.read_u32::()); + let hash_index_len = unwrap!(reader.read_u32::()); if hash_index_offset > 0 { Some(hash_index_len) From c220b34918a6914c240163cbf0c065b8b2e823ee Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 14 Jul 2025 19:02:01 +0200 Subject: [PATCH 236/613] wip --- src/segment/index_block/block_handle.rs | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/segment/index_block/block_handle.rs b/src/segment/index_block/block_handle.rs index 5e9f07e6..5ced9049 100644 --- a/src/segment/index_block/block_handle.rs +++ b/src/segment/index_block/block_handle.rs @@ -9,7 +9,7 @@ use crate::{ index_block::IndexBlockParsedItem, util::SliceIndexes, }, - unwrappy, + unwrap, }; use byteorder::{ReadBytesExt, WriteBytesExt}; use std::io::{Cursor, Seek}; @@ -214,19 +214,19 @@ impl Encodable for KeyedBlockHandle { impl Decodable for KeyedBlockHandle { fn parse_full(reader: &mut Cursor<&[u8]>, offset: usize) -> Option { - let marker = unwrappy!(reader.read_u8()); + let marker = unwrap!(reader.read_u8()); if marker == TRAILER_START_MARKER { return None; } - let file_offset = unwrappy!(reader.read_u64_varint()); - let size = unwrappy!(reader.read_u32_varint()); + let file_offset = unwrap!(reader.read_u64_varint()); + let size = unwrap!(reader.read_u32_varint()); - let key_len: usize = unwrappy!(reader.read_u16_varint()).into(); + let key_len: usize = unwrap!(reader.read_u16_varint()).into(); let key_start = offset + reader.position() as usize; - unwrappy!(reader.seek_relative(key_len as i64)); + unwrap!(reader.seek_relative(key_len as i64)); Some(IndexBlockParsedItem { prefix: None, @@ -241,19 +241,19 @@ impl Decodable for KeyedBlockHandle { offset: usize, data: &'a [u8], ) -> Option<&'a [u8]> { - let marker = unwrappy!(reader.read_u8()); + let marker = unwrap!(reader.read_u8()); if marker == TRAILER_START_MARKER { return None; } - let _file_offset = unwrappy!(reader.read_u64_varint()); - let _size = unwrappy!(reader.read_u32_varint()); + let _file_offset = unwrap!(reader.read_u64_varint()); + let _size = unwrap!(reader.read_u32_varint()); - let key_len: usize = unwrappy!(reader.read_u16_varint()).into(); + let key_len: usize = unwrap!(reader.read_u16_varint()).into(); let key_start = offset + reader.position() as usize; - unwrappy!(reader.seek_relative(key_len as i64)); + unwrap!(reader.seek_relative(key_len as i64)); data.get(key_start..(key_start + key_len)) } From cb045132986e822382628801acbde718811c797c Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 14 Jul 2025 19:02:29 +0200 Subject: [PATCH 237/613] disallow index blocks with restart interval > 1 for now --- src/segment/index_block/iter.rs | 18 +++++++++--------- src/segment/index_block/mod.rs | 23 +++++++---------------- 2 files changed, 16 insertions(+), 25 deletions(-) diff --git a/src/segment/index_block/iter.rs b/src/segment/index_block/iter.rs index b3acbd18..9fcdcfd6 100644 --- a/src/segment/index_block/iter.rs +++ b/src/segment/index_block/iter.rs @@ -63,7 +63,7 @@ mod tests { KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), ]; - let bytes = IndexBlock::encode_items(&items, 1)?; + let bytes = IndexBlock::encode_items(&items)?; let index_block = IndexBlock::new(Block { data: bytes.into(), @@ -99,7 +99,7 @@ mod tests { KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), ]; - let bytes = IndexBlock::encode_items(&items, 1)?; + let bytes = IndexBlock::encode_items(&items)?; let index_block = IndexBlock::new(Block { data: bytes.into(), @@ -133,7 +133,7 @@ mod tests { KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), ]; - let bytes = IndexBlock::encode_items(&items, 1)?; + let bytes = IndexBlock::encode_items(&items)?; let index_block = IndexBlock::new(Block { data: bytes.into(), @@ -170,7 +170,7 @@ mod tests { KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), ]; - let bytes = IndexBlock::encode_items(&items, 1)?; + let bytes = IndexBlock::encode_items(&items)?; let index_block = IndexBlock::new(Block { data: bytes.into(), @@ -204,7 +204,7 @@ mod tests { KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), ]; - let bytes = IndexBlock::encode_items(&items, 1)?; + let bytes = IndexBlock::encode_items(&items)?; let index_block = IndexBlock::new(Block { data: bytes.into(), @@ -238,7 +238,7 @@ mod tests { KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), ]; - let bytes = IndexBlock::encode_items(&items, 1)?; + let bytes = IndexBlock::encode_items(&items)?; let index_block = IndexBlock::new(Block { data: bytes.into(), @@ -276,7 +276,7 @@ mod tests { KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), ]; - let bytes = IndexBlock::encode_items(&items, 1)?; + let bytes = IndexBlock::encode_items(&items)?; let index_block = IndexBlock::new(Block { data: bytes.into(), @@ -310,7 +310,7 @@ mod tests { KeyedBlockHandle::new(b"b".into(), BlockOffset(13_000), 5_000), ]; - let bytes = IndexBlock::encode_items(&items, 1)?; + let bytes = IndexBlock::encode_items(&items)?; let index_block = IndexBlock::new(Block { data: bytes.into(), @@ -360,7 +360,7 @@ mod tests { KeyedBlockHandle::new(b"b".into(), BlockOffset(13_000), 5_000), ]; - let bytes = IndexBlock::encode_items(&items, 1)?; + let bytes = IndexBlock::encode_items(&items)?; let index_block = IndexBlock::new(Block { data: bytes.into(), diff --git a/src/segment/index_block/mod.rs b/src/segment/index_block/mod.rs index 9fa7aa92..c8f8b220 100644 --- a/src/segment/index_block/mod.rs +++ b/src/segment/index_block/mod.rs @@ -12,14 +12,11 @@ use super::{ block::{BlockOffset, Encoder, Trailer}, Block, }; -use crate::Slice; -use crate::{ - segment::{ - block::{Decoder, ParsedItem as Parsy}, - util::{compare_prefixed_slice, SliceIndexes}, - }, - unwrappy, +use crate::segment::{ + block::{Decoder, ParsedItem}, + util::{compare_prefixed_slice, SliceIndexes}, }; +use crate::Slice; #[derive(Debug)] pub struct IndexBlockParsedItem { @@ -29,7 +26,7 @@ pub struct IndexBlockParsedItem { pub end_key: SliceIndexes, } -impl Parsy for IndexBlockParsedItem { +impl ParsedItem for IndexBlockParsedItem { fn compare_key(&self, needle: &[u8], bytes: &[u8]) -> std::cmp::Ordering { if let Some(prefix) = &self.prefix { let prefix = unsafe { bytes.get_unchecked(prefix.0..prefix.1) }; @@ -41,12 +38,6 @@ impl Parsy for IndexBlockParsedItem { } } - fn key<'a>(&self, bytes: &'a [u8]) -> &'a [u8] { - debug_assert!(self.prefix.is_none(), "can only get key of restart heads"); - - unwrappy!(bytes.get(self.end_key.0..self.end_key.1)) - } - fn key_offset(&self) -> usize { self.end_key.0 } @@ -95,13 +86,13 @@ impl IndexBlock { pub fn encode_items( items: &[KeyedBlockHandle], - restart_interval: u8, + // restart_interval: u8, // TODO: support prefix truncation + delta encoding ) -> crate::Result> { let first_key = items.first().expect("chunk should not be empty").end_key(); let mut serializer = Encoder::<'_, BlockOffset, KeyedBlockHandle>::new( items.len(), - restart_interval, + 1, // TODO: hard coded for now 0.0, // NOTE: Index blocks do not support hash index first_key, ); From 474e7f51310a4b8c22056e7326a600dc7253f03e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 14 Jul 2025 19:02:32 +0200 Subject: [PATCH 238/613] wip --- src/segment/writer/index.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/segment/writer/index.rs b/src/segment/writer/index.rs index 898c8ffb..150d180f 100644 --- a/src/segment/writer/index.rs +++ b/src/segment/writer/index.rs @@ -70,8 +70,7 @@ impl BlockIndexWriter for FullIndexWriter ) -> crate::Result<(BlockHandle, Option)> { let tli_ptr = BlockOffset(block_file_writer.stream_position()?); - let bytes = - IndexBlock::encode_items(&self.block_handles, 1 /* TODO: hard coded for now */)?; + let bytes = IndexBlock::encode_items(&self.block_handles)?; let header = Block::to_writer(block_file_writer, &bytes, self.compression)?; From 4a41c9c2bdca94f697875429e31929f848ca8f76 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 14 Jul 2025 19:02:51 +0200 Subject: [PATCH 239/613] add more metadata props --- src/segment/writer/mod.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 12fd1979..3561859a 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -299,6 +299,7 @@ impl Writer { let meta_items = [ meta("#checksum_type", b"xxh3"), meta("#compression#data", &self.compression.encode_into_vec()), + meta("#compression#index", &self.compression.encode_into_vec()), meta("#created_at", &unix_timestamp().as_nanos().to_le_bytes()), meta( "#data_block_count", @@ -320,6 +321,8 @@ impl Writer { self.meta.first_key.as_ref().expect("should exist"), ), meta("#key_count", &(self.meta.key_count as u64).to_le_bytes()), + meta("#prefix_truncation#data", &[1]), + meta("#prefix_truncation#index", &[0]), meta("#seqno#max", &self.meta.highest_seqno.to_le_bytes()), meta("#seqno#min", &self.meta.lowest_seqno.to_le_bytes()), meta("#size", &self.meta.file_pos.to_le_bytes()), @@ -332,15 +335,14 @@ impl Writer { &self.meta.uncompressed_size.to_le_bytes(), ), meta("v#lsmt", env!("CARGO_PKG_VERSION").as_bytes()), - meta("v#table", b"3.0"), + meta("v#table", b"3"), // TODO: tli_handle_count ]; + // NOTE: Just to make sure the items are definitely sorted #[cfg(debug_assertions)] { let is_sorted = meta_items.iter().is_sorted_by_key(|kv| &kv.key); - - // Just to make sure the items are definitely sorted assert!(is_sorted, "meta items not sorted correctly"); } From 802fc55d37cef116163d90e28ce68ff340b48c61 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 14 Jul 2025 19:03:00 +0200 Subject: [PATCH 240/613] wip --- src/tree/inner.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tree/inner.rs b/src/tree/inner.rs index 988981f9..fe55d191 100644 --- a/src/tree/inner.rs +++ b/src/tree/inner.rs @@ -43,7 +43,7 @@ impl SealedMemtables { } } -/// Hands out a unique (monotonically increasing) tree ID +/// Hands out a unique (monotonically increasing) tree ID. pub fn get_next_tree_id() -> TreeId { static TREE_ID_COUNTER: AtomicU64 = AtomicU64::new(0); TREE_ID_COUNTER.fetch_add(1, std::sync::atomic::Ordering::Relaxed) From 173344751bae5d7f15f00d9b1ac8d9468e51bee9 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 14 Jul 2025 19:10:06 +0200 Subject: [PATCH 241/613] perf: correctly seek segment iter --- src/segment/iter.rs | 77 +++++++++++++++++++++++++++++---------------- 1 file changed, 50 insertions(+), 27 deletions(-) diff --git a/src/segment/iter.rs b/src/segment/iter.rs index 1c20d70c..387c0b14 100644 --- a/src/segment/iter.rs +++ b/src/segment/iter.rs @@ -4,10 +4,13 @@ use super::{ data_block::Iter as DataBlockIter, BlockOffset, DataBlock, GlobalSegmentId, KeyedBlockHandle, -}; + +use super::{data_block::Iter as DataBlockIter, BlockOffset, DataBlock, GlobalSegmentId}; use crate::{ - segment::{block::ParsedItem, util::load_block, BlockHandle}, - Cache, CompressionType, DescriptorTable, InternalValue, SeqNo, + segment::{ + block::ParsedItem, block_index::iter::OwnedIndexBlockIter, util::load_block, BlockHandle, + }, + Cache, CompressionType, DescriptorTable, InternalValue, SeqNo, UserKey, }; use self_cell::self_cell; use std::{path::PathBuf, sync::Arc}; @@ -56,15 +59,12 @@ pub fn create_data_block_reader(block: DataBlock) -> OwnedDataBlockIter { OwnedDataBlockIter::new(block, super::data_block::DataBlock::iter) } -pub struct Iter -where - I: DoubleEndedIterator, -{ +pub struct Iter { segment_id: GlobalSegmentId, path: Arc, #[allow(clippy::struct_field_names)] - index_iter: I, + index_iter: OwnedIndexBlockIter, descriptor_table: Arc, cache: Arc, compression: CompressionType, @@ -74,16 +74,15 @@ where hi_offset: BlockOffset, hi_data_block: Option, + + range: (Option, Option), } -impl Iter -where - I: DoubleEndedIterator, -{ +impl Iter { pub fn new( segment_id: GlobalSegmentId, path: Arc, - index_iter: I, + index_iter: OwnedIndexBlockIter, descriptor_table: Arc, cache: Arc, compression: CompressionType, @@ -102,14 +101,21 @@ where hi_offset: BlockOffset(u64::MAX), hi_data_block: None, + + range: (None, None), } } + + pub fn set_lower_bound(&mut self, key: UserKey) { + self.range.0 = Some(key); + } + + pub fn set_upper_bound(&mut self, key: UserKey) { + self.range.1 = Some(key); + } } -impl Iterator for Iter -where - I: DoubleEndedIterator, -{ +impl Iterator for Iter { type Item = crate::Result; fn next(&mut self) -> Option { @@ -119,6 +125,12 @@ where } } + if self.lo_data_block.is_none() { + if let Some(key) = &self.range.0 { + self.index_iter.seek_lower(key); + } + } + let Some(handle) = self.index_iter.next() else { // NOTE: No more block handles from index, // Now check hi buffer if it exists @@ -145,7 +157,7 @@ where &self.descriptor_table, &self.cache, &BlockHandle::new(handle.offset(), handle.size()), - self.compression + self.compression, )) } }; @@ -153,11 +165,12 @@ where let mut reader = create_data_block_reader(block); - // TODO: - /* // NOTE: This is the first block, seek it + // NOTE: This is the first block, seek in it if self.lo_data_block.is_none() { - reader.seek_lower(self.); - } */ + if let Some(key) = &self.range.0 { + reader.seek_lower(key, SeqNo::MAX); + } + } let item = reader.next(); @@ -168,10 +181,7 @@ where } } -impl DoubleEndedIterator for Iter -where - I: DoubleEndedIterator, -{ +impl DoubleEndedIterator for Iter { fn next_back(&mut self) -> Option { if let Some(block) = &mut self.hi_data_block { if let Some(item) = block.next_back().map(Ok) { @@ -179,6 +189,12 @@ where } } + if self.hi_data_block.is_none() { + if let Some(key) = &self.range.1 { + self.index_iter.seek_upper(key); + } + } + let Some(handle) = self.index_iter.next_back() else { // NOTE: No more block handles from index, // Now check lo buffer if it exists @@ -214,7 +230,7 @@ where &self.descriptor_table, &self.cache, &BlockHandle::new(handle.offset(), handle.size()), - self.compression + self.compression, )) } }; @@ -222,6 +238,13 @@ where let mut reader = create_data_block_reader(block); + // NOTE: This is the first block, seek in it + if self.hi_data_block.is_none() { + if let Some(key) = &self.range.1 { + reader.seek_upper(key, SeqNo::MAX); + } + } + let item = reader.next_back(); self.hi_offset = handle.offset(); From 27d7ecea5838f6cc789833a9a49fbf2ca94b25eb Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 14 Jul 2025 19:12:17 +0200 Subject: [PATCH 242/613] update ingestion --- src/tree/ingest.rs | 37 +++++-------------------------------- 1 file changed, 5 insertions(+), 32 deletions(-) diff --git a/src/tree/ingest.rs b/src/tree/ingest.rs index a8cfd7d0..f2feb091 100644 --- a/src/tree/ingest.rs +++ b/src/tree/ingest.rs @@ -4,7 +4,7 @@ use super::Tree; use crate::{ - file::SEGMENTS_FOLDER, +use std::path::PathBuf; segment::{multi_writer::MultiWriter, Segment}, AbstractTree, UserKey, UserValue, ValueType, }; @@ -24,34 +24,16 @@ impl<'a> Ingestion<'a> { "can only perform bulk_ingest on empty trees", ); - let folder = tree.config.path.join(SEGMENTS_FOLDER); + let folder = tree.config.path.join(crate::file::SEGMENTS_FOLDER); log::debug!("Ingesting into disk segments in {folder:?}"); let writer = MultiWriter::new( folder.clone(), tree.segment_id_counter.clone(), - 128 * 1_024 * 1_024, - /* crate::segment::writer::Options { - folder: folder.clone(), - data_block_size: tree.config.data_block_size, - index_block_size: tree.config.index_block_size, - segment_id: 0, /* TODO: unused */ - }, */ + 64 * 1_024 * 1_024, // TODO: look at tree configuration )? .use_compression(tree.config.compression); - /* { - use crate::segment::writer::BloomConstructionPolicy; - - if tree.config.bloom_bits_per_key >= 0 { - writer = writer.use_bloom_policy(BloomConstructionPolicy::BitsPerKey( - tree.config.bloom_bits_per_key.unsigned_abs(), - )); - } else { - writer = writer.use_bloom_policy(BloomConstructionPolicy::BitsPerKey(0)); - } - } */ - Ok(Self { folder, tree, @@ -64,7 +46,7 @@ impl<'a> Ingestion<'a> { key, value, 0, - ValueType::Value, + crate::ValueType::Value, )) } @@ -128,16 +110,7 @@ impl<'a> Ingestion<'a> { self.tree.register_segments(&created_segments)?; - self.tree.compact(Arc::new(MoveDown(0, 6)), 0)?; - - /* for segment in &created_segments { - let segment_file_path = self.folder.join(segment.id().to_string()); - - self.tree - .config - .descriptor_table - .insert(&segment_file_path, segment.global_id()); - } */ + self.tree.compact(Arc::new(MoveDown(0, 2)), 0)?; Ok(()) } From ac9378ac6d69e0a8d68adad10a69f3784fe06d03 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 14 Jul 2025 19:12:31 +0200 Subject: [PATCH 243/613] cap leveled compaction to 100 segments for now --- src/compaction/leveled.rs | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index d14bfb06..5239d168 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -44,16 +44,7 @@ fn pick_minimal_compaction( } else { // TODO: this should not consider the number of segments, but the amount of rewritten data // which corresponds to the amount of temporary space amp - - // NOTE: Keep compactions with N or less segments - // to make compactions not too large - // - // This value is currently manually fine-tuned based on benchmarks - // with 50%/50% read-write workload - // - // Making compactions too granular heavily increases read tail latencies - choice.segment_ids.len() < 100 - // true + choice.segment_ids.len() <= 100 }; if valid_choice { From 88a1958b63b8b3afb79b80812f79286568883344 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 14 Jul 2025 19:13:00 +0200 Subject: [PATCH 244/613] reimplement movedown "compaction" strategy --- src/compaction/movedown.rs | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/compaction/movedown.rs b/src/compaction/movedown.rs index 1f0dfc28..f96f4995 100644 --- a/src/compaction/movedown.rs +++ b/src/compaction/movedown.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{Choice, CompactionStrategy, Input}; -use crate::{level_manifest::LevelManifest, segment::Segment, Config, HashSet}; +use crate::{level_manifest::LevelManifest, segment::Segment, Config}; /// Moves down a level into the destination level. pub struct Strategy(pub u8, pub u8); @@ -15,6 +15,24 @@ impl CompactionStrategy for Strategy { #[allow(clippy::expect_used)] fn choose(&self, levels: &LevelManifest, _: &Config) -> Choice { - todo!() + if levels.busy_levels().contains(&self.0) { + return Choice::DoNothing; + } + + let Some(level) = levels.as_slice().get(self.0 as usize) else { + return Choice::DoNothing; + }; + + let segment_ids = level + .iter() + .flat_map(|run| run.iter()) + .map(Segment::id) + .collect(); + + Choice::Move(Input { + segment_ids, + dest_level: self.1, + target_size: u64::MAX, + }) } } From e900848a49723aa687579d98d43c822fed15fe23 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 14 Jul 2025 19:13:12 +0200 Subject: [PATCH 245/613] add metrics struct --- src/lib.rs | 3 +++ src/metrics.rs | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 src/metrics.rs diff --git a/src/lib.rs b/src/lib.rs index 0e2c9663..760fb155 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -164,6 +164,9 @@ mod descriptor_table; #[doc(hidden)] pub mod merge; +#[cfg(feature = "metrics")] +pub(crate) mod metrics; + mod multi_reader; #[doc(hidden)] diff --git a/src/metrics.rs b/src/metrics.rs new file mode 100644 index 00000000..5c56fc90 --- /dev/null +++ b/src/metrics.rs @@ -0,0 +1,48 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use std::sync::atomic::AtomicUsize; +use std::sync::atomic::Ordering::Relaxed; + +#[derive(Debug, Default)] +pub struct Metrics { + /// Number of blocks that were actually read from disk + pub(crate) block_load_io: AtomicUsize, + + /// Number of blocks that were read from block cache + pub(crate) block_load_cached: AtomicUsize, + + /// Number of bloom filter queries that were performed + pub(crate) bloom_filter_queries: AtomicUsize, + + /// Number of IOs that were skipped due to bloom filter hits + pub(crate) bloom_filter_hits: AtomicUsize, +} + +#[allow(clippy::cast_precision_loss)] +impl Metrics { + /// Number of blocks that were read from disk. + pub fn block_loads_io(&self) -> usize { + self.block_load_io.load(Relaxed) + } + + /// Number of blocks that were accessed. + pub fn block_loads(&self) -> usize { + self.block_load_cached.load(Relaxed) + self.block_load_io.load(Relaxed) + } + + /// Block cache efficiency in percent (0.0 - 1.0). + pub fn block_cache_efficiency(&self) -> f64 { + let queries = self.block_loads() as f64; + let hits = self.block_load_cached.load(Relaxed) as f64; + hits / queries + } + + /// Filter efficiency in percent (0.0 - 1.0). + pub fn bloom_filter_efficiency(&self) -> f64 { + let queries = self.bloom_filter_queries.load(Relaxed) as f64; + let hits = self.bloom_filter_hits.load(Relaxed) as f64; + hits / queries + } +} From 6b5841610cc7913e9ef2a072068a6fa5d262bf03 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 14 Jul 2025 19:23:23 +0200 Subject: [PATCH 246/613] add pinned index blocks stat --- src/abstract.rs | 3 +++ src/blob_tree/mod.rs | 6 ++++++ src/segment/mod.rs | 9 +++++++++ src/tree/mod.rs | 10 ++++++++++ 4 files changed, 28 insertions(+) diff --git a/src/abstract.rs b/src/abstract.rs index 4dbfb7c2..8a9af265 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -45,6 +45,9 @@ pub trait AbstractTree { /// Gets the memory usage of all pinned bloom filters in the tree. fn pinned_bloom_filter_size(&self) -> usize; + /// Gets the memory usage of all pinned index blocks in the tree. + fn pinned_block_index_size(&self) -> usize; + // TODO:? /* #[doc(hidden)] fn verify(&self) -> crate::Result; */ diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index 354e6097..07d5f573 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -322,6 +322,8 @@ impl AbstractTree for BlobTree { let vhandle = self.index.get_vhandle(key.as_ref(), seqno)?; Ok(vhandle.map(|x| match x { + // NOTE: Values are u32 length max + #[allow(clippy::cast_possible_truncation)] MaybeInlineValue::Inline(v) => v.len() as u32, // NOTE: We skip reading from the value log @@ -334,6 +336,10 @@ impl AbstractTree for BlobTree { self.index.pinned_bloom_filter_size() } + fn pinned_block_index_size(&self) -> usize { + self.index.pinned_block_index_size() + } + fn sealed_memtable_count(&self) -> usize { self.index.sealed_memtable_count() } diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 84f3bf83..d86c3749 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -112,6 +112,15 @@ impl Segment { .unwrap_or_default() } + #[must_use] + pub fn pinned_block_index_size(&self) -> usize { + if let BlockIndexImpl::Full(full_block_index) = &*self.block_index { + full_block_index.inner().inner.size() + } else { + unimplemented!(); + } + } + /// Gets the segment ID. /// /// The segment ID is unique for this tree, but not diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 5a56afb5..178622a0 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -126,6 +126,16 @@ impl AbstractTree for Tree { .sum() } + fn pinned_block_index_size(&self) -> usize { + self.manifest + .read() + .expect("lock is poisoned") + .current_version() + .iter_segments() + .map(Segment::pinned_block_index_size) + .sum() + } + fn sealed_memtable_count(&self) -> usize { self.sealed_memtables .read() From 6d20df20af4cfff39c1f4cd20b37ece137b12bee Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 14 Jul 2025 19:25:03 +0200 Subject: [PATCH 247/613] add block IO and bloom filter metrics --- src/compaction/worker.rs | 10 +++++ src/segment/inner.rs | 6 +++ src/segment/iter.rs | 15 +++++++- src/segment/mod.rs | 82 ++++++++++++++++++++++++++++++++++------ src/segment/util.rs | 13 +++++++ src/tree/ingest.rs | 58 +++++++--------------------- src/tree/inner.rs | 9 +++++ src/tree/mod.rs | 15 ++++++++ 8 files changed, 149 insertions(+), 59 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index f264b616..21e11d31 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -2,6 +2,9 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) +#[cfg(feature = "metrics")] +use crate::metrics::Metrics; + use super::{CompactionStrategy, Input as CompactionPayload}; use crate::{ compaction::{stream::CompactionStream, Choice}, @@ -42,6 +45,9 @@ pub struct Options { /// Evicts items that are older than this seqno (MVCC GC). pub eviction_seqno: u64, + + #[cfg(feature = "metrics")] + pub metrics: Arc, } impl Options { @@ -54,6 +60,8 @@ impl Options { stop_signal: tree.stop_signal.clone(), strategy, eviction_seqno: 0, + #[cfg(feature = "metrics")] + metrics: tree.metrics.clone(), } } } @@ -352,6 +360,8 @@ fn merge_segments( opts.config.cache.clone(), opts.config.descriptor_table.clone(), payload.dest_level <= 2, // TODO: look at configuration + #[cfg(feature = "metrics")] + opts.metrics.clone(), ) /* let segment_id = trailer.metadata.id; diff --git a/src/segment/inner.rs b/src/segment/inner.rs index 0438cb99..a41db211 100644 --- a/src/segment/inner.rs +++ b/src/segment/inner.rs @@ -2,6 +2,9 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) +#[cfg(feature = "metrics")] +use crate::metrics::Metrics; + use super::{block_index::BlockIndexImpl, meta::ParsedMeta, regions::ParsedRegions, Block}; use crate::{ cache::Cache, descriptor_table::DescriptorTable, tree::inner::TreeId, GlobalSegmentId, @@ -44,6 +47,9 @@ pub struct Inner { // #[doc(hidden)] // pub bloom_filter: Option, pub is_deleted: AtomicBool, + + #[cfg(feature = "metrics")] + pub(crate) metrics: Arc, } impl Drop for Inner { diff --git a/src/segment/iter.rs b/src/segment/iter.rs index 387c0b14..6579f57c 100644 --- a/src/segment/iter.rs +++ b/src/segment/iter.rs @@ -2,8 +2,8 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::{ - data_block::Iter as DataBlockIter, BlockOffset, DataBlock, GlobalSegmentId, KeyedBlockHandle, +#[cfg(feature = "metrics")] +use crate::metrics::Metrics; use super::{data_block::Iter as DataBlockIter, BlockOffset, DataBlock, GlobalSegmentId}; use crate::{ @@ -76,6 +76,9 @@ pub struct Iter { hi_data_block: Option, range: (Option, Option), + + #[cfg(feature = "metrics")] + metrics: Arc, } impl Iter { @@ -86,6 +89,7 @@ impl Iter { descriptor_table: Arc, cache: Arc, compression: CompressionType, + #[cfg(feature = "metrics")] metrics: Arc, ) -> Self { Self { segment_id, @@ -103,6 +107,9 @@ impl Iter { hi_data_block: None, range: (None, None), + + #[cfg(feature = "metrics")] + metrics, } } @@ -158,6 +165,8 @@ impl Iterator for Iter { &self.cache, &BlockHandle::new(handle.offset(), handle.size()), self.compression, + #[cfg(feature = "metrics")] + &self.metrics, )) } }; @@ -231,6 +240,8 @@ impl DoubleEndedIterator for Iter { &self.cache, &BlockHandle::new(handle.offset(), handle.size()), self.compression, + #[cfg(feature = "metrics")] + &self.metrics, )) } }; diff --git a/src/segment/mod.rs b/src/segment/mod.rs index d86c3749..132ed27f 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -22,28 +22,26 @@ pub use block::{Block, BlockOffset, Checksum}; pub use data_block::DataBlock; pub use id::{GlobalSegmentId, SegmentId}; pub use index_block::{BlockHandle, IndexBlock, KeyedBlockHandle}; -use regions::ParsedRegions; pub use scanner::Scanner; -use util::load_block; pub use writer::Writer; +#[cfg(feature = "metrics")] +use crate::metrics::Metrics; + use crate::{ - cache::Cache, descriptor_table::DescriptorTable, fallible_clipping_iter::FallibleClippingIter, - segment::block_index::iter::create_index_block_reader, CompressionType, InternalValue, SeqNo, - TreeId, UserKey, + cache::Cache, descriptor_table::DescriptorTable, CompressionType, InternalValue, SeqNo, TreeId, + UserKey, }; -use block_index::{BlockIndexImpl, FullBlockIndex}; -use filter::standard_bloom::{CompositeHash, StandardBloomFilterReader}; +use block_index::BlockIndexImpl; +use filter::standard_bloom::CompositeHash; use inner::Inner; use iter::Iter; -use meta::ParsedMeta; use std::{ ops::{Bound, RangeBounds}, path::PathBuf, - sync::{atomic::AtomicBool, Arc}, + sync::Arc, }; - -// todo +use util::load_block; // TODO: segment iter: // TODO: we only need to truncate items from blocks that are not the first and last block @@ -142,6 +140,8 @@ impl Segment { &self.cache, handle, compression, + #[cfg(feature = "metrics")] + &self.metrics, ) } @@ -156,6 +156,10 @@ impl Segment { seqno: SeqNo, key_hash: CompositeHash, ) -> crate::Result> { + use filter::standard_bloom::StandardBloomFilterReader; + #[cfg(feature = "metrics")] + use std::sync::atomic::Ordering::Relaxed; + if self.metadata.seqnos.0 >= seqno { return Ok(None); } @@ -163,14 +167,26 @@ impl Segment { if let Some(block) = &self.pinned_filter_block { let filter = StandardBloomFilterReader::new(&block.data)?; + #[cfg(feature = "metrics")] + self.metrics.bloom_filter_queries.fetch_add(1, Relaxed); + if !filter.contains_hash(key_hash) { + #[cfg(feature = "metrics")] + self.metrics.bloom_filter_hits.fetch_add(1, Relaxed); + return Ok(None); } } else if let Some(filter_block_handle) = &self.regions.filter { let block = self.load_block(filter_block_handle, CompressionType::None)?; let filter = StandardBloomFilterReader::new(&block.data)?; + #[cfg(feature = "metrics")] + self.metrics.bloom_filter_queries.fetch_add(1, Relaxed); + if !filter.contains_hash(key_hash) { + #[cfg(feature = "metrics")] + self.metrics.bloom_filter_hits.fetch_add(1, Relaxed); + return Ok(None); } } @@ -263,6 +279,9 @@ impl Segment { &self, range: R, ) -> impl DoubleEndedIterator> { + use crate::fallible_clipping_iter::FallibleClippingIter; + use block_index::iter::create_index_block_reader; + let BlockIndexImpl::Full(block_index) = &*self.block_index else { todo!(); }; @@ -276,7 +295,8 @@ impl Segment { self.descriptor_table.clone(), self.cache.clone(), self.metadata.data_block_compression, - ); + #[cfg(feature = "metrics")] + self.metrics.clone(), ); match range.start_bound() { @@ -303,7 +323,12 @@ impl Segment { cache: Arc, descriptor_table: Arc, pin_filter: bool, + #[cfg(feature = "metrics")] metrics: Arc, ) -> crate::Result { + use block_index::FullBlockIndex; + use meta::ParsedMeta; + use regions::ParsedRegions; + use std::sync::atomic::AtomicBool; use trailer::Trailer; log::debug!("Recovering segment from file {file_path:?}"); @@ -386,6 +411,9 @@ impl Segment { pinned_filter_block, is_deleted: AtomicBool::default(), + + #[cfg(feature = "metrics")] + metrics, })); Ok(segment) @@ -456,12 +484,17 @@ mod tests { } { + #[cfg(feature = "metrics")] + let metrics = Arc::new(Metrics::default()); + let segment = Segment::recover( file, 0, Arc::new(Cache::with_capacity_bytes(1_000_000)), Arc::new(DescriptorTable::new(10)), true, + #[cfg(feature = "metrics")] + metrics, )?; assert_eq!(5, segment.id()); @@ -546,12 +579,17 @@ mod tests { } { + #[cfg(feature = "metrics")] + let metrics = Arc::new(Metrics::default()); + let segment = Segment::recover( file, 0, Arc::new(Cache::with_capacity_bytes(1_000_000)), Arc::new(DescriptorTable::new(10)), true, + #[cfg(feature = "metrics")] + metrics, )?; assert_eq!(5, segment.id()); @@ -597,12 +635,17 @@ mod tests { } { + #[cfg(feature = "metrics")] + let metrics = Arc::new(Metrics::default()); + let segment = Segment::recover( file, 0, Arc::new(Cache::with_capacity_bytes(1_000_000)), Arc::new(DescriptorTable::new(10)), true, + #[cfg(feature = "metrics")] + metrics, )?; assert_eq!(5, segment.id()); @@ -647,12 +690,17 @@ mod tests { } { + #[cfg(feature = "metrics")] + let metrics = Arc::new(Metrics::default()); + let segment = Segment::recover( file, 0, Arc::new(Cache::with_capacity_bytes(1_000_000)), Arc::new(DescriptorTable::new(10)), true, + #[cfg(feature = "metrics")] + metrics, )?; assert_eq!(5, segment.id()); @@ -710,12 +758,17 @@ mod tests { } { + #[cfg(feature = "metrics")] + let metrics = Arc::new(Metrics::default()); + let segment = Segment::recover( file, 0, Arc::new(Cache::with_capacity_bytes(1_000_000)), Arc::new(DescriptorTable::new(10)), true, + #[cfg(feature = "metrics")] + metrics, )?; assert_eq!(5, segment.id()); @@ -773,12 +826,17 @@ mod tests { } { + #[cfg(feature = "metrics")] + let metrics = Arc::new(Metrics::default()); + let segment = Segment::recover( file, 0, Arc::new(Cache::with_capacity_bytes(1_000_000)), Arc::new(DescriptorTable::new(10)), false, + #[cfg(feature = "metrics")] + metrics, )?; assert_eq!(5, segment.id()); diff --git a/src/segment/util.rs b/src/segment/util.rs index 79e29d71..42525733 100644 --- a/src/segment/util.rs +++ b/src/segment/util.rs @@ -2,6 +2,9 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) +#[cfg(feature = "metrics")] +use crate::metrics::Metrics; + use super::{Block, BlockHandle, GlobalSegmentId}; use crate::{Cache, CompressionType, DescriptorTable}; use std::{path::Path, sync::Arc}; @@ -17,10 +20,17 @@ pub fn load_block( cache: &Cache, handle: &BlockHandle, compression: CompressionType, + #[cfg(feature = "metrics")] metrics: &Metrics, ) -> crate::Result { + #[cfg(feature = "metrics")] + use std::sync::atomic::Ordering::Relaxed; + log::trace!("load block {handle:?}"); if let Some(block) = cache.get_block(segment_id, handle.offset()) { + #[cfg(feature = "metrics")] + metrics.block_load_cached.fetch_add(1, Relaxed); + return Ok(block); } @@ -35,6 +45,9 @@ pub fn load_block( let block = Block::from_file(&fd, *handle, compression)?; + #[cfg(feature = "metrics")] + metrics.block_load_io.fetch_add(1, Relaxed); + // Cache FD if fd_cache_miss { descriptor_table.insert_for_table(segment_id, fd); diff --git a/src/tree/ingest.rs b/src/tree/ingest.rs index f2feb091..58448186 100644 --- a/src/tree/ingest.rs +++ b/src/tree/ingest.rs @@ -3,12 +3,8 @@ // (found in the LICENSE-* files in the repository) use super::Tree; -use crate::{ +use crate::{segment::multi_writer::MultiWriter, AbstractTree, UserKey, UserValue}; use std::path::PathBuf; - segment::{multi_writer::MultiWriter, Segment}, - AbstractTree, UserKey, UserValue, ValueType, -}; -use std::{path::PathBuf, sync::Arc}; pub struct Ingestion<'a> { folder: PathBuf, @@ -51,7 +47,8 @@ impl<'a> Ingestion<'a> { } pub fn finish(self) -> crate::Result<()> { - use crate::compaction::MoveDown; + use crate::{compaction::MoveDown, Segment}; + use std::sync::Arc; let results = self.writer.finish()?; @@ -60,51 +57,22 @@ impl<'a> Ingestion<'a> { let created_segments = results .into_iter() .map(|segment_id| -> crate::Result { + // TODO: look at tree configuration + + // TODO: segment recoverer struct w/ builder pattern + // Segment::recover() + // .pin_filters(true) + // .with_metrics(metrics) + // .run(path, tree_id, cache, descriptor_table); Segment::recover( self.folder.join(segment_id.to_string()), self.tree.id, self.tree.config.cache.clone(), self.tree.config.descriptor_table.clone(), true, - ) // TODO: look at configuration - - // todo!() - - /* let segment_id = trailer.metadata.id; - let segment_file_path = self.folder.join(segment_id.to_string()); - - let block_index = TwoLevelBlockIndex::from_file( - &segment_file_path, - &trailer.metadata, - trailer.offsets.tli_ptr, - (self.tree.id, segment_id).into(), - self.tree.config.descriptor_table.clone(), - self.tree.config.cache.clone(), - )?; - let block_index = BlockIndexImpl::TwoLevel(block_index); - let block_index = Arc::new(block_index); - - Ok(SegmentInner { - tree_id: self.tree.id, - - descriptor_table: self.tree.config.descriptor_table.clone(), - cache: self.tree.config.cache.clone(), - - metadata: trailer.metadata, - offsets: trailer.offsets, - - #[allow(clippy::needless_borrows_for_generic_args)] - block_index, - - bloom_filter: Segment::load_bloom( - &segment_file_path, - trailer.offsets.bloom_ptr, - )?, - - path: segment_file_path, - is_deleted: AtomicBool::default(), - } - .into()) */ + #[cfg(feature = "metrics")] + self.tree.metrics.clone(), + ) }) .collect::>>()?; diff --git a/src/tree/inner.rs b/src/tree/inner.rs index fe55d191..d23e0773 100644 --- a/src/tree/inner.rs +++ b/src/tree/inner.rs @@ -2,6 +2,9 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) +#[cfg(feature = "metrics")] +use crate::metrics::Metrics; + use crate::{ config::Config, level_manifest::LevelManifest, memtable::Memtable, stop_signal::StopSignal, SegmentId, @@ -76,6 +79,10 @@ pub struct TreeInner { pub(crate) stop_signal: StopSignal, pub(crate) major_compaction_lock: RwLock<()>, + + #[doc(hidden)] + #[cfg(feature = "metrics")] + pub metrics: Arc, } impl TreeInner { @@ -91,6 +98,8 @@ impl TreeInner { manifest: Arc::new(RwLock::new(manifest)), stop_signal: StopSignal::default(), major_compaction_lock: RwLock::default(), + #[cfg(feature = "metrics")] + metrics: Metrics::default().into(), }) } diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 178622a0..9bda654e 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -5,6 +5,9 @@ pub mod ingest; pub mod inner; +#[cfg(feature = "metrics")] +use crate::metrics::Metrics; + use crate::{ coding::{Decode, Encode}, compaction::CompactionStrategy, @@ -534,6 +537,8 @@ impl Tree { self.config.cache.clone(), self.config.descriptor_table.clone(), true, // TODO: look at configuration + #[cfg(feature = "metrics")] + self.metrics.clone(), )?; log::debug!("Flushed segment to {:?}", created_segment.path); @@ -816,11 +821,16 @@ impl Tree { let tree_id = get_next_tree_id(); + #[cfg(feature = "metrics")] + let metrics = Arc::new(Metrics::default()); + let levels = Self::recover_levels( &config.path, tree_id, &config.cache, &config.descriptor_table, + #[cfg(feature = "metrics")] + &metrics, )?; let highest_segment_id = levels.iter().map(Segment::id).max().unwrap_or_default(); @@ -834,6 +844,8 @@ impl Tree { stop_signal: StopSignal::default(), config, major_compaction_lock: RwLock::default(), + #[cfg(feature = "metrics")] + metrics, }; Ok(Self(Arc::new(inner))) @@ -881,6 +893,7 @@ impl Tree { tree_id: TreeId, cache: &Arc, descriptor_table: &Arc, + #[cfg(feature = "metrics")] metrics: &Arc, ) -> crate::Result { use crate::{file::fsync_directory, file::SEGMENTS_FOLDER, SegmentId}; @@ -939,6 +952,8 @@ impl Tree { cache.clone(), descriptor_table.clone(), true, // TODO: look at configuration + #[cfg(feature = "metrics")] + metrics.clone(), )?; log::debug!("Recovered segment from {:?}", segment.path); From b43673d6756fd377da1f82ab37ea5d15fb7d1449 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 14 Jul 2025 19:44:50 +0200 Subject: [PATCH 248/613] fix: segment range edge case --- src/segment/iter.rs | 190 +++++++++++++++++++++++--------------------- 1 file changed, 99 insertions(+), 91 deletions(-) diff --git a/src/segment/iter.rs b/src/segment/iter.rs index 6579f57c..cf8df88c 100644 --- a/src/segment/iter.rs +++ b/src/segment/iter.rs @@ -138,55 +138,59 @@ impl Iterator for Iter { } } - let Some(handle) = self.index_iter.next() else { - // NOTE: No more block handles from index, - // Now check hi buffer if it exists - if let Some(block) = &mut self.hi_data_block { - if let Some(item) = block.next().map(Ok) { - return Some(item); + loop { + let Some(handle) = self.index_iter.next() else { + // NOTE: No more block handles from index, + // Now check hi buffer if it exists + if let Some(block) = &mut self.hi_data_block { + if let Some(item) = block.next().map(Ok) { + return Some(item); + } } - } - // NOTE: If there is no more item, we are done - self.lo_data_block = None; - self.hi_data_block = None; - return None; - }; - - // NOTE: Load next lo block - #[allow(clippy::single_match_else)] - let block = match self.cache.get_block(self.segment_id, handle.offset()) { - Some(block) => block, - None => { - fail_iter!(load_block( - self.segment_id, - &self.path, - &self.descriptor_table, - &self.cache, - &BlockHandle::new(handle.offset(), handle.size()), - self.compression, - #[cfg(feature = "metrics")] - &self.metrics, - )) - } - }; - let block = DataBlock::new(block); + // NOTE: If there is no more item, we are done + self.lo_data_block = None; + self.hi_data_block = None; + return None; + }; + + // NOTE: Load next lo block + #[allow(clippy::single_match_else)] + let block = match self.cache.get_block(self.segment_id, handle.offset()) { + Some(block) => block, + None => { + fail_iter!(load_block( + self.segment_id, + &self.path, + &self.descriptor_table, + &self.cache, + &BlockHandle::new(handle.offset(), handle.size()), + self.compression, + #[cfg(feature = "metrics")] + &self.metrics, + )) + } + }; + let block = DataBlock::new(block); - let mut reader = create_data_block_reader(block); + let mut reader = create_data_block_reader(block); - // NOTE: This is the first block, seek in it - if self.lo_data_block.is_none() { - if let Some(key) = &self.range.0 { - reader.seek_lower(key, SeqNo::MAX); + // NOTE: This is the first block, seek in it + if self.lo_data_block.is_none() { + if let Some(key) = &self.range.0 { + reader.seek_lower(key, SeqNo::MAX); + } } - } - let item = reader.next(); + let item = reader.next(); - self.lo_offset = handle.offset(); - self.lo_data_block = Some(reader); + self.lo_offset = handle.offset(); + self.lo_data_block = Some(reader); - item.map(Ok) + if let Some(item) = item { + return Some(Ok(item)); + } + } } } @@ -204,63 +208,67 @@ impl DoubleEndedIterator for Iter { } } - let Some(handle) = self.index_iter.next_back() else { - // NOTE: No more block handles from index, - // Now check lo buffer if it exists - if let Some(block) = &mut self.lo_data_block { - // eprintln!("=== lo block ==="); - - // for item in block.borrow_owner().iter() { - // eprintln!( - // r#"InternalValue::from_components({:?}, {:?}, {}, {:?}),"#, - // item.key.user_key, item.value, item.key.seqno, item.key.value_type, - // ); - // } - - if let Some(item) = block.next_back().map(Ok) { - return Some(item); + loop { + let Some(handle) = self.index_iter.next_back() else { + // NOTE: No more block handles from index, + // Now check lo buffer if it exists + if let Some(block) = &mut self.lo_data_block { + // eprintln!("=== lo block ==="); + + // for item in block.borrow_owner().iter() { + // eprintln!( + // r#"InternalValue::from_components({:?}, {:?}, {}, {:?}),"#, + // item.key.user_key, item.value, item.key.seqno, item.key.value_type, + // ); + // } + + if let Some(item) = block.next_back().map(Ok) { + return Some(item); + } } - } - // NOTE: If there is no more item, we are done - self.lo_data_block = None; - self.hi_data_block = None; - return None; - }; - - // NOTE: Load next hi block - #[allow(clippy::single_match_else)] - let block = match self.cache.get_block(self.segment_id, handle.offset()) { - Some(block) => block, - None => { - fail_iter!(load_block( - self.segment_id, - &self.path, - &self.descriptor_table, - &self.cache, - &BlockHandle::new(handle.offset(), handle.size()), - self.compression, - #[cfg(feature = "metrics")] - &self.metrics, - )) - } - }; - let block = DataBlock::new(block); + // NOTE: If there is no more item, we are done + self.lo_data_block = None; + self.hi_data_block = None; + return None; + }; + + // NOTE: Load next hi block + #[allow(clippy::single_match_else)] + let block = match self.cache.get_block(self.segment_id, handle.offset()) { + Some(block) => block, + None => { + fail_iter!(load_block( + self.segment_id, + &self.path, + &self.descriptor_table, + &self.cache, + &BlockHandle::new(handle.offset(), handle.size()), + self.compression, + #[cfg(feature = "metrics")] + &self.metrics, + )) + } + }; + let block = DataBlock::new(block); - let mut reader = create_data_block_reader(block); + let mut reader = create_data_block_reader(block); - // NOTE: This is the first block, seek in it - if self.hi_data_block.is_none() { - if let Some(key) = &self.range.1 { - reader.seek_upper(key, SeqNo::MAX); + // NOTE: This is the first block, seek in it + if self.hi_data_block.is_none() { + if let Some(key) = &self.range.1 { + reader.seek_upper(key, SeqNo::MAX); + } } - } - let item = reader.next_back(); + let item = reader.next_back(); - self.hi_offset = handle.offset(); - self.hi_data_block = Some(reader); + self.hi_offset = handle.offset(); + self.hi_data_block = Some(reader); - item.map(Ok) + if let Some(item) = item { + return Some(Ok(item)); + } + } } } From fdc10772db961b16a447203e986b00c3686ea4f2 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 14 Jul 2025 20:25:20 +0200 Subject: [PATCH 249/613] add feature flags --- Cargo.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index 6c033a1e..f6e8b50b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,8 @@ default = [] lz4 = ["dep:lz4_flex"] miniz = ["dep:miniz_oxide"] bytes = [] # TODO: restore +use_unsafe = [] +metrics = [] [dependencies] byteorder = "1.5.0" From 8642e31cad8d57730310e7473cf641a3afedc543 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 15 Jul 2025 17:40:54 +0200 Subject: [PATCH 250/613] add debug derive --- src/segment/meta.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/segment/meta.rs b/src/segment/meta.rs index 9bee9672..0def02a9 100644 --- a/src/segment/meta.rs +++ b/src/segment/meta.rs @@ -8,7 +8,7 @@ use byteorder::{LittleEndian, ReadBytesExt}; use std::{fs::File, ops::Deref}; /// Nanosecond timestamp. -#[derive(Copy, Clone, PartialEq, Eq, Ord, PartialOrd)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, Ord, PartialOrd)] pub struct Timestamp(u128); impl Deref for Timestamp { @@ -31,6 +31,7 @@ impl From for Timestamp { } } +#[derive(Debug)] pub struct ParsedMeta { pub id: SegmentId, pub created_at: Timestamp, From 5f37a93a1ad68a898f8aae53892c8e96594facba Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 15 Jul 2025 17:42:52 +0200 Subject: [PATCH 251/613] fix block iter --- src/segment/block/decoder.rs | 15 ------------ src/segment/iter.rs | 45 +++++++++++++++++++----------------- 2 files changed, 24 insertions(+), 36 deletions(-) diff --git a/src/segment/block/decoder.rs b/src/segment/block/decoder.rs index 46dfcfa9..cb18d086 100644 --- a/src/segment/block/decoder.rs +++ b/src/segment/block/decoder.rs @@ -283,21 +283,6 @@ impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Pa pred: impl Fn(&[u8]) -> bool, second_partition: bool, ) -> bool { - // Try hash index lookup - if let Some(hash_index) = self.get_hash_index_reader() { - match hash_index.get(needle) { - super::hash_index::Lookup::Found(idx) => { - let offset = self.get_binary_index_reader().get(idx.into()); - self.lo_scanner.offset = offset; - return true; - } - super::hash_index::Lookup::NotFound => return false, - super::hash_index::Lookup::Conflicted => { - // Fall back to binary search - } - } - } - // TODO: make this nicer, maybe predicate that can affect the resulting index...? let result = if second_partition { self.partition_point_2(pred) diff --git a/src/segment/iter.rs b/src/segment/iter.rs index cf8df88c..72f687bd 100644 --- a/src/segment/iter.rs +++ b/src/segment/iter.rs @@ -69,6 +69,8 @@ pub struct Iter { cache: Arc, compression: CompressionType, + index_initialized: bool, + lo_offset: BlockOffset, lo_data_block: Option, @@ -100,6 +102,8 @@ impl Iter { cache, compression, + index_initialized: false, + lo_offset: BlockOffset(0), lo_data_block: None, @@ -132,10 +136,14 @@ impl Iterator for Iter { } } - if self.lo_data_block.is_none() { + if !self.index_initialized { if let Some(key) = &self.range.0 { self.index_iter.seek_lower(key); } + if let Some(key) = &self.range.1 { + self.index_iter.seek_upper(key); + } + self.index_initialized = true; } loop { @@ -175,11 +183,11 @@ impl Iterator for Iter { let mut reader = create_data_block_reader(block); - // NOTE: This is the first block, seek in it - if self.lo_data_block.is_none() { - if let Some(key) = &self.range.0 { - reader.seek_lower(key, SeqNo::MAX); - } + if let Some(key) = &self.range.0 { + reader.seek_lower(key, SeqNo::MAX); + } + if let Some(key) = &self.range.1 { + reader.seek_upper(key, SeqNo::MAX); } let item = reader.next(); @@ -202,10 +210,14 @@ impl DoubleEndedIterator for Iter { } } - if self.hi_data_block.is_none() { + if !self.index_initialized { + if let Some(key) = &self.range.0 { + self.index_iter.seek_lower(key); + } if let Some(key) = &self.range.1 { self.index_iter.seek_upper(key); } + self.index_initialized = true; } loop { @@ -213,15 +225,6 @@ impl DoubleEndedIterator for Iter { // NOTE: No more block handles from index, // Now check lo buffer if it exists if let Some(block) = &mut self.lo_data_block { - // eprintln!("=== lo block ==="); - - // for item in block.borrow_owner().iter() { - // eprintln!( - // r#"InternalValue::from_components({:?}, {:?}, {}, {:?}),"#, - // item.key.user_key, item.value, item.key.seqno, item.key.value_type, - // ); - // } - if let Some(item) = block.next_back().map(Ok) { return Some(item); } @@ -254,11 +257,11 @@ impl DoubleEndedIterator for Iter { let mut reader = create_data_block_reader(block); - // NOTE: This is the first block, seek in it - if self.hi_data_block.is_none() { - if let Some(key) = &self.range.1 { - reader.seek_upper(key, SeqNo::MAX); - } + if let Some(key) = &self.range.0 { + reader.seek_lower(key, SeqNo::MAX); + } + if let Some(key) = &self.range.1 { + reader.seek_upper(key, SeqNo::MAX); } let item = reader.next_back(); From 80b7d5215b2b54fe0a37a1b475340d012b7fd9c0 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 15 Jul 2025 17:43:41 +0200 Subject: [PATCH 252/613] restore hash index --- src/segment/block/decoder.rs | 30 +----- src/segment/block/hash_index/reader.rs | 6 ++ src/segment/data_block/iter.rs | 25 +++++ src/segment/data_block/mod.rs | 134 ++++++++++++++++--------- 4 files changed, 125 insertions(+), 70 deletions(-) diff --git a/src/segment/block/decoder.rs b/src/segment/block/decoder.rs index cb18d086..4a77a949 100644 --- a/src/segment/block/decoder.rs +++ b/src/segment/block/decoder.rs @@ -67,7 +67,7 @@ struct HiScanner { /// Generic block decoder for RocksDB-style blocks /// -/// Supports prefix truncation, binary search index (through restart intervals) and optionally hash indexes. +/// Supports prefix truncation and binary search index (through restart intervals). pub struct Decoder<'a, Item: Decodable, Parsed: ParsedItem> { block: &'a Block, phantom: PhantomData<(Item, Parsed)>, @@ -81,9 +81,6 @@ pub struct Decoder<'a, Item: Decodable, Parsed: ParsedItem> { binary_index_step_size: u8, binary_index_offset: u32, binary_index_len: u32, - - hash_index_offset: u32, - hash_index_len: u32, } impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Parsed> { @@ -106,9 +103,6 @@ impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Pa let binary_index_offset = unwrap!(reader.read_u32::()); let binary_index_len = unwrap!(reader.read_u32::()); - let hash_index_offset = unwrap!(reader.read_u32::()); - let hash_index_len = unwrap!(reader.read_u32::()); - Self { block, phantom: PhantomData, @@ -131,9 +125,6 @@ impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Pa binary_index_step_size, binary_index_offset, binary_index_len, - - hash_index_offset, - hash_index_len, } } @@ -163,21 +154,6 @@ impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Pa ) } - /// Returns the number of hash buckets. - #[must_use] - pub fn hash_bucket_count(&self) -> Option { - if self.hash_index_offset > 0 { - Some(self.hash_index_len) - } else { - None - } - } - - fn get_hash_index_reader(&self) -> Option { - self.hash_bucket_count() - .map(|offset| HashIndexReader::new(&self.block.data, self.hash_index_offset, offset)) - } - fn get_key_at(&self, pos: usize) -> &[u8] { let bytes = &self.block.data; @@ -274,6 +250,10 @@ impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Pa Some((offset, left)) } + pub fn set_lo_offset(&mut self, offset: usize) { + self.lo_scanner.offset = offset; + } + /// Seeks using the given predicate. /// /// Returns `false` if the key does not possible exist. diff --git a/src/segment/block/hash_index/reader.rs b/src/segment/block/hash_index/reader.rs index 54157b76..cf4592b6 100644 --- a/src/segment/block/hash_index/reader.rs +++ b/src/segment/block/hash_index/reader.rs @@ -33,6 +33,12 @@ impl<'a> Reader<'a> { Self(&bytes[offset..end]) } + /// Returns the number of buckets. + #[must_use] + pub fn bucket_count(&self) -> usize { + self.0.len() + } + // NOTE: Only used in metrics, so no need to be hyper-optimized #[allow(clippy::naive_bytecount)] /// Returns the amount of empty slots in the hash index. diff --git a/src/segment/data_block/iter.rs b/src/segment/data_block/iter.rs index d6d297da..c9d4685f 100644 --- a/src/segment/data_block/iter.rs +++ b/src/segment/data_block/iter.rs @@ -25,6 +25,31 @@ impl<'a> Iter<'a> { Self { bytes, decoder } } + pub fn seek_to_offset(&mut self, offset: usize, needle: &[u8]) -> bool { + self.decoder.inner_mut().set_lo_offset(offset); + + // Linear scan + loop { + let Some(item) = self.decoder.peek() else { + return false; + }; + + match item.compare_key(needle, self.bytes) { + std::cmp::Ordering::Equal => { + return true; + } + std::cmp::Ordering::Greater => { + return false; + } + std::cmp::Ordering::Less => { + // Continue + + self.decoder.next().expect("should exist"); + } + } + } + } + pub fn seek(&mut self, needle: &[u8]) -> bool { if !self .decoder diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index 4d1b557b..1ca78ce8 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -7,7 +7,8 @@ mod iter; pub use iter::Iter; use super::block::{ - Block, Decodable, Decoder, Encodable, Encoder, ParsedItem, Trailer, TRAILER_START_MARKER, + binary_index::Reader as BinaryIndexReader, hash_index::Reader as HashIndexReader, Block, + Decodable, Decoder, Encodable, Encoder, ParsedItem, Trailer, TRAILER_START_MARKER, }; use crate::key::InternalKey; use crate::segment::util::{compare_prefixed_slice, SliceIndexes}; @@ -240,6 +241,7 @@ impl ParsedItem for DataBlockParsedItem { } else { bytes.slice(self.key.0..self.key.1) }; + let key = InternalKey::new( key, self.seqno, @@ -285,13 +287,97 @@ impl DataBlock { self.inner.size() } + fn get_binary_index_reader(&self) -> BinaryIndexReader { + let trailer = Trailer::new(&self.inner); + let mut reader = trailer.as_slice(); + + let _item_count = reader.read_u32::().expect("should read"); + + let _restart_interval = unwrap!(reader.read_u8()); + + let binary_index_step_size = unwrap!(reader.read_u8()); + + debug_assert!( + binary_index_step_size == 2 || binary_index_step_size == 4, + "invalid binary index step size", + ); + + let binary_index_offset = unwrap!(reader.read_u32::()); + let binary_index_len = unwrap!(reader.read_u32::()); + + BinaryIndexReader::new( + &self.inner.data, + binary_index_offset, + binary_index_len, + binary_index_step_size, + ) + } + + fn get_hash_index_reader(&self) -> Option { + let trailer = Trailer::new(&self.inner); + + // NOTE: Skip item count (u32), restart interval (u8), binary index step size (u8) + // and binary stuff (2x u32) + let offset = std::mem::size_of::() + + std::mem::size_of::() + + std::mem::size_of::() + + std::mem::size_of::() + + std::mem::size_of::(); + + let mut reader = unwrap!(trailer.as_slice().get(offset..)); + + let hash_index_offset = unwrap!(reader.read_u32::()); + let hash_index_len = unwrap!(reader.read_u32::()); + + if hash_index_len > 0 { + Some(HashIndexReader::new( + &self.inner.data, + hash_index_offset, + hash_index_len, + )) + } else { + None + } + } + + /// Returns the number of hash buckets. + #[must_use] + pub fn hash_bucket_count(&self) -> Option { + self.get_hash_index_reader() + .map(|reader| reader.bucket_count()) + } + // TODO: handle seqno more nicely (make Key generic, so we can do binary search over (key, seqno)) #[must_use] pub fn point_read(&self, needle: &[u8], seqno: SeqNo) -> Option { let mut iter = self.iter(); - if !iter.seek(needle) { - return None; + if let Some(hash_index_reader) = self.get_hash_index_reader() { + use super::block::hash_index::Lookup::{Conflicted, Found, NotFound}; + + match hash_index_reader.get(needle) { + Found(idx) => { + let offset: usize = self.get_binary_index_reader().get(usize::from(idx)); + + if !iter.seek_to_offset(offset, needle) { + return None; + } + } + NotFound => { + return None; + } + Conflicted => { + // NOTE: Fallback to binary search + if !iter.seek(needle) { + return None; + } + } + } + } else { + // NOTE: Fallback to binary search + if !iter.seek(needle) { + return None; + } } for item in iter { @@ -335,48 +421,6 @@ impl DataBlock { unwrap!(reader.read_u32::()) } - /// Returns the number of hash buckets. - #[must_use] - pub fn hash_bucket_count(&self) -> Option { - let trailer = Trailer::new(&self.inner); - - // NOTE: Skip item count (u32), restart interval (u8), binary index step size (u8), - // and binary index offset+len (2x u32) - let offset = std::mem::size_of::() - + (2 * std::mem::size_of::()) - + (2 * std::mem::size_of::()); - - let mut reader = unwrap!(trailer.as_slice().get(offset..)); - - let hash_index_offset = unwrap!(reader.read_u32::()); - let hash_index_len = unwrap!(reader.read_u32::()); - - if hash_index_offset > 0 { - Some(hash_index_len) - } else { - None - } - } - - /* fn get_hash_index_reader(&self) -> Option { - self.hash_bucket_count() - .map(|offset| HashIndexReader::new(&self.inner.data, self.hash_index_offset, offset)) - } */ - - /* /// Returns the amount of conflicts in the hash buckets. - #[must_use] - pub fn hash_bucket_conflict_count(&self) -> Option { - self.get_hash_index_reader() - .map(|reader| reader.conflict_count()) - } */ - - /* /// Returns the amount of empty hash buckets. - #[must_use] - pub fn hash_bucket_free_count(&self) -> Option { - self.get_hash_index_reader() - .map(|reader| reader.free_count()) - } */ - /// Returns the amount of items in the block. #[must_use] #[allow(clippy::len_without_is_empty)] From 3c58f6bb083e20367f297fac52fc3a2fe2bb70c6 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 15 Jul 2025 17:44:10 +0200 Subject: [PATCH 253/613] add some test cases --- src/segment/block/mod.rs | 3 +- src/segment/data_block/iter.rs | 236 ++++++++++++++++++++++++++++++++ src/segment/index_block/iter.rs | 63 +++++++++ src/segment/mod.rs | 72 ++++++++++ 4 files changed, 373 insertions(+), 1 deletion(-) diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index ddb59fe4..2811db39 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -70,9 +70,10 @@ impl Block { writer.write_all(data)?; log::trace!( - "Writing block with size {}B (compressed: {}B)", + "Writing block with size {}B (compressed: {}B) (excluding header of {}B)", header.uncompressed_length, header.data_length, + Header::serialized_len(), ); Ok(header) diff --git a/src/segment/data_block/iter.rs b/src/segment/data_block/iter.rs index c9d4685f..7551dc0b 100644 --- a/src/segment/data_block/iter.rs +++ b/src/segment/data_block/iter.rs @@ -142,6 +142,242 @@ mod tests { }; use test_log::test; + #[test] + fn v3_data_block_wtf() -> crate::Result<()> { + let keys = [ + [0, 0, 0, 0, 0, 0, 0, 108], + [0, 0, 0, 0, 0, 0, 0, 109], + [0, 0, 0, 0, 0, 0, 0, 110], + [0, 0, 0, 0, 0, 0, 0, 111], + [0, 0, 0, 0, 0, 0, 0, 112], + [0, 0, 0, 0, 0, 0, 0, 113], + [0, 0, 0, 0, 0, 0, 0, 114], + [0, 0, 0, 0, 0, 0, 0, 115], + [0, 0, 0, 0, 0, 0, 0, 116], + [0, 0, 0, 0, 0, 0, 0, 117], + [0, 0, 0, 0, 0, 0, 0, 118], + [0, 0, 0, 0, 0, 0, 0, 119], + [0, 0, 0, 0, 0, 0, 0, 120], + [0, 0, 0, 0, 0, 0, 0, 121], + [0, 0, 0, 0, 0, 0, 0, 122], + [0, 0, 0, 0, 0, 0, 0, 123], + [0, 0, 0, 0, 0, 0, 0, 124], + [0, 0, 0, 0, 0, 0, 0, 125], + [0, 0, 0, 0, 0, 0, 0, 126], + [0, 0, 0, 0, 0, 0, 0, 127], + [0, 0, 0, 0, 0, 0, 0, 128], + [0, 0, 0, 0, 0, 0, 0, 129], + [0, 0, 0, 0, 0, 0, 0, 130], + [0, 0, 0, 0, 0, 0, 0, 131], + [0, 0, 0, 0, 0, 0, 0, 132], + [0, 0, 0, 0, 0, 0, 0, 133], + [0, 0, 0, 0, 0, 0, 0, 134], + [0, 0, 0, 0, 0, 0, 0, 135], + [0, 0, 0, 0, 0, 0, 0, 136], + [0, 0, 0, 0, 0, 0, 0, 137], + [0, 0, 0, 0, 0, 0, 0, 138], + [0, 0, 0, 0, 0, 0, 0, 139], + [0, 0, 0, 0, 0, 0, 0, 140], + [0, 0, 0, 0, 0, 0, 0, 141], + [0, 0, 0, 0, 0, 0, 0, 142], + [0, 0, 0, 0, 0, 0, 0, 143], + ]; + + let items = keys + .into_iter() + .map(|key| InternalValue::from_components(key, "", 0, Value)) + .collect::>(); + + for restart_interval in 1..=16 { + let bytes = DataBlock::encode_items(&items, restart_interval, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + { + let mut iter = data_block.iter(); + iter.seek(&10u64.to_be_bytes()); + iter.seek_upper(&110u64.to_be_bytes()); + let iter = iter.map(|x| x.materialize(data_block.as_slice())); + + assert_eq!( + items.iter().take(3).cloned().collect::>(), + iter.collect::>(), + ); + } + + { + let mut iter: crate::segment::data_block::Iter<'_> = data_block.iter(); + iter.seek(&10u64.to_be_bytes()); + iter.seek_upper(&110u64.to_be_bytes()); + let iter = iter.map(|x| x.materialize(data_block.as_slice())); + + assert_eq!( + items.iter().take(3).rev().cloned().collect::>(), + iter.rev().collect::>(), + ); + } + + { + let mut iter = data_block.iter(); + iter.seek(&10u64.to_be_bytes()); + iter.seek_upper(&110u64.to_be_bytes()); + + let mut iter = iter.map(|item| item.materialize(&data_block.inner.data)); + let mut count = 0; + + for x in 0.. { + if x % 2 == 0 { + let Some(_) = iter.next() else { + break; + }; + + count += 1; + } else { + let Some(_) = iter.next_back() else { + break; + }; + + count += 1; + } + } + + assert_eq!(3, count); + } + } + + Ok(()) + } + + #[test] + fn v3_data_block_range() -> crate::Result<()> { + let items = (100u64..110) + .map(|i| InternalValue::from_components(i.to_be_bytes(), "", 0, Value)) + .collect::>(); + + for restart_interval in 1..=16 { + let bytes = DataBlock::encode_items(&items, restart_interval, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + { + let mut iter = data_block.iter(); + iter.seek(&10u64.to_be_bytes()); + iter.seek_upper(&109u64.to_be_bytes()); + let iter = iter.map(|x| x.materialize(data_block.as_slice())); + + assert_eq!( + items.iter().take(10).cloned().collect::>(), + iter.collect::>(), + ); + } + + { + let mut iter: crate::segment::data_block::Iter<'_> = data_block.iter(); + iter.seek(&10u64.to_be_bytes()); + iter.seek_upper(&109u64.to_be_bytes()); + let iter = iter.map(|x| x.materialize(data_block.as_slice())); + + assert_eq!( + items.iter().take(10).rev().cloned().collect::>(), + iter.rev().collect::>(), + ); + } + + { + let mut iter = data_block.iter(); + iter.seek(&10u64.to_be_bytes()); + iter.seek_upper(&109u64.to_be_bytes()); + + let mut iter = iter.map(|item| item.materialize(&data_block.inner.data)); + let mut count = 0; + + for x in 0.. { + if x % 2 == 0 { + let Some(_) = iter.next() else { + break; + }; + + count += 1; + } else { + let Some(_) = iter.next_back() else { + break; + }; + + count += 1; + } + } + + assert_eq!(10, count); + } + } + + Ok(()) + } + + #[test] + fn v3_data_block_range_ping_pong() -> crate::Result<()> { + let items = (0u64..100) + .map(|i| InternalValue::from_components(i.to_be_bytes(), "", 0, Value)) + .collect::>(); + + for restart_interval in 1..=16 { + let bytes = DataBlock::encode_items(&items, restart_interval, 1.33)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + let mut iter = data_block.iter(); + iter.seek(&5u64.to_be_bytes()); + iter.seek_upper(&9u64.to_be_bytes()); + + let mut iter = iter.map(|item| item.materialize(&data_block.inner.data)); + let mut count = 0; + + for x in 0.. { + if x % 2 == 0 { + let Some(_) = iter.next() else { + break; + }; + + count += 1; + } else { + let Some(_) = iter.next_back() else { + break; + }; + + count += 1; + } + } + + assert_eq!(5, count); + } + + Ok(()) + } + #[test] fn v3_data_block_iter_forward() -> crate::Result<()> { let items = [ diff --git a/src/segment/index_block/iter.rs b/src/segment/index_block/iter.rs index 9fcdcfd6..3720b444 100644 --- a/src/segment/index_block/iter.rs +++ b/src/segment/index_block/iter.rs @@ -398,4 +398,67 @@ mod tests { Ok(()) } + + #[test] + fn v3_index_block_iter_range_1() -> crate::Result<()> { + let items = [ + KeyedBlockHandle::new(b"a".into(), BlockOffset(0), 6_000), + KeyedBlockHandle::new(b"b".into(), BlockOffset(13_000), 5_000), + KeyedBlockHandle::new(b"c".into(), BlockOffset(13_000), 5_000), + KeyedBlockHandle::new(b"d".into(), BlockOffset(13_000), 5_000), + KeyedBlockHandle::new(b"e".into(), BlockOffset(13_000), 5_000), + ]; + + let bytes = IndexBlock::encode_items(&items)?; + + let index_block = IndexBlock::new(Block { + data: bytes.into(), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(index_block.len(), items.len()); + + { + let mut iter = index_block.iter(); + assert!(iter.seek(b"b"), "should seek"); + assert!(iter.seek_upper(b"c"), "should seek"); + + let real_items: Vec<_> = iter + .map(|item| item.materialize(&index_block.inner.data)) + .collect(); + + assert_eq!( + items.iter().skip(1).take(3).cloned().collect::>(), + &*real_items, + ); + } + + { + let mut iter = index_block.iter(); + assert!(iter.seek(b"b"), "should seek"); + assert!(iter.seek_upper(b"c"), "should seek"); + + let real_items: Vec<_> = iter + .map(|item| item.materialize(&index_block.inner.data)) + .collect(); + + assert_eq!( + items + .iter() + .skip(1) + .take(3) + .rev() + .cloned() + .collect::>(), + &*real_items, + ); + } + + Ok(()) + } } diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 132ed27f..ec9b5a07 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -733,6 +733,78 @@ mod tests { Ok(()) } + #[test] + #[allow(clippy::unwrap_used)] + fn v3_segment_range_ping_pong() -> crate::Result<()> { + let dir = tempdir()?; + let file = dir.path().join("segment"); + + let items = (0u64..10) + .map(|i| { + InternalValue::from_components(i.to_be_bytes(), "", 0, crate::ValueType::Value) + }) + .collect::>(); + + { + let mut writer = crate::segment::Writer::new(file.clone(), 5)?; + + for item in items.iter().cloned() { + writer.write(item)?; + } + + let _trailer = writer.finish()?; + } + + { + #[cfg(feature = "metrics")] + let metrics = Arc::new(Metrics::default()); + + let segment = Segment::recover( + file, + 0, + Arc::new(Cache::with_capacity_bytes(1_000_000)), + Arc::new(DescriptorTable::new(10)), + true, + #[cfg(feature = "metrics")] + metrics, + )?; + + assert_eq!(5, segment.id()); + assert_eq!(10, segment.metadata.item_count); + assert_eq!(1, segment.metadata.data_block_count); + assert_eq!(1, segment.metadata.index_block_count); // 1 because we use a full index + assert!( + segment.regions.index.is_none(), + "should use full index, so only TLI exists", + ); + + let mut iter = segment + .range(UserKey::from(5u64.to_be_bytes())..UserKey::from(10u64.to_be_bytes())); + + let mut count = 0; + + for x in 0.. { + if x % 2 == 0 { + let Some(_) = iter.next() else { + break; + }; + + count += 1; + } else { + let Some(_) = iter.next_back() else { + break; + }; + + count += 1; + } + } + + assert_eq!(5, count); + } + + Ok(()) + } + #[test] #[allow(clippy::unwrap_used)] fn v3_segment_range_multiple_data_blocks() -> crate::Result<()> { From ffd507897e6a911ea5705cc1667bc6fc00e38884 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 15 Jul 2025 17:47:37 +0200 Subject: [PATCH 254/613] fix: doctest --- src/segment/block/encoder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/segment/block/encoder.rs b/src/segment/block/encoder.rs index 033c1835..7aa6ba54 100644 --- a/src/segment/block/encoder.rs +++ b/src/segment/block/encoder.rs @@ -42,7 +42,7 @@ pub trait Encodable { /// /// A block with `restart_interval=4` /// -/// ``` +/// ```js /// _______________ /// __________|__________ | /// v v | | From 4dc0618d86d921a7dab299ff0335834476fe2c36 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 19 Jul 2025 19:11:28 +0200 Subject: [PATCH 255/613] feat: add per-block read checksum check --- src/segment/block/mod.rs | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index 2811db39..303092ab 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -123,6 +123,16 @@ impl Block { } }); + let checksum = Checksum::from_raw(xxh3_64(&data)); + if checksum != header.checksum { + log::error!( + "Checksum mismatch for block, got={}, expected={}", + *checksum, + *header.checksum, + ); + return Err(crate::Error::InvalidChecksum((checksum, header.checksum))); + } + Ok(Self { header, data }) } @@ -132,8 +142,6 @@ impl Block { handle: BlockHandle, compression: CompressionType, ) -> crate::Result { - // TODO: toggle with use_unsafe and add bench - #[cfg(feature = "use_unsafe")] let mut buf = Slice::with_size_unzeroed(handle.size() as usize); @@ -221,13 +229,21 @@ impl Block { } }; - // TODO: check checksum - #[allow(clippy::expect_used, clippy::cast_possible_truncation)] { debug_assert_eq!(header.uncompressed_length, data.len() as u32); } + let checksum = Checksum::from_raw(xxh3_64(&data)); + if checksum != header.checksum { + log::error!( + "Checksum mismatch for block, got={}, expected={}", + *checksum, + *header.checksum, + ); + return Err(crate::Error::InvalidChecksum((checksum, header.checksum))); + } + Ok(Self { header, data }) } } From ded34569d86bae14c662d5bede6ca8976e60b88f Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 20 Jul 2025 14:32:23 +0200 Subject: [PATCH 256/613] wip --- src/tree/mod.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 2d5a0dc3..2f43f71a 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -88,6 +88,8 @@ impl AbstractTree for Tree { Ok(()) } + // TODO: clear() with Nuke compaction strategy (write lock) + #[doc(hidden)] fn major_compact(&self, target_size: u64, seqno_threshold: SeqNo) -> crate::Result<()> { let strategy = Arc::new(crate::compaction::major::Strategy::new(target_size)); @@ -945,13 +947,13 @@ impl Tree { crate::Error::Unrecoverable })?; - if let Some(&_level_idx) = segment_id_map.get(&segment_id) { + if let Some(&level_idx) = segment_id_map.get(&segment_id) { let segment = Segment::recover( segment_file_path, tree_id, cache.clone(), descriptor_table.clone(), - true, // TODO: look at configuration + level_idx <= 1, // TODO: look at configuration #[cfg(feature = "metrics")] metrics.clone(), )?; From 89e7b7157d195a77e6d68d243451ba5137b08773 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 20 Jul 2025 14:32:51 +0200 Subject: [PATCH 257/613] allow block encoder to write into external buffer --- src/segment/block/encoder.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/segment/block/encoder.rs b/src/segment/block/encoder.rs index 7aa6ba54..ff62db2a 100644 --- a/src/segment/block/encoder.rs +++ b/src/segment/block/encoder.rs @@ -61,7 +61,7 @@ pub trait Encodable { pub struct Encoder<'a, Context: Default, Item: Encodable> { pub(crate) phantom: PhantomData<(Context, Item)>, - pub(crate) writer: Vec, + pub(crate) writer: &'a mut Vec, pub(crate) state: Context, @@ -82,6 +82,7 @@ pub struct Encoder<'a, Context: Default, Item: Encodable> { impl<'a, Context: Default, Item: Encodable> Encoder<'a, Context, Item> { pub fn new( + writer: &'a mut Vec, item_count: usize, restart_interval: u8, // TODO: should be NonZero hash_index_ratio: f32, @@ -93,7 +94,7 @@ impl<'a, Context: Default, Item: Encodable> Encoder<'a, Context, Item> Self { phantom: PhantomData, - writer: Vec::new(), + writer, state: Context::default(), @@ -130,7 +131,7 @@ impl<'a, Context: Default, Item: Encodable> Encoder<'a, Context, Item> self.binary_index_builder.insert(self.writer.len() as u32); } - item.encode_full_into(&mut self.writer, &mut self.state)?; + item.encode_full_into(&mut *self.writer, &mut self.state)?; self.base_key = item.key(); } else { @@ -138,7 +139,7 @@ impl<'a, Context: Default, Item: Encodable> Encoder<'a, Context, Item> #[allow(clippy::cast_possible_truncation)] let shared_prefix_len = longest_shared_prefix_length(self.base_key, item.key()); - item.encode_truncated_into(&mut self.writer, &mut self.state, shared_prefix_len)?; + item.encode_truncated_into(&mut *self.writer, &mut self.state, shared_prefix_len)?; } let restart_idx = self.restart_count - 1; @@ -154,7 +155,7 @@ impl<'a, Context: Default, Item: Encodable> Encoder<'a, Context, Item> Ok(()) } - pub fn finish(self) -> crate::Result> { + pub fn finish(self) -> crate::Result<()> { Trailer::write(self) } } From 9328d25b62a153f0866e6250f65a85e7ca089bdb Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 20 Jul 2025 14:33:04 +0200 Subject: [PATCH 258/613] print block handle in checksum --- src/segment/block/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index 303092ab..99bea472 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -237,7 +237,7 @@ impl Block { let checksum = Checksum::from_raw(xxh3_64(&data)); if checksum != header.checksum { log::error!( - "Checksum mismatch for block, got={}, expected={}", + "Checksum mismatch for block {handle:?}, got={}, expected={}", *checksum, *header.checksum, ); From c0e4abb691945fd89cec4b4f2a02aa0a3c477414 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 20 Jul 2025 14:35:43 +0200 Subject: [PATCH 259/613] data block encoder allow buffer reuse --- src/segment/data_block/mod.rs | 46 +++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index 1ca78ce8..f01d6e25 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -428,11 +428,24 @@ impl DataBlock { Trailer::new(&self.inner).item_count() } - pub fn encode_items( + pub fn encode_into_vec( items: &[InternalValue], restart_interval: u8, hash_index_ratio: f32, ) -> crate::Result> { + let mut buf = vec![]; + + Self::encode_into(&mut buf, items, restart_interval, hash_index_ratio)?; + + Ok(buf) + } + + pub fn encode_into( + writer: &mut Vec, + items: &[InternalValue], + restart_interval: u8, + hash_index_ratio: f32, + ) -> crate::Result<()> { let first_key = &items .first() .expect("chunk should not be empty") @@ -440,6 +453,7 @@ impl DataBlock { .user_key; let mut serializer = Encoder::<'_, (), InternalValue>::new( + writer, items.len(), restart_interval, hash_index_ratio, @@ -486,7 +500,7 @@ mod tests { let ping_pong_code = [1, 0]; - let bytes: Vec = DataBlock::encode_items(&items, 1, 0.0)?; + let bytes: Vec = DataBlock::encode_into_vec(&items, 1, 0.0)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -547,7 +561,7 @@ mod tests { ]; for restart_interval in 1..=16 { - let bytes: Vec = DataBlock::encode_items(&items, restart_interval, 0.0)?; + let bytes: Vec = DataBlock::encode_into_vec(&items, restart_interval, 0.0)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -587,7 +601,7 @@ mod tests { crate::ValueType::Value, )]; - let bytes = DataBlock::encode_items(&items, 16, 0.0)?; + let bytes = DataBlock::encode_into_vec(&items, 16, 0.0)?; let serialized_len = bytes.len(); let data_block = DataBlock::new(Block { @@ -626,7 +640,7 @@ mod tests { )]; for restart_interval in 1..=16 { - let bytes = DataBlock::encode_items(&items, restart_interval, 0.0)?; + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 0.0)?; let serialized_len = bytes.len(); let data_block = DataBlock::new(Block { @@ -655,7 +669,7 @@ mod tests { InternalValue::from_components([0], b"", 0, Value), ]; - let bytes = DataBlock::encode_items(&items, 16, 1.33)?; + let bytes = DataBlock::encode_into_vec(&items, 16, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -696,7 +710,7 @@ mod tests { InternalValue::from_components([0], [], 0, Value), ]; - let bytes = DataBlock::encode_items(&items, 2, 0.0)?; + let bytes = DataBlock::encode_into_vec(&items, 2, 0.0)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -732,7 +746,7 @@ mod tests { InternalValue::from_components(b"d", b"d", 65, Value), ]; - let bytes = DataBlock::encode_items(&items, 1, 0.0)?; + let bytes = DataBlock::encode_into_vec(&items, 1, 0.0)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -767,7 +781,7 @@ mod tests { InternalValue::from_components(b"b", b"b", 65, Value), ]; - let bytes = DataBlock::encode_items(&items, 1, 1.33)?; + let bytes = DataBlock::encode_into_vec(&items, 1, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -813,7 +827,7 @@ mod tests { ), ]; - let bytes = DataBlock::encode_items(&items, 2, 0.0)?; + let bytes = DataBlock::encode_into_vec(&items, 2, 0.0)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -858,7 +872,7 @@ mod tests { ), ]; - let bytes = DataBlock::encode_items(&items, 2, 0.0)?; + let bytes = DataBlock::encode_into_vec(&items, 2, 0.0)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -907,7 +921,7 @@ mod tests { ), ]; - let bytes = DataBlock::encode_items(&items, 2, 0.0)?; + let bytes = DataBlock::encode_into_vec(&items, 2, 0.0)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -956,7 +970,7 @@ mod tests { ), ]; - let bytes = DataBlock::encode_items(&items, 1, 0.0)?; + let bytes = DataBlock::encode_into_vec(&items, 1, 0.0)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -992,7 +1006,7 @@ mod tests { InternalValue::from_components(b"b", b"b", 65, Value), ]; - let bytes = DataBlock::encode_items(&items, 1, 0.0)?; + let bytes = DataBlock::encode_into_vec(&items, 1, 0.0)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -1029,7 +1043,7 @@ mod tests { InternalValue::from_components("pla:venus:name", "Venus", 0, Value), ]; - let bytes = DataBlock::encode_items(&items, 16, 1.33)?; + let bytes = DataBlock::encode_into_vec(&items, 16, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -1072,7 +1086,7 @@ mod tests { InternalValue::from_components("pla:venus:name", "Venus", 0, Value), ]; - let bytes = DataBlock::encode_items(&items, 1, 1.33)?; + let bytes = DataBlock::encode_into_vec(&items, 1, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), From b51412c7f32a8fae70118f51a4e7431d7cb0e4d1 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 20 Jul 2025 14:35:56 +0200 Subject: [PATCH 260/613] index block encode allow buffer reuse --- src/segment/index_block/mod.rs | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/segment/index_block/mod.rs b/src/segment/index_block/mod.rs index c8f8b220..5d261cbc 100644 --- a/src/segment/index_block/mod.rs +++ b/src/segment/index_block/mod.rs @@ -84,13 +84,24 @@ impl IndexBlock { )) } - pub fn encode_items( + #[cfg(test)] + pub fn encode_into_vec(items: &[KeyedBlockHandle]) -> crate::Result> { + let mut buf = vec![]; + + Self::encode_into(&mut buf, items)?; + + Ok(buf) + } + + pub fn encode_into( + writer: &mut Vec, items: &[KeyedBlockHandle], // restart_interval: u8, // TODO: support prefix truncation + delta encoding - ) -> crate::Result> { + ) -> crate::Result<()> { let first_key = items.first().expect("chunk should not be empty").end_key(); let mut serializer = Encoder::<'_, BlockOffset, KeyedBlockHandle>::new( + writer, items.len(), 1, // TODO: hard coded for now 0.0, // NOTE: Index blocks do not support hash index From 9cefbb2b2916769bc4a714060c6596a9be8284c7 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 20 Jul 2025 14:37:46 +0200 Subject: [PATCH 261/613] rename --- src/segment/block/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index 99bea472..65590c47 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -43,7 +43,7 @@ impl Block { } /// Encodes a block into a writer. - pub fn to_writer( + pub fn write_into( mut writer: &mut W, data: &[u8], compression: CompressionType, From 955ddb53213dd111b4766e17c4abd3c232b61562 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 20 Jul 2025 14:38:17 +0200 Subject: [PATCH 262/613] reuse block encode buffer in segment construction --- src/segment/block/trailer.rs | 6 ++-- src/segment/data_block/iter.rs | 42 ++++++++++++------------ src/segment/index_block/iter.rs | 20 ++++++------ src/segment/regions.rs | 2 +- src/segment/writer/index.rs | 5 +-- src/segment/writer/mod.rs | 57 +++++++++++++++++++++++++-------- 6 files changed, 80 insertions(+), 52 deletions(-) diff --git a/src/segment/block/trailer.rs b/src/segment/block/trailer.rs index 8707df42..1ac0cae5 100644 --- a/src/segment/block/trailer.rs +++ b/src/segment/block/trailer.rs @@ -59,9 +59,7 @@ impl<'a> Trailer<'a> { } } - pub fn write>( - mut encoder: Encoder<'_, S, T>, - ) -> crate::Result> { + pub fn write>(mut encoder: Encoder<'_, S, T>) -> crate::Result<()> { // IMPORTANT: Terminator marker encoder.writer.write_u8(TRAILER_START_MARKER)?; @@ -140,6 +138,6 @@ impl<'a> Trailer<'a> { "trailer size does not match", ); - Ok(encoder.writer) + Ok(()) } } diff --git a/src/segment/data_block/iter.rs b/src/segment/data_block/iter.rs index 7551dc0b..df3266e9 100644 --- a/src/segment/data_block/iter.rs +++ b/src/segment/data_block/iter.rs @@ -189,7 +189,7 @@ mod tests { .collect::>(); for restart_interval in 1..=16 { - let bytes = DataBlock::encode_items(&items, restart_interval, 1.33)?; + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -263,7 +263,7 @@ mod tests { .collect::>(); for restart_interval in 1..=16 { - let bytes = DataBlock::encode_items(&items, restart_interval, 1.33)?; + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -337,7 +337,7 @@ mod tests { .collect::>(); for restart_interval in 1..=16 { - let bytes = DataBlock::encode_items(&items, restart_interval, 1.33)?; + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -389,7 +389,7 @@ mod tests { ]; for restart_interval in 1..=16 { - let bytes = DataBlock::encode_items(&items, restart_interval, 1.33)?; + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -424,7 +424,7 @@ mod tests { ]; for restart_interval in 1..=16 { - let bytes = DataBlock::encode_items(&items, restart_interval, 1.33)?; + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -463,7 +463,7 @@ mod tests { ]; for restart_interval in 1..=16 { - let bytes = DataBlock::encode_items(&items, restart_interval, 0.0)?; + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 0.0)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -503,7 +503,7 @@ mod tests { ]; for restart_interval in 1..=16 { - let bytes = DataBlock::encode_items(&items, restart_interval, 0.0)?; + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 0.0)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -584,7 +584,7 @@ mod tests { ]; for restart_interval in 1..=16 { - let bytes = DataBlock::encode_items(&items, restart_interval, 0.0)?; + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 0.0)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -625,7 +625,7 @@ mod tests { ]; for restart_interval in 1..=16 { - let bytes = DataBlock::encode_items(&items, restart_interval, 0.0)?; + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 0.0)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -665,7 +665,7 @@ mod tests { ]; for restart_interval in 1..=16 { - let bytes = DataBlock::encode_items(&items, restart_interval, 0.0)?; + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 0.0)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -768,7 +768,7 @@ mod tests { ]; for restart_interval in 1..=16 { - let bytes = DataBlock::encode_items(&items, restart_interval, 0.0)?; + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 0.0)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -839,7 +839,7 @@ mod tests { ]; for restart_interval in 1..=16 { - let bytes = DataBlock::encode_items(&items, restart_interval, 1.33)?; + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -876,7 +876,7 @@ mod tests { ]; for restart_interval in 1..=16 { - let bytes = DataBlock::encode_items(&items, restart_interval, 1.33)?; + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -916,7 +916,7 @@ mod tests { ]; for restart_interval in 1..=16 { - let bytes = DataBlock::encode_items(&items, restart_interval, 1.33)?; + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -956,7 +956,7 @@ mod tests { ]; for restart_interval in 1..=16 { - let bytes = DataBlock::encode_items(&items, restart_interval, 1.33)?; + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -993,7 +993,7 @@ mod tests { ]; for restart_interval in 1..=16 { - let bytes = DataBlock::encode_items(&items, restart_interval, 1.33)?; + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 1.33)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -1027,7 +1027,7 @@ mod tests { ]; for restart_interval in 1..=16 { - let bytes = DataBlock::encode_items(&items, restart_interval, 0.0)?; + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 0.0)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -1089,7 +1089,7 @@ mod tests { ]; for restart_interval in 1..=16 { - let bytes = DataBlock::encode_items(&items, restart_interval, 0.0)?; + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 0.0)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -1168,7 +1168,7 @@ mod tests { ]; for restart_interval in 1..=u8::MAX { - let bytes = DataBlock::encode_items(&items, restart_interval, 0.0)?; + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 0.0)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -1267,7 +1267,7 @@ mod tests { ), ]; - let bytes = DataBlock::encode_items(&items, 5, 1.0)?; + let bytes = DataBlock::encode_into_vec(&items, 5, 1.0)?; let data_block = DataBlock::new(Block { data: bytes.into(), @@ -1323,7 +1323,7 @@ mod tests { InternalValue::from_components(Slice::new(&[255, 255]), Slice::new(&[]), 47, Value), ]; - let bytes = DataBlock::encode_items(&items, 2, 1.0)?; + let bytes = DataBlock::encode_into_vec(&items, 2, 1.0)?; let data_block = DataBlock::new(Block { data: bytes.into(), diff --git a/src/segment/index_block/iter.rs b/src/segment/index_block/iter.rs index 3720b444..4cbca790 100644 --- a/src/segment/index_block/iter.rs +++ b/src/segment/index_block/iter.rs @@ -63,7 +63,7 @@ mod tests { KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), ]; - let bytes = IndexBlock::encode_items(&items)?; + let bytes = IndexBlock::encode_into_vec(&items)?; let index_block = IndexBlock::new(Block { data: bytes.into(), @@ -99,7 +99,7 @@ mod tests { KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), ]; - let bytes = IndexBlock::encode_items(&items)?; + let bytes = IndexBlock::encode_into_vec(&items)?; let index_block = IndexBlock::new(Block { data: bytes.into(), @@ -133,7 +133,7 @@ mod tests { KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), ]; - let bytes = IndexBlock::encode_items(&items)?; + let bytes = IndexBlock::encode_into_vec(&items)?; let index_block = IndexBlock::new(Block { data: bytes.into(), @@ -170,7 +170,7 @@ mod tests { KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), ]; - let bytes = IndexBlock::encode_items(&items)?; + let bytes = IndexBlock::encode_into_vec(&items)?; let index_block = IndexBlock::new(Block { data: bytes.into(), @@ -204,7 +204,7 @@ mod tests { KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), ]; - let bytes = IndexBlock::encode_items(&items)?; + let bytes = IndexBlock::encode_into_vec(&items)?; let index_block = IndexBlock::new(Block { data: bytes.into(), @@ -238,7 +238,7 @@ mod tests { KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), ]; - let bytes = IndexBlock::encode_items(&items)?; + let bytes = IndexBlock::encode_into_vec(&items)?; let index_block = IndexBlock::new(Block { data: bytes.into(), @@ -276,7 +276,7 @@ mod tests { KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), ]; - let bytes = IndexBlock::encode_items(&items)?; + let bytes = IndexBlock::encode_into_vec(&items)?; let index_block = IndexBlock::new(Block { data: bytes.into(), @@ -310,7 +310,7 @@ mod tests { KeyedBlockHandle::new(b"b".into(), BlockOffset(13_000), 5_000), ]; - let bytes = IndexBlock::encode_items(&items)?; + let bytes = IndexBlock::encode_into_vec(&items)?; let index_block = IndexBlock::new(Block { data: bytes.into(), @@ -360,7 +360,7 @@ mod tests { KeyedBlockHandle::new(b"b".into(), BlockOffset(13_000), 5_000), ]; - let bytes = IndexBlock::encode_items(&items)?; + let bytes = IndexBlock::encode_into_vec(&items)?; let index_block = IndexBlock::new(Block { data: bytes.into(), @@ -409,7 +409,7 @@ mod tests { KeyedBlockHandle::new(b"e".into(), BlockOffset(13_000), 5_000), ]; - let bytes = IndexBlock::encode_items(&items)?; + let bytes = IndexBlock::encode_into_vec(&items)?; let index_block = IndexBlock::new(Block { data: bytes.into(), diff --git a/src/segment/regions.rs b/src/segment/regions.rs index 787232eb..e6a4943c 100644 --- a/src/segment/regions.rs +++ b/src/segment/regions.rs @@ -106,6 +106,6 @@ impl ParsedRegions { } // TODO: no binary index - DataBlock::encode_items(&items, 1, 0.0) + DataBlock::encode_into_vec(&items, 1, 0.0) } } diff --git a/src/segment/writer/index.rs b/src/segment/writer/index.rs index 150d180f..7881031c 100644 --- a/src/segment/writer/index.rs +++ b/src/segment/writer/index.rs @@ -70,9 +70,10 @@ impl BlockIndexWriter for FullIndexWriter ) -> crate::Result<(BlockHandle, Option)> { let tli_ptr = BlockOffset(block_file_writer.stream_position()?); - let bytes = IndexBlock::encode_items(&self.block_handles)?; + let mut bytes = vec![]; + IndexBlock::encode_into(&mut bytes, &self.block_handles)?; - let header = Block::to_writer(block_file_writer, &bytes, self.compression)?; + let header = Block::write_into(block_file_writer, &bytes, self.compression)?; // NOTE: We know that blocks never even approach u32 size #[allow(clippy::cast_possible_truncation)] diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 3561859a..76daa199 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -32,6 +32,9 @@ pub struct Writer { /// Compression to use compression: CompressionType, + /// Buffer to serialize blocks into + block_buffer: Vec, + /// Writer of data blocks #[allow(clippy::struct_field_names)] block_writer: BufWriter, @@ -55,7 +58,7 @@ pub struct Writer { /// Hashes for bloom filter /// /// using enhanced double hashing, so we got two u64s - bloom_hash_buffer: Vec<(u64, u64)>, + pub bloom_hash_buffer: Vec<(u64, u64)>, } impl Writer { @@ -77,6 +80,7 @@ impl Writer { index_writer: Box::new(FullIndexWriter::new()), + block_buffer: Vec::new(), block_writer, chunk: Vec::new(), @@ -93,7 +97,7 @@ impl Writer { } #[must_use] - pub(crate) fn use_data_block_size(mut self, size: u32) -> Self { + pub fn use_data_block_size(mut self, size: u32) -> Self { assert!( size <= 4 * 1_024 * 1_024, "data block size must be <= 4 MiB", @@ -103,14 +107,14 @@ impl Writer { } #[must_use] - pub(crate) fn use_compression(mut self, compression: CompressionType) -> Self { + pub fn use_compression(mut self, compression: CompressionType) -> Self { self.compression = compression; self.index_writer.use_compression(compression); self } #[must_use] - pub(crate) fn use_bloom_policy(mut self, bloom_policy: BloomConstructionPolicy) -> Self { + pub fn use_bloom_policy(mut self, bloom_policy: BloomConstructionPolicy) -> Self { self.bloom_policy = bloom_policy; self } @@ -171,10 +175,25 @@ impl Writer { return Ok(()); }; - let bytes = DataBlock::encode_items(&self.chunk, 16, 1.33)?; + self.block_buffer.clear(); + + DataBlock::encode_into( + &mut self.block_buffer, + &self.chunk, + 16, // TODO: config + 1.33, // TODO: config + )?; + + // log::warn!("encoding {:?}", self.chunk); + // log::warn!( + // "encoded 0x{:#X?} -> {:?}", + // self.meta.file_pos, + // self.block_buffer + // ); // TODO: prev block offset - let header = Block::to_writer(&mut self.block_writer, &bytes, self.compression)?; + let header = + Block::write_into(&mut self.block_writer, &self.block_buffer, self.compression)?; self.meta.uncompressed_size += u64::from(header.uncompressed_length); @@ -253,7 +272,7 @@ impl Writer { let filter_bytes = { let mut builder = self.bloom_policy.init(n); - for hash in std::mem::take(&mut self.bloom_hash_buffer) { + for hash in self.bloom_hash_buffer { builder.set_with_hash(hash); } @@ -266,10 +285,13 @@ impl Writer { start.elapsed(), ); - let block = - Block::to_writer(&mut self.block_writer, &filter_bytes, CompressionType::None)?; + let header = Block::write_into( + &mut self.block_writer, + &filter_bytes, + CompressionType::None, + )?; - let bytes_written = (BlockHeader::serialized_len() as u32) + block.data_length; + let bytes_written = (BlockHeader::serialized_len() as u32) + header.data_length; Some(BlockHandle::new(BlockOffset(filter_ptr), bytes_written)) } @@ -348,13 +370,20 @@ impl Writer { log::trace!("Encoding metadata block: {meta_items:#?}"); + self.block_buffer.clear(); + // TODO: no binary index - let bytes = DataBlock::encode_items(&meta_items, 1, 0.0)?; - let header = Block::to_writer(&mut self.block_writer, &bytes, CompressionType::None)?; + DataBlock::encode_into(&mut self.block_buffer, &meta_items, 1, 0.0)?; + + let header = Block::write_into( + &mut self.block_writer, + &self.block_buffer, + CompressionType::None, + )?; let bytes_written = BlockHeader::serialized_len() as u32 + header.data_length; - BlockHandle::new(metadata_start, bytes_written as u32) + BlockHandle::new(metadata_start, bytes_written) }; // Write regions block @@ -371,7 +400,7 @@ impl Writer { log::trace!("Encoding regions: {regions:#?}"); let bytes = regions.encode_into_vec()?; - let header = Block::to_writer(&mut self.block_writer, &bytes, CompressionType::None)?; + let header = Block::write_into(&mut self.block_writer, &bytes, CompressionType::None)?; let bytes_written = BlockHeader::serialized_len() as u32 + header.data_length; From 8d18063b759fd6c2f306038a45547c47e4e614ce Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 22 Jul 2025 16:17:24 +0200 Subject: [PATCH 263/613] version iter double ended --- src/version/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/version/mod.rs b/src/version/mod.rs index 34c4e20b..2e1f92b7 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -49,7 +49,7 @@ impl GenericLevel { self.runs.is_empty() } - pub fn iter(&self) -> impl Iterator>> { + pub fn iter(&self) -> impl DoubleEndedIterator>> { self.runs.iter() } From f2c9a52dac16bcb17653f9e462edb9af724972a7 Mon Sep 17 00:00:00 2001 From: Marvin <33938500+marvin-j97@users.noreply.github.com> Date: Wed, 23 Jul 2025 02:41:50 +0200 Subject: [PATCH 264/613] Update mod.rs --- src/segment/mod.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/segment/mod.rs b/src/segment/mod.rs index ec9b5a07..acfb43c9 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -392,8 +392,14 @@ impl Segment { None }; + // TODO: Maybe only in L0/L1 + // For larger levels, this will + // cache possibly many FDs + // causing kick-out of other + // FDs in the cache + // // NOTE: We already have a file descriptor open, so let's just cache it immediately - descriptor_table.insert_for_table((tree_id, metadata.id).into(), Arc::new(file)); + // descriptor_table.insert_for_table((tree_id, metadata.id).into(), Arc::new(file)); let segment = Self(Arc::new(Inner { path: Arc::new(file_path), From b706f55952febf22ed030961d4035171a81a684a Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 23 Jul 2025 21:32:45 +0200 Subject: [PATCH 265/613] wip --- src/level_manifest/mod.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index efca42b3..e5b576ce 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -51,9 +51,7 @@ impl std::fmt::Display for LevelManifest { for run in level.iter() { write!(f, " ")?; - if run.is_empty() { - writeln!(f, "")?; - } else if run.len() >= 30 { + if run.len() >= 30 { #[allow(clippy::indexing_slicing)] for segment in run.iter().take(2) { let id = segment.id(); From 23d3964a9b184aca2d2e8fd5e558964ce88a30a1 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 23 Jul 2025 21:37:27 +0200 Subject: [PATCH 266/613] 128-bit checksum, block type in block header --- src/merge.rs | 58 +++++++++++++++++++++++++++++++++ src/segment/block/checksum.rs | 8 ++--- src/segment/block/header.rs | 56 ++++++++++++++++++++++++++++--- src/segment/block/mod.rs | 44 +++++++++++++++++++++---- src/segment/data_block/iter.rs | 23 ++++++++++++- src/segment/data_block/mod.rs | 17 +++++++++- src/segment/index_block/iter.rs | 12 ++++++- src/segment/iter.rs | 2 ++ src/segment/meta.rs | 7 +++- src/segment/mod.rs | 24 +++++++++++--- src/segment/regions.rs | 7 +++- src/segment/scanner.rs | 3 +- src/segment/util.rs | 5 +-- src/segment/writer/index.rs | 7 +++- src/segment/writer/mod.rs | 17 ++++++++-- src/version/run.rs | 2 +- 16 files changed, 259 insertions(+), 33 deletions(-) diff --git a/src/merge.rs b/src/merge.rs index a42bdac2..ca4bf122 100644 --- a/src/merge.rs +++ b/src/merge.rs @@ -117,3 +117,61 @@ impl> DoubleEndedIterator for Merger Some(Ok(max_item.1)) } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::ValueType::Value; + use test_log::test; + + #[test] + #[allow(clippy::unwrap_used)] + fn merge_simple() -> crate::Result<()> { + #[rustfmt::skip] + let a = vec![ + Ok(InternalValue::from_components("a", b"", 0, Value)), + ]; + #[rustfmt::skip] + let b = vec![ + Ok(InternalValue::from_components("b", b"", 0, Value)), + ]; + + let mut iter = Merger::new(vec![a.into_iter(), b.into_iter()]); + + assert_eq!( + iter.next().unwrap()?, + InternalValue::from_components("a", b"", 0, Value), + ); + assert_eq!( + iter.next().unwrap()?, + InternalValue::from_components("b", b"", 0, Value), + ); + assert!(iter.next().is_none(), "iter should be closed"); + + Ok(()) + } + + #[test] + #[ignore] + #[allow(clippy::unwrap_used)] + fn merge_dup() -> crate::Result<()> { + #[rustfmt::skip] + let a = vec![ + Ok(InternalValue::from_components("a", b"", 0, Value)), + ]; + #[rustfmt::skip] + let b = vec![ + Ok(InternalValue::from_components("a", b"", 0, Value)), + ]; + + let mut iter = Merger::new(vec![a.into_iter(), b.into_iter()]); + + assert_eq!( + iter.next().unwrap()?, + InternalValue::from_components("a", b"", 0, Value), + ); + assert!(iter.next().is_none(), "iter should be closed"); + + Ok(()) + } +} diff --git a/src/segment/block/checksum.rs b/src/segment/block/checksum.rs index 882de155..76ba835e 100644 --- a/src/segment/block/checksum.rs +++ b/src/segment/block/checksum.rs @@ -2,12 +2,12 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -/// An 64-bit checksum +/// An 128-bit checksum #[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub struct Checksum(u64); +pub struct Checksum(u128); impl std::ops::Deref for Checksum { - type Target = u64; + type Target = u128; fn deref(&self) -> &Self::Target { &self.0 @@ -16,7 +16,7 @@ impl std::ops::Deref for Checksum { impl Checksum { #[must_use] - pub fn from_raw(value: u64) -> Self { + pub fn from_raw(value: u128) -> Self { Self(value) } } diff --git a/src/segment/block/header.rs b/src/segment/block/header.rs index b359a5b3..a13813b0 100644 --- a/src/segment/block/header.rs +++ b/src/segment/block/header.rs @@ -10,14 +10,52 @@ use byteorder::LittleEndian; use byteorder::{ReadBytesExt, WriteBytesExt}; use std::io::{Read, Write}; +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum BlockType { + Data, + Index, + Filter, + Meta, + Regions, +} + +impl From for u8 { + fn from(val: BlockType) -> Self { + match val { + BlockType::Data => 0, + BlockType::Index => 1, + BlockType::Filter => 2, + BlockType::Meta => 3, + BlockType::Regions => 4, + } + } +} + +impl TryFrom for BlockType { + type Error = DecodeError; + + fn try_from(value: u8) -> Result { + match value { + 0 => Ok(Self::Data), + 1 => Ok(Self::Index), + 2 => Ok(Self::Filter), + 3 => Ok(Self::Meta), + 4 => Ok(Self::Regions), + _ => Err(DecodeError::InvalidTag(("BlockType", value))), + } + } +} + /// Header of a disk-based block #[derive(Copy, Clone, Debug, Eq, PartialEq)] pub struct Header { + pub block_type: BlockType, + /// Checksum value to verify integrity of data pub checksum: Checksum, /// File offset of previous block - only used for data blocks - pub previous_block_offset: BlockOffset, + pub previous_block_offset: BlockOffset, // TODO: remove? /// On-disk size of data segment pub data_length: u32, @@ -29,7 +67,9 @@ pub struct Header { impl Header { #[must_use] pub const fn serialized_len() -> usize { - MAGIC_BYTES.len() + MAGIC_BYTES.len() + // Block type + + std::mem::size_of::() // Checksum + std::mem::size_of::() // Backlink @@ -46,8 +86,11 @@ impl Encode for Header { // Write header writer.write_all(&MAGIC_BYTES)?; + // Write block type + writer.write_u8(self.block_type .into())?; + // Write checksum - writer.write_u64::(*self.checksum)?; + writer.write_u128::(*self.checksum)?; // Write prev offset writer.write_u64::(*self.previous_block_offset)?; @@ -72,8 +115,12 @@ impl Decode for Header { return Err(DecodeError::InvalidHeader("Block")); } + // Read block type + let block_type = reader.read_u8()?; + let block_type = BlockType::try_from(block_type)?; + // Read checksum - let checksum = reader.read_u64::()?; + let checksum = reader.read_u128::()?; // Read prev offset let previous_block_offset = reader.read_u64::()?; @@ -85,6 +132,7 @@ impl Decode for Header { let uncompressed_length = reader.read_u32::()?; Ok(Self { + block_type, checksum: Checksum::from_raw(checksum), previous_block_offset: BlockOffset(previous_block_offset), data_length, diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index 65590c47..a7e296e7 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -14,17 +14,17 @@ mod trailer; pub use checksum::Checksum; pub(crate) use decoder::{Decodable, Decoder, ParsedItem}; pub(crate) use encoder::{Encodable, Encoder}; -pub use header::Header; +pub use header::{BlockType, Header}; pub use offset::BlockOffset; pub(crate) use trailer::{Trailer, TRAILER_START_MARKER}; use crate::{ coding::{Decode, Encode}, - segment::BlockHandle, + segment::{BlockHandle, DataBlock}, CompressionType, Slice, }; use std::fs::File; -use xxhash_rust::xxh3::xxh3_64; +use xxhash_rust::xxh3::{xxh3_128, xxh3_64}; /// A block on disk /// @@ -46,10 +46,12 @@ impl Block { pub fn write_into( mut writer: &mut W, data: &[u8], + block_type: BlockType, compression: CompressionType, ) -> crate::Result
{ let mut header = Header { - checksum: Checksum::from_raw(xxh3_64(data)), + block_type, + checksum: Checksum::from_raw(xxh3_128(data)), data_length: 0, // <-- NOTE: Is set later on uncompressed_length: data.len() as u32, previous_block_offset: BlockOffset(0), // <-- TODO: @@ -82,6 +84,7 @@ impl Block { /// Reads a block from a reader. pub fn from_reader( reader: &mut R, + block_type: BlockType, compression: CompressionType, ) -> crate::Result { let header = Header::decode_from(reader)?; @@ -123,7 +126,20 @@ impl Block { } }); - let checksum = Checksum::from_raw(xxh3_64(&data)); + if header.block_type != block_type { + log::error!( + "Block type mismatch, got={:?}, expected={:?}", + header.block_type, + block_type, + ); + + return Err(crate::Error::Decode(crate::DecodeError::InvalidTag(( + "BlockType", + header.block_type.into(), + )))); + } + + let checksum = Checksum::from_raw(xxh3_128(&data)); if checksum != header.checksum { log::error!( "Checksum mismatch for block, got={}, expected={}", @@ -136,10 +152,11 @@ impl Block { Ok(Self { header, data }) } - /// Reads a block from a file without needing to seek the file. + /// Reads a block from a file. pub fn from_file( file: &File, handle: BlockHandle, + block_type: BlockType, compression: CompressionType, ) -> crate::Result { #[cfg(feature = "use_unsafe")] @@ -234,7 +251,20 @@ impl Block { debug_assert_eq!(header.uncompressed_length, data.len() as u32); } - let checksum = Checksum::from_raw(xxh3_64(&data)); + if header.block_type != block_type { + log::error!( + "Block type mismatch, got={:?}, expected={:?}", + header.block_type, + block_type, + ); + + return Err(crate::Error::Decode(crate::DecodeError::InvalidTag(( + "BlockType", + header.block_type.into(), + )))); + } + + let checksum = Checksum::from_raw(xxh3_128(&data)); if checksum != header.checksum { log::error!( "Checksum mismatch for block {handle:?}, got={}, expected={}", diff --git a/src/segment/data_block/iter.rs b/src/segment/data_block/iter.rs index df3266e9..13fb87c2 100644 --- a/src/segment/data_block/iter.rs +++ b/src/segment/data_block/iter.rs @@ -134,7 +134,7 @@ impl DoubleEndedIterator for Iter<'_> { mod tests { use crate::{ segment::{ - block::{Header, ParsedItem}, + block::{BlockType, Header, ParsedItem}, Block, BlockOffset, DataBlock, }, Checksum, InternalValue, Slice, @@ -194,6 +194,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -268,6 +269,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -342,6 +344,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -394,6 +397,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -429,6 +433,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -468,6 +473,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -508,6 +514,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -589,6 +596,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -630,6 +638,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -670,6 +679,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -773,6 +783,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -844,6 +855,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -881,6 +893,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -921,6 +934,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -961,6 +975,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -998,6 +1013,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -1032,6 +1048,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -1094,6 +1111,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -1173,6 +1191,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -1272,6 +1291,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -1328,6 +1348,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index f01d6e25..433904cb 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -472,7 +472,7 @@ impl DataBlock { mod tests { use crate::{ segment::{ - block::{Header, ParsedItem}, + block::{BlockType, Header, ParsedItem}, Block, BlockOffset, DataBlock, }, Checksum, InternalValue, SeqNo, Slice, @@ -505,6 +505,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -566,6 +567,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -607,6 +609,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -646,6 +649,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -674,6 +678,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -715,6 +720,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -751,6 +757,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -786,6 +793,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -832,6 +840,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -877,6 +886,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -926,6 +936,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -975,6 +986,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -1011,6 +1023,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -1048,6 +1061,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -1091,6 +1105,7 @@ mod tests { let data_block = DataBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Data, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, diff --git a/src/segment/index_block/iter.rs b/src/segment/index_block/iter.rs index 4cbca790..7a973aad 100644 --- a/src/segment/index_block/iter.rs +++ b/src/segment/index_block/iter.rs @@ -48,7 +48,7 @@ impl DoubleEndedIterator for Iter<'_> { mod tests { use crate::{ segment::{ - block::{Header, ParsedItem}, + block::{BlockType, Header, ParsedItem}, Block, BlockOffset, IndexBlock, KeyedBlockHandle, }, Checksum, @@ -68,6 +68,7 @@ mod tests { let index_block = IndexBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Index, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -104,6 +105,7 @@ mod tests { let index_block = IndexBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Index, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -138,6 +140,7 @@ mod tests { let index_block = IndexBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Index, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -175,6 +178,7 @@ mod tests { let index_block = IndexBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Index, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -209,6 +213,7 @@ mod tests { let index_block = IndexBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Index, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -243,6 +248,7 @@ mod tests { let index_block = IndexBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Index, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -281,6 +287,7 @@ mod tests { let index_block = IndexBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Index, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -315,6 +322,7 @@ mod tests { let index_block = IndexBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Index, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -365,6 +373,7 @@ mod tests { let index_block = IndexBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Index, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, @@ -414,6 +423,7 @@ mod tests { let index_block = IndexBlock::new(Block { data: bytes.into(), header: Header { + block_type: BlockType::Index, checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, diff --git a/src/segment/iter.rs b/src/segment/iter.rs index 72f687bd..8e7b2c75 100644 --- a/src/segment/iter.rs +++ b/src/segment/iter.rs @@ -173,6 +173,7 @@ impl Iterator for Iter { &self.descriptor_table, &self.cache, &BlockHandle::new(handle.offset(), handle.size()), + crate::segment::block::BlockType::Data, self.compression, #[cfg(feature = "metrics")] &self.metrics, @@ -247,6 +248,7 @@ impl DoubleEndedIterator for Iter { &self.descriptor_table, &self.cache, &BlockHandle::new(handle.offset(), handle.size()), + crate::segment::block::BlockType::Data, self.compression, #[cfg(feature = "metrics")] &self.metrics, diff --git a/src/segment/meta.rs b/src/segment/meta.rs index 0def02a9..c2ef1109 100644 --- a/src/segment/meta.rs +++ b/src/segment/meta.rs @@ -49,7 +49,12 @@ pub struct ParsedMeta { impl ParsedMeta { #[allow(clippy::expect_used, clippy::too_many_lines)] pub fn load_with_handle(file: &File, handle: &BlockHandle) -> crate::Result { - let block = Block::from_file(file, *handle, CompressionType::None)?; + let block = Block::from_file( + file, + *handle, + crate::segment::block::BlockType::Meta, + CompressionType::None, + )?; let block = DataBlock::new(block); assert_eq!( diff --git a/src/segment/mod.rs b/src/segment/mod.rs index ec9b5a07..3db38e75 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -29,8 +29,10 @@ pub use writer::Writer; use crate::metrics::Metrics; use crate::{ - cache::Cache, descriptor_table::DescriptorTable, CompressionType, InternalValue, SeqNo, TreeId, - UserKey, + cache::Cache, + descriptor_table::DescriptorTable, + segment::block::{BlockType, ParsedItem}, + CompressionType, InternalValue, SeqNo, TreeId, UserKey, }; use block_index::BlockIndexImpl; use filter::standard_bloom::CompositeHash; @@ -131,6 +133,7 @@ impl Segment { fn load_block( &self, handle: &BlockHandle, + block_type: BlockType, compression: CompressionType, ) -> crate::Result { load_block( @@ -139,6 +142,7 @@ impl Segment { &self.descriptor_table, &self.cache, handle, + block_type, compression, #[cfg(feature = "metrics")] &self.metrics, @@ -146,8 +150,12 @@ impl Segment { } fn load_data_block(&self, handle: &BlockHandle) -> crate::Result { - self.load_block(handle, self.metadata.data_block_compression) - .map(DataBlock::new) + self.load_block( + handle, + BlockType::Data, + self.metadata.data_block_compression, + ) + .map(DataBlock::new) } pub fn get( @@ -177,7 +185,11 @@ impl Segment { return Ok(None); } } else if let Some(filter_block_handle) = &self.regions.filter { - let block = self.load_block(filter_block_handle, CompressionType::None)?; + let block = self.load_block( + filter_block_handle, + BlockType::Filter, + CompressionType::None, + )?; let filter = StandardBloomFilterReader::new(&block.data)?; #[cfg(feature = "metrics")] @@ -352,6 +364,7 @@ impl Segment { let block = Block::from_file( &file, regions.tli, + crate::segment::block::BlockType::Index, metadata.data_block_compression, // TODO: index blocks may get their own compression level )?; @@ -384,6 +397,7 @@ impl Segment { Block::from_file( &file, filter_handle, + crate::segment::block::BlockType::Filter, crate::CompressionType::None, // NOTE: We never write a filter block with compression ) }) diff --git a/src/segment/regions.rs b/src/segment/regions.rs index e6a4943c..91c4938d 100644 --- a/src/segment/regions.rs +++ b/src/segment/regions.rs @@ -21,7 +21,12 @@ pub struct ParsedRegions { impl ParsedRegions { pub fn load_with_handle(file: &File, handle: &BlockHandle) -> crate::Result { - let block = Block::from_file(file, *handle, CompressionType::None)?; + let block = Block::from_file( + file, + *handle, + crate::segment::block::BlockType::Regions, + CompressionType::None, + )?; let block = DataBlock::new(block); let tli = { diff --git a/src/segment/scanner.rs b/src/segment/scanner.rs index 68c2480b..01df9b7f 100644 --- a/src/segment/scanner.rs +++ b/src/segment/scanner.rs @@ -42,7 +42,8 @@ impl Scanner { reader: &mut BufReader, compression: CompressionType, ) -> crate::Result { - Block::from_reader(reader, compression).map(DataBlock::new) + Block::from_reader(reader, crate::segment::block::BlockType::Data, compression) + .map(DataBlock::new) } } diff --git a/src/segment/util.rs b/src/segment/util.rs index 42525733..4d732051 100644 --- a/src/segment/util.rs +++ b/src/segment/util.rs @@ -6,7 +6,7 @@ use crate::metrics::Metrics; use super::{Block, BlockHandle, GlobalSegmentId}; -use crate::{Cache, CompressionType, DescriptorTable}; +use crate::{segment::block::BlockType, Cache, CompressionType, DescriptorTable}; use std::{path::Path, sync::Arc}; /// [start, end] slice indexes @@ -19,6 +19,7 @@ pub fn load_block( descriptor_table: &DescriptorTable, cache: &Cache, handle: &BlockHandle, + block_type: BlockType, compression: CompressionType, #[cfg(feature = "metrics")] metrics: &Metrics, ) -> crate::Result { @@ -43,7 +44,7 @@ pub fn load_block( Arc::new(std::fs::File::open(path)?) }; - let block = Block::from_file(&fd, *handle, compression)?; + let block = Block::from_file(&fd, *handle, block_type, compression)?; #[cfg(feature = "metrics")] metrics.block_load_io.fetch_add(1, Relaxed); diff --git a/src/segment/writer/index.rs b/src/segment/writer/index.rs index 7881031c..10757218 100644 --- a/src/segment/writer/index.rs +++ b/src/segment/writer/index.rs @@ -73,7 +73,12 @@ impl BlockIndexWriter for FullIndexWriter let mut bytes = vec![]; IndexBlock::encode_into(&mut bytes, &self.block_handles)?; - let header = Block::write_into(block_file_writer, &bytes, self.compression)?; + let header = Block::write_into( + block_file_writer, + &bytes, + crate::segment::block::BlockType::Index, + self.compression, + )?; // NOTE: We know that blocks never even approach u32 size #[allow(clippy::cast_possible_truncation)] diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 76daa199..93187d69 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -192,8 +192,12 @@ impl Writer { // ); // TODO: prev block offset - let header = - Block::write_into(&mut self.block_writer, &self.block_buffer, self.compression)?; + let header = Block::write_into( + &mut self.block_writer, + &self.block_buffer, + super::block::BlockType::Data, + self.compression, + )?; self.meta.uncompressed_size += u64::from(header.uncompressed_length); @@ -288,6 +292,7 @@ impl Writer { let header = Block::write_into( &mut self.block_writer, &filter_bytes, + crate::segment::block::BlockType::Filter, CompressionType::None, )?; @@ -378,6 +383,7 @@ impl Writer { let header = Block::write_into( &mut self.block_writer, &self.block_buffer, + crate::segment::block::BlockType::Meta, CompressionType::None, )?; @@ -400,7 +406,12 @@ impl Writer { log::trace!("Encoding regions: {regions:#?}"); let bytes = regions.encode_into_vec()?; - let header = Block::write_into(&mut self.block_writer, &bytes, CompressionType::None)?; + let header = Block::write_into( + &mut self.block_writer, + &bytes, + crate::segment::block::BlockType::Regions, + CompressionType::None, + )?; let bytes_written = BlockHeader::serialized_len() as u32 + header.data_length; diff --git a/src/version/run.rs b/src/version/run.rs index 6a39249b..ff8f6c73 100644 --- a/src/version/run.rs +++ b/src/version/run.rs @@ -45,7 +45,7 @@ impl std::ops::Deref for Indexed { } /// A disjoint run of disk segments -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Eq, PartialEq)] pub struct Run(Vec); impl std::ops::Deref for Run { From 807d88538ca3bf0694cb9cc8e8524e293f6e754d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 23 Jul 2025 21:43:18 +0200 Subject: [PATCH 267/613] fmt --- src/segment/block/header.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/segment/block/header.rs b/src/segment/block/header.rs index a13813b0..6db1042e 100644 --- a/src/segment/block/header.rs +++ b/src/segment/block/header.rs @@ -67,7 +67,7 @@ pub struct Header { impl Header { #[must_use] pub const fn serialized_len() -> usize { - MAGIC_BYTES.len() + MAGIC_BYTES.len() // Block type + std::mem::size_of::() // Checksum @@ -87,7 +87,7 @@ impl Encode for Header { writer.write_all(&MAGIC_BYTES)?; // Write block type - writer.write_u8(self.block_type .into())?; + writer.write_u8(self.block_type.into())?; // Write checksum writer.write_u128::(*self.checksum)?; From f9f6364de0c40907db33be8a0f452edf2bc97965 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 27 Jul 2025 03:37:20 +0200 Subject: [PATCH 268/613] license --- src/double_ended_peekable.rs | 4 ++++ src/segment/filter/bit_array/mod.rs | 4 ++++ src/segment/filter/standard_bloom/mod.rs | 4 ++++ src/segment/index_block/iter.rs | 4 ++++ 4 files changed, 16 insertions(+) diff --git a/src/double_ended_peekable.rs b/src/double_ended_peekable.rs index 9269954a..3532de5d 100644 --- a/src/double_ended_peekable.rs +++ b/src/double_ended_peekable.rs @@ -1,3 +1,7 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + //! A fork of //! to allow accessing the inner type //! diff --git a/src/segment/filter/bit_array/mod.rs b/src/segment/filter/bit_array/mod.rs index 2078a6f9..061a2cd3 100644 --- a/src/segment/filter/bit_array/mod.rs +++ b/src/segment/filter/bit_array/mod.rs @@ -1,3 +1,7 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + mod builder; mod reader; diff --git a/src/segment/filter/standard_bloom/mod.rs b/src/segment/filter/standard_bloom/mod.rs index b430dd5f..780ce8e5 100644 --- a/src/segment/filter/standard_bloom/mod.rs +++ b/src/segment/filter/standard_bloom/mod.rs @@ -1,3 +1,7 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + mod builder; pub use builder::{Builder, CompositeHash}; diff --git a/src/segment/index_block/iter.rs b/src/segment/index_block/iter.rs index 7a973aad..d412945a 100644 --- a/src/segment/index_block/iter.rs +++ b/src/segment/index_block/iter.rs @@ -1,3 +1,7 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + use crate::{ double_ended_peekable::{DoubleEndedPeekable, DoubleEndedPeekableExt}, segment::{block::Decoder, index_block::IndexBlockParsedItem, KeyedBlockHandle}, From 39706227aac1e3f0311e942474a2169ece232f00 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 27 Jul 2025 03:37:35 +0200 Subject: [PATCH 269/613] test: block header serde roundtrip --- src/segment/block/header.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/segment/block/header.rs b/src/segment/block/header.rs index 6db1042e..8be30b11 100644 --- a/src/segment/block/header.rs +++ b/src/segment/block/header.rs @@ -140,3 +140,27 @@ impl Decode for Header { }) } } + +#[cfg(test)] +mod tests { + use super::*; + use test_log::test; + + #[test] + fn v3_block_header_serde_roundtrip() -> crate::Result<()> { + let header = Header { + block_type: BlockType::Data, + checksum: Checksum::from_raw(5), + data_length: 252_356, + previous_block_offset: BlockOffset(35), + uncompressed_length: 124_124_124, + }; + + let bytes = header.encode_into_vec(); + + assert_eq!(bytes.len(), Header::serialized_len()); + assert_eq!(header, Header::decode_from(&mut &bytes[..])?); + + Ok(()) + } +} From c168872fd7f8f8b55342635d2fa0e1dd719580aa Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 27 Jul 2025 03:38:18 +0200 Subject: [PATCH 270/613] fix: optimize_runs --- src/version/key_range_partition.rs | 351 ----------------------------- src/version/mod.rs | 59 +---- src/version/optimize.rs | 149 ++++++++++++ src/version/run.rs | 7 + 4 files changed, 164 insertions(+), 402 deletions(-) delete mode 100644 src/version/key_range_partition.rs create mode 100644 src/version/optimize.rs diff --git a/src/version/key_range_partition.rs b/src/version/key_range_partition.rs deleted file mode 100644 index 819f6cdb..00000000 --- a/src/version/key_range_partition.rs +++ /dev/null @@ -1,351 +0,0 @@ -use crate::{ - binary_search::partition_point, version::run::Ranged, KeyRange, Segment, SegmentId, UserKey, -}; -use std::{ - collections::{HashSet, VecDeque}, - fmt::Debug, -}; - -pub trait Identifiable { - fn id(&self) -> Id; -} - -impl Identifiable for Segment { - fn id(&self) -> SegmentId { - self.id() - } -} - -#[derive(Clone, Debug)] -pub struct Partition> { - key_range: KeyRange, - segments: VecDeque, -} - -#[derive(Clone, Debug, Default)] -pub struct KeyRangePartitions>( - Vec>, -); - -impl> KeyRangePartitions { - pub fn new(pairs: impl Iterator) -> Self { - let mut partitions = vec![]; - - for (start_key, end_key) in pairs { - partitions.push(Partition { - key_range: KeyRange::new((start_key, end_key)), - segments: VecDeque::new(), - }); - } - - Self(partitions) - } - - pub fn index_segment(&mut self, segment: &T) { - let key_range = &segment.key_range(); - let start_key = key_range.min(); - - let idx = partition_point(&self.0, |x| x.key_range.max() < start_key); - - if let Some(slice) = self.0.get_mut(idx..) { - for partition in slice - .iter_mut() - .filter(|x| x.key_range.overlaps_with_key_range(key_range)) - { - partition.segments.push_back(segment.clone()); - } - } - } - - pub fn into_optimized_runs(mut self) -> Vec> { - let mut optimized = VecDeque::new(); - let mut blacklist = HashSet::::default(); - - while self - .0 - .iter() - .any(|partition| !partition.segments.is_empty()) - { - let run = { - let mut v: Vec = vec![]; - - for partition in &mut self.0 { - let Some(front) = partition.segments.front() else { - continue; - }; - - let curr_id = front.id(); - - if blacklist.contains(&curr_id) { - partition.segments.pop_front().expect("front should exist"); - continue; - } - - if v.iter() - .any(|x| x.key_range().overlaps_with_key_range(front.key_range())) - { - continue; - } - - // NOTE: We just got front previously - #[allow(clippy::expect_used)] - v.push(partition.segments.pop_front().expect("front should exist")); - - blacklist.insert(curr_id); - } - - v - }; - - #[cfg(debug_assertions)] - { - let ranges = run.iter().map(Ranged::key_range).collect::>(); - debug_assert!(KeyRange::is_disjoint(&ranges)); - } - - if !run.is_empty() { - optimized.push_front(run); - } - } - - optimized.into() - } -} - -#[cfg(test)] -mod tests { - use super::*; - use test_log::test; - - #[derive(Clone, Debug, PartialEq, Eq)] - struct FauxSegment { - key_range: KeyRange, - id: SegmentId, - } - - impl Identifiable for FauxSegment { - fn id(&self) -> SegmentId { - self.id - } - } - - impl Ranged for FauxSegment { - fn key_range(&self) -> &KeyRange { - &self.key_range - } - } - - #[test] - fn key_range_partition_single_key_twice() { - let a = FauxSegment { - key_range: KeyRange::new((UserKey::new(&[0; 8]), UserKey::new(&[0; 8]))), - id: 0, - }; - let b = FauxSegment { - key_range: KeyRange::new((UserKey::new(&[0; 8]), UserKey::new(&[0; 8]))), - id: 1, - }; - - { - let mut index = KeyRangePartitions::::new(std::iter::once(( - UserKey::new(&[0; 8]), - UserKey::new(&[0; 8]), - ))); - - index.index_segment(&a); - index.index_segment(&b); - - assert_eq!( - vec![vec![b.clone()], vec![a.clone()]], - index.into_optimized_runs() - ); - } - - { - let mut index = KeyRangePartitions::::new(std::iter::once(( - UserKey::new(&[0; 8]), - UserKey::new(&[0; 8]), - ))); - - index.index_segment(&b); - index.index_segment(&a); - - assert_eq!(vec![vec![a], vec![b]], index.into_optimized_runs()); - } - } - - #[test] - fn key_range_partition_single_key() { - let a = FauxSegment { - key_range: KeyRange::new((UserKey::new(b"a"), UserKey::new(b"b"))), - id: 0, - }; - let b = FauxSegment { - key_range: KeyRange::new((UserKey::new(b"a"), UserKey::new(b"a"))), - id: 1, - }; - - { - let mut index = KeyRangePartitions::::new(std::iter::once(( - UserKey::new(b"a"), - UserKey::new(b"b"), - ))); - - index.index_segment(&a); - index.index_segment(&b); - - assert_eq!( - vec![vec![b.clone()], vec![a.clone()]], - index.into_optimized_runs() - ); - } - - { - let mut index = KeyRangePartitions::::new(std::iter::once(( - UserKey::new(b"a"), - UserKey::new(b"b"), - ))); - - index.index_segment(&b); - index.index_segment(&a); - - assert_eq!(vec![vec![a], vec![b]], index.into_optimized_runs()); - } - } - - #[test] - fn key_range_partition_one_segment() { - let segment = FauxSegment { - key_range: KeyRange::new((UserKey::new(b"a"), UserKey::new(b"b"))), - id: 0, - }; - - let mut index = KeyRangePartitions::::new(std::iter::once(( - UserKey::new(b"a"), - UserKey::new(b"b"), - ))); - - index.index_segment(&segment); - - assert_eq!(vec![vec![segment]], index.into_optimized_runs()); - } - - #[test] - fn key_range_partition_two_to_one() { - let a = FauxSegment { - key_range: KeyRange::new((UserKey::new(b"a"), UserKey::new(b"b"))), - id: 0, - }; - let b = FauxSegment { - key_range: KeyRange::new((UserKey::new(b"c"), UserKey::new(b"d"))), - id: 1, - }; - - { - let mut index = KeyRangePartitions::::new( - [ - (UserKey::new(b"a"), UserKey::new(b"b")), - (UserKey::new(b"b"), UserKey::new(b"c")), - (UserKey::new(b"c"), UserKey::new(b"d")), - ] - .into_iter(), - ); - - index.index_segment(&a); - index.index_segment(&b); - - eprintln!("{index:#?}"); - - assert_eq!( - vec![vec![a.clone(), b.clone()]], - index.into_optimized_runs() - ); - } - - { - let mut index = KeyRangePartitions::::new( - [ - (UserKey::new(b"a"), UserKey::new(b"b")), - (UserKey::new(b"b"), UserKey::new(b"c")), - (UserKey::new(b"c"), UserKey::new(b"d")), - ] - .into_iter(), - ); - - index.index_segment(&b); - index.index_segment(&a); - - assert_eq!(vec![vec![a, b]], index.into_optimized_runs()); - } - } - - #[test] - fn key_range_partition_full_overlap() { - let a = FauxSegment { - key_range: KeyRange::new((UserKey::new(b"a"), UserKey::new(b"z"))), - id: 0, - }; - let b = FauxSegment { - key_range: KeyRange::new((UserKey::new(b"a"), UserKey::new(b"z"))), - id: 1, - }; - - { - let mut index = KeyRangePartitions::::new(std::iter::once(( - UserKey::new(b"a"), - UserKey::new(b"z"), - ))); - - index.index_segment(&a); - index.index_segment(&b); - - assert_eq!( - vec![vec![b.clone()], vec![a.clone()]], - index.into_optimized_runs() - ); - } - - { - let mut index = KeyRangePartitions::::new(std::iter::once(( - UserKey::new(b"a"), - UserKey::new(b"z"), - ))); - - index.index_segment(&b); - index.index_segment(&a); - - assert_eq!(vec![vec![a], vec![b]], index.into_optimized_runs()); - } - } - - #[test] - fn key_range_partition_partial_overlap() { - let a = FauxSegment { - key_range: KeyRange::new((UserKey::new(b"a"), UserKey::new(b"k"))), - id: 0, - }; - let b = FauxSegment { - key_range: KeyRange::new((UserKey::new(b"c"), UserKey::new(b"z"))), - id: 1, - }; - - { - let mut index = KeyRangePartitions::::new( - [ - (UserKey::new(b"a"), UserKey::new(b"c")), - (UserKey::new(b"c"), UserKey::new(b"k")), - (UserKey::new(b"k"), UserKey::new(b"z")), - ] - .into_iter(), - ); - - index.index_segment(&a); - index.index_segment(&b); - - assert_eq!( - vec![vec![b.clone()], vec![a.clone()]], - index.into_optimized_runs() - ); - } - } -} diff --git a/src/version/mod.rs b/src/version/mod.rs index 2e1f92b7..49075f76 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -1,12 +1,16 @@ -pub mod key_range_partition; +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +mod optimize; pub mod run; pub use run::Run; -use crate::{HashSet, KeyRange, Segment, SegmentId, UserKey}; -use key_range_partition::KeyRangePartitions; +use crate::{HashSet, KeyRange, Segment, SegmentId}; +use optimize::optimize_runs; use run::Ranged; -use std::{collections::BTreeSet, ops::Deref, sync::Arc}; +use std::{ops::Deref, sync::Arc}; pub type VersionId = u64; @@ -94,8 +98,6 @@ impl Level { } pub fn first_run(&self) -> Option<&Arc>> { - assert!(self.runs.len() <= 1, "should have at most one run"); - self.runs.first() } @@ -146,51 +148,6 @@ impl std::ops::Deref for Version { } } -// TODO: optimize runs unit test(s) -pub fn optimize_runs(level: Vec>) -> Vec> { - if level.len() <= 1 { - level - } else { - let mut key_range_boundaries: BTreeSet = BTreeSet::::default(); - - for run in &level { - for fragment in run.iter() { - let key_range = &fragment.metadata.key_range; - key_range_boundaries.insert(key_range.min().clone()); - key_range_boundaries.insert(key_range.max().clone()); - } - } - - let range_boundaries = key_range_boundaries - .into_iter() - .flat_map(|key| vec![key.clone(), key]) - .collect::>(); - - let mut index = KeyRangePartitions::new(range_boundaries.windows(2).map(|pair| { - // NOTE: We are iterating over pairs, so index 0 and 1 always exist - #[allow(clippy::expect_used)] - #[allow(clippy::get_first)] - ( - pair.get(0).expect("exists").clone(), - pair.get(1).expect("exists").clone(), - ) - })); - - // IMPORTANT: Index from bottom to top - for run in level.iter().rev() { - for segment in run.iter() { - index.index_segment(segment); - } - } - - index - .into_optimized_runs() - .into_iter() - .map(Run::new) - .collect() - } -} - // TODO: impl using generics so we can easily unit test Version transformation functions impl Version { pub fn id(&self) -> VersionId { diff --git a/src/version/optimize.rs b/src/version/optimize.rs new file mode 100644 index 00000000..2ab53184 --- /dev/null +++ b/src/version/optimize.rs @@ -0,0 +1,149 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use super::run::Ranged; +use crate::version::Run; +use std::fmt::Debug; + +pub fn optimize_runs(runs: Vec>) -> Vec> { + if runs.len() <= 1 { + runs + } else { + let mut new_runs: Vec> = Vec::new(); + + for run in runs.iter().rev() { + 'run: for segment in run.iter().rev() { + for existing_run in new_runs.iter_mut().rev() { + if existing_run + .iter() + .all(|x| !segment.key_range().overlaps_with_key_range(x.key_range())) + { + existing_run.push(segment.clone()); + continue 'run; + } + } + + new_runs.insert(0, Run::new(vec![segment.clone()])); + } + } + + new_runs + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::KeyRange; + use test_log::test; + + #[derive(Clone, Debug, Eq, PartialEq)] + struct FakeSegment { + id: u64, + key_range: KeyRange, + } + + impl Ranged for FakeSegment { + fn key_range(&self) -> &KeyRange { + &self.key_range + } + } + + fn s(id: u64, min: &str, max: &str) -> FakeSegment { + FakeSegment { + id, + key_range: KeyRange::new((min.as_bytes().into(), max.as_bytes().into())), + } + } + + #[test] + fn optimize_runs_empty() { + let runs = vec![]; + let runs = optimize_runs::(runs); + + assert_eq!(Vec::>::new(), &*runs); + } + + #[test] + fn optimize_runs_one() { + let runs = vec![Run::new(vec![s(0, "a", "b")])]; + let runs = optimize_runs::(runs); + + assert_eq!(vec![Run::new(vec![s(0, "a", "b")])], &*runs); + } + + #[test] + fn optimize_runs_two_overlap() { + let runs = vec![ + Run::new(vec![s(0, "a", "b")]), + Run::new(vec![s(1, "a", "b")]), + ]; + let runs = optimize_runs::(runs); + + assert_eq!( + vec![ + Run::new(vec![s(0, "a", "b")]), + Run::new(vec![s(1, "a", "b")]) + ], + &*runs + ); + } + + #[test] + #[ignore = "fix!!!"] + fn optimize_runs_two_overlap_2() { + let runs = vec![ + Run::new(vec![s(0, "a", "z")]), + Run::new(vec![s(1, "c", "f")]), + ]; + let runs = optimize_runs::(runs); + + assert_eq!( + vec![ + Run::new(vec![s(0, "a", "z")]), + Run::new(vec![s(1, "c", "f")]) + ], + &*runs + ); + } + + #[test] + fn optimize_runs_two_overlap_3() { + let runs = vec![ + Run::new(vec![s(0, "c", "f")]), + Run::new(vec![s(1, "a", "z")]), + ]; + let runs = optimize_runs::(runs); + + assert_eq!( + vec![ + Run::new(vec![s(0, "c", "f")]), + Run::new(vec![s(1, "a", "z")]) + ], + &*runs + ); + } + + #[test] + fn optimize_runs_two_disjoint() { + let runs = vec![ + Run::new(vec![s(0, "a", "c")]), + Run::new(vec![s(1, "d", "f")]), + ]; + let runs = optimize_runs::(runs); + + assert_eq!(vec![Run::new(vec![s(0, "a", "c"), s(1, "d", "f")])], &*runs); + } + + #[test] + fn optimize_runs_two_disjoint_2() { + let runs = vec![ + Run::new(vec![s(1, "d", "f")]), + Run::new(vec![s(0, "a", "c")]), + ]; + let runs = optimize_runs::(runs); + + assert_eq!(vec![Run::new(vec![s(0, "a", "c"), s(1, "d", "f")])], &*runs); + } +} diff --git a/src/version/run.rs b/src/version/run.rs index ff8f6c73..071fd394 100644 --- a/src/version/run.rs +++ b/src/version/run.rs @@ -1,3 +1,7 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + use crate::{binary_search::partition_point, KeyRange}; use std::ops::{Bound, RangeBounds}; @@ -63,6 +67,9 @@ impl Run { pub fn push(&mut self, item: T) { self.0.push(item); + + self.0 + .sort_by(|a, b| a.key_range().min().cmp(b.key_range().min())); } pub fn extend(&mut self, items: Vec) { From a1f41dc9ff6d88b3b34b9b78cc8077305a509a87 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 27 Jul 2025 13:41:04 +0200 Subject: [PATCH 271/613] gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 8db24cc8..5bd01e14 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,4 @@ Cargo.lock mutants* profile.json +fuzz*/**/out* From 075233872475a8f845a562ae6b37fec75c7c87c9 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 27 Jul 2025 14:55:47 +0200 Subject: [PATCH 272/613] wip: bloom --- benches/bloom.rs | 126 +++++++-- src/segment/filter/blocked_bloom/builder.rs | 29 +- src/segment/filter/blocked_bloom/mod.rs | 291 ++++++++++++-------- src/segment/filter/mod.rs | 63 +---- 4 files changed, 292 insertions(+), 217 deletions(-) diff --git a/benches/bloom.rs b/benches/bloom.rs index c6700ad9..3260a9a4 100644 --- a/benches/bloom.rs +++ b/benches/bloom.rs @@ -1,27 +1,99 @@ use criterion::{criterion_group, criterion_main, Criterion}; +use rand::{Rng, RngCore}; + +// Not really worth it anymore on new CPUs...? +fn fast_block_index(c: &mut Criterion) { + pub fn fast_impl(h: u64, num_blocks: usize) -> usize { + // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ + (((h >> 32).wrapping_mul(num_blocks as u64)) >> 32) as usize + } + + let mut rng = rand::rng(); + let num_blocks = 100_000; + + c.bench_function("block index - mod", |b| { + b.iter(|| { + let h: u64 = rng.random(); + criterion::black_box(h % (num_blocks as u64)) + }); + }); + + c.bench_function("block index - fast", |b| { + b.iter(|| { + let h: u64 = rng.random(); + criterion::black_box(fast_impl(h, num_blocks)) + }); + }); +} fn standard_filter_construction(c: &mut Criterion) { use lsm_tree::segment::filter::standard_bloom::Builder; - let mut filter = Builder::with_fp_rate(500_000_000, 0.01); + let mut rng = rand::rng(); + + c.bench_function("standard bloom filter add key, 1M", |b| { + let mut filter = Builder::with_fp_rate(1_000_000, 0.01); + + b.iter(|| { + let mut key = [0; 16]; + rng.fill_bytes(&mut key); + + filter.set_with_hash(Builder::get_hash(&key)); + }); + }); + + c.bench_function("standard bloom filter add key, 10M", |b| { + let mut filter = Builder::with_fp_rate(10_000_000, 0.01); - c.bench_function("standard bloom filter add key", |b| { b.iter(|| { - let key = nanoid::nanoid!(); - filter.set_with_hash(Builder::get_hash(key.as_bytes())); + let mut key = [0; 16]; + rng.fill_bytes(&mut key); + + filter.set_with_hash(Builder::get_hash(&key)); + }); + }); +} + +fn blocked_filter_construction(c: &mut Criterion) { + use lsm_tree::segment::filter::blocked_bloom::Builder; + + let mut rng = rand::rng(); + + c.bench_function("blocked bloom filter add key, 1M", |b| { + let mut filter = Builder::with_fp_rate(1_000_000, 0.01); + + b.iter(|| { + let mut key = [0; 16]; + rng.fill_bytes(&mut key); + + filter.set_with_hash(Builder::get_hash(&key)); + }); + }); + + c.bench_function("blocked bloom filter add key, 10M", |b| { + let mut filter = Builder::with_fp_rate(10_000_000, 0.01); + + b.iter(|| { + let mut key = [0; 16]; + rng.fill_bytes(&mut key); + + filter.set_with_hash(Builder::get_hash(&key)); }); }); } fn standard_filter_contains(c: &mut Criterion) { - use lsm_tree::segment::filter::{standard_bloom::Builder, AMQ}; + use lsm_tree::segment::filter::standard_bloom::Builder; let keys = (0..100_000u128) .map(|x| x.to_be_bytes().to_vec()) .collect::>(); - for fpr in [0.01, 0.001, 0.0001, 0.00001] { - let mut filter = Builder::with_fp_rate(100_000_000, fpr); + for fpr in [0.1, 0.01, 0.001, 0.0001, 0.00001] { + // NOTE: Purposefully bloat bloom filter size to run into more CPU cache misses + let n = 100_000_000; + + let mut filter = Builder::with_fp_rate(n, fpr); for key in &keys { filter.set_with_hash(Builder::get_hash(key)); @@ -29,7 +101,7 @@ fn standard_filter_contains(c: &mut Criterion) { let mut rng = rand::rng(); - let filter = filter.build(); + let filter_bytes = filter.build(); c.bench_function( &format!( @@ -39,6 +111,11 @@ fn standard_filter_contains(c: &mut Criterion) { |b| { b.iter(|| { use rand::seq::IndexedRandom; + use lsm_tree::segment::filter::standard_bloom::StandardBloomFilterReader as Reader; + + // NOTE: To make the costs more realistic, we + // pretend we are reading the filter straight from the block + let filter = Reader::new(&filter_bytes).unwrap(); let sample = keys.choose(&mut rng).unwrap(); let hash = Builder::get_hash(sample); @@ -49,28 +126,18 @@ fn standard_filter_contains(c: &mut Criterion) { } } -fn blocked_filter_construction(c: &mut Criterion) { - use lsm_tree::segment::filter::blocked_bloom::Builder; - - let mut filter = Builder::with_fp_rate(500_000_000, 0.01); - - c.bench_function("blocked bloom filter add key", |b| { - b.iter(|| { - let key = nanoid::nanoid!(); - filter.set_with_hash(Builder::get_hash(key.as_bytes())); - }); - }); -} - fn blocked_filter_contains(c: &mut Criterion) { - use lsm_tree::segment::filter::{blocked_bloom::Builder, AMQ}; + use lsm_tree::segment::filter::blocked_bloom::Builder; let keys = (0..100_000u128) .map(|x| x.to_be_bytes().to_vec()) .collect::>(); - for fpr in [0.01, 0.001, 0.0001, 0.00001] { - let mut filter = Builder::with_fp_rate(100_000_000, fpr); + for fpr in [0.1, 0.01, 0.001, 0.0001, 0.00001] { + // NOTE: Purposefully bloat bloom filter size to run into more CPU cache misses + let n = 100_000_000; + + let mut filter = Builder::with_fp_rate(n, fpr); for key in &keys { filter.set_with_hash(Builder::get_hash(key)); @@ -78,7 +145,7 @@ fn blocked_filter_contains(c: &mut Criterion) { let mut rng = rand::rng(); - let filter = filter.build(); + let filter_bytes = filter.build(); c.bench_function( &format!( @@ -88,6 +155,11 @@ fn blocked_filter_contains(c: &mut Criterion) { |b| { b.iter(|| { use rand::seq::IndexedRandom; + use lsm_tree::segment::filter::blocked_bloom::BlockedBloomFilterReader as Reader; + + // NOTE: To make the costs more realistic, we + // pretend we are reading the filter straight from the block + let filter = Reader::new(&filter_bytes).unwrap(); let sample = keys.choose(&mut rng).unwrap(); let hash = Builder::get_hash(sample); @@ -97,11 +169,13 @@ fn blocked_filter_contains(c: &mut Criterion) { ); } } + criterion_group!( benches, + fast_block_index, standard_filter_construction, - standard_filter_contains, blocked_filter_construction, + standard_filter_contains, blocked_filter_contains, ); criterion_main!(benches); diff --git a/src/segment/filter/blocked_bloom/builder.rs b/src/segment/filter/blocked_bloom/builder.rs index fa053c23..88e178eb 100644 --- a/src/segment/filter/blocked_bloom/builder.rs +++ b/src/segment/filter/blocked_bloom/builder.rs @@ -3,7 +3,9 @@ // (found in the LICENSE-* files in the repository) use super::super::bit_array::Builder as BitArrayBuilder; -use crate::segment::filter::{bit_array::BitArrayReader, CACHE_LINE_BYTES}; +use crate::{file::MAGIC_BYTES, segment::filter::CACHE_LINE_BYTES}; +use byteorder::{LittleEndian, WriteBytesExt}; +use std::io::Write; /// Two hashes that are used for double hashing pub type CompositeHash = (u64, u64); @@ -24,12 +26,25 @@ pub struct Builder { #[allow(clippy::len_without_is_empty)] impl Builder { #[must_use] - pub fn build(self) -> BlockedBloomFilter { - BlockedBloomFilter { - inner: BitArrayReader::new(self.inner.bytes().into()), - k: self.k, - num_blocks: self.num_blocks, - } + pub fn build(&self) -> Vec { + let mut v = vec![]; + + // Write header + v.write_all(&MAGIC_BYTES).expect("should not fail"); + + // NOTE: Filter type (unused) + v.write_u8(0).expect("should not fail"); + + // NOTE: Hash type (unused) + v.write_u8(0).expect("should not fail"); + + v.write_u64::(self.num_blocks as u64) + .expect("should not fail"); + v.write_u64::(self.k as u64) + .expect("should not fail"); + v.write_all(self.inner.bytes()).expect("should not fail"); + + v } /// Constructs a bloom filter that can hold `n` items diff --git a/src/segment/filter/blocked_bloom/mod.rs b/src/segment/filter/blocked_bloom/mod.rs index 81eba094..a0739c0d 100644 --- a/src/segment/filter/blocked_bloom/mod.rs +++ b/src/segment/filter/blocked_bloom/mod.rs @@ -3,18 +3,25 @@ // (found in the LICENSE-* files in the repository) mod builder; -use super::{bit_array::BitArrayReader, CACHE_LINE_BYTES}; -use crate::{ - coding::{DecodeError, Encode, EncodeError}, - file::MAGIC_BYTES, -}; + pub use builder::Builder; -use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use std::io::{Read, Write}; + +use super::{bit_array::BitArrayReader, CACHE_LINE_BYTES}; +use crate::file::MAGIC_BYTES; +use byteorder::{LittleEndian, ReadBytesExt}; +use std::io::{Cursor, Read}; /// Two hashes that are used for double hashing pub type CompositeHash = (u64, u64); +/// A blocked bloom filter +/// +/// Allows buffering the key hashes before actual filter construction +/// which is needed to properly calculate the filter size, as the amount of items +/// are unknown during segment construction. +/// +/// The filter uses double hashing instead of `k` hash functions, see: +/// #[derive(Debug, PartialEq)] pub struct BlockedBloomFilterReader<'a> { /// Raw bytes exposed as bit array @@ -27,48 +34,93 @@ pub struct BlockedBloomFilterReader<'a> { num_blocks: usize, } -// impl<'a> BlockedBloomFilterReader<'a> { -// fn bytes(&self) -> &[u8] { -// self.inner.bytes() -// } +impl<'a> BlockedBloomFilterReader<'a> { + pub fn new(slice: &'a [u8]) -> crate::Result { + let mut reader = Cursor::new(slice); -// /// Size of bloom filter in bytes -// #[must_use] -// fn len(&self) -> usize { -// self.inner.bytes().len() -// } + // Check header + let mut magic = [0u8; MAGIC_BYTES.len()]; + reader.read_exact(&mut magic)?; + + if magic != MAGIC_BYTES { + return Err(crate::Error::Decode(crate::DecodeError::InvalidHeader( + "BloomFilter", + ))); + } -// /// Returns `true` if the hash may be contained. -// /// -// /// Will never have a false negative. -// #[must_use] -// fn contains_hash(&self, (mut h1, mut h2): CompositeHash) -> bool { -// let block_idx = h1 % (self.num_blocks as u64); + // NOTE: Filter type (unused) + let filter_type = reader.read_u8()?; + assert_eq!(0, filter_type, "Invalid filter type"); -// for i in 1..(self.k as u64) { -// let bit_idx = h1 % (CACHE_LINE_BYTES as u64 * 8); + // NOTE: Hash type (unused) + let hash_type = reader.read_u8()?; + assert_eq!(0, hash_type, "Invalid bloom hash type"); -// // NOTE: should be in bounds because of modulo -// #[allow(clippy::expect_used, clippy::cast_possible_truncation)] -// if !self.has_bit(block_idx as usize, bit_idx as usize) { -// return false; -// } + let num_blocks = reader.read_u64::()? as usize; + let k = reader.read_u64::()? as usize; -// h1 = h1.wrapping_add(h2); -// h2 = h2.wrapping_mul(i); -// } + let offset = reader.position() as usize; -// true -// } + #[allow(clippy::indexing_slicing)] + Ok(Self { + k, + num_blocks, + inner: BitArrayReader::new(slice.get(offset..).expect("should be in bounds")), + }) + } -// /// Returns `true` if the item may be contained. -// /// -// /// Will never have a false negative. -// #[must_use] -// fn contains(&self, key: &[u8]) -> bool { -// self.contains_hash(Self::get_hash(key)) -// } -// } + fn bytes(&self) -> &[u8] { + self.inner.bytes() + } + + /// Size of bloom filter in bytes + #[must_use] + fn len(&self) -> usize { + self.inner.bytes().len() + } + + /// Returns `true` if the hash may be contained. + /// + /// Will never have a false negative. + #[must_use] + pub fn contains_hash(&self, (mut h1, mut h2): CompositeHash) -> bool { + let block_idx = h1 % (self.num_blocks as u64); + + for i in 1..(self.k as u64) { + let bit_idx = h1 % (CACHE_LINE_BYTES as u64 * 8); + + // NOTE: should be in bounds because of modulo + #[allow(clippy::expect_used, clippy::cast_possible_truncation)] + if !self.has_bit(block_idx as usize, bit_idx as usize) { + return false; + } + + h1 = h1.wrapping_add(h2); + h2 = h2.wrapping_mul(i); + } + + true + } + + /// Returns `true` if the bit at `idx` is `1`. + fn has_bit(&self, block_idx: usize, idx_in_block: usize) -> bool { + self.inner + .get(Builder::get_bit_idx(block_idx, idx_in_block)) + } + + /// Gets the hash of a key. + pub fn get_hash(key: &[u8]) -> CompositeHash { + Builder::get_hash(key) + } + + /// Returns `true` if the item may be contained. + /// + /// Will never have a false negative. + #[must_use] + pub fn contains(&self, key: &[u8]) -> bool { + self.contains_hash(Self::get_hash(key)) + } +} // impl<'a> Encode for BlockedBloomFilter<'a> { // fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { @@ -115,11 +167,6 @@ pub struct BlockedBloomFilterReader<'a> { // num_blocks, // } // } -// /// Returns `true` if the bit at `idx` is `1`. -// fn has_bit(&self, block_idx: usize, idx_in_block: usize) -> bool { -// self.inner -// .get(Builder::get_bit_idx(block_idx, idx_in_block)) -// } // /// Gets the hash of a key. // pub fn get_hash(key: &[u8]) -> CompositeHash { @@ -133,79 +180,79 @@ mod tests { use std::fs::File; use test_log::test; - #[test] - fn blocked_bloom_serde_round_trip() -> crate::Result<()> { - let dir = tempfile::tempdir()?; - - let path = dir.path().join("bf"); - let mut file = File::create(&path)?; - - let mut filter = Builder::with_fp_rate(10, 0.0001); - - let keys = &[ - b"item0", b"item1", b"item2", b"item3", b"item4", b"item5", b"item6", b"item7", - b"item8", b"item9", - ]; - - for key in keys { - filter.set_with_hash(BlockedBloomFilter::get_hash(*key)); - } - - let filter = filter.build(); - - for key in keys { - assert!(filter.contains(&**key)); - } - assert!(!filter.contains(b"asdasads")); - assert!(!filter.contains(b"item10")); - assert!(!filter.contains(b"cxycxycxy")); - - filter.encode_into(&mut file)?; - file.sync_all()?; - drop(file); - - let mut file = File::open(&path)?; - let filter_copy = AMQFilterBuilder::decode_from(&mut file)?; - - assert_eq!(filter.inner.bytes(), filter_copy.bytes()); - assert!(matches!(filter_copy, AMQFilter::BlockedBloom(_))); - - for key in keys { - assert!(filter.contains(&**key)); - } - assert!(!filter_copy.contains(b"asdasads")); - assert!(!filter_copy.contains(b"item10")); - assert!(!filter_copy.contains(b"cxycxycxy")); - - Ok(()) - } - - #[test] - fn blocked_bloom_basic() { - let mut filter = Builder::with_fp_rate(10, 0.0001); - let keys = [ - b"item0" as &[u8], - b"item1", - b"item2", - b"item3", - b"item4", - b"item5", - b"item6", - b"item7", - b"item8", - b"item9", - ]; - - for key in &keys { - filter.set_with_hash(Builder::get_hash(key)); - } - - let filter = filter.build(); - - for key in &keys { - assert!(filter.contains(key)); - } - - assert!(!filter.contains(b"asdasdasdasdasdasdasd")); - } + // #[test] + // fn blocked_bloom_serde_round_trip() -> crate::Result<()> { + // let dir = tempfile::tempdir()?; + + // let path = dir.path().join("bf"); + // let mut file = File::create(&path)?; + + // let mut filter = Builder::with_fp_rate(10, 0.0001); + + // let keys = &[ + // b"item0", b"item1", b"item2", b"item3", b"item4", b"item5", b"item6", b"item7", + // b"item8", b"item9", + // ]; + + // for key in keys { + // filter.set_with_hash(BlockedBloomFilter::get_hash(*key)); + // } + + // let filter = filter.build(); + + // for key in keys { + // assert!(filter.contains(&**key)); + // } + // assert!(!filter.contains(b"asdasads")); + // assert!(!filter.contains(b"item10")); + // assert!(!filter.contains(b"cxycxycxy")); + + // filter.encode_into(&mut file)?; + // file.sync_all()?; + // drop(file); + + // let mut file = File::open(&path)?; + // let filter_copy = AMQFilterBuilder::decode_from(&mut file)?; + + // assert_eq!(filter.inner.bytes(), filter_copy.bytes()); + // assert!(matches!(filter_copy, AMQFilter::BlockedBloom(_))); + + // for key in keys { + // assert!(filter.contains(&**key)); + // } + // assert!(!filter_copy.contains(b"asdasads")); + // assert!(!filter_copy.contains(b"item10")); + // assert!(!filter_copy.contains(b"cxycxycxy")); + + // Ok(()) + // } + + // #[test] + // fn blocked_bloom_basic() { + // let mut filter = Builder::with_fp_rate(10, 0.0001); + // let keys = [ + // b"item0" as &[u8], + // b"item1", + // b"item2", + // b"item3", + // b"item4", + // b"item5", + // b"item6", + // b"item7", + // b"item8", + // b"item9", + // ]; + + // for key in &keys { + // filter.set_with_hash(Builder::get_hash(key)); + // } + + // let filter = filter.build(); + + // for key in &keys { + // assert!(filter.contains(key)); + // } + + // assert!(!filter.contains(b"asdasdasdasdasdasdasd")); + // } } diff --git a/src/segment/filter/mod.rs b/src/segment/filter/mod.rs index cd915b32..86c7ad89 100644 --- a/src/segment/filter/mod.rs +++ b/src/segment/filter/mod.rs @@ -3,14 +3,10 @@ // (found in the LICENSE-* files in the repository) pub mod bit_array; -// pub mod blocked_bloom; +pub mod blocked_bloom; pub mod standard_bloom; use standard_bloom::Builder as StandardBloomFilterBuilder; -// use crate::{coding::DecodeError, file::MAGIC_BYTES}; -// use blocked_bloom::BlockedBloomFilter; -// use byteorder::ReadBytesExt; -// use std::io::Read; const CACHE_LINE_BYTES: usize = 64; @@ -45,60 +41,3 @@ impl BloomConstructionPolicy { } } } - -// #[enum_dispatch::enum_dispatch] -// pub trait AMQ { -// fn bytes(&self) -> &[u8]; -// fn len(&self) -> usize; -// fn contains(&self, item: &[u8]) -> bool; -// fn contains_hash(&self, hash: (u64, u64)) -> bool; -// } - -// #[enum_dispatch::enum_dispatch(AMQ)] -// #[derive(PartialEq, Debug)] -// pub enum AMQFilter { -// StandardBloom(StandardBloomFilter), -// BlockedBloom(BlockedBloomFilter), -// } - -// pub enum BloomFilterType { -// StandardBloom = 0, -// BlockedBloom = 1, -// } - -// impl TryFrom for BloomFilterType { -// type Error = (); -// fn try_from(value: u8) -> Result { -// match value { -// 0 => Ok(Self::StandardBloom), -// 1 => Ok(Self::BlockedBloom), -// _ => Err(()), -// } -// } -// } - -// pub struct AMQFilterBuilder {} - -// impl AMQFilterBuilder { -// pub fn decode_from(reader: &mut R) -> Result { -// // Check header -// let mut magic = [0u8; MAGIC_BYTES.len()]; -// reader.read_exact(&mut magic)?; - -// if magic != MAGIC_BYTES { -// return Err(DecodeError::InvalidHeader("BloomFilter")); -// } - -// let filter_type = reader.read_u8()?; - -// match BloomFilterType::try_from(filter_type) { -// Ok(BloomFilterType::StandardBloom) => { -// StandardBloomFilter::decode_from(reader).map_err(|e| DecodeError::from(e)) -// } -// Ok(BloomFilterType::BlockedBloom) => { -// BlockedBloomFilter::decode_from(reader).map_err(|e| DecodeError::from(e)) -// } -// _ => Err(DecodeError::InvalidHeader("Unknown filter type")), -// } -// } -// } From d376265ad7e764a9c2d28f500f6d4d9f093c80ef Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 27 Jul 2025 15:26:43 +0200 Subject: [PATCH 273/613] wip --- src/segment/mod.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/segment/mod.rs b/src/segment/mod.rs index d55700c7..3b2eafac 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -29,10 +29,8 @@ pub use writer::Writer; use crate::metrics::Metrics; use crate::{ - cache::Cache, - descriptor_table::DescriptorTable, - segment::block::{BlockType, ParsedItem}, - CompressionType, InternalValue, SeqNo, TreeId, UserKey, + cache::Cache, descriptor_table::DescriptorTable, segment::block::BlockType, CompressionType, + InternalValue, SeqNo, TreeId, UserKey, }; use block_index::BlockIndexImpl; use filter::standard_bloom::CompositeHash; From 0a7b8271445b52999a52d155c6934ee08a7bf584 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 30 Jul 2025 15:25:57 +0200 Subject: [PATCH 274/613] bloom filters adjustments --- src/segment/filter/bit_array/reader.rs | 7 ++++- src/segment/filter/blocked_bloom/builder.rs | 26 ++++++++++++----- src/segment/filter/blocked_bloom/mod.rs | 14 +++++---- src/segment/filter/mod.rs | 30 ++++++++++++++++++++ src/segment/filter/standard_bloom/builder.rs | 9 +++--- src/segment/filter/standard_bloom/mod.rs | 14 +++++---- 6 files changed, 78 insertions(+), 22 deletions(-) diff --git a/src/segment/filter/bit_array/reader.rs b/src/segment/filter/bit_array/reader.rs index ea8c2fb9..b602dacb 100644 --- a/src/segment/filter/bit_array/reader.rs +++ b/src/segment/filter/bit_array/reader.rs @@ -27,12 +27,17 @@ impl<'a> BitArrayReader<'a> { self.0 } - /// Gets the i-th bit + /// Gets the i-th bit. #[must_use] pub fn get(&self, idx: usize) -> bool { let byte_idx = idx / 8; + + #[cfg(not(feature = "bloom_use_unsafe"))] let byte = self.0.get(byte_idx).expect("should be in bounds"); + #[cfg(feature = "bloom_use_unsafe")] + let byte = unsafe { self.0.get_unchecked(byte_idx) }; + let bit_idx = idx % 8; get_bit(*byte, bit_idx) } diff --git a/src/segment/filter/blocked_bloom/builder.rs b/src/segment/filter/blocked_bloom/builder.rs index 88e178eb..a703a748 100644 --- a/src/segment/filter/blocked_bloom/builder.rs +++ b/src/segment/filter/blocked_bloom/builder.rs @@ -3,7 +3,10 @@ // (found in the LICENSE-* files in the repository) use super::super::bit_array::Builder as BitArrayBuilder; -use crate::{file::MAGIC_BYTES, segment::filter::CACHE_LINE_BYTES}; +use crate::{ + file::MAGIC_BYTES, + segment::filter::{FilterType, CACHE_LINE_BYTES}, +}; use byteorder::{LittleEndian, WriteBytesExt}; use std::io::Write; @@ -32,8 +35,9 @@ impl Builder { // Write header v.write_all(&MAGIC_BYTES).expect("should not fail"); - // NOTE: Filter type (unused) - v.write_u8(0).expect("should not fail"); + // NOTE: Filter type + v.write_u8(FilterType::BlockedBloom.into()) + .expect("should not fail"); // NOTE: Hash type (unused) v.write_u8(0).expect("should not fail"); @@ -56,10 +60,18 @@ impl Builder { assert!(n > 0); // NOTE: Some sensible minimum - let fpr = fpr.max(0.000_001); - - // TODO: m and k is still calculated by traditional standard bloom filter formula - let m = Self::calculate_m(n, fpr); + let fpr = fpr.max(0.000_000_1); + + // NOTE: We add ~5-25% more bits to account for blocked bloom filters being a bit less accurate + // See https://dl.acm.org/doi/10.1145/1498698.1594230 + let bonus = match fpr { + _ if fpr <= 0.001 => 1.25, + _ if fpr <= 0.01 => 1.2, + _ if fpr <= 0.1 => 1.1, + _ => 1.05, + }; + + let m = ((Self::calculate_m(n, fpr)) as f32 * bonus) as usize; let bpk = m / n; let k = (((bpk as f32) * LN_2) as usize).max(1); diff --git a/src/segment/filter/blocked_bloom/mod.rs b/src/segment/filter/blocked_bloom/mod.rs index a0739c0d..bafc12aa 100644 --- a/src/segment/filter/blocked_bloom/mod.rs +++ b/src/segment/filter/blocked_bloom/mod.rs @@ -7,7 +7,7 @@ mod builder; pub use builder::Builder; use super::{bit_array::BitArrayReader, CACHE_LINE_BYTES}; -use crate::file::MAGIC_BYTES; +use crate::{file::MAGIC_BYTES, segment::filter::FilterType}; use byteorder::{LittleEndian, ReadBytesExt}; use std::io::{Cursor, Read}; @@ -48,9 +48,15 @@ impl<'a> BlockedBloomFilterReader<'a> { ))); } - // NOTE: Filter type (unused) + // NOTE: Filter type let filter_type = reader.read_u8()?; - assert_eq!(0, filter_type, "Invalid filter type"); + let filter_type = FilterType::try_from(filter_type)?; + assert_eq!( + FilterType::BlockedBloom, + filter_type, + "Invalid filter type, got={filter_type:?}, expected={:?}", + FilterType::BlockedBloom, + ); // NOTE: Hash type (unused) let hash_type = reader.read_u8()?; @@ -89,8 +95,6 @@ impl<'a> BlockedBloomFilterReader<'a> { for i in 1..(self.k as u64) { let bit_idx = h1 % (CACHE_LINE_BYTES as u64 * 8); - // NOTE: should be in bounds because of modulo - #[allow(clippy::expect_used, clippy::cast_possible_truncation)] if !self.has_bit(block_idx as usize, bit_idx as usize) { return false; } diff --git a/src/segment/filter/mod.rs b/src/segment/filter/mod.rs index 86c7ad89..58925be4 100644 --- a/src/segment/filter/mod.rs +++ b/src/segment/filter/mod.rs @@ -41,3 +41,33 @@ impl BloomConstructionPolicy { } } } + +#[derive(Copy, Clone, PartialEq, Eq, Debug)] +enum FilterType { + StandardBloom, + BlockedBloom, +} + +impl TryFrom for FilterType { + type Error = crate::Error; + + fn try_from(value: u8) -> Result { + match value { + 0 => Ok(Self::StandardBloom), + 1 => Ok(Self::BlockedBloom), + _ => Err(crate::Error::Decode(crate::DecodeError::InvalidTag(( + "FilterType", + value, + )))), + } + } +} + +impl From for u8 { + fn from(value: FilterType) -> Self { + match value { + FilterType::StandardBloom => 0, + FilterType::BlockedBloom => 1, + } + } +} diff --git a/src/segment/filter/standard_bloom/builder.rs b/src/segment/filter/standard_bloom/builder.rs index 649f77a7..855533a4 100644 --- a/src/segment/filter/standard_bloom/builder.rs +++ b/src/segment/filter/standard_bloom/builder.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::super::bit_array::Builder as BitArrayBuilder; -use crate::file::MAGIC_BYTES; +use crate::{file::MAGIC_BYTES, segment::filter::FilterType}; use byteorder::{LittleEndian, WriteBytesExt}; use std::io::Write; @@ -32,8 +32,9 @@ impl Builder { // Write header v.write_all(&MAGIC_BYTES).expect("should not fail"); - // NOTE: Filter type (unused) - v.write_u8(0).expect("should not fail"); + // NOTE: Filter type + v.write_u8(FilterType::StandardBloom.into()) + .expect("should not fail"); // NOTE: Hash type (unused) v.write_u8(0).expect("should not fail"); @@ -56,7 +57,7 @@ impl Builder { assert!(n > 0); // NOTE: Some sensible minimum - let fpr = fpr.max(0.000_001); + let fpr = fpr.max(0.000_000_1); let m = Self::calculate_m(n, fpr); let bpk = m / n; diff --git a/src/segment/filter/standard_bloom/mod.rs b/src/segment/filter/standard_bloom/mod.rs index 0711ca41..ec59e4e6 100644 --- a/src/segment/filter/standard_bloom/mod.rs +++ b/src/segment/filter/standard_bloom/mod.rs @@ -7,7 +7,7 @@ mod builder; pub use builder::{Builder, CompositeHash}; use super::bit_array::BitArrayReader; -use crate::file::MAGIC_BYTES; +use crate::{file::MAGIC_BYTES, segment::filter::FilterType}; use byteorder::{LittleEndian, ReadBytesExt}; use std::io::{Cursor, Read}; @@ -44,9 +44,15 @@ impl<'a> StandardBloomFilterReader<'a> { ))); } - // NOTE: Filter type (unused) + // NOTE: Filter type let filter_type = reader.read_u8()?; - assert_eq!(0, filter_type, "Invalid filter type"); + let filter_type = FilterType::try_from(filter_type)?; + assert_eq!( + FilterType::StandardBloom, + filter_type, + "Invalid filter type, got={filter_type:?}, expected={:?}", + FilterType::StandardBloom + ); // NOTE: Hash type (unused) let hash_type = reader.read_u8()?; @@ -82,8 +88,6 @@ impl<'a> StandardBloomFilterReader<'a> { for i in 1..=(self.k as u64) { let idx = h1 % (self.m as u64); - // NOTE: should be in bounds because of modulo - #[allow(clippy::expect_used)] if !self.has_bit(idx as usize) { return false; } From 16e787721e4fba218c43fed33938f2390eb56441 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 30 Jul 2025 15:26:17 +0200 Subject: [PATCH 275/613] wip --- Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.toml b/Cargo.toml index f3c371c8..2d3f122d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,7 @@ lz4 = ["dep:lz4_flex"] miniz = ["dep:miniz_oxide"] bytes = [] # TODO: restore use_unsafe = [] +bloom_use_unsafe = [] metrics = [] [dependencies] From 5f5394a4401509cd9ddf1e2be892b63802c7cdf5 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 31 Jul 2025 14:16:43 +0200 Subject: [PATCH 276/613] microbench: bloom speed --- microbench/bloom_speed/Cargo.toml | 12 +++ microbench/bloom_speed/run.nu | 4 + microbench/bloom_speed/src/main.rs | 113 +++++++++++++++++++++++++++++ microbench/bloom_speed/template.py | 58 +++++++++++++++ 4 files changed, 187 insertions(+) create mode 100644 microbench/bloom_speed/Cargo.toml create mode 100644 microbench/bloom_speed/run.nu create mode 100644 microbench/bloom_speed/src/main.rs create mode 100644 microbench/bloom_speed/template.py diff --git a/microbench/bloom_speed/Cargo.toml b/microbench/bloom_speed/Cargo.toml new file mode 100644 index 00000000..1fc03e0c --- /dev/null +++ b/microbench/bloom_speed/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "bloom_bench" +version = "0.1.0" +edition = "2024" + +[features] +default = [] +use_unsafe = ["lsm-tree/bloom_use_unsafe"] + +[dependencies] +lsm-tree = { path = "../..", features = ["lz4"] } +rand = "0.9.0" diff --git a/microbench/bloom_speed/run.nu b/microbench/bloom_speed/run.nu new file mode 100644 index 00000000..36f75fca --- /dev/null +++ b/microbench/bloom_speed/run.nu @@ -0,0 +1,4 @@ +rm -f data.jsonl +cargo run -r | save data.jsonl --append +cargo run -r --features use_unsafe | save data.jsonl --append +python3 template.py diff --git a/microbench/bloom_speed/src/main.rs b/microbench/bloom_speed/src/main.rs new file mode 100644 index 00000000..442a4a58 --- /dev/null +++ b/microbench/bloom_speed/src/main.rs @@ -0,0 +1,113 @@ +use rand::{Rng, RngCore}; +use std::time::Instant; + +const NUM_READS: usize = 100_000_000; + +pub fn main() { + let mut rng = rand::rng(); + + let keys = (0..100_000_000u128) + .map(|x| x.to_be_bytes()) + .collect::>(); + + for fpr in [0.25, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001] { + let n = keys.len(); + + { + use lsm_tree::segment::filter::standard_bloom::Builder; + use lsm_tree::segment::filter::standard_bloom::StandardBloomFilterReader as Reader; + + let mut filter = Builder::with_fp_rate(n, fpr); + + for key in &keys { + filter.set_with_hash(Builder::get_hash(key)); + } + + let filter_bytes = filter.build(); + let filter = Reader::new(&filter_bytes).unwrap(); + + eprintln!("-- standard n={n} e={fpr} --"); + + { + let start = Instant::now(); + + for _ in 0..NUM_READS { + use rand::seq::IndexedRandom; + + // let sample = keys.choose(&mut rng).unwrap(); + + let mut sample = [0; 8]; + rng.fill(&mut sample); + + let hash = Builder::get_hash(&sample); + filter.contains_hash(hash); + // assert!(filter.contains_hash(hash)); + } + + let ns = start.elapsed().as_nanos(); + let per_read = ns / NUM_READS as u128; + eprintln!(" true positive in {per_read}ns"); + + #[cfg(feature = "use_unsafe")] + let use_unsafe = true; + + #[cfg(not(feature = "use_unsafe"))] + let use_unsafe = false; + + let filter_size_bytes = filter_bytes.len(); + println!( + r#"{{"key_count":{n},"fpr":{fpr},"impl":"standard","ns":{per_read},"bytes":{filter_size_bytes},"unsafe":{use_unsafe}}}"# + ); + } + } + + { + use lsm_tree::segment::filter::blocked_bloom::BlockedBloomFilterReader as Reader; + use lsm_tree::segment::filter::blocked_bloom::Builder; + + let mut filter = Builder::with_fp_rate(n, fpr); + + for key in &keys { + filter.set_with_hash(Builder::get_hash(key)); + } + + let filter_bytes = filter.build(); + let filter = Reader::new(&filter_bytes).unwrap(); + + eprintln!("-- blocked n={n} e={fpr} --"); + + { + let start = Instant::now(); + + for _ in 0..NUM_READS { + use rand::seq::IndexedRandom; + + // let sample = keys.choose(&mut rng).unwrap(); + + let mut sample = [0; 8]; + rng.fill(&mut sample); + + let hash = Builder::get_hash(&sample); + filter.contains_hash(hash); + + // assert!(filter.contains_hash(hash)); + } + + let ns = start.elapsed().as_nanos(); + let per_read = ns / NUM_READS as u128; + eprintln!(" true positive in {per_read}ns"); + + #[cfg(feature = "use_unsafe")] + let use_unsafe = true; + + #[cfg(not(feature = "use_unsafe"))] + let use_unsafe = false; + + let filter_size_bytes = filter_bytes.len(); + println!( + r#"{{"key_count":{n},"fpr":{fpr},"impl":"blocked","ns":{per_read},"bytes":{filter_size_bytes},"unsafe":{use_unsafe}}}"# + ); + } + } + } +} diff --git a/microbench/bloom_speed/template.py b/microbench/bloom_speed/template.py new file mode 100644 index 00000000..05d03057 --- /dev/null +++ b/microbench/bloom_speed/template.py @@ -0,0 +1,58 @@ +import json +import matplotlib.pyplot as plt +from collections import defaultdict +from pathlib import Path +from palettable.tableau import BlueRed_6 + +colors = BlueRed_6.mpl_colors + +# Path to the JSONL file +jsonl_path = Path('data.jsonl') + +# Data structure: {(impl, unsafe): [(fpr, ns), ...]} +data = defaultdict(list) + +# Read the JSONL file +for line in jsonl_path.read_text().splitlines(): + obj = json.loads(line) + key = (obj['impl'], obj['unsafe']) + data[key].append((obj['fpr'], obj['ns'])) + +plt.rcParams.update({ + 'axes.labelsize': 8, + 'font.size': 8, + 'legend.fontsize': 10, + 'xtick.labelsize': 10, + 'ytick.labelsize': 10, + 'text.usetex': False, + 'figure.figsize': [4.5, 4.5] +}) + +# Plotting +plt.figure(figsize=(6, 4)) + +i = 0 + +for (impl, unsafe), values in data.items(): + # Sort by FPR for consistent line plots + values.sort() + fprs = [fpr for fpr, ns in values] + ns_vals = [ns for fpr, ns in values] + safe_label = "unsafe" if unsafe else "safe" + label = f"{impl}, {safe_label}" + stroke = "-." if unsafe else "-" + marker = "v" if impl == "blocked" else "o" + plt.plot(fprs, ns_vals, marker=marker, label=label, color=colors[i], linestyle=stroke) + i += 1 + +plt.xscale("log") +plt.ylim(bottom=0) +plt.xlabel("False positive rate") +plt.ylabel("Latency [ns]") +# plt.title("Read Performance vs False Positive Rate") +plt.legend(loc='upper center', fancybox=True, bbox_to_anchor=(0.5, 1.15), shadow=True, ncol=2) +plt.grid(color="0.9", linestyle='--', linewidth=1) +plt.tight_layout() +# plt.show() +plt.savefig("bloom_speed.svg") + From f16167fa1417cd6d194ea514a5287787b08221cd Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 1 Aug 2025 03:13:27 +0200 Subject: [PATCH 277/613] use only 1 hash in bloom construction --- src/segment/filter/blocked_bloom/builder.rs | 17 +++++++---------- src/segment/filter/blocked_bloom/mod.rs | 16 ++++++++++------ src/segment/filter/mod.rs | 2 -- src/segment/filter/standard_bloom/builder.rs | 17 +++++++++-------- src/segment/filter/standard_bloom/mod.rs | 15 +++++++++------ src/segment/mod.rs | 3 +-- src/segment/writer/mod.rs | 2 +- 7 files changed, 37 insertions(+), 35 deletions(-) diff --git a/src/segment/filter/blocked_bloom/builder.rs b/src/segment/filter/blocked_bloom/builder.rs index a703a748..b9a133dc 100644 --- a/src/segment/filter/blocked_bloom/builder.rs +++ b/src/segment/filter/blocked_bloom/builder.rs @@ -3,16 +3,14 @@ // (found in the LICENSE-* files in the repository) use super::super::bit_array::Builder as BitArrayBuilder; +use super::super::standard_bloom::builder::secondary_hash; use crate::{ file::MAGIC_BYTES, - segment::filter::{FilterType, CACHE_LINE_BYTES}, + segment::filter::{blocked_bloom::CACHE_LINE_BYTES, FilterType}, }; use byteorder::{LittleEndian, WriteBytesExt}; use std::io::Write; -/// Two hashes that are used for double hashing -pub type CompositeHash = (u64, u64); - #[derive(Debug, Eq, PartialEq)] #[allow(clippy::module_name_repetitions)] pub struct Builder { @@ -123,7 +121,9 @@ impl Builder { } /// Adds the key to the filter. - pub fn set_with_hash(&mut self, (mut h1, mut h2): CompositeHash) { + pub fn set_with_hash(&mut self, mut h1: u64) { + let mut h2 = secondary_hash(h1); + let block_idx = h1 % (self.num_blocks as u64); for i in 1..(self.k as u64) { @@ -143,11 +143,8 @@ impl Builder { /// Gets the hash of a key. #[must_use] - pub fn get_hash(key: &[u8]) -> CompositeHash { - let h0 = xxhash_rust::xxh3::xxh3_128(key); - let h1 = (h0 >> 64) as u64; - let h2 = h0 as u64; - (h1, h2) + pub fn get_hash(key: &[u8]) -> u64 { + super::super::standard_bloom::Builder::get_hash(key) } } diff --git a/src/segment/filter/blocked_bloom/mod.rs b/src/segment/filter/blocked_bloom/mod.rs index bafc12aa..6debf2da 100644 --- a/src/segment/filter/blocked_bloom/mod.rs +++ b/src/segment/filter/blocked_bloom/mod.rs @@ -6,13 +6,15 @@ mod builder; pub use builder::Builder; -use super::{bit_array::BitArrayReader, CACHE_LINE_BYTES}; -use crate::{file::MAGIC_BYTES, segment::filter::FilterType}; +use super::bit_array::BitArrayReader; +use crate::{ + file::MAGIC_BYTES, + segment::filter::{standard_bloom::builder::secondary_hash, FilterType}, +}; use byteorder::{LittleEndian, ReadBytesExt}; use std::io::{Cursor, Read}; -/// Two hashes that are used for double hashing -pub type CompositeHash = (u64, u64); +const CACHE_LINE_BYTES: usize = 64; /// A blocked bloom filter /// @@ -89,7 +91,9 @@ impl<'a> BlockedBloomFilterReader<'a> { /// /// Will never have a false negative. #[must_use] - pub fn contains_hash(&self, (mut h1, mut h2): CompositeHash) -> bool { + pub fn contains_hash(&self, mut h1: u64) -> bool { + let mut h2 = secondary_hash(h1); + let block_idx = h1 % (self.num_blocks as u64); for i in 1..(self.k as u64) { @@ -113,7 +117,7 @@ impl<'a> BlockedBloomFilterReader<'a> { } /// Gets the hash of a key. - pub fn get_hash(key: &[u8]) -> CompositeHash { + pub fn get_hash(key: &[u8]) -> u64 { Builder::get_hash(key) } diff --git a/src/segment/filter/mod.rs b/src/segment/filter/mod.rs index 58925be4..641b558a 100644 --- a/src/segment/filter/mod.rs +++ b/src/segment/filter/mod.rs @@ -8,8 +8,6 @@ pub mod standard_bloom; use standard_bloom::Builder as StandardBloomFilterBuilder; -const CACHE_LINE_BYTES: usize = 64; - #[derive(Copy, Clone, Debug)] pub enum BloomConstructionPolicy { BitsPerKey(u8), diff --git a/src/segment/filter/standard_bloom/builder.rs b/src/segment/filter/standard_bloom/builder.rs index 855533a4..a0deba96 100644 --- a/src/segment/filter/standard_bloom/builder.rs +++ b/src/segment/filter/standard_bloom/builder.rs @@ -7,8 +7,10 @@ use crate::{file::MAGIC_BYTES, segment::filter::FilterType}; use byteorder::{LittleEndian, WriteBytesExt}; use std::io::Write; -/// Two hashes that are used for double hashing -pub type CompositeHash = (u64, u64); +pub fn secondary_hash(h1: u64) -> u64 { + // Taken from https://github.com/tomtomwombat/fastbloom + h1.wrapping_shr(32).wrapping_mul(0x51_7c_c1_b7_27_22_0a_95) +} #[derive(Debug, Eq, PartialEq)] #[allow(clippy::module_name_repetitions)] @@ -110,7 +112,9 @@ impl Builder { } /// Adds the key to the filter. - pub fn set_with_hash(&mut self, (mut h1, mut h2): CompositeHash) { + pub fn set_with_hash(&mut self, mut h1: u64) { + let mut h2 = secondary_hash(h1); + for i in 1..=(self.k as u64) { let idx = h1 % (self.m as u64); @@ -123,11 +127,8 @@ impl Builder { /// Gets the hash of a key. #[must_use] - pub fn get_hash(key: &[u8]) -> CompositeHash { - let h0 = xxhash_rust::xxh3::xxh3_128(key); - let h1 = (h0 >> 64) as u64; - let h2 = h0 as u64; - (h1, h2) + pub fn get_hash(key: &[u8]) -> u64 { + xxhash_rust::xxh3::xxh3_64(key) } } diff --git a/src/segment/filter/standard_bloom/mod.rs b/src/segment/filter/standard_bloom/mod.rs index ec59e4e6..b505a949 100644 --- a/src/segment/filter/standard_bloom/mod.rs +++ b/src/segment/filter/standard_bloom/mod.rs @@ -2,12 +2,15 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -mod builder; +pub(crate) mod builder; -pub use builder::{Builder, CompositeHash}; +pub use builder::Builder; use super::bit_array::BitArrayReader; -use crate::{file::MAGIC_BYTES, segment::filter::FilterType}; +use crate::{ + file::MAGIC_BYTES, + segment::filter::{standard_bloom::builder::secondary_hash, FilterType}, +}; use byteorder::{LittleEndian, ReadBytesExt}; use std::io::{Cursor, Read}; @@ -82,8 +85,8 @@ impl<'a> StandardBloomFilterReader<'a> { /// /// Will never have a false negative. #[must_use] - pub fn contains_hash(&self, hash: CompositeHash) -> bool { - let (mut h1, mut h2) = hash; + pub fn contains_hash(&self, mut h1: u64) -> bool { + let mut h2 = secondary_hash(h1); for i in 1..=(self.k as u64) { let idx = h1 % (self.m as u64); @@ -113,7 +116,7 @@ impl<'a> StandardBloomFilterReader<'a> { } /// Gets the hash of a key. - fn get_hash(key: &[u8]) -> CompositeHash { + fn get_hash(key: &[u8]) -> u64 { Builder::get_hash(key) } } diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 3b2eafac..3ab8202d 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -33,7 +33,6 @@ use crate::{ InternalValue, SeqNo, TreeId, UserKey, }; use block_index::BlockIndexImpl; -use filter::standard_bloom::CompositeHash; use inner::Inner; use iter::Iter; use std::{ @@ -160,7 +159,7 @@ impl Segment { &self, key: &[u8], seqno: SeqNo, - key_hash: CompositeHash, + key_hash: u64, ) -> crate::Result> { use filter::standard_bloom::StandardBloomFilterReader; #[cfg(feature = "metrics")] diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 93187d69..aa285530 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -58,7 +58,7 @@ pub struct Writer { /// Hashes for bloom filter /// /// using enhanced double hashing, so we got two u64s - pub bloom_hash_buffer: Vec<(u64, u64)>, + pub bloom_hash_buffer: Vec, } impl Writer { From acf7cc320bf1e1b871e69e7469f6ce5edb660ea9 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 8 Aug 2025 20:53:00 +0200 Subject: [PATCH 278/613] perf: zero seqnos if below GC watermark --- src/compaction/stream.rs | 46 +++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/src/compaction/stream.rs b/src/compaction/stream.rs index 4314009d..e8edecff 100644 --- a/src/compaction/stream.rs +++ b/src/compaction/stream.rs @@ -59,7 +59,7 @@ impl>> Iterator for CompactionSt fn next(&mut self) -> Option { loop { - let head = fail_iter!(self.inner.next()?); + let mut head = fail_iter!(self.inner.next()?); if let Some(peeked) = self.inner.peek() { let Ok(peeked) = peeked else { @@ -72,12 +72,9 @@ impl>> Iterator for CompactionSt .expect_err("should be error"))); }; - // NOTE: Only item of this key and thus latest version, so return it no matter what if peeked.key.user_key > head.key.user_key { - return Some(Ok(head)); - } - - if peeked.key.seqno < self.gc_seqno_threshold { + // NOTE: Only item of this key and thus latest version, so return it no matter what + } else if peeked.key.seqno < self.gc_seqno_threshold { // NOTE: If next item is an actual value, and current value is weak tombstone, // drop the tombstone let drop_weak_tombstone = peeked.key.value_type == ValueType::Value @@ -93,6 +90,13 @@ impl>> Iterator for CompactionSt } } + // NOTE: Convert sequence number to zero if it is below the snapshot watermark + // + // This can save a lot of space, because "0" only takes 1 byte. + if head.key.seqno < self.gc_seqno_threshold { + head.key.seqno = 0; + } + return Some(Ok(head)); } } @@ -137,6 +141,28 @@ mod tests { }; } + #[test] + #[allow(clippy::unwrap_used)] + fn compaction_stream_seqno_zeroing_1() -> crate::Result<()> { + #[rustfmt::skip] + let vec = stream![ + "a", "", "T", + "a", "", "T", + "a", "", "T", + ]; + + let iter = vec.iter().cloned().map(Ok); + let mut iter = CompactionStream::new(iter, 1_000); + + assert_eq!( + InternalValue::from_components(*b"a", *b"", 0, ValueType::Tombstone), + iter.next().unwrap()?, + ); + iter_closed!(iter); + + Ok(()) + } + #[test] #[allow(clippy::unwrap_used)] fn compaction_stream_queue_weak_tombstones() { @@ -168,18 +194,18 @@ mod tests { ]; let iter = vec.iter().cloned().map(Ok); - let mut iter = CompactionStream::new(iter, SeqNo::MAX); + let mut iter = CompactionStream::new(iter, 1_000_000); assert_eq!( - InternalValue::from_components(*b"a", *b"", 999, ValueType::Tombstone), + InternalValue::from_components(*b"a", *b"", 0, ValueType::Tombstone), iter.next().unwrap()?, ); assert_eq!( - InternalValue::from_components(*b"b", *b"", 999, ValueType::Tombstone), + InternalValue::from_components(*b"b", *b"", 0, ValueType::Tombstone), iter.next().unwrap()?, ); assert_eq!( - InternalValue::from_components(*b"c", *b"", 999, ValueType::Tombstone), + InternalValue::from_components(*b"c", *b"", 0, ValueType::Tombstone), iter.next().unwrap()?, ); iter_closed!(iter); From ea573b077e668c6196df53ef6a75b98047ae4d1e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 8 Aug 2025 22:10:15 +0200 Subject: [PATCH 279/613] fix: lint --- src/blob_tree/mod.rs | 2 +- src/compaction/worker.rs | 2 +- src/merge.rs | 2 +- src/segment/index_block/block_handle.rs | 4 ++-- src/tree/mod.rs | 2 +- src/value.rs | 15 ++++++--------- 6 files changed, 12 insertions(+), 15 deletions(-) diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index 07d5f573..8e2110db 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -230,7 +230,7 @@ impl BlobTree { else { return Ok(None); }; - self.register_segments(&[segment.clone()])?; + self.register_segments(std::slice::from_ref(&segment))?; Ok(Some(segment)) } diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 21e11d31..3996bdc6 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -359,7 +359,7 @@ fn merge_segments( opts.tree_id, opts.config.cache.clone(), opts.config.descriptor_table.clone(), - payload.dest_level <= 2, // TODO: look at configuration + false, // TODO: look at configuration #[cfg(feature = "metrics")] opts.metrics.clone(), ) diff --git a/src/merge.rs b/src/merge.rs index ca4bf122..0b97e8a7 100644 --- a/src/merge.rs +++ b/src/merge.rs @@ -26,7 +26,7 @@ impl Ord for HeapItem { impl PartialOrd for HeapItem { fn partial_cmp(&self, other: &Self) -> Option { - Some(self.1.key.cmp(&other.1.key)) + Some(self.cmp(other)) } } diff --git a/src/segment/index_block/block_handle.rs b/src/segment/index_block/block_handle.rs index 5ced9049..826661af 100644 --- a/src/segment/index_block/block_handle.rs +++ b/src/segment/index_block/block_handle.rs @@ -58,7 +58,7 @@ impl Ord for BlockHandle { impl PartialOrd for BlockHandle { fn partial_cmp(&self, other: &Self) -> Option { - Some(self.offset.cmp(&other.offset)) + Some(self.cmp(other)) } } @@ -149,7 +149,7 @@ impl Ord for KeyedBlockHandle { impl PartialOrd for KeyedBlockHandle { fn partial_cmp(&self, other: &Self) -> Option { - Some(self.offset().cmp(&other.offset())) + Some(self.cmp(other)) } } diff --git a/src/tree/mod.rs b/src/tree/mod.rs index b3d81b89..9b91debe 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -570,7 +570,7 @@ impl Tree { else { return Ok(None); }; - self.register_segments(&[segment.clone()])?; + self.register_segments(std::slice::from_ref(&segment))?; Ok(Some(segment)) } diff --git a/src/value.rs b/src/value.rs index 927096c0..d1573fce 100644 --- a/src/value.rs +++ b/src/value.rs @@ -143,21 +143,18 @@ impl PartialEq for InternalValue { } } -impl PartialOrd for InternalValue { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.key.cmp(&other.key)) - } -} - -// Order by user key, THEN by sequence number -// This is one of the most important functions -// Otherwise queries will not match expected behaviour impl Ord for InternalValue { fn cmp(&self, other: &Self) -> std::cmp::Ordering { self.key.cmp(&other.key) } } +impl PartialOrd for InternalValue { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + impl std::fmt::Debug for InternalValue { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( From fddc524d3f0e6f0b23c0f7d36984baa756634a65 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 9 Aug 2025 17:28:42 +0200 Subject: [PATCH 280/613] doc --- src/version/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/version/mod.rs b/src/version/mod.rs index 49075f76..31742b3a 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -184,6 +184,7 @@ impl Version { .flat_map(|x| x.iter()) } + /// Gets the n-th level. pub fn level(&self, n: usize) -> Option<&Level> { self.levels.get(n) } From e829f5dc3f62e980e9aff21e6db02d4ce11ab6ff Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 9 Aug 2025 17:28:56 +0200 Subject: [PATCH 281/613] disable monkey temporarily --- src/tree/mod.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 9b91debe..cb09eae2 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -203,7 +203,11 @@ impl AbstractTree for Tree { use crate::segment::filter::BloomConstructionPolicy; if self.config.bloom_bits_per_key >= 0 { - BloomConstructionPolicy::FpRate(0.00001) + // TODO: enable monkey later on + // BloomConstructionPolicy::FpRate(0.00001) + BloomConstructionPolicy::BitsPerKey( + self.config.bloom_bits_per_key.unsigned_abs(), + ) } else { BloomConstructionPolicy::BitsPerKey(0) } From 4d1798f1a758c3f1059e953e1a77ae4c1bbf946d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 9 Aug 2025 17:29:55 +0200 Subject: [PATCH 282/613] perf: replace busy_levels with level_is_busy --- src/level_manifest/mod.rs | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index e5b576ce..8c30db7c 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -374,21 +374,16 @@ impl LevelManifest { } #[must_use] - pub fn busy_levels(&self) -> HashSet { - let mut output = - HashSet::with_capacity_and_hasher(self.len(), xxhash_rust::xxh3::Xxh3Builder::new()); - - for (idx, level) in self.current.iter_levels().enumerate() { - for segment in level.iter().flat_map(|run| run.iter()) { - if self.hidden_set.is_hidden(segment.id()) { - // NOTE: Level count is u8 - #[allow(clippy::cast_possible_truncation)] - output.insert(idx as u8); - } - } - } - - output + pub fn level_is_busy(&self, idx: usize) -> bool { + self.current + .level(idx) + .map(|level| { + level + .iter() + .flat_map(|run| run.iter()) + .any(|segment| self.hidden_set.is_hidden(segment.id())) + }) + .unwrap_or_default() } pub(crate) fn get_segment(&self, id: SegmentId) -> Option<&Segment> { From d09b8fab6c847b44a0828a91f25ec5cac545d3b2 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 9 Aug 2025 17:30:09 +0200 Subject: [PATCH 283/613] wip --- src/compaction/worker.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 3996bdc6..bbe59bf1 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -83,7 +83,7 @@ pub fn do_compaction(opts: &Options) -> crate::Result<()> { match choice { Choice::Merge(payload) => merge_segments(original_levels, opts, &payload), - Choice::Move(payload) => move_segments(original_levels, opts, payload), + Choice::Move(payload) => move_segments(original_levels, opts, &payload), Choice::Drop(payload) => drop_segments( original_levels, opts, @@ -160,7 +160,7 @@ fn create_compaction_stream<'a>( fn move_segments( mut levels: RwLockWriteGuard<'_, LevelManifest>, opts: &Options, - payload: CompactionPayload, + payload: &CompactionPayload, ) -> crate::Result<()> { // Fail-safe for buggy compaction strategies if levels.should_decline_compaction(payload.segment_ids.iter().copied()) { From 4e96ed1b7428053486a9fc75708dcb96a540467d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 9 Aug 2025 17:30:14 +0200 Subject: [PATCH 284/613] fix --- src/compaction/movedown.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compaction/movedown.rs b/src/compaction/movedown.rs index f96f4995..6caeac91 100644 --- a/src/compaction/movedown.rs +++ b/src/compaction/movedown.rs @@ -15,7 +15,7 @@ impl CompactionStrategy for Strategy { #[allow(clippy::expect_used)] fn choose(&self, levels: &LevelManifest, _: &Config) -> Choice { - if levels.busy_levels().contains(&self.0) { + if levels.level_is_busy(usize::from(self.0)) { return Choice::DoNothing; } From 2367d5ab79734bc46e4ccb8d87278f3565365804 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 9 Aug 2025 17:30:36 +0200 Subject: [PATCH 285/613] reimplement fifo compaction without TTL for now --- src/compaction/fifo.rs | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/src/compaction/fifo.rs b/src/compaction/fifo.rs index 39c179fe..c2ccaad0 100644 --- a/src/compaction/fifo.rs +++ b/src/compaction/fifo.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{Choice, CompactionStrategy}; -use crate::{config::Config, level_manifest::LevelManifest, time::unix_timestamp, HashSet}; +use crate::{config::Config, level_manifest::LevelManifest, HashSet}; /// FIFO-style compaction /// @@ -44,8 +44,33 @@ impl CompactionStrategy for Strategy { "FifoStrategy" } - fn choose(&self, levels: &LevelManifest, config: &Config) -> Choice { - todo!() + // TODO: TTL + fn choose(&self, levels: &LevelManifest, _config: &Config) -> Choice { + let first_level = levels.as_slice().first().expect("should have first level"); + + assert!(first_level.is_disjoint(), "L0 needs to be disjoint",); + + let l0_size = first_level.size(); + + if l0_size > self.limit { + let overshoot = l0_size - self.limit; + + let mut oldest_segments = HashSet::default(); + let mut collected_bytes = 0; + + for segment in first_level.iter().flat_map(|run| run.iter()) { + if collected_bytes >= overshoot { + break; + } + + oldest_segments.insert(segment.id()); + collected_bytes += segment.metadata.file_size; + } + + Choice::Drop(oldest_segments) + } else { + Choice::DoNothing + } } } /* From ef9719216059f77ce869a7d4e9e2b579127174da Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 9 Aug 2025 17:31:10 +0200 Subject: [PATCH 286/613] change compaction names --- src/compaction/fifo.rs | 2 +- src/compaction/leveled.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/compaction/fifo.rs b/src/compaction/fifo.rs index c2ccaad0..09ca2637 100644 --- a/src/compaction/fifo.rs +++ b/src/compaction/fifo.rs @@ -41,7 +41,7 @@ impl Strategy { impl CompactionStrategy for Strategy { fn get_name(&self) -> &'static str { - "FifoStrategy" + "FifoCompaction" } // TODO: TTL diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index a58efd1b..69971c42 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -201,7 +201,7 @@ impl Strategy { impl CompactionStrategy for Strategy { fn get_name(&self) -> &'static str { - "LeveledStrategy" + "LeveledCompaction" } #[allow(clippy::too_many_lines)] From 1c93dfb41d7ed9c19ae9e1f5aa1f0e70bfd035ea Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 9 Aug 2025 17:32:23 +0200 Subject: [PATCH 287/613] leveled compaction scoring --- src/compaction/leveled.rs | 301 +++++++++++++++++++------------------- 1 file changed, 153 insertions(+), 148 deletions(-) diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 69971c42..50a4f583 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -30,6 +30,7 @@ fn pick_minimal_compaction( write_amp: f32, segment_ids: HashSet, can_trivial_move: bool, + // TODO: compaction_bytes } let mut choices = vec![]; @@ -44,7 +45,7 @@ fn pick_minimal_compaction( } else { // TODO: this should not consider the number of segments, but the amount of rewritten data // which corresponds to the amount of temporary space amp - choice.segment_ids.len() <= 100 + choice.segment_ids.len() <= 100 /* TODO: filter by x25 IF POSSIBLE */ }; if valid_choice { @@ -76,22 +77,20 @@ fn pick_minimal_compaction( .map(|x| x.metadata.file_size) .sum::(); - // NOTE: Only consider compactions where we actually reach the amount - // of bytes we need to merge? - if curr_level_size >= overshoot { - let next_level_size = window.iter().map(|x| x.metadata.file_size).sum::(); + // if curr_level_size >= overshoot { + let next_level_size = window.iter().map(|x| x.metadata.file_size).sum::(); - let mut segment_ids: HashSet<_> = window.iter().map(Segment::id).collect(); - segment_ids.extend(curr_level_pull_in.iter().map(|x| x.id())); + let mut segment_ids: HashSet<_> = window.iter().map(Segment::id).collect(); + segment_ids.extend(curr_level_pull_in.iter().map(|x| x.id())); - let write_amp = (next_level_size as f32) / (curr_level_size as f32); + let write_amp = (next_level_size as f32) / (curr_level_size as f32); - add_choice(Choice { - write_amp, - segment_ids, - can_trivial_move: false, - }); - } + add_choice(Choice { + write_amp, + segment_ids, + can_trivial_move: false, + }); + // } } } @@ -206,160 +205,166 @@ impl CompactionStrategy for Strategy { #[allow(clippy::too_many_lines)] fn choose(&self, levels: &LevelManifest, _: &Config) -> Choice { - // L1+ compactions - for (curr_level_index, level) in levels - .as_slice() - .iter() - .enumerate() - .skip(1) - .take(usize::from(levels.level_count() - 2)) - .rev() - { - // NOTE: Level count is 255 max - #[allow(clippy::cast_possible_truncation)] - let curr_level_index = curr_level_index as u8; + assert!(levels.as_slice().len() <= 7, "too many levels???"); - let next_level_index = curr_level_index + 1; + // Scoring + let mut scores = [(0.0, 0u64); 7]; - if level.is_empty() { - continue; + { + // Score first level + let first_level = levels.as_slice().first().expect("first level should exist"); + if first_level.len() >= usize::from(self.l0_threshold) { + scores[0] = ((first_level.len() as f64) / (self.l0_threshold as f64), 0); } - let level_size: u64 = level - .iter() - .flat_map(|x| x.iter()) - // NOTE: Take bytes that are already being compacted into account, - // otherwise we may be overcompensating - .filter(|x| !levels.hidden_set().is_hidden(x.id())) - .map(|x| x.metadata.file_size) - .sum(); - - let desired_bytes = self.level_target_size(curr_level_index); - - let overshoot = level_size.saturating_sub(desired_bytes); - - if overshoot > 0 { - let Some(next_level) = levels.current_version().level(next_level_index as usize) - else { - break; - }; - - debug_assert!(level.is_disjoint(), "level should be disjoint"); - debug_assert!(next_level.is_disjoint(), "next level should be disjoint"); - - let Some((segment_ids, can_trivial_move)) = pick_minimal_compaction( - level.first_run().expect("should have exactly one run"), - next_level.first_run().map(std::ops::Deref::deref), - levels.hidden_set(), - overshoot, - ) else { - break; - }; - - let choice = CompactionInput { - segment_ids, - dest_level: next_level_index, - target_size: u64::from(self.target_size), - }; - - /* eprintln!( - "{} {} segments, L{}->L{next_level_index}: {:?}", - if can_trivial_move { "move" } else { "merge" }, - choice.segment_ids.len(), - next_level_index - 1, - choice.segment_ids, - ); */ - - if can_trivial_move && level.is_disjoint() { - return Choice::Move(choice); + // Score L1+ + for (idx, level) in levels.as_slice().iter().enumerate().skip(1) { + let level_size = level + .iter() + .flat_map(|x| x.iter()) + // NOTE: Take bytes that are already being compacted into account, + // otherwise we may be overcompensating + .filter(|x| !levels.hidden_set().is_hidden(x.id())) + .map(|x| x.metadata.file_size) + .sum::(); + + let target_size = self.level_target_size(idx as u8); + + // NOTE: We check for level length above + #[allow(clippy::indexing_slicing)] + if level_size > target_size { + scores[idx] = ( + level_size as f64 / target_size as f64, + level_size - target_size, + ); + + // NOTE: Force a trivial move + if levels + .as_slice() + .get(idx + 1) + .is_some_and(|next_level| next_level.is_empty()) + { + scores[idx] = (99.99, 999); + } } - return Choice::Merge(choice); + } + + // NOTE: Never score Lmax + // NOTE: We check for level length above + #[allow(clippy::indexing_slicing)] + { + scores[6] = (0.0, 0); } } - // L0->L1 compactions - { - let busy_levels = levels.busy_levels(); + // eprintln!("{scores:?}"); + + // Choose compaction + let (level_idx_with_highest_score, (score, overshoot_bytes)) = scores + .into_iter() + .enumerate() + .max_by(|(_, (score_a, _)), (_, (score_b, _))| { + score_a + .partial_cmp(score_b) + .unwrap_or(std::cmp::Ordering::Equal) + }) + .expect("should have highest score somewhere"); + if score < 1.0 { + return Choice::DoNothing; + } + + // We choose L0->L1 compaction + if level_idx_with_highest_score == 0 { let Some(first_level) = levels.current_version().level(0) else { return Choice::DoNothing; }; - if busy_levels.contains(&0) { + if levels.level_is_busy(0) || levels.level_is_busy(1) { return Choice::DoNothing; } - if first_level.len() >= self.l0_threshold.into() { - // let first_level_size = first_level.size(); - - /* // NOTE: Special handling for disjoint workloads - if levels.is_disjoint() && first_level_size < self.target_size.into() { - // TODO: also do this in non-disjoint workloads - // -> intra-L0 compaction - - // NOTE: Force a merge into L0 itself - // ...we seem to have *very* small flushes - return if first_level.len() >= 30 { - Choice::Merge(CompactionInput { - dest_level: 0, - segment_ids: first_level.list_ids(), - // NOTE: Allow a bit of overshooting - target_size: ((self.target_size as f32) * 1.1) as u64, - }) - } else { - Choice::DoNothing - }; - } + let Some(next_level) = &levels.current_version().level(1) else { + return Choice::DoNothing; + }; - if first_level_size < self.target_size.into() { - // NOTE: We reached the threshold, but L0 is still very small - // meaning we have very small segments, so do intra-L0 compaction - return Choice::Merge(CompactionInput { - dest_level: 0, - segment_ids: first_level.list_ids(), - target_size: self.target_size.into(), - }); - } */ - - if !busy_levels.contains(&1) { - let Some(next_level) = &levels.current_version().level(1) else { - return Choice::DoNothing; - }; - - let mut segment_ids: HashSet = first_level.list_ids(); - - let key_range = first_level.aggregate_key_range(); - - // Get overlapping segments in next level - let next_level_overlapping_segment_ids: Vec<_> = next_level - .iter() - .flat_map(|run| run.get_overlapping(&key_range)) - .map(Segment::id) - .collect(); - - segment_ids.extend(&next_level_overlapping_segment_ids); - - let choice = CompactionInput { - segment_ids, - dest_level: 1, - target_size: u64::from(self.target_size), - }; - - /* eprintln!( - "merge {} segments, L0->L1: {:?}", - choice.segment_ids.len(), - choice.segment_ids, - ); */ - - if next_level_overlapping_segment_ids.is_empty() && first_level.is_disjoint() { - return Choice::Move(choice); - } - return Choice::Merge(choice); - } + let mut segment_ids: HashSet = first_level.list_ids(); + + let key_range = first_level.aggregate_key_range(); + + // Get overlapping segments in next level + let next_level_overlapping_segment_ids: Vec<_> = next_level + .iter() + .flat_map(|run| run.get_overlapping(&key_range)) + .map(Segment::id) + .collect(); + + segment_ids.extend(&next_level_overlapping_segment_ids); + + let choice = CompactionInput { + segment_ids, + dest_level: 1, + target_size: u64::from(self.target_size), + }; + + /* eprintln!( + "merge {} segments, L0->L1: {:?}", + choice.segment_ids.len(), + choice.segment_ids, + ); */ + + if next_level_overlapping_segment_ids.is_empty() && first_level.is_disjoint() { + return Choice::Move(choice); } + return Choice::Merge(choice); } - Choice::DoNothing + // We choose L1+ compaction + + // NOTE: Level count is 255 max + #[allow(clippy::cast_possible_truncation)] + let curr_level_index = level_idx_with_highest_score as u8; + + let next_level_index = curr_level_index + 1; + + let Some(level) = levels.current_version().level(level_idx_with_highest_score) else { + return Choice::DoNothing; + }; + + let Some(next_level) = levels.current_version().level(next_level_index as usize) else { + return Choice::DoNothing; + }; + + debug_assert!(level.is_disjoint(), "level should be disjoint"); + debug_assert!(next_level.is_disjoint(), "next level should be disjoint"); + + let Some((segment_ids, can_trivial_move)) = pick_minimal_compaction( + level.first_run().expect("should have exactly one run"), + next_level.first_run().map(std::ops::Deref::deref), + levels.hidden_set(), + overshoot_bytes, + ) else { + return Choice::DoNothing; + }; + + let choice = CompactionInput { + segment_ids, + dest_level: next_level_index, + target_size: u64::from(self.target_size), + }; + + /* eprintln!( + "{} {} segments, L{}->L{next_level_index}: {:?}", + if can_trivial_move { "move" } else { "merge" }, + choice.segment_ids.len(), + next_level_index - 1, + choice.segment_ids, + ); */ + + if can_trivial_move && level.is_disjoint() { + return Choice::Move(choice); + } + Choice::Merge(choice) } } /* From 61aa6914f155411cead263d424b3b222bd4bc648 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 9 Aug 2025 17:34:14 +0200 Subject: [PATCH 288/613] fix: major compaction test --- tests/major_compaction.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/major_compaction.rs b/tests/major_compaction.rs index 02699fa8..ab04581b 100644 --- a/tests/major_compaction.rs +++ b/tests/major_compaction.rs @@ -28,12 +28,12 @@ fn tree_major_compaction() -> lsm_tree::Result<()> { let item = tree.get_internal_entry(b"b", SeqNo::MAX)?.unwrap(); assert_eq!(&*item.key.user_key, "b".as_bytes()); assert!(!item.is_tombstone()); - assert_eq!(item.key.seqno, 1); + assert_eq!(item.key.seqno, 0); // NOTE: Seqno is zeroed because below GC threshold let item = tree.get_internal_entry(b"c", SeqNo::MAX)?.unwrap(); assert_eq!(&*item.key.user_key, "c".as_bytes()); assert!(!item.is_tombstone()); - assert_eq!(item.key.seqno, 2); + assert_eq!(item.key.seqno, 0); // NOTE: Seqno is zeroed because below GC threshold assert_eq!(1, tree.segment_count()); assert_eq!(3, tree.len(None, None)?); From d0f50774fb7ee8180aab55d36c3ec1e2e0dbdedb Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 13 Aug 2025 01:56:51 +0200 Subject: [PATCH 289/613] refactor: add crate-level hash fns it's unlikely other hash types will be supported... then we would need to replace this with an enum --- src/hash.rs | 7 +++++++ src/lib.rs | 2 ++ 2 files changed, 9 insertions(+) create mode 100644 src/hash.rs diff --git a/src/hash.rs b/src/hash.rs new file mode 100644 index 00000000..5183d221 --- /dev/null +++ b/src/hash.rs @@ -0,0 +1,7 @@ +pub fn hash64(bytes: &[u8]) -> u64 { + xxhash_rust::xxh3::xxh3_64(bytes) +} + +pub fn hash128(bytes: &[u8]) -> u128 { + xxhash_rust::xxh3::xxh3_128(bytes) +} diff --git a/src/lib.rs b/src/lib.rs index 760fb155..891c62b3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -144,6 +144,8 @@ pub(crate) mod fallible_clipping_iter; #[doc(hidden)] pub mod file; +mod hash; + mod key; #[doc(hidden)] From 4c995526116655e91cc311b5335bdb641d036c3a Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 13 Aug 2025 01:57:36 +0200 Subject: [PATCH 290/613] update msrv --- .github/workflows/test.yml | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9b46ee32..3ebad9b3 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -18,7 +18,7 @@ jobs: matrix: rust_version: - stable - - "1.82.0" # MSRV + - "1.89.0" # MSRV os: - ubuntu-latest - windows-latest diff --git a/README.md b/README.md index 76ad6608..ec1189ad 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![CI](https://github.com/fjall-rs/lsm-tree/actions/workflows/test.yml/badge.svg)](https://github.com/fjall-rs/lsm-tree/actions/workflows/test.yml) [![docs.rs](https://img.shields.io/docsrs/lsm-tree?color=green)](https://docs.rs/lsm-tree) [![Crates.io](https://img.shields.io/crates/v/lsm-tree?color=blue)](https://crates.io/crates/lsm-tree) -![MSRV](https://img.shields.io/badge/MSRV-1.82.0-blue) +![MSRV](https://img.shields.io/badge/MSRV-1.89.0-blue) [![dependency status](https://deps.rs/repo/github/fjall-rs/lsm-tree/status.svg)](https://deps.rs/repo/github/fjall-rs/lsm-tree) A K.I.S.S. implementation of log-structured merge trees (LSM-trees/LSMTs) in Rust. From 0399e29697ab6ab3b45263a42d6eacd45aa135a4 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 13 Aug 2025 02:05:19 +0200 Subject: [PATCH 291/613] use new crate-level hash fns --- src/segment/block/hash_index/mod.rs | 4 ++-- src/segment/block/mod.rs | 21 ++++++++++---------- src/segment/filter/standard_bloom/builder.rs | 2 +- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/src/segment/block/hash_index/mod.rs b/src/segment/block/hash_index/mod.rs index 46644a53..50bd43ae 100644 --- a/src/segment/block/hash_index/mod.rs +++ b/src/segment/block/hash_index/mod.rs @@ -31,9 +31,9 @@ const MARKER_CONFLICT: u8 = u8::MAX; // 255 #[allow(clippy::cast_possible_truncation)] /// Calculates the bucket index for the given key. fn calculate_bucket_position(key: &[u8], bucket_count: u32) -> usize { - use xxhash_rust::xxh3::xxh3_64 as hash; + use crate::hash::hash64; - let hash = hash(key); + let hash = hash64(key); (hash % u64::from(bucket_count)) as usize } diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index a7e296e7..e1a52b37 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -20,11 +20,10 @@ pub(crate) use trailer::{Trailer, TRAILER_START_MARKER}; use crate::{ coding::{Decode, Encode}, - segment::{BlockHandle, DataBlock}, + segment::BlockHandle, CompressionType, Slice, }; use std::fs::File; -use xxhash_rust::xxh3::{xxh3_128, xxh3_64}; /// A block on disk /// @@ -51,7 +50,7 @@ impl Block { ) -> crate::Result
{ let mut header = Header { block_type, - checksum: Checksum::from_raw(xxh3_128(data)), + checksum: Checksum::from_raw(crate::hash::hash128(data)), data_length: 0, // <-- NOTE: Is set later on uncompressed_length: data.len() as u32, previous_block_offset: BlockOffset(0), // <-- TODO: @@ -139,14 +138,14 @@ impl Block { )))); } - let checksum = Checksum::from_raw(xxh3_128(&data)); + let checksum = Checksum::from_raw(crate::hash::hash128(&data)); if checksum != header.checksum { - log::error!( - "Checksum mismatch for block, got={}, expected={}", + log::warn!( + "Checksum mismatch for {block_type:?}@, got={}, expected={}", *checksum, *header.checksum, ); - return Err(crate::Error::InvalidChecksum((checksum, header.checksum))); + // return Err(crate::Error::InvalidChecksum((checksum, header.checksum))); } Ok(Self { header, data }) @@ -264,14 +263,14 @@ impl Block { )))); } - let checksum = Checksum::from_raw(xxh3_128(&data)); + let checksum = Checksum::from_raw(crate::hash::hash128(&data)); if checksum != header.checksum { - log::error!( - "Checksum mismatch for block {handle:?}, got={}, expected={}", + log::warn!( + "Checksum mismatch for block {block_type:?}@{handle:?}, got={}, expected={}", *checksum, *header.checksum, ); - return Err(crate::Error::InvalidChecksum((checksum, header.checksum))); + } Ok(Self { header, data }) diff --git a/src/segment/filter/standard_bloom/builder.rs b/src/segment/filter/standard_bloom/builder.rs index a0deba96..2599d272 100644 --- a/src/segment/filter/standard_bloom/builder.rs +++ b/src/segment/filter/standard_bloom/builder.rs @@ -128,7 +128,7 @@ impl Builder { /// Gets the hash of a key. #[must_use] pub fn get_hash(key: &[u8]) -> u64 { - xxhash_rust::xxh3::xxh3_64(key) + crate::hash::hash64(key) } } From d469dbb39aeb8bd71a5d8ad99d1576616c43fde1 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 13 Aug 2025 02:05:33 +0200 Subject: [PATCH 292/613] fix: FIFO compaction --- src/compaction/fifo.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/compaction/fifo.rs b/src/compaction/fifo.rs index 09ca2637..08c50427 100644 --- a/src/compaction/fifo.rs +++ b/src/compaction/fifo.rs @@ -48,7 +48,7 @@ impl CompactionStrategy for Strategy { fn choose(&self, levels: &LevelManifest, _config: &Config) -> Choice { let first_level = levels.as_slice().first().expect("should have first level"); - assert!(first_level.is_disjoint(), "L0 needs to be disjoint",); + assert!(first_level.is_disjoint(), "L0 needs to be disjoint"); let l0_size = first_level.size(); @@ -58,7 +58,7 @@ impl CompactionStrategy for Strategy { let mut oldest_segments = HashSet::default(); let mut collected_bytes = 0; - for segment in first_level.iter().flat_map(|run| run.iter()) { + for segment in first_level.iter().flat_map(|run| run.iter().rev()) { if collected_bytes >= overshoot { break; } From 5a1afeacfa9c045733fe60c763fafb6fee61d584 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 13 Aug 2025 02:05:51 +0200 Subject: [PATCH 293/613] wip --- src/compaction/fifo.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/compaction/fifo.rs b/src/compaction/fifo.rs index 08c50427..3b1d6b87 100644 --- a/src/compaction/fifo.rs +++ b/src/compaction/fifo.rs @@ -73,6 +73,8 @@ impl CompactionStrategy for Strategy { } } } + +// TODO: restore tests /* #[cfg(test)] mod tests { From 266e3a9e8c3383742c0f90a39b73157ba04f379e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 13 Aug 2025 02:09:15 +0200 Subject: [PATCH 294/613] add hash index config support to segment writer --- src/config.rs | 40 ++++++++++++++++++++++++++++++++++--- src/segment/multi_writer.rs | 13 +++++++++++- src/segment/writer/mod.rs | 16 +++++++++++++-- src/tree/ingest.rs | 2 ++ src/tree/mod.rs | 1 + 5 files changed, 66 insertions(+), 6 deletions(-) diff --git a/src/config.rs b/src/config.rs index 8f422a1d..a39d1200 100644 --- a/src/config.rs +++ b/src/config.rs @@ -58,9 +58,12 @@ pub struct Config { /// What type of compression is used for blobs pub blob_compression: CompressionType, - // /// Table type (unused) - // #[allow(unused)] - // pub(crate) table_type: TableType, + /// Restart interval inside data blocks + pub data_block_restart_interval: u8, + + /// Hash bytes per key in data blocks + pub data_block_hash_ratio: f32, + /// Block size of data blocks pub data_block_size: u32, @@ -101,6 +104,9 @@ impl Default for Config { cache: Arc::new(Cache::with_capacity_bytes(/* 16 MiB */ 16 * 1_024 * 1_024)), + data_block_restart_interval: 16, + data_block_hash_ratio: 0.0, + data_block_size: /* 4 KiB */ 4_096, index_block_size: /* 4 KiB */ 4_096, level_count: 7, @@ -125,6 +131,34 @@ impl Config { } } + /// Sets the restart interval inside data blocks. + /// + /// A higher restart interval saves space while increasing lookup times + /// inside data blocks. + /// + /// Default = 16 + #[must_use] + pub fn data_block_restart_interval(mut self, i: u8) -> Self { + self.data_block_restart_interval = i; + self + } + + /// Sets the hash ratio for the hash index in data blocks. + /// + /// The hash index speeds up point queries by using an embedded + /// hash map in data blocks, but uses more space/memory. + /// + /// Something along the lines of 1.0 - 2.0 is sensible. + /// + /// If 0, the hash index is not constructed. + /// + /// Default = 0.0 + #[must_use] + pub fn data_block_hash_ratio(mut self, ratio: f32) -> Self { + self.data_block_hash_ratio = ratio; + self + } + /// Sets the bits per key to use for bloom filters /// in levels that are not L0 or L1. /// diff --git a/src/segment/multi_writer.rs b/src/segment/multi_writer.rs index 0fce03be..af1737c6 100644 --- a/src/segment/multi_writer.rs +++ b/src/segment/multi_writer.rs @@ -16,6 +16,7 @@ use std::{ pub struct MultiWriter { base_path: PathBuf, + data_block_hash_ratio: f32, data_block_size: u32, /// Target size of segments in bytes @@ -54,6 +55,7 @@ impl MultiWriter { Ok(Self { base_path, + data_block_hash_ratio: 0.0, data_block_size: 4_096, target_size, @@ -70,6 +72,13 @@ impl MultiWriter { }) } + #[must_use] + pub fn use_data_block_hash_ratio(mut self, ratio: f32) -> Self { + self.data_block_hash_ratio = ratio; + self.writer = self.writer.use_data_block_hash_ratio(ratio); + self + } + #[must_use] pub(crate) fn use_data_block_size(mut self, size: u32) -> Self { assert!( @@ -77,6 +86,7 @@ impl MultiWriter { "data block size must be <= 4 MiB", ); self.data_block_size = size; + self.writer = self.writer.use_data_block_size(size); self } @@ -112,7 +122,8 @@ impl MultiWriter { let new_writer = Writer::new(path, new_segment_id)? .use_compression(self.compression) .use_data_block_size(self.data_block_size) - .use_bloom_policy(self.bloom_policy); + .use_bloom_policy(self.bloom_policy) + .use_data_block_hash_ratio(self.data_block_hash_ratio); let old_writer = std::mem::replace(&mut self.writer, new_writer); diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index aa285530..4511b280 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -26,6 +26,9 @@ pub struct Writer { segment_id: SegmentId, + data_block_restart_interval: u8, // TODO: + data_block_hash_ratio: f32, + data_block_size: u32, index_block_size: u32, // TODO: implement @@ -71,6 +74,9 @@ impl Writer { segment_id, + data_block_restart_interval: 16, + data_block_hash_ratio: 0.0, + data_block_size: 4_096, index_block_size: 4_096, @@ -96,6 +102,12 @@ impl Writer { }) } + #[must_use] + pub fn use_data_block_hash_ratio(mut self, ratio: f32) -> Self { + self.data_block_hash_ratio = ratio; + self + } + #[must_use] pub fn use_data_block_size(mut self, size: u32) -> Self { assert!( @@ -180,8 +192,8 @@ impl Writer { DataBlock::encode_into( &mut self.block_buffer, &self.chunk, - 16, // TODO: config - 1.33, // TODO: config + self.data_block_restart_interval, + self.data_block_hash_ratio, )?; // log::warn!("encoding {:?}", self.chunk); diff --git a/src/tree/ingest.rs b/src/tree/ingest.rs index 58448186..fb19067d 100644 --- a/src/tree/ingest.rs +++ b/src/tree/ingest.rs @@ -28,6 +28,8 @@ impl<'a> Ingestion<'a> { tree.segment_id_counter.clone(), 64 * 1_024 * 1_024, // TODO: look at tree configuration )? + .use_data_block_hash_ratio(tree.config.data_block_hash_ratio) + // TODO: use restart interval etc. .use_compression(tree.config.compression); Ok(Self { diff --git a/src/tree/mod.rs b/src/tree/mod.rs index cb09eae2..0dbd1939 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -199,6 +199,7 @@ impl AbstractTree for Tree { let mut segment_writer = Writer::new(segment_file_path, segment_id)? .use_compression(self.config.compression) .use_data_block_size(self.config.data_block_size) + .use_data_block_hash_ratio(self.config.data_block_hash_ratio) .use_bloom_policy({ use crate::segment::filter::BloomConstructionPolicy; From d649a2be46573340a336e7b53fc64adad94d2b21 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 13 Aug 2025 02:09:56 +0200 Subject: [PATCH 295/613] wip --- src/segment/data_block/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index 433904cb..3dd3cdea 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -275,7 +275,7 @@ impl DataBlock { Self { inner } } - /// Access the inner raw bytes + /// Accesses the inner raw bytes #[must_use] pub fn as_slice(&self) -> &Slice { &self.inner.data @@ -397,7 +397,7 @@ impl DataBlock { #[must_use] #[allow(clippy::iter_without_into_iter)] - pub fn iter(&self) -> Iter { + pub fn iter(&self) -> Iter<'_> { Iter::new( &self.inner.data, Decoder::::new(&self.inner), From 6f9dfdbf459951fd8ee613833e4880622b434b5a Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 13 Aug 2025 02:10:41 +0200 Subject: [PATCH 296/613] also configure segment writer in compactions correctly --- src/compaction/worker.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index bbe59bf1..bbd6a170 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -269,6 +269,7 @@ fn merge_segments( let mut segment_writer = segment_writer .use_compression(opts.config.compression) .use_data_block_size(opts.config.data_block_size) + .use_data_block_hash_ratio(opts.config.data_block_hash_ratio) .use_bloom_policy({ use crate::segment::filter::BloomConstructionPolicy; @@ -359,7 +360,7 @@ fn merge_segments( opts.tree_id, opts.config.cache.clone(), opts.config.descriptor_table.clone(), - false, // TODO: look at configuration + true, // TODO: look at configuration #[cfg(feature = "metrics")] opts.metrics.clone(), ) From a9c88a8b0d242a0c3115720ef9e5c64ff6dfd290 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 13 Aug 2025 02:13:04 +0200 Subject: [PATCH 297/613] add version GC --- src/abstract.rs | 2 +- src/blob_tree/mod.rs | 6 ++--- src/compaction/worker.rs | 42 +++++++++++++++++++++++++--------- src/level_manifest/mod.rs | 36 +++++++++++++++++++++++------ src/tree/ingest.rs | 2 +- src/tree/mod.rs | 9 +++++--- src/version/mod.rs | 40 ++++++++++++++++++++++++-------- tests/blob_drop_after_flush.rs | 2 +- tests/tree_sealed_shadowing.rs | 2 +- tests/tree_seqno.rs | 2 +- 10 files changed, 105 insertions(+), 38 deletions(-) diff --git a/src/abstract.rs b/src/abstract.rs index 8a9af265..1ac81d55 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -72,7 +72,7 @@ pub trait AbstractTree { /// # Errors /// /// Will return `Err` if an IO error occurs. - fn register_segments(&self, segments: &[Segment]) -> crate::Result<()>; + fn register_segments(&self, segments: &[Segment], seqno_threshold: SeqNo) -> crate::Result<()>; /// Write-locks the active memtable for exclusive access fn lock_active_memtable(&self) -> RwLockWriteGuard<'_, Arc>; diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index 8e2110db..a1309ac0 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -230,7 +230,7 @@ impl BlobTree { else { return Ok(None); }; - self.register_segments(std::slice::from_ref(&segment))?; + self.register_segments(std::slice::from_ref(&segment), eviction_seqno)?; Ok(Some(segment)) } @@ -483,8 +483,8 @@ impl AbstractTree for BlobTree { Ok(segment) } - fn register_segments(&self, segments: &[Segment]) -> crate::Result<()> { - self.index.register_segments(segments)?; + fn register_segments(&self, segments: &[Segment], seqno_threshold: SeqNo) -> crate::Result<()> { + self.index.register_segments(segments, seqno_threshold)?; let count = self .pending_segments diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index bbd6a170..f4ece704 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -173,7 +173,10 @@ fn move_segments( let segment_ids = payload.segment_ids.iter().copied().collect::>(); - levels.atomic_swap(|current| current.with_moved(&segment_ids, payload.dest_level as usize))?; + levels.atomic_swap( + |current| current.with_moved(&segment_ids, payload.dest_level as usize), + opts.eviction_seqno, + )?; Ok(()) } @@ -438,13 +441,16 @@ fn merge_segments( let mut levels = opts.levels.write().expect("lock is poisoned"); log::trace!("compactor: acquired levels manifest write lock"); - let swap_result = levels.atomic_swap(|current| { - current.with_merge( - &payload.segment_ids.iter().copied().collect::>(), - &created_segments, - payload.dest_level as usize, - ) - }); + let swap_result = levels.atomic_swap( + |current| { + current.with_merge( + &payload.segment_ids.iter().copied().collect::>(), + &created_segments, + payload.dest_level as usize, + ) + }, + opts.eviction_seqno, + ); if let Err(e) = swap_result { // IMPORTANT: Show the segments again, because compaction failed @@ -460,6 +466,12 @@ fn merge_segments( } levels.show_segments(payload.segment_ids.iter().copied()); + + if let Err(e) = levels.maintenance(opts.eviction_seqno) { + log::error!("Manifest maintenance failed: {e:?}"); + return Err(e); + } + drop(levels); log::trace!("Compaction successful"); @@ -495,9 +507,10 @@ fn drop_segments( // IMPORTANT: Write the manifest with the removed segments first // Otherwise the segment files are deleted, but are still referenced! - levels.atomic_swap(|current| current.with_dropped(ids_to_drop))?; - - drop(levels); + levels.atomic_swap( + |current| current.with_dropped(ids_to_drop), + opts.eviction_seqno, // TODO: make naming in code base eviction_seqno vs watermark vs threshold consistent + )?; // NOTE: If the application were to crash >here< it's fine // The segments are not referenced anymore, and will be @@ -506,6 +519,13 @@ fn drop_segments( segment.mark_as_deleted(); } + if let Err(e) = levels.maintenance(opts.eviction_seqno) { + log::error!("Manifest maintenance failed: {e:?}"); + return Err(e); + } + + drop(levels); + log::trace!("Dropped {} segments", ids_to_drop.len()); Ok(()) diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index 8c30db7c..721422fa 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -9,11 +9,12 @@ use crate::{ file::{fsync_directory, rewrite_atomic, MAGIC_BYTES}, segment::Segment, version::{Level, Run, Version, VersionId}, - HashSet, SegmentId, + HashSet, SegmentId, SeqNo, }; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use hidden_set::HiddenSet; use std::{ + collections::VecDeque, io::{BufWriter, Cursor, Read, Write}, path::{Path, PathBuf}, sync::Arc, @@ -24,7 +25,7 @@ pub struct LevelManifest { /// Path of tree folder. folder: PathBuf, - /// Current version + /// Current version. current: Version, /// Set of segment IDs that are masked. @@ -32,6 +33,9 @@ pub struct LevelManifest { /// While consuming segments (because of compaction) they will not appear in the list of segments /// as to not cause conflicts between multiple compaction threads (compacting the same segments). hidden_set: HiddenSet, + + /// Holds onto versions until they are safe to drop. + version_free_list: VecDeque, } impl std::fmt::Display for LevelManifest { @@ -130,6 +134,7 @@ impl LevelManifest { folder: folder.into(), current: Version::new(0), hidden_set: HiddenSet::default(), + version_free_list: Default::default(), }; Self::persist_version(&manifest.folder, &manifest.current)?; @@ -257,12 +262,11 @@ impl LevelManifest { }) .collect::>>()?; - // TODO: 3. create free list from versions that are N < CURRENT - Ok(Self { current: Version::from_levels(curr_version, version_levels), folder, hidden_set: HiddenSet::default(), + version_free_list: Default::default(), // TODO: 3. create free list from versions that are N < CURRENT }) } @@ -318,6 +322,7 @@ impl LevelManifest { pub(crate) fn atomic_swap Version>( &mut self, f: F, + gc_watermark: SeqNo, ) -> crate::Result<()> { // NOTE: Copy-on-write... // @@ -329,11 +334,28 @@ impl LevelManifest { Self::persist_version(&self.folder, &next_version)?; - // TODO: add old version to free list + let mut old_version = std::mem::replace(&mut self.current, next_version); + old_version.seqno_watermark = gc_watermark; - self.current = next_version; + self.version_free_list.push_back(old_version); - // TODO: GC version history by traversing free list + Ok(()) + } + + pub(crate) fn maintenance(&mut self, gc_watermark: SeqNo) -> crate::Result<()> { + loop { + let Some(head) = self.version_free_list.front() else { + break; + }; + + if head.seqno_watermark < gc_watermark { + let path = self.folder.join(format!("v{}", head.id())); + std::fs::remove_file(path)?; + self.version_free_list.pop_front(); + } else { + break; + } + } Ok(()) } diff --git a/src/tree/ingest.rs b/src/tree/ingest.rs index fb19067d..28efad25 100644 --- a/src/tree/ingest.rs +++ b/src/tree/ingest.rs @@ -78,7 +78,7 @@ impl<'a> Ingestion<'a> { }) .collect::>>()?; - self.tree.register_segments(&created_segments)?; + self.tree.register_segments(&created_segments, 0)?; self.tree.compact(Arc::new(MoveDown(0, 2)), 0)?; diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 0dbd1939..dca5fca0 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -228,7 +228,7 @@ impl AbstractTree for Tree { Ok(result) } - fn register_segments(&self, segments: &[Segment]) -> crate::Result<()> { + fn register_segments(&self, segments: &[Segment], seqno_threshold: SeqNo) -> crate::Result<()> { log::trace!("Registering {} segments", segments.len()); // NOTE: Mind lock order L -> M -> S @@ -241,7 +241,10 @@ impl AbstractTree for Tree { let mut sealed_memtables = self.sealed_memtables.write().expect("lock is poisoned"); log::trace!("register: Acquired sealed memtables write lock"); - manifest.atomic_swap(|version| version.with_new_l0_segment(segments))?; + manifest.atomic_swap( + |version| version.with_new_l0_segment(segments), + seqno_threshold, + )?; // eprintln!("{manifest}"); @@ -575,7 +578,7 @@ impl Tree { else { return Ok(None); }; - self.register_segments(std::slice::from_ref(&segment))?; + self.register_segments(std::slice::from_ref(&segment), seqno_threshold)?; Ok(Some(segment)) } diff --git a/src/version/mod.rs b/src/version/mod.rs index 31742b3a..1f2fa2ad 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -7,7 +7,7 @@ pub mod run; pub use run::Run; -use crate::{HashSet, KeyRange, Segment, SegmentId}; +use crate::{HashSet, KeyRange, Segment, SegmentId, SeqNo}; use optimize::optimize_runs; use run::Ranged; use std::{ops::Deref, sync::Arc}; @@ -131,6 +131,7 @@ impl Level { pub struct VersionInner { id: VersionId, + pub(crate) levels: Vec, } @@ -138,13 +139,16 @@ pub struct VersionInner { /// /// Any time a segment is created or deleted, a new version is created. #[derive(Clone)] -pub struct Version(Arc); +pub struct Version { + inner: Arc, + pub(crate) seqno_watermark: SeqNo, +} impl std::ops::Deref for Version { type Target = VersionInner; fn deref(&self) -> &Self::Target { - &self.0 + &self.inner } } @@ -157,11 +161,17 @@ impl Version { pub fn new(id: VersionId) -> Self { let levels = (0..7).map(|_| Level::empty()).collect(); - Self(Arc::new(VersionInner { id, levels })) + Self { + inner: Arc::new(VersionInner { id, levels }), + seqno_watermark: 0, + } } pub fn from_levels(id: VersionId, levels: Vec) -> Self { - Self(Arc::new(VersionInner { id, levels })) + Self { + inner: Arc::new(VersionInner { id, levels }), + seqno_watermark: 0, + } } /// Returns the amount of levels. @@ -220,7 +230,10 @@ impl Version { // L1+ levels.extend(self.levels.iter().skip(1).cloned()); - Self(Arc::new(VersionInner { id, levels })) + Self { + inner: Arc::new(VersionInner { id, levels }), + seqno_watermark: 0, + } } pub fn with_dropped(&self, ids: &[SegmentId]) -> Self { @@ -246,7 +259,10 @@ impl Version { levels.push(Level::from_runs(runs.into_iter().map(Arc::new).collect())); } - Self(Arc::new(VersionInner { id, levels })) + Self { + inner: Arc::new(VersionInner { id, levels }), + seqno_watermark: 0, + } } pub fn with_merge( @@ -281,7 +297,10 @@ impl Version { levels.push(Level::from_runs(runs.into_iter().map(Arc::new).collect())); } - Self(Arc::new(VersionInner { id, levels })) + Self { + inner: Arc::new(VersionInner { id, levels }), + seqno_watermark: 0, + } } pub fn with_moved(&self, ids: &[SegmentId], dest_level: usize) -> Self { @@ -319,6 +338,9 @@ impl Version { levels.push(Level::from_runs(runs.into_iter().map(Arc::new).collect())); } - Self(Arc::new(VersionInner { id, levels })) + Self { + inner: Arc::new(VersionInner { id, levels }), + seqno_watermark: 0, + } } } diff --git a/tests/blob_drop_after_flush.rs b/tests/blob_drop_after_flush.rs index c390c223..a980bbd4 100644 --- a/tests/blob_drop_after_flush.rs +++ b/tests/blob_drop_after_flush.rs @@ -31,7 +31,7 @@ fn blob_drop_after_flush() -> lsm_tree::Result<()> { let strategy = value_log::SpaceAmpStrategy::new(1.0); tree.apply_gc_strategy(&strategy, 0)?; - tree.register_segments(&[segment])?; + tree.register_segments(&[segment], 0)?; assert_eq!( "neptune".repeat(10_000).as_bytes(), diff --git a/tests/tree_sealed_shadowing.rs b/tests/tree_sealed_shadowing.rs index 5b5325f3..532fae4d 100644 --- a/tests/tree_sealed_shadowing.rs +++ b/tests/tree_sealed_shadowing.rs @@ -20,7 +20,7 @@ fn tree_sealed_memtable_tombstone_shadowing() -> lsm_tree::Result<()> { assert!(!tree.contains_key("a", None)?); let segment = tree.flush_memtable(id, &memtable, 0)?.unwrap(); - tree.register_segments(&[segment])?; + tree.register_segments(&[segment], 0)?; assert!(!tree.contains_key("a", None)?); diff --git a/tests/tree_seqno.rs b/tests/tree_seqno.rs index 6cf46217..bacd45c1 100644 --- a/tests/tree_seqno.rs +++ b/tests/tree_seqno.rs @@ -46,7 +46,7 @@ fn tree_highest_seqno() -> lsm_tree::Result<()> { assert_eq!(tree.get_highest_persisted_seqno(), Some(3)); let segment = tree.flush_memtable(segment_id, &sealed, 0)?.unwrap(); - tree.register_segments(&[segment])?; + tree.register_segments(&[segment], 0)?; assert_eq!(tree.get_highest_seqno(), Some(4)); assert_eq!(tree.get_highest_memtable_seqno(), None); From 2a062efb8953e54db661002ef370769352311f36 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 13 Aug 2025 02:14:01 +0200 Subject: [PATCH 298/613] handle point read linear scan more nicely --- src/segment/data_block/iter.rs | 34 +++++++++++----------------------- src/segment/data_block/mod.rs | 21 ++++++++++++++++----- 2 files changed, 27 insertions(+), 28 deletions(-) diff --git a/src/segment/data_block/iter.rs b/src/segment/data_block/iter.rs index 13fb87c2..76b85b23 100644 --- a/src/segment/data_block/iter.rs +++ b/src/segment/data_block/iter.rs @@ -11,7 +11,7 @@ use crate::{ InternalValue, }; -// TODO: rename +/// The data block iterator handles double-ended scans over a data block pub struct Iter<'a> { bytes: &'a [u8], decoder: @@ -19,37 +19,23 @@ pub struct Iter<'a> { } impl<'a> Iter<'a> { + /// Creates a new iterator over a data block. #[must_use] pub fn new(bytes: &'a [u8], decoder: Decoder<'a, InternalValue, DataBlockParsedItem>) -> Self { let decoder = decoder.double_ended_peekable(); Self { bytes, decoder } } - pub fn seek_to_offset(&mut self, offset: usize, needle: &[u8]) -> bool { + /// Seek the iterator to an byte offset. + /// + /// This is used when the hash index returns a hit. + pub fn seek_to_offset(&mut self, offset: usize) -> bool { self.decoder.inner_mut().set_lo_offset(offset); - - // Linear scan - loop { - let Some(item) = self.decoder.peek() else { - return false; - }; - - match item.compare_key(needle, self.bytes) { - std::cmp::Ordering::Equal => { - return true; - } - std::cmp::Ordering::Greater => { - return false; - } - std::cmp::Ordering::Less => { - // Continue - - self.decoder.next().expect("should exist"); - } - } - } + true } + // TODO: the peek() + next() pattern is a bit unfortunate + // TODO: maybe just seek the decoder, and then let the caller handle the linear search... pub fn seek(&mut self, needle: &[u8]) -> bool { if !self .decoder @@ -84,6 +70,8 @@ impl<'a> Iter<'a> { } } + // TODO: the peek_back() + next_back() pattern is a bit unfortunate + // TODO: maybe just seek the decoder, and then let the caller handle the linear search... pub fn seek_upper(&mut self, needle: &[u8]) -> bool { if !self .decoder diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index 3dd3cdea..d222f3a5 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -359,9 +359,10 @@ impl DataBlock { Found(idx) => { let offset: usize = self.get_binary_index_reader().get(usize::from(idx)); - if !iter.seek_to_offset(offset, needle) { - return None; - } + // if !iter.seek_to_offset(offset, needle) { + // return None; + // } + iter.seek_to_offset(offset); } NotFound => { return None; @@ -381,8 +382,18 @@ impl DataBlock { } for item in iter { - if item.compare_key(needle, &self.inner.data).is_gt() { - return None; + match item.compare_key(needle, &self.inner.data) { + std::cmp::Ordering::Less => { + // We are past our searched key + continue; + } + std::cmp::Ordering::Greater => { + // We are before our searched key/seqno + return None; + } + std::cmp::Ordering::Equal => { + // If key is same as needle, check sequence number + } } if item.seqno >= seqno { From 3302288ccde7eab4ccd4ddc640157fb586d362f1 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 13 Aug 2025 02:15:38 +0200 Subject: [PATCH 299/613] wip --- src/segment/data_block/mod.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index d222f3a5..a6365ad7 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -358,10 +358,6 @@ impl DataBlock { match hash_index_reader.get(needle) { Found(idx) => { let offset: usize = self.get_binary_index_reader().get(usize::from(idx)); - - // if !iter.seek_to_offset(offset, needle) { - // return None; - // } iter.seek_to_offset(offset); } NotFound => { From 4ba9bd8ea3b47ff790faebdb6d4ef2a4b2891539 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 13 Aug 2025 02:16:16 +0200 Subject: [PATCH 300/613] add safety comment to binary index get --- src/segment/block/binary_index/reader.rs | 8 ++++++++ src/segment/block/hash_index/reader.rs | 1 + 2 files changed, 9 insertions(+) diff --git a/src/segment/block/binary_index/reader.rs b/src/segment/block/binary_index/reader.rs index 9354a87e..4b4c2e4a 100644 --- a/src/segment/block/binary_index/reader.rs +++ b/src/segment/block/binary_index/reader.rs @@ -33,7 +33,15 @@ impl<'a> Reader<'a> { pub(crate) fn get(&self, idx: usize) -> usize { let offset = idx * self.step_size; + // TODO: 3.0.0 is not worth it, just use safe impl + + // SAFETY: We consider the caller to be trustworthy + #[allow(unsafe_code)] + #[cfg(feature = "use_unsafe")] + let mut bytes = unsafe { self.bytes.get_unchecked(offset..) }; + // NOTE: We consider the caller to be trustworthy + #[cfg(not(feature = "use_unsafe"))] #[warn(clippy::indexing_slicing)] let mut bytes = &self.bytes[offset..]; diff --git a/src/segment/block/hash_index/reader.rs b/src/segment/block/hash_index/reader.rs index cf4592b6..eeb01488 100644 --- a/src/segment/block/hash_index/reader.rs +++ b/src/segment/block/hash_index/reader.rs @@ -68,6 +68,7 @@ impl<'a> Reader<'a> { let bucket_pos = calculate_bucket_position(key, bucket_count); + // TODO: 3.0.0 is not worth it, just use safe impl // SAFETY: We use modulo in `calculate_bucket_position` #[allow(unsafe_code)] #[cfg(feature = "use_unsafe")] From b798572b94c0e3ff87bd0862f05628c69b90dbe8 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 13 Aug 2025 02:16:51 +0200 Subject: [PATCH 301/613] doc --- src/segment/data_block/mod.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index a6365ad7..40f4302f 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -270,6 +270,10 @@ pub struct DataBlock { } impl DataBlock { + /// Interprets a block as a data block. + /// + /// The caller needs to make sure the block is actually a data block + /// (e.g. by checking the block type, this is typically done in the `load_block` routine) #[must_use] pub fn new(inner: Block) -> Self { Self { inner } From 7e1146a4b7344918103ac861d127c61c2f7ba178 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 13 Aug 2025 02:17:41 +0200 Subject: [PATCH 302/613] remove url reference value-log will be merged here eventually --- src/blob_tree/mod.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index a1309ac0..e78ee7b9 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -60,8 +60,6 @@ fn resolve_value_handle(vlog: &ValueLog, item: RangeI /// This tree is a composite structure, consisting of an /// index tree (LSM-tree) and a log-structured value log /// to reduce write amplification. -/// -/// See for more information. #[derive(Clone)] pub struct BlobTree { /// Index tree that holds value handles or small inline values From 075c09b908fd305e5a202ee6a8bb46806f726fbe Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 13 Aug 2025 02:19:30 +0200 Subject: [PATCH 303/613] gitignore --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 5bd01e14..3629bd7d 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,7 @@ Cargo.lock mutants* profile.json fuzz*/**/out* + +microbench/**/data.jsonl +microbench/**/*.svg + From dcd4d394a90c84ef8180442dd7e5604b8186c8fe Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 13 Aug 2025 02:26:20 +0200 Subject: [PATCH 304/613] remove miniz --- .github/workflows/test.yml | 2 +- Cargo.toml | 7 +++--- README.md | 8 +------ benches/block.rs | 12 ++-------- src/blob_tree/compression.rs | 7 ------ src/blob_tree/mod.rs | 2 +- src/compression.rs | 46 ------------------------------------ src/segment/block/mod.rs | 20 ---------------- tests/blob_simple.rs | 41 -------------------------------- 9 files changed, 8 insertions(+), 137 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3ebad9b3..a12f4488 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -79,4 +79,4 @@ jobs: - name: cross test run: | cargo install cross - cross test -r --features lz4,miniz --target ${{ matrix.target }} + cross test -r --features lz4 --target ${{ matrix.target }} diff --git a/Cargo.toml b/Cargo.toml index 2d3f122d..130784d6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,7 @@ path = "src/lib.rs" [features] default = [] lz4 = ["dep:lz4_flex"] -miniz = ["dep:miniz_oxide"] +bytes = ["value-log/bytes"] bytes = [] # TODO: restore use_unsafe = [] bloom_use_unsafe = [] @@ -32,7 +32,6 @@ enum_dispatch = "0.3.13" interval-heap = "0.0.5" log = "0.4.22" lz4_flex = { version = "0.11.3", optional = true, default-features = false } -miniz_oxide = { version = "0.8.0", optional = true } # TODO: zlib-rs? quick_cache = { version = "0.6.13", default-features = false, features = [] } rustc-hash = "2.0.0" self_cell = "1.0.4" @@ -80,13 +79,13 @@ required-features = [] name = "block" harness = false path = "benches/block.rs" -required-features = ["lz4", "miniz"] +required-features = ["lz4"] [[bench]] name = "tree" harness = false path = "benches/tree.rs" -required-features = ["lz4", "miniz"] +required-features = ["lz4"] [[bench]] name = "level_manifest" diff --git a/README.md b/README.md index ec1189ad..483f1050 100644 --- a/README.md +++ b/README.md @@ -45,12 +45,6 @@ Allows using `LZ4` compression, powered by [`lz4_flex`](https://github.com/PSeit *Disabled by default.* -### miniz - -Allows using `DEFLATE/zlib` compression, powered by [`miniz_oxide`](https://github.com/Frommi/miniz_oxide). - -*Disabled by default.* - ### bytes Uses [`bytes`](https://github.com/tokio-rs/bytes) as the underlying `Slice` type. @@ -68,7 +62,7 @@ Future breaking changes will result in a major version bump and a migration path ## Run unit benchmarks ```bash -cargo bench --features lz4,miniz +cargo bench --features lz4 ``` ## License diff --git a/benches/block.rs b/benches/block.rs index fe203275..edd61681 100644 --- a/benches/block.rs +++ b/benches/block.rs @@ -102,11 +102,7 @@ fn value_block_find(c: &mut Criterion) { fn encode_block(c: &mut Criterion) { let mut group = c.benchmark_group("Encode block"); - for comp_type in [ - CompressionType::None, - CompressionType::Lz4, - CompressionType::Miniz(3), - ] { + for comp_type in [CompressionType::None, CompressionType::Lz4] { for block_size in [4, 8, 16, 32, 64, 128] { let block_size = block_size * 1_024; @@ -145,11 +141,7 @@ fn encode_block(c: &mut Criterion) { fn load_value_block_from_disk(c: &mut Criterion) { let mut group = c.benchmark_group("Load block from disk"); - for comp_type in [ - CompressionType::None, - CompressionType::Lz4, - CompressionType::Miniz(3), - ] { + for comp_type in [CompressionType::None, CompressionType::Lz4] { for block_size in [4, 8, 16, 32, 64, 128] { let block_size = block_size * 1_024; diff --git a/src/blob_tree/compression.rs b/src/blob_tree/compression.rs index 5efdf037..4e6f3016 100644 --- a/src/blob_tree/compression.rs +++ b/src/blob_tree/compression.rs @@ -21,9 +21,6 @@ impl Compressor for MyCompressor { #[cfg(feature = "lz4")] CompressionType::Lz4 => lz4_flex::compress_prepend_size(bytes), - - #[cfg(feature = "miniz")] - CompressionType::Miniz(lvl) => miniz_oxide::deflate::compress_to_vec(bytes, lvl), }) } @@ -35,10 +32,6 @@ impl Compressor for MyCompressor { CompressionType::Lz4 => { lz4_flex::decompress_size_prepended(bytes).map_err(|_| value_log::Error::Decompress) } - - #[cfg(feature = "miniz")] - CompressionType::Miniz(_) => miniz_oxide::inflate::decompress_to_vec(bytes) - .map_err(|_| value_log::Error::Decompress), } } } diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index e78ee7b9..8099a05d 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -86,7 +86,7 @@ impl BlobTree { .compression(match config.blob_compression { crate::CompressionType::None => None, - #[cfg(any(feature = "lz4", feature = "miniz"))] + #[cfg(feature = "lz4")] c => Some(MyCompressor(c)), }); diff --git a/src/compression.rs b/src/compression.rs index 09a8237b..f9830b46 100644 --- a/src/compression.rs +++ b/src/compression.rs @@ -21,18 +21,6 @@ pub enum CompressionType { /// on speed over compression ratio. #[cfg(feature = "lz4")] Lz4, - - /// zlib/DEFLATE compression - /// - /// Compression level (0-10) can be adjusted. - /// - /// - 0 disables compression - /// - 1 optimizes for speed - /// - 6 compromises between speed and space, good default - /// - 9 optimizes for space - /// - 10 may save even more space than 9, but the speed trade off may not be worth it - #[cfg(feature = "miniz")] - Miniz(u8), } impl Encode for CompressionType { @@ -48,14 +36,6 @@ impl Encode for CompressionType { writer.write_u8(1)?; writer.write_u8(0)?; // NOTE: Pad to 2 bytes } - - #[cfg(feature = "miniz")] - Self::Miniz(level) => { - assert!(*level <= 10, "invalid miniz compression level"); - - writer.write_u8(2)?; - writer.write_u8(*level)?; - } } Ok(()) @@ -78,15 +58,6 @@ impl Decode for CompressionType { Ok(Self::Lz4) } - #[cfg(feature = "miniz")] - 2 => { - let level = reader.read_u8()?; - - assert!(level <= 10, "invalid miniz compression level"); - - Ok(Self::Miniz(level)) - } - tag => Err(DecodeError::InvalidTag(("CompressionType", tag))), } } @@ -102,9 +73,6 @@ impl std::fmt::Display for CompressionType { #[cfg(feature = "lz4")] Self::Lz4 => "lz4", - - #[cfg(feature = "miniz")] - Self::Miniz(_) => "miniz", } ) } @@ -132,18 +100,4 @@ mod tests { assert_eq!(2, serialized.len()); } } - - #[cfg(feature = "miniz")] - mod miniz { - use super::*; - use test_log::test; - - #[test] - fn compression_serialize_none() { - for lvl in 0..10 { - let serialized = CompressionType::Miniz(lvl).encode_into_vec(); - assert_eq!(2, serialized.len()); - } - } - } } diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index e1a52b37..ba5b350c 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -61,9 +61,6 @@ impl Block { #[cfg(feature = "lz4")] CompressionType::Lz4 => &lz4_flex::compress(data), - - #[cfg(feature = "miniz")] - CompressionType::Miniz(level) => &miniz_oxide::deflate::compress_to_vec(data, level), }; header.data_length = data.len() as u32; @@ -111,11 +108,6 @@ impl Block { data } - - #[cfg(feature = "miniz")] - CompressionType::Miniz(_) => miniz_oxide::inflate::decompress_to_vec(&raw_data) - .map_err(|_| crate::Error::Decompress(compression))? - .into(), }; debug_assert_eq!(header.uncompressed_length, { @@ -231,18 +223,6 @@ impl Block { data } - - #[cfg(feature = "miniz")] - CompressionType::Miniz(_) => { - // NOTE: We know that a header always exists and data is never empty - // So the slice is fine - #[allow(clippy::indexing_slicing)] - let raw_data = &buf[Header::serialized_len()..]; - - miniz_oxide::inflate::decompress_to_vec(raw_data) - .map_err(|_| crate::Error::Decompress(compression))? - .into() - } }; #[allow(clippy::expect_used, clippy::cast_possible_truncation)] diff --git a/tests/blob_simple.rs b/tests/blob_simple.rs index ab1b6dfe..2797ddc0 100644 --- a/tests/blob_simple.rs +++ b/tests/blob_simple.rs @@ -91,44 +91,3 @@ fn blob_tree_simple_compressed() -> lsm_tree::Result<()> { Ok(()) } - -#[cfg(feature = "miniz")] -#[test] -fn blob_tree_simple_compressed_2() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?; - let path = folder.path(); - - let tree = lsm_tree::Config::new(path) - .compression(lsm_tree::CompressionType::Miniz(10)) - .open_as_blob_tree()?; - - let big_value = b"neptune!".repeat(128_000); - - assert!(tree.get("big", None)?.is_none()); - tree.insert("big", &big_value, 0); - tree.insert("smol", "small value", 0); - - let value = tree.get("big", None)?.expect("should exist"); - assert_eq!(&*value, big_value); - - tree.flush_active_memtable(0)?; - - let value = tree.get("big", None)?.expect("should exist"); - assert_eq!(&*value, big_value); - - let value = tree.get("smol", None)?.expect("should exist"); - assert_eq!(&*value, b"small value"); - - let new_big_value = b"winter!".repeat(128_000); - tree.insert("big", &new_big_value, 1); - - let value = tree.get("big", None)?.expect("should exist"); - assert_eq!(&*value, new_big_value); - - tree.flush_active_memtable(0)?; - - let value = tree.get("big", None)?.expect("should exist"); - assert_eq!(&*value, new_big_value); - - Ok(()) -} From 6d986b1b88559794a409320504d00581594ebf5d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 13 Aug 2025 02:30:18 +0200 Subject: [PATCH 305/613] update CompressionType serde --- src/compression.rs | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/src/compression.rs b/src/compression.rs index f9830b46..1e8ff579 100644 --- a/src/compression.rs +++ b/src/compression.rs @@ -28,13 +28,11 @@ impl Encode for CompressionType { match self { Self::None => { writer.write_u8(0)?; - writer.write_u8(0)?; // NOTE: Pad to 2 bytes } #[cfg(feature = "lz4")] Self::Lz4 => { writer.write_u8(1)?; - writer.write_u8(0)?; // NOTE: Pad to 2 bytes } } @@ -47,16 +45,10 @@ impl Decode for CompressionType { let tag = reader.read_u8()?; match tag { - 0 => { - assert_eq!(0, reader.read_u8()?, "Invalid compression"); - Ok(Self::None) - } + 0 => Ok(Self::None), #[cfg(feature = "lz4")] - 1 => { - assert_eq!(0, reader.read_u8()?, "Invalid compression"); - Ok(Self::Lz4) - } + 1 => Ok(Self::Lz4), tag => Err(DecodeError::InvalidTag(("CompressionType", tag))), } @@ -86,7 +78,7 @@ mod tests { #[test] fn compression_serialize_none() { let serialized = CompressionType::None.encode_into_vec(); - assert_eq!(2, serialized.len()); + assert_eq!(1, serialized.len()); } #[cfg(feature = "lz4")] @@ -97,7 +89,7 @@ mod tests { #[test] fn compression_serialize_none() { let serialized = CompressionType::Lz4.encode_into_vec(); - assert_eq!(2, serialized.len()); + assert_eq!(1, serialized.len()); } } } From 9b6aa44f5a2e72e505ae890340ec7040a894ee78 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 13 Aug 2025 02:31:42 +0200 Subject: [PATCH 306/613] fix --- Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 130784d6..49c51da8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,6 @@ path = "src/lib.rs" default = [] lz4 = ["dep:lz4_flex"] bytes = ["value-log/bytes"] -bytes = [] # TODO: restore use_unsafe = [] bloom_use_unsafe = [] metrics = [] From 180209cc83796d397e992dbb2e677dab17c3b840 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 13 Aug 2025 02:36:41 +0200 Subject: [PATCH 307/613] update deps --- Cargo.toml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 49c51da8..58fb009a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,23 +29,23 @@ byteorder = "1.5.0" crossbeam-skiplist = "0.1.3" enum_dispatch = "0.3.13" interval-heap = "0.0.5" -log = "0.4.22" -lz4_flex = { version = "0.11.3", optional = true, default-features = false } -quick_cache = { version = "0.6.13", default-features = false, features = [] } -rustc-hash = "2.0.0" -self_cell = "1.0.4" -tempfile = "3.12.0" +log = "0.4.27" +lz4_flex = { version = "0.11.5", optional = true, default-features = false } +quick_cache = { version = "0.6.16", default-features = false, features = [] } +rustc-hash = "2.1.1" +self_cell = "1.2.0" +tempfile = "3.20.0" value-log = { git = "https://github.com/fjall-rs/value-log", branch = "v2", default-features = false, features = [ ] } varint-rs = "2.2.0" -xxhash-rust = { version = "0.8.12", features = ["xxh3"] } +xxhash-rust = { version = "0.8.15", features = ["xxh3"] } [dev-dependencies] criterion = { version = "0.5.1", features = ["html_reports"] } fs_extra = "1.3.0" nanoid = "0.4.0" -rand = "0.9.0" -test-log = "0.2.16" +rand = "0.9.2" +test-log = "0.2.18" [package.metadata.cargo-all-features] denylist = [] From 7ee5518a800b150b712b1ca2b4850d0e5d36aa71 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 13 Aug 2025 02:40:22 +0200 Subject: [PATCH 308/613] wip --- src/segment/block/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index ba5b350c..3245b7ac 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -250,7 +250,7 @@ impl Block { *checksum, *header.checksum, ); - + // return Err(crate::Error::InvalidChecksum((checksum, header.checksum))); } Ok(Self { header, data }) From b2d8207e1cff8a937936216c5eb3912a08046b6e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 13 Aug 2025 02:43:02 +0200 Subject: [PATCH 309/613] visibility --- src/segment/data_block/mod.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index 40f4302f..c84472bd 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -291,7 +291,7 @@ impl DataBlock { self.inner.size() } - fn get_binary_index_reader(&self) -> BinaryIndexReader { + pub(crate) fn get_binary_index_reader(&self) -> BinaryIndexReader { let trailer = Trailer::new(&self.inner); let mut reader = trailer.as_slice(); @@ -317,7 +317,8 @@ impl DataBlock { ) } - fn get_hash_index_reader(&self) -> Option { + #[must_use] + pub fn get_hash_index_reader(&self) -> Option> { let trailer = Trailer::new(&self.inner); // NOTE: Skip item count (u32), restart interval (u8), binary index step size (u8) From 26e315de3e65a401b88529acd2b7cbe61d0db706 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 13 Aug 2025 02:43:37 +0200 Subject: [PATCH 310/613] line --- microbench/bloom_speed/run.nu | 1 + 1 file changed, 1 insertion(+) diff --git a/microbench/bloom_speed/run.nu b/microbench/bloom_speed/run.nu index 36f75fca..92a918e1 100644 --- a/microbench/bloom_speed/run.nu +++ b/microbench/bloom_speed/run.nu @@ -2,3 +2,4 @@ rm -f data.jsonl cargo run -r | save data.jsonl --append cargo run -r --features use_unsafe | save data.jsonl --append python3 template.py + From faecf9f7f1cad70ba9525cc2a761bd313f46d473 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 13 Aug 2025 02:44:27 +0200 Subject: [PATCH 311/613] fix temporary --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 58fb009a..d4327013 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,7 @@ path = "src/lib.rs" [features] default = [] lz4 = ["dep:lz4_flex"] -bytes = ["value-log/bytes"] +# bytes = ["value-log/bytes"] # TODO: restore use_unsafe = [] bloom_use_unsafe = [] metrics = [] From 6d512e7923cfe020917149375f44a6868df20bc1 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 14 Aug 2025 21:37:54 +0200 Subject: [PATCH 312/613] wip --- src/tree/ingest.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tree/ingest.rs b/src/tree/ingest.rs index 28efad25..6ade9211 100644 --- a/src/tree/ingest.rs +++ b/src/tree/ingest.rs @@ -21,7 +21,7 @@ impl<'a> Ingestion<'a> { ); let folder = tree.config.path.join(crate::file::SEGMENTS_FOLDER); - log::debug!("Ingesting into disk segments in {folder:?}"); + log::debug!("Ingesting into disk segments in {:?}", folder.display()); let writer = MultiWriter::new( folder.clone(), From 6d6036a93bf5c00ec097ae1eed7dc8aa631a1e86 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 14 Aug 2025 23:49:14 +0200 Subject: [PATCH 313/613] refactor: rename file --- src/lib.rs | 2 +- src/{windows.rs => slice_windows.rs} | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) rename src/{windows.rs => slice_windows.rs} (94%) diff --git a/src/lib.rs b/src/lib.rs index 891c62b3..d0408644 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -180,8 +180,8 @@ mod path; pub mod range; mod seqno; +mod slice_windows; mod snapshot; -mod windows; #[doc(hidden)] pub mod stop_signal; diff --git a/src/windows.rs b/src/slice_windows.rs similarity index 94% rename from src/windows.rs rename to src/slice_windows.rs index 9f2403b8..a069c6f9 100644 --- a/src/windows.rs +++ b/src/slice_windows.rs @@ -1,3 +1,7 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + pub trait GrowingWindowsExt { fn growing_windows<'a>(&'a self) -> impl Iterator where From 0a52bb6c4714e948e9e02f0f071ad69bf2c1f103 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 14 Aug 2025 23:49:20 +0200 Subject: [PATCH 314/613] refactor: remove old file --- src/level_manifest/level.rs | 535 ------------------------------------ 1 file changed, 535 deletions(-) delete mode 100644 src/level_manifest/level.rs diff --git a/src/level_manifest/level.rs b/src/level_manifest/level.rs deleted file mode 100644 index 1a81bcc2..00000000 --- a/src/level_manifest/level.rs +++ /dev/null @@ -1,535 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use crate::{ - binary_search::partition_point, segment::Segment, HashSet, KeyRange, SegmentId, UserKey, -}; -use std::ops::Bound; - -/// Level of an LSM-tree -#[derive(Clone, Debug)] -pub struct Level { - /// List of segments - #[doc(hidden)] - pub segments: Vec, - - /// If the level is disjoint - /// - /// is only recomputed when the level is changed - /// to avoid unnecessary CPU work - pub is_disjoint: bool, - // pub key_range: KeyRange, -} - -impl std::fmt::Display for Level { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - for segment in self.segments.iter().rev().take(2).rev() { - write!(f, "[{}]", segment.id())?; - } - Ok(()) - } -} - -impl std::ops::Deref for Level { - type Target = Vec; - - fn deref(&self) -> &Self::Target { - &self.segments - } -} - -impl Default for Level { - fn default() -> Self { - Self { - is_disjoint: true, - segments: Vec::new(), - // key_range: KeyRange::empty(), - } - } -} - -impl Level { - pub fn list_ids(&self) -> HashSet { - self.segments.iter().map(Segment::id).collect() - } - - pub fn update_metadata(&mut self) { - self.set_disjoint_flag(); - self.sort(); - } - - pub fn insert(&mut self, segment: Segment) { - self.segments.push(segment); - self.update_metadata(); - } - - pub fn remove(&mut self, segment_id: SegmentId) -> Option { - if let Some(idx) = self.segments.iter().position(|x| x.id() == segment_id) { - let segment = self.segments.remove(idx); - self.update_metadata(); - Some(segment) - } else { - None - } - } - - pub(crate) fn sort(&mut self) { - if self.is_disjoint { - self.sort_by_key_range(); - } else { - self.sort_by_seqno(); - } - } - - /// Sorts the level by key range ascending. - /// - /// segment 1 segment 2 segment 3 - /// [key:a] [key:c] [key:z] - pub(crate) fn sort_by_key_range(&mut self) { - self.segments - .sort_by(|a, b| a.metadata.key_range.min().cmp(b.metadata.key_range.min())); - } - - /// Sorts the level from newest to oldest. - /// - /// This will make segments with highest seqno get checked first, - /// so if there are two versions of an item, the fresher one is seen first: - /// - /// segment 1 segment 2 - /// [key:asd:2] [key:asd:1] - /// - /// point read -----------> - pub(crate) fn sort_by_seqno(&mut self) { - self.segments - .sort_by(|a, b| b.metadata.seqnos.1.cmp(&a.metadata.seqnos.1)); - } - - /// Returns an iterator over the level's segment IDs. - pub fn ids(&self) -> impl Iterator + '_ { - self.segments.iter().map(Segment::id) - } - - /// Returns `true` if the level contains no segments. - pub fn is_empty(&self) -> bool { - self.segments.is_empty() - } - - /// Returns the number of segments. - pub fn len(&self) -> usize { - self.segments.len() - } - - /// Returns the level size in bytes. - pub fn size(&self) -> u64 { - self.segments.iter().map(|x| x.metadata.file_size).sum() - } - - pub(crate) fn compute_is_disjoint(&self) -> bool { - let ranges = self - .segments - .iter() - .map(|x| &x.metadata.key_range) - .collect::>(); - - KeyRange::is_disjoint(&ranges) - } - - /// Checks if the level is disjoint and caches the result in `is_disjoint`. - fn set_disjoint_flag(&mut self) { - self.is_disjoint = self.compute_is_disjoint(); - } - - /// Returns an iterator over segments in the level that have a key range - /// overlapping the input key range. - pub fn overlapping_segments<'a>( - &'a self, - key_range: &'a KeyRange, - ) -> impl Iterator { - self.segments - .iter() - .filter(|x| x.metadata.key_range.overlaps_with_key_range(key_range)) - } - - /// Returns an iterator over segments in the level that have a key range - /// fully contained in the input key range. - pub fn contained_segments<'a>( - &'a self, - key_range: &'a KeyRange, - ) -> impl Iterator { - self.segments - .iter() - .filter(|x| key_range.contains_range(&x.metadata.key_range)) - } - - pub fn as_disjoint(&self) -> Option> { - if self.is_disjoint { - Some(DisjointLevel(self)) - } else { - None - } - } -} - -#[allow(clippy::module_name_repetitions)] -pub struct DisjointLevel<'a>(&'a Level); - -impl<'a> DisjointLevel<'a> { - /// Returns the segment that possibly contains the key. - pub fn get_segment_containing_key(&self, key: &[u8]) -> Option { - let idx = partition_point(&self.0.segments, |segment| { - segment.metadata.key_range.max() < &key - }); - - self.0 - .segments - .get(idx) - .filter(|x| x.metadata.key_range.min() <= &key) - .cloned() - } - - pub fn range_indexes( - &'a self, - key_range: &'a (Bound, Bound), - ) -> Option<(usize, usize)> { - let level = &self.0; - - let lo = match &key_range.0 { - Bound::Unbounded => 0, - Bound::Included(start_key) => partition_point(level, |segment| { - segment.metadata.key_range.max() < start_key - }), - Bound::Excluded(start_key) => partition_point(level, |segment| { - segment.metadata.key_range.max() <= start_key - }), - }; - - if lo >= level.len() { - return None; - } - - let hi = match &key_range.1 { - Bound::Unbounded => level.len() - 1, - Bound::Included(end_key) => { - let idx = - partition_point(level, |segment| segment.metadata.key_range.min() <= end_key); - - if idx == 0 { - return None; - } - - idx.saturating_sub(1) // To avoid underflow - } - Bound::Excluded(end_key) => { - let idx = - partition_point(level, |segment| segment.metadata.key_range.min() < end_key); - - if idx == 0 { - return None; - } - - idx.saturating_sub(1) // To avoid underflow - } - }; - - if lo > hi { - return None; - } - - Some((lo, hi)) - } -} -/* -#[cfg(test)] -#[allow(clippy::expect_used)] -mod tests { - use super::*; - use crate::{ - cache::Cache, - descriptor_table::FileDescriptorTable, - segment::{ - block::offset::BlockOffset, - block_index::{two_level_index::TwoLevelBlockIndex, BlockIndexImpl}, - file_offsets::FileOffsets, - meta::{Metadata, SegmentId}, - SegmentInner, - }, - super_segment::Segment, - AbstractTree, KeyRange, Slice, - }; - use std::sync::{atomic::AtomicBool, Arc}; - use test_log::test; - - #[allow(clippy::expect_used)] - fn fixture_segment(id: SegmentId, key_range: KeyRange) -> Segment { - todo!() - /* let cache = Arc::new(Cache::with_capacity_bytes(10 * 1_024 * 1_024)); - - let block_index = TwoLevelBlockIndex::new((0, id).into(), cache.clone()); - let block_index = Arc::new(BlockIndexImpl::TwoLevel(block_index)); - - SegmentInner { - tree_id: 0, - descriptor_table: Arc::new(FileDescriptorTable::new(512, 1)), - block_index, - - offsets: FileOffsets { - bloom_ptr: BlockOffset(0), - range_filter_ptr: BlockOffset(0), - index_block_ptr: BlockOffset(0), - metadata_ptr: BlockOffset(0), - range_tombstones_ptr: BlockOffset(0), - tli_ptr: BlockOffset(0), - pfx_ptr: BlockOffset(0), - }, - - metadata: Metadata { - data_block_count: 0, - index_block_count: 0, - data_block_size: 4_096, - index_block_size: 4_096, - created_at: 0, - id, - file_size: 0, - compression: crate::segment::meta::CompressionType::None, - table_type: crate::segment::meta::TableType::Block, - item_count: 0, - key_count: 0, - key_range, - tombstone_count: 0, - range_tombstone_count: 0, - uncompressed_size: 0, - seqnos: (0, 0), - }, - cache, - - bloom_filter: Some(crate::bloom::BloomFilter::with_fp_rate(1, 0.1)), - - path: "a".into(), - is_deleted: AtomicBool::default(), - } - .into() */ - } - - #[test] - #[allow(clippy::unwrap_used)] - fn level_disjoint_cull() { - let level = Level { - is_disjoint: true, - // key_range: KeyRange::empty(), - segments: vec![ - fixture_segment(0, KeyRange::new((Slice::from("a"), Slice::from("c")))), - fixture_segment(1, KeyRange::new((Slice::from("d"), Slice::from("g")))), - fixture_segment(2, KeyRange::new((Slice::from("h"), Slice::from("k")))), - ], - }; - let level = level.as_disjoint().unwrap(); - - { - let range = (Bound::Unbounded, Bound::Included(Slice::from("0"))); - let indexes = level.range_indexes(&range); - assert_eq!(None, indexes); - } - - { - let range = (Bound::Included(Slice::from("l")), Bound::Unbounded); - let indexes = level.range_indexes(&range); - assert_eq!(None, indexes); - } - - { - let range = ( - Bound::Included(Slice::from("d")), - Bound::Included(Slice::from("g")), - ); - let indexes = level.range_indexes(&range); - assert_eq!(Some((1, 1)), indexes); - } - - { - let range = ( - Bound::Excluded(Slice::from("d")), - Bound::Included(Slice::from("g")), - ); - let indexes = level.range_indexes(&range); - assert_eq!(Some((1, 1)), indexes); - } - - { - let range = ( - Bound::Included(Slice::from("d")), - Bound::Excluded(Slice::from("h")), - ); - let indexes = level.range_indexes(&range); - assert_eq!(Some((1, 1)), indexes); - } - - { - let range = ( - Bound::Included(Slice::from("d")), - Bound::Included(Slice::from("h")), - ); - let indexes = level.range_indexes(&range); - assert_eq!(Some((1, 2)), indexes); - } - - { - let range = (Bound::Included(Slice::from("d")), Bound::Unbounded); - let indexes = level.range_indexes(&range); - assert_eq!(Some((1, 2)), indexes); - } - - { - let range = ( - Bound::Included(Slice::from("a")), - Bound::Included(Slice::from("d")), - ); - let indexes = level.range_indexes(&range); - assert_eq!(Some((0, 1)), indexes); - } - - { - let range = ( - Bound::Included(Slice::from("a")), - Bound::Excluded(Slice::from("d")), - ); - let indexes = level.range_indexes(&range); - assert_eq!(Some((0, 0)), indexes); - } - - { - let range = (Bound::Unbounded, Bound::Unbounded); - let indexes = level.range_indexes(&range); - assert_eq!(Some((0, 2)), indexes); - } - } - - #[test] - fn level_disjoint() -> crate::Result<()> { - let folder = tempfile::tempdir()?; - - let tree = crate::Config::new(&folder).open()?; - - let mut x = 0_u64; - - for _ in 0..10 { - for _ in 0..10 { - let key = x.to_be_bytes(); - x += 1; - tree.insert(key, key, 0); - } - tree.flush_active_memtable(0).expect("should flush"); - } - - assert!( - tree.levels - .read() - .expect("lock is poisoned") - .levels - .first() - .expect("should exist") - .is_disjoint - ); - - Ok(()) - } - - #[test] - #[allow(clippy::unwrap_used)] - fn level_disjoint_containing_key() -> crate::Result<()> { - let folder = tempfile::tempdir()?; - - let tree = crate::Config::new(&folder).open()?; - - for k in 'c'..'k' { - tree.insert([k as u8], "", 0); - tree.flush_active_memtable(0).expect("should flush"); - } - - let first = tree - .levels - .read() - .expect("lock is poisoned") - .levels - .first() - .expect("should exist") - .clone(); - - let dis = first.as_disjoint().unwrap(); - assert!(dis.get_segment_containing_key(b"a").is_none()); - assert!(dis.get_segment_containing_key(b"b").is_none()); - for k in 'c'..'k' { - assert!(dis.get_segment_containing_key(&[k as u8]).is_some()); - } - assert!(dis.get_segment_containing_key(b"l").is_none()); - - Ok(()) - } - - #[test] - fn level_not_disjoint() -> crate::Result<()> { - let folder = tempfile::tempdir()?; - - let tree = crate::Config::new(&folder).open()?; - - for i in 0..10 { - tree.insert("a", "", i); - tree.insert("z", "", i); - tree.flush_active_memtable(0).expect("should flush"); - } - - assert!( - !tree - .levels - .read() - .expect("lock is poisoned") - .levels - .first() - .expect("should exist") - .is_disjoint - ); - - Ok(()) - } - - #[test] - fn level_overlaps() { - let seg0 = fixture_segment( - 1, - KeyRange::new((b"c".to_vec().into(), b"k".to_vec().into())), - ); - let seg1 = fixture_segment( - 2, - KeyRange::new((b"l".to_vec().into(), b"z".to_vec().into())), - ); - - let mut level = Level::default(); - level.insert(seg0); - level.insert(seg1); - - assert_eq!( - Vec::::new(), - level - .overlapping_segments(&KeyRange::new((b"a".to_vec().into(), b"b".to_vec().into()))) - .map(Segment::id) - .collect::>(), - ); - - assert_eq!( - vec![1], - level - .overlapping_segments(&KeyRange::new((b"d".to_vec().into(), b"k".to_vec().into()))) - .map(Segment::id) - .collect::>(), - ); - - assert_eq!( - vec![1, 2], - level - .overlapping_segments(&KeyRange::new((b"f".to_vec().into(), b"x".to_vec().into()))) - .map(Segment::id) - .collect::>(), - ); - } -} - */ From ad3bcb1977df35a772bd932cdf28472b8cfe672b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 14 Aug 2025 23:49:28 +0200 Subject: [PATCH 315/613] refactor: clippy --- src/file.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/file.rs b/src/file.rs index 2bdbf516..4e8a7c54 100644 --- a/src/file.rs +++ b/src/file.rs @@ -28,6 +28,8 @@ pub fn rewrite_atomic(path: &Path, content: &[u8]) -> std::io::Result<()> { let file = std::fs::File::open(path)?; file.sync_all()?; + // NOTE: Files should always have a parent directory + #[allow(clippy::expect_used)] let folder = path.parent().expect("should have parent folder"); fsync_directory(folder)?; } From b50d7faec25ed3668fa3aba99d0d95084707e52d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 14 Aug 2025 23:49:32 +0200 Subject: [PATCH 316/613] refactor: clippy --- src/tree/ingest.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tree/ingest.rs b/src/tree/ingest.rs index 6ade9211..0bf19be2 100644 --- a/src/tree/ingest.rs +++ b/src/tree/ingest.rs @@ -21,7 +21,7 @@ impl<'a> Ingestion<'a> { ); let folder = tree.config.path.join(crate::file::SEGMENTS_FOLDER); - log::debug!("Ingesting into disk segments in {:?}", folder.display()); + log::debug!("Ingesting into disk segments in {}", folder.display()); let writer = MultiWriter::new( folder.clone(), From 3859c0ece37bb674eb7aae298cfa5c41ae1dda36 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 14 Aug 2025 23:49:38 +0200 Subject: [PATCH 317/613] refactor: clippy --- src/segment/regions.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/segment/regions.rs b/src/segment/regions.rs index 91c4938d..e5b052c1 100644 --- a/src/segment/regions.rs +++ b/src/segment/regions.rs @@ -30,6 +30,8 @@ impl ParsedRegions { let block = DataBlock::new(block); let tli = { + // NOTE: Top-level index block is always written + #[allow(clippy::expect_used)] let bytes = block .point_read(b"tli", SeqNo::MAX) .expect("TLI handle should exist"); @@ -39,6 +41,8 @@ impl ParsedRegions { }?; let metadata = { + // NOTE: Metadata block is always written + #[allow(clippy::expect_used)] let bytes = block .point_read(b"meta", SeqNo::MAX) .expect("Metadata handle should exist"); From ae8cadae6c42e8bd7955ea81ff61e09c7c050d46 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 14 Aug 2025 23:49:42 +0200 Subject: [PATCH 318/613] refactor: clippy --- src/segment/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 3ab8202d..11b0b822 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -340,7 +340,7 @@ impl Segment { use std::sync::atomic::AtomicBool; use trailer::Trailer; - log::debug!("Recovering segment from file {file_path:?}"); + log::debug!("Recovering segment from file {}", file_path.display()); let mut file = std::fs::File::open(&file_path)?; let trailer = Trailer::from_file(&mut file)?; From 43d7561cce7a6e0f3ede12e71f385749377a79e4 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 14 Aug 2025 23:49:54 +0200 Subject: [PATCH 319/613] refactor: clippy --- src/segment/filter/bit_array/builder.rs | 2 +- src/segment/filter/bit_array/reader.rs | 2 +- src/segment/filter/blocked_bloom/builder.rs | 3 ++- src/segment/filter/blocked_bloom/mod.rs | 2 +- src/segment/filter/standard_bloom/builder.rs | 2 +- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/segment/filter/bit_array/builder.rs b/src/segment/filter/bit_array/builder.rs index a0b4bcf6..1065f8df 100644 --- a/src/segment/filter/bit_array/builder.rs +++ b/src/segment/filter/bit_array/builder.rs @@ -12,7 +12,7 @@ pub fn enable_bit(byte: u8, idx: usize) -> u8 { } /// Fixed-size bit array -#[derive(Debug, Eq, PartialEq)] +#[derive(Debug)] pub struct Builder(Box<[u8]>); impl Builder { diff --git a/src/segment/filter/bit_array/reader.rs b/src/segment/filter/bit_array/reader.rs index b602dacb..07dfcf8e 100644 --- a/src/segment/filter/bit_array/reader.rs +++ b/src/segment/filter/bit_array/reader.rs @@ -13,7 +13,7 @@ fn get_bit(byte: u8, idx: usize) -> bool { } /// Fixed-size bit array reader -#[derive(Debug, Eq, PartialEq)] +#[derive(Debug)] pub struct BitArrayReader<'a>(&'a [u8]); impl<'a> BitArrayReader<'a> { diff --git a/src/segment/filter/blocked_bloom/builder.rs b/src/segment/filter/blocked_bloom/builder.rs index b9a133dc..6fd83945 100644 --- a/src/segment/filter/blocked_bloom/builder.rs +++ b/src/segment/filter/blocked_bloom/builder.rs @@ -11,7 +11,7 @@ use crate::{ use byteorder::{LittleEndian, WriteBytesExt}; use std::io::Write; -#[derive(Debug, Eq, PartialEq)] +#[derive(Debug)] #[allow(clippy::module_name_repetitions)] pub struct Builder { /// Raw bytes exposed as bit array @@ -137,6 +137,7 @@ impl Builder { } } + #[must_use] pub fn get_bit_idx(block_idx: usize, idx_in_block: usize) -> usize { block_idx * CACHE_LINE_BYTES * 8 + idx_in_block } diff --git a/src/segment/filter/blocked_bloom/mod.rs b/src/segment/filter/blocked_bloom/mod.rs index 6debf2da..70d50234 100644 --- a/src/segment/filter/blocked_bloom/mod.rs +++ b/src/segment/filter/blocked_bloom/mod.rs @@ -24,7 +24,7 @@ const CACHE_LINE_BYTES: usize = 64; /// /// The filter uses double hashing instead of `k` hash functions, see: /// -#[derive(Debug, PartialEq)] +#[derive(Debug)] pub struct BlockedBloomFilterReader<'a> { /// Raw bytes exposed as bit array inner: BitArrayReader<'a>, diff --git a/src/segment/filter/standard_bloom/builder.rs b/src/segment/filter/standard_bloom/builder.rs index 2599d272..b7337379 100644 --- a/src/segment/filter/standard_bloom/builder.rs +++ b/src/segment/filter/standard_bloom/builder.rs @@ -12,7 +12,7 @@ pub fn secondary_hash(h1: u64) -> u64 { h1.wrapping_shr(32).wrapping_mul(0x51_7c_c1_b7_27_22_0a_95) } -#[derive(Debug, Eq, PartialEq)] +#[derive(Debug)] #[allow(clippy::module_name_repetitions)] pub struct Builder { /// Raw bytes exposed as bit array From e9bef550ec0929e834c81301ab2e31a4812ea8cd Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 14 Aug 2025 23:49:57 +0200 Subject: [PATCH 320/613] refactor: clippy --- src/memtable/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/memtable/mod.rs b/src/memtable/mod.rs index 6e166ff3..c54d53bd 100644 --- a/src/memtable/mod.rs +++ b/src/memtable/mod.rs @@ -114,10 +114,10 @@ impl Memtable { /// Inserts an item into the memtable #[doc(hidden)] pub fn insert(&self, item: InternalValue) -> (u64, u64) { - // NOTE: We know values are limited to 32-bit length - #[allow(clippy::cast_possible_truncation)] + // NOTE: We know keys are limited to 16-bit length + values are limited to 32-bit length + #[allow(clippy::cast_possible_truncation, clippy::expect_used)] let item_size = - { item.key.user_key.len() + item.value.len() + std::mem::size_of::() } + (item.key.user_key.len() + item.value.len() + std::mem::size_of::()) .try_into() .expect("should fit into u64"); From 0cca26a1e67105e24270d2dcf3583c3e07f3893a Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 14 Aug 2025 23:50:15 +0200 Subject: [PATCH 321/613] hide unused module --- src/compaction/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compaction/mod.rs b/src/compaction/mod.rs index 0cf79ef0..13e214f7 100644 --- a/src/compaction/mod.rs +++ b/src/compaction/mod.rs @@ -6,7 +6,7 @@ pub(crate) mod fifo; pub(crate) mod leveled; -pub(crate) mod maintenance; +// pub(crate) mod maintenance; pub(crate) mod major; pub(crate) mod movedown; pub(crate) mod pulldown; From 5be69ffd397a9a4ff97c0b736d5f07d5402a5b27 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 14 Aug 2025 23:50:23 +0200 Subject: [PATCH 322/613] refactor: clippy --- src/compaction/leveled.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 50a4f583..a923b332 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -7,8 +7,8 @@ use crate::{ config::Config, level_manifest::{hidden_set::HiddenSet, LevelManifest}, segment::Segment, + slice_windows::{GrowingWindowsExt, ShrinkingWindowsExt}, version::{run::Ranged, Run}, - windows::{GrowingWindowsExt, ShrinkingWindowsExt}, HashSet, KeyRange, SegmentId, }; @@ -212,6 +212,9 @@ impl CompactionStrategy for Strategy { { // Score first level + + // NOTE: We always have at least one level + #[allow(clippy::expect_used)] let first_level = levels.as_slice().first().expect("first level should exist"); if first_level.len() >= usize::from(self.l0_threshold) { scores[0] = ((first_level.len() as f64) / (self.l0_threshold as f64), 0); @@ -257,8 +260,6 @@ impl CompactionStrategy for Strategy { } } - // eprintln!("{scores:?}"); - // Choose compaction let (level_idx_with_highest_score, (score, overshoot_bytes)) = scores .into_iter() From 45ac853192847492c2cfbf57a697f73ad2b37718 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 14 Aug 2025 23:50:26 +0200 Subject: [PATCH 323/613] refactor: clippy --- src/compaction/fifo.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/compaction/fifo.rs b/src/compaction/fifo.rs index 3b1d6b87..b02b94ee 100644 --- a/src/compaction/fifo.rs +++ b/src/compaction/fifo.rs @@ -46,6 +46,8 @@ impl CompactionStrategy for Strategy { // TODO: TTL fn choose(&self, levels: &LevelManifest, _config: &Config) -> Choice { + // NOTE: We always have at least one level + #[allow(clippy::expect_used)] let first_level = levels.as_slice().first().expect("should have first level"); assert!(first_level.is_disjoint(), "L0 needs to be disjoint"); From d81abcfae8cc106c9b061907231a6fa2566dc55f Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 14 Aug 2025 23:50:32 +0200 Subject: [PATCH 324/613] wip --- src/cache.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cache.rs b/src/cache.rs index 6bdd042d..d3407c45 100644 --- a/src/cache.rs +++ b/src/cache.rs @@ -60,8 +60,8 @@ impl Weighter for BlockWeighter { /// # use lsm_tree::{Tree, Config, Cache}; /// # use std::sync::Arc; /// # -/// // Provide 40 MB of cache capacity -/// let cache = Arc::new(Cache::with_capacity_bytes(40 * 1_000 * 1_000)); +/// // Provide 64 MB of cache capacity +/// let cache = Arc::new(Cache::with_capacity_bytes(64 * 1_000 * 1_000)); /// /// # let folder = tempfile::tempdir()?; /// let tree1 = Config::new(folder).use_cache(cache.clone()).open()?; From 70953faf996b13e3121f0929860bfeb823b40ad2 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 14 Aug 2025 23:50:56 +0200 Subject: [PATCH 325/613] refactor: clippy --- src/version/run.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/version/run.rs b/src/version/run.rs index 071fd394..33065498 100644 --- a/src/version/run.rs +++ b/src/version/run.rs @@ -94,9 +94,16 @@ impl Run { self.0.get(idx).filter(|x| x.key_range().min() <= &key) } + /// Returns the run's key range. pub fn aggregate_key_range(&self) -> KeyRange { + // NOTE: Run invariant + #[allow(clippy::expect_used)] let lo = self.first().expect("run should never be empty"); + + // NOTE: Run invariant + #[allow(clippy::expect_used)] let hi = self.last().expect("run should never be empty"); + KeyRange::new((lo.key_range().min().clone(), hi.key_range().max().clone())) } From 1f61440b596d116c606e0a20e4cbc3b1b894d6f3 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 14 Aug 2025 23:51:14 +0200 Subject: [PATCH 326/613] refactor: clippy --- src/level_manifest/mod.rs | 36 ++++++++++++++++++------------------ src/version/mod.rs | 7 ++++++- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index 721422fa..19fe9264 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -8,8 +8,8 @@ use crate::{ coding::DecodeError, file::{fsync_directory, rewrite_atomic, MAGIC_BYTES}, segment::Segment, - version::{Level, Run, Version, VersionId}, - HashSet, SegmentId, SeqNo, + version::{Level, Run, Version, VersionId, DEFAULT_LEVEL_COUNT}, + SegmentId, SeqNo, }; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use hidden_set::HiddenSet; @@ -134,7 +134,7 @@ impl LevelManifest { folder: folder.into(), current: Version::new(0), hidden_set: HiddenSet::default(), - version_free_list: Default::default(), + version_free_list: VecDeque::default(), }; Self::persist_version(&manifest.folder, &manifest.current)?; @@ -231,7 +231,7 @@ impl LevelManifest { let version_file = std::path::Path::new(&version_file_path); if !version_file.try_exists()? { - log::error!("Cannot find version file {version_file_path:?}"); + log::error!("Cannot find version file {}", version_file_path.display()); return Err(crate::Error::Unrecoverable); } @@ -266,12 +266,16 @@ impl LevelManifest { current: Version::from_levels(curr_version, version_levels), folder, hidden_set: HiddenSet::default(), - version_free_list: Default::default(), // TODO: 3. create free list from versions that are N < CURRENT + version_free_list: VecDeque::default(), // TODO: 3. create free list from versions that are N < CURRENT }) } fn persist_version(folder: &Path, version: &Version) -> crate::Result<()> { - log::trace!("Persisting version {} in {folder:?}", version.id()); + log::trace!( + "Persisting version {} in {}", + version.id(), + folder.display(), + ); let file = std::fs::File::create(folder.join(format!("v{}", version.id())))?; let mut writer = BufWriter::new(file); @@ -376,11 +380,10 @@ impl LevelManifest { } } - /// Returns the amount of levels in the tree + /// Returns the amount of levels in the tree. #[must_use] pub fn last_level_index(&self) -> u8 { - // NOTE: Currently hard coded to 7 - 1 - 6 + DEFAULT_LEVEL_COUNT - 1 } /// Returns the amount of segments, summed over all levels @@ -397,15 +400,12 @@ impl LevelManifest { #[must_use] pub fn level_is_busy(&self, idx: usize) -> bool { - self.current - .level(idx) - .map(|level| { - level - .iter() - .flat_map(|run| run.iter()) - .any(|segment| self.hidden_set.is_hidden(segment.id())) - }) - .unwrap_or_default() + self.current.level(idx).is_some_and(|level| { + level + .iter() + .flat_map(|run| run.iter()) + .any(|segment| self.hidden_set.is_hidden(segment.id())) + }) } pub(crate) fn get_segment(&self, id: SegmentId) -> Option<&Segment> { diff --git a/src/version/mod.rs b/src/version/mod.rs index 1f2fa2ad..13f12745 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -12,6 +12,8 @@ use optimize::optimize_runs; use run::Ranged; use std::{ops::Deref, sync::Arc}; +pub const DEFAULT_LEVEL_COUNT: u8 = 7; + pub type VersionId = u64; impl Ranged for Segment { @@ -159,7 +161,7 @@ impl Version { } pub fn new(id: VersionId) -> Self { - let levels = (0..7).map(|_| Level::empty()).collect(); + let levels = (0..DEFAULT_LEVEL_COUNT).map(|_| Level::empty()).collect(); Self { inner: Arc::new(VersionInner { id, levels }), @@ -207,6 +209,9 @@ impl Version { // L0 levels.push({ // Copy-on-write the first level with new run at top + + // NOTE: We always have at least one level + #[allow(clippy::expect_used)] let l0 = self.levels.first().expect("L0 should always exist"); let prev_runs = l0 From a6d3149f45c6c3aedafedf8e4fdeec5c182c3810 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 18 Aug 2025 15:15:54 +0200 Subject: [PATCH 327/613] perf: use binary search in run overlaps/contains --- src/version/run.rs | 132 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 118 insertions(+), 14 deletions(-) diff --git a/src/version/run.rs b/src/version/run.rs index 33065498..c41d3467 100644 --- a/src/version/run.rs +++ b/src/version/run.rs @@ -107,18 +107,43 @@ impl Run { KeyRange::new((lo.key_range().min().clone(), hi.key_range().max().clone())) } - /// Returns an iterator over segments in the level that have a key range - /// overlapping the input key range. - pub fn get_overlapping<'a>(&'a self, key_range: &'a KeyRange) -> impl Iterator { - self.iter() - .filter(|x| x.key_range().overlaps_with_key_range(key_range)) + /// Returns the sub slice of segments in the run that have + /// a key range overlapping the input key range. + pub fn get_overlapping<'a>(&'a self, key_range: &'a KeyRange) -> &'a [T] { + let range = key_range.min()..=key_range.max(); + + let Some((lo, hi)) = self.range_indexes::(&range) else { + return &[]; + }; + + self.get(lo..=hi).unwrap_or_default() } - /// Returns an iterator over segments in the level that have a key range - /// fully contained in the input key range. - pub fn get_contained<'a>(&'a self, key_range: &'a KeyRange) -> impl Iterator { - self.iter() - .filter(|x| key_range.contains_range(x.key_range())) + /// Returns the sub slice of segments of segments in the run that have + /// a key range fully contained in the input key range. + pub fn get_contained<'a>(&'a self, key_range: &'a KeyRange) -> &'a [T] { + fn trim_slice(s: &[T], pred: F) -> &[T] + where + F: Fn(&T) -> bool, + { + // find first index where pred holds + let start = s.iter().position(&pred).unwrap_or(s.len()); + + // find last index where pred holds + let end = s.iter().rposition(&pred).map_or(start, |i| i + 1); + + s.get(start..end).expect("should be in range") + } + + let range = key_range.min()..=key_range.max(); + + let Some((lo, hi)) = self.range_indexes::(&range) else { + return &[]; + }; + + self.get(lo..=hi) + .map(|slice| trim_slice(slice, |x| key_range.contains_range(x.key_range()))) + .unwrap_or_default() } /// Returns the indexes of the interval [min, max] of segments that overlap with a given range. @@ -211,7 +236,6 @@ mod tests { s(2, "k", "o"), s(3, "p", "z"), ]; - let run = Run(items); assert_eq!( @@ -228,7 +252,6 @@ mod tests { s(2, "k", "o"), s(3, "p", "z"), ]; - let run = Run(items); assert_eq!(0, run.get_for_key(b"a").unwrap().id); @@ -253,15 +276,16 @@ mod tests { s(2, "k", "o"), s(3, "p", "z"), ]; - let run = Run(items); assert_eq!(Some((0, 3)), run.range_indexes::<&[u8], _>(&..)); assert_eq!(Some((0, 0)), run.range_indexes(&(b"a" as &[u8]..=b"a"))); assert_eq!(Some((0, 0)), run.range_indexes(&(b"a" as &[u8]..=b"b"))); assert_eq!(Some((0, 0)), run.range_indexes(&(b"a" as &[u8]..=b"d"))); + assert_eq!(Some((0, 0)), run.range_indexes(&(b"d" as &[u8]..=b"d"))); assert_eq!(Some((0, 0)), run.range_indexes(&(b"a" as &[u8]..b"d"))); assert_eq!(Some((0, 1)), run.range_indexes(&(b"a" as &[u8]..=b"g"))); + assert_eq!(Some((1, 1)), run.range_indexes(&(b"j" as &[u8]..=b"j"))); assert_eq!(Some((0, 3)), run.range_indexes(&(b"a" as &[u8]..=b"z"))); assert_eq!(Some((3, 3)), run.range_indexes(&(b"z" as &[u8]..=b"zzz"))); assert_eq!(Some((3, 3)), run.range_indexes(&(b"z" as &[u8]..))); @@ -269,20 +293,96 @@ mod tests { } #[test] - fn run_range_overlaps() { + fn run_range_contained() { + use crate::SegmentId; + let items = vec![ s(0, "a", "d"), s(1, "e", "j"), s(2, "k", "o"), s(3, "p", "z"), ]; + let run = Run(items); + assert_eq!( + &[] as &[SegmentId], + &*run + .get_contained(&KeyRange::new((b"a".into(), b"a".into()))) + .iter() + .map(|x| x.id) + .collect::>(), + ); + + assert_eq!( + &[0], + &*run + .get_contained(&KeyRange::new((b"a".into(), b"d".into()))) + .iter() + .map(|x| x.id) + .collect::>(), + ); + + assert_eq!( + &[0, 1], + &*run + .get_contained(&KeyRange::new((b"a".into(), b"j".into()))) + .iter() + .map(|x| x.id) + .collect::>(), + ); + + assert_eq!( + &[0, 1], + &*run + .get_contained(&KeyRange::new((b"a".into(), b"k".into()))) + .iter() + .map(|x| x.id) + .collect::>(), + ); + + assert_eq!( + &[0, 1], + &*run + .get_contained(&KeyRange::new((b"a".into(), b"l".into()))) + .iter() + .map(|x| x.id) + .collect::>(), + ); + + assert_eq!( + &[0, 1, 2, 3], + &*run + .get_contained(&KeyRange::new((b"a".into(), b"z".into()))) + .iter() + .map(|x| x.id) + .collect::>(), + ); + } + + #[test] + fn run_range_overlaps() { + let items = vec![ + s(0, "a", "d"), + s(1, "e", "j"), + s(2, "k", "o"), + s(3, "p", "z"), + ]; let run = Run(items); + assert_eq!( + &[0], + &*run + .get_overlapping(&KeyRange::new((b"a".into(), b"a".into()))) + .iter() + .map(|x| x.id) + .collect::>(), + ); + assert_eq!( &[0], &*run .get_overlapping(&KeyRange::new((b"d".into(), b"d".into()))) + .iter() .map(|x| x.id) .collect::>(), ); @@ -291,6 +391,7 @@ mod tests { &[0], &*run .get_overlapping(&KeyRange::new((b"a".into(), b"d".into()))) + .iter() .map(|x| x.id) .collect::>(), ); @@ -299,6 +400,7 @@ mod tests { &[0, 1], &*run .get_overlapping(&KeyRange::new((b"a".into(), b"f".into()))) + .iter() .map(|x| x.id) .collect::>(), ); @@ -307,6 +409,7 @@ mod tests { &[0, 1, 2, 3], &*run .get_overlapping(&KeyRange::new((b"a".into(), b"zzz".into()))) + .iter() .map(|x| x.id) .collect::>(), ); @@ -315,6 +418,7 @@ mod tests { &[] as &[u64], &*run .get_overlapping(&KeyRange::new((b"zzz".into(), b"zzzz".into()))) + .iter() .map(|x| x.id) .collect::>(), ); From 014069d5e58a11c7fe081c40308cf3ddf21178c4 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 18 Aug 2025 15:17:07 +0200 Subject: [PATCH 328/613] wip --- src/compaction/fifo.rs | 2 +- src/compaction/leveled.rs | 4 ++-- src/compaction/maintenance.rs | 2 +- src/level_manifest/hidden_set.rs | 14 +++----------- src/level_manifest/mod.rs | 6 +++--- src/lib.rs | 5 ++--- src/segment/mod.rs | 5 +++++ src/version/mod.rs | 2 +- 8 files changed, 18 insertions(+), 22 deletions(-) diff --git a/src/compaction/fifo.rs b/src/compaction/fifo.rs index b02b94ee..1ac62e38 100644 --- a/src/compaction/fifo.rs +++ b/src/compaction/fifo.rs @@ -66,7 +66,7 @@ impl CompactionStrategy for Strategy { } oldest_segments.insert(segment.id()); - collected_bytes += segment.metadata.file_size; + collected_bytes += segment.file_size(); } Choice::Drop(oldest_segments) diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index a923b332..d7f73274 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -217,7 +217,7 @@ impl CompactionStrategy for Strategy { #[allow(clippy::expect_used)] let first_level = levels.as_slice().first().expect("first level should exist"); if first_level.len() >= usize::from(self.l0_threshold) { - scores[0] = ((first_level.len() as f64) / (self.l0_threshold as f64), 0); + scores[0] = ((first_level.len() as f64) / f64::from(self.l0_threshold), 0); } // Score L1+ @@ -228,7 +228,7 @@ impl CompactionStrategy for Strategy { // NOTE: Take bytes that are already being compacted into account, // otherwise we may be overcompensating .filter(|x| !levels.hidden_set().is_hidden(x.id())) - .map(|x| x.metadata.file_size) + .map(Segment::file_size) .sum::(); let target_size = self.level_target_size(idx as u8); diff --git a/src/compaction/maintenance.rs b/src/compaction/maintenance.rs index 78599575..aa0974c7 100644 --- a/src/compaction/maintenance.rs +++ b/src/compaction/maintenance.rs @@ -31,7 +31,7 @@ pub fn choose_least_effort_compaction(segments: &[Segment], n: usize) -> HashSet let windows = segments.windows(n); let window = windows - .min_by_key(|window| window.iter().map(|s| s.metadata.file_size).sum::()) + .min_by_key(|window| window.iter().map(|s| s.file_size()).sum::()) .expect("should have at least one window"); window.iter().map(Segment::id).collect() diff --git a/src/level_manifest/hidden_set.rs b/src/level_manifest/hidden_set.rs index b84a349e..90615e7a 100644 --- a/src/level_manifest/hidden_set.rs +++ b/src/level_manifest/hidden_set.rs @@ -1,4 +1,4 @@ -use crate::{HashSet, SegmentId}; +use crate::SegmentId; /// The hidden set keeps track of which segments are currently being compacted /// @@ -6,17 +6,9 @@ use crate::{HashSet, SegmentId}; /// segment, or it will be declined to be run. /// /// If a compaction task fails, the segments are shown again (removed from the hidden set). -#[derive(Clone)] +#[derive(Clone, Default)] pub struct HiddenSet { - pub(crate) set: HashSet, -} - -impl Default for HiddenSet { - fn default() -> Self { - Self { - set: HashSet::with_capacity_and_hasher(10, xxhash_rust::xxh3::Xxh3Builder::new()), - } - } + pub(crate) set: crate::HashSet, } impl HiddenSet { diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index 19fe9264..eacf56dd 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -87,7 +87,7 @@ impl std::fmt::Display for LevelManifest { f, " | # = {}, {} MiB", run.len(), - run.iter().map(|x| x.metadata.file_size).sum::() / 1_024 / 1_024, + run.iter().map(Segment::file_size).sum::() / 1_024 / 1_024, )?; } else { for segment in run.iter() { @@ -106,7 +106,7 @@ impl std::fmt::Display for LevelManifest { f, " | # = {}, {} MiB", run.len(), - run.iter().map(|x| x.metadata.file_size).sum::() / 1_024 / 1_024, + run.iter().map(Segment::file_size).sum::() / 1_024 / 1_024, )?; } } @@ -395,7 +395,7 @@ impl LevelManifest { /// Returns the (compressed) size of all segments #[must_use] pub fn size(&self) -> u64 { - self.iter().map(|s| s.metadata.file_size).sum() + self.iter().map(Segment::file_size).sum() } #[must_use] diff --git a/src/lib.rs b/src/lib.rs index d0408644..5d2cbbce 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -99,11 +99,10 @@ #![allow(clippy::missing_const_for_fn)] #![warn(clippy::multiple_crate_versions)] #![allow(clippy::option_if_let_else)] -#![warn(clippy::needless_lifetimes)] #![warn(clippy::redundant_feature_names)] -pub(crate) type HashMap = std::collections::HashMap; -pub(crate) type HashSet = std::collections::HashSet; +pub(crate) type HashMap = std::collections::HashMap; +pub(crate) type HashSet = std::collections::HashSet; #[allow(unused)] macro_rules! set { diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 11b0b822..4d65064a 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -155,6 +155,11 @@ impl Segment { .map(DataBlock::new) } + /// Returns the (possibly compressed) file size. + pub(crate) fn file_size(&self) -> u64 { + self.metadata.file_size + } + pub fn get( &self, key: &[u8], diff --git a/src/version/mod.rs b/src/version/mod.rs index 13f12745..b2047736 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -108,7 +108,7 @@ impl Level { self.0 .iter() .flat_map(|x| x.iter()) - .map(|x| x.metadata.file_size) + .map(Segment::file_size) .sum() } From 47384e1af9160e238c476d407918bbdac01b47df Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 18 Aug 2025 15:56:26 +0200 Subject: [PATCH 329/613] perf: leveled compaction lazy evaluation --- src/compaction/leveled.rs | 148 ++++++++++++++++---------------------- 1 file changed, 63 insertions(+), 85 deletions(-) diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index d7f73274..a34e06c5 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -18,112 +18,89 @@ pub fn aggregate_run_key_range(segments: &[Segment]) -> KeyRange { KeyRange::new((lo.key_range().min().clone(), hi.key_range().max().clone())) } -/// Tries to find the most optimal compaction set from -/// one level into the other. +/// Tries to find the most optimal compaction set from one level into the other. fn pick_minimal_compaction( curr_run: &Run, next_run: Option<&Run>, hidden_set: &HiddenSet, overshoot: u64, + segment_base_size: u64, ) -> Option<(HashSet, bool)> { - struct Choice { - write_amp: f32, - segment_ids: HashSet, - can_trivial_move: bool, - // TODO: compaction_bytes - } - - let mut choices = vec![]; + // NOTE: Find largest trivial move (if it exists) + if let Some(window) = curr_run.shrinking_windows().find(|window| { + let key_range = aggregate_run_key_range(window); - let mut add_choice = |choice: Choice| { - let valid_choice = if hidden_set.is_blocked(choice.segment_ids.iter().copied()) { - // IMPORTANT: Compaction is blocked because of other - // on-going compaction - false - } else if choice.can_trivial_move { - true + if let Some(next_run) = &next_run { + if next_run.get_overlapping(&key_range).is_empty() { + return true; + } } else { - // TODO: this should not consider the number of segments, but the amount of rewritten data - // which corresponds to the amount of temporary space amp - choice.segment_ids.len() <= 100 /* TODO: filter by x25 IF POSSIBLE */ - }; - - if valid_choice { - choices.push(choice); + return true; } - }; - if let Some(next_run) = &next_run { - for window in next_run.growing_windows() { - if hidden_set.is_blocked(window.iter().map(Segment::id)) { - // IMPORTANT: Compaction is blocked because of other - // on-going compaction - continue; - } + false + }) { + let ids = window.iter().map(Segment::id).collect(); + return Some((ids, true)); + } - let key_range = aggregate_run_key_range(window); + // NOTE: Look for merges + if let Some(next_run) = &next_run { + next_run + .growing_windows() + .take_while(|window| { + // Cap at 50x segments per compaction for now + // + // At this point, all compactions are too large anyway + // so we can escape early + let next_level_size = window.iter().map(Segment::file_size).sum::(); + next_level_size <= (50 * segment_base_size) + }) + .filter_map(|window| { + if hidden_set.is_blocked(window.iter().map(Segment::id)) { + // IMPORTANT: Compaction is blocked because of other + // on-going compaction + return None; + } - // Pull in all segments in current level into compaction - let curr_level_pull_in: Vec<_> = curr_run.get_contained(&key_range).collect(); + let key_range = aggregate_run_key_range(window); - if hidden_set.is_blocked(curr_level_pull_in.iter().map(|x| x.id())) { - // IMPORTANT: Compaction is blocked because of other - // on-going compaction - continue; - } + // Pull in all contained segments in current level into compaction + let curr_level_pull_in = curr_run.get_contained(&key_range); - let curr_level_size = curr_level_pull_in - .iter() - .map(|x| x.metadata.file_size) - .sum::(); + let curr_level_size = curr_level_pull_in + .iter() + .map(Segment::file_size) + .sum::(); - // if curr_level_size >= overshoot { - let next_level_size = window.iter().map(|x| x.metadata.file_size).sum::(); + // if curr_level_size < overshoot { + // return None; + // } - let mut segment_ids: HashSet<_> = window.iter().map(Segment::id).collect(); - segment_ids.extend(curr_level_pull_in.iter().map(|x| x.id())); + if hidden_set.is_blocked(curr_level_pull_in.iter().map(Segment::id)) { + // IMPORTANT: Compaction is blocked because of other + // on-going compaction + return None; + } - let write_amp = (next_level_size as f32) / (curr_level_size as f32); + let next_level_size = window.iter().map(Segment::file_size).sum::(); - add_choice(Choice { - write_amp, - segment_ids, - can_trivial_move: false, - }); - // } - } - } + // let compaction_bytes = curr_level_size + next_level_size; - // NOTE: Find largest trivial move (if it exists) - for window in curr_run.shrinking_windows() { - let key_range = aggregate_run_key_range(window); + #[allow(clippy::cast_precision_loss)] + let write_amp = (next_level_size as f32) / (curr_level_size as f32); - if let Some(next_run) = &next_run { - if next_run.get_overlapping(&key_range).next().is_none() { - add_choice(Choice { - write_amp: 0.0, - segment_ids: window.iter().map(Segment::id).collect(), - can_trivial_move: true, - }); - break; - } - } else { - add_choice(Choice { - write_amp: 0.0, - segment_ids: window.iter().map(Segment::id).collect(), - can_trivial_move: true, - }); - break; - } + Some((window, curr_level_pull_in, write_amp)) + }) + .min_by(|a, b| a.2.partial_cmp(&b.2).unwrap_or(std::cmp::Ordering::Equal)) + .map(|(window, curr_level_pull_in, _)| { + let mut ids: HashSet<_> = window.iter().map(Segment::id).collect(); + ids.extend(curr_level_pull_in.iter().map(Segment::id)); + (ids, false) + }) + } else { + None } - - let minimum_effort_choice = choices.into_iter().min_by(|a, b| { - a.write_amp - .partial_cmp(&b.write_amp) - .unwrap_or(std::cmp::Ordering::Equal) - }); - - minimum_effort_choice.map(|c| (c.segment_ids, c.can_trivial_move)) } /// Levelled compaction strategy (LCS) @@ -344,6 +321,7 @@ impl CompactionStrategy for Strategy { next_level.first_run().map(std::ops::Deref::deref), levels.hidden_set(), overshoot_bytes, + u64::from(self.target_size), ) else { return Choice::DoNothing; }; From 2cd07bf92321f568d2b1f31afd6d93aed9e0ac5b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 18 Aug 2025 16:02:51 +0200 Subject: [PATCH 330/613] wip --- Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 2fabcefa..d4327013 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -47,7 +47,6 @@ nanoid = "0.4.0" rand = "0.9.2" test-log = "0.2.18" - [package.metadata.cargo-all-features] denylist = [] From 8795cb70171e1d60ac1a84113d215aa978bfe51d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 18 Aug 2025 16:03:22 +0200 Subject: [PATCH 331/613] fix: lifetime --- src/version/run.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/version/run.rs b/src/version/run.rs index c41d3467..322534a4 100644 --- a/src/version/run.rs +++ b/src/version/run.rs @@ -121,7 +121,7 @@ impl Run { /// Returns the sub slice of segments of segments in the run that have /// a key range fully contained in the input key range. - pub fn get_contained<'a>(&'a self, key_range: &'a KeyRange) -> &'a [T] { + pub fn get_contained<'a>(&'a self, key_range: &KeyRange) -> &'a [T] { fn trim_slice(s: &[T], pred: F) -> &[T] where F: Fn(&T) -> bool, From 78ae7ce867d00fa7535371edd3f0a2a32aac7d19 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 24 Aug 2025 18:41:07 +0200 Subject: [PATCH 332/613] adjust bloom_speed bench --- microbench/bloom_speed/Cargo.toml | 2 +- microbench/bloom_speed/run.nu | 1 - microbench/bloom_speed/template.py | 4 ++-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/microbench/bloom_speed/Cargo.toml b/microbench/bloom_speed/Cargo.toml index 1fc03e0c..ba88f204 100644 --- a/microbench/bloom_speed/Cargo.toml +++ b/microbench/bloom_speed/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "bloom_bench" +name = "bloom_speed" version = "0.1.0" edition = "2024" diff --git a/microbench/bloom_speed/run.nu b/microbench/bloom_speed/run.nu index 92a918e1..36f75fca 100644 --- a/microbench/bloom_speed/run.nu +++ b/microbench/bloom_speed/run.nu @@ -2,4 +2,3 @@ rm -f data.jsonl cargo run -r | save data.jsonl --append cargo run -r --features use_unsafe | save data.jsonl --append python3 template.py - diff --git a/microbench/bloom_speed/template.py b/microbench/bloom_speed/template.py index 05d03057..83b995f8 100644 --- a/microbench/bloom_speed/template.py +++ b/microbench/bloom_speed/template.py @@ -2,9 +2,9 @@ import matplotlib.pyplot as plt from collections import defaultdict from pathlib import Path -from palettable.tableau import BlueRed_6 +from palettable.tableau import PurpleGray_6 -colors = BlueRed_6.mpl_colors +colors = PurpleGray_6.mpl_colors # Path to the JSONL file jsonl_path = Path('data.jsonl') From dc444b32f34f1e399cadfd237122e241205192e1 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 24 Aug 2025 18:41:36 +0200 Subject: [PATCH 333/613] microbench: hash fns --- microbench/hash_fns/.gitignore | 1 + microbench/hash_fns/Cargo.toml | 21 +++++ microbench/hash_fns/run.nu | 2 + microbench/hash_fns/src/lib.rs | 150 ++++++++++++++++++++++++++++++++ microbench/hash_fns/src/main.rs | 102 ++++++++++++++++++++++ microbench/hash_fns/template.py | 60 +++++++++++++ 6 files changed, 336 insertions(+) create mode 100644 microbench/hash_fns/.gitignore create mode 100644 microbench/hash_fns/Cargo.toml create mode 100644 microbench/hash_fns/run.nu create mode 100644 microbench/hash_fns/src/lib.rs create mode 100644 microbench/hash_fns/src/main.rs create mode 100644 microbench/hash_fns/template.py diff --git a/microbench/hash_fns/.gitignore b/microbench/hash_fns/.gitignore new file mode 100644 index 00000000..291a5fe2 --- /dev/null +++ b/microbench/hash_fns/.gitignore @@ -0,0 +1 @@ +data.jsonl diff --git a/microbench/hash_fns/Cargo.toml b/microbench/hash_fns/Cargo.toml new file mode 100644 index 00000000..1b9c3654 --- /dev/null +++ b/microbench/hash_fns/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "hash_bench" +version = "0.1.0" +edition = "2024" + +[lib] + +[dependencies] +cityhasher = "0.1.0" +fnv = "1.0.7" +foldhash = "0.1.5" +fxhash = "0.2.1" +gxhash = "3.5.0" +metrohash = "1.0.7" +rand = "0.9.1" +rapidhash = "3.0.0" +rustc-hash = "2.1.1" +seahash = "4.1.0" +wyhash = "0.5.0" +xxhash-rust = { version = "0.8.15", features = ["xxh3", "xxh64"] } +twox-hash = { version = "2.1.0" } diff --git a/microbench/hash_fns/run.nu b/microbench/hash_fns/run.nu new file mode 100644 index 00000000..c45a4c16 --- /dev/null +++ b/microbench/hash_fns/run.nu @@ -0,0 +1,2 @@ +RUSTFLAGS="-C target-cpu=native" cargo run -r + diff --git a/microbench/hash_fns/src/lib.rs b/microbench/hash_fns/src/lib.rs new file mode 100644 index 00000000..38dceb15 --- /dev/null +++ b/microbench/hash_fns/src/lib.rs @@ -0,0 +1,150 @@ +use std::hash::{BuildHasher, Hasher}; + +/// Calculates a 64-bit hash from a byte slice. +pub trait Hash64 { + /// Gets the readable hash function name (e.g. "metrohash") + fn name(&self) -> &'static str; + + /// Hashes a byte slice to a 64-bit digest + fn hash64(&self, bytes: &[u8]) -> u64; +} + +pub struct Fnv; +impl Hash64 for Fnv { + fn name(&self) -> &'static str { + "FNV" + } + + fn hash64(&self, bytes: &[u8]) -> u64 { + let mut hasher = fnv::FnvHasher::default(); + hasher.write(bytes); + hasher.finish() + } +} + +pub struct Xxh64; +impl Hash64 for Xxh64 { + fn name(&self) -> &'static str { + "XXH64" + } + + fn hash64(&self, bytes: &[u8]) -> u64 { + let mut hasher = xxhash_rust::xxh64::Xxh64::default(); + hasher.write(bytes); + hasher.finish() + } +} + +pub struct Xxh3; +impl Hash64 for Xxh3 { + fn name(&self) -> &'static str { + "XXH3" + } + + fn hash64(&self, bytes: &[u8]) -> u64 { + xxhash_rust::xxh3::xxh3_64(bytes) + } +} + +pub struct Xxh3_B; +impl Hash64 for Xxh3_B { + fn name(&self) -> &'static str { + "XXH3_B" + } + + fn hash64(&self, bytes: &[u8]) -> u64 { + twox_hash::XxHash3_64::oneshot(bytes) + } +} + +pub struct CityHash; +impl Hash64 for CityHash { + fn name(&self) -> &'static str { + "CityHash" + } + + fn hash64(&self, bytes: &[u8]) -> u64 { + cityhasher::hash(bytes) + } +} + +pub struct MetroHash; +impl Hash64 for MetroHash { + fn name(&self) -> &'static str { + "MetroHash" + } + + fn hash64(&self, bytes: &[u8]) -> u64 { + let mut hasher = metrohash::MetroHash64::default(); + hasher.write(bytes); + hasher.finish() + } +} + +pub struct WyHash; +impl Hash64 for WyHash { + fn name(&self) -> &'static str { + "WyHash" + } + + fn hash64(&self, bytes: &[u8]) -> u64 { + wyhash::wyhash(bytes, 0) + } +} + +pub struct RapidHash; +impl Hash64 for RapidHash { + fn name(&self) -> &'static str { + "RapidHash" + } + + fn hash64(&self, bytes: &[u8]) -> u64 { + let mut hasher = rapidhash::fast::RapidHasher::default(); + hasher.write(bytes); + hasher.finish() + } +} + +pub struct SeaHash; +impl Hash64 for SeaHash { + fn name(&self) -> &'static str { + "SeaHash" + } + + fn hash64(&self, bytes: &[u8]) -> u64 { + seahash::hash(bytes) + } +} + +pub struct RustcHash; +impl Hash64 for RustcHash { + fn name(&self) -> &'static str { + "RustcHash" + } + + fn hash64(&self, bytes: &[u8]) -> u64 { + rustc_hash::FxBuildHasher::default().hash_one(bytes) + } +} + +pub struct FxHash; +impl Hash64 for FxHash { + fn name(&self) -> &'static str { + "FxHash" + } + + fn hash64(&self, bytes: &[u8]) -> u64 { + fxhash::hash64(bytes) + } +} + +pub struct GxHash; +impl Hash64 for GxHash { + fn name(&self) -> &'static str { + "GxHash" + } + + fn hash64(&self, bytes: &[u8]) -> u64 { + gxhash::gxhash64(bytes, 123) + } +} diff --git a/microbench/hash_fns/src/main.rs b/microbench/hash_fns/src/main.rs new file mode 100644 index 00000000..55d88742 --- /dev/null +++ b/microbench/hash_fns/src/main.rs @@ -0,0 +1,102 @@ +use hash_bench::*; +use rand::RngCore; +use std::{path::Path, time::Instant}; + +fn main() { + let hashers: &[&dyn Hash64] = &[ + /* NOTE: GxHash needs AES instructions and a manual build flag, so a bit annoying to compile + but it's very fast: + RUSTFLAGS="-C target-cpu=native" cargo run -r + */ + // &GxHash, + &Xxh64, + &Xxh3, + // &Xxh3_B, // NOTE: twox-hash is slower than xxhash-rust + &RapidHash, &CityHash, &MetroHash, + &WyHash, + // &Fnv, + // &RustcHash, // NOTE: rustc_hash is supposedly stable, but stability is a non-goal: https://github.com/rust-lang/rustc-hash/pull/56#issuecomment-2667670854 + // &FxHash, // NOTE: ^ same for fxhash + // &SeaHash, // NOTE: seahash is pretty slow + ]; + + let mut rng = rand::rng(); + + let mut output = Vec::with_capacity(hashers.len()); + + for hasher in hashers { + for (byte_len, invocations) in [ + (4, 1_000_000_000), + (8, 1_000_000_000), + (16, 1_000_000_000), + (32, 1_000_000_000), + (64, 1_000_000_000), + (128, 500_000_000), + (256, 250_000_000), + (512, 125_000_000), + (1_024, 64_000_000), + (4 * 1_024, 16_000_000), + (8 * 1_024, 8_000_000), + (16 * 1_024, 4_000_000), + (32 * 1_024, 2_000_000), + (64 * 1_024, 1_000_000), + ] { + let invocations = if hasher.name() == "FNV" { + invocations / 4 / 10 + } else { + invocations / 4 + }; + + let mut bytes = vec![0; byte_len]; + rng.fill_bytes(&mut bytes); + eprint!("{} ({} bytes): ", hasher.name(), bytes.len()); + + let start = Instant::now(); + for _ in 0..invocations { + std::hint::black_box(hasher.hash64(&bytes)); + } + let elapsed = start.elapsed(); + let ns = elapsed.as_nanos(); + let per_call = ns as f64 / invocations as f64; + + eprintln!("{elapsed:?} - {per_call}ns per invocation"); + + output.push(format!( + "{{\"hash\": {:?},\"byte_len\": {byte_len}, \"ns\": {per_call}}}", + hasher.name(), + )); + } + + eprintln!(); + } + + { + let output = Path::new("hash.png"); + + if output.exists() { + std::fs::remove_file(&output).unwrap(); + } + } + + { + let data = output.join("\n"); + + let template = std::fs::read_to_string("template.py").unwrap(); + let template = template.replace("% data %", &data); + std::fs::write("tmp.py", &template).unwrap(); + std::fs::write("data.jsonl", &data).unwrap(); + } + + { + let status = std::process::Command::new("python3") + .arg("tmp.py") + .status() + .unwrap(); + + std::fs::remove_file("tmp.py").unwrap(); + + assert!(status.success(), "python failed"); + } + + // TODO: bench conflicts 1B keys - if >0, create matplotlib python image as well, also save JSON +} diff --git a/microbench/hash_fns/template.py b/microbench/hash_fns/template.py new file mode 100644 index 00000000..b042dc3d --- /dev/null +++ b/microbench/hash_fns/template.py @@ -0,0 +1,60 @@ +import matplotlib.pyplot as plt +import json +from palettable.tableau import BlueRed_6 +from pathlib import Path + +colors = BlueRed_6.mpl_colors + +data = Path('data.jsonl').read_text() + +# Parse the data +data_list = [json.loads(line) for line in data.strip().split('\n')] + +# Calculate throughput (hashes per second) +for entry in data_list: + # Convert ns to seconds and calculate throughput (1 hash per measurement) + time_in_seconds = entry["ns"] / 1e9 + entry["throughput"] = 1 / time_in_seconds # 1 hash / time in seconds + +# Group data by hash type +grouped_data = {} +for entry in data_list: + hash_type = entry["hash"] + if hash_type not in grouped_data: + grouped_data[hash_type] = {"byte_len": [], "throughput": []} + grouped_data[hash_type]["byte_len"].append(entry["byte_len"]) + grouped_data[hash_type]["throughput"].append(entry["throughput"]) + +plt.rcParams.update({ + 'axes.labelsize': 8, + 'font.size': 8, + 'legend.fontsize': 10, + 'xtick.labelsize': 10, + 'ytick.labelsize': 10, + 'text.usetex': False, + 'figure.figsize': [4.5, 4.5] +}) + +# Create the plot +plt.figure(figsize=(6, 4)) + +i = 0 +markers = ["*", "o", "d", ".", "v", "^"] + +for hash_type, values in grouped_data.items(): + plt.plot(values["byte_len"], values["throughput"], marker=markers[i], + linestyle='-', label=hash_type, color=colors[i]) + i += 1 + +plt.xlabel("Input length [bytes]") +plt.ylabel("Throughput [op/s]") + +plt.xscale('log') +plt.yscale('log') + +plt.legend(loc='upper center', fancybox=True, bbox_to_anchor=(0.5, 1.25), shadow=True, ncol=3) +plt.grid(color="0.9", linestyle='--', linewidth=1) +plt.tight_layout() + +# Save the plot to a file +plt.savefig("hash_fns.svg") From e986eb4cc73f4a39fd760d96c3b7954fc1c10b84 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 24 Aug 2025 18:41:54 +0200 Subject: [PATCH 334/613] microbench: fractional cascading in segment indexing --- microbench/fractional_cascading/Cargo.toml | 14 ++ microbench/fractional_cascading/run.nu | 8 + microbench/fractional_cascading/run.py | 69 +++++++ microbench/fractional_cascading/src/main.rs | 190 ++++++++++++++++++++ microbench/fractional_cascading/template.py | 46 +++++ 5 files changed, 327 insertions(+) create mode 100644 microbench/fractional_cascading/Cargo.toml create mode 100644 microbench/fractional_cascading/run.nu create mode 100644 microbench/fractional_cascading/run.py create mode 100644 microbench/fractional_cascading/src/main.rs create mode 100644 microbench/fractional_cascading/template.py diff --git a/microbench/fractional_cascading/Cargo.toml b/microbench/fractional_cascading/Cargo.toml new file mode 100644 index 00000000..67da0adc --- /dev/null +++ b/microbench/fractional_cascading/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "fractional_bench" +version = "0.1.0" +edition = "2024" + +[features] +default = ["fast_partition_point"] +cascading = [] +fast_partition_point = [] +use_unsafe = ["lsm-tree/use_unsafe"] + +[dependencies] +lsm-tree = { path = "../.." } +rand = "0.9.1" diff --git a/microbench/fractional_cascading/run.nu b/microbench/fractional_cascading/run.nu new file mode 100644 index 00000000..f57c5e71 --- /dev/null +++ b/microbench/fractional_cascading/run.nu @@ -0,0 +1,8 @@ +#/bin/nu + +rm -f data.jsonl +cargo run -r | save data.jsonl --append +cargo run -r --features use_unsafe | save data.jsonl --append +cargo run -r --features cascading | save data.jsonl --append +cargo run -r --features cascading,use_unsafe | save data.jsonl --append +python3 template.py diff --git a/microbench/fractional_cascading/run.py b/microbench/fractional_cascading/run.py new file mode 100644 index 00000000..97b25ad7 --- /dev/null +++ b/microbench/fractional_cascading/run.py @@ -0,0 +1,69 @@ +import itertools +import subprocess + +features_list = [ + "cascading", + # "fast_partition_point", + "use_unsafe" +] + +def run_with_features(features): + """ + Constructs the cargo command and runs it with the specified features. + Prints the command being run and its output. + """ + if not features: + # Handle the case of no features (though technically a combination of size 0) + # If you want to run with no features, you might adjust this + features_arg = "" + command = ["cargo", "run", "-r"] + # print("--- Running command: cargo run -r (no features) ---") + else: + features_arg = ",".join(features) + command = ["cargo", "run", "-r", "--features", features_arg] + # print(f"--- Running command: {' '.join(command)} ---") + + try: + # Run the command and capture output + result = subprocess.run( + command, + capture_output=True, + text=True, # Capture output as text + check=True # Raise an exception if the command fails + ) + # print("--- Output ---") + print(result.stdout.strip()) + # if result.stderr: + # print("--- Stderr ---") + # print(result.stderr) + + except subprocess.CalledProcessError as e: + print(f"--- Command failed with error: {e} ---") + print(f"--- Stderr ---") + print(e.stderr) + except FileNotFoundError: + print("--- Error: 'cargo' command not found. Is Cargo installed and in your PATH? ---") + except Exception as e: + print(f"--- An unexpected error occurred: {e} ---") + +if __name__ == "__main__": + # Generate combinations of different lengths (from 1 to the total number of features) + all_combinations = [] + + for i in range(1, len(features_list) + 1): + combinations_of_length_i = itertools.combinations(features_list, i) + all_combinations.extend(list(combinations_of_length_i)) + + all_combinations.append(tuple()) + all_combinations.sort(key=len) + + # Include the case with no features (an empty combination) if desired + # all_combinations.append(tuple()) # Uncomment this line to include running with no features + + # Loop over each combination + for combination in all_combinations: + # Run the cargo command with the current combination's features + # Convert the tuple to a list for the join operation + run_with_features(list(combination)) + # print("\n" + "="*50 + "\n") # Separator for clarity + diff --git a/microbench/fractional_cascading/src/main.rs b/microbench/fractional_cascading/src/main.rs new file mode 100644 index 00000000..b5ce1ba9 --- /dev/null +++ b/microbench/fractional_cascading/src/main.rs @@ -0,0 +1,190 @@ +use lsm_tree::{KeyRange, UserKey}; +use rand::Rng; +use std::{sync::Arc, time::Instant}; + +#[cfg(feature = "fast_partition_point")] +pub fn partition_point(slice: &[T], pred: F) -> usize +where + F: Fn(&T) -> bool, +{ + let mut left = 0; + let mut right = slice.len(); + + if right == 0 { + return 0; + } + + while left < right { + let mid = (left + right) / 2; + + // SAFETY: See https://github.com/rust-lang/rust/blob/ebf0cf75d368c035f4c7e7246d203bd469ee4a51/library/core/src/slice/mod.rs#L2834-L2836 + #[warn(unsafe_code)] + #[cfg(feature = "use_unsafe")] + let item = unsafe { slice.get_unchecked(mid) }; + + #[cfg(not(feature = "use_unsafe"))] + let item = slice.get(mid).unwrap(); + + if pred(item) { + left = mid + 1; + } else { + right = mid; + } + } + + left +} + +pub fn get_segment_containing_key(segments: &[Arc], key: &[u8]) -> Option> { + #[cfg(feature = "fast_partition_point")] + let idx = partition_point(segments, |segment| segment.key_range.max() < &key); + + #[cfg(not(feature = "fast_partition_point"))] + let idx = segments.partition_point(|segment| segment.key_range.max() < &key); + + segments + .get(idx) + .filter(|x| x.key_range.min() <= &key) + .cloned() +} + +#[derive(Clone, Debug)] +struct Segment { + // id: String, + is_lmax: bool, + key_range: KeyRange, + next: (u32, u32), +} + +fn run(num_sst: usize) { + eprintln!("Benchmarking {num_sst} SSTs"); + + let keys = (0..num_sst * 2) + .map(|x| x.to_be_bytes().to_vec()) + .collect::>(); + + let lowest_level = keys + .chunks(2) + .map(|x| KeyRange::new((UserKey::new(&x[0]), UserKey::new(&x[1])))) + .enumerate() + .map(|(idx, key_range)| { + Arc::new(Segment { + // id: format!("Lmax-{idx}"), + is_lmax: true, + key_range, + next: (u32::MAX, u32::MAX), + }) + }) + .collect::>(); + + let mut levels = vec![lowest_level]; + + for _ in 0..10 { + let next_level = &levels[0]; + + if next_level.len() <= 10 { + break; + } + + let new_upper_level = next_level + .chunks(10) + .enumerate() + .map(|(idx, x)| { + let idx = idx as u32; + let key_range = KeyRange::aggregate(x.iter().map(|x| &x.key_range)); + Arc::new(Segment { + // id: format!("L3-{idx}"), + is_lmax: false, + key_range, + next: (idx * 10, idx * 10 + 9), + }) + }) + .collect::>(); + + levels.insert(0, new_upper_level); + } + + for (idx, level) in levels.iter().enumerate() { + eprintln!("L{:?} = {}", idx + 1, level.len()); + } + + let mut rng = rand::rng(); + + const RUNS: usize = 20_000_000; + + let start = Instant::now(); + + for _ in 0..RUNS { + let idx = rng.random_range(0..keys.len()); + let key = &keys[idx]; + + // NOTE: Naive search + #[cfg(not(feature = "cascading"))] + { + for (_idx, level) in levels.iter().enumerate() { + let _segment = get_segment_containing_key(&level, &*key).unwrap(); + // eprintln!("found {segment:?} in L{}", idx + 1); + } + } + + // NOTE: Search with fractional cascading + #[cfg(feature = "cascading")] + { + let mut bounds: (u32, u32) = (u32::MAX, u32::MAX); + + for (idx, level) in levels.iter().enumerate() { + let segment = if idx == 0 { + get_segment_containing_key(&level, &*key).expect("should find segment") + } else { + let (lo, hi) = bounds; + let lo = lo as usize; + let hi = hi as usize; + + #[cfg(feature = "use_unsafe")] + let slice = unsafe { level.get_unchecked(lo..=hi) }; + + #[cfg(not(feature = "use_unsafe"))] + let slice = level.get(lo..=hi).unwrap(); + + get_segment_containing_key(slice, &*key).expect("should find segment") + }; + // eprintln!("found {segment:?} in L{}", idx + 1); + + bounds = segment.next; + } + } + } + + let elapsed = start.elapsed(); + let ns = elapsed.as_nanos(); + let per_run = ns / RUNS as u128; + + #[cfg(feature = "cascading")] + let cascading = true; + + #[cfg(not(feature = "cascading"))] + let cascading = false; + + #[cfg(feature = "fast_partition_point")] + let fast_partition_point = true; + + #[cfg(not(feature = "fast_partition_point"))] + let fast_partition_point = false; + + #[cfg(feature = "use_unsafe")] + let used_unsafe = true; + + #[cfg(not(feature = "use_unsafe"))] + let used_unsafe = false; + + println!( + "{{\"lmax_ssts\": {num_sst}, \"ns\":{per_run}, \"unsafe\":{used_unsafe}, \"std_partition_point\":{}, \"cascading\":{cascading} }}", + !fast_partition_point, + ); +} + +fn main() { + for lmax_sst_count in [100, 500, 1_000, 2_000, 4_000, 10_000] { + run(lmax_sst_count); + } +} diff --git a/microbench/fractional_cascading/template.py b/microbench/fractional_cascading/template.py new file mode 100644 index 00000000..cf600454 --- /dev/null +++ b/microbench/fractional_cascading/template.py @@ -0,0 +1,46 @@ +import matplotlib.pyplot as plt +import json +from palettable.tableau import PurpleGray_6 +from pathlib import Path + +colors = PurpleGray_6.mpl_colors + +data = Path("data.jsonl").read_text() + +# Parse the data +data_list = [json.loads(line) for line in data.strip().split('\n')] + +# Organize data by boolean key +from collections import defaultdict + +grouped = defaultdict(list) +for entry in data_list: + key = (entry["unsafe"], entry["std_partition_point"], entry["cascading"]) + grouped[key].append((entry["lmax_ssts"], entry["ns"])) + +# Plot +plt.figure(figsize=(6, 4)) + +markers = ["*", "o", "d", ".", "v", "^"] +i = 0 + +for key, values in grouped.items(): + values.sort() + x = [v[0] for v in values] + y = [v[1] for v in values] + label = "Cascading" if key[2] else "No cascading" + label += " unsafe" if key[0] else "" + plt.plot(x, y, label=label, color=colors[i], marker=markers[i]) + i += 1 + +plt.xscale("log") + +plt.xlabel("Segments in last level") +plt.ylabel("lookup latency [ns]") + +plt.legend(loc='upper center', fancybox=True, bbox_to_anchor=(0.5, 1.10), shadow=True, ncol=2) +plt.grid(color="0.9", linestyle='--', linewidth=1) +plt.tight_layout() + +plt.savefig("segment_indexing.svg") + From 7262200f4bd340280dd54a33162fa46fd09fc85e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 24 Aug 2025 18:42:09 +0200 Subject: [PATCH 335/613] microbench: bloom fpr --- microbench/bloom_fpr/Cargo.toml | 11 ++++ microbench/bloom_fpr/run.nu | 3 ++ microbench/bloom_fpr/src/main.rs | 90 +++++++++++++++++++++++++++++++ microbench/bloom_fpr/template.py | 93 ++++++++++++++++++++++++++++++++ 4 files changed, 197 insertions(+) create mode 100644 microbench/bloom_fpr/Cargo.toml create mode 100644 microbench/bloom_fpr/run.nu create mode 100644 microbench/bloom_fpr/src/main.rs create mode 100644 microbench/bloom_fpr/template.py diff --git a/microbench/bloom_fpr/Cargo.toml b/microbench/bloom_fpr/Cargo.toml new file mode 100644 index 00000000..a5a99caa --- /dev/null +++ b/microbench/bloom_fpr/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "bloom_fpr" +version = "0.1.0" +edition = "2024" + +[features] +default = [] + +[dependencies] +lsm-tree = { path = "../..", features = ["lz4"] } +rand = "0.9.0" diff --git a/microbench/bloom_fpr/run.nu b/microbench/bloom_fpr/run.nu new file mode 100644 index 00000000..7ee48ce9 --- /dev/null +++ b/microbench/bloom_fpr/run.nu @@ -0,0 +1,3 @@ +rm -f data.jsonl +cargo run -r | save data.jsonl --append +python3 template.py diff --git a/microbench/bloom_fpr/src/main.rs b/microbench/bloom_fpr/src/main.rs new file mode 100644 index 00000000..97b7cd42 --- /dev/null +++ b/microbench/bloom_fpr/src/main.rs @@ -0,0 +1,90 @@ +use rand::RngCore; +use std::time::Instant; + +const NUM_READS: usize = 100_000_000; + +pub fn main() { + let mut rng = rand::rng(); + + let keys = (0..100_000_000u64) + .map(|x| x.to_be_bytes()) + .collect::>(); + + for fpr in [0.25, 0.1, 0.01, 0.001, 0.000_1, 0.000_01, 0.000_001] { + let n = keys.len(); + + { + use lsm_tree::segment::filter::standard_bloom::Builder; + use lsm_tree::segment::filter::standard_bloom::StandardBloomFilterReader as Reader; + + let mut filter = Builder::with_fp_rate(n, fpr); + + for key in &keys { + filter.set_with_hash(Builder::get_hash(key)); + } + + let filter_bytes = filter.build(); + let filter = Reader::new(&filter_bytes).unwrap(); + + eprintln!("-- standard n={n} e={fpr} --"); + + { + let mut hits = 0; + + for _ in 0..NUM_READS { + let mut key = [0; 16]; + rng.fill_bytes(&mut key); + let hash = Builder::get_hash(&key); + + if filter.contains_hash(hash) { + hits += 1; + } + } + + let real_fpr = hits as f64 / NUM_READS as f64; + + let filter_size_bytes = filter_bytes.len(); + println!( + r#"{{"real_fpr":{real_fpr},"key_count":{n},"target_fpr":{fpr},"impl":"standard","false_hits":{hits},"bytes":{filter_size_bytes}}}"# + ); + } + } + + { + use lsm_tree::segment::filter::blocked_bloom::Builder; + use lsm_tree::segment::filter::blocked_bloom::BlockedBloomFilterReader as Reader; + + let mut filter = Builder::with_fp_rate(n, fpr); + + for key in &keys { + filter.set_with_hash(Builder::get_hash(key)); + } + + let filter_bytes = filter.build(); + let filter = Reader::new(&filter_bytes).unwrap(); + + eprintln!("-- blocked n={n} e={fpr} --"); + + { + let mut hits = 0; + + for _ in 0..NUM_READS { + let mut key = [0; 16]; + rng.fill_bytes(&mut key); + let hash = Builder::get_hash(&key); + + if filter.contains_hash(hash) { + hits += 1; + } + } + + let real_fpr = hits as f64 / NUM_READS as f64; + + let filter_size_bytes = filter_bytes.len(); + println!( + r#"{{"real_fpr":{real_fpr},"key_count":{n},"target_fpr":{fpr},"impl":"blocked","false_hits":{hits},"bytes":{filter_size_bytes}}}"# + ); + } + } + } +} diff --git a/microbench/bloom_fpr/template.py b/microbench/bloom_fpr/template.py new file mode 100644 index 00000000..11e84d70 --- /dev/null +++ b/microbench/bloom_fpr/template.py @@ -0,0 +1,93 @@ +import json +import matplotlib.pyplot as plt +from collections import defaultdict +from pathlib import Path +from palettable.tableau import PurpleGray_6 + +colors = PurpleGray_6.mpl_colors + +jsonl_path = Path('data.jsonl') + +fpr_data = defaultdict(list) +size_data = defaultdict(list) + +for line in jsonl_path.read_text().splitlines(): + obj = json.loads(line) + impl = obj['impl'] + fpr_data[impl].append((obj['target_fpr'], obj['real_fpr'])) + size_data[impl].append((obj['target_fpr'], obj['bytes'])) + +plt.rcParams.update({ + 'axes.labelsize': 8, + 'font.size': 8, + 'legend.fontsize': 10, + 'xtick.labelsize': 10, + 'ytick.labelsize': 10, + 'text.usetex': False, + 'figure.figsize': [4.5, 4.5] +}) + +fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4)) + +# --- Plot 1: Real FPR vs Target FPR --- +i = 0 + +for impl, values in fpr_data.items(): + values.sort() + x_vals = [x for x, y in values] + y_vals = [y for x, y in values] + marker = "v" if impl == "blocked" else "o" + label = impl + ax1.plot(x_vals, y_vals, marker=marker, label=label, color=colors[i], linestyle="-") + i += 1 + +# --- Plot 2: Filter Size vs Target FPR --- +i = 0 +for impl, values in size_data.items(): + values.sort() + x_vals = [x for x, y in values] + y_vals = [y / 1_024 / 1_024 for x, y in values] + marker = "v" if impl == "blocked" else "o" + ax2.plot(x_vals, y_vals, marker=marker, label=impl, color=colors[i], linestyle="-") + i += 1 + +# --- Secondary Y-axis: Size difference --- +ax2b = ax2.twinx() + +# Compute difference (impls[1] - impls[0]) assuming same target_fpr +impl1_vals = sorted(size_data["standard"]) +impl2_vals = sorted(size_data["blocked"]) + +# Make sure lengths and x-values match +percent_diff_x = [] +percent_diff_y = [] +for (x1, y1), (x2, y2) in zip(impl1_vals, impl2_vals): + percent_diff_x.append(x1) + percent_diff_y.append(100.0 * (y2 - y1) / y1) + +ax2b.plot(percent_diff_x, percent_diff_y, color='#a0a0a0', linestyle='dotted', marker='x', label="Diff") +ax2b.set_ylabel("Size difference [%]") +ax2b.invert_yaxis() +ax2b.set_ylim(top=0, bottom=33) +ax2b.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{int(x)}")) + +ax1.set_title("A", loc='left') +ax1.set_xscale("log") +ax1.set_yscale("log") +ax1.set_xlabel("Target false positive rate") +ax1.set_ylabel("Real false positive rate") +ax1.grid(color="0.9", linestyle='--', linewidth=1) +ax1.legend(loc='upper center', fancybox=True, bbox_to_anchor=(0.5, 1.15), shadow=True, ncol=2) + +ax2.set_title("B", loc='left') +ax2.set_xscale("log") +ax2.set_ylim(bottom=0) +ax2.set_xlabel("Target false positive rate") +ax2.set_ylabel("Filter size [MiB]") +ax2.grid(color="0.9", linestyle='--', linewidth=1) +lines1, labels1 = ax2.get_legend_handles_labels() +lines2, labels2 = ax2b.get_legend_handles_labels() +ax2b.legend(lines1 + lines2, labels1 + labels2, loc='upper center', fancybox=True, bbox_to_anchor=(0.5, 1.15), shadow=True, ncol=2) + +plt.tight_layout() +plt.savefig("bloom_fpr.svg") From 2f1d05723af26055fe2b6174955862d55374f0db Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 24 Aug 2025 18:42:20 +0200 Subject: [PATCH 336/613] microbench: block loading --- microbench/block_load/.gitignore | 1 + microbench/block_load/Cargo.toml | 19 ++++++++ microbench/block_load/run.nu | 4 ++ microbench/block_load/src/main.rs | 75 +++++++++++++++++++++++++++++++ microbench/block_load/template.py | 68 ++++++++++++++++++++++++++++ 5 files changed, 167 insertions(+) create mode 100644 microbench/block_load/.gitignore create mode 100644 microbench/block_load/Cargo.toml create mode 100644 microbench/block_load/run.nu create mode 100644 microbench/block_load/src/main.rs create mode 100644 microbench/block_load/template.py diff --git a/microbench/block_load/.gitignore b/microbench/block_load/.gitignore new file mode 100644 index 00000000..48cf2ad1 --- /dev/null +++ b/microbench/block_load/.gitignore @@ -0,0 +1 @@ +block diff --git a/microbench/block_load/Cargo.toml b/microbench/block_load/Cargo.toml new file mode 100644 index 00000000..e16002a0 --- /dev/null +++ b/microbench/block_load/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "block_load_bench" +version = "1.0.0" +edition = "2021" +publish = false + +[profile.release] +debug = true + +[features] +default = [] +use_unsafe = ["lsm-tree/use_unsafe"] + +[dependencies] +env_logger = "0.11.8" +lsm-tree = { path = "../..", features = ["lz4"] } +lz4_flex = "0.11.3" +rand = "0.9" +serde_json = "1.0.140" diff --git a/microbench/block_load/run.nu b/microbench/block_load/run.nu new file mode 100644 index 00000000..11240364 --- /dev/null +++ b/microbench/block_load/run.nu @@ -0,0 +1,4 @@ +rm -f data.jsonl +cargo run -r --features use_unsafe | save data.jsonl --append +cargo run -r --no-default-features | save data.jsonl --append +python3 template.py diff --git a/microbench/block_load/src/main.rs b/microbench/block_load/src/main.rs new file mode 100644 index 00000000..0a8acb63 --- /dev/null +++ b/microbench/block_load/src/main.rs @@ -0,0 +1,75 @@ +use lsm_tree::{ + segment::{ + block::{Block, BlockType, Header as BlockHeader}, + BlockHandle, BlockOffset, DataBlock, + }, + CompressionType, InternalValue, +}; +use std::time::Instant; + +pub fn main() -> lsm_tree::Result<()> { + env_logger::Builder::from_default_env().init(); + + #[cfg(feature = "use_unsafe")] + let used_unsafe = true; + + #[cfg(not(feature = "use_unsafe"))] + let used_unsafe = false; + + for item_count in [100, 200, 400, 1_000, 2_000] { + let mut items = vec![]; + + for item in 0u64..item_count { + items.push(InternalValue::from_components( + item.to_be_bytes(), + b"1asdabawerbwqerbqwr", + 0, + lsm_tree::ValueType::Value, + )); + } + + let mut file = std::fs::File::create("block")?; + + let bytes = DataBlock::encode_into_vec(&items, 16, 1.33)?; + let header = Block::write_into(&mut file, &bytes, BlockType::Data, CompressionType::None)?; + let bytes_written = BlockHeader::serialized_len() as u32 + header.data_length; + + file.sync_all()?; + drop(file); + + { + let file = std::fs::File::open("block")?; + + { + const NUM_RUNS: u128 = 5_000_000; + + let start = Instant::now(); + for _ in 0..NUM_RUNS { + let _block = lsm_tree::segment::Block::from_file( + &file, + BlockHandle::new(BlockOffset(0), bytes_written as u32), + BlockType::Data, + CompressionType::None, + )?; + } + + let rps_ns = { + let ns = start.elapsed().as_nanos(); + ns / NUM_RUNS + }; + + println!( + "{}", + serde_json::json!({ + "block_size": bytes.len(), + "rps_ns": rps_ns, + "unsafe": used_unsafe, + }) + .to_string(), + ); + } + } + } + + Ok(()) +} diff --git a/microbench/block_load/template.py b/microbench/block_load/template.py new file mode 100644 index 00000000..a0ffb812 --- /dev/null +++ b/microbench/block_load/template.py @@ -0,0 +1,68 @@ +import matplotlib.pyplot as plt +import json +from palettable.tableau import PurpleGray_6 +from pathlib import Path + +colors = PurpleGray_6.mpl_colors + +data = Path('data.jsonl').read_text() + +# Parse the data +data_list = [json.loads(line) for line in data.strip().split('\n')] + +# Separate data based on the 'unsafe' field +safe_data = [item for item in data_list if not item["unsafe"]] +unsafe_data = [item for item in data_list if item["unsafe"]] + +# Extract x and y values for each category +safe_block_sizes = [item["block_size"] for item in safe_data] +safe_latencies = [item["rps_ns"] for item in safe_data] + +unsafe_block_sizes = [item["block_size"] for item in unsafe_data] +unsafe_latencies = [item["rps_ns"] for item in unsafe_data] + +plt.rcParams.update({ + 'axes.labelsize': 8, + 'font.size': 8, + 'legend.fontsize': 10, + 'xtick.labelsize': 10, + 'ytick.labelsize': 10, + 'text.usetex': False, + 'figure.figsize': [4.5, 4.5] +}) + +# Create the plot +plt.figure(figsize=(6, 4)) + +# Plot the data for 'unsafe' = False +plt.plot( + safe_block_sizes, + safe_latencies, + marker="o", + linestyle="-", + label="safe", + color=colors[0], +) + +# Plot the data for 'unsafe' = True +plt.plot( + unsafe_block_sizes, + unsafe_latencies, + marker="s", + linestyle="--", + label="unsafe", + color=colors[1], +) + +# Add labels and title +plt.xscale("log") +plt.yscale("log") +# plt.ylim(bottom=0) +plt.xlabel("Block size [bytes]") +plt.ylabel("Read latency [ns/op]") +plt.legend(loc='upper center', fancybox=True, bbox_to_anchor=(0.5, 1.05), shadow=True, ncol=2) +plt.grid(color="0.9", linestyle='--', linewidth=1) +plt.tight_layout() + +# Show the plot +plt.savefig("block_load.svg") From 9551bb8e6e1a117e8040d31c6bb679edb48c88c2 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 24 Aug 2025 18:42:33 +0200 Subject: [PATCH 337/613] microbench: block hash index --- microbench/block_hash_index/Cargo.toml | 19 ++++ microbench/block_hash_index/run.nu | 5 + microbench/block_hash_index/src/main.rs | 143 ++++++++++++++++++++++++ microbench/block_hash_index/template.py | 64 +++++++++++ 4 files changed, 231 insertions(+) create mode 100644 microbench/block_hash_index/Cargo.toml create mode 100644 microbench/block_hash_index/run.nu create mode 100644 microbench/block_hash_index/src/main.rs create mode 100644 microbench/block_hash_index/template.py diff --git a/microbench/block_hash_index/Cargo.toml b/microbench/block_hash_index/Cargo.toml new file mode 100644 index 00000000..20976422 --- /dev/null +++ b/microbench/block_hash_index/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "block_hash_index_bench" +version = "1.0.0" +edition = "2021" +publish = false + +[profile.release] +debug = true + +[features] +default = [] +use_unsafe = ["lsm-tree/use_unsafe"] + +[dependencies] +env_logger = "0.11.8" +lsm-tree = { path = "../..", features = ["lz4"] } +lz4_flex = "0.11.3" +rand = "0.9" +serde_json = "1.0.140" diff --git a/microbench/block_hash_index/run.nu b/microbench/block_hash_index/run.nu new file mode 100644 index 00000000..92a918e1 --- /dev/null +++ b/microbench/block_hash_index/run.nu @@ -0,0 +1,5 @@ +rm -f data.jsonl +cargo run -r | save data.jsonl --append +cargo run -r --features use_unsafe | save data.jsonl --append +python3 template.py + diff --git a/microbench/block_hash_index/src/main.rs b/microbench/block_hash_index/src/main.rs new file mode 100644 index 00000000..4bf76eb3 --- /dev/null +++ b/microbench/block_hash_index/src/main.rs @@ -0,0 +1,143 @@ +use lsm_tree::segment::DataBlock; +use lsm_tree::{coding::Encode, InternalValue, ValueType}; +use rand::Rng; +use std::io::Write; +use std::time::Instant; + +pub fn main() -> lsm_tree::Result<()> { + env_logger::Builder::from_default_env().init(); + + #[cfg(feature = "use_unsafe")] + let use_unsafe = true; + + #[cfg(not(feature = "use_unsafe"))] + let use_unsafe = false; + + let mut rng = rand::rng(); + + let mut items = vec![]; + let item_count = 500; + + for item in 0u128..item_count { + items.push(InternalValue::from_components( + item.to_be_bytes(), + b"asevrasevfbss4b4n6tuziwernwawrbg", + 0, + lsm_tree::ValueType::Value, + )); + } + + for hash_ratio in [0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0] { + // eprintln!("hash_ratio={hash_ratio}"); + + use lsm_tree::segment::{ + block::{BlockType, Header}, + BlockOffset, Checksum, DataBlock, + }; + + let bytes = DataBlock::encode_into_vec(&items, 16, hash_ratio)?; + // eprintln!("{bytes:?}"); + // eprintln!("{}", String::from_utf8_lossy(&bytes)); + // eprintln!("encoded into {} bytes", bytes.len()); + + { + use lsm_tree::segment::Block; + + let block = DataBlock::new(Block { + data: lsm_tree::Slice::new(&bytes), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + block_type: BlockType::Data, + }, + }); + + /* eprintln!( + "hash index conflicts: {:?} / {:?}", + block.hash_bucket_conflict_count(), + block.hash_bucket_count(), + ); + eprintln!( + "hash index free slots: {:?} / {:?}", + block.hash_bucket_free_count(), + block.hash_bucket_count(), + ); */ + + { + const NUM_RUNS: u128 = 25_000_000; + + let start = Instant::now(); + for _ in 0..NUM_RUNS { + let needle = rng.random_range(0..item_count).to_be_bytes(); + block.point_read(&needle, u64::MAX).unwrap(); + } + + let rps_ns = { + let ns = start.elapsed().as_nanos(); + ns / NUM_RUNS + }; + + /* eprintln!("one read took {:?}ns",); */ + + println!( + "{}", + serde_json::json!({ + "block_size": bytes.len(), + "hash_ratio": format!("{hash_ratio:.1?}"), + "rps_ns": rps_ns, + "conflicts": block.get_hash_index_reader().map(|x| x.conflict_count()).unwrap_or_default(), + "free": block.get_hash_index_reader().map(|x| x.free_count()).unwrap_or_default(), + "use_unsafe": use_unsafe, + }) + .to_string(), + ); + } + + /* { + let start = Instant::now(); + for _ in 0..25_000 { + assert_eq!(items.len(), block.iter().count()); + } + + eprintln!("one iter() took {:?}ns", { + let ns = start.elapsed().as_nanos() as usize; + ns / 25_000 / items.len() + }); + } */ + + /* { + let start = Instant::now(); + for _ in 0..25_000 { + assert_eq!(items.len(), block.iter().rev().count()); + } + + eprintln!("one iter().rev() took {:?}ns", { + let ns = start.elapsed().as_nanos() as usize; + ns / 25_000 / items.len() + }); + } */ + } + + /* { + let mut writer = vec![]; + header.encode_into(&mut writer)?; + writer.write_all(&bytes)?; + + eprintln!("V3 format (uncompressed): {}B", writer.len()); + } + + { + let mut writer = vec![]; + header.encode_into(&mut writer)?; + + let bytes = lz4_flex::compress_prepend_size(&bytes); + writer.write_all(&bytes)?; + + eprintln!("V3 format (LZ4): {}B", writer.len()); + } */ + } + + Ok(()) +} diff --git a/microbench/block_hash_index/template.py b/microbench/block_hash_index/template.py new file mode 100644 index 00000000..ae8796b5 --- /dev/null +++ b/microbench/block_hash_index/template.py @@ -0,0 +1,64 @@ +import json +from pathlib import Path +import matplotlib.pyplot as plt +from palettable.tableau import PurpleGray_6 + +colors = PurpleGray_6.mpl_colors + +# Path to your data file +data_path = Path("data.jsonl") + +# Read and parse the data +safe_data = [] +unsafe_data = [] + +with data_path.open() as f: + for line in f: + if len(line) == 0: + continue + + entry = json.loads(line) + if entry.get("use_unsafe") == False: + safe_data.append(entry) + else: + unsafe_data.append(entry) + +# Sort by hash_ratio to ensure smooth lines +safe_data.sort(key=lambda x: x["hash_ratio"]) +unsafe_data.sort(key=lambda x: x["hash_ratio"]) + +# Extract data for plotting +hash_ratio_safe = [d["hash_ratio"] for d in safe_data] +rps_ns_safe = [d["rps_ns"] for d in safe_data] +block_size = [d["block_size"] for d in safe_data] + +hash_ratio_unsafe = [d["hash_ratio"] for d in unsafe_data] +rps_ns_unsafe = [d["rps_ns"] for d in unsafe_data] + +# Create figure and first Y-axis +fig, ax1 = plt.subplots(figsize=(6, 4)) + +# Plot rps_ns (left Y-axis) +ax1.plot(hash_ratio_safe, rps_ns_safe, label='Read latency (safe)', marker='o', color = colors[0]) +ax1.plot(hash_ratio_unsafe, rps_ns_unsafe, label='Read latency (unsafe)', marker='x', color = colors[1]) +ax1.set_xlabel('Hash ratio [bytes per KV]') +ax1.set_ylabel('Point read latency [ns]') +ax1.tick_params(axis='y') + +# Create second Y-axis for block size +ax2 = ax1.twinx() +ax2.plot(hash_ratio_safe, block_size, label='Block size', linestyle='--', marker='d', color = colors[2]) +ax2.set_ylabel('Block size [bytes]') +ax2.tick_params(axis='y') + +# Combine legends from both axes +lines1, labels1 = ax1.get_legend_handles_labels() +lines2, labels2 = ax2.get_legend_handles_labels() +ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper center', fancybox=True, bbox_to_anchor=(0.5, 1.25), shadow=True, ncol=2) + +# Grid and title +ax1.grid(color="0.9", linestyle='--', linewidth=1) +# plt.title('Safe vs Unsafe: rps_ns and Block Size vs Hash Ratio') +plt.tight_layout() + +plt.savefig("block_hash_index.svg") From 59234872f6605e06f867709fe0cfb2a02e93bfc9 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 24 Aug 2025 18:42:44 +0200 Subject: [PATCH 338/613] microbench: block binary index --- microbench/block_bin_index/Cargo.toml | 20 +++ microbench/block_bin_index/run.nu | 6 + microbench/block_bin_index/src/main.rs | 169 ++++++++++++++++++ microbench/block_bin_index/template.py | 68 +++++++ .../block_bin_index/template3d_space.py | 37 ++++ .../block_bin_index/template3d_speed.py | 36 ++++ 6 files changed, 336 insertions(+) create mode 100644 microbench/block_bin_index/Cargo.toml create mode 100644 microbench/block_bin_index/run.nu create mode 100644 microbench/block_bin_index/src/main.rs create mode 100644 microbench/block_bin_index/template.py create mode 100644 microbench/block_bin_index/template3d_space.py create mode 100644 microbench/block_bin_index/template3d_speed.py diff --git a/microbench/block_bin_index/Cargo.toml b/microbench/block_bin_index/Cargo.toml new file mode 100644 index 00000000..b026ed17 --- /dev/null +++ b/microbench/block_bin_index/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "block_bin_index_bench" +version = "1.0.0" +edition = "2021" +publish = false + +[profile.release] +debug = true + +[features] +default = [] +use_unsafe = ["lsm-tree/use_unsafe"] + +[dependencies] +env_logger = "0.11.8" +lsm-tree = { path = "../..", features = ["lz4"] } +lz4_flex = "0.11.3" +rand = "0.9" +serde_json = "1.0.140" +scru128 = "3.1.0" diff --git a/microbench/block_bin_index/run.nu b/microbench/block_bin_index/run.nu new file mode 100644 index 00000000..ada20241 --- /dev/null +++ b/microbench/block_bin_index/run.nu @@ -0,0 +1,6 @@ +rm -f data.jsonl +cargo run -r | save --append data.jsonl +cargo run -r --features use_unsafe | save --append data.jsonl +python3 template3d_speed.py +python3 template3d_space.py +python3 template.py diff --git a/microbench/block_bin_index/src/main.rs b/microbench/block_bin_index/src/main.rs new file mode 100644 index 00000000..81c1507c --- /dev/null +++ b/microbench/block_bin_index/src/main.rs @@ -0,0 +1,169 @@ +use lsm_tree::{InternalValue, SeqNo}; +use rand::{Rng, RngCore}; +use std::io::Write; + +fn generate_key(primary_key: u64, secondary_key: u64) -> [u8; 16] { + scru128::new().into() +} + +pub fn main() -> lsm_tree::Result<()> { + env_logger::Builder::from_default_env().init(); + + let mut rng = rand::rng(); + + #[cfg(feature = "use_unsafe")] + let used_unsafe = true; + + #[cfg(not(feature = "use_unsafe"))] + let used_unsafe = false; + + for item_count in [10, 50, 100, 250, 500, 1_000, 2_000, 4_000] { + let mut items = vec![]; + + { + let mut buf = [0u8; 16]; + + for item in 0u64..item_count { + let key = generate_key(item, 0); + rng.fill_bytes(&mut buf); + + items.push(InternalValue::from_components( + &key, + &buf, + 0, + lsm_tree::ValueType::Value, + )); + } + } + + let intervals: &[u8] = if std::env::var("DEFAULT_RESTART_INTERVAL_ONLY").is_ok() { + &[16] + } else { + &[ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + ] + }; + + for &restart_interval in intervals { + // eprintln!("hash_ratio={hash_ratio}"); + + use lsm_tree::segment::{ + block::{BlockType, Header}, + BlockOffset, Checksum, DataBlock, + }; + + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 0.0)?; + // eprintln!("{bytes:?}"); + // eprintln!("{}", String::from_utf8_lossy(&bytes)); + // eprintln!("encoded into {} bytes", bytes.len()); + + { + use lsm_tree::segment::Block; + use std::time::Instant; + + let block = DataBlock::new(Block { + data: lsm_tree::Slice::new(&bytes), + header: Header { + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + block_type: BlockType::Data, + }, + }); + + /* eprintln!( + "hash index conflicts: {:?} / {:?}", + block.hash_bucket_conflict_count(), + block.hash_bucket_count(), + ); + eprintln!( + "hash index free slots: {:?} / {:?}", + block.hash_bucket_free_count(), + block.hash_bucket_count(), + ); */ + + { + const NUM_RUNS: u128 = 2_500_000; + + let start = Instant::now(); + for _ in 0..NUM_RUNS { + let needle = rng.random_range(0..item_count as usize); + let needle = &items[needle].key.user_key; + + let mut iter = block.iter(); + + assert!( + iter.seek(&needle /* TODO: , SeqNo::MAX */), + "did not find key", + ); + // block.point_read(&needle, None); + } + + let rps_ns = { + let ns = start.elapsed().as_nanos(); + ns / NUM_RUNS + }; + + /* eprintln!("one read took {:?}ns",); */ + + println!( + "{}", + serde_json::json!({ + "block_size": bytes.len(), + "restart_interval": restart_interval, + "rps_ns": rps_ns, + "item_count": item_count, + "unsafe": used_unsafe, + }) + .to_string(), + ); + } + + /* { + let start = Instant::now(); + for _ in 0..25_000 { + assert_eq!(items.len(), block.iter().count()); + } + + eprintln!("one iter() took {:?}ns", { + let ns = start.elapsed().as_nanos() as usize; + ns / 25_000 / items.len() + }); + } */ + + /* { + let start = Instant::now(); + for _ in 0..25_000 { + assert_eq!(items.len(), block.iter().rev().count()); + } + + eprintln!("one iter().rev() took {:?}ns", { + let ns = start.elapsed().as_nanos() as usize; + ns / 25_000 / items.len() + }); + } */ + } + + /* { + let mut writer = vec![]; + header.encode_into(&mut writer)?; + writer.write_all(&bytes)?; + + eprintln!("V3 format (uncompressed): {}B", writer.len()); + } + + { + let mut writer = vec![]; + header.encode_into(&mut writer)?; + + let bytes = lz4_flex::compress_prepend_size(&bytes); + writer.write_all(&bytes)?; + + eprintln!("V3 format (LZ4): {}B", writer.len()); + } */ + } + } + + Ok(()) +} diff --git a/microbench/block_bin_index/template.py b/microbench/block_bin_index/template.py new file mode 100644 index 00000000..533936d0 --- /dev/null +++ b/microbench/block_bin_index/template.py @@ -0,0 +1,68 @@ +import json +from pathlib import Path +import matplotlib.pyplot as plt +from palettable.tableau import PurpleGray_6 + +colors = PurpleGray_6.mpl_colors + +# Path to your data file +data_path = Path("data.jsonl") + +# Read and parse the data +safe_data = [] +unsafe_data = [] + +with data_path.open() as f: + for line in f: + if len(line) == 0: + continue + + entry = json.loads(line) + + if entry.get("item_count") != 1000: + continue + + if entry.get("unsafe") == False: + safe_data.append(entry) + else: + unsafe_data.append(entry) + +# Sort by restart_interval to ensure smooth lines +safe_data.sort(key=lambda x: x["restart_interval"]) +unsafe_data.sort(key=lambda x: x["restart_interval"]) + +# Extract data for plotting +restart_interval_safe = [d["restart_interval"] for d in safe_data] +rps_ns_safe = [d["rps_ns"] for d in safe_data] +block_size = [d["block_size"] for d in safe_data] + +restart_interval_unsafe = [d["restart_interval"] for d in unsafe_data] +rps_ns_unsafe = [d["rps_ns"] for d in unsafe_data] + +# Create figure and first Y-axis +fig, ax1 = plt.subplots(figsize=(6, 4)) + +# Plot rps_ns (left Y-axis) +ax1.plot(restart_interval_safe, rps_ns_safe, label='Read latency (safe)', marker='o', color = colors[0]) +ax1.plot(restart_interval_unsafe, rps_ns_unsafe, label='Read latency (unsafe)', marker='x', color = colors[1]) +ax1.set_xlabel('Restart interval') +ax1.set_ylabel('Point read latency [ns]') +ax1.tick_params(axis='y') + +# Create second Y-axis for block size +ax2 = ax1.twinx() +ax2.plot(restart_interval_safe, block_size, label='Block size', linestyle='--', marker='d', color = colors[2]) +ax2.set_ylabel('Block size [bytes]') +ax2.tick_params(axis='y') + +# Combine legends from both axes +lines1, labels1 = ax1.get_legend_handles_labels() +lines2, labels2 = ax2.get_legend_handles_labels() +ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper center', fancybox=True, bbox_to_anchor=(0.5, 1.25), shadow=True, ncol=2) + +# Grid and title +ax1.grid(color="0.9", linestyle='--', linewidth=1) +# plt.title('Safe vs Unsafe: rps_ns and Block Size vs Hash Ratio') +plt.tight_layout() + +plt.savefig("block_binary_index.svg") diff --git a/microbench/block_bin_index/template3d_space.py b/microbench/block_bin_index/template3d_space.py new file mode 100644 index 00000000..dd18dac0 --- /dev/null +++ b/microbench/block_bin_index/template3d_space.py @@ -0,0 +1,37 @@ +from pathlib import Path +import json +import matplotlib.pyplot as plt + +# Read JSONL file using Path API +data_file = Path("data.jsonl") +lines = [line for line in data_file.read_text().splitlines() if line.strip()] +data_points = [json.loads(line) for line in lines] +filtered_data = [point for point in data_points if not point.get("unsafe", False)] + +# Extract the axes +x_vals = [point["item_count"] for point in filtered_data] +y_vals = [point["restart_interval"] for point in filtered_data] +z_vals = [point["block_size"] / 1024 for point in filtered_data] + +# Plotting +fig = plt.figure(figsize=(6, 4)) +ax = fig.add_subplot(111, projection='3d') + +trisurf = ax.plot_trisurf(x_vals, y_vals, z_vals, cmap='viridis', edgecolor='none', alpha=0.8) + +cbar = fig.colorbar(trisurf, ax=ax, pad=0.1, shrink=0.8, aspect=15) +cbar.set_label("", labelpad=10) + +ax.set_xlabel("# KV tuples") +ax.set_ylabel("Restart interval") +ax.set_zlabel("Block size [KiB]") + +ax.set_zlim(bottom=0) + +ax.invert_xaxis() +ax.invert_yaxis() + +fig.subplots_adjust(left=-0.3, right=0.99, top=0.99, bottom=0.08) + +# plt.tight_layout() +plt.savefig("binary_index_3d_space.svg") diff --git a/microbench/block_bin_index/template3d_speed.py b/microbench/block_bin_index/template3d_speed.py new file mode 100644 index 00000000..969f675e --- /dev/null +++ b/microbench/block_bin_index/template3d_speed.py @@ -0,0 +1,36 @@ +from pathlib import Path +import json +import matplotlib.pyplot as plt + +# Read JSONL file using Path API +data_file = Path("data.jsonl") +lines = [line for line in data_file.read_text().splitlines() if line.strip()] +data_points = [json.loads(line) for line in lines] +filtered_data = [point for point in data_points if not point.get("unsafe", False)] + +# Extract the axes +x_vals = [point["item_count"] for point in filtered_data] +y_vals = [point["restart_interval"] for point in filtered_data] +z_vals = [point["rps_ns"] for point in filtered_data] + +# Plotting +fig = plt.figure(figsize=(6, 4)) +ax = fig.add_subplot(111, projection='3d') + +trisurf = ax.plot_trisurf(x_vals, y_vals, z_vals, cmap='viridis', edgecolor='none', alpha=0.8) + +cbar = fig.colorbar(trisurf, ax=ax, pad=0.1, shrink=0.8, aspect=15) +cbar.set_label("", labelpad=10) + +ax.set_xlabel("# KV tuples") +ax.set_ylabel("Restart interval") +ax.set_zlabel("Read latency [ns]") + +ax.set_zlim(bottom=0) + +ax.invert_xaxis() + +fig.subplots_adjust(left=-0.3, right=0.99, top=0.99, bottom=0.08) + +# plt.tight_layout() +plt.savefig("binary_index_3d_speed.svg") From 6a3df3f6295d5d72ec0a7471f3154cfa81349853 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 24 Aug 2025 18:42:50 +0200 Subject: [PATCH 339/613] remove old test case --- tests/bloom_fpr.rs | 112 --------------------------------------------- 1 file changed, 112 deletions(-) delete mode 100644 tests/bloom_fpr.rs diff --git a/tests/bloom_fpr.rs b/tests/bloom_fpr.rs deleted file mode 100644 index da040140..00000000 --- a/tests/bloom_fpr.rs +++ /dev/null @@ -1,112 +0,0 @@ -// use lsm_tree::{ -// segment::filter::{ -// blocked_bloom::Builder as BlockedBloomBuilder, -// standard_bloom::Builder as StandardBloomBuilder, AMQ, -// }, -// Result, -// }; - -// // [Theoretical] FPR: 1.0000%, [Empirical] Standard Bloom FPR: 0.0002, Blocked Bloom FPR: 0.0313% -// // [Theoretical] FPR: 0.1000%, [Empirical] Standard Bloom FPR: 0.0000, Blocked Bloom FPR: 0.0303% -// // [Theoretical] FPR: 0.0100%, [Empirical] Standard Bloom FPR: 0.0000, Blocked Bloom FPR: 0.0287% -// // [Theoretical] FPR: 0.0010%, [Empirical] Standard Bloom FPR: 0.0000, Blocked Bloom FPR: 0.0257% -// #[test] -// fn measure_bloom_fpr_with_fp_rate() -> Result<()> { -// let keys = (0..1_000_000u128) -// .map(|x| x.to_be_bytes().to_vec()) -// .collect::>(); - -// let non_existent_keys = (1_000_000..2_000_000u128) -// .map(|x| x.to_be_bytes().to_vec()) -// .collect::>(); - -// let n: usize = 5_000_000; - -// for fpr in [0.01, 0.001, 0.0001, 0.00001] { -// let mut blocked_builder = BlockedBloomBuilder::with_fp_rate(n, fpr); -// let mut standard_builder = StandardBloomBuilder::with_fp_rate(n, fpr); - -// for key in &keys { -// blocked_builder.set_with_hash(BlockedBloomBuilder::get_hash(key.as_slice())); -// standard_builder.set_with_hash(StandardBloomBuilder::get_hash(key.as_slice())); -// } - -// let blocked_filter = blocked_builder.build(); -// let standard_filter = standard_builder.build(); - -// let mut blocked_fp = 0; -// let mut standard_fp = 0; -// for non_existent_key in &non_existent_keys { -// if blocked_filter -// .contains_hash(BlockedBloomBuilder::get_hash(non_existent_key.as_slice())) -// { -// blocked_fp += 1; -// } -// if standard_filter -// .contains_hash(StandardBloomBuilder::get_hash(non_existent_key.as_slice())) -// { -// standard_fp += 1; -// } -// } - -// println!( -// "[Theoretical] FPR: {:.4}%, [Empirical] Standard Bloom FPR: {:.4}, Blocked Bloom FPR: {:.4}%", -// fpr * 100.0, -// (standard_fp as f64 / non_existent_keys.len() as f64) * 100.0, -// (blocked_fp as f64 / non_existent_keys.len() as f64) * 100.0 -// ); -// } - -// Ok(()) -// } - -// // n = 5000000, [Empirical] Standard Bloom FPR: 0.0006, Blocked Bloom FPR: 0.0276% -// // n = 10000000, [Empirical] Standard Bloom FPR: 0.0000, Blocked Bloom FPR: 0.0108% -// // n = 15000000, [Empirical] Standard Bloom FPR: 0.0000, Blocked Bloom FPR: 0.0086% -// #[test] -// fn measure_bloom_fpr_with_bpk() -> Result<()> { -// let keys = (0..1_000_000u128) -// .map(|x| x.to_be_bytes().to_vec()) -// .collect::>(); - -// let non_existent_keys = (1_000_000..2_000_000u128) -// .map(|x| x.to_be_bytes().to_vec()) -// .collect::>(); - -// for n in [5_000_000, 10_000_000, 15_000_000] { -// let mut blocked_builder = BlockedBloomBuilder::with_bpk(n, 10); -// let mut standard_builder = StandardBloomBuilder::with_bpk(n, 10); - -// for key in &keys { -// blocked_builder.set_with_hash(BlockedBloomBuilder::get_hash(key.as_slice())); -// standard_builder.set_with_hash(StandardBloomBuilder::get_hash(key.as_slice())); -// } - -// let blocked_filter = blocked_builder.build(); -// let standard_filter = standard_builder.build(); - -// let mut blocked_fp = 0; -// let mut standard_fp = 0; -// for non_existent_key in &non_existent_keys { -// if blocked_filter -// .contains_hash(BlockedBloomBuilder::get_hash(non_existent_key.as_slice())) -// { -// blocked_fp += 1; -// } -// if standard_filter -// .contains_hash(StandardBloomBuilder::get_hash(non_existent_key.as_slice())) -// { -// standard_fp += 1; -// } -// } - -// println!( -// "n = {}, [Empirical] Standard Bloom FPR: {:.4}, Blocked Bloom FPR: {:.4}%", -// n, -// (standard_fp as f64 / non_existent_keys.len() as f64) * 100.0, -// (blocked_fp as f64 / non_existent_keys.len() as f64) * 100.0 -// ); -// } - -// Ok(()) -// } From d86c351cf4f27f471b67466537c6511c3f9def23 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 24 Aug 2025 18:59:55 +0200 Subject: [PATCH 340/613] doc --- src/tree/mod.rs | 5 +---- src/version/mod.rs | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/tree/mod.rs b/src/tree/mod.rs index dca5fca0..0f00ff00 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -241,10 +241,7 @@ impl AbstractTree for Tree { let mut sealed_memtables = self.sealed_memtables.write().expect("lock is poisoned"); log::trace!("register: Acquired sealed memtables write lock"); - manifest.atomic_swap( - |version| version.with_new_l0_segment(segments), - seqno_threshold, - )?; + manifest.atomic_swap(|version| version.with_new_l0_run(segments), seqno_threshold)?; // eprintln!("{manifest}"); diff --git a/src/version/mod.rs b/src/version/mod.rs index b2047736..a22f4f54 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -14,6 +14,7 @@ use std::{ops::Deref, sync::Arc}; pub const DEFAULT_LEVEL_COUNT: u8 = 7; +/// Monotonically increasing ID of a version. pub type VersionId = u64; impl Ranged for Segment { @@ -156,10 +157,12 @@ impl std::ops::Deref for Version { // TODO: impl using generics so we can easily unit test Version transformation functions impl Version { + /// Returns the version ID. pub fn id(&self) -> VersionId { self.id } + /// Creates a new empty version. pub fn new(id: VersionId) -> Self { let levels = (0..DEFAULT_LEVEL_COUNT).map(|_| Level::empty()).collect(); @@ -169,6 +172,7 @@ impl Version { } } + /// Creates a new pre-populated version. pub fn from_levels(id: VersionId, levels: Vec) -> Self { Self { inner: Arc::new(VersionInner { id, levels }), @@ -176,19 +180,22 @@ impl Version { } } - /// Returns the amount of levels. + /// Returns the number of levels. pub fn level_count(&self) -> usize { self.levels.len() } + /// Returns an iterator through all levels. pub fn iter_levels(&self) -> impl Iterator { self.levels.iter() } + /// Returns the number of segments in all levels. pub fn segment_count(&self) -> usize { self.iter_levels().map(|x| x.segment_count()).sum() } + /// Returns an iterator over all segments. pub fn iter_segments(&self) -> impl Iterator { self.levels .iter() @@ -201,7 +208,8 @@ impl Version { self.levels.get(n) } - pub fn with_new_l0_segment(&self, run: &[Segment]) -> Self { + /// Creates a new version with the additional run added to the "top" of L0. + pub fn with_new_l0_run(&self, run: &[Segment]) -> Self { let id = self.id + 1; let mut levels = vec![]; @@ -241,6 +249,9 @@ impl Version { } } + /// Returns a new version with a list of segments removed. + /// + /// The segment files are not immediately deleted, this is handled in the compaction worker. pub fn with_dropped(&self, ids: &[SegmentId]) -> Self { let id = self.id + 1; From de4cb4b809e0824c7835f43e85f4f079c11cb0a1 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 24 Aug 2025 20:41:36 +0200 Subject: [PATCH 341/613] use File::create_new wherever possible --- src/compaction/worker.rs | 2 +- src/level_manifest/mod.rs | 2 +- src/segment/writer/mod.rs | 2 +- src/tree/mod.rs | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index f4ece704..39f6e4a8 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -363,7 +363,7 @@ fn merge_segments( opts.tree_id, opts.config.cache.clone(), opts.config.descriptor_table.clone(), - true, // TODO: look at configuration + payload.dest_level >= 3, // TODO: look at configuration #[cfg(feature = "metrics")] opts.metrics.clone(), ) diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index eacf56dd..3a9de09e 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -277,7 +277,7 @@ impl LevelManifest { folder.display(), ); - let file = std::fs::File::create(folder.join(format!("v{}", version.id())))?; + let file = std::fs::File::create_new(folder.join(format!("v{}", version.id())))?; let mut writer = BufWriter::new(file); // Magic diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 4511b280..56a4386a 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -66,7 +66,7 @@ pub struct Writer { impl Writer { pub fn new(path: PathBuf, segment_id: SegmentId) -> crate::Result { - let block_writer = File::create(&path)?; + let block_writer = File::create_new(&path)?; let block_writer = BufWriter::with_capacity(u16::MAX.into(), block_writer); Ok(Self { diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 0f00ff00..8f08b340 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -876,7 +876,7 @@ impl Tree { // NOTE: Lastly, fsync version marker, which contains the version // -> the LSM is fully initialized - let mut file = File::create(manifest_path)?; + let mut file = File::create_new(manifest_path)?; Manifest { version: FormatVersion::V3, level_count: config.level_count, From 3171c668339bd09a7ce343900dfda4bc0b3a1431 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 24 Aug 2025 21:24:43 +0200 Subject: [PATCH 342/613] fix: 32-bit overflow in leveled compaction --- src/compaction/leveled.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index a34e06c5..852d7c81 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -165,9 +165,9 @@ impl Strategy { fn level_target_size(&self, level_idx: u8) -> u64 { assert!(level_idx >= 1, "level_target_size does not apply to L0"); - let power = (self.level_ratio as usize).pow(u32::from(level_idx) - 1); + let power = (self.level_ratio as usize).pow(u32::from(level_idx) - 1) as u64; - (power * (self.level_base_size() as usize)) as u64 + power * self.level_base_size() } fn level_base_size(&self) -> u64 { From 6bba3f4392cafdc29a544a32893f455a110be113 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 25 Aug 2025 02:13:11 +0200 Subject: [PATCH 343/613] add microbench script --- microbench/run.nu | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 microbench/run.nu diff --git a/microbench/run.nu b/microbench/run.nu new file mode 100644 index 00000000..c54af228 --- /dev/null +++ b/microbench/run.nu @@ -0,0 +1,18 @@ +let benchmarks = [ + "block_bin_index", + "block_hash_index", + "block_load", + "bloom_fpr", + "bloom_speed", + "fractional_cascading", + "hash_fns", +] + +print "===== Running all benchmarks, this will take a while =====" + +for bench in $benchmarks { + print $"=== Running ($bench) function benchmark ===" + cd $bench + nu run.nu + cd .. +} From 7ead9436a20e5b697deeb11be06d6b9b49039782 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 25 Aug 2025 03:04:37 +0200 Subject: [PATCH 344/613] feat: drop range, #148 --- src/blob_tree/mod.rs | 4 +++ src/compaction/drop_range.rs | 54 ++++++++++++++++++++++++++++++++++++ src/compaction/mod.rs | 1 + src/tree/mod.rs | 14 ++++++++++ tests/tree_drop_range.rs | 32 +++++++++++++++++++++ 5 files changed, 105 insertions(+) create mode 100644 src/compaction/drop_range.rs create mode 100644 tests/tree_drop_range.rs diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index 8099a05d..1b97e50a 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -235,6 +235,10 @@ impl BlobTree { } impl AbstractTree for BlobTree { + fn drop_range(&self, key_range: crate::KeyRange) -> crate::Result<()> { + self.index.drop_range(key_range) + } + fn ingest(&self, iter: impl Iterator) -> crate::Result<()> { use crate::tree::ingest::Ingestion; use std::time::Instant; diff --git a/src/compaction/drop_range.rs b/src/compaction/drop_range.rs new file mode 100644 index 00000000..a15c793a --- /dev/null +++ b/src/compaction/drop_range.rs @@ -0,0 +1,54 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use super::{Choice, CompactionStrategy}; +use crate::{ + config::Config, level_manifest::LevelManifest, segment::Segment, version::run::Ranged, HashSet, + KeyRange, +}; + +/// Drops all segments that are **contained** in a key range +pub struct Strategy { + key_range: KeyRange, +} + +impl Strategy { + /// Configures a new `DropRange` compaction strategy. + /// + /// # Panics + /// + /// Panics, if `target_size` is below 1024 bytes. + #[must_use] + #[allow(dead_code)] + pub fn new(key_range: KeyRange) -> Self { + Self { key_range } + } +} + +impl CompactionStrategy for Strategy { + fn get_name(&self) -> &'static str { + "DropRangeCompaction" + } + + fn choose(&self, levels: &LevelManifest, _: &Config) -> Choice { + let segment_ids: HashSet<_> = levels + .iter() + .filter(|segment| self.key_range.contains_range(segment.key_range())) + .map(Segment::id) + .collect(); + + // NOTE: This should generally not occur because of the + // tree-level major compaction lock + // But just as a fail-safe... + let some_hidden = segment_ids + .iter() + .any(|&id| levels.hidden_set().is_hidden(id)); + + if some_hidden { + Choice::DoNothing + } else { + Choice::Drop(segment_ids) + } + } +} diff --git a/src/compaction/mod.rs b/src/compaction/mod.rs index 13e214f7..77cf1f69 100644 --- a/src/compaction/mod.rs +++ b/src/compaction/mod.rs @@ -7,6 +7,7 @@ pub(crate) mod fifo; pub(crate) mod leveled; // pub(crate) mod maintenance; +pub(crate) mod drop_range; pub(crate) mod major; pub(crate) mod movedown; pub(crate) mod pulldown; diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 8f08b340..7d7de0dc 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -90,6 +90,20 @@ impl AbstractTree for Tree { // TODO: clear() with Nuke compaction strategy (write lock) + fn drop_range(&self, key_range: crate::KeyRange) -> crate::Result<()> { + let strategy = Arc::new(crate::compaction::drop_range::Strategy::new(key_range)); + + // IMPORTANT: Write lock so we can be the only compaction going on + let _lock = self + .0 + .major_compaction_lock + .write() + .expect("lock is poisoned"); + + log::info!("Starting drop_range compaction"); + self.inner_compact(strategy, 0) + } + #[doc(hidden)] fn major_compact(&self, target_size: u64, seqno_threshold: SeqNo) -> crate::Result<()> { let strategy = Arc::new(crate::compaction::major::Strategy::new(target_size)); diff --git a/tests/tree_drop_range.rs b/tests/tree_drop_range.rs new file mode 100644 index 00000000..bd568c65 --- /dev/null +++ b/tests/tree_drop_range.rs @@ -0,0 +1,32 @@ +use lsm_tree::{AbstractTree, Config, KeyRange, UserKey}; +use test_log::test; + +#[test] +fn tree_drop_range() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + { + let tree = Config::new(&folder).open()?; + + for key in 'a'..='e' { + tree.insert([key as u8], "", 0); + tree.flush_active_memtable(0)?; + } + + assert_eq!(1, tree.l0_run_count()); + assert_eq!(5, tree.segment_count()); + + tree.drop_range(KeyRange::new((UserKey::from("a"), UserKey::from("c"))))?; + + assert!(!tree.contains_key("a", None)?); + assert!(!tree.contains_key("b", None)?); + assert!(!tree.contains_key("c", None)?); + assert!(tree.contains_key("d", None)?); + assert!(tree.contains_key("e", None)?); + + assert_eq!(1, tree.l0_run_count()); + assert_eq!(2, tree.segment_count()); + } + + Ok(()) +} From 0e0f65b7201556937f8b6af8d58bcc60f3c173c3 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 25 Aug 2025 03:04:51 +0200 Subject: [PATCH 345/613] clippy --- src/tree/mod.rs | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 7d7de0dc..c6210291 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -208,7 +208,7 @@ impl AbstractTree for Tree { let folder = self.config.path.join(SEGMENTS_FOLDER); let segment_file_path = folder.join(segment_id.to_string()); - log::debug!("writing segment to {segment_file_path:?}"); + log::debug!("writing segment to {}", segment_file_path.display()); let mut segment_writer = Writer::new(segment_file_path, segment_id)? .use_compression(self.config.compression) @@ -494,7 +494,7 @@ impl Tree { pub(crate) fn open(config: Config) -> crate::Result { use crate::file::MANIFEST_FILE; - log::debug!("Opening LSM-tree at {:?}", config.path); + log::debug!("Opening LSM-tree at {}", config.path.display()); // Check for old version if config.path.join("version").try_exists()? { @@ -878,7 +878,7 @@ impl Tree { use std::fs::{create_dir_all, File}; let path = config.path.clone(); - log::trace!("Creating LSM-tree at {path:?}"); + log::trace!("Creating LSM-tree at {}", path.display()); create_dir_all(&path)?; @@ -920,12 +920,15 @@ impl Tree { let tree_path = tree_path.as_ref(); - log::info!("Recovering manifest at {tree_path:?}"); + log::info!("Recovering manifest at {}", tree_path.display()); let segment_id_map = LevelManifest::recover_ids(tree_path)?; let cnt = segment_id_map.len(); - log::debug!("Recovering {cnt} disk segments from {tree_path:?}"); + log::debug!( + "Recovering {cnt} disk segments from {}", + tree_path.display(), + ); let progress_mod = match cnt { _ if cnt <= 20 => 1, @@ -965,7 +968,7 @@ impl Tree { let segment_file_path = dirent.path(); assert!(!segment_file_path.is_dir()); - log::debug!("Recovering segment from {segment_file_path:?}"); + log::debug!("Recovering segment from {}", segment_file_path.display()); let segment_id = segment_file_name.parse::().map_err(|e| { log::error!("invalid segment file name {segment_file_name:?}: {e:?}"); @@ -991,7 +994,10 @@ impl Tree { log::debug!("Recovered {idx}/{cnt} disk segments"); } } else { - log::debug!("Deleting unfinished segment: {segment_file_path:?}",); + log::debug!( + "Deleting unfinished segment: {}", + segment_file_path.display(), + ); std::fs::remove_file(&segment_file_path)?; } } From d7e0d815068a901d400bc468b4dcc46de86bddfa Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 25 Aug 2025 03:05:03 +0200 Subject: [PATCH 346/613] refactor: seqno generator, information hiding --- src/seqno.rs | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/src/seqno.rs b/src/seqno.rs index 05e53df7..7aefc091 100644 --- a/src/seqno.rs +++ b/src/seqno.rs @@ -6,7 +6,7 @@ use crate::SeqNo; use std::sync::{ atomic::{ AtomicU64, - Ordering::{Acquire, Release}, + Ordering::{AcqRel, Acquire, Release}, }, Arc, }; @@ -43,14 +43,6 @@ use std::sync::{ #[derive(Clone, Default, Debug)] pub struct SequenceNumberCounter(Arc); -impl std::ops::Deref for SequenceNumberCounter { - type Target = Arc; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - impl SequenceNumberCounter { /// Creates a new counter, setting it to some previous value #[must_use] @@ -63,12 +55,22 @@ impl SequenceNumberCounter { /// This should only be used when creating a snapshot. #[must_use] pub fn get(&self) -> SeqNo { - self.load(Acquire) + self.0.load(Acquire) } /// Gets the next sequence number. #[must_use] pub fn next(&self) -> SeqNo { - self.fetch_add(1, Release) + self.0.fetch_add(1, Release) + } + + /// Sets the sequence number. + pub fn set(&self, seqno: SeqNo) { + self.0.store(seqno, Release); + } + + /// Maximizes the sequence number. + pub fn fetch_max(&self, seqno: SeqNo) { + self.0.fetch_max(seqno, AcqRel); } } From de4b8b813f362b2f957ade0d30c97b2961c4cb59 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 25 Aug 2025 03:05:13 +0200 Subject: [PATCH 347/613] impl Segment::tombstone_count --- src/segment/mod.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 4d65064a..47915708 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -467,9 +467,7 @@ impl Segment { #[must_use] #[doc(hidden)] pub fn tombstone_count(&self) -> u64 { - todo!() - - // self.metadata.tombstone_count + self.metadata.tombstone_count } /// Returns the ratio of tombstone markers in the `Segment`. From dff4401142cdc10179abd284a895ee2d4505b06d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 25 Aug 2025 03:05:25 +0200 Subject: [PATCH 348/613] fix major compaction docs --- src/compaction/major.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/compaction/major.rs b/src/compaction/major.rs index 1aa7b5ba..0179eca2 100644 --- a/src/compaction/major.rs +++ b/src/compaction/major.rs @@ -5,15 +5,13 @@ use super::{Choice, CompactionStrategy, Input as CompactionInput}; use crate::{config::Config, level_manifest::LevelManifest, segment::Segment, HashSet}; -/// Major compaction -/// -/// Compacts all segments into the last level. +/// Compacts all segments into the last level pub struct Strategy { target_size: u64, } impl Strategy { - /// Configures a new `SizeTiered` compaction strategy. + /// Configures a new `Major` compaction strategy. /// /// # Panics /// From 7cdadf2082e4f7eaa0380e52b97873e8e126c99b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 25 Aug 2025 03:09:35 +0200 Subject: [PATCH 349/613] fix --- src/abstract.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/abstract.rs b/src/abstract.rs index 1ac81d55..91843460 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -35,6 +35,13 @@ pub trait AbstractTree { #[doc(hidden)] fn ingest(&self, iter: impl Iterator) -> crate::Result<()>; + /// Drops segments that are fully contained in a given range. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + fn drop_range(&self, key_range: crate::KeyRange) -> crate::Result<()>; + /// Performs major compaction, blocking the caller until it's done. /// /// # Errors From 461a2e24f3704ad602868d09182e52633d3a9635 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 27 Aug 2025 20:31:43 +0200 Subject: [PATCH 350/613] adjust microbenches --- microbench/block_bin_index/src/main.rs | 2 +- microbench/block_hash_index/src/main.rs | 2 +- microbench/block_load/src/main.rs | 2 +- microbench/bloom_speed/src/main.rs | 2 +- microbench/fractional_cascading/src/main.rs | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/microbench/block_bin_index/src/main.rs b/microbench/block_bin_index/src/main.rs index 81c1507c..5807ba07 100644 --- a/microbench/block_bin_index/src/main.rs +++ b/microbench/block_bin_index/src/main.rs @@ -84,7 +84,7 @@ pub fn main() -> lsm_tree::Result<()> { ); */ { - const NUM_RUNS: u128 = 2_500_000; + const NUM_RUNS: u128 = 10_000_000; let start = Instant::now(); for _ in 0..NUM_RUNS { diff --git a/microbench/block_hash_index/src/main.rs b/microbench/block_hash_index/src/main.rs index 4bf76eb3..51c4498e 100644 --- a/microbench/block_hash_index/src/main.rs +++ b/microbench/block_hash_index/src/main.rs @@ -66,7 +66,7 @@ pub fn main() -> lsm_tree::Result<()> { ); */ { - const NUM_RUNS: u128 = 25_000_000; + const NUM_RUNS: u128 = 50_000_000; let start = Instant::now(); for _ in 0..NUM_RUNS { diff --git a/microbench/block_load/src/main.rs b/microbench/block_load/src/main.rs index 0a8acb63..04e7265c 100644 --- a/microbench/block_load/src/main.rs +++ b/microbench/block_load/src/main.rs @@ -41,7 +41,7 @@ pub fn main() -> lsm_tree::Result<()> { let file = std::fs::File::open("block")?; { - const NUM_RUNS: u128 = 5_000_000; + const NUM_RUNS: u128 = 10_000_000; let start = Instant::now(); for _ in 0..NUM_RUNS { diff --git a/microbench/bloom_speed/src/main.rs b/microbench/bloom_speed/src/main.rs index 442a4a58..55de258e 100644 --- a/microbench/bloom_speed/src/main.rs +++ b/microbench/bloom_speed/src/main.rs @@ -1,7 +1,7 @@ use rand::{Rng, RngCore}; use std::time::Instant; -const NUM_READS: usize = 100_000_000; +const NUM_READS: usize = 200_000_000; pub fn main() { let mut rng = rand::rng(); diff --git a/microbench/fractional_cascading/src/main.rs b/microbench/fractional_cascading/src/main.rs index b5ce1ba9..3b00fc3b 100644 --- a/microbench/fractional_cascading/src/main.rs +++ b/microbench/fractional_cascading/src/main.rs @@ -110,7 +110,7 @@ fn run(num_sst: usize) { let mut rng = rand::rng(); - const RUNS: usize = 20_000_000; + const RUNS: usize = 25_000_000; let start = Instant::now(); From 82fdbf7acd40f99f5e2076eb0f92da3cd2d0b407 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 28 Aug 2025 13:53:24 +0200 Subject: [PATCH 351/613] unpinned full index blocks --- src/compaction/worker.rs | 1 + src/segment/block_index/mod.rs | 1 + src/segment/mod.rs | 92 +++++++++++++++++++++++++--------- src/tree/ingest.rs | 1 + src/tree/mod.rs | 2 + 5 files changed, 74 insertions(+), 23 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 39f6e4a8..013d2bf3 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -364,6 +364,7 @@ fn merge_segments( opts.config.cache.clone(), opts.config.descriptor_table.clone(), payload.dest_level >= 3, // TODO: look at configuration + payload.dest_level >= 3, // TODO: look at configuration #[cfg(feature = "metrics")] opts.metrics.clone(), ) diff --git a/src/segment/block_index/mod.rs b/src/segment/block_index/mod.rs index 1605b6d6..99ade210 100644 --- a/src/segment/block_index/mod.rs +++ b/src/segment/block_index/mod.rs @@ -49,6 +49,7 @@ pub trait BlockIndex { #[allow(clippy::module_name_repetitions)] pub enum BlockIndexImpl { Full(FullBlockIndex), + VolatileFull, // TwoLevel(TwoLevelBlockIndex), } diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 47915708..6845285e 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -29,8 +29,10 @@ pub use writer::Writer; use crate::metrics::Metrics; use crate::{ - cache::Cache, descriptor_table::DescriptorTable, segment::block::BlockType, CompressionType, - InternalValue, SeqNo, TreeId, UserKey, + cache::Cache, + descriptor_table::DescriptorTable, + segment::block::{BlockType, ParsedItem}, + CompressionType, InternalValue, SeqNo, TreeId, UserKey, }; use block_index::BlockIndexImpl; use inner::Inner; @@ -213,11 +215,29 @@ impl Segment { // TODO: we would need to return something like ValueType + Value // TODO: so the caller can decide whether to return the value or not fn point_read(&self, key: &[u8], seqno: SeqNo) -> crate::Result> { - let BlockIndexImpl::Full(block_index) = &*self.block_index else { - todo!(); + // TODO: enum_dispatch BlockIndex::iter + let index_block = match &*self.block_index { + BlockIndexImpl::Full(index) => index.inner(), + BlockIndexImpl::VolatileFull => { + &IndexBlock::new(self.load_block( + &self.regions.tli, + BlockType::Index, + self.metadata.data_block_compression, // TODO: maybe index compression + )?) + } }; - let Some(iter) = block_index.forward_reader(key) else { + let iter = { + let mut iter = index_block.iter(); + + if iter.seek(key) { + Some(iter.map(|x| x.materialize(&index_block.inner.data))) + } else { + None + } + }; + + let Some(iter) = iter else { return Ok(None); }; @@ -296,11 +316,23 @@ impl Segment { use crate::fallible_clipping_iter::FallibleClippingIter; use block_index::iter::create_index_block_reader; - let BlockIndexImpl::Full(block_index) = &*self.block_index else { - todo!(); + // TODO: enum_dispatch BlockIndex::iter + let index_block = match &*self.block_index { + BlockIndexImpl::Full(idx) => idx.inner(), + BlockIndexImpl::VolatileFull => { + &IndexBlock::new( + // TODO: handle error + self.load_block( + &self.regions.tli, + BlockType::Index, + self.metadata.data_block_compression, // TODO: maybe index compression + ) + .expect("should load block"), + ) + } }; - let index_iter = create_index_block_reader(block_index.inner().clone()); + let index_iter = create_index_block_reader(index_block.clone()); let mut iter = Iter::new( self.global_id(), @@ -337,6 +369,7 @@ impl Segment { cache: Arc, descriptor_table: Arc, pin_filter: bool, + pin_index: bool, #[cfg(feature = "metrics")] metrics: Arc, ) -> crate::Result { use block_index::FullBlockIndex; @@ -360,19 +393,6 @@ impl Segment { log::debug!("Reading meta block, with meta_ptr={:?}", regions.metadata); let metadata = ParsedMeta::load_with_handle(&file, ®ions.metadata)?; - let tli_block = { - log::debug!("Reading TLI block, with tli_ptr={:?}", regions.tli); - - let block = Block::from_file( - &file, - regions.tli, - crate::segment::block::BlockType::Index, - metadata.data_block_compression, // TODO: index blocks may get their own compression level - )?; - - IndexBlock::new(block) - }; - let block_index = if let Some(index_block_handle) = regions.index { log::debug!( "Creating partitioned block index, with tli_ptr={:?}, index_block_ptr={index_block_handle:?}", @@ -382,9 +402,28 @@ impl Segment { unimplemented!("partitioned index is not supported yet"); // BlockIndexImpl::TwoLevel(tli_block, todo!()) - } else { - log::debug!("Creating full block index, with tli_ptr={:?}", regions.tli); + } else if pin_index { + let tli_block = { + log::debug!("Reading TLI block, with tli_ptr={:?}", regions.tli); + + let block = Block::from_file( + &file, + regions.tli, + crate::segment::block::BlockType::Index, + metadata.data_block_compression, // TODO: index blocks may get their own compression level + )?; + + IndexBlock::new(block) + }; + + log::debug!( + "Creating pinned block index, with tli_ptr={:?}", + regions.tli, + ); BlockIndexImpl::Full(FullBlockIndex::new(tli_block)) + } else { + log::debug!("Creating volatile block index"); + BlockIndexImpl::VolatileFull }; // TODO: load FilterBlock @@ -513,6 +552,7 @@ mod tests { Arc::new(Cache::with_capacity_bytes(1_000_000)), Arc::new(DescriptorTable::new(10)), true, + true, #[cfg(feature = "metrics")] metrics, )?; @@ -608,6 +648,7 @@ mod tests { Arc::new(Cache::with_capacity_bytes(1_000_000)), Arc::new(DescriptorTable::new(10)), true, + true, #[cfg(feature = "metrics")] metrics, )?; @@ -664,6 +705,7 @@ mod tests { Arc::new(Cache::with_capacity_bytes(1_000_000)), Arc::new(DescriptorTable::new(10)), true, + true, #[cfg(feature = "metrics")] metrics, )?; @@ -719,6 +761,7 @@ mod tests { Arc::new(Cache::with_capacity_bytes(1_000_000)), Arc::new(DescriptorTable::new(10)), true, + true, #[cfg(feature = "metrics")] metrics, )?; @@ -785,6 +828,7 @@ mod tests { Arc::new(Cache::with_capacity_bytes(1_000_000)), Arc::new(DescriptorTable::new(10)), true, + true, #[cfg(feature = "metrics")] metrics, )?; @@ -859,6 +903,7 @@ mod tests { Arc::new(Cache::with_capacity_bytes(1_000_000)), Arc::new(DescriptorTable::new(10)), true, + true, #[cfg(feature = "metrics")] metrics, )?; @@ -927,6 +972,7 @@ mod tests { Arc::new(Cache::with_capacity_bytes(1_000_000)), Arc::new(DescriptorTable::new(10)), false, + true, #[cfg(feature = "metrics")] metrics, )?; diff --git a/src/tree/ingest.rs b/src/tree/ingest.rs index 0bf19be2..a5d334a0 100644 --- a/src/tree/ingest.rs +++ b/src/tree/ingest.rs @@ -72,6 +72,7 @@ impl<'a> Ingestion<'a> { self.tree.config.cache.clone(), self.tree.config.descriptor_table.clone(), true, + true, #[cfg(feature = "metrics")] self.tree.metrics.clone(), ) diff --git a/src/tree/mod.rs b/src/tree/mod.rs index c6210291..96885e59 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -558,6 +558,7 @@ impl Tree { self.config.cache.clone(), self.config.descriptor_table.clone(), true, // TODO: look at configuration + true, // TODO: look at configuration #[cfg(feature = "metrics")] self.metrics.clone(), )?; @@ -982,6 +983,7 @@ impl Tree { cache.clone(), descriptor_table.clone(), level_idx <= 1, // TODO: look at configuration + level_idx <= 2, // TODO: look at configuration #[cfg(feature = "metrics")] metrics.clone(), )?; From 0e71baf465cc942aa64793943247a12d5d653860 Mon Sep 17 00:00:00 2001 From: Marvin <33938500+marvin-j97@users.noreply.github.com> Date: Thu, 28 Aug 2025 13:57:56 +0200 Subject: [PATCH 352/613] Update Cargo.toml --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index d4327013..93248ac4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,7 +35,7 @@ quick_cache = { version = "0.6.16", default-features = false, features = [] } rustc-hash = "2.1.1" self_cell = "1.2.0" tempfile = "3.20.0" -value-log = { git = "https://github.com/fjall-rs/value-log", branch = "v2", default-features = false, features = [ +value-log = { git = "https://github.com/fjall-rs/value-log", rev = "1075697727579e5a885b9b88533dc9128d79780e", default-features = false, features = [ ] } varint-rs = "2.2.0" xxhash-rust = { version = "0.8.15", features = ["xxh3"] } From 38f8bf1cf90dbe2d42f00f9f1c40c24dc673d12b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 28 Aug 2025 14:06:44 +0200 Subject: [PATCH 353/613] fix --- src/segment/mod.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 6845285e..ad335037 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -113,10 +113,9 @@ impl Segment { #[must_use] pub fn pinned_block_index_size(&self) -> usize { - if let BlockIndexImpl::Full(full_block_index) = &*self.block_index { - full_block_index.inner().inner.size() - } else { - unimplemented!(); + match &*self.block_index { + BlockIndexImpl::Full(full_block_index) => full_block_index.inner().inner.size(), + BlockIndexImpl::VolatileFull => 0, } } From fa4bca1cf4c7008ff4613e838f17836d35750166 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 28 Aug 2025 19:29:14 +0200 Subject: [PATCH 354/613] fix: pinning after compactions noooooooooooooooooooo --- src/abstract.rs | 2 ++ src/compaction/worker.rs | 4 ++-- src/merge.rs | 2 +- src/tree/mod.rs | 4 +--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/abstract.rs b/src/abstract.rs index 91843460..ccdcf5e2 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -35,6 +35,8 @@ pub trait AbstractTree { #[doc(hidden)] fn ingest(&self, iter: impl Iterator) -> crate::Result<()>; + // TODO: clear() with Nuke compaction strategy (write lock) -> drop_range(..) + /// Drops segments that are fully contained in a given range. /// /// # Errors diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 013d2bf3..0f2a115b 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -363,8 +363,8 @@ fn merge_segments( opts.tree_id, opts.config.cache.clone(), opts.config.descriptor_table.clone(), - payload.dest_level >= 3, // TODO: look at configuration - payload.dest_level >= 3, // TODO: look at configuration + payload.dest_level <= 3, // TODO: look at configuration + payload.dest_level <= 3, // TODO: look at configuration #[cfg(feature = "metrics")] opts.metrics.clone(), ) diff --git a/src/merge.rs b/src/merge.rs index 0b97e8a7..a54bea44 100644 --- a/src/merge.rs +++ b/src/merge.rs @@ -152,7 +152,7 @@ mod tests { } #[test] - #[ignore] + #[ignore = "maybe not needed"] #[allow(clippy::unwrap_used)] fn merge_dup() -> crate::Result<()> { #[rustfmt::skip] diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 96885e59..0f5c678c 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -88,8 +88,6 @@ impl AbstractTree for Tree { Ok(()) } - // TODO: clear() with Nuke compaction strategy (write lock) - fn drop_range(&self, key_range: crate::KeyRange) -> crate::Result<()> { let strategy = Arc::new(crate::compaction::drop_range::Strategy::new(key_range)); @@ -982,7 +980,7 @@ impl Tree { tree_id, cache.clone(), descriptor_table.clone(), - level_idx <= 1, // TODO: look at configuration + level_idx <= 2, // TODO: look at configuration level_idx <= 2, // TODO: look at configuration #[cfg(feature = "metrics")] metrics.clone(), From 9ddc0cd4ad0a6a9a8e24ecbeab38849ef8934d62 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 29 Aug 2025 18:27:57 +0200 Subject: [PATCH 355/613] change default pinning max level after compaction --- src/compaction/worker.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 0f2a115b..6d859e96 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -363,8 +363,8 @@ fn merge_segments( opts.tree_id, opts.config.cache.clone(), opts.config.descriptor_table.clone(), - payload.dest_level <= 3, // TODO: look at configuration - payload.dest_level <= 3, // TODO: look at configuration + payload.dest_level <= 2, // TODO: look at configuration + payload.dest_level <= 2, // TODO: look at configuration #[cfg(feature = "metrics")] opts.metrics.clone(), ) From 68280cc943ac227eb7e9f14ad06ca75deca4210b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 3 Sep 2025 18:21:53 +0200 Subject: [PATCH 356/613] adjust tests --- tests/blob_drop_after_flush.rs | 2 +- tests/blob_gc.rs | 8 ++++---- tests/blob_gc_watermark.rs | 2 +- tests/blob_simple.rs | 2 ++ tests/blob_tombstone.rs | 2 +- tests/blob_tree_flush.rs | 2 +- tests/blob_tree_reload_blob.rs | 1 - tests/recovery_mac_ds_store.rs | 2 +- tests/recovery_mac_underscore.rs | 2 +- tests/segment_point_reads.rs | 8 ++++---- tests/segment_range.rs | 6 +++--- tests/segment_range_oob.rs | 4 ++-- tests/segment_remove_weak.rs | 2 +- tests/tree_disjoint_point_read.rs | 8 ++++---- tests/tree_mvcc_simple.rs | 2 +- tests/tree_non_disjoint_point_read.rs | 2 +- tests/tree_seqno.rs | 2 +- tests/tree_shadowing.rs | 16 ++++++++-------- tests/tree_write_read.rs | 2 +- 19 files changed, 38 insertions(+), 37 deletions(-) diff --git a/tests/blob_drop_after_flush.rs b/tests/blob_drop_after_flush.rs index a980bbd4..3ff20d5a 100644 --- a/tests/blob_drop_after_flush.rs +++ b/tests/blob_drop_after_flush.rs @@ -28,7 +28,7 @@ fn blob_drop_after_flush() -> lsm_tree::Result<()> { std::thread::sleep(Duration::from_secs(1)); - let strategy = value_log::SpaceAmpStrategy::new(1.0); + let strategy = lsm_tree::gc::SpaceAmpStrategy::new(1.0); tree.apply_gc_strategy(&strategy, 0)?; tree.register_segments(&[segment], 0)?; diff --git a/tests/blob_gc.rs b/tests/blob_gc.rs index a1fa196b..18f15d3f 100644 --- a/tests/blob_gc.rs +++ b/tests/blob_gc.rs @@ -72,7 +72,7 @@ fn blob_gc_2() -> lsm_tree::Result<()> { tree.gc_scan_stats(seqno.get(), 1_000)?; assert_eq!(3.0, tree.blobs.space_amp()); - let strategy = value_log::SpaceAmpStrategy::new(1.0); + let strategy = lsm_tree::gc::SpaceAmpStrategy::new(1.0); tree.apply_gc_strategy(&strategy, seqno.next())?; assert_eq!(&*tree.get("a", None)?.unwrap(), b"a"); @@ -88,7 +88,7 @@ fn blob_gc_2() -> lsm_tree::Result<()> { tree.gc_scan_stats(seqno.get(), 1_000)?; - let strategy = value_log::SpaceAmpStrategy::new(1.0); + let strategy = lsm_tree::gc::SpaceAmpStrategy::new(1.0); tree.apply_gc_strategy(&strategy, seqno.next())?; assert_eq!(0, tree.blobs.segment_count()); @@ -123,7 +123,7 @@ fn blob_gc_3() -> lsm_tree::Result<()> { tree.gc_scan_stats(seqno.get(), 1_000)?; assert_eq!(3.0, tree.blobs.space_amp()); - let strategy = value_log::SpaceAmpStrategy::new(1.0); + let strategy = lsm_tree::gc::SpaceAmpStrategy::new(1.0); tree.apply_gc_strategy(&strategy, seqno.next())?; assert!(tree.get("a", None)?.is_none()); @@ -140,7 +140,7 @@ fn blob_gc_3() -> lsm_tree::Result<()> { tree.gc_scan_stats(seqno.get(), 1_000)?; - let strategy = value_log::SpaceAmpStrategy::new(1.0); + let strategy = lsm_tree::gc::SpaceAmpStrategy::new(1.0); tree.apply_gc_strategy(&strategy, seqno.next())?; assert_eq!(0, tree.blobs.segment_count()); diff --git a/tests/blob_gc_watermark.rs b/tests/blob_gc_watermark.rs index 6f0ae505..346b484a 100644 --- a/tests/blob_gc_watermark.rs +++ b/tests/blob_gc_watermark.rs @@ -31,7 +31,7 @@ fn blob_gc_seqno_watermark() -> lsm_tree::Result<()> { let report = tree.gc_scan_stats(seqno.get() + 1, 0)?; assert_eq!(2, report.stale_blobs); - let strategy = value_log::SpaceAmpStrategy::new(1.0); + let strategy = lsm_tree::gc::SpaceAmpStrategy::new(1.0); tree.apply_gc_strategy(&strategy, 0)?; // IMPORTANT: We cannot drop any blobs yet diff --git a/tests/blob_simple.rs b/tests/blob_simple.rs index 2797ddc0..ecf317ff 100644 --- a/tests/blob_simple.rs +++ b/tests/blob_simple.rs @@ -2,6 +2,7 @@ use lsm_tree::AbstractTree; use test_log::test; #[test] +#[ignore = "wip"] fn blob_tree_simple() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let path = folder.path(); @@ -53,6 +54,7 @@ fn blob_tree_simple() -> lsm_tree::Result<()> { #[cfg(feature = "lz4")] #[test] +#[ignore = "wip"] fn blob_tree_simple_compressed() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let path = folder.path(); diff --git a/tests/blob_tombstone.rs b/tests/blob_tombstone.rs index c7b0e310..c8bf1944 100644 --- a/tests/blob_tombstone.rs +++ b/tests/blob_tombstone.rs @@ -31,7 +31,7 @@ fn blob_tree_tombstone() -> lsm_tree::Result<()> { tree.gc_scan_stats(2, 0)?; - let strategy = value_log::StaleThresholdStrategy::new(0.01); + let strategy = lsm_tree::gc::StaleThresholdStrategy::new(0.01); tree.apply_gc_strategy(&strategy, 2)?; assert_eq!(2, tree.len(None, None)?); diff --git a/tests/blob_tree_flush.rs b/tests/blob_tree_flush.rs index a3270cc8..1bc3a4a6 100644 --- a/tests/blob_tree_flush.rs +++ b/tests/blob_tree_flush.rs @@ -19,7 +19,7 @@ fn blob_gc_flush_tombstone() -> lsm_tree::Result<()> { tree.gc_scan_stats(seqno.get(), /* simulate some time has passed */ 1_000)?; assert_eq!(2.0, tree.blobs.space_amp()); - let strategy = value_log::SpaceAmpStrategy::new(1.0); + let strategy = lsm_tree::gc::SpaceAmpStrategy::new(1.0); tree.apply_gc_strategy(&strategy, seqno.next())?; assert_eq!(1, tree.blobs.segment_count()); diff --git a/tests/blob_tree_reload_blob.rs b/tests/blob_tree_reload_blob.rs index 953ca860..b4423965 100644 --- a/tests/blob_tree_reload_blob.rs +++ b/tests/blob_tree_reload_blob.rs @@ -1,5 +1,4 @@ use lsm_tree::{AbstractTree, Config, SequenceNumberCounter, TreeType}; -use std::fs::File; use test_log::test; const ITEM_COUNT: usize = 10_000; diff --git a/tests/recovery_mac_ds_store.rs b/tests/recovery_mac_ds_store.rs index eadf26d4..7f3b67ae 100644 --- a/tests/recovery_mac_ds_store.rs +++ b/tests/recovery_mac_ds_store.rs @@ -3,7 +3,7 @@ use test_log::test; #[test] fn recovery_mac_ds_store() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); { let tree = Config::new(&folder).open()?; diff --git a/tests/recovery_mac_underscore.rs b/tests/recovery_mac_underscore.rs index 0d5d5156..ee6b6652 100644 --- a/tests/recovery_mac_underscore.rs +++ b/tests/recovery_mac_underscore.rs @@ -3,7 +3,7 @@ use test_log::test; #[test] fn recovery_mac_underscore_file() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); { let tree = Config::new(&folder).open()?; diff --git a/tests/segment_point_reads.rs b/tests/segment_point_reads.rs index 144459b2..6da86fab 100644 --- a/tests/segment_point_reads.rs +++ b/tests/segment_point_reads.rs @@ -5,7 +5,7 @@ const ITEM_COUNT: usize = 1_000; #[test] fn segment_point_reads() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) .data_block_size(1_024) @@ -29,7 +29,7 @@ fn segment_point_reads() -> lsm_tree::Result<()> { #[test] fn segment_point_reads_mvcc() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) .data_block_size(1_024) @@ -69,7 +69,7 @@ fn segment_point_reads_mvcc() -> lsm_tree::Result<()> { #[test] fn segment_point_reads_mvcc_slab() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) .data_block_size(1_024) @@ -113,7 +113,7 @@ fn segment_point_reads_mvcc_slab() -> lsm_tree::Result<()> { #[test] fn blob_tree_segment_point_reads_mvcc_slab() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) .data_block_size(1_024) diff --git a/tests/segment_range.rs b/tests/segment_range.rs index d240d738..0a6ab804 100644 --- a/tests/segment_range.rs +++ b/tests/segment_range.rs @@ -5,7 +5,7 @@ const ITEM_COUNT: usize = 1_000_000; #[test] fn segment_ranges() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) .data_block_size(1_024) @@ -51,7 +51,7 @@ fn segment_ranges() -> lsm_tree::Result<()> { #[test] fn segment_range_last_back() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) .data_block_size(1_024) @@ -86,7 +86,7 @@ fn segment_range_last_back() -> lsm_tree::Result<()> { #[test] fn segment_range_last_back_2() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) .data_block_size(1_024) diff --git a/tests/segment_range_oob.rs b/tests/segment_range_oob.rs index f4d2cb27..0139d55c 100644 --- a/tests/segment_range_oob.rs +++ b/tests/segment_range_oob.rs @@ -5,7 +5,7 @@ const ITEM_COUNT: usize = 100; #[test] fn segment_range_out_of_bounds_lo() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) .data_block_size(1_024) @@ -29,7 +29,7 @@ fn segment_range_out_of_bounds_lo() -> lsm_tree::Result<()> { #[test] fn segment_range_out_of_bounds_hi() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) .data_block_size(1_024) diff --git a/tests/segment_remove_weak.rs b/tests/segment_remove_weak.rs index e834e46f..e9415607 100644 --- a/tests/segment_remove_weak.rs +++ b/tests/segment_remove_weak.rs @@ -3,7 +3,7 @@ use test_log::test; #[test] fn segment_remove_weak_simple() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) .data_block_size(1_024) diff --git a/tests/tree_disjoint_point_read.rs b/tests/tree_disjoint_point_read.rs index 14c858d9..5ffe3cf5 100644 --- a/tests/tree_disjoint_point_read.rs +++ b/tests/tree_disjoint_point_read.rs @@ -4,7 +4,7 @@ use test_log::test; #[test] fn tree_disjoint_point_read() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) .data_block_size(1_024) @@ -33,7 +33,7 @@ fn tree_disjoint_point_read() -> lsm_tree::Result<()> { #[test] fn tree_disjoint_point_read_blob() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) .data_block_size(1_024) @@ -63,7 +63,7 @@ fn tree_disjoint_point_read_blob() -> lsm_tree::Result<()> { #[test] #[ignore] fn tree_disjoint_point_read_multiple_levels() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) .data_block_size(1_024) @@ -118,7 +118,7 @@ fn tree_disjoint_point_read_multiple_levels() -> lsm_tree::Result<()> { #[test] #[ignore] fn tree_disjoint_point_read_multiple_levels_blob() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) .data_block_size(1_024) diff --git a/tests/tree_mvcc_simple.rs b/tests/tree_mvcc_simple.rs index e1ca3b91..ea935db6 100644 --- a/tests/tree_mvcc_simple.rs +++ b/tests/tree_mvcc_simple.rs @@ -3,7 +3,7 @@ use test_log::test; #[test] fn tree_read_mvcc() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder).open()?; diff --git a/tests/tree_non_disjoint_point_read.rs b/tests/tree_non_disjoint_point_read.rs index f8ebb115..41a17e31 100644 --- a/tests/tree_non_disjoint_point_read.rs +++ b/tests/tree_non_disjoint_point_read.rs @@ -3,7 +3,7 @@ use test_log::test; #[test] fn tree_non_disjoint_point_read() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) .data_block_size(1_024) diff --git a/tests/tree_seqno.rs b/tests/tree_seqno.rs index bacd45c1..6ebda8d1 100644 --- a/tests/tree_seqno.rs +++ b/tests/tree_seqno.rs @@ -3,7 +3,7 @@ use test_log::test; #[test] fn tree_highest_seqno() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder).open()?; assert_eq!(tree.get_highest_seqno(), None); diff --git a/tests/tree_shadowing.rs b/tests/tree_shadowing.rs index 2b0fc3d7..58a37894 100644 --- a/tests/tree_shadowing.rs +++ b/tests/tree_shadowing.rs @@ -3,7 +3,7 @@ use test_log::test; #[test] fn tree_shadowing_upsert() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder).open()?; @@ -34,7 +34,7 @@ fn tree_shadowing_upsert() -> lsm_tree::Result<()> { #[test] fn tree_shadowing_upsert_blob() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder).open_as_blob_tree()?; @@ -65,7 +65,7 @@ fn tree_shadowing_upsert_blob() -> lsm_tree::Result<()> { #[test] fn tree_shadowing_delete() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder).open().unwrap(); @@ -94,7 +94,7 @@ fn tree_shadowing_delete() -> lsm_tree::Result<()> { #[test] fn tree_shadowing_delete_blob() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder).open_as_blob_tree().unwrap(); @@ -125,7 +125,7 @@ fn tree_shadowing_delete_blob() -> lsm_tree::Result<()> { fn tree_shadowing_range() -> lsm_tree::Result<()> { const ITEM_COUNT: usize = 10_000; - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder).open()?; @@ -169,7 +169,7 @@ fn tree_shadowing_range() -> lsm_tree::Result<()> { fn tree_shadowing_range_blob() -> lsm_tree::Result<()> { const ITEM_COUNT: usize = 10_000; - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder).open_as_blob_tree()?; @@ -213,7 +213,7 @@ fn tree_shadowing_range_blob() -> lsm_tree::Result<()> { fn tree_shadowing_prefix() -> lsm_tree::Result<()> { const ITEM_COUNT: usize = 10_000; - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder).open()?; @@ -285,7 +285,7 @@ fn tree_shadowing_prefix() -> lsm_tree::Result<()> { fn tree_shadowing_prefix_blob() -> lsm_tree::Result<()> { const ITEM_COUNT: usize = 10_000; - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder).open_as_blob_tree()?; diff --git a/tests/tree_write_read.rs b/tests/tree_write_read.rs index 8ac55090..6cf866b3 100644 --- a/tests/tree_write_read.rs +++ b/tests/tree_write_read.rs @@ -3,7 +3,7 @@ use test_log::test; #[test] fn tree_write_and_read() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?.into_path(); + let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder.clone()).open()?; From 93a76702c745061ca508f8da1268884852a07598 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 3 Sep 2025 18:23:07 +0200 Subject: [PATCH 357/613] adjust tests --- tests/tree_iter_lifetime.rs | 5 ++--- tests/tree_reload.rs | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/tree_iter_lifetime.rs b/tests/tree_iter_lifetime.rs index 2fd663ba..655de9d3 100644 --- a/tests/tree_iter_lifetime.rs +++ b/tests/tree_iter_lifetime.rs @@ -15,10 +15,9 @@ fn iterrr( Ok(tree.iter(None, None)) } -// TODO: 3.0.0 compiler error -/* #[test] +#[test] fn tree_iter_lifetime() -> lsm_tree::Result<()> { let folder = tempfile::tempdir().unwrap(); assert_eq!(100, iterrr(folder.path())?.count()); Ok(()) -} */ +} diff --git a/tests/tree_reload.rs b/tests/tree_reload.rs index a2808fea..690f3efa 100644 --- a/tests/tree_reload.rs +++ b/tests/tree_reload.rs @@ -1,5 +1,4 @@ use lsm_tree::{AbstractTree, Config, SequenceNumberCounter, TreeType}; -use std::fs::File; use test_log::test; const ITEM_COUNT: usize = 10_000; From 4250c77e327d890aab45761e2a7583e4152c8cc5 Mon Sep 17 00:00:00 2001 From: Marvin <33938500+marvin-j97@users.noreply.github.com> Date: Mon, 8 Sep 2025 17:09:04 +0200 Subject: [PATCH 358/613] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 483f1050..c953a3a7 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ This is the most feature-rich LSM-tree implementation in Rust! It features: - Thread-safe `BTreeMap`-like API - Mostly [safe](./UNSAFE.md) & 100% stable Rust - Block-based tables with compression support & prefix truncation - - Optional block hash indexes in blocks for faster point lookups [[3]](#footnotes) + - Optional block hash indexes in data blocks for faster point lookups [[3]](#footnotes) - Per-level filter/index block pinning configuration - Range & prefix searching with forward and reverse iteration - Block caching to keep hot data in memory From 7bd7666904956c8a536c58d2ea380c9b33fd5c14 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 11 Sep 2025 00:04:39 +0200 Subject: [PATCH 359/613] too much man --- .github/workflows/release.yml | 2 +- .github/workflows/test.yml | 2 +- src/abstract.rs | 275 +++----- src/blob_tree/cache.rs | 25 - src/blob_tree/compression.rs | 11 +- src/blob_tree/gc/reader.rs | 4 +- src/blob_tree/gc/writer.rs | 4 +- src/blob_tree/index.rs | 2 +- src/blob_tree/mod.rs | 220 +++--- src/blob_tree/value.rs | 3 +- src/cache.rs | 12 +- src/coding.rs | 116 ++++ src/compaction/leveled.rs | 28 +- src/compaction/worker.rs | 8 +- src/descriptor_table.rs | 3 +- src/error.rs | 10 - src/file.rs | 2 +- src/iter_guard.rs | 47 ++ src/key_range.rs | 321 +++++++++ src/lib.rs | 64 +- src/range.rs | 50 +- src/segment/block/binary_index/reader.rs | 12 +- src/segment/block/decoder.rs | 2 +- src/segment/block/hash_index/reader.rs | 7 - src/segment/block/mod.rs | 63 +- src/segment/block_index/mod.rs | 3 +- src/segment/data_block/iter.rs | 8 +- src/segment/data_block/mod.rs | 4 + src/segment/filter/bit_array/reader.rs | 4 - src/segment/index_block/block_handle.rs | 2 +- src/segment/multi_writer.rs | 10 +- src/seqno.rs | 34 +- src/slice/default/mod.rs | 76 +++ src/slice/mod.rs | 242 +++++++ src/slice/slice_bytes.rs | 127 ++++ src/tree/ingest.rs | 4 +- src/tree/mod.rs | 120 ++-- src/vlog/blob_file/gc_stats.rs | 33 + src/vlog/blob_file/merge.rs | 121 ++++ src/vlog/blob_file/meta.rs | 67 ++ src/vlog/blob_file/mod.rs | 75 +++ src/vlog/blob_file/multi_writer.rs | 146 ++++ src/vlog/blob_file/reader.rs | 119 ++++ src/vlog/blob_file/trailer.rs | 75 +++ src/vlog/blob_file/writer.rs | 202 ++++++ src/vlog/compression.rs | 20 + src/vlog/config.rs | 66 ++ src/vlog/gc/mod.rs | 123 ++++ src/vlog/gc/report.rs | 78 +++ src/vlog/handle.rs | 44 ++ src/vlog/index.rs | 47 ++ src/vlog/manifest.rs | 445 ++++++++++++ src/vlog/mod.rs | 33 + src/vlog/scanner.rs | 66 ++ src/vlog/value_log.rs | 669 +++++++++++++++++++ tests/blob_drop_after_flush.rs | 5 +- tests/blob_gc.rs | 38 +- tests/blob_gc_watermark.rs | 41 +- tests/blob_sep_threshold.rs | 8 +- tests/blob_simple.rs | 18 +- tests/blob_tombstone.rs | 24 +- tests/blob_tree_flush.rs | 4 +- tests/blob_tree_reload_blob.rs | 84 ++- tests/compaction_readers_grouping.rs | 8 +- tests/experimental_blob_tree_guarded_size.rs | 21 + tests/experimental_tree_guarded_range.rs | 62 ++ tests/major_compaction.rs | 4 +- tests/open_files.rs | 4 +- tests/segment_point_reads.rs | 84 +-- tests/segment_range.rs | 68 +- tests/segment_range_oob.rs | 36 +- tests/segment_remove_weak.rs | 4 +- tests/snapshot_compact.rs | 45 +- tests/snapshot_len.rs | 47 +- tests/snapshot_point_read.rs | 239 +++---- tests/snapshot_zombie.rs | 134 ++-- tests/tree_approx_len.rs | 106 +-- tests/tree_bulk_ingest.rs | 86 ++- tests/tree_count.rs | 38 +- tests/tree_delete_loop.rs | 56 +- tests/tree_different_block_size.rs | 10 +- tests/tree_disjoint_iter.rs | 44 +- tests/tree_disjoint_point_read.rs | 10 +- tests/tree_disjoint_prefix.rs | 26 +- tests/tree_disjoint_range.rs | 68 +- tests/tree_drop_range.rs | 12 +- tests/tree_flush_eviction.rs | 22 +- tests/tree_iter_lifetime.rs | 23 - tests/tree_kv.rs | 44 +- tests/tree_l0_point_read.rs | 16 +- tests/tree_l0_range.rs | 16 +- tests/tree_mvcc_simple.rs | 170 ++--- tests/tree_non_disjoint_point_read.rs | 14 +- tests/tree_range.rs | 31 +- tests/tree_range_memtable_only.rs | 125 +--- tests/tree_recover_large_value.rs | 4 +- tests/tree_reload.rs | 66 +- tests/tree_reload_pwd.rs | 6 +- tests/tree_sealed_shadowing.rs | 12 +- tests/tree_shadowing.rs | 170 ++--- tests/tree_weak_delete.rs | 48 +- tests/tree_weak_delete_eviction.rs | 2 +- tests/tree_weak_delete_queue.rs | 56 +- 103 files changed, 5033 insertions(+), 1582 deletions(-) delete mode 100644 src/blob_tree/cache.rs create mode 100644 src/coding.rs create mode 100644 src/iter_guard.rs create mode 100644 src/key_range.rs create mode 100644 src/slice/default/mod.rs create mode 100644 src/slice/mod.rs create mode 100644 src/slice/slice_bytes.rs create mode 100644 src/vlog/blob_file/gc_stats.rs create mode 100644 src/vlog/blob_file/merge.rs create mode 100644 src/vlog/blob_file/meta.rs create mode 100644 src/vlog/blob_file/mod.rs create mode 100644 src/vlog/blob_file/multi_writer.rs create mode 100644 src/vlog/blob_file/reader.rs create mode 100644 src/vlog/blob_file/trailer.rs create mode 100644 src/vlog/blob_file/writer.rs create mode 100644 src/vlog/compression.rs create mode 100644 src/vlog/config.rs create mode 100644 src/vlog/gc/mod.rs create mode 100644 src/vlog/gc/report.rs create mode 100644 src/vlog/handle.rs create mode 100644 src/vlog/index.rs create mode 100644 src/vlog/manifest.rs create mode 100644 src/vlog/mod.rs create mode 100644 src/vlog/scanner.rs create mode 100644 src/vlog/value_log.rs create mode 100644 tests/experimental_blob_tree_guarded_size.rs create mode 100644 tests/experimental_tree_guarded_range.rs delete mode 100644 tests/tree_iter_lifetime.rs diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index cc449d78..846f9636 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -8,7 +8,7 @@ jobs: publish: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: katyo/publish-crates@v2 with: registry-token: ${{ secrets.CARGO_REGISTRY_TOKEN }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a12f4488..85eb9d09 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -26,7 +26,7 @@ jobs: runs-on: ${{ matrix.os }} steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v5 - uses: dtolnay/rust-toolchain@stable with: toolchain: ${{ matrix.rust_version }} diff --git a/src/abstract.rs b/src/abstract.rs index ccdcf5e2..8b35c36e 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -3,9 +3,9 @@ // (found in the LICENSE-* files in the repository) use crate::{ - compaction::CompactionStrategy, config::TreeType, segment::Segment, tree::inner::MemtableId, - AnyTree, BlobTree, Config, KvPair, Memtable, SegmentId, SeqNo, Snapshot, Tree, UserKey, - UserValue, + compaction::CompactionStrategy, config::TreeType, iter_guard::IterGuardImpl, segment::Segment, + tree::inner::MemtableId, AnyTree, BlobTree, Config, Guard, KvPair, Memtable, SegmentId, SeqNo, + Tree, UserKey, UserValue, }; use enum_dispatch::enum_dispatch; use std::{ @@ -19,6 +19,37 @@ pub type RangeItem = crate::Result; #[allow(clippy::module_name_repetitions)] #[enum_dispatch] pub trait AbstractTree { + /// Returns an iterator that scans through the entire tree. + /// + /// Avoid using this function, or limit it as otherwise it may scan a lot of items. + fn iter( + &self, + seqno: SeqNo, + index: Option>, + ) -> Box> + '_> { + self.range::<&[u8], _>(.., seqno, index) + } + + /// Returns an iterator over a prefixed set of items. + /// + /// Avoid using an empty prefix as it may scan a lot of items (unless limited). + fn prefix>( + &self, + prefix: K, + seqno: SeqNo, + index: Option>, + ) -> Box> + '_>; + + /// Returns an iterator over a range of items. + /// + /// Avoid using full or unbounded ranges as they may scan a lot of items (unless limited). + fn range, R: RangeBounds>( + &self, + range: R, + seqno: SeqNo, + index: Option>, + ) -> Box> + '_>; + /// Ingests a sorted stream of key-value pairs into the tree. /// /// Can only be called on a new fresh, empty tree. @@ -35,6 +66,9 @@ pub trait AbstractTree { #[doc(hidden)] fn ingest(&self, iter: impl Iterator) -> crate::Result<()>; + /// Returns the approximate number of tombstones in the tree. + fn tombstone_count(&self) -> u64; + // TODO: clear() with Nuke compaction strategy (write lock) -> drop_range(..) /// Drops segments that are fully contained in a given range. @@ -71,7 +105,7 @@ pub trait AbstractTree { /// Will return `Err` if an IO error occurs. fn flush_memtable( &self, - segment_id: SegmentId, + segment_id: SegmentId, // TODO: remove? memtable: &Arc, seqno_threshold: SeqNo, ) -> crate::Result>; @@ -138,25 +172,25 @@ pub trait AbstractTree { /// Seals the active memtable, and returns a reference to it. fn rotate_memtable(&self) -> Option<(MemtableId, Arc)>; - /// Returns the amount of disk segments currently in the tree. + /// Returns the number of disk segments currently in the tree. fn segment_count(&self) -> usize; - /// Returns the amount of segments in levels[idx]. + /// Returns the number of segments in levels[idx]. /// /// Returns `None` if the level does not exist (if idx >= 7). fn level_segment_count(&self, idx: usize) -> Option; - /// Returns the amount of disjoint runs in L0. + /// Returns the number of disjoint runs in L0. /// /// Can be used to determine whether to write stall. fn l0_run_count(&self) -> usize; - /// Returns the amount of blob files currently in the tree. + /// Returns the number of blob files currently in the tree. fn blob_file_count(&self) -> usize { 0 } - /// Approximates the amount of items in the tree. + /// Approximates the number of items in the tree. fn approximate_len(&self) -> usize; /// Returns the disk space usage. @@ -168,7 +202,7 @@ pub trait AbstractTree { /// Returns the highest sequence number that is flushed to disk. fn get_highest_persisted_seqno(&self) -> Option; - /// Scans the entire tree, returning the amount of items. + /// Scans the entire tree, returning the number of items. /// /// ###### Caution /// @@ -186,11 +220,11 @@ pub trait AbstractTree { /// let folder = tempfile::tempdir()?; /// let tree = Config::new(folder).open()?; /// - /// assert_eq!(tree.len(None, None)?, 0); + /// assert_eq!(tree.len(0, None)?, 0); /// tree.insert("1", "abc", 0); /// tree.insert("3", "abc", 1); /// tree.insert("5", "abc", 2); - /// assert_eq!(tree.len(None, None)?, 3); + /// assert_eq!(tree.len(3, None)?, 3); /// # /// # Ok::<(), TreeError>(()) /// ``` @@ -198,11 +232,11 @@ pub trait AbstractTree { /// # Errors /// /// Will return `Err` if an IO error occurs. - fn len(&self, seqno: Option, index: Option>) -> crate::Result { + fn len(&self, seqno: SeqNo, index: Option>) -> crate::Result { let mut count = 0; for item in self.iter(seqno, index) { - let _ = item?; + let _ = item.key()?; count += 1; } @@ -220,10 +254,10 @@ pub trait AbstractTree { /// use lsm_tree::{AbstractTree, Config, Tree}; /// /// let tree = Config::new(folder).open()?; - /// assert!(tree.is_empty(None, None)?); + /// assert!(tree.is_empty(0, None)?); /// /// tree.insert("a", "abc", 0); - /// assert!(!tree.is_empty(None, None)?); + /// assert!(!tree.is_empty(1, None)?); /// # /// # Ok::<(), lsm_tree::Error>(()) /// ``` @@ -231,7 +265,7 @@ pub trait AbstractTree { /// # Errors /// /// Will return `Err` if an IO error occurs. - fn is_empty(&self, seqno: Option, index: Option>) -> crate::Result { + fn is_empty(&self, seqno: SeqNo, index: Option>) -> crate::Result { self.first_key_value(seqno, index).map(|x| x.is_none()) } @@ -251,7 +285,7 @@ pub trait AbstractTree { /// tree.insert("3", "abc", 1); /// tree.insert("5", "abc", 2); /// - /// let (key, _) = tree.first_key_value(None, None)?.expect("item should exist"); + /// let (key, _) = tree.first_key_value(3, None)?.expect("item should exist"); /// assert_eq!(&*key, "1".as_bytes()); /// # /// # Ok::<(), TreeError>(()) @@ -262,10 +296,13 @@ pub trait AbstractTree { /// Will return `Err` if an IO error occurs. fn first_key_value( &self, - seqno: Option, + seqno: SeqNo, index: Option>, ) -> crate::Result> { - self.iter(seqno, index).next().transpose() + self.iter(seqno, index) + .next() + .map(Guard::into_inner) + .transpose() } /// Returns the last key-value pair in the tree. @@ -284,7 +321,7 @@ pub trait AbstractTree { /// tree.insert("3", "abc", 1); /// tree.insert("5", "abc", 2); /// - /// let (key, _) = tree.last_key_value(None, None)?.expect("item should exist"); + /// let (key, _) = tree.last_key_value(3, None)?.expect("item should exist"); /// assert_eq!(&*key, "5".as_bytes()); /// # /// # Ok::<(), TreeError>(()) @@ -295,141 +332,15 @@ pub trait AbstractTree { /// Will return `Err` if an IO error occurs. fn last_key_value( &self, - seqno: Option, + seqno: SeqNo, index: Option>, ) -> crate::Result> { - self.iter(seqno, index).next_back().transpose() + self.iter(seqno, index) + .next_back() + .map(Guard::into_inner) + .transpose() } - /// Returns an iterator that scans through the entire tree. - /// - /// Avoid using this function, or limit it as otherwise it may scan a lot of items. - /// - /// # Examples - /// - /// ``` - /// # let folder = tempfile::tempdir()?; - /// use lsm_tree::{AbstractTree, Config, Tree}; - /// - /// let tree = Config::new(folder).open()?; - /// - /// tree.insert("a", "abc", 0); - /// tree.insert("f", "abc", 1); - /// tree.insert("g", "abc", 2); - /// assert_eq!(3, tree.iter(None, None).count()); - /// # - /// # Ok::<(), lsm_tree::Error>(()) - /// ``` - fn iter( - &self, - seqno: Option, - index: Option>, - ) -> Box> + 'static> { - self.range::<&[u8], _>(.., seqno, index) - } - - /// Returns an iterator that scans through the entire tree, returning keys only. - /// - /// Avoid using this function, or limit it as otherwise it may scan a lot of items. - /// - /// # Examples - /// - /// ``` - /// # let folder = tempfile::tempdir()?; - /// use lsm_tree::{AbstractTree, Config, Tree}; - /// - /// let tree = Config::new(folder).open()?; - /// - /// tree.insert("a", "abc", 0); - /// tree.insert("f", "abc", 1); - /// tree.insert("g", "abc", 2); - /// assert_eq!(3, tree.keys(None, None).count()); - /// # - /// # Ok::<(), lsm_tree::Error>(()) - /// ``` - fn keys( - &self, - seqno: Option, - index: Option>, - ) -> Box> + 'static>; - - /// Returns an iterator that scans through the entire tree, returning values only. - /// - /// Avoid using this function, or limit it as otherwise it may scan a lot of items. - /// - /// # Examples - /// - /// ``` - /// # let folder = tempfile::tempdir()?; - /// use lsm_tree::{AbstractTree, Config, Tree}; - /// - /// let tree = Config::new(folder).open()?; - /// - /// tree.insert("a", "abc", 0); - /// tree.insert("f", "abc", 1); - /// tree.insert("g", "abc", 2); - /// assert_eq!(3, tree.values(None, None).count()); - /// # - /// # Ok::<(), lsm_tree::Error>(()) - /// ``` - fn values( - &self, - seqno: Option, - index: Option>, - ) -> Box> + 'static>; - - /// Returns an iterator over a range of items. - /// - /// Avoid using full or unbounded ranges as they may scan a lot of items (unless limited). - /// - /// # Examples - /// - /// ``` - /// # let folder = tempfile::tempdir()?; - /// use lsm_tree::{AbstractTree, Config, Tree}; - /// - /// let tree = Config::new(folder).open()?; - /// - /// tree.insert("a", "abc", 0); - /// tree.insert("f", "abc", 1); - /// tree.insert("g", "abc", 2); - /// assert_eq!(2, tree.range("a"..="f", None, None).into_iter().count()); - /// # - /// # Ok::<(), lsm_tree::Error>(()) - /// ``` - fn range, R: RangeBounds>( - &self, - range: R, - seqno: Option, - index: Option>, - ) -> Box> + 'static>; - - /// Returns an iterator over a prefixed set of items. - /// - /// Avoid using an empty prefix as it may scan a lot of items (unless limited). - /// - /// # Examples - /// - /// ``` - /// # let folder = tempfile::tempdir()?; - /// use lsm_tree::{AbstractTree, Config, Tree}; - /// - /// let tree = Config::new(folder).open()?; - /// - /// tree.insert("a", "abc", 0); - /// tree.insert("ab", "abc", 1); - /// tree.insert("abc", "abc", 2); - /// assert_eq!(2, tree.prefix("ab", None, None).count()); - /// # - /// # Ok::<(), lsm_tree::Error>(()) - /// ``` - fn prefix>( - &self, - prefix: K, - seqno: Option, - index: Option>, - ) -> Box> + 'static>; - /// Returns the size of a value if it exists. /// /// # Examples @@ -441,10 +352,10 @@ pub trait AbstractTree { /// let tree = Config::new(folder).open()?; /// tree.insert("a", "my_value", 0); /// - /// let size = tree.size_of("a", None)?.unwrap_or_default(); + /// let size = tree.size_of("a", 1)?.unwrap_or_default(); /// assert_eq!("my_value".len() as u32, size); /// - /// let size = tree.size_of("b", None)?.unwrap_or_default(); + /// let size = tree.size_of("b", 1)?.unwrap_or_default(); /// assert_eq!(0, size); /// # /// # Ok::<(), lsm_tree::Error>(()) @@ -453,7 +364,7 @@ pub trait AbstractTree { /// # Errors /// /// Will return `Err` if an IO error occurs. - fn size_of>(&self, key: K, seqno: Option) -> crate::Result>; + fn size_of>(&self, key: K, seqno: SeqNo) -> crate::Result>; /// Retrieves an item from the tree. /// @@ -466,7 +377,7 @@ pub trait AbstractTree { /// let tree = Config::new(folder).open()?; /// tree.insert("a", "my_value", 0); /// - /// let item = tree.get("a", None)?; + /// let item = tree.get("a", 1)?; /// assert_eq!(Some("my_value".as_bytes().into()), item); /// # /// # Ok::<(), lsm_tree::Error>(()) @@ -475,42 +386,7 @@ pub trait AbstractTree { /// # Errors /// /// Will return `Err` if an IO error occurs. - fn get>(&self, key: K, seqno: Option) - -> crate::Result>; - - /// Opens a read-only point-in-time snapshot of the tree - /// - /// Dropping the snapshot will close the snapshot - /// - /// # Examples - /// - /// ``` - /// # let folder = tempfile::tempdir()?; - /// use lsm_tree::{AbstractTree, Config, Tree}; - /// - /// let tree = Config::new(folder).open()?; - /// - /// tree.insert("a", "abc", 0); - /// - /// let snapshot = tree.snapshot(1); - /// assert_eq!(snapshot.len()?, tree.len(None, None)?); - /// - /// tree.insert("b", "abc", 1); - /// - /// assert_eq!(2, tree.len(None, None)?); - /// assert_eq!(1, snapshot.len()?); - /// - /// assert!(snapshot.contains_key("a")?); - /// assert!(!snapshot.contains_key("b")?); - /// # - /// # Ok::<(), lsm_tree::Error>(()) - /// ``` - fn snapshot(&self, seqno: SeqNo) -> Snapshot; - - /// Opens a snapshot of this partition with a given sequence number - fn snapshot_at(&self, seqno: SeqNo) -> Snapshot { - self.snapshot(seqno) - } + fn get>(&self, key: K, seqno: SeqNo) -> crate::Result>; /// Returns `true` if the tree contains the specified key. /// @@ -521,10 +397,10 @@ pub trait AbstractTree { /// # use lsm_tree::{AbstractTree, Config, Tree}; /// # /// let tree = Config::new(folder).open()?; - /// assert!(!tree.contains_key("a", None)?); + /// assert!(!tree.contains_key("a", 0)?); /// /// tree.insert("a", "abc", 0); - /// assert!(tree.contains_key("a", None)?); + /// assert!(tree.contains_key("a", 1)?); /// # /// # Ok::<(), lsm_tree::Error>(()) /// ``` @@ -532,7 +408,7 @@ pub trait AbstractTree { /// # Errors /// /// Will return `Err` if an IO error occurs. - fn contains_key>(&self, key: K, seqno: Option) -> crate::Result { + fn contains_key>(&self, key: K, seqno: SeqNo) -> crate::Result { self.get(key, seqno).map(|x| x.is_some()) } @@ -577,12 +453,12 @@ pub trait AbstractTree { /// # let tree = Config::new(folder).open()?; /// tree.insert("a", "abc", 0); /// - /// let item = tree.get("a", None)?.expect("should have item"); + /// let item = tree.get("a", 1)?.expect("should have item"); /// assert_eq!("abc".as_bytes(), &*item); /// /// tree.remove("a", 1); /// - /// let item = tree.get("a", None)?; + /// let item = tree.get("a", 2)?; /// assert_eq!(None, item); /// # /// # Ok::<(), lsm_tree::Error>(()) @@ -611,12 +487,12 @@ pub trait AbstractTree { /// # let tree = Config::new(folder).open()?; /// tree.insert("a", "abc", 0); /// - /// let item = tree.get("a", None)?.expect("should have item"); + /// let item = tree.get("a", 1)?.expect("should have item"); /// assert_eq!("abc".as_bytes(), &*item); /// /// tree.remove_weak("a", 1); /// - /// let item = tree.get("a", None)?; + /// let item = tree.get("a", 2)?; /// assert_eq!(None, item); /// # /// # Ok::<(), lsm_tree::Error>(()) @@ -625,5 +501,6 @@ pub trait AbstractTree { /// # Errors /// /// Will return `Err` if an IO error occurs. + #[doc(hidden)] fn remove_weak>(&self, key: K, seqno: SeqNo) -> (u64, u64); } diff --git a/src/blob_tree/cache.rs b/src/blob_tree/cache.rs deleted file mode 100644 index 2d1946ff..00000000 --- a/src/blob_tree/cache.rs +++ /dev/null @@ -1,25 +0,0 @@ -use crate::Cache; -use std::sync::Arc; -use value_log::BlobCache; - -#[derive(Clone)] -pub struct MyBlobCache(pub(crate) Arc); - -impl BlobCache for MyBlobCache { - fn get( - &self, - vlog_id: value_log::ValueLogId, - vhandle: &value_log::ValueHandle, - ) -> Option { - self.0.get_blob(vlog_id, vhandle) - } - - fn insert( - &self, - vlog_id: value_log::ValueLogId, - vhandle: &value_log::ValueHandle, - value: value_log::UserValue, - ) { - self.0.insert_blob(vlog_id, vhandle, value); - } -} diff --git a/src/blob_tree/compression.rs b/src/blob_tree/compression.rs index 4e6f3016..3ab425e7 100644 --- a/src/blob_tree/compression.rs +++ b/src/blob_tree/compression.rs @@ -2,8 +2,8 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) +use crate::vlog::Compressor; use crate::CompressionType; -use value_log::Compressor; #[derive(Copy, Clone, Debug)] pub struct MyCompressor(pub(crate) CompressionType); @@ -15,7 +15,7 @@ impl Default for MyCompressor { } impl Compressor for MyCompressor { - fn compress(&self, bytes: &[u8]) -> value_log::Result> { + fn compress(&self, bytes: &[u8]) -> crate::Result> { Ok(match self.0 { CompressionType::None => bytes.into(), @@ -24,14 +24,13 @@ impl Compressor for MyCompressor { }) } - fn decompress(&self, bytes: &[u8]) -> value_log::Result> { + fn decompress(&self, bytes: &[u8]) -> crate::Result> { match self.0 { CompressionType::None => Ok(bytes.into()), #[cfg(feature = "lz4")] - CompressionType::Lz4 => { - lz4_flex::decompress_size_prepended(bytes).map_err(|_| value_log::Error::Decompress) - } + CompressionType::Lz4 => lz4_flex::decompress_size_prepended(bytes) + .map_err(|_| crate::Error::Decompress(self.0)), } } } diff --git a/src/blob_tree/gc/reader.rs b/src/blob_tree/gc/reader.rs index bebf86cd..25e4f612 100644 --- a/src/blob_tree/gc/reader.rs +++ b/src/blob_tree/gc/reader.rs @@ -2,9 +2,9 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) +use crate::vlog::ValueHandle; use crate::{blob_tree::value::MaybeInlineValue, coding::Decode, Memtable, SeqNo}; use std::io::Cursor; -use value_log::ValueHandle; #[allow(clippy::module_name_repetitions)] pub struct GcReader<'a> { @@ -33,7 +33,7 @@ impl<'a> GcReader<'a> { } } -impl value_log::IndexReader for GcReader<'_> { +impl crate::vlog::IndexReader for GcReader<'_> { fn get(&self, key: &[u8]) -> std::io::Result> { use std::io::Error as IoError; use MaybeInlineValue::{Indirect, Inline}; diff --git a/src/blob_tree/gc/writer.rs b/src/blob_tree/gc/writer.rs index f314a0e3..6d04e5b0 100644 --- a/src/blob_tree/gc/writer.rs +++ b/src/blob_tree/gc/writer.rs @@ -2,11 +2,11 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) +use crate::vlog::ValueHandle; use crate::{ blob_tree::value::MaybeInlineValue, coding::Encode, value::InternalValue, Memtable, SeqNo, UserKey, }; -use value_log::ValueHandle; #[allow(clippy::module_name_repetitions)] pub struct GcWriter<'a> { @@ -25,7 +25,7 @@ impl<'a> GcWriter<'a> { } } -impl value_log::IndexWriter for GcWriter<'_> { +impl crate::vlog::IndexWriter for GcWriter<'_> { fn insert_indirect( &mut self, key: &[u8], diff --git a/src/blob_tree/index.rs b/src/blob_tree/index.rs index 59785e1c..117ec144 100644 --- a/src/blob_tree/index.rs +++ b/src/blob_tree/index.rs @@ -21,7 +21,7 @@ impl IndexTree { pub(crate) fn get_vhandle( &self, key: &[u8], - seqno: Option, + seqno: SeqNo, ) -> crate::Result> { let Some(item) = self.get(key, seqno)? else { return Ok(None); diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index 1b97e50a..a42c7cca 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -2,7 +2,6 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -mod cache; mod compression; mod gc; pub mod index; @@ -12,13 +11,14 @@ use crate::{ coding::{Decode, Encode}, compaction::stream::CompactionStream, file::BLOBS_FOLDER, + iter_guard::{IterGuard, IterGuardImpl}, r#abstract::{AbstractTree, RangeItem}, segment::Segment, tree::inner::MemtableId, value::InternalValue, - Config, KvPair, Memtable, SegmentId, SeqNo, Snapshot, UserKey, UserValue, + vlog::ValueLog, + Config, Memtable, SegmentId, SeqNo, UserKey, UserValue, }; -use cache::MyBlobCache; use compression::MyCompressor; use gc::{reader::GcReader, writer::GcWriter}; use index::IndexTree; @@ -28,9 +28,39 @@ use std::{ sync::{atomic::AtomicUsize, Arc}, }; use value::MaybeInlineValue; -use value_log::ValueLog; -fn resolve_value_handle(vlog: &ValueLog, item: RangeItem) -> RangeItem { +pub struct Guard<'a>( + &'a ValueLog, + crate::Result<(UserKey, UserValue)>, +); + +impl IterGuard for Guard<'_> { + fn key(self) -> crate::Result { + self.1.map(|(k, _)| k) + } + + fn size(self) -> crate::Result { + use MaybeInlineValue::{Indirect, Inline}; + + let value = self.1?.1; + let mut cursor = Cursor::new(value); + + Ok(match MaybeInlineValue::decode_from(&mut cursor)? { + // NOTE: We know LSM-tree values are 32 bits in length max + #[allow(clippy::cast_possible_truncation)] + Inline(bytes) => bytes.len() as u32, + + // NOTE: No need to resolve vHandle, because the size is already stored + Indirect { size, .. } => size, + }) + } + + fn into_inner(self) -> crate::Result<(UserKey, UserValue)> { + resolve_value_handle(self.0, self.1) + } +} + +fn resolve_value_handle(vlog: &crate::vlog::ValueLog, item: RangeItem) -> RangeItem { use MaybeInlineValue::{Indirect, Inline}; match item { @@ -43,7 +73,7 @@ fn resolve_value_handle(vlog: &ValueLog, item: RangeI // Resolve indirection using value log match vlog.get(&vhandle) { Ok(Some(bytes)) => Ok((key, bytes)), - Err(e) => Err(e.into()), + Err(e) => Err(e), _ => { panic!("value handle ({:?} => {vhandle:?}) did not match any blob - this is a bug", String::from_utf8_lossy(&key)) } @@ -68,7 +98,7 @@ pub struct BlobTree { /// Log-structured value-log that stores large values #[doc(hidden)] - pub blobs: ValueLog, + pub blobs: crate::vlog::ValueLog, // TODO: maybe replace this with a nonce system #[doc(hidden)] @@ -80,15 +110,17 @@ impl BlobTree { let path = &config.path; let vlog_path = path.join(BLOBS_FOLDER); - let vlog_cfg = - value_log::Config::::new(MyBlobCache(config.cache.clone())) - .segment_size_bytes(config.blob_file_target_size) - .compression(match config.blob_compression { - crate::CompressionType::None => None, + let vlog_cfg = crate::vlog::Config::::new( + config.cache.clone(), + config.descriptor_table.clone(), + ) + .blob_file_size_bytes(config.blob_file_target_size) + .compression(match config.blob_compression { + crate::CompressionType::None => None, - #[cfg(feature = "lz4")] - c => Some(MyCompressor(c)), - }); + #[cfg(feature = "lz4")] + c => Some(MyCompressor(c)), + }); let index: IndexTree = config.open()?.into(); @@ -133,44 +165,41 @@ impl BlobTree { let iter = self .index - .create_internal_range::<&[u8], RangeFull>(&.., Some(seqno), None); + .create_internal_range::<&[u8], RangeFull>(&.., seqno, None); // Stores the max seqno of every blob file let mut seqno_map = crate::HashMap::::default(); - let result = self - .blobs - .scan_for_stats(iter.filter_map(|kv| { - let Ok(kv) = kv else { - return Some(Err(IoError::other( - "Failed to load KV pair from index tree", - ))); - }; + let result = self.blobs.scan_for_stats(iter.filter_map(|kv| { + let Ok(kv) = kv else { + return Some(Err(IoError::other( + "Failed to load KV pair from index tree", + ))); + }; - let mut cursor = Cursor::new(kv.value); - let value = match MaybeInlineValue::decode_from(&mut cursor) { - Ok(v) => v, - Err(e) => return Some(Err(IoError::other(e.to_string()))), - }; + let mut cursor = Cursor::new(kv.value); + let value = match MaybeInlineValue::decode_from(&mut cursor) { + Ok(v) => v, + Err(e) => return Some(Err(IoError::other(e.to_string()))), + }; - match value { - Indirect { vhandle, size } => { - seqno_map - .entry(vhandle.segment_id) - .and_modify(|x| *x = (*x).max(kv.key.seqno)) - .or_insert(kv.key.seqno); + match value { + Indirect { vhandle, size } => { + seqno_map + .entry(vhandle.blob_file_id) + .and_modify(|x| *x = (*x).max(kv.key.seqno)) + .or_insert(kv.key.seqno); - Some(Ok((vhandle, size))) - } - Inline(_) => None, + Some(Ok((vhandle, size))) } - })) - .map_err(Into::into); + Inline(_) => None, + } + })); let mut lock = self .blobs .manifest - .segments + .blob_files .write() .expect("lock is poisoned"); @@ -193,7 +222,7 @@ impl BlobTree { pub fn apply_gc_strategy( &self, - strategy: &impl value_log::GcStrategy, + strategy: &impl crate::vlog::GcStrategy, seqno: SeqNo, ) -> crate::Result { // IMPORTANT: Write lock memtable to avoid read skew @@ -206,7 +235,7 @@ impl BlobTree { )?; // NOTE: We still have the memtable lock, can't use gc_drop_stale because recursive locking - self.blobs.drop_stale_segments().map_err(Into::into) + self.blobs.drop_stale_blob_files() } /// Drops all stale blob segment files @@ -215,7 +244,7 @@ impl BlobTree { // IMPORTANT: Write lock memtable to avoid read skew let _lock = self.index.lock_active_memtable(); - self.blobs.drop_stale_segments().map_err(Into::into) + self.blobs.drop_stale_blob_files() } #[doc(hidden)] @@ -235,6 +264,38 @@ impl BlobTree { } impl AbstractTree for BlobTree { + fn prefix>( + &self, + prefix: K, + seqno: SeqNo, + index: Option>, + ) -> Box> + '_> { + Box::new( + self.index + .0 + .create_prefix(&prefix, seqno, index) + .map(move |kv| IterGuardImpl::Blob(Guard(&self.blobs, kv))), + ) + } + + fn range, R: RangeBounds>( + &self, + range: R, + seqno: SeqNo, + index: Option>, + ) -> Box> + '_> { + Box::new( + self.index + .0 + .create_range(&range, seqno, index) + .map(move |kv| IterGuardImpl::Blob(Guard(&self.blobs, kv))), + ) + } + + fn tombstone_count(&self) -> u64 { + self.index.tombstone_count() + } + fn drop_range(&self, key_range: crate::KeyRange) -> crate::Result<()> { self.index.drop_range(key_range) } @@ -315,12 +376,12 @@ impl AbstractTree for BlobTree { } fn blob_file_count(&self) -> usize { - self.blobs.segment_count() + self.blobs.blob_file_count() } // NOTE: We skip reading from the value log // because the vHandles already store the value size - fn size_of>(&self, key: K, seqno: Option) -> crate::Result> { + fn size_of>(&self, key: K, seqno: SeqNo) -> crate::Result> { let vhandle = self.index.get_vhandle(key.as_ref(), seqno)?; Ok(vhandle.map(|x| match x { @@ -353,22 +414,6 @@ impl AbstractTree for BlobTree { Ok(index_tree_sum + vlog_sum) } */ - fn keys( - &self, - seqno: Option, - index: Option>, - ) -> Box> + 'static> { - self.index.keys(seqno, index) - } - - fn values( - &self, - seqno: Option, - index: Option>, - ) -> Box> + 'static> { - Box::new(self.iter(seqno, index).map(|x| x.map(|(_, v)| v))) - } - fn flush_memtable( &self, segment_id: SegmentId, @@ -561,13 +606,13 @@ impl AbstractTree for BlobTree { // NOTE: Override the default implementation to not fetch // data from the value log, so we get much faster key reads - fn contains_key>(&self, key: K, seqno: Option) -> crate::Result { + fn contains_key>(&self, key: K, seqno: SeqNo) -> crate::Result { self.index.contains_key(key, seqno) } // NOTE: Override the default implementation to not fetch // data from the value log, so we get much faster scans - fn len(&self, seqno: Option, index: Option>) -> crate::Result { + fn len(&self, seqno: SeqNo, index: Option>) -> crate::Result { self.index.len(seqno, index) } @@ -583,42 +628,6 @@ impl AbstractTree for BlobTree { self.index.get_highest_persisted_seqno() } - fn snapshot(&self, seqno: SeqNo) -> Snapshot { - use crate::AnyTree::Blob; - - Snapshot::new(Blob(self.clone()), seqno) - } - - fn range, R: RangeBounds>( - &self, - range: R, - seqno: Option, - index: Option>, - ) -> Box> + 'static> { - let vlog = self.blobs.clone(); - Box::new( - self.index - .0 - .create_range(&range, seqno, index) - .map(move |item| resolve_value_handle(&vlog, item)), - ) - } - - fn prefix>( - &self, - prefix: K, - seqno: Option, - index: Option>, - ) -> Box> + 'static> { - let vlog = self.blobs.clone(); - Box::new( - self.index - .0 - .create_prefix(prefix, seqno, index) - .map(move |item| resolve_value_handle(&vlog, item)), - ) - } - fn insert, V: Into>( &self, key: K, @@ -627,6 +636,11 @@ impl AbstractTree for BlobTree { ) -> (u64, u64) { use value::MaybeInlineValue; + // TODO: let's store a struct in memtables instead + // TODO: that stores slice + is_user_value + // TODO: then we can avoid alloc + memcpy here + // TODO: benchmark for very large values + // NOTE: Initially, we always write an inline value // On memtable flush, depending on the values' sizes, they will be separated // into inline or indirect values @@ -637,11 +651,7 @@ impl AbstractTree for BlobTree { self.index.insert(key, value, seqno) } - fn get>( - &self, - key: K, - seqno: Option, - ) -> crate::Result> { + fn get>(&self, key: K, seqno: SeqNo) -> crate::Result> { use value::MaybeInlineValue::{Indirect, Inline}; let key = key.as_ref(); diff --git a/src/blob_tree/value.rs b/src/blob_tree/value.rs index a97cec99..1c6ab3a7 100644 --- a/src/blob_tree/value.rs +++ b/src/blob_tree/value.rs @@ -3,9 +3,10 @@ // (found in the LICENSE-* files in the repository) use crate::coding::{Decode, DecodeError, Encode, EncodeError}; +use crate::vlog::ValueHandle; +use crate::{Slice, UserValue}; use byteorder::{ReadBytesExt, WriteBytesExt}; use std::io::{Cursor, Read, Write}; -use value_log::{Slice, UserValue, ValueHandle}; use varint_rs::{VarintReader, VarintWriter}; /// A value which may or may not be inlined into an index tree diff --git a/src/cache.rs b/src/cache.rs index d3407c45..9b2d9d64 100644 --- a/src/cache.rs +++ b/src/cache.rs @@ -146,12 +146,12 @@ impl Cache { #[doc(hidden)] pub fn insert_blob( &self, - vlog_id: value_log::ValueLogId, - vhandle: &value_log::ValueHandle, + vlog_id: crate::vlog::ValueLogId, + vhandle: &crate::vlog::ValueHandle, value: UserValue, ) { self.data.insert( - (TAG_BLOB, vlog_id, vhandle.segment_id, vhandle.offset).into(), + (TAG_BLOB, vlog_id, vhandle.blob_file_id, vhandle.offset).into(), Item::Blob(value), ); } @@ -160,10 +160,10 @@ impl Cache { #[must_use] pub fn get_blob( &self, - vlog_id: value_log::ValueLogId, - vhandle: &value_log::ValueHandle, + vlog_id: crate::vlog::ValueLogId, + vhandle: &crate::vlog::ValueHandle, ) -> Option { - let key: CacheKey = (TAG_BLOB, vlog_id, vhandle.segment_id, vhandle.offset).into(); + let key: CacheKey = (TAG_BLOB, vlog_id, vhandle.blob_file_id, vhandle.offset).into(); Some(match self.data.get(&key)? { Item::Blob(blob) => blob, diff --git a/src/coding.rs b/src/coding.rs new file mode 100644 index 00000000..03a33bb6 --- /dev/null +++ b/src/coding.rs @@ -0,0 +1,116 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use std::io::{Read, Write}; + +/// Error during serialization +#[derive(Debug)] +pub enum EncodeError { + /// I/O error + Io(std::io::Error), +} + +impl std::fmt::Display for EncodeError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "EncodeError({})", + match self { + Self::Io(e) => e.to_string(), + } + ) + } +} + +impl From for EncodeError { + fn from(value: std::io::Error) -> Self { + Self::Io(value) + } +} + +impl std::error::Error for EncodeError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + Self::Io(e) => Some(e), + } + } +} + +/// Error during deserialization +#[derive(Debug)] +pub enum DecodeError { + /// I/O error + Io(std::io::Error), + + /// Unsupported/outdated disk version + InvalidVersion, + + /// Invalid enum tag + InvalidTag((&'static str, u8)), + + /// Invalid block trailer + InvalidTrailer, + + /// Invalid block header + InvalidHeader(&'static str), + + /// UTF-8 error + Utf8(std::str::Utf8Error), +} + +impl std::fmt::Display for DecodeError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "DecodeError({})", + match self { + Self::Io(e) => e.to_string(), + e => format!("{e:?}"), + } + ) + } +} + +impl From for DecodeError { + fn from(value: std::str::Utf8Error) -> Self { + Self::Utf8(value) + } +} + +impl From for DecodeError { + fn from(value: std::io::Error) -> Self { + Self::Io(value) + } +} + +impl std::error::Error for DecodeError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + Self::Io(e) => Some(e), + _ => None, + } + } +} + +/// Trait to serialize stuff +pub trait Encode { + /// Serializes into writer. + fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError>; + + /// Serializes into vector. + #[allow(unused)] + fn encode_into_vec(&self) -> Vec { + let mut v = vec![]; + self.encode_into(&mut v).expect("cannot fail"); + v + } +} + +/// Trait to deserialize stuff +pub trait Decode { + /// Deserializes from reader. + fn decode_from(reader: &mut R) -> Result + where + Self: Sized; +} diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 852d7c81..02713eaa 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -28,17 +28,20 @@ fn pick_minimal_compaction( ) -> Option<(HashSet, bool)> { // NOTE: Find largest trivial move (if it exists) if let Some(window) = curr_run.shrinking_windows().find(|window| { - let key_range = aggregate_run_key_range(window); + if hidden_set.is_blocked(window.iter().map(Segment::id)) { + // IMPORTANT: Compaction is blocked because of other + // on-going compaction + return false; + } - if let Some(next_run) = &next_run { - if next_run.get_overlapping(&key_range).is_empty() { - return true; - } - } else { + let Some(next_run) = &next_run else { + // No run in next level, so we can trivially move return true; - } + }; + + let key_range = aggregate_run_key_range(window); - false + next_run.get_overlapping(&key_range).is_empty() }) { let ids = window.iter().map(Segment::id).collect(); return Some((ids, true)); @@ -92,6 +95,7 @@ fn pick_minimal_compaction( Some((window, curr_level_pull_in, write_amp)) }) + // Find the compaction with the smallest write amplification factor .min_by(|a, b| a.2.partial_cmp(&b.2).unwrap_or(std::cmp::Ordering::Equal)) .map(|(window, curr_level_pull_in, _)| { let mut ids: HashSet<_> = window.iter().map(Segment::id).collect(); @@ -107,7 +111,7 @@ fn pick_minimal_compaction( /// /// When a level reaches some threshold size, parts of it are merged into overlapping segments in the next level. /// -/// Each level Ln for n >= 2 can have up to `level_base_size * ratio^n` segments. +/// Each level Ln for n >= 2 can have up to `level_base_size * ratio^(n - 1)` segments. /// /// LCS suffers from comparatively high write amplification, but has decent read amplification and great space amplification (~1.1x). /// @@ -161,6 +165,7 @@ impl Strategy { /// L2 = `level_base_size * ratio` /// /// L3 = `level_base_size * ratio * ratio` + /// /// ... fn level_target_size(&self, level_idx: u8) -> u64 { assert!(level_idx >= 1, "level_target_size does not apply to L0"); @@ -182,7 +187,7 @@ impl CompactionStrategy for Strategy { #[allow(clippy::too_many_lines)] fn choose(&self, levels: &LevelManifest, _: &Config) -> Choice { - assert!(levels.as_slice().len() <= 7, "too many levels???"); + assert!(levels.as_slice().len() == 7, "should have exactly 7 levels"); // Scoring let mut scores = [(0.0, 0u64); 7]; @@ -230,6 +235,7 @@ impl CompactionStrategy for Strategy { } // NOTE: Never score Lmax + // // NOTE: We check for level length above #[allow(clippy::indexing_slicing)] { @@ -297,7 +303,7 @@ impl CompactionStrategy for Strategy { return Choice::Merge(choice); } - // We choose L1+ compaction + // We choose L1+ compaction instead // NOTE: Level count is 255 max #[allow(clippy::cast_possible_truncation)] diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 6d859e96..bdf1e9a4 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -70,16 +70,16 @@ impl Options { /// /// This will block until the compactor is fully finished. pub fn do_compaction(opts: &Options) -> crate::Result<()> { - log::trace!("compactor: acquiring levels manifest lock"); + log::trace!("Acquiring levels manifest lock"); let original_levels = opts.levels.write().expect("lock is poisoned"); log::trace!( - "compactor: consulting compaction strategy {:?}", + "Consulting compaction strategy {:?}", opts.strategy.get_name(), ); let choice = opts.strategy.choose(&original_levels, &opts.config); - log::debug!("compactor: choice: {choice:?}"); + log::debug!("Compaction choice: {choice:?} in {:?}", start.elapsed()); match choice { Choice::Merge(payload) => merge_segments(original_levels, opts, &payload), @@ -363,7 +363,7 @@ fn merge_segments( opts.tree_id, opts.config.cache.clone(), opts.config.descriptor_table.clone(), - payload.dest_level <= 2, // TODO: look at configuration + payload.dest_level <= 1, // TODO: look at configuration payload.dest_level <= 2, // TODO: look at configuration #[cfg(feature = "metrics")] opts.metrics.clone(), diff --git a/src/descriptor_table.rs b/src/descriptor_table.rs index ffa372f1..62380280 100644 --- a/src/descriptor_table.rs +++ b/src/descriptor_table.rs @@ -14,6 +14,7 @@ type Item = Arc; #[derive(Eq, std::hash::Hash, PartialEq)] struct CacheKey(u8, u64, u64); +/// Caches file descriptors to disk segments and blob files pub struct DescriptorTable { inner: QuickCache, } @@ -25,7 +26,7 @@ impl DescriptorTable { #[allow(clippy::default_trait_access)] let quick_cache = QuickCache::with( - 100_000, + 1_000, capacity as u64, UnitWeighter, Default::default(), diff --git a/src/error.rs b/src/error.rs index 14ab2372..39e17ab9 100644 --- a/src/error.rs +++ b/src/error.rs @@ -32,9 +32,6 @@ pub enum Error { /// Invalid checksum value (got, expected) InvalidChecksum((Checksum, Checksum)), - - /// Value log errors - ValueLog(value_log::Error), } impl std::fmt::Display for Error { @@ -49,7 +46,6 @@ impl std::error::Error for Error { Self::Io(e) => Some(e), Self::Encode(e) => Some(e), Self::Decode(e) => Some(e), - Self::ValueLog(e) => Some(e), Self::Decompress(_) | Self::InvalidVersion(_) | Self::Unrecoverable @@ -76,11 +72,5 @@ impl From for Error { } } -impl From for Error { - fn from(value: value_log::Error) -> Self { - Self::ValueLog(value) - } -} - /// Tree result pub type Result = std::result::Result; diff --git a/src/file.rs b/src/file.rs index 4e8a7c54..38abfe37 100644 --- a/src/file.rs +++ b/src/file.rs @@ -10,7 +10,7 @@ pub const MANIFEST_FILE: &str = "manifest"; pub const SEGMENTS_FOLDER: &str = "segments"; pub const BLOBS_FOLDER: &str = "blobs"; -/// Atomically rewrites a file +/// Atomically rewrites a file. pub fn rewrite_atomic(path: &Path, content: &[u8]) -> std::io::Result<()> { // NOTE: Nothing we can do #[allow(clippy::expect_used)] diff --git a/src/iter_guard.rs b/src/iter_guard.rs new file mode 100644 index 00000000..5655dbd6 --- /dev/null +++ b/src/iter_guard.rs @@ -0,0 +1,47 @@ +use crate::{ + blob_tree::Guard as BlobGuard, tree::Guard as StandardGuard, KvPair, UserKey, UserValue, +}; +use enum_dispatch::enum_dispatch; + +/// An iterator item +#[enum_dispatch] +pub trait IterGuard { + /// Accesses the key-value tuple. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + fn into_inner(self) -> crate::Result; + + /// Accesses the key. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + fn key(self) -> crate::Result; + + /// Returns the value size. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + fn size(self) -> crate::Result; + + /// Accesses the value. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + fn value(self) -> crate::Result + where + Self: Sized, + { + self.into_inner().map(|(_, v)| v) + } +} + +#[enum_dispatch(IterGuard)] +pub enum IterGuardImpl<'a> { + Standard(StandardGuard), + Blob(BlobGuard<'a>), +} diff --git a/src/key_range.rs b/src/key_range.rs new file mode 100644 index 00000000..d1e1b52a --- /dev/null +++ b/src/key_range.rs @@ -0,0 +1,321 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use crate::{ + coding::{Decode, DecodeError, Encode, EncodeError}, + Slice, UserKey, +}; +use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; +use std::{ + io::{Read, Write}, + ops::Bound, +}; + +/// A key range in the format of [min, max] (inclusive on both sides) +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct KeyRange(UserKey, UserKey); + +impl std::fmt::Display for KeyRange { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "[{}<=>{}]", + String::from_utf8_lossy(self.min()), + String::from_utf8_lossy(self.max()) + ) + } +} + +impl KeyRange { + /// Creates a new key range. + #[must_use] + pub fn new((min, max): (UserKey, UserKey)) -> Self { + Self(min, max) + } + + /// Creates an empty key range. + #[must_use] + pub fn empty() -> Self { + Self(Slice::empty(), Slice::empty()) + } + + /// Returns the lower bound. + #[must_use] + pub fn min(&self) -> &UserKey { + &self.0 + } + + /// Returns the upper bound. + #[must_use] + pub fn max(&self) -> &UserKey { + &self.1 + } + + fn as_tuple(&self) -> (&UserKey, &UserKey) { + (self.min(), self.max()) + } + + /// Returns `true` if the list of key ranges is disjoint + #[must_use] + pub fn is_disjoint(ranges: &[&Self]) -> bool { + for (idx, a) in ranges.iter().enumerate() { + for b in ranges.iter().skip(idx + 1) { + if a.overlaps_with_key_range(b) { + return false; + } + } + } + + true + } + + /// Returns `true` if the key falls within this key range. + #[must_use] + pub fn contains_key(&self, key: &[u8]) -> bool { + let (start, end) = self.as_tuple(); + key >= *start && key <= *end + } + + /// Returns `true` if the `other` is fully contained in this range. + #[must_use] + pub fn contains_range(&self, other: &Self) -> bool { + let (start1, end1) = self.as_tuple(); + let (start2, end2) = other.as_tuple(); + start1 <= start2 && end1 >= end2 + } + + /// Returns `true` if the `other` overlaps at least partially with this range. + #[must_use] + pub fn overlaps_with_key_range(&self, other: &Self) -> bool { + let (start1, end1) = self.as_tuple(); + let (start2, end2) = other.as_tuple(); + end1 >= start2 && start1 <= end2 + } + + /// Returns `true` if the ranges overlap partially or fully. + #[must_use] + pub fn overlaps_with_bounds(&self, bounds: &(Bound<&[u8]>, Bound<&[u8]>)) -> bool { + let (lo, hi) = bounds; + let (my_lo, my_hi) = self.as_tuple(); + + if *lo == Bound::Unbounded && *hi == Bound::Unbounded { + return true; + } + + if *hi == Bound::Unbounded { + return match lo { + Bound::Included(key) => key <= my_hi, + Bound::Excluded(key) => key < my_hi, + Bound::Unbounded => unreachable!(), + }; + } + + if *lo == Bound::Unbounded { + return match hi { + Bound::Included(key) => key >= my_lo, + Bound::Excluded(key) => key > my_lo, + Bound::Unbounded => unreachable!(), + }; + } + + let lo_included = match lo { + Bound::Included(key) => key <= my_hi, + Bound::Excluded(key) => key < my_hi, + Bound::Unbounded => unreachable!(), + }; + + let hi_included = match hi { + Bound::Included(key) => key >= my_lo, + Bound::Excluded(key) => key > my_lo, + Bound::Unbounded => unreachable!(), + }; + + lo_included && hi_included + } + + /// Aggregates a key range. + pub fn aggregate<'a>(mut iter: impl Iterator) -> Self { + let Some(first) = iter.next() else { + return Self::empty(); + }; + + let mut min = first.min(); + let mut max = first.max(); + + for other in iter { + let x = other.min(); + if x < min { + min = x; + } + + let x = other.max(); + if x > max { + max = x; + } + } + + Self(min.clone(), max.clone()) + } +} + +impl Encode for KeyRange { + fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { + let min = self.min(); + let max = self.max(); + + // NOTE: Max key size = u16 + #[allow(clippy::cast_possible_truncation)] + writer.write_u16::(min.len() as u16)?; + writer.write_all(min)?; + + // NOTE: Max key size = u16 + #[allow(clippy::cast_possible_truncation)] + writer.write_u16::(max.len() as u16)?; + writer.write_all(max)?; + + Ok(()) + } +} + +impl Decode for KeyRange { + fn decode_from(reader: &mut R) -> Result { + let key_min_len = reader.read_u16::()?; + let key_min: UserKey = Slice::from_reader(reader, key_min_len.into())?; + + let key_max_len = reader.read_u16::()?; + let key_max: UserKey = Slice::from_reader(reader, key_max_len.into())?; + + Ok(Self::new((key_min, key_max))) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use test_log::test; + + fn int_key_range(a: u64, b: u64) -> KeyRange { + KeyRange::new((a.to_be_bytes().into(), b.to_be_bytes().into())) + } + + fn string_key_range(a: &str, b: &str) -> KeyRange { + KeyRange::new((a.as_bytes().into(), b.as_bytes().into())) + } + + #[test] + fn key_range_aggregate_1() { + let ranges = [ + int_key_range(2, 4), + int_key_range(0, 4), + int_key_range(7, 10), + ]; + let aggregated = KeyRange::aggregate(ranges.iter()); + let (min, max) = aggregated.as_tuple(); + assert_eq!([0, 0, 0, 0, 0, 0, 0, 0], &**min); + assert_eq!([0, 0, 0, 0, 0, 0, 0, 10], &**max); + } + + #[test] + fn key_range_aggregate_2() { + let ranges = [ + int_key_range(6, 7), + int_key_range(0, 2), + int_key_range(0, 10), + ]; + let aggregated = KeyRange::aggregate(ranges.iter()); + let (min, max) = aggregated.as_tuple(); + assert_eq!([0, 0, 0, 0, 0, 0, 0, 0], &**min); + assert_eq!([0, 0, 0, 0, 0, 0, 0, 10], &**max); + } + + mod is_disjoint { + use super::*; + use test_log::test; + + #[test] + fn key_range_number() { + let ranges = [&int_key_range(0, 4), &int_key_range(0, 4)]; + assert!(!KeyRange::is_disjoint(&ranges)); + } + + #[test] + fn key_range_string() { + let ranges = [&string_key_range("a", "d"), &string_key_range("g", "z")]; + assert!(KeyRange::is_disjoint(&ranges)); + } + + #[test] + fn key_range_not_disjoint() { + let ranges = [&string_key_range("a", "f"), &string_key_range("b", "h")]; + assert!(!KeyRange::is_disjoint(&ranges)); + + let ranges = [ + &string_key_range("a", "d"), + &string_key_range("d", "e"), + &string_key_range("f", "z"), + ]; + assert!(!KeyRange::is_disjoint(&ranges)); + } + } + + mod overflap_key_range { + use super::*; + use test_log::test; + + #[test] + fn key_range_overlap() { + let a = string_key_range("a", "f"); + let b = string_key_range("b", "h"); + assert!(a.overlaps_with_key_range(&b)); + } + + #[test] + fn key_range_overlap_edge() { + let a = string_key_range("a", "f"); + let b = string_key_range("f", "t"); + assert!(a.overlaps_with_key_range(&b)); + } + + #[test] + fn key_range_no_overlap() { + let a = string_key_range("a", "f"); + let b = string_key_range("g", "t"); + assert!(!a.overlaps_with_key_range(&b)); + } + } + + mod overlaps_with_bounds { + use super::*; + use std::ops::Bound::{Excluded, Included, Unbounded}; + use test_log::test; + + #[test] + fn inclusive() { + let key_range = KeyRange(UserKey::from("key1"), UserKey::from("key5")); + let bounds = (Included(b"key1" as &[u8]), Included(b"key5" as &[u8])); + assert!(key_range.overlaps_with_bounds(&bounds)); + } + + #[test] + fn exclusive() { + let key_range = KeyRange(UserKey::from("key1"), UserKey::from("key5")); + let bounds = (Excluded(b"key0" as &[u8]), Excluded(b"key6" as &[u8])); + assert!(key_range.overlaps_with_bounds(&bounds)); + } + + #[test] + fn no_overlap() { + let key_range = KeyRange(UserKey::from("key1"), UserKey::from("key5")); + let bounds = (Excluded(b"key5" as &[u8]), Excluded(b"key6" as &[u8])); + assert!(!key_range.overlaps_with_bounds(&bounds)); + } + + #[test] + fn unbounded() { + let key_range = KeyRange(UserKey::from("key1"), UserKey::from("key5")); + let bounds = (Unbounded, Unbounded); + assert!(key_range.overlaps_with_bounds(&bounds)); + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 5d2cbbce..0411c38a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -47,21 +47,21 @@ //! // So you can handle I/O errors if they occur //! tree.insert("my_key", "my_value", /* sequence number */ 0); //! -//! let item = tree.get("my_key", None)?; +//! let item = tree.get("my_key", 1)?; //! assert_eq!(Some("my_value".as_bytes().into()), item); //! //! // Search by prefix -//! for item in tree.prefix("prefix", None, None) { +//! for item in tree.prefix("prefix", 1, None) { //! // ... //! } //! //! // Search by range -//! for item in tree.range("a"..="z", None, None) { +//! for item in tree.range("a"..="z", 1, None) { //! // ... //! } //! //! // Iterators implement DoubleEndedIterator, so you can search backwards, too! -//! for item in tree.prefix("prefix", None, None).rev() { +//! for item in tree.prefix("prefix", 1, None).rev() { //! // ... //! } //! @@ -90,7 +90,6 @@ #![doc(html_logo_url = "https://raw.githubusercontent.com/fjall-rs/lsm-tree/main/logo.png")] #![doc(html_favicon_url = "https://raw.githubusercontent.com/fjall-rs/lsm-tree/main/logo.png")] -#![warn(unsafe_code)] #![deny(clippy::all, missing_docs, clippy::cargo)] #![deny(clippy::unwrap_used)] #![deny(clippy::indexing_slicing)] @@ -100,6 +99,9 @@ #![warn(clippy::multiple_crate_versions)] #![allow(clippy::option_if_let_else)] #![warn(clippy::redundant_feature_names)] +// the bytes feature uses unsafe to improve from_reader performance; so we need to relax this lint +// #![cfg_attr(feature = "bytes", deny(unsafe_code))] +// #![cfg_attr(not(feature = "bytes"), forbid(unsafe_code))] pub(crate) type HashMap = std::collections::HashMap; pub(crate) type HashSet = std::collections::HashSet; @@ -130,7 +132,12 @@ pub mod binary_search; #[doc(hidden)] pub mod blob_tree; -mod clipping_iter; +#[doc(hidden)] +mod cache; + +#[doc(hidden)] +pub mod coding; + pub mod compaction; mod compression; mod config; @@ -145,7 +152,10 @@ pub mod file; mod hash; +mod iter_guard; + mod key; +mod key_range; #[doc(hidden)] pub mod level_manifest; @@ -156,9 +166,6 @@ mod run_scanner; mod manifest; mod memtable; -#[doc(hidden)] -mod cache; - #[doc(hidden)] mod descriptor_table; @@ -179,8 +186,8 @@ mod path; pub mod range; mod seqno; +mod slice; mod slice_windows; -mod snapshot; #[doc(hidden)] pub mod stop_signal; @@ -190,6 +197,7 @@ mod time; mod tree; mod value; mod version; +mod vlog; #[doc(hidden)] pub mod segment; @@ -198,12 +206,7 @@ pub mod segment; pub type KvPair = (UserKey, UserValue); #[doc(hidden)] -pub use value_log::KeyRange; - -#[doc(hidden)] -pub mod coding { - pub use value_log::coding::{Decode, DecodeError, Encode, EncodeError}; -} +pub use key_range::KeyRange; #[doc(hidden)] pub use { @@ -221,33 +224,46 @@ pub use { descriptor_table::DescriptorTable, error::{Error, Result}, format_version::FormatVersion, + iter_guard::IterGuard as Guard, memtable::Memtable, r#abstract::AbstractTree, seqno::SequenceNumberCounter, - snapshot::Snapshot, tree::Tree, - value::{SeqNo, UserKey, UserValue, ValueType}, + value::{SeqNo, ValueType}, }; pub use any_tree::AnyTree; pub use blob_tree::BlobTree; -pub use value_log::{BlobCache, Slice}; +pub use slice::Slice; + +/// User defined key +pub type UserKey = Slice; + +/// User defined data (byte array) +pub type UserValue = Slice; /// Blob garbage collection utilities pub mod gc { - pub use value_log::{ + pub use super::vlog::{ GcReport as Report, GcStrategy as Strategy, SpaceAmpStrategy, StaleThresholdStrategy, }; } +// TODO: investigate perf macro_rules! unwrap { - ($x:expr) => { - $x.expect("should read") + ($x:expr) => {{ + #[cfg(not(feature = "use_unsafe"))] + { + $x.expect("should read") + } - // unsafe { $x.unwrap_unchecked() } - }; + #[cfg(feature = "use_unsafe")] + { + unsafe { $x.unwrap_unchecked() } + } + }}; } pub(crate) use unwrap; diff --git a/src/range.rs b/src/range.rs index 74af9a24..3449234a 100644 --- a/src/range.rs +++ b/src/range.rs @@ -143,7 +143,7 @@ impl TreeIter { pub fn create_range, R: RangeBounds>( guard: IterState, range: R, - seqno: Option, + seqno: SeqNo, level_manifest: &LevelManifest, ) -> Self { Self::new(guard, |lock| { @@ -232,14 +232,10 @@ impl TreeIter { range.end_bound().map(|x| &x.user_key).cloned(), )); - if let Some(seqno) = seqno { - iters.push(Box::new(reader.filter(move |item| match item { - Ok(item) => seqno_filter(item.key.seqno, seqno), - Err(_) => true, - }))); - } else { - iters.push(Box::new(reader)); - } + iters.push(Box::new(reader.filter(move |item| match item { + Ok(item) => seqno_filter(item.key.seqno, seqno), + Err(_) => true, + }))); } } _ => { @@ -251,14 +247,10 @@ impl TreeIter { ), CachePolicy::Write, ) { - if let Some(seqno) = seqno { - iters.push(Box::new(reader.filter(move |item| match item { - Ok(item) => seqno_filter(item.key.seqno, seqno), - Err(_) => true, - }))); - } else { - iters.push(Box::new(reader)); - } + iters.push(Box::new(reader.filter(move |item| match item { + Ok(item) => seqno_filter(item.key.seqno, seqno), + Err(_) => true, + }))); } } } @@ -300,28 +292,20 @@ impl TreeIter { for memtable in &lock.sealed { let iter = memtable.range(range.clone()); - if let Some(seqno) = seqno { - iters.push(Box::new( - iter.filter(move |item| seqno_filter(item.key.seqno, seqno)) - .map(Ok), - )); - } else { - iters.push(Box::new(iter.map(Ok))); - } + iters.push(Box::new( + iter.filter(move |item| seqno_filter(item.key.seqno, seqno)) + .map(Ok), + )); } // Active memtable { let iter = lock.active.range(range.clone()); - if let Some(seqno) = seqno { - iters.push(Box::new( - iter.filter(move |item| seqno_filter(item.key.seqno, seqno)) - .map(Ok), - )); - } else { - iters.push(Box::new(iter.map(Ok))); - } + iters.push(Box::new( + iter.filter(move |item| seqno_filter(item.key.seqno, seqno)) + .map(Ok), + )); } if let Some(index) = &lock.ephemeral { diff --git a/src/segment/block/binary_index/reader.rs b/src/segment/block/binary_index/reader.rs index 4b4c2e4a..2e1805dd 100644 --- a/src/segment/block/binary_index/reader.rs +++ b/src/segment/block/binary_index/reader.rs @@ -20,7 +20,7 @@ impl<'a> Reader<'a> { Self { // NOTE: We consider the caller to be trustworthy - #[warn(clippy::indexing_slicing)] + #[allow(clippy::indexing_slicing)] bytes: &bytes[offset..end], step_size, } @@ -33,16 +33,8 @@ impl<'a> Reader<'a> { pub(crate) fn get(&self, idx: usize) -> usize { let offset = idx * self.step_size; - // TODO: 3.0.0 is not worth it, just use safe impl - - // SAFETY: We consider the caller to be trustworthy - #[allow(unsafe_code)] - #[cfg(feature = "use_unsafe")] - let mut bytes = unsafe { self.bytes.get_unchecked(offset..) }; - // NOTE: We consider the caller to be trustworthy - #[cfg(not(feature = "use_unsafe"))] - #[warn(clippy::indexing_slicing)] + #[allow(clippy::indexing_slicing)] let mut bytes = &self.bytes[offset..]; if self.step_size == 2 { diff --git a/src/segment/block/decoder.rs b/src/segment/block/decoder.rs index 4a77a949..1a324c40 100644 --- a/src/segment/block/decoder.rs +++ b/src/segment/block/decoder.rs @@ -145,7 +145,7 @@ impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Pa Trailer::new(self.block).item_count() } - fn get_binary_index_reader(&self) -> BinaryIndexReader { + fn get_binary_index_reader(&self) -> BinaryIndexReader<'_> { BinaryIndexReader::new( &self.block.data, self.binary_index_offset, diff --git a/src/segment/block/hash_index/reader.rs b/src/segment/block/hash_index/reader.rs index eeb01488..f661d693 100644 --- a/src/segment/block/hash_index/reader.rs +++ b/src/segment/block/hash_index/reader.rs @@ -68,15 +68,8 @@ impl<'a> Reader<'a> { let bucket_pos = calculate_bucket_position(key, bucket_count); - // TODO: 3.0.0 is not worth it, just use safe impl - // SAFETY: We use modulo in `calculate_bucket_position` - #[allow(unsafe_code)] - #[cfg(feature = "use_unsafe")] - let marker = unsafe { *self.0.get_unchecked(bucket_pos) }; - // SAFETY: We use modulo in `calculate_bucket_position` #[allow(clippy::indexing_slicing)] - #[cfg(not(feature = "use_unsafe"))] let marker = self.0[bucket_pos]; match marker { diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index 3245b7ac..1f1bf9a5 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -20,7 +20,7 @@ pub(crate) use trailer::{Trailer, TRAILER_START_MARKER}; use crate::{ coding::{Decode, Encode}, - segment::BlockHandle, + segment::{BlockHandle, DataBlock}, CompressionType, Slice, }; use std::fs::File; @@ -91,22 +91,15 @@ impl Block { #[cfg(feature = "lz4")] CompressionType::Lz4 => { - #[cfg(feature = "use_unsafe")] - let mut data = Slice::with_size_unzeroed(header.uncompressed_length as usize); - - #[cfg(not(feature = "use_unsafe"))] - let mut data = Slice::with_size(header.uncompressed_length as usize); + let mut builder = + unsafe { Slice::builder_unzeroed(header.uncompressed_length as usize) }; { - // NOTE: We know that we are the owner - #[allow(clippy::expect_used)] - let mut mutator = data.get_mut().expect("should be the owner"); - - lz4_flex::decompress_into(&raw_data, &mut mutator) + lz4_flex::decompress_into(&raw_data, &mut builder) .map_err(|_| crate::Error::Decompress(compression))?; } - data + builder.freeze().into() } }; @@ -137,7 +130,7 @@ impl Block { *checksum, *header.checksum, ); - // return Err(crate::Error::InvalidChecksum((checksum, header.checksum))); + return Err(crate::Error::InvalidChecksum((checksum, header.checksum))); } Ok(Self { header, data }) @@ -150,20 +143,15 @@ impl Block { block_type: BlockType, compression: CompressionType, ) -> crate::Result { - #[cfg(feature = "use_unsafe")] - let mut buf = Slice::with_size_unzeroed(handle.size() as usize); - - #[cfg(not(feature = "use_unsafe"))] - let mut buf = Slice::with_size(handle.size() as usize); + #[warn(unsafe_code)] + let mut builder = unsafe { Slice::builder_unzeroed(handle.size() as usize) }; { - let mut mutator = buf.get_mut().expect("should be the owner"); - #[cfg(unix)] { use std::os::unix::fs::FileExt; - let bytes_read = file.read_at(&mut mutator, *handle.offset())?; + let bytes_read = file.read_at(&mut builder, *handle.offset())?; assert_eq!( bytes_read, @@ -177,7 +165,7 @@ impl Block { { use std::os::windows::fs::FileExt; - let bytes_read = file.seek_read(&mut mutator, *handle.offset())?; + let bytes_read = file.seek_read(&mut builder, *handle.offset())?; assert_eq!( bytes_read, @@ -194,9 +182,10 @@ impl Block { } } + let buf = builder.freeze(); let header = Header::decode_from(&mut &buf[..])?; - let data = match compression { + let buf = match compression { CompressionType::None => buf.slice(Header::serialized_len()..), #[cfg(feature = "lz4")] @@ -206,28 +195,22 @@ impl Block { #[allow(clippy::indexing_slicing)] let raw_data = &buf[Header::serialized_len()..]; - #[cfg(feature = "use_unsafe")] - let mut data = Slice::with_size_unzeroed(header.uncompressed_length as usize); - - #[cfg(not(feature = "use_unsafe"))] - let mut data = Slice::with_size(header.uncompressed_length as usize); + #[warn(unsafe_code)] + let mut builder = + unsafe { Slice::builder_unzeroed(header.uncompressed_length as usize) }; { - // NOTE: We know that we are the owner - #[allow(clippy::expect_used)] - let mut mutator = data.get_mut().expect("should be the owner"); - - lz4_flex::decompress_into(raw_data, &mut mutator) + lz4_flex::decompress_into(raw_data, &mut builder) .map_err(|_| crate::Error::Decompress(compression))?; } - data + builder.freeze().into() } }; #[allow(clippy::expect_used, clippy::cast_possible_truncation)] { - debug_assert_eq!(header.uncompressed_length, data.len() as u32); + debug_assert_eq!(header.uncompressed_length, buf.len() as u32); } if header.block_type != block_type { @@ -243,16 +226,20 @@ impl Block { )))); } - let checksum = Checksum::from_raw(crate::hash::hash128(&data)); + let checksum = Checksum::from_raw(crate::hash::hash128(&buf)); if checksum != header.checksum { log::warn!( "Checksum mismatch for block {block_type:?}@{handle:?}, got={}, expected={}", *checksum, *header.checksum, ); - // return Err(crate::Error::InvalidChecksum((checksum, header.checksum))); + + return Err(crate::Error::InvalidChecksum((checksum, header.checksum))); } - Ok(Self { header, data }) + Ok(Self { + header, + data: Slice::from(buf), + }) } } diff --git a/src/segment/block_index/mod.rs b/src/segment/block_index/mod.rs index 99ade210..013837a9 100644 --- a/src/segment/block_index/mod.rs +++ b/src/segment/block_index/mod.rs @@ -5,7 +5,8 @@ pub(crate) mod iter; use super::{CachePolicy, IndexBlock, KeyedBlockHandle}; -use crate::segment::block::ParsedItem; +use crate::{segment::block::ParsedItem, Cache}; +use std::sync::Arc; #[enum_dispatch::enum_dispatch] pub trait BlockIndex { diff --git a/src/segment/data_block/iter.rs b/src/segment/data_block/iter.rs index 76b85b23..26781be4 100644 --- a/src/segment/data_block/iter.rs +++ b/src/segment/data_block/iter.rs @@ -1311,24 +1311,24 @@ mod tests { let items = [ InternalValue::from_components( Slice::new(&[0]), - Slice::new(&[]), + Slice::empty(), 3_834_029_160_418_063_669, Value, ), InternalValue::from_components(Slice::new(&[0]), Slice::new(&[]), 127, Tombstone), InternalValue::from_components( Slice::new(&[53, 53, 53]), - Slice::new(&[]), + Slice::empty(), 18_446_744_073_709_551_615, Tombstone, ), InternalValue::from_components( Slice::new(&[255]), - Slice::new(&[]), + Slice::empty(), 18_446_744_069_414_584_831, Tombstone, ), - InternalValue::from_components(Slice::new(&[255, 255]), Slice::new(&[]), 47, Value), + InternalValue::from_components(Slice::new(&[255, 255]), Slice::empty(), 47, Value), ]; let bytes = DataBlock::encode_into_vec(&items, 2, 1.0)?; diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index c84472bd..49141308 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -270,6 +270,10 @@ pub struct DataBlock { } impl DataBlock { + // TODO: maybe make the constructor check the block type, so we don't have to do it in the + // block loading routine... + // TODO: for index block etc. too + /// Interprets a block as a data block. /// /// The caller needs to make sure the block is actually a data block diff --git a/src/segment/filter/bit_array/reader.rs b/src/segment/filter/bit_array/reader.rs index 07dfcf8e..10fec004 100644 --- a/src/segment/filter/bit_array/reader.rs +++ b/src/segment/filter/bit_array/reader.rs @@ -32,12 +32,8 @@ impl<'a> BitArrayReader<'a> { pub fn get(&self, idx: usize) -> bool { let byte_idx = idx / 8; - #[cfg(not(feature = "bloom_use_unsafe"))] let byte = self.0.get(byte_idx).expect("should be in bounds"); - #[cfg(feature = "bloom_use_unsafe")] - let byte = unsafe { self.0.get_unchecked(byte_idx) }; - let bit_idx = idx % 8; get_bit(*byte, bit_idx) } diff --git a/src/segment/index_block/block_handle.rs b/src/segment/index_block/block_handle.rs index 826661af..c0999d9c 100644 --- a/src/segment/index_block/block_handle.rs +++ b/src/segment/index_block/block_handle.rs @@ -2,6 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) +use crate::UserKey; use crate::{ coding::{Decode, DecodeError, Encode, EncodeError}, segment::{ @@ -13,7 +14,6 @@ use crate::{ }; use byteorder::{ReadBytesExt, WriteBytesExt}; use std::io::{Cursor, Seek}; -use value_log::UserKey; use varint_rs::{VarintReader, VarintWriter}; /// Points to a block on file diff --git a/src/segment/multi_writer.rs b/src/segment/multi_writer.rs index af1737c6..86246b3c 100644 --- a/src/segment/multi_writer.rs +++ b/src/segment/multi_writer.rs @@ -165,7 +165,7 @@ impl MultiWriter { #[cfg(test)] mod tests { - use crate::{AbstractTree, Config}; + use crate::{AbstractTree, Config, SeqNo}; use test_log::test; // NOTE: Tests that versions of the same key stay @@ -184,11 +184,11 @@ mod tests { tree.insert("a", "a5".repeat(4_000), 4); tree.flush_active_memtable(0)?; assert_eq!(1, tree.segment_count()); - assert_eq!(1, tree.len(None, None)?); + assert_eq!(1, tree.len(SeqNo::MAX, None)?); tree.major_compact(1_024, 0)?; assert_eq!(1, tree.segment_count()); - assert_eq!(1, tree.len(None, None)?); + assert_eq!(1, tree.len(SeqNo::MAX, None)?); Ok(()) } @@ -207,11 +207,11 @@ mod tests { tree.insert("c", "a1".repeat(4_000), 1); tree.flush_active_memtable(0)?; assert_eq!(1, tree.segment_count()); - assert_eq!(3, tree.len(None, None)?); + assert_eq!(3, tree.len(SeqNo::MAX, None)?); tree.major_compact(1_024, 0)?; assert_eq!(3, tree.segment_count()); - assert_eq!(3, tree.len(None, None)?); + assert_eq!(3, tree.len(SeqNo::MAX, None)?); Ok(()) } diff --git a/src/seqno.rs b/src/seqno.rs index 7aefc091..a719f9f9 100644 --- a/src/seqno.rs +++ b/src/seqno.rs @@ -28,16 +28,13 @@ use std::sync::{ /// tree.insert("b".as_bytes(), "abc", seqno.next()); /// tree.insert("c".as_bytes(), "abc", seqno.next()); /// -/// // Maybe create a snapshot -/// let snapshot = tree.snapshot(seqno.get()); -/// /// // Create a batch /// let batch_seqno = seqno.next(); /// tree.remove("a".as_bytes(), batch_seqno); /// tree.remove("b".as_bytes(), batch_seqno); /// tree.remove("c".as_bytes(), batch_seqno); /// # -/// # assert!(tree.is_empty(None, None)?); +/// # assert!(tree.is_empty(batch_seqno + 1, None)?); /// # Ok::<(), lsm_tree::Error>(()) /// ``` #[derive(Clone, Default, Debug)] @@ -61,7 +58,14 @@ impl SequenceNumberCounter { /// Gets the next sequence number. #[must_use] pub fn next(&self) -> SeqNo { - self.0.fetch_add(1, Release) + let seqno = self.0.fetch_add(1, Release); + + // The MSB is reserved for transactions. + // + // This gives us 63-bit sequence numbers technically. + assert!(seqno < 0x8000_0000_0000_0000, "Ran out of sequence numbers"); + + seqno } /// Sets the sequence number. @@ -74,3 +78,23 @@ impl SequenceNumberCounter { self.0.fetch_max(seqno, AcqRel); } } + +#[cfg(test)] +mod tests { + use test_log::test; + + #[test] + fn not_max_seqno() { + let counter = super::SequenceNumberCounter::default(); + counter.set(0x7FFF_FFFF_FFFF_FFFF); + let _ = counter.next(); + } + + #[test] + #[should_panic = "Ran out of sequence numbers"] + fn max_seqno() { + let counter = super::SequenceNumberCounter::default(); + counter.set(0x8000_0000_0000_0000); + let _ = counter.next(); + } +} diff --git a/src/slice/default/mod.rs b/src/slice/default/mod.rs new file mode 100644 index 00000000..2b99e747 --- /dev/null +++ b/src/slice/default/mod.rs @@ -0,0 +1,76 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use byteview::ByteView; + +/// An immutable byte slice that can be cloned without additional heap allocation +/// +/// There is no guarantee of any sort of alignment for zero-copy (de)serialization. +#[derive(Debug, Clone, Eq, Hash, Ord)] +pub struct Slice(pub(super) ByteView); + +impl Slice { + /// Construct a [`Slice`] from a byte slice. + #[must_use] + pub fn new(bytes: &[u8]) -> Self { + Self(bytes.into()) + } + + pub(crate) fn empty() -> Self { + Self(ByteView::new(&[])) + } + + pub(crate) unsafe fn builder_unzeroed(len: usize) -> byteview::Builder { + ByteView::builder_unzeroed(len) + } + + // pub(crate) fn with_size(len: usize) -> Self { + // Self(ByteView::with_size(len)) + // } + + // pub(crate) fn with_size_unzeroed(len: usize) -> Self { + // Self(ByteView::with_size_unzeroed(len)) + // } + + pub(crate) fn slice(&self, range: impl std::ops::RangeBounds) -> Self { + Self(self.0.slice(range)) + } + + pub(crate) fn fused(left: &[u8], right: &[u8]) -> Self { + Self(ByteView::fused(left, right)) + } + + pub(crate) fn from_reader( + reader: &mut R, + len: usize, + ) -> std::io::Result { + ByteView::from_reader(reader, len).map(Self) + } +} + +// Arc::from> is specialized +impl From> for Slice { + fn from(value: Vec) -> Self { + Self(ByteView::from(value)) + } +} + +// Arc::from> is specialized +impl From for Slice { + fn from(value: String) -> Self { + Self(ByteView::from(value.into_bytes())) + } +} + +impl From for Slice { + fn from(value: ByteView) -> Self { + Self(value) + } +} + +impl From for ByteView { + fn from(value: Slice) -> Self { + value.0 + } +} diff --git a/src/slice/mod.rs b/src/slice/mod.rs new file mode 100644 index 00000000..1ff87424 --- /dev/null +++ b/src/slice/mod.rs @@ -0,0 +1,242 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +#[cfg(not(feature = "bytes_1"))] +mod default; + +use std::{ + path::{Path, PathBuf}, + sync::Arc, +}; + +// #[cfg(feature = "bytes")] +// mod slice_bytes; + +// #[cfg(feature = "bytes")] +// pub use slice_bytes::Slice; + +#[cfg(not(feature = "bytes_1"))] +pub use default::Slice; + +impl AsRef<[u8]> for Slice { + fn as_ref(&self) -> &[u8] { + &self.0 + } +} + +impl From<&[u8]> for Slice { + fn from(value: &[u8]) -> Self { + #[cfg(not(feature = "bytes_1"))] + { + Self(byteview::ByteView::new(value)) + } + + // #[cfg(feature = "bytes")] + // { + // Self(bytes::Bytes::from(value.to_vec())) + // } + } +} + +impl From> for Slice { + fn from(value: Arc<[u8]>) -> Self { + Self::from(&*value) + } +} + +impl From<&Vec> for Slice { + fn from(value: &Vec) -> Self { + Self::from(value.as_slice()) + } +} + +impl From<&str> for Slice { + fn from(value: &str) -> Self { + Self::from(value.as_bytes()) + } +} + +impl From<&String> for Slice { + fn from(value: &String) -> Self { + Self::from(value.as_str()) + } +} + +impl From<&Path> for Slice { + fn from(value: &Path) -> Self { + Self::from(value.as_os_str().as_encoded_bytes()) + } +} + +impl From for Slice { + fn from(value: PathBuf) -> Self { + Self::from(value.as_os_str().as_encoded_bytes()) + } +} + +impl From> for Slice { + fn from(value: Arc) -> Self { + Self::from(&*value) + } +} + +impl From<[u8; N]> for Slice { + fn from(value: [u8; N]) -> Self { + Self::from(value.as_slice()) + } +} + +impl From<&[u8; N]> for Slice { + fn from(value: &[u8; N]) -> Self { + Self::from(value.as_slice()) + } +} + +impl FromIterator for Slice { + fn from_iter(iter: T) -> Self + where + T: IntoIterator, + { + Vec::from_iter(iter).into() + } +} + +impl std::ops::Deref for Slice { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + self.as_ref() + } +} + +impl std::borrow::Borrow<[u8]> for Slice { + fn borrow(&self) -> &[u8] { + self + } +} + +impl PartialEq for Slice +where + T: AsRef<[u8]>, +{ + fn eq(&self, other: &T) -> bool { + self.as_ref() == other.as_ref() + } +} + +impl PartialEq for &[u8] { + fn eq(&self, other: &Slice) -> bool { + *self == other.as_ref() + } +} + +impl PartialOrd for Slice +where + T: AsRef<[u8]>, +{ + fn partial_cmp(&self, other: &T) -> Option { + self.as_ref().partial_cmp(other.as_ref()) + } +} + +impl PartialOrd for &[u8] { + fn partial_cmp(&self, other: &Slice) -> Option { + (*self).partial_cmp(other.as_ref()) + } +} + +#[cfg(test)] +#[allow(clippy::expect_used)] +mod tests { + use super::Slice; + use std::{fmt::Debug, sync::Arc}; + use test_log::test; + + fn assert_slice_handles(v: T) + where + T: Clone + Debug, + Slice: From + PartialEq + PartialOrd, + { + // verify slice arc roundtrips + let slice: Slice = v.clone().into(); + assert_eq!(slice, v, "slice_arc: {slice:?}, v: {v:?}"); + assert!(slice >= v, "slice_arc: {slice:?}, v: {v:?}"); + } + + #[test] + fn slice_empty() { + assert_eq!(Slice::empty(), []); + } + + #[test] + fn slice_fuse_empty() { + let bytes = Slice::fused(&[], &[]); + assert_eq!(&*bytes, &[] as &[u8]); + } + + #[test] + fn slice_fuse_one() { + let bytes = Slice::fused(b"abc", &[]); + assert_eq!(&*bytes, b"abc"); + } + + #[test] + fn slice_fuse_two() { + let bytes = Slice::fused(b"abc", b"def"); + assert_eq!(&*bytes, b"abcdef"); + } + + #[test] + #[allow(unsafe_code)] + fn slice_with_size() { + assert_eq!( + &*unsafe { + let mut b = Slice::builder_unzeroed(5); + b.fill(0); + b.freeze() + }, + [0; 5], + ); + assert_eq!( + &*unsafe { + let mut b = Slice::builder_unzeroed(50); + b.fill(0); + b.freeze() + }, + [0; 50], + ); + } + + /// This test verifies that we can create a `Slice` from various types and compare a `Slice` with them. + #[test] + fn test_slice_instantiation() { + // - &[u8] + assert_slice_handles::<&[u8]>(&[1, 2, 3, 4]); + // - Arc + assert_slice_handles::>(Arc::new([1, 2, 3, 4])); + // - Vec + assert_slice_handles::>(vec![1, 2, 3, 4]); + // - &str + assert_slice_handles::<&str>("hello"); + // - String + assert_slice_handles::("hello".to_string()); + // - [u8; N] + assert_slice_handles::<[u8; 4]>([1, 2, 3, 4]); + + // Special case for these types + // - Iterator + let slice = Slice::from_iter(vec![1, 2, 3, 4]); + assert_eq!(slice, vec![1, 2, 3, 4]); + + // - Arc + let arc_str: Arc = Arc::from("hello"); + let slice = Slice::from(arc_str.clone()); + assert_eq!(slice.as_ref(), arc_str.as_bytes()); + + // - io::Read + let mut reader = std::io::Cursor::new(vec![1, 2, 3, 4]); + let slice = Slice::from_reader(&mut reader, 4).expect("read"); + assert_eq!(slice, vec![1, 2, 3, 4]); + } +} diff --git a/src/slice/slice_bytes.rs b/src/slice/slice_bytes.rs new file mode 100644 index 00000000..d8854d61 --- /dev/null +++ b/src/slice/slice_bytes.rs @@ -0,0 +1,127 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use bytes::{Bytes, BytesMut}; + +/// An immutable byte slice that can be cloned without additional heap allocation +/// +/// There is no guarantee of any sort of alignment for zero-copy (de)serialization. +#[derive(Debug, Clone, Eq, Hash, Ord)] +pub struct Slice(pub(super) Bytes); + +impl Slice { + /// Construct a [`Slice`] from a byte slice. + #[must_use] + pub fn new(bytes: &[u8]) -> Self { + Self(Bytes::copy_from_slice(bytes)) + } + + #[doc(hidden)] + #[must_use] + pub fn empty() -> Self { + Self(Bytes::from_static(&[])) + } + + #[must_use] + #[doc(hidden)] + pub fn with_size(len: usize) -> Self { + let bytes = vec![0; len]; + Self(Bytes::from(bytes)) + } + + #[must_use] + #[doc(hidden)] + pub fn with_size_unzeroed(len: usize) -> Self { + Self(Self::get_unzeroed_builder(len).freeze()) + } + + fn get_unzeroed_builder(len: usize) -> BytesMut { + // Use `with_capacity` & `set_len`` to avoid zeroing the buffer + let mut builder = BytesMut::with_capacity(len); + + // SAFETY: we just allocated `len` bytes, and `read_exact` will fail if + // it doesn't fill the buffer, subsequently dropping the uninitialized + // BytesMut object + #[allow(unsafe_code)] + unsafe { + builder.set_len(len); + } + + builder + } + + #[doc(hidden)] + #[must_use] + pub fn slice(&self, range: impl std::ops::RangeBounds) -> Self { + Self(self.0.slice(range)) + } + + #[doc(hidden)] + #[must_use] + pub fn fused(left: &[u8], right: &[u8]) -> Self { + use std::io::Write; + + let len = left.len() + right.len(); + let mut builder = Self::get_unzeroed_builder(len); + { + let mut writer = &mut builder[..]; + + writer.write_all(left).expect("should write"); + writer.write_all(right).expect("should write"); + } + + Self(builder.freeze()) + } + + #[must_use] + #[doc(hidden)] + pub fn get_mut(&mut self) -> Option + '_> { + todo!(); + + Option::<&mut [u8]>::None + } + + /// Constructs a [`Slice`] from an I/O reader by pulling in `len` bytes. + /// + /// The reader may not read the existing buffer. + #[doc(hidden)] + pub fn from_reader(reader: &mut R, len: usize) -> std::io::Result { + let mut builder = Self::get_unzeroed_builder(len); + + // SAFETY: Normally, read_exact over an uninitialized buffer is UB, + // however we know that in lsm-tree etc. only I/O readers or cursors over Vecs are used + // so it's safe + // + // The safe API is unstable: https://github.com/rust-lang/rust/issues/78485 + reader.read_exact(&mut builder)?; + + Ok(Self(builder.freeze())) + } +} + +impl From for Slice { + fn from(value: Bytes) -> Self { + Self(value) + } +} + +impl From for Bytes { + fn from(value: Slice) -> Self { + value.0 + } +} + +// Bytes::from> is zero-copy optimized +impl From> for Slice { + fn from(value: Vec) -> Self { + Self(Bytes::from(value)) + } +} + +// Bytes::from is zero-copy optimized +impl From for Slice { + fn from(value: String) -> Self { + Self(Bytes::from(value)) + } +} diff --git a/src/tree/ingest.rs b/src/tree/ingest.rs index a5d334a0..63ec5932 100644 --- a/src/tree/ingest.rs +++ b/src/tree/ingest.rs @@ -71,8 +71,8 @@ impl<'a> Ingestion<'a> { self.tree.id, self.tree.config.cache.clone(), self.tree.config.descriptor_table.clone(), - true, - true, + false, + false, #[cfg(feature = "metrics")] self.tree.metrics.clone(), ) diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 0f5c678c..a96026a2 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -13,13 +13,13 @@ use crate::{ compaction::CompactionStrategy, config::Config, format_version::FormatVersion, + iter_guard::{IterGuard, IterGuardImpl}, level_manifest::LevelManifest, manifest::Manifest, memtable::Memtable, segment::Segment, value::InternalValue, - AbstractTree, Cache, DescriptorTable, KvPair, SegmentId, SeqNo, Snapshot, UserKey, UserValue, - ValueType, + AbstractTree, Cache, DescriptorTable, KvPair, SegmentId, SeqNo, UserKey, UserValue, ValueType, }; use inner::{MemtableId, SealedMemtables, TreeId, TreeInner}; use std::{ @@ -29,6 +29,24 @@ use std::{ sync::{atomic::AtomicU64, Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}, }; +pub struct Guard(crate::Result<(UserKey, UserValue)>); + +impl IterGuard for Guard { + fn key(self) -> crate::Result { + self.0.map(|(k, _)| k) + } + + fn size(self) -> crate::Result { + // NOTE: We know LSM-tree values are 32 bits in length max + #[allow(clippy::cast_possible_truncation)] + self.into_inner().map(|(_, v)| v.len() as u32) + } + + fn into_inner(self) -> crate::Result<(UserKey, UserValue)> { + self.0 + } +} + fn ignore_tombstone_value(item: InternalValue) -> Option { if item.is_tombstone() { None @@ -50,6 +68,41 @@ impl std::ops::Deref for Tree { } impl AbstractTree for Tree { + fn prefix>( + &self, + prefix: K, + seqno: SeqNo, + index: Option>, + ) -> Box> + '_> { + Box::new( + self.create_prefix(&prefix, seqno, index) + .map(|kv| IterGuardImpl::Standard(Guard(kv))), + ) + } + + fn range, R: RangeBounds>( + &self, + range: R, + seqno: SeqNo, + index: Option>, + ) -> Box> + '_> { + Box::new( + self.create_range(&range, seqno, index) + .map(|kv| IterGuardImpl::Standard(Guard(kv))), + ) + } + + // TODO: doctest + fn tombstone_count(&self) -> u64 { + self.manifest + .read() + .expect("lock is poisoned") + .current_version() + .iter_segments() + .map(Segment::tombstone_count) + .sum() + } + fn ingest(&self, iter: impl Iterator) -> crate::Result<()> { use crate::tree::ingest::Ingestion; use std::time::Instant; @@ -127,7 +180,7 @@ impl AbstractTree for Tree { .unwrap_or_default() } - fn size_of>(&self, key: K, seqno: Option) -> crate::Result> { + fn size_of>(&self, key: K, seqno: SeqNo) -> crate::Result> { // NOTE: We know that values are u32 max #[allow(clippy::cast_possible_truncation)] Ok(self.get(key, seqno)?.map(|x| x.len() as u32)) @@ -177,22 +230,6 @@ impl AbstractTree for Tree { Ok(sum) } */ - fn keys( - &self, - seqno: Option, - index: Option>, - ) -> Box> + 'static> { - Box::new(self.create_iter(seqno, index).map(|x| x.map(|(k, _)| k))) - } - - fn values( - &self, - seqno: Option, - index: Option>, - ) -> Box> + 'static> { - Box::new(self.create_iter(seqno, index).map(|x| x.map(|(_, v)| v))) - } - fn flush_memtable( &self, segment_id: SegmentId, @@ -421,40 +458,12 @@ impl AbstractTree for Tree { .max() } - fn snapshot(&self, seqno: SeqNo) -> Snapshot { - use crate::AnyTree::Standard; - - Snapshot::new(Standard(self.clone()), seqno) - } - - fn get>( - &self, - key: K, - seqno: Option, - ) -> crate::Result> { + fn get>(&self, key: K, seqno: SeqNo) -> crate::Result> { Ok(self - .get_internal_entry(key.as_ref(), seqno.unwrap_or(SeqNo::MAX))? + .get_internal_entry(key.as_ref(), seqno)? .map(|x| x.value)) } - fn range, R: RangeBounds>( - &self, - range: R, - seqno: Option, - index: Option>, - ) -> Box> + 'static> { - Box::new(self.create_range(&range, seqno, index)) - } - - fn prefix>( - &self, - prefix: K, - seqno: Option, - index: Option>, - ) -> Box> + 'static> { - Box::new(self.create_prefix(prefix, seqno, index)) - } - fn insert, V: Into>( &self, key: K, @@ -728,7 +737,7 @@ impl Tree { #[must_use] pub fn create_iter( &self, - seqno: Option, + seqno: SeqNo, ephemeral: Option>, ) -> impl DoubleEndedIterator> + 'static { self.create_range::(&.., seqno, ephemeral) @@ -738,7 +747,7 @@ impl Tree { pub fn create_internal_range<'a, K: AsRef<[u8]> + 'a, R: RangeBounds + 'a>( &'a self, range: &'a R, - seqno: Option, + seqno: SeqNo, ephemeral: Option>, ) -> impl DoubleEndedIterator> + 'static { use crate::range::{IterState, TreeIter}; @@ -780,9 +789,9 @@ impl Tree { #[doc(hidden)] pub fn create_range<'a, K: AsRef<[u8]> + 'a, R: RangeBounds + 'a>( - &'a self, + &self, range: &'a R, - seqno: Option, + seqno: SeqNo, ephemeral: Option>, ) -> impl DoubleEndedIterator> + 'static { self.create_internal_range(range, seqno, ephemeral) @@ -794,9 +803,9 @@ impl Tree { #[doc(hidden)] pub fn create_prefix<'a, K: AsRef<[u8]> + 'a>( - &'a self, + &self, prefix: K, - seqno: Option, + seqno: SeqNo, ephemeral: Option>, ) -> impl DoubleEndedIterator> + 'static { use crate::range::prefix_to_range; @@ -946,7 +955,6 @@ impl Tree { for (idx, dirent) in std::fs::read_dir(&segment_base_folder)?.enumerate() { let dirent = dirent?; - let file_name = dirent.file_name(); // https://en.wikipedia.org/wiki/.DS_Store diff --git a/src/vlog/blob_file/gc_stats.rs b/src/vlog/blob_file/gc_stats.rs new file mode 100644 index 00000000..bfaf5093 --- /dev/null +++ b/src/vlog/blob_file/gc_stats.rs @@ -0,0 +1,33 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use std::sync::atomic::AtomicU64; + +#[derive(Debug, Default)] +pub struct GcStats { + pub(crate) stale_items: AtomicU64, + pub(crate) stale_bytes: AtomicU64, +} + +impl GcStats { + pub fn set_stale_items(&self, x: u64) { + self.stale_items + .store(x, std::sync::atomic::Ordering::Release); + } + + pub fn set_stale_bytes(&self, x: u64) { + self.stale_bytes + .store(x, std::sync::atomic::Ordering::Release); + } + + /// Returns the amount of dead items in the blob file. + pub fn stale_items(&self) -> u64 { + self.stale_items.load(std::sync::atomic::Ordering::Acquire) + } + + /// Returns the amount of dead bytes in the blob file. + pub fn stale_bytes(&self) -> u64 { + self.stale_bytes.load(std::sync::atomic::Ordering::Acquire) + } +} diff --git a/src/vlog/blob_file/merge.rs b/src/vlog/blob_file/merge.rs new file mode 100644 index 00000000..994586e6 --- /dev/null +++ b/src/vlog/blob_file/merge.rs @@ -0,0 +1,121 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use crate::{ + vlog::{BlobFileId, BlobFileReader, Compressor}, + UserKey, UserValue, +}; +use interval_heap::IntervalHeap; +use std::cmp::Reverse; + +macro_rules! fail_iter { + ($e:expr) => { + match $e { + Ok(v) => v, + Err(e) => return Some(Err(e.into())), + } + }; +} + +type IteratorIndex = usize; + +#[derive(Debug)] +struct IteratorValue { + index: IteratorIndex, + key: UserKey, + value: UserValue, + blob_file_id: BlobFileId, + checksum: u64, +} + +impl PartialEq for IteratorValue { + fn eq(&self, other: &Self) -> bool { + self.key == other.key + } +} +impl Eq for IteratorValue {} + +impl PartialOrd for IteratorValue { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for IteratorValue { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + (&self.key, Reverse(&self.blob_file_id)).cmp(&(&other.key, Reverse(&other.blob_file_id))) + } +} + +/// Interleaves multiple blob file readers into a single, sorted stream +#[allow(clippy::module_name_repetitions)] +pub struct MergeReader { + readers: Vec>, + heap: IntervalHeap, +} + +impl MergeReader { + /// Initializes a new merging reader + pub fn new(readers: Vec>) -> Self { + let heap = IntervalHeap::with_capacity(readers.len()); + Self { readers, heap } + } + + fn advance_reader(&mut self, idx: usize) -> crate::Result<()> { + let reader = self.readers.get_mut(idx).expect("iter should exist"); + + if let Some(value) = reader.next() { + let (k, v, checksum) = value?; + let blob_file_id = reader.blob_file_id; + + self.heap.push(IteratorValue { + index: idx, + key: k, + value: v, + blob_file_id, + checksum, + }); + } + + Ok(()) + } + + fn push_next(&mut self) -> crate::Result<()> { + for idx in 0..self.readers.len() { + self.advance_reader(idx)?; + } + + Ok(()) + } +} + +impl Iterator for MergeReader { + type Item = crate::Result<(UserKey, UserValue, BlobFileId, u64)>; + + fn next(&mut self) -> Option { + if self.heap.is_empty() { + fail_iter!(self.push_next()); + } + + if let Some(head) = self.heap.pop_min() { + fail_iter!(self.advance_reader(head.index)); + + // Discard old items + while let Some(next) = self.heap.pop_min() { + if next.key == head.key { + fail_iter!(self.advance_reader(next.index)); + } else { + // Reached next user key now + // Push back non-conflicting item and exit + self.heap.push(next); + break; + } + } + + return Some(Ok((head.key, head.value, head.blob_file_id, head.checksum))); + } + + None + } +} diff --git a/src/vlog/blob_file/meta.rs b/src/vlog/blob_file/meta.rs new file mode 100644 index 00000000..e72c191e --- /dev/null +++ b/src/vlog/blob_file/meta.rs @@ -0,0 +1,67 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use crate::{ + coding::{Decode, DecodeError, Encode, EncodeError}, + KeyRange, +}; +use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; +use std::io::{Read, Write}; + +pub const METADATA_HEADER_MAGIC: &[u8] = &[b'V', b'L', b'O', b'G', b'S', b'M', b'D', 1]; + +#[derive(Debug)] +pub struct Metadata { + /// Number of KV-pairs in the blob file + pub item_count: u64, + + /// compressed size in bytes (on disk) (without the fixed size trailer) + pub compressed_bytes: u64, + + /// true size in bytes (if no compression were used) + pub total_uncompressed_bytes: u64, + + /// Key range + pub key_range: KeyRange, +} + +impl Encode for Metadata { + fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { + // Write header + writer.write_all(METADATA_HEADER_MAGIC)?; + + writer.write_u64::(self.item_count)?; + writer.write_u64::(self.compressed_bytes)?; + writer.write_u64::(self.total_uncompressed_bytes)?; + + self.key_range.encode_into(writer)?; + + Ok(()) + } +} + +impl Decode for Metadata { + fn decode_from(reader: &mut R) -> Result { + // Check header + let mut magic = [0u8; METADATA_HEADER_MAGIC.len()]; + reader.read_exact(&mut magic)?; + + if magic != METADATA_HEADER_MAGIC { + return Err(DecodeError::InvalidHeader("BlobFileMeta")); + } + + let item_count = reader.read_u64::()?; + let compressed_bytes = reader.read_u64::()?; + let total_uncompressed_bytes = reader.read_u64::()?; + + let key_range = KeyRange::decode_from(reader)?; + + Ok(Self { + item_count, + compressed_bytes, + total_uncompressed_bytes, + key_range, + }) + } +} diff --git a/src/vlog/blob_file/mod.rs b/src/vlog/blob_file/mod.rs new file mode 100644 index 00000000..8762e853 --- /dev/null +++ b/src/vlog/blob_file/mod.rs @@ -0,0 +1,75 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +pub mod gc_stats; +pub mod merge; +pub mod meta; +pub mod multi_writer; +pub mod reader; +pub mod trailer; +pub mod writer; + +use crate::vlog::{BlobFileId, Compressor}; +use gc_stats::GcStats; +use meta::Metadata; +use std::{marker::PhantomData, path::PathBuf}; + +/// A blob file is an immutable, sorted, contiguous file that contains large key-value pairs (blobs) +#[derive(Debug)] +pub struct BlobFile { + /// Blob file ID + pub id: BlobFileId, + + /// File path + pub path: PathBuf, + + /// Statistics + pub meta: Metadata, + + /// Runtime stats for garbage collection + pub gc_stats: GcStats, + + pub(crate) _phantom: PhantomData, +} + +impl BlobFile { + /// Returns a scanner that can iterate through the blob file. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + pub fn scan(&self) -> crate::Result> { + reader::Reader::new(&self.path, self.id) + } + + /// Returns the amount of items in the blob file. + pub fn len(&self) -> u64 { + self.meta.item_count + } + + /// Marks the blob file as fully stale. + pub(crate) fn mark_as_stale(&self) { + self.gc_stats.set_stale_items(self.meta.item_count); + + self.gc_stats + .set_stale_bytes(self.meta.total_uncompressed_bytes); + } + + /// Returns `true` if the blob file is fully stale. + pub fn is_stale(&self) -> bool { + self.gc_stats.stale_items() == self.meta.item_count + } + + /// Returns the percent of dead items in the blob file. + // NOTE: Precision is not important here + #[allow(clippy::cast_precision_loss)] + pub fn stale_ratio(&self) -> f32 { + let dead = self.gc_stats.stale_items() as f32; + if dead == 0.0 { + return 0.0; + } + + dead / self.meta.item_count as f32 + } +} diff --git a/src/vlog/blob_file/multi_writer.rs b/src/vlog/blob_file/multi_writer.rs new file mode 100644 index 00000000..e423d661 --- /dev/null +++ b/src/vlog/blob_file/multi_writer.rs @@ -0,0 +1,146 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use super::writer::Writer; +use crate::vlog::{compression::Compressor, value_log::IdGenerator, BlobFileId, ValueHandle}; +use std::path::{Path, PathBuf}; + +/// Blob file writer, may write multiple blob files +pub struct MultiWriter { + folder: PathBuf, + target_size: u64, + + writers: Vec>, + + id_generator: IdGenerator, + + compression: Option, +} + +impl MultiWriter { + /// Initializes a new blob file writer. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + #[doc(hidden)] + pub fn new>( + id_generator: IdGenerator, + target_size: u64, + folder: P, + ) -> std::io::Result { + let folder = folder.as_ref(); + + let blob_file_id = id_generator.next(); + let blob_file_path = folder.join(blob_file_id.to_string()); + + Ok(Self { + id_generator, + folder: folder.into(), + target_size, + + writers: vec![Writer::new(blob_file_path, blob_file_id)?], + + compression: None, + }) + } + + /// Sets the compression method + #[must_use] + #[doc(hidden)] + pub fn use_compression(mut self, compressor: Option) -> Self { + self.compression.clone_from(&compressor); + self.get_active_writer_mut().compression = compressor; + self + } + + #[doc(hidden)] + #[must_use] + pub fn get_active_writer(&self) -> &Writer { + // NOTE: initialized in constructor + #[allow(clippy::expect_used)] + self.writers.last().expect("should exist") + } + + fn get_active_writer_mut(&mut self) -> &mut Writer { + // NOTE: initialized in constructor + #[allow(clippy::expect_used)] + self.writers.last_mut().expect("should exist") + } + + /// Returns the [`ValueHandle`] for the next written blob. + /// + /// This can be used to index an item into an external `Index`. + #[must_use] + pub fn get_next_value_handle(&self) -> ValueHandle { + ValueHandle { + offset: self.offset(), + blob_file_id: self.blob_file_id(), + } + } + + #[doc(hidden)] + #[must_use] + pub fn offset(&self) -> u64 { + self.get_active_writer().offset() + } + + #[must_use] + fn blob_file_id(&self) -> BlobFileId { + self.get_active_writer().blob_file_id() + } + + /// Sets up a new writer for the next blob file. + fn rotate(&mut self) -> crate::Result<()> { + log::debug!("Rotating blob file writer"); + + let new_blob_file_id = self.id_generator.next(); + let blob_file_path = self.folder.join(new_blob_file_id.to_string()); + + let new_writer = Writer::new(blob_file_path, new_blob_file_id)? + .use_compression(self.compression.clone()); + + self.writers.push(new_writer); + + Ok(()) + } + + /// Writes an item. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + pub fn write, V: AsRef<[u8]>>( + &mut self, + key: K, + value: V, + ) -> crate::Result { + let key = key.as_ref(); + let value = value.as_ref(); + + let target_size = self.target_size; + + // Write actual value into blob file + let writer = self.get_active_writer_mut(); + let bytes_written = writer.write(key, value)?; + + // Check for blob file size target, maybe rotate to next writer + if writer.offset() >= target_size { + writer.flush()?; + self.rotate()?; + } + + Ok(bytes_written) + } + + pub(crate) fn finish(mut self) -> crate::Result>> { + let writer = self.get_active_writer_mut(); + + if writer.item_count > 0 { + writer.flush()?; + } + + Ok(self.writers) + } +} diff --git a/src/vlog/blob_file/reader.rs b/src/vlog/blob_file/reader.rs new file mode 100644 index 00000000..28e38927 --- /dev/null +++ b/src/vlog/blob_file/reader.rs @@ -0,0 +1,119 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use super::{meta::METADATA_HEADER_MAGIC, writer::BLOB_HEADER_MAGIC}; +use crate::{ + coding::DecodeError, + vlog::{BlobFileId, Compressor}, + UserKey, UserValue, +}; +use byteorder::{BigEndian, ReadBytesExt}; +use std::{ + fs::File, + io::{BufReader, Read, Seek}, + path::Path, +}; + +macro_rules! fail_iter { + ($e:expr) => { + match $e { + Ok(v) => v, + Err(e) => return Some(Err(e.into())), + } + }; +} + +// TODO: pread + +/// Reads through a blob file in order. +pub struct Reader { + pub(crate) blob_file_id: BlobFileId, + inner: BufReader, + is_terminated: bool, + compression: Option, +} + +impl Reader { + /// Initializes a new blob file reader. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + pub fn new>(path: P, blob_file_id: BlobFileId) -> crate::Result { + let file_reader = BufReader::new(File::open(path)?); + Ok(Self::with_reader(blob_file_id, file_reader)) + } + + pub(crate) fn get_offset(&mut self) -> std::io::Result { + self.inner.stream_position() + } + + /// Initializes a new blob file reader. + #[must_use] + pub fn with_reader(blob_file_id: BlobFileId, file_reader: BufReader) -> Self { + Self { + blob_file_id, + inner: file_reader, + is_terminated: false, + compression: None, + } + } + + pub(crate) fn use_compression(mut self, compressor: Option) -> Self { + self.compression = compressor; + self + } + + pub(crate) fn into_inner(self) -> BufReader { + self.inner + } +} + +impl Iterator for Reader { + type Item = crate::Result<(UserKey, UserValue, u64)>; + + fn next(&mut self) -> Option { + if self.is_terminated { + return None; + } + + { + let mut buf = [0; BLOB_HEADER_MAGIC.len()]; + fail_iter!(self.inner.read_exact(&mut buf)); + + if buf == METADATA_HEADER_MAGIC { + self.is_terminated = true; + return None; + } + + if buf != BLOB_HEADER_MAGIC { + return Some(Err(crate::Error::Decode(DecodeError::InvalidHeader( + "Blob", + )))); + } + } + + let checksum = fail_iter!(self.inner.read_u64::()); + + let key_len = fail_iter!(self.inner.read_u16::()); + let key = fail_iter!(UserKey::from_reader(&mut self.inner, key_len as usize)); + + let val_len = fail_iter!(self.inner.read_u32::()); + let val = match &self.compression { + Some(compressor) => { + // TODO: https://github.com/PSeitz/lz4_flex/issues/166 + let mut val = vec![0; val_len as usize]; + fail_iter!(self.inner.read_exact(&mut val)); + UserValue::from(fail_iter!(compressor.decompress(&val))) + } + None => { + // NOTE: When not using compression, we can skip + // the intermediary heap allocation and read directly into a Slice + fail_iter!(UserValue::from_reader(&mut self.inner, val_len as usize)) + } + }; + + Some(Ok((key, val, checksum))) + } +} diff --git a/src/vlog/blob_file/trailer.rs b/src/vlog/blob_file/trailer.rs new file mode 100644 index 00000000..9d3af34d --- /dev/null +++ b/src/vlog/blob_file/trailer.rs @@ -0,0 +1,75 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use super::meta::Metadata; +use crate::coding::{Decode, DecodeError, Encode, EncodeError}; +use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; +use std::{ + fs::File, + io::{BufReader, Read, Seek, Write}, + path::Path, +}; + +pub const TRAILER_MAGIC: &[u8] = &[b'V', b'L', b'O', b'G', b'T', b'R', b'L', 1]; +pub const TRAILER_SIZE: usize = 256; + +#[derive(Debug)] +#[allow(clippy::module_name_repetitions)] +pub struct Trailer { + pub metadata: Metadata, + pub metadata_ptr: u64, +} + +impl Trailer { + pub fn from_file>(path: P) -> crate::Result { + let file = File::open(path)?; + let mut reader = BufReader::new(file); + reader.seek(std::io::SeekFrom::End(-(TRAILER_SIZE as i64)))?; + + // Get metadata ptr + let metadata_ptr = reader.read_u64::()?; + + // IMPORTANT: Subtract sizeof(meta_ptr) ------v + let remaining_padding = TRAILER_SIZE - std::mem::size_of::() - TRAILER_MAGIC.len(); + reader.seek_relative(remaining_padding as i64)?; + + // Check trailer magic + let mut magic = [0u8; TRAILER_MAGIC.len()]; + reader.read_exact(&mut magic)?; + + if magic != TRAILER_MAGIC { + return Err(crate::Error::Decode(DecodeError::InvalidHeader( + "BlobFileTrailer", + ))); + } + + // Jump to metadata and parse + reader.seek(std::io::SeekFrom::Start(metadata_ptr))?; + let metadata = Metadata::decode_from(&mut reader)?; + + Ok(Self { + metadata, + metadata_ptr, + }) + } +} + +impl Encode for Trailer { + fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { + let mut v = Vec::with_capacity(TRAILER_SIZE); + + v.write_u64::(self.metadata_ptr)?; + + // Pad with remaining bytes + v.resize(TRAILER_SIZE - TRAILER_MAGIC.len(), 0); + + v.write_all(TRAILER_MAGIC)?; + + assert_eq!(v.len(), TRAILER_SIZE, "blob file trailer has invalid size"); + + writer.write_all(&v)?; + + Ok(()) + } +} diff --git a/src/vlog/blob_file/writer.rs b/src/vlog/blob_file/writer.rs new file mode 100644 index 00000000..f380dd10 --- /dev/null +++ b/src/vlog/blob_file/writer.rs @@ -0,0 +1,202 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use super::{meta::Metadata, trailer::Trailer}; +use crate::{ + coding::Encode, + vlog::{compression::Compressor, BlobFileId}, + KeyRange, UserKey, +}; +use byteorder::{BigEndian, WriteBytesExt}; +use std::{ + fs::File, + io::{BufWriter, Seek, Write}, + path::{Path, PathBuf}, +}; + +pub const BLOB_HEADER_MAGIC: &[u8] = &[b'V', b'L', b'G', b'B', b'L', b'O', b'B', 1]; + +/// Blob file writer +pub struct Writer { + pub path: PathBuf, + pub(crate) blob_file_id: BlobFileId, + + #[allow(clippy::struct_field_names)] + active_writer: BufWriter, + + offset: u64, + + pub(crate) item_count: u64, + pub(crate) written_blob_bytes: u64, + pub(crate) uncompressed_bytes: u64, + + pub(crate) first_key: Option, + pub(crate) last_key: Option, + + pub(crate) compression: Option, +} + +impl Writer { + /// Initializes a new blob file writer. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + #[doc(hidden)] + pub fn new>(path: P, blob_file_id: BlobFileId) -> std::io::Result { + let path = path.as_ref(); + + let file = File::create(path)?; + + Ok(Self { + path: path.into(), + blob_file_id, + + active_writer: BufWriter::new(file), + + offset: 0, + item_count: 0, + written_blob_bytes: 0, + uncompressed_bytes: 0, + + first_key: None, + last_key: None, + + compression: None, + }) + } + + pub fn use_compression(mut self, compressor: Option) -> Self { + self.compression = compressor; + self + } + + /// Returns the current offset in the file. + /// + /// This can be used to index an item into an external `Index`. + #[must_use] + pub(crate) fn offset(&self) -> u64 { + self.offset + } + + /// Returns the blob file ID. + #[must_use] + pub(crate) fn blob_file_id(&self) -> BlobFileId { + self.blob_file_id + } + + /// Writes an item into the file. + /// + /// Items need to be written in key order. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + /// + /// # Panics + /// + /// Panics if the key length is empty or greater than 2^16, or the value length is greater than 2^32. + pub fn write(&mut self, key: &[u8], value: &[u8]) -> crate::Result { + assert!(!key.is_empty()); + assert!(key.len() <= u16::MAX.into()); + assert!(u32::try_from(value.len()).is_ok()); + + if self.first_key.is_none() { + self.first_key = Some(key.into()); + } + self.last_key = Some(key.into()); + + self.uncompressed_bytes += value.len() as u64; + + let value = match &self.compression { + Some(compressor) => compressor.compress(value)?, + None => value.to_vec(), + }; + + let mut hasher = xxhash_rust::xxh3::Xxh3::new(); + hasher.update(key); + hasher.update(&value); + let checksum = hasher.digest(); + + // TODO: 2.0.0 formalize blob header + // into struct... store uncompressed len as well + // so we can optimize rollover by avoiding + // repeated compression & decompression + + // Write header + self.active_writer.write_all(BLOB_HEADER_MAGIC)?; + + // Write checksum + self.active_writer.write_u64::(checksum)?; + + // Write key + + // NOTE: Truncation is okay and actually needed + #[allow(clippy::cast_possible_truncation)] + self.active_writer + .write_u16::(key.len() as u16)?; + self.active_writer.write_all(key)?; + + // Write value + + // NOTE: Truncation is okay and actually needed + #[allow(clippy::cast_possible_truncation)] + self.active_writer + .write_u32::(value.len() as u32)?; + self.active_writer.write_all(&value)?; + + // Header + self.offset += BLOB_HEADER_MAGIC.len() as u64; + + // Checksum + self.offset += std::mem::size_of::() as u64; + + // Key + self.offset += std::mem::size_of::() as u64; + self.offset += key.len() as u64; + + // Value + self.offset += std::mem::size_of::() as u64; + self.offset += value.len() as u64; + + // Update metadata + self.written_blob_bytes += value.len() as u64; + self.item_count += 1; + + // NOTE: Truncation is okay + #[allow(clippy::cast_possible_truncation)] + Ok(value.len() as u32) + } + + pub(crate) fn flush(&mut self) -> crate::Result<()> { + let metadata_ptr = self.active_writer.stream_position()?; + + // Write metadata + let metadata = Metadata { + item_count: self.item_count, + compressed_bytes: self.written_blob_bytes, + total_uncompressed_bytes: self.uncompressed_bytes, + key_range: KeyRange::new(( + self.first_key + .clone() + .expect("should have written at least 1 item"), + self.last_key + .clone() + .expect("should have written at least 1 item"), + )), + }; + metadata.encode_into(&mut self.active_writer)?; + + Trailer { + metadata, + metadata_ptr, + } + .encode_into(&mut self.active_writer)?; + + self.active_writer.flush()?; + self.active_writer.get_mut().sync_all()?; + + Ok(()) + } +} diff --git a/src/vlog/compression.rs b/src/vlog/compression.rs new file mode 100644 index 00000000..84ccbd6f --- /dev/null +++ b/src/vlog/compression.rs @@ -0,0 +1,20 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +/// Generic compression trait +pub trait Compressor { + /// Compresses a value + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + fn compress(&self, bytes: &[u8]) -> crate::Result>; + + /// Decompresses a value + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + fn decompress(&self, bytes: &[u8]) -> crate::Result>; +} diff --git a/src/vlog/config.rs b/src/vlog/config.rs new file mode 100644 index 00000000..1277ab78 --- /dev/null +++ b/src/vlog/config.rs @@ -0,0 +1,66 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use crate::{vlog::compression::Compressor, Cache, DescriptorTable}; +use std::sync::Arc; + +/// Value log configuration +pub struct Config { + /// Target size of vLog blob files + pub(crate) blob_file_size_bytes: u64, + + /// Blob cache to use + pub(crate) blob_cache: Arc, + + /// File descriptor cache to use + pub(crate) fd_cache: Arc, + + /// Compression to use + pub(crate) compression: Option, +} + +impl Config { + /// Creates a new configuration builder. + pub fn new(blob_cache: Arc, fd_cache: Arc) -> Self { + Self { + blob_cache, + fd_cache, + compression: None, + blob_file_size_bytes: 128 * 1_024 * 1_024, + } + } + + /// Sets the compression & decompression scheme. + #[must_use] + pub fn compression(mut self, compressor: Option) -> Self { + self.compression = compressor; + self + } + + /// Sets the blob cache. + /// + /// You can create a global [`Cache`] and share it between multiple + /// value logs to cap global cache memory usage. + #[must_use] + pub fn blob_cache(mut self, blob_cache: Arc) -> Self { + self.blob_cache = blob_cache; + self + } + + /// Sets the maximum size of value log blob files. + /// + /// This influences space amplification, as + /// space reclamation works on a per-file basis. + /// + /// Larger files results in less files on disk and thus less file descriptors that may have to be obtained or cached. + /// + /// Like `blob_file_size` in `RocksDB`. + /// + /// Default = 256 MiB + #[must_use] + pub fn blob_file_size_bytes(mut self, bytes: u64) -> Self { + self.blob_file_size_bytes = bytes; + self + } +} diff --git a/src/vlog/gc/mod.rs b/src/vlog/gc/mod.rs new file mode 100644 index 00000000..f6465752 --- /dev/null +++ b/src/vlog/gc/mod.rs @@ -0,0 +1,123 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +pub mod report; + +use crate::vlog::{BlobFileId, Compressor, ValueLog}; + +/// GC strategy +#[allow(clippy::module_name_repetitions)] +pub trait GcStrategy { + /// Picks blob files based on a predicate. + fn pick(&self, value_log: &ValueLog) -> Vec; +} + +/// Picks blob files that have a certain percentage of stale blobs +pub struct StaleThresholdStrategy(f32); + +impl StaleThresholdStrategy { + /// Creates a new strategy with the given threshold. + /// + /// # Panics + /// + /// Panics if the ratio is invalid. + #[must_use] + pub fn new(ratio: f32) -> Self { + assert!( + ratio.is_finite() && ratio.is_sign_positive(), + "invalid stale ratio" + ); + Self(ratio.min(1.0)) + } +} + +impl GcStrategy for StaleThresholdStrategy { + fn pick(&self, value_log: &ValueLog) -> Vec { + value_log + .manifest + .blob_files + .read() + .expect("lock is poisoned") + .values() + .filter(|x| x.stale_ratio() > self.0) + .map(|x| x.id) + .collect::>() + } +} + +/// Tries to find a least-effort-selection of blob files to merge to reach a certain space amplification +pub struct SpaceAmpStrategy(f32); + +impl SpaceAmpStrategy { + /// Creates a new strategy with the given space amp factor. + /// + /// # Panics + /// + /// Panics if the space amp factor is < 1.0. + #[must_use] + pub fn new(ratio: f32) -> Self { + assert!(ratio >= 1.0, "invalid space amp ratio"); + Self(ratio) + } +} + +impl GcStrategy for SpaceAmpStrategy { + #[allow(clippy::cast_precision_loss, clippy::significant_drop_tightening)] + fn pick(&self, value_log: &ValueLog) -> Vec { + let space_amp_target = self.0; + let current_space_amp = value_log.space_amp(); + + if current_space_amp < space_amp_target { + log::trace!("Space amp is <= target {space_amp_target}, nothing to do"); + vec![] + } else { + log::debug!("Selecting blob files to GC, space_amp_target={space_amp_target}"); + + let lock = value_log + .manifest + .blob_files + .read() + .expect("lock is poisoned"); + + let mut blob_files = lock + .values() + .filter(|x| x.stale_ratio() > 0.0) + .collect::>(); + + // Sort by stale ratio descending + blob_files.sort_by(|a, b| { + b.stale_ratio() + .partial_cmp(&a.stale_ratio()) + .unwrap_or(std::cmp::Ordering::Equal) + }); + + let mut selection = vec![]; + + let mut total_bytes = value_log.manifest.total_bytes(); + let mut stale_bytes = value_log.manifest.stale_bytes(); + + for blob_file in blob_files { + let blob_file_stale_bytes = blob_file.gc_stats.stale_bytes(); + stale_bytes -= blob_file_stale_bytes; + total_bytes -= blob_file_stale_bytes; + + selection.push(blob_file.id); + + let space_amp_after_gc = + total_bytes as f32 / (total_bytes as f32 - stale_bytes as f32); + + log::debug!( + "Selected blob file #{} for GC: will reduce space amp to {space_amp_after_gc}", + blob_file.id, + ); + + if space_amp_after_gc <= space_amp_target { + break; + } + } + + selection + } + } +} diff --git a/src/vlog/gc/report.rs b/src/vlog/gc/report.rs new file mode 100644 index 00000000..0b8a9b7f --- /dev/null +++ b/src/vlog/gc/report.rs @@ -0,0 +1,78 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use std::path::PathBuf; + +/// Statistics report for garbage collection +#[derive(Debug)] +#[allow(clippy::module_name_repetitions)] +pub struct GcReport { + /// Path of value log + pub path: PathBuf, + + /// Blob file count + pub blob_file_count: usize, + + /// Blob files that have 100% stale blobs + pub stale_blob_file_count: usize, + + /// Amount of stored bytes + pub total_bytes: u64, + + /// Amount of bytes that could be freed + pub stale_bytes: u64, + + /// Amount of stored blobs + pub total_blobs: u64, + + /// Amount of blobs that could be freed + pub stale_blobs: u64, +} + +impl std::fmt::Display for GcReport { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "--- GC report for vLog @ {} ---", self.path.display())?; + writeln!(f, "# files : {}", self.blob_file_count)?; + writeln!(f, "# stale : {}", self.stale_blob_file_count)?; + writeln!(f, "Total bytes: {}", self.total_bytes)?; + writeln!(f, "Stale bytes: {}", self.stale_bytes)?; + writeln!(f, "Total blobs: {}", self.total_blobs)?; + writeln!(f, "Stale blobs: {}", self.stale_blobs)?; + writeln!(f, "Stale ratio: {}", self.stale_ratio())?; + writeln!(f, "Space amp : {}", self.space_amp())?; + writeln!(f, "--- GC report done ---")?; + Ok(()) + } +} + +impl GcReport { + /// Calculates the space amplification factor. + #[must_use] + pub fn space_amp(&self) -> f32 { + if self.total_bytes == 0 { + return 0.0; + } + + let alive_bytes = self.total_bytes - self.stale_bytes; + if alive_bytes == 0 { + return 0.0; + } + + self.total_bytes as f32 / alive_bytes as f32 + } + + /// Calculates the stale ratio (percentage). + #[must_use] + pub fn stale_ratio(&self) -> f32 { + if self.total_bytes == 0 { + return 0.0; + } + + if self.stale_bytes == 0 { + return 0.0; + } + + self.stale_bytes as f32 / self.total_bytes as f32 + } +} diff --git a/src/vlog/handle.rs b/src/vlog/handle.rs new file mode 100644 index 00000000..7d7c0837 --- /dev/null +++ b/src/vlog/handle.rs @@ -0,0 +1,44 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use crate::{ + coding::{Decode, DecodeError, Encode, EncodeError}, + vlog::BlobFileId, +}; +use std::{ + hash::Hash, + io::{Read, Write}, +}; +use varint_rs::{VarintReader, VarintWriter}; + +/// A value handle points into the value log +#[allow(clippy::module_name_repetitions)] +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +pub struct ValueHandle { + /// Blob file ID + pub blob_file_id: BlobFileId, + + /// Offset in file + pub offset: u64, +} + +impl Encode for ValueHandle { + fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { + writer.write_u64_varint(self.offset)?; + writer.write_u64_varint(self.blob_file_id)?; + Ok(()) + } +} + +impl Decode for ValueHandle { + fn decode_from(reader: &mut R) -> Result { + let offset = reader.read_u64_varint()?; + let blob_file_id = reader.read_u64_varint()?; + + Ok(Self { + blob_file_id, + offset, + }) + } +} diff --git a/src/vlog/index.rs b/src/vlog/index.rs new file mode 100644 index 00000000..4f3c3c15 --- /dev/null +++ b/src/vlog/index.rs @@ -0,0 +1,47 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use crate::vlog::ValueHandle; + +/// Trait that allows reading from an external index +/// +/// An index should point into the value log using [`ValueHandle`]. +#[allow(clippy::module_name_repetitions)] +pub trait Reader { + /// Returns a value handle for a given key. + /// + /// This method is used to index back into the index to check for + /// stale values when scanning through the value log's blob files. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + fn get(&self, key: &[u8]) -> std::io::Result>; +} + +/// Trait that allows writing into an external index +/// +/// The write process should be atomic meaning that until `finish` is called +/// no written value handles should be handed out by the index. +/// When `finish` fails, no value handles should be written into the index. +pub trait Writer { + /// Inserts a value handle into the index write batch. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + fn insert_indirect( + &mut self, + key: &[u8], + vhandle: ValueHandle, + size: u32, + ) -> std::io::Result<()>; + + /// Finishes the write batch. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + fn finish(&mut self) -> std::io::Result<()>; +} diff --git a/src/vlog/manifest.rs b/src/vlog/manifest.rs new file mode 100644 index 00000000..92eca898 --- /dev/null +++ b/src/vlog/manifest.rs @@ -0,0 +1,445 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use crate::{ + file::rewrite_atomic, + vlog::{ + blob_file::{gc_stats::GcStats, meta::Metadata, trailer::Trailer}, + BlobFile, BlobFileId, BlobFileWriter as MultiWriter, Compressor, + }, + HashMap, KeyRange, +}; +use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; +use std::{ + io::Cursor, + marker::PhantomData, + path::{Path, PathBuf}, + sync::{Arc, RwLock}, +}; + +pub const VLOG_MARKER: &str = ".vlog"; +pub const BLOB_FILES_FOLDER: &str = "segments"; // TODO: don't use separate folder, instead rename just .blobs +const MANIFEST_FILE: &str = "vlog_manifest"; + +// TODO: use tree-level manifest to store blob files as well + +#[allow(clippy::module_name_repetitions)] +pub struct ManifestInner { + path: PathBuf, + pub blob_files: RwLock>>>, +} + +#[allow(clippy::module_name_repetitions)] +#[derive(Clone)] +pub struct Manifest(Arc>); + +impl std::ops::Deref for Manifest { + type Target = ManifestInner; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl Manifest { + fn remove_unfinished_blob_files>( + folder: P, + registered_ids: &[u64], + ) -> crate::Result<()> { + for dirent in std::fs::read_dir(folder)? { + let dirent = dirent?; + let file_name = dirent.file_name(); + + // https://en.wikipedia.org/wiki/.DS_Store + if file_name == ".DS_Store" { + continue; + } + + // https://en.wikipedia.org/wiki/AppleSingle_and_AppleDouble_formats + if file_name.to_string_lossy().starts_with("._") { + continue; + } + + if dirent.file_type()?.is_file() { + let blob_file_id = dirent + .file_name() + .to_str() + .expect("should be valid utf-8") + .parse::() + .expect("should be valid blob file ID"); + + if !registered_ids.contains(&blob_file_id) { + log::trace!("Deleting unfinished vLog blob file {blob_file_id}"); + std::fs::remove_file(dirent.path())?; + } + } + } + + Ok(()) + } + + /// Parses blob file IDs from manifest file + fn load_ids_from_disk>(path: P) -> crate::Result> { + let path = path.as_ref(); + log::debug!("Loading manifest from {}", path.display()); + + let bytes = std::fs::read(path)?; + + let mut ids = vec![]; + + let mut cursor = Cursor::new(bytes); + + let cnt = cursor.read_u64::()?; + + for _ in 0..cnt { + ids.push(cursor.read_u64::()?); + } + + Ok(ids) + } + + /// Recovers a value log from disk + pub(crate) fn recover>(folder: P) -> crate::Result { + let folder = folder.as_ref(); + let manifest_path = folder.join(MANIFEST_FILE); + + log::info!("Recovering vLog at {}", folder.display()); + + let ids = Self::load_ids_from_disk(&manifest_path)?; + let cnt = ids.len(); + + let progress_mod = match cnt { + _ if cnt <= 20 => 1, + _ if cnt <= 100 => 10, + _ => 100, + }; + + log::debug!( + "Recovering {cnt} vLog blob files from {:?}", + folder.display(), + ); + + let blob_files_folder = folder.join(BLOB_FILES_FOLDER); + Self::remove_unfinished_blob_files(&blob_files_folder, &ids)?; + + let blob_files = { + let mut map = HashMap::with_capacity_and_hasher(100, rustc_hash::FxBuildHasher); + + for (idx, &id) in ids.iter().enumerate() { + log::trace!("Recovering blob file #{id:?}"); + + let path = blob_files_folder.join(id.to_string()); + let trailer = Trailer::from_file(&path)?; + + map.insert( + id, + Arc::new(BlobFile { + id, + path, + meta: trailer.metadata, + gc_stats: GcStats::default(), + _phantom: PhantomData, + }), + ); + + if idx % progress_mod == 0 { + log::debug!("Recovered {idx}/{cnt} vLog blob files"); + } + } + + map + }; + + if blob_files.len() < ids.len() { + return Err(crate::Error::Unrecoverable); + } + + Ok(Self(Arc::new(ManifestInner { + path: manifest_path, + blob_files: RwLock::new(blob_files), + }))) + } + + pub(crate) fn create_new>(folder: P) -> crate::Result { + let path = folder.as_ref().join(MANIFEST_FILE); + + let m = Self(Arc::new(ManifestInner { + path, + blob_files: RwLock::new(HashMap::default()), + })); + Self::write_to_disk(&m.path, &[])?; + + Ok(m) + } + + /// Modifies the level manifest atomically. + pub(crate) fn atomic_swap>>)>( + &self, + f: F, + ) -> crate::Result<()> { + let mut prev_blob_files = self.blob_files.write().expect("lock is poisoned"); + + // NOTE: Create a copy of the levels we can operate on + // without mutating the current level manifest + // If persisting to disk fails, this way the level manifest + // is unchanged + let mut working_copy = prev_blob_files.clone(); + + f(&mut working_copy); + + let ids = working_copy.keys().copied().collect::>(); + + Self::write_to_disk(&self.path, &ids)?; + *prev_blob_files = working_copy; + + // NOTE: Lock needs to live until end of function because + // writing to disk needs to be exclusive + drop(prev_blob_files); + + log::trace!("Swapped vLog blob file list to: {ids:?}"); + + Ok(()) + } + + /// Drops all blob files. + /// + /// This does not delete the files from disk, but just un-refs them from the manifest. + /// + /// Once this function completes, the disk files can be safely removed. + pub fn clear(&self) -> crate::Result<()> { + self.atomic_swap(|recipe| { + recipe.clear(); + }) + } + + /// Drops the given blob files. + /// + /// This does not delete the files from disk, but just un-refs them from the manifest. + /// + /// Once this function completes, the disk files can be safely removed. + pub fn drop_blob_files(&self, ids: &[u64]) -> crate::Result<()> { + self.atomic_swap(|recipe| { + recipe.retain(|x, _| !ids.contains(x)); + }) + } + + pub fn register(&self, writer: MultiWriter) -> crate::Result<()> { + let writers = writer.finish()?; + + self.atomic_swap(move |recipe| { + for writer in writers { + if writer.item_count == 0 { + log::debug!( + "Writer at {} has written no data, deleting empty vLog blob file", + writer.path.display(), + ); + if let Err(e) = std::fs::remove_file(&writer.path) { + log::warn!( + "Could not delete empty vLog blob file at {}: {e:?}", + writer.path.display(), + ); + } + continue; + } + + let blob_file_id = writer.blob_file_id; + + recipe.insert( + blob_file_id, + Arc::new(BlobFile { + id: blob_file_id, + path: writer.path, + meta: Metadata { + item_count: writer.item_count, + compressed_bytes: writer.written_blob_bytes, + total_uncompressed_bytes: writer.uncompressed_bytes, + + // NOTE: We are checking for 0 items above + // so first and last key need to exist + #[allow(clippy::expect_used)] + key_range: KeyRange::new(( + writer + .first_key + .clone() + .expect("should have written at least 1 item"), + writer + .last_key + .clone() + .expect("should have written at least 1 item"), + )), + }, + gc_stats: GcStats::default(), + _phantom: PhantomData, + }), + ); + + log::debug!( + "Created blob file #{blob_file_id:?} ({} items, {} userdata bytes)", + writer.item_count, + writer.uncompressed_bytes, + ); + } + })?; + + // NOTE: If we crash before before finishing the index write, it's fine + // because all new blob files will be unreferenced, and thus can be dropped because stale + + Ok(()) + } + + fn write_to_disk>(path: P, blob_file_ids: &[BlobFileId]) -> crate::Result<()> { + let path = path.as_ref(); + log::trace!("Writing blob files manifest to {}", path.display()); + + let mut bytes = Vec::new(); + + let cnt = blob_file_ids.len() as u64; + bytes.write_u64::(cnt)?; + + for id in blob_file_ids { + bytes.write_u64::(*id)?; + } + + rewrite_atomic(path, &bytes)?; + + Ok(()) + } + + /// Gets a blob file. + #[must_use] + pub fn get_blob_file(&self, id: BlobFileId) -> Option>> { + self.blob_files + .read() + .expect("lock is poisoned") + .get(&id) + .cloned() + } + + /// Lists all blob file IDs. + #[doc(hidden)] + #[must_use] + pub fn list_blob_file_ids(&self) -> Vec { + self.blob_files + .read() + .expect("lock is poisoned") + .keys() + .copied() + .collect() + } + + /// Lists all blob files. + #[must_use] + pub fn list_blob_files(&self) -> Vec>> { + self.blob_files + .read() + .expect("lock is poisoned") + .values() + .cloned() + .collect() + } + + /// Returns the number of blob files. + #[must_use] + pub fn len(&self) -> usize { + self.blob_files.read().expect("lock is poisoned").len() + } + + /// Returns the amount of bytes on disk that are occupied by blobs. + #[must_use] + pub fn disk_space_used(&self) -> u64 { + self.blob_files + .read() + .expect("lock is poisoned") + .values() + .map(|x| x.meta.compressed_bytes) + .sum::() + } + + /// Returns the amount of stale bytes + #[must_use] + pub fn total_bytes(&self) -> u64 { + self.blob_files + .read() + .expect("lock is poisoned") + .values() + .map(|x| x.meta.total_uncompressed_bytes) + .sum::() + } + + /// Returns the amount of stale bytes + #[must_use] + pub fn stale_bytes(&self) -> u64 { + self.blob_files + .read() + .expect("lock is poisoned") + .values() + .map(|x| x.gc_stats.stale_bytes()) + .sum::() + } + + /// Returns the percent of dead bytes (uncompressed) in the value log + #[must_use] + #[allow(clippy::cast_precision_loss)] + pub fn stale_ratio(&self) -> f32 { + let total_bytes = self.total_bytes(); + if total_bytes == 0 { + return 0.0; + } + + let stale_bytes = self.stale_bytes(); + + if stale_bytes == 0 { + return 0.0; + } + + stale_bytes as f32 / total_bytes as f32 + } + + /// Returns the approximate space amplification + /// + /// Returns 0.0 if there are no items or the entire value log is stale. + #[must_use] + #[allow(clippy::cast_precision_loss)] + pub fn space_amp(&self) -> f32 { + let total_bytes = self.total_bytes(); + if total_bytes == 0 { + return 0.0; + } + + let stale_bytes = self.stale_bytes(); + + let alive_bytes = total_bytes - stale_bytes; + if alive_bytes == 0 { + return 0.0; + } + + total_bytes as f32 / alive_bytes as f32 + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fs::File; + use std::io::Write; + use test_log::test; + + #[test] + fn test_atomic_rewrite() -> crate::Result<()> { + let dir = tempfile::tempdir()?; + + let path = dir.path().join("test.txt"); + { + let mut file = File::create(&path)?; + write!(file, "asdasdasdasdasd")?; + } + + rewrite_atomic(&path, b"newcontent")?; + + let content = std::fs::read_to_string(&path)?; + assert_eq!("newcontent", content); + + Ok(()) + } +} diff --git a/src/vlog/mod.rs b/src/vlog/mod.rs new file mode 100644 index 00000000..165c97dd --- /dev/null +++ b/src/vlog/mod.rs @@ -0,0 +1,33 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +mod blob_file; +mod compression; // TODO: remove +mod config; +mod gc; +mod handle; +mod index; +mod manifest; + +#[doc(hidden)] +pub mod scanner; + +mod value_log; + +pub use { + blob_file::multi_writer::MultiWriter as BlobFileWriter, + compression::Compressor, + config::Config, + gc::report::GcReport, + gc::{GcStrategy, SpaceAmpStrategy, StaleThresholdStrategy}, + handle::ValueHandle, + index::{Reader as IndexReader, Writer as IndexWriter}, + value_log::{ValueLog, ValueLogId}, +}; + +#[doc(hidden)] +pub use blob_file::{reader::Reader as BlobFileReader, BlobFile}; + +/// The unique identifier for a value log blob file. +pub type BlobFileId = u64; diff --git a/src/vlog/scanner.rs b/src/vlog/scanner.rs new file mode 100644 index 00000000..71c03c66 --- /dev/null +++ b/src/vlog/scanner.rs @@ -0,0 +1,66 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use crate::vlog::{BlobFileId, ValueHandle}; +use std::{collections::BTreeMap, sync::MutexGuard}; + +#[derive(Debug, Default)] +pub struct BlobFileCounter { + pub size: u64, + pub item_count: u64, +} + +pub type SizeMap = BTreeMap; + +/// Scans a value log, building a size map for the GC report +pub struct Scanner<'a, I: Iterator>> { + iter: I, + + #[allow(unused)] + lock_guard: MutexGuard<'a, ()>, + + size_map: SizeMap, +} + +impl<'a, I: Iterator>> Scanner<'a, I> { + pub fn new(iter: I, lock_guard: MutexGuard<'a, ()>, ids: &[BlobFileId]) -> Self { + let mut size_map = BTreeMap::default(); + + for &id in ids { + size_map.insert(id, BlobFileCounter::default()); + } + + Self { + iter, + lock_guard, + size_map, + } + } + + pub fn finish(self) -> SizeMap { + self.size_map + } + + pub fn scan(&mut self) -> crate::Result<()> { + for vhandle in self.iter.by_ref() { + let (vhandle, size) = vhandle + .map_err(|_| crate::Error::Io(std::io::Error::other("Index returned error")))?; + + let size = u64::from(size); + + self.size_map + .entry(vhandle.blob_file_id) + .and_modify(|x| { + x.item_count += 1; + x.size += size; + }) + .or_insert_with(|| BlobFileCounter { + size, + item_count: 1, + }); + } + + Ok(()) + } +} diff --git a/src/vlog/value_log.rs b/src/vlog/value_log.rs new file mode 100644 index 00000000..a4e9769a --- /dev/null +++ b/src/vlog/value_log.rs @@ -0,0 +1,669 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use crate::{ + vlog::{ + blob_file::merge::MergeReader, + gc::report::GcReport, + index::Writer as IndexWriter, + manifest::{Manifest, BLOB_FILES_FOLDER, VLOG_MARKER}, + scanner::{Scanner, SizeMap}, + BlobFileId, BlobFileWriter, Compressor, Config, GcStrategy, IndexReader, ValueHandle, + }, + Cache, DescriptorTable, UserValue, +}; +use std::{ + path::{Path, PathBuf}, + sync::{atomic::AtomicU64, Arc, Mutex}, +}; + +// TODO: use other counter struct +#[allow(clippy::module_name_repetitions)] +#[derive(Clone, Default)] +pub struct IdGenerator(Arc); + +impl std::ops::Deref for IdGenerator { + type Target = Arc; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl IdGenerator { + pub fn new(start: u64) -> Self { + Self(Arc::new(AtomicU64::new(start))) + } + + pub fn next(&self) -> BlobFileId { + self.fetch_add(1, std::sync::atomic::Ordering::SeqCst) + } +} + +/// Unique value log ID +#[allow(clippy::module_name_repetitions)] +pub type ValueLogId = u64; + +/// Hands out a unique (monotonically increasing) value log ID. +pub fn get_next_vlog_id() -> ValueLogId { + static VLOG_ID_COUNTER: AtomicU64 = AtomicU64::new(0); + VLOG_ID_COUNTER.fetch_add(1, std::sync::atomic::Ordering::Relaxed) +} + +fn unlink_blob_files(base_path: &Path, ids: &[BlobFileId]) { + for id in ids { + let path = base_path.join(BLOB_FILES_FOLDER).join(id.to_string()); + + if let Err(e) = std::fs::remove_file(&path) { + log::error!("Could not free blob file at {path:?}: {e:?}"); + } + } +} + +/// A disk-resident value log +#[derive(Clone)] +pub struct ValueLog(Arc>); + +impl std::ops::Deref for ValueLog { + type Target = ValueLogInner; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +#[allow(clippy::module_name_repetitions)] +pub struct ValueLogInner { + /// Unique value log ID + id: u64, + + /// Base folder + pub path: PathBuf, + + /// Value log configuration + config: Config, + + /// In-memory blob cache + blob_cache: Arc, + + /// In-memory FD cache + fd_cache: Arc, + + /// Blob files manifest + #[doc(hidden)] + pub manifest: Manifest, + + /// Generator to get next blob file ID + id_generator: IdGenerator, + + /// Guards the rollover (compaction) process to only + /// allow one to happen at a time + #[doc(hidden)] + pub rollover_guard: Mutex<()>, +} + +impl ValueLog { + /// Creates or recovers a value log in the given directory. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + pub fn open>( + path: P, // TODO: move path into config? + config: Config, + ) -> crate::Result { + let path = path.into(); + + if path.join(VLOG_MARKER).try_exists()? { + Self::recover(path, config) + } else { + Self::create_new(path, config) + } + } + + /* /// Prints fragmentation histogram. + pub fn print_fragmentation_histogram(&self) { + let lock = self.manifest.blob_files.read().expect("lock is poisoned"); + + for (id, blob_file) in &*lock { + let stale_ratio = blob_file.stale_ratio(); + + let progress = (stale_ratio * 10.0) as usize; + let void = 10 - progress; + + let progress = "=".repeat(progress); + let void = " ".repeat(void); + + println!( + "{id:0>4} [{progress}{void}] {}%", + (stale_ratio * 100.0) as usize + ); + } + } */ + + #[doc(hidden)] + pub fn verify(&self) -> crate::Result { + let _lock = self.rollover_guard.lock().expect("lock is poisoned"); + + let mut sum = 0; + + for item in self.get_reader()? { + let (k, v, _, expected_checksum) = item?; + + let mut hasher = xxhash_rust::xxh3::Xxh3::new(); + hasher.update(&k); + hasher.update(&v); + + if hasher.digest() != expected_checksum { + sum += 1; + } + } + + Ok(sum) + } + + /// Creates a new empty value log in a directory. + pub(crate) fn create_new>(path: P, config: Config) -> crate::Result { + let path = path.into(); + + let path = crate::path::absolute_path(&path); + log::trace!("Creating value-log at {}", path.display()); + + std::fs::create_dir_all(&path)?; + + let marker_path = path.join(VLOG_MARKER); + assert!(!marker_path.try_exists()?); + + std::fs::create_dir_all(path.join(BLOB_FILES_FOLDER))?; + + // NOTE: Lastly, fsync .vlog marker, which contains the version + // -> the V-log is fully initialized + + // let mut file = std::fs::File::create(marker_path)?; + // FormatVersion::V3.write_file_header(&mut file)?; + // file.sync_all()?; + + #[cfg(not(target_os = "windows"))] + { + // fsync folders on Unix + + let folder = std::fs::File::open(path.join(BLOB_FILES_FOLDER))?; + folder.sync_all()?; + + let folder = std::fs::File::open(&path)?; + folder.sync_all()?; + } + + let blob_cache = config.blob_cache.clone(); + let fd_cache = config.fd_cache.clone(); + let manifest = Manifest::create_new(&path)?; + + Ok(Self(Arc::new(ValueLogInner { + id: get_next_vlog_id(), + config, + path, + blob_cache, + fd_cache, + manifest, + id_generator: IdGenerator::default(), + rollover_guard: Mutex::new(()), + }))) + } + + pub(crate) fn recover>(path: P, config: Config) -> crate::Result { + let path = path.into(); + log::info!("Recovering vLog at {}", path.display()); + + // { + // let bytes = std::fs::read(path.join(VLOG_MARKER))?; + + // if let Some(version) = Version::parse_file_header(&bytes) { + // if version != Version::V1 { + // return Err(crate::Error::InvalidVersion(Some(version))); + // } + // } else { + // return Err(crate::Error::InvalidVersion(None)); + // } + // } + + let blob_cache = config.blob_cache.clone(); + let fd_cache = config.fd_cache.clone(); + let manifest = Manifest::recover(&path)?; + + let highest_id = manifest + .blob_files + .read() + .expect("lock is poisoned") + .values() + .map(|x| x.id) + .max() + .unwrap_or_default(); + + Ok(Self(Arc::new(ValueLogInner { + id: get_next_vlog_id(), + config, + path, + blob_cache, + fd_cache, + manifest, + id_generator: IdGenerator::new(highest_id + 1), + rollover_guard: Mutex::new(()), + }))) + } + + /// Registers a [`BlobFileWriter`]. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + pub fn register_writer(&self, writer: BlobFileWriter) -> crate::Result<()> { + let _lock = self.rollover_guard.lock().expect("lock is poisoned"); + self.manifest.register(writer)?; + Ok(()) + } + + /// Returns the number of blob files in the value log. + #[must_use] + pub fn blob_file_count(&self) -> usize { + self.manifest.len() + } + + /// Resolves a value handle. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + pub fn get(&self, vhandle: &ValueHandle) -> crate::Result> { + self.get_with_prefetch(vhandle, 0) + } + + /// Resolves a value handle, and prefetches some values after it. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + pub fn get_with_prefetch( + &self, + vhandle: &ValueHandle, + prefetch_size: usize, + ) -> crate::Result> { + // TODO:, first rewrite blob files to use pread + Ok(None) + + // if let Some(value) = self.blob_cache.get(self.id, vhandle) { + // return Ok(Some(value)); + // } + + // let Some(blob_file) = self.manifest.get_blob_file(vhandle.blob_file_id) else { + // return Ok(None); + // }; + + // // TODO: get FD from cache or open and insert + // // let mut reader = match self + // // .fd_cache + // // .access_for_blob_file(&GlobalSegmentId::from((self.id, vhandle.blob_file_id))) + // // { + // // Some(fd) => fd, + // // None => BufReader::new(File::open(&blob_file.path)?), + // // }; + + // let mut reader = BlobFileReader::with_reader(vhandle.blob_file_id, reader) + // .use_compression(self.config.compression.clone()); + + // let Some(item) = reader.next() else { + // return Ok(None); + // }; + // let (_key, val, _checksum) = item?; + + // self.blob_cache.insert(self.id, vhandle, val.clone()); + + // // TODO: maybe we can look at the value size and prefetch some more values + // // without causing another I/O... + // // TODO: benchmark range reads for rather small non-inlined blobs (maybe ~512-1000B) + // // and see how different BufReader capacities and prefetch changes range read performance + // for _ in 0..prefetch_size { + // let offset = reader.get_offset()?; + + // let Some(item) = reader.next() else { + // break; + // }; + // let (_key, val, _checksum) = item?; + + // let value_handle = ValueHandle { + // blob_file_id: vhandle.blob_file_id, + // offset, + // }; + + // self.blob_cache.insert(self.id, &value_handle, val); + // } + + // Ok(Some(val)) + } + + fn get_writer_raw(&self) -> crate::Result> { + BlobFileWriter::new( + self.id_generator.clone(), + self.config.blob_file_size_bytes, + self.path.join(BLOB_FILES_FOLDER), + ) + .map_err(Into::into) + } + + /// Initializes a new blob file writer. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + pub fn get_writer(&self) -> crate::Result> { + self.get_writer_raw() + .map(|x| x.use_compression(self.config.compression.clone())) + } + + /// Drops stale blob files. + /// + /// Returns the amount of disk space (compressed data) freed. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + pub fn drop_stale_blob_files(&self) -> crate::Result { + // IMPORTANT: Only allow 1 rollover or GC at any given time + let _guard = self.rollover_guard.lock().expect("lock is poisoned"); + + let blob_files = self + .manifest + .blob_files + .read() + .expect("lock is poisoned") + .values() + .filter(|x| x.is_stale()) + .cloned() + .collect::>(); + + let bytes_freed = blob_files.iter().map(|x| x.meta.compressed_bytes).sum(); + + let ids = blob_files.iter().map(|x| x.id).collect::>(); + + if ids.is_empty() { + log::trace!("No blob files to drop"); + } else { + log::info!("Dropping stale blob files: {ids:?}"); + self.manifest.drop_blob_files(&ids)?; + + for blob_file in blob_files { + std::fs::remove_file(&blob_file.path)?; + } + } + + Ok(bytes_freed) + } + + /// Marks some blob files as stale. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + fn mark_as_stale(&self, ids: &[BlobFileId]) { + // NOTE: Read-locking is fine because we are dealing with an atomic bool + #[allow(clippy::significant_drop_tightening)] + let blob_files = self.manifest.blob_files.read().expect("lock is poisoned"); + + for id in ids { + let Some(blob_file) = blob_files.get(id) else { + continue; + }; + + blob_file.mark_as_stale(); + } + } + + // TODO: remove? + /// Returns the approximate space amplification. + /// + /// Returns 0.0 if there are no items. + #[must_use] + pub fn space_amp(&self) -> f32 { + self.manifest.space_amp() + } + + #[doc(hidden)] + #[allow(clippy::cast_precision_loss)] + #[must_use] + pub fn consume_scan_result(&self, size_map: &SizeMap) -> GcReport { + let mut report = GcReport { + path: self.path.clone(), + blob_file_count: self.blob_file_count(), + stale_blob_file_count: 0, + stale_bytes: 0, + total_bytes: 0, + stale_blobs: 0, + total_blobs: 0, + }; + + for (&id, counter) in size_map { + let blob_file = self + .manifest + .get_blob_file(id) + .expect("blob file should exist"); + + let total_bytes = blob_file.meta.total_uncompressed_bytes; + let total_items = blob_file.meta.item_count; + + report.total_bytes += total_bytes; + report.total_blobs += total_items; + + if counter.item_count > 0 { + let used_size = counter.size; + let alive_item_count = counter.item_count; + + let blob_file = self + .manifest + .get_blob_file(id) + .expect("blob file should exist"); + + let stale_bytes = total_bytes - used_size; + let stale_items = total_items - alive_item_count; + + blob_file.gc_stats.set_stale_bytes(stale_bytes); + blob_file.gc_stats.set_stale_items(stale_items); + + report.stale_bytes += stale_bytes; + report.stale_blobs += stale_items; + } else { + log::debug!( + "Blob file #{id} has no incoming references - can be dropped, freeing {} KiB on disk (userdata={} MiB)", + blob_file.meta.compressed_bytes / 1_024, + total_bytes / 1_024 / 1_024, + ); + self.mark_as_stale(&[id]); + + report.stale_blob_file_count += 1; + report.stale_bytes += total_bytes; + report.stale_blobs += total_items; + } + } + + report + } + + /// Scans the given index and collects GC statistics. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + #[allow(clippy::significant_drop_tightening)] + pub fn scan_for_stats( + &self, + iter: impl Iterator>, + ) -> crate::Result { + let lock_guard = self.rollover_guard.lock().expect("lock is poisoned"); + + let ids = self.manifest.list_blob_file_ids(); + + let mut scanner = Scanner::new(iter, lock_guard, &ids); + scanner.scan()?; + let size_map = scanner.finish(); + let report = self.consume_scan_result(&size_map); + + Ok(report) + } + + #[doc(hidden)] + pub fn get_reader(&self) -> crate::Result> { + let readers = self + .manifest + .blob_files + .read() + .expect("lock is poisoned") + .values() + .map(|x| x.scan()) + .collect::>>()?; + + Ok(MergeReader::new(readers)) + } + + /// Returns the amount of disk space (compressed data) freed. + #[doc(hidden)] + pub fn major_compact( + &self, + index_reader: &R, + index_writer: W, + ) -> crate::Result { + let ids = self.manifest.list_blob_file_ids(); + self.rollover(&ids, index_reader, index_writer) + } + + /// Applies a GC strategy. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + pub fn apply_gc_strategy( + &self, + strategy: &impl GcStrategy, + index_reader: &R, + index_writer: W, + ) -> crate::Result { + let blob_file_ids = strategy.pick(self); + self.rollover(&blob_file_ids, index_reader, index_writer) + } + + /// Atomically removes all data from the value log. + /// + /// If `prune_async` is set to `true`, the blob files will be removed from disk in a thread to avoid blocking. + pub fn clear(&self, prune_async: bool) -> crate::Result<()> { + let guard = self.rollover_guard.lock().expect("lock is poisoned"); + let ids = self.manifest.list_blob_file_ids(); + self.manifest.clear()?; + drop(guard); + + if prune_async { + let path = self.path.clone(); + + std::thread::spawn(move || { + log::trace!("Pruning dropped blob files in thread: {ids:?}"); + unlink_blob_files(&path, &ids); + log::trace!("Successfully pruned all blob files"); + }); + } else { + log::trace!("Pruning dropped blob files: {ids:?}"); + unlink_blob_files(&self.path, &ids); + log::trace!("Successfully pruned all blob files"); + } + + Ok(()) + } + + /// Rewrites some blob files into new blob files, blocking the caller + /// until the operation is completely done. + /// + /// Returns the amount of disk space (compressed data) freed. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + #[doc(hidden)] + pub fn rollover( + &self, + ids: &[u64], + index_reader: &R, + mut index_writer: W, + ) -> crate::Result { + if ids.is_empty() { + return Ok(0); + } + + // IMPORTANT: Only allow 1 rollover or GC at any given time + let _guard = self.rollover_guard.lock().expect("lock is poisoned"); + + let size_before = self.manifest.disk_space_used(); + + log::info!("Rollover blob files {ids:?}"); + + let blob_files = ids + .iter() + .map(|&x| self.manifest.get_blob_file(x)) + .collect::>>(); + + let Some(blob_files) = blob_files else { + return Ok(0); + }; + + let readers = blob_files + .into_iter() + .map(|x| x.scan()) + .collect::>>()?; + + // TODO: 3.0.0: Store uncompressed size per blob + // so we can avoid recompression costs during GC + // but have stats be correct + + let reader = MergeReader::new( + readers + .into_iter() + .map(|x| x.use_compression(self.config.compression.clone())) + .collect(), + ); + + let mut writer = self + .get_writer_raw()? + .use_compression(self.config.compression.clone()); + + for item in reader { + let (k, v, blob_file_id, _) = item?; + + match index_reader.get(&k)? { + // If this value is in an older blob file, we can discard it + Some(vhandle) if blob_file_id < vhandle.blob_file_id => continue, + None => continue, + _ => {} + } + + let vhandle = writer.get_next_value_handle(); + + // NOTE: Truncation is OK because we know values are u32 max + #[allow(clippy::cast_possible_truncation)] + index_writer.insert_indirect(&k, vhandle, v.len() as u32)?; + + writer.write(&k, &v)?; + } + + // IMPORTANT: New blob files need to be persisted before adding to index + // to avoid dangling pointers + self.manifest.register(writer)?; + + // NOTE: If we crash here, it's fine, the blob files are registered + // but never referenced, so they can just be dropped after recovery + index_writer.finish()?; + + // IMPORTANT: We only mark the blob files as definitely stale + // The external index needs to decide when it is safe to drop + // the old blob files, as some reads may still be performed + self.mark_as_stale(ids); + + let size_after = self.manifest.disk_space_used(); + + Ok(size_before.saturating_sub(size_after)) + } +} diff --git a/tests/blob_drop_after_flush.rs b/tests/blob_drop_after_flush.rs index 3ff20d5a..115b5d7c 100644 --- a/tests/blob_drop_after_flush.rs +++ b/tests/blob_drop_after_flush.rs @@ -1,8 +1,9 @@ -use lsm_tree::{AbstractTree, Config}; +use lsm_tree::{AbstractTree, Config, SeqNo}; use std::time::Duration; use test_log::test; #[test] +#[ignore = "restore"] fn blob_drop_after_flush() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; @@ -35,7 +36,7 @@ fn blob_drop_after_flush() -> lsm_tree::Result<()> { assert_eq!( "neptune".repeat(10_000).as_bytes(), - &*tree.get("a", None)?.unwrap(), + &*tree.get("a", SeqNo::MAX)?.unwrap(), ); let report = gc_report.join().unwrap()?; diff --git a/tests/blob_gc.rs b/tests/blob_gc.rs index 18f15d3f..9a647420 100644 --- a/tests/blob_gc.rs +++ b/tests/blob_gc.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, SequenceNumberCounter}; +use lsm_tree::{AbstractTree, Config, SeqNo, SequenceNumberCounter}; use test_log::test; #[test] @@ -15,7 +15,7 @@ fn blob_gc_1() -> lsm_tree::Result<()> { tree.insert("c", "neptune".repeat(10_000), seqno.next()); tree.flush_active_memtable(0)?; - assert_eq!(1, tree.blobs.segment_count()); + assert_eq!(1, tree.blobs.blob_file_count()); tree.gc_scan_stats(seqno.get(), 0)?; @@ -36,10 +36,10 @@ fn blob_gc_1() -> lsm_tree::Result<()> { tree.gc_drop_stale()?; - assert_eq!(&*tree.get("a", None)?.unwrap(), b"a"); - assert_eq!(&*tree.get("b", None)?.unwrap(), b"b"); - assert_eq!(&*tree.get("c", None)?.unwrap(), b"c"); - assert_eq!(0, tree.blobs.segment_count()); + assert_eq!(&*tree.get("a", SeqNo::MAX)?.unwrap(), b"a"); + assert_eq!(&*tree.get("b", SeqNo::MAX)?.unwrap(), b"b"); + assert_eq!(&*tree.get("c", SeqNo::MAX)?.unwrap(), b"c"); + assert_eq!(0, tree.blobs.blob_file_count()); assert_eq!(0.0, tree.blobs.space_amp()); Ok(()) @@ -59,7 +59,7 @@ fn blob_gc_2() -> lsm_tree::Result<()> { tree.insert("c", "neptune".repeat(10_000), seqno.next()); tree.flush_active_memtable(0)?; - assert_eq!(1, tree.blobs.segment_count()); + assert_eq!(1, tree.blobs.blob_file_count()); tree.gc_scan_stats(seqno.get(), 0)?; assert_eq!(1.0, tree.blobs.space_amp()); @@ -75,13 +75,13 @@ fn blob_gc_2() -> lsm_tree::Result<()> { let strategy = lsm_tree::gc::SpaceAmpStrategy::new(1.0); tree.apply_gc_strategy(&strategy, seqno.next())?; - assert_eq!(&*tree.get("a", None)?.unwrap(), b"a"); - assert_eq!(&*tree.get("b", None)?.unwrap(), b"b"); + assert_eq!(&*tree.get("a", SeqNo::MAX)?.unwrap(), b"a"); + assert_eq!(&*tree.get("b", SeqNo::MAX)?.unwrap(), b"b"); assert_eq!( - &*tree.get("c", None)?.unwrap(), + &*tree.get("c", SeqNo::MAX)?.unwrap(), "neptune".repeat(10_000).as_bytes() ); - assert_eq!(1, tree.blobs.segment_count()); + assert_eq!(1, tree.blobs.blob_file_count()); assert_eq!(1.0, tree.blobs.space_amp()); tree.insert("c", "c", seqno.next()); @@ -90,7 +90,7 @@ fn blob_gc_2() -> lsm_tree::Result<()> { let strategy = lsm_tree::gc::SpaceAmpStrategy::new(1.0); tree.apply_gc_strategy(&strategy, seqno.next())?; - assert_eq!(0, tree.blobs.segment_count()); + assert_eq!(0, tree.blobs.blob_file_count()); Ok(()) } @@ -109,7 +109,7 @@ fn blob_gc_3() -> lsm_tree::Result<()> { tree.insert("c", "neptune".repeat(10_000), seqno.next()); tree.flush_active_memtable(0)?; - assert_eq!(1, tree.blobs.segment_count()); + assert_eq!(1, tree.blobs.blob_file_count()); tree.gc_scan_stats(seqno.get(), 0)?; assert_eq!(1.0, tree.blobs.space_amp()); @@ -126,23 +126,23 @@ fn blob_gc_3() -> lsm_tree::Result<()> { let strategy = lsm_tree::gc::SpaceAmpStrategy::new(1.0); tree.apply_gc_strategy(&strategy, seqno.next())?; - assert!(tree.get("a", None)?.is_none()); - assert!(tree.get("b", None)?.is_none()); + assert!(tree.get("a", SeqNo::MAX)?.is_none()); + assert!(tree.get("b", SeqNo::MAX)?.is_none()); assert_eq!( - &*tree.get("c", None)?.unwrap(), + &*tree.get("c", SeqNo::MAX)?.unwrap(), "neptune".repeat(10_000).as_bytes() ); - assert_eq!(1, tree.blobs.segment_count()); + assert_eq!(1, tree.blobs.blob_file_count()); assert_eq!(1.0, tree.blobs.space_amp()); tree.remove("c", seqno.next()); - assert!(tree.get("c", None)?.is_none()); + assert!(tree.get("c", SeqNo::MAX)?.is_none()); tree.gc_scan_stats(seqno.get(), 1_000)?; let strategy = lsm_tree::gc::SpaceAmpStrategy::new(1.0); tree.apply_gc_strategy(&strategy, seqno.next())?; - assert_eq!(0, tree.blobs.segment_count()); + assert_eq!(0, tree.blobs.blob_file_count()); Ok(()) } diff --git a/tests/blob_gc_watermark.rs b/tests/blob_gc_watermark.rs index 346b484a..fc09617a 100644 --- a/tests/blob_gc_watermark.rs +++ b/tests/blob_gc_watermark.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, SequenceNumberCounter}; +use lsm_tree::{AbstractTree, Config, SeqNo, SequenceNumberCounter}; use test_log::test; #[test] @@ -12,21 +12,35 @@ fn blob_gc_seqno_watermark() -> lsm_tree::Result<()> { let seqno = SequenceNumberCounter::default(); tree.insert("a", "neptune".repeat(10_000), seqno.next()); - let snapshot = tree.snapshot(seqno.get()); - assert_eq!(&*snapshot.get("a")?.unwrap(), b"neptune".repeat(10_000)); - assert_eq!(&*tree.get("a", None)?.unwrap(), b"neptune".repeat(10_000)); + + // TODO: test snapshot reads + // let snapshot = tree.snapshot(seqno.get()); + // assert_eq!(&*snapshot.get("a")?.unwrap(), b"neptune".repeat(10_000)); + assert_eq!( + &*tree.get("a", SeqNo::MAX)?.unwrap(), + b"neptune".repeat(10_000) + ); tree.insert("a", "neptune2".repeat(10_000), seqno.next()); - assert_eq!(&*snapshot.get("a")?.unwrap(), b"neptune".repeat(10_000)); - assert_eq!(&*tree.get("a", None)?.unwrap(), b"neptune2".repeat(10_000)); + // assert_eq!(&*snapshot.get("a")?.unwrap(), b"neptune".repeat(10_000)); + assert_eq!( + &*tree.get("a", SeqNo::MAX)?.unwrap(), + b"neptune2".repeat(10_000) + ); tree.insert("a", "neptune3".repeat(10_000), seqno.next()); - assert_eq!(&*snapshot.get("a")?.unwrap(), b"neptune".repeat(10_000)); - assert_eq!(&*tree.get("a", None)?.unwrap(), b"neptune3".repeat(10_000)); + // assert_eq!(&*snapshot.get("a")?.unwrap(), b"neptune".repeat(10_000)); + assert_eq!( + &*tree.get("a", SeqNo::MAX)?.unwrap(), + b"neptune3".repeat(10_000) + ); tree.flush_active_memtable(0)?; - assert_eq!(&*snapshot.get("a")?.unwrap(), b"neptune".repeat(10_000)); - assert_eq!(&*tree.get("a", None)?.unwrap(), b"neptune3".repeat(10_000)); + // assert_eq!(&*snapshot.get("a")?.unwrap(), b"neptune".repeat(10_000)); + assert_eq!( + &*tree.get("a", SeqNo::MAX)?.unwrap(), + b"neptune3".repeat(10_000) + ); let report = tree.gc_scan_stats(seqno.get() + 1, 0)?; assert_eq!(2, report.stale_blobs); @@ -41,8 +55,11 @@ fn blob_gc_seqno_watermark() -> lsm_tree::Result<()> { let report = tree.gc_scan_stats(seqno.get() + 1, 0)?; assert_eq!(2, report.stale_blobs); - assert_eq!(&*snapshot.get("a")?.unwrap(), b"neptune".repeat(10_000)); - assert_eq!(&*tree.get("a", None)?.unwrap(), b"neptune3".repeat(10_000)); + // assert_eq!(&*snapshot.get("a")?.unwrap(), b"neptune".repeat(10_000)); + assert_eq!( + &*tree.get("a", SeqNo::MAX)?.unwrap(), + b"neptune3".repeat(10_000) + ); Ok(()) } diff --git a/tests/blob_sep_threshold.rs b/tests/blob_sep_threshold.rs index d281fe7b..16a7540a 100644 --- a/tests/blob_sep_threshold.rs +++ b/tests/blob_sep_threshold.rs @@ -1,4 +1,4 @@ -use lsm_tree::AbstractTree; +use lsm_tree::{AbstractTree, SeqNo}; use test_log::test; #[test] @@ -12,13 +12,13 @@ fn blob_tree_separation_threshold() -> lsm_tree::Result<()> { tree.insert("a", "a".repeat(1_023), 0); tree.flush_active_memtable(0)?; - assert_eq!(tree.blobs.segment_count(), 0); + assert_eq!(tree.blobs.blob_file_count(), 0); tree.insert("b", "b".repeat(1_024), 0); tree.flush_active_memtable(0)?; - assert_eq!(tree.blobs.segment_count(), 1); + assert_eq!(tree.blobs.blob_file_count(), 1); - assert_eq!(2, tree.len(None, None)?); + assert_eq!(2, tree.len(SeqNo::MAX, None)?); Ok(()) } diff --git a/tests/blob_simple.rs b/tests/blob_simple.rs index ecf317ff..54c0b1ed 100644 --- a/tests/blob_simple.rs +++ b/tests/blob_simple.rs @@ -1,4 +1,4 @@ -use lsm_tree::AbstractTree; +use lsm_tree::{AbstractTree, SeqNo}; use test_log::test; #[test] @@ -13,39 +13,39 @@ fn blob_tree_simple() -> lsm_tree::Result<()> { { let tree = lsm_tree::Config::new(path).open_as_blob_tree()?; - assert!(tree.get("big", None)?.is_none()); + assert!(tree.get("big", SeqNo::MAX)?.is_none()); tree.insert("big", &big_value, 0); tree.insert("smol", "small value", 0); - let value = tree.get("big", None)?.expect("should exist"); + let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); assert_eq!(&*value, big_value); tree.flush_active_memtable(0)?; - let value = tree.get("big", None)?.expect("should exist"); + let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); assert_eq!(&*value, big_value); - let value = tree.get("smol", None)?.expect("should exist"); + let value = tree.get("smol", SeqNo::MAX)?.expect("should exist"); assert_eq!(&*value, b"small value"); tree.insert("big", &new_big_value, 1); - let value = tree.get("big", None)?.expect("should exist"); + let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); assert_eq!(&*value, new_big_value); tree.flush_active_memtable(0)?; - let value = tree.get("big", None)?.expect("should exist"); + let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); assert_eq!(&*value, new_big_value); } { let tree = lsm_tree::Config::new(path).open_as_blob_tree()?; - let value = tree.get("smol", None)?.expect("should exist"); + let value = tree.get("smol", SeqNo::MAX)?.expect("should exist"); assert_eq!(&*value, b"small value"); - let value = tree.get("big", None)?.expect("should exist"); + let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); assert_eq!(&*value, new_big_value); } diff --git a/tests/blob_tombstone.rs b/tests/blob_tombstone.rs index c8bf1944..6f767808 100644 --- a/tests/blob_tombstone.rs +++ b/tests/blob_tombstone.rs @@ -1,4 +1,4 @@ -use lsm_tree::AbstractTree; +use lsm_tree::{AbstractTree, SeqNo}; use test_log::test; #[test] @@ -14,30 +14,30 @@ fn blob_tree_tombstone() -> lsm_tree::Result<()> { tree.insert("a", &big_value, 0); tree.insert("b", &big_value, 0); tree.insert("c", &big_value, 0); - assert_eq!(3, tree.len(None, None)?); + assert_eq!(3, tree.len(SeqNo::MAX, None)?); tree.flush_active_memtable(0)?; - assert_eq!(3, tree.len(None, None)?); + assert_eq!(3, tree.len(SeqNo::MAX, None)?); tree.remove("b", 1); - assert_eq!(2, tree.len(None, None)?); + assert_eq!(2, tree.len(SeqNo::MAX, None)?); tree.flush_active_memtable(0)?; - assert_eq!(2, tree.len(None, None)?); + assert_eq!(2, tree.len(SeqNo::MAX, None)?); - assert_eq!(&*tree.get("a", None)?.unwrap(), big_value); - assert!(tree.get("b", None)?.is_none()); - assert_eq!(&*tree.get("c", None)?.unwrap(), big_value); + assert_eq!(&*tree.get("a", SeqNo::MAX)?.unwrap(), big_value); + assert!(tree.get("b", SeqNo::MAX)?.is_none()); + assert_eq!(&*tree.get("c", SeqNo::MAX)?.unwrap(), big_value); tree.gc_scan_stats(2, 0)?; let strategy = lsm_tree::gc::StaleThresholdStrategy::new(0.01); tree.apply_gc_strategy(&strategy, 2)?; - assert_eq!(2, tree.len(None, None)?); + assert_eq!(2, tree.len(SeqNo::MAX, None)?); - assert_eq!(&*tree.get("a", None)?.unwrap(), big_value); - assert!(tree.get("b", None)?.is_none()); - assert_eq!(&*tree.get("c", None)?.unwrap(), big_value); + assert_eq!(&*tree.get("a", SeqNo::MAX)?.unwrap(), big_value); + assert!(tree.get("b", SeqNo::MAX)?.is_none()); + assert_eq!(&*tree.get("c", SeqNo::MAX)?.unwrap(), big_value); Ok(()) } diff --git a/tests/blob_tree_flush.rs b/tests/blob_tree_flush.rs index 1bc3a4a6..c47bb757 100644 --- a/tests/blob_tree_flush.rs +++ b/tests/blob_tree_flush.rs @@ -21,13 +21,13 @@ fn blob_gc_flush_tombstone() -> lsm_tree::Result<()> { let strategy = lsm_tree::gc::SpaceAmpStrategy::new(1.0); tree.apply_gc_strategy(&strategy, seqno.next())?; - assert_eq!(1, tree.blobs.segment_count()); + assert_eq!(1, tree.blobs.blob_file_count()); tree.gc_scan_stats(seqno.get(), 1_000)?; assert_eq!(1.0, tree.blobs.space_amp()); tree.flush_active_memtable(0)?; - assert_eq!(1, tree.blobs.segment_count()); + assert_eq!(1, tree.blobs.blob_file_count()); tree.gc_scan_stats(seqno.get(), 1_000)?; assert_eq!(1.0, tree.blobs.space_amp()); diff --git a/tests/blob_tree_reload_blob.rs b/tests/blob_tree_reload_blob.rs index b4423965..1cf46d45 100644 --- a/tests/blob_tree_reload_blob.rs +++ b/tests/blob_tree_reload_blob.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, SequenceNumberCounter, TreeType}; +use lsm_tree::{AbstractTree, Config, Guard, SeqNo, SequenceNumberCounter, TreeType}; use test_log::test; const ITEM_COUNT: usize = 10_000; @@ -11,18 +11,42 @@ fn blob_tree_reload_empty() -> lsm_tree::Result<()> { { let tree = Config::new(&folder).open_as_blob_tree()?; - assert_eq!(tree.len(None, None)?, 0); - assert_eq!(tree.iter(None, None).flatten().count(), 0); - assert_eq!(tree.iter(None, None).rev().flatten().count(), 0); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + assert_eq!( + tree.iter(SeqNo::MAX, None) + .flat_map(|x| x.key()) + .count(), + 0 + ); + assert_eq!( + tree.iter(SeqNo::MAX, None) + .map(|x| x.key()) + .rev() + .flatten() + .count(), + 0 + ); assert_eq!(tree.tree_type(), TreeType::Blob); } { let tree = Config::new(&folder).open_as_blob_tree()?; - assert_eq!(tree.len(None, None)?, 0); - assert_eq!(tree.iter(None, None).flatten().count(), 0); - assert_eq!(tree.iter(None, None).rev().flatten().count(), 0); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + assert_eq!( + tree.iter(SeqNo::MAX, None) + .flat_map(|x| x.key()) + .count(), + 0 + ); + assert_eq!( + tree.iter(SeqNo::MAX, None) + .map(|x| x.key()) + .rev() + .flatten() + .count(), + 0 + ); assert_eq!(tree.tree_type(), TreeType::Blob); tree.flush_active_memtable(0)?; @@ -31,9 +55,21 @@ fn blob_tree_reload_empty() -> lsm_tree::Result<()> { { let tree = Config::new(&folder).open_as_blob_tree()?; - assert_eq!(tree.len(None, None)?, 0); - assert_eq!(tree.iter(None, None).flatten().count(), 0); - assert_eq!(tree.iter(None, None).rev().flatten().count(), 0); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + assert_eq!( + tree.iter(SeqNo::MAX, None) + .flat_map(|x| x.key()) + .count(), + 0 + ); + assert_eq!( + tree.iter(SeqNo::MAX, None) + .map(|x| x.key()) + .rev() + .flatten() + .count(), + 0 + ); assert_eq!(tree.tree_type(), TreeType::Blob); } @@ -64,10 +100,18 @@ fn blob_tree_reload() -> lsm_tree::Result<()> { tree.insert(key, value.as_bytes(), seqno.next()); } - assert_eq!(tree.len(None, None)?, ITEM_COUNT * 2); - assert_eq!(tree.iter(None, None).flatten().count(), ITEM_COUNT * 2); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT * 2); assert_eq!( - tree.iter(None, None).rev().flatten().count(), + tree.iter(SeqNo::MAX, None) + .flat_map(|x| x.key()) + .count(), + ITEM_COUNT * 2 + ); + assert_eq!( + tree.iter(SeqNo::MAX, None) + .rev() + .flat_map(|x| x.key()) + .count(), ITEM_COUNT * 2 ); @@ -77,10 +121,18 @@ fn blob_tree_reload() -> lsm_tree::Result<()> { { let tree = Config::new(&folder).open_as_blob_tree()?; - assert_eq!(tree.len(None, None)?, ITEM_COUNT * 2); - assert_eq!(tree.iter(None, None).flatten().count(), ITEM_COUNT * 2); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT * 2); + assert_eq!( + tree.iter(SeqNo::MAX, None) + .flat_map(|x| x.key()) + .count(), + ITEM_COUNT * 2 + ); assert_eq!( - tree.iter(None, None).rev().flatten().count(), + tree.iter(SeqNo::MAX, None) + .rev() + .flat_map(|x| x.key()) + .count(), ITEM_COUNT * 2 ); } diff --git a/tests/compaction_readers_grouping.rs b/tests/compaction_readers_grouping.rs index 3e6ead60..f277bc51 100644 --- a/tests/compaction_readers_grouping.rs +++ b/tests/compaction_readers_grouping.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, SequenceNumberCounter}; +use lsm_tree::{AbstractTree, Config, SeqNo, SequenceNumberCounter}; use std::sync::Arc; use test_log::test; @@ -16,7 +16,7 @@ fn compaction_readers_grouping() -> lsm_tree::Result<()> { tree.insert("b".as_bytes(), "abc", seqno.next()); tree.insert("c".as_bytes(), "abc", seqno.next()); tree.flush_active_memtable(0)?; - assert_eq!(3, tree.len(None, None)?); + assert_eq!(3, tree.len(SeqNo::MAX, None)?); tree.compact(Arc::new(lsm_tree::compaction::PullDown(0, 2)), 0)?; @@ -24,13 +24,13 @@ fn compaction_readers_grouping() -> lsm_tree::Result<()> { tree.insert("e".as_bytes(), "abc", seqno.next()); tree.insert("f".as_bytes(), "abc", seqno.next()); tree.flush_active_memtable(0)?; - assert_eq!(6, tree.len(None, None)?); + assert_eq!(6, tree.len(SeqNo::MAX, None)?); tree.insert("g".as_bytes(), "abc", seqno.next()); tree.insert("h".as_bytes(), "abc", seqno.next()); tree.insert("i".as_bytes(), "abc", seqno.next()); tree.flush_active_memtable(0)?; - assert_eq!(9, tree.len(None, None)?); + assert_eq!(9, tree.len(SeqNo::MAX, None)?); // NOTE: Previously, create_compaction_stream would short circuit // breaking this diff --git a/tests/experimental_blob_tree_guarded_size.rs b/tests/experimental_blob_tree_guarded_size.rs new file mode 100644 index 00000000..0cc1ef8e --- /dev/null +++ b/tests/experimental_blob_tree_guarded_size.rs @@ -0,0 +1,21 @@ +use lsm_tree::{AbstractTree, Config, Guard, SeqNo}; +use test_log::test; + +#[test] +fn experimental_blob_tree_guarded_size() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new(folder).open_as_blob_tree()?; + + tree.insert("a".as_bytes(), "abc", 0); + tree.insert("b".as_bytes(), "a".repeat(10_000), 0); + + assert_eq!( + 10_003u32, + tree.iter(SeqNo::MAX, None) + .flat_map(Guard::size) + .sum(), + ); + + Ok(()) +} diff --git a/tests/experimental_tree_guarded_range.rs b/tests/experimental_tree_guarded_range.rs new file mode 100644 index 00000000..f8f89cbd --- /dev/null +++ b/tests/experimental_tree_guarded_range.rs @@ -0,0 +1,62 @@ +use lsm_tree::{AbstractTree, Config, Guard, SeqNo}; +use test_log::test; + +#[test] +fn experimental_tree_guarded_range() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new(folder).open()?; + + tree.insert("a".as_bytes(), nanoid::nanoid!().as_bytes(), 0); + tree.insert("f".as_bytes(), nanoid::nanoid!().as_bytes(), 1); + tree.insert("g".as_bytes(), nanoid::nanoid!().as_bytes(), 2); + + tree.insert("a".as_bytes(), nanoid::nanoid!().as_bytes(), 3); + tree.insert("f".as_bytes(), nanoid::nanoid!().as_bytes(), 4); + tree.insert("g".as_bytes(), nanoid::nanoid!().as_bytes(), 5); + + assert_eq!( + 2, + tree.range("a"..="f", SeqNo::MAX, None) + .flat_map(Guard::key) + .count(), + ); + assert_eq!( + 2, + tree.range("f"..="g", SeqNo::MAX, None) + .flat_map(Guard::key) + .count(), + ); + + Ok(()) +} + +#[test] +fn experimental_blob_tree_guarded_range() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new(folder).open_as_blob_tree()?; + + tree.insert("a".as_bytes(), nanoid::nanoid!().as_bytes(), 0); + tree.insert("f".as_bytes(), nanoid::nanoid!().as_bytes(), 1); + tree.insert("g".as_bytes(), nanoid::nanoid!().as_bytes(), 2); + + tree.insert("a".as_bytes(), nanoid::nanoid!().as_bytes(), 3); + tree.insert("f".as_bytes(), nanoid::nanoid!().as_bytes(), 4); + tree.insert("g".as_bytes(), nanoid::nanoid!().as_bytes(), 5); + + assert_eq!( + 2, + tree.range("a"..="f", SeqNo::MAX, None) + .flat_map(Guard::key) + .count(), + ); + assert_eq!( + 2, + tree.range("f"..="g", SeqNo::MAX, None) + .flat_map(Guard::key) + .count(), + ); + + Ok(()) +} diff --git a/tests/major_compaction.rs b/tests/major_compaction.rs index ab04581b..b83fdc49 100644 --- a/tests/major_compaction.rs +++ b/tests/major_compaction.rs @@ -36,7 +36,7 @@ fn tree_major_compaction() -> lsm_tree::Result<()> { assert_eq!(item.key.seqno, 0); // NOTE: Seqno is zeroed because below GC threshold assert_eq!(1, tree.segment_count()); - assert_eq!(3, tree.len(None, None)?); + assert_eq!(3, tree.len(SeqNo::MAX, None)?); let batch_seqno = seqno.next(); tree.remove("a".as_bytes(), batch_seqno); @@ -49,7 +49,7 @@ fn tree_major_compaction() -> lsm_tree::Result<()> { tree.major_compact(u64::MAX, 1_000 /* NOTE: Simulate some time passing */)?; assert_eq!(0, tree.segment_count()); - assert_eq!(0, tree.len(None, None)?); + assert_eq!(0, tree.len(SeqNo::MAX, None)?); Ok(()) } diff --git a/tests/open_files.rs b/tests/open_files.rs index 24600046..e4466387 100644 --- a/tests/open_files.rs +++ b/tests/open_files.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Cache, Config}; +use lsm_tree::{AbstractTree, Cache, Config, SeqNo}; use std::sync::Arc; use test_log::test; @@ -19,7 +19,7 @@ fn open_file_limit() -> lsm_tree::Result<()> { } for _ in 0..5 { - assert!(tree.first_key_value(None, None)?.is_some()); + assert!(tree.first_key_value(SeqNo::MAX, None)?.is_some()); } Ok(()) diff --git a/tests/segment_point_reads.rs b/tests/segment_point_reads.rs index 6da86fab..a8715ac8 100644 --- a/tests/segment_point_reads.rs +++ b/tests/segment_point_reads.rs @@ -21,7 +21,7 @@ fn segment_point_reads() -> lsm_tree::Result<()> { for x in 0..ITEM_COUNT as u64 { let key = x.to_be_bytes(); - assert!(tree.contains_key(key, None)?, "{key:?} not found"); + assert!(tree.contains_key(key, SeqNo::MAX)?, "{key:?} not found"); } Ok(()) @@ -51,17 +51,19 @@ fn segment_point_reads_mvcc() -> lsm_tree::Result<()> { assert_eq!(item.key.seqno, 2); assert_eq!(&*item.value, b"2"); - let snapshot = tree.snapshot(3); - let item = snapshot.get(key)?.unwrap(); - assert_eq!(&*item, b"2"); + // TODO: test snapshot reads - let snapshot = tree.snapshot(2); - let item = snapshot.get(key)?.unwrap(); - assert_eq!(&*item, b"1"); + // let snapshot = tree.snapshot(3); + // let item = snapshot.get(key)?.unwrap(); + // assert_eq!(&*item, b"2"); - let snapshot = tree.snapshot(1); - let item = snapshot.get(key)?.unwrap(); - assert_eq!(&*item, b"0"); + // let snapshot = tree.snapshot(2); + // let item = snapshot.get(key)?.unwrap(); + // assert_eq!(&*item, b"1"); + + // let snapshot = tree.snapshot(1); + // let item = snapshot.get(key)?.unwrap(); + // assert_eq!(&*item, b"0"); } Ok(()) @@ -93,20 +95,22 @@ fn segment_point_reads_mvcc_slab() -> lsm_tree::Result<()> { assert_eq!(item.key.seqno, ITEM_COUNT as u64 - 1); } - for key in &keys { - // NOTE: Need to start at seqno=1 - for seqno in 1..ITEM_COUNT as u64 { - let snapshot = tree.snapshot(seqno); - let item = snapshot.get(key)?.unwrap(); - - // NOTE: When snapshot is =1, it will read any items with - // seqno less than 1 - assert_eq!( - String::from_utf8_lossy(&item).parse::().unwrap(), - seqno - 1 - ); - } - } + // TODO: test snapshot reads + + // for key in &keys { + // // NOTE: Need to start at seqno=1 + // for seqno in 1..ITEM_COUNT as u64 { + // let snapshot = tree.snapshot(seqno); + // let item = snapshot.get(key)?.unwrap(); + + // // NOTE: When snapshot is =1, it will read any items with + // // seqno less than 1 + // assert_eq!( + // String::from_utf8_lossy(&item).parse::().unwrap(), + // seqno - 1 + // ); + // } + // } Ok(()) } @@ -133,27 +137,29 @@ fn blob_tree_segment_point_reads_mvcc_slab() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; for key in &keys { - let item = tree.get(key, None)?.unwrap(); + let item = tree.get(key, SeqNo::MAX)?.unwrap(); assert_eq!( String::from_utf8_lossy(&item).parse::().unwrap(), ITEM_COUNT as u64 - 1 ); } - for key in &keys { - // NOTE: Need to start at seqno=1 - for seqno in 1..ITEM_COUNT as u64 { - let snapshot = tree.snapshot(seqno); - let item = snapshot.get(key)?.unwrap(); - - // NOTE: When snapshot is =1, it will read any items with - // seqno less than 1 - assert_eq!( - String::from_utf8_lossy(&item).parse::().unwrap(), - seqno - 1 - ); - } - } + // TODO: test snapshot reads + + // for key in &keys { + // // NOTE: Need to start at seqno=1 + // for seqno in 1..ITEM_COUNT as u64 { + // let snapshot = tree.snapshot(seqno); + // let item = snapshot.get(key)?.unwrap(); + + // // NOTE: When snapshot is =1, it will read any items with + // // seqno less than 1 + // assert_eq!( + // String::from_utf8_lossy(&item).parse::().unwrap(), + // seqno - 1 + // ); + // } + // } Ok(()) } diff --git a/tests/segment_range.rs b/tests/segment_range.rs index 0a6ab804..71e611d5 100644 --- a/tests/segment_range.rs +++ b/tests/segment_range.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config}; +use lsm_tree::{AbstractTree, Config, Guard, SeqNo}; use test_log::test; const ITEM_COUNT: usize = 1_000_000; @@ -19,13 +19,25 @@ fn segment_ranges() -> lsm_tree::Result<()> { } tree.flush_active_memtable(0)?; - let iter = tree.range(1_000u64.to_be_bytes()..11_000u64.to_be_bytes(), None, None); + let iter = tree.range( + 1_000u64.to_be_bytes()..11_000u64.to_be_bytes(), + SeqNo::MAX, + None, + ); assert_eq!(10_000, iter.count()); - let iter = tree.range(1_000u64.to_be_bytes()..11_000u64.to_be_bytes(), None, None); + let iter = tree.range( + 1_000u64.to_be_bytes()..11_000u64.to_be_bytes(), + SeqNo::MAX, + None, + ); assert_eq!(10_000, iter.rev().count()); - let mut iter = tree.range(1_000u64.to_be_bytes()..11_000u64.to_be_bytes(), None, None); + let mut iter = tree.range( + 1_000u64.to_be_bytes()..11_000u64.to_be_bytes(), + SeqNo::MAX, + None, + ); let mut count = 0; for x in 0.. { @@ -66,19 +78,19 @@ fn segment_range_last_back() -> lsm_tree::Result<()> { } tree.flush_active_memtable(0)?; - let iter = tree.range(0u64.to_be_bytes()..10u64.to_be_bytes(), None, None); + let iter = tree.range(0u64.to_be_bytes()..10u64.to_be_bytes(), SeqNo::MAX, None); assert_eq!(10, iter.count()); - let iter = tree.range(0u64.to_be_bytes()..10u64.to_be_bytes(), None, None); + let iter = tree.range(0u64.to_be_bytes()..10u64.to_be_bytes(), SeqNo::MAX, None); assert_eq!(10, iter.rev().count()); - let mut iter = tree.range(0u64.to_be_bytes()..5u64.to_be_bytes(), None, None); + let mut iter = tree.range(0u64.to_be_bytes()..5u64.to_be_bytes(), SeqNo::MAX, None); - assert_eq!(0u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); - assert_eq!(1u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); - assert_eq!(2u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); - assert_eq!(3u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); - assert_eq!(4u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); + assert_eq!(0u64.to_be_bytes(), &*iter.next().unwrap().key()?); + assert_eq!(1u64.to_be_bytes(), &*iter.next().unwrap().key()?); + assert_eq!(2u64.to_be_bytes(), &*iter.next().unwrap().key()?); + assert_eq!(3u64.to_be_bytes(), &*iter.next().unwrap().key()?); + assert_eq!(4u64.to_be_bytes(), &*iter.next().unwrap().key()?); assert!(iter.next_back().is_none()); Ok(()) @@ -103,26 +115,26 @@ fn segment_range_last_back_2() -> lsm_tree::Result<()> { tree.insert(11u64.to_be_bytes(), [], 0); tree.flush_active_memtable(0)?; - let iter = tree.range(0u64.to_be_bytes()..10u64.to_be_bytes(), None, None); + let iter = tree.range(0u64.to_be_bytes()..10u64.to_be_bytes(), SeqNo::MAX, None); assert_eq!(10, iter.count()); - let iter = tree.range(0u64.to_be_bytes()..10u64.to_be_bytes(), None, None); + let iter = tree.range(0u64.to_be_bytes()..10u64.to_be_bytes(), SeqNo::MAX, None); assert_eq!(10, iter.rev().count()); - let mut iter = tree.range(0u64.to_be_bytes()..12u64.to_be_bytes(), None, None); - - assert_eq!(0u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); - assert_eq!(1u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); - assert_eq!(2u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); - assert_eq!(3u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); - assert_eq!(4u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); - assert_eq!(5u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); - assert_eq!(6u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); - assert_eq!(7u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); - assert_eq!(8u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); - assert_eq!(9u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); - assert_eq!(10u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); - assert_eq!(11u64.to_be_bytes(), &*iter.next_back().unwrap().unwrap().0); + let mut iter = tree.range(0u64.to_be_bytes()..12u64.to_be_bytes(), SeqNo::MAX, None); + + assert_eq!(0u64.to_be_bytes(), &*iter.next().unwrap().key()?); + assert_eq!(1u64.to_be_bytes(), &*iter.next().unwrap().key()?); + assert_eq!(2u64.to_be_bytes(), &*iter.next().unwrap().key()?); + assert_eq!(3u64.to_be_bytes(), &*iter.next().unwrap().key()?); + assert_eq!(4u64.to_be_bytes(), &*iter.next().unwrap().key()?); + assert_eq!(5u64.to_be_bytes(), &*iter.next().unwrap().key()?); + assert_eq!(6u64.to_be_bytes(), &*iter.next().unwrap().key()?); + assert_eq!(7u64.to_be_bytes(), &*iter.next().unwrap().key()?); + assert_eq!(8u64.to_be_bytes(), &*iter.next().unwrap().key()?); + assert_eq!(9u64.to_be_bytes(), &*iter.next().unwrap().key()?); + assert_eq!(10u64.to_be_bytes(), &*iter.next().unwrap().key()?); + assert_eq!(11u64.to_be_bytes(), &*iter.next_back().unwrap().key()?); assert!(iter.next().is_none()); assert!(iter.next_back().is_none()); diff --git a/tests/segment_range_oob.rs b/tests/segment_range_oob.rs index 0139d55c..4b9a8da3 100644 --- a/tests/segment_range_oob.rs +++ b/tests/segment_range_oob.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config}; +use lsm_tree::{AbstractTree, Config, SeqNo}; use test_log::test; const ITEM_COUNT: usize = 100; @@ -18,11 +18,11 @@ fn segment_range_out_of_bounds_lo() -> lsm_tree::Result<()> { } tree.flush_active_memtable(0)?; - assert_eq!(4, tree.range(..="k", None, None).count()); - assert_eq!(4, tree.range(..="k", None, None).rev().count()); + assert_eq!(4, tree.range(..="k", SeqNo::MAX, None).count()); + assert_eq!(4, tree.range(..="k", SeqNo::MAX, None).rev().count()); - assert_eq!(4, tree.range("0"..="k", None, None).count()); - assert_eq!(4, tree.range("0"..="k", None, None).rev().count()); + assert_eq!(4, tree.range("0"..="k", SeqNo::MAX, None).count()); + assert_eq!(4, tree.range("0"..="k", SeqNo::MAX, None).rev().count()); Ok(()) } @@ -43,24 +43,36 @@ fn segment_range_out_of_bounds_hi() -> lsm_tree::Result<()> { } tree.flush_active_memtable(0)?; - assert_eq!(50, tree.range((50u64.to_be_bytes()).., None, None).count()); assert_eq!( 50, - tree.range((50u64.to_be_bytes()).., None, None) + tree.range((50u64.to_be_bytes()).., SeqNo::MAX, None) + .count() + ); + assert_eq!( + 50, + tree.range((50u64.to_be_bytes()).., SeqNo::MAX, None) .rev() .count() ); assert_eq!( 50, - tree.range((50u64.to_be_bytes())..(150u64.to_be_bytes()), None, None) - .count() + tree.range( + (50u64.to_be_bytes())..(150u64.to_be_bytes()), + SeqNo::MAX, + None + ) + .count() ); assert_eq!( 50, - tree.range((50u64.to_be_bytes())..(150u64.to_be_bytes()), None, None) - .rev() - .count() + tree.range( + (50u64.to_be_bytes())..(150u64.to_be_bytes()), + SeqNo::MAX, + None + ) + .rev() + .count() ); Ok(()) diff --git a/tests/segment_remove_weak.rs b/tests/segment_remove_weak.rs index e9415607..1b2b7c72 100644 --- a/tests/segment_remove_weak.rs +++ b/tests/segment_remove_weak.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config}; +use lsm_tree::{AbstractTree, Config, SeqNo}; use test_log::test; #[test] @@ -16,7 +16,7 @@ fn segment_remove_weak_simple() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; - assert!(tree.get("a", None)?.is_none()); + assert!(tree.get("a", SeqNo::MAX)?.is_none()); Ok(()) } diff --git a/tests/snapshot_compact.rs b/tests/snapshot_compact.rs index 1002c36b..c8c775ec 100644 --- a/tests/snapshot_compact.rs +++ b/tests/snapshot_compact.rs @@ -1,41 +1,42 @@ -use lsm_tree::{AbstractTree, Config, SequenceNumberCounter}; +use lsm_tree::{AbstractTree, Config, SeqNo, SequenceNumberCounter}; use test_log::test; const ITEM_COUNT: usize = 100; #[test] +#[ignore = "restore w/o snapshot API"] fn snapshot_after_compaction() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?; + // let folder = tempfile::tempdir()?; - let tree = Config::new(&folder).open()?; + // let tree = Config::new(&folder).open()?; - let seqno = SequenceNumberCounter::default(); + // let seqno = SequenceNumberCounter::default(); - for x in 0..ITEM_COUNT as u64 { - let key = x.to_be_bytes(); - tree.insert(key, "abc".as_bytes(), seqno.next()); - } + // for x in 0..ITEM_COUNT as u64 { + // let key = x.to_be_bytes(); + // tree.insert(key, "abc".as_bytes(), seqno.next()); + // } - assert_eq!(tree.len(None, None)?, ITEM_COUNT); + // assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); - let snapshot_seqno = seqno.get(); - let snapshot = tree.snapshot(snapshot_seqno); + // let snapshot_seqno = seqno.get(); + // let snapshot = tree.snapshot(snapshot_seqno); - assert_eq!(tree.len(None, None)?, snapshot.len()?); - assert_eq!(tree.len(None, None)?, snapshot.iter().rev().count()); + // assert_eq!(tree.len(SeqNo::MAX, None)?, snapshot.len()?); + // assert_eq!(tree.len(SeqNo::MAX, None)?, snapshot.iter().rev().count()); - for x in 0..ITEM_COUNT as u64 { - let key = x.to_be_bytes(); - tree.insert(key, "abc".as_bytes(), seqno.next()); - } + // for x in 0..ITEM_COUNT as u64 { + // let key = x.to_be_bytes(); + // tree.insert(key, "abc".as_bytes(), seqno.next()); + // } - tree.flush_active_memtable(0)?; - tree.major_compact(u64::MAX, 0)?; + // tree.flush_active_memtable(0)?; + // tree.major_compact(u64::MAX, 0)?; - assert_eq!(tree.len(None, None)?, ITEM_COUNT); + // assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); - assert_eq!(ITEM_COUNT, snapshot.len()?); - assert_eq!(ITEM_COUNT, snapshot.iter().rev().count()); + // assert_eq!(ITEM_COUNT, snapshot.len()?); + // assert_eq!(ITEM_COUNT, snapshot.iter().rev().count()); Ok(()) } diff --git a/tests/snapshot_len.rs b/tests/snapshot_len.rs index 7cbf20f4..cb0c680b 100644 --- a/tests/snapshot_len.rs +++ b/tests/snapshot_len.rs @@ -1,43 +1,44 @@ -use lsm_tree::{AbstractTree, Config, SequenceNumberCounter}; +use lsm_tree::{AbstractTree, Config, SeqNo, SequenceNumberCounter}; use test_log::test; const ITEM_COUNT: usize = 100; #[test] +#[ignore = "restore w/o snapshot API"] fn snapshot_basic() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; - let tree = Config::new(&folder).open()?; + // let tree = Config::new(&folder).open()?; - let seqno = SequenceNumberCounter::default(); + // let seqno = SequenceNumberCounter::default(); - for x in 0..ITEM_COUNT as u64 { - let key = x.to_be_bytes(); - tree.insert(key, "abc".as_bytes(), seqno.next()); - } + // for x in 0..ITEM_COUNT as u64 { + // let key = x.to_be_bytes(); + // tree.insert(key, "abc".as_bytes(), seqno.next()); + // } - assert_eq!(tree.len(None, None)?, ITEM_COUNT); + // assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); - for x in 0..ITEM_COUNT as u64 { - let key = x.to_be_bytes(); - tree.insert(key, "abc".as_bytes(), seqno.next()); - } + // for x in 0..ITEM_COUNT as u64 { + // let key = x.to_be_bytes(); + // tree.insert(key, "abc".as_bytes(), seqno.next()); + // } - assert_eq!(tree.len(None, None)?, ITEM_COUNT); + // assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); - let snapshot = tree.snapshot(seqno.get()); + // let snapshot = tree.snapshot(seqno.get()); - assert_eq!(tree.len(None, None)?, snapshot.len()?); - assert_eq!(tree.len(None, None)?, snapshot.iter().rev().count()); + // assert_eq!(tree.len(SeqNo::MAX, None)?, snapshot.len()?); + // assert_eq!(tree.len(SeqNo::MAX, None)?, snapshot.iter().rev().count()); - for x in (ITEM_COUNT as u64)..((ITEM_COUNT * 2) as u64) { - let key = x.to_be_bytes(); - tree.insert(key, "abc".as_bytes(), seqno.next()); - } + // for x in (ITEM_COUNT as u64)..((ITEM_COUNT * 2) as u64) { + // let key = x.to_be_bytes(); + // tree.insert(key, "abc".as_bytes(), seqno.next()); + // } - assert_eq!(tree.len(None, None)?, ITEM_COUNT * 2); - assert_eq!(ITEM_COUNT, snapshot.len()?); - assert_eq!(ITEM_COUNT, snapshot.iter().rev().count()); + // assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT * 2); + // assert_eq!(ITEM_COUNT, snapshot.len()?); + // assert_eq!(ITEM_COUNT, snapshot.iter().rev().count()); Ok(()) } diff --git a/tests/snapshot_point_read.rs b/tests/snapshot_point_read.rs index d6ea8b2d..6926e03d 100644 --- a/tests/snapshot_point_read.rs +++ b/tests/snapshot_point_read.rs @@ -1,185 +1,188 @@ -use lsm_tree::{AbstractTree, Config, SequenceNumberCounter}; +use lsm_tree::{AbstractTree, Config, SeqNo, SequenceNumberCounter}; use test_log::test; #[test] +#[ignore = "restore w/o snapshot API"] fn snapshot_404() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; - let tree = Config::new(&folder) - .data_block_size(1_024) - .index_block_size(1_024) - .open()?; + // let tree = Config::new(&folder) + // .data_block_size(1_024) + // .index_block_size(1_024) + // .open()?; - tree.insert("a", "a", 0); - tree.insert("a2", "a2", 0); - tree.insert("c", "c", 0); + // tree.insert("a", "a", 0); + // tree.insert("a2", "a2", 0); + // tree.insert("c", "c", 0); - tree.flush_active_memtable(0)?; + // tree.flush_active_memtable(0)?; - assert_eq!(b"a", &*tree.get("a", None)?.unwrap()); - assert_eq!(b"a2", &*tree.get("a2", None)?.unwrap()); - assert!(tree.get("b", None)?.is_none()); - assert_eq!(b"c", &*tree.get("c", None)?.unwrap()); + // assert_eq!(b"a", &*tree.get("a", None)?.unwrap()); + // assert_eq!(b"a2", &*tree.get("a2", None)?.unwrap()); + // assert!(tree.get("b", None)?.is_none()); + // assert_eq!(b"c", &*tree.get("c", None)?.unwrap()); - assert!(tree.get("a", Some(0))?.is_none()); - assert!(tree.get("a2", Some(0))?.is_none()); - assert!(tree.get("b", Some(0))?.is_none()); - assert!(tree.get("c", Some(0))?.is_none()); + // assert!(tree.get("a", Some(0))?.is_none()); + // assert!(tree.get("a2", Some(0))?.is_none()); + // assert!(tree.get("b", Some(0))?.is_none()); + // assert!(tree.get("c", Some(0))?.is_none()); - assert_eq!(b"a", &*tree.get("a", Some(1))?.unwrap()); - assert_eq!(b"a2", &*tree.get("a2", Some(1))?.unwrap()); - assert!(tree.get("b", Some(1))?.is_none()); - assert_eq!(b"c", &*tree.get("c", Some(1))?.unwrap()); + // assert_eq!(b"a", &*tree.get("a", Some(1))?.unwrap()); + // assert_eq!(b"a2", &*tree.get("a2", Some(1))?.unwrap()); + // assert!(tree.get("b", Some(1))?.is_none()); + // assert_eq!(b"c", &*tree.get("c", Some(1))?.unwrap()); Ok(()) } #[test] +#[ignore = "restore w/o snapshot API"] fn snapshot_lots_of_versions() -> lsm_tree::Result<()> { let version_count = 600; - let folder = tempfile::tempdir()?; + // let folder = tempfile::tempdir()?; - let tree = Config::new(&folder) - .data_block_size(1_024) - .index_block_size(1_024) - .open()?; + // let tree = Config::new(&folder) + // .data_block_size(1_024) + // .index_block_size(1_024) + // .open()?; - let key = "abc"; + // let key = "abc"; - let seqno = SequenceNumberCounter::default(); + // let seqno = SequenceNumberCounter::default(); - #[allow(clippy::explicit_counter_loop)] - for _ in 0u64..version_count { - tree.insert(key, format!("abc{version_count}").as_bytes(), seqno.next()); - } + // #[allow(clippy::explicit_counter_loop)] + // for _ in 0u64..version_count { + // tree.insert(key, format!("abc{version_count}").as_bytes(), seqno.next()); + // } - tree.flush_active_memtable(0)?; + // tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, 1); + // assert_eq!(tree.len(SeqNo::MAX, None)?, 1); - for seqno in 1..version_count { - let item = tree - .get_internal_entry(key.as_bytes(), seqno)? - .expect("should exist"); - assert_eq!(format!("abc{}", version_count).as_bytes(), &*item.value); + // for seqno in 1..version_count { + // let item = tree + // .get_internal_entry(key.as_bytes(), seqno)? + // .expect("should exist"); + // assert_eq!(format!("abc{}", version_count).as_bytes(), &*item.value); - let item = tree.get(key, None)?.expect("should exist"); - assert_eq!(format!("abc{}", version_count).as_bytes(), &*item); - } + // let item = tree.get(key, None)?.expect("should exist"); + // assert_eq!(format!("abc{}", version_count).as_bytes(), &*item); + // } - Ok(()) -} + // Ok(()) + // } -const ITEM_COUNT: usize = 1; -const BATCHES: usize = 10; + // const ITEM_COUNT: usize = 1; + // const BATCHES: usize = 10; -#[test] -fn snapshot_disk_point_reads() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?; + // #[test] + // fn snapshot_disk_point_reads() -> lsm_tree::Result<()> { + // let folder = tempfile::tempdir()?; - let tree = Config::new(&folder) - .data_block_size(1_024) - .index_block_size(1_024) - .open()?; + // let tree = Config::new(&folder) + // .data_block_size(1_024) + // .index_block_size(1_024) + // .open()?; - let seqno = SequenceNumberCounter::default(); + // let seqno = SequenceNumberCounter::default(); - for batch in 0..BATCHES { - for x in 0..ITEM_COUNT as u64 { - let key = x.to_be_bytes(); - tree.insert(key, format!("abc{batch}").as_bytes(), seqno.next()); - } - } + // for batch in 0..BATCHES { + // for x in 0..ITEM_COUNT as u64 { + // let key = x.to_be_bytes(); + // tree.insert(key, format!("abc{batch}").as_bytes(), seqno.next()); + // } + // } - tree.flush_active_memtable(0)?; + // tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, ITEM_COUNT); + // assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); - for x in 0..ITEM_COUNT as u64 { - let key = x.to_be_bytes(); + // for x in 0..ITEM_COUNT as u64 { + // let key = x.to_be_bytes(); - let item = tree.get(key, None)?.expect("should exist"); - assert_eq!("abc9".as_bytes(), &*item); - } + // let item = tree.get(key, None)?.expect("should exist"); + // assert_eq!("abc9".as_bytes(), &*item); + // } - let snapshot = tree.snapshot(seqno.get()); + // let snapshot = tree.snapshot(seqno.get()); - assert_eq!(tree.len(None, None)?, snapshot.len()?); - assert_eq!(tree.len(None, None)?, snapshot.iter().rev().count()); + // assert_eq!(tree.len(SeqNo::MAX, None)?, snapshot.len()?); + // assert_eq!(tree.len(SeqNo::MAX, None)?, snapshot.iter().rev().count()); - // This batch will be too new for snapshot (invisible) - for batch in 0..BATCHES { - let batch_seqno = seqno.next(); + // // This batch will be too new for snapshot (invisible) + // for batch in 0..BATCHES { + // let batch_seqno = seqno.next(); - for x in 0..ITEM_COUNT as u64 { - let key = x.to_be_bytes(); - tree.insert(key, format!("def{batch}").as_bytes(), batch_seqno); - } - } - tree.flush_active_memtable(0)?; + // for x in 0..ITEM_COUNT as u64 { + // let key = x.to_be_bytes(); + // tree.insert(key, format!("def{batch}").as_bytes(), batch_seqno); + // } + // } + // tree.flush_active_memtable(0)?; - for x in 0..ITEM_COUNT as u64 { - let key = x.to_be_bytes(); + // for x in 0..ITEM_COUNT as u64 { + // let key = x.to_be_bytes(); - let item = snapshot.get(key)?.expect("should exist"); - assert_eq!("abc9".as_bytes(), &*item); + // let item = snapshot.get(key)?.expect("should exist"); + // assert_eq!("abc9".as_bytes(), &*item); - let item = tree.get(key, None)?.expect("should exist"); - assert_eq!("def9".as_bytes(), &*item); - } + // let item = tree.get(key, None)?.expect("should exist"); + // assert_eq!("def9".as_bytes(), &*item); + // } Ok(()) } #[test] +#[ignore = "remove"] fn snapshot_disk_and_memtable_reads() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; - let tree = Config::new(&folder) - .data_block_size(1_024) - .index_block_size(1_024) - .open()?; + // let tree = Config::new(&folder) + // .data_block_size(1_024) + // .index_block_size(1_024) + // .open()?; - let seqno = SequenceNumberCounter::default(); + // let seqno = SequenceNumberCounter::default(); - for batch in 0..BATCHES { - let batch_seqno = seqno.next(); + // for batch in 0..BATCHES { + // let batch_seqno = seqno.next(); - for x in 0..ITEM_COUNT as u64 { - let key = x.to_be_bytes(); - tree.insert(key, format!("abc{batch}").as_bytes(), batch_seqno); - } - } + // for x in 0..ITEM_COUNT as u64 { + // let key = x.to_be_bytes(); + // tree.insert(key, format!("abc{batch}").as_bytes(), batch_seqno); + // } + // } - tree.flush_active_memtable(0)?; + // tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, ITEM_COUNT); + // assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); - let snapshot = tree.snapshot(seqno.get()); + // let snapshot = tree.snapshot(seqno.get()); - assert_eq!(tree.len(None, None)?, snapshot.len()?); - assert_eq!(tree.len(None, None)?, snapshot.iter().rev().count()); + // assert_eq!(tree.len(SeqNo::MAX, None)?, snapshot.len()?); + // assert_eq!(tree.len(SeqNo::MAX, None)?, snapshot.iter().rev().count()); - // This batch will be in memtable and too new for snapshot (invisible) - for batch in 0..BATCHES { - let batch_seqno = seqno.next(); + // // This batch will be in memtable and too new for snapshot (invisible) + // for batch in 0..BATCHES { + // let batch_seqno = seqno.next(); - for x in 0..ITEM_COUNT as u64 { - let key = x.to_be_bytes(); - tree.insert(key, format!("def{batch}").as_bytes(), batch_seqno); - } - } + // for x in 0..ITEM_COUNT as u64 { + // let key = x.to_be_bytes(); + // tree.insert(key, format!("def{batch}").as_bytes(), batch_seqno); + // } + // } - for x in 0..ITEM_COUNT as u64 { - let key = x.to_be_bytes(); + // for x in 0..ITEM_COUNT as u64 { + // let key = x.to_be_bytes(); - let item = snapshot.get(key)?.expect("should exist"); - assert_eq!("abc9".as_bytes(), &*item); + // let item = snapshot.get(key)?.expect("should exist"); + // assert_eq!("abc9".as_bytes(), &*item); - let item = tree.get(key, None)?.expect("should exist"); - assert_eq!("def9".as_bytes(), &*item); - } + // let item = tree.get(key, None)?.expect("should exist"); + // assert_eq!("def9".as_bytes(), &*item); + // } Ok(()) } diff --git a/tests/snapshot_zombie.rs b/tests/snapshot_zombie.rs index ac554a9e..3979e1ba 100644 --- a/tests/snapshot_zombie.rs +++ b/tests/snapshot_zombie.rs @@ -1,104 +1,106 @@ -use lsm_tree::{AbstractTree, Config, SequenceNumberCounter}; +use lsm_tree::{AbstractTree, Config, SeqNo, SequenceNumberCounter}; use test_log::test; const ITEM_COUNT: usize = 5; #[test] +#[ignore = "restore w/o snapshot API"] fn snapshot_zombie_memtable() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; - let tree = Config::new(&folder).open()?; + // let tree = Config::new(&folder).open()?; - let seqno = SequenceNumberCounter::default(); + // let seqno = SequenceNumberCounter::default(); - for x in 0..ITEM_COUNT as u64 { - let key = x.to_be_bytes(); - tree.insert(key, "abc".as_bytes(), seqno.next()); - } + // for x in 0..ITEM_COUNT as u64 { + // let key = x.to_be_bytes(); + // tree.insert(key, "abc".as_bytes(), seqno.next()); + // } - assert_eq!(tree.len(None, None)?, ITEM_COUNT); - assert_eq!(tree.iter(None, None).rev().count(), ITEM_COUNT); + // assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); + // assert_eq!(tree.iter(SeqNo::MAX, None).rev().count(), ITEM_COUNT); - { - let snapshot = tree.snapshot(seqno.get()); - assert_eq!(ITEM_COUNT, snapshot.len()?); - assert_eq!(ITEM_COUNT, snapshot.iter().rev().count()); - } + // { + // let snapshot = tree.snapshot(seqno.get()); + // assert_eq!(ITEM_COUNT, snapshot.len()?); + // assert_eq!(ITEM_COUNT, snapshot.iter().rev().count()); + // } - for x in 0..ITEM_COUNT as u64 { - let key = x.to_be_bytes(); - tree.remove(key, seqno.next()); - } + // for x in 0..ITEM_COUNT as u64 { + // let key = x.to_be_bytes(); + // tree.remove(key, seqno.next()); + // } - assert_eq!(tree.len(None, None)?, 0); - assert_eq!(tree.iter(None, None).rev().count(), 0); + // assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + // assert_eq!(tree.iter(SeqNo::MAX, None).rev().count(), 0); - { - let snapshot = tree.snapshot(seqno.get()); - assert_eq!(0, snapshot.len()?); - assert_eq!(0, snapshot.iter().rev().count()); - assert_eq!(0, snapshot.prefix("".as_bytes()).count()); - } + // { + // let snapshot = tree.snapshot(seqno.get()); + // assert_eq!(0, snapshot.len()?); + // assert_eq!(0, snapshot.iter().rev().count()); + // assert_eq!(0, snapshot.prefix("".as_bytes()).count()); + // } Ok(()) } #[test] +#[ignore = "restore w/o snapshot API"] fn snapshot_zombie_segment() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; - let seqno = SequenceNumberCounter::default(); + // let seqno = SequenceNumberCounter::default(); - { - let tree = Config::new(&folder).open()?; + // { + // let tree = Config::new(&folder).open()?; - for x in 0..ITEM_COUNT as u64 { - let key = x.to_be_bytes(); - tree.insert(key, "abc".as_bytes(), seqno.next()); - } + // for x in 0..ITEM_COUNT as u64 { + // let key = x.to_be_bytes(); + // tree.insert(key, "abc".as_bytes(), seqno.next()); + // } - tree.flush_active_memtable(0)?; + // tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, ITEM_COUNT); - assert_eq!(tree.iter(None, None).rev().count(), ITEM_COUNT); + // assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); + // assert_eq!(tree.iter(SeqNo::MAX, None).rev().count(), ITEM_COUNT); - { - let snapshot = tree.snapshot(seqno.get()); - assert_eq!(ITEM_COUNT, snapshot.len()?); - assert_eq!(ITEM_COUNT, snapshot.iter().rev().count()); - } + // { + // let snapshot = tree.snapshot(seqno.get()); + // assert_eq!(ITEM_COUNT, snapshot.len()?); + // assert_eq!(ITEM_COUNT, snapshot.iter().rev().count()); + // } - for x in 0..ITEM_COUNT as u64 { - let key = x.to_be_bytes(); - tree.remove(key, seqno.next()); - } + // for x in 0..ITEM_COUNT as u64 { + // let key = x.to_be_bytes(); + // tree.remove(key, seqno.next()); + // } - tree.flush_active_memtable(0)?; + // tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, 0); - assert_eq!(tree.iter(None, None).rev().count(), 0); + // assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + // assert_eq!(tree.iter(SeqNo::MAX, None).rev().count(), 0); - { - let snapshot = tree.snapshot(seqno.get()); - assert_eq!(0, snapshot.len()?); - assert_eq!(0, snapshot.iter().rev().count()); - assert_eq!(0, snapshot.prefix("".as_bytes()).count()); - } - } + // { + // let snapshot = tree.snapshot(seqno.get()); + // assert_eq!(0, snapshot.len()?); + // assert_eq!(0, snapshot.iter().rev().count()); + // assert_eq!(0, snapshot.prefix("".as_bytes()).count()); + // } + // } - { - let tree = Config::new(&folder).open()?; + // { + // let tree = Config::new(&folder).open()?; - assert_eq!(tree.len(None, None)?, 0); - assert_eq!(tree.iter(None, None).rev().count(), 0); + // assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + // assert_eq!(tree.iter(SeqNo::MAX, None).rev().count(), 0); - { - let snapshot = tree.snapshot(seqno.get()); - assert_eq!(0, snapshot.len()?); - assert_eq!(0, snapshot.iter().rev().count()); - assert_eq!(0, snapshot.prefix("".as_bytes()).count()); - } - } + // { + // let snapshot = tree.snapshot(seqno.get()); + // assert_eq!(0, snapshot.len()?); + // assert_eq!(0, snapshot.iter().rev().count()); + // assert_eq!(0, snapshot.prefix("".as_bytes()).count()); + // } + // } Ok(()) } diff --git a/tests/tree_approx_len.rs b/tests/tree_approx_len.rs index 0628816c..0c0dd2df 100644 --- a/tests/tree_approx_len.rs +++ b/tests/tree_approx_len.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config}; +use lsm_tree::{AbstractTree, Config, SeqNo}; use tempfile::tempdir; use test_log::test; @@ -8,26 +8,26 @@ fn tree_approx_len_sealed() -> lsm_tree::Result<()> { let tree = Config::new(folder).open()?; - assert_eq!(tree.len(None, None)?, 0); - assert!(tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + assert!(tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 0); tree.insert("a", "", 0); - assert_eq!(tree.len(None, None)?, 1); - assert!(!tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 1); + assert!(!tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 1); tree.insert("b", "", 0); - assert_eq!(tree.len(None, None)?, 2); - assert!(!tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 2); + assert!(!tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 2); let _ = tree.rotate_memtable().unwrap(); - assert_eq!(tree.len(None, None)?, 2); - assert!(!tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 2); + assert!(!tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 2); Ok(()) @@ -39,26 +39,26 @@ fn tree_approx_len_sealed_blob() -> lsm_tree::Result<()> { let tree = Config::new(folder).open_as_blob_tree()?; - assert_eq!(tree.len(None, None)?, 0); - assert!(tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + assert!(tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 0); tree.insert("a", "", 0); - assert_eq!(tree.len(None, None)?, 1); - assert!(!tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 1); + assert!(!tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 1); tree.insert("b", "", 0); - assert_eq!(tree.len(None, None)?, 2); - assert!(!tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 2); + assert!(!tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 2); let _ = tree.rotate_memtable().unwrap(); - assert_eq!(tree.len(None, None)?, 2); - assert!(!tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 2); + assert!(!tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 2); Ok(()) @@ -70,58 +70,58 @@ fn tree_approx_len() -> lsm_tree::Result<()> { let tree = Config::new(folder).open()?; - assert_eq!(tree.len(None, None)?, 0); - assert!(tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + assert!(tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 0); tree.insert("a", "", 0); - assert_eq!(tree.len(None, None)?, 1); - assert!(!tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 1); + assert!(!tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 1); tree.insert("b", "", 0); - assert_eq!(tree.len(None, None)?, 2); - assert!(!tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 2); + assert!(!tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 2); tree.insert("a", "", 1); // Approximate count diverges - assert_eq!(tree.len(None, None)?, 2); - assert!(!tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 2); + assert!(!tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 3); tree.remove("a", 2); - assert_eq!(tree.len(None, None)?, 1); - assert!(!tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 1); + assert!(!tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 4); tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, 1); - assert!(!tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 1); + assert!(!tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 4); tree.remove("b", 4); - assert_eq!(tree.len(None, None)?, 0); - assert!(tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + assert!(tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 5); tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, 0); - assert!(tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + assert!(tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 5); tree.major_compact(u64::MAX, 5)?; // Approximate count converges - assert_eq!(tree.len(None, None)?, 0); - assert!(tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + assert!(tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 0); Ok(()) @@ -133,58 +133,58 @@ fn tree_approx_len_blob() -> lsm_tree::Result<()> { let tree = Config::new(folder).open_as_blob_tree()?; - assert_eq!(tree.len(None, None)?, 0); - assert!(tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + assert!(tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 0); tree.insert("a", "", 0); - assert_eq!(tree.len(None, None)?, 1); - assert!(!tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 1); + assert!(!tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 1); tree.insert("b", "", 0); - assert_eq!(tree.len(None, None)?, 2); - assert!(!tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 2); + assert!(!tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 2); tree.insert("a", "", 1); // Approximate count diverges - assert_eq!(tree.len(None, None)?, 2); - assert!(!tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 2); + assert!(!tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 3); tree.remove("a", 2); - assert_eq!(tree.len(None, None)?, 1); - assert!(!tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 1); + assert!(!tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 4); tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, 1); - assert!(!tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 1); + assert!(!tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 4); tree.remove("b", 4); - assert_eq!(tree.len(None, None)?, 0); - assert!(tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + assert!(tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 5); tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, 0); - assert!(tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + assert!(tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 5); tree.index.major_compact(u64::MAX, 5)?; // Approximate count converges - assert_eq!(tree.len(None, None)?, 0); - assert!(tree.is_empty(None, None)?); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + assert!(tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 0); Ok(()) diff --git a/tests/tree_bulk_ingest.rs b/tests/tree_bulk_ingest.rs index b29529cb..9de807c5 100644 --- a/tests/tree_bulk_ingest.rs +++ b/tests/tree_bulk_ingest.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config}; +use lsm_tree::{AbstractTree, Config, Guard, SeqNo}; use test_log::test; const ITEM_COUNT: usize = 100_000; @@ -16,13 +16,18 @@ fn tree_bulk_ingest() -> lsm_tree::Result<()> { (k.into(), v.into()) }))?; - assert_eq!(tree.len(None, None)?, ITEM_COUNT); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( - tree.iter(None, None).filter(|x| x.is_ok()).count(), + tree.iter(SeqNo::MAX, None) + .flat_map(|x| x.key()) + .count(), ITEM_COUNT ); assert_eq!( - tree.iter(None, None).rev().filter(|x| x.is_ok()).count(), + tree.iter(SeqNo::MAX, None) + .rev() + .flat_map(|x| x.key()) + .count(), ITEM_COUNT ); @@ -41,13 +46,18 @@ fn tree_copy() -> lsm_tree::Result<()> { (k.into(), v.into()) }))?; - assert_eq!(src.len(None, None)?, ITEM_COUNT); + assert_eq!(src.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( - src.iter(None, None).filter(|x| x.is_ok()).count(), + src.iter(SeqNo::MAX, None) + .flat_map(|x| x.key()) + .count(), ITEM_COUNT ); assert_eq!( - src.iter(None, None).rev().filter(|x| x.is_ok()).count(), + src.iter(SeqNo::MAX, None) + .rev() + .flat_map(|x| x.key()) + .count(), ITEM_COUNT ); assert!(src.lock_active_memtable().is_empty()); @@ -55,18 +65,24 @@ fn tree_copy() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let dest = Config::new(folder).open()?; - dest.ingest(src.iter(None, None).map(|kv| { - let (k, v) = kv.unwrap(); - (k, v) - }))?; + dest.ingest( + src.iter(SeqNo::MAX, None) + .map(|x| x.into_inner()) + .map(|x| x.unwrap()), + )?; - assert_eq!(dest.len(None, None)?, ITEM_COUNT); + assert_eq!(dest.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( - dest.iter(None, None).filter(|x| x.is_ok()).count(), + dest.iter(SeqNo::MAX, None) + .flat_map(|x| x.key()) + .count(), ITEM_COUNT ); assert_eq!( - dest.iter(None, None).rev().filter(|x| x.is_ok()).count(), + dest.iter(SeqNo::MAX, None) + .rev() + .flat_map(|x| x.key()) + .count(), ITEM_COUNT ); assert!(dest.lock_active_memtable().is_empty()); @@ -89,13 +105,18 @@ fn blob_tree_bulk_ingest() -> lsm_tree::Result<()> { (k.into(), v.into()) }))?; - assert_eq!(tree.len(None, None)?, ITEM_COUNT); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( - tree.iter(None, None).filter(|x| x.is_ok()).count(), + tree.iter(SeqNo::MAX, None) + .flat_map(|x| x.key()) + .count(), ITEM_COUNT ); assert_eq!( - tree.iter(None, None).rev().filter(|x| x.is_ok()).count(), + tree.iter(SeqNo::MAX, None) + .rev() + .flat_map(|x| x.key()) + .count(), ITEM_COUNT ); assert_eq!(1, tree.blob_file_count()); @@ -117,13 +138,18 @@ fn blob_tree_copy() -> lsm_tree::Result<()> { (k.into(), v.into()) }))?; - assert_eq!(src.len(None, None)?, ITEM_COUNT); + assert_eq!(src.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( - src.iter(None, None).filter(|x| x.is_ok()).count(), + src.iter(SeqNo::MAX, None) + .flat_map(|x| x.key()) + .count(), ITEM_COUNT ); assert_eq!( - src.iter(None, None).rev().filter(|x| x.is_ok()).count(), + src.iter(SeqNo::MAX, None) + .rev() + .flat_map(|x| x.key()) + .count(), ITEM_COUNT ); assert!(src.lock_active_memtable().is_empty()); @@ -134,18 +160,24 @@ fn blob_tree_copy() -> lsm_tree::Result<()> { .blob_file_separation_threshold(1) .open_as_blob_tree()?; - dest.ingest(src.iter(None, None).map(|kv| { - let (k, v) = kv.unwrap(); - (k, v) - }))?; + dest.ingest( + src.iter(SeqNo::MAX, None) + .map(|x| x.into_inner()) + .map(|x| x.unwrap()), + )?; - assert_eq!(dest.len(None, None)?, ITEM_COUNT); + assert_eq!(dest.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( - dest.iter(None, None).filter(|x| x.is_ok()).count(), + dest.iter(SeqNo::MAX, None) + .flat_map(|x| x.key()) + .count(), ITEM_COUNT ); assert_eq!( - dest.iter(None, None).rev().filter(|x| x.is_ok()).count(), + dest.iter(SeqNo::MAX, None) + .rev() + .flat_map(|x| x.key()) + .count(), ITEM_COUNT ); assert!(dest.lock_active_memtable().is_empty()); diff --git a/tests/tree_count.rs b/tests/tree_count.rs index ee3eea87..febb4e1c 100644 --- a/tests/tree_count.rs +++ b/tests/tree_count.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, Slice}; +use lsm_tree::{AbstractTree, Config, Guard, SeqNo, Slice}; use test_log::test; const ITEM_COUNT: usize = 1_000; @@ -15,13 +15,18 @@ fn tree_memtable_count() -> lsm_tree::Result<()> { tree.insert(key, value.as_bytes(), 0); } - assert_eq!(tree.len(None, None)?, ITEM_COUNT); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( - tree.iter(None, None).filter(|x| x.is_ok()).count(), + tree.iter(SeqNo::MAX, None) + .flat_map(|x| x.key()) + .count(), ITEM_COUNT ); assert_eq!( - tree.iter(None, None).rev().filter(|x| x.is_ok()).count(), + tree.iter(SeqNo::MAX, None) + .rev() + .flat_map(|x| x.key()) + .count(), ITEM_COUNT ); @@ -42,13 +47,18 @@ fn tree_flushed_count() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, ITEM_COUNT); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( - tree.iter(None, None).filter(|x| x.is_ok()).count(), + tree.iter(SeqNo::MAX, None) + .flat_map(|x| x.key()) + .count(), ITEM_COUNT ); assert_eq!( - tree.iter(None, None).rev().filter(|x| x.is_ok()).count(), + tree.iter(SeqNo::MAX, None) + .rev() + .flat_map(|x| x.key()) + .count(), ITEM_COUNT ); @@ -69,13 +79,18 @@ fn tree_flushed_count_blob() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, ITEM_COUNT); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( - tree.iter(None, None).filter(|x| x.is_ok()).count(), + tree.iter(SeqNo::MAX, None) + .flat_map(|x| x.key()) + .count(), ITEM_COUNT ); assert_eq!( - tree.iter(None, None).rev().filter(|x| x.is_ok()).count(), + tree.iter(SeqNo::MAX, None) + .rev() + .flat_map(|x| x.key()) + .count(), ITEM_COUNT ); @@ -105,7 +120,8 @@ fn tree_non_locking_count() -> lsm_tree::Result<()> { loop { let chunk = tree - .range(range.clone(), None, None) + .range(range.clone(), SeqNo::MAX, None) + .map(|x| x.into_inner()) .take(10) .collect::>>()?; diff --git a/tests/tree_delete_loop.rs b/tests/tree_delete_loop.rs index 80f3cc9e..67609012 100644 --- a/tests/tree_delete_loop.rs +++ b/tests/tree_delete_loop.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, SequenceNumberCounter}; +use lsm_tree::{AbstractTree, Config, Guard, SeqNo, SequenceNumberCounter}; use test_log::test; #[test] @@ -22,20 +22,44 @@ fn tree_delete_by_prefix() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, ITEM_COUNT * 3); - assert_eq!(tree.prefix("a:".as_bytes(), None, None).count(), ITEM_COUNT); - assert_eq!(tree.prefix("b:".as_bytes(), None, None).count(), ITEM_COUNT); - assert_eq!(tree.prefix("c:".as_bytes(), None, None).count(), ITEM_COUNT); - - for item in tree.prefix("b:".as_bytes(), None, None) { - let (key, _) = item?; + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT * 3); + assert_eq!( + tree.prefix("a:".as_bytes(), SeqNo::MAX, None) + .count(), + ITEM_COUNT + ); + assert_eq!( + tree.prefix("b:".as_bytes(), SeqNo::MAX, None) + .count(), + ITEM_COUNT + ); + assert_eq!( + tree.prefix("c:".as_bytes(), SeqNo::MAX, None) + .count(), + ITEM_COUNT + ); + + for item in tree.prefix("b:".as_bytes(), SeqNo::MAX, None) { + let key = item.key()?; tree.remove(key, seqno.next()); } - assert_eq!(tree.len(None, None)?, ITEM_COUNT * 2); - assert_eq!(tree.prefix("a:".as_bytes(), None, None).count(), ITEM_COUNT); - assert_eq!(tree.prefix("b:".as_bytes(), None, None).count(), 0); - assert_eq!(tree.prefix("c:".as_bytes(), None, None).count(), ITEM_COUNT); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT * 2); + assert_eq!( + tree.prefix("a:".as_bytes(), SeqNo::MAX, None) + .count(), + ITEM_COUNT + ); + assert_eq!( + tree.prefix("b:".as_bytes(), SeqNo::MAX, None) + .count(), + 0 + ); + assert_eq!( + tree.prefix("c:".as_bytes(), SeqNo::MAX, None) + .count(), + ITEM_COUNT + ); Ok(()) } @@ -56,14 +80,14 @@ fn tree_delete_by_range() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, 6); + assert_eq!(tree.len(SeqNo::MAX, None)?, 6); - for item in tree.range("c"..="e", None, None) { - let (key, _) = item?; + for item in tree.range("c"..="e", SeqNo::MAX, None) { + let key = item.key()?; tree.remove(key, 1); } - assert_eq!(tree.len(None, None)?, 3); + assert_eq!(tree.len(SeqNo::MAX, None)?, 3); Ok(()) } diff --git a/tests/tree_different_block_size.rs b/tests/tree_different_block_size.rs index e8c2929c..bdd96eeb 100644 --- a/tests/tree_different_block_size.rs +++ b/tests/tree_different_block_size.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, SequenceNumberCounter}; +use lsm_tree::{AbstractTree, Config, SeqNo, SequenceNumberCounter}; use test_log::test; const ITEM_COUNT: usize = 1_000; @@ -23,7 +23,7 @@ fn tree_block_size_after_recovery() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; - assert_eq!(ITEM_COUNT, tree.len(None, None)?); + assert_eq!(ITEM_COUNT, tree.len(SeqNo::MAX, None)?); } { @@ -31,7 +31,7 @@ fn tree_block_size_after_recovery() -> lsm_tree::Result<()> { .data_block_size(2_048) .index_block_size(2_048) .open()?; - assert_eq!(ITEM_COUNT, tree.len(None, None)?); + assert_eq!(ITEM_COUNT, tree.len(SeqNo::MAX, None)?); } { @@ -39,7 +39,7 @@ fn tree_block_size_after_recovery() -> lsm_tree::Result<()> { .data_block_size(4_096) .index_block_size(4_096) .open()?; - assert_eq!(ITEM_COUNT, tree.len(None, None)?); + assert_eq!(ITEM_COUNT, tree.len(SeqNo::MAX, None)?); } { @@ -47,7 +47,7 @@ fn tree_block_size_after_recovery() -> lsm_tree::Result<()> { .data_block_size(78_652) .index_block_size(78_652) .open()?; - assert_eq!(ITEM_COUNT, tree.len(None, None)?); + assert_eq!(ITEM_COUNT, tree.len(SeqNo::MAX, None)?); } Ok(()) diff --git a/tests/tree_disjoint_iter.rs b/tests/tree_disjoint_iter.rs index 912e636b..451d3671 100644 --- a/tests/tree_disjoint_iter.rs +++ b/tests/tree_disjoint_iter.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, Slice}; +use lsm_tree::{AbstractTree, Config, Guard, SeqNo, Slice}; use test_log::test; macro_rules! iter_closed { @@ -30,38 +30,38 @@ fn tree_disjoint_iter() -> lsm_tree::Result<()> { // NOTE: Forwards - let mut iter = tree.iter(None, None); + let mut iter = tree.iter(SeqNo::MAX, None); - assert_eq!(Slice::from(*b"a"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"b"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"c"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"d"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"e"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"f"), iter.next().unwrap()?.0); + assert_eq!(Slice::from(*b"a"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"b"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"c"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"d"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"e"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"f"), iter.next().unwrap().key()?); iter_closed!(iter); // NOTE: Reverse - let mut iter = tree.iter(None, None).rev(); + let mut iter = tree.iter(SeqNo::MAX, None).rev(); - assert_eq!(Slice::from(*b"f"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"e"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"d"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"c"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"b"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"a"), iter.next().unwrap()?.0); + assert_eq!(Slice::from(*b"f"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"e"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"d"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"c"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"b"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"a"), iter.next().unwrap().key()?); iter_closed!(iter); // NOTE: Ping Pong - let mut iter = tree.iter(None, None); + let mut iter = tree.iter(SeqNo::MAX, None); - assert_eq!(Slice::from(*b"a"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"f"), iter.next_back().unwrap()?.0); - assert_eq!(Slice::from(*b"b"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"e"), iter.next_back().unwrap()?.0); - assert_eq!(Slice::from(*b"c"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"d"), iter.next_back().unwrap()?.0); + assert_eq!(Slice::from(*b"a"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"f"), iter.next_back().unwrap().key()?); + assert_eq!(Slice::from(*b"b"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"e"), iter.next_back().unwrap().key()?); + assert_eq!(Slice::from(*b"c"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"d"), iter.next_back().unwrap().key()?); iter_closed!(iter); Ok(()) diff --git a/tests/tree_disjoint_point_read.rs b/tests/tree_disjoint_point_read.rs index 5ffe3cf5..1d25aa5b 100644 --- a/tests/tree_disjoint_point_read.rs +++ b/tests/tree_disjoint_point_read.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config}; +use lsm_tree::{AbstractTree, Config, SeqNo}; use std::sync::Arc; use test_log::test; @@ -24,7 +24,7 @@ fn tree_disjoint_point_read() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; for key in [b"a", b"b", b"c", b"d", b"e", b"f"] { - let value = tree.get(key, None).unwrap().unwrap(); + let value = tree.get(key, SeqNo::MAX).unwrap().unwrap(); assert_eq!(&*value, key) } @@ -53,7 +53,7 @@ fn tree_disjoint_point_read_blob() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; for key in [b"a", b"b", b"c", b"d", b"e", b"f"] { - let value = tree.get(key, None).unwrap().unwrap(); + let value = tree.get(key, SeqNo::MAX).unwrap().unwrap(); assert_eq!(&*value, key) } @@ -108,7 +108,7 @@ fn tree_disjoint_point_read_multiple_levels() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; for key in [b"z", b"b", b"c", b"d", b"e", b"f"] { - let value = tree.get(key, None).unwrap().unwrap(); + let value = tree.get(key, SeqNo::MAX).unwrap().unwrap(); assert_eq!(&*value, key) } @@ -164,7 +164,7 @@ fn tree_disjoint_point_read_multiple_levels_blob() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; for key in [b"z", b"b", b"c", b"d", b"e", b"f"] { - let value = tree.get(key, None).unwrap().unwrap(); + let value = tree.get(key, SeqNo::MAX).unwrap().unwrap(); assert_eq!(&*value, key) } diff --git a/tests/tree_disjoint_prefix.rs b/tests/tree_disjoint_prefix.rs index 4000d8e5..66ab135d 100644 --- a/tests/tree_disjoint_prefix.rs +++ b/tests/tree_disjoint_prefix.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, Slice}; +use lsm_tree::{AbstractTree, Config, Guard, SeqNo, Slice}; use test_log::test; macro_rules! iter_closed { @@ -35,29 +35,29 @@ fn tree_disjoint_prefix() -> lsm_tree::Result<()> { // NOTE: Forwards - let mut iter = tree.prefix("d", None, None); + let mut iter = tree.prefix("d", SeqNo::MAX, None); - assert_eq!(Slice::from(*b"da"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"db"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"dc"), iter.next().unwrap()?.0); + assert_eq!(Slice::from(*b"da"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"db"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"dc"), iter.next().unwrap().key()?); iter_closed!(iter); // NOTE: Reverse - let mut iter = tree.prefix("d", None, None).rev(); + let mut iter = tree.prefix("d", SeqNo::MAX, None).rev(); - assert_eq!(Slice::from(*b"dc"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"db"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"da"), iter.next().unwrap()?.0); + assert_eq!(Slice::from(*b"dc"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"db"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"da"), iter.next().unwrap().key()?); iter_closed!(iter); // NOTE: Ping Pong - let mut iter = tree.prefix("d", None, None); + let mut iter = tree.prefix("d", SeqNo::MAX, None); - assert_eq!(Slice::from(*b"da"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"dc"), iter.next_back().unwrap()?.0); - assert_eq!(Slice::from(*b"db"), iter.next().unwrap()?.0); + assert_eq!(Slice::from(*b"da"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"dc"), iter.next_back().unwrap().key()?); + assert_eq!(Slice::from(*b"db"), iter.next().unwrap().key()?); iter_closed!(iter); Ok(()) diff --git a/tests/tree_disjoint_range.rs b/tests/tree_disjoint_range.rs index e0aa43b1..46b92c95 100644 --- a/tests/tree_disjoint_range.rs +++ b/tests/tree_disjoint_range.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, Slice}; +use lsm_tree::{AbstractTree, Config, Guard, SeqNo, Slice}; use test_log::test; macro_rules! iter_closed { @@ -35,65 +35,65 @@ fn tree_disjoint_range() -> lsm_tree::Result<()> { // NOTE: Forwards - let mut iter = tree.range("e".."i", None, None); + let mut iter = tree.range("e".."i", SeqNo::MAX, None); - assert_eq!(Slice::from(*b"e"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"f"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"g"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"h"), iter.next().unwrap()?.0); + assert_eq!(Slice::from(*b"e"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"f"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"g"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"h"), iter.next().unwrap().key()?); iter_closed!(iter); // NOTE: Forwards inclusive - let mut iter = tree.range("e"..="i", None, None); + let mut iter = tree.range("e"..="i", SeqNo::MAX, None); - assert_eq!(Slice::from(*b"e"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"f"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"g"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"h"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"i"), iter.next().unwrap()?.0); + assert_eq!(Slice::from(*b"e"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"f"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"g"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"h"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"i"), iter.next().unwrap().key()?); iter_closed!(iter); // NOTE: Reverse - let mut iter = tree.range("e".."i", None, None).rev(); + let mut iter = tree.range("e".."i", SeqNo::MAX, None).rev(); - assert_eq!(Slice::from(*b"h"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"g"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"f"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"e"), iter.next().unwrap()?.0); + assert_eq!(Slice::from(*b"h"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"g"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"f"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"e"), iter.next().unwrap().key()?); iter_closed!(iter); // NOTE: Reverse inclusive - let mut iter = tree.range("e"..="i", None, None).rev(); + let mut iter = tree.range("e"..="i", SeqNo::MAX, None).rev(); - assert_eq!(Slice::from(*b"i"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"h"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"g"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"f"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"e"), iter.next().unwrap()?.0); + assert_eq!(Slice::from(*b"i"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"h"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"g"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"f"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"e"), iter.next().unwrap().key()?); iter_closed!(iter); // NOTE: Ping Pong - let mut iter = tree.range("e".."i", None, None); + let mut iter = tree.range("e".."i", SeqNo::MAX, None); - assert_eq!(Slice::from(*b"e"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"h"), iter.next_back().unwrap()?.0); - assert_eq!(Slice::from(*b"f"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"g"), iter.next_back().unwrap()?.0); + assert_eq!(Slice::from(*b"e"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"h"), iter.next_back().unwrap().key()?); + assert_eq!(Slice::from(*b"f"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"g"), iter.next_back().unwrap().key()?); iter_closed!(iter); // NOTE: Ping Pong inclusive - let mut iter = tree.range("e"..="i", None, None); + let mut iter = tree.range("e"..="i", SeqNo::MAX, None); - assert_eq!(Slice::from(*b"e"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"i"), iter.next_back().unwrap()?.0); - assert_eq!(Slice::from(*b"f"), iter.next().unwrap()?.0); - assert_eq!(Slice::from(*b"h"), iter.next_back().unwrap()?.0); - assert_eq!(Slice::from(*b"g"), iter.next().unwrap()?.0); + assert_eq!(Slice::from(*b"e"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"i"), iter.next_back().unwrap().key()?); + assert_eq!(Slice::from(*b"f"), iter.next().unwrap().key()?); + assert_eq!(Slice::from(*b"h"), iter.next_back().unwrap().key()?); + assert_eq!(Slice::from(*b"g"), iter.next().unwrap().key()?); iter_closed!(iter); Ok(()) diff --git a/tests/tree_drop_range.rs b/tests/tree_drop_range.rs index bd568c65..a4f295b6 100644 --- a/tests/tree_drop_range.rs +++ b/tests/tree_drop_range.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, KeyRange, UserKey}; +use lsm_tree::{AbstractTree, Config, KeyRange, SeqNo, UserKey}; use test_log::test; #[test] @@ -18,11 +18,11 @@ fn tree_drop_range() -> lsm_tree::Result<()> { tree.drop_range(KeyRange::new((UserKey::from("a"), UserKey::from("c"))))?; - assert!(!tree.contains_key("a", None)?); - assert!(!tree.contains_key("b", None)?); - assert!(!tree.contains_key("c", None)?); - assert!(tree.contains_key("d", None)?); - assert!(tree.contains_key("e", None)?); + assert!(!tree.contains_key("a", SeqNo::MAX)?); + assert!(!tree.contains_key("b", SeqNo::MAX)?); + assert!(!tree.contains_key("c", SeqNo::MAX)?); + assert!(tree.contains_key("d", SeqNo::MAX)?); + assert!(tree.contains_key("e", SeqNo::MAX)?); assert_eq!(1, tree.l0_run_count()); assert_eq!(2, tree.segment_count()); diff --git a/tests/tree_flush_eviction.rs b/tests/tree_flush_eviction.rs index 6d881076..10441569 100644 --- a/tests/tree_flush_eviction.rs +++ b/tests/tree_flush_eviction.rs @@ -1,4 +1,4 @@ -use lsm_tree::AbstractTree; +use lsm_tree::{AbstractTree, SeqNo}; use std::sync::Arc; use test_log::test; @@ -11,12 +11,12 @@ fn tree_flush_eviction_1() -> lsm_tree::Result<()> { tree.insert("a", "a", 0); tree.remove_weak("a", 1); - assert_eq!(0, tree.len(None, None)?); + assert_eq!(0, tree.len(SeqNo::MAX, None)?); // NOTE: Should not evict weak tombstone tree.flush_active_memtable(0)?; assert_eq!(1, tree.segment_count()); - assert_eq!(0, tree.len(None, None)?); + assert_eq!(0, tree.len(SeqNo::MAX, None)?); Ok(()) } @@ -30,12 +30,12 @@ fn tree_flush_eviction_2() -> lsm_tree::Result<()> { tree.insert("a", "a", 0); tree.remove_weak("a", 1); - assert_eq!(0, tree.len(None, None)?); + assert_eq!(0, tree.len(SeqNo::MAX, None)?); // NOTE: Should evict old value, thus weak tombstone too tree.flush_active_memtable(1)?; assert_eq!(0, tree.segment_count()); - assert_eq!(0, tree.len(None, None)?); + assert_eq!(0, tree.len(SeqNo::MAX, None)?); Ok(()) } @@ -50,17 +50,17 @@ fn tree_flush_eviction_3() -> lsm_tree::Result<()> { tree.insert("a", "a", 0); tree.remove("a", 1); - assert_eq!(0, tree.len(None, None)?); + assert_eq!(0, tree.len(SeqNo::MAX, None)?); // NOTE: Should evict old value, but tombstone should stay until last level tree.flush_active_memtable(1)?; assert_eq!(1, tree.segment_count()); - assert_eq!(0, tree.len(None, None)?); + assert_eq!(0, tree.len(SeqNo::MAX, None)?); // NOTE: Should evict tombstone because last level tree.compact(Arc::new(lsm_tree::compaction::PullDown(0, 6)), 0)?; assert_eq!(0, tree.segment_count()); - assert_eq!(0, tree.len(None, None)?); + assert_eq!(0, tree.len(SeqNo::MAX, None)?); Ok(()) } @@ -76,12 +76,12 @@ fn tree_flush_eviction_4() -> lsm_tree::Result<()> { tree.insert("a", "a", 0); tree.remove("a", 1); tree.insert("a", "a", 2); - assert_eq!(1, tree.len(None, None)?); + assert_eq!(1, tree.len(SeqNo::MAX, None)?); // NOTE: Tombstone should stay because of seqno threshold tree.flush_active_memtable(1)?; assert_eq!(1, tree.segment_count()); - assert_eq!(1, tree.len(None, None)?); + assert_eq!(1, tree.len(SeqNo::MAX, None)?); assert_eq!( 1, tree.manifest @@ -101,7 +101,7 @@ fn tree_flush_eviction_4() -> lsm_tree::Result<()> { // NOTE: Should evict tombstone because last level tree.compact(Arc::new(lsm_tree::compaction::PullDown(0, 6)), 0)?; assert_eq!(1, tree.segment_count()); - assert_eq!(1, tree.len(None, None)?); + assert_eq!(1, tree.len(SeqNo::MAX, None)?); assert_eq!( 0, tree.manifest diff --git a/tests/tree_iter_lifetime.rs b/tests/tree_iter_lifetime.rs deleted file mode 100644 index 655de9d3..00000000 --- a/tests/tree_iter_lifetime.rs +++ /dev/null @@ -1,23 +0,0 @@ -use lsm_tree::{AbstractTree, KvPair}; -use std::path::Path; -use test_log::test; - -fn iterrr( - path: &Path, -) -> lsm_tree::Result>> { - let tree = lsm_tree::Config::new(path).open()?; - - for x in 0..100u32 { - let x = x.to_be_bytes(); - tree.insert(x, x, 0); - } - - Ok(tree.iter(None, None)) -} - -#[test] -fn tree_iter_lifetime() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir().unwrap(); - assert_eq!(100, iterrr(folder.path())?.count()); - Ok(()) -} diff --git a/tests/tree_kv.rs b/tests/tree_kv.rs index efb6ef8f..ad4dd43b 100644 --- a/tests/tree_kv.rs +++ b/tests/tree_kv.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config}; +use lsm_tree::{AbstractTree, Config, SeqNo}; use test_log::test; #[test] @@ -8,51 +8,51 @@ fn tree_first_last_kv() -> lsm_tree::Result<()> { { let tree = Config::new(&folder).open()?; - assert!(tree.is_empty(None, None)?); - assert_eq!(tree.first_key_value(None, None)?, None); - assert_eq!(tree.last_key_value(None, None)?, None); + assert!(tree.is_empty(SeqNo::MAX, None)?); + assert_eq!(tree.first_key_value(SeqNo::MAX, None)?, None); + assert_eq!(tree.last_key_value(SeqNo::MAX, None)?, None); tree.insert("b", "b", 0); - assert_eq!(b"b", &*tree.first_key_value(None, None)?.unwrap().0); - assert_eq!(b"b", &*tree.last_key_value(None, None)?.unwrap().0); + assert_eq!(b"b", &*tree.first_key_value(SeqNo::MAX, None)?.unwrap().0); + assert_eq!(b"b", &*tree.last_key_value(SeqNo::MAX, None)?.unwrap().0); tree.flush_active_memtable(0)?; - assert_eq!(b"b", &*tree.first_key_value(None, None)?.unwrap().0); - assert_eq!(b"b", &*tree.last_key_value(None, None)?.unwrap().0); + assert_eq!(b"b", &*tree.first_key_value(SeqNo::MAX, None)?.unwrap().0); + assert_eq!(b"b", &*tree.last_key_value(SeqNo::MAX, None)?.unwrap().0); } { let tree = Config::new(&folder).open()?; - assert_eq!(1, tree.len(None, None)?); + assert_eq!(1, tree.len(SeqNo::MAX, None)?); - assert_eq!(b"b", &*tree.first_key_value(None, None)?.unwrap().0); - assert_eq!(b"b", &*tree.last_key_value(None, None)?.unwrap().0); + assert_eq!(b"b", &*tree.first_key_value(SeqNo::MAX, None)?.unwrap().0); + assert_eq!(b"b", &*tree.last_key_value(SeqNo::MAX, None)?.unwrap().0); tree.insert("a", "a", 0); - assert_eq!(2, tree.len(None, None)?); + assert_eq!(2, tree.len(SeqNo::MAX, None)?); - assert_eq!(b"a", &*tree.first_key_value(None, None)?.unwrap().0); - assert_eq!(b"b", &*tree.last_key_value(None, None)?.unwrap().0); + assert_eq!(b"a", &*tree.first_key_value(SeqNo::MAX, None)?.unwrap().0); + assert_eq!(b"b", &*tree.last_key_value(SeqNo::MAX, None)?.unwrap().0); tree.insert("c", "c", 0); - assert_eq!(3, tree.len(None, None)?); + assert_eq!(3, tree.len(SeqNo::MAX, None)?); - assert_eq!(b"a", &*tree.first_key_value(None, None)?.unwrap().0); - assert_eq!(b"c", &*tree.last_key_value(None, None)?.unwrap().0); + assert_eq!(b"a", &*tree.first_key_value(SeqNo::MAX, None)?.unwrap().0); + assert_eq!(b"c", &*tree.last_key_value(SeqNo::MAX, None)?.unwrap().0); tree.flush_active_memtable(0)?; - assert_eq!(b"a", &*tree.first_key_value(None, None)?.unwrap().0); - assert_eq!(b"c", &*tree.last_key_value(None, None)?.unwrap().0); + assert_eq!(b"a", &*tree.first_key_value(SeqNo::MAX, None)?.unwrap().0); + assert_eq!(b"c", &*tree.last_key_value(SeqNo::MAX, None)?.unwrap().0); } { let tree = Config::new(&folder).open()?; - assert_eq!(3, tree.len(None, None)?); + assert_eq!(3, tree.len(SeqNo::MAX, None)?); - assert_eq!(b"a", &*tree.first_key_value(None, None)?.unwrap().0); - assert_eq!(b"c", &*tree.last_key_value(None, None)?.unwrap().0); + assert_eq!(b"a", &*tree.first_key_value(SeqNo::MAX, None)?.unwrap().0); + assert_eq!(b"c", &*tree.last_key_value(SeqNo::MAX, None)?.unwrap().0); } Ok(()) diff --git a/tests/tree_l0_point_read.rs b/tests/tree_l0_point_read.rs index 7a4f2efd..152876e8 100644 --- a/tests/tree_l0_point_read.rs +++ b/tests/tree_l0_point_read.rs @@ -1,4 +1,4 @@ -use lsm_tree::AbstractTree; +use lsm_tree::{AbstractTree, SeqNo}; use test_log::test; #[test] @@ -31,13 +31,13 @@ fn tree_l0_point_read() -> lsm_tree::Result<()> { tree.insert("g", "g", 3); tree.flush_active_memtable(0)?; - assert_eq!(b"A", &*tree.get("a", None)?.unwrap()); - assert_eq!(b"B", &*tree.get("b", None)?.unwrap()); - assert_eq!(b"C", &*tree.get("c", None)?.unwrap()); - assert_eq!(b"d", &*tree.get("d", None)?.unwrap()); - assert_eq!(b"e", &*tree.get("e", None)?.unwrap()); - assert_eq!(b"f", &*tree.get("f", None)?.unwrap()); - assert_eq!(b"g", &*tree.get("g", None)?.unwrap()); + assert_eq!(b"A", &*tree.get("a", SeqNo::MAX)?.unwrap()); + assert_eq!(b"B", &*tree.get("b", SeqNo::MAX)?.unwrap()); + assert_eq!(b"C", &*tree.get("c", SeqNo::MAX)?.unwrap()); + assert_eq!(b"d", &*tree.get("d", SeqNo::MAX)?.unwrap()); + assert_eq!(b"e", &*tree.get("e", SeqNo::MAX)?.unwrap()); + assert_eq!(b"f", &*tree.get("f", SeqNo::MAX)?.unwrap()); + assert_eq!(b"g", &*tree.get("g", SeqNo::MAX)?.unwrap()); Ok(()) } diff --git a/tests/tree_l0_range.rs b/tests/tree_l0_range.rs index 22d656a4..25dd7abf 100644 --- a/tests/tree_l0_range.rs +++ b/tests/tree_l0_range.rs @@ -1,4 +1,4 @@ -use lsm_tree::AbstractTree; +use lsm_tree::{AbstractTree, Guard, SeqNo}; use test_log::test; #[test] @@ -31,15 +31,15 @@ fn tree_l0_range() -> lsm_tree::Result<()> { tree.insert("g", "g", 3); tree.flush_active_memtable(0)?; - let mut range = tree.range("c"..="e", None, None); - assert_eq!(b"C", &*range.next().unwrap().unwrap().1); - assert_eq!(b"d", &*range.next().unwrap().unwrap().1); - assert_eq!(b"e", &*range.next().unwrap().unwrap().1); + let mut range = tree.range("c"..="e", SeqNo::MAX, None); + assert_eq!(b"C", &*range.next().unwrap().value()?); + assert_eq!(b"d", &*range.next().unwrap().value()?); + assert_eq!(b"e", &*range.next().unwrap().value()?); assert!(range.next().is_none()); - let mut range = tree.range("f"..="g", None, None).rev(); - assert_eq!(b"g", &*range.next().unwrap().unwrap().1); - assert_eq!(b"f", &*range.next().unwrap().unwrap().1); + let mut range = tree.range("f"..="g", SeqNo::MAX, None).rev(); + assert_eq!(b"g", &*range.next().unwrap().value()?); + assert_eq!(b"f", &*range.next().unwrap().value()?); assert!(range.next().is_none()); Ok(()) diff --git a/tests/tree_mvcc_simple.rs b/tests/tree_mvcc_simple.rs index ea935db6..0e468c3c 100644 --- a/tests/tree_mvcc_simple.rs +++ b/tests/tree_mvcc_simple.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config}; +use lsm_tree::{AbstractTree, Config, Guard, SeqNo}; use test_log::test; #[test] @@ -22,103 +22,107 @@ fn tree_read_mvcc() -> lsm_tree::Result<()> { tree.insert("a", "a5", 5); - assert_eq!(&*tree.get("a", None)?.unwrap(), b"a5"); - assert_eq!(&*tree.get("b", None)?.unwrap(), b"b3"); - assert_eq!(&*tree.get("c", None)?.unwrap(), b"c4"); - - let snapshot = tree.snapshot(1); - assert_eq!(&*snapshot.get("a")?.unwrap(), b"a0"); - assert_eq!(&*snapshot.get("b")?.unwrap(), b"b0"); - assert!(snapshot.get("c")?.is_none()); - - let snapshot = tree.snapshot(2); - assert_eq!(&*snapshot.get("a")?.unwrap(), b"a1"); - assert_eq!(&*snapshot.get("b")?.unwrap(), b"b1"); - assert!(snapshot.get("c")?.is_none()); - - let snapshot = tree.snapshot(3); - assert_eq!(&*snapshot.get("a")?.unwrap(), b"a1"); - assert_eq!(&*snapshot.get("b")?.unwrap(), b"b2"); - assert!(snapshot.get("c")?.is_none()); - - let snapshot = tree.snapshot(4); - assert_eq!(&*snapshot.get("a")?.unwrap(), b"a1"); - assert_eq!(&*snapshot.get("b")?.unwrap(), b"b3"); - assert!(snapshot.get("c")?.is_none()); - - let snapshot = tree.snapshot(5); - assert_eq!(&*snapshot.get("a")?.unwrap(), b"a1"); - assert_eq!(&*snapshot.get("b")?.unwrap(), b"b3"); - assert_eq!(&*snapshot.get("c")?.unwrap(), b"c4"); - - let snapshot = tree.snapshot(6); - assert_eq!(&*snapshot.get("a")?.unwrap(), b"a5"); - assert_eq!(&*snapshot.get("b")?.unwrap(), b"b3"); - assert_eq!(&*snapshot.get("c")?.unwrap(), b"c4"); - - let snapshot = tree.snapshot(100); - assert_eq!(&*snapshot.get("a")?.unwrap(), b"a5"); - assert_eq!(&*snapshot.get("b")?.unwrap(), b"b3"); - assert_eq!(&*snapshot.get("c")?.unwrap(), b"c4"); - - let mut iter = tree.iter(None, None); - - assert_eq!(&*iter.next().unwrap().unwrap().1, b"a5"); - assert_eq!(&*iter.next().unwrap().unwrap().1, b"b3"); - assert_eq!(&*iter.next().unwrap().unwrap().1, b"c4"); - assert!(iter.next().is_none()); + assert_eq!(&*tree.get("a", SeqNo::MAX)?.unwrap(), b"a5"); + assert_eq!(&*tree.get("b", SeqNo::MAX)?.unwrap(), b"b3"); + assert_eq!(&*tree.get("c", SeqNo::MAX)?.unwrap(), b"c4"); - let snapshot = tree.snapshot(1); - let mut iter = snapshot.iter(); + // TODO: test snapshot reads - assert_eq!(&*iter.next().unwrap().unwrap().1, b"a0"); - assert_eq!(&*iter.next().unwrap().unwrap().1, b"b0"); - assert!(iter.next().is_none()); + // let snapshot = tree.snapshot(1); + // assert_eq!(&*snapshot.get("a")?.unwrap(), b"a0"); + // assert_eq!(&*snapshot.get("b")?.unwrap(), b"b0"); + // assert!(snapshot.get("c")?.is_none()); - let snapshot = tree.snapshot(2); - let mut iter = snapshot.iter(); + // let snapshot = tree.snapshot(2); + // assert_eq!(&*snapshot.get("a")?.unwrap(), b"a1"); + // assert_eq!(&*snapshot.get("b")?.unwrap(), b"b1"); + // assert!(snapshot.get("c")?.is_none()); - assert_eq!(&*iter.next().unwrap().unwrap().1, b"a1"); - assert_eq!(&*iter.next().unwrap().unwrap().1, b"b1"); - assert!(iter.next().is_none()); + // let snapshot = tree.snapshot(3); + // assert_eq!(&*snapshot.get("a")?.unwrap(), b"a1"); + // assert_eq!(&*snapshot.get("b")?.unwrap(), b"b2"); + // assert!(snapshot.get("c")?.is_none()); - let snapshot = tree.snapshot(3); - let mut iter = snapshot.iter(); + // let snapshot = tree.snapshot(4); + // assert_eq!(&*snapshot.get("a")?.unwrap(), b"a1"); + // assert_eq!(&*snapshot.get("b")?.unwrap(), b"b3"); + // assert!(snapshot.get("c")?.is_none()); - assert_eq!(&*iter.next().unwrap().unwrap().1, b"a1"); - assert_eq!(&*iter.next().unwrap().unwrap().1, b"b2"); - assert!(iter.next().is_none()); + // let snapshot = tree.snapshot(5); + // assert_eq!(&*snapshot.get("a")?.unwrap(), b"a1"); + // assert_eq!(&*snapshot.get("b")?.unwrap(), b"b3"); + // assert_eq!(&*snapshot.get("c")?.unwrap(), b"c4"); - let snapshot = tree.snapshot(4); - let mut iter = snapshot.iter(); + // let snapshot = tree.snapshot(6); + // assert_eq!(&*snapshot.get("a")?.unwrap(), b"a5"); + // assert_eq!(&*snapshot.get("b")?.unwrap(), b"b3"); + // assert_eq!(&*snapshot.get("c")?.unwrap(), b"c4"); - assert_eq!(&*iter.next().unwrap().unwrap().1, b"a1"); - assert_eq!(&*iter.next().unwrap().unwrap().1, b"b3"); - assert!(iter.next().is_none()); + // let snapshot = tree.snapshot(100); + // assert_eq!(&*snapshot.get("a")?.unwrap(), b"a5"); + // assert_eq!(&*snapshot.get("b")?.unwrap(), b"b3"); + // assert_eq!(&*snapshot.get("c")?.unwrap(), b"c4"); - let snapshot = tree.snapshot(5); - let mut iter = snapshot.iter(); + let mut iter = tree.iter(SeqNo::MAX, None); - assert_eq!(&*iter.next().unwrap().unwrap().1, b"a1"); - assert_eq!(&*iter.next().unwrap().unwrap().1, b"b3"); - assert_eq!(&*iter.next().unwrap().unwrap().1, b"c4"); + assert_eq!(&*iter.next().unwrap().value()?, b"a5"); + assert_eq!(&*iter.next().unwrap().value()?, b"b3"); + assert_eq!(&*iter.next().unwrap().value()?, b"c4"); assert!(iter.next().is_none()); - let snapshot = tree.snapshot(6); - let mut iter = snapshot.iter(); + // TODO: test snapshot reads - assert_eq!(&*iter.next().unwrap().unwrap().1, b"a5"); - assert_eq!(&*iter.next().unwrap().unwrap().1, b"b3"); - assert_eq!(&*iter.next().unwrap().unwrap().1, b"c4"); - assert!(iter.next().is_none()); + // let snapshot = tree.snapshot(1); + // let mut iter = snapshot.iter(); - let snapshot = tree.snapshot(100); - let mut iter = snapshot.iter(); + // assert_eq!(&*iter.next().unwrap().unwrap().1, b"a0"); + // assert_eq!(&*iter.next().unwrap().unwrap().1, b"b0"); + // assert!(iter.next().is_none()); - assert_eq!(&*iter.next().unwrap().unwrap().1, b"a5"); - assert_eq!(&*iter.next().unwrap().unwrap().1, b"b3"); - assert_eq!(&*iter.next().unwrap().unwrap().1, b"c4"); - assert!(iter.next().is_none()); + // let snapshot = tree.snapshot(2); + // let mut iter = snapshot.iter(); + + // assert_eq!(&*iter.next().unwrap().unwrap().1, b"a1"); + // assert_eq!(&*iter.next().unwrap().unwrap().1, b"b1"); + // assert!(iter.next().is_none()); + + // let snapshot = tree.snapshot(3); + // let mut iter = snapshot.iter(); + + // assert_eq!(&*iter.next().unwrap().unwrap().1, b"a1"); + // assert_eq!(&*iter.next().unwrap().unwrap().1, b"b2"); + // assert!(iter.next().is_none()); + + // let snapshot = tree.snapshot(4); + // let mut iter = snapshot.iter(); + + // assert_eq!(&*iter.next().unwrap().unwrap().1, b"a1"); + // assert_eq!(&*iter.next().unwrap().unwrap().1, b"b3"); + // assert!(iter.next().is_none()); + + // let snapshot = tree.snapshot(5); + // let mut iter = snapshot.iter(); + + // assert_eq!(&*iter.next().unwrap().unwrap().1, b"a1"); + // assert_eq!(&*iter.next().unwrap().unwrap().1, b"b3"); + // assert_eq!(&*iter.next().unwrap().unwrap().1, b"c4"); + // assert!(iter.next().is_none()); + + // let snapshot = tree.snapshot(6); + // let mut iter = snapshot.iter(); + + // assert_eq!(&*iter.next().unwrap().unwrap().1, b"a5"); + // assert_eq!(&*iter.next().unwrap().unwrap().1, b"b3"); + // assert_eq!(&*iter.next().unwrap().unwrap().1, b"c4"); + // assert!(iter.next().is_none()); + + // let snapshot = tree.snapshot(100); + // let mut iter = snapshot.iter(); + + // assert_eq!(&*iter.next().unwrap().unwrap().1, b"a5"); + // assert_eq!(&*iter.next().unwrap().unwrap().1, b"b3"); + // assert_eq!(&*iter.next().unwrap().unwrap().1, b"c4"); + // assert!(iter.next().is_none()); Ok(()) } diff --git a/tests/tree_non_disjoint_point_read.rs b/tests/tree_non_disjoint_point_read.rs index 41a17e31..55f49858 100644 --- a/tests/tree_non_disjoint_point_read.rs +++ b/tests/tree_non_disjoint_point_read.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config}; +use lsm_tree::{AbstractTree, Config, SeqNo}; use test_log::test; #[test] @@ -44,12 +44,12 @@ fn tree_non_disjoint_point_read() -> lsm_tree::Result<()> { tree.insert("z", "z", 0); tree.flush_active_memtable(0)?; - tree.get("c", None).unwrap().unwrap(); - tree.get("d", None).unwrap().unwrap(); - tree.get("e", None).unwrap().unwrap(); - tree.get("f", None).unwrap().unwrap(); - tree.get("g", None).unwrap().unwrap(); - tree.get("h", None).unwrap().unwrap(); + tree.get("c", SeqNo::MAX).unwrap().unwrap(); + tree.get("d", SeqNo::MAX).unwrap().unwrap(); + tree.get("e", SeqNo::MAX).unwrap().unwrap(); + tree.get("f", SeqNo::MAX).unwrap().unwrap(); + tree.get("g", SeqNo::MAX).unwrap().unwrap(); + tree.get("h", SeqNo::MAX).unwrap().unwrap(); Ok(()) } diff --git a/tests/tree_range.rs b/tests/tree_range.rs index dc99ec3d..17cd8b39 100644 --- a/tests/tree_range.rs +++ b/tests/tree_range.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config}; +use lsm_tree::{AbstractTree, Config, SeqNo}; use test_log::test; #[test] @@ -17,14 +17,14 @@ fn tree_range_count() -> lsm_tree::Result<()> { tree.insert("f".as_bytes(), nanoid::nanoid!().as_bytes(), 4); tree.insert("g".as_bytes(), nanoid::nanoid!().as_bytes(), 5); - assert_eq!(2, tree.range("a"..="f", None, None).count()); - assert_eq!(2, tree.range("f"..="g", None, None).count()); + assert_eq!(2, tree.range("a"..="f", SeqNo::MAX, None).count()); + assert_eq!(2, tree.range("f"..="g", SeqNo::MAX, None).count()); assert_eq!( 1, tree.range::, (Bound>, Bound>)>( (Excluded("f".into()), Unbounded), - None, + SeqNo::MAX, None ) .count() @@ -32,12 +32,12 @@ fn tree_range_count() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; - assert_eq!(2, tree.range("a"..="f", None, None).count(),); + assert_eq!(2, tree.range("a"..="f", SeqNo::MAX, None).count()); assert_eq!( 1, tree.range::, (Bound>, Bound>)>( (Excluded("f".into()), Unbounded), - None, + SeqNo::MAX, None ) .count() @@ -47,12 +47,12 @@ fn tree_range_count() -> lsm_tree::Result<()> { tree.insert("f".as_bytes(), nanoid::nanoid!().as_bytes(), 7); tree.insert("g".as_bytes(), nanoid::nanoid!().as_bytes(), 8); - assert_eq!(2, tree.range("a"..="f", None, None).count()); + assert_eq!(2, tree.range("a"..="f", SeqNo::MAX, None).count()); assert_eq!( 1, tree.range::, (Bound>, Bound>)>( (Excluded("f".into()), Unbounded), - None, + SeqNo::MAX, None ) .count() @@ -62,6 +62,7 @@ fn tree_range_count() -> lsm_tree::Result<()> { } #[test] +#[ignore = "restore"] fn blob_tree_range_count() -> lsm_tree::Result<()> { use std::ops::Bound::{self, Excluded, Unbounded}; @@ -77,14 +78,14 @@ fn blob_tree_range_count() -> lsm_tree::Result<()> { tree.insert("f".as_bytes(), b"neptune!".repeat(128_000), 4); tree.insert("g".as_bytes(), nanoid::nanoid!().as_bytes(), 5); - assert_eq!(2, tree.range("a"..="f", None, None).count()); - assert_eq!(2, tree.range("f"..="g", None, None).count()); + assert_eq!(2, tree.range("a"..="f", SeqNo::MAX, None).count()); + assert_eq!(2, tree.range("f"..="g", SeqNo::MAX, None).count()); assert_eq!( 1, tree.range::, (Bound>, Bound>)>( (Excluded("f".into()), Unbounded), - None, + SeqNo::MAX, None ) .count() @@ -92,12 +93,12 @@ fn blob_tree_range_count() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; - assert_eq!(2, tree.range("a"..="f", None, None).count()); + assert_eq!(2, tree.range("a"..="f", SeqNo::MAX, None).count()); assert_eq!( 1, tree.range::, (Bound>, Bound>)>( (Excluded("f".into()), Unbounded), - None, + SeqNo::MAX, None ) .count() @@ -107,12 +108,12 @@ fn blob_tree_range_count() -> lsm_tree::Result<()> { tree.insert("f".as_bytes(), nanoid::nanoid!().as_bytes(), 7); tree.insert("g".as_bytes(), nanoid::nanoid!().as_bytes(), 8); - assert_eq!(2, tree.range("a"..="f", None, None).count()); + assert_eq!(2, tree.range("a"..="f", SeqNo::MAX, None).count()); assert_eq!( 1, tree.range::, (Bound>, Bound>)>( (Excluded("f".into()), Unbounded), - None, + SeqNo::MAX, None ) .count() diff --git a/tests/tree_range_memtable_only.rs b/tests/tree_range_memtable_only.rs index a78e4065..7e080184 100644 --- a/tests/tree_range_memtable_only.rs +++ b/tests/tree_range_memtable_only.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config}; +use lsm_tree::{AbstractTree, Config, Guard, SeqNo}; use test_log::test; #[test] @@ -12,137 +12,62 @@ fn tree_range_memtable_only() -> lsm_tree::Result<()> { tree.insert("c", "", 0); let found: Vec = tree - .range("a".."a", None, None) - .flatten() - .map(|(k, _)| String::from_utf8(k.to_vec()).unwrap()) + .range("a".."a", SeqNo::MAX, None) + .flat_map(|x| x.key()) + .map(|k| String::from_utf8(k.to_vec()).unwrap()) .collect::>(); assert_eq!(Vec::::new(), found); let found = tree - .range("a"..="a", None, None) - .flatten() - .map(|(k, _)| String::from_utf8(k.to_vec()).unwrap()) + .range("a"..="a", SeqNo::MAX, None) + .flat_map(|x| x.key()) + .map(|k| String::from_utf8(k.to_vec()).unwrap()) .collect::>(); assert_eq!(vec!["a"], found); let found = tree - .range("a".."b", None, None) - .flatten() - .map(|(k, _)| String::from_utf8(k.to_vec()).unwrap()) + .range("a".."b", SeqNo::MAX, None) + .flat_map(|x| x.key()) + .map(|k| String::from_utf8(k.to_vec()).unwrap()) .collect::>(); assert_eq!(vec!["a"], found); let found = tree - .range("a"..="b", None, None) - .flatten() - .map(|(k, _)| String::from_utf8(k.to_vec()).unwrap()) + .range("a"..="b", SeqNo::MAX, None) + .flat_map(|x| x.key()) + .map(|k| String::from_utf8(k.to_vec()).unwrap()) .collect::>(); assert_eq!(vec!["a", "b"], found); let found = tree - .range("a".."a", None, None) - .flatten() + .range("a".."a", SeqNo::MAX, None) + .flat_map(|x| x.key()) .rev() - .map(|(k, _)| String::from_utf8(k.to_vec()).unwrap()) + .map(|k| String::from_utf8(k.to_vec()).unwrap()) .collect::>(); assert_eq!(Vec::::new(), found); let found = tree - .range("a"..="a", None, None) - .flatten() + .range("a"..="a", SeqNo::MAX, None) + .flat_map(|x| x.key()) .rev() - .map(|(k, _)| String::from_utf8(k.to_vec()).unwrap()) + .map(|k| String::from_utf8(k.to_vec()).unwrap()) .collect::>(); assert_eq!(vec!["a"], found); let found = tree - .range("a".."b", None, None) - .flatten() + .range("a".."b", SeqNo::MAX, None) + .flat_map(|x| x.key()) .rev() - .map(|(k, _)| String::from_utf8(k.to_vec()).unwrap()) + .map(|k| String::from_utf8(k.to_vec()).unwrap()) .collect::>(); assert_eq!(vec!["a"], found); let found = tree - .range("a"..="b", None, None) - .flatten() + .range("a"..="b", SeqNo::MAX, None) + .flat_map(|x| x.key()) .rev() - .map(|(k, _)| String::from_utf8(k.to_vec()).unwrap()) - .collect::>(); - assert_eq!(vec!["b", "a"], found); - - Ok(()) -} - -#[test] -fn tree_snapshot_range_memtable_only() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?; - - let tree = Config::new(&folder).open()?; - - tree.insert("a", "", 5); - tree.insert("b", "", 5); - tree.insert("c", "", 5); - - let snapshot = tree.snapshot(100); - - let found = snapshot - .range("a".."a") - .flatten() - .map(|(k, _)| String::from_utf8(k.to_vec()).unwrap()) - .collect::>(); - assert_eq!(Vec::::new(), found); - - let found = snapshot - .range("a"..="a") - .flatten() - .map(|(k, _)| String::from_utf8(k.to_vec()).unwrap()) - .collect::>(); - assert_eq!(vec!["a"], found); - - let found = snapshot - .range("a".."b") - .flatten() - .map(|(k, _)| String::from_utf8(k.to_vec()).unwrap()) - .collect::>(); - assert_eq!(vec!["a"], found); - - let found = snapshot - .range("a"..="b") - .flatten() - .map(|(k, _)| String::from_utf8(k.to_vec()).unwrap()) - .collect::>(); - assert_eq!(vec!["a", "b"], found); - - let found = snapshot - .range("a".."a") - .flatten() - .rev() - .map(|(k, _)| String::from_utf8(k.to_vec()).unwrap()) - .collect::>(); - assert_eq!(Vec::::new(), found); - - let found = snapshot - .range("a"..="a") - .flatten() - .rev() - .map(|(k, _)| String::from_utf8(k.to_vec()).unwrap()) - .collect::>(); - assert_eq!(vec!["a"], found); - - let found = snapshot - .range("a".."b") - .flatten() - .rev() - .map(|(k, _)| String::from_utf8(k.to_vec()).unwrap()) - .collect::>(); - assert_eq!(vec!["a"], found); - - let found = snapshot - .range("a"..="b") - .flatten() - .rev() - .map(|(k, _)| String::from_utf8(k.to_vec()).unwrap()) + .map(|k| String::from_utf8(k.to_vec()).unwrap()) .collect::>(); assert_eq!(vec!["b", "a"], found); diff --git a/tests/tree_recover_large_value.rs b/tests/tree_recover_large_value.rs index fe6b15f7..e2c4810f 100644 --- a/tests/tree_recover_large_value.rs +++ b/tests/tree_recover_large_value.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config}; +use lsm_tree::{AbstractTree, Config, SeqNo}; use test_log::test; #[test] @@ -14,7 +14,7 @@ fn tree_recover_large_value() -> lsm_tree::Result<()> { { let tree = Config::new(&folder).open()?; assert_eq!( - &*tree.get("a", None)?.expect("should exist"), + &*tree.get("a", SeqNo::MAX)?.expect("should exist"), "a".repeat(100_000).as_bytes() ); } diff --git a/tests/tree_reload.rs b/tests/tree_reload.rs index 690f3efa..c76917fd 100644 --- a/tests/tree_reload.rs +++ b/tests/tree_reload.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, SequenceNumberCounter, TreeType}; +use lsm_tree::{AbstractTree, Config, Guard, SeqNo, SequenceNumberCounter, TreeType}; use test_log::test; const ITEM_COUNT: usize = 10_000; @@ -15,13 +15,13 @@ fn tree_reload_smoke_test() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; assert_eq!(1, tree.segment_count()); - assert!(tree.contains_key("a", None)?); + assert!(tree.contains_key("a", SeqNo::MAX)?); } { let tree = Config::new(&folder).open()?; assert_eq!(1, tree.segment_count()); - assert!(tree.contains_key("a", None)?); + assert!(tree.contains_key("a", SeqNo::MAX)?); } Ok(()) @@ -34,18 +34,30 @@ fn tree_reload_empty() -> lsm_tree::Result<()> { { let tree = Config::new(&folder).open()?; - assert_eq!(tree.len(None, None)?, 0); - assert_eq!(tree.iter(None, None).flatten().count(), 0); - assert_eq!(tree.iter(None, None).rev().flatten().count(), 0); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + assert_eq!(tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), 0); + assert_eq!( + tree.iter(SeqNo::MAX, None) + .rev() + .flat_map(|x| x.key()) + .count(), + 0 + ); assert_eq!(tree.tree_type(), TreeType::Standard); } { let tree = Config::new(&folder).open()?; - assert_eq!(tree.len(None, None)?, 0); - assert_eq!(tree.iter(None, None).flatten().count(), 0); - assert_eq!(tree.iter(None, None).rev().flatten().count(), 0); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + assert_eq!(tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), 0); + assert_eq!( + tree.iter(SeqNo::MAX, None) + .rev() + .flat_map(|x| x.key()) + .count(), + 0 + ); assert_eq!(tree.tree_type(), TreeType::Standard); tree.flush_active_memtable(0)?; @@ -54,9 +66,15 @@ fn tree_reload_empty() -> lsm_tree::Result<()> { { let tree = Config::new(&folder).open()?; - assert_eq!(tree.len(None, None)?, 0); - assert_eq!(tree.iter(None, None).flatten().count(), 0); - assert_eq!(tree.iter(None, None).rev().flatten().count(), 0); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + assert_eq!(tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), 0); + assert_eq!( + tree.iter(SeqNo::MAX, None) + .rev() + .flat_map(|x| x.key()) + .count(), + 0 + ); assert_eq!(tree.tree_type(), TreeType::Standard); } @@ -86,10 +104,16 @@ fn tree_reload() -> lsm_tree::Result<()> { tree.insert(key, value.as_bytes(), seqno.next()); } - assert_eq!(tree.len(None, None)?, ITEM_COUNT * 2); - assert_eq!(tree.iter(None, None).flatten().count(), ITEM_COUNT * 2); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT * 2); assert_eq!( - tree.iter(None, None).rev().flatten().count(), + tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), + ITEM_COUNT * 2 + ); + assert_eq!( + tree.iter(SeqNo::MAX, None) + .rev() + .flat_map(|x| x.key()) + .count(), ITEM_COUNT * 2 ); @@ -99,10 +123,16 @@ fn tree_reload() -> lsm_tree::Result<()> { { let tree = Config::new(&folder).open()?; - assert_eq!(tree.len(None, None)?, ITEM_COUNT * 2); - assert_eq!(tree.iter(None, None).flatten().count(), ITEM_COUNT * 2); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT * 2); + assert_eq!( + tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), + ITEM_COUNT * 2 + ); assert_eq!( - tree.iter(None, None).rev().flatten().count(), + tree.iter(SeqNo::MAX, None) + .rev() + .flat_map(|x| x.key()) + .count(), ITEM_COUNT * 2 ); } diff --git a/tests/tree_reload_pwd.rs b/tests/tree_reload_pwd.rs index 2b5dc4b5..5ae7f991 100644 --- a/tests/tree_reload_pwd.rs +++ b/tests/tree_reload_pwd.rs @@ -1,5 +1,5 @@ use fs_extra::dir::CopyOptions; -use lsm_tree::{AbstractTree, Config, SequenceNumberCounter}; +use lsm_tree::{AbstractTree, Config, SeqNo, SequenceNumberCounter}; use test_log::test; const ITEM_COUNT: usize = 10_000; @@ -21,7 +21,7 @@ fn tree_reload_pwd() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; - assert_eq!(ITEM_COUNT, tree.len(None, None)?); + assert_eq!(ITEM_COUNT, tree.len(SeqNo::MAX, None)?); } let folder_new = tempfile::tempdir()?; @@ -37,7 +37,7 @@ fn tree_reload_pwd() -> lsm_tree::Result<()> { { let tree = Config::new(&folder_new_subfolder).open()?; - assert_eq!(ITEM_COUNT, tree.len(None, None)?); + assert_eq!(ITEM_COUNT, tree.len(SeqNo::MAX, None)?); } Ok(()) diff --git a/tests/tree_sealed_shadowing.rs b/tests/tree_sealed_shadowing.rs index 532fae4d..9bc83887 100644 --- a/tests/tree_sealed_shadowing.rs +++ b/tests/tree_sealed_shadowing.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config}; +use lsm_tree::{AbstractTree, Config, SeqNo}; use test_log::test; #[test] @@ -9,24 +9,24 @@ fn tree_sealed_memtable_tombstone_shadowing() -> lsm_tree::Result<()> { let tree = Config::new(path).open()?; tree.insert("a", "123", 0); - assert!(tree.contains_key("a", None)?); + assert!(tree.contains_key("a", SeqNo::MAX)?); tree.flush_active_memtable(0)?; tree.remove("a", 1); - assert!(!tree.contains_key("a", None)?); + assert!(!tree.contains_key("a", SeqNo::MAX)?); let (id, memtable) = tree.rotate_memtable().unwrap(); - assert!(!tree.contains_key("a", None)?); + assert!(!tree.contains_key("a", SeqNo::MAX)?); let segment = tree.flush_memtable(id, &memtable, 0)?.unwrap(); tree.register_segments(&[segment], 0)?; - assert!(!tree.contains_key("a", None)?); + assert!(!tree.contains_key("a", SeqNo::MAX)?); tree.major_compact(u64::MAX, 2)?; - assert!(!tree.contains_key("a", None)?); + assert!(!tree.contains_key("a", SeqNo::MAX)?); Ok(()) } diff --git a/tests/tree_shadowing.rs b/tests/tree_shadowing.rs index 58a37894..af2df223 100644 --- a/tests/tree_shadowing.rs +++ b/tests/tree_shadowing.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, SequenceNumberCounter}; +use lsm_tree::{AbstractTree, Config, Guard, SeqNo, SequenceNumberCounter}; use test_log::test; #[test] @@ -10,24 +10,24 @@ fn tree_shadowing_upsert() -> lsm_tree::Result<()> { let key = "1".as_bytes(); let value = "oldvalue".as_bytes(); - assert_eq!(tree.len(None, None)?, 0); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); tree.insert(key, value, 0); - assert_eq!(tree.len(None, None)?, 1); - assert_eq!(tree.get(key, None)?, Some(value.into())); + assert_eq!(tree.len(SeqNo::MAX, None)?, 1); + assert_eq!(tree.get(key, SeqNo::MAX)?, Some(value.into())); tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, 1); - assert_eq!(tree.get(key, None)?, Some(value.into())); + assert_eq!(tree.len(SeqNo::MAX, None)?, 1); + assert_eq!(tree.get(key, SeqNo::MAX)?, Some(value.into())); let value = "newvalue".as_bytes(); tree.insert(key, value, 1); - assert_eq!(tree.len(None, None)?, 1); - assert_eq!(tree.get(key, None)?, Some(value.into())); + assert_eq!(tree.len(SeqNo::MAX, None)?, 1); + assert_eq!(tree.get(key, SeqNo::MAX)?, Some(value.into())); tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, 1); - assert_eq!(tree.get(key, None)?, Some(value.into())); + assert_eq!(tree.len(SeqNo::MAX, None)?, 1); + assert_eq!(tree.get(key, SeqNo::MAX)?, Some(value.into())); Ok(()) } @@ -41,24 +41,24 @@ fn tree_shadowing_upsert_blob() -> lsm_tree::Result<()> { let key = "1".as_bytes(); let value = "oldvalue".as_bytes(); - assert_eq!(tree.len(None, None)?, 0); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); tree.insert(key, value, 0); - assert_eq!(tree.len(None, None)?, 1); - assert_eq!(tree.get(key, None)?, Some(value.into())); + assert_eq!(tree.len(SeqNo::MAX, None)?, 1); + assert_eq!(tree.get(key, SeqNo::MAX)?, Some(value.into())); tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, 1); - assert_eq!(tree.get(key, None)?, Some(value.into())); + assert_eq!(tree.len(SeqNo::MAX, None)?, 1); + assert_eq!(tree.get(key, SeqNo::MAX)?, Some(value.into())); let value = "newvalue".as_bytes(); tree.insert(key, value, 1); - assert_eq!(tree.len(None, None)?, 1); - assert_eq!(tree.get(key, None)?, Some(value.into())); + assert_eq!(tree.len(SeqNo::MAX, None)?, 1); + assert_eq!(tree.get(key, SeqNo::MAX)?, Some(value.into())); tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, 1); - assert_eq!(tree.get(key, None)?, Some(value.into())); + assert_eq!(tree.len(SeqNo::MAX, None)?, 1); + assert_eq!(tree.get(key, SeqNo::MAX)?, Some(value.into())); Ok(()) } @@ -72,22 +72,22 @@ fn tree_shadowing_delete() -> lsm_tree::Result<()> { let key = "1".as_bytes(); let value = "oldvalue".as_bytes(); - assert_eq!(tree.len(None, None)?, 0); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); tree.insert(key, value, 0); - assert_eq!(tree.len(None, None)?, 1); - assert_eq!(tree.get(key, None)?, Some(value.into())); + assert_eq!(tree.len(SeqNo::MAX, None)?, 1); + assert_eq!(tree.get(key, SeqNo::MAX)?, Some(value.into())); tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, 1); - assert_eq!(tree.get(key, None)?, Some(value.into())); + assert_eq!(tree.len(SeqNo::MAX, None)?, 1); + assert_eq!(tree.get(key, SeqNo::MAX)?, Some(value.into())); tree.remove(key, 1); - assert_eq!(tree.len(None, None)?, 0); - assert!(tree.get(key, None)?.is_none()); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + assert!(tree.get(key, SeqNo::MAX)?.is_none()); tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, 0); - assert!(tree.get(key, None)?.is_none()); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + assert!(tree.get(key, SeqNo::MAX)?.is_none()); Ok(()) } @@ -101,22 +101,22 @@ fn tree_shadowing_delete_blob() -> lsm_tree::Result<()> { let key = "1".as_bytes(); let value = "oldvalue".as_bytes(); - assert_eq!(tree.len(None, None)?, 0); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); tree.insert(key, value, 0); - assert_eq!(tree.len(None, None)?, 1); - assert_eq!(tree.get(key, None)?, Some(value.into())); + assert_eq!(tree.len(SeqNo::MAX, None)?, 1); + assert_eq!(tree.get(key, SeqNo::MAX)?, Some(value.into())); tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, 1); - assert_eq!(tree.get(key, None)?, Some(value.into())); + assert_eq!(tree.len(SeqNo::MAX, None)?, 1); + assert_eq!(tree.get(key, SeqNo::MAX)?, Some(value.into())); tree.remove(key, 1); - assert_eq!(tree.len(None, None)?, 0); - assert!(tree.get(key, None)?.is_none()); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + assert!(tree.get(key, SeqNo::MAX)?.is_none()); tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, 0); - assert!(tree.get(key, None)?.is_none()); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + assert!(tree.get(key, SeqNo::MAX)?.is_none()); Ok(()) } @@ -139,10 +139,10 @@ fn tree_shadowing_range() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, ITEM_COUNT); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); assert!(tree - .iter(None, None) - .all(|x| &*x.unwrap().1 == "old".as_bytes())); + .iter(SeqNo::MAX, None) + .all(|x| &*x.value().unwrap() == "old".as_bytes())); for x in 0..ITEM_COUNT as u64 { let key = x.to_be_bytes(); @@ -150,17 +150,17 @@ fn tree_shadowing_range() -> lsm_tree::Result<()> { tree.insert(key, value, seqno.next()); } - assert_eq!(tree.len(None, None)?, ITEM_COUNT); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); assert!(tree - .iter(None, None) - .all(|x| &*x.unwrap().1 == "new".as_bytes())); + .iter(SeqNo::MAX, None) + .all(|x| &*x.value().unwrap() == "new".as_bytes())); tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, ITEM_COUNT); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); assert!(tree - .iter(None, None) - .all(|x| &*x.unwrap().1 == "new".as_bytes())); + .iter(SeqNo::MAX, None) + .all(|x| &*x.value().unwrap() == "new".as_bytes())); Ok(()) } @@ -183,10 +183,10 @@ fn tree_shadowing_range_blob() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, ITEM_COUNT); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); assert!(tree - .iter(None, None) - .all(|x| &*x.unwrap().1 == "old".as_bytes())); + .iter(SeqNo::MAX, None) + .all(|x| &*x.value().unwrap() == "old".as_bytes())); for x in 0..ITEM_COUNT as u64 { let key = x.to_be_bytes(); @@ -194,17 +194,17 @@ fn tree_shadowing_range_blob() -> lsm_tree::Result<()> { tree.insert(key, value, seqno.next()); } - assert_eq!(tree.len(None, None)?, ITEM_COUNT); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); assert!(tree - .iter(None, None) - .all(|x| &*x.unwrap().1 == "new".as_bytes())); + .iter(SeqNo::MAX, None) + .all(|x| &*x.value().unwrap() == "new".as_bytes())); tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, ITEM_COUNT); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); assert!(tree - .iter(None, None) - .all(|x| &*x.unwrap().1 == "new".as_bytes())); + .iter(SeqNo::MAX, None) + .all(|x| &*x.value().unwrap() == "new".as_bytes())); Ok(()) } @@ -229,18 +229,18 @@ fn tree_shadowing_prefix() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, ITEM_COUNT * 2); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT * 2); assert_eq!( - tree.prefix("pre".as_bytes(), None, None).count(), + tree.prefix("pre".as_bytes(), SeqNo::MAX, None).count(), ITEM_COUNT * 2 ); assert_eq!( - tree.prefix("prefix".as_bytes(), None, None).count(), + tree.prefix("prefix".as_bytes(), SeqNo::MAX, None).count(), ITEM_COUNT ); assert!(tree - .iter(None, None) - .all(|x| &*x.unwrap().1 == "old".as_bytes())); + .iter(SeqNo::MAX, None) + .all(|x| &*x.value().unwrap() == "old".as_bytes())); for x in 0..ITEM_COUNT as u64 { let value = "new".as_bytes(); @@ -250,33 +250,33 @@ fn tree_shadowing_prefix() -> lsm_tree::Result<()> { tree.insert(format!("prefix:{x}").as_bytes(), value, batch_seqno); } - assert_eq!(tree.len(None, None)?, ITEM_COUNT * 2); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT * 2); assert_eq!( - tree.prefix("pre".as_bytes(), None, None).count(), + tree.prefix("pre".as_bytes(), SeqNo::MAX, None).count(), ITEM_COUNT * 2 ); assert_eq!( - tree.prefix("prefix".as_bytes(), None, None).count(), + tree.prefix("prefix".as_bytes(), SeqNo::MAX, None).count(), ITEM_COUNT ); assert!(tree - .iter(None, None) - .all(|x| &*x.unwrap().1 == "new".as_bytes())); + .iter(SeqNo::MAX, None) + .all(|x| &*x.value().unwrap() == "new".as_bytes())); tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, ITEM_COUNT * 2); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT * 2); assert_eq!( - tree.prefix("pre".as_bytes(), None, None).count(), + tree.prefix("pre".as_bytes(), SeqNo::MAX, None).count(), ITEM_COUNT * 2 ); assert_eq!( - tree.prefix("prefix".as_bytes(), None, None).count(), + tree.prefix("prefix".as_bytes(), SeqNo::MAX, None).count(), ITEM_COUNT ); assert!(tree - .iter(None, None) - .all(|x| &*x.unwrap().1 == "new".as_bytes())); + .iter(SeqNo::MAX, None) + .all(|x| &*x.value().unwrap() == "new".as_bytes())); Ok(()) } @@ -301,18 +301,18 @@ fn tree_shadowing_prefix_blob() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, ITEM_COUNT * 2); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT * 2); assert_eq!( - tree.prefix("pre".as_bytes(), None, None).count(), + tree.prefix("pre".as_bytes(), SeqNo::MAX, None).count(), ITEM_COUNT * 2 ); assert_eq!( - tree.prefix("prefix".as_bytes(), None, None).count(), + tree.prefix("prefix".as_bytes(), SeqNo::MAX, None).count(), ITEM_COUNT ); assert!(tree - .iter(None, None) - .all(|x| &*x.unwrap().1 == "old".as_bytes())); + .iter(SeqNo::MAX, None) + .all(|x| &*x.value().unwrap() == "old".as_bytes())); for x in 0..ITEM_COUNT as u64 { let value = "new".as_bytes(); @@ -322,33 +322,33 @@ fn tree_shadowing_prefix_blob() -> lsm_tree::Result<()> { tree.insert(format!("prefix:{x}").as_bytes(), value, batch_seqno); } - assert_eq!(tree.len(None, None)?, ITEM_COUNT * 2); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT * 2); assert_eq!( - tree.prefix("pre".as_bytes(), None, None).count(), + tree.prefix("pre".as_bytes(), SeqNo::MAX, None).count(), ITEM_COUNT * 2 ); assert_eq!( - tree.prefix("prefix".as_bytes(), None, None).count(), + tree.prefix("prefix".as_bytes(), SeqNo::MAX, None).count(), ITEM_COUNT ); assert!(tree - .iter(None, None) - .all(|x| &*x.unwrap().1 == "new".as_bytes())); + .iter(SeqNo::MAX, None) + .all(|x| &*x.value().unwrap() == "new".as_bytes())); tree.flush_active_memtable(0)?; - assert_eq!(tree.len(None, None)?, ITEM_COUNT * 2); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT * 2); assert_eq!( - tree.prefix("pre".as_bytes(), None, None).count(), + tree.prefix("pre".as_bytes(), SeqNo::MAX, None).count(), ITEM_COUNT * 2 ); assert_eq!( - tree.prefix("prefix".as_bytes(), None, None).count(), + tree.prefix("prefix".as_bytes(), SeqNo::MAX, None).count(), ITEM_COUNT ); assert!(tree - .iter(None, None) - .all(|x| &*x.unwrap().1 == "new".as_bytes())); + .iter(SeqNo::MAX, None) + .all(|x| &*x.value().unwrap() == "new".as_bytes())); Ok(()) } diff --git a/tests/tree_weak_delete.rs b/tests/tree_weak_delete.rs index f07bf92c..6a692fc2 100644 --- a/tests/tree_weak_delete.rs +++ b/tests/tree_weak_delete.rs @@ -1,4 +1,4 @@ -use lsm_tree::AbstractTree; +use lsm_tree::{AbstractTree, SeqNo}; use test_log::test; #[test] @@ -9,20 +9,20 @@ fn tree_weak_delete_simple() -> lsm_tree::Result<()> { let tree = lsm_tree::Config::new(path).open()?; tree.insert("a", "old", 0); - assert_eq!(1, tree.len(None, None)?); - assert!(tree.contains_key("a", None)?); + assert_eq!(1, tree.len(SeqNo::MAX, None)?); + assert!(tree.contains_key("a", SeqNo::MAX)?); tree.remove_weak("a", 1); - assert_eq!(0, tree.len(None, None)?); - assert!(!tree.contains_key("a", None)?); + assert_eq!(0, tree.len(SeqNo::MAX, None)?); + assert!(!tree.contains_key("a", SeqNo::MAX)?); tree.insert("a", "new", 2); - assert_eq!(1, tree.len(None, None)?); - assert!(tree.contains_key("a", None)?); + assert_eq!(1, tree.len(SeqNo::MAX, None)?); + assert!(tree.contains_key("a", SeqNo::MAX)?); tree.remove_weak("a", 3); - assert_eq!(0, tree.len(None, None)?); - assert!(!tree.contains_key("a", None)?); + assert_eq!(0, tree.len(SeqNo::MAX, None)?); + assert!(!tree.contains_key("a", SeqNo::MAX)?); Ok(()) } @@ -35,14 +35,14 @@ fn tree_weak_delete_flush() -> lsm_tree::Result<()> { let tree = lsm_tree::Config::new(path).open()?; tree.insert("a", "old", 0); - assert_eq!(1, tree.len(None, None)?); + assert_eq!(1, tree.len(SeqNo::MAX, None)?); tree.remove_weak("a", 1); - assert_eq!(0, tree.len(None, None)?); + assert_eq!(0, tree.len(SeqNo::MAX, None)?); tree.flush_active_memtable(0)?; assert_eq!(1, tree.segment_count()); - assert_eq!(0, tree.len(None, None)?); + assert_eq!(0, tree.len(SeqNo::MAX, None)?); Ok(()) } @@ -55,16 +55,16 @@ fn tree_weak_delete_semi_flush() -> lsm_tree::Result<()> { let tree = lsm_tree::Config::new(path).open()?; tree.insert("a", "old", 0); - assert_eq!(1, tree.len(None, None)?); + assert_eq!(1, tree.len(SeqNo::MAX, None)?); tree.flush_active_memtable(0)?; assert_eq!(1, tree.segment_count()); tree.remove_weak("a", 1); - assert_eq!(0, tree.len(None, None)?); + assert_eq!(0, tree.len(SeqNo::MAX, None)?); tree.flush_active_memtable(0)?; assert_eq!(2, tree.segment_count()); - assert_eq!(0, tree.len(None, None)?); + assert_eq!(0, tree.len(SeqNo::MAX, None)?); Ok(()) } @@ -77,14 +77,14 @@ fn tree_weak_delete_flush_point_read() -> lsm_tree::Result<()> { let tree = lsm_tree::Config::new(path).open()?; tree.insert("a", "old", 0); - assert!(tree.contains_key("a", None)?); + assert!(tree.contains_key("a", SeqNo::MAX)?); tree.remove_weak("a", 1); - assert!(!tree.contains_key("a", None)?); + assert!(!tree.contains_key("a", SeqNo::MAX)?); tree.flush_active_memtable(0)?; assert_eq!(1, tree.segment_count()); - assert!(!tree.contains_key("a", None)?); + assert!(!tree.contains_key("a", SeqNo::MAX)?); Ok(()) } @@ -97,16 +97,16 @@ fn tree_weak_delete_semi_flush_point_read() -> lsm_tree::Result<()> { let tree = lsm_tree::Config::new(path).open()?; tree.insert("a", "old", 0); - assert!(tree.contains_key("a", None)?); + assert!(tree.contains_key("a", SeqNo::MAX)?); tree.flush_active_memtable(0)?; assert_eq!(1, tree.segment_count()); tree.remove_weak("a", 1); - assert!(!tree.contains_key("a", None)?); + assert!(!tree.contains_key("a", SeqNo::MAX)?); tree.flush_active_memtable(0)?; assert_eq!(2, tree.segment_count()); - assert!(!tree.contains_key("a", None)?); + assert!(!tree.contains_key("a", SeqNo::MAX)?); Ok(()) } @@ -119,13 +119,13 @@ fn tree_weak_delete_resurrection() -> lsm_tree::Result<()> { let tree = lsm_tree::Config::new(path).open()?; tree.insert("a", "old", 0); - assert_eq!(1, tree.len(None, None)?); + assert_eq!(1, tree.len(SeqNo::MAX, None)?); tree.insert("a", "new", 1); - assert_eq!(1, tree.len(None, None)?); + assert_eq!(1, tree.len(SeqNo::MAX, None)?); tree.remove_weak("a", 2); - assert_eq!(0, tree.len(None, None)?); + assert_eq!(0, tree.len(SeqNo::MAX, None)?); Ok(()) } diff --git a/tests/tree_weak_delete_eviction.rs b/tests/tree_weak_delete_eviction.rs index ddd0507f..b9b2cc10 100644 --- a/tests/tree_weak_delete_eviction.rs +++ b/tests/tree_weak_delete_eviction.rs @@ -16,7 +16,7 @@ fn tree_weak_remove_flush_eviction() -> lsm_tree::Result<()> { tree.remove_weak(c, idx as SeqNo + 1000); } - assert_eq!(0, tree.len(None, None)?); + assert_eq!(0, tree.len(SeqNo::MAX, None)?); tree.flush_active_memtable(1_100)?; diff --git a/tests/tree_weak_delete_queue.rs b/tests/tree_weak_delete_queue.rs index 162b25de..32211d12 100644 --- a/tests/tree_weak_delete_queue.rs +++ b/tests/tree_weak_delete_queue.rs @@ -1,4 +1,4 @@ -use lsm_tree::AbstractTree; +use lsm_tree::{AbstractTree, SeqNo}; use test_log::test; #[test] @@ -13,22 +13,37 @@ fn tree_weak_delete_queue() -> lsm_tree::Result<()> { tree.insert("c", "c", 0); tree.insert("d", "d", 0); tree.insert("e", "e", 0); - assert_eq!(b"a", &*tree.first_key_value(None, None).unwrap().unwrap().0); + assert_eq!( + b"a", + &*tree.first_key_value(SeqNo::MAX, None).unwrap().unwrap().0 + ); tree.remove_weak("a", 1); - assert_eq!(b"b", &*tree.first_key_value(None, None).unwrap().unwrap().0); + assert_eq!( + b"b", + &*tree.first_key_value(SeqNo::MAX, None).unwrap().unwrap().0 + ); tree.remove_weak("b", 1); - assert_eq!(b"c", &*tree.first_key_value(None, None).unwrap().unwrap().0); + assert_eq!( + b"c", + &*tree.first_key_value(SeqNo::MAX, None).unwrap().unwrap().0 + ); tree.remove_weak("c", 1); - assert_eq!(b"d", &*tree.first_key_value(None, None).unwrap().unwrap().0); + assert_eq!( + b"d", + &*tree.first_key_value(SeqNo::MAX, None).unwrap().unwrap().0 + ); tree.remove_weak("d", 1); - assert_eq!(b"e", &*tree.first_key_value(None, None).unwrap().unwrap().0); + assert_eq!( + b"e", + &*tree.first_key_value(SeqNo::MAX, None).unwrap().unwrap().0 + ); tree.remove_weak("e", 1); - assert!(tree.is_empty(None, None)?); + assert!(tree.is_empty(SeqNo::MAX, None)?); Ok(()) } @@ -45,22 +60,37 @@ fn tree_weak_delete_queue_reverse() -> lsm_tree::Result<()> { tree.insert("c", "c", 0); tree.insert("d", "d", 0); tree.insert("e", "e", 0); - assert_eq!(b"e", &*tree.last_key_value(None, None).unwrap().unwrap().0); + assert_eq!( + b"e", + &*tree.last_key_value(SeqNo::MAX, None).unwrap().unwrap().0 + ); tree.remove_weak("e", 1); - assert_eq!(b"d", &*tree.last_key_value(None, None).unwrap().unwrap().0); + assert_eq!( + b"d", + &*tree.last_key_value(SeqNo::MAX, None).unwrap().unwrap().0 + ); tree.remove_weak("d", 1); - assert_eq!(b"c", &*tree.last_key_value(None, None).unwrap().unwrap().0); + assert_eq!( + b"c", + &*tree.last_key_value(SeqNo::MAX, None).unwrap().unwrap().0 + ); tree.remove_weak("c", 1); - assert_eq!(b"b", &*tree.last_key_value(None, None).unwrap().unwrap().0); + assert_eq!( + b"b", + &*tree.last_key_value(SeqNo::MAX, None).unwrap().unwrap().0 + ); tree.remove_weak("b", 1); - assert_eq!(b"a", &*tree.last_key_value(None, None).unwrap().unwrap().0); + assert_eq!( + b"a", + &*tree.last_key_value(SeqNo::MAX, None).unwrap().unwrap().0 + ); tree.remove_weak("a", 1); - assert!(tree.is_empty(None, None)?); + assert!(tree.is_empty(SeqNo::MAX, None)?); Ok(()) } From 2a854511bbfacb76965b75e8af1e1407d93b7815 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 11 Sep 2025 00:05:04 +0200 Subject: [PATCH 360/613] remove snapshot.rs --- src/snapshot.rs | 393 ------------------------------------------------ 1 file changed, 393 deletions(-) delete mode 100644 src/snapshot.rs diff --git a/src/snapshot.rs b/src/snapshot.rs deleted file mode 100644 index 5d8348fc..00000000 --- a/src/snapshot.rs +++ /dev/null @@ -1,393 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use crate::{ - value::{SeqNo, UserKey, UserValue}, - AbstractTree, AnyTree, KvPair, -}; -use std::ops::RangeBounds; - -/// A snapshot captures a read-only point-in-time view of the tree at the time the snapshot was created -/// -/// As long as the snapshot is open, old versions of objects will not be evicted as to -/// keep the snapshot consistent. Thus, snapshots should only be kept around for as little as possible. -/// -/// Snapshots do not persist across restarts. -#[derive(Clone)] -pub struct Snapshot { - tree: AnyTree, - - #[doc(hidden)] - pub seqno: SeqNo, -} - -impl Snapshot { - /// Creates a snapshot - pub(crate) fn new(tree: AnyTree, seqno: SeqNo) -> Self { - log::trace!("Opening snapshot with seqno: {seqno}"); - Self { tree, seqno } - } - - /// Retrieves an item from the snapshot. - /// - /// # Examples - /// - /// ``` - /// # let folder = tempfile::tempdir()?; - /// use lsm_tree::{AbstractTree, Config, Tree}; - /// - /// let tree = Config::new(folder).open()?; - /// let snapshot = tree.snapshot(0); - /// - /// tree.insert("a", "my_value", 0); - /// - /// let len = snapshot.size_of("a")?; - /// assert_eq!(None, len); - /// - /// let snapshot = tree.snapshot(1); - /// - /// let len = snapshot.size_of("a")?.unwrap_or_default(); - /// assert_eq!("my_value".len() as u32, len); - /// - /// let len = snapshot.size_of("b")?.unwrap_or_default(); - /// assert_eq!(0, len); - /// # - /// # Ok::<(), lsm_tree::Error>(()) - /// ``` - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - pub fn size_of>(&self, key: K) -> crate::Result> { - self.tree.size_of(key, Some(self.seqno)) - } - - /// Retrieves an item from the snapshot. - /// - /// # Examples - /// - /// ``` - /// # let folder = tempfile::tempdir()?; - /// use lsm_tree::{AbstractTree, Config, Tree}; - /// - /// let tree = Config::new(folder).open()?; - /// let snapshot = tree.snapshot(0); - /// - /// tree.insert("a", "my_value", 0); - /// - /// let item = snapshot.get("a")?; - /// assert_eq!(None, item); - /// - /// let snapshot = tree.snapshot(1); - /// - /// let item = snapshot.get("a")?.unwrap(); - /// assert_eq!(b"my_value", &*item); - /// # - /// # Ok::<(), lsm_tree::Error>(()) - /// ``` - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - pub fn get>(&self, key: K) -> crate::Result> { - self.tree.get(key, Some(self.seqno)) - } - - /// Returns an iterator that scans through the entire snapshot. - /// - /// Avoid using this function, or limit it as otherwise it may scan a lot of items. - /// - /// # Examples - /// - /// ``` - /// # let folder = tempfile::tempdir()?; - /// use lsm_tree::{AbstractTree, Config, Tree}; - /// - /// let tree = Config::new(folder).open()?; - /// - /// tree.insert("a", "abc", 0); - /// tree.insert("f", "abc", 1); - /// let snapshot = tree.snapshot(2); - /// - /// tree.insert("g", "abc", 2); - /// - /// assert_eq!(2, snapshot.iter().count()); - /// # - /// # Ok::<(), lsm_tree::Error>(()) - /// ``` - #[must_use] - pub fn iter(&self) -> impl DoubleEndedIterator> + 'static { - self.tree.iter(Some(self.seqno), None) - } - - /// Returns an iterator that scans through the entire snapshot, returning keys only. - /// - /// Avoid using this function, or limit it as otherwise it may scan a lot of items. - /// - /// # Examples - /// - /// ``` - /// # let folder = tempfile::tempdir()?; - /// use lsm_tree::{AbstractTree, Config, Tree}; - /// - /// let tree = Config::new(folder).open()?; - /// - /// tree.insert("a", "abc", 0); - /// tree.insert("f", "abc", 1); - /// let snapshot = tree.snapshot(2); - /// - /// tree.insert("g", "abc", 2); - /// - /// assert_eq!(2, snapshot.keys().count()); - /// # - /// # Ok::<(), lsm_tree::Error>(()) - /// ``` - #[must_use] - pub fn keys(&self) -> impl DoubleEndedIterator> + 'static { - self.tree.keys(Some(self.seqno), None) - } - - /// Returns an iterator that scans through the entire snapshot, returning values only. - /// - /// Avoid using this function, or limit it as otherwise it may scan a lot of items. - /// - /// # Examples - /// - /// ``` - /// # let folder = tempfile::tempdir()?; - /// use lsm_tree::{AbstractTree, Config, Tree}; - /// - /// let tree = Config::new(folder).open()?; - /// - /// tree.insert("a", "abc", 0); - /// tree.insert("f", "abc", 1); - /// let snapshot = tree.snapshot(2); - /// - /// tree.insert("g", "abc", 2); - /// - /// assert_eq!(2, snapshot.values().count()); - /// # - /// # Ok::<(), lsm_tree::Error>(()) - /// ``` - #[must_use] - pub fn values(&self) -> impl DoubleEndedIterator> + 'static { - self.tree.values(Some(self.seqno), None) - } - - /// Returns an iterator over a range of items in the snapshot. - /// - /// Avoid using full or unbounded ranges as they may scan a lot of items (unless limited). - /// - /// # Examples - /// - /// ``` - /// # let folder = tempfile::tempdir()?; - /// use lsm_tree::{AbstractTree, Config, Tree}; - /// - /// let tree = Config::new(folder).open()?; - /// - /// tree.insert("a", "abc", 0); - /// let snapshot = tree.snapshot(1); - /// - /// tree.insert("f", "abc", 1); - /// tree.insert("g", "abc", 2); - /// - /// assert_eq!(1, snapshot.range("a"..="f").count()); - /// # - /// # Ok::<(), lsm_tree::Error>(()) - /// ``` - pub fn range, R: RangeBounds>( - &self, - range: R, - ) -> impl DoubleEndedIterator> + 'static { - self.tree.range(range, Some(self.seqno), None) - } - - /// Returns an iterator over a prefixed set of items in the snapshot. - /// - /// Avoid using an empty prefix as it may scan a lot of items (unless limited). - /// - /// # Examples - /// - /// ``` - /// # let folder = tempfile::tempdir()?; - /// use lsm_tree::{AbstractTree, Config, Tree}; - /// - /// let tree = Config::new(folder).open()?; - /// - /// tree.insert("a", "abc", 0); - /// tree.insert("ab", "abc", 1); - /// let snapshot = tree.snapshot(2); - /// - /// tree.insert("abc", "abc", 2); - /// - /// assert_eq!(2, snapshot.prefix("a").count()); - /// # - /// # Ok::<(), lsm_tree::Error>(()) - /// ``` - pub fn prefix>( - &self, - prefix: K, - ) -> impl DoubleEndedIterator> + 'static { - self.tree.prefix(prefix, Some(self.seqno), None) - } - - /// Returns the first key-value pair in the snapshot. - /// The key in this pair is the minimum key in the snapshot. - /// - /// # Examples - /// - /// ``` - /// # use lsm_tree::Error as TreeError; - /// use lsm_tree::{AbstractTree, Config, Tree}; - /// - /// # let folder = tempfile::tempdir()?; - /// let tree = Config::new(folder).open()?; - /// - /// tree.insert("5", "abc", 0); - /// tree.insert("3", "abc", 1); - /// let snapshot = tree.snapshot(2); - /// - /// tree.insert("1", "abc", 2); - /// - /// let (key, _) = snapshot.first_key_value()?.expect("item should exist"); - /// assert_eq!(&*key, "3".as_bytes()); - /// # - /// # Ok::<(), TreeError>(()) - /// ``` - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - pub fn first_key_value(&self) -> crate::Result> { - self.iter().next().transpose() - } - - /// Returns the las key-value pair in the snapshot. - /// The key in this pair is the maximum key in the snapshot. - /// - /// # Examples - /// - /// ``` - /// # use lsm_tree::Error as TreeError; - /// use lsm_tree::{AbstractTree, Config, Tree}; - /// - /// # let folder = tempfile::tempdir()?; - /// let tree = Config::new(folder).open()?; - /// - /// tree.insert("1", "abc", 0); - /// tree.insert("3", "abc", 1); - /// let snapshot = tree.snapshot(2); - /// - /// tree.insert("5", "abc", 2); - /// - /// let (key, _) = snapshot.last_key_value()?.expect("item should exist"); - /// assert_eq!(&*key, "3".as_bytes()); - /// # - /// # Ok::<(), TreeError>(()) - /// ``` - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - pub fn last_key_value(&self) -> crate::Result> { - self.iter().next_back().transpose() - } - - /// Returns `true` if the snapshot contains the specified key. - /// - /// # Examples - /// - /// ``` - /// # let folder = tempfile::tempdir()?; - /// use lsm_tree::{AbstractTree, Config, Tree}; - /// - /// let tree = Config::new(folder).open()?; - /// let snapshot = tree.snapshot(0); - /// - /// assert!(!snapshot.contains_key("a")?); - /// - /// tree.insert("a", "abc", 0); - /// assert!(!snapshot.contains_key("a")?); - /// # - /// # Ok::<(), lsm_tree::Error>(()) - /// ``` - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - pub fn contains_key>(&self, key: K) -> crate::Result { - self.tree.contains_key(key, Some(self.seqno)) - } - - /// Returns `true` if the snapshot is empty. - /// - /// This operation has O(log N) complexity. - /// - /// # Examples - /// - /// ``` - /// # let folder = tempfile::tempdir()?; - /// use lsm_tree::{AbstractTree, Config, Tree}; - /// - /// let tree = Config::new(folder).open()?; - /// let snapshot = tree.snapshot(0); - /// - /// assert!(snapshot.is_empty()?); - /// - /// tree.insert("a", "abc", 0); - /// assert!(snapshot.is_empty()?); - /// # - /// # Ok::<(), lsm_tree::Error>(()) - /// ``` - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - pub fn is_empty(&self) -> crate::Result { - self.first_key_value().map(|x| x.is_none()) - } - - /// Scans the entire snapshot, returning the amount of items. - /// - /// ###### Caution - /// - /// This operation scans the entire tree: O(n) complexity! - /// - /// Never, under any circumstances, use .`len()` == 0 to check - /// if the snapshot is empty, use [`Snapshot::is_empty`] instead. - /// - /// # Examples - /// - /// ``` - /// # use lsm_tree::Error as TreeError; - /// use lsm_tree::{AbstractTree, Config, Tree}; - /// - /// # let folder = tempfile::tempdir()?; - /// let tree = Config::new(folder).open()?; - /// let snapshot = tree.snapshot(0); - /// - /// assert_eq!(snapshot.len()?, 0); - /// tree.insert("1", "abc", 0); - /// tree.insert("3", "abc", 1); - /// tree.insert("5", "abc", 2); - /// assert_eq!(snapshot.len()?, 0); - /// # - /// # Ok::<(), TreeError>(()) - /// ``` - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - pub fn len(&self) -> crate::Result { - let mut count = 0; - - for item in self.iter() { - let _ = item?; - count += 1; - } - - Ok(count) - } -} From 6dfa30aa5ea388ae2f265e5bf8c1e4f34f81164b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 11 Sep 2025 00:05:47 +0200 Subject: [PATCH 361/613] fmt --- tests/blob_tree_reload_blob.rs | 29 ++++---------------- tests/experimental_blob_tree_guarded_size.rs | 4 +-- tests/tree_bulk_ingest.rs | 24 ++++------------ tests/tree_count.rs | 12 ++------ tests/tree_delete_loop.rs | 21 ++++---------- 5 files changed, 21 insertions(+), 69 deletions(-) diff --git a/tests/blob_tree_reload_blob.rs b/tests/blob_tree_reload_blob.rs index 1cf46d45..8f6daaac 100644 --- a/tests/blob_tree_reload_blob.rs +++ b/tests/blob_tree_reload_blob.rs @@ -12,12 +12,7 @@ fn blob_tree_reload_empty() -> lsm_tree::Result<()> { let tree = Config::new(&folder).open_as_blob_tree()?; assert_eq!(tree.len(SeqNo::MAX, None)?, 0); - assert_eq!( - tree.iter(SeqNo::MAX, None) - .flat_map(|x| x.key()) - .count(), - 0 - ); + assert_eq!(tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), 0); assert_eq!( tree.iter(SeqNo::MAX, None) .map(|x| x.key()) @@ -33,12 +28,7 @@ fn blob_tree_reload_empty() -> lsm_tree::Result<()> { let tree = Config::new(&folder).open_as_blob_tree()?; assert_eq!(tree.len(SeqNo::MAX, None)?, 0); - assert_eq!( - tree.iter(SeqNo::MAX, None) - .flat_map(|x| x.key()) - .count(), - 0 - ); + assert_eq!(tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), 0); assert_eq!( tree.iter(SeqNo::MAX, None) .map(|x| x.key()) @@ -56,12 +46,7 @@ fn blob_tree_reload_empty() -> lsm_tree::Result<()> { let tree = Config::new(&folder).open_as_blob_tree()?; assert_eq!(tree.len(SeqNo::MAX, None)?, 0); - assert_eq!( - tree.iter(SeqNo::MAX, None) - .flat_map(|x| x.key()) - .count(), - 0 - ); + assert_eq!(tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), 0); assert_eq!( tree.iter(SeqNo::MAX, None) .map(|x| x.key()) @@ -102,9 +87,7 @@ fn blob_tree_reload() -> lsm_tree::Result<()> { assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT * 2); assert_eq!( - tree.iter(SeqNo::MAX, None) - .flat_map(|x| x.key()) - .count(), + tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), ITEM_COUNT * 2 ); assert_eq!( @@ -123,9 +106,7 @@ fn blob_tree_reload() -> lsm_tree::Result<()> { assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT * 2); assert_eq!( - tree.iter(SeqNo::MAX, None) - .flat_map(|x| x.key()) - .count(), + tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), ITEM_COUNT * 2 ); assert_eq!( diff --git a/tests/experimental_blob_tree_guarded_size.rs b/tests/experimental_blob_tree_guarded_size.rs index 0cc1ef8e..f105d3c4 100644 --- a/tests/experimental_blob_tree_guarded_size.rs +++ b/tests/experimental_blob_tree_guarded_size.rs @@ -12,9 +12,7 @@ fn experimental_blob_tree_guarded_size() -> lsm_tree::Result<()> { assert_eq!( 10_003u32, - tree.iter(SeqNo::MAX, None) - .flat_map(Guard::size) - .sum(), + tree.iter(SeqNo::MAX, None).flat_map(Guard::size).sum(), ); Ok(()) diff --git a/tests/tree_bulk_ingest.rs b/tests/tree_bulk_ingest.rs index 9de807c5..635379a6 100644 --- a/tests/tree_bulk_ingest.rs +++ b/tests/tree_bulk_ingest.rs @@ -18,9 +18,7 @@ fn tree_bulk_ingest() -> lsm_tree::Result<()> { assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( - tree.iter(SeqNo::MAX, None) - .flat_map(|x| x.key()) - .count(), + tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), ITEM_COUNT ); assert_eq!( @@ -48,9 +46,7 @@ fn tree_copy() -> lsm_tree::Result<()> { assert_eq!(src.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( - src.iter(SeqNo::MAX, None) - .flat_map(|x| x.key()) - .count(), + src.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), ITEM_COUNT ); assert_eq!( @@ -73,9 +69,7 @@ fn tree_copy() -> lsm_tree::Result<()> { assert_eq!(dest.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( - dest.iter(SeqNo::MAX, None) - .flat_map(|x| x.key()) - .count(), + dest.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), ITEM_COUNT ); assert_eq!( @@ -107,9 +101,7 @@ fn blob_tree_bulk_ingest() -> lsm_tree::Result<()> { assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( - tree.iter(SeqNo::MAX, None) - .flat_map(|x| x.key()) - .count(), + tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), ITEM_COUNT ); assert_eq!( @@ -140,9 +132,7 @@ fn blob_tree_copy() -> lsm_tree::Result<()> { assert_eq!(src.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( - src.iter(SeqNo::MAX, None) - .flat_map(|x| x.key()) - .count(), + src.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), ITEM_COUNT ); assert_eq!( @@ -168,9 +158,7 @@ fn blob_tree_copy() -> lsm_tree::Result<()> { assert_eq!(dest.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( - dest.iter(SeqNo::MAX, None) - .flat_map(|x| x.key()) - .count(), + dest.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), ITEM_COUNT ); assert_eq!( diff --git a/tests/tree_count.rs b/tests/tree_count.rs index febb4e1c..d40624ab 100644 --- a/tests/tree_count.rs +++ b/tests/tree_count.rs @@ -17,9 +17,7 @@ fn tree_memtable_count() -> lsm_tree::Result<()> { assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( - tree.iter(SeqNo::MAX, None) - .flat_map(|x| x.key()) - .count(), + tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), ITEM_COUNT ); assert_eq!( @@ -49,9 +47,7 @@ fn tree_flushed_count() -> lsm_tree::Result<()> { assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( - tree.iter(SeqNo::MAX, None) - .flat_map(|x| x.key()) - .count(), + tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), ITEM_COUNT ); assert_eq!( @@ -81,9 +77,7 @@ fn tree_flushed_count_blob() -> lsm_tree::Result<()> { assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( - tree.iter(SeqNo::MAX, None) - .flat_map(|x| x.key()) - .count(), + tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), ITEM_COUNT ); assert_eq!( diff --git a/tests/tree_delete_loop.rs b/tests/tree_delete_loop.rs index 67609012..b087a2bb 100644 --- a/tests/tree_delete_loop.rs +++ b/tests/tree_delete_loop.rs @@ -24,18 +24,15 @@ fn tree_delete_by_prefix() -> lsm_tree::Result<()> { assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT * 3); assert_eq!( - tree.prefix("a:".as_bytes(), SeqNo::MAX, None) - .count(), + tree.prefix("a:".as_bytes(), SeqNo::MAX, None).count(), ITEM_COUNT ); assert_eq!( - tree.prefix("b:".as_bytes(), SeqNo::MAX, None) - .count(), + tree.prefix("b:".as_bytes(), SeqNo::MAX, None).count(), ITEM_COUNT ); assert_eq!( - tree.prefix("c:".as_bytes(), SeqNo::MAX, None) - .count(), + tree.prefix("c:".as_bytes(), SeqNo::MAX, None).count(), ITEM_COUNT ); @@ -46,18 +43,12 @@ fn tree_delete_by_prefix() -> lsm_tree::Result<()> { assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT * 2); assert_eq!( - tree.prefix("a:".as_bytes(), SeqNo::MAX, None) - .count(), + tree.prefix("a:".as_bytes(), SeqNo::MAX, None).count(), ITEM_COUNT ); + assert_eq!(tree.prefix("b:".as_bytes(), SeqNo::MAX, None).count(), 0); assert_eq!( - tree.prefix("b:".as_bytes(), SeqNo::MAX, None) - .count(), - 0 - ); - assert_eq!( - tree.prefix("c:".as_bytes(), SeqNo::MAX, None) - .count(), + tree.prefix("c:".as_bytes(), SeqNo::MAX, None).count(), ITEM_COUNT ); From 72f85496bd21a644f274dbeb955698fcdad47c00 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 11 Sep 2025 00:09:51 +0200 Subject: [PATCH 362/613] dep --- Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.toml b/Cargo.toml index 93248ac4..3e4a36bb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,7 @@ metrics = [] [dependencies] byteorder = "1.5.0" +byteview = { git = "https://github.com/fjall-rs/byteview" } crossbeam-skiplist = "0.1.3" enum_dispatch = "0.3.13" interval-heap = "0.0.5" From eae356dfe214d4113b4e0a88506783340be75be8 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 11 Sep 2025 00:11:00 +0200 Subject: [PATCH 363/613] fix --- src/compaction/worker.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index bdf1e9a4..c01c563f 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -73,6 +73,7 @@ pub fn do_compaction(opts: &Options) -> crate::Result<()> { log::trace!("Acquiring levels manifest lock"); let original_levels = opts.levels.write().expect("lock is poisoned"); + let start = Instant::now(); log::trace!( "Consulting compaction strategy {:?}", opts.strategy.get_name(), From cc48a76c24eb70537ccb87b3c71312ea4a1f243e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 11 Sep 2025 00:14:54 +0200 Subject: [PATCH 364/613] fix --- src/segment/block/mod.rs | 4 ++-- tests/blob_simple.rs | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index 1f1bf9a5..c2a358d4 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -20,7 +20,7 @@ pub(crate) use trailer::{Trailer, TRAILER_START_MARKER}; use crate::{ coding::{Decode, Encode}, - segment::{BlockHandle, DataBlock}, + segment::BlockHandle, CompressionType, Slice, }; use std::fs::File; @@ -204,7 +204,7 @@ impl Block { .map_err(|_| crate::Error::Decompress(compression))?; } - builder.freeze().into() + builder.freeze() } }; diff --git a/tests/blob_simple.rs b/tests/blob_simple.rs index 54c0b1ed..29de63fe 100644 --- a/tests/blob_simple.rs +++ b/tests/blob_simple.rs @@ -65,30 +65,30 @@ fn blob_tree_simple_compressed() -> lsm_tree::Result<()> { let big_value = b"neptune!".repeat(128_000); - assert!(tree.get("big", None)?.is_none()); + assert!(tree.get("big", SeqNo::MAX)?.is_none()); tree.insert("big", &big_value, 0); tree.insert("smol", "small value", 0); - let value = tree.get("big", None)?.expect("should exist"); + let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); assert_eq!(&*value, big_value); tree.flush_active_memtable(0)?; - let value = tree.get("big", None)?.expect("should exist"); + let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); assert_eq!(&*value, big_value); - let value = tree.get("smol", None)?.expect("should exist"); + let value = tree.get("smol", SeqNo::MAX)?.expect("should exist"); assert_eq!(&*value, b"small value"); let new_big_value = b"winter!".repeat(128_000); tree.insert("big", &new_big_value, 1); - let value = tree.get("big", None)?.expect("should exist"); + let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); assert_eq!(&*value, new_big_value); tree.flush_active_memtable(0)?; - let value = tree.get("big", None)?.expect("should exist"); + let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); assert_eq!(&*value, new_big_value); Ok(()) From 148694344bb530d59a58a38e56bb8428213e2d72 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 11 Sep 2025 00:18:43 +0200 Subject: [PATCH 365/613] fix: kv example --- examples/kv/src/main.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/kv/src/main.rs b/examples/kv/src/main.rs index e61cad5c..03874750 100644 --- a/examples/kv/src/main.rs +++ b/examples/kv/src/main.rs @@ -1,6 +1,6 @@ mod wal; -use lsm_tree::{AbstractTree, Config, InternalValue, SequenceNumberCounter, Tree}; +use lsm_tree::{AbstractTree, Config, InternalValue, SeqNo, SequenceNumberCounter, Tree}; use nanoid::nanoid; use std::{ path::Path, @@ -151,7 +151,7 @@ impl KvStore { } pub fn get>(&self, key: K) -> lsm_tree::Result>> { - Ok(self.tree.get(key.as_ref(), None)?.map(|bytes| { + Ok(self.tree.get(key.as_ref(), SeqNo::MAX)?.map(|bytes| { std::str::from_utf8(&bytes) .expect("should be valid utf-8") .into() @@ -159,15 +159,15 @@ impl KvStore { } pub fn contains_key>(&self, key: K) -> lsm_tree::Result { - self.tree.contains_key(key.as_ref(), None) + self.tree.contains_key(key.as_ref(), SeqNo::MAX) } pub fn is_empty(&self) -> lsm_tree::Result { - self.tree.is_empty(None, None) + self.tree.is_empty(SeqNo::MAX, None) } pub fn len(&self) -> lsm_tree::Result { - self.tree.len(None, None) + self.tree.len(SeqNo::MAX, None) } } From 94ab74353ee907cecf7f35476d86909ac4e0b2c4 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 13 Sep 2025 16:34:42 +0200 Subject: [PATCH 366/613] doc --- src/compression.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compression.rs b/src/compression.rs index 1e8ff579..24f8348f 100644 --- a/src/compression.rs +++ b/src/compression.rs @@ -6,7 +6,7 @@ use crate::coding::{Decode, DecodeError, Encode, EncodeError}; use byteorder::{ReadBytesExt, WriteBytesExt}; use std::io::{Read, Write}; -/// Compression algorithm to use. +/// Compression algorithm to use #[derive(Copy, Clone, Debug, Eq, PartialEq)] #[allow(clippy::module_name_repetitions)] pub enum CompressionType { From dbcdcea8e95fa47b19171134ada98201742a3a62 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 13 Sep 2025 16:35:34 +0200 Subject: [PATCH 367/613] doc --- src/lib.rs | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 0411c38a..0130a84c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -19,15 +19,15 @@ //! on disk and perform fast lookup queries. //! Instead of updating a disk-based data structure in-place, //! deltas (inserts and deletes) are added into an in-memory write buffer (`Memtable`). -//! Data is then flushed to disk segments, as the write buffer reaches some threshold. +//! Data is then flushed to disk segments when the write buffer reaches some threshold. //! -//! Amassing many segments on disk will degrade read performance and waste disk space usage, so segments +//! Amassing many segments on disk will degrade read performance and waste disk space, so segments //! can be periodically merged into larger segments in a process called `Compaction`. //! Different compaction strategies have different advantages and drawbacks, and should be chosen based //! on the workload characteristics. //! //! Because maintaining an efficient structure is deferred to the compaction process, writing to an LSMT -//! is very fast (O(1) complexity). +//! is very fast (_O(1)_ complexity). //! //! Keys are limited to 65536 bytes, values are limited to 2^32 bytes. As is normal with any kind of storage //! engine, larger keys and values have a bigger performance impact. @@ -61,7 +61,7 @@ //! } //! //! // Iterators implement DoubleEndedIterator, so you can search backwards, too! -//! for item in tree.prefix("prefix", 1, None).rev() { +//! for item in tree.prefix("user1", 1, None).rev() { //! // ... //! } //! @@ -69,7 +69,6 @@ //! // and persisting all in-memory data. //! // Note, this flushes synchronously, which may not be desired //! tree.flush_active_memtable(0)?; -//! assert_eq!(Some("my_value".as_bytes().into()), item); //! //! // When some disk segments have amassed, use compaction //! // to reduce the amount of disk segments @@ -82,8 +81,6 @@ //! //! let version_gc_threshold = 0; //! tree.compact(Arc::new(strategy), version_gc_threshold)?; -//! -//! assert_eq!(Some("my_value".as_bytes().into()), item); //! # //! # Ok::<(), lsm_tree::Error>(()) //! ``` From b76071311deacd5c556105ddf4164d2a713f04e4 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 13 Sep 2025 16:35:45 +0200 Subject: [PATCH 368/613] remove old code --- src/key.rs | 33 -------------- src/value.rs | 120 ++------------------------------------------------- 2 files changed, 3 insertions(+), 150 deletions(-) diff --git a/src/key.rs b/src/key.rs index 24e42032..141c6657 100644 --- a/src/key.rs +++ b/src/key.rs @@ -83,39 +83,6 @@ impl InternalKey { } } -// TODO: 3.0.0 remove -impl Encode for InternalKey { - fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { - writer.write_u8(u8::from(self.value_type))?; - - writer.write_u64_varint(self.seqno)?; - - // NOTE: Truncation is okay and actually needed - #[allow(clippy::cast_possible_truncation)] - writer.write_u16_varint(self.user_key.len() as u16)?; - writer.write_all(&self.user_key)?; - - Ok(()) - } -} - -// TODO: 3.0.0 remove -impl Decode for InternalKey { - fn decode_from(reader: &mut R) -> Result { - let value_type = reader.read_u8()?; - let value_type = value_type - .try_into() - .map_err(|()| DecodeError::InvalidTag(("ValueType", value_type)))?; - - let seqno = reader.read_u64_varint()?; - - let key_len = reader.read_u16_varint()?; - let key = UserKey::from_reader(reader, key_len.into())?; - - Ok(Self::new(key, seqno, value_type)) - } -} - impl PartialOrd for InternalKey { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) diff --git a/src/value.rs b/src/value.rs index d1573fce..efc190f2 100644 --- a/src/value.rs +++ b/src/value.rs @@ -2,13 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::{ - coding::{Decode, DecodeError, Encode, EncodeError}, - key::InternalKey, - Slice, -}; -use std::io::{Read, Write}; -use varint_rs::{VarintReader, VarintWriter}; +use crate::{key::InternalKey, Slice}; /// User defined key pub type UserKey = Slice; @@ -22,7 +16,8 @@ pub type UserValue = Slice; /// Values with the same seqno are part of the same batch. /// /// A value with a higher sequence number shadows an item with the -/// same key and lower sequence number. This enables MVCC. +/// same key and lower sequence number. +/// This enables MVCC. /// /// Stale items are lazily garbage-collected during compaction. pub type SeqNo = u64; @@ -170,44 +165,6 @@ impl std::fmt::Debug for InternalValue { } } -// TODO: 3.0.0 remove -impl Encode for InternalValue { - fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { - self.key.encode_into(writer)?; - - // NOTE: Only write value len + value if we are actually a value - if !self.is_tombstone() { - // NOTE: We know values are limited to 32-bit length - #[allow(clippy::cast_possible_truncation)] - writer.write_u32_varint(self.value.len() as u32)?; - writer.write_all(&self.value)?; - } - - Ok(()) - } -} - -// TODO: 3.0.0 remove -impl Decode for InternalValue { - fn decode_from(reader: &mut R) -> Result { - let key = InternalKey::decode_from(reader)?; - - if key.is_tombstone() { - Ok(Self { - key, - value: UserValue::empty(), - }) - } else { - // NOTE: Only read value if we are actually a value - - let value_len = reader.read_u32_varint()?; - let value = UserValue::from_reader(reader, value_len as usize)?; - - Ok(Self { key, value }) - } - } -} - #[cfg(test)] mod tests { use super::*; @@ -226,75 +183,4 @@ mod tests { let b = InternalKey::new(*b"a", 1, ValueType::Value); assert!(a > b); } - - /* #[test] - fn value_raw() -> crate::Result<()> { - // Create an empty Value instance - let value = - InternalValue::from_components(vec![1, 2, 3], vec![3, 2, 1], 1, ValueType::Value); - - #[rustfmt::skip] - let bytes = [ - // Seqno - 1, - - // Type - 0, - - // User key - 3, 1, 2, 3, - - // User value - 3, 3, 2, 1, - ]; - - // Deserialize the empty Value - let deserialized = InternalValue::decode_from(&mut Cursor::new(bytes))?; - - // Check if deserialized Value is equivalent to the original empty Value - assert_eq!(value, deserialized); - - Ok(()) - } */ - - #[test] - fn value_empty_value() -> crate::Result<()> { - // Create an empty Value instance - let value = InternalValue::from_components(vec![1, 2, 3], vec![], 42, ValueType::Value); - - // Serialize the empty Value - let mut serialized = Vec::new(); - value.encode_into(&mut serialized)?; - - // Deserialize the empty Value - let deserialized = InternalValue::decode_from(&mut &serialized[..])?; - - // Check if deserialized Value is equivalent to the original empty Value - assert_eq!(value, deserialized); - - Ok(()) - } - - #[test] - fn value_with_value() -> crate::Result<()> { - // Create an empty Value instance - let value = InternalValue::from_components( - vec![1, 2, 3], - vec![6, 2, 6, 2, 7, 5, 7, 8, 98], - 42, - ValueType::Value, - ); - - // Serialize the empty Value - let mut serialized = Vec::new(); - value.encode_into(&mut serialized)?; - - // Deserialize the empty Value - let deserialized = InternalValue::decode_from(&mut &serialized[..])?; - - // Check if deserialized Value is equivalent to the original empty Value - assert_eq!(value, deserialized); - - Ok(()) - } } From 91ddefc494267514cd33b39f986766747fd807e1 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 13 Sep 2025 16:36:08 +0200 Subject: [PATCH 369/613] clippy --- src/tree/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tree/mod.rs b/src/tree/mod.rs index a96026a2..49754370 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -531,7 +531,7 @@ impl Tree { return Ok(None); }; - log::debug!("Finalized segment write at {segment_file_path:?}"); + log::debug!("Finalized segment write at {}", segment_file_path.display()); /* let block_index = FullBlockIndex::from_file(&segment_file_path, &trailer.metadata, &trailer.offsets)?; From 417f9c1519f37c4553db78dc04e2e5dfb21c30f5 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 13 Sep 2025 16:36:22 +0200 Subject: [PATCH 370/613] clippy --- src/segment/index_block/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/segment/index_block/mod.rs b/src/segment/index_block/mod.rs index 5d261cbc..59068857 100644 --- a/src/segment/index_block/mod.rs +++ b/src/segment/index_block/mod.rs @@ -78,7 +78,7 @@ impl IndexBlock { #[must_use] #[allow(clippy::iter_without_into_iter)] - pub fn iter(&self) -> Iter { + pub fn iter(&self) -> Iter<'_> { Iter::new(Decoder::::new( &self.inner, )) From 7eff8b9edd506b0b07ff7925c2a34907ae0d6b96 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 13 Sep 2025 16:36:46 +0200 Subject: [PATCH 371/613] clippy --- src/tree/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 49754370..240a01c4 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -833,7 +833,7 @@ impl Tree { use crate::{file::MANIFEST_FILE, stop_signal::StopSignal}; use inner::get_next_tree_id; - log::info!("Recovering LSM-tree at {:?}", config.path); + log::info!("Recovering LSM-tree at {}", config.path.display()); let bytes = std::fs::read(config.path.join(MANIFEST_FILE))?; let mut bytes = Cursor::new(bytes); From bfab67c2447a37bbd2de5421b03feb6031804d8b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 13 Sep 2025 16:37:34 +0200 Subject: [PATCH 372/613] separate data block compression --- src/blob_tree/mod.rs | 2 +- src/compaction/worker.rs | 3 ++- src/segment/mod.rs | 10 +++++----- src/segment/multi_writer.rs | 21 +++++++++++++++------ src/segment/writer/index.rs | 7 +++++-- src/segment/writer/mod.rs | 31 +++++++++++++++++++++---------- src/tree/ingest.rs | 4 ++-- src/tree/mod.rs | 2 +- 8 files changed, 52 insertions(+), 28 deletions(-) diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index a42c7cca..d3de4e30 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -443,7 +443,7 @@ impl AbstractTree for BlobTree { folder: lsm_segment_folder, } */ )? - .use_compression(self.index.config.compression); + .use_data_block_compression(self.index.config.compression); /* segment_writer = segment_writer.use_bloom_policy( crate::segment::writer::BloomConstructionPolicy::FpRate(0.0001), diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index c01c563f..c3e17e85 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -271,7 +271,8 @@ fn merge_segments( }; let mut segment_writer = segment_writer - .use_compression(opts.config.compression) + .use_data_block_restart_interval(16) + .use_data_block_compression(opts.config.compression) .use_data_block_size(opts.config.data_block_size) .use_data_block_hash_ratio(opts.config.data_block_hash_ratio) .use_bloom_policy({ diff --git a/src/segment/mod.rs b/src/segment/mod.rs index ad335037..57c7e91c 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -221,7 +221,7 @@ impl Segment { &IndexBlock::new(self.load_block( &self.regions.tli, BlockType::Index, - self.metadata.data_block_compression, // TODO: maybe index compression + CompressionType::None, // TODO: allow index block compression )?) } }; @@ -324,7 +324,7 @@ impl Segment { self.load_block( &self.regions.tli, BlockType::Index, - self.metadata.data_block_compression, // TODO: maybe index compression + CompressionType::None, // TODO: allow separate index block compression ) .expect("should load block"), ) @@ -408,8 +408,8 @@ impl Segment { let block = Block::from_file( &file, regions.tli, - crate::segment::block::BlockType::Index, - metadata.data_block_compression, // TODO: index blocks may get their own compression level + BlockType::Index, + CompressionType::None, // TODO: allow setting index block compression )?; IndexBlock::new(block) @@ -437,7 +437,7 @@ impl Segment { Block::from_file( &file, filter_handle, - crate::segment::block::BlockType::Filter, + BlockType::Filter, crate::CompressionType::None, // NOTE: We never write a filter block with compression ) }) diff --git a/src/segment/multi_writer.rs b/src/segment/multi_writer.rs index 86246b3c..47ee960e 100644 --- a/src/segment/multi_writer.rs +++ b/src/segment/multi_writer.rs @@ -18,6 +18,7 @@ pub struct MultiWriter { data_block_hash_ratio: f32, data_block_size: u32, + data_block_restart_interval: u8, /// Target size of segments in bytes /// @@ -32,7 +33,7 @@ pub struct MultiWriter { pub writer: Writer, - pub compression: CompressionType, + pub data_block_compression: CompressionType, bloom_policy: BloomConstructionPolicy, @@ -57,6 +58,7 @@ impl MultiWriter { data_block_hash_ratio: 0.0, data_block_size: 4_096, + data_block_restart_interval: 16, target_size, results: Vec::with_capacity(10), @@ -64,7 +66,7 @@ impl MultiWriter { current_segment_id, writer, - compression: CompressionType::None, + data_block_compression: CompressionType::None, bloom_policy: BloomConstructionPolicy::default(), @@ -72,6 +74,13 @@ impl MultiWriter { }) } + #[must_use] + pub fn use_data_block_restart_interval(mut self, interval: u8) -> Self { + self.data_block_restart_interval = interval; + self.writer = self.writer.use_data_block_restart_interval(interval); + self + } + #[must_use] pub fn use_data_block_hash_ratio(mut self, ratio: f32) -> Self { self.data_block_hash_ratio = ratio; @@ -91,9 +100,9 @@ impl MultiWriter { } #[must_use] - pub fn use_compression(mut self, compression: CompressionType) -> Self { - self.compression = compression; - self.writer = self.writer.use_compression(compression); + pub fn use_data_block_compression(mut self, compression: CompressionType) -> Self { + self.data_block_compression = compression; + self.writer = self.writer.use_data_block_compression(compression); self } @@ -120,7 +129,7 @@ impl MultiWriter { let path = self.base_path.join(new_segment_id.to_string()); let new_writer = Writer::new(path, new_segment_id)? - .use_compression(self.compression) + .use_data_block_compression(self.data_block_compression) .use_data_block_size(self.data_block_size) .use_bloom_policy(self.bloom_policy) .use_data_block_hash_ratio(self.data_block_hash_ratio); diff --git a/src/segment/writer/index.rs b/src/segment/writer/index.rs index 10757218..a6a541ca 100644 --- a/src/segment/writer/index.rs +++ b/src/segment/writer/index.rs @@ -23,7 +23,9 @@ pub trait BlockIndexWriter { block_file_writer: &mut W, ) -> crate::Result<(BlockHandle, Option)>; - fn use_compression(&mut self, compression: CompressionType); + fn use_compression(self, compression: CompressionType) -> Self + where + Self: Sized; fn len(&self) -> usize; } @@ -47,8 +49,9 @@ impl BlockIndexWriter for FullIndexWriter 1 } - fn use_compression(&mut self, compression: CompressionType) { + fn use_compression(mut self, compression: CompressionType) -> Self { self.compression = compression; + self } fn register_data_block(&mut self, block_handle: KeyedBlockHandle) -> crate::Result<()> { diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 56a4386a..6770aea1 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -26,14 +26,14 @@ pub struct Writer { segment_id: SegmentId, - data_block_restart_interval: u8, // TODO: + data_block_restart_interval: u8, data_block_hash_ratio: f32, data_block_size: u32, index_block_size: u32, // TODO: implement - /// Compression to use - compression: CompressionType, + /// Compression to use for data blocks + data_block_compression: CompressionType, /// Buffer to serialize blocks into block_buffer: Vec, @@ -80,7 +80,7 @@ impl Writer { data_block_size: 4_096, index_block_size: 4_096, - compression: CompressionType::None, + data_block_compression: CompressionType::None, path: std::path::absolute(path)?, @@ -102,6 +102,12 @@ impl Writer { }) } + #[must_use] + pub fn use_data_block_restart_interval(mut self, interval: u8) -> Self { + self.data_block_restart_interval = interval; + self + } + #[must_use] pub fn use_data_block_hash_ratio(mut self, ratio: f32) -> Self { self.data_block_hash_ratio = ratio; @@ -119,9 +125,8 @@ impl Writer { } #[must_use] - pub fn use_compression(mut self, compression: CompressionType) -> Self { - self.compression = compression; - self.index_writer.use_compression(compression); + pub fn use_data_block_compression(mut self, compression: CompressionType) -> Self { + self.data_block_compression = compression; self } @@ -208,7 +213,7 @@ impl Writer { &mut self.block_writer, &self.block_buffer, super::block::BlockType::Data, - self.compression, + self.data_block_compression, )?; self.meta.uncompressed_size += u64::from(header.uncompressed_length); @@ -337,8 +342,14 @@ impl Writer { let meta_items = [ meta("#checksum_type", b"xxh3"), - meta("#compression#data", &self.compression.encode_into_vec()), - meta("#compression#index", &self.compression.encode_into_vec()), + meta( + "#compression#data", + &self.data_block_compression.encode_into_vec(), + ), + meta( + "#compression#index", + &self.data_block_compression.encode_into_vec(), + ), meta("#created_at", &unix_timestamp().as_nanos().to_le_bytes()), meta( "#data_block_count", diff --git a/src/tree/ingest.rs b/src/tree/ingest.rs index 63ec5932..8ac2a814 100644 --- a/src/tree/ingest.rs +++ b/src/tree/ingest.rs @@ -28,9 +28,9 @@ impl<'a> Ingestion<'a> { tree.segment_id_counter.clone(), 64 * 1_024 * 1_024, // TODO: look at tree configuration )? - .use_data_block_hash_ratio(tree.config.data_block_hash_ratio) // TODO: use restart interval etc. - .use_compression(tree.config.compression); + .use_data_block_hash_ratio(tree.config.data_block_hash_ratio) + .use_data_block_compression(tree.config.compression); Ok(Self { folder, diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 240a01c4..baef0c3a 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -246,7 +246,7 @@ impl AbstractTree for Tree { log::debug!("writing segment to {}", segment_file_path.display()); let mut segment_writer = Writer::new(segment_file_path, segment_id)? - .use_compression(self.config.compression) + .use_data_block_compression(self.config.compression) .use_data_block_size(self.config.data_block_size) .use_data_block_hash_ratio(self.config.data_block_hash_ratio) .use_bloom_policy({ From aeea7ff710e1ee82e4498926ce8cb8691cf0aede Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 13 Sep 2025 16:37:56 +0200 Subject: [PATCH 373/613] clippy --- src/segment/data_block/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index 49141308..e253cdc5 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -295,7 +295,7 @@ impl DataBlock { self.inner.size() } - pub(crate) fn get_binary_index_reader(&self) -> BinaryIndexReader { + pub(crate) fn get_binary_index_reader(&self) -> BinaryIndexReader<'_> { let trailer = Trailer::new(&self.inner); let mut reader = trailer.as_slice(); From 13c1106c8984549d161d7fa11eb73ffdb17358e5 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 13 Sep 2025 16:38:39 +0200 Subject: [PATCH 374/613] simplify block decoder --- src/segment/block/decoder.rs | 7 +--- src/segment/block/hash_index/mod.rs | 24 ++++++------- src/segment/block/hash_index/reader.rs | 25 ++------------ src/segment/data_block/iter.rs | 2 +- src/segment/data_block/mod.rs | 48 ++++++++++++++++---------- src/segment/index_block/iter.rs | 2 +- 6 files changed, 47 insertions(+), 61 deletions(-) diff --git a/src/segment/block/decoder.rs b/src/segment/block/decoder.rs index 1a324c40..80875b86 100644 --- a/src/segment/block/decoder.rs +++ b/src/segment/block/decoder.rs @@ -257,12 +257,7 @@ impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Pa /// Seeks using the given predicate. /// /// Returns `false` if the key does not possible exist. - pub fn seek( - &mut self, - needle: &[u8], - pred: impl Fn(&[u8]) -> bool, - second_partition: bool, - ) -> bool { + pub fn seek(&mut self, pred: impl Fn(&[u8]) -> bool, second_partition: bool) -> bool { // TODO: make this nicer, maybe predicate that can affect the resulting index...? let result = if second_partition { self.partition_point_2(pred) diff --git a/src/segment/block/hash_index/mod.rs b/src/segment/block/hash_index/mod.rs index 50bd43ae..5e0aa26d 100644 --- a/src/segment/block/hash_index/mod.rs +++ b/src/segment/block/hash_index/mod.rs @@ -22,10 +22,10 @@ mod builder; mod reader; pub use builder::{Builder, MAX_POINTERS_FOR_HASH_INDEX}; -pub use reader::{Lookup, Reader}; +pub use reader::Reader; -const MARKER_FREE: u8 = u8::MAX - 1; // 254 -const MARKER_CONFLICT: u8 = u8::MAX; // 255 +pub(crate) const MARKER_FREE: u8 = u8::MAX - 1; // 254 +pub(crate) const MARKER_CONFLICT: u8 = u8::MAX; // 255 // NOTE: We know the hash index has a bucket count <= u32 #[allow(clippy::cast_possible_truncation)] @@ -69,10 +69,10 @@ mod tests { let reader = Reader::new(&bytes, 0, 100); assert_eq!(0, reader.conflict_count()); - assert_eq!(Lookup::Found(5), reader.get(b"a")); - assert_eq!(Lookup::Found(8), reader.get(b"b")); - assert_eq!(Lookup::Found(10), reader.get(b"c")); - assert_eq!(Lookup::NotFound, reader.get(b"d")); + assert_eq!(5, reader.get(b"a")); + assert_eq!(8, reader.get(b"b")); + assert_eq!(10, reader.get(b"c")); + assert_eq!(MARKER_FREE, reader.get(b"d")); } #[test] @@ -102,8 +102,8 @@ mod tests { let reader = Reader::new(&bytes, 0, 1); assert_eq!(0, reader.conflict_count()); - assert_eq!(Lookup::Found(5), reader.get(b"a")); - assert_eq!(Lookup::Found(5), reader.get(b"b")); + assert_eq!(5, reader.get(b"a")); + assert_eq!(5, reader.get(b"b")); } #[test] @@ -131,9 +131,9 @@ mod tests { let bytes = hash_index.into_inner(); let reader = Reader::new(&bytes, 0, 1); - assert_eq!(Lookup::Conflicted, reader.get(b"a")); - assert_eq!(Lookup::Conflicted, reader.get(b"b")); - assert_eq!(Lookup::Conflicted, reader.get(b"c")); + assert_eq!(MARKER_CONFLICT, reader.get(b"a")); + assert_eq!(MARKER_CONFLICT, reader.get(b"b")); + assert_eq!(MARKER_CONFLICT, reader.get(b"c")); assert_eq!(1, Reader::new(&bytes, 0, 1).conflict_count()); } diff --git a/src/segment/block/hash_index/reader.rs b/src/segment/block/hash_index/reader.rs index f661d693..814eead4 100644 --- a/src/segment/block/hash_index/reader.rs +++ b/src/segment/block/hash_index/reader.rs @@ -4,19 +4,6 @@ use super::{calculate_bucket_position, MARKER_CONFLICT, MARKER_FREE}; -/// Hash index lookup result -#[derive(Debug, Eq, PartialEq)] -pub enum Lookup { - /// Key is found, can skip the binary index search - fast path - Found(u8), - - /// Key's bucket was still FREE, so it definitely does not exist - NotFound, - - /// Key is conflicted - we need to look in the binary index instead - slow path - Conflicted, -} - /// Helper to read from an embedded block hash index pub struct Reader<'a>(&'a [u8]); @@ -60,7 +47,7 @@ impl<'a> Reader<'a> { /// Returns the binary index position if the key is not conflicted. #[must_use] - pub fn get(&self, key: &[u8]) -> Lookup { + pub fn get(&self, key: &[u8]) -> u8 { // NOTE: Even with very high hash ratio, there will be nearly enough items to // cause us to create u32 buckets #[allow(clippy::cast_possible_truncation)] @@ -69,13 +56,7 @@ impl<'a> Reader<'a> { let bucket_pos = calculate_bucket_position(key, bucket_count); // SAFETY: We use modulo in `calculate_bucket_position` - #[allow(clippy::indexing_slicing)] - let marker = self.0[bucket_pos]; - - match marker { - MARKER_CONFLICT => Lookup::Conflicted, - MARKER_FREE => Lookup::NotFound, - idx => Lookup::Found(idx), - } + // SAFETY: Also we already did a bounds check in the constructor using indexing slicing + *unsafe { self.0.get_unchecked(bucket_pos) } } } diff --git a/src/segment/data_block/iter.rs b/src/segment/data_block/iter.rs index 26781be4..35d230cb 100644 --- a/src/segment/data_block/iter.rs +++ b/src/segment/data_block/iter.rs @@ -40,7 +40,7 @@ impl<'a> Iter<'a> { if !self .decoder .inner_mut() - .seek(needle, |head_key| head_key < needle, false) + .seek(|head_key| head_key < needle, false) { return false; } diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index e253cdc5..264d5658 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -11,6 +11,7 @@ use super::block::{ Decodable, Decoder, Encodable, Encoder, ParsedItem, Trailer, TRAILER_START_MARKER, }; use crate::key::InternalKey; +use crate::segment::block::hash_index::{MARKER_CONFLICT, MARKER_FREE}; use crate::segment::util::{compare_prefixed_slice, SliceIndexes}; use crate::{unwrap, InternalValue, SeqNo, Slice, ValueType}; use byteorder::WriteBytesExt; @@ -297,11 +298,11 @@ impl DataBlock { pub(crate) fn get_binary_index_reader(&self) -> BinaryIndexReader<'_> { let trailer = Trailer::new(&self.inner); - let mut reader = trailer.as_slice(); - let _item_count = reader.read_u32::().expect("should read"); + // NOTE: Skip item count (u32) and restart interval (u8) + let offset = std::mem::size_of::() + std::mem::size_of::(); - let _restart_interval = unwrap!(reader.read_u8()); + let mut reader = unwrap!(trailer.as_slice().get(offset..)); let binary_index_step_size = unwrap!(reader.read_u8()); @@ -359,39 +360,44 @@ impl DataBlock { // TODO: handle seqno more nicely (make Key generic, so we can do binary search over (key, seqno)) #[must_use] pub fn point_read(&self, needle: &[u8], seqno: SeqNo) -> Option { - let mut iter = self.iter(); - - if let Some(hash_index_reader) = self.get_hash_index_reader() { - use super::block::hash_index::Lookup::{Conflicted, Found, NotFound}; - + let iter = if let Some(hash_index_reader) = self.get_hash_index_reader() { match hash_index_reader.get(needle) { - Found(idx) => { - let offset: usize = self.get_binary_index_reader().get(usize::from(idx)); - iter.seek_to_offset(offset); - } - NotFound => { + MARKER_FREE => { return None; } - Conflicted => { + MARKER_CONFLICT => { // NOTE: Fallback to binary search + let mut iter = self.iter(); + if !iter.seek(needle) { return None; } + + iter + } + idx => { + let offset: usize = self.get_binary_index_reader().get(usize::from(idx)); + + let mut iter = self.iter(); + iter.seek_to_offset(offset); + + iter } } } else { + let mut iter = self.iter(); + // NOTE: Fallback to binary search if !iter.seek(needle) { return None; } - } + iter + }; + + // Linear scan for item in iter { match item.compare_key(needle, &self.inner.data) { - std::cmp::Ordering::Less => { - // We are past our searched key - continue; - } std::cmp::Ordering::Greater => { // We are before our searched key/seqno return None; @@ -399,6 +405,10 @@ impl DataBlock { std::cmp::Ordering::Equal => { // If key is same as needle, check sequence number } + std::cmp::Ordering::Less => { + // We are past our searched key + continue; + } } if item.seqno >= seqno { diff --git a/src/segment/index_block/iter.rs b/src/segment/index_block/iter.rs index d412945a..15a8a0c0 100644 --- a/src/segment/index_block/iter.rs +++ b/src/segment/index_block/iter.rs @@ -24,7 +24,7 @@ impl<'a> Iter<'a> { pub fn seek(&mut self, needle: &[u8]) -> bool { self.decoder .inner_mut() - .seek(needle, |end_key| end_key < needle, true) + .seek(|end_key| end_key < needle, true) } pub fn seek_upper(&mut self, needle: &[u8]) -> bool { From bedbae1e796d22008dab48d56804a2aece6a8aba Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 13 Sep 2025 16:39:38 +0200 Subject: [PATCH 375/613] deps --- Cargo.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 3e4a36bb..4dde38fd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,7 +25,8 @@ bloom_use_unsafe = [] metrics = [] [dependencies] -byteorder = "1.5.0" +bytes = { version = "1", optional = true } +byteorder = { package = "byteorder-lite", version = "0.1.0" } byteview = { git = "https://github.com/fjall-rs/byteview" } crossbeam-skiplist = "0.1.3" enum_dispatch = "0.3.13" From a9d2cf0870c3e5aa7d60605a93976cf05daff465 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 13 Sep 2025 19:00:18 +0200 Subject: [PATCH 376/613] wip comment --- src/tree/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tree/mod.rs b/src/tree/mod.rs index baef0c3a..6a60d0e3 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -141,6 +141,7 @@ impl AbstractTree for Tree { Ok(()) } + // TODO: change API to RangeBounds fn drop_range(&self, key_range: crate::KeyRange) -> crate::Result<()> { let strategy = Arc::new(crate::compaction::drop_range::Strategy::new(key_range)); From c093f060200556dd3e3da8ad0956a7171ccee07e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 13 Sep 2025 19:00:24 +0200 Subject: [PATCH 377/613] rename --- src/version/run.rs | 59 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 44 insertions(+), 15 deletions(-) diff --git a/src/version/run.rs b/src/version/run.rs index 322534a4..6c804b2e 100644 --- a/src/version/run.rs +++ b/src/version/run.rs @@ -112,7 +112,7 @@ impl Run { pub fn get_overlapping<'a>(&'a self, key_range: &'a KeyRange) -> &'a [T] { let range = key_range.min()..=key_range.max(); - let Some((lo, hi)) = self.range_indexes::(&range) else { + let Some((lo, hi)) = self.range_overlap_indexes::(&range) else { return &[]; }; @@ -137,7 +137,7 @@ impl Run { let range = key_range.min()..=key_range.max(); - let Some((lo, hi)) = self.range_indexes::(&range) else { + let Some((lo, hi)) = self.range_overlap_indexes::(&range) else { return &[]; }; @@ -147,7 +147,7 @@ impl Run { } /// Returns the indexes of the interval [min, max] of segments that overlap with a given range. - pub fn range_indexes, R: RangeBounds>( + pub fn range_overlap_indexes, R: RangeBounds>( &self, key_range: &R, ) -> Option<(usize, usize)> { @@ -278,18 +278,47 @@ mod tests { ]; let run = Run(items); - assert_eq!(Some((0, 3)), run.range_indexes::<&[u8], _>(&..)); - assert_eq!(Some((0, 0)), run.range_indexes(&(b"a" as &[u8]..=b"a"))); - assert_eq!(Some((0, 0)), run.range_indexes(&(b"a" as &[u8]..=b"b"))); - assert_eq!(Some((0, 0)), run.range_indexes(&(b"a" as &[u8]..=b"d"))); - assert_eq!(Some((0, 0)), run.range_indexes(&(b"d" as &[u8]..=b"d"))); - assert_eq!(Some((0, 0)), run.range_indexes(&(b"a" as &[u8]..b"d"))); - assert_eq!(Some((0, 1)), run.range_indexes(&(b"a" as &[u8]..=b"g"))); - assert_eq!(Some((1, 1)), run.range_indexes(&(b"j" as &[u8]..=b"j"))); - assert_eq!(Some((0, 3)), run.range_indexes(&(b"a" as &[u8]..=b"z"))); - assert_eq!(Some((3, 3)), run.range_indexes(&(b"z" as &[u8]..=b"zzz"))); - assert_eq!(Some((3, 3)), run.range_indexes(&(b"z" as &[u8]..))); - assert!(run.range_indexes(&(b"zzz" as &[u8]..=b"zzzzzzz")).is_none()); + assert_eq!(Some((0, 3)), run.range_overlap_indexes::<&[u8], _>(&..)); + assert_eq!( + Some((0, 0)), + run.range_overlap_indexes(&(b"a" as &[u8]..=b"a")) + ); + assert_eq!( + Some((0, 0)), + run.range_overlap_indexes(&(b"a" as &[u8]..=b"b")) + ); + assert_eq!( + Some((0, 0)), + run.range_overlap_indexes(&(b"a" as &[u8]..=b"d")) + ); + assert_eq!( + Some((0, 0)), + run.range_overlap_indexes(&(b"d" as &[u8]..=b"d")) + ); + assert_eq!( + Some((0, 0)), + run.range_overlap_indexes(&(b"a" as &[u8]..b"d")) + ); + assert_eq!( + Some((0, 1)), + run.range_overlap_indexes(&(b"a" as &[u8]..=b"g")) + ); + assert_eq!( + Some((1, 1)), + run.range_overlap_indexes(&(b"j" as &[u8]..=b"j")) + ); + assert_eq!( + Some((0, 3)), + run.range_overlap_indexes(&(b"a" as &[u8]..=b"z")) + ); + assert_eq!( + Some((3, 3)), + run.range_overlap_indexes(&(b"z" as &[u8]..=b"zzz")) + ); + assert_eq!(Some((3, 3)), run.range_overlap_indexes(&(b"z" as &[u8]..))); + assert!(run + .range_overlap_indexes(&(b"zzz" as &[u8]..=b"zzzzzzz")) + .is_none()); } #[test] From 3d9601850a802a22291185127044eeac1f4f593e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 13 Sep 2025 19:00:34 +0200 Subject: [PATCH 378/613] perf: optimize drop_range --- src/compaction/drop_range.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/compaction/drop_range.rs b/src/compaction/drop_range.rs index a15c793a..3e71c8a0 100644 --- a/src/compaction/drop_range.rs +++ b/src/compaction/drop_range.rs @@ -3,10 +3,8 @@ // (found in the LICENSE-* files in the repository) use super::{Choice, CompactionStrategy}; -use crate::{ - config::Config, level_manifest::LevelManifest, segment::Segment, version::run::Ranged, HashSet, - KeyRange, -}; +use crate::{config::Config, level_manifest::LevelManifest, KeyRange}; +use crate::{HashSet, Segment}; /// Drops all segments that are **contained** in a key range pub struct Strategy { @@ -33,8 +31,10 @@ impl CompactionStrategy for Strategy { fn choose(&self, levels: &LevelManifest, _: &Config) -> Choice { let segment_ids: HashSet<_> = levels - .iter() - .filter(|segment| self.key_range.contains_range(segment.key_range())) + .current_version() + .iter_levels() + .flat_map(|lvl| lvl.iter()) + .flat_map(|run| run.get_contained(&self.key_range)) .map(Segment::id) .collect(); From 02bf685a943771576dd4ddbde2100e586ce51f09 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 13 Sep 2025 19:00:39 +0200 Subject: [PATCH 379/613] comment --- src/segment/block/hash_index/reader.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/segment/block/hash_index/reader.rs b/src/segment/block/hash_index/reader.rs index 814eead4..7e696fc7 100644 --- a/src/segment/block/hash_index/reader.rs +++ b/src/segment/block/hash_index/reader.rs @@ -48,8 +48,8 @@ impl<'a> Reader<'a> { /// Returns the binary index position if the key is not conflicted. #[must_use] pub fn get(&self, key: &[u8]) -> u8 { - // NOTE: Even with very high hash ratio, there will be nearly enough items to - // cause us to create u32 buckets + // NOTE: Even with very high hash ratio, there won't be nearly enough items to + // cause us to create ~4 billion buckets #[allow(clippy::cast_possible_truncation)] let bucket_count = self.0.len() as u32; From 20dd4cfc4621a668777e1fe4dd74e55dd02f79ea Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 13 Sep 2025 19:00:43 +0200 Subject: [PATCH 380/613] wip --- src/segment/data_block/mod.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index 264d5658..8c6de45b 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -271,10 +271,6 @@ pub struct DataBlock { } impl DataBlock { - // TODO: maybe make the constructor check the block type, so we don't have to do it in the - // block loading routine... - // TODO: for index block etc. too - /// Interprets a block as a data block. /// /// The caller needs to make sure the block is actually a data block From af31d406dc56ce8e5a86a3ace57e569f46bea9fc Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 13 Sep 2025 19:00:48 +0200 Subject: [PATCH 381/613] rename --- src/run_reader.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/run_reader.rs b/src/run_reader.rs index e5eb75d9..be8cdff6 100644 --- a/src/run_reader.rs +++ b/src/run_reader.rs @@ -27,7 +27,7 @@ impl RunReader { ) -> Option { assert!(!run.is_empty(), "level reader cannot read empty level"); - let (lo, hi) = run.range_indexes(&range)?; + let (lo, hi) = run.range_overlap_indexes(&range)?; Some(Self::culled(run, range, (Some(lo), Some(hi)), cache_policy)) } From 1ad16aaed0273c661314368ac92c911a8d7869d9 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 13 Sep 2025 19:01:08 +0200 Subject: [PATCH 382/613] safety comments --- src/segment/util.rs | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/segment/util.rs b/src/segment/util.rs index 4d732051..c8f873d3 100644 --- a/src/segment/util.rs +++ b/src/segment/util.rs @@ -13,6 +13,9 @@ use std::{path::Path, sync::Arc}; #[derive(Debug)] pub struct SliceIndexes(pub usize, pub usize); +/// Loads a block from disk or block cache, if cached. +/// +/// Also handles file descriptor opening and caching. pub fn load_block( segment_id: GlobalSegmentId, path: &Path, @@ -44,7 +47,14 @@ pub fn load_block( Arc::new(std::fs::File::open(path)?) }; - let block = Block::from_file(&fd, *handle, block_type, compression)?; + let block = Block::from_file(&fd, *handle, compression)?; + + if block.header.block_type != block_type { + return Err(crate::Error::Decode(crate::DecodeError::InvalidTag(( + "BlockType", + block.header.block_type.into(), + )))); + } #[cfg(feature = "metrics")] metrics.block_load_io.fetch_add(1, Relaxed); @@ -81,9 +91,11 @@ pub fn compare_prefixed_slice(prefix: &[u8], suffix: &[u8], needle: &[u8]) -> st let max_pfx_len = prefix.len().min(needle.len()); { + // SAFETY: We checked for max_pfx_len #[allow(unsafe_code)] let prefix = unsafe { prefix.get_unchecked(0..max_pfx_len) }; + // SAFETY: We checked for max_pfx_len #[allow(unsafe_code)] let needle = unsafe { needle.get_unchecked(0..max_pfx_len) }; From fd03144b98acaf9b089071a6bdbe3304ac247f87 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 13 Sep 2025 19:02:02 +0200 Subject: [PATCH 383/613] change block type assertions --- src/segment/block/mod.rs | 32 ++------------------------------ src/segment/meta.rs | 19 ++++++++++++------- src/segment/mod.rs | 11 ++++++++--- src/segment/regions.rs | 17 ++++++++++------- src/segment/scanner.rs | 22 +++++++++++++++++++--- 5 files changed, 51 insertions(+), 50 deletions(-) diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index c2a358d4..e35cc2f1 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -80,7 +80,6 @@ impl Block { /// Reads a block from a reader. pub fn from_reader( reader: &mut R, - block_type: BlockType, compression: CompressionType, ) -> crate::Result { let header = Header::decode_from(reader)?; @@ -110,23 +109,10 @@ impl Block { } }); - if header.block_type != block_type { - log::error!( - "Block type mismatch, got={:?}, expected={:?}", - header.block_type, - block_type, - ); - - return Err(crate::Error::Decode(crate::DecodeError::InvalidTag(( - "BlockType", - header.block_type.into(), - )))); - } - let checksum = Checksum::from_raw(crate::hash::hash128(&data)); if checksum != header.checksum { log::warn!( - "Checksum mismatch for {block_type:?}@, got={}, expected={}", + "Checksum mismatch for , got={}, expected={}", *checksum, *header.checksum, ); @@ -140,7 +126,6 @@ impl Block { pub fn from_file( file: &File, handle: BlockHandle, - block_type: BlockType, compression: CompressionType, ) -> crate::Result { #[warn(unsafe_code)] @@ -213,23 +198,10 @@ impl Block { debug_assert_eq!(header.uncompressed_length, buf.len() as u32); } - if header.block_type != block_type { - log::error!( - "Block type mismatch, got={:?}, expected={:?}", - header.block_type, - block_type, - ); - - return Err(crate::Error::Decode(crate::DecodeError::InvalidTag(( - "BlockType", - header.block_type.into(), - )))); - } - let checksum = Checksum::from_raw(crate::hash::hash128(&buf)); if checksum != header.checksum { log::warn!( - "Checksum mismatch for block {block_type:?}@{handle:?}, got={}, expected={}", + "Checksum mismatch for block {handle:?}, got={}, expected={}", *checksum, *header.checksum, ); diff --git a/src/segment/meta.rs b/src/segment/meta.rs index c2ef1109..0e760bae 100644 --- a/src/segment/meta.rs +++ b/src/segment/meta.rs @@ -3,7 +3,9 @@ // (found in the LICENSE-* files in the repository) use super::{Block, BlockHandle, DataBlock}; -use crate::{coding::Decode, CompressionType, KeyRange, SegmentId, SeqNo}; +use crate::{ + coding::Decode, segment::block::BlockType, CompressionType, KeyRange, SegmentId, SeqNo, +}; use byteorder::{LittleEndian, ReadBytesExt}; use std::{fs::File, ops::Deref}; @@ -49,12 +51,15 @@ pub struct ParsedMeta { impl ParsedMeta { #[allow(clippy::expect_used, clippy::too_many_lines)] pub fn load_with_handle(file: &File, handle: &BlockHandle) -> crate::Result { - let block = Block::from_file( - file, - *handle, - crate::segment::block::BlockType::Meta, - CompressionType::None, - )?; + let block = Block::from_file(file, *handle, CompressionType::None)?; + + if block.header.block_type != BlockType::Meta { + return Err(crate::Error::Decode(crate::DecodeError::InvalidTag(( + "BlockType", + block.header.block_type.into(), + )))); + } + let block = DataBlock::new(block); assert_eq!( diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 57c7e91c..dc3db542 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -408,10 +408,16 @@ impl Segment { let block = Block::from_file( &file, regions.tli, - BlockType::Index, CompressionType::None, // TODO: allow setting index block compression )?; + if block.header.block_type != BlockType::Index { + return Err(crate::Error::Decode(crate::DecodeError::InvalidTag(( + "BlockType", + block.header.block_type.into(), + )))); + } + IndexBlock::new(block) }; @@ -425,7 +431,7 @@ impl Segment { BlockIndexImpl::VolatileFull }; - // TODO: load FilterBlock + // TODO: load FilterBlock, check block type let pinned_filter_block = if pin_filter { regions .filter @@ -437,7 +443,6 @@ impl Segment { Block::from_file( &file, filter_handle, - BlockType::Filter, crate::CompressionType::None, // NOTE: We never write a filter block with compression ) }) diff --git a/src/segment/regions.rs b/src/segment/regions.rs index e5b052c1..3e54483d 100644 --- a/src/segment/regions.rs +++ b/src/segment/regions.rs @@ -5,7 +5,7 @@ use super::{Block, BlockHandle}; use crate::{ coding::{Decode, Encode}, - segment::DataBlock, + segment::{block::BlockType, DataBlock}, CompressionType, InternalValue, SeqNo, UserValue, }; use std::fs::File; @@ -21,12 +21,15 @@ pub struct ParsedRegions { impl ParsedRegions { pub fn load_with_handle(file: &File, handle: &BlockHandle) -> crate::Result { - let block = Block::from_file( - file, - *handle, - crate::segment::block::BlockType::Regions, - CompressionType::None, - )?; + let block = Block::from_file(file, *handle, CompressionType::None)?; + + if block.header.block_type != BlockType::Regions { + return Err(crate::Error::Decode(crate::DecodeError::InvalidTag(( + "BlockType", + block.header.block_type.into(), + )))); + } + let block = DataBlock::new(block); let tli = { diff --git a/src/segment/scanner.rs b/src/segment/scanner.rs index 01df9b7f..a4365112 100644 --- a/src/segment/scanner.rs +++ b/src/segment/scanner.rs @@ -3,7 +3,10 @@ // (found in the LICENSE-* files in the repository) use super::{Block, DataBlock}; -use crate::{segment::iter::OwnedDataBlockIter, CompressionType, InternalValue}; +use crate::{ + segment::{block::BlockType, iter::OwnedDataBlockIter}, + CompressionType, InternalValue, +}; use std::{fs::File, io::BufReader, path::Path}; /// Segment reader that is optimized for consuming an entire segment @@ -42,8 +45,21 @@ impl Scanner { reader: &mut BufReader, compression: CompressionType, ) -> crate::Result { - Block::from_reader(reader, crate::segment::block::BlockType::Data, compression) - .map(DataBlock::new) + let block = Block::from_reader(reader, compression); + + match block { + Ok(block) => { + if block.header.block_type != BlockType::Data { + return Err(crate::Error::Decode(crate::DecodeError::InvalidTag(( + "BlockType", + block.header.block_type.into(), + )))); + } + + Ok(DataBlock::new(block)) + } + Err(e) => Err(e), + } } } From 7b30530deb6efb798e6fc2b6387ed63555ccfff9 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 13 Sep 2025 19:50:36 +0200 Subject: [PATCH 384/613] feat: restore tokio bytes feature flag --- Cargo.toml | 7 ++--- src/slice/mod.rs | 26 +++++++++--------- .../{slice_bytes.rs => slice_bytes/mod.rs} | 27 +++---------------- src/slice/{default => slice_default}/mod.rs | 8 ------ 4 files changed, 19 insertions(+), 49 deletions(-) rename src/slice/{slice_bytes.rs => slice_bytes/mod.rs} (81%) rename src/slice/{default => slice_default}/mod.rs (88%) diff --git a/Cargo.toml b/Cargo.toml index 4dde38fd..43bbfd3f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,10 +19,9 @@ path = "src/lib.rs" [features] default = [] lz4 = ["dep:lz4_flex"] -# bytes = ["value-log/bytes"] # TODO: restore -use_unsafe = [] -bloom_use_unsafe = [] +bytes_1 = ["dep:bytes"] metrics = [] +use_unsafe = [] # TODO: 3.0.0 remove [dependencies] bytes = { version = "1", optional = true } @@ -37,8 +36,6 @@ quick_cache = { version = "0.6.16", default-features = false, features = [] } rustc-hash = "2.1.1" self_cell = "1.2.0" tempfile = "3.20.0" -value-log = { git = "https://github.com/fjall-rs/value-log", rev = "1075697727579e5a885b9b88533dc9128d79780e", default-features = false, features = [ -] } varint-rs = "2.2.0" xxhash-rust = { version = "0.8.15", features = ["xxh3"] } diff --git a/src/slice/mod.rs b/src/slice/mod.rs index 1ff87424..93c14967 100644 --- a/src/slice/mod.rs +++ b/src/slice/mod.rs @@ -2,22 +2,24 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) +// Using tokio bytes +#[cfg(feature = "bytes_1")] +mod slice_bytes; + +// Using byteview #[cfg(not(feature = "bytes_1"))] -mod default; +mod slice_default; use std::{ path::{Path, PathBuf}, sync::Arc, }; -// #[cfg(feature = "bytes")] -// mod slice_bytes; - -// #[cfg(feature = "bytes")] -// pub use slice_bytes::Slice; - #[cfg(not(feature = "bytes_1"))] -pub use default::Slice; +pub use slice_default::Slice; + +#[cfg(feature = "bytes_1")] +pub use slice_bytes::Slice; impl AsRef<[u8]> for Slice { fn as_ref(&self) -> &[u8] { @@ -32,10 +34,10 @@ impl From<&[u8]> for Slice { Self(byteview::ByteView::new(value)) } - // #[cfg(feature = "bytes")] - // { - // Self(bytes::Bytes::from(value.to_vec())) - // } + #[cfg(feature = "bytes_1")] + { + Self(bytes::Bytes::from(value.to_vec())) + } } } diff --git a/src/slice/slice_bytes.rs b/src/slice/slice_bytes/mod.rs similarity index 81% rename from src/slice/slice_bytes.rs rename to src/slice/slice_bytes/mod.rs index d8854d61..35609d19 100644 --- a/src/slice/slice_bytes.rs +++ b/src/slice/slice_bytes/mod.rs @@ -23,20 +23,7 @@ impl Slice { Self(Bytes::from_static(&[])) } - #[must_use] - #[doc(hidden)] - pub fn with_size(len: usize) -> Self { - let bytes = vec![0; len]; - Self(Bytes::from(bytes)) - } - - #[must_use] - #[doc(hidden)] - pub fn with_size_unzeroed(len: usize) -> Self { - Self(Self::get_unzeroed_builder(len).freeze()) - } - - fn get_unzeroed_builder(len: usize) -> BytesMut { + pub(crate) unsafe fn builder_unzeroed(len: usize) -> BytesMut { // Use `with_capacity` & `set_len`` to avoid zeroing the buffer let mut builder = BytesMut::with_capacity(len); @@ -63,7 +50,7 @@ impl Slice { use std::io::Write; let len = left.len() + right.len(); - let mut builder = Self::get_unzeroed_builder(len); + let mut builder = unsafe { Self::builder_unzeroed(len) }; { let mut writer = &mut builder[..]; @@ -74,20 +61,12 @@ impl Slice { Self(builder.freeze()) } - #[must_use] - #[doc(hidden)] - pub fn get_mut(&mut self) -> Option + '_> { - todo!(); - - Option::<&mut [u8]>::None - } - /// Constructs a [`Slice`] from an I/O reader by pulling in `len` bytes. /// /// The reader may not read the existing buffer. #[doc(hidden)] pub fn from_reader(reader: &mut R, len: usize) -> std::io::Result { - let mut builder = Self::get_unzeroed_builder(len); + let mut builder = unsafe { Self::builder_unzeroed(len) }; // SAFETY: Normally, read_exact over an uninitialized buffer is UB, // however we know that in lsm-tree etc. only I/O readers or cursors over Vecs are used diff --git a/src/slice/default/mod.rs b/src/slice/slice_default/mod.rs similarity index 88% rename from src/slice/default/mod.rs rename to src/slice/slice_default/mod.rs index 2b99e747..3e05ede6 100644 --- a/src/slice/default/mod.rs +++ b/src/slice/slice_default/mod.rs @@ -25,14 +25,6 @@ impl Slice { ByteView::builder_unzeroed(len) } - // pub(crate) fn with_size(len: usize) -> Self { - // Self(ByteView::with_size(len)) - // } - - // pub(crate) fn with_size_unzeroed(len: usize) -> Self { - // Self(ByteView::with_size_unzeroed(len)) - // } - pub(crate) fn slice(&self, range: impl std::ops::RangeBounds) -> Self { Self(self.0.slice(range)) } From d65c82d48290b9faaf96e3005ce9b2b8f494996e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 13 Sep 2025 19:51:44 +0200 Subject: [PATCH 385/613] test: add assertion --- src/slice/mod.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/slice/mod.rs b/src/slice/mod.rs index 93c14967..1f48e2ef 100644 --- a/src/slice/mod.rs +++ b/src/slice/mod.rs @@ -208,6 +208,14 @@ mod tests { }, [0; 50], ); + assert_eq!( + &*unsafe { + let mut b = Slice::builder_unzeroed(50); + b.fill(77); + b.freeze() + }, + [77; 50], + ); } /// This test verifies that we can create a `Slice` from various types and compare a `Slice` with them. From ca885cfe6b4b1ec5c9eed36f6f56f7de1db2b7cd Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 13 Sep 2025 20:47:18 +0200 Subject: [PATCH 386/613] comments --- src/blob_tree/mod.rs | 6 ++++-- tests/blob_drop_after_flush.rs | 4 ++++ tests/blob_gc_watermark.rs | 4 ++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index d3de4e30..b489867d 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -131,8 +131,7 @@ impl BlobTree { }) } - /// Scans the index tree, collecting statistics about - /// value log fragmentation + /// Scans the index tree, collecting statistics about value log fragmentation. #[doc(hidden)] pub fn gc_scan_stats( &self, @@ -513,6 +512,9 @@ impl AbstractTree for BlobTree { let _memtable_lock = self.lock_active_memtable(); + // TODO: 3.0.0 + // TODO: add to vlog atomically together with the segment (that way, we don't need the pending_segments monkey patch) + log::trace!("Register blob writer into value log"); self.blobs.register_writer(blob_writer)?; diff --git a/tests/blob_drop_after_flush.rs b/tests/blob_drop_after_flush.rs index 115b5d7c..04c51314 100644 --- a/tests/blob_drop_after_flush.rs +++ b/tests/blob_drop_after_flush.rs @@ -2,6 +2,10 @@ use lsm_tree::{AbstractTree, Config, SeqNo}; use std::time::Duration; use test_log::test; +// NOTE: This was a race condition in v2 that could drop a blob file +// before its corresponding segment was registered +// +// https://github.com/fjall-rs/lsm-tree/commit/a3a174ed9eff0755f671f793626d17f4ef3f5f57 #[test] #[ignore = "restore"] fn blob_drop_after_flush() -> lsm_tree::Result<()> { diff --git a/tests/blob_gc_watermark.rs b/tests/blob_gc_watermark.rs index fc09617a..e7c08f6f 100644 --- a/tests/blob_gc_watermark.rs +++ b/tests/blob_gc_watermark.rs @@ -1,6 +1,10 @@ use lsm_tree::{AbstractTree, Config, SeqNo, SequenceNumberCounter}; use test_log::test; +// NOTE: This was a logic/MVCC error in v2 that could drop +// a blob file while it was maybe accessible by a snapshot read +// +// https://github.com/fjall-rs/lsm-tree/commit/79c6ead4b955051cbb4835913e21d08b8aeafba1 #[test] #[ignore] fn blob_gc_seqno_watermark() -> lsm_tree::Result<()> { From ef6f7438e4078305fa298e42e73b018eb050f79e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 13 Sep 2025 23:21:28 +0200 Subject: [PATCH 387/613] restore some tests --- tests/snapshot_compact.rs | 40 +++--- tests/snapshot_len.rs | 43 +++---- tests/snapshot_point_read.rs | 236 +++++++++++++++++------------------ tests/snapshot_zombie.rs | 131 ++++++++++--------- 4 files changed, 218 insertions(+), 232 deletions(-) diff --git a/tests/snapshot_compact.rs b/tests/snapshot_compact.rs index c8c775ec..df4205e6 100644 --- a/tests/snapshot_compact.rs +++ b/tests/snapshot_compact.rs @@ -4,39 +4,35 @@ use test_log::test; const ITEM_COUNT: usize = 100; #[test] -#[ignore = "restore w/o snapshot API"] fn snapshot_after_compaction() -> lsm_tree::Result<()> { - // let folder = tempfile::tempdir()?; + let folder = tempfile::tempdir()?; - // let tree = Config::new(&folder).open()?; + let tree = Config::new(&folder).open()?; - // let seqno = SequenceNumberCounter::default(); + let seqno = SequenceNumberCounter::default(); - // for x in 0..ITEM_COUNT as u64 { - // let key = x.to_be_bytes(); - // tree.insert(key, "abc".as_bytes(), seqno.next()); - // } + for x in 0..ITEM_COUNT as u64 { + let key = x.to_be_bytes(); + tree.insert(key, "abc".as_bytes(), seqno.next()); + } - // assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); - // let snapshot_seqno = seqno.get(); - // let snapshot = tree.snapshot(snapshot_seqno); + let snapshot_seqno = seqno.get(); - // assert_eq!(tree.len(SeqNo::MAX, None)?, snapshot.len()?); - // assert_eq!(tree.len(SeqNo::MAX, None)?, snapshot.iter().rev().count()); + assert_eq!(tree.len(SeqNo::MAX, None)?, tree.len(snapshot_seqno, None)?); - // for x in 0..ITEM_COUNT as u64 { - // let key = x.to_be_bytes(); - // tree.insert(key, "abc".as_bytes(), seqno.next()); - // } + for x in 0..ITEM_COUNT as u64 { + let key = x.to_be_bytes(); + tree.insert(key, "abc".as_bytes(), seqno.next()); + } - // tree.flush_active_memtable(0)?; - // tree.major_compact(u64::MAX, 0)?; + tree.flush_active_memtable(0)?; + tree.major_compact(u64::MAX, 0)?; - // assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); - // assert_eq!(ITEM_COUNT, snapshot.len()?); - // assert_eq!(ITEM_COUNT, snapshot.iter().rev().count()); + assert_eq!(ITEM_COUNT, tree.len(snapshot_seqno, None)?); Ok(()) } diff --git a/tests/snapshot_len.rs b/tests/snapshot_len.rs index cb0c680b..777a5243 100644 --- a/tests/snapshot_len.rs +++ b/tests/snapshot_len.rs @@ -4,41 +4,38 @@ use test_log::test; const ITEM_COUNT: usize = 100; #[test] -#[ignore = "restore w/o snapshot API"] fn snapshot_basic() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; - // let tree = Config::new(&folder).open()?; + let tree = Config::new(&folder).open()?; - // let seqno = SequenceNumberCounter::default(); + let seqno = SequenceNumberCounter::default(); - // for x in 0..ITEM_COUNT as u64 { - // let key = x.to_be_bytes(); - // tree.insert(key, "abc".as_bytes(), seqno.next()); - // } + for x in 0..ITEM_COUNT as u64 { + let key = x.to_be_bytes(); + tree.insert(key, "abc".as_bytes(), seqno.next()); + } - // assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); - // for x in 0..ITEM_COUNT as u64 { - // let key = x.to_be_bytes(); - // tree.insert(key, "abc".as_bytes(), seqno.next()); - // } + for x in 0..ITEM_COUNT as u64 { + let key = x.to_be_bytes(); + tree.insert(key, "abc".as_bytes(), seqno.next()); + } - // assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); - // let snapshot = tree.snapshot(seqno.get()); + let instant = seqno.get(); - // assert_eq!(tree.len(SeqNo::MAX, None)?, snapshot.len()?); - // assert_eq!(tree.len(SeqNo::MAX, None)?, snapshot.iter().rev().count()); + assert_eq!(tree.len(SeqNo::MAX, None)?, tree.len(instant, None)?); - // for x in (ITEM_COUNT as u64)..((ITEM_COUNT * 2) as u64) { - // let key = x.to_be_bytes(); - // tree.insert(key, "abc".as_bytes(), seqno.next()); - // } + for x in (ITEM_COUNT as u64)..((ITEM_COUNT * 2) as u64) { + let key = x.to_be_bytes(); + tree.insert(key, "abc".as_bytes(), seqno.next()); + } - // assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT * 2); - // assert_eq!(ITEM_COUNT, snapshot.len()?); - // assert_eq!(ITEM_COUNT, snapshot.iter().rev().count()); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT * 2); + assert_eq!(ITEM_COUNT, tree.len(instant, None)?); Ok(()) } diff --git a/tests/snapshot_point_read.rs b/tests/snapshot_point_read.rs index 6926e03d..dfd63891 100644 --- a/tests/snapshot_point_read.rs +++ b/tests/snapshot_point_read.rs @@ -2,187 +2,183 @@ use lsm_tree::{AbstractTree, Config, SeqNo, SequenceNumberCounter}; use test_log::test; #[test] -#[ignore = "restore w/o snapshot API"] fn snapshot_404() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; - // let tree = Config::new(&folder) - // .data_block_size(1_024) - // .index_block_size(1_024) - // .open()?; + let tree = Config::new(&folder) + .data_block_size(1_024) + .index_block_size(1_024) + .open()?; - // tree.insert("a", "a", 0); - // tree.insert("a2", "a2", 0); - // tree.insert("c", "c", 0); + tree.insert("a", "a", 0); + tree.insert("a2", "a2", 0); + tree.insert("c", "c", 0); - // tree.flush_active_memtable(0)?; + tree.flush_active_memtable(0)?; - // assert_eq!(b"a", &*tree.get("a", None)?.unwrap()); - // assert_eq!(b"a2", &*tree.get("a2", None)?.unwrap()); - // assert!(tree.get("b", None)?.is_none()); - // assert_eq!(b"c", &*tree.get("c", None)?.unwrap()); + assert_eq!(b"a", &*tree.get("a", SeqNo::MAX)?.unwrap()); + assert_eq!(b"a2", &*tree.get("a2", SeqNo::MAX)?.unwrap()); + assert!(tree.get("b", SeqNo::MAX)?.is_none()); + assert_eq!(b"c", &*tree.get("c", SeqNo::MAX)?.unwrap()); - // assert!(tree.get("a", Some(0))?.is_none()); - // assert!(tree.get("a2", Some(0))?.is_none()); - // assert!(tree.get("b", Some(0))?.is_none()); - // assert!(tree.get("c", Some(0))?.is_none()); + assert!(tree.get("a", 0)?.is_none()); + assert!(tree.get("a2", 0)?.is_none()); + assert!(tree.get("b", 0)?.is_none()); + assert!(tree.get("c", 0)?.is_none()); - // assert_eq!(b"a", &*tree.get("a", Some(1))?.unwrap()); - // assert_eq!(b"a2", &*tree.get("a2", Some(1))?.unwrap()); - // assert!(tree.get("b", Some(1))?.is_none()); - // assert_eq!(b"c", &*tree.get("c", Some(1))?.unwrap()); + assert_eq!(b"a", &*tree.get("a", 1)?.unwrap()); + assert_eq!(b"a2", &*tree.get("a2", 1)?.unwrap()); + assert!(tree.get("b", 1)?.is_none()); + assert_eq!(b"c", &*tree.get("c", 1)?.unwrap()); Ok(()) } #[test] -#[ignore = "restore w/o snapshot API"] fn snapshot_lots_of_versions() -> lsm_tree::Result<()> { let version_count = 600; - // let folder = tempfile::tempdir()?; + let folder = tempfile::tempdir()?; + + let tree = Config::new(&folder) + .data_block_size(1_024) + .index_block_size(1_024) + .open()?; - // let tree = Config::new(&folder) - // .data_block_size(1_024) - // .index_block_size(1_024) - // .open()?; + let key = "abc"; - // let key = "abc"; + let seqno = SequenceNumberCounter::default(); - // let seqno = SequenceNumberCounter::default(); + #[allow(clippy::explicit_counter_loop)] + for _ in 0u64..version_count { + tree.insert(key, format!("abc{version_count}").as_bytes(), seqno.next()); + } - // #[allow(clippy::explicit_counter_loop)] - // for _ in 0u64..version_count { - // tree.insert(key, format!("abc{version_count}").as_bytes(), seqno.next()); - // } + tree.flush_active_memtable(0)?; - // tree.flush_active_memtable(0)?; + assert_eq!(tree.len(SeqNo::MAX, None)?, 1); - // assert_eq!(tree.len(SeqNo::MAX, None)?, 1); + for seqno in 1..version_count { + let item = tree + .get_internal_entry(key.as_bytes(), seqno)? + .expect("should exist"); - // for seqno in 1..version_count { - // let item = tree - // .get_internal_entry(key.as_bytes(), seqno)? - // .expect("should exist"); - // assert_eq!(format!("abc{}", version_count).as_bytes(), &*item.value); + assert_eq!(format!("abc{}", version_count).as_bytes(), &*item.value); - // let item = tree.get(key, None)?.expect("should exist"); - // assert_eq!(format!("abc{}", version_count).as_bytes(), &*item); - // } + let item = tree.get(key, SeqNo::MAX)?.expect("should exist"); + assert_eq!(format!("abc{}", version_count).as_bytes(), &*item); + } - // Ok(()) - // } + Ok(()) +} - // const ITEM_COUNT: usize = 1; - // const BATCHES: usize = 10; +const ITEM_COUNT: usize = 1; +const BATCHES: usize = 10; - // #[test] - // fn snapshot_disk_point_reads() -> lsm_tree::Result<()> { - // let folder = tempfile::tempdir()?; +#[test] +fn snapshot_disk_point_reads() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; - // let tree = Config::new(&folder) - // .data_block_size(1_024) - // .index_block_size(1_024) - // .open()?; + let tree = Config::new(&folder) + .data_block_size(1_024) + .index_block_size(1_024) + .open()?; - // let seqno = SequenceNumberCounter::default(); + let seqno = SequenceNumberCounter::default(); - // for batch in 0..BATCHES { - // for x in 0..ITEM_COUNT as u64 { - // let key = x.to_be_bytes(); - // tree.insert(key, format!("abc{batch}").as_bytes(), seqno.next()); - // } - // } + for batch in 0..BATCHES { + for x in 0..ITEM_COUNT as u64 { + let key = x.to_be_bytes(); + tree.insert(key, format!("abc{batch}").as_bytes(), seqno.next()); + } + } - // tree.flush_active_memtable(0)?; + tree.flush_active_memtable(0)?; - // assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); - // for x in 0..ITEM_COUNT as u64 { - // let key = x.to_be_bytes(); + for x in 0..ITEM_COUNT as u64 { + let key = x.to_be_bytes(); - // let item = tree.get(key, None)?.expect("should exist"); - // assert_eq!("abc9".as_bytes(), &*item); - // } + let item = tree.get(key, SeqNo::MAX)?.expect("should exist"); + assert_eq!("abc9".as_bytes(), &*item); + } - // let snapshot = tree.snapshot(seqno.get()); + let snapshot_seqno = seqno.get(); - // assert_eq!(tree.len(SeqNo::MAX, None)?, snapshot.len()?); - // assert_eq!(tree.len(SeqNo::MAX, None)?, snapshot.iter().rev().count()); + assert_eq!(tree.len(SeqNo::MAX, None)?, tree.len(snapshot_seqno, None)?); - // // This batch will be too new for snapshot (invisible) - // for batch in 0..BATCHES { - // let batch_seqno = seqno.next(); + // This batch will be too new for snapshot (invisible) + for batch in 0..BATCHES { + let batch_seqno = seqno.next(); - // for x in 0..ITEM_COUNT as u64 { - // let key = x.to_be_bytes(); - // tree.insert(key, format!("def{batch}").as_bytes(), batch_seqno); - // } - // } - // tree.flush_active_memtable(0)?; + for x in 0..ITEM_COUNT as u64 { + let key = x.to_be_bytes(); + tree.insert(key, format!("def{batch}").as_bytes(), batch_seqno); + } + } + tree.flush_active_memtable(0)?; - // for x in 0..ITEM_COUNT as u64 { - // let key = x.to_be_bytes(); + for x in 0..ITEM_COUNT as u64 { + let key = x.to_be_bytes(); - // let item = snapshot.get(key)?.expect("should exist"); - // assert_eq!("abc9".as_bytes(), &*item); + let item = tree.get(key, snapshot_seqno)?.expect("should exist"); + assert_eq!("abc9".as_bytes(), &*item); - // let item = tree.get(key, None)?.expect("should exist"); - // assert_eq!("def9".as_bytes(), &*item); - // } + let item = tree.get(key, SeqNo::MAX)?.expect("should exist"); + assert_eq!("def9".as_bytes(), &*item); + } Ok(()) } #[test] -#[ignore = "remove"] fn snapshot_disk_and_memtable_reads() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; - // let tree = Config::new(&folder) - // .data_block_size(1_024) - // .index_block_size(1_024) - // .open()?; + let tree = Config::new(&folder) + .data_block_size(1_024) + .index_block_size(1_024) + .open()?; - // let seqno = SequenceNumberCounter::default(); + let seqno = SequenceNumberCounter::default(); - // for batch in 0..BATCHES { - // let batch_seqno = seqno.next(); + for batch in 0..BATCHES { + let batch_seqno = seqno.next(); - // for x in 0..ITEM_COUNT as u64 { - // let key = x.to_be_bytes(); - // tree.insert(key, format!("abc{batch}").as_bytes(), batch_seqno); - // } - // } + for x in 0..ITEM_COUNT as u64 { + let key = x.to_be_bytes(); + tree.insert(key, format!("abc{batch}").as_bytes(), batch_seqno); + } + } - // tree.flush_active_memtable(0)?; + tree.flush_active_memtable(0)?; - // assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); - // let snapshot = tree.snapshot(seqno.get()); + let snapshot_seqno = seqno.get(); - // assert_eq!(tree.len(SeqNo::MAX, None)?, snapshot.len()?); - // assert_eq!(tree.len(SeqNo::MAX, None)?, snapshot.iter().rev().count()); + assert_eq!(tree.len(SeqNo::MAX, None)?, tree.len(snapshot_seqno, None)?); - // // This batch will be in memtable and too new for snapshot (invisible) - // for batch in 0..BATCHES { - // let batch_seqno = seqno.next(); + // This batch will be in memtable and too new for snapshot (invisible) + for batch in 0..BATCHES { + let batch_seqno = seqno.next(); - // for x in 0..ITEM_COUNT as u64 { - // let key = x.to_be_bytes(); - // tree.insert(key, format!("def{batch}").as_bytes(), batch_seqno); - // } - // } + for x in 0..ITEM_COUNT as u64 { + let key = x.to_be_bytes(); + tree.insert(key, format!("def{batch}").as_bytes(), batch_seqno); + } + } - // for x in 0..ITEM_COUNT as u64 { - // let key = x.to_be_bytes(); + for x in 0..ITEM_COUNT as u64 { + let key = x.to_be_bytes(); - // let item = snapshot.get(key)?.expect("should exist"); - // assert_eq!("abc9".as_bytes(), &*item); + let item = tree.get(key, snapshot_seqno)?.expect("should exist"); + assert_eq!("abc9".as_bytes(), &*item); - // let item = tree.get(key, None)?.expect("should exist"); - // assert_eq!("def9".as_bytes(), &*item); - // } + let item = tree.get(key, SeqNo::MAX)?.expect("should exist"); + assert_eq!("def9".as_bytes(), &*item); + } Ok(()) } diff --git a/tests/snapshot_zombie.rs b/tests/snapshot_zombie.rs index 3979e1ba..9d66f367 100644 --- a/tests/snapshot_zombie.rs +++ b/tests/snapshot_zombie.rs @@ -4,103 +4,100 @@ use test_log::test; const ITEM_COUNT: usize = 5; #[test] -#[ignore = "restore w/o snapshot API"] fn snapshot_zombie_memtable() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; - // let tree = Config::new(&folder).open()?; + let tree = Config::new(&folder).open()?; - // let seqno = SequenceNumberCounter::default(); + let seqno = SequenceNumberCounter::default(); - // for x in 0..ITEM_COUNT as u64 { - // let key = x.to_be_bytes(); - // tree.insert(key, "abc".as_bytes(), seqno.next()); - // } + for x in 0..ITEM_COUNT as u64 { + let key = x.to_be_bytes(); + tree.insert(key, "abc".as_bytes(), seqno.next()); + } - // assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); - // assert_eq!(tree.iter(SeqNo::MAX, None).rev().count(), ITEM_COUNT); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); + assert_eq!(tree.iter(SeqNo::MAX, None).rev().count(), ITEM_COUNT); - // { - // let snapshot = tree.snapshot(seqno.get()); - // assert_eq!(ITEM_COUNT, snapshot.len()?); - // assert_eq!(ITEM_COUNT, snapshot.iter().rev().count()); - // } + { + let snapshot_seqno = seqno.get(); + assert_eq!(ITEM_COUNT, tree.len(snapshot_seqno, None)?); + assert_eq!(ITEM_COUNT, tree.iter(snapshot_seqno, None).rev().count()); + } - // for x in 0..ITEM_COUNT as u64 { - // let key = x.to_be_bytes(); - // tree.remove(key, seqno.next()); - // } + for x in 0..ITEM_COUNT as u64 { + let key = x.to_be_bytes(); + tree.remove(key, seqno.next()); + } - // assert_eq!(tree.len(SeqNo::MAX, None)?, 0); - // assert_eq!(tree.iter(SeqNo::MAX, None).rev().count(), 0); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + assert_eq!(tree.iter(SeqNo::MAX, None).rev().count(), 0); - // { - // let snapshot = tree.snapshot(seqno.get()); - // assert_eq!(0, snapshot.len()?); - // assert_eq!(0, snapshot.iter().rev().count()); - // assert_eq!(0, snapshot.prefix("".as_bytes()).count()); - // } + { + let snapshot_seqno = seqno.get(); + assert_eq!(0, tree.len(snapshot_seqno, None)?); + assert_eq!(0, tree.iter(snapshot_seqno, None).rev().count()); + } Ok(()) } #[test] -#[ignore = "restore w/o snapshot API"] fn snapshot_zombie_segment() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; - // let seqno = SequenceNumberCounter::default(); + let seqno = SequenceNumberCounter::default(); - // { - // let tree = Config::new(&folder).open()?; + { + let tree = Config::new(&folder).open()?; - // for x in 0..ITEM_COUNT as u64 { - // let key = x.to_be_bytes(); - // tree.insert(key, "abc".as_bytes(), seqno.next()); - // } + for x in 0..ITEM_COUNT as u64 { + let key = x.to_be_bytes(); + tree.insert(key, "abc".as_bytes(), seqno.next()); + } - // tree.flush_active_memtable(0)?; + tree.flush_active_memtable(0)?; - // assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); - // assert_eq!(tree.iter(SeqNo::MAX, None).rev().count(), ITEM_COUNT); + assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); + assert_eq!(tree.iter(SeqNo::MAX, None).rev().count(), ITEM_COUNT); - // { - // let snapshot = tree.snapshot(seqno.get()); - // assert_eq!(ITEM_COUNT, snapshot.len()?); - // assert_eq!(ITEM_COUNT, snapshot.iter().rev().count()); - // } + { + let snapshot_seqno = seqno.get(); + assert_eq!(ITEM_COUNT, tree.len(snapshot_seqno, None)?); + assert_eq!(ITEM_COUNT, tree.iter(snapshot_seqno, None).rev().count()); + } - // for x in 0..ITEM_COUNT as u64 { - // let key = x.to_be_bytes(); - // tree.remove(key, seqno.next()); - // } + for x in 0..ITEM_COUNT as u64 { + let key = x.to_be_bytes(); + tree.remove(key, seqno.next()); + } - // tree.flush_active_memtable(0)?; + tree.flush_active_memtable(0)?; - // assert_eq!(tree.len(SeqNo::MAX, None)?, 0); - // assert_eq!(tree.iter(SeqNo::MAX, None).rev().count(), 0); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + assert_eq!(tree.iter(SeqNo::MAX, None).rev().count(), 0); - // { - // let snapshot = tree.snapshot(seqno.get()); - // assert_eq!(0, snapshot.len()?); - // assert_eq!(0, snapshot.iter().rev().count()); - // assert_eq!(0, snapshot.prefix("".as_bytes()).count()); - // } - // } + { + let snapshot_seqno = seqno.get(); + assert_eq!(0, tree.len(snapshot_seqno, None)?); + assert_eq!(0, tree.iter(snapshot_seqno, None).rev().count()); + assert_eq!(0, tree.prefix(b"", snapshot_seqno, None).count()); + } + } - // { - // let tree = Config::new(&folder).open()?; + { + let tree = Config::new(&folder).open()?; - // assert_eq!(tree.len(SeqNo::MAX, None)?, 0); - // assert_eq!(tree.iter(SeqNo::MAX, None).rev().count(), 0); + assert_eq!(tree.len(SeqNo::MAX, None)?, 0); + assert_eq!(tree.iter(SeqNo::MAX, None).rev().count(), 0); - // { - // let snapshot = tree.snapshot(seqno.get()); - // assert_eq!(0, snapshot.len()?); - // assert_eq!(0, snapshot.iter().rev().count()); - // assert_eq!(0, snapshot.prefix("".as_bytes()).count()); - // } - // } + { + let snapshot_seqno = seqno.get(); + assert_eq!(0, tree.len(snapshot_seqno, None)?); + assert_eq!(0, tree.iter(snapshot_seqno, None).rev().count()); + assert_eq!(0, tree.prefix(b"", snapshot_seqno, None).count()); + } + } Ok(()) } From 12e80fa5b7524c6e1ef9bf47666180fb69c3c11b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 13 Sep 2025 23:37:04 +0200 Subject: [PATCH 388/613] restore some more tests --- src/run_scanner.rs | 19 +- src/segment/filter/blocked_bloom/builder.rs | 4 +- src/segment/filter/blocked_bloom/mod.rs | 190 +++++++------------- src/segment/filter/standard_bloom/mod.rs | 31 ++-- 4 files changed, 90 insertions(+), 154 deletions(-) diff --git a/src/run_scanner.rs b/src/run_scanner.rs index 14ec5532..e7a39bfa 100644 --- a/src/run_scanner.rs +++ b/src/run_scanner.rs @@ -70,9 +70,8 @@ mod tests { use crate::{AbstractTree, Slice}; use test_log::test; - // TODO: restore - /* #[test] - fn level_scanner_basic() -> crate::Result<()> { + #[test] + fn run_scanner_basic() -> crate::Result<()> { let tempdir = tempfile::tempdir()?; let tree = crate::Config::new(&tempdir).open()?; @@ -91,21 +90,19 @@ mod tests { } let segments = tree - .version + .manifest .read() .expect("lock is poisoned") + .current_version() .iter_segments() .cloned() .collect::>(); - let level = Arc::new(Level { - segments, - is_disjoint: true, - }); + let level = Arc::new(Run::new(segments)); #[allow(clippy::unwrap_used)] { - let multi_reader = RunScanner::from_indexes(level.clone(), (None, None))?; + let multi_reader = RunScanner::culled(level.clone(), (None, None))?; let mut iter = multi_reader.flatten(); @@ -125,7 +122,7 @@ mod tests { #[allow(clippy::unwrap_used)] { - let multi_reader = RunScanner::from_indexes(level.clone(), (Some(1), None))?; + let multi_reader = RunScanner::culled(level, (Some(1), None))?; let mut iter = multi_reader.flatten(); @@ -141,5 +138,5 @@ mod tests { } Ok(()) - } */ + } } diff --git a/src/segment/filter/blocked_bloom/builder.rs b/src/segment/filter/blocked_bloom/builder.rs index 6fd83945..48ce08cd 100644 --- a/src/segment/filter/blocked_bloom/builder.rs +++ b/src/segment/filter/blocked_bloom/builder.rs @@ -18,10 +18,10 @@ pub struct Builder { inner: BitArrayBuilder, /// Number of hash functions - k: usize, + pub(crate) k: usize, /// Number of blocks in the blocked bloom filter - num_blocks: usize, + pub(crate) num_blocks: usize, } #[allow(clippy::len_without_is_empty)] diff --git a/src/segment/filter/blocked_bloom/mod.rs b/src/segment/filter/blocked_bloom/mod.rs index 70d50234..9f6d9c92 100644 --- a/src/segment/filter/blocked_bloom/mod.rs +++ b/src/segment/filter/blocked_bloom/mod.rs @@ -30,10 +30,10 @@ pub struct BlockedBloomFilterReader<'a> { inner: BitArrayReader<'a>, /// Number of hash functions - k: usize, + pub(crate) k: usize, /// Number of blocks in the blocked bloom filter - num_blocks: usize, + pub(crate) num_blocks: usize, } impl<'a> BlockedBloomFilterReader<'a> { @@ -117,6 +117,7 @@ impl<'a> BlockedBloomFilterReader<'a> { } /// Gets the hash of a key. + #[must_use] pub fn get_hash(key: &[u8]) -> u64 { Builder::get_hash(key) } @@ -130,137 +131,66 @@ impl<'a> BlockedBloomFilterReader<'a> { } } -// impl<'a> Encode for BlockedBloomFilter<'a> { -// fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { -// // Write header -// writer.write_all(&MAGIC_BYTES)?; - -// writer.write_u8(BloomFilterType::BlockedBloom as u8)?; - -// // NOTE: Hash type (unused) -// writer.write_u8(0)?; - -// writer.write_u64::(self.num_blocks as u64)?; -// writer.write_u64::(self.k as u64)?; -// writer.write_all(self.inner.bytes())?; - -// Ok(()) -// } -// } - -// impl<'a> BlockedBloomFilter<'a> { -// // To be used by AMQFilter after magic bytes and filter type have been read and parsed -// pub(super) fn decode_from(reader: &mut R) -> Result { -// // NOTE: Hash type (unused) -// let hash_type = reader.read_u8()?; -// assert_eq!(0, hash_type, "Invalid bloom hash type"); - -// let num_blocks = reader.read_u64::()? as usize; -// let k = reader.read_u64::()? as usize; - -// let mut bytes = vec![0; num_blocks * CACHE_LINE_BYTES]; -// reader.read_exact(&mut bytes)?; - -// Ok(AMQFilter::BlockedBloom(Self::from_raw( -// num_blocks, -// k, -// bytes.into(), -// ))) -// } - -// fn from_raw(num_blocks: usize, k: usize, slice: crate::Slice) -> Self { -// Self { -// inner: BitArrayReader::new(slice), -// k, -// num_blocks, -// } -// } - -// /// Gets the hash of a key. -// pub fn get_hash(key: &[u8]) -> CompositeHash { -// Builder::get_hash(key) -// } -// } - #[cfg(test)] mod tests { use super::*; - use std::fs::File; use test_log::test; - // #[test] - // fn blocked_bloom_serde_round_trip() -> crate::Result<()> { - // let dir = tempfile::tempdir()?; - - // let path = dir.path().join("bf"); - // let mut file = File::create(&path)?; - - // let mut filter = Builder::with_fp_rate(10, 0.0001); - - // let keys = &[ - // b"item0", b"item1", b"item2", b"item3", b"item4", b"item5", b"item6", b"item7", - // b"item8", b"item9", - // ]; - - // for key in keys { - // filter.set_with_hash(BlockedBloomFilter::get_hash(*key)); - // } - - // let filter = filter.build(); - - // for key in keys { - // assert!(filter.contains(&**key)); - // } - // assert!(!filter.contains(b"asdasads")); - // assert!(!filter.contains(b"item10")); - // assert!(!filter.contains(b"cxycxycxy")); - - // filter.encode_into(&mut file)?; - // file.sync_all()?; - // drop(file); - - // let mut file = File::open(&path)?; - // let filter_copy = AMQFilterBuilder::decode_from(&mut file)?; - - // assert_eq!(filter.inner.bytes(), filter_copy.bytes()); - // assert!(matches!(filter_copy, AMQFilter::BlockedBloom(_))); - - // for key in keys { - // assert!(filter.contains(&**key)); - // } - // assert!(!filter_copy.contains(b"asdasads")); - // assert!(!filter_copy.contains(b"item10")); - // assert!(!filter_copy.contains(b"cxycxycxy")); - - // Ok(()) - // } - - // #[test] - // fn blocked_bloom_basic() { - // let mut filter = Builder::with_fp_rate(10, 0.0001); - // let keys = [ - // b"item0" as &[u8], - // b"item1", - // b"item2", - // b"item3", - // b"item4", - // b"item5", - // b"item6", - // b"item7", - // b"item8", - // b"item9", - // ]; - - // for key in &keys { - // filter.set_with_hash(Builder::get_hash(key)); - // } - - // let filter = filter.build(); - - // for key in &keys { - // assert!(filter.contains(key)); - // } - - // assert!(!filter.contains(b"asdasdasdasdasdasdasd")); - // } + #[test] + fn filter_bloom_blocked_serde_round_trip() -> crate::Result<()> { + let mut filter = Builder::with_fp_rate(10, 0.0001); + + let keys = &[ + b"item0", b"item1", b"item2", b"item3", b"item4", b"item5", b"item6", b"item7", + b"item8", b"item9", + ]; + + for key in keys { + filter.set_with_hash(BlockedBloomFilterReader::get_hash(*key)); + } + + let filter_bytes = filter.build(); + let filter_copy = BlockedBloomFilterReader::new(&filter_bytes)?; + + assert_eq!(filter.k, filter_copy.k); + assert_eq!(filter.num_blocks, filter_copy.num_blocks); + assert!(!filter_copy.contains(b"asdasads")); + assert!(!filter_copy.contains(b"item10")); + assert!(!filter_copy.contains(b"cxycxycxy")); + + Ok(()) + } + + #[test] + fn filter_bloom_blocked_basic() -> crate::Result<()> { + let mut filter = Builder::with_fp_rate(10, 0.0001); + + let keys = [ + b"item0" as &[u8], + b"item1", + b"item2", + b"item3", + b"item4", + b"item5", + b"item6", + b"item7", + b"item8", + b"item9", + ]; + + for key in &keys { + filter.set_with_hash(Builder::get_hash(key)); + } + + let filter_bytes = filter.build(); + let filter = BlockedBloomFilterReader::new(&filter_bytes)?; + + for key in &keys { + assert!(filter.contains(key)); + } + + assert!(!filter.contains(b"asdasdasdasdasdasdasd")); + + Ok(()) + } } diff --git a/src/segment/filter/standard_bloom/mod.rs b/src/segment/filter/standard_bloom/mod.rs index b505a949..fd6691e9 100644 --- a/src/segment/filter/standard_bloom/mod.rs +++ b/src/segment/filter/standard_bloom/mod.rs @@ -122,13 +122,12 @@ impl<'a> StandardBloomFilterReader<'a> { } #[cfg(test)] -#[allow(clippy::unwrap_used)] mod tests { use super::*; use test_log::test; #[test] - fn bloom_serde_round_trip() { + fn filter_bloom_standard_serde_round_trip() -> crate::Result<()> { let mut filter = Builder::with_fp_rate(10, 0.0001); let keys = &[ @@ -141,17 +140,19 @@ mod tests { } let filter_bytes = filter.build(); - let filter_copy = StandardBloomFilterReader::new(&filter_bytes).unwrap(); + let filter_copy = StandardBloomFilterReader::new(&filter_bytes)?; assert_eq!(filter.k, filter_copy.k); assert_eq!(filter.m, filter_copy.m); assert!(!filter_copy.contains(b"asdasads")); assert!(!filter_copy.contains(b"item10")); assert!(!filter_copy.contains(b"cxycxycxy")); + + Ok(()) } #[test] - fn bloom_basic() { + fn filter_bloom_standard_basic() -> crate::Result<()> { let mut filter = Builder::with_fp_rate(10, 0.0001); let keys = [ @@ -172,17 +173,19 @@ mod tests { } let filter_bytes = filter.build(); - let filter = StandardBloomFilterReader::new(&filter_bytes).unwrap(); + let filter = StandardBloomFilterReader::new(&filter_bytes)?; for key in &keys { assert!(filter.contains(key)); } assert!(!filter.contains(b"asdasdasdasdasdasdasd")); + + Ok(()) } #[test] - fn bloom_bpk() { + fn filter_bloom_standard_bpk() -> crate::Result<()> { let item_count = 1_000; let bpk = 5; @@ -195,7 +198,7 @@ mod tests { } let filter_bytes = filter.build(); - let filter = StandardBloomFilterReader::new(&filter_bytes).unwrap(); + let filter = StandardBloomFilterReader::new(&filter_bytes)?; let mut false_positives = 0; @@ -210,10 +213,12 @@ mod tests { #[allow(clippy::cast_precision_loss)] let fpr = false_positives as f32 / item_count as f32; assert!(fpr < 0.13); + + Ok(()) } #[test] - fn bloom_fpr() { + fn filter_bloom_standard_fpr() -> crate::Result<()> { let item_count = 100_000; let wanted_fpr = 0.1; @@ -226,7 +231,7 @@ mod tests { } let filter_bytes = filter.build(); - let filter = StandardBloomFilterReader::new(&filter_bytes).unwrap(); + let filter = StandardBloomFilterReader::new(&filter_bytes)?; let mut false_positives = 0; @@ -242,10 +247,12 @@ mod tests { let fpr = false_positives as f32 / item_count as f32; assert!(fpr > 0.05); assert!(fpr < 0.13); + + Ok(()) } #[test] - fn bloom_fpr_2() { + fn filter_bloom_standard_fpr_2() -> crate::Result<()> { let item_count = 100_000; let wanted_fpr = 0.5; @@ -258,7 +265,7 @@ mod tests { } let filter_bytes = filter.build(); - let filter = StandardBloomFilterReader::new(&filter_bytes).unwrap(); + let filter = StandardBloomFilterReader::new(&filter_bytes)?; let mut false_positives = 0; @@ -274,5 +281,7 @@ mod tests { let fpr = false_positives as f32 / item_count as f32; assert!(fpr > 0.45); assert!(fpr < 0.55); + + Ok(()) } } From 1f26645bb498ab4d8a354304561282dd64f2a3be Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 13 Sep 2025 23:39:08 +0200 Subject: [PATCH 389/613] refactor --- src/level_manifest/mod.rs | 67 +++++----------------------------- src/version/mod.rs | 75 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 83 insertions(+), 59 deletions(-) diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index 3a9de09e..3493b9be 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -5,7 +5,7 @@ pub(crate) mod hidden_set; use crate::{ - coding::DecodeError, + coding::{DecodeError, Encode}, file::{fsync_directory, rewrite_atomic, MAGIC_BYTES}, segment::Segment, version::{Level, Run, Version, VersionId, DEFAULT_LEVEL_COUNT}, @@ -142,6 +142,7 @@ impl LevelManifest { Ok(manifest) } + // TODO: move into Version::decode pub(crate) fn load_version(path: &Path) -> crate::Result>>> { let mut level_manifest = Cursor::new(std::fs::read(path)?); @@ -280,32 +281,7 @@ impl LevelManifest { let file = std::fs::File::create_new(folder.join(format!("v{}", version.id())))?; let mut writer = BufWriter::new(file); - // Magic - writer.write_all(&MAGIC_BYTES)?; - - // Level count - // NOTE: We know there are always less than 256 levels - #[allow(clippy::cast_possible_truncation)] - writer.write_u8(version.level_count() as u8)?; - - for level in version.iter_levels() { - // Run count - // NOTE: We know there are always less than 256 runs - #[allow(clippy::cast_possible_truncation)] - writer.write_u8(level.len() as u8)?; - - for run in level.iter() { - // Segment count - // NOTE: We know there are always less than 4 billion segments in a run - #[allow(clippy::cast_possible_truncation)] - writer.write_u32::(run.len() as u32)?; - - // Segment IDs - for id in run.iter().map(Segment::id) { - writer.write_u64::(id)?; - } - } - } + version.encode_into(&mut writer)?; writer.flush()?; writer.get_mut().sync_all()?; @@ -447,13 +423,13 @@ mod tests { use crate::{ coding::Encode, level_manifest::{hidden_set::HiddenSet, LevelManifest}, + version::Version, AbstractTree, }; + use std::collections::VecDeque; use test_log::test; - // TODO: restore - /* #[test] - #[ignore] + #[test] fn level_manifest_atomicity() -> crate::Result<()> { let folder = tempfile::tempdir()?; @@ -479,12 +455,12 @@ mod tests { // NOTE: Purposefully change level manifest to have invalid path // to force an I/O error - tree.levels.write().expect("lock is poisoned").path = "/invaliiid/asd".into(); + tree.manifest.write().expect("lock is poisoned").folder = "/invaliiid/asd".into(); assert!(tree.major_compact(u64::MAX, 4).is_err()); assert!(tree - .levels + .manifest .read() .expect("lock is poisoned") .hidden_set @@ -493,30 +469,5 @@ mod tests { assert_eq!(segment_count_before_major_compact, tree.segment_count()); Ok(()) - } */ - - /* #[test] - fn level_manifest_raw_empty() -> crate::Result<()> { - let manifest = LevelManifest { - hidden_set: HiddenSet::default(), - levels: Vec::default(), - path: "a".into(), - is_disjoint: false, - }; - - let bytes = Runs(&manifest.deep_clone()).encode_into_vec(); - - #[rustfmt::skip] - let raw = &[ - // Magic - b'L', b'S', b'M', 3, - - // Count - 0, - ]; - - assert_eq!(bytes, raw); - - Ok(()) - } */ + } } diff --git a/src/version/mod.rs b/src/version/mod.rs index a22f4f54..836db56a 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -7,7 +7,7 @@ pub mod run; pub use run::Run; -use crate::{HashSet, KeyRange, Segment, SegmentId, SeqNo}; +use crate::{coding::Encode, HashSet, KeyRange, Segment, SegmentId, SeqNo}; use optimize::optimize_runs; use run::Ranged; use std::{ops::Deref, sync::Arc}; @@ -360,3 +360,76 @@ impl Version { } } } + +impl Encode for Version { + fn encode_into(&self, writer: &mut W) -> Result<(), crate::EncodeError> { + use crate::file::MAGIC_BYTES; + use byteorder::{LittleEndian, WriteBytesExt}; + + // Magic + writer.write_all(&MAGIC_BYTES)?; + + // Level count + // NOTE: We know there are always less than 256 levels + #[allow(clippy::cast_possible_truncation)] + writer.write_u8(self.level_count() as u8)?; + + for level in self.iter_levels() { + // Run count + // NOTE: We know there are always less than 256 runs + #[allow(clippy::cast_possible_truncation)] + writer.write_u8(level.len() as u8)?; + + for run in level.iter() { + // Segment count + // NOTE: We know there are always less than 4 billion segments in a run + #[allow(clippy::cast_possible_truncation)] + writer.write_u32::(run.len() as u32)?; + + // Segment IDs + for id in run.iter().map(Segment::id) { + writer.write_u64::(id)?; + } + } + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use test_log::test; + + #[test] + fn version_encode_empty() { + let bytes = Version::new(0).encode_into_vec(); + + #[rustfmt::skip] + let raw = &[ + // Magic + b'L', b'S', b'M', 3, + + // Level count + 7, + + // L0 runs + 0, + // L1 runs + 0, + // L2 runs + 0, + // L3 runs + 0, + // L4 runs + 0, + // L5 runs + 0, + // L6 runs + 0, + ]; + + assert_eq!(bytes, raw); + } +} From 21d8398c4f03f581752fce18af87b989950a86c4 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 17 Sep 2025 15:18:45 +0200 Subject: [PATCH 390/613] doc --- src/abstract.rs | 2 +- src/compaction/fifo.rs | 2 +- src/config.rs | 8 ++------ src/level_manifest/mod.rs | 6 +++--- src/lib.rs | 2 +- src/memtable/mod.rs | 2 +- src/segment/block/decoder.rs | 2 +- src/segment/block/hash_index/builder.rs | 2 +- src/segment/block/hash_index/reader.rs | 4 ++-- src/segment/block/trailer.rs | 2 +- src/segment/block_index/mod.rs | 3 +-- src/segment/data_block/mod.rs | 2 +- src/segment/filter/bit_array/builder.rs | 3 +++ src/segment/filter/bit_array/reader.rs | 2 ++ src/segment/filter/blocked_bloom/builder.rs | 2 ++ src/segment/filter/blocked_bloom/mod.rs | 2 +- src/segment/filter/standard_bloom/mod.rs | 2 +- src/segment/index_block/mod.rs | 2 +- src/segment/mod.rs | 2 +- src/segment/multi_writer.rs | 6 ++++++ src/segment/trailer.rs | 2 +- src/version/mod.rs | 9 ++++++++- src/vlog/blob_file/gc_stats.rs | 2 +- 23 files changed, 43 insertions(+), 28 deletions(-) diff --git a/src/abstract.rs b/src/abstract.rs index 8b35c36e..3052b7d5 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -129,7 +129,7 @@ pub trait AbstractTree { /// after tree recovery. fn set_active_memtable(&self, memtable: Memtable); - /// Returns the amount of sealed memtables. + /// Returns the number of sealed memtables. fn sealed_memtable_count(&self) -> usize; /// Adds a sealed memtables. diff --git a/src/compaction/fifo.rs b/src/compaction/fifo.rs index 1ac62e38..0d6cc78b 100644 --- a/src/compaction/fifo.rs +++ b/src/compaction/fifo.rs @@ -10,7 +10,7 @@ use crate::{config::Config, level_manifest::LevelManifest, HashSet}; /// Limits the tree size to roughly `limit` bytes, deleting the oldest segment(s) /// when the threshold is reached. /// -/// Will also merge segments if the amount of segments in level 0 grows too much, which +/// Will also merge segments if the number of segments in level 0 grows too much, which /// could cause write stalls. /// /// Additionally, a (lazy) TTL can be configured to drop old segments. diff --git a/src/config.rs b/src/config.rs index a39d1200..7ff4a83a 100644 --- a/src/config.rs +++ b/src/config.rs @@ -70,7 +70,7 @@ pub struct Config { /// Block size of index blocks pub index_block_size: u32, - /// Amount of levels of the LSM tree (depth of tree) + /// Number of levels of the LSM tree (depth of tree) pub level_count: u8, /// Bits per key for levels that are not L0, L1, L2 @@ -199,7 +199,7 @@ impl Config { self } - /// Sets the amount of levels of the LSM tree (depth of tree). + /// Sets the number of levels of the LSM tree (depth of tree). /// /// Defaults to 7, like `LevelDB` and `RocksDB`. /// @@ -233,9 +233,7 @@ impl Config { pub fn data_block_size(mut self, block_size: u32) -> Self { assert!(block_size >= 1_024); assert!(block_size <= 512 * 1_024); - self.data_block_size = block_size; - self } @@ -256,9 +254,7 @@ impl Config { pub fn index_block_size(mut self, block_size: u32) -> Self { assert!(block_size >= 1_024); assert!(block_size <= 512 * 1_024); - self.index_block_size = block_size; - self } diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index 3493b9be..731c31d3 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -346,7 +346,7 @@ impl LevelManifest { self.len() == 0 } - /// Returns the amount of levels in the tree + /// Returns the number of levels in the tree #[must_use] pub fn level_count(&self) -> u8 { // NOTE: Level count is u8 @@ -356,13 +356,13 @@ impl LevelManifest { } } - /// Returns the amount of levels in the tree. + /// Returns the number of levels in the tree. #[must_use] pub fn last_level_index(&self) -> u8 { DEFAULT_LEVEL_COUNT - 1 } - /// Returns the amount of segments, summed over all levels + /// Returns the number of segments, summed over all levels #[must_use] pub fn len(&self) -> usize { self.current.segment_count() diff --git a/src/lib.rs b/src/lib.rs index 0130a84c..36b4968e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -71,7 +71,7 @@ //! tree.flush_active_memtable(0)?; //! //! // When some disk segments have amassed, use compaction -//! // to reduce the amount of disk segments +//! // to reduce the number of disk segments //! //! // Choose compaction strategy based on workload //! use lsm_tree::compaction::Leveled; diff --git a/src/memtable/mod.rs b/src/memtable/mod.rs index c54d53bd..d5026ab0 100644 --- a/src/memtable/mod.rs +++ b/src/memtable/mod.rs @@ -100,7 +100,7 @@ impl Memtable { .load(std::sync::atomic::Ordering::Acquire) } - /// Counts the amount of items in the memtable. + /// Counts the number of items in the memtable. pub fn len(&self) -> usize { self.items.len() } diff --git a/src/segment/block/decoder.rs b/src/segment/block/decoder.rs index 80875b86..9cbd8dd7 100644 --- a/src/segment/block/decoder.rs +++ b/src/segment/block/decoder.rs @@ -138,7 +138,7 @@ impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Pa &self.block.data } - /// Returns the amount of items in the block. + /// Returns the number of items in the block. #[must_use] #[allow(clippy::len_without_is_empty)] pub fn len(&self) -> usize { diff --git a/src/segment/block/hash_index/builder.rs b/src/segment/block/hash_index/builder.rs index 4ec1e3a1..b90b35d9 100644 --- a/src/segment/block/hash_index/builder.rs +++ b/src/segment/block/hash_index/builder.rs @@ -13,7 +13,7 @@ pub const MAX_POINTERS_FOR_HASH_INDEX: usize = 254; pub struct Builder(Vec); impl Builder { - /// Initializes a new builder with the given amount of buckets. + /// Initializes a new builder with the given number of buckets. #[must_use] pub fn with_bucket_count(bucket_count: u32) -> Self { Self(vec![MARKER_FREE; bucket_count as usize]) diff --git a/src/segment/block/hash_index/reader.rs b/src/segment/block/hash_index/reader.rs index 7e696fc7..a4b27a31 100644 --- a/src/segment/block/hash_index/reader.rs +++ b/src/segment/block/hash_index/reader.rs @@ -28,14 +28,14 @@ impl<'a> Reader<'a> { // NOTE: Only used in metrics, so no need to be hyper-optimized #[allow(clippy::naive_bytecount)] - /// Returns the amount of empty slots in the hash index. + /// Returns the number of empty slots in the hash index. #[must_use] pub fn free_count(&self) -> usize { self.0.iter().filter(|&&byte| byte == MARKER_FREE).count() } // NOTE: Only used in metrics, so no need to be hyper-optimized - /// Returns the amount of conflict markers in the hash index. + /// Returns the number of conflict markers in the hash index. #[must_use] #[allow(clippy::naive_bytecount)] pub fn conflict_count(&self) -> usize { diff --git a/src/segment/block/trailer.rs b/src/segment/block/trailer.rs index 1ac0cae5..878d2c42 100644 --- a/src/segment/block/trailer.rs +++ b/src/segment/block/trailer.rs @@ -33,7 +33,7 @@ impl<'a> Trailer<'a> { self.block.data.len() - TRAILER_SIZE } - /// Returns the amount of items in the block + /// Returns the number of items in the block #[must_use] pub fn item_count(&self) -> usize { let mut reader = self.as_slice(); diff --git a/src/segment/block_index/mod.rs b/src/segment/block_index/mod.rs index 013837a9..99ade210 100644 --- a/src/segment/block_index/mod.rs +++ b/src/segment/block_index/mod.rs @@ -5,8 +5,7 @@ pub(crate) mod iter; use super::{CachePolicy, IndexBlock, KeyedBlockHandle}; -use crate::{segment::block::ParsedItem, Cache}; -use std::sync::Arc; +use crate::segment::block::ParsedItem; #[enum_dispatch::enum_dispatch] pub trait BlockIndex { diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index 8c6de45b..f62a15d2 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -443,7 +443,7 @@ impl DataBlock { unwrap!(reader.read_u32::()) } - /// Returns the amount of items in the block. + /// Returns the number of items in the block. #[must_use] #[allow(clippy::len_without_is_empty)] pub fn len(&self) -> usize { diff --git a/src/segment/filter/bit_array/builder.rs b/src/segment/filter/bit_array/builder.rs index 1065f8df..d4914a57 100644 --- a/src/segment/filter/bit_array/builder.rs +++ b/src/segment/filter/bit_array/builder.rs @@ -35,6 +35,9 @@ impl Builder { /// Sets the i-th bit pub fn enable_bit(&mut self, idx: usize) { let byte_idx = idx / 8; + + // NOTE: We trust the caller + #[allow(clippy::expect_used)] let byte = self.0.get_mut(byte_idx).expect("should be in bounds"); let bit_idx = idx % 8; diff --git a/src/segment/filter/bit_array/reader.rs b/src/segment/filter/bit_array/reader.rs index 10fec004..32671be8 100644 --- a/src/segment/filter/bit_array/reader.rs +++ b/src/segment/filter/bit_array/reader.rs @@ -32,6 +32,8 @@ impl<'a> BitArrayReader<'a> { pub fn get(&self, idx: usize) -> bool { let byte_idx = idx / 8; + // NOTE: We trust the caller + #[allow(clippy::expect_used)] let byte = self.0.get(byte_idx).expect("should be in bounds"); let bit_idx = idx % 8; diff --git a/src/segment/filter/blocked_bloom/builder.rs b/src/segment/filter/blocked_bloom/builder.rs index 48ce08cd..c0f1204c 100644 --- a/src/segment/filter/blocked_bloom/builder.rs +++ b/src/segment/filter/blocked_bloom/builder.rs @@ -26,6 +26,8 @@ pub struct Builder { #[allow(clippy::len_without_is_empty)] impl Builder { + // NOTE: We write into a Vec, so no I/O error can happen + #[allow(clippy::expect_used)] #[must_use] pub fn build(&self) -> Vec { let mut v = vec![]; diff --git a/src/segment/filter/blocked_bloom/mod.rs b/src/segment/filter/blocked_bloom/mod.rs index 9f6d9c92..cdb565e5 100644 --- a/src/segment/filter/blocked_bloom/mod.rs +++ b/src/segment/filter/blocked_bloom/mod.rs @@ -19,7 +19,7 @@ const CACHE_LINE_BYTES: usize = 64; /// A blocked bloom filter /// /// Allows buffering the key hashes before actual filter construction -/// which is needed to properly calculate the filter size, as the amount of items +/// which is needed to properly calculate the filter size, as the number of items /// are unknown during segment construction. /// /// The filter uses double hashing instead of `k` hash functions, see: diff --git a/src/segment/filter/standard_bloom/mod.rs b/src/segment/filter/standard_bloom/mod.rs index fd6691e9..645d35e0 100644 --- a/src/segment/filter/standard_bloom/mod.rs +++ b/src/segment/filter/standard_bloom/mod.rs @@ -17,7 +17,7 @@ use std::io::{Cursor, Read}; /// A standard bloom filter /// /// Allows buffering the key hashes before actual filter construction -/// which is needed to properly calculate the filter size, as the amount of items +/// which is needed to properly calculate the filter size, as the number of items /// are unknown during segment construction. /// /// The filter uses double hashing instead of `k` hash functions, see: diff --git a/src/segment/index_block/mod.rs b/src/segment/index_block/mod.rs index 59068857..dc6135a2 100644 --- a/src/segment/index_block/mod.rs +++ b/src/segment/index_block/mod.rs @@ -69,7 +69,7 @@ impl IndexBlock { Self { inner } } - /// Returns the amount of items in the block. + /// Returns the number of items in the block. #[must_use] #[allow(clippy::len_without_is_empty)] pub fn len(&self) -> usize { diff --git a/src/segment/mod.rs b/src/segment/mod.rs index dc3db542..00a89068 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -506,7 +506,7 @@ impl Segment { self.metadata.seqnos.1 } - /// Returns the amount of tombstone markers in the `Segment`. + /// Returns the number of tombstone markers in the `Segment`. #[must_use] #[doc(hidden)] pub fn tombstone_count(&self) -> u64 { diff --git a/src/segment/multi_writer.rs b/src/segment/multi_writer.rs index 47ee960e..ba494eda 100644 --- a/src/segment/multi_writer.rs +++ b/src/segment/multi_writer.rs @@ -179,7 +179,10 @@ mod tests { // NOTE: Tests that versions of the same key stay // in the same segment even if it needs to be rotated + // // This avoids segments' key ranges overlapping + // + // http://github.com/fjall-rs/lsm-tree/commit/f46b6fe26a1e90113dc2dbb0342db160a295e616 #[test] fn segment_multi_writer_same_key_norotate() -> crate::Result<()> { let folder = tempfile::tempdir()?; @@ -202,6 +205,9 @@ mod tests { Ok(()) } + // NOTE: Follow-up fix for non-disjoint output + // + // https://github.com/fjall-rs/lsm-tree/commit/1609a57c2314420b858d826790ecd1442aa76720 #[test] fn segment_multi_writer_same_key_norotate_2() -> crate::Result<()> { let folder = tempfile::tempdir()?; diff --git a/src/segment/trailer.rs b/src/segment/trailer.rs index 6cc0ff36..b210683c 100644 --- a/src/segment/trailer.rs +++ b/src/segment/trailer.rs @@ -36,7 +36,7 @@ const TRAILER_SIZE: usize = 32; /// | trailer | <- fixed size /// |--------------| /// -/// Through this indirection, we can have a variable amount of region block handles. +/// Through this indirection, we can have a variable number of region block handles. #[derive(Copy, Clone, Debug, Default, PartialEq, Eq)] pub struct Trailer { regions_block_handle: BlockHandle, diff --git a/src/version/mod.rs b/src/version/mod.rs index 836db56a..b3c807c7 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -133,17 +133,24 @@ impl Level { } pub struct VersionInner { + /// The version's ID id: VersionId, + /// The individual LSM-tree levels which consist of runs of tables pub(crate) levels: Vec, } -/// A version is a point-in-time view of a tree's structure +/// A version is an immutable, point-in-time view of a tree's structure /// /// Any time a segment is created or deleted, a new version is created. #[derive(Clone)] pub struct Version { inner: Arc, + + /// The sequence number at the time the version was installed + /// + /// We keep all versions that have `seqno_watermark` > `mvcc_watermark` to prevent + /// snapshots losing data pub(crate) seqno_watermark: SeqNo, } diff --git a/src/vlog/blob_file/gc_stats.rs b/src/vlog/blob_file/gc_stats.rs index bfaf5093..031248cc 100644 --- a/src/vlog/blob_file/gc_stats.rs +++ b/src/vlog/blob_file/gc_stats.rs @@ -21,7 +21,7 @@ impl GcStats { .store(x, std::sync::atomic::Ordering::Release); } - /// Returns the amount of dead items in the blob file. + /// Returns the number of dead items in the blob file. pub fn stale_items(&self) -> u64 { self.stale_items.load(std::sync::atomic::Ordering::Acquire) } From 60d1075233aaf3c7d1b4c124b05a2937be21d8ac Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 15:50:06 +0200 Subject: [PATCH 391/613] test: block reader roundtrips --- src/segment/block/mod.rs | 60 ++++++++++++++++++++++++++++++++++------ 1 file changed, 52 insertions(+), 8 deletions(-) diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index e35cc2f1..7da550b6 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -93,10 +93,8 @@ impl Block { let mut builder = unsafe { Slice::builder_unzeroed(header.uncompressed_length as usize) }; - { - lz4_flex::decompress_into(&raw_data, &mut builder) - .map_err(|_| crate::Error::Decompress(compression))?; - } + lz4_flex::decompress_into(&raw_data, &mut builder) + .map_err(|_| crate::Error::Decompress(compression))?; builder.freeze().into() } @@ -184,10 +182,8 @@ impl Block { let mut builder = unsafe { Slice::builder_unzeroed(header.uncompressed_length as usize) }; - { - lz4_flex::decompress_into(raw_data, &mut builder) - .map_err(|_| crate::Error::Decompress(compression))?; - } + lz4_flex::decompress_into(raw_data, &mut builder) + .map_err(|_| crate::Error::Decompress(compression))?; builder.freeze() } @@ -215,3 +211,51 @@ impl Block { }) } } + +#[cfg(test)] +mod tests { + use super::*; + use test_log::test; + + // TODO: Block::from_file roundtrips + + #[test] + fn block_roundtrip_uncompressed() -> crate::Result<()> { + let mut writer = vec![]; + + Block::write_into( + &mut writer, + b"abcdefabcdefabcdef", + BlockType::Data, + CompressionType::None, + )?; + + { + let mut reader = &writer[..]; + let block = Block::from_reader(&mut reader, CompressionType::None)?; + assert_eq!(b"abcdefabcdefabcdef", &*block.data); + } + + Ok(()) + } + #[test] + #[cfg(feature = "lz4")] + fn block_roundtrip_lz4() -> crate::Result<()> { + let mut writer = vec![]; + + Block::write_into( + &mut writer, + b"abcdefabcdefabcdef", + BlockType::Data, + CompressionType::Lz4, + )?; + + { + let mut reader = &writer[..]; + let block = Block::from_reader(&mut reader, CompressionType::Lz4)?; + assert_eq!(b"abcdefabcdefabcdef", &*block.data); + } + + Ok(()) + } +} From e3280eaace80327276be6b604627ff5c13c4d4d3 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 15:51:56 +0200 Subject: [PATCH 392/613] doc --- src/vlog/gc/report.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vlog/gc/report.rs b/src/vlog/gc/report.rs index 0b8a9b7f..890d4b5c 100644 --- a/src/vlog/gc/report.rs +++ b/src/vlog/gc/report.rs @@ -23,10 +23,10 @@ pub struct GcReport { /// Amount of bytes that could be freed pub stale_bytes: u64, - /// Amount of stored blobs + /// Number of stored blobs pub total_blobs: u64, - /// Amount of blobs that could be freed + /// Number of blobs that could be freed pub stale_blobs: u64, } From aab0c68f58af0c4e12ef8cda14e93915bbe1fcbd Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 16:52:52 +0200 Subject: [PATCH 393/613] refactor: remove old InternalKey code --- src/key.rs | 78 +++--------------------------------------------------- 1 file changed, 3 insertions(+), 75 deletions(-) diff --git a/src/key.rs b/src/key.rs index 141c6657..d1520600 100644 --- a/src/key.rs +++ b/src/key.rs @@ -2,16 +2,8 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::{ - coding::{Decode, DecodeError, Encode, EncodeError}, - SeqNo, UserKey, ValueType, -}; -use byteorder::{ReadBytesExt, WriteBytesExt}; -use std::{ - cmp::Reverse, - io::{Read, Write}, -}; -use varint_rs::{VarintReader, VarintWriter}; +use crate::{SeqNo, UserKey, ValueType}; +use std::cmp::Reverse; #[derive(Clone, Eq)] #[allow(clippy::module_name_repetitions)] @@ -21,12 +13,6 @@ pub struct InternalKey { pub value_type: ValueType, } -/* impl<'a> From<&InternalKeyRef<'a>> for InternalKey { - fn from(value: &InternalKeyRef<'a>) -> Self { - Self::new(value.user_key, value.seqno, value.value_type) - } -} */ - impl AsRef<[u8]> for InternalKey { fn as_ref(&self) -> &[u8] { &self.user_key @@ -67,7 +53,7 @@ impl InternalKey { let user_key = user_key.into(); assert!( - user_key.len() <= u16::MAX.into(), + u16::try_from(user_key.len()).is_ok(), "keys can be 65535 bytes in length", ); @@ -97,61 +83,3 @@ impl Ord for InternalKey { (&self.user_key, Reverse(self.seqno)).cmp(&(&other.user_key, Reverse(other.seqno))) } } - -// TODO: wait for new crossbeam-skiplist -// TODO: https://github.com/crossbeam-rs/crossbeam/pull/1162 -// -// impl Equivalent> for InternalKey { -// fn equivalent(&self, other: &InternalKeyRef<'_>) -> bool { -// self.user_key == other.user_key && self.seqno == other.seqno -// } -// } - -// impl Comparable> for InternalKey { -// fn compare(&self, other: &InternalKeyRef<'_>) -> std::cmp::Ordering { -// (&*self.user_key, Reverse(self.seqno)).cmp(&(other.user_key, Reverse(other.seqno))) -// } -// } - -/* /// Temporary internal key without heap allocation -#[derive(Clone, Debug, Eq)] -pub struct InternalKeyRef<'a> { - pub user_key: &'a [u8], - pub seqno: SeqNo, - pub value_type: ValueType, -} - -impl<'a> AsRef<[u8]> for InternalKeyRef<'a> { - fn as_ref(&self) -> &[u8] { - self.user_key - } -} - -impl<'a> InternalKeyRef<'a> { - // Constructor for InternalKeyRef - pub fn new(user_key: &'a [u8], seqno: u64, value_type: ValueType) -> Self { - InternalKeyRef { - user_key, - seqno, - value_type, - } - } -} - -impl<'a> PartialEq for InternalKeyRef<'a> { - fn eq(&self, other: &Self) -> bool { - self.user_key == other.user_key && self.seqno == other.seqno - } -} - -impl<'a> PartialOrd for InternalKeyRef<'a> { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl<'a> Ord for InternalKeyRef<'a> { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - (&self.user_key, Reverse(self.seqno)).cmp(&(&other.user_key, Reverse(other.seqno))) - } -} */ From acd5da4086c66e2c573b9db7e3be9997f0d73caf Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 16:53:31 +0200 Subject: [PATCH 394/613] check FilterBlock type when loading pinned filter --- src/segment/mod.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 00a89068..e1a8db7b 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -431,7 +431,7 @@ impl Segment { BlockIndexImpl::VolatileFull }; - // TODO: load FilterBlock, check block type + // TODO: FilterBlock newtype let pinned_filter_block = if pin_filter { regions .filter @@ -445,6 +445,16 @@ impl Segment { filter_handle, crate::CompressionType::None, // NOTE: We never write a filter block with compression ) + .and_then(|block| { + if block.header.block_type == BlockType::Filter { + Ok(block) + } else { + Err(crate::Error::Decode(crate::DecodeError::InvalidTag(( + "BlockType", + block.header.block_type.into(), + )))) + } + }) }) .transpose()? } else { From 89a58b7ee875432da015f521053385c5a9212d03 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 17:58:26 +0200 Subject: [PATCH 395/613] refactor: metrics struct --- src/metrics.rs | 81 ++++++++++++++++++++++++++++++------- src/segment/block/mod.rs | 1 + src/segment/mod.rs | 14 +++---- src/segment/util.rs | 33 ++++++++++++--- src/version/mod.rs | 87 +++++++++++++++++++++++++++++++++++----- 5 files changed, 179 insertions(+), 37 deletions(-) diff --git a/src/metrics.rs b/src/metrics.rs index 5c56fc90..c154e6a0 100644 --- a/src/metrics.rs +++ b/src/metrics.rs @@ -5,44 +5,95 @@ use std::sync::atomic::AtomicUsize; use std::sync::atomic::Ordering::Relaxed; +/// Runtime metrics +/// +/// Are not stored durably, so metrics will reset after a restart/crash. #[derive(Debug, Default)] pub struct Metrics { + /// Number of index blocks that were actually read from disk + pub(crate) index_block_load_io: AtomicUsize, + + /// Number of filter blocks that were actually read from disk + pub(crate) filter_block_load_io: AtomicUsize, + /// Number of blocks that were actually read from disk - pub(crate) block_load_io: AtomicUsize, + pub(crate) data_block_load_io: AtomicUsize, + + /// Number of blocks that were read from block cache + pub(crate) index_block_load_cached: AtomicUsize, + + /// Number of blocks that were read from block cache + pub(crate) filter_block_load_cached: AtomicUsize, /// Number of blocks that were read from block cache - pub(crate) block_load_cached: AtomicUsize, + pub(crate) data_block_load_cached: AtomicUsize, - /// Number of bloom filter queries that were performed - pub(crate) bloom_filter_queries: AtomicUsize, + /// Number of filter queries that were performed + pub(crate) filter_queries: AtomicUsize, - /// Number of IOs that were skipped due to bloom filter hits - pub(crate) bloom_filter_hits: AtomicUsize, + /// Number of IOs that were skipped due to filter + pub(crate) io_skipped_by_filter: AtomicUsize, } #[allow(clippy::cast_precision_loss)] impl Metrics { - /// Number of blocks that were read from disk. + /// Number of data blocks that were accessed. + pub fn data_block_loads(&self) -> usize { + self.data_block_load_cached.load(Relaxed) + self.data_block_load_io.load(Relaxed) + } + + /// Number of index blocks that were accessed. + pub fn index_block_loads(&self) -> usize { + self.index_block_load_cached.load(Relaxed) + self.index_block_load_io.load(Relaxed) + } + + /// Number of filter blocks that were accessed. + pub fn filter_block_loads(&self) -> usize { + self.filter_block_load_cached.load(Relaxed) + self.filter_block_load_io.load(Relaxed) + } + + /// Number of blocks that were loaded from disk or OS page cache. pub fn block_loads_io(&self) -> usize { - self.block_load_io.load(Relaxed) + self.data_block_load_io.load(Relaxed) + + self.index_block_load_io.load(Relaxed) + + self.filter_block_load_io.load(Relaxed) + } + + /// Number of blocks that were loaded from disk or OS page cache. + pub fn block_loads_cached(&self) -> usize { + self.data_block_load_cached.load(Relaxed) + + self.index_block_load_cached.load(Relaxed) + + self.filter_block_load_cached.load(Relaxed) } /// Number of blocks that were accessed. pub fn block_loads(&self) -> usize { - self.block_load_cached.load(Relaxed) + self.block_load_io.load(Relaxed) + self.block_loads_io() + self.block_loads_cached() } /// Block cache efficiency in percent (0.0 - 1.0). - pub fn block_cache_efficiency(&self) -> f64 { + pub fn block_cache_hit_rate(&self) -> f64 { let queries = self.block_loads() as f64; - let hits = self.block_load_cached.load(Relaxed) as f64; + let hits = self.block_loads_cached() as f64; hits / queries } /// Filter efficiency in percent (0.0 - 1.0). - pub fn bloom_filter_efficiency(&self) -> f64 { - let queries = self.bloom_filter_queries.load(Relaxed) as f64; - let hits = self.bloom_filter_hits.load(Relaxed) as f64; - hits / queries + /// + /// Represents the ratio of I/O operations avoided due to filter. + pub fn filter_efficiency(&self) -> f64 { + let queries = self.filter_queries.load(Relaxed) as f64; + let io_skipped = self.io_skipped_by_filter.load(Relaxed) as f64; + io_skipped / queries + } + + /// Number of filter queries performed. + pub fn filter_queries(&self) -> usize { + self.filter_queries.load(Relaxed) + } + + /// Number of I/O operations skipped by filter. + pub fn io_skipped_by_filter(&self) -> usize { + self.io_skipped_by_filter.load(Relaxed) } } diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index 7da550b6..dfb34bfb 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -90,6 +90,7 @@ impl Block { #[cfg(feature = "lz4")] CompressionType::Lz4 => { + #[warn(unsafe_code)] let mut builder = unsafe { Slice::builder_unzeroed(header.uncompressed_length as usize) }; diff --git a/src/segment/mod.rs b/src/segment/mod.rs index e1a8db7b..fa937a47 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -25,9 +25,6 @@ pub use index_block::{BlockHandle, IndexBlock, KeyedBlockHandle}; pub use scanner::Scanner; pub use writer::Writer; -#[cfg(feature = "metrics")] -use crate::metrics::Metrics; - use crate::{ cache::Cache, descriptor_table::DescriptorTable, @@ -44,6 +41,9 @@ use std::{ }; use util::load_block; +#[cfg(feature = "metrics")] +use crate::metrics::Metrics; + // TODO: segment iter: // TODO: we only need to truncate items from blocks that are not the first and last block // TODO: because any block inbetween must (trivially) only contain relevant items @@ -179,11 +179,11 @@ impl Segment { let filter = StandardBloomFilterReader::new(&block.data)?; #[cfg(feature = "metrics")] - self.metrics.bloom_filter_queries.fetch_add(1, Relaxed); + self.metrics.filter_queries.fetch_add(1, Relaxed); if !filter.contains_hash(key_hash) { #[cfg(feature = "metrics")] - self.metrics.bloom_filter_hits.fetch_add(1, Relaxed); + self.metrics.io_skipped_by_filter.fetch_add(1, Relaxed); return Ok(None); } @@ -196,11 +196,11 @@ impl Segment { let filter = StandardBloomFilterReader::new(&block.data)?; #[cfg(feature = "metrics")] - self.metrics.bloom_filter_queries.fetch_add(1, Relaxed); + self.metrics.filter_queries.fetch_add(1, Relaxed); if !filter.contains_hash(key_hash) { #[cfg(feature = "metrics")] - self.metrics.bloom_filter_hits.fetch_add(1, Relaxed); + self.metrics.io_skipped_by_filter.fetch_add(1, Relaxed); return Ok(None); } diff --git a/src/segment/util.rs b/src/segment/util.rs index c8f873d3..3164bed5 100644 --- a/src/segment/util.rs +++ b/src/segment/util.rs @@ -2,13 +2,13 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -#[cfg(feature = "metrics")] -use crate::metrics::Metrics; - use super::{Block, BlockHandle, GlobalSegmentId}; use crate::{segment::block::BlockType, Cache, CompressionType, DescriptorTable}; use std::{path::Path, sync::Arc}; +#[cfg(feature = "metrics")] +use crate::metrics::Metrics; + /// [start, end] slice indexes #[derive(Debug)] pub struct SliceIndexes(pub usize, pub usize); @@ -16,6 +16,7 @@ pub struct SliceIndexes(pub usize, pub usize); /// Loads a block from disk or block cache, if cached. /// /// Also handles file descriptor opening and caching. +#[warn(clippy::too_many_arguments)] pub fn load_block( segment_id: GlobalSegmentId, path: &Path, @@ -33,7 +34,18 @@ pub fn load_block( if let Some(block) = cache.get_block(segment_id, handle.offset()) { #[cfg(feature = "metrics")] - metrics.block_load_cached.fetch_add(1, Relaxed); + match block_type { + BlockType::Filter => { + metrics.filter_block_load_cached.fetch_add(1, Relaxed); + } + BlockType::Index => { + metrics.index_block_load_cached.fetch_add(1, Relaxed); + } + BlockType::Data => { + metrics.data_block_load_cached.fetch_add(1, Relaxed); + } + _ => {} + } return Ok(block); } @@ -57,7 +69,18 @@ pub fn load_block( } #[cfg(feature = "metrics")] - metrics.block_load_io.fetch_add(1, Relaxed); + match block_type { + BlockType::Filter => { + metrics.filter_block_load_io.fetch_add(1, Relaxed); + } + BlockType::Index => { + metrics.index_block_load_io.fetch_add(1, Relaxed); + } + BlockType::Data => { + metrics.data_block_load_io.fetch_add(1, Relaxed); + } + _ => {} + } // Cache FD if fd_cache_miss { diff --git a/src/version/mod.rs b/src/version/mod.rs index b3c807c7..fdf48342 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -7,10 +7,14 @@ pub mod run; pub use run::Run; -use crate::{coding::Encode, HashSet, KeyRange, Segment, SegmentId, SeqNo}; +use crate::{ + coding::Encode, + vlog::{BlobFile, BlobFileId}, + HashSet, KeyRange, Segment, SegmentId, SeqNo, +}; use optimize::optimize_runs; use run::Ranged; -use std::{ops::Deref, sync::Arc}; +use std::{collections::BTreeMap, ops::Deref, sync::Arc}; pub const DEFAULT_LEVEL_COUNT: u8 = 7; @@ -138,6 +142,13 @@ pub struct VersionInner { /// The individual LSM-tree levels which consist of runs of tables pub(crate) levels: Vec, + + // We purposefully use Arc<_> to avoid deep cloning the blob files again and again + // + // Changing the value log tends to happen way less often than other modifications to the + // LSM-tree + /// Blob files for large values (value log) + pub(crate) value_log: Arc>, } /// A version is an immutable, point-in-time view of a tree's structure @@ -174,15 +185,27 @@ impl Version { let levels = (0..DEFAULT_LEVEL_COUNT).map(|_| Level::empty()).collect(); Self { - inner: Arc::new(VersionInner { id, levels }), + inner: Arc::new(VersionInner { + id, + levels, + value_log: Arc::default(), + }), seqno_watermark: 0, } } /// Creates a new pre-populated version. - pub fn from_levels(id: VersionId, levels: Vec) -> Self { + pub fn from_levels( + id: VersionId, + levels: Vec, + blob_files: BTreeMap, + ) -> Self { Self { - inner: Arc::new(VersionInner { id, levels }), + inner: Arc::new(VersionInner { + id, + levels, + value_log: Arc::new(blob_files), + }), seqno_watermark: 0, } } @@ -202,6 +225,10 @@ impl Version { self.iter_levels().map(|x| x.segment_count()).sum() } + pub fn blob_file_count(&self) -> usize { + self.value_log.len() + } + /// Returns an iterator over all segments. pub fn iter_segments(&self) -> impl Iterator { self.levels @@ -216,7 +243,7 @@ impl Version { } /// Creates a new version with the additional run added to the "top" of L0. - pub fn with_new_l0_run(&self, run: &[Segment]) -> Self { + pub fn with_new_l0_run(&self, run: &[Segment], blob_files: Option<&[BlobFile]>) -> Self { let id = self.id + 1; let mut levels = vec![]; @@ -250,8 +277,21 @@ impl Version { // L1+ levels.extend(self.levels.iter().skip(1).cloned()); + // Value log + let value_log = if let Some(blob_files) = blob_files { + let mut copy = self.value_log.deref().clone(); + copy.extend(blob_files.iter().cloned().map(|bf| (bf.id(), bf))); + copy.into() + } else { + self.value_log.clone() + }; + Self { - inner: Arc::new(VersionInner { id, levels }), + inner: Arc::new(VersionInner { + id, + levels, + value_log, + }), seqno_watermark: 0, } } @@ -283,7 +323,11 @@ impl Version { } Self { - inner: Arc::new(VersionInner { id, levels }), + inner: Arc::new(VersionInner { + id, + levels, + value_log: self.value_log.clone(), + }), seqno_watermark: 0, } } @@ -321,7 +365,11 @@ impl Version { } Self { - inner: Arc::new(VersionInner { id, levels }), + inner: Arc::new(VersionInner { + id, + levels, + value_log: self.value_log.clone(), + }), seqno_watermark: 0, } } @@ -362,7 +410,11 @@ impl Version { } Self { - inner: Arc::new(VersionInner { id, levels }), + inner: Arc::new(VersionInner { + id, + levels, + value_log: self.value_log.clone(), + }), seqno_watermark: 0, } } @@ -400,6 +452,15 @@ impl Encode for Version { } } + // Blob file count + // NOTE: We know there are always less than 4 billion blob files + #[allow(clippy::cast_possible_truncation)] + writer.write_u32::(self.value_log.len() as u32)?; + + for file in self.value_log.values() { + writer.write_u64::(file.id())?; + } + Ok(()) } } @@ -435,6 +496,12 @@ mod tests { 0, // L6 runs 0, + + // Blob file count + 0, + 0, + 0, + 0, ]; assert_eq!(bytes, raw); From 0b7aa6b6370657a9b6b0b223270dda0191a813f0 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:00:44 +0200 Subject: [PATCH 396/613] wip --- src/vlog/blob_file/writer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vlog/blob_file/writer.rs b/src/vlog/blob_file/writer.rs index f380dd10..b93e80b4 100644 --- a/src/vlog/blob_file/writer.rs +++ b/src/vlog/blob_file/writer.rs @@ -99,7 +99,7 @@ impl Writer { /// Panics if the key length is empty or greater than 2^16, or the value length is greater than 2^32. pub fn write(&mut self, key: &[u8], value: &[u8]) -> crate::Result { assert!(!key.is_empty()); - assert!(key.len() <= u16::MAX.into()); + assert!(u16::try_from(key.len()).is_ok()); assert!(u32::try_from(value.len()).is_ok()); if self.first_key.is_none() { From c8c7d53d9e56ca8b613d7e7539c166624fef49af Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:01:05 +0200 Subject: [PATCH 397/613] remove old vlog compression trait --- src/vlog/compression.rs | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 src/vlog/compression.rs diff --git a/src/vlog/compression.rs b/src/vlog/compression.rs deleted file mode 100644 index 84ccbd6f..00000000 --- a/src/vlog/compression.rs +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -/// Generic compression trait -pub trait Compressor { - /// Compresses a value - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - fn compress(&self, bytes: &[u8]) -> crate::Result>; - - /// Decompresses a value - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - fn decompress(&self, bytes: &[u8]) -> crate::Result>; -} From a1e7936e9276d614fd42cc866d9c4291c5f600b0 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:01:31 +0200 Subject: [PATCH 398/613] new blob file writer --- src/vlog/blob_file/multi_writer.rs | 58 ++++++++++---------- src/vlog/blob_file/writer.rs | 85 ++++++++++++++++-------------- 2 files changed, 74 insertions(+), 69 deletions(-) diff --git a/src/vlog/blob_file/multi_writer.rs b/src/vlog/blob_file/multi_writer.rs index e423d661..f038061e 100644 --- a/src/vlog/blob_file/multi_writer.rs +++ b/src/vlog/blob_file/multi_writer.rs @@ -3,22 +3,25 @@ // (found in the LICENSE-* files in the repository) use super::writer::Writer; -use crate::vlog::{compression::Compressor, value_log::IdGenerator, BlobFileId, ValueHandle}; +use crate::{ + vlog::{BlobFileId, ValueHandle}, + CompressionType, SequenceNumberCounter, +}; use std::path::{Path, PathBuf}; /// Blob file writer, may write multiple blob files -pub struct MultiWriter { +pub struct MultiWriter { folder: PathBuf, target_size: u64, - writers: Vec>, + writers: Vec, - id_generator: IdGenerator, + id_generator: SequenceNumberCounter, - compression: Option, + compression: CompressionType, } -impl MultiWriter { +impl MultiWriter { /// Initializes a new blob file writer. /// /// # Errors @@ -26,7 +29,7 @@ impl MultiWriter { /// Will return `Err` if an IO error occurs. #[doc(hidden)] pub fn new>( - id_generator: IdGenerator, + id_generator: SequenceNumberCounter, target_size: u64, folder: P, ) -> std::io::Result { @@ -42,52 +45,47 @@ impl MultiWriter { writers: vec![Writer::new(blob_file_path, blob_file_id)?], - compression: None, + compression: CompressionType::None, }) } - /// Sets the compression method + /// Sets the blob file target size. + #[must_use] + pub fn use_target_size(mut self, bytes: u64) -> Self { + self.target_size = bytes; + self + } + + /// Sets the compression method. #[must_use] #[doc(hidden)] - pub fn use_compression(mut self, compressor: Option) -> Self { - self.compression.clone_from(&compressor); - self.get_active_writer_mut().compression = compressor; + pub fn use_compression(mut self, compression: CompressionType) -> Self { + self.compression.clone_from(&compression); + self.get_active_writer_mut().compression = compression; self } #[doc(hidden)] #[must_use] - pub fn get_active_writer(&self) -> &Writer { + pub fn get_active_writer(&self) -> &Writer { // NOTE: initialized in constructor #[allow(clippy::expect_used)] self.writers.last().expect("should exist") } - fn get_active_writer_mut(&mut self) -> &mut Writer { + fn get_active_writer_mut(&mut self) -> &mut Writer { // NOTE: initialized in constructor #[allow(clippy::expect_used)] self.writers.last_mut().expect("should exist") } - /// Returns the [`ValueHandle`] for the next written blob. - /// - /// This can be used to index an item into an external `Index`. - #[must_use] - pub fn get_next_value_handle(&self) -> ValueHandle { - ValueHandle { - offset: self.offset(), - blob_file_id: self.blob_file_id(), - } - } - - #[doc(hidden)] #[must_use] pub fn offset(&self) -> u64 { self.get_active_writer().offset() } #[must_use] - fn blob_file_id(&self) -> BlobFileId { + pub fn blob_file_id(&self) -> BlobFileId { self.get_active_writer().blob_file_id() } @@ -98,8 +96,8 @@ impl MultiWriter { let new_blob_file_id = self.id_generator.next(); let blob_file_path = self.folder.join(new_blob_file_id.to_string()); - let new_writer = Writer::new(blob_file_path, new_blob_file_id)? - .use_compression(self.compression.clone()); + let new_writer = + Writer::new(blob_file_path, new_blob_file_id)?.use_compression(self.compression); self.writers.push(new_writer); @@ -134,7 +132,7 @@ impl MultiWriter { Ok(bytes_written) } - pub(crate) fn finish(mut self) -> crate::Result>> { + pub(crate) fn finish(mut self) -> crate::Result> { let writer = self.get_active_writer_mut(); if writer.item_count > 0 { diff --git a/src/vlog/blob_file/writer.rs b/src/vlog/blob_file/writer.rs index b93e80b4..fcb2f63b 100644 --- a/src/vlog/blob_file/writer.rs +++ b/src/vlog/blob_file/writer.rs @@ -3,11 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{meta::Metadata, trailer::Trailer}; -use crate::{ - coding::Encode, - vlog::{compression::Compressor, BlobFileId}, - KeyRange, UserKey, -}; +use crate::{coding::Encode, vlog::BlobFileId, CompressionType, KeyRange, UserKey}; use byteorder::{BigEndian, WriteBytesExt}; use std::{ fs::File, @@ -15,10 +11,16 @@ use std::{ path::{Path, PathBuf}, }; -pub const BLOB_HEADER_MAGIC: &[u8] = &[b'V', b'L', b'G', b'B', b'L', b'O', b'B', 1]; +pub const BLOB_HEADER_MAGIC: &[u8] = b"BLOB"; + +pub const BLOB_HEADER_LEN: usize = BLOB_HEADER_MAGIC.len() + + std::mem::size_of::() + + std::mem::size_of::() + + std::mem::size_of::() + + std::mem::size_of::(); /// Blob file writer -pub struct Writer { +pub struct Writer { pub path: PathBuf, pub(crate) blob_file_id: BlobFileId, @@ -34,10 +36,10 @@ pub struct Writer { pub(crate) first_key: Option, pub(crate) last_key: Option, - pub(crate) compression: Option, + pub(crate) compression: CompressionType, } -impl Writer { +impl Writer { /// Initializes a new blob file writer. /// /// # Errors @@ -63,11 +65,11 @@ impl Writer { first_key: None, last_key: None, - compression: None, + compression: CompressionType::None, }) } - pub fn use_compression(mut self, compressor: Option) -> Self { + pub fn use_compression(mut self, compressor: CompressionType) -> Self { self.compression = compressor; self } @@ -109,55 +111,60 @@ impl Writer { self.uncompressed_bytes += value.len() as u64; - let value = match &self.compression { - Some(compressor) => compressor.compress(value)?, - None => value.to_vec(), - }; - - let mut hasher = xxhash_rust::xxh3::Xxh3::new(); - hasher.update(key); - hasher.update(&value); - let checksum = hasher.digest(); - - // TODO: 2.0.0 formalize blob header - // into struct... store uncompressed len as well - // so we can optimize rollover by avoiding - // repeated compression & decompression + // NOTE: + // BLOB HEADER LAYOUT + // + // [MAGIC_BYTES; 4B] + // [Checksum; 16B] + // [key len; 2B] + // [real val len; 4B] + // [on-disk val len; 4B] + // [...key; ?] + // [...val; ?] // Write header self.active_writer.write_all(BLOB_HEADER_MAGIC)?; - // Write checksum - self.active_writer.write_u64::(checksum)?; + let mut hasher = xxhash_rust::xxh3::Xxh3::new(); + hasher.update(key); + hasher.update(value); + let checksum = hasher.digest128(); - // Write key + // Write checksum + self.active_writer.write_u128::(checksum)?; // NOTE: Truncation is okay and actually needed #[allow(clippy::cast_possible_truncation)] self.active_writer .write_u16::(key.len() as u16)?; - self.active_writer.write_all(key)?; - // Write value + // NOTE: Truncation is okay and actually needed + #[allow(clippy::cast_possible_truncation)] + self.active_writer + .write_u32::(value.len() as u32)?; + + // TODO: + let value = match &self.compression { + _ => value, + }; // NOTE: Truncation is okay and actually needed #[allow(clippy::cast_possible_truncation)] self.active_writer .write_u32::(value.len() as u32)?; - self.active_writer.write_all(&value)?; - // Header - self.offset += BLOB_HEADER_MAGIC.len() as u64; + self.active_writer.write_all(key)?; + self.active_writer.write_all(value)?; - // Checksum - self.offset += std::mem::size_of::() as u64; + // Update offset + self.offset += BLOB_HEADER_MAGIC.len() as u64; + self.offset += std::mem::size_of::() as u64; - // Key self.offset += std::mem::size_of::() as u64; - self.offset += key.len() as u64; - - // Value self.offset += std::mem::size_of::() as u64; + self.offset += std::mem::size_of::() as u64; + + self.offset += key.len() as u64; self.offset += value.len() as u64; // Update metadata From 6577be057e78aaa01588c1463a061603ac0dd570 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:03:48 +0200 Subject: [PATCH 399/613] remove vlog compression generic param --- src/vlog/blob_file/merge.rs | 12 +- src/vlog/blob_file/mod.rs | 62 +++- src/vlog/blob_file/reader.rs | 37 ++- src/vlog/config.rs | 14 +- src/vlog/gc/mod.rs | 144 +++++---- src/vlog/value_log.rs | 601 +++++++++++++++++------------------ 6 files changed, 436 insertions(+), 434 deletions(-) diff --git a/src/vlog/blob_file/merge.rs b/src/vlog/blob_file/merge.rs index 994586e6..dca45e98 100644 --- a/src/vlog/blob_file/merge.rs +++ b/src/vlog/blob_file/merge.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use crate::{ - vlog::{BlobFileId, BlobFileReader, Compressor}, + vlog::{BlobFileId, BlobFileReader}, UserKey, UserValue, }; use interval_heap::IntervalHeap; @@ -50,14 +50,14 @@ impl Ord for IteratorValue { /// Interleaves multiple blob file readers into a single, sorted stream #[allow(clippy::module_name_repetitions)] -pub struct MergeReader { - readers: Vec>, +pub struct MergeReader { + readers: Vec, heap: IntervalHeap, } -impl MergeReader { +impl MergeReader { /// Initializes a new merging reader - pub fn new(readers: Vec>) -> Self { + pub fn new(readers: Vec) -> Self { let heap = IntervalHeap::with_capacity(readers.len()); Self { readers, heap } } @@ -90,7 +90,7 @@ impl MergeReader { } } -impl Iterator for MergeReader { +impl Iterator for MergeReader { type Item = crate::Result<(UserKey, UserValue, BlobFileId, u64)>; fn next(&mut self) -> Option { diff --git a/src/vlog/blob_file/mod.rs b/src/vlog/blob_file/mod.rs index 8762e853..2c856106 100644 --- a/src/vlog/blob_file/mod.rs +++ b/src/vlog/blob_file/mod.rs @@ -10,14 +10,14 @@ pub mod reader; pub mod trailer; pub mod writer; -use crate::vlog::{BlobFileId, Compressor}; -use gc_stats::GcStats; -use meta::Metadata; -use std::{marker::PhantomData, path::PathBuf}; +use crate::vlog::BlobFileId; +pub use gc_stats::GcStats; +pub use meta::Metadata; +use std::{path::PathBuf, sync::Arc}; /// A blob file is an immutable, sorted, contiguous file that contains large key-value pairs (blobs) #[derive(Debug)] -pub struct BlobFile { +pub(crate) struct Inner { /// Blob file ID pub id: BlobFileId, @@ -29,47 +29,75 @@ pub struct BlobFile { /// Runtime stats for garbage collection pub gc_stats: GcStats, + // TODO: is_deleted, on Drop, like SST segments +} + +/// A blob file stores large values and is part of the value log +#[derive(Clone)] +pub struct BlobFile(pub(crate) Arc); + +impl Eq for BlobFile {} - pub(crate) _phantom: PhantomData, +impl PartialEq for BlobFile { + fn eq(&self, other: &Self) -> bool { + self.id().eq(&other.id()) + } } -impl BlobFile { +impl std::hash::Hash for BlobFile { + fn hash(&self, state: &mut H) { + self.id().hash(state); + } +} + +impl BlobFile { + /// Returns the blob file ID. + #[must_use] + pub fn id(&self) -> BlobFileId { + self.0.id + } + /// Returns a scanner that can iterate through the blob file. /// /// # Errors /// /// Will return `Err` if an IO error occurs. - pub fn scan(&self) -> crate::Result> { - reader::Reader::new(&self.path, self.id) + pub fn scan(&self) -> crate::Result { + reader::Reader::new(&self.0.path, self.id()) } - /// Returns the amount of items in the blob file. + /// Returns the number of items in the blob file. + #[must_use] + #[allow(clippy::len_without_is_empty)] pub fn len(&self) -> u64 { - self.meta.item_count + self.0.meta.item_count } /// Marks the blob file as fully stale. pub(crate) fn mark_as_stale(&self) { - self.gc_stats.set_stale_items(self.meta.item_count); + self.0.gc_stats.set_stale_items(self.0.meta.item_count); - self.gc_stats - .set_stale_bytes(self.meta.total_uncompressed_bytes); + self.0 + .gc_stats + .set_stale_bytes(self.0.meta.total_uncompressed_bytes); } /// Returns `true` if the blob file is fully stale. + #[must_use] pub fn is_stale(&self) -> bool { - self.gc_stats.stale_items() == self.meta.item_count + self.0.gc_stats.stale_items() == self.0.meta.item_count } /// Returns the percent of dead items in the blob file. // NOTE: Precision is not important here #[allow(clippy::cast_precision_loss)] + #[must_use] pub fn stale_ratio(&self) -> f32 { - let dead = self.gc_stats.stale_items() as f32; + let dead = self.0.gc_stats.stale_items() as f32; if dead == 0.0 { return 0.0; } - dead / self.meta.item_count as f32 + dead / self.0.meta.item_count as f32 } } diff --git a/src/vlog/blob_file/reader.rs b/src/vlog/blob_file/reader.rs index 28e38927..4cc01927 100644 --- a/src/vlog/blob_file/reader.rs +++ b/src/vlog/blob_file/reader.rs @@ -3,11 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{meta::METADATA_HEADER_MAGIC, writer::BLOB_HEADER_MAGIC}; -use crate::{ - coding::DecodeError, - vlog::{BlobFileId, Compressor}, - UserKey, UserValue, -}; +use crate::{coding::DecodeError, vlog::BlobFileId, CompressionType, UserKey, UserValue}; use byteorder::{BigEndian, ReadBytesExt}; use std::{ fs::File, @@ -27,14 +23,14 @@ macro_rules! fail_iter { // TODO: pread /// Reads through a blob file in order. -pub struct Reader { +pub struct Reader { pub(crate) blob_file_id: BlobFileId, inner: BufReader, is_terminated: bool, - compression: Option, + compression: CompressionType, } -impl Reader { +impl Reader { /// Initializes a new blob file reader. /// /// # Errors @@ -56,12 +52,12 @@ impl Reader { blob_file_id, inner: file_reader, is_terminated: false, - compression: None, + compression: CompressionType::None, } } - pub(crate) fn use_compression(mut self, compressor: Option) -> Self { - self.compression = compressor; + pub(crate) fn use_compression(mut self, compressoion: CompressionType) -> Self { + self.compression = compressoion; self } @@ -70,7 +66,7 @@ impl Reader { } } -impl Iterator for Reader { +impl Iterator for Reader { type Item = crate::Result<(UserKey, UserValue, u64)>; fn next(&mut self) -> Option { @@ -101,18 +97,21 @@ impl Iterator for Reader { let val_len = fail_iter!(self.inner.read_u32::()); let val = match &self.compression { - Some(compressor) => { - // TODO: https://github.com/PSeitz/lz4_flex/issues/166 - let mut val = vec![0; val_len as usize]; - fail_iter!(self.inner.read_exact(&mut val)); - UserValue::from(fail_iter!(compressor.decompress(&val))) - } - None => { + _ => { // NOTE: When not using compression, we can skip // the intermediary heap allocation and read directly into a Slice fail_iter!(UserValue::from_reader(&mut self.inner, val_len as usize)) } }; + // Some(compressor) => { + // // TODO: https://github.com/PSeitz/lz4_flex/issues/166 + // let mut val = vec![0; val_len as usize]; + // fail_iter!(self.inner.read_exact(&mut val)); + // UserValue::from(fail_iter!(compressor.decompress(&val))) + // } + // None => { + + // } Some(Ok((key, val, checksum))) } diff --git a/src/vlog/config.rs b/src/vlog/config.rs index 1277ab78..dab541fb 100644 --- a/src/vlog/config.rs +++ b/src/vlog/config.rs @@ -2,11 +2,11 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::{vlog::compression::Compressor, Cache, DescriptorTable}; +use crate::{Cache, CompressionType, DescriptorTable}; use std::sync::Arc; /// Value log configuration -pub struct Config { +pub struct Config { /// Target size of vLog blob files pub(crate) blob_file_size_bytes: u64, @@ -17,24 +17,24 @@ pub struct Config { pub(crate) fd_cache: Arc, /// Compression to use - pub(crate) compression: Option, + pub(crate) compression: CompressionType, } -impl Config { +impl Config { /// Creates a new configuration builder. pub fn new(blob_cache: Arc, fd_cache: Arc) -> Self { Self { blob_cache, fd_cache, - compression: None, + compression: CompressionType::None, blob_file_size_bytes: 128 * 1_024 * 1_024, } } /// Sets the compression & decompression scheme. #[must_use] - pub fn compression(mut self, compressor: Option) -> Self { - self.compression = compressor; + pub fn compression(mut self, compression: CompressionType) -> Self { + self.compression = compression; self } diff --git a/src/vlog/gc/mod.rs b/src/vlog/gc/mod.rs index f6465752..2514d468 100644 --- a/src/vlog/gc/mod.rs +++ b/src/vlog/gc/mod.rs @@ -4,13 +4,13 @@ pub mod report; -use crate::vlog::{BlobFileId, Compressor, ValueLog}; +use crate::vlog::{BlobFileId, ValueLog}; /// GC strategy #[allow(clippy::module_name_repetitions)] -pub trait GcStrategy { +pub trait GcStrategy { /// Picks blob files based on a predicate. - fn pick(&self, value_log: &ValueLog) -> Vec; + fn pick(&self, value_log: &ValueLog) -> Vec; } /// Picks blob files that have a certain percentage of stale blobs @@ -32,17 +32,19 @@ impl StaleThresholdStrategy { } } -impl GcStrategy for StaleThresholdStrategy { - fn pick(&self, value_log: &ValueLog) -> Vec { - value_log - .manifest - .blob_files - .read() - .expect("lock is poisoned") - .values() - .filter(|x| x.stale_ratio() > self.0) - .map(|x| x.id) - .collect::>() +impl GcStrategy for StaleThresholdStrategy { + fn pick(&self, value_log: &ValueLog) -> Vec { + unimplemented!() + + // value_log + // .manifest + // .blob_files + // .read() + // .expect("lock is poisoned") + // .values() + // .filter(|x| x.stale_ratio() > self.0) + // .map(|x| x.id) + // .collect::>() } } @@ -62,62 +64,64 @@ impl SpaceAmpStrategy { } } -impl GcStrategy for SpaceAmpStrategy { +impl GcStrategy for SpaceAmpStrategy { #[allow(clippy::cast_precision_loss, clippy::significant_drop_tightening)] - fn pick(&self, value_log: &ValueLog) -> Vec { - let space_amp_target = self.0; - let current_space_amp = value_log.space_amp(); - - if current_space_amp < space_amp_target { - log::trace!("Space amp is <= target {space_amp_target}, nothing to do"); - vec![] - } else { - log::debug!("Selecting blob files to GC, space_amp_target={space_amp_target}"); - - let lock = value_log - .manifest - .blob_files - .read() - .expect("lock is poisoned"); - - let mut blob_files = lock - .values() - .filter(|x| x.stale_ratio() > 0.0) - .collect::>(); - - // Sort by stale ratio descending - blob_files.sort_by(|a, b| { - b.stale_ratio() - .partial_cmp(&a.stale_ratio()) - .unwrap_or(std::cmp::Ordering::Equal) - }); - - let mut selection = vec![]; - - let mut total_bytes = value_log.manifest.total_bytes(); - let mut stale_bytes = value_log.manifest.stale_bytes(); - - for blob_file in blob_files { - let blob_file_stale_bytes = blob_file.gc_stats.stale_bytes(); - stale_bytes -= blob_file_stale_bytes; - total_bytes -= blob_file_stale_bytes; - - selection.push(blob_file.id); - - let space_amp_after_gc = - total_bytes as f32 / (total_bytes as f32 - stale_bytes as f32); - - log::debug!( - "Selected blob file #{} for GC: will reduce space amp to {space_amp_after_gc}", - blob_file.id, - ); - - if space_amp_after_gc <= space_amp_target { - break; - } - } - - selection - } + fn pick(&self, value_log: &ValueLog) -> Vec { + unimplemented!() + + // let space_amp_target = self.0; + // let current_space_amp = value_log.space_amp(); + + // if current_space_amp < space_amp_target { + // log::trace!("Space amp is <= target {space_amp_target}, nothing to do"); + // vec![] + // } else { + // log::debug!("Selecting blob files to GC, space_amp_target={space_amp_target}"); + + // let lock = value_log + // .manifest + // .blob_files + // .read() + // .expect("lock is poisoned"); + + // let mut blob_files = lock + // .values() + // .filter(|x| x.stale_ratio() > 0.0) + // .collect::>(); + + // // Sort by stale ratio descending + // blob_files.sort_by(|a, b| { + // b.stale_ratio() + // .partial_cmp(&a.stale_ratio()) + // .unwrap_or(std::cmp::Ordering::Equal) + // }); + + // let mut selection = vec![]; + + // let mut total_bytes = value_log.manifest.total_bytes(); + // let mut stale_bytes = value_log.manifest.stale_bytes(); + + // for blob_file in blob_files { + // let blob_file_stale_bytes = blob_file.gc_stats.stale_bytes(); + // stale_bytes -= blob_file_stale_bytes; + // total_bytes -= blob_file_stale_bytes; + + // selection.push(blob_file.id); + + // let space_amp_after_gc = + // total_bytes as f32 / (total_bytes as f32 - stale_bytes as f32); + + // log::debug!( + // "Selected blob file #{} for GC: will reduce space amp to {space_amp_after_gc}", + // blob_file.id, + // ); + + // if space_amp_after_gc <= space_amp_target { + // break; + // } + // } + + // selection + // } } } diff --git a/src/vlog/value_log.rs b/src/vlog/value_log.rs index a4e9769a..670dc0df 100644 --- a/src/vlog/value_log.rs +++ b/src/vlog/value_log.rs @@ -3,43 +3,45 @@ // (found in the LICENSE-* files in the repository) use crate::{ + file::VLOG_MARKER, vlog::{ - blob_file::merge::MergeReader, + blob_file::{ + gc_stats::GcStats, merge::MergeReader, meta::Metadata, Inner as BlobFileInner, + }, gc::report::GcReport, index::Writer as IndexWriter, - manifest::{Manifest, BLOB_FILES_FOLDER, VLOG_MARKER}, - scanner::{Scanner, SizeMap}, - BlobFileId, BlobFileWriter, Compressor, Config, GcStrategy, IndexReader, ValueHandle, + scanner::SizeMap, + BlobFile, BlobFileId, BlobFileWriter, Config, GcStrategy, IndexReader, ValueHandle, }, - Cache, DescriptorTable, UserValue, + Cache, DescriptorTable, KeyRange, UserValue, }; use std::{ path::{Path, PathBuf}, sync::{atomic::AtomicU64, Arc, Mutex}, }; -// TODO: use other counter struct -#[allow(clippy::module_name_repetitions)] -#[derive(Clone, Default)] -pub struct IdGenerator(Arc); +// // TODO: use other counter struct +// #[allow(clippy::module_name_repetitions)] +// #[derive(Clone, Default)] +// pub struct IdGenerator(Arc); -impl std::ops::Deref for IdGenerator { - type Target = Arc; +// impl std::ops::Deref for IdGenerator { +// type Target = Arc; - fn deref(&self) -> &Self::Target { - &self.0 - } -} +// fn deref(&self) -> &Self::Target { +// &self.0 +// } +// } -impl IdGenerator { - pub fn new(start: u64) -> Self { - Self(Arc::new(AtomicU64::new(start))) - } +// impl IdGenerator { +// pub fn new(start: u64) -> Self { +// Self(Arc::new(AtomicU64::new(start))) +// } - pub fn next(&self) -> BlobFileId { - self.fetch_add(1, std::sync::atomic::Ordering::SeqCst) - } -} +// pub fn next(&self) -> BlobFileId { +// self.fetch_add(1, std::sync::atomic::Ordering::SeqCst) +// } +// } /// Unique value log ID #[allow(clippy::module_name_repetitions)] @@ -52,21 +54,23 @@ pub fn get_next_vlog_id() -> ValueLogId { } fn unlink_blob_files(base_path: &Path, ids: &[BlobFileId]) { - for id in ids { - let path = base_path.join(BLOB_FILES_FOLDER).join(id.to_string()); + unimplemented!() - if let Err(e) = std::fs::remove_file(&path) { - log::error!("Could not free blob file at {path:?}: {e:?}"); - } - } + // for id in ids { + // let path = base_path.join(BLOB_FILES_FOLDER).join(id.to_string()); + + // if let Err(e) = std::fs::remove_file(&path) { + // log::error!("Could not free blob file at {path:?}: {e:?}"); + // } + // } } /// A disk-resident value log #[derive(Clone)] -pub struct ValueLog(Arc>); +pub struct ValueLog(Arc); -impl std::ops::Deref for ValueLog { - type Target = ValueLogInner; +impl std::ops::Deref for ValueLog { + type Target = ValueLogInner; fn deref(&self) -> &Self::Target { &self.0 @@ -74,7 +78,7 @@ impl std::ops::Deref for ValueLog { } #[allow(clippy::module_name_repetitions)] -pub struct ValueLogInner { +pub struct ValueLogInner { /// Unique value log ID id: u64, @@ -82,20 +86,16 @@ pub struct ValueLogInner { pub path: PathBuf, /// Value log configuration - config: Config, + config: Config, /// In-memory blob cache - blob_cache: Arc, + // blob_cache: Arc, /// In-memory FD cache - fd_cache: Arc, + // fd_cache: Arc, - /// Blob files manifest - #[doc(hidden)] - pub manifest: Manifest, - - /// Generator to get next blob file ID - id_generator: IdGenerator, + // /// Generator to get next blob file ID + // id_generator: IdGenerator, /// Guards the rollover (compaction) process to only /// allow one to happen at a time @@ -103,7 +103,7 @@ pub struct ValueLogInner { pub rollover_guard: Mutex<()>, } -impl ValueLog { +impl ValueLog { /// Creates or recovers a value log in the given directory. /// /// # Errors @@ -111,7 +111,7 @@ impl ValueLog { /// Will return `Err` if an IO error occurs. pub fn open>( path: P, // TODO: move path into config? - config: Config, + config: Config, ) -> crate::Result { let path = path.into(); @@ -144,27 +144,29 @@ impl ValueLog { #[doc(hidden)] pub fn verify(&self) -> crate::Result { - let _lock = self.rollover_guard.lock().expect("lock is poisoned"); + unimplemented!() - let mut sum = 0; + // let _lock = self.rollover_guard.lock().expect("lock is poisoned"); - for item in self.get_reader()? { - let (k, v, _, expected_checksum) = item?; + // let mut sum = 0; - let mut hasher = xxhash_rust::xxh3::Xxh3::new(); - hasher.update(&k); - hasher.update(&v); + // for item in self.get_reader()? { + // let (k, v, _, expected_checksum) = item?; - if hasher.digest() != expected_checksum { - sum += 1; - } - } + // let mut hasher = xxhash_rust::xxh3::Xxh3::new(); + // hasher.update(&k); + // hasher.update(&v); + + // if hasher.digest() != expected_checksum { + // sum += 1; + // } + // } - Ok(sum) + // Ok(sum) } /// Creates a new empty value log in a directory. - pub(crate) fn create_new>(path: P, config: Config) -> crate::Result { + pub(crate) fn create_new>(path: P, config: Config) -> crate::Result { let path = path.into(); let path = crate::path::absolute_path(&path); @@ -175,8 +177,6 @@ impl ValueLog { let marker_path = path.join(VLOG_MARKER); assert!(!marker_path.try_exists()?); - std::fs::create_dir_all(path.join(BLOB_FILES_FOLDER))?; - // NOTE: Lastly, fsync .vlog marker, which contains the version // -> the V-log is fully initialized @@ -188,85 +188,32 @@ impl ValueLog { { // fsync folders on Unix - let folder = std::fs::File::open(path.join(BLOB_FILES_FOLDER))?; - folder.sync_all()?; - let folder = std::fs::File::open(&path)?; folder.sync_all()?; } - let blob_cache = config.blob_cache.clone(); - let fd_cache = config.fd_cache.clone(); - let manifest = Manifest::create_new(&path)?; - - Ok(Self(Arc::new(ValueLogInner { - id: get_next_vlog_id(), - config, - path, - blob_cache, - fd_cache, - manifest, - id_generator: IdGenerator::default(), - rollover_guard: Mutex::new(()), - }))) - } - - pub(crate) fn recover>(path: P, config: Config) -> crate::Result { - let path = path.into(); - log::info!("Recovering vLog at {}", path.display()); - - // { - // let bytes = std::fs::read(path.join(VLOG_MARKER))?; - - // if let Some(version) = Version::parse_file_header(&bytes) { - // if version != Version::V1 { - // return Err(crate::Error::InvalidVersion(Some(version))); - // } - // } else { - // return Err(crate::Error::InvalidVersion(None)); - // } - // } - - let blob_cache = config.blob_cache.clone(); - let fd_cache = config.fd_cache.clone(); - let manifest = Manifest::recover(&path)?; - - let highest_id = manifest - .blob_files - .read() - .expect("lock is poisoned") - .values() - .map(|x| x.id) - .max() - .unwrap_or_default(); + // let blob_cache = config.blob_cache.clone(); + // let fd_cache = config.fd_cache.clone(); + // let manifest = Manifest::create_new(&path)?; Ok(Self(Arc::new(ValueLogInner { id: get_next_vlog_id(), config, path, - blob_cache, - fd_cache, - manifest, - id_generator: IdGenerator::new(highest_id + 1), + // blob_cache, + // fd_cache, + // manifest, + // id_generator: IdGenerator::default(), rollover_guard: Mutex::new(()), }))) } - /// Registers a [`BlobFileWriter`]. - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - pub fn register_writer(&self, writer: BlobFileWriter) -> crate::Result<()> { - let _lock = self.rollover_guard.lock().expect("lock is poisoned"); - self.manifest.register(writer)?; - Ok(()) - } - /// Returns the number of blob files in the value log. #[must_use] pub fn blob_file_count(&self) -> usize { - self.manifest.len() + unimplemented!() + + // self.manifest.len() } /// Resolves a value handle. @@ -341,13 +288,15 @@ impl ValueLog { // Ok(Some(val)) } - fn get_writer_raw(&self) -> crate::Result> { - BlobFileWriter::new( - self.id_generator.clone(), - self.config.blob_file_size_bytes, - self.path.join(BLOB_FILES_FOLDER), - ) - .map_err(Into::into) + fn get_writer_raw(&self) -> crate::Result { + unimplemented!() + + // BlobFileWriter::new( + // self.id_generator.clone(), + // self.config.blob_file_size_bytes, + // &self.path, + // ) + // .map_err(Into::into) } /// Initializes a new blob file writer. @@ -355,9 +304,11 @@ impl ValueLog { /// # Errors /// /// Will return `Err` if an IO error occurs. - pub fn get_writer(&self) -> crate::Result> { - self.get_writer_raw() - .map(|x| x.use_compression(self.config.compression.clone())) + pub fn get_writer(&self) -> crate::Result { + unimplemented!() + + // self.get_writer_raw() + // .map(|x| x.use_compression(self.config.compression)) } /// Drops stale blob files. @@ -368,35 +319,37 @@ impl ValueLog { /// /// Will return `Err` if an IO error occurs. pub fn drop_stale_blob_files(&self) -> crate::Result { - // IMPORTANT: Only allow 1 rollover or GC at any given time - let _guard = self.rollover_guard.lock().expect("lock is poisoned"); + unimplemented!() - let blob_files = self - .manifest - .blob_files - .read() - .expect("lock is poisoned") - .values() - .filter(|x| x.is_stale()) - .cloned() - .collect::>(); + // // IMPORTANT: Only allow 1 rollover or GC at any given time + // let _guard = self.rollover_guard.lock().expect("lock is poisoned"); - let bytes_freed = blob_files.iter().map(|x| x.meta.compressed_bytes).sum(); + // let blob_files = self + // .manifest + // .blob_files + // .read() + // .expect("lock is poisoned") + // .values() + // .filter(|x| x.is_stale()) + // .cloned() + // .collect::>(); - let ids = blob_files.iter().map(|x| x.id).collect::>(); + // let bytes_freed = blob_files.iter().map(|x| x.meta.compressed_bytes).sum(); - if ids.is_empty() { - log::trace!("No blob files to drop"); - } else { - log::info!("Dropping stale blob files: {ids:?}"); - self.manifest.drop_blob_files(&ids)?; + // let ids = blob_files.iter().map(|x| x.id).collect::>(); - for blob_file in blob_files { - std::fs::remove_file(&blob_file.path)?; - } - } + // if ids.is_empty() { + // log::trace!("No blob files to drop"); + // } else { + // log::info!("Dropping stale blob files: {ids:?}"); + // self.manifest.drop_blob_files(&ids)?; - Ok(bytes_freed) + // for blob_file in blob_files { + // std::fs::remove_file(&blob_file.path)?; + // } + // } + + // Ok(bytes_freed) } /// Marks some blob files as stale. @@ -405,17 +358,19 @@ impl ValueLog { /// /// Will return `Err` if an IO error occurs. fn mark_as_stale(&self, ids: &[BlobFileId]) { - // NOTE: Read-locking is fine because we are dealing with an atomic bool - #[allow(clippy::significant_drop_tightening)] - let blob_files = self.manifest.blob_files.read().expect("lock is poisoned"); + unimplemented!() - for id in ids { - let Some(blob_file) = blob_files.get(id) else { - continue; - }; + // // NOTE: Read-locking is fine because we are dealing with an atomic bool + // #[allow(clippy::significant_drop_tightening)] + // let blob_files = self.manifest.blob_files.read().expect("lock is poisoned"); - blob_file.mark_as_stale(); - } + // for id in ids { + // let Some(blob_file) = blob_files.get(id) else { + // continue; + // }; + + // blob_file.mark_as_stale(); + // } } // TODO: remove? @@ -424,67 +379,71 @@ impl ValueLog { /// Returns 0.0 if there are no items. #[must_use] pub fn space_amp(&self) -> f32 { - self.manifest.space_amp() + unimplemented!() + + // self.manifest.space_amp() } #[doc(hidden)] #[allow(clippy::cast_precision_loss)] #[must_use] pub fn consume_scan_result(&self, size_map: &SizeMap) -> GcReport { - let mut report = GcReport { - path: self.path.clone(), - blob_file_count: self.blob_file_count(), - stale_blob_file_count: 0, - stale_bytes: 0, - total_bytes: 0, - stale_blobs: 0, - total_blobs: 0, - }; - - for (&id, counter) in size_map { - let blob_file = self - .manifest - .get_blob_file(id) - .expect("blob file should exist"); - - let total_bytes = blob_file.meta.total_uncompressed_bytes; - let total_items = blob_file.meta.item_count; - - report.total_bytes += total_bytes; - report.total_blobs += total_items; - - if counter.item_count > 0 { - let used_size = counter.size; - let alive_item_count = counter.item_count; - - let blob_file = self - .manifest - .get_blob_file(id) - .expect("blob file should exist"); - - let stale_bytes = total_bytes - used_size; - let stale_items = total_items - alive_item_count; - - blob_file.gc_stats.set_stale_bytes(stale_bytes); - blob_file.gc_stats.set_stale_items(stale_items); - - report.stale_bytes += stale_bytes; - report.stale_blobs += stale_items; - } else { - log::debug!( - "Blob file #{id} has no incoming references - can be dropped, freeing {} KiB on disk (userdata={} MiB)", - blob_file.meta.compressed_bytes / 1_024, - total_bytes / 1_024 / 1_024, - ); - self.mark_as_stale(&[id]); + unimplemented!() + + // let mut report = GcReport { + // path: self.path.clone(), + // blob_file_count: self.blob_file_count(), + // stale_blob_file_count: 0, + // stale_bytes: 0, + // total_bytes: 0, + // stale_blobs: 0, + // total_blobs: 0, + // }; - report.stale_blob_file_count += 1; - report.stale_bytes += total_bytes; - report.stale_blobs += total_items; - } - } + // for (&id, counter) in size_map { + // let blob_file = self + // .manifest + // .get_blob_file(id) + // .expect("blob file should exist"); + + // let total_bytes = blob_file.meta.total_uncompressed_bytes; + // let total_items = blob_file.meta.item_count; + + // report.total_bytes += total_bytes; + // report.total_blobs += total_items; + + // if counter.item_count > 0 { + // let used_size = counter.size; + // let alive_item_count = counter.item_count; + + // let blob_file = self + // .manifest + // .get_blob_file(id) + // .expect("blob file should exist"); - report + // let stale_bytes = total_bytes - used_size; + // let stale_items = total_items - alive_item_count; + + // blob_file.gc_stats.set_stale_bytes(stale_bytes); + // blob_file.gc_stats.set_stale_items(stale_items); + + // report.stale_bytes += stale_bytes; + // report.stale_blobs += stale_items; + // } else { + // log::debug!( + // "Blob file #{id} has no incoming references - can be dropped, freeing {} KiB on disk (userdata={} MiB)", + // blob_file.meta.compressed_bytes / 1_024, + // total_bytes / 1_024 / 1_024, + // ); + // self.mark_as_stale(&[id]); + + // report.stale_blob_file_count += 1; + // report.stale_bytes += total_bytes; + // report.stale_blobs += total_items; + // } + // } + + // report } /// Scans the given index and collects GC statistics. @@ -497,30 +456,34 @@ impl ValueLog { &self, iter: impl Iterator>, ) -> crate::Result { - let lock_guard = self.rollover_guard.lock().expect("lock is poisoned"); + unimplemented!() + + // let lock_guard = self.rollover_guard.lock().expect("lock is poisoned"); - let ids = self.manifest.list_blob_file_ids(); + // let ids = self.manifest.list_blob_file_ids(); - let mut scanner = Scanner::new(iter, lock_guard, &ids); - scanner.scan()?; - let size_map = scanner.finish(); - let report = self.consume_scan_result(&size_map); + // let mut scanner = Scanner::new(iter, lock_guard, &ids); + // scanner.scan()?; + // let size_map = scanner.finish(); + // let report = self.consume_scan_result(&size_map); - Ok(report) + // Ok(report) } #[doc(hidden)] - pub fn get_reader(&self) -> crate::Result> { - let readers = self - .manifest - .blob_files - .read() - .expect("lock is poisoned") - .values() - .map(|x| x.scan()) - .collect::>>()?; - - Ok(MergeReader::new(readers)) + pub fn get_reader(&self) -> crate::Result { + unimplemented!() + + // let readers = self + // .manifest + // .blob_files + // .read() + // .expect("lock is poisoned") + // .values() + // .map(|x| x.scan()) + // .collect::>>()?; + + // Ok(MergeReader::new(readers)) } /// Returns the amount of disk space (compressed data) freed. @@ -530,8 +493,10 @@ impl ValueLog { index_reader: &R, index_writer: W, ) -> crate::Result { - let ids = self.manifest.list_blob_file_ids(); - self.rollover(&ids, index_reader, index_writer) + unimplemented!() + + // let ids = self.manifest.list_blob_file_ids(); + // self.rollover(&ids, index_reader, index_writer) } /// Applies a GC strategy. @@ -541,38 +506,42 @@ impl ValueLog { /// Will return `Err` if an IO error occurs. pub fn apply_gc_strategy( &self, - strategy: &impl GcStrategy, + strategy: &impl GcStrategy, index_reader: &R, index_writer: W, ) -> crate::Result { - let blob_file_ids = strategy.pick(self); - self.rollover(&blob_file_ids, index_reader, index_writer) + unimplemented!() + + // let blob_file_ids = strategy.pick(self); + // self.rollover(&blob_file_ids, index_reader, index_writer) } /// Atomically removes all data from the value log. /// /// If `prune_async` is set to `true`, the blob files will be removed from disk in a thread to avoid blocking. pub fn clear(&self, prune_async: bool) -> crate::Result<()> { - let guard = self.rollover_guard.lock().expect("lock is poisoned"); - let ids = self.manifest.list_blob_file_ids(); - self.manifest.clear()?; - drop(guard); - - if prune_async { - let path = self.path.clone(); - - std::thread::spawn(move || { - log::trace!("Pruning dropped blob files in thread: {ids:?}"); - unlink_blob_files(&path, &ids); - log::trace!("Successfully pruned all blob files"); - }); - } else { - log::trace!("Pruning dropped blob files: {ids:?}"); - unlink_blob_files(&self.path, &ids); - log::trace!("Successfully pruned all blob files"); - } + unimplemented!() + + // let guard = self.rollover_guard.lock().expect("lock is poisoned"); + // let ids = self.manifest.list_blob_file_ids(); + // self.manifest.clear()?; + // drop(guard); + + // if prune_async { + // let path = self.path.clone(); + + // std::thread::spawn(move || { + // log::trace!("Pruning dropped blob files in thread: {ids:?}"); + // unlink_blob_files(&path, &ids); + // log::trace!("Successfully pruned all blob files"); + // }); + // } else { + // log::trace!("Pruning dropped blob files: {ids:?}"); + // unlink_blob_files(&self.path, &ids); + // log::trace!("Successfully pruned all blob files"); + // } - Ok(()) + // Ok(()) } /// Rewrites some blob files into new blob files, blocking the caller @@ -590,80 +559,82 @@ impl ValueLog { index_reader: &R, mut index_writer: W, ) -> crate::Result { - if ids.is_empty() { - return Ok(0); - } + unimplemented!() + + // if ids.is_empty() { + // return Ok(0); + // } - // IMPORTANT: Only allow 1 rollover or GC at any given time - let _guard = self.rollover_guard.lock().expect("lock is poisoned"); + // // IMPORTANT: Only allow 1 rollover or GC at any given time + // let _guard = self.rollover_guard.lock().expect("lock is poisoned"); - let size_before = self.manifest.disk_space_used(); + // let size_before = self.manifest.disk_space_used(); - log::info!("Rollover blob files {ids:?}"); + // log::info!("Rollover blob files {ids:?}"); - let blob_files = ids - .iter() - .map(|&x| self.manifest.get_blob_file(x)) - .collect::>>(); + // let blob_files = ids + // .iter() + // .map(|&x| self.manifest.get_blob_file(x)) + // .collect::>>(); - let Some(blob_files) = blob_files else { - return Ok(0); - }; + // let Some(blob_files) = blob_files else { + // return Ok(0); + // }; - let readers = blob_files - .into_iter() - .map(|x| x.scan()) - .collect::>>()?; + // let readers = blob_files + // .into_iter() + // .map(|x| x.scan()) + // .collect::>>()?; - // TODO: 3.0.0: Store uncompressed size per blob - // so we can avoid recompression costs during GC - // but have stats be correct + // // TODO: 3.0.0: Store uncompressed size per blob + // // so we can avoid recompression costs during GC + // // but have stats be correct - let reader = MergeReader::new( - readers - .into_iter() - .map(|x| x.use_compression(self.config.compression.clone())) - .collect(), - ); + // let reader = MergeReader::new( + // readers + // .into_iter() + // .map(|x| x.use_compression(self.config.compression.clone())) + // .collect(), + // ); - let mut writer = self - .get_writer_raw()? - .use_compression(self.config.compression.clone()); + // let mut writer = self + // .get_writer_raw()? + // .use_compression(self.config.compression.clone()); - for item in reader { - let (k, v, blob_file_id, _) = item?; + // for item in reader { + // let (k, v, blob_file_id, _) = item?; - match index_reader.get(&k)? { - // If this value is in an older blob file, we can discard it - Some(vhandle) if blob_file_id < vhandle.blob_file_id => continue, - None => continue, - _ => {} - } + // match index_reader.get(&k)? { + // // If this value is in an older blob file, we can discard it + // Some(vhandle) if blob_file_id < vhandle.blob_file_id => continue, + // None => continue, + // _ => {} + // } - let vhandle = writer.get_next_value_handle(); + // let vhandle = writer.get_next_value_handle(); - // NOTE: Truncation is OK because we know values are u32 max - #[allow(clippy::cast_possible_truncation)] - index_writer.insert_indirect(&k, vhandle, v.len() as u32)?; + // // NOTE: Truncation is OK because we know values are u32 max + // #[allow(clippy::cast_possible_truncation)] + // index_writer.insert_indirect(&k, vhandle, v.len() as u32)?; - writer.write(&k, &v)?; - } + // writer.write(&k, &v)?; + // } - // IMPORTANT: New blob files need to be persisted before adding to index - // to avoid dangling pointers - self.manifest.register(writer)?; + // // IMPORTANT: New blob files need to be persisted before adding to index + // // to avoid dangling pointers + // self.manifest.register(writer)?; - // NOTE: If we crash here, it's fine, the blob files are registered - // but never referenced, so they can just be dropped after recovery - index_writer.finish()?; + // // NOTE: If we crash here, it's fine, the blob files are registered + // // but never referenced, so they can just be dropped after recovery + // index_writer.finish()?; - // IMPORTANT: We only mark the blob files as definitely stale - // The external index needs to decide when it is safe to drop - // the old blob files, as some reads may still be performed - self.mark_as_stale(ids); + // // IMPORTANT: We only mark the blob files as definitely stale + // // The external index needs to decide when it is safe to drop + // // the old blob files, as some reads may still be performed + // self.mark_as_stale(ids); - let size_after = self.manifest.disk_space_used(); + // let size_after = self.manifest.disk_space_used(); - Ok(size_before.saturating_sub(size_after)) + // Ok(size_before.saturating_sub(size_after)) } } From c9303990cd38d7e2a01b43e51651474e0e3381a4 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:04:10 +0200 Subject: [PATCH 400/613] add on disk size to value handle needed for pread --- src/vlog/handle.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/vlog/handle.rs b/src/vlog/handle.rs index 7d7c0837..a5900924 100644 --- a/src/vlog/handle.rs +++ b/src/vlog/handle.rs @@ -21,12 +21,16 @@ pub struct ValueHandle { /// Offset in file pub offset: u64, + + /// On-disk size + pub on_disk_size: u32, } impl Encode for ValueHandle { fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { writer.write_u64_varint(self.offset)?; writer.write_u64_varint(self.blob_file_id)?; + writer.write_u32_varint(self.on_disk_size)?; Ok(()) } } @@ -35,10 +39,12 @@ impl Decode for ValueHandle { fn decode_from(reader: &mut R) -> Result { let offset = reader.read_u64_varint()?; let blob_file_id = reader.read_u64_varint()?; + let on_disk_size = reader.read_u32_varint()?; Ok(Self { blob_file_id, offset, + on_disk_size, }) } } From 939d45d3ff06cbe3713baf826f3ab516fe6f6b7e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:04:27 +0200 Subject: [PATCH 401/613] move imports --- src/tree/inner.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tree/inner.rs b/src/tree/inner.rs index d23e0773..32b1ae59 100644 --- a/src/tree/inner.rs +++ b/src/tree/inner.rs @@ -2,15 +2,15 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -#[cfg(feature = "metrics")] -use crate::metrics::Metrics; - use crate::{ config::Config, level_manifest::LevelManifest, memtable::Memtable, stop_signal::StopSignal, SegmentId, }; use std::sync::{atomic::AtomicU64, Arc, RwLock}; +#[cfg(feature = "metrics")] +use crate::metrics::Metrics; + /// Unique tree ID /// /// Tree IDs are monotonically increasing integers. From 659423a342a8e4dc63d7f71696db276cc140d9d1 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:04:33 +0200 Subject: [PATCH 402/613] move imports --- src/segment/iter.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/segment/iter.rs b/src/segment/iter.rs index 8e7b2c75..66ac0a3d 100644 --- a/src/segment/iter.rs +++ b/src/segment/iter.rs @@ -2,9 +2,6 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -#[cfg(feature = "metrics")] -use crate::metrics::Metrics; - use super::{data_block::Iter as DataBlockIter, BlockOffset, DataBlock, GlobalSegmentId}; use crate::{ segment::{ @@ -15,6 +12,9 @@ use crate::{ use self_cell::self_cell; use std::{path::PathBuf, sync::Arc}; +#[cfg(feature = "metrics")] +use crate::metrics::Metrics; + type InnerIter<'a> = DataBlockIter<'a>; self_cell!( From 518cb4210235b001f2d7b54c36ff1ba4f105036c Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:04:44 +0200 Subject: [PATCH 403/613] export metrics --- src/lib.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 36b4968e..2eefd01f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -227,8 +227,12 @@ pub use { seqno::SequenceNumberCounter, tree::Tree, value::{SeqNo, ValueType}, + vlog::BlobFile, }; +#[cfg(feature = "metrics")] +pub use metrics::Metrics; + pub use any_tree::AnyTree; pub use blob_tree::BlobTree; From 49308f279a5170957874a44ca9d5d857b166785e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:05:53 +0200 Subject: [PATCH 404/613] wip --- src/vlog/value_log.rs | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/vlog/value_log.rs b/src/vlog/value_log.rs index 670dc0df..775efdaa 100644 --- a/src/vlog/value_log.rs +++ b/src/vlog/value_log.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use crate::{ - file::VLOG_MARKER, + // file::VLOG_MARKER, vlog::{ blob_file::{ gc_stats::GcStats, merge::MergeReader, meta::Metadata, Inner as BlobFileInner, @@ -13,7 +13,10 @@ use crate::{ scanner::SizeMap, BlobFile, BlobFileId, BlobFileWriter, Config, GcStrategy, IndexReader, ValueHandle, }, - Cache, DescriptorTable, KeyRange, UserValue, + Cache, + DescriptorTable, + KeyRange, + UserValue, }; use std::{ path::{Path, PathBuf}, @@ -113,13 +116,15 @@ impl ValueLog { path: P, // TODO: move path into config? config: Config, ) -> crate::Result { - let path = path.into(); + // let path = path.into(); - if path.join(VLOG_MARKER).try_exists()? { - Self::recover(path, config) - } else { - Self::create_new(path, config) - } + // if path.join(VLOG_MARKER).try_exists()? { + // Self::recover(path, config) + // } else { + // Self::create_new(path, config) + // } + + unimplemented!() } /* /// Prints fragmentation histogram. @@ -174,8 +179,8 @@ impl ValueLog { std::fs::create_dir_all(&path)?; - let marker_path = path.join(VLOG_MARKER); - assert!(!marker_path.try_exists()?); + // let marker_path = path.join(VLOG_MARKER); + // assert!(!marker_path.try_exists()?); // NOTE: Lastly, fsync .vlog marker, which contains the version // -> the V-log is fully initialized From 321915c082a228c1a37c0024cd00d4bf0f02c23e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:06:10 +0200 Subject: [PATCH 405/613] remove old compression type --- src/blob_tree/compression.rs | 36 ------------------------------------ 1 file changed, 36 deletions(-) delete mode 100644 src/blob_tree/compression.rs diff --git a/src/blob_tree/compression.rs b/src/blob_tree/compression.rs deleted file mode 100644 index 3ab425e7..00000000 --- a/src/blob_tree/compression.rs +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use crate::vlog::Compressor; -use crate::CompressionType; - -#[derive(Copy, Clone, Debug)] -pub struct MyCompressor(pub(crate) CompressionType); - -impl Default for MyCompressor { - fn default() -> Self { - Self(CompressionType::None) - } -} - -impl Compressor for MyCompressor { - fn compress(&self, bytes: &[u8]) -> crate::Result> { - Ok(match self.0 { - CompressionType::None => bytes.into(), - - #[cfg(feature = "lz4")] - CompressionType::Lz4 => lz4_flex::compress_prepend_size(bytes), - }) - } - - fn decompress(&self, bytes: &[u8]) -> crate::Result> { - match self.0 { - CompressionType::None => Ok(bytes.into()), - - #[cfg(feature = "lz4")] - CompressionType::Lz4 => lz4_flex::decompress_size_prepended(bytes) - .map_err(|_| crate::Error::Decompress(self.0)), - } - } -} From e34684ee21cba000a0e8e73d94cca650fc4784e0 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:06:14 +0200 Subject: [PATCH 406/613] wip --- src/blob_tree/mod.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index b489867d..bd09f8c7 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -2,7 +2,6 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -mod compression; mod gc; pub mod index; pub mod value; From 2396dba7a665f06a841f96c8a1496fe8a428adc7 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:07:41 +0200 Subject: [PATCH 407/613] restore blob tree flush --- src/blob_tree/mod.rs | 573 +++++++++++++++++++++++--------------- src/level_manifest/mod.rs | 112 ++++---- src/vlog/accessor.rs | 94 +++++++ src/vlog/mod.rs | 54 +++- 4 files changed, 544 insertions(+), 289 deletions(-) create mode 100644 src/vlog/accessor.rs diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index bd09f8c7..c9fed6b8 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -9,29 +9,29 @@ pub mod value; use crate::{ coding::{Decode, Encode}, compaction::stream::CompactionStream, - file::BLOBS_FOLDER, + file::{fsync_directory, BLOBS_FOLDER}, iter_guard::{IterGuard, IterGuardImpl}, r#abstract::{AbstractTree, RangeItem}, segment::Segment, tree::inner::MemtableId, value::InternalValue, - vlog::ValueLog, - Config, Memtable, SegmentId, SeqNo, UserKey, UserValue, + vlog::{BlobFile, BlobFileWriter, ValueHandle, ValueLog}, + Config, Memtable, SegmentId, SeqNo, SequenceNumberCounter, UserKey, UserValue, }; -use compression::MyCompressor; use gc::{reader::GcReader, writer::GcWriter}; use index::IndexTree; use std::{ io::Cursor, ops::{RangeBounds, RangeFull}, - sync::{atomic::AtomicUsize, Arc}, + path::PathBuf, + sync::{ + atomic::{AtomicU64, AtomicUsize}, + Arc, + }, }; use value::MaybeInlineValue; -pub struct Guard<'a>( - &'a ValueLog, - crate::Result<(UserKey, UserValue)>, -); +pub struct Guard<'a>(&'a ValueLog, crate::Result<(UserKey, UserValue)>); impl IterGuard for Guard<'_> { fn key(self) -> crate::Result { @@ -59,7 +59,7 @@ impl IterGuard for Guard<'_> { } } -fn resolve_value_handle(vlog: &crate::vlog::ValueLog, item: RangeItem) -> RangeItem { +fn resolve_value_handle(vlog: &crate::vlog::ValueLog, item: RangeItem) -> RangeItem { use MaybeInlineValue::{Indirect, Inline}; match item { @@ -95,41 +95,124 @@ pub struct BlobTree { #[doc(hidden)] pub index: IndexTree, - /// Log-structured value-log that stores large values - #[doc(hidden)] - pub blobs: crate::vlog::ValueLog, + blobs_folder: PathBuf, // TODO: maybe replace this with a nonce system #[doc(hidden)] pub pending_segments: Arc, + + blob_file_id_generator: SequenceNumberCounter, } impl BlobTree { pub(crate) fn open(config: Config) -> crate::Result { - let path = &config.path; - - let vlog_path = path.join(BLOBS_FOLDER); - let vlog_cfg = crate::vlog::Config::::new( - config.cache.clone(), - config.descriptor_table.clone(), - ) - .blob_file_size_bytes(config.blob_file_target_size) - .compression(match config.blob_compression { - crate::CompressionType::None => None, + // let path = &config.path; - #[cfg(feature = "lz4")] - c => Some(MyCompressor(c)), - }); + // let vlog_path = path.join(BLOBS_FOLDER); + // let vlog_cfg = + // crate::vlog::Config::new(config.cache.clone(), config.descriptor_table.clone()) + // .blob_file_size_bytes(config.blob_file_target_size) + // .compression(config.blob_compression); let index: IndexTree = config.open()?.into(); + let blobs_folder = index.config.path.join(BLOBS_FOLDER); + std::fs::create_dir_all(&blobs_folder)?; + fsync_directory(&blobs_folder)?; + + let blob_file_id_to_continue_with = index + .manifest + .read() + .expect("lock is poisoned") + .current_version() + .value_log + .values() + .map(BlobFile::id) + .max() + .map(|x| x + 1) + .unwrap_or_default(); + Ok(Self { index, - blobs: ValueLog::open(vlog_path, vlog_cfg)?, + blobs_folder, pending_segments: Arc::new(AtomicUsize::new(0)), + blob_file_id_generator: SequenceNumberCounter::new(blob_file_id_to_continue_with), }) } + #[must_use] + pub fn space_amp(&self) -> f32 { + todo!() + } + + /// Consumes a [`BlobFileWriter`], returning a `BlobFile` handle. + /// + /// # Note + /// + /// The blob file is **not** added to the value log immediately. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + fn consume_blob_file_writer(writer: BlobFileWriter) -> crate::Result> { + use crate::vlog::blob_file::{GcStats, Inner as BlobFileInner, Metadata}; + + let writers = writer.finish()?; + + let mut blob_files = Vec::with_capacity(writers.len()); + + for writer in writers { + if writer.item_count == 0 { + log::debug!( + "Blob file writer at {} has written no data, deleting empty blob file", + writer.path.display(), + ); + if let Err(e) = std::fs::remove_file(&writer.path) { + log::warn!( + "Could not delete empty blob file at {}: {e:?}", + writer.path.display(), + ); + } + continue; + } + + let blob_file_id = writer.blob_file_id; + + blob_files.push(BlobFile(Arc::new(BlobFileInner { + id: blob_file_id, + path: writer.path, + meta: Metadata { + item_count: writer.item_count, + compressed_bytes: writer.written_blob_bytes, + total_uncompressed_bytes: writer.uncompressed_bytes, + + // NOTE: We are checking for 0 items above + // so first and last key need to exist + #[allow(clippy::expect_used)] + key_range: crate::KeyRange::new(( + writer + .first_key + .clone() + .expect("should have written at least 1 item"), + writer + .last_key + .clone() + .expect("should have written at least 1 item"), + )), + }, + gc_stats: GcStats::default(), + }))); + + log::debug!( + "Created blob file #{blob_file_id:?} ({} items, {} userdata bytes)", + writer.item_count, + writer.uncompressed_bytes, + ); + } + + Ok(blob_files) + } + /// Scans the index tree, collecting statistics about value log fragmentation. #[doc(hidden)] pub fn gc_scan_stats( @@ -140,109 +223,117 @@ impl BlobTree { use std::io::Error as IoError; use MaybeInlineValue::{Indirect, Inline}; - while self - .pending_segments - .load(std::sync::atomic::Ordering::Acquire) - > 0 - { - // IMPORTANT: Busy wait until all segments in-flight are committed - // to the tree - } - - // IMPORTANT: Lock + snapshot memtable to avoid read skew + preventing tampering with memtable - let _memtable_lock = self.index.read_lock_active_memtable(); - - while self - .pending_segments - .load(std::sync::atomic::Ordering::Acquire) - > 0 - { - // IMPORTANT: Busy wait again until all segments in-flight are committed - // to the tree - } - - let iter = self - .index - .create_internal_range::<&[u8], RangeFull>(&.., seqno, None); - - // Stores the max seqno of every blob file - let mut seqno_map = crate::HashMap::::default(); - - let result = self.blobs.scan_for_stats(iter.filter_map(|kv| { - let Ok(kv) = kv else { - return Some(Err(IoError::other( - "Failed to load KV pair from index tree", - ))); - }; - - let mut cursor = Cursor::new(kv.value); - let value = match MaybeInlineValue::decode_from(&mut cursor) { - Ok(v) => v, - Err(e) => return Some(Err(IoError::other(e.to_string()))), - }; - - match value { - Indirect { vhandle, size } => { - seqno_map - .entry(vhandle.blob_file_id) - .and_modify(|x| *x = (*x).max(kv.key.seqno)) - .or_insert(kv.key.seqno); - - Some(Ok((vhandle, size))) - } - Inline(_) => None, - } - })); - - let mut lock = self - .blobs - .manifest - .blob_files - .write() - .expect("lock is poisoned"); - - // IMPORTANT: We are overwiting the staleness of blob files - // that contain an item that is still contained in the GC watermark - // so snapshots cannot accidentally lose data - // - // TODO: 3.0.0 this should be dealt with in value-log 2.0 (make it MVCC aware) - for (blob_file_id, max_seqno) in seqno_map { - if gc_watermark <= max_seqno { - if let Some(blob_file) = lock.get_mut(&blob_file_id) { - blob_file.gc_stats.set_stale_items(0); - blob_file.gc_stats.set_stale_bytes(0); - } - } - } - - result + todo!() + + // while self + // .pending_segments + // .load(std::sync::atomic::Ordering::Acquire) + // > 0 + // { + // // IMPORTANT: Busy wait until all segments in-flight are committed + // // to the tree + // } + + // // IMPORTANT: Lock + snapshot memtable to avoid read skew + preventing tampering with memtable + // let _memtable_lock = self.index.read_lock_active_memtable(); + + // while self + // .pending_segments + // .load(std::sync::atomic::Ordering::Acquire) + // > 0 + // { + // // IMPORTANT: Busy wait again until all segments in-flight are committed + // // to the tree + // } + + // let iter = self + // .index + // .create_internal_range::<&[u8], RangeFull>(&.., seqno, None); + + // // Stores the max seqno of every blob file + // let mut seqno_map = crate::HashMap::::default(); + + // let result = self.blobs.scan_for_stats(iter.filter_map(|kv| { + // let Ok(kv) = kv else { + // return Some(Err(IoError::other( + // "Failed to load KV pair from index tree", + // ))); + // }; + + // let mut cursor = Cursor::new(kv.value); + // let value = match MaybeInlineValue::decode_from(&mut cursor) { + // Ok(v) => v, + // Err(e) => return Some(Err(IoError::other(e.to_string()))), + // }; + + // match value { + // Indirect { vhandle, size } => { + // seqno_map + // .entry(vhandle.blob_file_id) + // .and_modify(|x| *x = (*x).max(kv.key.seqno)) + // .or_insert(kv.key.seqno); + + // Some(Ok((vhandle, size))) + // } + // Inline(_) => None, + // } + // })); + + // // TODO: + + // // let mut lock = self + // // .blobs + // // .manifest + // // .blob_files + // // .write() + // // .expect("lock is poisoned"); + + // // // IMPORTANT: We are overwiting the staleness of blob files + // // // that contain an item that is still contained in the GC watermark + // // // so snapshots cannot accidentally lose data + // // // + // // // TODO: 3.0.0 this should be dealt with in value-log 2.0 (make it MVCC aware) + // // for (blob_file_id, max_seqno) in seqno_map { + // // if gc_watermark <= max_seqno { + // // if let Some(blob_file) = lock.get_mut(&blob_file_id) { + // // blob_file.gc_stats.set_stale_items(0); + // // blob_file.gc_stats.set_stale_bytes(0); + // // } + // // } + // // } + + // result } pub fn apply_gc_strategy( &self, - strategy: &impl crate::vlog::GcStrategy, + strategy: &impl crate::vlog::GcStrategy, seqno: SeqNo, ) -> crate::Result { - // IMPORTANT: Write lock memtable to avoid read skew - let memtable_lock = self.index.lock_active_memtable(); + todo!() - self.blobs.apply_gc_strategy( - strategy, - &GcReader::new(&self.index, &memtable_lock), - GcWriter::new(seqno, &memtable_lock), - )?; + // // IMPORTANT: Write lock memtable to avoid read skew + // let memtable_lock = self.index.lock_active_memtable(); + + // self.blobs.apply_gc_strategy( + // strategy, + // &GcReader::new(&self.index, &memtable_lock), + // GcWriter::new(seqno, &memtable_lock), + // )?; - // NOTE: We still have the memtable lock, can't use gc_drop_stale because recursive locking - self.blobs.drop_stale_blob_files() + // // NOTE: We still have the memtable lock, can't use gc_drop_stale because recursive locking + // self.blobs.drop_stale_blob_files() } /// Drops all stale blob segment files #[doc(hidden)] pub fn gc_drop_stale(&self) -> crate::Result { - // IMPORTANT: Write lock memtable to avoid read skew - let _lock = self.index.lock_active_memtable(); + todo!() + + // // IMPORTANT: Write lock memtable to avoid read skew + // let _lock = self.index.lock_active_memtable(); - self.blobs.drop_stale_blob_files() + // self.blobs.drop_stale_blob_files() } #[doc(hidden)] @@ -251,29 +342,40 @@ impl BlobTree { return Ok(None); }; - let Some(segment) = self.flush_memtable(segment_id, &yanked_memtable, eviction_seqno)? + let Some((segment, blob_file)) = + self.flush_memtable(segment_id, &yanked_memtable, eviction_seqno)? else { return Ok(None); }; - self.register_segments(std::slice::from_ref(&segment), eviction_seqno)?; + self.register_segments( + std::slice::from_ref(&segment), + blob_file.as_ref().map(std::slice::from_ref), + eviction_seqno, + )?; Ok(Some(segment)) } } impl AbstractTree for BlobTree { + fn version_free_list_len(&self) -> usize { + self.index.version_free_list_len() + } + fn prefix>( &self, prefix: K, seqno: SeqNo, index: Option>, ) -> Box> + '_> { - Box::new( - self.index - .0 - .create_prefix(&prefix, seqno, index) - .map(move |kv| IterGuardImpl::Blob(Guard(&self.blobs, kv))), - ) + todo!() + + // Box::new( + // self.index + // .0 + // .create_prefix(&prefix, seqno, index) + // .map(move |kv| IterGuardImpl::Blob(Guard(&self.blobs, kv))), + // ) } fn range, R: RangeBounds>( @@ -282,12 +384,14 @@ impl AbstractTree for BlobTree { seqno: SeqNo, index: Option>, ) -> Box> + '_> { - Box::new( - self.index - .0 - .create_range(&range, seqno, index) - .map(move |kv| IterGuardImpl::Blob(Guard(&self.blobs, kv))), - ) + todo!() + + // Box::new( + // self.index + // .0 + // .create_range(&range, seqno, index) + // .map(move |kv| IterGuardImpl::Blob(Guard(&self.blobs, kv))), + // ) } fn tombstone_count(&self) -> u64 { @@ -302,61 +406,64 @@ impl AbstractTree for BlobTree { use crate::tree::ingest::Ingestion; use std::time::Instant; - // NOTE: Lock active memtable so nothing else can be going on while we are bulk loading - let lock = self.lock_active_memtable(); - assert!( - lock.is_empty(), - "can only perform bulk_ingest on empty trees", - ); - - let mut segment_writer = Ingestion::new(&self.index)?; - let mut blob_writer = self.blobs.get_writer()?; - - let start = Instant::now(); - let mut count = 0; - let mut last_key = None; - - for (key, value) in iter { - if let Some(last_key) = &last_key { - assert!( - key > last_key, - "next key in bulk ingest was not greater than last key", - ); - } - last_key = Some(key.clone()); - - // NOTE: Values are 32-bit max - #[allow(clippy::cast_possible_truncation)] - let value_size = value.len() as u32; - - if value_size >= self.index.config.blob_file_separation_threshold { - let vhandle = blob_writer.get_next_value_handle(); - - let indirection = MaybeInlineValue::Indirect { - vhandle, - size: value_size, - }; - // TODO: use Slice::with_size - let mut serialized_indirection = vec![]; - indirection.encode_into(&mut serialized_indirection)?; - - segment_writer.write(key.clone(), serialized_indirection.into())?; - - blob_writer.write(&key, value)?; - } else { - // TODO: use Slice::with_size - let direct = MaybeInlineValue::Inline(value); - let serialized_direct = direct.encode_into_vec(); - segment_writer.write(key, serialized_direct.into())?; - } - - count += 1; - } - - self.blobs.register_writer(blob_writer)?; - segment_writer.finish()?; - - log::info!("Ingested {count} items in {:?}", start.elapsed()); + todo!(); + + // // NOTE: Lock active memtable so nothing else can be going on while we are bulk loading + // let lock = self.lock_active_memtable(); + // assert!( + // lock.is_empty(), + // "can only perform bulk_ingest on empty trees", + // ); + + // let mut segment_writer = Ingestion::new(&self.index)?; + // let mut blob_writer = self.blobs.get_writer()?; + + // let start = Instant::now(); + // let mut count = 0; + // let mut last_key = None; + + // for (key, value) in iter { + // if let Some(last_key) = &last_key { + // assert!( + // key > last_key, + // "next key in bulk ingest was not greater than last key", + // ); + // } + // last_key = Some(key.clone()); + + // // NOTE: Values are 32-bit max + // #[allow(clippy::cast_possible_truncation)] + // let value_size = value.len() as u32; + + // if value_size >= self.index.config.blob_file_separation_threshold { + // let vhandle = blob_writer.get_next_value_handle(); + + // let indirection = MaybeInlineValue::Indirect { + // vhandle, + // size: value_size, + // }; + // // TODO: use Slice::with_size + // let mut serialized_indirection = vec![]; + // indirection.encode_into(&mut serialized_indirection)?; + + // segment_writer.write(key.clone(), serialized_indirection.into())?; + + // blob_writer.write(&key, value)?; + // } else { + // // TODO: use Slice::with_size + // let direct = MaybeInlineValue::Inline(value); + // let serialized_direct = direct.encode_into_vec(); + // segment_writer.write(key, serialized_direct.into())?; + // } + + // count += 1; + // } + + // // TODO: add to manifest + unit test + // // self.blobs.register_writer(blob_writer)?; + // // segment_writer.finish()?; + + // log::info!("Ingested {count} items in {:?}", start.elapsed()); Ok(()) } @@ -374,7 +481,12 @@ impl AbstractTree for BlobTree { } fn blob_file_count(&self) -> usize { - self.blobs.blob_file_count() + self.index + .manifest + .read() + .expect("lock is poisoned") + .current_version() + .blob_file_count() } // NOTE: We skip reading from the value log @@ -405,31 +517,20 @@ impl AbstractTree for BlobTree { self.index.sealed_memtable_count() } - /* #[doc(hidden)] - fn verify(&self) -> crate::Result { - let index_tree_sum = self.index.verify()?; - let vlog_sum = self.blobs.verify()?; - Ok(index_tree_sum + vlog_sum) - } */ - fn flush_memtable( &self, segment_id: SegmentId, memtable: &Arc, eviction_seqno: SeqNo, - ) -> crate::Result> { - use crate::{ - file::SEGMENTS_FOLDER, - //segment::writer::{Options, Writer as SegmentWriter}, - segment::Writer as SegmentWriter, - }; + ) -> crate::Result)>> { + use crate::{file::SEGMENTS_FOLDER, segment::Writer as SegmentWriter}; use value::MaybeInlineValue; let lsm_segment_folder = self.index.config.path.join(SEGMENTS_FOLDER); - log::debug!("flushing memtable & performing key-value separation"); - log::debug!("=> to LSM segments in {lsm_segment_folder:?}"); - log::debug!("=> to blob segment at {:?}", self.blobs.path); + log::debug!("Flushing memtable & performing key-value separation"); + log::debug!("=> to LSM segments in {}", lsm_segment_folder.display()); + // log::debug!("=> to blob segment at {}", self.blobs.path.display()); let mut segment_writer = SegmentWriter::new( lsm_segment_folder.join(segment_id.to_string()), @@ -442,12 +543,18 @@ impl AbstractTree for BlobTree { } */ )? .use_data_block_compression(self.index.config.compression); - + // TODO: monkey /* segment_writer = segment_writer.use_bloom_policy( crate::segment::writer::BloomConstructionPolicy::FpRate(0.0001), ); */ - let mut blob_writer = self.blobs.get_writer()?; + let mut blob_writer = BlobFileWriter::new( + self.blob_file_id_generator.clone(), + u64::MAX, + self.index.config.path.join(BLOBS_FOLDER), + )?; + + // let mut blob_writer = self.blobs.get_writer()?.use_target_size(u64::MAX); let iter = memtable.iter().map(Ok); let compaction_filter = CompactionStream::new(iter, eviction_seqno); @@ -458,7 +565,6 @@ impl AbstractTree for BlobTree { if item.is_tombstone() { // NOTE: Still need to add tombstone to index tree // But no blob to blob writer - segment_writer.write(InternalValue::new(item.key, UserValue::empty()))?; continue; } @@ -487,10 +593,16 @@ impl AbstractTree for BlobTree { let value_size = value.len() as u32; if value_size >= self.index.config.blob_file_separation_threshold { - let vhandle = blob_writer.get_next_value_handle(); + let offset = blob_writer.offset(); + let blob_file_id = blob_writer.blob_file_id(); + let on_disk_size = blob_writer.write(&item.key.user_key, value)?; let indirection = MaybeInlineValue::Indirect { - vhandle, + vhandle: ValueHandle { + offset, + blob_file_id, + on_disk_size, + }, size: value_size, }; // TODO: use Slice::with_size @@ -499,8 +611,6 @@ impl AbstractTree for BlobTree { segment_writer .write(InternalValue::new(item.key.clone(), serialized_indirection))?; - - blob_writer.write(&item.key.user_key, value)?; } else { // TODO: use Slice::with_size let direct = MaybeInlineValue::Inline(value); @@ -509,13 +619,13 @@ impl AbstractTree for BlobTree { } } - let _memtable_lock = self.lock_active_memtable(); - - // TODO: 3.0.0 - // TODO: add to vlog atomically together with the segment (that way, we don't need the pending_segments monkey patch) + // let _memtable_lock = self.lock_active_memtable(); - log::trace!("Register blob writer into value log"); - self.blobs.register_writer(blob_writer)?; + // TODO: 3.0.0: add to vlog atomically together with the segment (that way, we don't need the pending_segments monkey patch) + log::trace!("Creating blob file"); + let blob_files = Self::consume_blob_file_writer(blob_writer)?; + assert!(blob_files.len() <= 1); + let blob_file = blob_files.into_iter().next(); log::trace!("Creating LSM-tree segment {segment_id}"); let segment = self.index.consume_writer(segment_writer)?; @@ -528,11 +638,17 @@ impl AbstractTree for BlobTree { .fetch_add(1, std::sync::atomic::Ordering::Release); } - Ok(segment) + Ok(segment.map(|segment| (segment, blob_file))) } - fn register_segments(&self, segments: &[Segment], seqno_threshold: SeqNo) -> crate::Result<()> { - self.index.register_segments(segments, seqno_threshold)?; + fn register_segments( + &self, + segments: &[Segment], + blob_files: Option<&[BlobFile]>, + seqno_threshold: SeqNo, + ) -> crate::Result<()> { + self.index + .register_segments(segments, blob_files, seqno_threshold)?; let count = self .pending_segments @@ -618,7 +734,10 @@ impl AbstractTree for BlobTree { } fn disk_space(&self) -> u64 { - self.index.disk_space() + self.blobs.manifest.disk_space_used() + let lock = self.index.manifest.read().expect("lock is poisoned"); + let version = lock.current_version(); + let vlog = crate::vlog::Accessor::new(&version.value_log); + self.index.disk_space() + vlog.disk_space() } fn get_highest_memtable_seqno(&self) -> Option { @@ -657,6 +776,9 @@ impl AbstractTree for BlobTree { let key = key.as_ref(); + // TODO: refactor memtable, sealed memtables, manifest lock to be a single lock (SuperVersion kind of) + // TODO: then, try to reduce the lock access to 1, because we are accessing it twice (index.get, and then vhandle resolving...) + let Some(value) = self.index.get_vhandle(key, seqno)? else { return Ok(None); }; @@ -664,9 +786,18 @@ impl AbstractTree for BlobTree { match value { Inline(bytes) => Ok(Some(bytes)), Indirect { vhandle, .. } => { + let lock = self.index.manifest.read().expect("lock is poisoned"); + let vlog = crate::vlog::Accessor::new(&lock.current_version().value_log); + // Resolve indirection using value log - match self.blobs.get(&vhandle)? { - Some(bytes) => Ok(Some(bytes)), + match vlog.get( + &self.blobs_folder, + key, + &vhandle, + &self.index.config.cache, + &self.index.config.descriptor_table, + )? { + Some(v) => Ok(Some(v)), None => { panic!("value handle ({key:?} => {vhandle:?}) did not match any blob - this is a bug") } diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index 731c31d3..cf9a6567 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -9,9 +9,10 @@ use crate::{ file::{fsync_directory, rewrite_atomic, MAGIC_BYTES}, segment::Segment, version::{Level, Run, Version, VersionId, DEFAULT_LEVEL_COUNT}, - SegmentId, SeqNo, + vlog::BlobFileId, + BlobFile, SegmentId, SeqNo, }; -use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use byteorder::{LittleEndian, ReadBytesExt}; use hidden_set::HiddenSet; use std::{ collections::VecDeque, @@ -20,6 +21,12 @@ use std::{ sync::Arc, }; +pub struct Recovery { + pub curr_version_id: VersionId, + pub segment_ids: Vec>>, + pub blob_file_ids: Vec, +} + /// Represents the levels of a log-structured merge tree pub struct LevelManifest { /// Path of tree folder. @@ -35,7 +42,7 @@ pub struct LevelManifest { hidden_set: HiddenSet, /// Holds onto versions until they are safe to drop. - version_free_list: VecDeque, + pub(crate) version_free_list: VecDeque, } impl std::fmt::Display for LevelManifest { @@ -142,9 +149,17 @@ impl LevelManifest { Ok(manifest) } - // TODO: move into Version::decode - pub(crate) fn load_version(path: &Path) -> crate::Result>>> { - let mut level_manifest = Cursor::new(std::fs::read(path)?); + pub(crate) fn recover_ids(folder: &Path) -> crate::Result { + let curr_version_id = Self::get_current_version(folder)?; + let version_file_path = folder.join(format!("v{curr_version_id}")); + + log::info!( + "Recovering current manifest at {}", + version_file_path.display(), + ); + let mut level_manifest = Cursor::new(std::fs::read(version_file_path)?); + + // TODO: vvv move into Version::decode? vvv // Check header let mut magic = [0u8; MAGIC_BYTES.len()]; @@ -179,66 +194,35 @@ impl LevelManifest { levels.push(level); } - Ok(levels) - } + let blob_file_count = level_manifest.read_u32::()?; + let mut blob_file_ids = Vec::with_capacity(blob_file_count as usize); - pub(crate) fn recover_ids( - folder: &Path, - ) -> crate::Result> { - let curr_version = Self::get_current_version(folder)?; - let version_file_path = folder.join(format!("v{curr_version}")); - - let manifest = Self::load_version(&version_file_path)?; - let mut result = crate::HashMap::default(); - - for (level_idx, segment_ids) in manifest.into_iter().enumerate() { - for run in segment_ids { - for segment_id in run { - // NOTE: We know there are always less than 256 levels - #[allow(clippy::expect_used)] - result.insert( - segment_id, - level_idx - .try_into() - .expect("there are less than 256 levels"), - ); - } - } + for _ in 0..blob_file_count { + let id = level_manifest.read_u64::()?; + blob_file_ids.push(id); } - Ok(result) + Ok(Recovery { + curr_version_id, + segment_ids: levels, + blob_file_ids, + }) } pub fn get_current_version(folder: &Path) -> crate::Result { - let mut buf = [0; 8]; - - { - let mut file = std::fs::File::open(folder.join("current"))?; - file.read_exact(&mut buf)?; - } - - Ok(u64::from_le_bytes(buf)) + std::fs::File::open(folder.join("current")) + .and_then(|mut f| f.read_u64::()) + .map_err(Into::into) } pub(crate) fn recover>( folder: P, + recovery: &Recovery, segments: &[Segment], + blob_files: &[BlobFile], ) -> crate::Result { - let folder = folder.into(); - - let curr_version = Self::get_current_version(&folder)?; - let version_file_path = folder.join(format!("v{curr_version}")); - - let version_file = std::path::Path::new(&version_file_path); - - if !version_file.try_exists()? { - log::error!("Cannot find version file {}", version_file_path.display()); - return Err(crate::Error::Unrecoverable); - } - - let raw_version = Self::load_version(&version_file_path)?; - - let version_levels = raw_version + let version_levels = recovery + .segment_ids .iter() .map(|level| { let level_runs = level @@ -264,10 +248,12 @@ impl LevelManifest { .collect::>>()?; Ok(Self { - current: Version::from_levels(curr_version, version_levels), - folder, + current: Version::from_levels(recovery.curr_version_id, version_levels, { + blob_files.iter().cloned().map(|bf| (bf.id(), bf)).collect() + }), + folder: folder.into(), hidden_set: HiddenSet::default(), - version_free_list: VecDeque::default(), // TODO: 3. create free list from versions that are N < CURRENT + version_free_list: VecDeque::default(), // TODO: 3. create free list from versions that are N < CURRENT, or delete old versions eagerly... }) } @@ -323,6 +309,8 @@ impl LevelManifest { } pub(crate) fn maintenance(&mut self, gc_watermark: SeqNo) -> crate::Result<()> { + log::debug!("Running manifest GC"); + loop { let Some(head) = self.version_free_list.front() else { break; @@ -337,6 +325,8 @@ impl LevelManifest { } } + log::debug!("Manifest GC done"); + Ok(()) } @@ -420,13 +410,7 @@ impl LevelManifest { #[cfg(test)] #[allow(clippy::expect_used)] mod tests { - use crate::{ - coding::Encode, - level_manifest::{hidden_set::HiddenSet, LevelManifest}, - version::Version, - AbstractTree, - }; - use std::collections::VecDeque; + use crate::AbstractTree; use test_log::test; #[test] diff --git a/src/vlog/accessor.rs b/src/vlog/accessor.rs new file mode 100644 index 00000000..f626c184 --- /dev/null +++ b/src/vlog/accessor.rs @@ -0,0 +1,94 @@ +use crate::{ + vlog::{blob_file::writer::BLOB_HEADER_LEN, BlobFileId, ValueHandle}, + BlobFile, Cache, DescriptorTable, GlobalSegmentId, Slice, UserValue, +}; +use std::{collections::BTreeMap, fs::File, path::Path, sync::Arc}; + +pub struct Accessor<'a>(&'a BTreeMap); + +impl<'a> Accessor<'a> { + pub fn new(blob_files: &'a BTreeMap) -> Self { + Self(blob_files) + } + + pub fn disk_space(&self) -> u64 { + self.0 + .values() + .map(|x| x.0.meta.total_uncompressed_bytes) + .sum() + } + + pub fn get( + &self, + base_path: &Path, + key: &[u8], + vhandle: &ValueHandle, + cache: &Cache, + descriptor_table: &DescriptorTable, + ) -> crate::Result> { + if let Some(value) = cache.get_blob(0 /* TODO: vlog ID... */, vhandle) { + return Ok(Some(value)); + } + + let Some(blob_file) = self.0.get(&vhandle.blob_file_id) else { + return Ok(None); + }; + + let bf_id = GlobalSegmentId::from((0 /* TODO: vlog ID */, vhandle.blob_file_id)); + + let file = if let Some(fd) = descriptor_table.access_for_blob_file(&bf_id) { + fd + } else { + let file = Arc::new(File::open( + base_path.join(vhandle.blob_file_id.to_string()), + )?); + descriptor_table.insert_for_blob_file(bf_id, file.clone()); + file + }; + + let offset = vhandle.offset + (BLOB_HEADER_LEN as u64) + (key.len() as u64); + + #[warn(unsafe_code)] + let mut builder = unsafe { Slice::builder_unzeroed(vhandle.on_disk_size as usize) }; + + { + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + + let bytes_read = file.read_at(&mut builder, offset)?; + + assert_eq!( + bytes_read, + vhandle.on_disk_size as usize, + "not enough bytes read: file has length {}", + file.metadata()?.len(), + ); + } + + #[cfg(windows)] + { + use std::os::windows::fs::FileExt; + + let bytes_read = file.seek_read(&mut builder, offset)?; + + assert_eq!( + bytes_read, + vhandle.on_disk_size as usize, + "not enough bytes read: file has length {}", + file.metadata()?.len(), + ); + } + + #[cfg(not(any(unix, windows)))] + { + compile_error!("unsupported OS"); + unimplemented!(); + } + } + + // TODO: decompress? save compression type into blobfile.meta + + Ok(Some(builder.freeze().into())) + } +} diff --git a/src/vlog/mod.rs b/src/vlog/mod.rs index 165c97dd..6ae6134f 100644 --- a/src/vlog/mod.rs +++ b/src/vlog/mod.rs @@ -2,13 +2,13 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -mod blob_file; -mod compression; // TODO: remove +pub mod blob_file; mod config; mod gc; mod handle; mod index; -mod manifest; +// mod manifest; +mod accessor; #[doc(hidden)] pub mod scanner; @@ -16,8 +16,8 @@ pub mod scanner; mod value_log; pub use { + accessor::Accessor, blob_file::multi_writer::MultiWriter as BlobFileWriter, - compression::Compressor, config::Config, gc::report::GcReport, gc::{GcStrategy, SpaceAmpStrategy, StaleThresholdStrategy}, @@ -29,5 +29,51 @@ pub use { #[doc(hidden)] pub use blob_file::{reader::Reader as BlobFileReader, BlobFile}; +use crate::vlog::blob_file::{trailer::Trailer, GcStats, Inner as BlobFileInner}; +use std::{path::Path, sync::Arc}; + +pub fn recover_blob_files(folder: &Path, ids: &[BlobFileId]) -> crate::Result> { + let cnt = ids.len(); + + let progress_mod = match cnt { + _ if cnt <= 20 => 1, + _ if cnt <= 100 => 10, + _ => 100, + }; + + log::debug!("Recovering {cnt} blob files from {:?}", folder.display(),); + + // TODO: + // Self::remove_unfinished_blob_files(&folder, &ids)?; + + let mut blob_files = Vec::with_capacity(ids.len()); + + for (idx, &id) in ids.iter().enumerate() { + log::trace!("Recovering blob file #{id:?}"); + + let path = folder.join(id.to_string()); + let trailer = Trailer::from_file(&path)?; + + blob_files.push(BlobFile(Arc::new(BlobFileInner { + id, + path, + meta: trailer.metadata, + gc_stats: GcStats::default(), + }))); + + if idx % progress_mod == 0 { + log::debug!("Recovered {idx}/{cnt} blob files"); + } + } + + if blob_files.len() < ids.len() { + return Err(crate::Error::Unrecoverable); + } + + log::debug!("Successfully recovered {} blob files", blob_files.len()); + + Ok(blob_files) +} + /// The unique identifier for a value log blob file. pub type BlobFileId = u64; From 601b6f3a64f4e4e94f8164c84b07c65f14d81d14 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:08:04 +0200 Subject: [PATCH 408/613] perform version GC after trivial move as well --- src/compaction/worker.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index c3e17e85..038d66b7 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -179,6 +179,11 @@ fn move_segments( opts.eviction_seqno, )?; + if let Err(e) = levels.maintenance(opts.eviction_seqno) { + log::error!("Manifest maintenance failed: {e:?}"); + return Err(e); + } + Ok(()) } From 7f281ddb4bbf1985448dd98251173eb3d131eec9 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:08:23 +0200 Subject: [PATCH 409/613] recover blob files --- src/tree/mod.rs | 79 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 63 insertions(+), 16 deletions(-) diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 6a60d0e3..da828554 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -5,13 +5,11 @@ pub mod ingest; pub mod inner; -#[cfg(feature = "metrics")] -use crate::metrics::Metrics; - use crate::{ coding::{Decode, Encode}, compaction::CompactionStrategy, config::Config, + file::BLOBS_FOLDER, format_version::FormatVersion, iter_guard::{IterGuard, IterGuardImpl}, level_manifest::LevelManifest, @@ -19,6 +17,7 @@ use crate::{ memtable::Memtable, segment::Segment, value::InternalValue, + vlog::BlobFile, AbstractTree, Cache, DescriptorTable, KvPair, SegmentId, SeqNo, UserKey, UserValue, ValueType, }; use inner::{MemtableId, SealedMemtables, TreeId, TreeInner}; @@ -29,6 +28,9 @@ use std::{ sync::{atomic::AtomicU64, Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}, }; +#[cfg(feature = "metrics")] +use crate::metrics::Metrics; + pub struct Guard(crate::Result<(UserKey, UserValue)>); impl IterGuard for Guard { @@ -68,6 +70,14 @@ impl std::ops::Deref for Tree { } impl AbstractTree for Tree { + fn version_free_list_len(&self) -> usize { + self.manifest + .read() + .expect("lock is poisoned") + .version_free_list + .len() + } + fn prefix>( &self, prefix: K, @@ -236,7 +246,7 @@ impl AbstractTree for Tree { segment_id: SegmentId, memtable: &Arc, seqno_threshold: SeqNo, - ) -> crate::Result> { + ) -> crate::Result)>> { use crate::{compaction::stream::CompactionStream, file::SEGMENTS_FOLDER, segment::Writer}; use std::time::Instant; @@ -275,11 +285,20 @@ impl AbstractTree for Tree { log::debug!("Flushed memtable {segment_id:?} in {:?}", start.elapsed()); - Ok(result) + Ok(result.map(|segment| (segment, None))) } - fn register_segments(&self, segments: &[Segment], seqno_threshold: SeqNo) -> crate::Result<()> { - log::trace!("Registering {} segments", segments.len()); + fn register_segments( + &self, + segments: &[Segment], + blob_files: Option<&[BlobFile]>, + seqno_threshold: SeqNo, + ) -> crate::Result<()> { + log::trace!( + "Registering {} segments, {} blob files", + segments.len(), + blob_files.map(<[BlobFile]>::len).unwrap_or_default(), + ); // NOTE: Mind lock order L -> M -> S log::trace!("register: Acquiring levels manifest write lock"); @@ -291,9 +310,10 @@ impl AbstractTree for Tree { let mut sealed_memtables = self.sealed_memtables.write().expect("lock is poisoned"); log::trace!("register: Acquired sealed memtables write lock"); - manifest.atomic_swap(|version| version.with_new_l0_run(segments), seqno_threshold)?; - - // eprintln!("{manifest}"); + manifest.atomic_swap( + |version| version.with_new_l0_run(segments, blob_files), + seqno_threshold, + )?; for segment in segments { log::trace!("releasing sealed memtable {}", segment.id()); @@ -594,11 +614,12 @@ impl Tree { return Ok(None); }; - let Some(segment) = self.flush_memtable(segment_id, &yanked_memtable, seqno_threshold)? + let Some((segment, _)) = + self.flush_memtable(segment_id, &yanked_memtable, seqno_threshold)? else { return Ok(None); }; - self.register_segments(std::slice::from_ref(&segment), seqno_threshold)?; + self.register_segments(std::slice::from_ref(&segment), None, seqno_threshold)?; Ok(Some(segment)) } @@ -929,9 +950,30 @@ impl Tree { let tree_path = tree_path.as_ref(); - log::info!("Recovering manifest at {}", tree_path.display()); + let recovery = LevelManifest::recover_ids(tree_path)?; + + let segment_id_map = { + let mut result: crate::HashMap = + crate::HashMap::default(); + + for (level_idx, segment_ids) in recovery.segment_ids.iter().enumerate() { + for run in segment_ids { + for segment_id in run { + // NOTE: We know there are always less than 256 levels + #[allow(clippy::expect_used)] + result.insert( + *segment_id, + level_idx + .try_into() + .expect("there are less than 256 levels"), + ); + } + } + } + + result + }; - let segment_id_map = LevelManifest::recover_ids(tree_path)?; let cnt = segment_id_map.len(); log::debug!( @@ -989,7 +1031,7 @@ impl Tree { tree_id, cache.clone(), descriptor_table.clone(), - level_idx <= 2, // TODO: look at configuration + level_idx <= 1, // TODO: look at configuration level_idx <= 2, // TODO: look at configuration #[cfg(feature = "metrics")] metrics.clone(), @@ -1021,6 +1063,11 @@ impl Tree { log::debug!("Successfully recovered {} segments", segments.len()); - LevelManifest::recover(tree_path, &segments) + let blob_files = crate::vlog::recover_blob_files( + &tree_path.join(BLOBS_FOLDER), + &recovery.blob_file_ids, + )?; + + LevelManifest::recover(tree_path, &recovery, &segments, &blob_files) } } From 11f52266b899d03245c111f89a3431153d293ded Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:08:36 +0200 Subject: [PATCH 410/613] wip --- src/abstract.rs | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/abstract.rs b/src/abstract.rs index 3052b7d5..8ddf930e 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -4,8 +4,8 @@ use crate::{ compaction::CompactionStrategy, config::TreeType, iter_guard::IterGuardImpl, segment::Segment, - tree::inner::MemtableId, AnyTree, BlobTree, Config, Guard, KvPair, Memtable, SegmentId, SeqNo, - Tree, UserKey, UserValue, + tree::inner::MemtableId, vlog::BlobFile, AnyTree, BlobTree, Config, Guard, KvPair, Memtable, + SegmentId, SeqNo, Tree, UserKey, UserValue, }; use enum_dispatch::enum_dispatch; use std::{ @@ -91,6 +91,9 @@ pub trait AbstractTree { /// Gets the memory usage of all pinned index blocks in the tree. fn pinned_block_index_size(&self) -> usize; + /// Gets the length of the version free list. + fn version_free_list_len(&self) -> usize; + // TODO:? /* #[doc(hidden)] fn verify(&self) -> crate::Result; */ @@ -108,14 +111,19 @@ pub trait AbstractTree { segment_id: SegmentId, // TODO: remove? memtable: &Arc, seqno_threshold: SeqNo, - ) -> crate::Result>; + ) -> crate::Result)>>; /// Atomically registers flushed disk segments into the tree, removing their associated sealed memtables. /// /// # Errors /// /// Will return `Err` if an IO error occurs. - fn register_segments(&self, segments: &[Segment], seqno_threshold: SeqNo) -> crate::Result<()>; + fn register_segments( + &self, + segments: &[Segment], + blob_files: Option<&[BlobFile]>, + seqno_threshold: SeqNo, + ) -> crate::Result<()>; /// Write-locks the active memtable for exclusive access fn lock_active_memtable(&self) -> RwLockWriteGuard<'_, Arc>; @@ -175,7 +183,7 @@ pub trait AbstractTree { /// Returns the number of disk segments currently in the tree. fn segment_count(&self) -> usize; - /// Returns the number of segments in levels[idx]. + /// Returns the number of segments in `levels[idx]`. /// /// Returns `None` if the level does not exist (if idx >= 7). fn level_segment_count(&self, idx: usize) -> Option; From f952e33dbe46696008c29e1471bbeed414f2758f Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:08:52 +0200 Subject: [PATCH 411/613] adjust tests --- tests/blob_drop_after_flush.rs | 4 +- tests/blob_gc.rs | 42 ++++++++++---------- tests/blob_sep_threshold.rs | 4 +- tests/blob_simple.rs | 11 +++-- tests/blob_tree_flush.rs | 10 ++--- tests/experimental_blob_tree_guarded_size.rs | 1 + tests/experimental_tree_guarded_range.rs | 1 + tests/tree_approx_len.rs | 2 + tests/tree_count.rs | 1 + tests/tree_l0_range.rs | 3 +- tests/tree_sealed_shadowing.rs | 4 +- tests/tree_seqno.rs | 4 +- tests/tree_shadowing.rs | 2 + 13 files changed, 51 insertions(+), 38 deletions(-) diff --git a/tests/blob_drop_after_flush.rs b/tests/blob_drop_after_flush.rs index 04c51314..1c592a18 100644 --- a/tests/blob_drop_after_flush.rs +++ b/tests/blob_drop_after_flush.rs @@ -18,7 +18,7 @@ fn blob_drop_after_flush() -> lsm_tree::Result<()> { tree.insert("a", "neptune".repeat(10_000), 0); let (id, memtable) = tree.rotate_memtable().unwrap(); - let segment = tree.flush_memtable(id, &memtable, 0).unwrap().unwrap(); + let (segment, blob_file) = tree.flush_memtable(id, &memtable, 0).unwrap().unwrap(); // NOTE: Segment is now in-flight @@ -36,7 +36,7 @@ fn blob_drop_after_flush() -> lsm_tree::Result<()> { let strategy = lsm_tree::gc::SpaceAmpStrategy::new(1.0); tree.apply_gc_strategy(&strategy, 0)?; - tree.register_segments(&[segment], 0)?; + tree.register_segments(&[segment], Some(&[blob_file.unwrap()]), 0)?; assert_eq!( "neptune".repeat(10_000).as_bytes(), diff --git a/tests/blob_gc.rs b/tests/blob_gc.rs index 9a647420..6aa50d86 100644 --- a/tests/blob_gc.rs +++ b/tests/blob_gc.rs @@ -15,32 +15,32 @@ fn blob_gc_1() -> lsm_tree::Result<()> { tree.insert("c", "neptune".repeat(10_000), seqno.next()); tree.flush_active_memtable(0)?; - assert_eq!(1, tree.blobs.blob_file_count()); + assert_eq!(1, tree.blob_file_count()); tree.gc_scan_stats(seqno.get(), 0)?; - assert_eq!(1.0, tree.blobs.space_amp()); + assert_eq!(1.0, tree.space_amp()); tree.insert("a", "a", seqno.next()); tree.gc_scan_stats(seqno.get(), /* simulate some time has passed */ 1_000)?; - assert_eq!(1.5, tree.blobs.space_amp()); + assert_eq!(1.5, tree.space_amp()); tree.insert("b", "b", seqno.next()); tree.gc_scan_stats(seqno.get(), 1_000)?; - assert_eq!(3.0, tree.blobs.space_amp()); + assert_eq!(3.0, tree.space_amp()); // NOTE: Everything is stale tree.insert("c", "c", seqno.next()); tree.gc_scan_stats(seqno.get(), 1_000)?; - assert_eq!(0.0, tree.blobs.space_amp()); + assert_eq!(0.0, tree.space_amp()); tree.gc_drop_stale()?; assert_eq!(&*tree.get("a", SeqNo::MAX)?.unwrap(), b"a"); assert_eq!(&*tree.get("b", SeqNo::MAX)?.unwrap(), b"b"); assert_eq!(&*tree.get("c", SeqNo::MAX)?.unwrap(), b"c"); - assert_eq!(0, tree.blobs.blob_file_count()); - assert_eq!(0.0, tree.blobs.space_amp()); + assert_eq!(0, tree.blob_file_count()); + assert_eq!(0.0, tree.space_amp()); Ok(()) } @@ -59,18 +59,18 @@ fn blob_gc_2() -> lsm_tree::Result<()> { tree.insert("c", "neptune".repeat(10_000), seqno.next()); tree.flush_active_memtable(0)?; - assert_eq!(1, tree.blobs.blob_file_count()); + assert_eq!(1, tree.blob_file_count()); tree.gc_scan_stats(seqno.get(), 0)?; - assert_eq!(1.0, tree.blobs.space_amp()); + assert_eq!(1.0, tree.space_amp()); tree.insert("a", "a", seqno.next()); tree.gc_scan_stats(seqno.get(), /* simulate some time has passed */ 1_000)?; - assert_eq!(1.5, tree.blobs.space_amp()); + assert_eq!(1.5, tree.space_amp()); tree.insert("b", "b", seqno.next()); tree.gc_scan_stats(seqno.get(), 1_000)?; - assert_eq!(3.0, tree.blobs.space_amp()); + assert_eq!(3.0, tree.space_amp()); let strategy = lsm_tree::gc::SpaceAmpStrategy::new(1.0); tree.apply_gc_strategy(&strategy, seqno.next())?; @@ -81,8 +81,8 @@ fn blob_gc_2() -> lsm_tree::Result<()> { &*tree.get("c", SeqNo::MAX)?.unwrap(), "neptune".repeat(10_000).as_bytes() ); - assert_eq!(1, tree.blobs.blob_file_count()); - assert_eq!(1.0, tree.blobs.space_amp()); + assert_eq!(1, tree.blob_file_count()); + assert_eq!(1.0, tree.space_amp()); tree.insert("c", "c", seqno.next()); @@ -90,7 +90,7 @@ fn blob_gc_2() -> lsm_tree::Result<()> { let strategy = lsm_tree::gc::SpaceAmpStrategy::new(1.0); tree.apply_gc_strategy(&strategy, seqno.next())?; - assert_eq!(0, tree.blobs.blob_file_count()); + assert_eq!(0, tree.blob_file_count()); Ok(()) } @@ -109,19 +109,19 @@ fn blob_gc_3() -> lsm_tree::Result<()> { tree.insert("c", "neptune".repeat(10_000), seqno.next()); tree.flush_active_memtable(0)?; - assert_eq!(1, tree.blobs.blob_file_count()); + assert_eq!(1, tree.blob_file_count()); tree.gc_scan_stats(seqno.get(), 0)?; - assert_eq!(1.0, tree.blobs.space_amp()); + assert_eq!(1.0, tree.space_amp()); tree.remove("a", seqno.next()); tree.gc_scan_stats(seqno.get(), /* simulate some time has passed */ 1_000)?; - assert_eq!(1.5, tree.blobs.space_amp()); + assert_eq!(1.5, tree.space_amp()); tree.remove("b", seqno.next()); tree.gc_scan_stats(seqno.get(), 1_000)?; - assert_eq!(3.0, tree.blobs.space_amp()); + assert_eq!(3.0, tree.space_amp()); let strategy = lsm_tree::gc::SpaceAmpStrategy::new(1.0); tree.apply_gc_strategy(&strategy, seqno.next())?; @@ -132,8 +132,8 @@ fn blob_gc_3() -> lsm_tree::Result<()> { &*tree.get("c", SeqNo::MAX)?.unwrap(), "neptune".repeat(10_000).as_bytes() ); - assert_eq!(1, tree.blobs.blob_file_count()); - assert_eq!(1.0, tree.blobs.space_amp()); + assert_eq!(1, tree.blob_file_count()); + assert_eq!(1.0, tree.space_amp()); tree.remove("c", seqno.next()); assert!(tree.get("c", SeqNo::MAX)?.is_none()); @@ -142,7 +142,7 @@ fn blob_gc_3() -> lsm_tree::Result<()> { let strategy = lsm_tree::gc::SpaceAmpStrategy::new(1.0); tree.apply_gc_strategy(&strategy, seqno.next())?; - assert_eq!(0, tree.blobs.blob_file_count()); + assert_eq!(0, tree.blob_file_count()); Ok(()) } diff --git a/tests/blob_sep_threshold.rs b/tests/blob_sep_threshold.rs index 16a7540a..dd162d4f 100644 --- a/tests/blob_sep_threshold.rs +++ b/tests/blob_sep_threshold.rs @@ -12,11 +12,11 @@ fn blob_tree_separation_threshold() -> lsm_tree::Result<()> { tree.insert("a", "a".repeat(1_023), 0); tree.flush_active_memtable(0)?; - assert_eq!(tree.blobs.blob_file_count(), 0); + assert_eq!(0, tree.blob_file_count()); tree.insert("b", "b".repeat(1_024), 0); tree.flush_active_memtable(0)?; - assert_eq!(tree.blobs.blob_file_count(), 1); + assert_eq!(1, tree.blob_file_count()); assert_eq!(2, tree.len(SeqNo::MAX, None)?); diff --git a/tests/blob_simple.rs b/tests/blob_simple.rs index 29de63fe..e91cf0e0 100644 --- a/tests/blob_simple.rs +++ b/tests/blob_simple.rs @@ -2,7 +2,6 @@ use lsm_tree::{AbstractTree, SeqNo}; use test_log::test; #[test] -#[ignore = "wip"] fn blob_tree_simple() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let path = folder.path(); @@ -22,12 +21,15 @@ fn blob_tree_simple() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; - let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); - assert_eq!(&*value, big_value); + assert_eq!(1, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); let value = tree.get("smol", SeqNo::MAX)?.expect("should exist"); assert_eq!(&*value, b"small value"); + let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + tree.insert("big", &new_big_value, 1); let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); @@ -37,6 +39,9 @@ fn blob_tree_simple() -> lsm_tree::Result<()> { let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); assert_eq!(&*value, new_big_value); + + let value = tree.get("big", 1)?.expect("should exist"); + assert_eq!(&*value, big_value); } { diff --git a/tests/blob_tree_flush.rs b/tests/blob_tree_flush.rs index c47bb757..1fa4792c 100644 --- a/tests/blob_tree_flush.rs +++ b/tests/blob_tree_flush.rs @@ -17,20 +17,20 @@ fn blob_gc_flush_tombstone() -> lsm_tree::Result<()> { tree.remove("b", seqno.next()); tree.gc_scan_stats(seqno.get(), /* simulate some time has passed */ 1_000)?; - assert_eq!(2.0, tree.blobs.space_amp()); + assert_eq!(2.0, tree.space_amp()); let strategy = lsm_tree::gc::SpaceAmpStrategy::new(1.0); tree.apply_gc_strategy(&strategy, seqno.next())?; - assert_eq!(1, tree.blobs.blob_file_count()); + assert_eq!(1, tree.blob_file_count()); tree.gc_scan_stats(seqno.get(), 1_000)?; - assert_eq!(1.0, tree.blobs.space_amp()); + assert_eq!(1.0, tree.space_amp()); tree.flush_active_memtable(0)?; - assert_eq!(1, tree.blobs.blob_file_count()); + assert_eq!(1, tree.blob_file_count()); tree.gc_scan_stats(seqno.get(), 1_000)?; - assert_eq!(1.0, tree.blobs.space_amp()); + assert_eq!(1.0, tree.space_amp()); Ok(()) } diff --git a/tests/experimental_blob_tree_guarded_size.rs b/tests/experimental_blob_tree_guarded_size.rs index f105d3c4..c435f359 100644 --- a/tests/experimental_blob_tree_guarded_size.rs +++ b/tests/experimental_blob_tree_guarded_size.rs @@ -2,6 +2,7 @@ use lsm_tree::{AbstractTree, Config, Guard, SeqNo}; use test_log::test; #[test] +#[ignore = "restore"] fn experimental_blob_tree_guarded_size() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; diff --git a/tests/experimental_tree_guarded_range.rs b/tests/experimental_tree_guarded_range.rs index f8f89cbd..286da5cf 100644 --- a/tests/experimental_tree_guarded_range.rs +++ b/tests/experimental_tree_guarded_range.rs @@ -32,6 +32,7 @@ fn experimental_tree_guarded_range() -> lsm_tree::Result<()> { } #[test] +#[ignore = "restore"] fn experimental_blob_tree_guarded_range() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; diff --git a/tests/tree_approx_len.rs b/tests/tree_approx_len.rs index 0c0dd2df..64c87b04 100644 --- a/tests/tree_approx_len.rs +++ b/tests/tree_approx_len.rs @@ -34,6 +34,7 @@ fn tree_approx_len_sealed() -> lsm_tree::Result<()> { } #[test] +#[ignore] fn tree_approx_len_sealed_blob() -> lsm_tree::Result<()> { let folder = tempdir()?; @@ -128,6 +129,7 @@ fn tree_approx_len() -> lsm_tree::Result<()> { } #[test] +#[ignore = "restore"] fn tree_approx_len_blob() -> lsm_tree::Result<()> { let folder = tempdir()?; diff --git a/tests/tree_count.rs b/tests/tree_count.rs index d40624ab..5f52069d 100644 --- a/tests/tree_count.rs +++ b/tests/tree_count.rs @@ -62,6 +62,7 @@ fn tree_flushed_count() -> lsm_tree::Result<()> { } #[test] +#[ignore] fn tree_flushed_count_blob() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; diff --git a/tests/tree_l0_range.rs b/tests/tree_l0_range.rs index 25dd7abf..2f9b4d8f 100644 --- a/tests/tree_l0_range.rs +++ b/tests/tree_l0_range.rs @@ -2,7 +2,8 @@ use lsm_tree::{AbstractTree, Guard, SeqNo}; use test_log::test; #[test] -fn tree_l0_range() -> lsm_tree::Result<()> { +#[ignore] +fn tree_l0_range_blob() -> lsm_tree::Result<()> { let folder: tempfile::TempDir = tempfile::tempdir()?; let path = folder.path(); diff --git a/tests/tree_sealed_shadowing.rs b/tests/tree_sealed_shadowing.rs index 9bc83887..e6d02772 100644 --- a/tests/tree_sealed_shadowing.rs +++ b/tests/tree_sealed_shadowing.rs @@ -19,8 +19,8 @@ fn tree_sealed_memtable_tombstone_shadowing() -> lsm_tree::Result<()> { let (id, memtable) = tree.rotate_memtable().unwrap(); assert!(!tree.contains_key("a", SeqNo::MAX)?); - let segment = tree.flush_memtable(id, &memtable, 0)?.unwrap(); - tree.register_segments(&[segment], 0)?; + let (segment, _) = tree.flush_memtable(id, &memtable, 0)?.unwrap(); + tree.register_segments(&[segment], None, 0)?; assert!(!tree.contains_key("a", SeqNo::MAX)?); diff --git a/tests/tree_seqno.rs b/tests/tree_seqno.rs index 6ebda8d1..8316c2b9 100644 --- a/tests/tree_seqno.rs +++ b/tests/tree_seqno.rs @@ -45,8 +45,8 @@ fn tree_highest_seqno() -> lsm_tree::Result<()> { assert_eq!(tree.get_highest_memtable_seqno(), Some(4)); assert_eq!(tree.get_highest_persisted_seqno(), Some(3)); - let segment = tree.flush_memtable(segment_id, &sealed, 0)?.unwrap(); - tree.register_segments(&[segment], 0)?; + let (segment, _) = tree.flush_memtable(segment_id, &sealed, 0)?.unwrap(); + tree.register_segments(&[segment], None, 0)?; assert_eq!(tree.get_highest_seqno(), Some(4)); assert_eq!(tree.get_highest_memtable_seqno(), None); diff --git a/tests/tree_shadowing.rs b/tests/tree_shadowing.rs index af2df223..568003f6 100644 --- a/tests/tree_shadowing.rs +++ b/tests/tree_shadowing.rs @@ -166,6 +166,7 @@ fn tree_shadowing_range() -> lsm_tree::Result<()> { } #[test] +#[ignore] fn tree_shadowing_range_blob() -> lsm_tree::Result<()> { const ITEM_COUNT: usize = 10_000; @@ -282,6 +283,7 @@ fn tree_shadowing_prefix() -> lsm_tree::Result<()> { } #[test] +#[ignore] fn tree_shadowing_prefix_blob() -> lsm_tree::Result<()> { const ITEM_COUNT: usize = 10_000; From f10699513f5a9a20ef059327a41009110a052300 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:09:18 +0200 Subject: [PATCH 412/613] adjust crates keywords --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 43bbfd3f..81ff0eb4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ readme = "README.md" include = ["src/**/*", "LICENSE-APACHE", "LICENSE-MIT", "README.md"] repository = "https://github.com/fjall-rs/lsm-tree" homepage = "https://github.com/fjall-rs/lsm-tree" -keywords = ["database", "lsmt", "lsm", "rocksdb", "leveldb"] +keywords = ["lsmt", "lsm", "rocksdb", "leveldb", "key-value"] categories = ["data-structures", "database-implementations", "algorithms"] [lib] From ec9f2f0ff066de882d96b0cea83b06630e55c1a6 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:09:53 +0200 Subject: [PATCH 413/613] change cache default config --- src/cache.rs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/cache.rs b/src/cache.rs index 9b2d9d64..0c0d6ede 100644 --- a/src/cache.rs +++ b/src/cache.rs @@ -85,10 +85,18 @@ impl Cache { pub fn with_capacity_bytes(bytes: u64) -> Self { use quick_cache::sync::DefaultLifecycle; + // NOTE: Nothing we can do if it fails + #[allow(clippy::expect_used)] + let opts = quick_cache::OptionsBuilder::new() + .weight_capacity(bytes) + .hot_allocation(0.9) + .estimated_items_capacity(1_000_000) + .build() + .expect("cache options should be valid"); + #[allow(clippy::default_trait_access)] - let quick_cache = QuickCache::with( - 100_000, - bytes, + let quick_cache = QuickCache::with_options( + opts, BlockWeighter, Default::default(), DefaultLifecycle::default(), From 9f62926c64220873e396e7e23264c9fa6dd95cd8 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:10:06 +0200 Subject: [PATCH 414/613] move import --- src/compaction/worker.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 038d66b7..6ad79abe 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -2,9 +2,6 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -#[cfg(feature = "metrics")] -use crate::metrics::Metrics; - use super::{CompactionStrategy, Input as CompactionPayload}; use crate::{ compaction::{stream::CompactionStream, Choice}, @@ -22,6 +19,9 @@ use std::{ time::Instant, }; +#[cfg(feature = "metrics")] +use crate::metrics::Metrics; + pub type CompactionReader<'a> = Box> + 'a>; /// Compaction options From b4fa24df5ed374581fabefaa05c13a98bb98c0e7 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:11:15 +0200 Subject: [PATCH 415/613] clippy --- src/blob_tree/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index c9fed6b8..f394a401 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -599,8 +599,8 @@ impl AbstractTree for BlobTree { let indirection = MaybeInlineValue::Indirect { vhandle: ValueHandle { - offset, blob_file_id, + offset, on_disk_size, }, size: value_size, From dea55332813ae081d418b874570c6d9537cdeddc Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:13:12 +0200 Subject: [PATCH 416/613] fix --- src/tree/ingest.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tree/ingest.rs b/src/tree/ingest.rs index 8ac2a814..b0aaf9e6 100644 --- a/src/tree/ingest.rs +++ b/src/tree/ingest.rs @@ -79,7 +79,7 @@ impl<'a> Ingestion<'a> { }) .collect::>>()?; - self.tree.register_segments(&created_segments, 0)?; + self.tree.register_segments(&created_segments, None, 0)?; self.tree.compact(Arc::new(MoveDown(0, 2)), 0)?; From a9a1eb9350658a16d4d148446791f87ac2bcd00d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:15:53 +0200 Subject: [PATCH 417/613] make clippy shut up temporarily --- src/vlog/blob_file/reader.rs | 3 +++ src/vlog/blob_file/writer.rs | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/vlog/blob_file/reader.rs b/src/vlog/blob_file/reader.rs index 4cc01927..cab68c62 100644 --- a/src/vlog/blob_file/reader.rs +++ b/src/vlog/blob_file/reader.rs @@ -96,6 +96,9 @@ impl Iterator for Reader { let key = fail_iter!(UserKey::from_reader(&mut self.inner, key_len as usize)); let val_len = fail_iter!(self.inner.read_u32::()); + + // TODO: finish compression + #[warn(clippy::match_single_binding)] let val = match &self.compression { _ => { // NOTE: When not using compression, we can skip diff --git a/src/vlog/blob_file/writer.rs b/src/vlog/blob_file/writer.rs index fcb2f63b..352045b7 100644 --- a/src/vlog/blob_file/writer.rs +++ b/src/vlog/blob_file/writer.rs @@ -143,7 +143,8 @@ impl Writer { self.active_writer .write_u32::(value.len() as u32)?; - // TODO: + // TODO: finish compression + #[warn(clippy::match_single_binding)] let value = match &self.compression { _ => value, }; From f50929cba4b9b9741e8c9a75c54037712d0fb2e2 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:26:11 +0200 Subject: [PATCH 418/613] license --- src/vlog/accessor.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/vlog/accessor.rs b/src/vlog/accessor.rs index f626c184..2c72b55a 100644 --- a/src/vlog/accessor.rs +++ b/src/vlog/accessor.rs @@ -1,3 +1,7 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + use crate::{ vlog::{blob_file::writer::BLOB_HEADER_LEN, BlobFileId, ValueHandle}, BlobFile, Cache, DescriptorTable, GlobalSegmentId, Slice, UserValue, From a6ded6cd99f7e6a407126753dfae67d258b93327 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:31:26 +0200 Subject: [PATCH 419/613] remove old value log config --- src/blob_tree/mod.rs | 1 + src/vlog/config.rs | 66 ------------------------------------------- src/vlog/mod.rs | 2 -- src/vlog/value_log.rs | 10 +++---- 4 files changed, 6 insertions(+), 73 deletions(-) delete mode 100644 src/vlog/config.rs diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index f394a401..29208d7b 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -553,6 +553,7 @@ impl AbstractTree for BlobTree { u64::MAX, self.index.config.path.join(BLOBS_FOLDER), )?; + // TODO: select compression // let mut blob_writer = self.blobs.get_writer()?.use_target_size(u64::MAX); diff --git a/src/vlog/config.rs b/src/vlog/config.rs deleted file mode 100644 index dab541fb..00000000 --- a/src/vlog/config.rs +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use crate::{Cache, CompressionType, DescriptorTable}; -use std::sync::Arc; - -/// Value log configuration -pub struct Config { - /// Target size of vLog blob files - pub(crate) blob_file_size_bytes: u64, - - /// Blob cache to use - pub(crate) blob_cache: Arc, - - /// File descriptor cache to use - pub(crate) fd_cache: Arc, - - /// Compression to use - pub(crate) compression: CompressionType, -} - -impl Config { - /// Creates a new configuration builder. - pub fn new(blob_cache: Arc, fd_cache: Arc) -> Self { - Self { - blob_cache, - fd_cache, - compression: CompressionType::None, - blob_file_size_bytes: 128 * 1_024 * 1_024, - } - } - - /// Sets the compression & decompression scheme. - #[must_use] - pub fn compression(mut self, compression: CompressionType) -> Self { - self.compression = compression; - self - } - - /// Sets the blob cache. - /// - /// You can create a global [`Cache`] and share it between multiple - /// value logs to cap global cache memory usage. - #[must_use] - pub fn blob_cache(mut self, blob_cache: Arc) -> Self { - self.blob_cache = blob_cache; - self - } - - /// Sets the maximum size of value log blob files. - /// - /// This influences space amplification, as - /// space reclamation works on a per-file basis. - /// - /// Larger files results in less files on disk and thus less file descriptors that may have to be obtained or cached. - /// - /// Like `blob_file_size` in `RocksDB`. - /// - /// Default = 256 MiB - #[must_use] - pub fn blob_file_size_bytes(mut self, bytes: u64) -> Self { - self.blob_file_size_bytes = bytes; - self - } -} diff --git a/src/vlog/mod.rs b/src/vlog/mod.rs index 6ae6134f..1ba80ac6 100644 --- a/src/vlog/mod.rs +++ b/src/vlog/mod.rs @@ -3,7 +3,6 @@ // (found in the LICENSE-* files in the repository) pub mod blob_file; -mod config; mod gc; mod handle; mod index; @@ -18,7 +17,6 @@ mod value_log; pub use { accessor::Accessor, blob_file::multi_writer::MultiWriter as BlobFileWriter, - config::Config, gc::report::GcReport, gc::{GcStrategy, SpaceAmpStrategy, StaleThresholdStrategy}, handle::ValueHandle, diff --git a/src/vlog/value_log.rs b/src/vlog/value_log.rs index 775efdaa..d83a2b16 100644 --- a/src/vlog/value_log.rs +++ b/src/vlog/value_log.rs @@ -11,7 +11,7 @@ use crate::{ gc::report::GcReport, index::Writer as IndexWriter, scanner::SizeMap, - BlobFile, BlobFileId, BlobFileWriter, Config, GcStrategy, IndexReader, ValueHandle, + BlobFile, BlobFileId, BlobFileWriter, GcStrategy, IndexReader, ValueHandle, }, Cache, DescriptorTable, @@ -89,7 +89,7 @@ pub struct ValueLogInner { pub path: PathBuf, /// Value log configuration - config: Config, + // config: Config, /// In-memory blob cache // blob_cache: Arc, @@ -114,7 +114,7 @@ impl ValueLog { /// Will return `Err` if an IO error occurs. pub fn open>( path: P, // TODO: move path into config? - config: Config, + // config: Config, ) -> crate::Result { // let path = path.into(); @@ -171,7 +171,7 @@ impl ValueLog { } /// Creates a new empty value log in a directory. - pub(crate) fn create_new>(path: P, config: Config) -> crate::Result { + pub(crate) fn create_new>(path: P) -> crate::Result { let path = path.into(); let path = crate::path::absolute_path(&path); @@ -203,7 +203,7 @@ impl ValueLog { Ok(Self(Arc::new(ValueLogInner { id: get_next_vlog_id(), - config, + // config, path, // blob_cache, // fd_cache, From f45da9bf693bc5c74f94407fb71635777de4422d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:34:53 +0200 Subject: [PATCH 420/613] remove more unnecessary code --- src/cache.rs | 4 ++-- src/vlog/accessor.rs | 4 ++-- src/vlog/mod.rs | 2 +- src/vlog/value_log.rs | 10 ---------- 4 files changed, 5 insertions(+), 15 deletions(-) diff --git a/src/cache.rs b/src/cache.rs index 0c0d6ede..1eaf09a3 100644 --- a/src/cache.rs +++ b/src/cache.rs @@ -154,7 +154,7 @@ impl Cache { #[doc(hidden)] pub fn insert_blob( &self, - vlog_id: crate::vlog::ValueLogId, + vlog_id: crate::TreeId, vhandle: &crate::vlog::ValueHandle, value: UserValue, ) { @@ -168,7 +168,7 @@ impl Cache { #[must_use] pub fn get_blob( &self, - vlog_id: crate::vlog::ValueLogId, + vlog_id: crate::TreeId, vhandle: &crate::vlog::ValueHandle, ) -> Option { let key: CacheKey = (TAG_BLOB, vlog_id, vhandle.blob_file_id, vhandle.offset).into(); diff --git a/src/vlog/accessor.rs b/src/vlog/accessor.rs index 2c72b55a..5ce2b5a9 100644 --- a/src/vlog/accessor.rs +++ b/src/vlog/accessor.rs @@ -30,7 +30,7 @@ impl<'a> Accessor<'a> { cache: &Cache, descriptor_table: &DescriptorTable, ) -> crate::Result> { - if let Some(value) = cache.get_blob(0 /* TODO: vlog ID... */, vhandle) { + if let Some(value) = cache.get_blob(0 /* TODO: tree ID... */, vhandle) { return Ok(Some(value)); } @@ -38,7 +38,7 @@ impl<'a> Accessor<'a> { return Ok(None); }; - let bf_id = GlobalSegmentId::from((0 /* TODO: vlog ID */, vhandle.blob_file_id)); + let bf_id = GlobalSegmentId::from((0 /* TODO: tree ID */, vhandle.blob_file_id)); let file = if let Some(fd) = descriptor_table.access_for_blob_file(&bf_id) { fd diff --git a/src/vlog/mod.rs b/src/vlog/mod.rs index 1ba80ac6..c3cec95f 100644 --- a/src/vlog/mod.rs +++ b/src/vlog/mod.rs @@ -21,7 +21,7 @@ pub use { gc::{GcStrategy, SpaceAmpStrategy, StaleThresholdStrategy}, handle::ValueHandle, index::{Reader as IndexReader, Writer as IndexWriter}, - value_log::{ValueLog, ValueLogId}, + value_log::ValueLog, }; #[doc(hidden)] diff --git a/src/vlog/value_log.rs b/src/vlog/value_log.rs index d83a2b16..b583973b 100644 --- a/src/vlog/value_log.rs +++ b/src/vlog/value_log.rs @@ -46,16 +46,6 @@ use std::{ // } // } -/// Unique value log ID -#[allow(clippy::module_name_repetitions)] -pub type ValueLogId = u64; - -/// Hands out a unique (monotonically increasing) value log ID. -pub fn get_next_vlog_id() -> ValueLogId { - static VLOG_ID_COUNTER: AtomicU64 = AtomicU64::new(0); - VLOG_ID_COUNTER.fetch_add(1, std::sync::atomic::Ordering::Relaxed) -} - fn unlink_blob_files(base_path: &Path, ids: &[BlobFileId]) { unimplemented!() From b6df8bc6ee63fb929f1d22651abc769cff155467 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 18:35:46 +0200 Subject: [PATCH 421/613] fix --- src/vlog/value_log.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/vlog/value_log.rs b/src/vlog/value_log.rs index b583973b..44863736 100644 --- a/src/vlog/value_log.rs +++ b/src/vlog/value_log.rs @@ -72,9 +72,6 @@ impl std::ops::Deref for ValueLog { #[allow(clippy::module_name_repetitions)] pub struct ValueLogInner { - /// Unique value log ID - id: u64, - /// Base folder pub path: PathBuf, @@ -192,7 +189,6 @@ impl ValueLog { // let manifest = Manifest::create_new(&path)?; Ok(Self(Arc::new(ValueLogInner { - id: get_next_vlog_id(), // config, path, // blob_cache, From 2067025cf5d667ccb2e6b5a49a85ebce15617bd8 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 21:17:04 +0200 Subject: [PATCH 422/613] perf: optimize BlobTree::is_empty --- src/blob_tree/mod.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index 29208d7b..b2462e10 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -722,6 +722,12 @@ impl AbstractTree for BlobTree { self.index.approximate_len() } + // NOTE: Override the default implementation to not fetch + // data from the value log, so we get much faster key reads + fn is_empty(&self, seqno: SeqNo, index: Option>) -> crate::Result { + self.index.is_empty(seqno, index) + } + // NOTE: Override the default implementation to not fetch // data from the value log, so we get much faster key reads fn contains_key>(&self, key: K, seqno: SeqNo) -> crate::Result { From 5904111533ce425cdd62fd4bbd84783300e144df Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 21:17:10 +0200 Subject: [PATCH 423/613] restore blob tree range ops --- src/blob_tree/mod.rs | 73 +++++++++++++++++++++++++++++++------------- 1 file changed, 51 insertions(+), 22 deletions(-) diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index b2462e10..2989ee8d 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -15,12 +15,13 @@ use crate::{ segment::Segment, tree::inner::MemtableId, value::InternalValue, - vlog::{BlobFile, BlobFileWriter, ValueHandle, ValueLog}, + vlog::{Accessor, BlobFile, BlobFileId, BlobFileWriter, ValueHandle, ValueLog}, Config, Memtable, SegmentId, SeqNo, SequenceNumberCounter, UserKey, UserValue, }; use gc::{reader::GcReader, writer::GcWriter}; use index::IndexTree; use std::{ + collections::BTreeMap, io::Cursor, ops::{RangeBounds, RangeFull}, path::PathBuf, @@ -31,17 +32,21 @@ use std::{ }; use value::MaybeInlineValue; -pub struct Guard<'a>(&'a ValueLog, crate::Result<(UserKey, UserValue)>); +pub struct Guard<'a>( + &'a BlobTree, + Arc>, + crate::Result<(UserKey, UserValue)>, +); impl IterGuard for Guard<'_> { fn key(self) -> crate::Result { - self.1.map(|(k, _)| k) + self.2.map(|(k, _)| k) } fn size(self) -> crate::Result { use MaybeInlineValue::{Indirect, Inline}; - let value = self.1?.1; + let value = self.2?.1; let mut cursor = Cursor::new(value); Ok(match MaybeInlineValue::decode_from(&mut cursor)? { @@ -55,11 +60,15 @@ impl IterGuard for Guard<'_> { } fn into_inner(self) -> crate::Result<(UserKey, UserValue)> { - resolve_value_handle(self.0, self.1) + resolve_value_handle(self.0, &self.1, self.2) } } -fn resolve_value_handle(vlog: &crate::vlog::ValueLog, item: RangeItem) -> RangeItem { +fn resolve_value_handle( + tree: &BlobTree, + vlog: &BTreeMap, + item: RangeItem, +) -> RangeItem { use MaybeInlineValue::{Indirect, Inline}; match item { @@ -70,7 +79,13 @@ fn resolve_value_handle(vlog: &crate::vlog::ValueLog, item: RangeItem) -> RangeI Inline(bytes) => Ok((key, bytes)), Indirect { vhandle, .. } => { // Resolve indirection using value log - match vlog.get(&vhandle) { + match Accessor::new(vlog).get( + &tree.blobs_folder, + &key, + &vhandle, + &tree.index.config.cache, + &tree.index.config.descriptor_table, + ) { Ok(Some(bytes)) => Ok((key, bytes)), Err(e) => Err(e), _ => { @@ -368,14 +383,21 @@ impl AbstractTree for BlobTree { seqno: SeqNo, index: Option>, ) -> Box> + '_> { - todo!() + let version = self + .index + .manifest + .read() + .expect("lock is poisoned") + .current_version() + .clone(); - // Box::new( - // self.index - // .0 - // .create_prefix(&prefix, seqno, index) - // .map(move |kv| IterGuardImpl::Blob(Guard(&self.blobs, kv))), - // ) + // TODO: PERF: ugly Arc clone + Box::new( + self.index + .0 + .create_prefix(&prefix, seqno, index) + .map(move |kv| IterGuardImpl::Blob(Guard(self, version.value_log.clone(), kv))), + ) } fn range, R: RangeBounds>( @@ -384,14 +406,21 @@ impl AbstractTree for BlobTree { seqno: SeqNo, index: Option>, ) -> Box> + '_> { - todo!() - - // Box::new( - // self.index - // .0 - // .create_range(&range, seqno, index) - // .map(move |kv| IterGuardImpl::Blob(Guard(&self.blobs, kv))), - // ) + let version = self + .index + .manifest + .read() + .expect("lock is poisoned") + .current_version() + .clone(); + + // TODO: PERF: ugly Arc clone + Box::new( + self.index + .0 + .create_range(&range, seqno, index) + .map(move |kv| IterGuardImpl::Blob(Guard(self, version.value_log.clone(), kv))), + ) } fn tombstone_count(&self) -> u64 { From af1ec2f701660d1fc4b3a3dbcb7f7d0a041b8772 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 21:17:26 +0200 Subject: [PATCH 424/613] restore more tests --- tests/blob_tree_reload_blob.rs | 2 -- tests/tree_approx_len.rs | 2 -- tests/tree_bulk_ingest.rs | 24 ++++++++++++------------ tests/tree_count.rs | 1 - tests/tree_l0_range.rs | 5 ++--- tests/tree_shadowing.rs | 2 -- 6 files changed, 14 insertions(+), 22 deletions(-) diff --git a/tests/blob_tree_reload_blob.rs b/tests/blob_tree_reload_blob.rs index 8f6daaac..54c3744e 100644 --- a/tests/blob_tree_reload_blob.rs +++ b/tests/blob_tree_reload_blob.rs @@ -4,7 +4,6 @@ use test_log::test; const ITEM_COUNT: usize = 10_000; #[test] -#[ignore] fn blob_tree_reload_empty() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; @@ -62,7 +61,6 @@ fn blob_tree_reload_empty() -> lsm_tree::Result<()> { } #[test] -#[ignore] fn blob_tree_reload() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; diff --git a/tests/tree_approx_len.rs b/tests/tree_approx_len.rs index 64c87b04..0c0dd2df 100644 --- a/tests/tree_approx_len.rs +++ b/tests/tree_approx_len.rs @@ -34,7 +34,6 @@ fn tree_approx_len_sealed() -> lsm_tree::Result<()> { } #[test] -#[ignore] fn tree_approx_len_sealed_blob() -> lsm_tree::Result<()> { let folder = tempdir()?; @@ -129,7 +128,6 @@ fn tree_approx_len() -> lsm_tree::Result<()> { } #[test] -#[ignore = "restore"] fn tree_approx_len_blob() -> lsm_tree::Result<()> { let folder = tempdir()?; diff --git a/tests/tree_bulk_ingest.rs b/tests/tree_bulk_ingest.rs index 635379a6..8d9da9a1 100644 --- a/tests/tree_bulk_ingest.rs +++ b/tests/tree_bulk_ingest.rs @@ -19,14 +19,14 @@ fn tree_bulk_ingest() -> lsm_tree::Result<()> { assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), - ITEM_COUNT + ITEM_COUNT, ); assert_eq!( tree.iter(SeqNo::MAX, None) .rev() .flat_map(|x| x.key()) .count(), - ITEM_COUNT + ITEM_COUNT, ); Ok(()) @@ -47,14 +47,14 @@ fn tree_copy() -> lsm_tree::Result<()> { assert_eq!(src.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( src.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), - ITEM_COUNT + ITEM_COUNT, ); assert_eq!( src.iter(SeqNo::MAX, None) .rev() .flat_map(|x| x.key()) .count(), - ITEM_COUNT + ITEM_COUNT, ); assert!(src.lock_active_memtable().is_empty()); @@ -70,14 +70,14 @@ fn tree_copy() -> lsm_tree::Result<()> { assert_eq!(dest.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( dest.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), - ITEM_COUNT + ITEM_COUNT, ); assert_eq!( dest.iter(SeqNo::MAX, None) .rev() .flat_map(|x| x.key()) .count(), - ITEM_COUNT + ITEM_COUNT, ); assert!(dest.lock_active_memtable().is_empty()); @@ -102,14 +102,14 @@ fn blob_tree_bulk_ingest() -> lsm_tree::Result<()> { assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), - ITEM_COUNT + ITEM_COUNT, ); assert_eq!( tree.iter(SeqNo::MAX, None) .rev() .flat_map(|x| x.key()) .count(), - ITEM_COUNT + ITEM_COUNT, ); assert_eq!(1, tree.blob_file_count()); @@ -133,14 +133,14 @@ fn blob_tree_copy() -> lsm_tree::Result<()> { assert_eq!(src.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( src.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), - ITEM_COUNT + ITEM_COUNT, ); assert_eq!( src.iter(SeqNo::MAX, None) .rev() .flat_map(|x| x.key()) .count(), - ITEM_COUNT + ITEM_COUNT, ); assert!(src.lock_active_memtable().is_empty()); assert_eq!(1, src.blob_file_count()); @@ -159,14 +159,14 @@ fn blob_tree_copy() -> lsm_tree::Result<()> { assert_eq!(dest.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( dest.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), - ITEM_COUNT + ITEM_COUNT, ); assert_eq!( dest.iter(SeqNo::MAX, None) .rev() .flat_map(|x| x.key()) .count(), - ITEM_COUNT + ITEM_COUNT, ); assert!(dest.lock_active_memtable().is_empty()); assert_eq!(1, dest.blob_file_count()); diff --git a/tests/tree_count.rs b/tests/tree_count.rs index 5f52069d..d40624ab 100644 --- a/tests/tree_count.rs +++ b/tests/tree_count.rs @@ -62,7 +62,6 @@ fn tree_flushed_count() -> lsm_tree::Result<()> { } #[test] -#[ignore] fn tree_flushed_count_blob() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; diff --git a/tests/tree_l0_range.rs b/tests/tree_l0_range.rs index 2f9b4d8f..85e6a268 100644 --- a/tests/tree_l0_range.rs +++ b/tests/tree_l0_range.rs @@ -2,7 +2,6 @@ use lsm_tree::{AbstractTree, Guard, SeqNo}; use test_log::test; #[test] -#[ignore] fn tree_l0_range_blob() -> lsm_tree::Result<()> { let folder: tempfile::TempDir = tempfile::tempdir()?; let path = folder.path(); @@ -29,7 +28,7 @@ fn tree_l0_range_blob() -> lsm_tree::Result<()> { tree.insert("f", "f", 3); tree.flush_active_memtable(0)?; - tree.insert("g", "g", 3); + tree.insert("g", "g".repeat(10_000), 3); tree.flush_active_memtable(0)?; let mut range = tree.range("c"..="e", SeqNo::MAX, None); @@ -39,7 +38,7 @@ fn tree_l0_range_blob() -> lsm_tree::Result<()> { assert!(range.next().is_none()); let mut range = tree.range("f"..="g", SeqNo::MAX, None).rev(); - assert_eq!(b"g", &*range.next().unwrap().value()?); + assert_eq!(b"g".repeat(10_000), &*range.next().unwrap().value()?); assert_eq!(b"f", &*range.next().unwrap().value()?); assert!(range.next().is_none()); diff --git a/tests/tree_shadowing.rs b/tests/tree_shadowing.rs index 568003f6..af2df223 100644 --- a/tests/tree_shadowing.rs +++ b/tests/tree_shadowing.rs @@ -166,7 +166,6 @@ fn tree_shadowing_range() -> lsm_tree::Result<()> { } #[test] -#[ignore] fn tree_shadowing_range_blob() -> lsm_tree::Result<()> { const ITEM_COUNT: usize = 10_000; @@ -283,7 +282,6 @@ fn tree_shadowing_prefix() -> lsm_tree::Result<()> { } #[test] -#[ignore] fn tree_shadowing_prefix_blob() -> lsm_tree::Result<()> { const ITEM_COUNT: usize = 10_000; From 17d5469430931ad14318f4eba2c5cae4b89a840b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 18 Sep 2025 21:17:40 +0200 Subject: [PATCH 425/613] restore FIFO test --- src/compaction/worker.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 6ad79abe..576d85a5 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -546,7 +546,6 @@ mod tests { use test_log::test; #[test] - #[ignore] fn compaction_drop_segments() -> crate::Result<()> { let folder = tempfile::tempdir()?; @@ -554,9 +553,9 @@ mod tests { tree.insert("a", "a", 0); tree.flush_active_memtable(0)?; - tree.insert("a", "a", 1); + tree.insert("b", "a", 1); tree.flush_active_memtable(0)?; - tree.insert("a", "a", 2); + tree.insert("c", "a", 2); tree.flush_active_memtable(0)?; assert_eq!(3, tree.approximate_len()); From 4f6d199c2c16432145b56b580c9da12d5cfc0b7f Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 22 Sep 2025 20:04:56 +0200 Subject: [PATCH 426/613] add back some methods to double_ended_peekable --- src/double_ended_peekable.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/double_ended_peekable.rs b/src/double_ended_peekable.rs index 3532de5d..120cc128 100644 --- a/src/double_ended_peekable.rs +++ b/src/double_ended_peekable.rs @@ -68,6 +68,23 @@ where .as_ref() .or_else(|| self.back.peeked_value_ref()) } + + /// Consumes and returns the next value of this iterator if a condition is true. + /// + /// See [`Peekable::next_if`] for more information. + /// + /// [`Peekable::next_if`]: core::iter::Peekable::next_if + #[inline] + pub fn next_if(&mut self, func: impl FnOnce(&I::Item) -> bool) -> Option { + match self.next() { + Some(item) if func(&item) => Some(item), + other => { + debug_assert!(self.front.is_unpeeked()); + self.front = MaybePeeked::Peeked(other); + None + } + } + } } impl DoubleEndedPeekable @@ -173,4 +190,8 @@ impl MaybePeeked { Self::Peeked(Some(peeked)) => Some(peeked), } } + + const fn is_unpeeked(&self) -> bool { + matches!(self, MaybePeeked::Unpeeked) + } } From 5a83d00b356b09b67d9b42cdb6dd4428e7163c2b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 22 Sep 2025 20:05:13 +0200 Subject: [PATCH 427/613] refactor: compaction stream --- src/compaction/stream.rs | 47 ++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/src/compaction/stream.rs b/src/compaction/stream.rs index e8edecff..47b43651 100644 --- a/src/compaction/stream.rs +++ b/src/compaction/stream.rs @@ -5,16 +5,21 @@ use crate::{InternalValue, SeqNo, UserKey, ValueType}; use std::iter::Peekable; +type Item = crate::Result; + /// Consumes a stream of KVs and emits a new stream according to GC and tombstone rules /// /// This iterator is used during flushing & compaction. #[allow(clippy::module_name_repetitions)] -pub struct CompactionStream>> { +pub struct CompactionStream> { + /// KV stream inner: Peekable, + + /// MVCC watermark to get rid of old versions gc_seqno_threshold: SeqNo, } -impl>> CompactionStream { +impl> CompactionStream { /// Initializes a new merge iterator #[must_use] pub fn new(iter: I, gc_seqno_threshold: SeqNo) -> Self { @@ -26,36 +31,26 @@ impl>> CompactionStream { } } - fn drain_key_min(&mut self, key: &UserKey) -> crate::Result<()> { + /// Drains the remaining versions of the given key. + fn drain_key(&mut self, key: &UserKey) -> crate::Result<()> { loop { - let Some(next) = self.inner.peek() else { + let Some(next) = self.inner.next_if(|kv| { + if let Ok(kv) = kv { + kv.key.user_key == key + } else { + true + } + }) else { return Ok(()); }; - let Ok(next) = next else { - // NOTE: We just asserted, the peeked value is an error - #[allow(clippy::expect_used)] - return Err(self - .inner - .next() - .expect("should exist") - .expect_err("should be error")); - }; - - // Consume version - if next.key.user_key == key { - // NOTE: We know the next value is not empty, because we just peeked it - #[allow(clippy::expect_used)] - self.inner.next().expect("should not be empty")?; - } else { - return Ok(()); - } + next?; } } } -impl>> Iterator for CompactionStream { - type Item = crate::Result; +impl> Iterator for CompactionStream { + type Item = Item; fn next(&mut self) -> Option { loop { @@ -82,7 +77,7 @@ impl>> Iterator for CompactionSt // NOTE: Next item is expired, // so the tail of this user key is entirely expired, so drain it all - fail_iter!(self.drain_key_min(&head.key.user_key)); + fail_iter!(self.drain_key(&head.key.user_key)); if drop_weak_tombstone { continue; @@ -90,7 +85,7 @@ impl>> Iterator for CompactionSt } } - // NOTE: Convert sequence number to zero if it is below the snapshot watermark + // NOTE: Convert sequence number to zero if it is below the snapshot watermark. // // This can save a lot of space, because "0" only takes 1 byte. if head.key.seqno < self.gc_seqno_threshold { From b43bd9dee98803397f2ba25772e10e17bb643af9 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 22 Sep 2025 20:05:56 +0200 Subject: [PATCH 428/613] refactor: mvcc stream --- src/mvcc_stream.rs | 32 +++++++++++--------------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/src/mvcc_stream.rs b/src/mvcc_stream.rs index b622fa53..549b7295 100644 --- a/src/mvcc_stream.rs +++ b/src/mvcc_stream.rs @@ -17,34 +17,24 @@ impl>> MvccStream /// Initializes a new merge iterator #[must_use] pub fn new(iter: I) -> Self { - let iter = iter.double_ended_peekable(); - Self { inner: iter } + Self { + inner: iter.double_ended_peekable(), + } } fn drain_key_min(&mut self, key: &UserKey) -> crate::Result<()> { loop { - let Some(next) = self.inner.peek() else { + let Some(next) = self.inner.next_if(|kv| { + if let Ok(kv) = kv { + kv.key.user_key == key + } else { + true + } + }) else { return Ok(()); }; - let Ok(next) = next else { - // NOTE: We just asserted, the peeked value is an error - #[allow(clippy::expect_used)] - return Err(self - .inner - .next() - .expect("should exist") - .expect_err("should be error")); - }; - - // Consume version - if next.key.user_key == key { - // NOTE: We know the next value is not empty, because we just peeked it - #[allow(clippy::expect_used)] - self.inner.next().expect("should not be empty")?; - } else { - return Ok(()); - } + next?; } } } From c1c10f39536f4a6612d6f28d024608fc44cdb5b2 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 22 Sep 2025 20:06:10 +0200 Subject: [PATCH 429/613] remove method --- src/level_manifest/mod.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index cf9a6567..19c15aff 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -374,10 +374,6 @@ impl LevelManifest { }) } - pub(crate) fn get_segment(&self, id: SegmentId) -> Option<&Segment> { - self.current.iter_segments().find(|x| x.metadata.id == id) - } - #[must_use] pub fn as_slice(&self) -> &[Level] { &self.current.levels From 2d0b0df4b6b802ccf7f01ddeb0cd8bbf2e920bcd Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 22 Sep 2025 20:06:43 +0200 Subject: [PATCH 430/613] refactor: pread into util function --- src/file.rs | 47 +++++++++++++++++++++++++++++++++++++++- src/segment/block/mod.rs | 45 ++------------------------------------ 2 files changed, 48 insertions(+), 44 deletions(-) diff --git a/src/file.rs b/src/file.rs index 38abfe37..68ef1cf0 100644 --- a/src/file.rs +++ b/src/file.rs @@ -2,7 +2,8 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use std::{io::Write, path::Path}; +use crate::Slice; +use std::{fs::File, io::Write, path::Path}; pub const MAGIC_BYTES: [u8; 4] = [b'L', b'S', b'M', 3]; @@ -10,6 +11,50 @@ pub const MANIFEST_FILE: &str = "manifest"; pub const SEGMENTS_FOLDER: &str = "segments"; pub const BLOBS_FOLDER: &str = "blobs"; +/// Reads bytes from a file using `pread`. +pub fn read_exact(file: &File, offset: u64, size: usize) -> std::io::Result { + #[warn(unsafe_code)] + let mut builder = unsafe { Slice::builder_unzeroed(size) }; + + { + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + + let bytes_read = file.read_at(&mut builder, offset)?; + + assert_eq!( + bytes_read, + size, + "not enough bytes read: file has length {}", + file.metadata()?.len(), + ); + } + + #[cfg(windows)] + { + use std::os::windows::fs::FileExt; + + let bytes_read = file.seek_read(&mut builder, offset)?; + + assert_eq!( + bytes_read, + size, + "not enough bytes read: file has length {}", + file.metadata()?.len(), + ); + } + + #[cfg(not(any(unix, windows)))] + { + compile_error!("unsupported OS"); + unimplemented!(); + } + } + + Ok(builder.freeze().into()) +} + /// Atomically rewrites a file. pub fn rewrite_atomic(path: &Path, content: &[u8]) -> std::io::Result<()> { // NOTE: Nothing we can do diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index dfb34bfb..3802fec8 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -127,46 +127,8 @@ impl Block { handle: BlockHandle, compression: CompressionType, ) -> crate::Result { - #[warn(unsafe_code)] - let mut builder = unsafe { Slice::builder_unzeroed(handle.size() as usize) }; + let buf = crate::file::read_exact(file, *handle.offset(), handle.size() as usize)?; - { - #[cfg(unix)] - { - use std::os::unix::fs::FileExt; - - let bytes_read = file.read_at(&mut builder, *handle.offset())?; - - assert_eq!( - bytes_read, - handle.size() as usize, - "not enough bytes read: file has length {}", - file.metadata()?.len(), - ); - } - - #[cfg(windows)] - { - use std::os::windows::fs::FileExt; - - let bytes_read = file.seek_read(&mut builder, *handle.offset())?; - - assert_eq!( - bytes_read, - handle.size() as usize, - "not enough bytes read: file has length {}", - file.metadata()?.len(), - ); - } - - #[cfg(not(any(unix, windows)))] - { - compile_error!("unsupported OS"); - unimplemented!(); - } - } - - let buf = builder.freeze(); let header = Header::decode_from(&mut &buf[..])?; let buf = match compression { @@ -206,10 +168,7 @@ impl Block { return Err(crate::Error::InvalidChecksum((checksum, header.checksum))); } - Ok(Self { - header, - data: Slice::from(buf), - }) + Ok(Self { header, data: buf }) } } From bf96fc9113bf53a31f5de2239714025c5e1e0366 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 22 Sep 2025 20:07:27 +0200 Subject: [PATCH 431/613] Version::get_segment --- src/compaction/worker.rs | 4 ++-- src/version/mod.rs | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 576d85a5..04fc397e 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -210,7 +210,7 @@ fn merge_segments( let Some(segments) = payload .segment_ids .iter() - .map(|&id| levels.get_segment(id).cloned()) + .map(|&id| levels.current_version().get_segment(id).cloned()) .collect::>>() else { log::warn!( @@ -503,7 +503,7 @@ fn drop_segments( let Some(segments) = ids_to_drop .iter() - .map(|&id| levels.get_segment(id).cloned()) + .map(|&id| levels.current_version().get_segment(id).cloned()) .collect::>>() else { log::warn!( diff --git a/src/version/mod.rs b/src/version/mod.rs index fdf48342..497a237f 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -237,6 +237,10 @@ impl Version { .flat_map(|x| x.iter()) } + pub(crate) fn get_segment(&self, id: SegmentId) -> Option<&Segment> { + self.iter_segments().find(|x| x.metadata.id == id) + } + /// Gets the n-th level. pub fn level(&self, n: usize) -> Option<&Level> { self.levels.get(n) From 7586e2e88bc98020da5f0af9cd71af3239341193 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 22 Sep 2025 20:08:11 +0200 Subject: [PATCH 432/613] more metrics --- src/metrics.rs | 48 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/src/metrics.rs b/src/metrics.rs index c154e6a0..9f6430a7 100644 --- a/src/metrics.rs +++ b/src/metrics.rs @@ -59,6 +59,16 @@ impl Metrics { + self.filter_block_load_io.load(Relaxed) } + /// Number of index blocks that were loaded from disk or OS page cache. + pub fn index_block_loads_cached(&self) -> usize { + self.index_block_load_cached.load(Relaxed) + } + + /// Number of filter blocks that were loaded from disk or OS page cache. + pub fn filter_block_loads_cached(&self) -> usize { + self.filter_block_load_cached.load(Relaxed) + } + /// Number of blocks that were loaded from disk or OS page cache. pub fn block_loads_cached(&self) -> usize { self.data_block_load_cached.load(Relaxed) @@ -71,11 +81,40 @@ impl Metrics { self.block_loads_io() + self.block_loads_cached() } + /// Filter block cache efficiency in percent (0.0 - 1.0). + pub fn filter_block_cache_hit_rate(&self) -> f64 { + let queries = self.filter_block_loads() as f64; + let hits = self.filter_block_loads_cached() as f64; + + if queries == 0.0 { + 1.0 + } else { + hits / queries + } + } + + /// Index block cache efficiency in percent (0.0 - 1.0). + pub fn index_block_cache_hit_rate(&self) -> f64 { + let queries = self.index_block_loads() as f64; + let hits = self.index_block_loads_cached() as f64; + + if queries == 0.0 { + 1.0 + } else { + hits / queries + } + } + /// Block cache efficiency in percent (0.0 - 1.0). pub fn block_cache_hit_rate(&self) -> f64 { let queries = self.block_loads() as f64; let hits = self.block_loads_cached() as f64; - hits / queries + + if queries == 0.0 { + 1.0 + } else { + hits / queries + } } /// Filter efficiency in percent (0.0 - 1.0). @@ -84,7 +123,12 @@ impl Metrics { pub fn filter_efficiency(&self) -> f64 { let queries = self.filter_queries.load(Relaxed) as f64; let io_skipped = self.io_skipped_by_filter.load(Relaxed) as f64; - io_skipped / queries + + if queries == 0.0 { + 1.0 + } else { + io_skipped / queries + } } /// Number of filter queries performed. From c3b3c67b4e1e7b1549fd0c89ef97dd0091e4417a Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 22 Sep 2025 20:08:28 +0200 Subject: [PATCH 433/613] export slice builder --- src/slice/mod.rs | 4 ++-- src/slice/slice_bytes/mod.rs | 4 +++- src/slice/slice_default/mod.rs | 4 +++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/slice/mod.rs b/src/slice/mod.rs index 1f48e2ef..e76957f7 100644 --- a/src/slice/mod.rs +++ b/src/slice/mod.rs @@ -16,10 +16,10 @@ use std::{ }; #[cfg(not(feature = "bytes_1"))] -pub use slice_default::Slice; +pub use slice_default::{Builder, Slice}; #[cfg(feature = "bytes_1")] -pub use slice_bytes::Slice; +pub use slice_bytes::{Builder, Slice}; impl AsRef<[u8]> for Slice { fn as_ref(&self) -> &[u8] { diff --git a/src/slice/slice_bytes/mod.rs b/src/slice/slice_bytes/mod.rs index 35609d19..74989a7d 100644 --- a/src/slice/slice_bytes/mod.rs +++ b/src/slice/slice_bytes/mod.rs @@ -4,6 +4,8 @@ use bytes::{Bytes, BytesMut}; +pub use BytesMut as Builder; + /// An immutable byte slice that can be cloned without additional heap allocation /// /// There is no guarantee of any sort of alignment for zero-copy (de)serialization. @@ -23,7 +25,7 @@ impl Slice { Self(Bytes::from_static(&[])) } - pub(crate) unsafe fn builder_unzeroed(len: usize) -> BytesMut { + pub(crate) unsafe fn builder_unzeroed(len: usize) -> Builder { // Use `with_capacity` & `set_len`` to avoid zeroing the buffer let mut builder = BytesMut::with_capacity(len); diff --git a/src/slice/slice_default/mod.rs b/src/slice/slice_default/mod.rs index 3e05ede6..e675ecda 100644 --- a/src/slice/slice_default/mod.rs +++ b/src/slice/slice_default/mod.rs @@ -4,6 +4,8 @@ use byteview::ByteView; +pub use byteview::Builder; + /// An immutable byte slice that can be cloned without additional heap allocation /// /// There is no guarantee of any sort of alignment for zero-copy (de)serialization. @@ -21,7 +23,7 @@ impl Slice { Self(ByteView::new(&[])) } - pub(crate) unsafe fn builder_unzeroed(len: usize) -> byteview::Builder { + pub(crate) unsafe fn builder_unzeroed(len: usize) -> Builder { ByteView::builder_unzeroed(len) } From 7e2db973b1dd990cfc73229693b071817cf6b294 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 01:44:20 +0200 Subject: [PATCH 434/613] overhaul leveled compaction, closes #125 --- src/compaction/leveled.rs | 81 ++++++++++++++++++++++++++++++++------ src/compaction/major.rs | 1 + src/compaction/mod.rs | 3 ++ src/compaction/movedown.rs | 1 + src/compaction/worker.rs | 4 +- 5 files changed, 75 insertions(+), 15 deletions(-) diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 02713eaa..22225f58 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -167,10 +167,13 @@ impl Strategy { /// L3 = `level_base_size * ratio * ratio` /// /// ... - fn level_target_size(&self, level_idx: u8) -> u64 { - assert!(level_idx >= 1, "level_target_size does not apply to L0"); + fn level_target_size(&self, canonical_level_idx: u8) -> u64 { + assert!( + canonical_level_idx >= 1, + "level_target_size does not apply to L0", + ); - let power = (self.level_ratio as usize).pow(u32::from(level_idx) - 1) as u64; + let power = (self.level_ratio as usize).pow(u32::from(canonical_level_idx) - 1) as u64; power * self.level_base_size() } @@ -189,8 +192,51 @@ impl CompactionStrategy for Strategy { fn choose(&self, levels: &LevelManifest, _: &Config) -> Choice { assert!(levels.as_slice().len() == 7, "should have exactly 7 levels"); + // Find the level that corresponds to L1 + #[allow(clippy::map_unwrap_or)] + let mut canonical_l1_idx = levels + .as_slice() + .iter() + .enumerate() + .skip(1) + .find(|(_, lvl)| !lvl.is_empty()) + .map(|(idx, _)| idx) + .unwrap_or_else(|| usize::from(levels.last_level_index())); + + // Number of levels we have to shift to get from the actual level idx to the canonical + let mut level_shift = canonical_l1_idx - 1; + + if canonical_l1_idx > 1 && levels.as_slice().iter().skip(1).any(|lvl| !lvl.is_empty()) { + let need_new_l1 = levels + .as_slice() + .iter() + .enumerate() + .skip(1) + .filter(|(_, lvl)| !lvl.is_empty()) + .all(|(idx, level)| { + let level_size = level + .iter() + .flat_map(|x| x.iter()) + // NOTE: Take bytes that are already being compacted into account, + // otherwise we may be overcompensating + .filter(|x| !levels.hidden_set().is_hidden(x.id())) + .map(Segment::file_size) + .sum::(); + + let target_size = self.level_target_size((idx - level_shift) as u8); + + level_size > target_size + }); + + // Move up L1 one level if all current levels are at capacity + if need_new_l1 { + canonical_l1_idx -= 1; + level_shift -= 1; + } + } + // Scoring - let mut scores = [(0.0, 0u64); 7]; + let mut scores = [(/* score */ 0.0, /* overshoot */ 0u64); 7]; { // Score first level @@ -198,12 +244,19 @@ impl CompactionStrategy for Strategy { // NOTE: We always have at least one level #[allow(clippy::expect_used)] let first_level = levels.as_slice().first().expect("first level should exist"); - if first_level.len() >= usize::from(self.l0_threshold) { - scores[0] = ((first_level.len() as f64) / f64::from(self.l0_threshold), 0); + + // TODO: use run_count instead? but be careful because of version free list GC thingy + if first_level.segment_count() >= usize::from(self.l0_threshold) { + let ratio = (first_level.segment_count() as f64) / f64::from(self.l0_threshold); + scores[0] = (ratio, 0); } // Score L1+ for (idx, level) in levels.as_slice().iter().enumerate().skip(1) { + if level.is_empty() { + continue; + } + let level_size = level .iter() .flat_map(|x| x.iter()) @@ -213,7 +266,7 @@ impl CompactionStrategy for Strategy { .map(Segment::file_size) .sum::(); - let target_size = self.level_target_size(idx as u8); + let target_size = self.level_target_size((idx - level_shift) as u8); // NOTE: We check for level length above #[allow(clippy::indexing_slicing)] @@ -264,11 +317,11 @@ impl CompactionStrategy for Strategy { return Choice::DoNothing; }; - if levels.level_is_busy(0) || levels.level_is_busy(1) { + if levels.level_is_busy(0) || levels.level_is_busy(canonical_l1_idx) { return Choice::DoNothing; } - let Some(next_level) = &levels.current_version().level(1) else { + let Some(target_level) = &levels.current_version().level(canonical_l1_idx) else { return Choice::DoNothing; }; @@ -277,17 +330,18 @@ impl CompactionStrategy for Strategy { let key_range = first_level.aggregate_key_range(); // Get overlapping segments in next level - let next_level_overlapping_segment_ids: Vec<_> = next_level + let target_level_overlapping_segment_ids: Vec<_> = target_level .iter() .flat_map(|run| run.get_overlapping(&key_range)) .map(Segment::id) .collect(); - segment_ids.extend(&next_level_overlapping_segment_ids); + segment_ids.extend(&target_level_overlapping_segment_ids); let choice = CompactionInput { segment_ids, - dest_level: 1, + dest_level: canonical_l1_idx as u8, + canonical_level: 1, target_size: u64::from(self.target_size), }; @@ -297,7 +351,7 @@ impl CompactionStrategy for Strategy { choice.segment_ids, ); */ - if next_level_overlapping_segment_ids.is_empty() && first_level.is_disjoint() { + if target_level_overlapping_segment_ids.is_empty() && first_level.is_disjoint() { return Choice::Move(choice); } return Choice::Merge(choice); @@ -335,6 +389,7 @@ impl CompactionStrategy for Strategy { let choice = CompactionInput { segment_ids, dest_level: next_level_index, + canonical_level: next_level_index - (level_shift as u8), target_size: u64::from(self.target_size), }; diff --git a/src/compaction/major.rs b/src/compaction/major.rs index 0179eca2..6cb79e14 100644 --- a/src/compaction/major.rs +++ b/src/compaction/major.rs @@ -53,6 +53,7 @@ impl CompactionStrategy for Strategy { Choice::Merge(CompactionInput { segment_ids, dest_level: levels.last_level_index(), + canonical_level: levels.last_level_index(), target_size: self.target_size, }) } diff --git a/src/compaction/mod.rs b/src/compaction/mod.rs index 77cf1f69..03283260 100644 --- a/src/compaction/mod.rs +++ b/src/compaction/mod.rs @@ -42,6 +42,9 @@ pub struct Input { /// Level to put the created segments into pub dest_level: u8, + /// The logical level the segments are part of + pub canonical_level: u8, + /// Segment target size /// /// If a segment compaction reaches the level, a new segment is started. diff --git a/src/compaction/movedown.rs b/src/compaction/movedown.rs index 6caeac91..5a78b68c 100644 --- a/src/compaction/movedown.rs +++ b/src/compaction/movedown.rs @@ -32,6 +32,7 @@ impl CompactionStrategy for Strategy { Choice::Move(Input { segment_ids, dest_level: self.1, + canonical_level: self.1, target_size: u64::MAX, }) } diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 04fc397e..79c412c7 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -370,8 +370,8 @@ fn merge_segments( opts.tree_id, opts.config.cache.clone(), opts.config.descriptor_table.clone(), - payload.dest_level <= 1, // TODO: look at configuration - payload.dest_level <= 2, // TODO: look at configuration + payload.canonical_level <= 1, // TODO: look at configuration + payload.canonical_level <= 2, // TODO: look at configuration #[cfg(feature = "metrics")] opts.metrics.clone(), ) From 32800e685ec64cdb36e970e0208feeb02d281c45 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 01:46:30 +0200 Subject: [PATCH 435/613] fix --- src/segment/block/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index 3802fec8..47998152 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -148,7 +148,7 @@ impl Block { lz4_flex::decompress_into(raw_data, &mut builder) .map_err(|_| crate::Error::Decompress(compression))?; - builder.freeze() + builder.freeze().into() } }; From ee01d2fa157b3aebf799efe198b17206fe25eab0 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 01:49:16 +0200 Subject: [PATCH 436/613] fix --- src/slice/slice_bytes/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/slice/slice_bytes/mod.rs b/src/slice/slice_bytes/mod.rs index 74989a7d..afe9ea6b 100644 --- a/src/slice/slice_bytes/mod.rs +++ b/src/slice/slice_bytes/mod.rs @@ -2,9 +2,9 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use bytes::{Bytes, BytesMut}; +use bytes::Bytes; -pub use BytesMut as Builder; +pub use bytes::BytesMut as Builder; /// An immutable byte slice that can be cloned without additional heap allocation /// @@ -27,7 +27,7 @@ impl Slice { pub(crate) unsafe fn builder_unzeroed(len: usize) -> Builder { // Use `with_capacity` & `set_len`` to avoid zeroing the buffer - let mut builder = BytesMut::with_capacity(len); + let mut builder = Builder::with_capacity(len); // SAFETY: we just allocated `len` bytes, and `read_exact` will fail if // it doesn't fill the buffer, subsequently dropping the uninitialized From 5018f1838294c117783086fc077382f7e884eaa0 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 19:01:58 +0200 Subject: [PATCH 437/613] hidden metrics getter --- src/abstract.rs | 4 ++++ src/blob_tree/mod.rs | 5 +++++ src/tree/mod.rs | 5 +++++ 3 files changed, 14 insertions(+) diff --git a/src/abstract.rs b/src/abstract.rs index 8ddf930e..87228dd4 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -94,6 +94,10 @@ pub trait AbstractTree { /// Gets the length of the version free list. fn version_free_list_len(&self) -> usize; + /// Returns the metrics structure. + #[cfg(feature = "metrics")] + fn metrics(&self) -> &Arc; + // TODO:? /* #[doc(hidden)] fn verify(&self) -> crate::Result; */ diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index 2989ee8d..5802790e 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -373,6 +373,11 @@ impl BlobTree { } impl AbstractTree for BlobTree { + #[cfg(feature = "metrics")] + fn metrics(&self) -> &Arc { + self.index.metrics() + } + fn version_free_list_len(&self) -> usize { self.index.version_free_list_len() } diff --git a/src/tree/mod.rs b/src/tree/mod.rs index da828554..e6bbd70e 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -70,6 +70,11 @@ impl std::ops::Deref for Tree { } impl AbstractTree for Tree { + #[cfg(feature = "metrics")] + fn metrics(&self) -> &Arc { + &self.0.metrics + } + fn version_free_list_len(&self) -> usize { self.manifest .read() From a6a62f2e5c95fbee46512c3e9396ffdad32b0538 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 19:02:44 +0200 Subject: [PATCH 438/613] change filter size getters naming --- src/abstract.rs | 9 +++++++-- src/blob_tree/mod.rs | 8 ++++++-- src/tree/mod.rs | 14 ++++++++++++-- 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/src/abstract.rs b/src/abstract.rs index 87228dd4..73abf04d 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -85,8 +85,13 @@ pub trait AbstractTree { /// Will return `Err` if an IO error occurs. fn major_compact(&self, target_size: u64, seqno_threshold: SeqNo) -> crate::Result<()>; - /// Gets the memory usage of all pinned bloom filters in the tree. - fn pinned_bloom_filter_size(&self) -> usize; + /// Gets the space usage of all filters in the tree. + /// + /// May not correspond to the actual memory size because filter blocks may be paged out. + fn filter_size(&self) -> usize; + + /// Gets the memory usage of all pinned filters in the tree. + fn pinned_filter_size(&self) -> usize; /// Gets the memory usage of all pinned index blocks in the tree. fn pinned_block_index_size(&self) -> usize; diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index 5802790e..c17e81fe 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -539,8 +539,12 @@ impl AbstractTree for BlobTree { })) } - fn pinned_bloom_filter_size(&self) -> usize { - self.index.pinned_bloom_filter_size() + fn filter_size(&self) -> usize { + self.index.filter_size() + } + + fn pinned_filter_size(&self) -> usize { + self.index.pinned_filter_size() } fn pinned_block_index_size(&self) -> usize { diff --git a/src/tree/mod.rs b/src/tree/mod.rs index e6bbd70e..5854369f 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -202,13 +202,23 @@ impl AbstractTree for Tree { Ok(self.get(key, seqno)?.map(|x| x.len() as u32)) } - fn pinned_bloom_filter_size(&self) -> usize { + fn filter_size(&self) -> usize { self.manifest .read() .expect("lock is poisoned") .current_version() .iter_segments() - .map(Segment::pinned_bloom_filter_size) + .map(Segment::filter_size) + .sum() + } + + fn pinned_filter_size(&self) -> usize { + self.manifest + .read() + .expect("lock is poisoned") + .current_version() + .iter_segments() + .map(Segment::pinned_filter_size) .sum() } From 964f07d951baa22aa7b7cc7a074f9008cb494bc4 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 19:10:01 +0200 Subject: [PATCH 439/613] new config --- src/config/block_size.rs | 47 ++++++ src/config/compression.rs | 67 +++++++++ src/config/filter.rs | 63 ++++++++ src/{config.rs => config/mod.rs} | 248 +++++++++++++++++-------------- src/config/pinning.rs | 40 +++++ src/config/restart_interval.rs | 40 +++++ src/config_new/mod.rs | 1 + src/lib.rs | 5 +- 8 files changed, 402 insertions(+), 109 deletions(-) create mode 100644 src/config/block_size.rs create mode 100644 src/config/compression.rs create mode 100644 src/config/filter.rs rename src/{config.rs => config/mod.rs} (58%) create mode 100644 src/config/pinning.rs create mode 100644 src/config/restart_interval.rs create mode 100644 src/config_new/mod.rs diff --git a/src/config/block_size.rs b/src/config/block_size.rs new file mode 100644 index 00000000..94fe002d --- /dev/null +++ b/src/config/block_size.rs @@ -0,0 +1,47 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +/// Block size policy +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct BlockSizePolicy(Vec); + +impl std::ops::Deref for BlockSizePolicy { + type Target = [u32]; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +// TODO: remove default +impl Default for BlockSizePolicy { + fn default() -> Self { + Self::new(&[4 * 1_024]) + } +} + +impl BlockSizePolicy { + pub(crate) fn get(&self, level: usize) -> u32 { + self.0 + .get(level) + .copied() + .unwrap_or_else(|| self.last().copied().expect("policy should not be empty")) + } + + // TODO: accept Vec... Into>? or owned + + /// Uses the same block size in every level. + #[must_use] + pub fn all(c: u32) -> Self { + Self(vec![c]) + } + + /// Constructs a custom block size policy. + #[must_use] + pub fn new(policy: &[u32]) -> Self { + assert!(!policy.is_empty(), "compression policy may not be empty"); + assert!(policy.len() <= 255, "compression policy is too large"); + Self(policy.into()) + } +} diff --git a/src/config/compression.rs b/src/config/compression.rs new file mode 100644 index 00000000..168cdf86 --- /dev/null +++ b/src/config/compression.rs @@ -0,0 +1,67 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use crate::CompressionType; + +/// Compression policy +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct CompressionPolicy(Vec); + +impl std::ops::Deref for CompressionPolicy { + type Target = [CompressionType]; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +// TODO: remove default +impl Default for CompressionPolicy { + fn default() -> Self { + #[cfg(feature = "lz4")] + let c = Self::new(&[CompressionType::None, CompressionType::Lz4]); + + #[cfg(not(feature = "lz4"))] + let c = Self::new(&[CompressionType::None]); + + c + } +} + +impl CompressionPolicy { + pub(crate) fn get(&self, level: usize) -> CompressionType { + self.0 + .get(level) + .copied() + .unwrap_or_else(|| self.last().copied().expect("policy should not be empty")) + } + + // TODO: accept Vec... Into>? or owned + + /// Uses the same compression in every level. + #[must_use] + pub fn all(c: CompressionType) -> Self { + Self(vec![c]) + } + + /// Constructs a custom compression policy. + /// + /// # Example + /// + /// Skip compression in level 0: + /// + /// ``` + /// # use lsm_tree::{CompressionType, config::CompressionPolicy}; + /// let policy = CompressionPolicy::new(&[ + /// CompressionType::None, + /// CompressionType::Lz4, // use LZ4 for L1+ + /// ]); + /// ``` + #[must_use] + pub fn new(policy: &[CompressionType]) -> Self { + assert!(!policy.is_empty(), "compression policy may not be empty"); + assert!(policy.len() <= 255, "compression policy is too large"); + Self(policy.into()) + } +} diff --git a/src/config/filter.rs b/src/config/filter.rs new file mode 100644 index 00000000..8d6cde99 --- /dev/null +++ b/src/config/filter.rs @@ -0,0 +1,63 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +pub use crate::segment::filter::BloomConstructionPolicy; + +/// Filter policy entry +/// +/// Each level can be configured with a different filter type and bits per key +#[derive(Copy, Debug, Clone, PartialEq)] +pub enum FilterPolicyEntry { + /// Skip filter construction + None, + + /// Standard bloom filter with K bits per key + Bloom(BloomConstructionPolicy), +} + +/// Filter policy +#[derive(Debug, Clone, PartialEq)] +pub struct FilterPolicy(Vec); + +impl std::ops::Deref for FilterPolicy { + type Target = [FilterPolicyEntry]; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +// TODO: remove default +impl Default for FilterPolicy { + fn default() -> Self { + Self::new(&[FilterPolicyEntry::Bloom( + BloomConstructionPolicy::BitsPerKey(10.0), + )]) + } +} + +impl FilterPolicy { + pub(crate) fn get(&self, level: usize) -> FilterPolicyEntry { + self.0 + .get(level) + .copied() + .unwrap_or_else(|| self.last().copied().expect("policy should not be empty")) + } + + // TODO: accept Vec... Into>? or owned + + /// Uses the same block size in every level. + #[must_use] + pub fn all(c: FilterPolicyEntry) -> Self { + Self(vec![c]) + } + + /// Constructs a custom block size policy. + #[must_use] + pub fn new(policy: &[FilterPolicyEntry]) -> Self { + assert!(!policy.is_empty(), "compression policy may not be empty"); + assert!(policy.len() <= 255, "compression policy is too large"); + Self(policy.into()) + } +} diff --git a/src/config.rs b/src/config/mod.rs similarity index 58% rename from src/config.rs rename to src/config/mod.rs index 7ff4a83a..0b890d9f 100644 --- a/src/config.rs +++ b/src/config/mod.rs @@ -2,6 +2,18 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) +mod block_size; +mod compression; +mod filter; +mod pinning; +mod restart_interval; + +pub use block_size::BlockSizePolicy; +pub use compression::CompressionPolicy; +pub use filter::{BloomConstructionPolicy, FilterPolicy, FilterPolicyEntry}; +pub use pinning::PinningPolicy; +pub use restart_interval::RestartIntervalPolicy; + use crate::{path::absolute_path, BlobTree, Cache, CompressionType, DescriptorTable, Tree}; use std::{ path::{Path, PathBuf}, @@ -48,40 +60,57 @@ pub struct Config { #[doc(hidden)] pub path: PathBuf, + /// Block cache to use + #[doc(hidden)] + pub cache: Arc, + + /// Descriptor table to use + #[doc(hidden)] + pub descriptor_table: Arc, + /// Tree type (unused) #[allow(unused)] pub tree_type: TreeType, - /// What type of compression is used - pub compression: CompressionType, + /// Number of levels of the LSM tree (depth of tree) + pub level_count: u8, - /// What type of compression is used for blobs - pub blob_compression: CompressionType, + /// What type of compression is used for data blocks + pub data_block_compression_policy: CompressionPolicy, + + /// What type of compression is used for index blocks + pub index_block_compression_policy: CompressionPolicy, /// Restart interval inside data blocks - pub data_block_restart_interval: u8, + pub data_block_restart_interval_policy: RestartIntervalPolicy, + + /// Restart interval inside index blocks + pub index_block_restart_interval_policy: RestartIntervalPolicy, /// Hash bytes per key in data blocks pub data_block_hash_ratio: f32, /// Block size of data blocks - pub data_block_size: u32, + pub data_block_size_policy: BlockSizePolicy, /// Block size of index blocks - pub index_block_size: u32, + pub index_block_size_policy: BlockSizePolicy, - /// Number of levels of the LSM tree (depth of tree) - pub level_count: u8, + /// Whether to pin index blocks + pub index_block_pinning_policy: PinningPolicy, - /// Bits per key for levels that are not L0, L1, L2 - // NOTE: bloom_bits_per_key is not conditionally compiled, - // because that would change the file format - #[doc(hidden)] - pub bloom_bits_per_key: i8, + /// Whether to pin filter blocks + pub filter_block_pinning_policy: PinningPolicy, - /// Block cache to use - #[doc(hidden)] - pub cache: Arc, + /// If `true`, the last level will not build filters, reducing the filter size of a database + /// by ~90% typically + pub(crate) expect_point_read_hits: bool, + + /// Filter construction policy + pub filter_policy: FilterPolicy, + + /// What type of compression is used for blobs + pub blob_compression: CompressionType, /// Blob file (value log segment) target size in bytes #[doc(hidden)] @@ -90,10 +119,6 @@ pub struct Config { /// Key-value separation threshold in bytes #[doc(hidden)] pub blob_file_separation_threshold: u32, - - /// Descriptor table to use - #[doc(hidden)] - pub descriptor_table: Arc, } impl Default for Config { @@ -104,20 +129,31 @@ impl Default for Config { cache: Arc::new(Cache::with_capacity_bytes(/* 16 MiB */ 16 * 1_024 * 1_024)), - data_block_restart_interval: 16, + data_block_restart_interval_policy: RestartIntervalPolicy::all(16), + index_block_restart_interval_policy: RestartIntervalPolicy::all(1), + data_block_hash_ratio: 0.0, - data_block_size: /* 4 KiB */ 4_096, - index_block_size: /* 4 KiB */ 4_096, level_count: 7, tree_type: TreeType::Standard, - // table_type: TableType::Block, - compression: CompressionType::None, + + data_block_size_policy: BlockSizePolicy::default(), + index_block_size_policy: BlockSizePolicy::default(), + + index_block_pinning_policy: PinningPolicy::new(&[true, true, false]), + filter_block_pinning_policy: PinningPolicy::new(&[true, false]), + + data_block_compression_policy: CompressionPolicy::default(), + index_block_compression_policy:CompressionPolicy::all(CompressionType::None), + blob_compression: CompressionType::None, - bloom_bits_per_key: 10, + + filter_policy: FilterPolicy::default(), blob_file_target_size: /* 64 MiB */ 64 * 1_024 * 1_024, blob_file_separation_threshold: /* 4 KiB */ 4 * 1_024, + + expect_point_read_hits: false, } } } @@ -131,6 +167,49 @@ impl Config { } } + /// Sets the global cache. + /// + /// You can create a global [`Cache`] and share it between multiple + /// trees to cap global cache memory usage. + /// + /// Defaults to a cache with 8 MiB of capacity *per tree*. + #[must_use] + pub fn use_cache(mut self, cache: Arc) -> Self { + self.cache = cache; + self + } + + #[must_use] + #[doc(hidden)] + pub fn use_descriptor_table(mut self, descriptor_table: Arc) -> Self { + self.descriptor_table = descriptor_table; + self + } + + /// If `true`, the last level will not build filters, reducing the filter size of a database + /// by ~90% typically. + /// + /// **Enable this only if you know that point reads generally are expected to find a key-value pair.** + #[must_use] + pub fn expect_point_read_hits(mut self, b: bool) -> Self { + self.expect_point_read_hits = b; + self + } + + /// Sets the pinning policy for filter blocks. + #[must_use] + pub fn filter_block_pinning_policy(mut self, policy: PinningPolicy) -> Self { + self.filter_block_pinning_policy = policy; + self + } + + /// Sets the pinning policy for index blocks. + #[must_use] + pub fn index_block_pinning_policy(mut self, policy: PinningPolicy) -> Self { + self.index_block_pinning_policy = policy; + self + } + /// Sets the restart interval inside data blocks. /// /// A higher restart interval saves space while increasing lookup times @@ -138,8 +217,20 @@ impl Config { /// /// Default = 16 #[must_use] - pub fn data_block_restart_interval(mut self, i: u8) -> Self { - self.data_block_restart_interval = i; + pub fn data_block_restart_interval_policy(mut self, policy: RestartIntervalPolicy) -> Self { + self.data_block_restart_interval_policy = policy; + self + } + + /// Sets the restart interval inside index blocks. + /// + /// A higher restart interval saves space while increasing lookup times + /// inside index blocks. + /// + /// Default = 1 + #[must_use] + pub fn index_block_restart_interval_policy(mut self, policy: RestartIntervalPolicy) -> Self { + self.index_block_restart_interval_policy = policy; self } @@ -155,44 +246,32 @@ impl Config { /// Default = 0.0 #[must_use] pub fn data_block_hash_ratio(mut self, ratio: f32) -> Self { - self.data_block_hash_ratio = ratio; + self.data_block_hash_ratio = ratio; // TODO: policy self } - /// Sets the bits per key to use for bloom filters - /// in levels that are not L0 or L1. - /// - /// Use -1 to disable bloom filters even in L0, L1, L2. - /// - /// Defaults to 10 bits. - /// - /// # Panics - /// - /// Panics if `n` is less than -1. + /// Sets the filter construction policy. #[must_use] - pub fn bloom_bits_per_key(mut self, bits: i8) -> Self { - assert!(bits >= -1, "invalid bits_per_key value"); + pub fn filter_policy(mut self, policy: FilterPolicy) -> Self { + self.filter_policy = policy; + self + } - self.bloom_bits_per_key = bits; + /// Sets the compression method for data blocks. + #[must_use] + pub fn data_block_compression_policy(mut self, policy: CompressionPolicy) -> Self { + self.data_block_compression_policy = policy; self } - /// Sets the compression method. - /// - /// Using some compression is recommended. - /// - /// Default = None + /// Sets the compression method for index blocks. #[must_use] - pub fn compression(mut self, compression: CompressionType) -> Self { - self.compression = compression; + pub fn index_block_compression_policy(mut self, policy: CompressionPolicy) -> Self { + self.index_block_compression_policy = policy; self } - /// Sets the compression method. - /// - /// Using some compression is recommended. - /// - /// Default = None + /// Sets the blob compression method. #[must_use] pub fn blob_compression(mut self, compression: CompressionType) -> Self { self.blob_compression = compression; @@ -216,57 +295,17 @@ impl Config { self } - /// Sets the data block size. - /// - /// Defaults to 4 KiB (4096 bytes). - /// - /// For point read heavy workloads (get) a sensible default is - /// somewhere between 4 - 8 KiB, depending on the average value size. - /// - /// For scan heavy workloads (range, prefix), use 16 - 64 KiB - /// which also increases compression efficiency. - /// - /// # Panics - /// - /// Panics if the block size is smaller than 1 KiB or larger than 512 KiB. - #[must_use] - pub fn data_block_size(mut self, block_size: u32) -> Self { - assert!(block_size >= 1_024); - assert!(block_size <= 512 * 1_024); - self.data_block_size = block_size; - self - } - - /// Sets the index block size. - /// - /// Defaults to 4 KiB (4096 bytes). - /// - /// For point read heavy workloads (get) a sensible default is - /// somewhere between 4 - 8 KiB, depending on the average value size. - /// - /// For scan heavy workloads (range, prefix), use 16 - 64 KiB - /// which also increases compression efficiency. - /// - /// # Panics - /// - /// Panics if the block size is smaller than 1 KiB or larger than 512 KiB. + /// Sets the data block size policy. #[must_use] - pub fn index_block_size(mut self, block_size: u32) -> Self { - assert!(block_size >= 1_024); - assert!(block_size <= 512 * 1_024); - self.index_block_size = block_size; + pub fn data_block_size_policy(mut self, policy: BlockSizePolicy) -> Self { + self.data_block_size_policy = policy; self } - /// Sets the global cache. - /// - /// You can create a global [`Cache`] and share it between multiple - /// trees to cap global cache memory usage. - /// - /// Defaults to a cache with 8 MiB of capacity *per tree*. + /// Sets the index block size policy. #[must_use] - pub fn use_cache(mut self, cache: Arc) -> Self { - self.cache = cache; + pub fn index_block_size_policy(mut self, policy: BlockSizePolicy) -> Self { + self.index_block_size_policy = policy; self } @@ -301,13 +340,6 @@ impl Config { self } - #[must_use] - #[doc(hidden)] - pub fn descriptor_table(mut self, descriptor_table: Arc) -> Self { - self.descriptor_table = descriptor_table; - self - } - /// Opens a tree using the config. /// /// # Errors diff --git a/src/config/pinning.rs b/src/config/pinning.rs new file mode 100644 index 00000000..fc486ebb --- /dev/null +++ b/src/config/pinning.rs @@ -0,0 +1,40 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +/// Pinning policy +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct PinningPolicy(Vec); + +impl std::ops::Deref for PinningPolicy { + type Target = [bool]; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl PinningPolicy { + pub(crate) fn get(&self, level: usize) -> bool { + self.0 + .get(level) + .copied() + .unwrap_or_else(|| self.last().copied().expect("policy should not be empty")) + } + + // TODO: accept Vec... Into>? or owned + + /// Uses the same block size in every level. + #[must_use] + pub fn all(c: bool) -> Self { + Self(vec![c]) + } + + /// Constructs a custom block size policy. + #[must_use] + pub fn new(policy: &[bool]) -> Self { + assert!(!policy.is_empty(), "compression policy may not be empty"); + assert!(policy.len() <= 255, "compression policy is too large"); + Self(policy.into()) + } +} diff --git a/src/config/restart_interval.rs b/src/config/restart_interval.rs new file mode 100644 index 00000000..a58f0043 --- /dev/null +++ b/src/config/restart_interval.rs @@ -0,0 +1,40 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +/// Restart interval policy +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct RestartIntervalPolicy(Vec); + +impl std::ops::Deref for RestartIntervalPolicy { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl RestartIntervalPolicy { + pub(crate) fn get(&self, level: usize) -> u8 { + self.0 + .get(level) + .copied() + .unwrap_or_else(|| self.last().copied().expect("policy should not be empty")) + } + + // TODO: accept Vec... Into>? or owned + + /// Uses the same block size in every level. + #[must_use] + pub fn all(c: u8) -> Self { + Self(vec![c]) + } + + /// Constructs a custom block size policy. + #[must_use] + pub fn new(policy: &[u8]) -> Self { + assert!(!policy.is_empty(), "compression policy may not be empty"); + assert!(policy.len() <= 255, "compression policy is too large"); + Self(policy.into()) + } +} diff --git a/src/config_new/mod.rs b/src/config_new/mod.rs new file mode 100644 index 00000000..06e30bf8 --- /dev/null +++ b/src/config_new/mod.rs @@ -0,0 +1 @@ +crate::config diff --git a/src/lib.rs b/src/lib.rs index 2eefd01f..a6ded8ac 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -137,7 +137,10 @@ pub mod coding; pub mod compaction; mod compression; -mod config; + +/// Configuration +pub mod config; + mod double_ended_peekable; mod error; From 580352b811419dc6dda8b87bbf70d55f1c9318ae Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 19:12:02 +0200 Subject: [PATCH 440/613] refactor data block --- src/segment/block/decoder.rs | 1 + src/segment/data_block/mod.rs | 24 +++++++++++++++--------- src/segment/index_block/iter.rs | 5 ++++- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/segment/block/decoder.rs b/src/segment/block/decoder.rs index 9cbd8dd7..3b854541 100644 --- a/src/segment/block/decoder.rs +++ b/src/segment/block/decoder.rs @@ -100,6 +100,7 @@ impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Pa "invalid binary index step size", ); + // TODO: flip len, offset let binary_index_offset = unwrap!(reader.read_u32::()); let binary_index_len = unwrap!(reader.read_u32::()); diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index f62a15d2..e05dbfae 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -293,10 +293,12 @@ impl DataBlock { } pub(crate) fn get_binary_index_reader(&self) -> BinaryIndexReader<'_> { + use std::mem::size_of; + let trailer = Trailer::new(&self.inner); // NOTE: Skip item count (u32) and restart interval (u8) - let offset = std::mem::size_of::() + std::mem::size_of::(); + let offset = size_of::() + size_of::(); let mut reader = unwrap!(trailer.as_slice().get(offset..)); @@ -307,6 +309,7 @@ impl DataBlock { "invalid binary index step size", ); + // TODO: 3.0.0 flip len and offset let binary_index_offset = unwrap!(reader.read_u32::()); let binary_index_len = unwrap!(reader.read_u32::()); @@ -320,18 +323,21 @@ impl DataBlock { #[must_use] pub fn get_hash_index_reader(&self) -> Option> { + use std::mem::size_of; + let trailer = Trailer::new(&self.inner); // NOTE: Skip item count (u32), restart interval (u8), binary index step size (u8) // and binary stuff (2x u32) - let offset = std::mem::size_of::() - + std::mem::size_of::() - + std::mem::size_of::() - + std::mem::size_of::() - + std::mem::size_of::(); + let offset = size_of::() + + size_of::() + + size_of::() + + size_of::() + + size_of::(); let mut reader = unwrap!(trailer.as_slice().get(offset..)); + // TODO: 3.0.0 flip offset and len, so we can terminate after len if == 0 let hash_index_offset = unwrap!(reader.read_u32::()); let hash_index_len = unwrap!(reader.read_u32::()); @@ -431,13 +437,13 @@ impl DataBlock { /// The number of pointers is equal to the number of restart intervals. #[must_use] pub fn binary_index_len(&self) -> u32 { + use std::mem::size_of; + let trailer = Trailer::new(&self.inner); // NOTE: Skip item count (u32), restart interval (u8), binary index step size (u8), // and binary index offset (u32) - let offset = std::mem::size_of::() - + (2 * std::mem::size_of::()) - + std::mem::size_of::(); + let offset = size_of::() + (2 * size_of::()) + size_of::(); let mut reader = unwrap!(trailer.as_slice().get(offset..)); unwrap!(reader.read_u32::()) diff --git a/src/segment/index_block/iter.rs b/src/segment/index_block/iter.rs index 15a8a0c0..f3c44600 100644 --- a/src/segment/index_block/iter.rs +++ b/src/segment/index_block/iter.rs @@ -277,8 +277,11 @@ mod tests { Ok(()) } + // TODO: seek and seek_upper need separate binary search routines...? + // TODO: because seeking in [a,b,c] to e should return None for seek, + // TODO: but not for seek_upper #[test] - #[ignore] + #[ignore = "should not seek"] fn v3_index_block_iter_too_far() -> crate::Result<()> { let items = [ KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), From 6e69acdc93a08bf9170243df73782df3795730ef Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 19:12:51 +0200 Subject: [PATCH 441/613] change bloom filter bpk to float --- src/config_new/mod.rs | 1 - src/segment/filter/mod.rs | 10 +++++----- src/segment/filter/standard_bloom/builder.rs | 10 ++++------ src/segment/filter/standard_bloom/mod.rs | 2 +- 4 files changed, 10 insertions(+), 13 deletions(-) delete mode 100644 src/config_new/mod.rs diff --git a/src/config_new/mod.rs b/src/config_new/mod.rs deleted file mode 100644 index 06e30bf8..00000000 --- a/src/config_new/mod.rs +++ /dev/null @@ -1 +0,0 @@ -crate::config diff --git a/src/segment/filter/mod.rs b/src/segment/filter/mod.rs index 641b558a..a4c8458e 100644 --- a/src/segment/filter/mod.rs +++ b/src/segment/filter/mod.rs @@ -8,15 +8,15 @@ pub mod standard_bloom; use standard_bloom::Builder as StandardBloomFilterBuilder; -#[derive(Copy, Clone, Debug)] +#[derive(Copy, Clone, Debug, PartialEq)] pub enum BloomConstructionPolicy { - BitsPerKey(u8), - FpRate(f32), + BitsPerKey(f32), + FpRate(f32), // TODO: 3.0.0 rename: FalsePositiveRate? } impl Default for BloomConstructionPolicy { fn default() -> Self { - Self::BitsPerKey(10) + Self::BitsPerKey(10.0) } } @@ -34,7 +34,7 @@ impl BloomConstructionPolicy { #[must_use] pub fn is_active(&self) -> bool { match self { - Self::BitsPerKey(bpk) => *bpk > 0, + Self::BitsPerKey(bpk) => *bpk > 0.0, Self::FpRate(fpr) => *fpr > 0.0, } } diff --git a/src/segment/filter/standard_bloom/builder.rs b/src/segment/filter/standard_bloom/builder.rs index b7337379..6697abb6 100644 --- a/src/segment/filter/standard_bloom/builder.rs +++ b/src/segment/filter/standard_bloom/builder.rs @@ -77,16 +77,14 @@ impl Builder { /// /// 10 bits per key is a sensible default. #[must_use] - pub fn with_bpk(n: usize, bpk: u8) -> Self { + pub fn with_bpk(n: usize, bpk: f32) -> Self { use std::f32::consts::LN_2; - assert!(bpk > 0); + assert!(bpk > 0.0); assert!(n > 0); - let bpk = bpk as usize; - - let m = n * bpk; - let k = (((bpk as f32) * LN_2) as usize).max(1); + let m = n * (bpk as usize); + let k = ((bpk * LN_2) as usize).max(1); // NOTE: Round up so we don't get too little bits let bytes = (m as f32 / 8.0).ceil() as usize; diff --git a/src/segment/filter/standard_bloom/mod.rs b/src/segment/filter/standard_bloom/mod.rs index 645d35e0..8c531c8f 100644 --- a/src/segment/filter/standard_bloom/mod.rs +++ b/src/segment/filter/standard_bloom/mod.rs @@ -187,7 +187,7 @@ mod tests { #[test] fn filter_bloom_standard_bpk() -> crate::Result<()> { let item_count = 1_000; - let bpk = 5; + let bpk = 5.0; let mut filter = Builder::with_bpk(item_count, bpk); From 4382b52459c97dc500dd11c16cc073ac8ae73764 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 19:12:55 +0200 Subject: [PATCH 442/613] wip --- src/compression.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compression.rs b/src/compression.rs index 24f8348f..0391229d 100644 --- a/src/compression.rs +++ b/src/compression.rs @@ -61,7 +61,7 @@ impl std::fmt::Display for CompressionType { f, "{}", match self { - Self::None => "no compression", + Self::None => "none", #[cfg(feature = "lz4")] Self::Lz4 => "lz4", From e0c136a5021efb1eee0e70452b1070a21d1b9a6d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 19:13:04 +0200 Subject: [PATCH 443/613] whitespace --- src/compaction/leveled.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 22225f58..ae50df5a 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -407,6 +407,7 @@ impl CompactionStrategy for Strategy { Choice::Merge(choice) } } + /* #[cfg(test)] mod tests { From 137abaa4f5e83f92d536b4f10383a7b47463033f Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 19:13:57 +0200 Subject: [PATCH 444/613] use new config in flush --- src/tree/mod.rs | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 5854369f..cfbaefbd 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -269,23 +269,36 @@ impl AbstractTree for Tree { let folder = self.config.path.join(SEGMENTS_FOLDER); let segment_file_path = folder.join(segment_id.to_string()); - log::debug!("writing segment to {}", segment_file_path.display()); + + let data_block_size = self.config.data_block_size_policy.get(0); + let index_block_size = self.config.index_block_size_policy.get(0); + + let data_block_restart_interval = self.config.data_block_restart_interval_policy.get(0); + let index_block_restart_interval = self.config.index_block_restart_interval_policy.get(0); + + let data_block_compression = self.config.data_block_compression_policy.get(0); + let index_block_compression = self.config.index_block_compression_policy.get(0); + + log::debug!( + "Flushing segment to {}, data_block_restart_interval={data_block_restart_interval}, index_block_restart_interval={index_block_restart_interval}, data_block_size={data_block_size}, index_block_size={index_block_size}, data_block_compression={data_block_compression}, index_block_compression={index_block_compression}", + segment_file_path.display(), + ); let mut segment_writer = Writer::new(segment_file_path, segment_id)? - .use_data_block_compression(self.config.compression) - .use_data_block_size(self.config.data_block_size) + .use_data_block_restart_interval(data_block_restart_interval) + .use_index_block_restart_interval(index_block_restart_interval) + .use_data_block_compression(data_block_compression) + .use_index_block_compression(index_block_compression) + .use_data_block_size(data_block_size) + .use_index_block_size(index_block_size) .use_data_block_hash_ratio(self.config.data_block_hash_ratio) .use_bloom_policy({ + use crate::config::FilterPolicyEntry::{Bloom, None}; use crate::segment::filter::BloomConstructionPolicy; - if self.config.bloom_bits_per_key >= 0 { - // TODO: enable monkey later on - // BloomConstructionPolicy::FpRate(0.00001) - BloomConstructionPolicy::BitsPerKey( - self.config.bloom_bits_per_key.unsigned_abs(), - ) - } else { - BloomConstructionPolicy::BitsPerKey(0) + match self.config.filter_policy.get(0) { + Bloom(policy) => policy, + None => BloomConstructionPolicy::BitsPerKey(0.0), } }); @@ -595,13 +608,16 @@ impl Tree { .descriptor_table .insert(segment_file_path, created_segment.global_id()); */ + let pin_filter = self.config.filter_block_pinning_policy.get(0); + let pin_index = self.config.filter_block_pinning_policy.get(0); + let created_segment = Segment::recover( segment_file_path, self.id, self.config.cache.clone(), self.config.descriptor_table.clone(), - true, // TODO: look at configuration - true, // TODO: look at configuration + pin_filter, + pin_index, #[cfg(feature = "metrics")] self.metrics.clone(), )?; @@ -882,7 +898,6 @@ impl Tree { // IMPORTANT: Restore persisted config config.level_count = manifest.level_count; - // config.table_type = manifest.table_type; config.tree_type = manifest.tree_type; let tree_id = get_next_tree_id(); From 72c1eab3837fc6ab26677221256684bf0473ddf1 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 19:14:16 +0200 Subject: [PATCH 445/613] use new config for compaction --- src/compaction/worker.rs | 62 ++++++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 22 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 79c412c7..40c66bfb 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -222,11 +222,30 @@ fn merge_segments( let segments_base_folder = opts.config.path.join(SEGMENTS_FOLDER); + let data_block_size = opts.config.data_block_size_policy.get(0); + let index_block_size = opts.config.index_block_size_policy.get(0); + + let data_block_restart_interval = opts.config.data_block_restart_interval_policy.get(0); + let index_block_restart_interval = opts.config.index_block_restart_interval_policy.get(0); + + let data_block_compression = opts + .config + .data_block_compression_policy + .get(payload.canonical_level.into()); + + let index_block_compression = opts + .config + .index_block_compression_policy + .get(payload.canonical_level.into()); + + let pin_filter = opts.config.filter_block_pinning_policy.get(0); + let pin_index = opts.config.filter_block_pinning_policy.get(0); + log::debug!( - "Compacting segments {:?} into L{}, compression={}, mvcc_gc_watermark={}", + "Compacting segments {:?} into L{} (canonical L{}), data_block_restart_interval={data_block_restart_interval}, index_block_restart_interval={index_block_restart_interval}, data_block_size={data_block_size}, index_block_size={index_block_size}, data_block_compression={data_block_compression}, index_block_compression={index_block_compression}, mvcc_gc_watermark={}", payload.segment_ids, payload.dest_level, - opts.config.compression, + payload.canonical_level, opts.eviction_seqno, ); @@ -276,29 +295,28 @@ fn merge_segments( }; let mut segment_writer = segment_writer - .use_data_block_restart_interval(16) - .use_data_block_compression(opts.config.compression) - .use_data_block_size(opts.config.data_block_size) + .use_data_block_restart_interval(data_block_restart_interval) + .use_index_block_restart_interval(index_block_restart_interval) + .use_data_block_compression(data_block_compression) + .use_data_block_size(data_block_size) + .use_index_block_size(index_block_size) .use_data_block_hash_ratio(opts.config.data_block_hash_ratio) + .use_index_block_compression(index_block_compression) .use_bloom_policy({ + use crate::config::FilterPolicyEntry::{Bloom, None}; use crate::segment::filter::BloomConstructionPolicy; - if opts.config.bloom_bits_per_key >= 0 { - // TODO: - // NOTE: Apply some MONKEY to have very high FPR on small levels - // because it's cheap - // - // See https://nivdayan.github.io/monkeykeyvaluestore.pdf - /* match payload.dest_level { - 0 => BloomConstructionPolicy::FpRate(0.00001), - 1 => BloomConstructionPolicy::FpRate(0.0005), - _ => BloomConstructionPolicy::BitsPerKey( - opts.config.bloom_bits_per_key.unsigned_abs(), - ), - } */ - BloomConstructionPolicy::BitsPerKey(opts.config.bloom_bits_per_key.unsigned_abs()) + if is_last_level && opts.config.expect_point_read_hits { + BloomConstructionPolicy::BitsPerKey(0.0) } else { - BloomConstructionPolicy::BitsPerKey(0) + match opts + .config + .filter_policy + .get(usize::from(payload.dest_level)) + { + Bloom(policy) => policy, + None => BloomConstructionPolicy::BitsPerKey(0.0), + } } }); @@ -370,8 +388,8 @@ fn merge_segments( opts.tree_id, opts.config.cache.clone(), opts.config.descriptor_table.clone(), - payload.canonical_level <= 1, // TODO: look at configuration - payload.canonical_level <= 2, // TODO: look at configuration + pin_filter, + pin_index, #[cfg(feature = "metrics")] opts.metrics.clone(), ) From 58fd29ceed03f8bbc20261c61b0686f020cc61e4 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 19:14:30 +0200 Subject: [PATCH 446/613] add index block compression meta property --- src/segment/meta.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/segment/meta.rs b/src/segment/meta.rs index 0e760bae..8801e420 100644 --- a/src/segment/meta.rs +++ b/src/segment/meta.rs @@ -46,6 +46,7 @@ pub struct ParsedMeta { pub tombstone_count: u64, pub data_block_compression: CompressionType, + pub index_block_compression: CompressionType, } impl ParsedMeta { @@ -184,6 +185,15 @@ impl ParsedMeta { CompressionType::decode_from(&mut bytes)? }; + let index_block_compression = { + let bytes = block + .point_read(b"#compression#index", SeqNo::MAX) + .expect("size should exist"); + + let mut bytes = &bytes.value[..]; + CompressionType::decode_from(&mut bytes)? + }; + Ok(Self { id, created_at, @@ -195,6 +205,7 @@ impl ParsedMeta { item_count, tombstone_count, data_block_compression, + index_block_compression, }) } } From d559b08f52f1eb795f0649fda0f17d0694530d78 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 19:14:47 +0200 Subject: [PATCH 447/613] rename method --- src/segment/mod.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/segment/mod.rs b/src/segment/mod.rs index fa937a47..c1648dda 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -104,7 +104,12 @@ impl Segment { } #[must_use] - pub fn pinned_bloom_filter_size(&self) -> usize { + pub fn filter_size(&self) -> usize { + unimplemented!() + } + + #[must_use] + pub fn pinned_filter_size(&self) -> usize { self.pinned_filter_block .as_ref() .map(Block::size) From 2be11e164f3991c19f478e90f6833ae5c6c056b8 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 19:14:58 +0200 Subject: [PATCH 448/613] correctly use index block compression in segment read path --- src/segment/mod.rs | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/src/segment/mod.rs b/src/segment/mod.rs index c1648dda..35dbbea6 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -196,7 +196,7 @@ impl Segment { let block = self.load_block( filter_block_handle, BlockType::Filter, - CompressionType::None, + CompressionType::None, // NOTE: We never write a filter block with compression )?; let filter = StandardBloomFilterReader::new(&block.data)?; @@ -222,13 +222,11 @@ impl Segment { // TODO: enum_dispatch BlockIndex::iter let index_block = match &*self.block_index { BlockIndexImpl::Full(index) => index.inner(), - BlockIndexImpl::VolatileFull => { - &IndexBlock::new(self.load_block( - &self.regions.tli, - BlockType::Index, - CompressionType::None, // TODO: allow index block compression - )?) - } + BlockIndexImpl::VolatileFull => &IndexBlock::new(self.load_block( + &self.regions.tli, + BlockType::Index, + self.metadata.index_block_compression, + )?), }; let iter = { @@ -329,7 +327,7 @@ impl Segment { self.load_block( &self.regions.tli, BlockType::Index, - CompressionType::None, // TODO: allow separate index block compression + self.metadata.index_block_compression, ) .expect("should load block"), ) @@ -410,11 +408,7 @@ impl Segment { let tli_block = { log::debug!("Reading TLI block, with tli_ptr={:?}", regions.tli); - let block = Block::from_file( - &file, - regions.tli, - CompressionType::None, // TODO: allow setting index block compression - )?; + let block = Block::from_file(&file, regions.tli, metadata.index_block_compression)?; if block.header.block_type != BlockType::Index { return Err(crate::Error::Decode(crate::DecodeError::InvalidTag(( From 77a8137c11d8c215eb75454b267935bf8b8a71c5 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 19:15:38 +0200 Subject: [PATCH 449/613] add more options to segment writer --- src/segment/multi_writer.rs | 41 ++++++++++++++++++++++++++++++++++ src/segment/writer/index.rs | 7 ++---- src/segment/writer/mod.rs | 44 +++++++++++++++++++++++++++++++++++-- 3 files changed, 85 insertions(+), 7 deletions(-) diff --git a/src/segment/multi_writer.rs b/src/segment/multi_writer.rs index ba494eda..c4c55561 100644 --- a/src/segment/multi_writer.rs +++ b/src/segment/multi_writer.rs @@ -17,8 +17,12 @@ pub struct MultiWriter { base_path: PathBuf, data_block_hash_ratio: f32, + data_block_size: u32, + index_block_size: u32, + data_block_restart_interval: u8, + index_block_restart_interval: u8, /// Target size of segments in bytes /// @@ -34,6 +38,7 @@ pub struct MultiWriter { pub writer: Writer, pub data_block_compression: CompressionType, + pub index_block_compression: CompressionType, bloom_policy: BloomConstructionPolicy, @@ -57,8 +62,12 @@ impl MultiWriter { base_path, data_block_hash_ratio: 0.0, + data_block_size: 4_096, + index_block_size: 4_096, + data_block_restart_interval: 16, + index_block_restart_interval: 1, target_size, results: Vec::with_capacity(10), @@ -67,6 +76,7 @@ impl MultiWriter { writer, data_block_compression: CompressionType::None, + index_block_compression: CompressionType::None, bloom_policy: BloomConstructionPolicy::default(), @@ -81,8 +91,17 @@ impl MultiWriter { self } + #[must_use] + pub fn use_index_block_restart_interval(mut self, interval: u8) -> Self { + self.index_block_restart_interval = interval; + self.writer = self.writer.use_index_block_restart_interval(interval); + self + } + #[must_use] pub fn use_data_block_hash_ratio(mut self, ratio: f32) -> Self { + eprintln!("multi write writing data block: data_block_hash_ratio={ratio}"); + self.data_block_hash_ratio = ratio; self.writer = self.writer.use_data_block_hash_ratio(ratio); self @@ -99,6 +118,17 @@ impl MultiWriter { self } + #[must_use] + pub(crate) fn use_index_block_size(mut self, size: u32) -> Self { + assert!( + size <= 4 * 1_024 * 1_024, + "index block size must be <= 4 MiB", + ); + self.index_block_size = size; + self.writer = self.writer.use_index_block_size(size); + self + } + #[must_use] pub fn use_data_block_compression(mut self, compression: CompressionType) -> Self { self.data_block_compression = compression; @@ -106,6 +136,13 @@ impl MultiWriter { self } + #[must_use] + pub fn use_index_block_compression(mut self, compression: CompressionType) -> Self { + self.index_block_compression = compression; + self.writer = self.writer.use_index_block_compression(compression); + self + } + #[must_use] pub fn use_bloom_policy(mut self, bloom_policy: BloomConstructionPolicy) -> Self { self.bloom_policy = bloom_policy; @@ -130,7 +167,11 @@ impl MultiWriter { let new_writer = Writer::new(path, new_segment_id)? .use_data_block_compression(self.data_block_compression) + .use_index_block_compression(self.index_block_compression) .use_data_block_size(self.data_block_size) + .use_index_block_size(self.index_block_size) + .use_data_block_restart_interval(self.data_block_restart_interval) + .use_index_block_restart_interval(self.index_block_restart_interval) .use_bloom_policy(self.bloom_policy) .use_data_block_hash_ratio(self.data_block_hash_ratio); diff --git a/src/segment/writer/index.rs b/src/segment/writer/index.rs index a6a541ca..e6126f5f 100644 --- a/src/segment/writer/index.rs +++ b/src/segment/writer/index.rs @@ -23,9 +23,7 @@ pub trait BlockIndexWriter { block_file_writer: &mut W, ) -> crate::Result<(BlockHandle, Option)>; - fn use_compression(self, compression: CompressionType) -> Self - where - Self: Sized; + fn set_compression(&mut self, compression: CompressionType); fn len(&self) -> usize; } @@ -49,9 +47,8 @@ impl BlockIndexWriter for FullIndexWriter 1 } - fn use_compression(mut self, compression: CompressionType) -> Self { + fn set_compression(&mut self, compression: CompressionType) { self.compression = compression; - self } fn register_data_block(&mut self, block_handle: KeyedBlockHandle) -> crate::Result<()> { diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 6770aea1..da272f4b 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -27,14 +27,19 @@ pub struct Writer { segment_id: SegmentId, data_block_restart_interval: u8, - data_block_hash_ratio: f32, + index_block_restart_interval: u8, data_block_size: u32, index_block_size: u32, // TODO: implement + data_block_hash_ratio: f32, + /// Compression to use for data blocks data_block_compression: CompressionType, + /// Compression to use for data blocks + index_block_compression: CompressionType, + /// Buffer to serialize blocks into block_buffer: Vec, @@ -75,12 +80,15 @@ impl Writer { segment_id, data_block_restart_interval: 16, + index_block_restart_interval: 1, + data_block_hash_ratio: 0.0, data_block_size: 4_096, index_block_size: 4_096, data_block_compression: CompressionType::None, + index_block_compression: CompressionType::None, path: std::path::absolute(path)?, @@ -108,6 +116,12 @@ impl Writer { self } + #[must_use] + pub fn use_index_block_restart_interval(mut self, interval: u8) -> Self { + self.index_block_restart_interval = interval; + self + } + #[must_use] pub fn use_data_block_hash_ratio(mut self, ratio: f32) -> Self { self.data_block_hash_ratio = ratio; @@ -124,12 +138,29 @@ impl Writer { self } + #[must_use] + pub fn use_index_block_size(mut self, size: u32) -> Self { + assert!( + size <= 4 * 1_024 * 1_024, + "index block size must be <= 4 MiB", + ); + self.index_block_size = size; + self + } + #[must_use] pub fn use_data_block_compression(mut self, compression: CompressionType) -> Self { self.data_block_compression = compression; self } + #[must_use] + pub fn use_index_block_compression(mut self, compression: CompressionType) -> Self { + self.index_block_compression = compression; + self.index_writer.set_compression(compression); + self + } + #[must_use] pub fn use_bloom_policy(mut self, bloom_policy: BloomConstructionPolicy) -> Self { self.bloom_policy = bloom_policy; @@ -348,7 +379,7 @@ impl Writer { ), meta( "#compression#index", - &self.data_block_compression.encode_into_vec(), + &self.index_block_compression.encode_into_vec(), ), meta("#created_at", &unix_timestamp().as_nanos().to_le_bytes()), meta( @@ -373,6 +404,14 @@ impl Writer { meta("#key_count", &(self.meta.key_count as u64).to_le_bytes()), meta("#prefix_truncation#data", &[1]), meta("#prefix_truncation#index", &[0]), + meta( + "#restart_interval#data", + &self.data_block_restart_interval.to_le_bytes(), + ), + meta( + "#restart_interval#index", + &self.index_block_restart_interval.to_le_bytes(), + ), meta("#seqno#max", &self.meta.highest_seqno.to_le_bytes()), meta("#seqno#min", &self.meta.lowest_seqno.to_le_bytes()), meta("#size", &self.meta.file_pos.to_le_bytes()), @@ -387,6 +426,7 @@ impl Writer { meta("v#lsmt", env!("CARGO_PKG_VERSION").as_bytes()), meta("v#table", b"3"), // TODO: tli_handle_count + // TODO: hash ratio etc ]; // NOTE: Just to make sure the items are definitely sorted From 15acb2de7407389d0ae0f1e7663fe0bcca28bf60 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 19:16:13 +0200 Subject: [PATCH 450/613] adjust tests --- tests/blob_drop_after_flush.rs | 4 +- tests/blob_gc_watermark.rs | 45 ++++++++++------ tests/mvcc_slab.rs | 10 ++-- tests/segment_point_reads.rs | 18 +++---- tests/segment_range.rs | 14 ++--- tests/segment_range_oob.rs | 8 ++- tests/segment_remove_weak.rs | 6 +-- tests/snapshot_point_read.rs | 18 +++---- tests/tree_bulk_ingest.rs | 74 +++++++++++++++++++-------- tests/tree_different_block_size.rs | 18 +++---- tests/tree_disjoint_point_read.rs | 18 +++---- tests/tree_non_disjoint_point_read.rs | 6 +-- 12 files changed, 142 insertions(+), 97 deletions(-) diff --git a/tests/blob_drop_after_flush.rs b/tests/blob_drop_after_flush.rs index 1c592a18..fad0eaf3 100644 --- a/tests/blob_drop_after_flush.rs +++ b/tests/blob_drop_after_flush.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, SeqNo}; +use lsm_tree::{config::CompressionPolicy, AbstractTree, Config, SeqNo}; use std::time::Duration; use test_log::test; @@ -12,7 +12,7 @@ fn blob_drop_after_flush() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let tree = Config::new(&folder) - .compression(lsm_tree::CompressionType::None) + .data_block_compression_policy(CompressionPolicy::all(lsm_tree::CompressionType::None)) .open_as_blob_tree()?; tree.insert("a", "neptune".repeat(10_000), 0); diff --git a/tests/blob_gc_watermark.rs b/tests/blob_gc_watermark.rs index e7c08f6f..8c4b4f46 100644 --- a/tests/blob_gc_watermark.rs +++ b/tests/blob_gc_watermark.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, SeqNo, SequenceNumberCounter}; +use lsm_tree::{config::CompressionPolicy, AbstractTree, Config, SeqNo, SequenceNumberCounter}; use test_log::test; // NOTE: This was a logic/MVCC error in v2 that could drop @@ -11,39 +11,51 @@ fn blob_gc_seqno_watermark() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let tree = Config::new(&folder) - .compression(lsm_tree::CompressionType::None) + .data_block_compression_policy(CompressionPolicy::all(lsm_tree::CompressionType::None)) .open_as_blob_tree()?; let seqno = SequenceNumberCounter::default(); tree.insert("a", "neptune".repeat(10_000), seqno.next()); - // TODO: test snapshot reads - // let snapshot = tree.snapshot(seqno.get()); - // assert_eq!(&*snapshot.get("a")?.unwrap(), b"neptune".repeat(10_000)); + let snapshot_seqno = seqno.get(); + + assert_eq!( + &*tree.get("a", snapshot_seqno)?.unwrap(), + b"neptune".repeat(10_000), + ); assert_eq!( &*tree.get("a", SeqNo::MAX)?.unwrap(), - b"neptune".repeat(10_000) + b"neptune".repeat(10_000), ); tree.insert("a", "neptune2".repeat(10_000), seqno.next()); - // assert_eq!(&*snapshot.get("a")?.unwrap(), b"neptune".repeat(10_000)); + assert_eq!( + &*tree.get("a", snapshot_seqno)?.unwrap(), + b"neptune".repeat(10_000), + ); assert_eq!( &*tree.get("a", SeqNo::MAX)?.unwrap(), - b"neptune2".repeat(10_000) + b"neptune2".repeat(10_000), ); tree.insert("a", "neptune3".repeat(10_000), seqno.next()); - // assert_eq!(&*snapshot.get("a")?.unwrap(), b"neptune".repeat(10_000)); + assert_eq!( + &*tree.get("a", snapshot_seqno)?.unwrap(), + b"neptune".repeat(10_000), + ); assert_eq!( &*tree.get("a", SeqNo::MAX)?.unwrap(), - b"neptune3".repeat(10_000) + b"neptune3".repeat(10_000), ); tree.flush_active_memtable(0)?; - // assert_eq!(&*snapshot.get("a")?.unwrap(), b"neptune".repeat(10_000)); + assert_eq!( + &*tree.get("a", snapshot_seqno)?.unwrap(), + b"neptune".repeat(10_000), + ); assert_eq!( &*tree.get("a", SeqNo::MAX)?.unwrap(), - b"neptune3".repeat(10_000) + b"neptune3".repeat(10_000), ); let report = tree.gc_scan_stats(seqno.get() + 1, 0)?; @@ -53,16 +65,19 @@ fn blob_gc_seqno_watermark() -> lsm_tree::Result<()> { tree.apply_gc_strategy(&strategy, 0)?; // IMPORTANT: We cannot drop any blobs yet - // because we the watermark is too low + // because the watermark is too low // // This would previously fail let report = tree.gc_scan_stats(seqno.get() + 1, 0)?; assert_eq!(2, report.stale_blobs); - // assert_eq!(&*snapshot.get("a")?.unwrap(), b"neptune".repeat(10_000)); + assert_eq!( + &*tree.get("a", snapshot_seqno)?.unwrap(), + b"neptune".repeat(10_000), + ); assert_eq!( &*tree.get("a", SeqNo::MAX)?.unwrap(), - b"neptune3".repeat(10_000) + b"neptune3".repeat(10_000), ); Ok(()) diff --git a/tests/mvcc_slab.rs b/tests/mvcc_slab.rs index 21d2a0ef..23bd5df4 100644 --- a/tests/mvcc_slab.rs +++ b/tests/mvcc_slab.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, SequenceNumberCounter}; +use lsm_tree::{config::BlockSizePolicy, AbstractTree, Config, SequenceNumberCounter}; use test_log::test; #[test] @@ -8,8 +8,8 @@ fn segment_reader_mvcc_slab() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let tree = Config::new(&folder) - .data_block_size(1_024) - .index_block_size(1_024) + .data_block_size_policy(BlockSizePolicy::all(1_024)) + .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; let seqno = SequenceNumberCounter::default(); @@ -45,8 +45,8 @@ fn segment_reader_mvcc_slab_blob() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let tree = Config::new(&folder) - .data_block_size(1_024) - .index_block_size(1_024) + .data_block_size_policy(BlockSizePolicy::all(1_024)) + .index_block_size_policy(BlockSizePolicy::all(1_024)) .open_as_blob_tree()?; let seqno = SequenceNumberCounter::default(); diff --git a/tests/segment_point_reads.rs b/tests/segment_point_reads.rs index a8715ac8..d94f6dee 100644 --- a/tests/segment_point_reads.rs +++ b/tests/segment_point_reads.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, SeqNo}; +use lsm_tree::{config::BlockSizePolicy, AbstractTree, Config, SeqNo}; use test_log::test; const ITEM_COUNT: usize = 1_000; @@ -8,8 +8,8 @@ fn segment_point_reads() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) - .data_block_size(1_024) - .index_block_size(1_024) + .data_block_size_policy(BlockSizePolicy::all(1_024)) + .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; for x in 0..ITEM_COUNT as u64 { @@ -32,8 +32,8 @@ fn segment_point_reads_mvcc() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) - .data_block_size(1_024) - .index_block_size(1_024) + .data_block_size_policy(BlockSizePolicy::all(1_024)) + .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; for x in 0..ITEM_COUNT as u64 { @@ -74,8 +74,8 @@ fn segment_point_reads_mvcc_slab() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) - .data_block_size(1_024) - .index_block_size(1_024) + .data_block_size_policy(BlockSizePolicy::all(1_024)) + .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; let keys = [0, 1, 2] @@ -120,8 +120,8 @@ fn blob_tree_segment_point_reads_mvcc_slab() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) - .data_block_size(1_024) - .index_block_size(1_024) + .data_block_size_policy(BlockSizePolicy::all(1_024)) + .index_block_size_policy(BlockSizePolicy::all(1_024)) .open_as_blob_tree()?; let keys = [0, 1, 2] diff --git a/tests/segment_range.rs b/tests/segment_range.rs index 71e611d5..6824a876 100644 --- a/tests/segment_range.rs +++ b/tests/segment_range.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, Guard, SeqNo}; +use lsm_tree::{config::BlockSizePolicy, AbstractTree, Config, Guard, SeqNo}; use test_log::test; const ITEM_COUNT: usize = 1_000_000; @@ -8,8 +8,8 @@ fn segment_ranges() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) - .data_block_size(1_024) - .index_block_size(1_024) + .data_block_size_policy(BlockSizePolicy::all(1_024)) + .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; for x in 0..ITEM_COUNT as u64 { @@ -66,8 +66,8 @@ fn segment_range_last_back() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) - .data_block_size(1_024) - .index_block_size(1_024) + .data_block_size_policy(BlockSizePolicy::all(1_024)) + .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; let value = (0..2_000).map(|_| 0).collect::>(); @@ -101,8 +101,8 @@ fn segment_range_last_back_2() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) - .data_block_size(1_024) - .index_block_size(1_024) + .data_block_size_policy(BlockSizePolicy::all(1_024)) + .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; let value = (0..2_000).map(|_| 0).collect::>(); diff --git a/tests/segment_range_oob.rs b/tests/segment_range_oob.rs index 4b9a8da3..a9ca589e 100644 --- a/tests/segment_range_oob.rs +++ b/tests/segment_range_oob.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, SeqNo}; +use lsm_tree::{config::BlockSizePolicy, AbstractTree, Config, SeqNo}; use test_log::test; const ITEM_COUNT: usize = 100; @@ -8,8 +8,7 @@ fn segment_range_out_of_bounds_lo() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) - .data_block_size(1_024) - .index_block_size(1_024) + .data_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; for key in ('h'..='o').map(|c| c.to_string()) { @@ -32,8 +31,7 @@ fn segment_range_out_of_bounds_hi() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) - .data_block_size(1_024) - .index_block_size(1_024) + .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; for x in 0..ITEM_COUNT as u64 { diff --git a/tests/segment_remove_weak.rs b/tests/segment_remove_weak.rs index 1b2b7c72..f1125819 100644 --- a/tests/segment_remove_weak.rs +++ b/tests/segment_remove_weak.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, SeqNo}; +use lsm_tree::{config::BlockSizePolicy, AbstractTree, Config, SeqNo}; use test_log::test; #[test] @@ -6,8 +6,8 @@ fn segment_remove_weak_simple() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) - .data_block_size(1_024) - .index_block_size(1_024) + .data_block_size_policy(BlockSizePolicy::all(1_024)) + .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; tree.insert("a", "a", 0); diff --git a/tests/snapshot_point_read.rs b/tests/snapshot_point_read.rs index dfd63891..d1e8cdd2 100644 --- a/tests/snapshot_point_read.rs +++ b/tests/snapshot_point_read.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, SeqNo, SequenceNumberCounter}; +use lsm_tree::{config::BlockSizePolicy, AbstractTree, Config, SeqNo, SequenceNumberCounter}; use test_log::test; #[test] @@ -6,8 +6,8 @@ fn snapshot_404() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let tree = Config::new(&folder) - .data_block_size(1_024) - .index_block_size(1_024) + .data_block_size_policy(BlockSizePolicy::all(1_024)) + .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; tree.insert("a", "a", 0); @@ -41,8 +41,8 @@ fn snapshot_lots_of_versions() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let tree = Config::new(&folder) - .data_block_size(1_024) - .index_block_size(1_024) + .data_block_size_policy(BlockSizePolicy::all(1_024)) + .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; let key = "abc"; @@ -80,8 +80,8 @@ fn snapshot_disk_point_reads() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let tree = Config::new(&folder) - .data_block_size(1_024) - .index_block_size(1_024) + .data_block_size_policy(BlockSizePolicy::all(1_024)) + .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; let seqno = SequenceNumberCounter::default(); @@ -137,8 +137,8 @@ fn snapshot_disk_and_memtable_reads() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let tree = Config::new(&folder) - .data_block_size(1_024) - .index_block_size(1_024) + .data_block_size_policy(BlockSizePolicy::all(1_024)) + .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; let seqno = SequenceNumberCounter::default(); diff --git a/tests/tree_bulk_ingest.rs b/tests/tree_bulk_ingest.rs index 8d9da9a1..e0ac5e00 100644 --- a/tests/tree_bulk_ingest.rs +++ b/tests/tree_bulk_ingest.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, Guard, SeqNo}; +use lsm_tree::{AbstractTree, Config, Guard, SeqNo, SequenceNumberCounter}; use test_log::test; const ITEM_COUNT: usize = 100_000; @@ -10,11 +10,18 @@ fn tree_bulk_ingest() -> lsm_tree::Result<()> { let tree = Config::new(folder).open()?; - tree.ingest((0..ITEM_COUNT as u64).map(|x| { - let k = x.to_be_bytes(); - let v = nanoid::nanoid!(); - (k.into(), v.into()) - }))?; + let seqno = SequenceNumberCounter::default(); + let visible_seqno = SequenceNumberCounter::default(); + + tree.ingest( + (0..ITEM_COUNT as u64).map(|x| { + let k = x.to_be_bytes(); + let v = nanoid::nanoid!(); + (k.into(), v.into()) + }), + &seqno, + &visible_seqno, + )?; assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( @@ -38,11 +45,18 @@ fn tree_copy() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let src = Config::new(folder).open()?; - src.ingest((0..ITEM_COUNT as u64).map(|x| { - let k = x.to_be_bytes(); - let v = nanoid::nanoid!(); - (k.into(), v.into()) - }))?; + let seqno = SequenceNumberCounter::default(); + let visible_seqno = SequenceNumberCounter::default(); + + src.ingest( + (0..ITEM_COUNT as u64).map(|x| { + let k = x.to_be_bytes(); + let v = nanoid::nanoid!(); + (k.into(), v.into()) + }), + &seqno, + &visible_seqno, + )?; assert_eq!(src.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( @@ -65,6 +79,8 @@ fn tree_copy() -> lsm_tree::Result<()> { src.iter(SeqNo::MAX, None) .map(|x| x.into_inner()) .map(|x| x.unwrap()), + &seqno, + &visible_seqno, )?; assert_eq!(dest.len(SeqNo::MAX, None)?, ITEM_COUNT); @@ -93,11 +109,18 @@ fn blob_tree_bulk_ingest() -> lsm_tree::Result<()> { .blob_file_separation_threshold(1) .open_as_blob_tree()?; - tree.ingest((0..ITEM_COUNT as u64).map(|x| { - let k = x.to_be_bytes(); - let v = nanoid::nanoid!(); - (k.into(), v.into()) - }))?; + let seqno = SequenceNumberCounter::default(); + let visible_seqno = SequenceNumberCounter::default(); + + tree.ingest( + (0..ITEM_COUNT as u64).map(|x| { + let k = x.to_be_bytes(); + let v = nanoid::nanoid!(); + (k.into(), v.into()) + }), + &seqno, + &visible_seqno, + )?; assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( @@ -124,11 +147,18 @@ fn blob_tree_copy() -> lsm_tree::Result<()> { .blob_file_separation_threshold(1) .open_as_blob_tree()?; - src.ingest((0..ITEM_COUNT as u64).map(|x| { - let k = x.to_be_bytes(); - let v = nanoid::nanoid!(); - (k.into(), v.into()) - }))?; + let seqno = SequenceNumberCounter::default(); + let visible_seqno = SequenceNumberCounter::default(); + + src.ingest( + (0..ITEM_COUNT as u64).map(|x| { + let k = x.to_be_bytes(); + let v = nanoid::nanoid!(); + (k.into(), v.into()) + }), + &seqno, + &visible_seqno, + )?; assert_eq!(src.len(SeqNo::MAX, None)?, ITEM_COUNT); assert_eq!( @@ -154,6 +184,8 @@ fn blob_tree_copy() -> lsm_tree::Result<()> { src.iter(SeqNo::MAX, None) .map(|x| x.into_inner()) .map(|x| x.unwrap()), + &seqno, + &visible_seqno, )?; assert_eq!(dest.len(SeqNo::MAX, None)?, ITEM_COUNT); diff --git a/tests/tree_different_block_size.rs b/tests/tree_different_block_size.rs index bdd96eeb..6df806de 100644 --- a/tests/tree_different_block_size.rs +++ b/tests/tree_different_block_size.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, SeqNo, SequenceNumberCounter}; +use lsm_tree::{config::BlockSizePolicy, AbstractTree, Config, SeqNo, SequenceNumberCounter}; use test_log::test; const ITEM_COUNT: usize = 1_000; @@ -9,8 +9,8 @@ fn tree_block_size_after_recovery() -> lsm_tree::Result<()> { { let tree = Config::new(&folder) - .data_block_size(2_048) - .index_block_size(2_048) + .data_block_size_policy(BlockSizePolicy::all(2_048)) + .index_block_size_policy(BlockSizePolicy::all(2_048)) .open()?; let seqno = SequenceNumberCounter::default(); @@ -28,24 +28,24 @@ fn tree_block_size_after_recovery() -> lsm_tree::Result<()> { { let tree = Config::new(&folder) - .data_block_size(2_048) - .index_block_size(2_048) + .data_block_size_policy(BlockSizePolicy::all(2_048)) + .index_block_size_policy(BlockSizePolicy::all(2_048)) .open()?; assert_eq!(ITEM_COUNT, tree.len(SeqNo::MAX, None)?); } { let tree = Config::new(&folder) - .data_block_size(4_096) - .index_block_size(4_096) + .data_block_size_policy(BlockSizePolicy::all(4_096)) + .index_block_size_policy(BlockSizePolicy::all(4_096)) .open()?; assert_eq!(ITEM_COUNT, tree.len(SeqNo::MAX, None)?); } { let tree = Config::new(&folder) - .data_block_size(78_652) - .index_block_size(78_652) + .data_block_size_policy(BlockSizePolicy::all(78_652)) + .index_block_size_policy(BlockSizePolicy::all(78_652)) .open()?; assert_eq!(ITEM_COUNT, tree.len(SeqNo::MAX, None)?); } diff --git a/tests/tree_disjoint_point_read.rs b/tests/tree_disjoint_point_read.rs index 1d25aa5b..f36609e6 100644 --- a/tests/tree_disjoint_point_read.rs +++ b/tests/tree_disjoint_point_read.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, SeqNo}; +use lsm_tree::{config::BlockSizePolicy, AbstractTree, Config, SeqNo}; use std::sync::Arc; use test_log::test; @@ -7,8 +7,8 @@ fn tree_disjoint_point_read() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) - .data_block_size(1_024) - .index_block_size(1_024) + .data_block_size_policy(BlockSizePolicy::all(1_024)) + .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; tree.insert("a", "a", 0); @@ -36,8 +36,8 @@ fn tree_disjoint_point_read_blob() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) - .data_block_size(1_024) - .index_block_size(1_024) + .data_block_size_policy(BlockSizePolicy::all(1_024)) + .index_block_size_policy(BlockSizePolicy::all(1_024)) .open_as_blob_tree()?; tree.insert("a", "a", 0); @@ -66,8 +66,8 @@ fn tree_disjoint_point_read_multiple_levels() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) - .data_block_size(1_024) - .index_block_size(1_024) + .data_block_size_policy(BlockSizePolicy::all(1_024)) + .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; tree.insert("z", "z", 0); @@ -121,8 +121,8 @@ fn tree_disjoint_point_read_multiple_levels_blob() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) - .data_block_size(1_024) - .index_block_size(1_024) + .data_block_size_policy(BlockSizePolicy::all(1_024)) + .index_block_size_policy(BlockSizePolicy::all(1_024)) .open_as_blob_tree()?; tree.insert("z", "z", 0); diff --git a/tests/tree_non_disjoint_point_read.rs b/tests/tree_non_disjoint_point_read.rs index 55f49858..7935c91c 100644 --- a/tests/tree_non_disjoint_point_read.rs +++ b/tests/tree_non_disjoint_point_read.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, SeqNo}; +use lsm_tree::{config::BlockSizePolicy, AbstractTree, Config, SeqNo}; use test_log::test; #[test] @@ -6,8 +6,8 @@ fn tree_non_disjoint_point_read() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) - .data_block_size(1_024) - .index_block_size(1_024) + .data_block_size_policy(BlockSizePolicy::all(1_024)) + .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; tree.insert("a", "a", 0); From 317870d26f1eb4dad761dc2ff3595717a5f1c099 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 19:19:47 +0200 Subject: [PATCH 451/613] ignore some tests for now --- tests/blob_sep_threshold.rs | 1 + tests/blob_simple.rs | 1 + tests/blob_tree_reload_blob.rs | 2 ++ tests/segment_point_reads.rs | 1 + 4 files changed, 5 insertions(+) diff --git a/tests/blob_sep_threshold.rs b/tests/blob_sep_threshold.rs index dd162d4f..449d2a92 100644 --- a/tests/blob_sep_threshold.rs +++ b/tests/blob_sep_threshold.rs @@ -2,6 +2,7 @@ use lsm_tree::{AbstractTree, SeqNo}; use test_log::test; #[test] +#[ignore] fn blob_tree_separation_threshold() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let path = folder.path(); diff --git a/tests/blob_simple.rs b/tests/blob_simple.rs index e91cf0e0..5808fb43 100644 --- a/tests/blob_simple.rs +++ b/tests/blob_simple.rs @@ -2,6 +2,7 @@ use lsm_tree::{AbstractTree, SeqNo}; use test_log::test; #[test] +#[ignore] fn blob_tree_simple() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let path = folder.path(); diff --git a/tests/blob_tree_reload_blob.rs b/tests/blob_tree_reload_blob.rs index 54c3744e..8f6daaac 100644 --- a/tests/blob_tree_reload_blob.rs +++ b/tests/blob_tree_reload_blob.rs @@ -4,6 +4,7 @@ use test_log::test; const ITEM_COUNT: usize = 10_000; #[test] +#[ignore] fn blob_tree_reload_empty() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; @@ -61,6 +62,7 @@ fn blob_tree_reload_empty() -> lsm_tree::Result<()> { } #[test] +#[ignore] fn blob_tree_reload() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; diff --git a/tests/segment_point_reads.rs b/tests/segment_point_reads.rs index d94f6dee..293e1745 100644 --- a/tests/segment_point_reads.rs +++ b/tests/segment_point_reads.rs @@ -116,6 +116,7 @@ fn segment_point_reads_mvcc_slab() -> lsm_tree::Result<()> { } #[test] +#[ignore] fn blob_tree_segment_point_reads_mvcc_slab() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.keep(); From 2c57b9f9b24603d719c60c626ca7461687f309ed Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 19:20:13 +0200 Subject: [PATCH 452/613] fix --- src/blob_tree/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index c17e81fe..d6f413d2 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -580,7 +580,7 @@ impl AbstractTree for BlobTree { folder: lsm_segment_folder, } */ )? - .use_data_block_compression(self.index.config.compression); + .use_data_block_compression(self.index.config.data_block_compression_policy.get(0)); // TODO: monkey /* segment_writer = segment_writer.use_bloom_policy( crate::segment::writer::BloomConstructionPolicy::FpRate(0.0001), From 66fde2c7dc238b92f41ec47ebb92cbf7383d90a6 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 19:21:30 +0200 Subject: [PATCH 453/613] fix --- src/tree/ingest.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tree/ingest.rs b/src/tree/ingest.rs index b0aaf9e6..6e4c2359 100644 --- a/src/tree/ingest.rs +++ b/src/tree/ingest.rs @@ -23,14 +23,14 @@ impl<'a> Ingestion<'a> { let folder = tree.config.path.join(crate::file::SEGMENTS_FOLDER); log::debug!("Ingesting into disk segments in {}", folder.display()); + // TODO: 3.0.0 look at tree configuration let writer = MultiWriter::new( folder.clone(), tree.segment_id_counter.clone(), - 64 * 1_024 * 1_024, // TODO: look at tree configuration + 64 * 1_024 * 1_024, )? - // TODO: use restart interval etc. .use_data_block_hash_ratio(tree.config.data_block_hash_ratio) - .use_data_block_compression(tree.config.compression); + .use_data_block_compression(tree.config.data_block_compression_policy.get(6)); Ok(Self { folder, From 51de6cd8cbd6b99b14ddde332ea02ac81e9202af Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 19:23:37 +0200 Subject: [PATCH 454/613] wip --- tests/tree_bulk_ingest.rs | 382 +++++++++++++++++++------------------- 1 file changed, 194 insertions(+), 188 deletions(-) diff --git a/tests/tree_bulk_ingest.rs b/tests/tree_bulk_ingest.rs index e0ac5e00..70ce3eea 100644 --- a/tests/tree_bulk_ingest.rs +++ b/tests/tree_bulk_ingest.rs @@ -6,202 +6,208 @@ const ITEM_COUNT: usize = 100_000; #[test] #[ignore] fn tree_bulk_ingest() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?; - - let tree = Config::new(folder).open()?; - - let seqno = SequenceNumberCounter::default(); - let visible_seqno = SequenceNumberCounter::default(); - - tree.ingest( - (0..ITEM_COUNT as u64).map(|x| { - let k = x.to_be_bytes(); - let v = nanoid::nanoid!(); - (k.into(), v.into()) - }), - &seqno, - &visible_seqno, - )?; - - assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); - assert_eq!( - tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), - ITEM_COUNT, - ); - assert_eq!( - tree.iter(SeqNo::MAX, None) - .rev() - .flat_map(|x| x.key()) - .count(), - ITEM_COUNT, - ); - - Ok(()) + todo!() + + // let folder = tempfile::tempdir()?; + + // let tree = Config::new(folder).open()?; + + // let seqno = SequenceNumberCounter::default(); + // let visible_seqno = SequenceNumberCounter::default(); + + // tree.ingest( + // (0..ITEM_COUNT as u64).map(|x| { + // let k = x.to_be_bytes(); + // let v = nanoid::nanoid!(); + // (k.into(), v.into()) + // }), + // &seqno, + // &visible_seqno, + // )?; + + // assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); + // assert_eq!( + // tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), + // ITEM_COUNT, + // ); + // assert_eq!( + // tree.iter(SeqNo::MAX, None) + // .rev() + // .flat_map(|x| x.key()) + // .count(), + // ITEM_COUNT, + // ); + + // Ok(()) } #[test] #[ignore] fn tree_copy() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?; - let src = Config::new(folder).open()?; - - let seqno = SequenceNumberCounter::default(); - let visible_seqno = SequenceNumberCounter::default(); - - src.ingest( - (0..ITEM_COUNT as u64).map(|x| { - let k = x.to_be_bytes(); - let v = nanoid::nanoid!(); - (k.into(), v.into()) - }), - &seqno, - &visible_seqno, - )?; - - assert_eq!(src.len(SeqNo::MAX, None)?, ITEM_COUNT); - assert_eq!( - src.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), - ITEM_COUNT, - ); - assert_eq!( - src.iter(SeqNo::MAX, None) - .rev() - .flat_map(|x| x.key()) - .count(), - ITEM_COUNT, - ); - assert!(src.lock_active_memtable().is_empty()); - - let folder = tempfile::tempdir()?; - let dest = Config::new(folder).open()?; - - dest.ingest( - src.iter(SeqNo::MAX, None) - .map(|x| x.into_inner()) - .map(|x| x.unwrap()), - &seqno, - &visible_seqno, - )?; - - assert_eq!(dest.len(SeqNo::MAX, None)?, ITEM_COUNT); - assert_eq!( - dest.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), - ITEM_COUNT, - ); - assert_eq!( - dest.iter(SeqNo::MAX, None) - .rev() - .flat_map(|x| x.key()) - .count(), - ITEM_COUNT, - ); - assert!(dest.lock_active_memtable().is_empty()); - - Ok(()) -} - -#[test] -#[ignore] -fn blob_tree_bulk_ingest() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?; - - let tree = Config::new(folder) - .blob_file_separation_threshold(1) - .open_as_blob_tree()?; - - let seqno = SequenceNumberCounter::default(); - let visible_seqno = SequenceNumberCounter::default(); - - tree.ingest( - (0..ITEM_COUNT as u64).map(|x| { - let k = x.to_be_bytes(); - let v = nanoid::nanoid!(); - (k.into(), v.into()) - }), - &seqno, - &visible_seqno, - )?; - - assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); - assert_eq!( - tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), - ITEM_COUNT, - ); - assert_eq!( - tree.iter(SeqNo::MAX, None) - .rev() - .flat_map(|x| x.key()) - .count(), - ITEM_COUNT, - ); - assert_eq!(1, tree.blob_file_count()); - - Ok(()) + todo!() + + // let folder = tempfile::tempdir()?; + // let src = Config::new(folder).open()?; + + // let seqno = SequenceNumberCounter::default(); + // let visible_seqno = SequenceNumberCounter::default(); + + // src.ingest( + // (0..ITEM_COUNT as u64).map(|x| { + // let k = x.to_be_bytes(); + // let v = nanoid::nanoid!(); + // (k.into(), v.into()) + // }), + // &seqno, + // &visible_seqno, + // )?; + + // assert_eq!(src.len(SeqNo::MAX, None)?, ITEM_COUNT); + // assert_eq!( + // src.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), + // ITEM_COUNT, + // ); + // assert_eq!( + // src.iter(SeqNo::MAX, None) + // .rev() + // .flat_map(|x| x.key()) + // .count(), + // ITEM_COUNT, + // ); + // assert!(src.lock_active_memtable().is_empty()); + + // let folder = tempfile::tempdir()?; + // let dest = Config::new(folder).open()?; + + // dest.ingest( + // src.iter(SeqNo::MAX, None) + // .map(|x| x.into_inner()) + // .map(|x| x.unwrap()), + // &seqno, + // &visible_seqno, + // )?; + + // assert_eq!(dest.len(SeqNo::MAX, None)?, ITEM_COUNT); + // assert_eq!( + // dest.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), + // ITEM_COUNT, + // ); + // assert_eq!( + // dest.iter(SeqNo::MAX, None) + // .rev() + // .flat_map(|x| x.key()) + // .count(), + // ITEM_COUNT, + // ); + // assert!(dest.lock_active_memtable().is_empty()); + + // Ok(()) + // } + + // #[test] + // #[ignore] + // fn blob_tree_bulk_ingest() -> lsm_tree::Result<()> { + // let folder = tempfile::tempdir()?; + + // let tree = Config::new(folder) + // .blob_file_separation_threshold(1) + // .open_as_blob_tree()?; + + // let seqno = SequenceNumberCounter::default(); + // let visible_seqno = SequenceNumberCounter::default(); + + // tree.ingest( + // (0..ITEM_COUNT as u64).map(|x| { + // let k = x.to_be_bytes(); + // let v = nanoid::nanoid!(); + // (k.into(), v.into()) + // }), + // &seqno, + // &visible_seqno, + // )?; + + // assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT); + // assert_eq!( + // tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), + // ITEM_COUNT, + // ); + // assert_eq!( + // tree.iter(SeqNo::MAX, None) + // .rev() + // .flat_map(|x| x.key()) + // .count(), + // ITEM_COUNT, + // ); + // assert_eq!(1, tree.blob_file_count()); + + // Ok(()) } #[test] #[ignore] fn blob_tree_copy() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?; - let src = Config::new(folder) - .blob_file_separation_threshold(1) - .open_as_blob_tree()?; - - let seqno = SequenceNumberCounter::default(); - let visible_seqno = SequenceNumberCounter::default(); - - src.ingest( - (0..ITEM_COUNT as u64).map(|x| { - let k = x.to_be_bytes(); - let v = nanoid::nanoid!(); - (k.into(), v.into()) - }), - &seqno, - &visible_seqno, - )?; - - assert_eq!(src.len(SeqNo::MAX, None)?, ITEM_COUNT); - assert_eq!( - src.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), - ITEM_COUNT, - ); - assert_eq!( - src.iter(SeqNo::MAX, None) - .rev() - .flat_map(|x| x.key()) - .count(), - ITEM_COUNT, - ); - assert!(src.lock_active_memtable().is_empty()); - assert_eq!(1, src.blob_file_count()); - - let folder = tempfile::tempdir()?; - let dest = Config::new(folder) - .blob_file_separation_threshold(1) - .open_as_blob_tree()?; - - dest.ingest( - src.iter(SeqNo::MAX, None) - .map(|x| x.into_inner()) - .map(|x| x.unwrap()), - &seqno, - &visible_seqno, - )?; - - assert_eq!(dest.len(SeqNo::MAX, None)?, ITEM_COUNT); - assert_eq!( - dest.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), - ITEM_COUNT, - ); - assert_eq!( - dest.iter(SeqNo::MAX, None) - .rev() - .flat_map(|x| x.key()) - .count(), - ITEM_COUNT, - ); - assert!(dest.lock_active_memtable().is_empty()); - assert_eq!(1, dest.blob_file_count()); - - Ok(()) + todo!() + + // let folder = tempfile::tempdir()?; + // let src = Config::new(folder) + // .blob_file_separation_threshold(1) + // .open_as_blob_tree()?; + + // let seqno = SequenceNumberCounter::default(); + // let visible_seqno = SequenceNumberCounter::default(); + + // src.ingest( + // (0..ITEM_COUNT as u64).map(|x| { + // let k = x.to_be_bytes(); + // let v = nanoid::nanoid!(); + // (k.into(), v.into()) + // }), + // &seqno, + // &visible_seqno, + // )?; + + // assert_eq!(src.len(SeqNo::MAX, None)?, ITEM_COUNT); + // assert_eq!( + // src.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), + // ITEM_COUNT, + // ); + // assert_eq!( + // src.iter(SeqNo::MAX, None) + // .rev() + // .flat_map(|x| x.key()) + // .count(), + // ITEM_COUNT, + // ); + // assert!(src.lock_active_memtable().is_empty()); + // assert_eq!(1, src.blob_file_count()); + + // let folder = tempfile::tempdir()?; + // let dest = Config::new(folder) + // .blob_file_separation_threshold(1) + // .open_as_blob_tree()?; + + // dest.ingest( + // src.iter(SeqNo::MAX, None) + // .map(|x| x.into_inner()) + // .map(|x| x.unwrap()), + // &seqno, + // &visible_seqno, + // )?; + + // assert_eq!(dest.len(SeqNo::MAX, None)?, ITEM_COUNT); + // assert_eq!( + // dest.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), + // ITEM_COUNT, + // ); + // assert_eq!( + // dest.iter(SeqNo::MAX, None) + // .rev() + // .flat_map(|x| x.key()) + // .count(), + // ITEM_COUNT, + // ); + // assert!(dest.lock_active_memtable().is_empty()); + // assert_eq!(1, dest.blob_file_count()); + + // Ok(()) } From 241373a493e9ba3e7dafd5c8feede734fbcadf98 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 19:25:42 +0200 Subject: [PATCH 455/613] wip --- tests/blob_simple.rs | 50 +++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/tests/blob_simple.rs b/tests/blob_simple.rs index 5808fb43..308bdc6c 100644 --- a/tests/blob_simple.rs +++ b/tests/blob_simple.rs @@ -62,40 +62,42 @@ fn blob_tree_simple() -> lsm_tree::Result<()> { #[test] #[ignore = "wip"] fn blob_tree_simple_compressed() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?; - let path = folder.path(); + todo!() - let tree = lsm_tree::Config::new(path) - .compression(lsm_tree::CompressionType::Lz4) - .open_as_blob_tree()?; + // let folder = tempfile::tempdir()?; + // let path = folder.path(); - let big_value = b"neptune!".repeat(128_000); + // let tree = lsm_tree::Config::new(path) + // .compression(lsm_tree::CompressionType::Lz4) + // .open_as_blob_tree()?; - assert!(tree.get("big", SeqNo::MAX)?.is_none()); - tree.insert("big", &big_value, 0); - tree.insert("smol", "small value", 0); + // let big_value = b"neptune!".repeat(128_000); - let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); - assert_eq!(&*value, big_value); + // assert!(tree.get("big", SeqNo::MAX)?.is_none()); + // tree.insert("big", &big_value, 0); + // tree.insert("smol", "small value", 0); - tree.flush_active_memtable(0)?; + // let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); + // assert_eq!(&*value, big_value); - let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); - assert_eq!(&*value, big_value); + // tree.flush_active_memtable(0)?; - let value = tree.get("smol", SeqNo::MAX)?.expect("should exist"); - assert_eq!(&*value, b"small value"); + // let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); + // assert_eq!(&*value, big_value); - let new_big_value = b"winter!".repeat(128_000); - tree.insert("big", &new_big_value, 1); + // let value = tree.get("smol", SeqNo::MAX)?.expect("should exist"); + // assert_eq!(&*value, b"small value"); - let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); - assert_eq!(&*value, new_big_value); + // let new_big_value = b"winter!".repeat(128_000); + // tree.insert("big", &new_big_value, 1); - tree.flush_active_memtable(0)?; + // let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); + // assert_eq!(&*value, new_big_value); - let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); - assert_eq!(&*value, new_big_value); + // tree.flush_active_memtable(0)?; - Ok(()) + // let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); + // assert_eq!(&*value, new_big_value); + + // Ok(()) } From 23ebed2ae9215ca6fa1427fb443bb5c1a1435a53 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 19:26:27 +0200 Subject: [PATCH 456/613] version --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 81ff0eb4..e00e7039 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ name = "lsm-tree" description = "A K.I.S.S. implementation of log-structured merge trees (LSM-trees/LSMTs)" license = "MIT OR Apache-2.0" -version = "3.0.0" +version = "3.0.0-pre.0" edition = "2021" rust-version = "1.82.0" readme = "README.md" From 29e64972531edb7989d01faa1bbe06a0d746c6ef Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 19:39:17 +0200 Subject: [PATCH 457/613] fix: test --- src/segment/multi_writer.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/segment/multi_writer.rs b/src/segment/multi_writer.rs index c4c55561..779c415a 100644 --- a/src/segment/multi_writer.rs +++ b/src/segment/multi_writer.rs @@ -215,7 +215,7 @@ impl MultiWriter { #[cfg(test)] mod tests { - use crate::{AbstractTree, Config, SeqNo}; + use crate::{config::CompressionPolicy, AbstractTree, Config, SeqNo}; use test_log::test; // NOTE: Tests that versions of the same key stay @@ -228,7 +228,10 @@ mod tests { fn segment_multi_writer_same_key_norotate() -> crate::Result<()> { let folder = tempfile::tempdir()?; - let tree = Config::new(&folder).open()?; + let tree = Config::new(&folder) + .data_block_compression_policy(CompressionPolicy::all(crate::CompressionType::None)) + .index_block_compression_policy(CompressionPolicy::all(crate::CompressionType::None)) + .open()?; tree.insert("a", "a1".repeat(4_000), 0); tree.insert("a", "a2".repeat(4_000), 1); @@ -253,7 +256,10 @@ mod tests { fn segment_multi_writer_same_key_norotate_2() -> crate::Result<()> { let folder = tempfile::tempdir()?; - let tree = Config::new(&folder).open()?; + let tree = Config::new(&folder) + .data_block_compression_policy(CompressionPolicy::all(crate::CompressionType::None)) + .index_block_compression_policy(CompressionPolicy::all(crate::CompressionType::None)) + .open()?; tree.insert("a", "a1".repeat(4_000), 0); tree.insert("a", "a1".repeat(4_000), 1); From e2b1969cb13e6854e2882486ad506a225e60dd20 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 19:42:12 +0200 Subject: [PATCH 458/613] fix --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 85eb9d09..16895c73 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -53,7 +53,7 @@ jobs: - name: Run tests run: cargo nextest run --all-features - name: Run doc tests - run: cargo test --doc + run: cargo test --doc --features lz4 - name: Build & test LSM examples run: node compile_examples.mjs cross: From 227708dce149814c300ca2facc30272d40a9cad4 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 19:47:44 +0200 Subject: [PATCH 459/613] use byteview release --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index e00e7039..86a56230 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,7 +26,7 @@ use_unsafe = [] # TODO: 3.0.0 remove [dependencies] bytes = { version = "1", optional = true } byteorder = { package = "byteorder-lite", version = "0.1.0" } -byteview = { git = "https://github.com/fjall-rs/byteview" } +byteview = "~0.8.0" crossbeam-skiplist = "0.1.3" enum_dispatch = "0.3.13" interval-heap = "0.0.5" From e2de104a9da0a434fe61547982793f1776ad9082 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 20:05:59 +0200 Subject: [PATCH 460/613] wip ingestion --- src/abstract.rs | 9 ++++-- src/lib.rs | 1 + src/tree/ingest.rs | 80 ++++++++++++++++++++++++++++++++++++---------- src/tree/mod.rs | 24 ++++++++++---- 4 files changed, 90 insertions(+), 24 deletions(-) diff --git a/src/abstract.rs b/src/abstract.rs index 73abf04d..d9685605 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -5,7 +5,7 @@ use crate::{ compaction::CompactionStrategy, config::TreeType, iter_guard::IterGuardImpl, segment::Segment, tree::inner::MemtableId, vlog::BlobFile, AnyTree, BlobTree, Config, Guard, KvPair, Memtable, - SegmentId, SeqNo, Tree, UserKey, UserValue, + SegmentId, SeqNo, SequenceNumberCounter, Tree, UserKey, UserValue, }; use enum_dispatch::enum_dispatch; use std::{ @@ -64,7 +64,12 @@ pub trait AbstractTree { /// /// Will panic if the input iterator is not sorted in ascending order. #[doc(hidden)] - fn ingest(&self, iter: impl Iterator) -> crate::Result<()>; + fn ingest( + &self, + iter: impl Iterator, + seqno_generator: &SequenceNumberCounter, + visible_seqno: &SequenceNumberCounter, + ) -> crate::Result<()>; /// Returns the approximate number of tombstones in the tree. fn tombstone_count(&self) -> u64; diff --git a/src/lib.rs b/src/lib.rs index a6ded8ac..58ad5cc9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -212,6 +212,7 @@ pub use key_range::KeyRange; pub use { merge::BoxedIterator, segment::{block::Checksum, GlobalSegmentId, Segment, SegmentId}, + tree::ingest::Ingestion, tree::inner::TreeId, value::InternalValue, }; diff --git a/src/tree/ingest.rs b/src/tree/ingest.rs index 6e4c2359..bdb34b5d 100644 --- a/src/tree/ingest.rs +++ b/src/tree/ingest.rs @@ -3,23 +3,29 @@ // (found in the LICENSE-* files in the repository) use super::Tree; -use crate::{segment::multi_writer::MultiWriter, AbstractTree, UserKey, UserValue}; -use std::path::PathBuf; +use crate::{ + compaction::MoveDown, segment::multi_writer::MultiWriter, AbstractTree, SeqNo, UserKey, + UserValue, +}; +use std::{path::PathBuf, sync::Arc}; +/// Bulk ingestion +/// +/// Items NEED to be added in ascending key order. pub struct Ingestion<'a> { folder: PathBuf, tree: &'a Tree, writer: MultiWriter, + seqno: SeqNo, } impl<'a> Ingestion<'a> { + /// Creates a new ingestion. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. pub fn new(tree: &'a Tree) -> crate::Result { - assert_eq!( - 0, - tree.segment_count(), - "can only perform bulk_ingest on empty trees", - ); - let folder = tree.config.path.join(crate::file::SEGMENTS_FOLDER); log::debug!("Ingesting into disk segments in {}", folder.display()); @@ -36,43 +42,77 @@ impl<'a> Ingestion<'a> { folder, tree, writer, + seqno: 0, }) } + /// Sets the ingestion seqno. + #[must_use] + pub fn with_seqno(mut self, seqno: SeqNo) -> Self { + self.seqno = seqno; + self + } + + /// Writes a key-value pair. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. pub fn write(&mut self, key: UserKey, value: UserValue) -> crate::Result<()> { self.writer.write(crate::InternalValue::from_components( key, value, - 0, + self.seqno, crate::ValueType::Value, )) } + /// Writes a key-value pair. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + #[doc(hidden)] + pub fn write_tombstone(&mut self, key: UserKey) -> crate::Result<()> { + self.writer.write(crate::InternalValue::from_components( + key, + crate::UserValue::empty(), + self.seqno, + crate::ValueType::Tombstone, + )) + } + + /// Finishes the ingestion. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. pub fn finish(self) -> crate::Result<()> { - use crate::{compaction::MoveDown, Segment}; - use std::sync::Arc; + use crate::Segment; let results = self.writer.finish()?; log::info!("Finished ingestion writer"); + let pin_filter = self.tree.config.filter_block_pinning_policy.get(6); + let pin_index = self.tree.config.filter_block_pinning_policy.get(6); + let created_segments = results .into_iter() .map(|segment_id| -> crate::Result { - // TODO: look at tree configuration - // TODO: segment recoverer struct w/ builder pattern // Segment::recover() // .pin_filters(true) // .with_metrics(metrics) // .run(path, tree_id, cache, descriptor_table); + Segment::recover( self.folder.join(segment_id.to_string()), self.tree.id, self.tree.config.cache.clone(), self.tree.config.descriptor_table.clone(), - false, - false, + pin_filter, + pin_index, #[cfg(feature = "metrics")] self.tree.metrics.clone(), ) @@ -81,7 +121,15 @@ impl<'a> Ingestion<'a> { self.tree.register_segments(&created_segments, None, 0)?; - self.tree.compact(Arc::new(MoveDown(0, 2)), 0)?; + let last_level_idx = self + .tree + .manifest + .read() + .expect("lock is poisoned") + .last_level_index(); + + self.tree + .compact(Arc::new(MoveDown(0, last_level_idx)), 0)?; Ok(()) } diff --git a/src/tree/mod.rs b/src/tree/mod.rs index cfbaefbd..1083570e 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -18,7 +18,8 @@ use crate::{ segment::Segment, value::InternalValue, vlog::BlobFile, - AbstractTree, Cache, DescriptorTable, KvPair, SegmentId, SeqNo, UserKey, UserValue, ValueType, + AbstractTree, Cache, DescriptorTable, KvPair, SegmentId, SeqNo, SequenceNumberCounter, UserKey, + UserValue, ValueType, }; use inner::{MemtableId, SealedMemtables, TreeId, TreeInner}; use std::{ @@ -118,18 +119,27 @@ impl AbstractTree for Tree { .sum() } - fn ingest(&self, iter: impl Iterator) -> crate::Result<()> { + fn ingest( + &self, + iter: impl Iterator, + seqno_generator: &SequenceNumberCounter, + visible_seqno: &SequenceNumberCounter, + ) -> crate::Result<()> { use crate::tree::ingest::Ingestion; use std::time::Instant; // NOTE: Lock active memtable so nothing else can be going on while we are bulk loading - let lock = self.lock_active_memtable(); + let memtable_lock = self.lock_active_memtable(); + + let seqno = seqno_generator.next(); + + // TODO: allow ingestion always, by flushing memtable assert!( - lock.is_empty(), - "can only perform bulk_ingest on empty trees", + memtable_lock.is_empty(), + "can only perform bulk ingestion with empty memtable", ); - let mut writer = Ingestion::new(self)?; + let mut writer = Ingestion::new(self)?.with_seqno(seqno); let start = Instant::now(); let mut count = 0; @@ -151,6 +161,8 @@ impl AbstractTree for Tree { writer.finish()?; + visible_seqno.fetch_max(seqno + 1); + log::info!("Ingested {count} items in {:?}", start.elapsed()); Ok(()) From 216ee2655d6c1cdb3cd4d60d85c7446e8c61595b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 20:08:19 +0200 Subject: [PATCH 461/613] fix --- src/blob_tree/mod.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index d6f413d2..d9d525aa 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -436,10 +436,18 @@ impl AbstractTree for BlobTree { self.index.drop_range(key_range) } - fn ingest(&self, iter: impl Iterator) -> crate::Result<()> { + fn ingest( + &self, + iter: impl Iterator, + seqno_generator: &SequenceNumberCounter, + visible_seqno: &SequenceNumberCounter, + ) -> crate::Result<()> { use crate::tree::ingest::Ingestion; use std::time::Instant; + // TODO: take curr seqno for ingest, HOWEVER + // TODO: we need to take the next seqno AFTER locking the memtable + todo!(); // // NOTE: Lock active memtable so nothing else can be going on while we are bulk loading From 1dd4c1639a29bad1415a50b96e178c404e2e93cd Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 22:53:18 +0200 Subject: [PATCH 462/613] refactor: checksum error variant --- src/error.rs | 12 +++++++++--- src/segment/block/mod.rs | 15 +++++++++++---- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/src/error.rs b/src/error.rs index 39e17ab9..85d81534 100644 --- a/src/error.rs +++ b/src/error.rs @@ -30,8 +30,14 @@ pub enum Error { /// Some required segments could not be recovered from disk Unrecoverable, - /// Invalid checksum value (got, expected) - InvalidChecksum((Checksum, Checksum)), + /// Checksum mismatch + ChecksumMismatch { + /// Checksum of loaded block + got: Checksum, + + /// Checksum that was saved in block header + expected: Checksum, + }, } impl std::fmt::Display for Error { @@ -49,7 +55,7 @@ impl std::error::Error for Error { Self::Decompress(_) | Self::InvalidVersion(_) | Self::Unrecoverable - | Self::InvalidChecksum(_) => None, + | Self::ChecksumMismatch { .. } => None, } } } diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index 47998152..10edc744 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -110,12 +110,16 @@ impl Block { let checksum = Checksum::from_raw(crate::hash::hash128(&data)); if checksum != header.checksum { - log::warn!( + log::error!( "Checksum mismatch for , got={}, expected={}", *checksum, *header.checksum, ); - return Err(crate::Error::InvalidChecksum((checksum, header.checksum))); + + return Err(crate::Error::ChecksumMismatch { + got: checksum, + expected: header.checksum, + }); } Ok(Self { header, data }) @@ -159,13 +163,16 @@ impl Block { let checksum = Checksum::from_raw(crate::hash::hash128(&buf)); if checksum != header.checksum { - log::warn!( + log::error!( "Checksum mismatch for block {handle:?}, got={}, expected={}", *checksum, *header.checksum, ); - return Err(crate::Error::InvalidChecksum((checksum, header.checksum))); + return Err(crate::Error::ChecksumMismatch { + got: checksum, + expected: header.checksum, + }); } Ok(Self { header, data: buf }) From 29c0515fae59a4fbe399cbc03f5f015df02c978d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 23:21:08 +0200 Subject: [PATCH 463/613] data block hash ratio policy --- src/compaction/worker.rs | 27 ++++++++++++--------------- src/config/hash_ratio.rs | 40 ++++++++++++++++++++++++++++++++++++++++ src/config/mod.rs | 38 +++++++++++++++++--------------------- src/tree/ingest.rs | 26 ++++++++++++++++++++++---- src/tree/mod.rs | 4 +++- 5 files changed, 94 insertions(+), 41 deletions(-) create mode 100644 src/config/hash_ratio.rs diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 40c66bfb..ca3730cf 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -222,24 +222,21 @@ fn merge_segments( let segments_base_folder = opts.config.path.join(SEGMENTS_FOLDER); - let data_block_size = opts.config.data_block_size_policy.get(0); - let index_block_size = opts.config.index_block_size_policy.get(0); + let dst_lvl = payload.canonical_level.into(); - let data_block_restart_interval = opts.config.data_block_restart_interval_policy.get(0); - let index_block_restart_interval = opts.config.index_block_restart_interval_policy.get(0); + let data_block_size = opts.config.data_block_size_policy.get(dst_lvl); + let index_block_size = opts.config.index_block_size_policy.get(dst_lvl); - let data_block_compression = opts - .config - .data_block_compression_policy - .get(payload.canonical_level.into()); + let data_block_restart_interval = opts.config.data_block_restart_interval_policy.get(dst_lvl); + let index_block_restart_interval = opts.config.index_block_restart_interval_policy.get(dst_lvl); - let index_block_compression = opts - .config - .index_block_compression_policy - .get(payload.canonical_level.into()); + let data_block_compression = opts.config.data_block_compression_policy.get(dst_lvl); + let index_block_compression = opts.config.index_block_compression_policy.get(dst_lvl); - let pin_filter = opts.config.filter_block_pinning_policy.get(0); - let pin_index = opts.config.filter_block_pinning_policy.get(0); + let pin_filter = opts.config.filter_block_pinning_policy.get(dst_lvl); + let pin_index = opts.config.filter_block_pinning_policy.get(dst_lvl); + + let data_block_hash_ratio = opts.config.data_block_hash_ratio_policy.get(dst_lvl); log::debug!( "Compacting segments {:?} into L{} (canonical L{}), data_block_restart_interval={data_block_restart_interval}, index_block_restart_interval={index_block_restart_interval}, data_block_size={data_block_size}, index_block_size={index_block_size}, data_block_compression={data_block_compression}, index_block_compression={index_block_compression}, mvcc_gc_watermark={}", @@ -300,7 +297,7 @@ fn merge_segments( .use_data_block_compression(data_block_compression) .use_data_block_size(data_block_size) .use_index_block_size(index_block_size) - .use_data_block_hash_ratio(opts.config.data_block_hash_ratio) + .use_data_block_hash_ratio(data_block_hash_ratio) .use_index_block_compression(index_block_compression) .use_bloom_policy({ use crate::config::FilterPolicyEntry::{Bloom, None}; diff --git a/src/config/hash_ratio.rs b/src/config/hash_ratio.rs new file mode 100644 index 00000000..6ffbb860 --- /dev/null +++ b/src/config/hash_ratio.rs @@ -0,0 +1,40 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +/// Hash ratio policy +#[derive(Debug, Clone, PartialEq)] +pub struct HashRatioPolicy(Vec); + +impl std::ops::Deref for HashRatioPolicy { + type Target = [f32]; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl HashRatioPolicy { + pub(crate) fn get(&self, level: usize) -> f32 { + self.0 + .get(level) + .copied() + .unwrap_or_else(|| self.last().copied().expect("policy should not be empty")) + } + + // TODO: accept Vec... Into>? or owned + + /// Uses the same block size in every level. + #[must_use] + pub fn all(c: f32) -> Self { + Self(vec![c]) + } + + /// Constructs a custom block size policy. + #[must_use] + pub fn new(policy: &[f32]) -> Self { + assert!(!policy.is_empty(), "compression policy may not be empty"); + assert!(policy.len() <= 255, "compression policy is too large"); + Self(policy.into()) + } +} diff --git a/src/config/mod.rs b/src/config/mod.rs index 0b890d9f..47b54933 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -5,12 +5,14 @@ mod block_size; mod compression; mod filter; +mod hash_ratio; mod pinning; mod restart_interval; pub use block_size::BlockSizePolicy; pub use compression::CompressionPolicy; pub use filter::{BloomConstructionPolicy, FilterPolicy, FilterPolicyEntry}; +pub use hash_ratio::HashRatioPolicy; pub use pinning::PinningPolicy; pub use restart_interval::RestartIntervalPolicy; @@ -87,9 +89,6 @@ pub struct Config { /// Restart interval inside index blocks pub index_block_restart_interval_policy: RestartIntervalPolicy, - /// Hash bytes per key in data blocks - pub data_block_hash_ratio: f32, - /// Block size of data blocks pub data_block_size_policy: BlockSizePolicy, @@ -102,6 +101,9 @@ pub struct Config { /// Whether to pin filter blocks pub filter_block_pinning_policy: PinningPolicy, + /// Data block hash ratio + pub data_block_hash_ratio_policy: HashRatioPolicy, + /// If `true`, the last level will not build filters, reducing the filter size of a database /// by ~90% typically pub(crate) expect_point_read_hits: bool, @@ -132,8 +134,6 @@ impl Default for Config { data_block_restart_interval_policy: RestartIntervalPolicy::all(16), index_block_restart_interval_policy: RestartIntervalPolicy::all(1), - data_block_hash_ratio: 0.0, - level_count: 7, tree_type: TreeType::Standard, @@ -146,6 +146,8 @@ impl Default for Config { data_block_compression_policy: CompressionPolicy::default(), index_block_compression_policy:CompressionPolicy::all(CompressionType::None), + data_block_hash_ratio_policy: HashRatioPolicy::all(0.0), + blob_compression: CompressionType::None, filter_policy: FilterPolicy::default(), @@ -234,22 +236,6 @@ impl Config { self } - /// Sets the hash ratio for the hash index in data blocks. - /// - /// The hash index speeds up point queries by using an embedded - /// hash map in data blocks, but uses more space/memory. - /// - /// Something along the lines of 1.0 - 2.0 is sensible. - /// - /// If 0, the hash index is not constructed. - /// - /// Default = 0.0 - #[must_use] - pub fn data_block_hash_ratio(mut self, ratio: f32) -> Self { - self.data_block_hash_ratio = ratio; // TODO: policy - self - } - /// Sets the filter construction policy. #[must_use] pub fn filter_policy(mut self, policy: FilterPolicy) -> Self { @@ -309,6 +295,16 @@ impl Config { self } + /// Sets the hash ratio policy for data blocks. + /// + /// If greater than 0.0, a hash index is embedded into data blocks that can speed up reads + /// inside the data block. + #[must_use] + pub fn data_block_hash_ratio_policy(mut self, policy: HashRatioPolicy) -> Self { + self.data_block_hash_ratio_policy = policy; + self + } + /// Sets the target size of blob files. /// /// Smaller blob files allow more granular garbage collection diff --git a/src/tree/ingest.rs b/src/tree/ingest.rs index bdb34b5d..6cf1c698 100644 --- a/src/tree/ingest.rs +++ b/src/tree/ingest.rs @@ -9,6 +9,8 @@ use crate::{ }; use std::{path::PathBuf, sync::Arc}; +pub const INITIAL_CANONICAL_LEVEL: usize = 1; + /// Bulk ingestion /// /// Items NEED to be added in ascending key order. @@ -35,8 +37,16 @@ impl<'a> Ingestion<'a> { tree.segment_id_counter.clone(), 64 * 1_024 * 1_024, )? - .use_data_block_hash_ratio(tree.config.data_block_hash_ratio) - .use_data_block_compression(tree.config.data_block_compression_policy.get(6)); + .use_data_block_hash_ratio( + tree.config + .data_block_hash_ratio_policy + .get(INITIAL_CANONICAL_LEVEL), + ) + .use_data_block_compression( + tree.config + .data_block_compression_policy + .get(INITIAL_CANONICAL_LEVEL), + ); Ok(Self { folder, @@ -94,8 +104,16 @@ impl<'a> Ingestion<'a> { log::info!("Finished ingestion writer"); - let pin_filter = self.tree.config.filter_block_pinning_policy.get(6); - let pin_index = self.tree.config.filter_block_pinning_policy.get(6); + let pin_filter = self + .tree + .config + .filter_block_pinning_policy + .get(INITIAL_CANONICAL_LEVEL); + let pin_index = self + .tree + .config + .filter_block_pinning_policy + .get(INITIAL_CANONICAL_LEVEL); let created_segments = results .into_iter() diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 1083570e..ada3a66d 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -291,6 +291,8 @@ impl AbstractTree for Tree { let data_block_compression = self.config.data_block_compression_policy.get(0); let index_block_compression = self.config.index_block_compression_policy.get(0); + let data_block_hash_ratio = self.config.data_block_hash_ratio_policy.get(0); + log::debug!( "Flushing segment to {}, data_block_restart_interval={data_block_restart_interval}, index_block_restart_interval={index_block_restart_interval}, data_block_size={data_block_size}, index_block_size={index_block_size}, data_block_compression={data_block_compression}, index_block_compression={index_block_compression}", segment_file_path.display(), @@ -303,7 +305,7 @@ impl AbstractTree for Tree { .use_index_block_compression(index_block_compression) .use_data_block_size(data_block_size) .use_index_block_size(index_block_size) - .use_data_block_hash_ratio(self.config.data_block_hash_ratio) + .use_data_block_hash_ratio(data_block_hash_ratio) .use_bloom_policy({ use crate::config::FilterPolicyEntry::{Bloom, None}; use crate::segment::filter::BloomConstructionPolicy; From 28855db58159b1cc2c2a94218c2bbda61a138360 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 24 Sep 2025 23:47:17 +0200 Subject: [PATCH 464/613] clippy --- src/file.rs | 10 +- src/segment/block/decoder.rs | 2 +- src/segment/block_index/mod.rs | 2 +- src/segment/data_block/iter.rs | 123 ++++++++++++------- src/segment/data_block/mod.rs | 25 +++- src/segment/filter/blocked_bloom/builder.rs | 2 + src/segment/filter/standard_bloom/builder.rs | 4 + src/segment/index_block/mod.rs | 11 +- src/segment/mod.rs | 2 +- src/segment/writer/mod.rs | 38 +++--- src/tree/mod.rs | 4 +- 11 files changed, 150 insertions(+), 73 deletions(-) diff --git a/src/file.rs b/src/file.rs index 68ef1cf0..906d7b18 100644 --- a/src/file.rs +++ b/src/file.rs @@ -13,7 +13,15 @@ pub const BLOBS_FOLDER: &str = "blobs"; /// Reads bytes from a file using `pread`. pub fn read_exact(file: &File, offset: u64, size: usize) -> std::io::Result { - #[warn(unsafe_code)] + // SAFETY: This slice builder starts uninitialized, but we know its length + // + // We use read_at/seek_read which give us the number of bytes read + // If that number does not match the slice length, the function panics (for now), + // so the (partially) uninitialized buffer is discarded + // + // Additionally, generally, block loads furthermore do a checksum check which + // would likely catch the buffer being wrong somehow + #[allow(unsafe_code)] let mut builder = unsafe { Slice::builder_unzeroed(size) }; { diff --git a/src/segment/block/decoder.rs b/src/segment/block/decoder.rs index 3b854541..a950d230 100644 --- a/src/segment/block/decoder.rs +++ b/src/segment/block/decoder.rs @@ -2,7 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::{binary_index::Reader as BinaryIndexReader, hash_index::Reader as HashIndexReader}; +use super::binary_index::Reader as BinaryIndexReader; use crate::{ segment::{block::Trailer, Block}, unwrap, Slice, diff --git a/src/segment/block_index/mod.rs b/src/segment/block_index/mod.rs index 99ade210..de0fa6c5 100644 --- a/src/segment/block_index/mod.rs +++ b/src/segment/block_index/mod.rs @@ -2,7 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -pub(crate) mod iter; +pub mod iter; use super::{CachePolicy, IndexBlock, KeyedBlockHandle}; use crate::segment::block::ParsedItem; diff --git a/src/segment/data_block/iter.rs b/src/segment/data_block/iter.rs index 35d230cb..f02b45a7 100644 --- a/src/segment/data_block/iter.rs +++ b/src/segment/data_block/iter.rs @@ -119,6 +119,7 @@ impl DoubleEndedIterator for Iter<'_> { } #[cfg(test)] +#[allow(clippy::expect_used)] mod tests { use crate::{ segment::{ @@ -1020,7 +1021,6 @@ mod tests { } #[test] - #[allow(clippy::unwrap_used)] fn v3_data_block_iter_consume_last_back() -> crate::Result<()> { let items = [ InternalValue::from_components("pla:earth:fact", "eaaaaaaaaarth", 0, Value), @@ -1052,11 +1052,26 @@ mod tests { .iter() .map(|item| item.materialize(&data_block.inner.data)); - assert_eq!(b"pla:earth:fact", &*iter.next().unwrap().key.user_key); - assert_eq!(b"pla:jupiter:fact", &*iter.next().unwrap().key.user_key); - assert_eq!(b"pla:jupiter:mass", &*iter.next().unwrap().key.user_key); - assert_eq!(b"pla:jupiter:name", &*iter.next().unwrap().key.user_key); - assert_eq!(b"pla:jupiter:radius", &*iter.next().unwrap().key.user_key); + assert_eq!( + b"pla:earth:fact", + &*iter.next().expect("should exist").key.user_key, + ); + assert_eq!( + b"pla:jupiter:fact", + &*iter.next().expect("should exist").key.user_key, + ); + assert_eq!( + b"pla:jupiter:mass", + &*iter.next().expect("should exist").key.user_key, + ); + assert_eq!( + b"pla:jupiter:name", + &*iter.next().expect("should exist").key.user_key, + ); + assert_eq!( + b"pla:jupiter:radius", + &*iter.next().expect("should exist").key.user_key, + ); assert!(iter.next_back().is_none()); assert!(iter.next().is_none()); } @@ -1066,13 +1081,25 @@ mod tests { .iter() .map(|item| item.materialize(&data_block.inner.data)); - assert_eq!(b"pla:earth:fact", &*iter.next().unwrap().key.user_key); - assert_eq!(b"pla:jupiter:fact", &*iter.next().unwrap().key.user_key); - assert_eq!(b"pla:jupiter:mass", &*iter.next().unwrap().key.user_key); - assert_eq!(b"pla:jupiter:name", &*iter.next().unwrap().key.user_key); + assert_eq!( + b"pla:earth:fact", + &*iter.next().expect("should exist").key.user_key, + ); + assert_eq!( + b"pla:jupiter:fact", + &*iter.next().expect("should exist").key.user_key, + ); + assert_eq!( + b"pla:jupiter:mass", + &*iter.next().expect("should exist").key.user_key, + ); + assert_eq!( + b"pla:jupiter:name", + &*iter.next().expect("should exist").key.user_key, + ); assert_eq!( b"pla:jupiter:radius", - &*iter.next_back().unwrap().key.user_key + &*iter.next_back().expect("should exist").key.user_key, ); assert!(iter.next().is_none()); assert!(iter.next_back().is_none()); @@ -1083,7 +1110,6 @@ mod tests { } #[test] - #[allow(clippy::unwrap_used)] fn v3_data_block_iter_consume_last_forwards() -> crate::Result<()> { let items = [ InternalValue::from_components("pla:earth:fact", "eaaaaaaaaarth", 0, Value), @@ -1116,20 +1142,26 @@ mod tests { .rev() .map(|item| item.materialize(&data_block.inner.data)); - assert_eq!(b"pla:earth:fact", &*iter.next_back().unwrap().key.user_key); + assert_eq!( + b"pla:earth:fact", + &*iter.next_back().expect("should exist").key.user_key, + ); assert_eq!( b"pla:jupiter:fact", - &*iter.next_back().unwrap().key.user_key + &*iter.next_back().expect("should exist").key.user_key, ); assert_eq!( b"pla:jupiter:mass", - &*iter.next_back().unwrap().key.user_key + &*iter.next_back().expect("should exist").key.user_key, ); assert_eq!( b"pla:jupiter:name", - &*iter.next_back().unwrap().key.user_key + &*iter.next_back().expect("should exist").key.user_key, + ); + assert_eq!( + b"pla:jupiter:radius", + &*iter.next().expect("should exist").key.user_key, ); - assert_eq!(b"pla:jupiter:radius", &*iter.next().unwrap().key.user_key); assert!(iter.next().is_none()); assert!(iter.next_back().is_none()); } @@ -1140,20 +1172,26 @@ mod tests { .rev() .map(|item| item.materialize(&data_block.inner.data)); - assert_eq!(b"pla:earth:fact", &*iter.next_back().unwrap().key.user_key); + assert_eq!( + b"pla:earth:fact", + &*iter.next_back().expect("should exist").key.user_key, + ); assert_eq!( b"pla:jupiter:fact", - &*iter.next_back().unwrap().key.user_key + &*iter.next_back().expect("should exist").key.user_key, ); assert_eq!( b"pla:jupiter:mass", - &*iter.next_back().unwrap().key.user_key + &*iter.next_back().expect("should exist").key.user_key, ); assert_eq!( b"pla:jupiter:name", - &*iter.next_back().unwrap().key.user_key + &*iter.next_back().expect("should exist").key.user_key, + ); + assert_eq!( + b"pla:jupiter:radius", + &*iter.next().expect("should exist").key.user_key, ); - assert_eq!(b"pla:jupiter:radius", &*iter.next().unwrap().key.user_key); assert!(iter.next_back().is_none()); assert!(iter.next().is_none()); } @@ -1163,7 +1201,6 @@ mod tests { } #[test] - #[allow(clippy::unwrap_used)] fn v3_data_block_iter_ping_pong_exhaust() -> crate::Result<()> { let items = [ InternalValue::from_components("a", "a", 0, Value), @@ -1195,11 +1232,11 @@ mod tests { .iter() .map(|item| item.materialize(&data_block.inner.data)); - assert_eq!(b"a", &*iter.next().unwrap().key.user_key); - assert_eq!(b"b", &*iter.next().unwrap().key.user_key); - assert_eq!(b"c", &*iter.next().unwrap().key.user_key); - assert_eq!(b"d", &*iter.next().unwrap().key.user_key); - assert_eq!(b"e", &*iter.next().unwrap().key.user_key); + assert_eq!(b"a", &*iter.next().expect("should exist").key.user_key); + assert_eq!(b"b", &*iter.next().expect("should exist").key.user_key); + assert_eq!(b"c", &*iter.next().expect("should exist").key.user_key); + assert_eq!(b"d", &*iter.next().expect("should exist").key.user_key); + assert_eq!(b"e", &*iter.next().expect("should exist").key.user_key); assert!(iter.next().is_none()); assert!(iter.next().is_none()); } @@ -1209,11 +1246,11 @@ mod tests { .iter() .map(|item| item.materialize(&data_block.inner.data)); - assert_eq!(b"e", &*iter.next_back().unwrap().key.user_key); - assert_eq!(b"d", &*iter.next_back().unwrap().key.user_key); - assert_eq!(b"c", &*iter.next_back().unwrap().key.user_key); - assert_eq!(b"b", &*iter.next_back().unwrap().key.user_key); - assert_eq!(b"a", &*iter.next_back().unwrap().key.user_key); + assert_eq!(b"e", &*iter.next_back().expect("should exist").key.user_key); + assert_eq!(b"d", &*iter.next_back().expect("should exist").key.user_key); + assert_eq!(b"c", &*iter.next_back().expect("should exist").key.user_key); + assert_eq!(b"b", &*iter.next_back().expect("should exist").key.user_key); + assert_eq!(b"a", &*iter.next_back().expect("should exist").key.user_key); assert!(iter.next_back().is_none()); assert!(iter.next_back().is_none()); } @@ -1223,11 +1260,11 @@ mod tests { .iter() .map(|item| item.materialize(&data_block.inner.data)); - assert_eq!(b"a", &*iter.next().unwrap().key.user_key); - assert_eq!(b"b", &*iter.next().unwrap().key.user_key); - assert_eq!(b"c", &*iter.next().unwrap().key.user_key); - assert_eq!(b"d", &*iter.next().unwrap().key.user_key); - assert_eq!(b"e", &*iter.next().unwrap().key.user_key); + assert_eq!(b"a", &*iter.next().expect("should exist").key.user_key); + assert_eq!(b"b", &*iter.next().expect("should exist").key.user_key); + assert_eq!(b"c", &*iter.next().expect("should exist").key.user_key); + assert_eq!(b"d", &*iter.next().expect("should exist").key.user_key); + assert_eq!(b"e", &*iter.next().expect("should exist").key.user_key); assert!(iter.next_back().is_none()); assert!(iter.next_back().is_none()); assert!(iter.next().is_none()); @@ -1239,11 +1276,11 @@ mod tests { .iter() .map(|item| item.materialize(&data_block.inner.data)); - assert_eq!(b"e", &*iter.next_back().unwrap().key.user_key); - assert_eq!(b"d", &*iter.next_back().unwrap().key.user_key); - assert_eq!(b"c", &*iter.next_back().unwrap().key.user_key); - assert_eq!(b"b", &*iter.next_back().unwrap().key.user_key); - assert_eq!(b"a", &*iter.next_back().unwrap().key.user_key); + assert_eq!(b"e", &*iter.next_back().expect("should exist").key.user_key); + assert_eq!(b"d", &*iter.next_back().expect("should exist").key.user_key); + assert_eq!(b"c", &*iter.next_back().expect("should exist").key.user_key); + assert_eq!(b"b", &*iter.next_back().expect("should exist").key.user_key); + assert_eq!(b"a", &*iter.next_back().expect("should exist").key.user_key); assert!(iter.next().is_none()); assert!(iter.next().is_none()); assert!(iter.next_back().is_none()); diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index e05dbfae..c1166e72 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -178,11 +178,19 @@ impl Encodable<()> for InternalValue { writer.write_u64_varint(self.key.seqno)?; // 2 // TODO: maybe we can skip this varint altogether if prefix truncation = false + + // NOTE: We know keys have u16 length max + #[allow(clippy::cast_possible_truncation)] writer.write_u16_varint(shared_len as u16)?; // 3 let rest_len = self.key().len() - shared_len; + + // NOTE: We know keys have u16 length max + #[allow(clippy::cast_possible_truncation)] writer.write_u16_varint(rest_len as u16)?; // 4 + // NOTE: We trust the caller + #[allow(clippy::expect_used)] let truncated_user_key = self .key .user_key @@ -468,12 +476,19 @@ impl DataBlock { Ok(buf) } + /// Builds an data block. + /// + /// # Panics + /// + /// Panics if the given item array if empty. pub fn encode_into( writer: &mut Vec, items: &[InternalValue], restart_interval: u8, hash_index_ratio: f32, ) -> crate::Result<()> { + // NOTE: We expect a non-empty chunk of items + #[allow(clippy::expect_used)] let first_key = &items .first() .expect("chunk should not be empty") @@ -497,6 +512,7 @@ impl DataBlock { } #[cfg(test)] +#[allow(clippy::expect_used)] mod tests { use crate::{ segment::{ @@ -509,7 +525,6 @@ mod tests { use test_log::test; #[test] - #[allow(clippy::unwrap_used)] fn v3_data_block_ping_pong_fuzz_1() -> crate::Result<()> { let items = [ InternalValue::from_components( @@ -547,9 +562,9 @@ mod tests { for &x in &ping_pong_code { if x == 0 { - v.push(iter.next().cloned().unwrap()); + v.push(iter.next().cloned().expect("should have item")); } else { - v.push(iter.next_back().cloned().unwrap()); + v.push(iter.next_back().cloned().expect("should have item")); } } @@ -565,9 +580,9 @@ mod tests { for &x in &ping_pong_code { if x == 0 { - v.push(iter.next().unwrap()); + v.push(iter.next().expect("should have item")); } else { - v.push(iter.next_back().unwrap()); + v.push(iter.next_back().expect("should have item")); } } diff --git a/src/segment/filter/blocked_bloom/builder.rs b/src/segment/filter/blocked_bloom/builder.rs index c0f1204c..2b2a7bfa 100644 --- a/src/segment/filter/blocked_bloom/builder.rs +++ b/src/segment/filter/blocked_bloom/builder.rs @@ -131,6 +131,8 @@ impl Builder { for i in 1..(self.k as u64) { let idx = h1 % (CACHE_LINE_BYTES as u64 * 8); + // NOTE: Even for a large segment, filters tend to be pretty small, definitely less than 4 GiB + #[allow(clippy::cast_possible_truncation)] self.inner .enable_bit(Self::get_bit_idx(block_idx as usize, idx as usize)); diff --git a/src/segment/filter/standard_bloom/builder.rs b/src/segment/filter/standard_bloom/builder.rs index 6697abb6..1024d041 100644 --- a/src/segment/filter/standard_bloom/builder.rs +++ b/src/segment/filter/standard_bloom/builder.rs @@ -27,6 +27,8 @@ pub struct Builder { #[allow(clippy::len_without_is_empty)] impl Builder { + // NOTE: We write into a Vec, so no I/O error can happen + #[allow(clippy::expect_used)] #[must_use] pub fn build(&self) -> Vec { let mut v = vec![]; @@ -116,6 +118,8 @@ impl Builder { for i in 1..=(self.k as u64) { let idx = h1 % (self.m as u64); + // NOTE: Even for a large segment, filters tend to be pretty small, definitely less than 4 GiB + #[allow(clippy::cast_possible_truncation)] self.inner.enable_bit(idx as usize); h1 = h1.wrapping_add(h2); diff --git a/src/segment/index_block/mod.rs b/src/segment/index_block/mod.rs index dc6135a2..495a909d 100644 --- a/src/segment/index_block/mod.rs +++ b/src/segment/index_block/mod.rs @@ -93,18 +93,25 @@ impl IndexBlock { Ok(buf) } + /// Builds an index block. + /// + /// # Panics + /// + /// Panics if the given item array if empty. pub fn encode_into( writer: &mut Vec, items: &[KeyedBlockHandle], // restart_interval: u8, // TODO: support prefix truncation + delta encoding ) -> crate::Result<()> { + // NOTE: We expect a non-empty chunk of items + #[allow(clippy::expect_used)] let first_key = items.first().expect("chunk should not be empty").end_key(); let mut serializer = Encoder::<'_, BlockOffset, KeyedBlockHandle>::new( writer, items.len(), - 1, // TODO: hard coded for now - 0.0, // NOTE: Index blocks do not support hash index + /* TODO: hard-coded for now */ 1, + /* Index blocks do not support hash index */ 0.0, first_key, ); diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 35dbbea6..27f0e7a8 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -323,7 +323,7 @@ impl Segment { BlockIndexImpl::Full(idx) => idx.inner(), BlockIndexImpl::VolatileFull => { &IndexBlock::new( - // TODO: handle error + // TODO: load on initial access to block index self.load_block( &self.regions.tli, BlockType::Index, diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index da272f4b..b8b3a0e7 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -30,7 +30,7 @@ pub struct Writer { index_block_restart_interval: u8, data_block_size: u32, - index_block_size: u32, // TODO: implement + index_block_size: u32, // TODO: 3.0.0 implement partitioned index data_block_hash_ratio: f32, @@ -48,6 +48,7 @@ pub struct Writer { block_writer: BufWriter, /// Writer of index blocks + #[allow(clippy::struct_field_names)] index_writer: Box>>, /// Buffer of KVs @@ -249,6 +250,8 @@ impl Writer { self.meta.uncompressed_size += u64::from(header.uncompressed_length); + // NOTE: Block header is a couple of bytes only, so cast is fine + #[allow(clippy::cast_possible_truncation)] let bytes_written = BlockHeader::serialized_len() as u32 + header.data_length; self.index_writer @@ -259,13 +262,13 @@ impl Writer { ))?; // Adjust metadata - self.meta.file_pos += bytes_written as u64; + self.meta.file_pos += u64::from(bytes_written); self.meta.item_count += self.chunk.len(); self.meta.data_block_count += 1; // Back link stuff self.prev_pos.0 = self.prev_pos.1; - self.prev_pos.1 += bytes_written as u64; + self.prev_pos.1 += u64::from(bytes_written); // Set last key self.meta.last_key = Some( @@ -344,6 +347,8 @@ impl Writer { CompressionType::None, )?; + // NOTE: Block header is a couple of bytes only, so cast is fine + #[allow(clippy::cast_possible_truncation)] let bytes_written = (BlockHeader::serialized_len() as u32) + header.data_length; Some(BlockHandle::new(BlockOffset(filter_ptr), bytes_written)) @@ -351,18 +356,6 @@ impl Writer { }; log::trace!("filter_ptr={filter_handle:?}"); - // // TODO: #46 https://github.com/fjall-rs/lsm-tree/issues/46 - Write range filter - // let rf_ptr = BlockOffset(0); - // log::trace!("rf_ptr={rf_ptr}"); - - // // TODO: #2 https://github.com/fjall-rs/lsm-tree/issues/2 - Write range tombstones - // let range_tombstones_ptr = BlockOffset(0); - // log::trace!("range_tombstones_ptr={range_tombstones_ptr}"); - - // // TODO: - // let pfx_ptr = BlockOffset(0); - // log::trace!("pfx_ptr={pfx_ptr}"); - // Write metadata let metadata_start = BlockOffset(self.block_writer.stream_position()?); @@ -395,10 +388,14 @@ impl Writer { meta("#item_count", &(self.meta.item_count as u64).to_le_bytes()), meta( "#key#max", + // NOTE: At the beginning we check that we have written at least 1 item, so last_key must exist + #[allow(clippy::expect_used)] self.meta.last_key.as_ref().expect("should exist"), ), meta( "#key#min", + // NOTE: At the beginning we check that we have written at least 1 item, so first_key must exist + #[allow(clippy::expect_used)] self.meta.first_key.as_ref().expect("should exist"), ), meta("#key_count", &(self.meta.key_count as u64).to_le_bytes()), @@ -450,6 +447,8 @@ impl Writer { CompressionType::None, )?; + // NOTE: Block header is a couple of bytes only, so cast is fine + #[allow(clippy::cast_possible_truncation)] let bytes_written = BlockHeader::serialized_len() as u32 + header.data_length; BlockHandle::new(metadata_start, bytes_written) @@ -476,9 +475,11 @@ impl Writer { CompressionType::None, )?; + // NOTE: Block header is a couple of bytes only, so cast is fine + #[allow(clippy::cast_possible_truncation)] let bytes_written = BlockHeader::serialized_len() as u32 + header.data_length; - BlockHandle::new(regions_block_start, bytes_written as u32) + BlockHandle::new(regions_block_start, bytes_written) }; // Write fixed-size trailer @@ -490,6 +491,9 @@ impl Writer { self.block_writer.get_mut().sync_all()?; // IMPORTANT: fsync folder on Unix + + // NOTE: If there's no parent folder, something has gone horribly wrong + #[allow(clippy::expect_used)] fsync_directory(self.path.parent().expect("should have folder"))?; log::debug!( @@ -503,7 +507,7 @@ impl Writer { } } -// TODO: restore +// TODO: 3.0.0 restore /* #[cfg(test)] #[allow(clippy::expect_used)] diff --git a/src/tree/mod.rs b/src/tree/mod.rs index ada3a66d..ec583088 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -488,7 +488,7 @@ impl AbstractTree for Tree { .expect("lock is poisoned") .current_version() .iter_levels() - .map(|x| x.size()) + .map(super::version::Level::size) .sum() } @@ -588,7 +588,7 @@ impl Tree { &self, writer: crate::segment::Writer, ) -> crate::Result> { - let segment_file_path = writer.path.to_path_buf(); + let segment_file_path = writer.path.clone(); let Some(_) = writer.finish()? else { return Ok(None); From 1491a3c57b364962c7b3fda4c3ae99437f8da4e8 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 25 Sep 2025 00:02:20 +0200 Subject: [PATCH 465/613] change segment metadata --- src/segment/meta.rs | 53 +++++++++++++++++++++++++++++---------- src/segment/writer/mod.rs | 4 +-- 2 files changed, 42 insertions(+), 15 deletions(-) diff --git a/src/segment/meta.rs b/src/segment/meta.rs index 8801e420..aa923841 100644 --- a/src/segment/meta.rs +++ b/src/segment/meta.rs @@ -63,23 +63,50 @@ impl ParsedMeta { let block = DataBlock::new(block); - assert_eq!( - b"xxh3", - &*block - .point_read(b"#hash_type", SeqNo::MAX) + #[allow(clippy::indexing_slicing)] + { + let table_version = block + .point_read(b"v#table_version", SeqNo::MAX) .expect("Segment ID should exist") - .value, - "invalid hash type", - ); + .value; + + assert_eq!(1, table_version.len(), "invalid table version byte array"); - assert_eq!( - b"xxh3", - &*block + assert_eq!( + [3u8], + &*table_version, + "unspported table version {}", + table_version[0], + ); + } + + { + let hash_type = block + .point_read(b"#filter_hash_type", SeqNo::MAX) + .expect("Segment ID should exist") + .value; + + assert_eq!( + b"xxh3", + &*hash_type, + "invalid hash type: {:?}", + std::str::from_utf8(&hash_type), + ); + } + + { + let hash_type = block .point_read(b"#checksum_type", SeqNo::MAX) .expect("Segment ID should exist") - .value, - "invalid checksum type", - ); + .value; + + assert_eq!( + b"xxh3", + &*hash_type, + "invalid checksum type: {:?}", + std::str::from_utf8(&hash_type), + ); + } let id = { let bytes = block diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index b8b3a0e7..8563b2e1 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -379,7 +379,7 @@ impl Writer { "#data_block_count", &(self.meta.data_block_count as u64).to_le_bytes(), ), - meta("#hash_type", b"xxh3"), + meta("#filter_hash_type", b"xxh3"), meta("#id", &self.segment_id.to_le_bytes()), meta( "#index_block_count", @@ -421,7 +421,7 @@ impl Writer { &self.meta.uncompressed_size.to_le_bytes(), ), meta("v#lsmt", env!("CARGO_PKG_VERSION").as_bytes()), - meta("v#table", b"3"), + meta("v#table_version", &[3u8]), // TODO: tli_handle_count // TODO: hash ratio etc ]; From e301c28c5e9cfb68df3a3f951f98c14e0e7e2e35 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 25 Sep 2025 00:43:04 +0200 Subject: [PATCH 466/613] doc --- src/iter_guard.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/iter_guard.rs b/src/iter_guard.rs index 5655dbd6..eecd4fa2 100644 --- a/src/iter_guard.rs +++ b/src/iter_guard.rs @@ -3,10 +3,10 @@ use crate::{ }; use enum_dispatch::enum_dispatch; -/// An iterator item +/// Guard to access key-value pairs #[enum_dispatch] pub trait IterGuard { - /// Accesses the key-value tuple. + /// Accesses the key-value pair. /// /// # Errors /// From 3d77c11eaa897f8d2839b58ffd9397052c6cd5c1 Mon Sep 17 00:00:00 2001 From: Marvin <33938500+marvin-j97@users.noreply.github.com> Date: Thu, 25 Sep 2025 03:48:53 +0200 Subject: [PATCH 467/613] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c953a3a7..82485bf6 100644 --- a/README.md +++ b/README.md @@ -26,8 +26,9 @@ This is the most feature-rich LSM-tree implementation in Rust! It features: - Per-level filter/index block pinning configuration - Range & prefix searching with forward and reverse iteration - Block caching to keep hot data in memory +- File descriptor caching with upper bound to reduce fopen calls - *AMQ* filters (currently Bloom filters) to improve point lookup performance -- Snapshots (*MVCC*) +- Multi-versioning of KVs, enabling snapshot reads - Optionally partitioned block index & filters for better cache efficiency [[1]](#footnotes) - Size-tiered, (concurrent) Leveled and FIFO compaction - Multi-threaded flushing (immutable/sealed memtables) From 0a73d72faea7678bc00203030a3a200e408b1603 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 26 Sep 2025 17:20:11 +0200 Subject: [PATCH 468/613] refactor: table file regions to use tft tft real name TBD --- Cargo.toml | 1 + src/segment/block/checksum.rs | 6 + src/segment/index_block/block_handle.rs | 2 +- src/segment/mod.rs | 38 +++---- src/segment/multi_writer.rs | 2 - src/segment/regions.rs | 145 +++++++----------------- src/segment/trailer.rs | 102 ----------------- src/segment/writer/index.rs | 22 +--- src/segment/writer/mod.rs | 142 ++++++++--------------- 9 files changed, 119 insertions(+), 341 deletions(-) delete mode 100644 src/segment/trailer.rs diff --git a/Cargo.toml b/Cargo.toml index 86a56230..eb1ad19e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,6 +38,7 @@ self_cell = "1.2.0" tempfile = "3.20.0" varint-rs = "2.2.0" xxhash-rust = { version = "0.8.15", features = ["xxh3"] } +tft = { path = "../tft" } [dev-dependencies] criterion = { version = "0.5.1", features = ["html_reports"] } diff --git a/src/segment/block/checksum.rs b/src/segment/block/checksum.rs index 76ba835e..ae987413 100644 --- a/src/segment/block/checksum.rs +++ b/src/segment/block/checksum.rs @@ -14,6 +14,12 @@ impl std::ops::Deref for Checksum { } } +impl From for Checksum { + fn from(value: tft::Checksum) -> Self { + Self(value.into_u128()) + } +} + impl Checksum { #[must_use] pub fn from_raw(value: u128) -> Self { diff --git a/src/segment/index_block/block_handle.rs b/src/segment/index_block/block_handle.rs index c0999d9c..83f6ec4e 100644 --- a/src/segment/index_block/block_handle.rs +++ b/src/segment/index_block/block_handle.rs @@ -25,7 +25,7 @@ pub struct BlockHandle { /// Size of block in bytes size: u32, -} +} // TODO: 3.0.0 ^---- maybe u64 impl BlockHandle { #[must_use] diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 27f0e7a8..708df865 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -14,7 +14,6 @@ mod meta; pub(crate) mod multi_writer; mod regions; mod scanner; -mod trailer; pub mod util; mod writer; @@ -378,19 +377,29 @@ impl Segment { use meta::ParsedMeta; use regions::ParsedRegions; use std::sync::atomic::AtomicBool; - use trailer::Trailer; log::debug!("Recovering segment from file {}", file_path.display()); let mut file = std::fs::File::open(&file_path)?; - let trailer = Trailer::from_file(&mut file)?; - log::trace!("Got trailer: {trailer:#?}"); - - log::debug!( - "Reading regions block, with region_ptr={:?}", - trailer.regions_block_handle(), - ); - let regions = ParsedRegions::load_with_handle(&file, trailer.regions_block_handle())?; + let trailer = tft::Reader::from_reader(&mut file).map_err(|e| match e { + tft::Error::Io(e) => crate::Error::from(e), + tft::Error::ChecksumMismatch { got, expected } => { + log::error!("Archive ToC checksum mismatch"); + crate::Error::ChecksumMismatch { + got: got.into(), + expected: expected.into(), + } + } + tft::Error::InvalidVersion => { + log::error!("Invalid archive version"); + crate::Error::Unrecoverable + } + tft::Error::UnsupportedChecksumType => { + log::error!("Invalid archive checksum type"); + crate::Error::Unrecoverable + } + })?; + let regions = ParsedRegions::parse_from_toc(trailer.toc())?; log::debug!("Reading meta block, with meta_ptr={:?}", regions.metadata); let metadata = ParsedMeta::load_with_handle(&file, ®ions.metadata)?; @@ -460,15 +469,6 @@ impl Segment { None }; - // TODO: Maybe only in L0/L1 - // For larger levels, this will - // cache possibly many FDs - // causing kick-out of other - // FDs in the cache - // - // NOTE: We already have a file descriptor open, so let's just cache it immediately - // descriptor_table.insert_for_table((tree_id, metadata.id).into(), Arc::new(file)); - let segment = Self(Arc::new(Inner { path: Arc::new(file_path), tree_id, diff --git a/src/segment/multi_writer.rs b/src/segment/multi_writer.rs index 779c415a..f38d2ebe 100644 --- a/src/segment/multi_writer.rs +++ b/src/segment/multi_writer.rs @@ -100,8 +100,6 @@ impl MultiWriter { #[must_use] pub fn use_data_block_hash_ratio(mut self, ratio: f32) -> Self { - eprintln!("multi write writing data block: data_block_hash_ratio={ratio}"); - self.data_block_hash_ratio = ratio; self.writer = self.writer.use_data_block_hash_ratio(ratio); self diff --git a/src/segment/regions.rs b/src/segment/regions.rs index 3e54483d..efbcb6e4 100644 --- a/src/segment/regions.rs +++ b/src/segment/regions.rs @@ -2,15 +2,32 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::{Block, BlockHandle}; -use crate::{ - coding::{Decode, Encode}, - segment::{block::BlockType, DataBlock}, - CompressionType, InternalValue, SeqNo, UserValue, -}; -use std::fs::File; +use super::{BlockHandle, BlockOffset}; +use tft::TocEntry; + +fn toc_entry_to_handle(entry: &TocEntry) -> BlockHandle { + BlockHandle::new(BlockOffset(entry.pos()), entry.len() as u32) +} /// The regions block stores offsets to the different segment disk file "regions" +/// +/// ---------------- +/// | data | <- implicitly start at 0 +/// |--------------| +/// | tli | +/// |--------------| +/// | index | <- may not exist (if full block index is used, TLI will be dense) +/// |--------------| +/// | filter | <- may not exist +/// |--------------| +/// | ... TBD ... | +/// |--------------| +/// | meta | +/// |--------------| +/// | toc | +/// |--------------| +/// | trailer | <- fixed size +/// |--------------| #[derive(Copy, Clone, Debug, Default, PartialEq, Eq)] pub struct ParsedRegions { pub tli: BlockHandle, @@ -20,104 +37,24 @@ pub struct ParsedRegions { } impl ParsedRegions { - pub fn load_with_handle(file: &File, handle: &BlockHandle) -> crate::Result { - let block = Block::from_file(file, *handle, CompressionType::None)?; - - if block.header.block_type != BlockType::Regions { - return Err(crate::Error::Decode(crate::DecodeError::InvalidTag(( - "BlockType", - block.header.block_type.into(), - )))); - } - - let block = DataBlock::new(block); - - let tli = { - // NOTE: Top-level index block is always written - #[allow(clippy::expect_used)] - let bytes = block - .point_read(b"tli", SeqNo::MAX) - .expect("TLI handle should exist"); - - let mut bytes = &bytes.value[..]; - BlockHandle::decode_from(&mut bytes) - }?; - - let metadata = { - // NOTE: Metadata block is always written - #[allow(clippy::expect_used)] - let bytes = block - .point_read(b"meta", SeqNo::MAX) - .expect("Metadata handle should exist"); - - let mut bytes = &bytes.value[..]; - BlockHandle::decode_from(&mut bytes) - }?; - - let index = { - match block.point_read(b"index", SeqNo::MAX) { - Some(bytes) if !bytes.value.is_empty() => { - let mut bytes = &bytes.value[..]; - Some(BlockHandle::decode_from(&mut bytes)) - } - _ => None, - } - } - .transpose()?; - - let filter = { - match block.point_read(b"filter", SeqNo::MAX) { - Some(bytes) if !bytes.value.is_empty() => { - let mut bytes = &bytes.value[..]; - Some(BlockHandle::decode_from(&mut bytes)) - } - _ => None, - } - } - .transpose()?; - + pub fn parse_from_toc(toc: &tft::Toc) -> crate::Result { Ok(Self { - tli, - index, - filter, - metadata, + tli: toc + .section(b"tli") + .map(toc_entry_to_handle) + .ok_or_else(|| { + log::error!("TLI should exist"); + crate::Error::Unrecoverable + })?, + index: toc.section(b"index").map(toc_entry_to_handle), + filter: toc.section(b"filter").map(toc_entry_to_handle), + metadata: toc + .section(b"meta") + .map(toc_entry_to_handle) + .ok_or_else(|| { + log::error!("Metadata should exist"); + crate::Error::Unrecoverable + })?, }) } - - pub fn encode_into_vec(&self) -> crate::Result> { - fn region(key: &str, value: impl Into) -> InternalValue { - InternalValue::from_components(key, value, 0, crate::ValueType::Value) - } - - let items = [ - region( - "filter", - match self.filter { - Some(handle) => handle.encode_into_vec(), - None => vec![], - }, - ), - region( - "index", - match self.index { - Some(handle) => handle.encode_into_vec(), - None => vec![], - }, - ), - region("meta", self.metadata.encode_into_vec()), - region("tli", self.tli.encode_into_vec()), - ]; - - #[cfg(debug_assertions)] - { - let mut sorted_copy = items.clone(); - sorted_copy.sort(); - - // Just to make sure the items are definitely sorted - assert_eq!(items, sorted_copy, "region items not sorted correctly"); - } - - // TODO: no binary index - DataBlock::encode_into_vec(&items, 1, 0.0) - } } diff --git a/src/segment/trailer.rs b/src/segment/trailer.rs deleted file mode 100644 index b210683c..00000000 --- a/src/segment/trailer.rs +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use super::index_block::BlockHandle; -use crate::{ - coding::{Decode, DecodeError, Encode}, - file::MAGIC_BYTES, -}; -use std::{ - fs::File, - io::{Read, Seek, Write}, -}; - -const TRAILER_SIZE: usize = 32; - -/// The fixed-size segment trailer stores a block handle to the regions block -/// -/// # Diagram -/// -/// ---------------- -/// | data blocks | <- implicitly start at 0 -/// |--------------| -/// | tli block | -/// |--------------| -/// | index block | <- may not exist (if full block index is used, TLI will be dense) -/// |--------------| -/// | filter block | <- may not exist -/// |--------------| -/// | ... TBD ... | -/// |--------------| -/// | meta block | -/// |--------------| -/// | region block | -/// |--------------| -/// | trailer | <- fixed size -/// |--------------| -/// -/// Through this indirection, we can have a variable number of region block handles. -#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)] -pub struct Trailer { - regions_block_handle: BlockHandle, -} - -impl Trailer { - pub fn from_handle(regions_block_handle: BlockHandle) -> Self { - Self { - regions_block_handle, - } - } - - pub fn regions_block_handle(&self) -> &BlockHandle { - &self.regions_block_handle - } - - pub fn write_into(&self, writer: &mut W) -> crate::Result<()> { - let mut v = Vec::with_capacity(TRAILER_SIZE); - - v.write_all(&MAGIC_BYTES)?; - - self.regions_block_handle.encode_into(&mut v)?; - - // Pad with remaining bytes - v.resize(TRAILER_SIZE, 0); - - assert_eq!( - v.len(), - TRAILER_SIZE, - "segment file trailer has invalid size" - ); - - writer.write_all(&v)?; - - Ok(()) - } - - // TODO: the trailer is fixed size so we can use read_at?! - // TODO: then we don't need &mut File - pub fn from_file(file: &mut File) -> crate::Result { - file.seek(std::io::SeekFrom::End(-(TRAILER_SIZE as i64)))?; - - let mut trailer_bytes = [0u8; TRAILER_SIZE]; - file.read_exact(&mut trailer_bytes)?; - - let mut reader = &mut &trailer_bytes[..]; - - // Check trailer magic header - let mut magic = [0u8; MAGIC_BYTES.len()]; - reader.read_exact(&mut magic)?; - - if magic != MAGIC_BYTES { - return Err(crate::Error::Decode(DecodeError::InvalidHeader( - "SegmentTrailer", - ))); - } - - // Get regions block handle - let handle = BlockHandle::decode_from(&mut reader)?; - - Ok(Self::from_handle(handle)) - } -} diff --git a/src/segment/writer/index.rs b/src/segment/writer/index.rs index e6126f5f..ab455a6d 100644 --- a/src/segment/writer/index.rs +++ b/src/segment/writer/index.rs @@ -3,25 +3,18 @@ // (found in the LICENSE-* files in the repository) use crate::{ - segment::{ - block::Header as BlockHeader, - index_block::{BlockHandle, KeyedBlockHandle}, - Block, BlockOffset, IndexBlock, - }, + segment::{block::Header as BlockHeader, index_block::KeyedBlockHandle, Block, IndexBlock}, CompressionType, }; -pub trait BlockIndexWriter { +pub trait BlockIndexWriter { /// Registers a data block in the block index. fn register_data_block(&mut self, block_handle: KeyedBlockHandle) -> crate::Result<()>; /// Writes the block index to a file. /// /// Returns the (optional) index blocks handle and the TLI handle. - fn finish( - &mut self, - block_file_writer: &mut W, - ) -> crate::Result<(BlockHandle, Option)>; + fn finish(&mut self, block_file_writer: &mut tft::Writer) -> crate::Result<()>; fn set_compression(&mut self, compression: CompressionType); @@ -64,11 +57,8 @@ impl BlockIndexWriter for FullIndexWriter Ok(()) } - fn finish( - &mut self, - block_file_writer: &mut W, - ) -> crate::Result<(BlockHandle, Option)> { - let tli_ptr = BlockOffset(block_file_writer.stream_position()?); + fn finish(&mut self, block_file_writer: &mut tft::Writer) -> crate::Result<()> { + block_file_writer.start("tli")?; let mut bytes = vec![]; IndexBlock::encode_into(&mut bytes, &self.block_handles)?; @@ -89,7 +79,7 @@ impl BlockIndexWriter for FullIndexWriter self.block_handles.len(), ); - Ok((BlockHandle::new(tli_ptr, bytes_written), None)) + Ok(()) } } diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 8563b2e1..4cbe05a8 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -2,22 +2,15 @@ mod index; mod meta; use super::{ - block::Header as BlockHeader, filter::BloomConstructionPolicy, trailer::Trailer, Block, - BlockOffset, DataBlock, KeyedBlockHandle, + block::Header as BlockHeader, filter::BloomConstructionPolicy, Block, BlockOffset, DataBlock, + KeyedBlockHandle, }; use crate::{ - coding::Encode, - file::fsync_directory, - segment::{filter::standard_bloom::Builder, index_block::BlockHandle, regions::ParsedRegions}, - time::unix_timestamp, - CompressionType, InternalValue, SegmentId, UserKey, + coding::Encode, file::fsync_directory, segment::filter::standard_bloom::Builder, + time::unix_timestamp, CompressionType, InternalValue, SegmentId, UserKey, }; use index::{BlockIndexWriter, FullIndexWriter}; -use std::{ - fs::File, - io::{BufWriter, Seek, Write}, - path::PathBuf, -}; +use std::{fs::File, io::BufWriter, path::PathBuf}; /// Serializes and compresses values into blocks and writes them to disk as segment pub struct Writer { @@ -45,7 +38,7 @@ pub struct Writer { /// Writer of data blocks #[allow(clippy::struct_field_names)] - block_writer: BufWriter, + block_writer: tft::Writer, /// Writer of index blocks #[allow(clippy::struct_field_names)] @@ -74,6 +67,8 @@ impl Writer { pub fn new(path: PathBuf, segment_id: SegmentId) -> crate::Result { let block_writer = File::create_new(&path)?; let block_writer = BufWriter::with_capacity(u16::MAX.into(), block_writer); + let mut block_writer = tft::Writer::into_writer(block_writer); + block_writer.start("data")?; Ok(Self { meta: meta::Metadata::default(), @@ -304,62 +299,50 @@ impl Writer { return Ok(None); } - // // Append index blocks to file - let (tli_handle, index_blocks_handle) = self.index_writer.finish(&mut self.block_writer)?; - log::trace!("tli_ptr={tli_handle:?}"); - log::trace!("index_blocks_ptr={index_blocks_handle:?}"); + // Write index + self.index_writer.finish(&mut self.block_writer)?; // Write filter - let filter_handle = { - if self.bloom_hash_buffer.is_empty() { - None - } else { - let filter_ptr = self.block_writer.stream_position()?; - let n = self.bloom_hash_buffer.len(); - - log::trace!( - "Constructing Bloom filter with {n} entries: {:?}", - self.bloom_policy, - ); + if !self.bloom_hash_buffer.is_empty() { + self.block_writer.start("filter")?; - let start = std::time::Instant::now(); + let n = self.bloom_hash_buffer.len(); - let filter_bytes = { - let mut builder = self.bloom_policy.init(n); + log::trace!( + "Constructing Bloom filter with {n} entries: {:?}", + self.bloom_policy, + ); - for hash in self.bloom_hash_buffer { - builder.set_with_hash(hash); - } + let start = std::time::Instant::now(); - builder.build() - }; + let filter_bytes = { + let mut builder = self.bloom_policy.init(n); - log::trace!( - "Built Bloom filter ({} B) in {:?}", - filter_bytes.len(), - start.elapsed(), - ); + for hash in self.bloom_hash_buffer { + builder.set_with_hash(hash); + } - let header = Block::write_into( - &mut self.block_writer, - &filter_bytes, - crate::segment::block::BlockType::Filter, - CompressionType::None, - )?; + builder.build() + }; - // NOTE: Block header is a couple of bytes only, so cast is fine - #[allow(clippy::cast_possible_truncation)] - let bytes_written = (BlockHeader::serialized_len() as u32) + header.data_length; + log::trace!( + "Built Bloom filter ({} B) in {:?}", + filter_bytes.len(), + start.elapsed(), + ); - Some(BlockHandle::new(BlockOffset(filter_ptr), bytes_written)) - } - }; - log::trace!("filter_ptr={filter_handle:?}"); + Block::write_into( + &mut self.block_writer, + &filter_bytes, + crate::segment::block::BlockType::Filter, + CompressionType::None, + )?; + } // Write metadata - let metadata_start = BlockOffset(self.block_writer.stream_position()?); + self.block_writer.start("meta")?; - let metadata_handle = { + { fn meta(key: &str, value: &[u8]) -> InternalValue { InternalValue::from_components(key, value, 0, crate::ValueType::Value) } @@ -440,55 +423,20 @@ impl Writer { // TODO: no binary index DataBlock::encode_into(&mut self.block_buffer, &meta_items, 1, 0.0)?; - let header = Block::write_into( + Block::write_into( &mut self.block_writer, &self.block_buffer, crate::segment::block::BlockType::Meta, CompressionType::None, )?; - - // NOTE: Block header is a couple of bytes only, so cast is fine - #[allow(clippy::cast_possible_truncation)] - let bytes_written = BlockHeader::serialized_len() as u32 + header.data_length; - - BlockHandle::new(metadata_start, bytes_written) - }; - - // Write regions block - let regions_block_handle = { - let regions_block_start = BlockOffset(self.block_writer.stream_position()?); - - let regions = ParsedRegions { - tli: tli_handle, - index: None, - filter: filter_handle, - metadata: metadata_handle, - }; - - log::trace!("Encoding regions: {regions:#?}"); - - let bytes = regions.encode_into_vec()?; - let header = Block::write_into( - &mut self.block_writer, - &bytes, - crate::segment::block::BlockType::Regions, - CompressionType::None, - )?; - - // NOTE: Block header is a couple of bytes only, so cast is fine - #[allow(clippy::cast_possible_truncation)] - let bytes_written = BlockHeader::serialized_len() as u32 + header.data_length; - - BlockHandle::new(regions_block_start, bytes_written) }; // Write fixed-size trailer - let trailer = Trailer::from_handle(regions_block_handle); - trailer.write_into(&mut self.block_writer)?; - - // Finally, flush & fsync the blocks file - self.block_writer.flush()?; - self.block_writer.get_mut().sync_all()?; + // and flush & fsync the table file + self.block_writer.finish().map_err(|e| match e { + tft::Error::Io(e) => crate::Error::from(e), + _ => unreachable!(), + })?; // IMPORTANT: fsync folder on Unix From 7887944be133ef7c510a8e6da83adc1b2e31ad3d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 26 Sep 2025 17:55:11 +0200 Subject: [PATCH 469/613] use git url temporarily --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index eb1ad19e..ed644ecb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,7 +38,7 @@ self_cell = "1.2.0" tempfile = "3.20.0" varint-rs = "2.2.0" xxhash-rust = { version = "0.8.15", features = ["xxh3"] } -tft = { path = "../tft" } +tft = { git = "https://github.com/marvin-j97/tft" } [dev-dependencies] criterion = { version = "0.5.1", features = ["html_reports"] } From 97276f6daa387e8d469a166ecc65d9c1e5368261 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 26 Sep 2025 20:26:17 +0200 Subject: [PATCH 470/613] refactor: use tft for version files --- README.md | 2 +- src/error.rs | 23 +++++++++ src/level_manifest/mod.rs | 82 +++++++++++++++++------------- src/segment/mod.rs | 19 +------ src/segment/writer/mod.rs | 5 +- src/version/mod.rs | 102 ++++++++++++++++++++------------------ 6 files changed, 127 insertions(+), 106 deletions(-) diff --git a/README.md b/README.md index 82485bf6..b121c967 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![CI](https://github.com/fjall-rs/lsm-tree/actions/workflows/test.yml/badge.svg)](https://github.com/fjall-rs/lsm-tree/actions/workflows/test.yml) [![docs.rs](https://img.shields.io/docsrs/lsm-tree?color=green)](https://docs.rs/lsm-tree) [![Crates.io](https://img.shields.io/crates/v/lsm-tree?color=blue)](https://crates.io/crates/lsm-tree) -![MSRV](https://img.shields.io/badge/MSRV-1.89.0-blue) +![MSRV](https://img.shields.io/badge/MSRV-1.82.0-blue) [![dependency status](https://deps.rs/repo/github/fjall-rs/lsm-tree/status.svg)](https://deps.rs/repo/github/fjall-rs/lsm-tree) A K.I.S.S. implementation of log-structured merge trees (LSM-trees/LSMTs) in Rust. diff --git a/src/error.rs b/src/error.rs index 85d81534..5d1fbde0 100644 --- a/src/error.rs +++ b/src/error.rs @@ -60,6 +60,29 @@ impl std::error::Error for Error { } } +impl From for Error { + fn from(value: tft::Error) -> Self { + match value { + tft::Error::Io(e) => Self::from(e), + tft::Error::ChecksumMismatch { got, expected } => { + log::error!("Archive ToC checksum mismatch"); + Self::ChecksumMismatch { + got: got.into(), + expected: expected.into(), + } + } + tft::Error::InvalidVersion => { + log::error!("Invalid archive version"); + Self::Unrecoverable + } + tft::Error::UnsupportedChecksumType => { + log::error!("Invalid archive checksum type"); + Self::Unrecoverable + } + } + } +} + impl From for Error { fn from(value: std::io::Error) -> Self { Self::Io(value) diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index 19c15aff..4a12763a 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -157,50 +157,57 @@ impl LevelManifest { "Recovering current manifest at {}", version_file_path.display(), ); - let mut level_manifest = Cursor::new(std::fs::read(version_file_path)?); - // TODO: vvv move into Version::decode? vvv + let reader = tft::Reader::new(&version_file_path)?; + let toc = reader.toc(); - // Check header - let mut magic = [0u8; MAGIC_BYTES.len()]; - level_manifest.read_exact(&mut magic)?; + // // TODO: vvv move into Version::decode vvv + let mut levels = vec![]; - if magic != MAGIC_BYTES { - return Err(crate::Error::Decode(DecodeError::InvalidHeader( - "LevelManifest", - ))); - } + { + let mut reader = toc + .section(b"tables") + .expect("tables should exist") + .buf_reader(&version_file_path)?; - let mut levels = vec![]; + let level_count = reader.read_u8()?; - let level_count = level_manifest.read_u8()?; + for _ in 0..level_count { + let mut level = vec![]; + let run_count = reader.read_u8()?; - for _ in 0..level_count { - let mut level = vec![]; - let run_count = level_manifest.read_u8()?; + for _ in 0..run_count { + let mut run = vec![]; + let segment_count = reader.read_u32::()?; - for _ in 0..run_count { - let mut run = vec![]; - let segment_count = level_manifest.read_u32::()?; + for _ in 0..segment_count { + let id = reader.read_u64::()?; + run.push(id); + } - for _ in 0..segment_count { - let id = level_manifest.read_u64::()?; - run.push(id); + level.push(run); } - level.push(run); + levels.push(level); } - - levels.push(level); } - let blob_file_count = level_manifest.read_u32::()?; - let mut blob_file_ids = Vec::with_capacity(blob_file_count as usize); + let blob_file_ids = { + let mut reader = toc + .section(b"blob_files") + .expect("tables should exist") + .buf_reader(&version_file_path)?; - for _ in 0..blob_file_count { - let id = level_manifest.read_u64::()?; - blob_file_ids.push(id); - } + let blob_file_count = reader.read_u32::()?; + let mut blob_file_ids = Vec::with_capacity(blob_file_count as usize); + + for _ in 0..blob_file_count { + let id = reader.read_u64::()?; + blob_file_ids.push(id); + } + + blob_file_ids + }; Ok(Recovery { curr_version_id, @@ -264,15 +271,20 @@ impl LevelManifest { folder.display(), ); - let file = std::fs::File::create_new(folder.join(format!("v{}", version.id())))?; - let mut writer = BufWriter::new(file); + let path = folder.join(format!("v{}", version.id())); + let file = std::fs::File::create_new(path)?; + let writer = BufWriter::new(file); + let mut writer = tft::Writer::into_writer(writer); version.encode_into(&mut writer)?; - writer.flush()?; - writer.get_mut().sync_all()?; + writer.finish().map_err(|e| match e { + tft::Error::Io(e) => crate::Error::from(e), + _ => unreachable!(), + })?; + + // IMPORTANT: fsync folder on Unix fsync_directory(folder)?; - // IMPORTANT: ^ wait for fsync and directory sync to fully finish rewrite_atomic(&folder.join("current"), &version.id().to_le_bytes())?; diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 708df865..cb042621 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -381,24 +381,7 @@ impl Segment { log::debug!("Recovering segment from file {}", file_path.display()); let mut file = std::fs::File::open(&file_path)?; - let trailer = tft::Reader::from_reader(&mut file).map_err(|e| match e { - tft::Error::Io(e) => crate::Error::from(e), - tft::Error::ChecksumMismatch { got, expected } => { - log::error!("Archive ToC checksum mismatch"); - crate::Error::ChecksumMismatch { - got: got.into(), - expected: expected.into(), - } - } - tft::Error::InvalidVersion => { - log::error!("Invalid archive version"); - crate::Error::Unrecoverable - } - tft::Error::UnsupportedChecksumType => { - log::error!("Invalid archive checksum type"); - crate::Error::Unrecoverable - } - })?; + let trailer = tft::Reader::from_reader(&mut file)?; let regions = ParsedRegions::parse_from_toc(trailer.toc())?; log::debug!("Reading meta block, with meta_ptr={:?}", regions.metadata); diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 4cbe05a8..94655fd7 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -433,10 +433,7 @@ impl Writer { // Write fixed-size trailer // and flush & fsync the table file - self.block_writer.finish().map_err(|e| match e { - tft::Error::Io(e) => crate::Error::from(e), - _ => unreachable!(), - })?; + self.block_writer.finish()?; // IMPORTANT: fsync folder on Unix diff --git a/src/version/mod.rs b/src/version/mod.rs index 497a237f..90a1fa84 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -14,7 +14,7 @@ use crate::{ }; use optimize::optimize_runs; use run::Ranged; -use std::{collections::BTreeMap, ops::Deref, sync::Arc}; +use std::{collections::BTreeMap, io::Write, ops::Deref, sync::Arc}; pub const DEFAULT_LEVEL_COUNT: u8 = 7; @@ -424,13 +424,11 @@ impl Version { } } -impl Encode for Version { - fn encode_into(&self, writer: &mut W) -> Result<(), crate::EncodeError> { - use crate::file::MAGIC_BYTES; +impl Version { + pub(crate) fn encode_into(&self, writer: &mut tft::Writer) -> Result<(), crate::EncodeError> { use byteorder::{LittleEndian, WriteBytesExt}; - // Magic - writer.write_all(&MAGIC_BYTES)?; + writer.start("tables")?; // Level count // NOTE: We know there are always less than 256 levels @@ -456,6 +454,8 @@ impl Encode for Version { } } + writer.start("blob_files")?; + // Blob file count // NOTE: We know there are always less than 4 billion blob files #[allow(clippy::cast_possible_truncation)] @@ -465,49 +465,55 @@ impl Encode for Version { writer.write_u64::(file.id())?; } + writer.start("blob_gc_stats")?; + + // TODO: 3.0.0 + + writer.write_all(b":)")?; + Ok(()) } } -#[cfg(test)] -mod tests { - use super::*; - use test_log::test; - - #[test] - fn version_encode_empty() { - let bytes = Version::new(0).encode_into_vec(); - - #[rustfmt::skip] - let raw = &[ - // Magic - b'L', b'S', b'M', 3, - - // Level count - 7, - - // L0 runs - 0, - // L1 runs - 0, - // L2 runs - 0, - // L3 runs - 0, - // L4 runs - 0, - // L5 runs - 0, - // L6 runs - 0, - - // Blob file count - 0, - 0, - 0, - 0, - ]; - - assert_eq!(bytes, raw); - } -} +// #[cfg(test)] +// mod tests { +// use super::*; +// use test_log::test; + +// #[test] +// fn version_encode_empty() { +// let bytes = Version::new(0).encode_into_vec(); + +// #[rustfmt::skip] +// let raw = &[ +// // Magic +// b'L', b'S', b'M', 3, + +// // Level count +// 7, + +// // L0 runs +// 0, +// // L1 runs +// 0, +// // L2 runs +// 0, +// // L3 runs +// 0, +// // L4 runs +// 0, +// // L5 runs +// 0, +// // L6 runs +// 0, + +// // Blob file count +// 0, +// 0, +// 0, +// 0, +// ]; + +// assert_eq!(bytes, raw); +// } +// } From 82a056e5270ee81a50a278cca561984adbf24ad8 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 26 Sep 2025 20:34:41 +0200 Subject: [PATCH 471/613] clippy --- src/config/compression.rs | 6 ++++++ src/config/filter.rs | 8 ++++++++ src/config/mod.rs | 2 ++ src/level_manifest/mod.rs | 5 ++--- 4 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/config/compression.rs b/src/config/compression.rs index 168cdf86..139bdbd1 100644 --- a/src/config/compression.rs +++ b/src/config/compression.rs @@ -37,6 +37,12 @@ impl CompressionPolicy { .unwrap_or_else(|| self.last().copied().expect("policy should not be empty")) } + /// Disables all compression. + #[must_use] + pub fn disabled() -> Self { + Self::all(CompressionType::None) + } + // TODO: accept Vec... Into>? or owned /// Uses the same compression in every level. diff --git a/src/config/filter.rs b/src/config/filter.rs index 8d6cde99..f4470563 100644 --- a/src/config/filter.rs +++ b/src/config/filter.rs @@ -45,6 +45,14 @@ impl FilterPolicy { .unwrap_or_else(|| self.last().copied().expect("policy should not be empty")) } + /// Disables all filters. + /// + /// **Not recommended unless you know what you are doing!** + #[must_use] + pub fn disabled() -> Self { + Self::all(FilterPolicyEntry::None) + } + // TODO: accept Vec... Into>? or owned /// Uses the same block size in every level. diff --git a/src/config/mod.rs b/src/config/mod.rs index 47b54933..cbdbfc7a 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -75,6 +75,8 @@ pub struct Config { pub tree_type: TreeType, /// Number of levels of the LSM tree (depth of tree) + /// + /// Once set, the level count is fixed (in the "manifest" file) pub level_count: u8, /// What type of compression is used for data blocks diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index 4a12763a..b1496b4f 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -5,8 +5,7 @@ pub(crate) mod hidden_set; use crate::{ - coding::{DecodeError, Encode}, - file::{fsync_directory, rewrite_atomic, MAGIC_BYTES}, + file::{fsync_directory, rewrite_atomic}, segment::Segment, version::{Level, Run, Version, VersionId, DEFAULT_LEVEL_COUNT}, vlog::BlobFileId, @@ -16,7 +15,7 @@ use byteorder::{LittleEndian, ReadBytesExt}; use hidden_set::HiddenSet; use std::{ collections::VecDeque, - io::{BufWriter, Cursor, Read, Write}, + io::BufWriter, path::{Path, PathBuf}, sync::Arc, }; From d6499cdf5a40120f068c54338427ee069b43b7de Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 26 Sep 2025 21:03:28 +0200 Subject: [PATCH 472/613] prepare for pre.0 --- Cargo.toml | 2 +- src/error.rs | 16 ++++++++++------ src/level_manifest/mod.rs | 6 +++--- src/segment/block/checksum.rs | 4 ++-- src/segment/block/trailer.rs | 2 -- src/segment/mod.rs | 2 +- src/segment/regions.rs | 4 ++-- src/segment/writer/index.rs | 4 ++-- src/segment/writer/mod.rs | 4 ++-- src/version/mod.rs | 2 +- 10 files changed, 24 insertions(+), 22 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ed644ecb..a3b8180f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,7 +38,7 @@ self_cell = "1.2.0" tempfile = "3.20.0" varint-rs = "2.2.0" xxhash-rust = { version = "0.8.15", features = ["xxh3"] } -tft = { git = "https://github.com/marvin-j97/tft" } +sfa = "~0.0.1" [dev-dependencies] criterion = { version = "0.5.1", features = ["html_reports"] } diff --git a/src/error.rs b/src/error.rs index 5d1fbde0..eeab1620 100644 --- a/src/error.rs +++ b/src/error.rs @@ -60,22 +60,26 @@ impl std::error::Error for Error { } } -impl From for Error { - fn from(value: tft::Error) -> Self { +impl From for Error { + fn from(value: sfa::Error) -> Self { match value { - tft::Error::Io(e) => Self::from(e), - tft::Error::ChecksumMismatch { got, expected } => { + sfa::Error::Io(e) => Self::from(e), + sfa::Error::ChecksumMismatch { got, expected } => { log::error!("Archive ToC checksum mismatch"); Self::ChecksumMismatch { got: got.into(), expected: expected.into(), } } - tft::Error::InvalidVersion => { + sfa::Error::InvalidHeader => { + log::error!("Invalid archive header"); + Self::Unrecoverable + } + sfa::Error::InvalidVersion => { log::error!("Invalid archive version"); Self::Unrecoverable } - tft::Error::UnsupportedChecksumType => { + sfa::Error::UnsupportedChecksumType => { log::error!("Invalid archive checksum type"); Self::Unrecoverable } diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index b1496b4f..cbf43fd9 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -157,7 +157,7 @@ impl LevelManifest { version_file_path.display(), ); - let reader = tft::Reader::new(&version_file_path)?; + let reader = sfa::Reader::new(&version_file_path)?; let toc = reader.toc(); // // TODO: vvv move into Version::decode vvv @@ -273,12 +273,12 @@ impl LevelManifest { let path = folder.join(format!("v{}", version.id())); let file = std::fs::File::create_new(path)?; let writer = BufWriter::new(file); - let mut writer = tft::Writer::into_writer(writer); + let mut writer = sfa::Writer::into_writer(writer); version.encode_into(&mut writer)?; writer.finish().map_err(|e| match e { - tft::Error::Io(e) => crate::Error::from(e), + sfa::Error::Io(e) => crate::Error::from(e), _ => unreachable!(), })?; diff --git a/src/segment/block/checksum.rs b/src/segment/block/checksum.rs index ae987413..b919b48d 100644 --- a/src/segment/block/checksum.rs +++ b/src/segment/block/checksum.rs @@ -14,8 +14,8 @@ impl std::ops::Deref for Checksum { } } -impl From for Checksum { - fn from(value: tft::Checksum) -> Self { +impl From for Checksum { + fn from(value: sfa::Checksum) -> Self { Self(value.into_u128()) } } diff --git a/src/segment/block/trailer.rs b/src/segment/block/trailer.rs index 878d2c42..74fb7c5d 100644 --- a/src/segment/block/trailer.rs +++ b/src/segment/block/trailer.rs @@ -63,8 +63,6 @@ impl<'a> Trailer<'a> { // IMPORTANT: Terminator marker encoder.writer.write_u8(TRAILER_START_MARKER)?; - // TODO: version u8? -> add to segment metadata instead - // NOTE: We know that data blocks will never even approach 4 GB in size #[allow(clippy::cast_possible_truncation)] let binary_index_offset = encoder.writer.len() as u32; diff --git a/src/segment/mod.rs b/src/segment/mod.rs index cb042621..361e9127 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -381,7 +381,7 @@ impl Segment { log::debug!("Recovering segment from file {}", file_path.display()); let mut file = std::fs::File::open(&file_path)?; - let trailer = tft::Reader::from_reader(&mut file)?; + let trailer = sfa::Reader::from_reader(&mut file)?; let regions = ParsedRegions::parse_from_toc(trailer.toc())?; log::debug!("Reading meta block, with meta_ptr={:?}", regions.metadata); diff --git a/src/segment/regions.rs b/src/segment/regions.rs index efbcb6e4..9dff8db9 100644 --- a/src/segment/regions.rs +++ b/src/segment/regions.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{BlockHandle, BlockOffset}; -use tft::TocEntry; +use sfa::TocEntry; fn toc_entry_to_handle(entry: &TocEntry) -> BlockHandle { BlockHandle::new(BlockOffset(entry.pos()), entry.len() as u32) @@ -37,7 +37,7 @@ pub struct ParsedRegions { } impl ParsedRegions { - pub fn parse_from_toc(toc: &tft::Toc) -> crate::Result { + pub fn parse_from_toc(toc: &sfa::Toc) -> crate::Result { Ok(Self { tli: toc .section(b"tli") diff --git a/src/segment/writer/index.rs b/src/segment/writer/index.rs index ab455a6d..361323bd 100644 --- a/src/segment/writer/index.rs +++ b/src/segment/writer/index.rs @@ -14,7 +14,7 @@ pub trait BlockIndexWriter { /// Writes the block index to a file. /// /// Returns the (optional) index blocks handle and the TLI handle. - fn finish(&mut self, block_file_writer: &mut tft::Writer) -> crate::Result<()>; + fn finish(&mut self, block_file_writer: &mut sfa::Writer) -> crate::Result<()>; fn set_compression(&mut self, compression: CompressionType); @@ -57,7 +57,7 @@ impl BlockIndexWriter for FullIndexWriter Ok(()) } - fn finish(&mut self, block_file_writer: &mut tft::Writer) -> crate::Result<()> { + fn finish(&mut self, block_file_writer: &mut sfa::Writer) -> crate::Result<()> { block_file_writer.start("tli")?; let mut bytes = vec![]; diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 94655fd7..68fb420b 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -38,7 +38,7 @@ pub struct Writer { /// Writer of data blocks #[allow(clippy::struct_field_names)] - block_writer: tft::Writer, + block_writer: sfa::Writer, /// Writer of index blocks #[allow(clippy::struct_field_names)] @@ -67,7 +67,7 @@ impl Writer { pub fn new(path: PathBuf, segment_id: SegmentId) -> crate::Result { let block_writer = File::create_new(&path)?; let block_writer = BufWriter::with_capacity(u16::MAX.into(), block_writer); - let mut block_writer = tft::Writer::into_writer(block_writer); + let mut block_writer = sfa::Writer::into_writer(block_writer); block_writer.start("data")?; Ok(Self { diff --git a/src/version/mod.rs b/src/version/mod.rs index 90a1fa84..ade0664d 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -425,7 +425,7 @@ impl Version { } impl Version { - pub(crate) fn encode_into(&self, writer: &mut tft::Writer) -> Result<(), crate::EncodeError> { + pub(crate) fn encode_into(&self, writer: &mut sfa::Writer) -> Result<(), crate::EncodeError> { use byteorder::{LittleEndian, WriteBytesExt}; writer.start("tables")?; From b256824f721849b199e679525b243f4e3d96c54d Mon Sep 17 00:00:00 2001 From: zaidoon Date: Sat, 27 Sep 2025 17:07:25 -0400 Subject: [PATCH 473/613] update drop_range API to use RangeBounds --- src/abstract.rs | 7 +- src/blob_tree/mod.rs | 4 +- src/compaction/drop_range.rs | 43 +++++++- src/error.rs | 6 +- src/tree/mod.rs | 41 +++++++- tests/tree_drop_range.rs | 193 +++++++++++++++++++++++++++++++---- 6 files changed, 259 insertions(+), 35 deletions(-) diff --git a/src/abstract.rs b/src/abstract.rs index d9685605..b2fe6134 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -78,10 +78,13 @@ pub trait AbstractTree { /// Drops segments that are fully contained in a given range. /// + /// Both range bounds must be inclusive and finite. + /// /// # Errors /// - /// Will return `Err` if an IO error occurs. - fn drop_range(&self, key_range: crate::KeyRange) -> crate::Result<()>; + /// Will return `Err` if an IO error occurs or if the provided bounds are + /// not supported. + fn drop_range, R: RangeBounds>(&self, range: R) -> crate::Result<()>; /// Performs major compaction, blocking the caller until it's done. /// diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index d9d525aa..8c0db395 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -432,8 +432,8 @@ impl AbstractTree for BlobTree { self.index.tombstone_count() } - fn drop_range(&self, key_range: crate::KeyRange) -> crate::Result<()> { - self.index.drop_range(key_range) + fn drop_range, R: RangeBounds>(&self, range: R) -> crate::Result<()> { + self.index.drop_range(range) } fn ingest( diff --git a/src/compaction/drop_range.rs b/src/compaction/drop_range.rs index 3e71c8a0..615651a6 100644 --- a/src/compaction/drop_range.rs +++ b/src/compaction/drop_range.rs @@ -3,12 +3,42 @@ // (found in the LICENSE-* files in the repository) use super::{Choice, CompactionStrategy}; -use crate::{config::Config, level_manifest::LevelManifest, KeyRange}; +use crate::{ + config::Config, level_manifest::LevelManifest, slice::Slice, version::run::Ranged, KeyRange, +}; use crate::{HashSet, Segment}; +use std::ops::Bound; + +#[derive(Clone, Debug)] +pub struct OwnedBounds { + pub start: Bound, + pub end: Bound, +} + +impl OwnedBounds { + #[must_use] + pub fn contains(&self, range: &KeyRange) -> bool { + let lower_ok = match &self.start { + Bound::Unbounded => true, + Bound::Included(key) => key.as_ref() <= range.min().as_ref(), + Bound::Excluded(key) => key.as_ref() < range.min().as_ref(), + }; + + if !lower_ok { + return false; + } + + match &self.end { + Bound::Unbounded => true, + Bound::Included(key) => key.as_ref() >= range.max().as_ref(), + Bound::Excluded(key) => key.as_ref() > range.max().as_ref(), + } + } +} /// Drops all segments that are **contained** in a key range pub struct Strategy { - key_range: KeyRange, + bounds: OwnedBounds, } impl Strategy { @@ -19,8 +49,8 @@ impl Strategy { /// Panics, if `target_size` is below 1024 bytes. #[must_use] #[allow(dead_code)] - pub fn new(key_range: KeyRange) -> Self { - Self { key_range } + pub fn new(bounds: OwnedBounds) -> Self { + Self { bounds } } } @@ -34,7 +64,10 @@ impl CompactionStrategy for Strategy { .current_version() .iter_levels() .flat_map(|lvl| lvl.iter()) - .flat_map(|run| run.get_contained(&self.key_range)) + .flat_map(|run| { + run.iter() + .filter(|segment| self.bounds.contains(segment.key_range())) + }) .map(Segment::id) .collect(); diff --git a/src/error.rs b/src/error.rs index eeab1620..62a15c26 100644 --- a/src/error.rs +++ b/src/error.rs @@ -38,6 +38,9 @@ pub enum Error { /// Checksum that was saved in block header expected: Checksum, }, + + /// Provided range bounds are not supported by the requested operation + InvalidRangeBounds, } impl std::fmt::Display for Error { @@ -55,7 +58,8 @@ impl std::error::Error for Error { Self::Decompress(_) | Self::InvalidVersion(_) | Self::Unrecoverable - | Self::ChecksumMismatch { .. } => None, + | Self::ChecksumMismatch { .. } + | Self::InvalidRangeBounds => None, } } } diff --git a/src/tree/mod.rs b/src/tree/mod.rs index ec583088..0f55d57a 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -7,7 +7,7 @@ pub mod inner; use crate::{ coding::{Decode, Encode}, - compaction::CompactionStrategy, + compaction::{drop_range::OwnedBounds, CompactionStrategy}, config::Config, file::BLOBS_FOLDER, format_version::FormatVersion, @@ -16,6 +16,7 @@ use crate::{ manifest::Manifest, memtable::Memtable, segment::Segment, + slice::Slice, value::InternalValue, vlog::BlobFile, AbstractTree, Cache, DescriptorTable, KvPair, SegmentId, SeqNo, SequenceNumberCounter, UserKey, @@ -24,7 +25,7 @@ use crate::{ use inner::{MemtableId, SealedMemtables, TreeId, TreeInner}; use std::{ io::Cursor, - ops::RangeBounds, + ops::{Bound, RangeBounds}, path::Path, sync::{atomic::AtomicU64, Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}, }; @@ -168,9 +169,9 @@ impl AbstractTree for Tree { Ok(()) } - // TODO: change API to RangeBounds - fn drop_range(&self, key_range: crate::KeyRange) -> crate::Result<()> { - let strategy = Arc::new(crate::compaction::drop_range::Strategy::new(key_range)); + fn drop_range, R: RangeBounds>(&self, range: R) -> crate::Result<()> { + let bounds = Self::range_bounds_to_owned_bounds(&range)?; + let strategy = Arc::new(crate::compaction::drop_range::Strategy::new(bounds)); // IMPORTANT: Write lock so we can be the only compaction going on let _lock = self @@ -549,6 +550,36 @@ impl AbstractTree for Tree { } impl Tree { + fn range_bounds_to_owned_bounds, R: RangeBounds>( + range: &R, + ) -> crate::Result { + use Bound::{Excluded, Included, Unbounded}; + + let start = match range.start_bound() { + Included(key) => Included(Slice::from(key.as_ref())), + Excluded(key) => Excluded(Slice::from(key.as_ref())), + Unbounded => Unbounded, + }; + + let end = match range.end_bound() { + Included(key) => Included(Slice::from(key.as_ref())), + Excluded(key) => Excluded(Slice::from(key.as_ref())), + Unbounded => Unbounded, + }; + + if let (Included(lo), Included(hi)) + | (Included(lo), Excluded(hi)) + | (Excluded(lo), Included(hi)) + | (Excluded(lo), Excluded(hi)) = (&start, &end) + { + if lo.as_ref() > hi.as_ref() { + return Err(crate::Error::InvalidRangeBounds); + } + } + + Ok(OwnedBounds { start, end }) + } + /// Opens an LSM-tree in the given directory. /// /// Will recover previous state if the folder was previously diff --git a/tests/tree_drop_range.rs b/tests/tree_drop_range.rs index a4f295b6..efe4d17e 100644 --- a/tests/tree_drop_range.rs +++ b/tests/tree_drop_range.rs @@ -1,32 +1,185 @@ -use lsm_tree::{AbstractTree, Config, KeyRange, SeqNo, UserKey}; -use test_log::test; +use lsm_tree::{AbstractTree, Config, Error, SeqNo, Tree}; +use std::ops::Bound::{Excluded, Included, Unbounded}; + +fn populate_segments(tree: &Tree) -> lsm_tree::Result<()> { + for key in 'a'..='e' { + tree.insert([key as u8], "", 0); + tree.flush_active_memtable(0)?; + } + Ok(()) +} #[test] -fn tree_drop_range() -> lsm_tree::Result<()> { +fn tree_drop_range_basic() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; + let tree = Config::new(&folder).open()?; - { - let tree = Config::new(&folder).open()?; + populate_segments(&tree)?; - for key in 'a'..='e' { - tree.insert([key as u8], "", 0); - tree.flush_active_memtable(0)?; - } + assert_eq!(1, tree.l0_run_count()); + assert_eq!(5, tree.segment_count()); - assert_eq!(1, tree.l0_run_count()); - assert_eq!(5, tree.segment_count()); + tree.drop_range("a"..="c")?; - tree.drop_range(KeyRange::new((UserKey::from("a"), UserKey::from("c"))))?; + assert!(!tree.contains_key("a", SeqNo::MAX)?); + assert!(!tree.contains_key("b", SeqNo::MAX)?); + assert!(!tree.contains_key("c", SeqNo::MAX)?); + assert!(tree.contains_key("d", SeqNo::MAX)?); + assert!(tree.contains_key("e", SeqNo::MAX)?); - assert!(!tree.contains_key("a", SeqNo::MAX)?); - assert!(!tree.contains_key("b", SeqNo::MAX)?); - assert!(!tree.contains_key("c", SeqNo::MAX)?); - assert!(tree.contains_key("d", SeqNo::MAX)?); - assert!(tree.contains_key("e", SeqNo::MAX)?); + assert_eq!(1, tree.l0_run_count()); + assert_eq!(2, tree.segment_count()); - assert_eq!(1, tree.l0_run_count()); - assert_eq!(2, tree.segment_count()); - } + Ok(()) +} + +#[test] +fn tree_drop_range_upper_exclusive() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let tree = Config::new(&folder).open()?; + + populate_segments(&tree)?; + + tree.drop_range("a".."d")?; + + assert!(!tree.contains_key("a", SeqNo::MAX)?); + assert!(!tree.contains_key("b", SeqNo::MAX)?); + assert!(!tree.contains_key("c", SeqNo::MAX)?); + assert!(tree.contains_key("d", SeqNo::MAX)?); + assert!(tree.contains_key("e", SeqNo::MAX)?); + + Ok(()) +} + +#[test] +fn tree_drop_range_lower_exclusive() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let tree = Config::new(&folder).open()?; + + populate_segments(&tree)?; + + tree.drop_range((Excluded("a"), Included("c")))?; + + assert!(tree.contains_key("a", SeqNo::MAX)?); + assert!(!tree.contains_key("b", SeqNo::MAX)?); + assert!(!tree.contains_key("c", SeqNo::MAX)?); + assert!(tree.contains_key("d", SeqNo::MAX)?); + + Ok(()) +} + +#[test] +fn tree_drop_range_unbounded_lower_inclusive_upper() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let tree = Config::new(&folder).open()?; + + populate_segments(&tree)?; + + tree.drop_range((Unbounded, Included("c")))?; + + assert!(!tree.contains_key("a", SeqNo::MAX)?); + assert!(!tree.contains_key("b", SeqNo::MAX)?); + assert!(!tree.contains_key("c", SeqNo::MAX)?); + assert!(tree.contains_key("d", SeqNo::MAX)?); + assert!(tree.contains_key("e", SeqNo::MAX)?); + + Ok(()) +} + +#[test] +fn tree_drop_range_unbounded_lower_exclusive_upper() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let tree = Config::new(&folder).open()?; + + populate_segments(&tree)?; + + tree.drop_range((Unbounded, Excluded("d")))?; + + assert!(!tree.contains_key("a", SeqNo::MAX)?); + assert!(!tree.contains_key("b", SeqNo::MAX)?); + assert!(!tree.contains_key("c", SeqNo::MAX)?); + assert!(tree.contains_key("d", SeqNo::MAX)?); + + Ok(()) +} + +#[test] +fn tree_drop_range_exclusive_empty_interval() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let tree = Config::new(&folder).open()?; + + populate_segments(&tree)?; + + tree.drop_range((Excluded("b"), Excluded("b")))?; + + assert!(tree.contains_key("a", SeqNo::MAX)?); + assert!(tree.contains_key("b", SeqNo::MAX)?); + assert!(tree.contains_key("c", SeqNo::MAX)?); + + Ok(()) +} + +#[test] +fn tree_drop_range_empty_tree() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let tree = Config::new(&folder).open()?; + + tree.drop_range("a"..="c")?; + + assert_eq!(0, tree.l0_run_count()); + assert_eq!(0, tree.segment_count()); + + Ok(()) +} + +#[test] +fn tree_drop_range_unbounded_upper() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let tree = Config::new(&folder).open()?; + + populate_segments(&tree)?; + + tree.drop_range("c"..)?; + + assert!(tree.contains_key("a", SeqNo::MAX)?); + assert!(tree.contains_key("b", SeqNo::MAX)?); + assert!(!tree.contains_key("c", SeqNo::MAX)?); + assert!(!tree.contains_key("d", SeqNo::MAX)?); + assert!(!tree.contains_key("e", SeqNo::MAX)?); + + Ok(()) +} + +#[test] +fn tree_drop_range_clear_all() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let tree = Config::new(&folder).open()?; + + populate_segments(&tree)?; + + tree.drop_range::<&str, _>(..)?; + + assert_eq!(0, tree.l0_run_count()); + assert_eq!(0, tree.segment_count()); + assert!(!tree.contains_key("a", SeqNo::MAX)?); + assert!(!tree.contains_key("e", SeqNo::MAX)?); + + Ok(()) +} + +#[test] +fn tree_drop_range_invalid_bounds() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let tree = Config::new(&folder).open()?; + + assert!(matches!( + tree.drop_range("c"..="a"), + Err(Error::InvalidRangeBounds) + )); + assert!(matches!( + tree.drop_range("c".."a"), + Err(Error::InvalidRangeBounds) + )); Ok(()) } From 92396cdbaa52bf75bb2ed1579cc480be8a458734 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 28 Sep 2025 17:22:47 +0200 Subject: [PATCH 474/613] impl blob fragmentation stats in Version etc. this is a lot --- src/abstract.rs | 11 +- src/blob_tree/gc.rs | 221 ++++++ src/blob_tree/gc/mod.rs | 6 - src/blob_tree/gc/reader.rs | 53 -- src/blob_tree/gc/writer.rs | 56 -- src/blob_tree/index.rs | 40 -- src/blob_tree/mod.rs | 302 +++------ src/blob_tree/value.rs | 2 +- src/config/mod.rs | 1 + src/level_manifest/mod.rs | 25 +- src/lib.rs | 76 +-- src/segment/mod.rs | 37 +- src/segment/regions.rs | 6 +- src/segment/writer/mod.rs | 34 +- src/tree/ingest.rs | 4 +- src/tree/mod.rs | 19 +- src/version/mod.rs | 99 ++- src/vlog/accessor.rs | 52 +- src/vlog/blob_file/gc_stats.rs | 33 - src/vlog/blob_file/merge.rs | 103 ++- src/vlog/blob_file/meta.rs | 2 +- src/vlog/blob_file/mod.rs | 78 +-- src/vlog/blob_file/multi_writer.rs | 13 +- src/vlog/blob_file/reader.rs | 124 +--- src/vlog/blob_file/scanner.rs | 157 +++++ src/vlog/gc/mod.rs | 127 ---- src/vlog/gc/report.rs | 78 --- src/vlog/index.rs | 47 -- src/vlog/manifest.rs | 445 ------------ src/vlog/mod.rs | 22 +- src/vlog/scanner.rs | 19 +- src/vlog/value_log.rs | 631 ------------------ ...ter_flush.rs => blob_drop_after_flush._rs} | 0 tests/blob_flush_gc_stats.rs | 59 ++ tests/{blob_gc.rs => blob_gc._rs} | 0 ..._gc_watermark.rs => blob_gc_watermark._rs} | 0 tests/blob_major_compact_gc_stats.rs | 106 +++ tests/blob_recover_gc_stats.rs | 77 +++ tests/blob_simple.rs | 6 +- .../{blob_tombstone.rs => blob_tombstone._rs} | 0 ...blob_tree_flush.rs => blob_tree_flush._rs} | 0 tests/tree_sealed_shadowing.rs | 4 +- tests/tree_seqno.rs | 4 +- 43 files changed, 1079 insertions(+), 2100 deletions(-) create mode 100644 src/blob_tree/gc.rs delete mode 100644 src/blob_tree/gc/mod.rs delete mode 100644 src/blob_tree/gc/reader.rs delete mode 100644 src/blob_tree/gc/writer.rs delete mode 100644 src/blob_tree/index.rs delete mode 100644 src/vlog/blob_file/gc_stats.rs create mode 100644 src/vlog/blob_file/scanner.rs delete mode 100644 src/vlog/gc/mod.rs delete mode 100644 src/vlog/gc/report.rs delete mode 100644 src/vlog/index.rs delete mode 100644 src/vlog/manifest.rs delete mode 100644 src/vlog/value_log.rs rename tests/{blob_drop_after_flush.rs => blob_drop_after_flush._rs} (100%) create mode 100644 tests/blob_flush_gc_stats.rs rename tests/{blob_gc.rs => blob_gc._rs} (100%) rename tests/{blob_gc_watermark.rs => blob_gc_watermark._rs} (100%) create mode 100644 tests/blob_major_compact_gc_stats.rs create mode 100644 tests/blob_recover_gc_stats.rs rename tests/{blob_tombstone.rs => blob_tombstone._rs} (100%) rename tests/{blob_tree_flush.rs => blob_tree_flush._rs} (100%) diff --git a/src/abstract.rs b/src/abstract.rs index d9685605..0a3e775a 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -3,9 +3,10 @@ // (found in the LICENSE-* files in the repository) use crate::{ - compaction::CompactionStrategy, config::TreeType, iter_guard::IterGuardImpl, segment::Segment, - tree::inner::MemtableId, vlog::BlobFile, AnyTree, BlobTree, Config, Guard, KvPair, Memtable, - SegmentId, SeqNo, SequenceNumberCounter, Tree, UserKey, UserValue, + blob_tree::FragmentationMap, compaction::CompactionStrategy, config::TreeType, + iter_guard::IterGuardImpl, segment::Segment, tree::inner::MemtableId, vlog::BlobFile, AnyTree, + BlobTree, Config, Guard, KvPair, Memtable, SegmentId, SeqNo, SequenceNumberCounter, Tree, + UserKey, UserValue, }; use enum_dispatch::enum_dispatch; use std::{ @@ -120,12 +121,13 @@ pub trait AbstractTree { /// # Errors /// /// Will return `Err` if an IO error occurs. + #[warn(clippy::type_complexity)] fn flush_memtable( &self, segment_id: SegmentId, // TODO: remove? memtable: &Arc, seqno_threshold: SeqNo, - ) -> crate::Result)>>; + ) -> crate::Result, Option)>>; /// Atomically registers flushed disk segments into the tree, removing their associated sealed memtables. /// @@ -136,6 +138,7 @@ pub trait AbstractTree { &self, segments: &[Segment], blob_files: Option<&[BlobFile]>, + frag_map: Option, seqno_threshold: SeqNo, ) -> crate::Result<()>; diff --git a/src/blob_tree/gc.rs b/src/blob_tree/gc.rs new file mode 100644 index 00000000..b0d9ecc2 --- /dev/null +++ b/src/blob_tree/gc.rs @@ -0,0 +1,221 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use crate::{ + blob_tree::value::{MaybeInlineValue, TAG_INDIRECT}, + compaction::stream::ExpiredKvCallback, + vlog::BlobFileId, +}; + +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub struct FragmentationEntry { + /// Number of unreferenced (garbage) blobs + pub(crate) len: usize, + + /// Unreferenced (garbage) blob bytes that could be freed + pub(crate) bytes: u64, +} + +impl FragmentationEntry { + #[must_use] + pub fn new(len: usize, bytes: u64) -> Self { + Self { len, bytes } + } +} + +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub struct FragmentationMap(crate::HashMap); + +impl std::ops::Deref for FragmentationMap { + type Target = crate::HashMap; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl FragmentationMap { + // TODO: unit test + pub fn merge_into(self, other: &mut Self) { + for (blob_file_id, diff) in self.0 { + other + .0 + .entry(blob_file_id) + .and_modify(|counter| { + counter.bytes += diff.bytes; + counter.len += diff.len; + }) + .or_insert(diff); + } + } +} + +impl crate::coding::Encode for FragmentationMap { + fn encode_into(&self, writer: &mut W) -> Result<(), crate::EncodeError> { + use byteorder::{WriteBytesExt, LE}; + + // NOTE: We know there are always less than 4 billion blob files + #[allow(clippy::cast_possible_truncation)] + writer.write_u32::(self.len() as u32)?; + + for (blob_file_id, item) in self.iter() { + writer.write_u64::(*blob_file_id)?; + + // NOTE: We know there are always less than 4 billion blobs in a blob file + #[allow(clippy::cast_possible_truncation)] + writer.write_u32::(item.len as u32)?; + + writer.write_u64::(item.bytes)?; + } + + Ok(()) + } +} + +impl crate::coding::Decode for FragmentationMap { + fn decode_from(reader: &mut R) -> Result + where + Self: Sized, + { + use byteorder::{ReadBytesExt, LE}; + + let len = reader.read_u32::()?; + let mut map = + crate::HashMap::with_capacity_and_hasher(len as usize, rustc_hash::FxBuildHasher); + + for _ in 0..len { + let id = reader.read_u64::()?; + + // NOTE: We know there are always less than 4 billion blobs in a blob file + #[allow(clippy::cast_possible_truncation)] + let len = reader.read_u32::()? as usize; + + let bytes = reader.read_u64::()?; + + map.insert(id, FragmentationEntry::new(len, bytes)); + } + + Ok(Self(map)) + } +} + +impl ExpiredKvCallback for FragmentationMap { + fn on_expired(&mut self, kv: &crate::InternalValue) { + if kv.key.is_tombstone() { + return; + } + + let tag = *kv.value.first().expect("value should not be empty"); + + if tag == TAG_INDIRECT { + let parsed_indirection = + MaybeInlineValue::from_slice(&kv.value).expect("should parse MaybeInlineValue"); + + match parsed_indirection { + MaybeInlineValue::Indirect { vhandle, size } => { + let size = u64::from(size); + + self.0 + .entry(vhandle.blob_file_id) + .and_modify(|counter| { + counter.len += 1; + counter.bytes += size; + }) + .or_insert_with(|| FragmentationEntry { + bytes: size, + len: 1, + }); + } + // NOTE: Unreachable because we check for the tag above + MaybeInlineValue::Inline(_) => unreachable!(), + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{ + coding::{Decode, Encode}, + compaction::stream::CompactionStream, + value::{InternalValue, ValueType}, + vlog::ValueHandle, + }; + use std::collections::HashMap; + use test_log::test; + + /// Tests encoding and decoding traits + #[test] + fn frag_map_roundtrip() { + let map = FragmentationMap({ + let mut map = HashMap::default(); + map.insert( + 0, + FragmentationEntry { + len: 1, + bytes: 1_000, + }, + ); + map.insert( + 1, + FragmentationEntry { + len: 2, + bytes: 2_000, + }, + ); + map + }); + + let encoded = map.encode_into_vec(); + let decoded = FragmentationMap::decode_from(&mut &encoded[..]).unwrap(); + assert_eq!(map, decoded); + } + + #[test] + #[allow(clippy::unwrap_used)] + fn compaction_stream_gc_count_drops() -> crate::Result<()> { + #[rustfmt::skip] + let vec = &[ + InternalValue::from_components("a", b"abc", 1, ValueType::Value), + + InternalValue::from_components("a", MaybeInlineValue::Indirect { + size: 1000, + vhandle: ValueHandle { + blob_file_id: 0, + on_disk_size: 500, + offset: 0, + } + }.encode_into_vec(), 0, ValueType::Value), + ]; + + let mut my_watcher = FragmentationMap::default(); + + let iter = vec.iter().cloned().map(Ok); + let mut iter = CompactionStream::new(iter, 1_000).with_expiration_callback(&mut my_watcher); + + assert_eq!( + // Seqno is reset to 0 + InternalValue::from_components(*b"a", b"abc", 0, ValueType::Value), + iter.next().unwrap()?, + ); + + assert_eq!( + { + let mut map = HashMap::default(); + map.insert( + 0, + FragmentationEntry { + len: 1, + bytes: 1_000, + }, + ); + map + }, + my_watcher.0, + ); + + Ok(()) + } +} diff --git a/src/blob_tree/gc/mod.rs b/src/blob_tree/gc/mod.rs deleted file mode 100644 index e0f13f06..00000000 --- a/src/blob_tree/gc/mod.rs +++ /dev/null @@ -1,6 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -pub mod reader; -pub mod writer; diff --git a/src/blob_tree/gc/reader.rs b/src/blob_tree/gc/reader.rs deleted file mode 100644 index 25e4f612..00000000 --- a/src/blob_tree/gc/reader.rs +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use crate::vlog::ValueHandle; -use crate::{blob_tree::value::MaybeInlineValue, coding::Decode, Memtable, SeqNo}; -use std::io::Cursor; - -#[allow(clippy::module_name_repetitions)] -pub struct GcReader<'a> { - tree: &'a crate::Tree, - memtable: &'a Memtable, -} - -impl<'a> GcReader<'a> { - pub fn new(tree: &'a crate::Tree, memtable: &'a Memtable) -> Self { - Self { tree, memtable } - } - - fn get_internal(&self, key: &[u8]) -> crate::Result> { - let Some(item) = self - .tree - .get_internal_entry_with_memtable(self.memtable, key, SeqNo::MAX)? - .map(|x| x.value) - else { - return Ok(None); - }; - - let mut cursor = Cursor::new(item); - let item = MaybeInlineValue::decode_from(&mut cursor)?; - - Ok(Some(item)) - } -} - -impl crate::vlog::IndexReader for GcReader<'_> { - fn get(&self, key: &[u8]) -> std::io::Result> { - use std::io::Error as IoError; - use MaybeInlineValue::{Indirect, Inline}; - - let Some(item) = self - .get_internal(key) - .map_err(|e| IoError::other(e.to_string()))? - else { - return Ok(None); - }; - - match item { - Inline(_) => Ok(None), - Indirect { vhandle, .. } => Ok(Some(vhandle)), - } - } -} diff --git a/src/blob_tree/gc/writer.rs b/src/blob_tree/gc/writer.rs deleted file mode 100644 index 6d04e5b0..00000000 --- a/src/blob_tree/gc/writer.rs +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use crate::vlog::ValueHandle; -use crate::{ - blob_tree::value::MaybeInlineValue, coding::Encode, value::InternalValue, Memtable, SeqNo, - UserKey, -}; - -#[allow(clippy::module_name_repetitions)] -pub struct GcWriter<'a> { - seqno: SeqNo, - buffer: Vec<(UserKey, ValueHandle, u32)>, - memtable: &'a Memtable, -} - -impl<'a> GcWriter<'a> { - pub fn new(seqno: SeqNo, memtable: &'a Memtable) -> Self { - Self { - seqno, - memtable, - buffer: Vec::with_capacity(100), - } - } -} - -impl crate::vlog::IndexWriter for GcWriter<'_> { - fn insert_indirect( - &mut self, - key: &[u8], - vhandle: ValueHandle, - size: u32, - ) -> std::io::Result<()> { - self.buffer.push((key.into(), vhandle, size)); - Ok(()) - } - - fn finish(&mut self) -> std::io::Result<()> { - log::trace!("Finish blob GC index writer"); - - #[allow(clippy::significant_drop_in_scrutinee)] - for (key, vhandle, size) in self.buffer.drain(..) { - let buf = MaybeInlineValue::Indirect { vhandle, size }.encode_into_vec(); - - self.memtable.insert(InternalValue::from_components( - key, - buf, - self.seqno, - crate::ValueType::Value, - )); - } - - Ok(()) - } -} diff --git a/src/blob_tree/index.rs b/src/blob_tree/index.rs deleted file mode 100644 index 117ec144..00000000 --- a/src/blob_tree/index.rs +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use super::value::MaybeInlineValue; -use crate::{AbstractTree, SeqNo, Tree as LsmTree}; - -#[allow(clippy::module_name_repetitions)] -#[derive(Clone)] -pub struct IndexTree(#[doc(hidden)] pub LsmTree); - -impl std::ops::Deref for IndexTree { - type Target = LsmTree; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl IndexTree { - pub(crate) fn get_vhandle( - &self, - key: &[u8], - seqno: SeqNo, - ) -> crate::Result> { - let Some(item) = self.get(key, seqno)? else { - return Ok(None); - }; - - let item = MaybeInlineValue::from_slice(&item)?; - - Ok(Some(item)) - } -} - -impl From for IndexTree { - fn from(value: LsmTree) -> Self { - Self(value) - } -} diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index d9d525aa..284bff62 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -3,9 +3,11 @@ // (found in the LICENSE-* files in the repository) mod gc; -pub mod index; pub mod value; +#[doc(hidden)] +pub use gc::{FragmentationEntry, FragmentationMap}; + use crate::{ coding::{Decode, Encode}, compaction::stream::CompactionStream, @@ -15,38 +17,27 @@ use crate::{ segment::Segment, tree::inner::MemtableId, value::InternalValue, - vlog::{Accessor, BlobFile, BlobFileId, BlobFileWriter, ValueHandle, ValueLog}, + vlog::{Accessor, BlobFile, BlobFileId, BlobFileWriter, ValueHandle}, Config, Memtable, SegmentId, SeqNo, SequenceNumberCounter, UserKey, UserValue, }; -use gc::{reader::GcReader, writer::GcWriter}; -use index::IndexTree; -use std::{ - collections::BTreeMap, - io::Cursor, - ops::{RangeBounds, RangeFull}, - path::PathBuf, - sync::{ - atomic::{AtomicU64, AtomicUsize}, - Arc, - }, -}; +use std::{collections::BTreeMap, io::Cursor, ops::RangeBounds, path::PathBuf, sync::Arc}; use value::MaybeInlineValue; -pub struct Guard<'a>( - &'a BlobTree, - Arc>, - crate::Result<(UserKey, UserValue)>, -); +pub struct Guard<'a> { + blob_tree: &'a BlobTree, + vlog: Arc>, + kv: crate::Result<(UserKey, UserValue)>, +} impl IterGuard for Guard<'_> { fn key(self) -> crate::Result { - self.2.map(|(k, _)| k) + self.kv.map(|(k, _)| k) } fn size(self) -> crate::Result { use MaybeInlineValue::{Indirect, Inline}; - let value = self.2?.1; + let (_, value) = self.kv?; let mut cursor = Cursor::new(value); Ok(match MaybeInlineValue::decode_from(&mut cursor)? { @@ -60,7 +51,7 @@ impl IterGuard for Guard<'_> { } fn into_inner(self) -> crate::Result<(UserKey, UserValue)> { - resolve_value_handle(self.0, &self.1, self.2) + resolve_value_handle(self.blob_tree, &self.vlog, self.kv) } } @@ -108,28 +99,16 @@ fn resolve_value_handle( pub struct BlobTree { /// Index tree that holds value handles or small inline values #[doc(hidden)] - pub index: IndexTree, + pub index: crate::Tree, blobs_folder: PathBuf, - // TODO: maybe replace this with a nonce system - #[doc(hidden)] - pub pending_segments: Arc, - blob_file_id_generator: SequenceNumberCounter, } impl BlobTree { pub(crate) fn open(config: Config) -> crate::Result { - // let path = &config.path; - - // let vlog_path = path.join(BLOBS_FOLDER); - // let vlog_cfg = - // crate::vlog::Config::new(config.cache.clone(), config.descriptor_table.clone()) - // .blob_file_size_bytes(config.blob_file_target_size) - // .compression(config.blob_compression); - - let index: IndexTree = config.open()?.into(); + let index = config.open()?; let blobs_folder = index.config.path.join(BLOBS_FOLDER); std::fs::create_dir_all(&blobs_folder)?; @@ -150,13 +129,23 @@ impl BlobTree { Ok(Self { index, blobs_folder, - pending_segments: Arc::new(AtomicUsize::new(0)), blob_file_id_generator: SequenceNumberCounter::new(blob_file_id_to_continue_with), }) } + fn get_vhandle(&self, key: &[u8], seqno: SeqNo) -> crate::Result> { + let Some(item) = self.index.get(key, seqno)? else { + return Ok(None); + }; + + let item = MaybeInlineValue::from_slice(&item)?; + + Ok(Some(item)) + } + #[must_use] pub fn space_amp(&self) -> f32 { + // TODO: calculate using current version FragmentationMap todo!() } @@ -170,7 +159,7 @@ impl BlobTree { /// /// Will return `Err` if an IO error occurs. fn consume_blob_file_writer(writer: BlobFileWriter) -> crate::Result> { - use crate::vlog::blob_file::{GcStats, Inner as BlobFileInner, Metadata}; + use crate::vlog::blob_file::{Inner as BlobFileInner, Metadata}; let writers = writer.finish()?; @@ -215,7 +204,7 @@ impl BlobTree { .expect("should have written at least 1 item"), )), }, - gc_stats: GcStats::default(), + // gc_stats: GcStats::default(), }))); log::debug!( @@ -228,136 +217,13 @@ impl BlobTree { Ok(blob_files) } - /// Scans the index tree, collecting statistics about value log fragmentation. - #[doc(hidden)] - pub fn gc_scan_stats( - &self, - seqno: SeqNo, - gc_watermark: SeqNo, - ) -> crate::Result { - use std::io::Error as IoError; - use MaybeInlineValue::{Indirect, Inline}; - - todo!() - - // while self - // .pending_segments - // .load(std::sync::atomic::Ordering::Acquire) - // > 0 - // { - // // IMPORTANT: Busy wait until all segments in-flight are committed - // // to the tree - // } - - // // IMPORTANT: Lock + snapshot memtable to avoid read skew + preventing tampering with memtable - // let _memtable_lock = self.index.read_lock_active_memtable(); - - // while self - // .pending_segments - // .load(std::sync::atomic::Ordering::Acquire) - // > 0 - // { - // // IMPORTANT: Busy wait again until all segments in-flight are committed - // // to the tree - // } - - // let iter = self - // .index - // .create_internal_range::<&[u8], RangeFull>(&.., seqno, None); - - // // Stores the max seqno of every blob file - // let mut seqno_map = crate::HashMap::::default(); - - // let result = self.blobs.scan_for_stats(iter.filter_map(|kv| { - // let Ok(kv) = kv else { - // return Some(Err(IoError::other( - // "Failed to load KV pair from index tree", - // ))); - // }; - - // let mut cursor = Cursor::new(kv.value); - // let value = match MaybeInlineValue::decode_from(&mut cursor) { - // Ok(v) => v, - // Err(e) => return Some(Err(IoError::other(e.to_string()))), - // }; - - // match value { - // Indirect { vhandle, size } => { - // seqno_map - // .entry(vhandle.blob_file_id) - // .and_modify(|x| *x = (*x).max(kv.key.seqno)) - // .or_insert(kv.key.seqno); - - // Some(Ok((vhandle, size))) - // } - // Inline(_) => None, - // } - // })); - - // // TODO: - - // // let mut lock = self - // // .blobs - // // .manifest - // // .blob_files - // // .write() - // // .expect("lock is poisoned"); - - // // // IMPORTANT: We are overwiting the staleness of blob files - // // // that contain an item that is still contained in the GC watermark - // // // so snapshots cannot accidentally lose data - // // // - // // // TODO: 3.0.0 this should be dealt with in value-log 2.0 (make it MVCC aware) - // // for (blob_file_id, max_seqno) in seqno_map { - // // if gc_watermark <= max_seqno { - // // if let Some(blob_file) = lock.get_mut(&blob_file_id) { - // // blob_file.gc_stats.set_stale_items(0); - // // blob_file.gc_stats.set_stale_bytes(0); - // // } - // // } - // // } - - // result - } - - pub fn apply_gc_strategy( - &self, - strategy: &impl crate::vlog::GcStrategy, - seqno: SeqNo, - ) -> crate::Result { - todo!() - - // // IMPORTANT: Write lock memtable to avoid read skew - // let memtable_lock = self.index.lock_active_memtable(); - - // self.blobs.apply_gc_strategy( - // strategy, - // &GcReader::new(&self.index, &memtable_lock), - // GcWriter::new(seqno, &memtable_lock), - // )?; - - // // NOTE: We still have the memtable lock, can't use gc_drop_stale because recursive locking - // self.blobs.drop_stale_blob_files() - } - - /// Drops all stale blob segment files - #[doc(hidden)] - pub fn gc_drop_stale(&self) -> crate::Result { - todo!() - - // // IMPORTANT: Write lock memtable to avoid read skew - // let _lock = self.index.lock_active_memtable(); - - // self.blobs.drop_stale_blob_files() - } - #[doc(hidden)] pub fn flush_active_memtable(&self, eviction_seqno: SeqNo) -> crate::Result> { let Some((segment_id, yanked_memtable)) = self.index.rotate_memtable() else { return Ok(None); }; - let Some((segment, blob_file)) = + let Some((segment, blob_file, frag_map)) = self.flush_memtable(segment_id, &yanked_memtable, eviction_seqno)? else { return Ok(None); @@ -365,6 +231,7 @@ impl BlobTree { self.register_segments( std::slice::from_ref(&segment), blob_file.as_ref().map(std::slice::from_ref), + frag_map, eviction_seqno, )?; @@ -396,12 +263,16 @@ impl AbstractTree for BlobTree { .current_version() .clone(); - // TODO: PERF: ugly Arc clone Box::new( self.index - .0 .create_prefix(&prefix, seqno, index) - .map(move |kv| IterGuardImpl::Blob(Guard(self, version.value_log.clone(), kv))), + .map(move |kv| { + IterGuardImpl::Blob(Guard { + blob_tree: self, + vlog: version.value_log.clone(), // TODO: PERF: ugly Arc clone + kv, + }) + }), ) } @@ -422,9 +293,14 @@ impl AbstractTree for BlobTree { // TODO: PERF: ugly Arc clone Box::new( self.index - .0 .create_range(&range, seqno, index) - .map(move |kv| IterGuardImpl::Blob(Guard(self, version.value_log.clone(), kv))), + .map(move |kv| { + IterGuardImpl::Blob(Guard { + blob_tree: self, + vlog: version.value_log.clone(), // TODO: PERF: ugly Arc clone + kv, + }) + }), ) } @@ -457,7 +333,7 @@ impl AbstractTree for BlobTree { // "can only perform bulk_ingest on empty trees", // ); - // let mut segment_writer = Ingestion::new(&self.index)?; + // let mut segment_writer = Ingestion::new(&self.index)?.with_seqno(seqno); // let mut blob_writer = self.blobs.get_writer()?; // let start = Instant::now(); @@ -505,6 +381,8 @@ impl AbstractTree for BlobTree { // // self.blobs.register_writer(blob_writer)?; // // segment_writer.finish()?; + // TODO: increaes visible seqno + // log::info!("Ingested {count} items in {:?}", start.elapsed()); Ok(()) @@ -534,7 +412,7 @@ impl AbstractTree for BlobTree { // NOTE: We skip reading from the value log // because the vHandles already store the value size fn size_of>(&self, key: K, seqno: SeqNo) -> crate::Result> { - let vhandle = self.index.get_vhandle(key.as_ref(), seqno)?; + let vhandle = self.get_vhandle(key.as_ref(), seqno)?; Ok(vhandle.map(|x| match x { // NOTE: Values are u32 length max @@ -568,44 +446,43 @@ impl AbstractTree for BlobTree { segment_id: SegmentId, memtable: &Arc, eviction_seqno: SeqNo, - ) -> crate::Result)>> { + ) -> crate::Result, Option)>> { use crate::{file::SEGMENTS_FOLDER, segment::Writer as SegmentWriter}; use value::MaybeInlineValue; let lsm_segment_folder = self.index.config.path.join(SEGMENTS_FOLDER); log::debug!("Flushing memtable & performing key-value separation"); - log::debug!("=> to LSM segments in {}", lsm_segment_folder.display()); - // log::debug!("=> to blob segment at {}", self.blobs.path.display()); - - let mut segment_writer = SegmentWriter::new( - lsm_segment_folder.join(segment_id.to_string()), - segment_id, - /* Options { - segment_id, - data_block_size: self.index.config.data_block_size, - index_block_size: self.index.config.index_block_size, - folder: lsm_segment_folder, - } */ - )? - .use_data_block_compression(self.index.config.data_block_compression_policy.get(0)); - // TODO: monkey - /* segment_writer = segment_writer.use_bloom_policy( - crate::segment::writer::BloomConstructionPolicy::FpRate(0.0001), - ); */ + log::debug!("=> to LSM table in {}", lsm_segment_folder.display()); + log::debug!("=> to blob file at {}", self.blobs_folder.display()); + + let mut segment_writer = + SegmentWriter::new(lsm_segment_folder.join(segment_id.to_string()), segment_id)? + // TODO: apply other policies + .use_data_block_compression(self.index.config.data_block_compression_policy.get(0)) + .use_bloom_policy({ + use crate::config::FilterPolicyEntry::{Bloom, None}; + use crate::segment::filter::BloomConstructionPolicy; + + match self.index.config.filter_policy.get(0) { + Bloom(policy) => policy, + None => BloomConstructionPolicy::BitsPerKey(0.0), + } + }); + // TODO: 3.0.0 select compression let mut blob_writer = BlobFileWriter::new( self.blob_file_id_generator.clone(), - u64::MAX, + u64::MAX, // TODO: actually use target size? but be sure to link to table correctly self.index.config.path.join(BLOBS_FOLDER), )?; - // TODO: select compression - - // let mut blob_writer = self.blobs.get_writer()?.use_target_size(u64::MAX); + blob_writer.link_table(segment_id); let iter = memtable.iter().map(Ok); let compaction_filter = CompactionStream::new(iter, eviction_seqno); + let mut blob_bytes_referenced = 0; + for item in compaction_filter { let item = item?; @@ -658,6 +535,8 @@ impl AbstractTree for BlobTree { segment_writer .write(InternalValue::new(item.key.clone(), serialized_indirection))?; + + blob_bytes_referenced += u64::from(value_size); } else { // TODO: use Slice::with_size let direct = MaybeInlineValue::Inline(value); @@ -666,50 +545,33 @@ impl AbstractTree for BlobTree { } } - // let _memtable_lock = self.lock_active_memtable(); - - // TODO: 3.0.0: add to vlog atomically together with the segment (that way, we don't need the pending_segments monkey patch) log::trace!("Creating blob file"); let blob_files = Self::consume_blob_file_writer(blob_writer)?; assert!(blob_files.len() <= 1); let blob_file = blob_files.into_iter().next(); log::trace!("Creating LSM-tree segment {segment_id}"); - let segment = self.index.consume_writer(segment_writer)?; - // TODO: this can probably solved in a nicer way - if segment.is_some() { - // IMPORTANT: Increment the pending count - // so there cannot be a GC scan now, until the segment is registered - self.pending_segments - .fetch_add(1, std::sync::atomic::Ordering::Release); + if blob_bytes_referenced > 0 { + if let Some(blob_file) = &blob_file { + segment_writer.link_blob_file(blob_file.id(), blob_bytes_referenced); + } } - Ok(segment.map(|segment| (segment, blob_file))) + let segment = self.index.consume_writer(segment_writer)?; + + Ok(segment.map(|segment| (segment, blob_file, None))) } fn register_segments( &self, segments: &[Segment], blob_files: Option<&[BlobFile]>, + frag_map: Option, seqno_threshold: SeqNo, ) -> crate::Result<()> { self.index - .register_segments(segments, blob_files, seqno_threshold)?; - - let count = self - .pending_segments - .load(std::sync::atomic::Ordering::Acquire); - - assert!( - count >= segments.len(), - "pending_segments is less than segments to register - this is a bug" - ); - - self.pending_segments - .fetch_sub(segments.len(), std::sync::atomic::Ordering::Release); - - Ok(()) + .register_segments(segments, blob_files, frag_map, seqno_threshold) } fn lock_active_memtable(&self) -> std::sync::RwLockWriteGuard<'_, Arc> { @@ -832,7 +694,7 @@ impl AbstractTree for BlobTree { // TODO: refactor memtable, sealed memtables, manifest lock to be a single lock (SuperVersion kind of) // TODO: then, try to reduce the lock access to 1, because we are accessing it twice (index.get, and then vhandle resolving...) - let Some(value) = self.index.get_vhandle(key, seqno)? else { + let Some(value) = self.get_vhandle(key, seqno)? else { return Ok(None); }; diff --git a/src/blob_tree/value.rs b/src/blob_tree/value.rs index 1c6ab3a7..93fe6c41 100644 --- a/src/blob_tree/value.rs +++ b/src/blob_tree/value.rs @@ -24,7 +24,7 @@ pub enum MaybeInlineValue { } const TAG_INLINE: u8 = 0; -const TAG_INDIRECT: u8 = 1; +pub const TAG_INDIRECT: u8 = 1; impl MaybeInlineValue { pub fn from_slice(bytes: &Slice) -> Result { diff --git a/src/config/mod.rs b/src/config/mod.rs index cbdbfc7a..3a33d60e 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -123,6 +123,7 @@ pub struct Config { /// Key-value separation threshold in bytes #[doc(hidden)] pub blob_file_separation_threshold: u32, + // TODO: blob_file_staleness_threshold AND/OR space_amp_threshold } impl Default for Config { diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index cbf43fd9..5277104e 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -5,6 +5,7 @@ pub(crate) mod hidden_set; use crate::{ + coding::Decode, file::{fsync_directory, rewrite_atomic}, segment::Segment, version::{Level, Run, Version, VersionId, DEFAULT_LEVEL_COUNT}, @@ -24,6 +25,7 @@ pub struct Recovery { pub curr_version_id: VersionId, pub segment_ids: Vec>>, pub blob_file_ids: Vec, + pub gc_stats: crate::blob_tree::FragmentationMap, } /// Represents the levels of a log-structured merge tree @@ -194,7 +196,7 @@ impl LevelManifest { let blob_file_ids = { let mut reader = toc .section(b"blob_files") - .expect("tables should exist") + .expect("blob_files should exist") .buf_reader(&version_file_path)?; let blob_file_count = reader.read_u32::()?; @@ -208,10 +210,20 @@ impl LevelManifest { blob_file_ids }; + let gc_stats = { + let mut reader = toc + .section(b"blob_gc_stats") + .expect("blob_gc_stats should exist") + .buf_reader(&version_file_path)?; + + crate::blob_tree::FragmentationMap::decode_from(&mut reader)? + }; + Ok(Recovery { curr_version_id, segment_ids: levels, blob_file_ids, + gc_stats, }) } @@ -223,7 +235,7 @@ impl LevelManifest { pub(crate) fn recover>( folder: P, - recovery: &Recovery, + recovery: Recovery, segments: &[Segment], blob_files: &[BlobFile], ) -> crate::Result { @@ -254,9 +266,12 @@ impl LevelManifest { .collect::>>()?; Ok(Self { - current: Version::from_levels(recovery.curr_version_id, version_levels, { - blob_files.iter().cloned().map(|bf| (bf.id(), bf)).collect() - }), + current: Version::from_levels( + recovery.curr_version_id, + version_levels, + blob_files.iter().cloned().map(|bf| (bf.id(), bf)).collect(), + recovery.gc_stats, + ), folder: folder.into(), hidden_set: HiddenSet::default(), version_free_list: VecDeque::default(), // TODO: 3. create free list from versions that are N < CURRENT, or delete old versions eagerly... diff --git a/src/lib.rs b/src/lib.rs index 58ad5cc9..9ccc9e73 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -100,7 +100,11 @@ // #![cfg_attr(feature = "bytes", deny(unsafe_code))] // #![cfg_attr(not(feature = "bytes"), forbid(unsafe_code))] -pub(crate) type HashMap = std::collections::HashMap; +// TODO: 3.0.0 use checksum type impl from sfa as well + +#[doc(hidden)] +pub type HashMap = std::collections::HashMap; + pub(crate) type HashSet = std::collections::HashSet; #[allow(unused)] @@ -119,6 +123,23 @@ macro_rules! fail_iter { }; } +// TODO: investigate perf +macro_rules! unwrap { + ($x:expr) => {{ + #[cfg(not(feature = "use_unsafe"))] + { + $x.expect("should read") + } + + #[cfg(feature = "use_unsafe")] + { + unsafe { $x.unwrap_unchecked() } + } + }}; +} + +pub(crate) use unwrap; + mod any_tree; mod r#abstract; @@ -185,6 +206,9 @@ mod path; #[doc(hidden)] pub mod range; +#[doc(hidden)] +pub mod segment; + mod seqno; mod slice; mod slice_windows; @@ -199,17 +223,18 @@ mod value; mod version; mod vlog; -#[doc(hidden)] -pub mod segment; +/// User defined key +pub type UserKey = Slice; + +/// User defined data (byte array) +pub type UserValue = Slice; /// KV-tuple, typically returned by an iterator pub type KvPair = (UserKey, UserValue); -#[doc(hidden)] -pub use key_range::KeyRange; - #[doc(hidden)] pub use { + key_range::KeyRange, merge::BoxedIterator, segment::{block::Checksum, GlobalSegmentId, Segment, SegmentId}, tree::ingest::Ingestion, @@ -218,6 +243,8 @@ pub use { }; pub use { + any_tree::AnyTree, + blob_tree::BlobTree, cache::Cache, coding::{DecodeError, EncodeError}, compression::CompressionType, @@ -229,6 +256,7 @@ pub use { memtable::Memtable, r#abstract::AbstractTree, seqno::SequenceNumberCounter, + slice::Slice, tree::Tree, value::{SeqNo, ValueType}, vlog::BlobFile, @@ -236,39 +264,3 @@ pub use { #[cfg(feature = "metrics")] pub use metrics::Metrics; - -pub use any_tree::AnyTree; - -pub use blob_tree::BlobTree; - -pub use slice::Slice; - -/// User defined key -pub type UserKey = Slice; - -/// User defined data (byte array) -pub type UserValue = Slice; - -/// Blob garbage collection utilities -pub mod gc { - pub use super::vlog::{ - GcReport as Report, GcStrategy as Strategy, SpaceAmpStrategy, StaleThresholdStrategy, - }; -} - -// TODO: investigate perf -macro_rules! unwrap { - ($x:expr) => {{ - #[cfg(not(feature = "use_unsafe"))] - { - $x.expect("should read") - } - - #[cfg(feature = "use_unsafe")] - { - unsafe { $x.unwrap_unchecked() } - } - }}; -} - -pub(crate) use unwrap; diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 361e9127..8f6ec983 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -27,13 +27,19 @@ pub use writer::Writer; use crate::{ cache::Cache, descriptor_table::DescriptorTable, - segment::block::{BlockType, ParsedItem}, + segment::{ + block::{BlockType, ParsedItem}, + writer::LinkedFile, + }, + vlog::BlobFileId, CompressionType, InternalValue, SeqNo, TreeId, UserKey, }; use block_index::BlockIndexImpl; use inner::Inner; use iter::Iter; use std::{ + fs::File, + io::{BufReader, Read, Seek}, ops::{Bound, RangeBounds}, path::PathBuf, sync::Arc, @@ -96,6 +102,35 @@ impl std::fmt::Debug for Segment { } impl Segment { + pub(crate) fn get_linked_blob_files(&self) -> crate::Result>> { + use byteorder::{ReadBytesExt, LE}; + + Ok(if let Some(handle) = &self.regions.linked_blob_files { + let reader = File::open(&*self.path)?; + let mut reader = BufReader::new(reader); + reader.seek(std::io::SeekFrom::Start(*handle.offset()))?; + let mut reader = reader.take(u64::from(handle.size())); + + let mut blob_files = vec![]; + + let len = reader.read_u32::()?; + + for _ in 0..len { + let blob_file_id = reader.read_u64::()?; + let bytes = reader.read_u64::()?; + + blob_files.push(LinkedFile { + blob_file_id, + bytes, + }); + } + + Some(blob_files) + } else { + None + }) + } + /// Gets the global segment ID. #[must_use] pub fn global_id(&self) -> GlobalSegmentId { diff --git a/src/segment/regions.rs b/src/segment/regions.rs index 9dff8db9..8ce3a2f7 100644 --- a/src/segment/regions.rs +++ b/src/segment/regions.rs @@ -20,7 +20,9 @@ fn toc_entry_to_handle(entry: &TocEntry) -> BlockHandle { /// |--------------| /// | filter | <- may not exist /// |--------------| -/// | ... TBD ... | +/// | ... | +/// |--------------| +/// | linked blobs | <- may not exist /// |--------------| /// | meta | /// |--------------| @@ -33,6 +35,7 @@ pub struct ParsedRegions { pub tli: BlockHandle, pub index: Option, pub filter: Option, + pub linked_blob_files: Option, pub metadata: BlockHandle, } @@ -48,6 +51,7 @@ impl ParsedRegions { })?, index: toc.section(b"index").map(toc_entry_to_handle), filter: toc.section(b"filter").map(toc_entry_to_handle), + linked_blob_files: toc.section(b"linked_blob_files").map(toc_entry_to_handle), metadata: toc .section(b"meta") .map(toc_entry_to_handle) diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 68fb420b..d1da56c7 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -7,11 +7,16 @@ use super::{ }; use crate::{ coding::Encode, file::fsync_directory, segment::filter::standard_bloom::Builder, - time::unix_timestamp, CompressionType, InternalValue, SegmentId, UserKey, + time::unix_timestamp, vlog::BlobFileId, CompressionType, InternalValue, SegmentId, UserKey, }; use index::{BlockIndexWriter, FullIndexWriter}; use std::{fs::File, io::BufWriter, path::PathBuf}; +pub struct LinkedFile { + pub(crate) blob_file_id: BlobFileId, + pub(crate) bytes: u64, +} + /// Serializes and compresses values into blocks and writes them to disk as segment pub struct Writer { /// Segment file @@ -61,6 +66,8 @@ pub struct Writer { /// /// using enhanced double hashing, so we got two u64s pub bloom_hash_buffer: Vec, + + linked_blob_files: Vec, } impl Writer { @@ -103,9 +110,18 @@ impl Writer { bloom_policy: BloomConstructionPolicy::default(), bloom_hash_buffer: Vec::new(), + + linked_blob_files: Vec::new(), }) } + pub fn link_blob_file(&mut self, blob_file_id: BlobFileId, bytes: u64) { + self.linked_blob_files.push(LinkedFile { + blob_file_id, + bytes, + }); + } + #[must_use] pub fn use_data_block_restart_interval(mut self, interval: u8) -> Self { self.data_block_restart_interval = interval; @@ -339,6 +355,22 @@ impl Writer { )?; } + if !self.linked_blob_files.is_empty() { + use byteorder::{WriteBytesExt, LE}; + + self.block_writer.start("linked_blob_files")?; + + // NOTE: We know that there are never 4 billion blob files linked to a single table + #[allow(clippy::cast_possible_truncation)] + self.block_writer + .write_u32::(self.linked_blob_files.len() as u32)?; + + for file in self.linked_blob_files { + self.block_writer.write_u64::(file.blob_file_id)?; + self.block_writer.write_u64::(file.bytes)?; + } + } + // Write metadata self.block_writer.start("meta")?; diff --git a/src/tree/ingest.rs b/src/tree/ingest.rs index 6cf1c698..63d5d539 100644 --- a/src/tree/ingest.rs +++ b/src/tree/ingest.rs @@ -32,6 +32,7 @@ impl<'a> Ingestion<'a> { log::debug!("Ingesting into disk segments in {}", folder.display()); // TODO: 3.0.0 look at tree configuration + // TODO: maybe create a PrepareMultiWriter that can be used by flush, ingest and compaction worker let writer = MultiWriter::new( folder.clone(), tree.segment_id_counter.clone(), @@ -137,7 +138,8 @@ impl<'a> Ingestion<'a> { }) .collect::>>()?; - self.tree.register_segments(&created_segments, None, 0)?; + self.tree + .register_segments(&created_segments, None, None, 0)?; let last_level_idx = self .tree diff --git a/src/tree/mod.rs b/src/tree/mod.rs index ec583088..7b1a5463 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -6,6 +6,7 @@ pub mod ingest; pub mod inner; use crate::{ + blob_tree::FragmentationMap, coding::{Decode, Encode}, compaction::CompactionStrategy, config::Config, @@ -273,7 +274,7 @@ impl AbstractTree for Tree { segment_id: SegmentId, memtable: &Arc, seqno_threshold: SeqNo, - ) -> crate::Result)>> { + ) -> crate::Result, Option)>> { use crate::{compaction::stream::CompactionStream, file::SEGMENTS_FOLDER, segment::Writer}; use std::time::Instant; @@ -327,13 +328,14 @@ impl AbstractTree for Tree { log::debug!("Flushed memtable {segment_id:?} in {:?}", start.elapsed()); - Ok(result.map(|segment| (segment, None))) + Ok(result.map(|segment| (segment, None, None))) } fn register_segments( &self, segments: &[Segment], blob_files: Option<&[BlobFile]>, + frag_map: Option, seqno_threshold: SeqNo, ) -> crate::Result<()> { log::trace!( @@ -353,7 +355,9 @@ impl AbstractTree for Tree { log::trace!("register: Acquired sealed memtables write lock"); manifest.atomic_swap( - |version| version.with_new_l0_run(segments, blob_files), + |version| { + version.with_new_l0_run(segments, blob_files, frag_map.filter(|x| !x.is_empty())) + }, seqno_threshold, )?; @@ -659,12 +663,12 @@ impl Tree { return Ok(None); }; - let Some((segment, _)) = + let Some((segment, _, _)) = self.flush_memtable(segment_id, &yanked_memtable, seqno_threshold)? else { return Ok(None); }; - self.register_segments(std::slice::from_ref(&segment), None, seqno_threshold)?; + self.register_segments(std::slice::from_ref(&segment), None, None, seqno_threshold)?; Ok(Some(segment)) } @@ -1040,6 +1044,9 @@ impl Tree { fsync_directory(&segment_base_folder)?; } + // TODO: 3.0.0 only remove unreferenced segments once we have successfully recovered the most recent version + // TODO: same for blob files + for (idx, dirent) in std::fs::read_dir(&segment_base_folder)?.enumerate() { let dirent = dirent?; let file_name = dirent.file_name(); @@ -1112,6 +1119,6 @@ impl Tree { &recovery.blob_file_ids, )?; - LevelManifest::recover(tree_path, &recovery, &segments, &blob_files) + LevelManifest::recover(tree_path, recovery, &segments, &blob_files) } } diff --git a/src/version/mod.rs b/src/version/mod.rs index ade0664d..035a1dba 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -7,14 +7,15 @@ pub mod run; pub use run::Run; +use crate::blob_tree::FragmentationMap; +use crate::coding::Encode; use crate::{ - coding::Encode, vlog::{BlobFile, BlobFileId}, HashSet, KeyRange, Segment, SegmentId, SeqNo, }; use optimize::optimize_runs; use run::Ranged; -use std::{collections::BTreeMap, io::Write, ops::Deref, sync::Arc}; +use std::{collections::BTreeMap, ops::Deref, sync::Arc}; pub const DEFAULT_LEVEL_COUNT: u8 = 7; @@ -143,12 +144,16 @@ pub struct VersionInner { /// The individual LSM-tree levels which consist of runs of tables pub(crate) levels: Vec, - // We purposefully use Arc<_> to avoid deep cloning the blob files again and again + // NOTE: We purposefully use Arc<_> to avoid deep cloning the blob files again and again // // Changing the value log tends to happen way less often than other modifications to the // LSM-tree + // /// Blob files for large values (value log) pub(crate) value_log: Arc>, + + /// Blob file fragmentation + gc_stats: Arc, } /// A version is an immutable, point-in-time view of a tree's structure @@ -180,6 +185,10 @@ impl Version { self.id } + pub fn gc_stats(&self) -> &FragmentationMap { + &self.gc_stats + } + /// Creates a new empty version. pub fn new(id: VersionId) -> Self { let levels = (0..DEFAULT_LEVEL_COUNT).map(|_| Level::empty()).collect(); @@ -189,6 +198,7 @@ impl Version { id, levels, value_log: Arc::default(), + gc_stats: Arc::default(), }), seqno_watermark: 0, } @@ -199,12 +209,14 @@ impl Version { id: VersionId, levels: Vec, blob_files: BTreeMap, + gc_stats: FragmentationMap, ) -> Self { Self { inner: Arc::new(VersionInner { id, levels, value_log: Arc::new(blob_files), + gc_stats: Arc::new(gc_stats), }), seqno_watermark: 0, } @@ -247,7 +259,12 @@ impl Version { } /// Creates a new version with the additional run added to the "top" of L0. - pub fn with_new_l0_run(&self, run: &[Segment], blob_files: Option<&[BlobFile]>) -> Self { + pub fn with_new_l0_run( + &self, + run: &[Segment], + blob_files: Option<&[BlobFile]>, + diff: Option, + ) -> Self { let id = self.id + 1; let mut levels = vec![]; @@ -290,11 +307,22 @@ impl Version { self.value_log.clone() }; + let gc_map = if let Some(diff) = diff { + let mut copy: FragmentationMap = self.gc_stats.deref().clone(); + diff.merge_into(&mut copy); + // TODO: if a blob file is not part of the version anymore, prune its entry from map + // to garbage collect old map entries -> otherwise, monotonically increasing memory usage + Arc::new(copy) + } else { + self.gc_stats.clone() + }; + Self { inner: Arc::new(VersionInner { id, levels, value_log, + gc_stats: gc_map, }), seqno_watermark: 0, } @@ -326,11 +354,14 @@ impl Version { levels.push(Level::from_runs(runs.into_iter().map(Arc::new).collect())); } + // TODO: adjust GC stats if needed + Self { inner: Arc::new(VersionInner { id, levels, value_log: self.value_log.clone(), + gc_stats: Arc::default(), }), seqno_watermark: 0, } @@ -341,6 +372,7 @@ impl Version { old_ids: &[SegmentId], new_segments: &[Segment], dest_level: usize, + diff: Option, ) -> Self { let id = self.id + 1; @@ -368,11 +400,22 @@ impl Version { levels.push(Level::from_runs(runs.into_iter().map(Arc::new).collect())); } + let gc_map = if let Some(diff) = diff { + let mut copy: FragmentationMap = self.gc_stats.deref().clone(); + diff.merge_into(&mut copy); + // TODO: if a blob file is not part of the version anymore, prune its entry from map + // to garbage collect old map entries -> otherwise, monotonically increasing memory usage + Arc::new(copy) + } else { + self.gc_stats.clone() + }; + Self { inner: Arc::new(VersionInner { id, levels, value_log: self.value_log.clone(), + gc_stats: gc_map, }), seqno_watermark: 0, } @@ -418,6 +461,7 @@ impl Version { id, levels, value_log: self.value_log.clone(), + gc_stats: Arc::default(), }), seqno_watermark: 0, } @@ -467,53 +511,8 @@ impl Version { writer.start("blob_gc_stats")?; - // TODO: 3.0.0 - - writer.write_all(b":)")?; + self.gc_stats.encode_into(writer)?; Ok(()) } } - -// #[cfg(test)] -// mod tests { -// use super::*; -// use test_log::test; - -// #[test] -// fn version_encode_empty() { -// let bytes = Version::new(0).encode_into_vec(); - -// #[rustfmt::skip] -// let raw = &[ -// // Magic -// b'L', b'S', b'M', 3, - -// // Level count -// 7, - -// // L0 runs -// 0, -// // L1 runs -// 0, -// // L2 runs -// 0, -// // L3 runs -// 0, -// // L4 runs -// 0, -// // L5 runs -// 0, -// // L6 runs -// 0, - -// // Blob file count -// 0, -// 0, -// 0, -// 0, -// ]; - -// assert_eq!(bytes, raw); -// } -// } diff --git a/src/vlog/accessor.rs b/src/vlog/accessor.rs index 5ce2b5a9..48fd3bf1 100644 --- a/src/vlog/accessor.rs +++ b/src/vlog/accessor.rs @@ -3,8 +3,8 @@ // (found in the LICENSE-* files in the repository) use crate::{ - vlog::{blob_file::writer::BLOB_HEADER_LEN, BlobFileId, ValueHandle}, - BlobFile, Cache, DescriptorTable, GlobalSegmentId, Slice, UserValue, + vlog::{blob_file::reader::Reader, BlobFileId, ValueHandle}, + BlobFile, Cache, DescriptorTable, GlobalSegmentId, UserValue, }; use std::{collections::BTreeMap, fs::File, path::Path, sync::Arc}; @@ -38,7 +38,7 @@ impl<'a> Accessor<'a> { return Ok(None); }; - let bf_id = GlobalSegmentId::from((0 /* TODO: tree ID */, vhandle.blob_file_id)); + let bf_id = GlobalSegmentId::from((0 /* TODO: tree ID */, blob_file.id())); let file = if let Some(fd) = descriptor_table.access_for_blob_file(&bf_id) { fd @@ -50,49 +50,9 @@ impl<'a> Accessor<'a> { file }; - let offset = vhandle.offset + (BLOB_HEADER_LEN as u64) + (key.len() as u64); + let value = Reader::new(blob_file, &file).get(key, vhandle)?; + cache.insert_blob(0 /* TODO: tree_id */, vhandle, value.clone()); - #[warn(unsafe_code)] - let mut builder = unsafe { Slice::builder_unzeroed(vhandle.on_disk_size as usize) }; - - { - #[cfg(unix)] - { - use std::os::unix::fs::FileExt; - - let bytes_read = file.read_at(&mut builder, offset)?; - - assert_eq!( - bytes_read, - vhandle.on_disk_size as usize, - "not enough bytes read: file has length {}", - file.metadata()?.len(), - ); - } - - #[cfg(windows)] - { - use std::os::windows::fs::FileExt; - - let bytes_read = file.seek_read(&mut builder, offset)?; - - assert_eq!( - bytes_read, - vhandle.on_disk_size as usize, - "not enough bytes read: file has length {}", - file.metadata()?.len(), - ); - } - - #[cfg(not(any(unix, windows)))] - { - compile_error!("unsupported OS"); - unimplemented!(); - } - } - - // TODO: decompress? save compression type into blobfile.meta - - Ok(Some(builder.freeze().into())) + Ok(Some(value)) } } diff --git a/src/vlog/blob_file/gc_stats.rs b/src/vlog/blob_file/gc_stats.rs deleted file mode 100644 index 031248cc..00000000 --- a/src/vlog/blob_file/gc_stats.rs +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use std::sync::atomic::AtomicU64; - -#[derive(Debug, Default)] -pub struct GcStats { - pub(crate) stale_items: AtomicU64, - pub(crate) stale_bytes: AtomicU64, -} - -impl GcStats { - pub fn set_stale_items(&self, x: u64) { - self.stale_items - .store(x, std::sync::atomic::Ordering::Release); - } - - pub fn set_stale_bytes(&self, x: u64) { - self.stale_bytes - .store(x, std::sync::atomic::Ordering::Release); - } - - /// Returns the number of dead items in the blob file. - pub fn stale_items(&self) -> u64 { - self.stale_items.load(std::sync::atomic::Ordering::Acquire) - } - - /// Returns the amount of dead bytes in the blob file. - pub fn stale_bytes(&self) -> u64 { - self.stale_bytes.load(std::sync::atomic::Ordering::Acquire) - } -} diff --git a/src/vlog/blob_file/merge.rs b/src/vlog/blob_file/merge.rs index dca45e98..666d2e5b 100644 --- a/src/vlog/blob_file/merge.rs +++ b/src/vlog/blob_file/merge.rs @@ -2,22 +2,11 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::{ - vlog::{BlobFileId, BlobFileReader}, - UserKey, UserValue, -}; +use super::scanner::Scanner as BlobFileScanner; +use crate::{vlog::BlobFileId, Checksum, UserKey, UserValue}; use interval_heap::IntervalHeap; use std::cmp::Reverse; -macro_rules! fail_iter { - ($e:expr) => { - match $e { - Ok(v) => v, - Err(e) => return Some(Err(e.into())), - } - }; -} - type IteratorIndex = usize; #[derive(Debug)] @@ -26,7 +15,7 @@ struct IteratorValue { key: UserKey, value: UserValue, blob_file_id: BlobFileId, - checksum: u64, + checksum: Checksum, } impl PartialEq for IteratorValue { @@ -50,20 +39,22 @@ impl Ord for IteratorValue { /// Interleaves multiple blob file readers into a single, sorted stream #[allow(clippy::module_name_repetitions)] -pub struct MergeReader { - readers: Vec, +pub struct MergeScanner { + readers: Vec, heap: IntervalHeap, } -impl MergeReader { +impl MergeScanner { /// Initializes a new merging reader - pub fn new(readers: Vec) -> Self { + pub fn new(readers: Vec) -> Self { let heap = IntervalHeap::with_capacity(readers.len()); Self { readers, heap } } fn advance_reader(&mut self, idx: usize) -> crate::Result<()> { - let reader = self.readers.get_mut(idx).expect("iter should exist"); + // NOTE: We trust the caller + #[allow(clippy::indexing_slicing)] + let reader = &mut self.readers[idx]; if let Some(value) = reader.next() { let (k, v, checksum) = value?; @@ -90,8 +81,8 @@ impl MergeReader { } } -impl Iterator for MergeReader { - type Item = crate::Result<(UserKey, UserValue, BlobFileId, u64)>; +impl Iterator for MergeScanner { + type Item = crate::Result<(UserKey, UserValue, BlobFileId, Checksum)>; fn next(&mut self) -> Option { if self.heap.is_empty() { @@ -119,3 +110,73 @@ impl Iterator for MergeReader { None } } + +#[cfg(test)] +#[allow(clippy::unwrap_used)] +mod tests { + use super::super::scanner::Scanner; + use super::*; + use crate::{vlog::blob_file::writer::Writer as BlobFileWriter, Slice}; + use tempfile::tempdir; + use test_log::test; + + #[test] + fn blob_file_merger() -> crate::Result<()> { + let dir = tempdir()?; + + let blob_file_0_path = dir.path().join("0"); + + let blob_file_1_path = dir.path().join("1"); + + { + let keys = [b"a", b"c", b"e"]; + + { + let mut writer = BlobFileWriter::new(&blob_file_0_path, 0)?; + + for key in keys { + writer.write(key, &key.repeat(100))?; + } + + writer.flush()?; + } + } + + { + let keys = [b"b", b"d"]; + + { + let mut writer = BlobFileWriter::new(&blob_file_1_path, 1)?; + + for key in keys { + writer.write(key, &key.repeat(100))?; + } + + writer.flush()?; + } + } + + { + let mut merger = MergeScanner::new(vec![ + Scanner::new(&blob_file_0_path, 0)?, + Scanner::new(&blob_file_1_path, 1)?, + ]); + + let merged_keys = [b"a", b"b", b"c", b"d", b"e"]; + + for key in merged_keys { + assert_eq!( + (Slice::from(key), Slice::from(key.repeat(100))), + merger + .next() + .map(|result| result.map(|(k, v, _, _)| { (k, v) })) + .unwrap()?, + ); + } + + assert!(merger.next().is_none()); + } + + Ok(()) + } +} diff --git a/src/vlog/blob_file/meta.rs b/src/vlog/blob_file/meta.rs index e72c191e..8ac2f883 100644 --- a/src/vlog/blob_file/meta.rs +++ b/src/vlog/blob_file/meta.rs @@ -9,7 +9,7 @@ use crate::{ use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use std::io::{Read, Write}; -pub const METADATA_HEADER_MAGIC: &[u8] = &[b'V', b'L', b'O', b'G', b'S', b'M', b'D', 1]; +pub const METADATA_HEADER_MAGIC: &[u8] = b"META"; #[derive(Debug)] pub struct Metadata { diff --git a/src/vlog/blob_file/mod.rs b/src/vlog/blob_file/mod.rs index 2c856106..56cc15e3 100644 --- a/src/vlog/blob_file/mod.rs +++ b/src/vlog/blob_file/mod.rs @@ -2,16 +2,16 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -pub mod gc_stats; pub mod merge; pub mod meta; pub mod multi_writer; pub mod reader; +pub mod scanner; pub mod trailer; pub mod writer; use crate::vlog::BlobFileId; -pub use gc_stats::GcStats; +// pub use gc_stats::GcStats; pub use meta::Metadata; use std::{path::PathBuf, sync::Arc}; @@ -26,9 +26,9 @@ pub(crate) struct Inner { /// Statistics pub meta: Metadata, + // /// Runtime stats for garbage collection + // pub gc_stats: GcStats, - /// Runtime stats for garbage collection - pub gc_stats: GcStats, // TODO: is_deleted, on Drop, like SST segments } @@ -57,14 +57,14 @@ impl BlobFile { self.0.id } - /// Returns a scanner that can iterate through the blob file. - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - pub fn scan(&self) -> crate::Result { - reader::Reader::new(&self.0.path, self.id()) - } + // /// Returns a scanner that can iterate through the blob file. + // /// + // /// # Errors + // /// + // /// Will return `Err` if an IO error occurs. + // pub fn scan(&self) -> crate::Result { + // reader::Reader::new(&self.0.path, self.id()) + // } /// Returns the number of items in the blob file. #[must_use] @@ -73,31 +73,31 @@ impl BlobFile { self.0.meta.item_count } - /// Marks the blob file as fully stale. - pub(crate) fn mark_as_stale(&self) { - self.0.gc_stats.set_stale_items(self.0.meta.item_count); - - self.0 - .gc_stats - .set_stale_bytes(self.0.meta.total_uncompressed_bytes); - } - - /// Returns `true` if the blob file is fully stale. - #[must_use] - pub fn is_stale(&self) -> bool { - self.0.gc_stats.stale_items() == self.0.meta.item_count - } - - /// Returns the percent of dead items in the blob file. - // NOTE: Precision is not important here - #[allow(clippy::cast_precision_loss)] - #[must_use] - pub fn stale_ratio(&self) -> f32 { - let dead = self.0.gc_stats.stale_items() as f32; - if dead == 0.0 { - return 0.0; - } - - dead / self.0.meta.item_count as f32 - } + // /// Marks the blob file as fully stale. + // pub(crate) fn mark_as_stale(&self) { + // self.0.gc_stats.set_stale_items(self.0.meta.item_count); + + // self.0 + // .gc_stats + // .set_stale_bytes(self.0.meta.total_uncompressed_bytes); + // } + + // Returns `true` if the blob file is fully stale. + // #[must_use] + // pub fn is_stale(&self) -> bool { + // self.0.gc_stats.stale_items() == self.0.meta.item_count + // } + + // /// Returns the percent of dead items in the blob file. + // // NOTE: Precision is not important here + // #[allow(clippy::cast_precision_loss)] + // #[must_use] + // pub fn stale_ratio(&self) -> f32 { + // let dead = self.0.gc_stats.stale_items() as f32; + // if dead == 0.0 { + // return 0.0; + // } + + // dead / self.0.meta.item_count as f32 + // } } diff --git a/src/vlog/blob_file/multi_writer.rs b/src/vlog/blob_file/multi_writer.rs index f038061e..81773673 100644 --- a/src/vlog/blob_file/multi_writer.rs +++ b/src/vlog/blob_file/multi_writer.rs @@ -3,10 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::writer::Writer; -use crate::{ - vlog::{BlobFileId, ValueHandle}, - CompressionType, SequenceNumberCounter, -}; +use crate::{vlog::BlobFileId, CompressionType, SegmentId, SequenceNumberCounter}; use std::path::{Path, PathBuf}; /// Blob file writer, may write multiple blob files @@ -19,6 +16,8 @@ pub struct MultiWriter { id_generator: SequenceNumberCounter, compression: CompressionType, + + linked_table_ids: Vec, } impl MultiWriter { @@ -46,9 +45,15 @@ impl MultiWriter { writers: vec![Writer::new(blob_file_path, blob_file_id)?], compression: CompressionType::None, + + linked_table_ids: Vec::new(), // TODO: 3.0.0 consume and reset after rotation }) } + pub fn link_table(&mut self, table_id: SegmentId) { + self.linked_table_ids.push(table_id); + } + /// Sets the blob file target size. #[must_use] pub fn use_target_size(mut self, bytes: u64) -> Self { diff --git a/src/vlog/blob_file/reader.rs b/src/vlog/blob_file/reader.rs index cab68c62..6c95e95f 100644 --- a/src/vlog/blob_file/reader.rs +++ b/src/vlog/blob_file/reader.rs @@ -2,120 +2,34 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::{meta::METADATA_HEADER_MAGIC, writer::BLOB_HEADER_MAGIC}; -use crate::{coding::DecodeError, vlog::BlobFileId, CompressionType, UserKey, UserValue}; -use byteorder::{BigEndian, ReadBytesExt}; -use std::{ - fs::File, - io::{BufReader, Read, Seek}, - path::Path, +use crate::{ + vlog::{blob_file::writer::BLOB_HEADER_LEN, ValueHandle}, + BlobFile, UserValue, }; +use std::fs::File; -macro_rules! fail_iter { - ($e:expr) => { - match $e { - Ok(v) => v, - Err(e) => return Some(Err(e.into())), - } - }; +/// Reads a single blob from a blob file +pub struct Reader<'a> { + blob_file: &'a BlobFile, + file: &'a File, } -// TODO: pread - -/// Reads through a blob file in order. -pub struct Reader { - pub(crate) blob_file_id: BlobFileId, - inner: BufReader, - is_terminated: bool, - compression: CompressionType, -} - -impl Reader { - /// Initializes a new blob file reader. - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - pub fn new>(path: P, blob_file_id: BlobFileId) -> crate::Result { - let file_reader = BufReader::new(File::open(path)?); - Ok(Self::with_reader(blob_file_id, file_reader)) - } - - pub(crate) fn get_offset(&mut self) -> std::io::Result { - self.inner.stream_position() +impl<'a> Reader<'a> { + pub fn new(blob_file: &'a BlobFile, file: &'a File) -> Self { + Self { blob_file, file } } - /// Initializes a new blob file reader. - #[must_use] - pub fn with_reader(blob_file_id: BlobFileId, file_reader: BufReader) -> Self { - Self { - blob_file_id, - inner: file_reader, - is_terminated: false, - compression: CompressionType::None, - } - } - - pub(crate) fn use_compression(mut self, compressoion: CompressionType) -> Self { - self.compression = compressoion; - self - } + pub fn get(&self, key: &'a [u8], vhandle: &'a ValueHandle) -> crate::Result { + debug_assert_eq!(vhandle.blob_file_id, self.blob_file.id()); - pub(crate) fn into_inner(self) -> BufReader { - self.inner - } -} + let offset = vhandle.offset + (BLOB_HEADER_LEN as u64) + (key.len() as u64); -impl Iterator for Reader { - type Item = crate::Result<(UserKey, UserValue, u64)>; + let value = crate::file::read_exact(self.file, offset, vhandle.on_disk_size as usize)?; - fn next(&mut self) -> Option { - if self.is_terminated { - return None; - } + // TODO: decompress? save compression type into blob_file.meta - { - let mut buf = [0; BLOB_HEADER_MAGIC.len()]; - fail_iter!(self.inner.read_exact(&mut buf)); - - if buf == METADATA_HEADER_MAGIC { - self.is_terminated = true; - return None; - } - - if buf != BLOB_HEADER_MAGIC { - return Some(Err(crate::Error::Decode(DecodeError::InvalidHeader( - "Blob", - )))); - } - } - - let checksum = fail_iter!(self.inner.read_u64::()); - - let key_len = fail_iter!(self.inner.read_u16::()); - let key = fail_iter!(UserKey::from_reader(&mut self.inner, key_len as usize)); - - let val_len = fail_iter!(self.inner.read_u32::()); - - // TODO: finish compression - #[warn(clippy::match_single_binding)] - let val = match &self.compression { - _ => { - // NOTE: When not using compression, we can skip - // the intermediary heap allocation and read directly into a Slice - fail_iter!(UserValue::from_reader(&mut self.inner, val_len as usize)) - } - }; - // Some(compressor) => { - // // TODO: https://github.com/PSeitz/lz4_flex/issues/166 - // let mut val = vec![0; val_len as usize]; - // fail_iter!(self.inner.read_exact(&mut val)); - // UserValue::from(fail_iter!(compressor.decompress(&val))) - // } - // None => { - - // } - - Some(Ok((key, val, checksum))) + Ok(value) } } + +// TODO: unit test diff --git a/src/vlog/blob_file/scanner.rs b/src/vlog/blob_file/scanner.rs new file mode 100644 index 00000000..ddbda499 --- /dev/null +++ b/src/vlog/blob_file/scanner.rs @@ -0,0 +1,157 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use super::{meta::METADATA_HEADER_MAGIC, writer::BLOB_HEADER_MAGIC}; +use crate::{coding::DecodeError, vlog::BlobFileId, Checksum, CompressionType, UserKey, UserValue}; +use byteorder::{BigEndian, ReadBytesExt}; +use std::{ + fs::File, + io::{BufReader, Read}, + path::Path, +}; + +/// Reads through a blob file in order +pub struct Scanner { + pub(crate) blob_file_id: BlobFileId, // TODO: remove unused? + inner: BufReader, + is_terminated: bool, + compression: CompressionType, +} + +impl Scanner { + /// Initializes a new blob file reader. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + pub fn new>(path: P, blob_file_id: BlobFileId) -> crate::Result { + let file_reader = BufReader::new(File::open(path)?); + Ok(Self::with_reader(blob_file_id, file_reader)) + } + + /// Initializes a new blob file reader. + #[must_use] + pub fn with_reader(blob_file_id: BlobFileId, file_reader: BufReader) -> Self { + Self { + blob_file_id, + inner: file_reader, + is_terminated: false, + compression: CompressionType::None, + } + } + + pub(crate) fn use_compression(mut self, compressoion: CompressionType) -> Self { + self.compression = compressoion; + self + } + + // pub(crate) fn get_offset(&mut self) -> std::io::Result { + // self.inner.stream_position() + // } + + // pub(crate) fn into_inner(self) -> BufReader { + // self.inner + // } +} + +impl Iterator for Scanner { + type Item = crate::Result<(UserKey, UserValue, Checksum)>; + + fn next(&mut self) -> Option { + if self.is_terminated { + return None; + } + + { + let mut buf = [0; BLOB_HEADER_MAGIC.len()]; + fail_iter!(self.inner.read_exact(&mut buf)); + + if buf == METADATA_HEADER_MAGIC { + self.is_terminated = true; + return None; + } + + if buf != BLOB_HEADER_MAGIC { + return Some(Err(crate::Error::Decode(DecodeError::InvalidHeader( + "Blob", + )))); + } + } + + let checksum = fail_iter!(self.inner.read_u128::()); + + let key_len = fail_iter!(self.inner.read_u16::()); + let real_val_len = fail_iter!(self.inner.read_u32::()); + let on_disk_val_len = fail_iter!(self.inner.read_u32::()); + + let key = fail_iter!(UserKey::from_reader(&mut self.inner, key_len as usize)); + + // TODO: finish compression + #[warn(clippy::match_single_binding)] + let val = match &self.compression { + _ => { + fail_iter!(UserValue::from_reader( + &mut self.inner, + on_disk_val_len as usize + )) + } + }; + // Some(compressor) => { + // // TODO: https://github.com/PSeitz/lz4_flex/issues/166 + // let mut val = vec![0; val_len as usize]; + // fail_iter!(self.inner.read_exact(&mut val)); + // UserValue::from(fail_iter!(compressor.decompress(&val))) + // } + // None => { + + // } + + Some(Ok((key, val, Checksum::from_raw(checksum)))) + } +} + +#[cfg(test)] +#[allow(clippy::unwrap_used)] +mod tests { + use super::*; + use crate::{vlog::blob_file::writer::Writer as BlobFileWriter, Slice}; + use tempfile::tempdir; + use test_log::test; + + #[test] + fn blob_file_scanner() -> crate::Result<()> { + let dir = tempdir()?; + let blob_file_path = dir.path().join("0"); + + let keys = [b"a", b"b", b"c", b"d", b"e"]; + + { + let mut writer = BlobFileWriter::new(&blob_file_path, 0)?; + + for key in keys { + writer.write(key, &key.repeat(100))?; + } + + writer.flush()?; + } + + { + let mut scanner = Scanner::new(&blob_file_path, 0)?; + + for key in keys { + assert_eq!( + (Slice::from(key), Slice::from(key.repeat(100))), + scanner + .next() + .map(|result| result.map(|(k, v, _)| { (k, v) })) + .unwrap()?, + ); + } + + assert!(scanner.next().is_none()); + } + + Ok(()) + } +} diff --git a/src/vlog/gc/mod.rs b/src/vlog/gc/mod.rs deleted file mode 100644 index 2514d468..00000000 --- a/src/vlog/gc/mod.rs +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -pub mod report; - -use crate::vlog::{BlobFileId, ValueLog}; - -/// GC strategy -#[allow(clippy::module_name_repetitions)] -pub trait GcStrategy { - /// Picks blob files based on a predicate. - fn pick(&self, value_log: &ValueLog) -> Vec; -} - -/// Picks blob files that have a certain percentage of stale blobs -pub struct StaleThresholdStrategy(f32); - -impl StaleThresholdStrategy { - /// Creates a new strategy with the given threshold. - /// - /// # Panics - /// - /// Panics if the ratio is invalid. - #[must_use] - pub fn new(ratio: f32) -> Self { - assert!( - ratio.is_finite() && ratio.is_sign_positive(), - "invalid stale ratio" - ); - Self(ratio.min(1.0)) - } -} - -impl GcStrategy for StaleThresholdStrategy { - fn pick(&self, value_log: &ValueLog) -> Vec { - unimplemented!() - - // value_log - // .manifest - // .blob_files - // .read() - // .expect("lock is poisoned") - // .values() - // .filter(|x| x.stale_ratio() > self.0) - // .map(|x| x.id) - // .collect::>() - } -} - -/// Tries to find a least-effort-selection of blob files to merge to reach a certain space amplification -pub struct SpaceAmpStrategy(f32); - -impl SpaceAmpStrategy { - /// Creates a new strategy with the given space amp factor. - /// - /// # Panics - /// - /// Panics if the space amp factor is < 1.0. - #[must_use] - pub fn new(ratio: f32) -> Self { - assert!(ratio >= 1.0, "invalid space amp ratio"); - Self(ratio) - } -} - -impl GcStrategy for SpaceAmpStrategy { - #[allow(clippy::cast_precision_loss, clippy::significant_drop_tightening)] - fn pick(&self, value_log: &ValueLog) -> Vec { - unimplemented!() - - // let space_amp_target = self.0; - // let current_space_amp = value_log.space_amp(); - - // if current_space_amp < space_amp_target { - // log::trace!("Space amp is <= target {space_amp_target}, nothing to do"); - // vec![] - // } else { - // log::debug!("Selecting blob files to GC, space_amp_target={space_amp_target}"); - - // let lock = value_log - // .manifest - // .blob_files - // .read() - // .expect("lock is poisoned"); - - // let mut blob_files = lock - // .values() - // .filter(|x| x.stale_ratio() > 0.0) - // .collect::>(); - - // // Sort by stale ratio descending - // blob_files.sort_by(|a, b| { - // b.stale_ratio() - // .partial_cmp(&a.stale_ratio()) - // .unwrap_or(std::cmp::Ordering::Equal) - // }); - - // let mut selection = vec![]; - - // let mut total_bytes = value_log.manifest.total_bytes(); - // let mut stale_bytes = value_log.manifest.stale_bytes(); - - // for blob_file in blob_files { - // let blob_file_stale_bytes = blob_file.gc_stats.stale_bytes(); - // stale_bytes -= blob_file_stale_bytes; - // total_bytes -= blob_file_stale_bytes; - - // selection.push(blob_file.id); - - // let space_amp_after_gc = - // total_bytes as f32 / (total_bytes as f32 - stale_bytes as f32); - - // log::debug!( - // "Selected blob file #{} for GC: will reduce space amp to {space_amp_after_gc}", - // blob_file.id, - // ); - - // if space_amp_after_gc <= space_amp_target { - // break; - // } - // } - - // selection - // } - } -} diff --git a/src/vlog/gc/report.rs b/src/vlog/gc/report.rs deleted file mode 100644 index 890d4b5c..00000000 --- a/src/vlog/gc/report.rs +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use std::path::PathBuf; - -/// Statistics report for garbage collection -#[derive(Debug)] -#[allow(clippy::module_name_repetitions)] -pub struct GcReport { - /// Path of value log - pub path: PathBuf, - - /// Blob file count - pub blob_file_count: usize, - - /// Blob files that have 100% stale blobs - pub stale_blob_file_count: usize, - - /// Amount of stored bytes - pub total_bytes: u64, - - /// Amount of bytes that could be freed - pub stale_bytes: u64, - - /// Number of stored blobs - pub total_blobs: u64, - - /// Number of blobs that could be freed - pub stale_blobs: u64, -} - -impl std::fmt::Display for GcReport { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - writeln!(f, "--- GC report for vLog @ {} ---", self.path.display())?; - writeln!(f, "# files : {}", self.blob_file_count)?; - writeln!(f, "# stale : {}", self.stale_blob_file_count)?; - writeln!(f, "Total bytes: {}", self.total_bytes)?; - writeln!(f, "Stale bytes: {}", self.stale_bytes)?; - writeln!(f, "Total blobs: {}", self.total_blobs)?; - writeln!(f, "Stale blobs: {}", self.stale_blobs)?; - writeln!(f, "Stale ratio: {}", self.stale_ratio())?; - writeln!(f, "Space amp : {}", self.space_amp())?; - writeln!(f, "--- GC report done ---")?; - Ok(()) - } -} - -impl GcReport { - /// Calculates the space amplification factor. - #[must_use] - pub fn space_amp(&self) -> f32 { - if self.total_bytes == 0 { - return 0.0; - } - - let alive_bytes = self.total_bytes - self.stale_bytes; - if alive_bytes == 0 { - return 0.0; - } - - self.total_bytes as f32 / alive_bytes as f32 - } - - /// Calculates the stale ratio (percentage). - #[must_use] - pub fn stale_ratio(&self) -> f32 { - if self.total_bytes == 0 { - return 0.0; - } - - if self.stale_bytes == 0 { - return 0.0; - } - - self.stale_bytes as f32 / self.total_bytes as f32 - } -} diff --git a/src/vlog/index.rs b/src/vlog/index.rs deleted file mode 100644 index 4f3c3c15..00000000 --- a/src/vlog/index.rs +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use crate::vlog::ValueHandle; - -/// Trait that allows reading from an external index -/// -/// An index should point into the value log using [`ValueHandle`]. -#[allow(clippy::module_name_repetitions)] -pub trait Reader { - /// Returns a value handle for a given key. - /// - /// This method is used to index back into the index to check for - /// stale values when scanning through the value log's blob files. - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - fn get(&self, key: &[u8]) -> std::io::Result>; -} - -/// Trait that allows writing into an external index -/// -/// The write process should be atomic meaning that until `finish` is called -/// no written value handles should be handed out by the index. -/// When `finish` fails, no value handles should be written into the index. -pub trait Writer { - /// Inserts a value handle into the index write batch. - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - fn insert_indirect( - &mut self, - key: &[u8], - vhandle: ValueHandle, - size: u32, - ) -> std::io::Result<()>; - - /// Finishes the write batch. - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - fn finish(&mut self) -> std::io::Result<()>; -} diff --git a/src/vlog/manifest.rs b/src/vlog/manifest.rs deleted file mode 100644 index 92eca898..00000000 --- a/src/vlog/manifest.rs +++ /dev/null @@ -1,445 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use crate::{ - file::rewrite_atomic, - vlog::{ - blob_file::{gc_stats::GcStats, meta::Metadata, trailer::Trailer}, - BlobFile, BlobFileId, BlobFileWriter as MultiWriter, Compressor, - }, - HashMap, KeyRange, -}; -use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; -use std::{ - io::Cursor, - marker::PhantomData, - path::{Path, PathBuf}, - sync::{Arc, RwLock}, -}; - -pub const VLOG_MARKER: &str = ".vlog"; -pub const BLOB_FILES_FOLDER: &str = "segments"; // TODO: don't use separate folder, instead rename just .blobs -const MANIFEST_FILE: &str = "vlog_manifest"; - -// TODO: use tree-level manifest to store blob files as well - -#[allow(clippy::module_name_repetitions)] -pub struct ManifestInner { - path: PathBuf, - pub blob_files: RwLock>>>, -} - -#[allow(clippy::module_name_repetitions)] -#[derive(Clone)] -pub struct Manifest(Arc>); - -impl std::ops::Deref for Manifest { - type Target = ManifestInner; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl Manifest { - fn remove_unfinished_blob_files>( - folder: P, - registered_ids: &[u64], - ) -> crate::Result<()> { - for dirent in std::fs::read_dir(folder)? { - let dirent = dirent?; - let file_name = dirent.file_name(); - - // https://en.wikipedia.org/wiki/.DS_Store - if file_name == ".DS_Store" { - continue; - } - - // https://en.wikipedia.org/wiki/AppleSingle_and_AppleDouble_formats - if file_name.to_string_lossy().starts_with("._") { - continue; - } - - if dirent.file_type()?.is_file() { - let blob_file_id = dirent - .file_name() - .to_str() - .expect("should be valid utf-8") - .parse::() - .expect("should be valid blob file ID"); - - if !registered_ids.contains(&blob_file_id) { - log::trace!("Deleting unfinished vLog blob file {blob_file_id}"); - std::fs::remove_file(dirent.path())?; - } - } - } - - Ok(()) - } - - /// Parses blob file IDs from manifest file - fn load_ids_from_disk>(path: P) -> crate::Result> { - let path = path.as_ref(); - log::debug!("Loading manifest from {}", path.display()); - - let bytes = std::fs::read(path)?; - - let mut ids = vec![]; - - let mut cursor = Cursor::new(bytes); - - let cnt = cursor.read_u64::()?; - - for _ in 0..cnt { - ids.push(cursor.read_u64::()?); - } - - Ok(ids) - } - - /// Recovers a value log from disk - pub(crate) fn recover>(folder: P) -> crate::Result { - let folder = folder.as_ref(); - let manifest_path = folder.join(MANIFEST_FILE); - - log::info!("Recovering vLog at {}", folder.display()); - - let ids = Self::load_ids_from_disk(&manifest_path)?; - let cnt = ids.len(); - - let progress_mod = match cnt { - _ if cnt <= 20 => 1, - _ if cnt <= 100 => 10, - _ => 100, - }; - - log::debug!( - "Recovering {cnt} vLog blob files from {:?}", - folder.display(), - ); - - let blob_files_folder = folder.join(BLOB_FILES_FOLDER); - Self::remove_unfinished_blob_files(&blob_files_folder, &ids)?; - - let blob_files = { - let mut map = HashMap::with_capacity_and_hasher(100, rustc_hash::FxBuildHasher); - - for (idx, &id) in ids.iter().enumerate() { - log::trace!("Recovering blob file #{id:?}"); - - let path = blob_files_folder.join(id.to_string()); - let trailer = Trailer::from_file(&path)?; - - map.insert( - id, - Arc::new(BlobFile { - id, - path, - meta: trailer.metadata, - gc_stats: GcStats::default(), - _phantom: PhantomData, - }), - ); - - if idx % progress_mod == 0 { - log::debug!("Recovered {idx}/{cnt} vLog blob files"); - } - } - - map - }; - - if blob_files.len() < ids.len() { - return Err(crate::Error::Unrecoverable); - } - - Ok(Self(Arc::new(ManifestInner { - path: manifest_path, - blob_files: RwLock::new(blob_files), - }))) - } - - pub(crate) fn create_new>(folder: P) -> crate::Result { - let path = folder.as_ref().join(MANIFEST_FILE); - - let m = Self(Arc::new(ManifestInner { - path, - blob_files: RwLock::new(HashMap::default()), - })); - Self::write_to_disk(&m.path, &[])?; - - Ok(m) - } - - /// Modifies the level manifest atomically. - pub(crate) fn atomic_swap>>)>( - &self, - f: F, - ) -> crate::Result<()> { - let mut prev_blob_files = self.blob_files.write().expect("lock is poisoned"); - - // NOTE: Create a copy of the levels we can operate on - // without mutating the current level manifest - // If persisting to disk fails, this way the level manifest - // is unchanged - let mut working_copy = prev_blob_files.clone(); - - f(&mut working_copy); - - let ids = working_copy.keys().copied().collect::>(); - - Self::write_to_disk(&self.path, &ids)?; - *prev_blob_files = working_copy; - - // NOTE: Lock needs to live until end of function because - // writing to disk needs to be exclusive - drop(prev_blob_files); - - log::trace!("Swapped vLog blob file list to: {ids:?}"); - - Ok(()) - } - - /// Drops all blob files. - /// - /// This does not delete the files from disk, but just un-refs them from the manifest. - /// - /// Once this function completes, the disk files can be safely removed. - pub fn clear(&self) -> crate::Result<()> { - self.atomic_swap(|recipe| { - recipe.clear(); - }) - } - - /// Drops the given blob files. - /// - /// This does not delete the files from disk, but just un-refs them from the manifest. - /// - /// Once this function completes, the disk files can be safely removed. - pub fn drop_blob_files(&self, ids: &[u64]) -> crate::Result<()> { - self.atomic_swap(|recipe| { - recipe.retain(|x, _| !ids.contains(x)); - }) - } - - pub fn register(&self, writer: MultiWriter) -> crate::Result<()> { - let writers = writer.finish()?; - - self.atomic_swap(move |recipe| { - for writer in writers { - if writer.item_count == 0 { - log::debug!( - "Writer at {} has written no data, deleting empty vLog blob file", - writer.path.display(), - ); - if let Err(e) = std::fs::remove_file(&writer.path) { - log::warn!( - "Could not delete empty vLog blob file at {}: {e:?}", - writer.path.display(), - ); - } - continue; - } - - let blob_file_id = writer.blob_file_id; - - recipe.insert( - blob_file_id, - Arc::new(BlobFile { - id: blob_file_id, - path: writer.path, - meta: Metadata { - item_count: writer.item_count, - compressed_bytes: writer.written_blob_bytes, - total_uncompressed_bytes: writer.uncompressed_bytes, - - // NOTE: We are checking for 0 items above - // so first and last key need to exist - #[allow(clippy::expect_used)] - key_range: KeyRange::new(( - writer - .first_key - .clone() - .expect("should have written at least 1 item"), - writer - .last_key - .clone() - .expect("should have written at least 1 item"), - )), - }, - gc_stats: GcStats::default(), - _phantom: PhantomData, - }), - ); - - log::debug!( - "Created blob file #{blob_file_id:?} ({} items, {} userdata bytes)", - writer.item_count, - writer.uncompressed_bytes, - ); - } - })?; - - // NOTE: If we crash before before finishing the index write, it's fine - // because all new blob files will be unreferenced, and thus can be dropped because stale - - Ok(()) - } - - fn write_to_disk>(path: P, blob_file_ids: &[BlobFileId]) -> crate::Result<()> { - let path = path.as_ref(); - log::trace!("Writing blob files manifest to {}", path.display()); - - let mut bytes = Vec::new(); - - let cnt = blob_file_ids.len() as u64; - bytes.write_u64::(cnt)?; - - for id in blob_file_ids { - bytes.write_u64::(*id)?; - } - - rewrite_atomic(path, &bytes)?; - - Ok(()) - } - - /// Gets a blob file. - #[must_use] - pub fn get_blob_file(&self, id: BlobFileId) -> Option>> { - self.blob_files - .read() - .expect("lock is poisoned") - .get(&id) - .cloned() - } - - /// Lists all blob file IDs. - #[doc(hidden)] - #[must_use] - pub fn list_blob_file_ids(&self) -> Vec { - self.blob_files - .read() - .expect("lock is poisoned") - .keys() - .copied() - .collect() - } - - /// Lists all blob files. - #[must_use] - pub fn list_blob_files(&self) -> Vec>> { - self.blob_files - .read() - .expect("lock is poisoned") - .values() - .cloned() - .collect() - } - - /// Returns the number of blob files. - #[must_use] - pub fn len(&self) -> usize { - self.blob_files.read().expect("lock is poisoned").len() - } - - /// Returns the amount of bytes on disk that are occupied by blobs. - #[must_use] - pub fn disk_space_used(&self) -> u64 { - self.blob_files - .read() - .expect("lock is poisoned") - .values() - .map(|x| x.meta.compressed_bytes) - .sum::() - } - - /// Returns the amount of stale bytes - #[must_use] - pub fn total_bytes(&self) -> u64 { - self.blob_files - .read() - .expect("lock is poisoned") - .values() - .map(|x| x.meta.total_uncompressed_bytes) - .sum::() - } - - /// Returns the amount of stale bytes - #[must_use] - pub fn stale_bytes(&self) -> u64 { - self.blob_files - .read() - .expect("lock is poisoned") - .values() - .map(|x| x.gc_stats.stale_bytes()) - .sum::() - } - - /// Returns the percent of dead bytes (uncompressed) in the value log - #[must_use] - #[allow(clippy::cast_precision_loss)] - pub fn stale_ratio(&self) -> f32 { - let total_bytes = self.total_bytes(); - if total_bytes == 0 { - return 0.0; - } - - let stale_bytes = self.stale_bytes(); - - if stale_bytes == 0 { - return 0.0; - } - - stale_bytes as f32 / total_bytes as f32 - } - - /// Returns the approximate space amplification - /// - /// Returns 0.0 if there are no items or the entire value log is stale. - #[must_use] - #[allow(clippy::cast_precision_loss)] - pub fn space_amp(&self) -> f32 { - let total_bytes = self.total_bytes(); - if total_bytes == 0 { - return 0.0; - } - - let stale_bytes = self.stale_bytes(); - - let alive_bytes = total_bytes - stale_bytes; - if alive_bytes == 0 { - return 0.0; - } - - total_bytes as f32 / alive_bytes as f32 - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::fs::File; - use std::io::Write; - use test_log::test; - - #[test] - fn test_atomic_rewrite() -> crate::Result<()> { - let dir = tempfile::tempdir()?; - - let path = dir.path().join("test.txt"); - { - let mut file = File::create(&path)?; - write!(file, "asdasdasdasdasd")?; - } - - rewrite_atomic(&path, b"newcontent")?; - - let content = std::fs::read_to_string(&path)?; - assert_eq!("newcontent", content); - - Ok(()) - } -} diff --git a/src/vlog/mod.rs b/src/vlog/mod.rs index c3cec95f..5efa645c 100644 --- a/src/vlog/mod.rs +++ b/src/vlog/mod.rs @@ -2,32 +2,19 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) +mod accessor; pub mod blob_file; -mod gc; mod handle; -mod index; -// mod manifest; -mod accessor; #[doc(hidden)] pub mod scanner; -mod value_log; - pub use { - accessor::Accessor, - blob_file::multi_writer::MultiWriter as BlobFileWriter, - gc::report::GcReport, - gc::{GcStrategy, SpaceAmpStrategy, StaleThresholdStrategy}, - handle::ValueHandle, - index::{Reader as IndexReader, Writer as IndexWriter}, - value_log::ValueLog, + accessor::Accessor, blob_file::multi_writer::MultiWriter as BlobFileWriter, + blob_file::BlobFile, handle::ValueHandle, }; -#[doc(hidden)] -pub use blob_file::{reader::Reader as BlobFileReader, BlobFile}; - -use crate::vlog::blob_file::{trailer::Trailer, GcStats, Inner as BlobFileInner}; +use crate::vlog::blob_file::{trailer::Trailer, Inner as BlobFileInner}; use std::{path::Path, sync::Arc}; pub fn recover_blob_files(folder: &Path, ids: &[BlobFileId]) -> crate::Result> { @@ -56,7 +43,6 @@ pub fn recover_blob_files(folder: &Path, ids: &[BlobFileId]) -> crate::Result; /// Scans a value log, building a size map for the GC report -pub struct Scanner<'a, I: Iterator>> { +pub struct Scanner>> { iter: I, - #[allow(unused)] - lock_guard: MutexGuard<'a, ()>, - + // #[allow(unused)] + // lock_guard: MutexGuard<'a, ()>, size_map: SizeMap, } -impl<'a, I: Iterator>> Scanner<'a, I> { - pub fn new(iter: I, lock_guard: MutexGuard<'a, ()>, ids: &[BlobFileId]) -> Self { +impl>> Scanner { + pub fn new(iter: I, ids: &[BlobFileId]) -> Self { let mut size_map = BTreeMap::default(); for &id in ids { size_map.insert(id, BlobFileCounter::default()); } - Self { - iter, - lock_guard, - size_map, - } + Self { iter, size_map } } pub fn finish(self) -> SizeMap { diff --git a/src/vlog/value_log.rs b/src/vlog/value_log.rs deleted file mode 100644 index 44863736..00000000 --- a/src/vlog/value_log.rs +++ /dev/null @@ -1,631 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use crate::{ - // file::VLOG_MARKER, - vlog::{ - blob_file::{ - gc_stats::GcStats, merge::MergeReader, meta::Metadata, Inner as BlobFileInner, - }, - gc::report::GcReport, - index::Writer as IndexWriter, - scanner::SizeMap, - BlobFile, BlobFileId, BlobFileWriter, GcStrategy, IndexReader, ValueHandle, - }, - Cache, - DescriptorTable, - KeyRange, - UserValue, -}; -use std::{ - path::{Path, PathBuf}, - sync::{atomic::AtomicU64, Arc, Mutex}, -}; - -// // TODO: use other counter struct -// #[allow(clippy::module_name_repetitions)] -// #[derive(Clone, Default)] -// pub struct IdGenerator(Arc); - -// impl std::ops::Deref for IdGenerator { -// type Target = Arc; - -// fn deref(&self) -> &Self::Target { -// &self.0 -// } -// } - -// impl IdGenerator { -// pub fn new(start: u64) -> Self { -// Self(Arc::new(AtomicU64::new(start))) -// } - -// pub fn next(&self) -> BlobFileId { -// self.fetch_add(1, std::sync::atomic::Ordering::SeqCst) -// } -// } - -fn unlink_blob_files(base_path: &Path, ids: &[BlobFileId]) { - unimplemented!() - - // for id in ids { - // let path = base_path.join(BLOB_FILES_FOLDER).join(id.to_string()); - - // if let Err(e) = std::fs::remove_file(&path) { - // log::error!("Could not free blob file at {path:?}: {e:?}"); - // } - // } -} - -/// A disk-resident value log -#[derive(Clone)] -pub struct ValueLog(Arc); - -impl std::ops::Deref for ValueLog { - type Target = ValueLogInner; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -#[allow(clippy::module_name_repetitions)] -pub struct ValueLogInner { - /// Base folder - pub path: PathBuf, - - /// Value log configuration - // config: Config, - - /// In-memory blob cache - // blob_cache: Arc, - - /// In-memory FD cache - // fd_cache: Arc, - - // /// Generator to get next blob file ID - // id_generator: IdGenerator, - - /// Guards the rollover (compaction) process to only - /// allow one to happen at a time - #[doc(hidden)] - pub rollover_guard: Mutex<()>, -} - -impl ValueLog { - /// Creates or recovers a value log in the given directory. - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - pub fn open>( - path: P, // TODO: move path into config? - // config: Config, - ) -> crate::Result { - // let path = path.into(); - - // if path.join(VLOG_MARKER).try_exists()? { - // Self::recover(path, config) - // } else { - // Self::create_new(path, config) - // } - - unimplemented!() - } - - /* /// Prints fragmentation histogram. - pub fn print_fragmentation_histogram(&self) { - let lock = self.manifest.blob_files.read().expect("lock is poisoned"); - - for (id, blob_file) in &*lock { - let stale_ratio = blob_file.stale_ratio(); - - let progress = (stale_ratio * 10.0) as usize; - let void = 10 - progress; - - let progress = "=".repeat(progress); - let void = " ".repeat(void); - - println!( - "{id:0>4} [{progress}{void}] {}%", - (stale_ratio * 100.0) as usize - ); - } - } */ - - #[doc(hidden)] - pub fn verify(&self) -> crate::Result { - unimplemented!() - - // let _lock = self.rollover_guard.lock().expect("lock is poisoned"); - - // let mut sum = 0; - - // for item in self.get_reader()? { - // let (k, v, _, expected_checksum) = item?; - - // let mut hasher = xxhash_rust::xxh3::Xxh3::new(); - // hasher.update(&k); - // hasher.update(&v); - - // if hasher.digest() != expected_checksum { - // sum += 1; - // } - // } - - // Ok(sum) - } - - /// Creates a new empty value log in a directory. - pub(crate) fn create_new>(path: P) -> crate::Result { - let path = path.into(); - - let path = crate::path::absolute_path(&path); - log::trace!("Creating value-log at {}", path.display()); - - std::fs::create_dir_all(&path)?; - - // let marker_path = path.join(VLOG_MARKER); - // assert!(!marker_path.try_exists()?); - - // NOTE: Lastly, fsync .vlog marker, which contains the version - // -> the V-log is fully initialized - - // let mut file = std::fs::File::create(marker_path)?; - // FormatVersion::V3.write_file_header(&mut file)?; - // file.sync_all()?; - - #[cfg(not(target_os = "windows"))] - { - // fsync folders on Unix - - let folder = std::fs::File::open(&path)?; - folder.sync_all()?; - } - - // let blob_cache = config.blob_cache.clone(); - // let fd_cache = config.fd_cache.clone(); - // let manifest = Manifest::create_new(&path)?; - - Ok(Self(Arc::new(ValueLogInner { - // config, - path, - // blob_cache, - // fd_cache, - // manifest, - // id_generator: IdGenerator::default(), - rollover_guard: Mutex::new(()), - }))) - } - - /// Returns the number of blob files in the value log. - #[must_use] - pub fn blob_file_count(&self) -> usize { - unimplemented!() - - // self.manifest.len() - } - - /// Resolves a value handle. - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - pub fn get(&self, vhandle: &ValueHandle) -> crate::Result> { - self.get_with_prefetch(vhandle, 0) - } - - /// Resolves a value handle, and prefetches some values after it. - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - pub fn get_with_prefetch( - &self, - vhandle: &ValueHandle, - prefetch_size: usize, - ) -> crate::Result> { - // TODO:, first rewrite blob files to use pread - Ok(None) - - // if let Some(value) = self.blob_cache.get(self.id, vhandle) { - // return Ok(Some(value)); - // } - - // let Some(blob_file) = self.manifest.get_blob_file(vhandle.blob_file_id) else { - // return Ok(None); - // }; - - // // TODO: get FD from cache or open and insert - // // let mut reader = match self - // // .fd_cache - // // .access_for_blob_file(&GlobalSegmentId::from((self.id, vhandle.blob_file_id))) - // // { - // // Some(fd) => fd, - // // None => BufReader::new(File::open(&blob_file.path)?), - // // }; - - // let mut reader = BlobFileReader::with_reader(vhandle.blob_file_id, reader) - // .use_compression(self.config.compression.clone()); - - // let Some(item) = reader.next() else { - // return Ok(None); - // }; - // let (_key, val, _checksum) = item?; - - // self.blob_cache.insert(self.id, vhandle, val.clone()); - - // // TODO: maybe we can look at the value size and prefetch some more values - // // without causing another I/O... - // // TODO: benchmark range reads for rather small non-inlined blobs (maybe ~512-1000B) - // // and see how different BufReader capacities and prefetch changes range read performance - // for _ in 0..prefetch_size { - // let offset = reader.get_offset()?; - - // let Some(item) = reader.next() else { - // break; - // }; - // let (_key, val, _checksum) = item?; - - // let value_handle = ValueHandle { - // blob_file_id: vhandle.blob_file_id, - // offset, - // }; - - // self.blob_cache.insert(self.id, &value_handle, val); - // } - - // Ok(Some(val)) - } - - fn get_writer_raw(&self) -> crate::Result { - unimplemented!() - - // BlobFileWriter::new( - // self.id_generator.clone(), - // self.config.blob_file_size_bytes, - // &self.path, - // ) - // .map_err(Into::into) - } - - /// Initializes a new blob file writer. - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - pub fn get_writer(&self) -> crate::Result { - unimplemented!() - - // self.get_writer_raw() - // .map(|x| x.use_compression(self.config.compression)) - } - - /// Drops stale blob files. - /// - /// Returns the amount of disk space (compressed data) freed. - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - pub fn drop_stale_blob_files(&self) -> crate::Result { - unimplemented!() - - // // IMPORTANT: Only allow 1 rollover or GC at any given time - // let _guard = self.rollover_guard.lock().expect("lock is poisoned"); - - // let blob_files = self - // .manifest - // .blob_files - // .read() - // .expect("lock is poisoned") - // .values() - // .filter(|x| x.is_stale()) - // .cloned() - // .collect::>(); - - // let bytes_freed = blob_files.iter().map(|x| x.meta.compressed_bytes).sum(); - - // let ids = blob_files.iter().map(|x| x.id).collect::>(); - - // if ids.is_empty() { - // log::trace!("No blob files to drop"); - // } else { - // log::info!("Dropping stale blob files: {ids:?}"); - // self.manifest.drop_blob_files(&ids)?; - - // for blob_file in blob_files { - // std::fs::remove_file(&blob_file.path)?; - // } - // } - - // Ok(bytes_freed) - } - - /// Marks some blob files as stale. - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - fn mark_as_stale(&self, ids: &[BlobFileId]) { - unimplemented!() - - // // NOTE: Read-locking is fine because we are dealing with an atomic bool - // #[allow(clippy::significant_drop_tightening)] - // let blob_files = self.manifest.blob_files.read().expect("lock is poisoned"); - - // for id in ids { - // let Some(blob_file) = blob_files.get(id) else { - // continue; - // }; - - // blob_file.mark_as_stale(); - // } - } - - // TODO: remove? - /// Returns the approximate space amplification. - /// - /// Returns 0.0 if there are no items. - #[must_use] - pub fn space_amp(&self) -> f32 { - unimplemented!() - - // self.manifest.space_amp() - } - - #[doc(hidden)] - #[allow(clippy::cast_precision_loss)] - #[must_use] - pub fn consume_scan_result(&self, size_map: &SizeMap) -> GcReport { - unimplemented!() - - // let mut report = GcReport { - // path: self.path.clone(), - // blob_file_count: self.blob_file_count(), - // stale_blob_file_count: 0, - // stale_bytes: 0, - // total_bytes: 0, - // stale_blobs: 0, - // total_blobs: 0, - // }; - - // for (&id, counter) in size_map { - // let blob_file = self - // .manifest - // .get_blob_file(id) - // .expect("blob file should exist"); - - // let total_bytes = blob_file.meta.total_uncompressed_bytes; - // let total_items = blob_file.meta.item_count; - - // report.total_bytes += total_bytes; - // report.total_blobs += total_items; - - // if counter.item_count > 0 { - // let used_size = counter.size; - // let alive_item_count = counter.item_count; - - // let blob_file = self - // .manifest - // .get_blob_file(id) - // .expect("blob file should exist"); - - // let stale_bytes = total_bytes - used_size; - // let stale_items = total_items - alive_item_count; - - // blob_file.gc_stats.set_stale_bytes(stale_bytes); - // blob_file.gc_stats.set_stale_items(stale_items); - - // report.stale_bytes += stale_bytes; - // report.stale_blobs += stale_items; - // } else { - // log::debug!( - // "Blob file #{id} has no incoming references - can be dropped, freeing {} KiB on disk (userdata={} MiB)", - // blob_file.meta.compressed_bytes / 1_024, - // total_bytes / 1_024 / 1_024, - // ); - // self.mark_as_stale(&[id]); - - // report.stale_blob_file_count += 1; - // report.stale_bytes += total_bytes; - // report.stale_blobs += total_items; - // } - // } - - // report - } - - /// Scans the given index and collects GC statistics. - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - #[allow(clippy::significant_drop_tightening)] - pub fn scan_for_stats( - &self, - iter: impl Iterator>, - ) -> crate::Result { - unimplemented!() - - // let lock_guard = self.rollover_guard.lock().expect("lock is poisoned"); - - // let ids = self.manifest.list_blob_file_ids(); - - // let mut scanner = Scanner::new(iter, lock_guard, &ids); - // scanner.scan()?; - // let size_map = scanner.finish(); - // let report = self.consume_scan_result(&size_map); - - // Ok(report) - } - - #[doc(hidden)] - pub fn get_reader(&self) -> crate::Result { - unimplemented!() - - // let readers = self - // .manifest - // .blob_files - // .read() - // .expect("lock is poisoned") - // .values() - // .map(|x| x.scan()) - // .collect::>>()?; - - // Ok(MergeReader::new(readers)) - } - - /// Returns the amount of disk space (compressed data) freed. - #[doc(hidden)] - pub fn major_compact( - &self, - index_reader: &R, - index_writer: W, - ) -> crate::Result { - unimplemented!() - - // let ids = self.manifest.list_blob_file_ids(); - // self.rollover(&ids, index_reader, index_writer) - } - - /// Applies a GC strategy. - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - pub fn apply_gc_strategy( - &self, - strategy: &impl GcStrategy, - index_reader: &R, - index_writer: W, - ) -> crate::Result { - unimplemented!() - - // let blob_file_ids = strategy.pick(self); - // self.rollover(&blob_file_ids, index_reader, index_writer) - } - - /// Atomically removes all data from the value log. - /// - /// If `prune_async` is set to `true`, the blob files will be removed from disk in a thread to avoid blocking. - pub fn clear(&self, prune_async: bool) -> crate::Result<()> { - unimplemented!() - - // let guard = self.rollover_guard.lock().expect("lock is poisoned"); - // let ids = self.manifest.list_blob_file_ids(); - // self.manifest.clear()?; - // drop(guard); - - // if prune_async { - // let path = self.path.clone(); - - // std::thread::spawn(move || { - // log::trace!("Pruning dropped blob files in thread: {ids:?}"); - // unlink_blob_files(&path, &ids); - // log::trace!("Successfully pruned all blob files"); - // }); - // } else { - // log::trace!("Pruning dropped blob files: {ids:?}"); - // unlink_blob_files(&self.path, &ids); - // log::trace!("Successfully pruned all blob files"); - // } - - // Ok(()) - } - - /// Rewrites some blob files into new blob files, blocking the caller - /// until the operation is completely done. - /// - /// Returns the amount of disk space (compressed data) freed. - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - #[doc(hidden)] - pub fn rollover( - &self, - ids: &[u64], - index_reader: &R, - mut index_writer: W, - ) -> crate::Result { - unimplemented!() - - // if ids.is_empty() { - // return Ok(0); - // } - - // // IMPORTANT: Only allow 1 rollover or GC at any given time - // let _guard = self.rollover_guard.lock().expect("lock is poisoned"); - - // let size_before = self.manifest.disk_space_used(); - - // log::info!("Rollover blob files {ids:?}"); - - // let blob_files = ids - // .iter() - // .map(|&x| self.manifest.get_blob_file(x)) - // .collect::>>(); - - // let Some(blob_files) = blob_files else { - // return Ok(0); - // }; - - // let readers = blob_files - // .into_iter() - // .map(|x| x.scan()) - // .collect::>>()?; - - // // TODO: 3.0.0: Store uncompressed size per blob - // // so we can avoid recompression costs during GC - // // but have stats be correct - - // let reader = MergeReader::new( - // readers - // .into_iter() - // .map(|x| x.use_compression(self.config.compression.clone())) - // .collect(), - // ); - - // let mut writer = self - // .get_writer_raw()? - // .use_compression(self.config.compression.clone()); - - // for item in reader { - // let (k, v, blob_file_id, _) = item?; - - // match index_reader.get(&k)? { - // // If this value is in an older blob file, we can discard it - // Some(vhandle) if blob_file_id < vhandle.blob_file_id => continue, - // None => continue, - // _ => {} - // } - - // let vhandle = writer.get_next_value_handle(); - - // // NOTE: Truncation is OK because we know values are u32 max - // #[allow(clippy::cast_possible_truncation)] - // index_writer.insert_indirect(&k, vhandle, v.len() as u32)?; - - // writer.write(&k, &v)?; - // } - - // // IMPORTANT: New blob files need to be persisted before adding to index - // // to avoid dangling pointers - // self.manifest.register(writer)?; - - // // NOTE: If we crash here, it's fine, the blob files are registered - // // but never referenced, so they can just be dropped after recovery - // index_writer.finish()?; - - // // IMPORTANT: We only mark the blob files as definitely stale - // // The external index needs to decide when it is safe to drop - // // the old blob files, as some reads may still be performed - // self.mark_as_stale(ids); - - // let size_after = self.manifest.disk_space_used(); - - // Ok(size_before.saturating_sub(size_after)) - } -} diff --git a/tests/blob_drop_after_flush.rs b/tests/blob_drop_after_flush._rs similarity index 100% rename from tests/blob_drop_after_flush.rs rename to tests/blob_drop_after_flush._rs diff --git a/tests/blob_flush_gc_stats.rs b/tests/blob_flush_gc_stats.rs new file mode 100644 index 00000000..8a2dd529 --- /dev/null +++ b/tests/blob_flush_gc_stats.rs @@ -0,0 +1,59 @@ +use lsm_tree::{AbstractTree, SeqNo}; +use test_log::test; + +#[test] +fn blob_tree_flush_gc_stats() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let path = folder.path(); + + let big_value = b"neptune!".repeat(128_000); + let new_big_value = b"winter!".repeat(128_000); + + { + let tree = lsm_tree::Config::new(path).open_as_blob_tree()?; + + assert!(tree.get("big", SeqNo::MAX)?.is_none()); + tree.insert("big", &big_value, 0); + tree.insert("smol", "small value", 0); + + let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + + tree.insert("big", &new_big_value, 1); + + tree.flush_active_memtable(1_000)?; + + // NOTE: The first big_value is dropped, so it never arrives in a blob file + assert_eq!(2, tree.approximate_len()); + } + + Ok(()) +} + +#[test] +fn blob_tree_flush_gc_stats_tombstone() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let path = folder.path(); + + let big_value = b"neptune!".repeat(128_000); + + { + let tree = lsm_tree::Config::new(path).open_as_blob_tree()?; + + assert!(tree.get("big", SeqNo::MAX)?.is_none()); + tree.insert("big", &big_value, 0); + tree.insert("smol", "small value", 0); + + let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + + tree.remove("big", 1); + + tree.flush_active_memtable(1_000)?; + + // NOTE: The first big_value is dropped, so it never arrives in a blob file + assert_eq!(2, tree.approximate_len()); + } + + Ok(()) +} diff --git a/tests/blob_gc.rs b/tests/blob_gc._rs similarity index 100% rename from tests/blob_gc.rs rename to tests/blob_gc._rs diff --git a/tests/blob_gc_watermark.rs b/tests/blob_gc_watermark._rs similarity index 100% rename from tests/blob_gc_watermark.rs rename to tests/blob_gc_watermark._rs diff --git a/tests/blob_major_compact_gc_stats.rs b/tests/blob_major_compact_gc_stats.rs new file mode 100644 index 00000000..776f9e1d --- /dev/null +++ b/tests/blob_major_compact_gc_stats.rs @@ -0,0 +1,106 @@ +use lsm_tree::{blob_tree::FragmentationEntry, AbstractTree, SeqNo}; +use test_log::test; + +#[test] +fn blob_tree_major_compact_gc_stats() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let path = folder.path(); + + let big_value = b"neptune!".repeat(128_000); + let new_big_value = b"winter!".repeat(128_000); + + { + let tree = lsm_tree::Config::new(path).open_as_blob_tree()?; + + assert!(tree.get("big", SeqNo::MAX)?.is_none()); + tree.insert("big", &big_value, 0); + tree.insert("smol", "small value", 0); + + let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + + tree.flush_active_memtable(0)?; + + assert_eq!(1, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); + + tree.insert("big", &new_big_value, 1); + + tree.flush_active_memtable(0)?; + + tree.major_compact(64_000_000, 1_000)?; + + let gc_stats = tree + .index + .manifest + .read() + .expect("lock is poisoned") + .current_version() + .gc_stats() + .clone(); + + // "big":0 is expired + assert_eq!( + &{ + let mut map = lsm_tree::HashMap::default(); + map.insert(0, FragmentationEntry::new(1, big_value.len() as u64)); + map + }, + &*gc_stats, + ); + } + + Ok(()) +} + +// TODO: check that decompressed value size is used (enable compression) +#[test] +fn blob_tree_major_compact_gc_stats_tombstone() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let path = folder.path(); + + let big_value = b"neptune!".repeat(128_000); + + { + let tree = lsm_tree::Config::new(path).open_as_blob_tree()?; + + assert!(tree.get("big", SeqNo::MAX)?.is_none()); + tree.insert("big", &big_value, 0); + tree.insert("smol", "small value", 0); + + let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + + tree.flush_active_memtable(0)?; + + assert_eq!(1, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); + + tree.remove("big", 1); + + tree.flush_active_memtable(0)?; + + tree.major_compact(64_000_000, 1_000)?; + + let gc_stats = tree + .index + .manifest + .read() + .expect("lock is poisoned") + .current_version() + .gc_stats() + .clone(); + + // "big":0 is expired + assert_eq!( + &{ + let mut map = lsm_tree::HashMap::default(); + map.insert(0, FragmentationEntry::new(1, big_value.len() as u64)); + map + }, + &*gc_stats, + ); + } + + Ok(()) +} diff --git a/tests/blob_recover_gc_stats.rs b/tests/blob_recover_gc_stats.rs new file mode 100644 index 00000000..ebdc5f83 --- /dev/null +++ b/tests/blob_recover_gc_stats.rs @@ -0,0 +1,77 @@ +use lsm_tree::{blob_tree::FragmentationEntry, AbstractTree, SeqNo}; +use test_log::test; + +#[test] +fn blob_tree_recover_gc_stats() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let path = folder.path(); + + let big_value = b"neptune!".repeat(128_000); + let new_big_value = b"winter!".repeat(128_000); + + { + let tree = lsm_tree::Config::new(path).open_as_blob_tree()?; + + assert!(tree.get("big", SeqNo::MAX)?.is_none()); + tree.insert("big", &big_value, 0); + tree.insert("smol", "small value", 0); + + let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + + tree.flush_active_memtable(0)?; + + assert_eq!(1, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); + + tree.insert("big", &new_big_value, 1); + + tree.flush_active_memtable(0)?; + + tree.major_compact(64_000_000, 1_000)?; + + let gc_stats = tree + .index + .manifest + .read() + .expect("lock is poisoned") + .current_version() + .gc_stats() + .clone(); + + // "big":0 is expired + assert_eq!( + &{ + let mut map = lsm_tree::HashMap::default(); + map.insert(0, FragmentationEntry::new(1, big_value.len() as u64)); + map + }, + &*gc_stats, + ); + } + + { + let tree = lsm_tree::Config::new(path).open_as_blob_tree()?; + + let gc_stats = tree + .index + .manifest + .read() + .expect("lock is poisoned") + .current_version() + .gc_stats() + .clone(); + + // "big":0 is still expired + assert_eq!( + &{ + let mut map = lsm_tree::HashMap::default(); + map.insert(0, FragmentationEntry::new(1, big_value.len() as u64)); + map + }, + &*gc_stats, + ); + } + + Ok(()) +} diff --git a/tests/blob_simple.rs b/tests/blob_simple.rs index 308bdc6c..8bd4ccaf 100644 --- a/tests/blob_simple.rs +++ b/tests/blob_simple.rs @@ -2,8 +2,7 @@ use lsm_tree::{AbstractTree, SeqNo}; use test_log::test; #[test] -#[ignore] -fn blob_tree_simple() -> lsm_tree::Result<()> { +fn blob_tree_simple_flush_read() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let path = folder.path(); @@ -11,6 +10,9 @@ fn blob_tree_simple() -> lsm_tree::Result<()> { let new_big_value = b"winter!".repeat(128_000); { + // TODO: 3.0.0 just do Config.with_kv_separation().open() + // on recover, check manifest for type + // just return AnyTree let tree = lsm_tree::Config::new(path).open_as_blob_tree()?; assert!(tree.get("big", SeqNo::MAX)?.is_none()); diff --git a/tests/blob_tombstone.rs b/tests/blob_tombstone._rs similarity index 100% rename from tests/blob_tombstone.rs rename to tests/blob_tombstone._rs diff --git a/tests/blob_tree_flush.rs b/tests/blob_tree_flush._rs similarity index 100% rename from tests/blob_tree_flush.rs rename to tests/blob_tree_flush._rs diff --git a/tests/tree_sealed_shadowing.rs b/tests/tree_sealed_shadowing.rs index e6d02772..8221e77e 100644 --- a/tests/tree_sealed_shadowing.rs +++ b/tests/tree_sealed_shadowing.rs @@ -19,8 +19,8 @@ fn tree_sealed_memtable_tombstone_shadowing() -> lsm_tree::Result<()> { let (id, memtable) = tree.rotate_memtable().unwrap(); assert!(!tree.contains_key("a", SeqNo::MAX)?); - let (segment, _) = tree.flush_memtable(id, &memtable, 0)?.unwrap(); - tree.register_segments(&[segment], None, 0)?; + let (segment, _, _) = tree.flush_memtable(id, &memtable, 0)?.unwrap(); + tree.register_segments(&[segment], None, None, 0)?; assert!(!tree.contains_key("a", SeqNo::MAX)?); diff --git a/tests/tree_seqno.rs b/tests/tree_seqno.rs index 8316c2b9..8ed70b62 100644 --- a/tests/tree_seqno.rs +++ b/tests/tree_seqno.rs @@ -45,8 +45,8 @@ fn tree_highest_seqno() -> lsm_tree::Result<()> { assert_eq!(tree.get_highest_memtable_seqno(), Some(4)); assert_eq!(tree.get_highest_persisted_seqno(), Some(3)); - let (segment, _) = tree.flush_memtable(segment_id, &sealed, 0)?.unwrap(); - tree.register_segments(&[segment], None, 0)?; + let (segment, _, _) = tree.flush_memtable(segment_id, &sealed, 0)?.unwrap(); + tree.register_segments(&[segment], None, None, 0)?; assert_eq!(tree.get_highest_seqno(), Some(4)); assert_eq!(tree.get_highest_memtable_seqno(), None); From d37773df3bd3b660b6efdf7fef46d2a4051be6e7 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 28 Sep 2025 17:23:22 +0200 Subject: [PATCH 475/613] impl compaction stream expired KV callback and install blob callback --- src/compaction/stream.rs | 78 +++++++++++++++++++++++++++++++++++++--- src/compaction/worker.rs | 21 +++++++++-- 2 files changed, 92 insertions(+), 7 deletions(-) diff --git a/src/compaction/stream.rs b/src/compaction/stream.rs index 47b43651..3c4c8af4 100644 --- a/src/compaction/stream.rs +++ b/src/compaction/stream.rs @@ -7,19 +7,30 @@ use std::iter::Peekable; type Item = crate::Result; +/// A callback that receives all expired KVs +/// +/// Used for counting blobs that are not referenced anymore because of +/// vHandles that are being dropped through compaction. +pub trait ExpiredKvCallback { + fn on_expired(&mut self, kv: &InternalValue); +} + /// Consumes a stream of KVs and emits a new stream according to GC and tombstone rules /// /// This iterator is used during flushing & compaction. #[allow(clippy::module_name_repetitions)] -pub struct CompactionStream> { +pub struct CompactionStream<'a, I: Iterator> { /// KV stream inner: Peekable, /// MVCC watermark to get rid of old versions gc_seqno_threshold: SeqNo, + + /// Event emitter that receives all expired KVs + expiration_callback: Option<&'a mut dyn ExpiredKvCallback>, } -impl> CompactionStream { +impl<'a, I: Iterator> CompactionStream<'a, I> { /// Initializes a new merge iterator #[must_use] pub fn new(iter: I, gc_seqno_threshold: SeqNo) -> Self { @@ -28,15 +39,30 @@ impl> CompactionStream { Self { inner: iter, gc_seqno_threshold, + expiration_callback: None, } } + /// Installs a callback that receives all expired KVs. + pub fn with_expiration_callback(mut self, cb: &'a mut dyn ExpiredKvCallback) -> Self { + self.expiration_callback = Some(cb); + self + } + /// Drains the remaining versions of the given key. fn drain_key(&mut self, key: &UserKey) -> crate::Result<()> { loop { let Some(next) = self.inner.next_if(|kv| { if let Ok(kv) = kv { - kv.key.user_key == key + let expired = kv.key.user_key == key; + + if expired { + if let Some(watcher) = &mut self.expiration_callback { + watcher.on_expired(kv); + } + } + + expired } else { true } @@ -49,7 +75,7 @@ impl> CompactionStream { } } -impl> Iterator for CompactionStream { +impl> Iterator for CompactionStream<'_, I> { type Item = Item; fn next(&mut self) -> Option { @@ -136,6 +162,50 @@ mod tests { }; } + #[test] + #[allow(clippy::unwrap_used)] + fn compaction_stream_expired_callback_1() -> crate::Result<()> { + #[derive(Default)] + struct MyCallback { + items: Vec, + } + + impl ExpiredKvCallback for MyCallback { + fn on_expired(&mut self, kv: &InternalValue) { + self.items.push(kv.clone()); + } + } + + #[rustfmt::skip] + let vec = stream![ + "a", "", "T", + "a", "", "T", + "a", "", "T", + ]; + + let mut my_watcher = MyCallback::default(); + + let iter = vec.iter().cloned().map(Ok); + let mut iter = CompactionStream::new(iter, 1_000).with_expiration_callback(&mut my_watcher); + + assert_eq!( + // Seqno is reset to 0 + InternalValue::from_components(*b"a", *b"", 0, ValueType::Tombstone), + iter.next().unwrap()?, + ); + iter_closed!(iter); + + assert_eq!( + [ + InternalValue::from_components("a", "", 998, ValueType::Value), + InternalValue::from_components("a", "", 997, ValueType::Value), + ], + &*my_watcher.items, + ); + + Ok(()) + } + #[test] #[allow(clippy::unwrap_used)] fn compaction_stream_seqno_zeroing_1() -> crate::Result<()> { diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index ca3730cf..3ccda50f 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -4,6 +4,7 @@ use super::{CompactionStrategy, Input as CompactionPayload}; use crate::{ + blob_tree::FragmentationMap, compaction::{stream::CompactionStream, Choice}, file::SEGMENTS_FOLDER, level_manifest::LevelManifest, @@ -12,7 +13,7 @@ use crate::{ segment::{multi_writer::MultiWriter, Segment}, stop_signal::StopSignal, tree::inner::TreeId, - Config, InternalValue, SegmentId, SeqNo, + Config, InternalValue, SegmentId, SeqNo, TreeType, }; use std::{ sync::{atomic::AtomicU64, Arc, RwLock, RwLockWriteGuard}, @@ -101,7 +102,7 @@ fn create_compaction_stream<'a>( levels: &LevelManifest, to_compact: &[SegmentId], eviction_seqno: SeqNo, -) -> crate::Result>>>> { +) -> crate::Result>>>> { let mut readers: Vec> = vec![]; let mut found = 0; @@ -246,7 +247,9 @@ fn merge_segments( opts.eviction_seqno, ); - let Some(merge_iter) = create_compaction_stream( + let mut blob_frag_map = FragmentationMap::default(); + + let Some(mut merge_iter) = create_compaction_stream( &levels, &payload.segment_ids.iter().copied().collect::>(), opts.eviction_seqno, @@ -317,6 +320,11 @@ fn merge_segments( } }); + // NOTE: If we are a blob tree, install callback to listen for evicted KVs + if opts.config.tree_type == TreeType::Blob { + merge_iter = merge_iter.with_expiration_callback(&mut blob_frag_map); + } + for (idx, item) in merge_iter.enumerate() { let item = match item { Ok(v) => v, @@ -464,12 +472,19 @@ fn merge_segments( let mut levels = opts.levels.write().expect("lock is poisoned"); log::trace!("compactor: acquired levels manifest write lock"); + log::trace!("Blob fragmentation diff: {blob_frag_map:#?}"); + let swap_result = levels.atomic_swap( |current| { current.with_merge( &payload.segment_ids.iter().copied().collect::>(), &created_segments, payload.dest_level as usize, + if blob_frag_map.is_empty() { + None + } else { + Some(blob_frag_map) + }, ) }, opts.eviction_seqno, From 8c87feee49d9ec7da5ec160f08265140bebaeed2 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 28 Sep 2025 17:27:57 +0200 Subject: [PATCH 476/613] wip --- src/blob_tree/gc.rs | 2 +- src/vlog/mod.rs | 3 --- src/vlog/scanner.rs | 61 --------------------------------------------- 3 files changed, 1 insertion(+), 65 deletions(-) delete mode 100644 src/vlog/scanner.rs diff --git a/src/blob_tree/gc.rs b/src/blob_tree/gc.rs index b0d9ecc2..c385aa13 100644 --- a/src/blob_tree/gc.rs +++ b/src/blob_tree/gc.rs @@ -169,7 +169,7 @@ mod tests { }); let encoded = map.encode_into_vec(); - let decoded = FragmentationMap::decode_from(&mut &encoded[..]).unwrap(); + let decoded = FragmentationMap::decode_from(&mut &encoded[..]).expect("should decode map"); assert_eq!(map, decoded); } diff --git a/src/vlog/mod.rs b/src/vlog/mod.rs index 5efa645c..4d1f3e7e 100644 --- a/src/vlog/mod.rs +++ b/src/vlog/mod.rs @@ -6,9 +6,6 @@ mod accessor; pub mod blob_file; mod handle; -#[doc(hidden)] -pub mod scanner; - pub use { accessor::Accessor, blob_file::multi_writer::MultiWriter as BlobFileWriter, blob_file::BlobFile, handle::ValueHandle, diff --git a/src/vlog/scanner.rs b/src/vlog/scanner.rs deleted file mode 100644 index 1f3291d0..00000000 --- a/src/vlog/scanner.rs +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use crate::vlog::{BlobFileId, ValueHandle}; -use std::collections::BTreeMap; - -#[derive(Debug, Default)] -pub struct BlobFileCounter { - pub size: u64, - pub item_count: u64, -} - -pub type SizeMap = BTreeMap; - -/// Scans a value log, building a size map for the GC report -pub struct Scanner>> { - iter: I, - - // #[allow(unused)] - // lock_guard: MutexGuard<'a, ()>, - size_map: SizeMap, -} - -impl>> Scanner { - pub fn new(iter: I, ids: &[BlobFileId]) -> Self { - let mut size_map = BTreeMap::default(); - - for &id in ids { - size_map.insert(id, BlobFileCounter::default()); - } - - Self { iter, size_map } - } - - pub fn finish(self) -> SizeMap { - self.size_map - } - - pub fn scan(&mut self) -> crate::Result<()> { - for vhandle in self.iter.by_ref() { - let (vhandle, size) = vhandle - .map_err(|_| crate::Error::Io(std::io::Error::other("Index returned error")))?; - - let size = u64::from(size); - - self.size_map - .entry(vhandle.blob_file_id) - .and_modify(|x| { - x.item_count += 1; - x.size += size; - }) - .or_insert_with(|| BlobFileCounter { - size, - item_count: 1, - }); - } - - Ok(()) - } -} From a00270bbb1e6fef407da8fa6dd884ab66e0da971 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 28 Sep 2025 17:43:55 +0200 Subject: [PATCH 477/613] FragmentationMap::prune --- src/blob_tree/gc.rs | 15 +++++++++++++-- src/version/mod.rs | 13 ++++++------- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/src/blob_tree/gc.rs b/src/blob_tree/gc.rs index c385aa13..33adb29a 100644 --- a/src/blob_tree/gc.rs +++ b/src/blob_tree/gc.rs @@ -6,7 +6,9 @@ use crate::{ blob_tree::value::{MaybeInlineValue, TAG_INDIRECT}, compaction::stream::ExpiredKvCallback, vlog::BlobFileId, + BlobFile, }; +use std::collections::BTreeMap; #[derive(Copy, Clone, Debug, Eq, PartialEq)] pub struct FragmentationEntry { @@ -36,6 +38,13 @@ impl std::ops::Deref for FragmentationMap { } impl FragmentationMap { + // TODO: unit test + /// Removes blob file entries that are not part of the value log (anymore) + /// to reduce linear memory growth. + pub fn prune(&mut self, value_log: &BTreeMap) { + self.0.retain(|k, _| value_log.contains_key(k)); + } + // TODO: unit test pub fn merge_into(self, other: &mut Self) { for (blob_file_id, diff) in self.0 { @@ -106,7 +115,9 @@ impl ExpiredKvCallback for FragmentationMap { return; } - let tag = *kv.value.first().expect("value should not be empty"); + let Some(tag) = kv.value.first().copied() else { + return; + }; if tag == TAG_INDIRECT { let parsed_indirection = @@ -135,6 +146,7 @@ impl ExpiredKvCallback for FragmentationMap { } #[cfg(test)] +#[allow(clippy::expect_used)] mod tests { use super::*; use crate::{ @@ -146,7 +158,6 @@ mod tests { use std::collections::HashMap; use test_log::test; - /// Tests encoding and decoding traits #[test] fn frag_map_roundtrip() { let map = FragmentationMap({ diff --git a/src/version/mod.rs b/src/version/mod.rs index 035a1dba..02bfcf56 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -310,8 +310,7 @@ impl Version { let gc_map = if let Some(diff) = diff { let mut copy: FragmentationMap = self.gc_stats.deref().clone(); diff.merge_into(&mut copy); - // TODO: if a blob file is not part of the version anymore, prune its entry from map - // to garbage collect old map entries -> otherwise, monotonically increasing memory usage + copy.prune(&self.value_log); Arc::new(copy) } else { self.gc_stats.clone() @@ -330,7 +329,7 @@ impl Version { /// Returns a new version with a list of segments removed. /// - /// The segment files are not immediately deleted, this is handled in the compaction worker. + /// The segment files are not immediately deleted, this is handled by the version system's free list. pub fn with_dropped(&self, ids: &[SegmentId]) -> Self { let id = self.id + 1; @@ -354,14 +353,15 @@ impl Version { levels.push(Level::from_runs(runs.into_iter().map(Arc::new).collect())); } - // TODO: adjust GC stats if needed + // TODO: adjust GC stats by adjusting GC stats based on dropped table's blob file links + // TODO: add unit test Self { inner: Arc::new(VersionInner { id, levels, value_log: self.value_log.clone(), - gc_stats: Arc::default(), + gc_stats: self.gc_stats.clone(), }), seqno_watermark: 0, } @@ -403,8 +403,7 @@ impl Version { let gc_map = if let Some(diff) = diff { let mut copy: FragmentationMap = self.gc_stats.deref().clone(); diff.merge_into(&mut copy); - // TODO: if a blob file is not part of the version anymore, prune its entry from map - // to garbage collect old map entries -> otherwise, monotonically increasing memory usage + copy.prune(&self.value_log); Arc::new(copy) } else { self.gc_stats.clone() From 92c783f34243b3fedb6e3917abc6d0dc76e2cd3e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 28 Sep 2025 18:09:55 +0200 Subject: [PATCH 478/613] Tree::stale_blob_bytes --- src/abstract.rs | 5 +++++ src/blob_tree/gc.rs | 5 +++++ src/blob_tree/mod.rs | 16 ++++++++++------ 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/src/abstract.rs b/src/abstract.rs index 0a3e775a..af4a69a7 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -91,6 +91,11 @@ pub trait AbstractTree { /// Will return `Err` if an IO error occurs. fn major_compact(&self, target_size: u64, seqno_threshold: SeqNo) -> crate::Result<()>; + /// Returns the disk space used by stale blobs. + fn stale_blob_bytes(&self) -> u64 { + 0 + } + /// Gets the space usage of all filters in the tree. /// /// May not correspond to the actual memory size because filter blocks may be paged out. diff --git a/src/blob_tree/gc.rs b/src/blob_tree/gc.rs index 33adb29a..df3ef31d 100644 --- a/src/blob_tree/gc.rs +++ b/src/blob_tree/gc.rs @@ -38,6 +38,11 @@ impl std::ops::Deref for FragmentationMap { } impl FragmentationMap { + #[must_use] + pub fn stale_bytes(&self) -> u64 { + self.0.values().map(|x| x.bytes).sum() + } + // TODO: unit test /// Removes blob file entries that are not part of the value log (anymore) /// to reduce linear memory growth. diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index 284bff62..ff3a8c0e 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -143,12 +143,6 @@ impl BlobTree { Ok(Some(item)) } - #[must_use] - pub fn space_amp(&self) -> f32 { - // TODO: calculate using current version FragmentationMap - todo!() - } - /// Consumes a [`BlobFileWriter`], returning a `BlobFile` handle. /// /// # Note @@ -425,6 +419,16 @@ impl AbstractTree for BlobTree { })) } + fn stale_blob_bytes(&self) -> u64 { + self.index + .manifest + .read() + .expect("lock is poisoned") + .current_version() + .gc_stats() + .stale_bytes() + } + fn filter_size(&self) -> usize { self.index.filter_size() } From c2ca0556de11211e639850a11239cf37b3561400 Mon Sep 17 00:00:00 2001 From: Marvin <33938500+marvin-j97@users.noreply.github.com> Date: Sun, 28 Sep 2025 18:50:28 +0200 Subject: [PATCH 479/613] Update mod.rs --- src/tree/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tree/mod.rs b/src/tree/mod.rs index ec583088..63ef16fb 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -145,6 +145,7 @@ impl AbstractTree for Tree { let mut count = 0; let mut last_key = None; + #[allow(clippy::explicit_counter_loop)] for (key, value) in iter { if let Some(last_key) = &last_key { assert!( From d3707d2b3c39d010896d70d032236d55fb97c5b7 Mon Sep 17 00:00:00 2001 From: zaidoon Date: Sun, 28 Sep 2025 12:27:05 -0400 Subject: [PATCH 480/613] handle empty ranges gracefully in drop_range without throwing errors --- src/abstract.rs | 7 ++++--- src/error.rs | 6 +----- src/tree/mod.rs | 31 +++++++++++++++++++++++-------- tests/tree_drop_range.rs | 29 +++++++++++++++-------------- 4 files changed, 43 insertions(+), 30 deletions(-) diff --git a/src/abstract.rs b/src/abstract.rs index b2fe6134..1954e657 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -78,12 +78,13 @@ pub trait AbstractTree { /// Drops segments that are fully contained in a given range. /// - /// Both range bounds must be inclusive and finite. + /// Accepts any `RangeBounds`, including unbounded or exclusive endpoints. + /// If the normalized lower bound is greater than the upper bound, the + /// method returns without performing any work. /// /// # Errors /// - /// Will return `Err` if an IO error occurs or if the provided bounds are - /// not supported. + /// Will return `Err` only if an IO error occurs during compaction. fn drop_range, R: RangeBounds>(&self, range: R) -> crate::Result<()>; /// Performs major compaction, blocking the caller until it's done. diff --git a/src/error.rs b/src/error.rs index 62a15c26..eeab1620 100644 --- a/src/error.rs +++ b/src/error.rs @@ -38,9 +38,6 @@ pub enum Error { /// Checksum that was saved in block header expected: Checksum, }, - - /// Provided range bounds are not supported by the requested operation - InvalidRangeBounds, } impl std::fmt::Display for Error { @@ -58,8 +55,7 @@ impl std::error::Error for Error { Self::Decompress(_) | Self::InvalidVersion(_) | Self::Unrecoverable - | Self::ChecksumMismatch { .. } - | Self::InvalidRangeBounds => None, + | Self::ChecksumMismatch { .. } => None, } } } diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 0f55d57a..38935ab9 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -170,7 +170,12 @@ impl AbstractTree for Tree { } fn drop_range, R: RangeBounds>(&self, range: R) -> crate::Result<()> { - let bounds = Self::range_bounds_to_owned_bounds(&range)?; + let (bounds, is_empty) = Self::range_bounds_to_owned_bounds(&range)?; + + if is_empty { + return Ok(()); + } + let strategy = Arc::new(crate::compaction::drop_range::Strategy::new(bounds)); // IMPORTANT: Write lock so we can be the only compaction going on @@ -550,9 +555,19 @@ impl AbstractTree for Tree { } impl Tree { + /// Normalizes a user-provided range into owned `Bound` values. + /// + /// Returns a tuple containing: + /// - the `OwnedBounds` that mirror the original bounds semantics (including + /// inclusive/exclusive markers and unbounded endpoints), and + /// - a `bool` flag indicating whether the normalized range is logically + /// empty (e.g., when the lower bound is greater than the upper bound). + /// + /// Callers can use the flag to detect empty ranges and skip further work + /// while still having access to the normalized bounds for non-empty cases. fn range_bounds_to_owned_bounds, R: RangeBounds>( range: &R, - ) -> crate::Result { + ) -> crate::Result<(OwnedBounds, bool)> { use Bound::{Excluded, Included, Unbounded}; let start = match range.start_bound() { @@ -567,17 +582,17 @@ impl Tree { Unbounded => Unbounded, }; - if let (Included(lo), Included(hi)) + let is_empty = if let (Included(lo), Included(hi)) | (Included(lo), Excluded(hi)) | (Excluded(lo), Included(hi)) | (Excluded(lo), Excluded(hi)) = (&start, &end) { - if lo.as_ref() > hi.as_ref() { - return Err(crate::Error::InvalidRangeBounds); - } - } + lo.as_ref() > hi.as_ref() + } else { + false + }; - Ok(OwnedBounds { start, end }) + Ok((OwnedBounds { start, end }, is_empty)) } /// Opens an LSM-tree in the given directory. diff --git a/tests/tree_drop_range.rs b/tests/tree_drop_range.rs index efe4d17e..755c5a5d 100644 --- a/tests/tree_drop_range.rs +++ b/tests/tree_drop_range.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, Config, Error, SeqNo, Tree}; +use lsm_tree::{AbstractTree, Config, SeqNo, Tree}; use std::ops::Bound::{Excluded, Included, Unbounded}; fn populate_segments(tree: &Tree) -> lsm_tree::Result<()> { @@ -58,7 +58,7 @@ fn tree_drop_range_lower_exclusive() -> lsm_tree::Result<()> { populate_segments(&tree)?; - tree.drop_range((Excluded("a"), Included("c")))?; + tree.drop_range::<&str, _>((Excluded("a"), Included("c")))?; assert!(tree.contains_key("a", SeqNo::MAX)?); assert!(!tree.contains_key("b", SeqNo::MAX)?); @@ -75,7 +75,7 @@ fn tree_drop_range_unbounded_lower_inclusive_upper() -> lsm_tree::Result<()> { populate_segments(&tree)?; - tree.drop_range((Unbounded, Included("c")))?; + tree.drop_range::<&str, _>((Unbounded, Included("c")))?; assert!(!tree.contains_key("a", SeqNo::MAX)?); assert!(!tree.contains_key("b", SeqNo::MAX)?); @@ -93,7 +93,7 @@ fn tree_drop_range_unbounded_lower_exclusive_upper() -> lsm_tree::Result<()> { populate_segments(&tree)?; - tree.drop_range((Unbounded, Excluded("d")))?; + tree.drop_range::<&str, _>((Unbounded, Excluded("d")))?; assert!(!tree.contains_key("a", SeqNo::MAX)?); assert!(!tree.contains_key("b", SeqNo::MAX)?); @@ -110,7 +110,7 @@ fn tree_drop_range_exclusive_empty_interval() -> lsm_tree::Result<()> { populate_segments(&tree)?; - tree.drop_range((Excluded("b"), Excluded("b")))?; + tree.drop_range::<&str, _>((Excluded("b"), Excluded("b")))?; assert!(tree.contains_key("a", SeqNo::MAX)?); assert!(tree.contains_key("b", SeqNo::MAX)?); @@ -168,18 +168,19 @@ fn tree_drop_range_clear_all() -> lsm_tree::Result<()> { } #[test] -fn tree_drop_range_invalid_bounds() -> lsm_tree::Result<()> { +fn tree_drop_range_inverted_bounds_is_noop() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let tree = Config::new(&folder).open()?; - assert!(matches!( - tree.drop_range("c"..="a"), - Err(Error::InvalidRangeBounds) - )); - assert!(matches!( - tree.drop_range("c".."a"), - Err(Error::InvalidRangeBounds) - )); + populate_segments(&tree)?; + + tree.drop_range("c".."a")?; + tree.drop_range("c"..="a")?; + + // All keys remain because the range is treated as empty. + for key in 'a'..='e' { + assert!(tree.contains_key([key as u8], SeqNo::MAX)?); + } Ok(()) } From 3277d6dc34ec49a26e46e3bcb81b6b68c5d04ef6 Mon Sep 17 00:00:00 2001 From: Marvin <33938500+marvin-j97@users.noreply.github.com> Date: Sun, 28 Sep 2025 19:21:22 +0200 Subject: [PATCH 481/613] Update mod.rs --- src/tree/mod.rs | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 63ef16fb..83ccf2da 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -597,32 +597,6 @@ impl Tree { log::debug!("Finalized segment write at {}", segment_file_path.display()); - /* let block_index = - FullBlockIndex::from_file(&segment_file_path, &trailer.metadata, &trailer.offsets)?; - let block_index = Arc::new(BlockIndexImpl::Full(block_index)); - - let created_segment: Segment = SegmentInner { - path: segment_file_path.clone(), - - tree_id: self.id, - - metadata: trailer.metadata, - offsets: trailer.offsets, - - descriptor_table: self.config.descriptor_table.clone(), - block_index, - cache: self.config.cache.clone(), - - bloom_filter: Segment::load_bloom(&segment_file_path, trailer.offsets.bloom_ptr)?, - - is_deleted: AtomicBool::default(), - } - .into(); */ - - /* self.config - .descriptor_table - .insert(segment_file_path, created_segment.global_id()); */ - let pin_filter = self.config.filter_block_pinning_policy.get(0); let pin_index = self.config.filter_block_pinning_policy.get(0); From b9cd3a2ecf0370780fd67d053eccaf632a3d726b Mon Sep 17 00:00:00 2001 From: zaidoon Date: Sun, 28 Sep 2025 14:54:27 -0400 Subject: [PATCH 482/613] implement RangeBounds for OwnedBounds and add ContainedSegments iterator --- src/compaction/drop_range.rs | 60 ++++++++++++++++++++++++++++++++++-- tests/tree_drop_range.rs | 25 +++++++++++++++ 2 files changed, 82 insertions(+), 3 deletions(-) diff --git a/src/compaction/drop_range.rs b/src/compaction/drop_range.rs index 615651a6..9aac0219 100644 --- a/src/compaction/drop_range.rs +++ b/src/compaction/drop_range.rs @@ -7,7 +7,7 @@ use crate::{ config::Config, level_manifest::LevelManifest, slice::Slice, version::run::Ranged, KeyRange, }; use crate::{HashSet, Segment}; -use std::ops::Bound; +use std::ops::{Bound, RangeBounds}; #[derive(Clone, Debug)] pub struct OwnedBounds { @@ -15,6 +15,56 @@ pub struct OwnedBounds { pub end: Bound, } +impl RangeBounds for OwnedBounds { + fn start_bound(&self) -> Bound<&Slice> { + match &self.start { + Bound::Unbounded => Bound::Unbounded, + Bound::Included(key) => Bound::Included(key), + Bound::Excluded(key) => Bound::Excluded(key), + } + } + + fn end_bound(&self) -> Bound<&Slice> { + match &self.end { + Bound::Unbounded => Bound::Unbounded, + Bound::Included(key) => Bound::Included(key), + Bound::Excluded(key) => Bound::Excluded(key), + } + } +} + +struct ContainedSegments<'a> { + segments: &'a [Segment], + bounds: &'a OwnedBounds, + pos: usize, +} + +impl<'a> ContainedSegments<'a> { + fn new(segments: &'a [Segment], bounds: &'a OwnedBounds) -> Self { + Self { + segments, + bounds, + pos: 0, + } + } +} + +impl<'a> Iterator for ContainedSegments<'a> { + type Item = &'a Segment; + + fn next(&mut self) -> Option { + while let Some(segment) = self.segments.get(self.pos) { + self.pos += 1; + + if self.bounds.contains(segment.key_range()) { + return Some(segment); + } + } + + None + } +} + impl OwnedBounds { #[must_use] pub fn contains(&self, range: &KeyRange) -> bool { @@ -65,8 +115,12 @@ impl CompactionStrategy for Strategy { .iter_levels() .flat_map(|lvl| lvl.iter()) .flat_map(|run| { - run.iter() - .filter(|segment| self.bounds.contains(segment.key_range())) + let slice = run + .range_overlap_indexes(&self.bounds) + .and_then(|(lo, hi)| run.get(lo..=hi)) + .unwrap_or(&[]); + + ContainedSegments::new(slice, &self.bounds) }) .map(Segment::id) .collect(); diff --git a/tests/tree_drop_range.rs b/tests/tree_drop_range.rs index 755c5a5d..11cffb65 100644 --- a/tests/tree_drop_range.rs +++ b/tests/tree_drop_range.rs @@ -33,6 +33,31 @@ fn tree_drop_range_basic() -> lsm_tree::Result<()> { Ok(()) } +#[test] +fn tree_drop_range_partial_segment_overlap_kept() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let tree = Config::new(&folder).open()?; + + for key in ['a', 'b', 'c', 'd', 'e'] { + tree.insert([key as u8], "", 0); + } + tree.flush_active_memtable(0)?; + + assert_eq!(1, tree.l0_run_count()); + assert_eq!(1, tree.segment_count()); + + tree.drop_range("b".."d")?; + + for key in ['a', 'b', 'c', 'd', 'e'] { + assert!(tree.contains_key([key as u8], SeqNo::MAX)?); + } + + assert_eq!(1, tree.l0_run_count()); + assert_eq!(1, tree.segment_count()); + + Ok(()) +} + #[test] fn tree_drop_range_upper_exclusive() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; From 22623e2db5f1e686f80e91e6fc9d13907ad0f214 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 29 Sep 2025 00:57:54 +0200 Subject: [PATCH 483/613] wip --- src/vlog/handle.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vlog/handle.rs b/src/vlog/handle.rs index a5900924..94e351ad 100644 --- a/src/vlog/handle.rs +++ b/src/vlog/handle.rs @@ -14,7 +14,7 @@ use varint_rs::{VarintReader, VarintWriter}; /// A value handle points into the value log #[allow(clippy::module_name_repetitions)] -#[derive(Clone, Debug, Eq, Hash, PartialEq)] +#[derive(Copy, Clone, Debug, Eq, Hash, PartialEq)] pub struct ValueHandle { /// Blob file ID pub blob_file_id: BlobFileId, From 032a55be7519e1dd5a017997e8e7d307808fb832 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 29 Sep 2025 00:58:13 +0200 Subject: [PATCH 484/613] split ValueType into own file --- src/key.rs | 3 ++- src/lib.rs | 4 ++- src/range.rs | 14 +++++------ src/value.rs | 40 +----------------------------- src/value_type.rs | 63 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 75 insertions(+), 49 deletions(-) create mode 100644 src/value_type.rs diff --git a/src/key.rs b/src/key.rs index d1520600..b59ef01b 100644 --- a/src/key.rs +++ b/src/key.rs @@ -43,6 +43,7 @@ impl std::fmt::Debug for InternalKey { ValueType::Value => "V", ValueType::Tombstone => "T", ValueType::WeakTombstone => "W", + ValueType::Indirection => "Vb", }, ) } @@ -65,7 +66,7 @@ impl InternalKey { } pub fn is_tombstone(&self) -> bool { - self.value_type == ValueType::Tombstone || self.value_type == ValueType::WeakTombstone + self.value_type.is_tombstone() } } diff --git a/src/lib.rs b/src/lib.rs index 9ccc9e73..12e613d4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -220,6 +220,7 @@ mod format_version; mod time; mod tree; mod value; +mod value_type; mod version; mod vlog; @@ -258,7 +259,8 @@ pub use { seqno::SequenceNumberCounter, slice::Slice, tree::Tree, - value::{SeqNo, ValueType}, + value::SeqNo, + value_type::ValueType, vlog::BlobFile, }; diff --git a/src/range.rs b/src/range.rs index 3449234a..7b5034dd 100644 --- a/src/range.rs +++ b/src/range.rs @@ -152,12 +152,12 @@ impl TreeIter { Bound::Included(key) => Bound::Included(InternalKey::new( key.as_ref(), SeqNo::MAX, - crate::value::ValueType::Tombstone, + crate::ValueType::Tombstone, )), Bound::Excluded(key) => Bound::Excluded(InternalKey::new( key.as_ref(), 0, - crate::value::ValueType::Tombstone, + crate::ValueType::Tombstone, )), Bound::Unbounded => Bound::Unbounded, }; @@ -177,15 +177,13 @@ impl TreeIter { // abcdef -> 6 // abcdef -> 5 // - Bound::Included(key) => Bound::Included(InternalKey::new( - key.as_ref(), - 0, - crate::value::ValueType::Value, - )), + Bound::Included(key) => { + Bound::Included(InternalKey::new(key.as_ref(), 0, crate::ValueType::Value)) + } Bound::Excluded(key) => Bound::Excluded(InternalKey::new( key.as_ref(), SeqNo::MAX, - crate::value::ValueType::Value, + crate::ValueType::Value, )), Bound::Unbounded => Bound::Unbounded, }; diff --git a/src/value.rs b/src/value.rs index efc190f2..a874b805 100644 --- a/src/value.rs +++ b/src/value.rs @@ -2,11 +2,10 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::{key::InternalKey, Slice}; +use crate::{key::InternalKey, Slice, ValueType}; /// User defined key pub type UserKey = Slice; - /// User defined data (blob of bytes) #[allow(clippy::module_name_repetitions)] pub type UserValue = Slice; @@ -22,43 +21,6 @@ pub type UserValue = Slice; /// Stale items are lazily garbage-collected during compaction. pub type SeqNo = u64; -/// Value type (regular value or tombstone) -#[derive(Copy, Clone, Debug, Eq, PartialEq)] -#[allow(clippy::module_name_repetitions)] -pub enum ValueType { - /// Existing value - Value, - - /// Deleted value - Tombstone, - - /// "Weak" deletion (a.k.a. `SingleDelete` in `RocksDB`) - WeakTombstone, -} - -impl TryFrom for ValueType { - type Error = (); - - fn try_from(value: u8) -> Result { - match value { - 0 => Ok(Self::Value), - 1 => Ok(Self::Tombstone), - 2 => Ok(Self::WeakTombstone), - _ => Err(()), - } - } -} - -impl From for u8 { - fn from(value: ValueType) -> Self { - match value { - ValueType::Value => 0, - ValueType::Tombstone => 1, - ValueType::WeakTombstone => 2, - } - } -} - /// Internal representation of KV pairs #[allow(clippy::module_name_repetitions)] #[derive(Clone, Eq)] diff --git a/src/value_type.rs b/src/value_type.rs new file mode 100644 index 00000000..5d24450f --- /dev/null +++ b/src/value_type.rs @@ -0,0 +1,63 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +// TODO: remove MaybeInlineValue because we can just store values flat and look at key instead + +// TODO: add ValueType::is_vhandle + +/// Value type (regular value or tombstone) +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[allow(clippy::module_name_repetitions)] +pub enum ValueType { + /// Existing value + Value, + + /// Deleted value + Tombstone, + + /// "Weak" deletion (a.k.a. `SingleDelete` in `RocksDB`) + WeakTombstone, + + /// Value handle + /// + /// Points to a blob in a blob file. + Indirection, +} + +impl ValueType { + /// Returns `true` if the type is a tombstone marker (either normal or weak). + #[must_use] + pub fn is_tombstone(self) -> bool { + self == Self::Tombstone || self == Self::WeakTombstone + } + + pub(crate) fn is_indirection(self) -> bool { + self == Self::Indirection + } +} + +impl TryFrom for ValueType { + type Error = (); + + fn try_from(value: u8) -> Result { + match value { + 0 => Ok(Self::Value), + 0x0000_0001 => Ok(Self::Tombstone), + 0x0000_0011 => Ok(Self::WeakTombstone), + 0b1000_0000 => Ok(Self::Indirection), + _ => Err(()), + } + } +} + +impl From for u8 { + fn from(value: ValueType) -> Self { + match value { + ValueType::Value => 0, + ValueType::Tombstone => 0x0000_0001, + ValueType::WeakTombstone => 0x0000_0011, + ValueType::Indirection => 0b1000_0000, + } + } +} From 3bbb90ecd9594ec15a0bc7f791fbb6b8e7f5750c Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 29 Sep 2025 00:58:47 +0200 Subject: [PATCH 485/613] test: add assertion --- tests/blob_recover_gc_stats.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/blob_recover_gc_stats.rs b/tests/blob_recover_gc_stats.rs index ebdc5f83..edc86c7e 100644 --- a/tests/blob_recover_gc_stats.rs +++ b/tests/blob_recover_gc_stats.rs @@ -20,13 +20,13 @@ fn blob_tree_recover_gc_stats() -> lsm_tree::Result<()> { assert_eq!(&*value, big_value); tree.flush_active_memtable(0)?; - assert_eq!(1, tree.segment_count()); assert_eq!(1, tree.blob_file_count()); tree.insert("big", &new_big_value, 1); tree.flush_active_memtable(0)?; + assert_eq!(2, tree.blob_file_count()); tree.major_compact(64_000_000, 1_000)?; From 42dc7e7fffa20db3d1a0fc4c755ef3c72eaac6e2 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 29 Sep 2025 00:58:54 +0200 Subject: [PATCH 486/613] wip --- src/compaction/stream.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compaction/stream.rs b/src/compaction/stream.rs index 3c4c8af4..7618b339 100644 --- a/src/compaction/stream.rs +++ b/src/compaction/stream.rs @@ -126,7 +126,7 @@ impl> Iterator for CompactionStream<'_, I> { #[cfg(test)] mod tests { use super::*; - use crate::value::{InternalValue, ValueType}; + use crate::{value::InternalValue, ValueType}; use test_log::test; macro_rules! stream { From 997daa08008fa1d703fa4ecf668b7cee83c30c73 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 29 Sep 2025 00:59:03 +0200 Subject: [PATCH 487/613] wip --- src/memtable/mod.rs | 7 +++++-- src/mvcc_stream.rs | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/memtable/mod.rs b/src/memtable/mod.rs index d5026ab0..559c9c01 100644 --- a/src/memtable/mod.rs +++ b/src/memtable/mod.rs @@ -3,7 +3,10 @@ // (found in the LICENSE-* files in the repository) use crate::key::InternalKey; -use crate::value::{InternalValue, SeqNo, UserValue, ValueType}; +use crate::{ + value::{InternalValue, SeqNo, UserValue}, + ValueType, +}; use crossbeam_skiplist::SkipMap; use std::ops::RangeBounds; use std::sync::atomic::AtomicU64; @@ -150,7 +153,7 @@ impl Memtable { #[cfg(test)] mod tests { use super::*; - use crate::value::ValueType; + use crate::ValueType; use test_log::test; #[test] diff --git a/src/mvcc_stream.rs b/src/mvcc_stream.rs index 549b7295..37edfbda 100644 --- a/src/mvcc_stream.rs +++ b/src/mvcc_stream.rs @@ -86,7 +86,7 @@ impl>> DoubleEndedIte #[allow(clippy::string_lit_as_bytes)] mod tests { use super::*; - use crate::value::{InternalValue, ValueType}; + use crate::{value::InternalValue, ValueType}; use test_log::test; macro_rules! stream { From 69d034283376d5d67e52f15cce4377aeb8f17841 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 29 Sep 2025 00:59:46 +0200 Subject: [PATCH 488/613] fix deserialization of data block items --- src/segment/block/header.rs | 2 +- src/segment/data_block/mod.rs | 60 +++++++++++++++++++++++++++-------- 2 files changed, 47 insertions(+), 15 deletions(-) diff --git a/src/segment/block/header.rs b/src/segment/block/header.rs index 8be30b11..dce3ff9f 100644 --- a/src/segment/block/header.rs +++ b/src/segment/block/header.rs @@ -55,7 +55,7 @@ pub struct Header { pub checksum: Checksum, /// File offset of previous block - only used for data blocks - pub previous_block_offset: BlockOffset, // TODO: remove? + pub previous_block_offset: BlockOffset, // TODO: 3.0.0 remove? /// On-disk size of data segment pub data_length: u32, diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index c1166e72..1496de4d 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -43,10 +43,10 @@ impl Decodable for InternalValue { fn parse_full(reader: &mut Cursor<&[u8]>, offset: usize) -> Option { let value_type = unwrap!(reader.read_u8()); - if value_type == TRAILER_START_MARKER { return None; } + let value_type = ValueType::try_from(value_type).expect("should be valid value type"); let seqno = unwrap!(reader.read_u64_varint()); @@ -54,7 +54,9 @@ impl Decodable for InternalValue { let key_start = offset + reader.position() as usize; unwrap!(reader.seek_relative(key_len as i64)); - let val_len: usize = if value_type == u8::from(ValueType::Value) { + let is_value = !value_type.is_tombstone(); + + let val_len: usize = if is_value { unwrap!(reader.read_u32_varint()) as usize } else { 0 @@ -62,7 +64,7 @@ impl Decodable for InternalValue { let val_offset = offset + reader.position() as usize; unwrap!(reader.seek_relative(val_len as i64)); - Some(if value_type == u8::from(ValueType::Value) { + Some(if is_value { DataBlockParsedItem { value_type, seqno, @@ -87,10 +89,10 @@ impl Decodable for InternalValue { base_key_offset: usize, ) -> Option { let value_type = unwrap!(reader.read_u8()); - if value_type == TRAILER_START_MARKER { return None; } + let value_type = unwrap!(ValueType::try_from(value_type)); let seqno = unwrap!(reader.read_u64_varint()); @@ -101,7 +103,9 @@ impl Decodable for InternalValue { unwrap!(reader.seek_relative(rest_key_len as i64)); - let val_len: usize = if value_type == u8::from(ValueType::Value) { + let is_value = !value_type.is_tombstone(); + + let val_len: usize = if is_value { unwrap!(reader.read_u32_varint()) as usize } else { 0 @@ -109,7 +113,7 @@ impl Decodable for InternalValue { let val_offset = offset + reader.position() as usize; unwrap!(reader.seek_relative(val_len as i64)); - Some(if value_type == u8::from(ValueType::Value) { + Some(if is_value { DataBlockParsedItem { value_type, seqno, @@ -217,7 +221,7 @@ impl Encodable<()> for InternalValue { #[derive(Debug)] pub struct DataBlockParsedItem { - pub value_type: u8, + pub value_type: ValueType, pub seqno: SeqNo, pub prefix: Option, pub key: SliceIndexes, @@ -251,13 +255,7 @@ impl ParsedItem for DataBlockParsedItem { bytes.slice(self.key.0..self.key.1) }; - let key = InternalKey::new( - key, - self.seqno, - // NOTE: Value type is (or should be) checked when reading it - #[allow(clippy::expect_used)] - self.value_type.try_into().expect("should work"), - ); + let key = InternalKey::new(key, self.seqno, self.value_type); let value = self .value @@ -676,6 +674,40 @@ mod tests { Ok(()) } + #[test] + fn v3_data_block_vhandle() -> crate::Result<()> { + let items = [InternalValue::from_components( + "abc", + "world", + 1, + crate::ValueType::Indirection, + )]; + + for restart_interval in 1..=16 { + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 0.0)?; + let serialized_len = bytes.len(); + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + block_type: BlockType::Data, + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + previous_block_offset: BlockOffset(0), + }, + }); + + assert_eq!(data_block.len(), items.len()); + assert_eq!(data_block.inner.size(), serialized_len); + + assert_eq!(Some(items[0].clone()), data_block.point_read(b"abc", 777)); + assert!(data_block.point_read(b"abc", 1).is_none()); + } + + Ok(()) + } + #[test] fn v3_data_block_mvcc_read_first() -> crate::Result<()> { let items = [InternalValue::from_components( From 135517011b0639165d553c8eee71cca825ca384d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 29 Sep 2025 01:00:30 +0200 Subject: [PATCH 489/613] replace MaybeInlineValue with BlobIndirection --- src/blob_tree/gc.rs | 59 +++++------- src/blob_tree/handle.rs | 29 ++++++ src/blob_tree/mod.rs | 199 ++++++++++++++-------------------------- src/blob_tree/value.rs | 96 ------------------- 4 files changed, 120 insertions(+), 263 deletions(-) create mode 100644 src/blob_tree/handle.rs delete mode 100644 src/blob_tree/value.rs diff --git a/src/blob_tree/gc.rs b/src/blob_tree/gc.rs index df3ef31d..9850563e 100644 --- a/src/blob_tree/gc.rs +++ b/src/blob_tree/gc.rs @@ -3,10 +3,8 @@ // (found in the LICENSE-* files in the repository) use crate::{ - blob_tree::value::{MaybeInlineValue, TAG_INDIRECT}, - compaction::stream::ExpiredKvCallback, - vlog::BlobFileId, - BlobFile, + blob_tree::handle::BlobIndirection, coding::Decode, compaction::stream::ExpiredKvCallback, + vlog::BlobFileId, BlobFile, }; use std::collections::BTreeMap; @@ -116,36 +114,24 @@ impl crate::coding::Decode for FragmentationMap { impl ExpiredKvCallback for FragmentationMap { fn on_expired(&mut self, kv: &crate::InternalValue) { - if kv.key.is_tombstone() { - return; - } + if kv.key.value_type.is_indirection() { + let mut reader = &kv.value[..]; + + let vptr = + BlobIndirection::decode_from(&mut reader).expect("should parse BlobIndirection"); + + let size = u64::from(vptr.size); - let Some(tag) = kv.value.first().copied() else { - return; - }; - - if tag == TAG_INDIRECT { - let parsed_indirection = - MaybeInlineValue::from_slice(&kv.value).expect("should parse MaybeInlineValue"); - - match parsed_indirection { - MaybeInlineValue::Indirect { vhandle, size } => { - let size = u64::from(size); - - self.0 - .entry(vhandle.blob_file_id) - .and_modify(|counter| { - counter.len += 1; - counter.bytes += size; - }) - .or_insert_with(|| FragmentationEntry { - bytes: size, - len: 1, - }); - } - // NOTE: Unreachable because we check for the tag above - MaybeInlineValue::Inline(_) => unreachable!(), - } + self.0 + .entry(vptr.vhandle.blob_file_id) + .and_modify(|counter| { + counter.len += 1; + counter.bytes += size; + }) + .or_insert_with(|| FragmentationEntry { + bytes: size, + len: 1, + }); } } } @@ -157,8 +143,9 @@ mod tests { use crate::{ coding::{Decode, Encode}, compaction::stream::CompactionStream, - value::{InternalValue, ValueType}, + value::InternalValue, vlog::ValueHandle, + ValueType, }; use std::collections::HashMap; use test_log::test; @@ -196,14 +183,14 @@ mod tests { let vec = &[ InternalValue::from_components("a", b"abc", 1, ValueType::Value), - InternalValue::from_components("a", MaybeInlineValue::Indirect { + InternalValue::from_components("a", BlobIndirection { size: 1000, vhandle: ValueHandle { blob_file_id: 0, on_disk_size: 500, offset: 0, } - }.encode_into_vec(), 0, ValueType::Value), + }.encode_into_vec(), 0, ValueType::Indirection), ]; let mut my_watcher = FragmentationMap::default(); diff --git a/src/blob_tree/handle.rs b/src/blob_tree/handle.rs new file mode 100644 index 00000000..89c6b53a --- /dev/null +++ b/src/blob_tree/handle.rs @@ -0,0 +1,29 @@ +use crate::{ + coding::{Decode, Encode}, + vlog::ValueHandle, + DecodeError, EncodeError, +}; +use std::io::{Read, Write}; +use varint_rs::{VarintReader, VarintWriter}; + +#[derive(Copy, Clone)] +pub struct BlobIndirection { + pub(crate) vhandle: ValueHandle, + pub(crate) size: u32, +} + +impl Encode for BlobIndirection { + fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { + self.vhandle.encode_into(writer)?; + writer.write_u32_varint(self.size)?; + Ok(()) + } +} + +impl Decode for BlobIndirection { + fn decode_from(reader: &mut R) -> Result { + let vhandle = ValueHandle::decode_from(reader)?; + let size = reader.read_u32_varint()?; + Ok(Self { vhandle, size }) + } +} diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index ff3a8c0e..06bec891 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) mod gc; -pub mod value; +pub mod handle; #[doc(hidden)] pub use gc::{FragmentationEntry, FragmentationMap}; @@ -20,73 +20,64 @@ use crate::{ vlog::{Accessor, BlobFile, BlobFileId, BlobFileWriter, ValueHandle}, Config, Memtable, SegmentId, SeqNo, SequenceNumberCounter, UserKey, UserValue, }; +use handle::BlobIndirection; use std::{collections::BTreeMap, io::Cursor, ops::RangeBounds, path::PathBuf, sync::Arc}; -use value::MaybeInlineValue; pub struct Guard<'a> { blob_tree: &'a BlobTree, vlog: Arc>, - kv: crate::Result<(UserKey, UserValue)>, + kv: crate::Result, } impl IterGuard for Guard<'_> { fn key(self) -> crate::Result { - self.kv.map(|(k, _)| k) + self.kv.map(|kv| kv.key.user_key) } fn size(self) -> crate::Result { - use MaybeInlineValue::{Indirect, Inline}; - - let (_, value) = self.kv?; - let mut cursor = Cursor::new(value); - - Ok(match MaybeInlineValue::decode_from(&mut cursor)? { - // NOTE: We know LSM-tree values are 32 bits in length max - #[allow(clippy::cast_possible_truncation)] - Inline(bytes) => bytes.len() as u32, - - // NOTE: No need to resolve vHandle, because the size is already stored - Indirect { size, .. } => size, - }) + let mut cursor = Cursor::new(self.kv?.value); + Ok(BlobIndirection::decode_from(&mut cursor)?.size) } fn into_inner(self) -> crate::Result<(UserKey, UserValue)> { - resolve_value_handle(self.blob_tree, &self.vlog, self.kv) + resolve_value_handle(self.blob_tree, &self.vlog, self.kv?) } } fn resolve_value_handle( tree: &BlobTree, vlog: &BTreeMap, - item: RangeItem, + item: InternalValue, ) -> RangeItem { - use MaybeInlineValue::{Indirect, Inline}; - - match item { - Ok((key, value)) => { - let mut cursor = Cursor::new(value); - - match MaybeInlineValue::decode_from(&mut cursor)? { - Inline(bytes) => Ok((key, bytes)), - Indirect { vhandle, .. } => { - // Resolve indirection using value log - match Accessor::new(vlog).get( - &tree.blobs_folder, - &key, - &vhandle, - &tree.index.config.cache, - &tree.index.config.descriptor_table, - ) { - Ok(Some(bytes)) => Ok((key, bytes)), - Err(e) => Err(e), - _ => { - panic!("value handle ({:?} => {vhandle:?}) did not match any blob - this is a bug", String::from_utf8_lossy(&key)) - } - } - } + if item.key.value_type.is_indirection() { + let mut cursor = Cursor::new(item.value); + let vptr = BlobIndirection::decode_from(&mut cursor)?; + + // Resolve indirection using value log + match Accessor::new(vlog).get( + &tree.blobs_folder, + &item.key.user_key, + &vptr.vhandle, + &tree.index.config.cache, + &tree.index.config.descriptor_table, + ) { + Ok(Some(v)) => { + let k = item.key.user_key; + Ok((k, v)) + } + Ok(None) => { + panic!( + "value handle ({:?} => {:?}) did not match any blob - this is a bug", + String::from_utf8_lossy(&item.key.user_key), + vptr.vhandle, + ) } + Err(e) => Err(e), } - Err(e) => Err(e), + } else { + let k = item.key.user_key; + let v = item.value; + Ok((k, v)) } } @@ -133,16 +124,6 @@ impl BlobTree { }) } - fn get_vhandle(&self, key: &[u8], seqno: SeqNo) -> crate::Result> { - let Some(item) = self.index.get(key, seqno)? else { - return Ok(None); - }; - - let item = MaybeInlineValue::from_slice(&item)?; - - Ok(Some(item)) - } - /// Consumes a [`BlobFileWriter`], returning a `BlobFile` handle. /// /// # Note @@ -249,6 +230,10 @@ impl AbstractTree for BlobTree { seqno: SeqNo, index: Option>, ) -> Box> + '_> { + use crate::range::prefix_to_range; + + let range = prefix_to_range(prefix.as_ref()); + let version = self .index .manifest @@ -259,7 +244,7 @@ impl AbstractTree for BlobTree { Box::new( self.index - .create_prefix(&prefix, seqno, index) + .create_internal_range(&range, seqno, index) .map(move |kv| { IterGuardImpl::Blob(Guard { blob_tree: self, @@ -287,7 +272,7 @@ impl AbstractTree for BlobTree { // TODO: PERF: ugly Arc clone Box::new( self.index - .create_range(&range, seqno, index) + .create_internal_range(&range, seqno, index) .map(move |kv| { IterGuardImpl::Blob(Guard { blob_tree: self, @@ -406,16 +391,20 @@ impl AbstractTree for BlobTree { // NOTE: We skip reading from the value log // because the vHandles already store the value size fn size_of>(&self, key: K, seqno: SeqNo) -> crate::Result> { - let vhandle = self.get_vhandle(key.as_ref(), seqno)?; + let Some(item) = self.index.get_internal_entry(key.as_ref(), seqno)? else { + return Ok(None); + }; - Ok(vhandle.map(|x| match x { + Ok(Some(if item.key.value_type.is_indirection() { + let mut cursor = Cursor::new(item.value); + let vptr = BlobIndirection::decode_from(&mut cursor)?; + vptr.size + } else { // NOTE: Values are u32 length max #[allow(clippy::cast_possible_truncation)] - MaybeInlineValue::Inline(v) => v.len() as u32, - - // NOTE: We skip reading from the value log - // because the indirections already store the value size - MaybeInlineValue::Indirect { size, .. } => size, + { + item.value.len() as u32 + } })) } @@ -452,7 +441,7 @@ impl AbstractTree for BlobTree { eviction_seqno: SeqNo, ) -> crate::Result, Option)>> { use crate::{file::SEGMENTS_FOLDER, segment::Writer as SegmentWriter}; - use value::MaybeInlineValue; + // use value::MaybeInlineValue; let lsm_segment_folder = self.index.config.path.join(SEGMENTS_FOLDER); @@ -497,24 +486,7 @@ impl AbstractTree for BlobTree { continue; } - let mut cursor = Cursor::new(item.value); - - let value = MaybeInlineValue::decode_from(&mut cursor)?; - let value = match value { - MaybeInlineValue::Inline(value) => value, - indirection @ MaybeInlineValue::Indirect { .. } => { - // NOTE: This is a previous indirection, just write it to index tree - // without writing the blob again - - let mut serialized_indirection = vec![]; - indirection.encode_into(&mut serialized_indirection)?; - - segment_writer - .write(InternalValue::new(item.key.clone(), serialized_indirection))?; - - continue; - } - }; + let value = item.value; // NOTE: Values are 32-bit max #[allow(clippy::cast_possible_truncation)] @@ -525,7 +497,7 @@ impl AbstractTree for BlobTree { let blob_file_id = blob_writer.blob_file_id(); let on_disk_size = blob_writer.write(&item.key.user_key, value)?; - let indirection = MaybeInlineValue::Indirect { + let indirection = BlobIndirection { vhandle: ValueHandle { blob_file_id, offset, @@ -533,19 +505,17 @@ impl AbstractTree for BlobTree { }, size: value_size, }; - // TODO: use Slice::with_size - let mut serialized_indirection = vec![]; - indirection.encode_into(&mut serialized_indirection)?; - segment_writer - .write(InternalValue::new(item.key.clone(), serialized_indirection))?; + segment_writer.write({ + let mut vptr = + InternalValue::new(item.key.clone(), indirection.encode_into_vec()); + vptr.key.value_type = crate::ValueType::Indirection; + vptr + })?; blob_bytes_referenced += u64::from(value_size); } else { - // TODO: use Slice::with_size - let direct = MaybeInlineValue::Inline(value); - let serialized_direct = direct.encode_into_vec(); - segment_writer.write(InternalValue::new(item.key, serialized_direct))?; + segment_writer.write(InternalValue::new(item.key, value))?; } } @@ -673,56 +643,23 @@ impl AbstractTree for BlobTree { value: V, seqno: SeqNo, ) -> (u64, u64) { - use value::MaybeInlineValue; - - // TODO: let's store a struct in memtables instead - // TODO: that stores slice + is_user_value - // TODO: then we can avoid alloc + memcpy here - // TODO: benchmark for very large values - - // NOTE: Initially, we always write an inline value - // On memtable flush, depending on the values' sizes, they will be separated - // into inline or indirect values - let item = MaybeInlineValue::Inline(value.into()); - - let value = item.encode_into_vec(); - - self.index.insert(key, value, seqno) + self.index.insert(key, value.into(), seqno) } fn get>(&self, key: K, seqno: SeqNo) -> crate::Result> { - use value::MaybeInlineValue::{Indirect, Inline}; - let key = key.as_ref(); // TODO: refactor memtable, sealed memtables, manifest lock to be a single lock (SuperVersion kind of) // TODO: then, try to reduce the lock access to 1, because we are accessing it twice (index.get, and then vhandle resolving...) - let Some(value) = self.get_vhandle(key, seqno)? else { + let Some(item) = self.index.get_internal_entry(key, seqno)? else { return Ok(None); }; - match value { - Inline(bytes) => Ok(Some(bytes)), - Indirect { vhandle, .. } => { - let lock = self.index.manifest.read().expect("lock is poisoned"); - let vlog = crate::vlog::Accessor::new(&lock.current_version().value_log); - - // Resolve indirection using value log - match vlog.get( - &self.blobs_folder, - key, - &vhandle, - &self.index.config.cache, - &self.index.config.descriptor_table, - )? { - Some(v) => Ok(Some(v)), - None => { - panic!("value handle ({key:?} => {vhandle:?}) did not match any blob - this is a bug") - } - } - } - } + let lock = self.index.manifest.read().expect("lock is poisoned"); + let version = lock.current_version(); + let (_, v) = resolve_value_handle(self, &version.value_log, item)?; + Ok(Some(v)) } fn remove>(&self, key: K, seqno: SeqNo) -> (u64, u64) { diff --git a/src/blob_tree/value.rs b/src/blob_tree/value.rs deleted file mode 100644 index 93fe6c41..00000000 --- a/src/blob_tree/value.rs +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use crate::coding::{Decode, DecodeError, Encode, EncodeError}; -use crate::vlog::ValueHandle; -use crate::{Slice, UserValue}; -use byteorder::{ReadBytesExt, WriteBytesExt}; -use std::io::{Cursor, Read, Write}; -use varint_rs::{VarintReader, VarintWriter}; - -/// A value which may or may not be inlined into an index tree -/// -/// If not inlined, the value is present in the value log, so it needs -/// to be fetched using the given value handle. -#[derive(Debug)] -#[allow(clippy::module_name_repetitions)] -pub enum MaybeInlineValue { - /// Inlined value (classic LSM-tree) - Inline(UserValue), - - /// The value is a handle (pointer) into the value log - Indirect { vhandle: ValueHandle, size: u32 }, -} - -const TAG_INLINE: u8 = 0; -pub const TAG_INDIRECT: u8 = 1; - -impl MaybeInlineValue { - pub fn from_slice(bytes: &Slice) -> Result { - let mut cursor = Cursor::new(&**bytes); - - match cursor.read_u8()? { - TAG_INLINE => { - // NOTE: Truncation is OK because we are only at the first couple - // of bytes of the slice - #[allow(clippy::cast_possible_truncation)] - let size_len = { - let pos_before = cursor.position() as usize; - let _ = cursor.read_u32_varint()?; - let pos_after = cursor.position() as usize; - pos_after - pos_before - }; - let slice = bytes.slice((1 + size_len)..); - Ok(Self::Inline(slice)) - } - TAG_INDIRECT => { - let mut reader = &**bytes; - Self::decode_from(&mut reader) - } - x => Err(DecodeError::InvalidTag(("MaybeInlineValue", x))), - } - } -} - -impl Encode for MaybeInlineValue { - fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { - match self { - Self::Inline(bytes) => { - writer.write_u8(TAG_INLINE)?; - - // NOTE: Values can be up to 2^32 bytes - #[allow(clippy::cast_possible_truncation)] - writer.write_u32_varint(bytes.len() as u32)?; - - writer.write_all(bytes)?; - } - Self::Indirect { vhandle, size } => { - writer.write_u8(TAG_INDIRECT)?; - vhandle.encode_into(writer)?; - writer.write_u32_varint(*size)?; - } - } - Ok(()) - } -} - -impl Decode for MaybeInlineValue { - fn decode_from(reader: &mut R) -> Result { - let tag = reader.read_u8()?; - - match tag { - TAG_INLINE => { - let len = reader.read_u32_varint()? as usize; - let slice = UserValue::from_reader(reader, len)?; - Ok(Self::Inline(slice)) - } - TAG_INDIRECT => { - let vhandle = ValueHandle::decode_from(reader)?; - let size = reader.read_u32_varint()?; - Ok(Self::Indirect { vhandle, size }) - } - x => Err(DecodeError::InvalidTag(("MaybeInlineValue", x))), - } - } -} From f4afd98d536b9f6ca96bf66f0317ae1d1ca815a6 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 29 Sep 2025 02:22:59 +0200 Subject: [PATCH 490/613] refactor: blob writer API --- src/blob_tree/mod.rs | 70 +----------------- src/vlog/blob_file/merge.rs | 4 +- src/vlog/blob_file/mod.rs | 42 +---------- src/vlog/blob_file/multi_writer.rs | 115 +++++++++++++++++++++-------- src/vlog/blob_file/scanner.rs | 2 +- src/vlog/blob_file/writer.rs | 48 +++++------- src/vlog/mod.rs | 20 +++-- 7 files changed, 120 insertions(+), 181 deletions(-) diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index 06bec891..512eff2f 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -124,74 +124,6 @@ impl BlobTree { }) } - /// Consumes a [`BlobFileWriter`], returning a `BlobFile` handle. - /// - /// # Note - /// - /// The blob file is **not** added to the value log immediately. - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - fn consume_blob_file_writer(writer: BlobFileWriter) -> crate::Result> { - use crate::vlog::blob_file::{Inner as BlobFileInner, Metadata}; - - let writers = writer.finish()?; - - let mut blob_files = Vec::with_capacity(writers.len()); - - for writer in writers { - if writer.item_count == 0 { - log::debug!( - "Blob file writer at {} has written no data, deleting empty blob file", - writer.path.display(), - ); - if let Err(e) = std::fs::remove_file(&writer.path) { - log::warn!( - "Could not delete empty blob file at {}: {e:?}", - writer.path.display(), - ); - } - continue; - } - - let blob_file_id = writer.blob_file_id; - - blob_files.push(BlobFile(Arc::new(BlobFileInner { - id: blob_file_id, - path: writer.path, - meta: Metadata { - item_count: writer.item_count, - compressed_bytes: writer.written_blob_bytes, - total_uncompressed_bytes: writer.uncompressed_bytes, - - // NOTE: We are checking for 0 items above - // so first and last key need to exist - #[allow(clippy::expect_used)] - key_range: crate::KeyRange::new(( - writer - .first_key - .clone() - .expect("should have written at least 1 item"), - writer - .last_key - .clone() - .expect("should have written at least 1 item"), - )), - }, - // gc_stats: GcStats::default(), - }))); - - log::debug!( - "Created blob file #{blob_file_id:?} ({} items, {} userdata bytes)", - writer.item_count, - writer.uncompressed_bytes, - ); - } - - Ok(blob_files) - } - #[doc(hidden)] pub fn flush_active_memtable(&self, eviction_seqno: SeqNo) -> crate::Result> { let Some((segment_id, yanked_memtable)) = self.index.rotate_memtable() else { @@ -520,7 +452,7 @@ impl AbstractTree for BlobTree { } log::trace!("Creating blob file"); - let blob_files = Self::consume_blob_file_writer(blob_writer)?; + let blob_files = blob_writer.finish()?; assert!(blob_files.len() <= 1); let blob_file = blob_files.into_iter().next(); diff --git a/src/vlog/blob_file/merge.rs b/src/vlog/blob_file/merge.rs index 666d2e5b..4f20f4cd 100644 --- a/src/vlog/blob_file/merge.rs +++ b/src/vlog/blob_file/merge.rs @@ -138,7 +138,7 @@ mod tests { writer.write(key, &key.repeat(100))?; } - writer.flush()?; + writer.finish()?; } } @@ -152,7 +152,7 @@ mod tests { writer.write(key, &key.repeat(100))?; } - writer.flush()?; + writer.finish()?; } } diff --git a/src/vlog/blob_file/mod.rs b/src/vlog/blob_file/mod.rs index 56cc15e3..cf96ec20 100644 --- a/src/vlog/blob_file/mod.rs +++ b/src/vlog/blob_file/mod.rs @@ -7,7 +7,7 @@ pub mod meta; pub mod multi_writer; pub mod reader; pub mod scanner; -pub mod trailer; +// pub mod trailer; pub mod writer; use crate::vlog::BlobFileId; @@ -26,9 +26,6 @@ pub(crate) struct Inner { /// Statistics pub meta: Metadata, - // /// Runtime stats for garbage collection - // pub gc_stats: GcStats, - // TODO: is_deleted, on Drop, like SST segments } @@ -57,47 +54,10 @@ impl BlobFile { self.0.id } - // /// Returns a scanner that can iterate through the blob file. - // /// - // /// # Errors - // /// - // /// Will return `Err` if an IO error occurs. - // pub fn scan(&self) -> crate::Result { - // reader::Reader::new(&self.0.path, self.id()) - // } - /// Returns the number of items in the blob file. #[must_use] #[allow(clippy::len_without_is_empty)] pub fn len(&self) -> u64 { self.0.meta.item_count } - - // /// Marks the blob file as fully stale. - // pub(crate) fn mark_as_stale(&self) { - // self.0.gc_stats.set_stale_items(self.0.meta.item_count); - - // self.0 - // .gc_stats - // .set_stale_bytes(self.0.meta.total_uncompressed_bytes); - // } - - // Returns `true` if the blob file is fully stale. - // #[must_use] - // pub fn is_stale(&self) -> bool { - // self.0.gc_stats.stale_items() == self.0.meta.item_count - // } - - // /// Returns the percent of dead items in the blob file. - // // NOTE: Precision is not important here - // #[allow(clippy::cast_precision_loss)] - // #[must_use] - // pub fn stale_ratio(&self) -> f32 { - // let dead = self.0.gc_stats.stale_items() as f32; - // if dead == 0.0 { - // return 0.0; - // } - - // dead / self.0.meta.item_count as f32 - // } } diff --git a/src/vlog/blob_file/multi_writer.rs b/src/vlog/blob_file/multi_writer.rs index 81773673..b6e1c714 100644 --- a/src/vlog/blob_file/multi_writer.rs +++ b/src/vlog/blob_file/multi_writer.rs @@ -3,15 +3,26 @@ // (found in the LICENSE-* files in the repository) use super::writer::Writer; -use crate::{vlog::BlobFileId, CompressionType, SegmentId, SequenceNumberCounter}; -use std::path::{Path, PathBuf}; +use crate::{ + vlog::{ + blob_file::{Inner as BlobFileInner, Metadata}, + BlobFileId, + }, + BlobFile, CompressionType, SegmentId, SequenceNumberCounter, +}; +use std::{ + path::{Path, PathBuf}, + sync::Arc, +}; /// Blob file writer, may write multiple blob files pub struct MultiWriter { folder: PathBuf, target_size: u64, - writers: Vec, + active_writer: Writer, + + results: Vec, id_generator: SequenceNumberCounter, @@ -31,7 +42,7 @@ impl MultiWriter { id_generator: SequenceNumberCounter, target_size: u64, folder: P, - ) -> std::io::Result { + ) -> crate::Result { let folder = folder.as_ref(); let blob_file_id = id_generator.next(); @@ -42,7 +53,9 @@ impl MultiWriter { folder: folder.into(), target_size, - writers: vec![Writer::new(blob_file_path, blob_file_id)?], + active_writer: Writer::new(blob_file_path, blob_file_id)?, + + results: Vec::new(), compression: CompressionType::None, @@ -66,32 +79,18 @@ impl MultiWriter { #[doc(hidden)] pub fn use_compression(mut self, compression: CompressionType) -> Self { self.compression.clone_from(&compression); - self.get_active_writer_mut().compression = compression; + self.active_writer.compression = compression; self } - #[doc(hidden)] - #[must_use] - pub fn get_active_writer(&self) -> &Writer { - // NOTE: initialized in constructor - #[allow(clippy::expect_used)] - self.writers.last().expect("should exist") - } - - fn get_active_writer_mut(&mut self) -> &mut Writer { - // NOTE: initialized in constructor - #[allow(clippy::expect_used)] - self.writers.last_mut().expect("should exist") - } - #[must_use] pub fn offset(&self) -> u64 { - self.get_active_writer().offset() + self.active_writer.offset() } #[must_use] pub fn blob_file_id(&self) -> BlobFileId { - self.get_active_writer().blob_file_id() + self.active_writer.blob_file_id() } /// Sets up a new writer for the next blob file. @@ -104,11 +103,67 @@ impl MultiWriter { let new_writer = Writer::new(blob_file_path, new_blob_file_id)?.use_compression(self.compression); - self.writers.push(new_writer); + let old_writer = std::mem::replace(&mut self.active_writer, new_writer); + let blob_file = Self::consume_writer(old_writer)?; + self.results.extend(blob_file); Ok(()) } + fn consume_writer(writer: Writer) -> crate::Result> { + if writer.item_count > 0 { + let blob_file_id = writer.blob_file_id; + + log::debug!( + "Created blob file #{blob_file_id:?} ({} items, {} userdata bytes)", + writer.item_count, + writer.uncompressed_bytes, + ); + + let blob_file = BlobFile(Arc::new(BlobFileInner { + id: blob_file_id, + path: writer.path.clone(), + meta: Metadata { + item_count: writer.item_count, + compressed_bytes: writer.written_blob_bytes, + total_uncompressed_bytes: writer.uncompressed_bytes, + + // NOTE: We are checking for 0 items above + // so first and last key need to exist + #[allow(clippy::expect_used)] + key_range: crate::KeyRange::new(( + writer + .first_key + .clone() + .expect("should have written at least 1 item"), + writer + .last_key + .clone() + .expect("should have written at least 1 item"), + )), + }, + })); + + writer.finish()?; + + Ok(Some(blob_file)) + } else { + log::debug!( + "Blob file writer at {} has written no data, deleting empty blob file", + writer.path.display(), + ); + + if let Err(e) = std::fs::remove_file(&writer.path) { + log::warn!( + "Could not delete empty blob file at {}: {e:?}", + writer.path.display(), + ); + } + + Ok(None) + } + } + /// Writes an item. /// /// # Errors @@ -125,25 +180,23 @@ impl MultiWriter { let target_size = self.target_size; // Write actual value into blob file - let writer = self.get_active_writer_mut(); + let writer = &mut self.active_writer; let bytes_written = writer.write(key, value)?; // Check for blob file size target, maybe rotate to next writer if writer.offset() >= target_size { - writer.flush()?; self.rotate()?; } Ok(bytes_written) } - pub(crate) fn finish(mut self) -> crate::Result> { - let writer = self.get_active_writer_mut(); - - if writer.item_count > 0 { - writer.flush()?; + pub(crate) fn finish(mut self) -> crate::Result> { + if self.active_writer.item_count > 0 { + let blob_file = Self::consume_writer(self.active_writer)?; + self.results.extend(blob_file); } - Ok(self.writers) + Ok(self.results) } } diff --git a/src/vlog/blob_file/scanner.rs b/src/vlog/blob_file/scanner.rs index ddbda499..47fec933 100644 --- a/src/vlog/blob_file/scanner.rs +++ b/src/vlog/blob_file/scanner.rs @@ -133,7 +133,7 @@ mod tests { writer.write(key, &key.repeat(100))?; } - writer.flush()?; + writer.finish()?; } { diff --git a/src/vlog/blob_file/writer.rs b/src/vlog/blob_file/writer.rs index 352045b7..34b81b72 100644 --- a/src/vlog/blob_file/writer.rs +++ b/src/vlog/blob_file/writer.rs @@ -2,12 +2,11 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::{meta::Metadata, trailer::Trailer}; +use super::meta::Metadata; use crate::{coding::Encode, vlog::BlobFileId, CompressionType, KeyRange, UserKey}; use byteorder::{BigEndian, WriteBytesExt}; use std::{ - fs::File, - io::{BufWriter, Seek, Write}, + io::Write, path::{Path, PathBuf}, }; @@ -24,8 +23,7 @@ pub struct Writer { pub path: PathBuf, pub(crate) blob_file_id: BlobFileId, - #[allow(clippy::struct_field_names)] - active_writer: BufWriter, + writer: sfa::Writer, offset: u64, @@ -46,16 +44,16 @@ impl Writer { /// /// Will return `Err` if an IO error occurs. #[doc(hidden)] - pub fn new>(path: P, blob_file_id: BlobFileId) -> std::io::Result { + pub fn new>(path: P, blob_file_id: BlobFileId) -> crate::Result { let path = path.as_ref(); - - let file = File::create(path)?; + let mut writer = sfa::Writer::new_at_path(path)?; + writer.start("data")?; Ok(Self { path: path.into(), blob_file_id, - active_writer: BufWriter::new(file), + writer, offset: 0, item_count: 0, @@ -123,7 +121,7 @@ impl Writer { // [...val; ?] // Write header - self.active_writer.write_all(BLOB_HEADER_MAGIC)?; + self.writer.write_all(BLOB_HEADER_MAGIC)?; let mut hasher = xxhash_rust::xxh3::Xxh3::new(); hasher.update(key); @@ -131,17 +129,15 @@ impl Writer { let checksum = hasher.digest128(); // Write checksum - self.active_writer.write_u128::(checksum)?; + self.writer.write_u128::(checksum)?; // NOTE: Truncation is okay and actually needed #[allow(clippy::cast_possible_truncation)] - self.active_writer - .write_u16::(key.len() as u16)?; + self.writer.write_u16::(key.len() as u16)?; // NOTE: Truncation is okay and actually needed #[allow(clippy::cast_possible_truncation)] - self.active_writer - .write_u32::(value.len() as u32)?; + self.writer.write_u32::(value.len() as u32)?; // TODO: finish compression #[warn(clippy::match_single_binding)] @@ -151,11 +147,10 @@ impl Writer { // NOTE: Truncation is okay and actually needed #[allow(clippy::cast_possible_truncation)] - self.active_writer - .write_u32::(value.len() as u32)?; + self.writer.write_u32::(value.len() as u32)?; - self.active_writer.write_all(key)?; - self.active_writer.write_all(value)?; + self.writer.write_all(key)?; + self.writer.write_all(value)?; // Update offset self.offset += BLOB_HEADER_MAGIC.len() as u64; @@ -177,8 +172,8 @@ impl Writer { Ok(value.len() as u32) } - pub(crate) fn flush(&mut self) -> crate::Result<()> { - let metadata_ptr = self.active_writer.stream_position()?; + pub(crate) fn finish(mut self) -> crate::Result<()> { + self.writer.start("meta"); // Write metadata let metadata = Metadata { @@ -194,16 +189,9 @@ impl Writer { .expect("should have written at least 1 item"), )), }; - metadata.encode_into(&mut self.active_writer)?; - - Trailer { - metadata, - metadata_ptr, - } - .encode_into(&mut self.active_writer)?; + metadata.encode_into(&mut self.writer)?; - self.active_writer.flush()?; - self.active_writer.get_mut().sync_all()?; + self.writer.finish()?; Ok(()) } diff --git a/src/vlog/mod.rs b/src/vlog/mod.rs index 4d1f3e7e..4feea768 100644 --- a/src/vlog/mod.rs +++ b/src/vlog/mod.rs @@ -11,7 +11,10 @@ pub use { blob_file::BlobFile, handle::ValueHandle, }; -use crate::vlog::blob_file::{trailer::Trailer, Inner as BlobFileInner}; +use crate::{ + coding::Decode, + vlog::blob_file::{Inner as BlobFileInner, Metadata}, +}; use std::{path::Path, sync::Arc}; pub fn recover_blob_files(folder: &Path, ids: &[BlobFileId]) -> crate::Result> { @@ -34,13 +37,16 @@ pub fn recover_blob_files(folder: &Path, ids: &[BlobFileId]) -> crate::Result Date: Mon, 29 Sep 2025 02:26:55 +0200 Subject: [PATCH 491/613] add checksum type to blob file meta --- src/vlog/blob_file/meta.rs | 8 ++++ src/vlog/blob_file/mod.rs | 2 - src/vlog/blob_file/trailer.rs | 75 ----------------------------------- 3 files changed, 8 insertions(+), 77 deletions(-) delete mode 100644 src/vlog/blob_file/trailer.rs diff --git a/src/vlog/blob_file/meta.rs b/src/vlog/blob_file/meta.rs index 8ac2f883..82950dbf 100644 --- a/src/vlog/blob_file/meta.rs +++ b/src/vlog/blob_file/meta.rs @@ -31,6 +31,9 @@ impl Encode for Metadata { // Write header writer.write_all(METADATA_HEADER_MAGIC)?; + // Checksum type (always 0x0 = XXH3) + writer.write_u8(0x0)?; + writer.write_u64::(self.item_count)?; writer.write_u64::(self.compressed_bytes)?; writer.write_u64::(self.total_uncompressed_bytes)?; @@ -51,6 +54,11 @@ impl Decode for Metadata { return Err(DecodeError::InvalidHeader("BlobFileMeta")); } + let checksum_type = reader.read_u8()?; + if checksum_type != 0x0 { + return Err(DecodeError::InvalidTag(("BlobFileChecksum", checksum_type))); + } + let item_count = reader.read_u64::()?; let compressed_bytes = reader.read_u64::()?; let total_uncompressed_bytes = reader.read_u64::()?; diff --git a/src/vlog/blob_file/mod.rs b/src/vlog/blob_file/mod.rs index cf96ec20..7b8e6ab4 100644 --- a/src/vlog/blob_file/mod.rs +++ b/src/vlog/blob_file/mod.rs @@ -7,11 +7,9 @@ pub mod meta; pub mod multi_writer; pub mod reader; pub mod scanner; -// pub mod trailer; pub mod writer; use crate::vlog::BlobFileId; -// pub use gc_stats::GcStats; pub use meta::Metadata; use std::{path::PathBuf, sync::Arc}; diff --git a/src/vlog/blob_file/trailer.rs b/src/vlog/blob_file/trailer.rs deleted file mode 100644 index 9d3af34d..00000000 --- a/src/vlog/blob_file/trailer.rs +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use super::meta::Metadata; -use crate::coding::{Decode, DecodeError, Encode, EncodeError}; -use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; -use std::{ - fs::File, - io::{BufReader, Read, Seek, Write}, - path::Path, -}; - -pub const TRAILER_MAGIC: &[u8] = &[b'V', b'L', b'O', b'G', b'T', b'R', b'L', 1]; -pub const TRAILER_SIZE: usize = 256; - -#[derive(Debug)] -#[allow(clippy::module_name_repetitions)] -pub struct Trailer { - pub metadata: Metadata, - pub metadata_ptr: u64, -} - -impl Trailer { - pub fn from_file>(path: P) -> crate::Result { - let file = File::open(path)?; - let mut reader = BufReader::new(file); - reader.seek(std::io::SeekFrom::End(-(TRAILER_SIZE as i64)))?; - - // Get metadata ptr - let metadata_ptr = reader.read_u64::()?; - - // IMPORTANT: Subtract sizeof(meta_ptr) ------v - let remaining_padding = TRAILER_SIZE - std::mem::size_of::() - TRAILER_MAGIC.len(); - reader.seek_relative(remaining_padding as i64)?; - - // Check trailer magic - let mut magic = [0u8; TRAILER_MAGIC.len()]; - reader.read_exact(&mut magic)?; - - if magic != TRAILER_MAGIC { - return Err(crate::Error::Decode(DecodeError::InvalidHeader( - "BlobFileTrailer", - ))); - } - - // Jump to metadata and parse - reader.seek(std::io::SeekFrom::Start(metadata_ptr))?; - let metadata = Metadata::decode_from(&mut reader)?; - - Ok(Self { - metadata, - metadata_ptr, - }) - } -} - -impl Encode for Trailer { - fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { - let mut v = Vec::with_capacity(TRAILER_SIZE); - - v.write_u64::(self.metadata_ptr)?; - - // Pad with remaining bytes - v.resize(TRAILER_SIZE - TRAILER_MAGIC.len(), 0); - - v.write_all(TRAILER_MAGIC)?; - - assert_eq!(v.len(), TRAILER_SIZE, "blob file trailer has invalid size"); - - writer.write_all(&v)?; - - Ok(()) - } -} From 4ef3affd705bdb1e5e39574ad2dfc33f046a179a Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 29 Sep 2025 02:31:54 +0200 Subject: [PATCH 492/613] wip --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 3629bd7d..0f65ab8d 100644 --- a/.gitignore +++ b/.gitignore @@ -23,4 +23,4 @@ fuzz*/**/out* microbench/**/data.jsonl microbench/**/*.svg - +old From 1f1b4cfc7ffb0f90d2fd644be6b17ec4b31d01af Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 29 Sep 2025 02:35:12 +0200 Subject: [PATCH 493/613] refactor --- src/compaction/drop_range.rs | 45 ++++-------------------------------- 1 file changed, 4 insertions(+), 41 deletions(-) diff --git a/src/compaction/drop_range.rs b/src/compaction/drop_range.rs index 9aac0219..7374804a 100644 --- a/src/compaction/drop_range.rs +++ b/src/compaction/drop_range.rs @@ -33,38 +33,6 @@ impl RangeBounds for OwnedBounds { } } -struct ContainedSegments<'a> { - segments: &'a [Segment], - bounds: &'a OwnedBounds, - pos: usize, -} - -impl<'a> ContainedSegments<'a> { - fn new(segments: &'a [Segment], bounds: &'a OwnedBounds) -> Self { - Self { - segments, - bounds, - pos: 0, - } - } -} - -impl<'a> Iterator for ContainedSegments<'a> { - type Item = &'a Segment; - - fn next(&mut self) -> Option { - while let Some(segment) = self.segments.get(self.pos) { - self.pos += 1; - - if self.bounds.contains(segment.key_range()) { - return Some(segment); - } - } - - None - } -} - impl OwnedBounds { #[must_use] pub fn contains(&self, range: &KeyRange) -> bool { @@ -93,10 +61,6 @@ pub struct Strategy { impl Strategy { /// Configures a new `DropRange` compaction strategy. - /// - /// # Panics - /// - /// Panics, if `target_size` is below 1024 bytes. #[must_use] #[allow(dead_code)] pub fn new(bounds: OwnedBounds) -> Self { @@ -115,12 +79,11 @@ impl CompactionStrategy for Strategy { .iter_levels() .flat_map(|lvl| lvl.iter()) .flat_map(|run| { - let slice = run - .range_overlap_indexes(&self.bounds) + run.range_overlap_indexes(&self.bounds) .and_then(|(lo, hi)| run.get(lo..=hi)) - .unwrap_or(&[]); - - ContainedSegments::new(slice, &self.bounds) + .unwrap_or_default() + .iter() + .filter(|x| self.bounds.contains(x.key_range())) }) .map(Segment::id) .collect(); From 47a8351325d83e1c1326597ce410ba024ba05d6c Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 29 Sep 2025 16:58:19 +0200 Subject: [PATCH 494/613] adjust blob fragmentation map after table drop --- Cargo.toml | 2 +- src/abstract.rs | 2 +- src/blob_tree/gc.rs | 6 ++++ src/blob_tree/mod.rs | 17 +++++++---- src/segment/mod.rs | 2 ++ src/segment/writer/mod.rs | 5 +++- src/tree/mod.rs | 6 ++-- src/version/mod.rs | 39 ++++++++++++++++++++---- src/version/run.rs | 4 +++ src/vlog/blob_file/writer.rs | 3 +- tests/blob_drop_range_gc_stats.rs | 50 +++++++++++++++++++++++++++++++ tests/tree_sealed_shadowing.rs | 2 +- tests/tree_seqno.rs | 2 +- 13 files changed, 120 insertions(+), 20 deletions(-) create mode 100644 tests/blob_drop_range_gc_stats.rs diff --git a/Cargo.toml b/Cargo.toml index a3b8180f..4629144e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,7 @@ description = "A K.I.S.S. implementation of log-structured merge trees (LSM-tree license = "MIT OR Apache-2.0" version = "3.0.0-pre.0" edition = "2021" -rust-version = "1.82.0" +rust-version = "1.87.0" readme = "README.md" include = ["src/**/*", "LICENSE-APACHE", "LICENSE-MIT", "README.md"] repository = "https://github.com/fjall-rs/lsm-tree" diff --git a/src/abstract.rs b/src/abstract.rs index 07373808..6bda4ee7 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -136,7 +136,7 @@ pub trait AbstractTree { segment_id: SegmentId, // TODO: remove? memtable: &Arc, seqno_threshold: SeqNo, - ) -> crate::Result, Option)>>; + ) -> crate::Result)>>; /// Atomically registers flushed disk segments into the tree, removing their associated sealed memtables. /// diff --git a/src/blob_tree/gc.rs b/src/blob_tree/gc.rs index 9850563e..8f4748cd 100644 --- a/src/blob_tree/gc.rs +++ b/src/blob_tree/gc.rs @@ -35,6 +35,12 @@ impl std::ops::Deref for FragmentationMap { } } +impl std::ops::DerefMut for FragmentationMap { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + impl FragmentationMap { #[must_use] pub fn stale_bytes(&self) -> u64 { diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index a25dd97f..acbb13ab 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -130,7 +130,7 @@ impl BlobTree { return Ok(None); }; - let Some((segment, blob_file, frag_map)) = + let Some((segment, blob_file)) = self.flush_memtable(segment_id, &yanked_memtable, eviction_seqno)? else { return Ok(None); @@ -138,7 +138,7 @@ impl BlobTree { self.register_segments( std::slice::from_ref(&segment), blob_file.as_ref().map(std::slice::from_ref), - frag_map, + None, eviction_seqno, )?; @@ -371,9 +371,8 @@ impl AbstractTree for BlobTree { segment_id: SegmentId, memtable: &Arc, eviction_seqno: SeqNo, - ) -> crate::Result, Option)>> { + ) -> crate::Result)>> { use crate::{file::SEGMENTS_FOLDER, segment::Writer as SegmentWriter}; - // use value::MaybeInlineValue; let lsm_segment_folder = self.index.config.path.join(SEGMENTS_FOLDER); @@ -407,6 +406,7 @@ impl AbstractTree for BlobTree { let compaction_filter = CompactionStream::new(iter, eviction_seqno); let mut blob_bytes_referenced = 0; + let mut blobs_referenced_count = 0; for item in compaction_filter { let item = item?; @@ -446,6 +446,7 @@ impl AbstractTree for BlobTree { })?; blob_bytes_referenced += u64::from(value_size); + blobs_referenced_count += 1; } else { segment_writer.write(InternalValue::new(item.key, value))?; } @@ -460,13 +461,17 @@ impl AbstractTree for BlobTree { if blob_bytes_referenced > 0 { if let Some(blob_file) = &blob_file { - segment_writer.link_blob_file(blob_file.id(), blob_bytes_referenced); + segment_writer.link_blob_file( + blob_file.id(), + blob_bytes_referenced, + blobs_referenced_count, + ); } } let segment = self.index.consume_writer(segment_writer)?; - Ok(segment.map(|segment| (segment, blob_file, None))) + Ok(segment.map(|segment| (segment, blob_file))) } fn register_segments( diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 8f6ec983..c41c4195 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -118,10 +118,12 @@ impl Segment { for _ in 0..len { let blob_file_id = reader.read_u64::()?; let bytes = reader.read_u64::()?; + let len = reader.read_u64::()?; blob_files.push(LinkedFile { blob_file_id, bytes, + len: len as usize, }); } diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index d1da56c7..907f518d 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -15,6 +15,7 @@ use std::{fs::File, io::BufWriter, path::PathBuf}; pub struct LinkedFile { pub(crate) blob_file_id: BlobFileId, pub(crate) bytes: u64, + pub(crate) len: usize, } /// Serializes and compresses values into blocks and writes them to disk as segment @@ -115,10 +116,11 @@ impl Writer { }) } - pub fn link_blob_file(&mut self, blob_file_id: BlobFileId, bytes: u64) { + pub fn link_blob_file(&mut self, blob_file_id: BlobFileId, bytes: u64, len: usize) { self.linked_blob_files.push(LinkedFile { blob_file_id, bytes, + len, }); } @@ -368,6 +370,7 @@ impl Writer { for file in self.linked_blob_files { self.block_writer.write_u64::(file.blob_file_id)?; self.block_writer.write_u64::(file.bytes)?; + self.block_writer.write_u64::(file.len as u64)?; } } diff --git a/src/tree/mod.rs b/src/tree/mod.rs index cc7f5fda..b8e71fea 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -281,7 +281,7 @@ impl AbstractTree for Tree { segment_id: SegmentId, memtable: &Arc, seqno_threshold: SeqNo, - ) -> crate::Result, Option)>> { + ) -> crate::Result)>> { use crate::{compaction::stream::CompactionStream, file::SEGMENTS_FOLDER, segment::Writer}; use std::time::Instant; @@ -335,7 +335,7 @@ impl AbstractTree for Tree { log::debug!("Flushed memtable {segment_id:?} in {:?}", start.elapsed()); - Ok(result.map(|segment| (segment, None, None))) + Ok(result.map(|segment| (segment, None))) } fn register_segments( @@ -684,7 +684,7 @@ impl Tree { return Ok(None); }; - let Some((segment, _, _)) = + let Some((segment, _)) = self.flush_memtable(segment_id, &yanked_memtable, seqno_threshold)? else { return Ok(None); diff --git a/src/version/mod.rs b/src/version/mod.rs index 02bfcf56..e57cafbe 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -7,7 +7,7 @@ pub mod run; pub use run::Run; -use crate::blob_tree::FragmentationMap; +use crate::blob_tree::{FragmentationEntry, FragmentationMap}; use crate::coding::Encode; use crate::{ vlog::{BlobFile, BlobFileId}, @@ -335,6 +335,8 @@ impl Version { let mut levels = vec![]; + let mut dropped_segments = vec![]; + for level in &self.levels { let runs = level .runs @@ -342,7 +344,13 @@ impl Version { .map(|run| { // TODO: don't clone Arc inner if we don't need to modify let mut run: Run = run.deref().clone(); - run.retain(|x| !ids.contains(&x.metadata.id)); + + let removed_segments = run + .inner_mut() + .extract_if(.., |x| ids.contains(&x.metadata.id)); + + dropped_segments = removed_segments.collect(); + run }) .filter(|x| !x.is_empty()) @@ -353,15 +361,36 @@ impl Version { levels.push(Level::from_runs(runs.into_iter().map(Arc::new).collect())); } - // TODO: adjust GC stats by adjusting GC stats based on dropped table's blob file links - // TODO: add unit test + let gc_stats = if dropped_segments.is_empty() { + self.gc_stats.clone() + } else { + let mut copy: FragmentationMap = self.gc_stats.deref().clone(); + + for segment in dropped_segments { + let linked_blob_files = segment + .get_linked_blob_files() + .expect("TODO: handle error") + .unwrap_or_default(); + + for blob_file in linked_blob_files { + copy.entry(blob_file.blob_file_id) + .and_modify(|counter| { + counter.bytes += blob_file.bytes; + counter.len += blob_file.len; + }) + .or_insert_with(|| FragmentationEntry::new(blob_file.len, blob_file.bytes)); + } + } + + Arc::new(copy) + }; Self { inner: Arc::new(VersionInner { id, levels, value_log: self.value_log.clone(), - gc_stats: self.gc_stats.clone(), + gc_stats, }), seqno_watermark: 0, } diff --git a/src/version/run.rs b/src/version/run.rs index 6c804b2e..039e17e1 100644 --- a/src/version/run.rs +++ b/src/version/run.rs @@ -65,6 +65,10 @@ impl Run { Self(items) } + pub fn inner_mut(&mut self) -> &mut Vec { + &mut self.0 + } + pub fn push(&mut self, item: T) { self.0.push(item); diff --git a/src/vlog/blob_file/writer.rs b/src/vlog/blob_file/writer.rs index 34b81b72..d7fded07 100644 --- a/src/vlog/blob_file/writer.rs +++ b/src/vlog/blob_file/writer.rs @@ -23,6 +23,7 @@ pub struct Writer { pub path: PathBuf, pub(crate) blob_file_id: BlobFileId, + #[allow(clippy::struct_field_names)] writer: sfa::Writer, offset: u64, @@ -173,7 +174,7 @@ impl Writer { } pub(crate) fn finish(mut self) -> crate::Result<()> { - self.writer.start("meta"); + self.writer.start("meta")?; // Write metadata let metadata = Metadata { diff --git a/tests/blob_drop_range_gc_stats.rs b/tests/blob_drop_range_gc_stats.rs new file mode 100644 index 00000000..ae978fc2 --- /dev/null +++ b/tests/blob_drop_range_gc_stats.rs @@ -0,0 +1,50 @@ +use lsm_tree::{blob_tree::FragmentationEntry, AbstractTree, SeqNo}; +use test_log::test; + +#[test] +fn blob_tree_drop_range_gc_stats() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let path = folder.path(); + + let big_value = b"neptune!".repeat(128_000); + + { + let tree = lsm_tree::Config::new(path).open_as_blob_tree()?; + + assert!(tree.get("big", SeqNo::MAX)?.is_none()); + tree.insert("big", &big_value, 0); + + let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + + tree.flush_active_memtable(1)?; + assert_eq!(1, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); + + tree.drop_range::<&[u8], _>(..)?; + + assert_eq!(0, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); // TODO: 3.0.0 automatically prune fully stale blob files from version -> this should be 0 + + let gc_stats = tree + .index + .manifest + .read() + .expect("lock is poisoned") + .current_version() + .gc_stats() + .clone(); + + // "big":0 was dropped + assert_eq!( + &{ + let mut map = lsm_tree::HashMap::default(); + map.insert(0, FragmentationEntry::new(1, big_value.len() as u64)); + map + }, + &*gc_stats, + ); + } + + Ok(()) +} diff --git a/tests/tree_sealed_shadowing.rs b/tests/tree_sealed_shadowing.rs index 8221e77e..aa4ea107 100644 --- a/tests/tree_sealed_shadowing.rs +++ b/tests/tree_sealed_shadowing.rs @@ -19,7 +19,7 @@ fn tree_sealed_memtable_tombstone_shadowing() -> lsm_tree::Result<()> { let (id, memtable) = tree.rotate_memtable().unwrap(); assert!(!tree.contains_key("a", SeqNo::MAX)?); - let (segment, _, _) = tree.flush_memtable(id, &memtable, 0)?.unwrap(); + let (segment, _) = tree.flush_memtable(id, &memtable, 0)?.unwrap(); tree.register_segments(&[segment], None, None, 0)?; assert!(!tree.contains_key("a", SeqNo::MAX)?); diff --git a/tests/tree_seqno.rs b/tests/tree_seqno.rs index 8ed70b62..4b3cdd53 100644 --- a/tests/tree_seqno.rs +++ b/tests/tree_seqno.rs @@ -45,7 +45,7 @@ fn tree_highest_seqno() -> lsm_tree::Result<()> { assert_eq!(tree.get_highest_memtable_seqno(), Some(4)); assert_eq!(tree.get_highest_persisted_seqno(), Some(3)); - let (segment, _, _) = tree.flush_memtable(segment_id, &sealed, 0)?.unwrap(); + let (segment, _) = tree.flush_memtable(segment_id, &sealed, 0)?.unwrap(); tree.register_segments(&[segment], None, None, 0)?; assert_eq!(tree.get_highest_seqno(), Some(4)); From dd29371728d865ee31faeff05d730133b6e4637a Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 29 Sep 2025 17:01:24 +0200 Subject: [PATCH 495/613] clippy --- .github/workflows/test.yml | 2 +- README.md | 2 +- src/segment/block/encoder.rs | 5 ++++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 16895c73..d0cb132f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -18,7 +18,7 @@ jobs: matrix: rust_version: - stable - - "1.89.0" # MSRV + - "1.87.0" # MSRV os: - ubuntu-latest - windows-latest diff --git a/README.md b/README.md index b121c967..7af5d023 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![CI](https://github.com/fjall-rs/lsm-tree/actions/workflows/test.yml/badge.svg)](https://github.com/fjall-rs/lsm-tree/actions/workflows/test.yml) [![docs.rs](https://img.shields.io/docsrs/lsm-tree?color=green)](https://docs.rs/lsm-tree) [![Crates.io](https://img.shields.io/crates/v/lsm-tree?color=blue)](https://crates.io/crates/lsm-tree) -![MSRV](https://img.shields.io/badge/MSRV-1.82.0-blue) +![MSRV](https://img.shields.io/badge/MSRV-1.87.0-blue) [![dependency status](https://deps.rs/repo/github/fjall-rs/lsm-tree/status.svg)](https://deps.rs/repo/github/fjall-rs/lsm-tree) A K.I.S.S. implementation of log-structured merge trees (LSM-trees/LSMTs) in Rust. diff --git a/src/segment/block/encoder.rs b/src/segment/block/encoder.rs index ff62db2a..682ac037 100644 --- a/src/segment/block/encoder.rs +++ b/src/segment/block/encoder.rs @@ -122,7 +122,10 @@ impl<'a, Context: Default, Item: Encodable> Encoder<'a, Context, Item> pub fn write(&mut self, item: &'a Item) -> crate::Result<()> { // NOTE: Check if we are a restart marker - if self.item_count % usize::from(self.restart_interval) == 0 { + if self + .item_count + .is_multiple_of(usize::from(self.restart_interval)) + { self.restart_count += 1; if self.restart_interval > 0 { From 0008929469928c9bae43700096baf8d9978d1080 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 29 Sep 2025 17:49:22 +0200 Subject: [PATCH 496/613] in compaction merge, link blob files to created tables --- src/blob_tree/mod.rs | 1 - src/compaction/worker.rs | 16 ++++++- src/segment/mod.rs | 5 +-- src/segment/multi_writer.rs | 35 ++++++++++++++- src/segment/writer/mod.rs | 7 +-- src/vlog/blob_file/multi_writer.rs | 10 +---- tests/blob_major_compact_gc_stats.rs | 29 +++++++++++- tests/blob_major_compact_relink.rs | 67 ++++++++++++++++++++++++++++ 8 files changed, 150 insertions(+), 20 deletions(-) create mode 100644 tests/blob_major_compact_relink.rs diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index acbb13ab..4880d23d 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -400,7 +400,6 @@ impl AbstractTree for BlobTree { u64::MAX, // TODO: actually use target size? but be sure to link to table correctly self.index.config.path.join(BLOBS_FOLDER), )?; - blob_writer.link_table(segment_id); let iter = memtable.iter().map(Ok); let compaction_filter = CompactionStream::new(iter, eviction_seqno); diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 3ccda50f..003b7b0a 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -4,7 +4,8 @@ use super::{CompactionStrategy, Input as CompactionPayload}; use crate::{ - blob_tree::FragmentationMap, + blob_tree::{handle::BlobIndirection, FragmentationMap}, + coding::Decode, compaction::{stream::CompactionStream, Choice}, file::SEGMENTS_FOLDER, level_manifest::LevelManifest, @@ -346,6 +347,19 @@ fn merge_segments( continue; } + if item.key.value_type.is_indirection() { + let mut reader = &item.value[..]; + + let Ok(indirection) = BlobIndirection::decode_from(&mut reader) else { + log::error!("Failed to deserialize blob indirection: {item:?}"); + return Ok(()); + }; + + // TODO: 3.0.0 -> IF we have a blob writer, use the active_blob_file ID instead (rewriting the vptr) + + segment_writer.register_blob(indirection); + } + if let Err(e) = segment_writer.write(item) { log::error!("Compaction failed: {e:?}"); diff --git a/src/segment/mod.rs b/src/segment/mod.rs index c41c4195..b20f2460 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -15,7 +15,7 @@ pub(crate) mod multi_writer; mod regions; mod scanner; pub mod util; -mod writer; +pub mod writer; pub use block::{Block, BlockOffset, Checksum}; pub use data_block::DataBlock; @@ -31,7 +31,6 @@ use crate::{ block::{BlockType, ParsedItem}, writer::LinkedFile, }, - vlog::BlobFileId, CompressionType, InternalValue, SeqNo, TreeId, UserKey, }; use block_index::BlockIndexImpl; @@ -102,7 +101,7 @@ impl std::fmt::Debug for Segment { } impl Segment { - pub(crate) fn get_linked_blob_files(&self) -> crate::Result>> { + pub fn get_linked_blob_files(&self) -> crate::Result>> { use byteorder::{ReadBytesExt, LE}; Ok(if let Some(handle) = &self.regions.linked_blob_files { diff --git a/src/segment/multi_writer.rs b/src/segment/multi_writer.rs index f38d2ebe..2cb2c541 100644 --- a/src/segment/multi_writer.rs +++ b/src/segment/multi_writer.rs @@ -3,7 +3,10 @@ // (found in the LICENSE-* files in the repository) use super::{filter::BloomConstructionPolicy, writer::Writer}; -use crate::{value::InternalValue, CompressionType, SegmentId, UserKey}; +use crate::{ + blob_tree::handle::BlobIndirection, segment::writer::LinkedFile, value::InternalValue, + vlog::BlobFileId, CompressionType, HashMap, SegmentId, UserKey, +}; use std::{ path::PathBuf, sync::{atomic::AtomicU64, Arc}, @@ -43,6 +46,8 @@ pub struct MultiWriter { bloom_policy: BloomConstructionPolicy, current_key: Option, + + linked_blobs: HashMap, } impl MultiWriter { @@ -81,9 +86,25 @@ impl MultiWriter { bloom_policy: BloomConstructionPolicy::default(), current_key: None, + + linked_blobs: HashMap::default(), // TODO: consume on finish or rotate }) } + pub fn register_blob(&mut self, indirection: BlobIndirection) { + self.linked_blobs + .entry(indirection.vhandle.blob_file_id) + .and_modify(|entry| { + entry.bytes += u64::from(indirection.size); + entry.len += 1; + }) + .or_insert_with(|| LinkedFile { + blob_file_id: indirection.vhandle.blob_file_id, + bytes: u64::from(indirection.size), + len: 1, + }); + } + #[must_use] pub fn use_data_block_restart_interval(mut self, interval: u8) -> Self { self.data_block_restart_interval = interval; @@ -173,7 +194,12 @@ impl MultiWriter { .use_bloom_policy(self.bloom_policy) .use_data_block_hash_ratio(self.data_block_hash_ratio); - let old_writer = std::mem::replace(&mut self.writer, new_writer); + let mut old_writer = std::mem::replace(&mut self.writer, new_writer); + + for linked in self.linked_blobs.values() { + old_writer.link_blob_file(linked.blob_file_id, linked.bytes, linked.len); + } + self.linked_blobs.clear(); if let Some(segment_id) = old_writer.finish()? { self.results.push(segment_id); @@ -203,6 +229,11 @@ impl MultiWriter { /// /// Returns the metadata of created segments pub fn finish(mut self) -> crate::Result> { + for linked in self.linked_blobs.values() { + self.writer + .link_blob_file(linked.blob_file_id, linked.bytes, linked.len); + } + if let Some(last_writer_result) = self.writer.finish()? { self.results.push(last_writer_result); } diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 907f518d..689a06d4 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -12,10 +12,11 @@ use crate::{ use index::{BlockIndexWriter, FullIndexWriter}; use std::{fs::File, io::BufWriter, path::PathBuf}; +#[derive(Copy, Clone, PartialEq, Eq, Debug)] pub struct LinkedFile { - pub(crate) blob_file_id: BlobFileId, - pub(crate) bytes: u64, - pub(crate) len: usize, + pub blob_file_id: BlobFileId, + pub bytes: u64, + pub len: usize, } /// Serializes and compresses values into blocks and writes them to disk as segment diff --git a/src/vlog/blob_file/multi_writer.rs b/src/vlog/blob_file/multi_writer.rs index b6e1c714..0942549f 100644 --- a/src/vlog/blob_file/multi_writer.rs +++ b/src/vlog/blob_file/multi_writer.rs @@ -8,7 +8,7 @@ use crate::{ blob_file::{Inner as BlobFileInner, Metadata}, BlobFileId, }, - BlobFile, CompressionType, SegmentId, SequenceNumberCounter, + BlobFile, CompressionType, SequenceNumberCounter, }; use std::{ path::{Path, PathBuf}, @@ -27,8 +27,6 @@ pub struct MultiWriter { id_generator: SequenceNumberCounter, compression: CompressionType, - - linked_table_ids: Vec, } impl MultiWriter { @@ -58,15 +56,9 @@ impl MultiWriter { results: Vec::new(), compression: CompressionType::None, - - linked_table_ids: Vec::new(), // TODO: 3.0.0 consume and reset after rotation }) } - pub fn link_table(&mut self, table_id: SegmentId) { - self.linked_table_ids.push(table_id); - } - /// Sets the blob file target size. #[must_use] pub fn use_target_size(mut self, bytes: u64) -> Self { diff --git a/tests/blob_major_compact_gc_stats.rs b/tests/blob_major_compact_gc_stats.rs index 776f9e1d..7935255d 100644 --- a/tests/blob_major_compact_gc_stats.rs +++ b/tests/blob_major_compact_gc_stats.rs @@ -72,7 +72,6 @@ fn blob_tree_major_compact_gc_stats_tombstone() -> lsm_tree::Result<()> { assert_eq!(&*value, big_value); tree.flush_active_memtable(0)?; - assert_eq!(1, tree.segment_count()); assert_eq!(1, tree.blob_file_count()); @@ -80,7 +79,22 @@ fn blob_tree_major_compact_gc_stats_tombstone() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; + assert_eq!( + None, + tree.index + .manifest + .read() + .expect("lock is poisoned") + .current_version() + .iter_segments() + .next() + .unwrap() + .get_linked_blob_files()?, + ); + tree.major_compact(64_000_000, 1_000)?; + assert_eq!(1, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); let gc_stats = tree .index @@ -100,6 +114,19 @@ fn blob_tree_major_compact_gc_stats_tombstone() -> lsm_tree::Result<()> { }, &*gc_stats, ); + + assert_eq!( + None, + tree.index + .manifest + .read() + .expect("lock is poisoned") + .current_version() + .iter_segments() + .next() + .unwrap() + .get_linked_blob_files()?, + ); } Ok(()) diff --git a/tests/blob_major_compact_relink.rs b/tests/blob_major_compact_relink.rs new file mode 100644 index 00000000..0290942b --- /dev/null +++ b/tests/blob_major_compact_relink.rs @@ -0,0 +1,67 @@ +use lsm_tree::{AbstractTree, SeqNo}; +use test_log::test; + +#[test] +fn blob_tree_major_compact_gc_stats() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let path = folder.path(); + + let big_value = b"neptune!".repeat(128_000); + + { + let tree = lsm_tree::Config::new(path).open_as_blob_tree()?; + + assert!(tree.get("big", SeqNo::MAX)?.is_none()); + tree.insert("big", &big_value, 0); + tree.insert("smol", "small value", 0); + + let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + + tree.flush_active_memtable(0)?; + assert_eq!(1, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); + + assert_eq!( + Some(vec![lsm_tree::segment::writer::LinkedFile { + blob_file_id: 0, + bytes: big_value.len() as u64, + len: 1 + }]), + tree.index + .manifest + .read() + .expect("lock is poisoned") + .current_version() + .iter_segments() + .next() + .unwrap() + .get_linked_blob_files()?, + ); + + tree.flush_active_memtable(1)?; + + tree.major_compact(64_000_000, 1_000)?; + assert_eq!(1, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); + + assert_eq!( + Some(vec![lsm_tree::segment::writer::LinkedFile { + blob_file_id: 0, + bytes: big_value.len() as u64, + len: 1 + }]), + tree.index + .manifest + .read() + .expect("lock is poisoned") + .current_version() + .iter_segments() + .next() + .unwrap() + .get_linked_blob_files()?, + ); + } + + Ok(()) +} From 606df7c1ce54d489b784b12416486da78672c07c Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 29 Sep 2025 17:58:03 +0200 Subject: [PATCH 497/613] refactor: table meta u64 reading --- src/segment/meta.rs | 69 +++++++++++---------------------------------- 1 file changed, 17 insertions(+), 52 deletions(-) diff --git a/src/segment/meta.rs b/src/segment/meta.rs index aa923841..4163c3a2 100644 --- a/src/segment/meta.rs +++ b/src/segment/meta.rs @@ -49,6 +49,17 @@ pub struct ParsedMeta { pub index_block_compression: CompressionType, } +macro_rules! read_u64 { + ($block:expr, $name:expr) => {{ + let bytes = $block + .point_read($name, SeqNo::MAX) + .unwrap_or_else(|| panic!("meta property {:?} should exist", $name)); + + let mut bytes = &bytes.value[..]; + bytes.read_u64::()? + }}; +} + impl ParsedMeta { #[allow(clippy::expect_used, clippy::too_many_lines)] pub fn load_with_handle(file: &File, handle: &BlockHandle) -> crate::Result { @@ -108,14 +119,12 @@ impl ParsedMeta { ); } - let id = { - let bytes = block - .point_read(b"#id", SeqNo::MAX) - .expect("Segment ID should exist"); - - let mut bytes = &bytes.value[..]; - bytes.read_u64::()? - }; + let id = read_u64!(block, b"#id"); + let item_count = read_u64!(block, b"#item_count"); + let tombstone_count = read_u64!(block, b"#tombstone_count"); + let data_block_count = read_u64!(block, b"#data_block_count"); + let index_block_count = read_u64!(block, b"#index_block_count"); + let file_size = read_u64!(block, b"#size"); // TODO: rename file_size let created_at = { let bytes = block @@ -126,42 +135,6 @@ impl ParsedMeta { bytes.read_u128::()?.into() }; - let item_count = { - let bytes = block - .point_read(b"#item_count", SeqNo::MAX) - .expect("Segment ID should exist"); - - let mut bytes = &bytes.value[..]; - bytes.read_u64::()? - }; - - let tombstone_count = { - let bytes = block - .point_read(b"#tombstone_count", SeqNo::MAX) - .expect("Segment ID should exist"); - - let mut bytes = &bytes.value[..]; - bytes.read_u64::()? - }; - - let data_block_count = { - let bytes = block - .point_read(b"#data_block_count", SeqNo::MAX) - .expect("data_block_count should exist"); - - let mut bytes = &bytes.value[..]; - bytes.read_u64::()? - }; - - let index_block_count = { - let bytes = block - .point_read(b"#index_block_count", SeqNo::MAX) - .expect("index_block_count should exist"); - - let mut bytes = &bytes.value[..]; - bytes.read_u64::()? - }; - let key_range = KeyRange::new(( block .point_read(b"#key#min", SeqNo::MAX) @@ -195,14 +168,6 @@ impl ParsedMeta { (min, max) }; - let file_size = { - let bytes = block - .point_read(b"#size", SeqNo::MAX) - .expect("size should exist"); - let mut bytes = &bytes.value[..]; - bytes.read_u64::()? - }; - let data_block_compression = { let bytes = block .point_read(b"#compression#data", SeqNo::MAX) From bbc2fc07f705d3346698e337ff705852aaacdd27 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 29 Sep 2025 20:55:04 +0200 Subject: [PATCH 498/613] wip --- src/segment/inner.rs | 4 ++-- src/segment/meta.rs | 2 +- src/vlog/blob_file/scanner.rs | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/segment/inner.rs b/src/segment/inner.rs index a41db211..794d922a 100644 --- a/src/segment/inner.rs +++ b/src/segment/inner.rs @@ -57,11 +57,11 @@ impl Drop for Inner { let global_id: GlobalSegmentId = (self.tree_id, self.metadata.id).into(); if self.is_deleted.load(std::sync::atomic::Ordering::Acquire) { - log::trace!("Cleanup deleted segment {global_id:?} at {:?}", self.path); + log::trace!("Cleanup deleted table {global_id:?} at {:?}", self.path); if let Err(e) = std::fs::remove_file(&*self.path) { log::warn!( - "Failed to cleanup deleted segment {global_id:?} at {:?}: {e:?}", + "Failed to cleanup deleted table {global_id:?} at {:?}: {e:?}", self.path, ); } diff --git a/src/segment/meta.rs b/src/segment/meta.rs index 4163c3a2..983bccb3 100644 --- a/src/segment/meta.rs +++ b/src/segment/meta.rs @@ -124,7 +124,7 @@ impl ParsedMeta { let tombstone_count = read_u64!(block, b"#tombstone_count"); let data_block_count = read_u64!(block, b"#data_block_count"); let index_block_count = read_u64!(block, b"#index_block_count"); - let file_size = read_u64!(block, b"#size"); // TODO: rename file_size + let file_size = read_u64!(block, b"#size"); // TODO: 3.0.0 rename file_size let created_at = { let bytes = block diff --git a/src/vlog/blob_file/scanner.rs b/src/vlog/blob_file/scanner.rs index 47fec933..38a34929 100644 --- a/src/vlog/blob_file/scanner.rs +++ b/src/vlog/blob_file/scanner.rs @@ -41,8 +41,8 @@ impl Scanner { } } - pub(crate) fn use_compression(mut self, compressoion: CompressionType) -> Self { - self.compression = compressoion; + pub(crate) fn use_compression(mut self, compression: CompressionType) -> Self { + self.compression = compression; self } From 0203b2687406ad98065d7b97758e5760375ef562 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 29 Sep 2025 20:57:20 +0200 Subject: [PATCH 499/613] change kv separation options --- src/config/mod.rs | 155 +++++++++++++++++++++++++--------------------- src/lib.rs | 2 +- 2 files changed, 86 insertions(+), 71 deletions(-) diff --git a/src/config/mod.rs b/src/config/mod.rs index 3a33d60e..c392ef83 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -16,7 +16,9 @@ pub use hash_ratio::HashRatioPolicy; pub use pinning::PinningPolicy; pub use restart_interval::RestartIntervalPolicy; -use crate::{path::absolute_path, BlobTree, Cache, CompressionType, DescriptorTable, Tree}; +use crate::{ + path::absolute_path, AnyTree, BlobTree, Cache, CompressionType, DescriptorTable, Tree, +}; use std::{ path::{Path, PathBuf}, sync::Arc, @@ -55,6 +57,72 @@ impl TryFrom for TreeType { const DEFAULT_FILE_FOLDER: &str = ".lsm.data"; +/// Options for key-value separation +#[derive(Clone)] +pub struct KvSeparationOptions { + /// What type of compression is used for blobs + pub blob_compression: CompressionType, + + /// Blob file (value log segment) target size in bytes + #[doc(hidden)] + pub blob_file_target_size: u64, + + /// Key-value separation threshold in bytes + #[doc(hidden)] + pub blob_file_separation_threshold: u32, + // TODO: blob_file_staleness_threshold AND/OR space_amp_threshold +} + +impl Default for KvSeparationOptions { + fn default() -> Self { + Self { + blob_compression: CompressionType::None, // TODO: LZ4 + blob_file_target_size: /* 64 MiB */ 64 * 1_024 * 1_024, + blob_file_separation_threshold: /* 4 KiB */ 4 * 1_024, + } + } +} + +impl KvSeparationOptions { + /// Sets the blob compression method. + #[must_use] + pub fn blob_compression(mut self, compression: CompressionType) -> Self { + self.blob_compression = compression; + self + } + + /// Sets the target size of blob files. + /// + /// Smaller blob files allow more granular garbage collection + /// which allows lower space amp for lower write I/O cost. + /// + /// Larger blob files decrease the number of files on disk and maintenance + /// overhead. + /// + /// Defaults to 64 MiB. + /// + /// This option has no effect when not used for opening a blob tree. + #[must_use] + pub fn blob_file_target_size(mut self, bytes: u64) -> Self { + self.blob_file_target_size = bytes; + self + } + + /// Sets the key-value separation threshold in bytes. + /// + /// Smaller value will reduce compaction overhead and thus write amplification, + /// at the cost of lower read performance. + /// + /// Defaults to 4KiB. + /// + /// This option has no effect when not used for opening a blob tree. + #[must_use] + pub fn blob_file_separation_threshold(mut self, bytes: u32) -> Self { + self.blob_file_separation_threshold = bytes; + self + } +} + #[derive(Clone)] /// Tree configuration builder pub struct Config { @@ -70,10 +138,6 @@ pub struct Config { #[doc(hidden)] pub descriptor_table: Arc, - /// Tree type (unused) - #[allow(unused)] - pub tree_type: TreeType, - /// Number of levels of the LSM tree (depth of tree) /// /// Once set, the level count is fixed (in the "manifest" file) @@ -113,17 +177,7 @@ pub struct Config { /// Filter construction policy pub filter_policy: FilterPolicy, - /// What type of compression is used for blobs - pub blob_compression: CompressionType, - - /// Blob file (value log segment) target size in bytes - #[doc(hidden)] - pub blob_file_target_size: u64, - - /// Key-value separation threshold in bytes - #[doc(hidden)] - pub blob_file_separation_threshold: u32, - // TODO: blob_file_staleness_threshold AND/OR space_amp_threshold + pub(crate) kv_separation_opts: Option, } impl Default for Config { @@ -132,13 +186,14 @@ impl Default for Config { path: absolute_path(Path::new(DEFAULT_FILE_FOLDER)), descriptor_table: Arc::new(DescriptorTable::new(256)), - cache: Arc::new(Cache::with_capacity_bytes(/* 16 MiB */ 16 * 1_024 * 1_024)), + cache: Arc::new(Cache::with_capacity_bytes( + /* 16 MiB */ 16 * 1_024 * 1_024, + )), data_block_restart_interval_policy: RestartIntervalPolicy::all(16), index_block_restart_interval_policy: RestartIntervalPolicy::all(1), level_count: 7, - tree_type: TreeType::Standard, data_block_size_policy: BlockSizePolicy::default(), index_block_size_policy: BlockSizePolicy::default(), @@ -147,18 +202,15 @@ impl Default for Config { filter_block_pinning_policy: PinningPolicy::new(&[true, false]), data_block_compression_policy: CompressionPolicy::default(), - index_block_compression_policy:CompressionPolicy::all(CompressionType::None), + index_block_compression_policy: CompressionPolicy::all(CompressionType::None), data_block_hash_ratio_policy: HashRatioPolicy::all(0.0), - blob_compression: CompressionType::None, - filter_policy: FilterPolicy::default(), - blob_file_target_size: /* 64 MiB */ 64 * 1_024 * 1_024, - blob_file_separation_threshold: /* 4 KiB */ 4 * 1_024, - expect_point_read_hits: false, + + kv_separation_opts: None, } } } @@ -260,13 +312,6 @@ impl Config { self } - /// Sets the blob compression method. - #[must_use] - pub fn blob_compression(mut self, compression: CompressionType) -> Self { - self.blob_compression = compression; - self - } - /// Sets the number of levels of the LSM tree (depth of tree). /// /// Defaults to 7, like `LevelDB` and `RocksDB`. @@ -308,34 +353,10 @@ impl Config { self } - /// Sets the target size of blob files. - /// - /// Smaller blob files allow more granular garbage collection - /// which allows lower space amp for lower write I/O cost. - /// - /// Larger blob files decrease the number of files on disk and maintenance - /// overhead. - /// - /// Defaults to 64 MiB. - /// - /// This option has no effect when not used for opening a blob tree. + /// Toggles key-value separation. #[must_use] - pub fn blob_file_target_size(mut self, bytes: u64) -> Self { - self.blob_file_target_size = bytes; - self - } - - /// Sets the key-value separation threshold in bytes. - /// - /// Smaller value will reduce compaction overhead and thus write amplification, - /// at the cost of lower read performance. - /// - /// Defaults to 4KiB. - /// - /// This option has no effect when not used for opening a blob tree. - #[must_use] - pub fn blob_file_separation_threshold(mut self, bytes: u32) -> Self { - self.blob_file_separation_threshold = bytes; + pub fn with_kv_separation(mut self, opts: Option) -> Self { + self.kv_separation_opts = opts; self } @@ -344,17 +365,11 @@ impl Config { /// # Errors /// /// Will return `Err` if an IO error occurs. - pub fn open(self) -> crate::Result { - Tree::open(self) - } - - /// Opens a blob tree using the config. - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - pub fn open_as_blob_tree(mut self) -> crate::Result { - self.tree_type = TreeType::Blob; - BlobTree::open(self) + pub fn open(self) -> crate::Result { + Ok(if self.kv_separation_opts.is_some() { + AnyTree::Blob(BlobTree::open(self)?) + } else { + AnyTree::Standard(Tree::open(self)?) + }) } } diff --git a/src/lib.rs b/src/lib.rs index 12e613d4..cd2883eb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -249,7 +249,7 @@ pub use { cache::Cache, coding::{DecodeError, EncodeError}, compression::CompressionType, - config::{Config, TreeType}, + config::{Config, KvSeparationOptions, TreeType}, descriptor_table::DescriptorTable, error::{Error, Result}, format_version::FormatVersion, From 2a449ee8cef5b1044528a4c115d3622db9e64066 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 29 Sep 2025 21:00:59 +0200 Subject: [PATCH 500/613] refactor --- src/abstract.rs | 33 ++++- src/blob_tree/mod.rs | 62 +++++++--- src/compaction/worker.rs | 57 +-------- src/level_manifest/mod.rs | 4 +- src/multi_reader.rs | 2 +- src/run_reader.rs | 4 +- src/run_scanner.rs | 2 +- src/tree/inner.rs | 3 +- src/tree/mod.rs | 116 +++++++++--------- src/version/mod.rs | 36 ++++-- src/vlog/accessor.rs | 12 +- src/vlog/blob_file/mod.rs | 19 +++ tests/blob_drop_range_gc_stats.rs | 11 +- tests/blob_flush_gc_stats.rs | 8 +- tests/blob_major_compact_gc_stats.rs | 42 ++++--- tests/blob_major_compact_relink.rs | 14 +-- tests/blob_recover_gc_stats.rs | 14 ++- tests/blob_sep_threshold.rs | 8 +- tests/blob_simple.rs | 8 +- tests/blob_tree_reload_blob.rs | 10 +- tests/compaction_readers_grouping.rs | 8 +- tests/experimental_blob_tree_guarded_size.rs | 4 +- tests/experimental_tree_guarded_range.rs | 4 +- tests/multi_trees.rs | 40 ++---- tests/mvcc_slab.rs | 7 +- tests/segment_point_reads.rs | 3 +- tests/tree_approx_len.rs | 6 +- tests/tree_count.rs | 2 +- tests/tree_disjoint_point_read.rs | 11 +- tests/tree_drop_range.rs | 4 +- tests/tree_flush_eviction.rs | 4 +- tests/tree_l0_point_read.rs | 4 +- tests/tree_l0_range.rs | 4 +- ...compaction.rs => tree_major_compaction.rs} | 0 tests/tree_range.rs | 2 +- tests/tree_recover_counter.rs | 28 +---- tests/tree_shadowing.rs | 8 +- 37 files changed, 320 insertions(+), 284 deletions(-) rename tests/{major_compaction.rs => tree_major_compaction.rs} (100%) diff --git a/src/abstract.rs b/src/abstract.rs index 6bda4ee7..666e96da 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -4,14 +4,14 @@ use crate::{ blob_tree::FragmentationMap, compaction::CompactionStrategy, config::TreeType, - iter_guard::IterGuardImpl, segment::Segment, tree::inner::MemtableId, vlog::BlobFile, AnyTree, - BlobTree, Config, Guard, KvPair, Memtable, SegmentId, SeqNo, SequenceNumberCounter, Tree, - UserKey, UserValue, + iter_guard::IterGuardImpl, level_manifest::LevelManifest, segment::Segment, + tree::inner::MemtableId, vlog::BlobFile, AnyTree, BlobTree, Config, Guard, InternalValue, + KvPair, Memtable, SegmentId, SeqNo, SequenceNumberCounter, Tree, TreeId, UserKey, UserValue, }; use enum_dispatch::enum_dispatch; use std::{ ops::RangeBounds, - sync::{Arc, RwLockWriteGuard}, + sync::{Arc, RwLock, RwLockWriteGuard}, }; pub type RangeItem = crate::Result; @@ -20,6 +20,31 @@ pub type RangeItem = crate::Result; #[allow(clippy::module_name_repetitions)] #[enum_dispatch] pub trait AbstractTree { + #[doc(hidden)] + fn next_table_id(&self) -> SegmentId; + + #[doc(hidden)] + fn id(&self) -> TreeId; + + #[doc(hidden)] + fn get_internal_entry(&self, key: &[u8], seqno: SeqNo) -> crate::Result>; + + #[doc(hidden)] + fn manifest(&self) -> &Arc>; + + /// Synchronously flushes the active memtable to a disk segment. + /// + /// The function may not return a result, if, during concurrent workloads, the memtable + /// ends up being empty before the flush is set up. + /// + /// The result will contain the [`Segment`]. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + #[doc(hidden)] + fn flush_active_memtable(&self, seqno_threshold: SeqNo) -> crate::Result>; + /// Returns an iterator that scans through the entire tree. /// /// Avoid using this function, or limit it as otherwise it may scan a lot of items. diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index 4880d23d..4b43456c 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -13,6 +13,7 @@ use crate::{ compaction::stream::CompactionStream, file::{fsync_directory, BLOBS_FOLDER}, iter_guard::{IterGuard, IterGuardImpl}, + level_manifest::LevelManifest, r#abstract::{AbstractTree, RangeItem}, segment::Segment, tree::inner::MemtableId, @@ -21,7 +22,13 @@ use crate::{ Config, Memtable, SegmentId, SeqNo, SequenceNumberCounter, UserKey, UserValue, }; use handle::BlobIndirection; -use std::{collections::BTreeMap, io::Cursor, ops::RangeBounds, path::PathBuf, sync::Arc}; +use std::{ + collections::BTreeMap, + io::Cursor, + ops::RangeBounds, + path::PathBuf, + sync::{Arc, RwLock}, +}; pub struct Guard<'a> { blob_tree: &'a BlobTree, @@ -99,14 +106,14 @@ pub struct BlobTree { impl BlobTree { pub(crate) fn open(config: Config) -> crate::Result { - let index = config.open()?; + let index = crate::Tree::open(config)?; let blobs_folder = index.config.path.join(BLOBS_FOLDER); std::fs::create_dir_all(&blobs_folder)?; fsync_directory(&blobs_folder)?; let blob_file_id_to_continue_with = index - .manifest + .manifest() .read() .expect("lock is poisoned") .current_version() @@ -123,9 +130,26 @@ impl BlobTree { blob_file_id_generator: SequenceNumberCounter::new(blob_file_id_to_continue_with), }) } +} - #[doc(hidden)] - pub fn flush_active_memtable(&self, eviction_seqno: SeqNo) -> crate::Result> { +impl AbstractTree for BlobTree { + fn next_table_id(&self) -> SegmentId { + self.index.next_table_id() + } + + fn id(&self) -> crate::TreeId { + self.index.id() + } + + fn get_internal_entry(&self, key: &[u8], seqno: SeqNo) -> crate::Result> { + self.index.get_internal_entry(key, seqno) + } + + fn manifest(&self) -> &Arc> { + self.index.manifest() + } + + fn flush_active_memtable(&self, eviction_seqno: SeqNo) -> crate::Result> { let Some((segment_id, yanked_memtable)) = self.index.rotate_memtable() else { return Ok(None); }; @@ -144,9 +168,7 @@ impl BlobTree { Ok(Some(segment)) } -} -impl AbstractTree for BlobTree { #[cfg(feature = "metrics")] fn metrics(&self) -> &Arc { self.index.metrics() @@ -167,8 +189,7 @@ impl AbstractTree for BlobTree { let range = prefix_to_range(prefix.as_ref()); let version = self - .index - .manifest + .manifest() .read() .expect("lock is poisoned") .current_version() @@ -194,8 +215,7 @@ impl AbstractTree for BlobTree { index: Option>, ) -> Box> + '_> { let version = self - .index - .manifest + .manifest() .read() .expect("lock is poisoned") .current_version() @@ -312,8 +332,7 @@ impl AbstractTree for BlobTree { } fn blob_file_count(&self) -> usize { - self.index - .manifest + self.manifest() .read() .expect("lock is poisoned") .current_version() @@ -341,8 +360,7 @@ impl AbstractTree for BlobTree { } fn stale_blob_bytes(&self) -> u64 { - self.index - .manifest + self.manifest() .read() .expect("lock is poisoned") .current_version() @@ -407,6 +425,14 @@ impl AbstractTree for BlobTree { let mut blob_bytes_referenced = 0; let mut blobs_referenced_count = 0; + let separation_threshold = self + .index + .config + .kv_separation_opts + .as_ref() + .expect("kv separation options should exist") + .blob_file_separation_threshold; + for item in compaction_filter { let item = item?; @@ -423,7 +449,7 @@ impl AbstractTree for BlobTree { #[allow(clippy::cast_possible_truncation)] let value_size = value.len() as u32; - if value_size >= self.index.config.blob_file_separation_threshold { + if value_size >= separation_threshold { let offset = blob_writer.offset(); let blob_file_id = blob_writer.blob_file_id(); let on_disk_size = blob_writer.write(&item.key.user_key, value)?; @@ -559,7 +585,7 @@ impl AbstractTree for BlobTree { } fn disk_space(&self) -> u64 { - let lock = self.index.manifest.read().expect("lock is poisoned"); + let lock = self.manifest().read().expect("lock is poisoned"); let version = lock.current_version(); let vlog = crate::vlog::Accessor::new(&version.value_log); self.index.disk_space() + vlog.disk_space() @@ -592,7 +618,7 @@ impl AbstractTree for BlobTree { return Ok(None); }; - let lock = self.index.manifest.read().expect("lock is poisoned"); + let lock = self.manifest().read().expect("lock is poisoned"); let version = lock.current_version(); let (_, v) = resolve_value_handle(self, &version.value_log, item)?; Ok(Some(v)) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 003b7b0a..0984e364 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -14,7 +14,7 @@ use crate::{ segment::{multi_writer::MultiWriter, Segment}, stop_signal::StopSignal, tree::inner::TreeId, - Config, InternalValue, SegmentId, SeqNo, TreeType, + AbstractTree, Config, InternalValue, SegmentId, SeqNo, TreeType, }; use std::{ sync::{atomic::AtomicU64, Arc, RwLock, RwLockWriteGuard}, @@ -58,7 +58,7 @@ impl Options { tree_id: tree.id, segment_id_generator: tree.segment_id_counter.clone(), config: tree.config.clone(), - levels: tree.manifest.clone(), + levels: tree.manifest().clone(), stop_signal: tree.stop_signal.clone(), strategy, eviction_seqno: 0, @@ -322,7 +322,7 @@ fn merge_segments( }); // NOTE: If we are a blob tree, install callback to listen for evicted KVs - if opts.config.tree_type == TreeType::Blob { + if opts.config.kv_separation_opts.is_some() { merge_iter = merge_iter.with_expiration_callback(&mut blob_frag_map); } @@ -412,57 +412,6 @@ fn merge_segments( #[cfg(feature = "metrics")] opts.metrics.clone(), ) - - /* let segment_id = trailer.metadata.id; - let segment_file_path = segments_base_folder.join(segment_id.to_string()); - - let block_index = match payload.dest_level { - 0 | 1 => { - let block_index = FullBlockIndex::from_file( - &segment_file_path, - &trailer.metadata, - &trailer.offsets, - )?; - BlockIndexImpl::Full(block_index) - } - _ => { - // NOTE: Need to allow because of false positive in Clippy - // because of "bloom" feature - #[allow(clippy::needless_borrows_for_generic_args)] - let block_index = TwoLevelBlockIndex::from_file( - &segment_file_path, - &trailer.metadata, - trailer.offsets.tli_ptr, - (opts.tree_id, segment_id).into(), - opts.config.descriptor_table.clone(), - opts.config.cache.clone(), - )?; - BlockIndexImpl::TwoLevel(block_index) - } - }; - let block_index = Arc::new(block_index); - - let bloom_filter = Segment::load_bloom(&segment_file_path, trailer.offsets.bloom_ptr)?; - - Ok(SegmentInner { - path: segment_file_path, - - tree_id: opts.tree_id, - - descriptor_table: opts.config.descriptor_table.clone(), - cache: opts.config.cache.clone(), - - metadata: trailer.metadata, - offsets: trailer.offsets, - - #[allow(clippy::needless_borrows_for_generic_args)] - block_index, - - bloom_filter, - - is_deleted: AtomicBool::default(), - } - .into()) */ }) .collect::>>(); diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index 5277104e..549ecf82 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -461,12 +461,12 @@ mod tests { // NOTE: Purposefully change level manifest to have invalid path // to force an I/O error - tree.manifest.write().expect("lock is poisoned").folder = "/invaliiid/asd".into(); + tree.manifest().write().expect("lock is poisoned").folder = "/invaliiid/asd".into(); assert!(tree.major_compact(u64::MAX, 4).is_err()); assert!(tree - .manifest + .manifest() .read() .expect("lock is poisoned") .hidden_set diff --git a/src/multi_reader.rs b/src/multi_reader.rs index 9ce0fe5a..bf32552c 100644 --- a/src/multi_reader.rs +++ b/src/multi_reader.rs @@ -76,7 +76,7 @@ mod tests { } let segments = tree - .manifest + .manifest() .read() .expect("lock is poisoned") .current_version() diff --git a/src/run_reader.rs b/src/run_reader.rs index be8cdff6..21b9c443 100644 --- a/src/run_reader.rs +++ b/src/run_reader.rs @@ -154,7 +154,7 @@ mod tests { } let segments = tree - .manifest + .manifest() .read() .expect("lock is poisoned") .iter() @@ -196,7 +196,7 @@ mod tests { } let segments = tree - .manifest + .manifest() .read() .expect("lock is poisoned") .iter() diff --git a/src/run_scanner.rs b/src/run_scanner.rs index e7a39bfa..1405b8d9 100644 --- a/src/run_scanner.rs +++ b/src/run_scanner.rs @@ -90,7 +90,7 @@ mod tests { } let segments = tree - .manifest + .manifest() .read() .expect("lock is poisoned") .current_version() diff --git a/src/tree/inner.rs b/src/tree/inner.rs index 32b1ae59..00da1bca 100644 --- a/src/tree/inner.rs +++ b/src/tree/inner.rs @@ -68,8 +68,7 @@ pub struct TreeInner { pub(crate) sealed_memtables: Arc>, /// Current tree version - #[doc(hidden)] - pub manifest: Arc>, + pub(super) manifest: Arc>, /// Tree configuration pub config: Config, diff --git a/src/tree/mod.rs b/src/tree/mod.rs index b8e71fea..370d7415 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -20,8 +20,8 @@ use crate::{ slice::Slice, value::InternalValue, vlog::BlobFile, - AbstractTree, Cache, DescriptorTable, KvPair, SegmentId, SeqNo, SequenceNumberCounter, UserKey, - UserValue, ValueType, + AbstractTree, Cache, DescriptorTable, KvPair, SegmentId, SeqNo, SequenceNumberCounter, + TreeType, UserKey, UserValue, ValueType, }; use inner::{MemtableId, SealedMemtables, TreeId, TreeInner}; use std::{ @@ -73,6 +73,57 @@ impl std::ops::Deref for Tree { } impl AbstractTree for Tree { + fn next_table_id(&self) -> SegmentId { + self.0 + .segment_id_counter + .load(std::sync::atomic::Ordering::Relaxed) + } + + fn id(&self) -> TreeId { + self.id + } + + fn get_internal_entry(&self, key: &[u8], seqno: SeqNo) -> crate::Result> { + // TODO: consolidate memtable & sealed behind single RwLock + + let memtable_lock = self.active_memtable.read().expect("lock is poisoned"); + + if let Some(entry) = memtable_lock.get(key, seqno) { + return Ok(ignore_tombstone_value(entry)); + } + + drop(memtable_lock); + + // Now look in sealed memtables + if let Some(entry) = self.get_internal_entry_from_sealed_memtables(key, seqno) { + return Ok(ignore_tombstone_value(entry)); + } + + // Now look in segments... this may involve disk I/O + self.get_internal_entry_from_segments(key, seqno) + } + + fn manifest(&self) -> &Arc> { + &self.manifest + } + + fn flush_active_memtable(&self, seqno_threshold: SeqNo) -> crate::Result> { + log::debug!("Flushing active memtable"); + + let Some((segment_id, yanked_memtable)) = self.rotate_memtable() else { + return Ok(None); + }; + + let Some((segment, _)) = + self.flush_memtable(segment_id, &yanked_memtable, seqno_threshold)? + else { + return Ok(None); + }; + self.register_segments(std::slice::from_ref(&segment), None, None, seqno_threshold)?; + + Ok(Some(segment)) + } + #[cfg(feature = "metrics")] fn metrics(&self) -> &Arc { &self.0.metrics @@ -666,34 +717,6 @@ impl Tree { Ok(Some(created_segment)) } - /// Synchronously flushes the active memtable to a disk segment. - /// - /// The function may not return a result, if, during concurrent workloads, the memtable - /// ends up being empty before the flush is set up. - /// - /// The result will contain the [`Segment`]. - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - #[doc(hidden)] - pub fn flush_active_memtable(&self, seqno_threshold: SeqNo) -> crate::Result> { - log::debug!("Flushing active memtable"); - - let Some((segment_id, yanked_memtable)) = self.rotate_memtable() else { - return Ok(None); - }; - - let Some((segment, _)) = - self.flush_memtable(segment_id, &yanked_memtable, seqno_threshold)? - else { - return Ok(None); - }; - self.register_segments(std::slice::from_ref(&segment), None, None, seqno_threshold)?; - - Ok(Some(segment)) - } - /// Returns `true` if there are some segments that are being compacted. #[doc(hidden)] #[must_use] @@ -783,31 +806,6 @@ impl Tree { Ok(None) } - #[doc(hidden)] - pub fn get_internal_entry( - &self, - key: &[u8], - seqno: SeqNo, - ) -> crate::Result> { - // TODO: consolidate memtable & sealed behind single RwLock - - let memtable_lock = self.active_memtable.read().expect("lock is poisoned"); - - if let Some(entry) = memtable_lock.get(key, seqno) { - return Ok(ignore_tombstone_value(entry)); - } - - drop(memtable_lock); - - // Now look in sealed memtables - if let Some(entry) = self.get_internal_entry_from_sealed_memtables(key, seqno) { - return Ok(ignore_tombstone_value(entry)); - } - - // Now look in segments... this may involve disk I/O - self.get_internal_entry_from_segments(key, seqno) - } - fn inner_compact( &self, strategy: Arc, @@ -937,7 +935,6 @@ impl Tree { // IMPORTANT: Restore persisted config config.level_count = manifest.level_count; - config.tree_type = manifest.tree_type; let tree_id = get_next_tree_id(); @@ -993,8 +990,11 @@ impl Tree { Manifest { version: FormatVersion::V3, level_count: config.level_count, - tree_type: config.tree_type, - // table_type: TableType::Block, + tree_type: if config.kv_separation_opts.is_some() { + TreeType::Blob + } else { + TreeType::Standard + }, } .encode_into(&mut file)?; file.sync_all()?; diff --git a/src/version/mod.rs b/src/version/mod.rs index e57cafbe..eed37b95 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -308,7 +308,7 @@ impl Version { }; let gc_map = if let Some(diff) = diff { - let mut copy: FragmentationMap = self.gc_stats.deref().clone(); + let mut copy = self.gc_stats.deref().clone(); diff.merge_into(&mut copy); copy.prune(&self.value_log); Arc::new(copy) @@ -364,9 +364,9 @@ impl Version { let gc_stats = if dropped_segments.is_empty() { self.gc_stats.clone() } else { - let mut copy: FragmentationMap = self.gc_stats.deref().clone(); + let mut copy = self.gc_stats.deref().clone(); - for segment in dropped_segments { + for segment in &dropped_segments { let linked_blob_files = segment .get_linked_blob_files() .expect("TODO: handle error") @@ -385,11 +385,20 @@ impl Version { Arc::new(copy) }; + let value_log = if dropped_segments.is_empty() { + self.value_log.clone() + } else { + // TODO: 3.0.0 this should really be a newtype + let mut copy = self.value_log.deref().clone(); + copy.retain(|_, blob_file| !blob_file.is_dead(&gc_stats)); + Arc::new(copy) + }; + Self { inner: Arc::new(VersionInner { id, levels, - value_log: self.value_log.clone(), + value_log, gc_stats, }), seqno_watermark: 0, @@ -429,8 +438,10 @@ impl Version { levels.push(Level::from_runs(runs.into_iter().map(Arc::new).collect())); } - let gc_map = if let Some(diff) = diff { - let mut copy: FragmentationMap = self.gc_stats.deref().clone(); + let has_diff = diff.is_some(); + + let gc_stats = if let Some(diff) = diff { + let mut copy = self.gc_stats.deref().clone(); diff.merge_into(&mut copy); copy.prune(&self.value_log); Arc::new(copy) @@ -438,12 +449,21 @@ impl Version { self.gc_stats.clone() }; + let value_log = if has_diff { + // TODO: 3.0.0 this should really be a newtype + let mut copy = self.value_log.deref().clone(); + copy.retain(|_, blob_file| !blob_file.is_dead(&gc_stats)); + Arc::new(copy) + } else { + self.value_log.clone() + }; + Self { inner: Arc::new(VersionInner { id, levels, - value_log: self.value_log.clone(), - gc_stats: gc_map, + value_log, + gc_stats, }), seqno_watermark: 0, } diff --git a/src/vlog/accessor.rs b/src/vlog/accessor.rs index 48fd3bf1..1fe4f3b3 100644 --- a/src/vlog/accessor.rs +++ b/src/vlog/accessor.rs @@ -8,10 +8,12 @@ use crate::{ }; use std::{collections::BTreeMap, fs::File, path::Path, sync::Arc}; -pub struct Accessor<'a>(&'a BTreeMap); +type Inner = BTreeMap; + +pub struct Accessor<'a>(&'a Inner); impl<'a> Accessor<'a> { - pub fn new(blob_files: &'a BTreeMap) -> Self { + pub fn new(blob_files: &'a Inner) -> Self { Self(blob_files) } @@ -30,7 +32,7 @@ impl<'a> Accessor<'a> { cache: &Cache, descriptor_table: &DescriptorTable, ) -> crate::Result> { - if let Some(value) = cache.get_blob(0 /* TODO: tree ID... */, vhandle) { + if let Some(value) = cache.get_blob(0 /* TODO: 3.0.0 tree ID... */, vhandle) { return Ok(Some(value)); } @@ -38,7 +40,7 @@ impl<'a> Accessor<'a> { return Ok(None); }; - let bf_id = GlobalSegmentId::from((0 /* TODO: tree ID */, blob_file.id())); + let bf_id = GlobalSegmentId::from((0 /* TODO: 3.0.0 tree ID */, blob_file.id())); let file = if let Some(fd) = descriptor_table.access_for_blob_file(&bf_id) { fd @@ -51,7 +53,7 @@ impl<'a> Accessor<'a> { }; let value = Reader::new(blob_file, &file).get(key, vhandle)?; - cache.insert_blob(0 /* TODO: tree_id */, vhandle, value.clone()); + cache.insert_blob(0 /* TODO: 3.0.0 tree_id */, vhandle, value.clone()); Ok(Some(value)) } diff --git a/src/vlog/blob_file/mod.rs b/src/vlog/blob_file/mod.rs index 7b8e6ab4..3d43e47e 100644 --- a/src/vlog/blob_file/mod.rs +++ b/src/vlog/blob_file/mod.rs @@ -58,4 +58,23 @@ impl BlobFile { pub fn len(&self) -> u64 { self.0.meta.item_count } + + /// Returns `true` if the blob file is stale (based on the given staleness threshold). + pub(crate) fn is_stale(&self, frag_map: &FragmentationMap, threshold: f32) -> bool { + frag_map.get(&self.id()).is_some_and(|x| { + let stale_bytes = x.bytes as f32; + let all_bytes = self.0.meta.total_uncompressed_bytes as f32; + let ratio = stale_bytes / all_bytes; + ratio >= threshold + }) + } + + /// Returns `true` if the blob file has no more incoming references, and can be safely removed from a Version. + pub(crate) fn is_dead(&self, frag_map: &FragmentationMap) -> bool { + frag_map.get(&self.id()).is_some_and(|x| { + let stale_bytes = x.bytes; + let all_bytes = self.0.meta.total_uncompressed_bytes; + stale_bytes == all_bytes + }) + } } diff --git a/tests/blob_drop_range_gc_stats.rs b/tests/blob_drop_range_gc_stats.rs index ae978fc2..ae8b7d73 100644 --- a/tests/blob_drop_range_gc_stats.rs +++ b/tests/blob_drop_range_gc_stats.rs @@ -9,7 +9,9 @@ fn blob_tree_drop_range_gc_stats() -> lsm_tree::Result<()> { let big_value = b"neptune!".repeat(128_000); { - let tree = lsm_tree::Config::new(path).open_as_blob_tree()?; + let tree = lsm_tree::Config::new(path) + .with_kv_separation(Some(Default::default())) + .open()?; assert!(tree.get("big", SeqNo::MAX)?.is_none()); tree.insert("big", &big_value, 0); @@ -23,12 +25,13 @@ fn blob_tree_drop_range_gc_stats() -> lsm_tree::Result<()> { tree.drop_range::<&[u8], _>(..)?; + // NOTE: Because the blob does not have any incoming references anymore + // it is pruned from the Version + assert_eq!(0, tree.blob_file_count()); assert_eq!(0, tree.segment_count()); - assert_eq!(1, tree.blob_file_count()); // TODO: 3.0.0 automatically prune fully stale blob files from version -> this should be 0 let gc_stats = tree - .index - .manifest + .manifest() .read() .expect("lock is poisoned") .current_version() diff --git a/tests/blob_flush_gc_stats.rs b/tests/blob_flush_gc_stats.rs index 8a2dd529..13788638 100644 --- a/tests/blob_flush_gc_stats.rs +++ b/tests/blob_flush_gc_stats.rs @@ -10,7 +10,9 @@ fn blob_tree_flush_gc_stats() -> lsm_tree::Result<()> { let new_big_value = b"winter!".repeat(128_000); { - let tree = lsm_tree::Config::new(path).open_as_blob_tree()?; + let tree = lsm_tree::Config::new(path) + .with_kv_separation(Some(Default::default())) + .open()?; assert!(tree.get("big", SeqNo::MAX)?.is_none()); tree.insert("big", &big_value, 0); @@ -38,7 +40,9 @@ fn blob_tree_flush_gc_stats_tombstone() -> lsm_tree::Result<()> { let big_value = b"neptune!".repeat(128_000); { - let tree = lsm_tree::Config::new(path).open_as_blob_tree()?; + let tree = lsm_tree::Config::new(path) + .with_kv_separation(Some(Default::default())) + .open()?; assert!(tree.get("big", SeqNo::MAX)?.is_none()); tree.insert("big", &big_value, 0); diff --git a/tests/blob_major_compact_gc_stats.rs b/tests/blob_major_compact_gc_stats.rs index 7935255d..d46ada8f 100644 --- a/tests/blob_major_compact_gc_stats.rs +++ b/tests/blob_major_compact_gc_stats.rs @@ -1,6 +1,8 @@ use lsm_tree::{blob_tree::FragmentationEntry, AbstractTree, SeqNo}; use test_log::test; +// TODO: 3.0.0 check that decompressed value size is used (enable compression) + #[test] fn blob_tree_major_compact_gc_stats() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; @@ -10,7 +12,9 @@ fn blob_tree_major_compact_gc_stats() -> lsm_tree::Result<()> { let new_big_value = b"winter!".repeat(128_000); { - let tree = lsm_tree::Config::new(path).open_as_blob_tree()?; + let tree = lsm_tree::Config::new(path) + .with_kv_separation(Some(Default::default())) + .open()?; assert!(tree.get("big", SeqNo::MAX)?.is_none()); tree.insert("big", &big_value, 0); @@ -28,11 +32,11 @@ fn blob_tree_major_compact_gc_stats() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; + // Major compaction does not rewrite every blob file tree.major_compact(64_000_000, 1_000)?; let gc_stats = tree - .index - .manifest + .manifest() .read() .expect("lock is poisoned") .current_version() @@ -53,7 +57,6 @@ fn blob_tree_major_compact_gc_stats() -> lsm_tree::Result<()> { Ok(()) } -// TODO: check that decompressed value size is used (enable compression) #[test] fn blob_tree_major_compact_gc_stats_tombstone() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; @@ -62,10 +65,13 @@ fn blob_tree_major_compact_gc_stats_tombstone() -> lsm_tree::Result<()> { let big_value = b"neptune!".repeat(128_000); { - let tree = lsm_tree::Config::new(path).open_as_blob_tree()?; + let tree = lsm_tree::Config::new(path) + .with_kv_separation(Some(Default::default())) + .open()?; assert!(tree.get("big", SeqNo::MAX)?.is_none()); tree.insert("big", &big_value, 0); + tree.insert("another_big", &big_value, 0); tree.insert("smol", "small value", 0); let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); @@ -78,27 +84,32 @@ fn blob_tree_major_compact_gc_stats_tombstone() -> lsm_tree::Result<()> { tree.remove("big", 1); tree.flush_active_memtable(0)?; + assert_eq!(2, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); assert_eq!( - None, - tree.index - .manifest + Some(vec![lsm_tree::segment::writer::LinkedFile { + blob_file_id: 0, + bytes: 2 * big_value.len() as u64, + len: 2, + }]), + tree.manifest() .read() .expect("lock is poisoned") .current_version() .iter_segments() - .next() + .nth(1) .unwrap() .get_linked_blob_files()?, ); + // Major compaction does not rewrite every blob file tree.major_compact(64_000_000, 1_000)?; assert_eq!(1, tree.segment_count()); assert_eq!(1, tree.blob_file_count()); let gc_stats = tree - .index - .manifest + .manifest() .read() .expect("lock is poisoned") .current_version() @@ -116,9 +127,12 @@ fn blob_tree_major_compact_gc_stats_tombstone() -> lsm_tree::Result<()> { ); assert_eq!( - None, - tree.index - .manifest + Some(vec![lsm_tree::segment::writer::LinkedFile { + blob_file_id: 0, + bytes: big_value.len() as u64, + len: 1, + }]), + tree.manifest() .read() .expect("lock is poisoned") .current_version() diff --git a/tests/blob_major_compact_relink.rs b/tests/blob_major_compact_relink.rs index 0290942b..e834b152 100644 --- a/tests/blob_major_compact_relink.rs +++ b/tests/blob_major_compact_relink.rs @@ -9,7 +9,9 @@ fn blob_tree_major_compact_gc_stats() -> lsm_tree::Result<()> { let big_value = b"neptune!".repeat(128_000); { - let tree = lsm_tree::Config::new(path).open_as_blob_tree()?; + let tree = lsm_tree::Config::new(path) + .with_kv_separation(Some(Default::default())) + .open()?; assert!(tree.get("big", SeqNo::MAX)?.is_none()); tree.insert("big", &big_value, 0); @@ -26,10 +28,9 @@ fn blob_tree_major_compact_gc_stats() -> lsm_tree::Result<()> { Some(vec![lsm_tree::segment::writer::LinkedFile { blob_file_id: 0, bytes: big_value.len() as u64, - len: 1 + len: 1, }]), - tree.index - .manifest + tree.manifest() .read() .expect("lock is poisoned") .current_version() @@ -49,10 +50,9 @@ fn blob_tree_major_compact_gc_stats() -> lsm_tree::Result<()> { Some(vec![lsm_tree::segment::writer::LinkedFile { blob_file_id: 0, bytes: big_value.len() as u64, - len: 1 + len: 1, }]), - tree.index - .manifest + tree.manifest() .read() .expect("lock is poisoned") .current_version() diff --git a/tests/blob_recover_gc_stats.rs b/tests/blob_recover_gc_stats.rs index edc86c7e..e0a9695b 100644 --- a/tests/blob_recover_gc_stats.rs +++ b/tests/blob_recover_gc_stats.rs @@ -10,7 +10,9 @@ fn blob_tree_recover_gc_stats() -> lsm_tree::Result<()> { let new_big_value = b"winter!".repeat(128_000); { - let tree = lsm_tree::Config::new(path).open_as_blob_tree()?; + let tree = lsm_tree::Config::new(path) + .with_kv_separation(Some(Default::default())) + .open()?; assert!(tree.get("big", SeqNo::MAX)?.is_none()); tree.insert("big", &big_value, 0); @@ -31,8 +33,7 @@ fn blob_tree_recover_gc_stats() -> lsm_tree::Result<()> { tree.major_compact(64_000_000, 1_000)?; let gc_stats = tree - .index - .manifest + .manifest() .read() .expect("lock is poisoned") .current_version() @@ -51,11 +52,12 @@ fn blob_tree_recover_gc_stats() -> lsm_tree::Result<()> { } { - let tree = lsm_tree::Config::new(path).open_as_blob_tree()?; + let tree = lsm_tree::Config::new(path) + .with_kv_separation(Some(Default::default())) + .open()?; let gc_stats = tree - .index - .manifest + .manifest() .read() .expect("lock is poisoned") .current_version() diff --git a/tests/blob_sep_threshold.rs b/tests/blob_sep_threshold.rs index 449d2a92..1c26a11a 100644 --- a/tests/blob_sep_threshold.rs +++ b/tests/blob_sep_threshold.rs @@ -1,4 +1,4 @@ -use lsm_tree::{AbstractTree, SeqNo}; +use lsm_tree::{AbstractTree, KvSeparationOptions, SeqNo}; use test_log::test; #[test] @@ -8,8 +8,10 @@ fn blob_tree_separation_threshold() -> lsm_tree::Result<()> { let path = folder.path(); let tree = lsm_tree::Config::new(path) - .blob_file_separation_threshold(1_024) - .open_as_blob_tree()?; + .with_kv_separation(Some( + KvSeparationOptions::default().blob_file_separation_threshold(1_024), + )) + .open()?; tree.insert("a", "a".repeat(1_023), 0); tree.flush_active_memtable(0)?; diff --git a/tests/blob_simple.rs b/tests/blob_simple.rs index 8bd4ccaf..8557ed1d 100644 --- a/tests/blob_simple.rs +++ b/tests/blob_simple.rs @@ -13,7 +13,9 @@ fn blob_tree_simple_flush_read() -> lsm_tree::Result<()> { // TODO: 3.0.0 just do Config.with_kv_separation().open() // on recover, check manifest for type // just return AnyTree - let tree = lsm_tree::Config::new(path).open_as_blob_tree()?; + let tree = lsm_tree::Config::new(path) + .with_kv_separation(Some(Default::default())) + .open()?; assert!(tree.get("big", SeqNo::MAX)?.is_none()); tree.insert("big", &big_value, 0); @@ -48,7 +50,9 @@ fn blob_tree_simple_flush_read() -> lsm_tree::Result<()> { } { - let tree = lsm_tree::Config::new(path).open_as_blob_tree()?; + let tree = lsm_tree::Config::new(path) + .with_kv_separation(Some(Default::default())) + .open()?; let value = tree.get("smol", SeqNo::MAX)?.expect("should exist"); assert_eq!(&*value, b"small value"); diff --git a/tests/blob_tree_reload_blob.rs b/tests/blob_tree_reload_blob.rs index 8f6daaac..f42a09c8 100644 --- a/tests/blob_tree_reload_blob.rs +++ b/tests/blob_tree_reload_blob.rs @@ -9,7 +9,7 @@ fn blob_tree_reload_empty() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; { - let tree = Config::new(&folder).open_as_blob_tree()?; + let tree = Config::new(&folder).with_kv_separation(Some(Default::default())).open()?; assert_eq!(tree.len(SeqNo::MAX, None)?, 0); assert_eq!(tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), 0); @@ -25,7 +25,7 @@ fn blob_tree_reload_empty() -> lsm_tree::Result<()> { } { - let tree = Config::new(&folder).open_as_blob_tree()?; + let tree = Config::new(&folder).with_kv_separation(Some(Default::default())).open()?; assert_eq!(tree.len(SeqNo::MAX, None)?, 0); assert_eq!(tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), 0); @@ -43,7 +43,7 @@ fn blob_tree_reload_empty() -> lsm_tree::Result<()> { } { - let tree = Config::new(&folder).open_as_blob_tree()?; + let tree = Config::new(&folder).with_kv_separation(Some(Default::default())).open()?; assert_eq!(tree.len(SeqNo::MAX, None)?, 0); assert_eq!(tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), 0); @@ -69,7 +69,7 @@ fn blob_tree_reload() -> lsm_tree::Result<()> { let seqno = SequenceNumberCounter::default(); { - let tree = Config::new(&folder).open_as_blob_tree()?; + let tree = Config::new(&folder).with_kv_separation(Some(Default::default())).open()?; for x in 0..ITEM_COUNT as u64 { let key = x.to_be_bytes(); @@ -102,7 +102,7 @@ fn blob_tree_reload() -> lsm_tree::Result<()> { } { - let tree = Config::new(&folder).open_as_blob_tree()?; + let tree = Config::new(&folder).with_kv_separation(Some(Default::default())).open()?; assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT * 2); assert_eq!( diff --git a/tests/compaction_readers_grouping.rs b/tests/compaction_readers_grouping.rs index f277bc51..dd7d5ea0 100644 --- a/tests/compaction_readers_grouping.rs +++ b/tests/compaction_readers_grouping.rs @@ -37,7 +37,7 @@ fn compaction_readers_grouping() -> lsm_tree::Result<()> { tree.compact(Arc::new(lsm_tree::compaction::PullDown(2, 3)), 0)?; assert!(!tree - .manifest + .manifest() .read() .expect("asdasd") .current_version() @@ -46,7 +46,7 @@ fn compaction_readers_grouping() -> lsm_tree::Result<()> { .is_empty()); assert!(tree - .manifest + .manifest() .read() .expect("asdasd") .current_version() @@ -55,7 +55,7 @@ fn compaction_readers_grouping() -> lsm_tree::Result<()> { .is_empty()); assert!(tree - .manifest + .manifest() .read() .expect("asdasd") .current_version() @@ -64,7 +64,7 @@ fn compaction_readers_grouping() -> lsm_tree::Result<()> { .is_empty()); assert!(!tree - .manifest + .manifest() .read() .expect("asdasd") .current_version() diff --git a/tests/experimental_blob_tree_guarded_size.rs b/tests/experimental_blob_tree_guarded_size.rs index c435f359..df7dbaff 100644 --- a/tests/experimental_blob_tree_guarded_size.rs +++ b/tests/experimental_blob_tree_guarded_size.rs @@ -6,7 +6,9 @@ use test_log::test; fn experimental_blob_tree_guarded_size() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; - let tree = Config::new(folder).open_as_blob_tree()?; + let tree = Config::new(folder) + .with_kv_separation(Some(Default::default())) + .open()?; tree.insert("a".as_bytes(), "abc", 0); tree.insert("b".as_bytes(), "a".repeat(10_000), 0); diff --git a/tests/experimental_tree_guarded_range.rs b/tests/experimental_tree_guarded_range.rs index 286da5cf..0b18937d 100644 --- a/tests/experimental_tree_guarded_range.rs +++ b/tests/experimental_tree_guarded_range.rs @@ -36,7 +36,9 @@ fn experimental_tree_guarded_range() -> lsm_tree::Result<()> { fn experimental_blob_tree_guarded_range() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; - let tree = Config::new(folder).open_as_blob_tree()?; + let tree = Config::new(folder) + .with_kv_separation(Some(Default::default())) + .open()?; tree.insert("a".as_bytes(), nanoid::nanoid!().as_bytes(), 0); tree.insert("f".as_bytes(), nanoid::nanoid!().as_bytes(), 1); diff --git a/tests/multi_trees.rs b/tests/multi_trees.rs index c7015ec7..47dff0ec 100644 --- a/tests/multi_trees.rs +++ b/tests/multi_trees.rs @@ -7,31 +7,19 @@ fn tree_multi_segment_ids() -> lsm_tree::Result<()> { let folder1 = tempfile::tempdir()?; let tree0 = Config::new(&folder0).open()?; - assert_eq!(tree0.id, 0); + assert_eq!(tree0.id(), 0); - assert_eq!( - 0, - tree0 - .0 - .segment_id_counter - .load(std::sync::atomic::Ordering::Relaxed) - ); + assert_eq!(0, tree0.next_table_id()); tree0.insert("a", "a", 0); tree0.flush_active_memtable(0)?; - assert_eq!( - 1, - tree0 - .0 - .segment_id_counter - .load(std::sync::atomic::Ordering::Relaxed) - ); + assert_eq!(1, tree0.next_table_id()); assert_eq!( 0, tree0 - .manifest + .manifest() .read() .expect("lock is poisoned") .current_version() @@ -46,31 +34,19 @@ fn tree_multi_segment_ids() -> lsm_tree::Result<()> { ); let tree1 = Config::new(&folder1).open()?; - assert_eq!(tree1.id, 1); + assert_eq!(tree1.id(), 1); - assert_eq!( - 0, - tree1 - .0 - .segment_id_counter - .load(std::sync::atomic::Ordering::Relaxed) - ); + assert_eq!(0, tree1.next_table_id()); tree1.insert("a", "a", 0); tree1.flush_active_memtable(0)?; - assert_eq!( - 1, - tree1 - .0 - .segment_id_counter - .load(std::sync::atomic::Ordering::Relaxed) - ); + assert_eq!(1, tree1.next_table_id()); assert_eq!( 0, tree1 - .manifest + .manifest() .read() .expect("lock is poisoned") .current_version() diff --git a/tests/mvcc_slab.rs b/tests/mvcc_slab.rs index 23bd5df4..45ef7ee3 100644 --- a/tests/mvcc_slab.rs +++ b/tests/mvcc_slab.rs @@ -21,7 +21,7 @@ fn segment_reader_mvcc_slab() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; - let level_manifest = tree.manifest.read().expect("lock is poisoned"); + let level_manifest = tree.manifest().read().expect("lock is poisoned"); let segment = level_manifest .current_version() @@ -47,7 +47,8 @@ fn segment_reader_mvcc_slab_blob() -> lsm_tree::Result<()> { let tree = Config::new(&folder) .data_block_size_policy(BlockSizePolicy::all(1_024)) .index_block_size_policy(BlockSizePolicy::all(1_024)) - .open_as_blob_tree()?; + .with_kv_separation(Some(Default::default())) + .open()?; let seqno = SequenceNumberCounter::default(); @@ -58,7 +59,7 @@ fn segment_reader_mvcc_slab_blob() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; - let level_manifest = tree.index.manifest.read().expect("lock is poisoned"); + let level_manifest = tree.manifest().read().expect("lock is poisoned"); let segment = level_manifest .current_version() diff --git a/tests/segment_point_reads.rs b/tests/segment_point_reads.rs index 293e1745..46c770a9 100644 --- a/tests/segment_point_reads.rs +++ b/tests/segment_point_reads.rs @@ -123,7 +123,8 @@ fn blob_tree_segment_point_reads_mvcc_slab() -> lsm_tree::Result<()> { let tree = Config::new(folder) .data_block_size_policy(BlockSizePolicy::all(1_024)) .index_block_size_policy(BlockSizePolicy::all(1_024)) - .open_as_blob_tree()?; + .with_kv_separation(Some(Default::default())) + .open()?; let keys = [0, 1, 2] .into_iter() diff --git a/tests/tree_approx_len.rs b/tests/tree_approx_len.rs index 0c0dd2df..df359ee0 100644 --- a/tests/tree_approx_len.rs +++ b/tests/tree_approx_len.rs @@ -37,7 +37,7 @@ fn tree_approx_len_sealed() -> lsm_tree::Result<()> { fn tree_approx_len_sealed_blob() -> lsm_tree::Result<()> { let folder = tempdir()?; - let tree = Config::new(folder).open_as_blob_tree()?; + let tree = Config::new(folder).with_kv_separation(Some(Default::default())).open()?; assert_eq!(tree.len(SeqNo::MAX, None)?, 0); assert!(tree.is_empty(SeqNo::MAX, None)?); @@ -131,7 +131,7 @@ fn tree_approx_len() -> lsm_tree::Result<()> { fn tree_approx_len_blob() -> lsm_tree::Result<()> { let folder = tempdir()?; - let tree = Config::new(folder).open_as_blob_tree()?; + let tree = Config::new(folder).with_kv_separation(Some(Default::default())).open()?; assert_eq!(tree.len(SeqNo::MAX, None)?, 0); assert!(tree.is_empty(SeqNo::MAX, None)?); @@ -180,7 +180,7 @@ fn tree_approx_len_blob() -> lsm_tree::Result<()> { assert!(tree.is_empty(SeqNo::MAX, None)?); assert_eq!(tree.approximate_len(), 5); - tree.index.major_compact(u64::MAX, 5)?; + tree.major_compact(u64::MAX, 5)?; // Approximate count converges assert_eq!(tree.len(SeqNo::MAX, None)?, 0); diff --git a/tests/tree_count.rs b/tests/tree_count.rs index d40624ab..4f03d037 100644 --- a/tests/tree_count.rs +++ b/tests/tree_count.rs @@ -65,7 +65,7 @@ fn tree_flushed_count() -> lsm_tree::Result<()> { fn tree_flushed_count_blob() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; - let tree = Config::new(folder).open_as_blob_tree()?; + let tree = Config::new(folder).with_kv_separation(Some(Default::default())).open()?; for x in 0..ITEM_COUNT as u64 { let key = x.to_be_bytes(); diff --git a/tests/tree_disjoint_point_read.rs b/tests/tree_disjoint_point_read.rs index f36609e6..7356653d 100644 --- a/tests/tree_disjoint_point_read.rs +++ b/tests/tree_disjoint_point_read.rs @@ -38,7 +38,8 @@ fn tree_disjoint_point_read_blob() -> lsm_tree::Result<()> { let tree = Config::new(folder) .data_block_size_policy(BlockSizePolicy::all(1_024)) .index_block_size_policy(BlockSizePolicy::all(1_024)) - .open_as_blob_tree()?; + .with_kv_separation(Some(Default::default())) + .open()?; tree.insert("a", "a", 0); tree.insert("b", "b", 0); @@ -85,7 +86,7 @@ fn tree_disjoint_point_read_multiple_levels() -> lsm_tree::Result<()> { tree.compact(Arc::new(lsm_tree::compaction::SizeTiered::new(10, 8)), 1)?; assert_eq!( 1, - tree.manifest + tree.manifest() .read() .expect("asdasd") .current_version() @@ -123,7 +124,8 @@ fn tree_disjoint_point_read_multiple_levels_blob() -> lsm_tree::Result<()> { let tree = Config::new(folder) .data_block_size_policy(BlockSizePolicy::all(1_024)) .index_block_size_policy(BlockSizePolicy::all(1_024)) - .open_as_blob_tree()?; + .with_kv_separation(Some(Default::default())) + .open()?; tree.insert("z", "z", 0); tree.flush_active_memtable(0)?; @@ -140,8 +142,7 @@ fn tree_disjoint_point_read_multiple_levels_blob() -> lsm_tree::Result<()> { tree.compact(Arc::new(lsm_tree::compaction::SizeTiered::new(10, 8)), 1)?; assert_eq!( 1, - tree.index - .manifest + tree.manifest() .read() .expect("asdasd") .current_version() diff --git a/tests/tree_drop_range.rs b/tests/tree_drop_range.rs index 11cffb65..c519fe53 100644 --- a/tests/tree_drop_range.rs +++ b/tests/tree_drop_range.rs @@ -1,7 +1,7 @@ -use lsm_tree::{AbstractTree, Config, SeqNo, Tree}; +use lsm_tree::{AbstractTree, AnyTree, Config, SeqNo}; use std::ops::Bound::{Excluded, Included, Unbounded}; -fn populate_segments(tree: &Tree) -> lsm_tree::Result<()> { +fn populate_segments(tree: &AnyTree) -> lsm_tree::Result<()> { for key in 'a'..='e' { tree.insert([key as u8], "", 0); tree.flush_active_memtable(0)?; diff --git a/tests/tree_flush_eviction.rs b/tests/tree_flush_eviction.rs index 10441569..a1ed93c7 100644 --- a/tests/tree_flush_eviction.rs +++ b/tests/tree_flush_eviction.rs @@ -84,7 +84,7 @@ fn tree_flush_eviction_4() -> lsm_tree::Result<()> { assert_eq!(1, tree.len(SeqNo::MAX, None)?); assert_eq!( 1, - tree.manifest + tree.manifest() .read() .expect("lock is poisoned") .current_version() @@ -104,7 +104,7 @@ fn tree_flush_eviction_4() -> lsm_tree::Result<()> { assert_eq!(1, tree.len(SeqNo::MAX, None)?); assert_eq!( 0, - tree.manifest + tree.manifest() .read() .expect("lock is poisoned") .current_version() diff --git a/tests/tree_l0_point_read.rs b/tests/tree_l0_point_read.rs index 152876e8..94fd051e 100644 --- a/tests/tree_l0_point_read.rs +++ b/tests/tree_l0_point_read.rs @@ -6,7 +6,9 @@ fn tree_l0_point_read() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let path = folder.path(); - let tree = lsm_tree::Config::new(path).open_as_blob_tree()?; + let tree = lsm_tree::Config::new(path) + .with_kv_separation(Some(Default::default())) + .open()?; tree.insert("a", "a", 0); tree.insert("b", "b", 0); diff --git a/tests/tree_l0_range.rs b/tests/tree_l0_range.rs index 85e6a268..ae906009 100644 --- a/tests/tree_l0_range.rs +++ b/tests/tree_l0_range.rs @@ -6,7 +6,9 @@ fn tree_l0_range_blob() -> lsm_tree::Result<()> { let folder: tempfile::TempDir = tempfile::tempdir()?; let path = folder.path(); - let tree = lsm_tree::Config::new(path).open_as_blob_tree()?; + let tree = lsm_tree::Config::new(path) + .with_kv_separation(Some(Default::default())) + .open()?; tree.insert("a", "a", 0); tree.insert("b", "b", 0); diff --git a/tests/major_compaction.rs b/tests/tree_major_compaction.rs similarity index 100% rename from tests/major_compaction.rs rename to tests/tree_major_compaction.rs diff --git a/tests/tree_range.rs b/tests/tree_range.rs index 17cd8b39..2c6e0207 100644 --- a/tests/tree_range.rs +++ b/tests/tree_range.rs @@ -68,7 +68,7 @@ fn blob_tree_range_count() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; - let tree = Config::new(folder).open_as_blob_tree()?; + let tree = Config::new(folder).with_kv_separation(Some(Default::default())).open()?; tree.insert("a".as_bytes(), nanoid::nanoid!().as_bytes(), 0); tree.insert("f".as_bytes(), nanoid::nanoid!().as_bytes(), 1); diff --git a/tests/tree_recover_counter.rs b/tests/tree_recover_counter.rs index ac65283a..30982e9e 100644 --- a/tests/tree_recover_counter.rs +++ b/tests/tree_recover_counter.rs @@ -8,43 +8,23 @@ fn tree_recover_segment_counter() -> lsm_tree::Result<()> { { let tree = Config::new(&folder).open()?; - assert_eq!( - 0, - tree.0 - .segment_id_counter - .load(std::sync::atomic::Ordering::Relaxed) - ); + assert_eq!(0, tree.next_table_id()); tree.insert("a", "a", 0); tree.flush_active_memtable(0)?; - assert_eq!( - 1, - tree.0 - .segment_id_counter - .load(std::sync::atomic::Ordering::Relaxed) - ); + assert_eq!(1, tree.next_table_id()); tree.insert("b", "b", 0); tree.flush_active_memtable(0)?; - assert_eq!( - 2, - tree.0 - .segment_id_counter - .load(std::sync::atomic::Ordering::Relaxed) - ); + assert_eq!(2, tree.next_table_id()); } { let tree = Config::new(&folder).open()?; - assert_eq!( - 2, - tree.0 - .segment_id_counter - .load(std::sync::atomic::Ordering::Relaxed) - ); + assert_eq!(2, tree.next_table_id()); } Ok(()) diff --git a/tests/tree_shadowing.rs b/tests/tree_shadowing.rs index af2df223..dc9de65f 100644 --- a/tests/tree_shadowing.rs +++ b/tests/tree_shadowing.rs @@ -36,7 +36,7 @@ fn tree_shadowing_upsert() -> lsm_tree::Result<()> { fn tree_shadowing_upsert_blob() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.keep(); - let tree = Config::new(folder).open_as_blob_tree()?; + let tree = Config::new(folder).with_kv_separation(Some(Default::default())).open()?; let key = "1".as_bytes(); let value = "oldvalue".as_bytes(); @@ -96,7 +96,7 @@ fn tree_shadowing_delete() -> lsm_tree::Result<()> { fn tree_shadowing_delete_blob() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.keep(); - let tree = Config::new(folder).open_as_blob_tree().unwrap(); + let tree = Config::new(folder).with_kv_separation(Some(Default::default())).open()?; let key = "1".as_bytes(); let value = "oldvalue".as_bytes(); @@ -171,7 +171,7 @@ fn tree_shadowing_range_blob() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.keep(); - let tree = Config::new(folder).open_as_blob_tree()?; + let tree = Config::new(folder).with_kv_separation(Some(Default::default())).open()?; let seqno = SequenceNumberCounter::default(); @@ -287,7 +287,7 @@ fn tree_shadowing_prefix_blob() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.keep(); - let tree = Config::new(folder).open_as_blob_tree()?; + let tree = Config::new(folder).with_kv_separation(Some(Default::default())).open()?; let seqno = SequenceNumberCounter::default(); From ca6a1022815fae9a4c4aa02cd8498556fd7d0c3a Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 29 Sep 2025 21:01:38 +0200 Subject: [PATCH 501/613] wip --- src/vlog/blob_file/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vlog/blob_file/mod.rs b/src/vlog/blob_file/mod.rs index 3d43e47e..7bc59da8 100644 --- a/src/vlog/blob_file/mod.rs +++ b/src/vlog/blob_file/mod.rs @@ -9,13 +9,13 @@ pub mod reader; pub mod scanner; pub mod writer; -use crate::vlog::BlobFileId; +use crate::{blob_tree::FragmentationMap, vlog::BlobFileId}; pub use meta::Metadata; use std::{path::PathBuf, sync::Arc}; /// A blob file is an immutable, sorted, contiguous file that contains large key-value pairs (blobs) #[derive(Debug)] -pub(crate) struct Inner { +pub struct Inner { /// Blob file ID pub id: BlobFileId, From 87f64efc028549a6d8adcf1fecebeee7d9c4b392 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 29 Sep 2025 21:12:59 +0200 Subject: [PATCH 502/613] fmt --- tests/blob_tree_reload_blob.rs | 20 +++++++++++++++----- tests/tree_approx_len.rs | 8 ++++++-- tests/tree_count.rs | 4 +++- tests/tree_range.rs | 4 +++- tests/tree_shadowing.rs | 16 ++++++++++++---- 5 files changed, 39 insertions(+), 13 deletions(-) diff --git a/tests/blob_tree_reload_blob.rs b/tests/blob_tree_reload_blob.rs index f42a09c8..784eb115 100644 --- a/tests/blob_tree_reload_blob.rs +++ b/tests/blob_tree_reload_blob.rs @@ -9,7 +9,9 @@ fn blob_tree_reload_empty() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; { - let tree = Config::new(&folder).with_kv_separation(Some(Default::default())).open()?; + let tree = Config::new(&folder) + .with_kv_separation(Some(Default::default())) + .open()?; assert_eq!(tree.len(SeqNo::MAX, None)?, 0); assert_eq!(tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), 0); @@ -25,7 +27,9 @@ fn blob_tree_reload_empty() -> lsm_tree::Result<()> { } { - let tree = Config::new(&folder).with_kv_separation(Some(Default::default())).open()?; + let tree = Config::new(&folder) + .with_kv_separation(Some(Default::default())) + .open()?; assert_eq!(tree.len(SeqNo::MAX, None)?, 0); assert_eq!(tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), 0); @@ -43,7 +47,9 @@ fn blob_tree_reload_empty() -> lsm_tree::Result<()> { } { - let tree = Config::new(&folder).with_kv_separation(Some(Default::default())).open()?; + let tree = Config::new(&folder) + .with_kv_separation(Some(Default::default())) + .open()?; assert_eq!(tree.len(SeqNo::MAX, None)?, 0); assert_eq!(tree.iter(SeqNo::MAX, None).flat_map(|x| x.key()).count(), 0); @@ -69,7 +75,9 @@ fn blob_tree_reload() -> lsm_tree::Result<()> { let seqno = SequenceNumberCounter::default(); { - let tree = Config::new(&folder).with_kv_separation(Some(Default::default())).open()?; + let tree = Config::new(&folder) + .with_kv_separation(Some(Default::default())) + .open()?; for x in 0..ITEM_COUNT as u64 { let key = x.to_be_bytes(); @@ -102,7 +110,9 @@ fn blob_tree_reload() -> lsm_tree::Result<()> { } { - let tree = Config::new(&folder).with_kv_separation(Some(Default::default())).open()?; + let tree = Config::new(&folder) + .with_kv_separation(Some(Default::default())) + .open()?; assert_eq!(tree.len(SeqNo::MAX, None)?, ITEM_COUNT * 2); assert_eq!( diff --git a/tests/tree_approx_len.rs b/tests/tree_approx_len.rs index df359ee0..b38e0fe2 100644 --- a/tests/tree_approx_len.rs +++ b/tests/tree_approx_len.rs @@ -37,7 +37,9 @@ fn tree_approx_len_sealed() -> lsm_tree::Result<()> { fn tree_approx_len_sealed_blob() -> lsm_tree::Result<()> { let folder = tempdir()?; - let tree = Config::new(folder).with_kv_separation(Some(Default::default())).open()?; + let tree = Config::new(folder) + .with_kv_separation(Some(Default::default())) + .open()?; assert_eq!(tree.len(SeqNo::MAX, None)?, 0); assert!(tree.is_empty(SeqNo::MAX, None)?); @@ -131,7 +133,9 @@ fn tree_approx_len() -> lsm_tree::Result<()> { fn tree_approx_len_blob() -> lsm_tree::Result<()> { let folder = tempdir()?; - let tree = Config::new(folder).with_kv_separation(Some(Default::default())).open()?; + let tree = Config::new(folder) + .with_kv_separation(Some(Default::default())) + .open()?; assert_eq!(tree.len(SeqNo::MAX, None)?, 0); assert!(tree.is_empty(SeqNo::MAX, None)?); diff --git a/tests/tree_count.rs b/tests/tree_count.rs index 4f03d037..72d037bb 100644 --- a/tests/tree_count.rs +++ b/tests/tree_count.rs @@ -65,7 +65,9 @@ fn tree_flushed_count() -> lsm_tree::Result<()> { fn tree_flushed_count_blob() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; - let tree = Config::new(folder).with_kv_separation(Some(Default::default())).open()?; + let tree = Config::new(folder) + .with_kv_separation(Some(Default::default())) + .open()?; for x in 0..ITEM_COUNT as u64 { let key = x.to_be_bytes(); diff --git a/tests/tree_range.rs b/tests/tree_range.rs index 2c6e0207..8375e24c 100644 --- a/tests/tree_range.rs +++ b/tests/tree_range.rs @@ -68,7 +68,9 @@ fn blob_tree_range_count() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; - let tree = Config::new(folder).with_kv_separation(Some(Default::default())).open()?; + let tree = Config::new(folder) + .with_kv_separation(Some(Default::default())) + .open()?; tree.insert("a".as_bytes(), nanoid::nanoid!().as_bytes(), 0); tree.insert("f".as_bytes(), nanoid::nanoid!().as_bytes(), 1); diff --git a/tests/tree_shadowing.rs b/tests/tree_shadowing.rs index dc9de65f..17163c61 100644 --- a/tests/tree_shadowing.rs +++ b/tests/tree_shadowing.rs @@ -36,7 +36,9 @@ fn tree_shadowing_upsert() -> lsm_tree::Result<()> { fn tree_shadowing_upsert_blob() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.keep(); - let tree = Config::new(folder).with_kv_separation(Some(Default::default())).open()?; + let tree = Config::new(folder) + .with_kv_separation(Some(Default::default())) + .open()?; let key = "1".as_bytes(); let value = "oldvalue".as_bytes(); @@ -96,7 +98,9 @@ fn tree_shadowing_delete() -> lsm_tree::Result<()> { fn tree_shadowing_delete_blob() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.keep(); - let tree = Config::new(folder).with_kv_separation(Some(Default::default())).open()?; + let tree = Config::new(folder) + .with_kv_separation(Some(Default::default())) + .open()?; let key = "1".as_bytes(); let value = "oldvalue".as_bytes(); @@ -171,7 +175,9 @@ fn tree_shadowing_range_blob() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.keep(); - let tree = Config::new(folder).with_kv_separation(Some(Default::default())).open()?; + let tree = Config::new(folder) + .with_kv_separation(Some(Default::default())) + .open()?; let seqno = SequenceNumberCounter::default(); @@ -287,7 +293,9 @@ fn tree_shadowing_prefix_blob() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.keep(); - let tree = Config::new(folder).with_kv_separation(Some(Default::default())).open()?; + let tree = Config::new(folder) + .with_kv_separation(Some(Default::default())) + .open()?; let seqno = SequenceNumberCounter::default(); From ffaa1008878cd93f0a7d99c5b330f076baaa1a38 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 30 Sep 2025 15:12:11 +0200 Subject: [PATCH 503/613] cache shenanigans --- examples/kv/src/main.rs | 4 ++-- src/blob_tree/mod.rs | 1 + src/cache.rs | 2 +- src/config/mod.rs | 7 ++++--- src/segment/util.rs | 2 +- src/vlog/accessor.rs | 9 +++++---- 6 files changed, 14 insertions(+), 11 deletions(-) diff --git a/examples/kv/src/main.rs b/examples/kv/src/main.rs index 03874750..b25ffc12 100644 --- a/examples/kv/src/main.rs +++ b/examples/kv/src/main.rs @@ -1,6 +1,6 @@ mod wal; -use lsm_tree::{AbstractTree, Config, InternalValue, SeqNo, SequenceNumberCounter, Tree}; +use lsm_tree::{AbstractTree, AnyTree, Config, InternalValue, SeqNo, SequenceNumberCounter}; use nanoid::nanoid; use std::{ path::Path, @@ -12,7 +12,7 @@ use wal::Wal; /// Single-writer-only JSON-based KV-store. #[derive(Clone)] pub struct KvStore { - tree: Tree, + tree: AnyTree, wal: Wal, seqno: SequenceNumberCounter, } diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index 4b43456c..0daf6099 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -62,6 +62,7 @@ fn resolve_value_handle( // Resolve indirection using value log match Accessor::new(vlog).get( + tree.id(), &tree.blobs_folder, &item.key.user_key, &vptr.vhandle, diff --git a/src/cache.rs b/src/cache.rs index 1eaf09a3..22be8476 100644 --- a/src/cache.rs +++ b/src/cache.rs @@ -90,7 +90,7 @@ impl Cache { let opts = quick_cache::OptionsBuilder::new() .weight_capacity(bytes) .hot_allocation(0.9) - .estimated_items_capacity(1_000_000) + .estimated_items_capacity(1_000) .build() .expect("cache options should be valid"); diff --git a/src/config/mod.rs b/src/config/mod.rs index c392ef83..408c3f8e 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -58,7 +58,7 @@ impl TryFrom for TreeType { const DEFAULT_FILE_FOLDER: &str = ".lsm.data"; /// Options for key-value separation -#[derive(Clone)] +#[derive(Clone, Debug)] pub struct KvSeparationOptions { /// What type of compression is used for blobs pub blob_compression: CompressionType, @@ -78,7 +78,7 @@ impl Default for KvSeparationOptions { Self { blob_compression: CompressionType::None, // TODO: LZ4 blob_file_target_size: /* 64 MiB */ 64 * 1_024 * 1_024, - blob_file_separation_threshold: /* 4 KiB */ 4 * 1_024, + blob_file_separation_threshold: /* 1 KiB */ 1_024, } } } @@ -177,7 +177,8 @@ pub struct Config { /// Filter construction policy pub filter_policy: FilterPolicy, - pub(crate) kv_separation_opts: Option, + #[doc(hidden)] + pub kv_separation_opts: Option, } impl Default for Config { diff --git a/src/segment/util.rs b/src/segment/util.rs index 3164bed5..c8a144a6 100644 --- a/src/segment/util.rs +++ b/src/segment/util.rs @@ -30,7 +30,7 @@ pub fn load_block( #[cfg(feature = "metrics")] use std::sync::atomic::Ordering::Relaxed; - log::trace!("load block {handle:?}"); + log::trace!("load {block_type:?} block {handle:?}"); if let Some(block) = cache.get_block(segment_id, handle.offset()) { #[cfg(feature = "metrics")] diff --git a/src/vlog/accessor.rs b/src/vlog/accessor.rs index 1fe4f3b3..fdf7ef2f 100644 --- a/src/vlog/accessor.rs +++ b/src/vlog/accessor.rs @@ -4,7 +4,7 @@ use crate::{ vlog::{blob_file::reader::Reader, BlobFileId, ValueHandle}, - BlobFile, Cache, DescriptorTable, GlobalSegmentId, UserValue, + BlobFile, Cache, DescriptorTable, GlobalSegmentId, TreeId, UserValue, }; use std::{collections::BTreeMap, fs::File, path::Path, sync::Arc}; @@ -26,13 +26,14 @@ impl<'a> Accessor<'a> { pub fn get( &self, + tree_id: TreeId, base_path: &Path, key: &[u8], vhandle: &ValueHandle, cache: &Cache, descriptor_table: &DescriptorTable, ) -> crate::Result> { - if let Some(value) = cache.get_blob(0 /* TODO: 3.0.0 tree ID... */, vhandle) { + if let Some(value) = cache.get_blob(tree_id, vhandle) { return Ok(Some(value)); } @@ -40,7 +41,7 @@ impl<'a> Accessor<'a> { return Ok(None); }; - let bf_id = GlobalSegmentId::from((0 /* TODO: 3.0.0 tree ID */, blob_file.id())); + let bf_id = GlobalSegmentId::from((tree_id, blob_file.id())); let file = if let Some(fd) = descriptor_table.access_for_blob_file(&bf_id) { fd @@ -53,7 +54,7 @@ impl<'a> Accessor<'a> { }; let value = Reader::new(blob_file, &file).get(key, vhandle)?; - cache.insert_blob(0 /* TODO: 3.0.0 tree_id */, vhandle, value.clone()); + cache.insert_blob(tree_id, vhandle, value.clone()); Ok(Some(value)) } From a4497f21d1e7c5bbc38115e0b6b6e150862df69b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 30 Sep 2025 15:32:25 +0200 Subject: [PATCH 504/613] remove kv example --- .github/workflows/test.yml | 5 - compile_examples.mjs | 33 ----- examples/kv/.gitignore | 3 - examples/kv/Cargo.toml | 14 -- examples/kv/README.md | 3 - examples/kv/src/main.rs | 280 ------------------------------------- examples/kv/src/wal.rs | 138 ------------------ 7 files changed, 476 deletions(-) delete mode 100644 compile_examples.mjs delete mode 100644 examples/kv/.gitignore delete mode 100644 examples/kv/Cargo.toml delete mode 100644 examples/kv/README.md delete mode 100644 examples/kv/src/main.rs delete mode 100644 examples/kv/src/wal.rs diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d0cb132f..305f6aef 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -38,9 +38,6 @@ jobs: uses: Swatinem/rust-cache@v2 with: prefix-key: ${{ runner.os }}-cargo - workspaces: > - . -> target - examples/kv -> target - name: Install cargo-all-features run: cargo install cargo-all-features - uses: taiki-e/install-action@nextest @@ -54,8 +51,6 @@ jobs: run: cargo nextest run --all-features - name: Run doc tests run: cargo test --doc --features lz4 - - name: Build & test LSM examples - run: node compile_examples.mjs cross: timeout-minutes: 15 name: cross diff --git a/compile_examples.mjs b/compile_examples.mjs deleted file mode 100644 index 68e8f454..00000000 --- a/compile_examples.mjs +++ /dev/null @@ -1,33 +0,0 @@ -import { spawn } from "node:child_process"; -import { readdir } from "node:fs/promises"; -import { resolve } from "node:path"; - -const examplesFolder = "examples"; - -for (const exampleName of await readdir(examplesFolder)) { - const folder = resolve(examplesFolder, exampleName); - - { - const proc = spawn("cargo test", { - cwd: folder, - shell: true, - }); - - proc.stdout.on("data", buf => console.log(String(buf))); - proc.stderr.on("data", buf => console.error(String(buf))); - - await new Promise((resolve, _) => { - proc.on("exit", () => { - if (proc.exitCode > 0) { - console.error(`${folder} FAILED`); - process.exit(1); - } - else { - resolve(); - } - }) - }); - } - - console.error(`${folder} OK`); -} diff --git a/examples/kv/.gitignore b/examples/kv/.gitignore deleted file mode 100644 index 57e6b4cd..00000000 --- a/examples/kv/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -/target -.data -.test.data diff --git a/examples/kv/Cargo.toml b/examples/kv/Cargo.toml deleted file mode 100644 index cb7f3cca..00000000 --- a/examples/kv/Cargo.toml +++ /dev/null @@ -1,14 +0,0 @@ -[package] -name = "simple-kv" -version = "0.1.0" -edition = "2021" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] -lsm-tree = { path = "../.." } -nanoid = "0.4.0" -serde = { version = "1.0.193", features = ["derive", "rc"] } -serde_json = "1.0.108" - -[workspace] diff --git a/examples/kv/README.md b/examples/kv/README.md deleted file mode 100644 index e627ee4b..00000000 --- a/examples/kv/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# kv example - -This example uses `lsm-tree` and a really bad WAL implementation to provide a persistent KV-store. diff --git a/examples/kv/src/main.rs b/examples/kv/src/main.rs deleted file mode 100644 index b25ffc12..00000000 --- a/examples/kv/src/main.rs +++ /dev/null @@ -1,280 +0,0 @@ -mod wal; - -use lsm_tree::{AbstractTree, AnyTree, Config, InternalValue, SeqNo, SequenceNumberCounter}; -use nanoid::nanoid; -use std::{ - path::Path, - sync::Arc, - time::{Duration, Instant}, -}; -use wal::Wal; - -/// Single-writer-only JSON-based KV-store. -#[derive(Clone)] -pub struct KvStore { - tree: AnyTree, - wal: Wal, - seqno: SequenceNumberCounter, -} - -impl KvStore { - pub fn open>(path: P) -> lsm_tree::Result { - let start = Instant::now(); - let tree = Config::new(&path).open()?; - eprintln!("Recovered LSM-tree in {}s", start.elapsed().as_secs_f32()); - - let start = Instant::now(); - let (wal, memtable) = Wal::open(&path)?; - eprintln!( - "Recovered WAL + memtable in {}s", - start.elapsed().as_secs_f32() - ); - - let seqno = SequenceNumberCounter::new( - memtable - .get_highest_seqno() - .map(|x| x + 1) - .unwrap_or_default(), - ); - - tree.set_active_memtable(memtable); - - let kv = Self { tree, wal, seqno }; - - { - let tree = kv.tree.clone(); - - // Run garbage collection on interval - // - // Could use something like a semaphore that - // gets incremented after flushing instead - std::thread::spawn(move || { - loop { - eprintln!("Maybe compact"); - let strategy = lsm_tree::compaction::Levelled::default(); - - // NOTE: This is not tracking lowest safe seqno, so old versions will - // not be GC'ed - tree.compact(Arc::new(strategy), 0)?; - - std::thread::sleep(Duration::from_secs(1)); - } - Ok::<_, lsm_tree::Error>(()) - }); - } - - { - let kv = kv.clone(); - - // Keep data durable up to 1 second into the past - // Could also call wal.sync() after every insert - // But that makes inserts really slow - std::thread::spawn(move || { - loop { - std::thread::sleep(Duration::from_secs(1)); - kv.wal.sync()?; - } - Ok::<_, lsm_tree::Error>(()) - }); - } - - Ok(kv) - } - - pub fn insert, V: AsRef>( - &mut self, - key: K, - value: V, - ) -> lsm_tree::Result<()> { - let key = key.as_ref().as_bytes(); - let value = value.as_ref().as_bytes(); - let seqno = self.seqno.next(); - - self.wal.write(InternalValue::from_components( - key, - value, - seqno, - lsm_tree::ValueType::Value, - ))?; - - let (_, memtable_size) = self.tree.insert(key, value, seqno); - self.maintenance(memtable_size)?; - - Ok(()) - } - - pub fn remove>(&mut self, key: K) -> lsm_tree::Result<()> { - let key = key.as_ref().as_bytes(); - let seqno = self.seqno.next(); - - self.wal.write(InternalValue::new_tombstone(key, seqno))?; - - let (_, memtable_size) = self.tree.remove(key, seqno); - self.maintenance(memtable_size)?; - - Ok(()) - } - - pub fn force_flush(&self) -> lsm_tree::Result<()> { - eprintln!("Flushing memtable"); - self.tree.flush_active_memtable(0)?; - Ok(()) - } - - pub fn maintenance(&mut self, memtable_size: u64) -> lsm_tree::Result<()> { - // 8 MiB limit - if memtable_size > 8 * 1_024 * 1_024 { - self.force_flush()?; - - // NOTE: This is not safe and should not be used in a real implementation: - // If the system crashes >here<, the WAL will still exist, and on - // recovery we will have the flushed data on disk AND in the memtable - // only to get flushed again - // - // You can try and think of a way of how to solve this ;) - - // NOTE: Because we are doing synchronous flushing, we can safely - // truncate the log, as we now know all data is flushed to segments. - self.wal.truncate()?; - } - - if self.tree.level_segment_count(0).unwrap_or_default() > 16 { - eprintln!("Stalling writes..."); - std::thread::sleep(Duration::from_millis(100)); - } - - while self.tree.level_segment_count(0).unwrap_or_default() > 20 { - eprintln!("Halting writes until L0 is cleared up..."); - } - - Ok(()) - } - - pub fn get>(&self, key: K) -> lsm_tree::Result>> { - Ok(self.tree.get(key.as_ref(), SeqNo::MAX)?.map(|bytes| { - std::str::from_utf8(&bytes) - .expect("should be valid utf-8") - .into() - })) - } - - pub fn contains_key>(&self, key: K) -> lsm_tree::Result { - self.tree.contains_key(key.as_ref(), SeqNo::MAX) - } - - pub fn is_empty(&self) -> lsm_tree::Result { - self.tree.is_empty(SeqNo::MAX, None) - } - - pub fn len(&self) -> lsm_tree::Result { - self.tree.len(SeqNo::MAX, None) - } -} - -const ITEM_COUNT: usize = 1_000_000; - -fn main() -> lsm_tree::Result<()> { - let mut kv = KvStore::open(".data")?; - - eprintln!("Counting items"); - eprintln!("Recovered LSM-tree with {} items", kv.len()?); - - if !kv.contains_key("my-key-1")? { - kv.insert("my-key-1", "my-value-1")?; - } - if !kv.contains_key("my-key-2")? { - kv.insert("my-key-2", "my-value-2")?; - } - if !kv.contains_key("my-key-3")? { - kv.insert("my-key-3", "my-value-3")?; - } - - eprintln!("Getting items"); - - assert_eq!(Some("my-value-1"), kv.get("my-key-1")?.as_deref()); - assert_eq!(Some("my-value-2"), kv.get("my-key-2")?.as_deref()); - assert_eq!(Some("my-value-3"), kv.get("my-key-3")?.as_deref()); - - eprintln!("Remove 3 items"); - kv.remove("my-key-1")?; - kv.remove("my-key-2")?; - kv.remove("my-key-3")?; - - assert!(!kv.contains_key("my-key-1")?); - assert!(!kv.contains_key("my-key-2")?); - assert!(!kv.contains_key("my-key-3")?); - - eprintln!("Counting items"); - let remaining_item_count = ITEM_COUNT - kv.len()?; - - eprintln!("Bulk loading {remaining_item_count} items"); - let start = Instant::now(); - - for idx in 0..remaining_item_count { - kv.insert(nanoid!(), nanoid!())?; - - if idx % 1_000_000 == 0 { - eprintln!("Written {idx} items"); - } - } - eprintln!("Took: {}s", start.elapsed().as_secs_f32()); - - eprintln!("Counting items"); - assert_eq!(ITEM_COUNT, kv.len()?); - - while kv.tree.is_compacting() { - eprintln!("Waiting for compaction..."); - std::thread::sleep(Duration::from_secs(1)); - } - - eprintln!("Counting items"); - assert_eq!(ITEM_COUNT, kv.len()?); - - eprintln!("All good"); - - Ok(()) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_simple_kv() -> lsm_tree::Result<()> { - if Path::new(".test-data").try_exists()? { - std::fs::remove_dir_all(".test-data")?; - } - - // Write some data - { - let mut kv = KvStore::open(".test-data")?; - - kv.insert("my-key-1", "my-value-1")?; - kv.insert("my-key-2", "my-value-2")?; - kv.insert("my-key-3", "my-value-3")?; - - assert_eq!(Some("my-value-1"), kv.get("my-key-1")?.as_deref()); - assert_eq!(Some("my-value-2"), kv.get("my-key-2")?.as_deref()); - assert_eq!(Some("my-value-3"), kv.get("my-key-3")?.as_deref()); - - assert_eq!(3, kv.len()?); - - kv.remove("my-key-2")?; - - kv.force_flush()?; - } - - // Recover from disk - { - let kv = KvStore::open(".test-data")?; - - assert_eq!(Some("my-value-1"), kv.get("my-key-1")?.as_deref()); - assert_eq!(None, kv.get("my-key-2")?); - assert_eq!(Some("my-value-3"), kv.get("my-key-3")?.as_deref()); - - assert_eq!(2, kv.len()?); - } - - Ok(()) - } -} diff --git a/examples/kv/src/wal.rs b/examples/kv/src/wal.rs deleted file mode 100644 index bc3fd7ef..00000000 --- a/examples/kv/src/wal.rs +++ /dev/null @@ -1,138 +0,0 @@ -use lsm_tree::{InternalValue, Memtable, SeqNo, ValueType}; -use serde::{Deserialize, Serialize}; -use std::io::{Seek, Write}; -use std::sync::Mutex; -use std::{ - fs::{File, OpenOptions}, - io::{BufRead, BufReader}, - path::Path, - sync::Arc, -}; - -#[derive(Deserialize, Serialize)] -pub struct WalEntry { - #[serde(rename = "k")] - key: Arc, - - #[serde(rename = "v")] - value: Arc, - - #[serde(rename = "s")] - seqno: SeqNo, - - #[serde(rename = "t")] - value_type: u8, -} - -impl From for InternalValue { - fn from(entry: WalEntry) -> Self { - Self::from_components( - entry.key, - entry.value, - entry.seqno, - ValueType::try_from(entry.value_type).unwrap(), - ) - } -} - -impl From for WalEntry { - fn from(entry: InternalValue) -> Self { - Self { - key: std::str::from_utf8(&entry.key.user_key) - .expect("should be valid utf-8") - .into(), - value: std::str::from_utf8(&entry.value) - .expect("should be valid utf-8") - .into(), - seqno: entry.key.seqno, - value_type: entry.key.value_type.into(), - } - } -} - -/// Simple JSON-based single-writer-only WAL. -#[derive(Clone)] -pub struct Wal { - writer: Arc>, -} - -impl Wal { - pub fn open>(path: P) -> lsm_tree::Result<(Wal, Memtable)> { - let path = path.as_ref(); - let wal_path = path.join(".wal.jsonl"); - - if wal_path.try_exists()? { - let Memtable = recover_wal(&wal_path)?; - let writer = OpenOptions::new().append(true).open(&wal_path)?; - let writer = Arc::new(Mutex::new(writer)); - - let wal = Self { writer }; - - Ok((wal, Memtable)) - } else { - let writer = OpenOptions::new() - .write(true) - .create_new(true) - .open(&wal_path)?; - let writer = Arc::new(Mutex::new(writer)); - - let wal = Self { writer }; - Ok((wal, Memtable::default())) - } - } - - pub fn write(&mut self, value: InternalValue) -> lsm_tree::Result<()> { - let mut writer = self.writer.lock().expect("lock is poisoned"); - - let wal_entry: WalEntry = value.into(); - let str = serde_json::to_string(&wal_entry).expect("should serialize"); - writeln!(&mut writer, "{str}")?; - - Ok(()) - } - - pub fn sync(&self) -> lsm_tree::Result<()> { - let writer = self.writer.lock().expect("lock is poisoned"); - writer.sync_all()?; - Ok(()) - } - - pub fn truncate(&mut self) -> lsm_tree::Result<()> { - let mut writer = self.writer.lock().expect("lock is poisoned"); - writer.seek(std::io::SeekFrom::Start(0))?; - writer.set_len(0)?; - writer.sync_all()?; - Ok(()) - } -} - -fn recover_wal>(path: P) -> lsm_tree::Result { - eprintln!("Recovering WAL"); - - let Memtable = Memtable::default(); - - let wal_path = path.as_ref(); - let file = File::open(wal_path)?; - let file = BufReader::new(file); - - let mut cnt = 0; - - for (idx, line) in file.lines().enumerate() { - let line = line?; - if line.is_empty() { - break; - } - - let Ok(entry) = serde_json::from_str::(&line) else { - eprintln!("Truncating WAL to line {idx} because of malformed content"); - break; - }; - - Memtable.insert(entry.into()); - cnt += 1; - } - - eprintln!("Recovered {cnt} items from WAL"); - - Ok(Memtable) -} From 889bc3be060c6570a17431b119d155ef1a152808 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 4 Oct 2025 17:02:25 +0200 Subject: [PATCH 505/613] update blob file writer, scanner etc. --- src/vlog/blob_file/merge.rs | 81 ++++++++++------ src/vlog/blob_file/meta.rs | 25 +++-- src/vlog/blob_file/mod.rs | 35 ++++++- src/vlog/blob_file/multi_writer.rs | 17 ++-- src/vlog/blob_file/reader.rs | 149 +++++++++++++++++++++++++++-- src/vlog/blob_file/scanner.rs | 134 +++++++++++++++++++------- src/vlog/blob_file/writer.rs | 55 +++++++---- src/vlog/mod.rs | 17 +++- 8 files changed, 399 insertions(+), 114 deletions(-) diff --git a/src/vlog/blob_file/merge.rs b/src/vlog/blob_file/merge.rs index 4f20f4cd..4891c7ee 100644 --- a/src/vlog/blob_file/merge.rs +++ b/src/vlog/blob_file/merge.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::scanner::Scanner as BlobFileScanner; -use crate::{vlog::BlobFileId, Checksum, UserKey, UserValue}; +use crate::vlog::{blob_file::scanner::ScanEntry, BlobFileId}; use interval_heap::IntervalHeap; use std::cmp::Reverse; @@ -12,15 +12,13 @@ type IteratorIndex = usize; #[derive(Debug)] struct IteratorValue { index: IteratorIndex, - key: UserKey, - value: UserValue, + scan_entry: ScanEntry, blob_file_id: BlobFileId, - checksum: Checksum, } impl PartialEq for IteratorValue { fn eq(&self, other: &Self) -> bool { - self.key == other.key + self.scan_entry.key == other.scan_entry.key } } impl Eq for IteratorValue {} @@ -33,7 +31,8 @@ impl PartialOrd for IteratorValue { impl Ord for IteratorValue { fn cmp(&self, other: &Self) -> std::cmp::Ordering { - (&self.key, Reverse(&self.blob_file_id)).cmp(&(&other.key, Reverse(&other.blob_file_id))) + (&self.scan_entry.key, Reverse(&self.scan_entry.seqno)) + .cmp(&(&other.scan_entry.key, Reverse(&other.scan_entry.seqno))) } } @@ -57,15 +56,13 @@ impl MergeScanner { let reader = &mut self.readers[idx]; if let Some(value) = reader.next() { - let (k, v, checksum) = value?; + let scan_entry = value?; let blob_file_id = reader.blob_file_id; self.heap.push(IteratorValue { index: idx, - key: k, - value: v, blob_file_id, - checksum, + scan_entry, }); } @@ -82,7 +79,7 @@ impl MergeScanner { } impl Iterator for MergeScanner { - type Item = crate::Result<(UserKey, UserValue, BlobFileId, Checksum)>; + type Item = crate::Result<(ScanEntry, BlobFileId)>; fn next(&mut self) -> Option { if self.heap.is_empty() { @@ -91,20 +88,7 @@ impl Iterator for MergeScanner { if let Some(head) = self.heap.pop_min() { fail_iter!(self.advance_reader(head.index)); - - // Discard old items - while let Some(next) = self.heap.pop_min() { - if next.key == head.key { - fail_iter!(self.advance_reader(next.index)); - } else { - // Reached next user key now - // Push back non-conflicting item and exit - self.heap.push(next); - break; - } - } - - return Some(Ok((head.key, head.value, head.blob_file_id, head.checksum))); + return Some(Ok((head.scan_entry, head.blob_file_id))); } None @@ -120,12 +104,51 @@ mod tests { use tempfile::tempdir; use test_log::test; + #[test] + fn blob_file_merger_seqno() -> crate::Result<()> { + let dir = tempdir()?; + + let blob_file_path = dir.path().join("0"); + { + { + let mut writer = BlobFileWriter::new(&blob_file_path, 0)?; + + writer.write(b"a", 1, &b"1".repeat(100))?; + writer.write(b"a", 0, &b"0".repeat(100))?; + + writer.finish()?; + } + } + + { + let mut merger = MergeScanner::new(vec![Scanner::new(&blob_file_path, 0)?]); + + assert_eq!( + (Slice::from(b"a"), Slice::from(b"1".repeat(100))), + merger + .next() + .map(|result| result.map(|(entry, _)| { (entry.key, entry.value) })) + .unwrap()?, + ); + assert_eq!( + (Slice::from(b"a"), Slice::from(b"0".repeat(100))), + merger + .next() + .map(|result| result.map(|(entry, _)| { (entry.key, entry.value) })) + .unwrap()?, + ); + + assert!(merger.next().is_none()); + } + + Ok(()) + } + #[test] fn blob_file_merger() -> crate::Result<()> { let dir = tempdir()?; let blob_file_0_path = dir.path().join("0"); - let blob_file_1_path = dir.path().join("1"); { @@ -135,7 +158,7 @@ mod tests { let mut writer = BlobFileWriter::new(&blob_file_0_path, 0)?; for key in keys { - writer.write(key, &key.repeat(100))?; + writer.write(key, 0, &key.repeat(100))?; } writer.finish()?; @@ -149,7 +172,7 @@ mod tests { let mut writer = BlobFileWriter::new(&blob_file_1_path, 1)?; for key in keys { - writer.write(key, &key.repeat(100))?; + writer.write(key, 1, &key.repeat(100))?; } writer.finish()?; @@ -169,7 +192,7 @@ mod tests { (Slice::from(key), Slice::from(key.repeat(100))), merger .next() - .map(|result| result.map(|(k, v, _, _)| { (k, v) })) + .map(|result| result.map(|(entry, _)| { (entry.key, entry.value) })) .unwrap()?, ); } diff --git a/src/vlog/blob_file/meta.rs b/src/vlog/blob_file/meta.rs index 82950dbf..6ce27d70 100644 --- a/src/vlog/blob_file/meta.rs +++ b/src/vlog/blob_file/meta.rs @@ -4,15 +4,16 @@ use crate::{ coding::{Decode, DecodeError, Encode, EncodeError}, - KeyRange, + CompressionType, KeyRange, }; -use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use std::io::{Read, Write}; pub const METADATA_HEADER_MAGIC: &[u8] = b"META"; #[derive(Debug)] pub struct Metadata { + // TODO: 3.0.0 created at, so we can do age-based compaction /// Number of KV-pairs in the blob file pub item_count: u64, @@ -24,6 +25,9 @@ pub struct Metadata { /// Key range pub key_range: KeyRange, + + /// Compression type used for all blobs in this file + pub compression: CompressionType, } impl Encode for Metadata { @@ -34,12 +38,14 @@ impl Encode for Metadata { // Checksum type (always 0x0 = XXH3) writer.write_u8(0x0)?; - writer.write_u64::(self.item_count)?; - writer.write_u64::(self.compressed_bytes)?; - writer.write_u64::(self.total_uncompressed_bytes)?; + writer.write_u64::(self.item_count)?; + writer.write_u64::(self.compressed_bytes)?; + writer.write_u64::(self.total_uncompressed_bytes)?; self.key_range.encode_into(writer)?; + self.compression.encode_into(writer)?; + Ok(()) } } @@ -59,17 +65,20 @@ impl Decode for Metadata { return Err(DecodeError::InvalidTag(("BlobFileChecksum", checksum_type))); } - let item_count = reader.read_u64::()?; - let compressed_bytes = reader.read_u64::()?; - let total_uncompressed_bytes = reader.read_u64::()?; + let item_count = reader.read_u64::()?; + let compressed_bytes = reader.read_u64::()?; + let total_uncompressed_bytes = reader.read_u64::()?; let key_range = KeyRange::decode_from(reader)?; + let compression = CompressionType::decode_from(reader)?; + Ok(Self { item_count, compressed_bytes, total_uncompressed_bytes, key_range, + compression, }) } } diff --git a/src/vlog/blob_file/mod.rs b/src/vlog/blob_file/mod.rs index 7bc59da8..b21b6b77 100644 --- a/src/vlog/blob_file/mod.rs +++ b/src/vlog/blob_file/mod.rs @@ -11,7 +11,10 @@ pub mod writer; use crate::{blob_tree::FragmentationMap, vlog::BlobFileId}; pub use meta::Metadata; -use std::{path::PathBuf, sync::Arc}; +use std::{ + path::PathBuf, + sync::{atomic::AtomicBool, Arc}, +}; /// A blob file is an immutable, sorted, contiguous file that contains large key-value pairs (blobs) #[derive(Debug)] @@ -24,7 +27,29 @@ pub struct Inner { /// Statistics pub meta: Metadata, - // TODO: is_deleted, on Drop, like SST segments + + /// Whether this blob file is deleted (logically) + pub is_deleted: AtomicBool, +} + +impl Drop for Inner { + fn drop(&mut self) { + if self.is_deleted.load(std::sync::atomic::Ordering::Acquire) { + log::trace!( + "Cleanup deleted blob file {:?} at {}", + self.id, + self.path.display(), + ); + + if let Err(e) = std::fs::remove_file(&*self.path) { + log::warn!( + "Failed to cleanup deleted blob file {:?} at {}: {e:?}", + self.id, + self.path.display(), + ); + } + } + } } /// A blob file stores large values and is part of the value log @@ -46,6 +71,12 @@ impl std::hash::Hash for BlobFile { } impl BlobFile { + pub(crate) fn mark_as_deleted(&self) { + self.0 + .is_deleted + .store(true, std::sync::atomic::Ordering::Release); + } + /// Returns the blob file ID. #[must_use] pub fn id(&self) -> BlobFileId { diff --git a/src/vlog/blob_file/multi_writer.rs b/src/vlog/blob_file/multi_writer.rs index 0942549f..51e81eb0 100644 --- a/src/vlog/blob_file/multi_writer.rs +++ b/src/vlog/blob_file/multi_writer.rs @@ -8,11 +8,11 @@ use crate::{ blob_file::{Inner as BlobFileInner, Metadata}, BlobFileId, }, - BlobFile, CompressionType, SequenceNumberCounter, + BlobFile, CompressionType, SeqNo, SequenceNumberCounter, }; use std::{ path::{Path, PathBuf}, - sync::Arc, + sync::{atomic::AtomicBool, Arc}, }; /// Blob file writer, may write multiple blob files @@ -113,6 +113,7 @@ impl MultiWriter { ); let blob_file = BlobFile(Arc::new(BlobFileInner { + is_deleted: AtomicBool::new(false), id: blob_file_id, path: writer.path.clone(), meta: Metadata { @@ -133,6 +134,7 @@ impl MultiWriter { .clone() .expect("should have written at least 1 item"), )), + compression: writer.compression, }, })); @@ -161,19 +163,12 @@ impl MultiWriter { /// # Errors /// /// Will return `Err` if an IO error occurs. - pub fn write, V: AsRef<[u8]>>( - &mut self, - key: K, - value: V, - ) -> crate::Result { - let key = key.as_ref(); - let value = value.as_ref(); - + pub fn write(&mut self, key: &[u8], seqno: SeqNo, value: &[u8]) -> crate::Result { let target_size = self.target_size; // Write actual value into blob file let writer = &mut self.active_writer; - let bytes_written = writer.write(key, value)?; + let bytes_written = writer.write(key, seqno, value)?; // Check for blob file size target, maybe rotate to next writer if writer.offset() >= target_size { diff --git a/src/vlog/blob_file/reader.rs b/src/vlog/blob_file/reader.rs index 6c95e95f..0d206c68 100644 --- a/src/vlog/blob_file/reader.rs +++ b/src/vlog/blob_file/reader.rs @@ -2,11 +2,19 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) +use byteorder::{LittleEndian, ReadBytesExt}; + use crate::{ - vlog::{blob_file::writer::BLOB_HEADER_LEN, ValueHandle}, - BlobFile, UserValue, + vlog::{ + blob_file::writer::{BLOB_HEADER_LEN, BLOB_HEADER_MAGIC}, + ValueHandle, + }, + BlobFile, Checksum, CompressionType, UserValue, +}; +use std::{ + fs::File, + io::{Cursor, Read, Seek}, }; -use std::fs::File; /// Reads a single blob from a blob file pub struct Reader<'a> { @@ -22,14 +30,141 @@ impl<'a> Reader<'a> { pub fn get(&self, key: &'a [u8], vhandle: &'a ValueHandle) -> crate::Result { debug_assert_eq!(vhandle.blob_file_id, self.blob_file.id()); - let offset = vhandle.offset + (BLOB_HEADER_LEN as u64) + (key.len() as u64); + let add_size = (BLOB_HEADER_LEN as u64) + (key.len() as u64); + + let value = crate::file::read_exact( + self.file, + vhandle.offset, + (u64::from(vhandle.on_disk_size) + add_size) as usize, + )?; + + let mut reader = Cursor::new(&value[..]); + + let mut magic = [0u8; 4]; + reader.read_exact(&mut magic)?; + + if magic != BLOB_HEADER_MAGIC { + return Err(crate::Error::Decode(crate::DecodeError::InvalidHeader( + "Blob", + ))); + } + + let expected_checksum = reader.read_u128::()?; + + let _seqno = reader.read_u64::()?; + let key_len = reader.read_u16::()? as usize; + let real_val_len = reader.read_u32::()? as usize; + let _on_disk_val_len = reader.read_u32::()? as usize; - let value = crate::file::read_exact(self.file, offset, vhandle.on_disk_size as usize)?; + reader.seek(std::io::SeekFrom::Current(key_len as i64))?; - // TODO: decompress? save compression type into blob_file.meta + let raw_data = value.slice((add_size as usize)..); + + #[warn(clippy::match_single_binding)] + let value = match &self.blob_file.0.meta.compression { + CompressionType::None => raw_data, + + #[cfg(feature = "lz4")] + CompressionType::Lz4 => { + #[warn(unsafe_code)] + let mut builder = unsafe { UserValue::builder_unzeroed(real_val_len as usize) }; + + lz4_flex::decompress_into(&raw_data, &mut builder) + .map_err(|_| crate::Error::Decompress(self.blob_file.0.meta.compression))?; + + builder.freeze().into() + } + }; + + { + let checksum = { + let mut hasher = xxhash_rust::xxh3::Xxh3::default(); + hasher.update(key); + hasher.update(&value); + hasher.digest128() + }; + + if expected_checksum != checksum { + return Err(crate::Error::ChecksumMismatch { + got: Checksum::from_raw(checksum), + expected: Checksum::from_raw(expected_checksum), + }); + } + } Ok(value) } } -// TODO: unit test +#[cfg(test)] +#[allow(clippy::unwrap_used)] +mod tests { + use super::*; + use crate::SequenceNumberCounter; + use test_log::test; + + #[test] + fn blob_reader_roundtrip() -> crate::Result<()> { + let id_generator = SequenceNumberCounter::default(); + + let folder = tempfile::tempdir()?; + let mut writer = + crate::vlog::BlobFileWriter::new(id_generator, u64::MAX, folder.path()).unwrap(); + + let offset = writer.offset(); + let on_disk_size = writer.write(b"a", 0, b"abcdef")?; + let handle = ValueHandle { + blob_file_id: 0, + offset, + on_disk_size, + }; + + let blob_file = writer.finish()?; + let blob_file = blob_file.first().unwrap(); + + let file = File::open(&blob_file.0.path)?; + let reader = Reader::new(blob_file, &file); + + assert_eq!(reader.get(b"a", &handle)?, b"abcdef"); + + Ok(()) + } + + #[test] + #[cfg(feature = "lz4")] + fn blob_reader_roundtrip_lz4() -> crate::Result<()> { + let id_generator = SequenceNumberCounter::default(); + + let folder = tempfile::tempdir()?; + let mut writer = crate::vlog::BlobFileWriter::new(id_generator, u64::MAX, folder.path()) + .unwrap() + .use_compression(CompressionType::Lz4); + + let offset = writer.offset(); + let on_disk_size = writer.write(b"a", 0, b"abcdef")?; + let handle0 = ValueHandle { + blob_file_id: 0, + offset, + on_disk_size, + }; + + let offset = writer.offset(); + let on_disk_size = writer.write(b"b", 0, b"ghi")?; + let handle1 = ValueHandle { + blob_file_id: 0, + offset, + on_disk_size, + }; + + let blob_file = writer.finish()?; + let blob_file = blob_file.first().unwrap(); + + let file = File::open(&blob_file.0.path)?; + let reader = Reader::new(blob_file, &file); + + assert_eq!(reader.get(b"a", &handle0)?, b"abcdef"); + assert_eq!(reader.get(b"b", &handle1)?, b"ghi"); + + Ok(()) + } +} diff --git a/src/vlog/blob_file/scanner.rs b/src/vlog/blob_file/scanner.rs index 38a34929..916f2c81 100644 --- a/src/vlog/blob_file/scanner.rs +++ b/src/vlog/blob_file/scanner.rs @@ -3,11 +3,13 @@ // (found in the LICENSE-* files in the repository) use super::{meta::METADATA_HEADER_MAGIC, writer::BLOB_HEADER_MAGIC}; -use crate::{coding::DecodeError, vlog::BlobFileId, Checksum, CompressionType, UserKey, UserValue}; -use byteorder::{BigEndian, ReadBytesExt}; +use crate::{ + coding::DecodeError, vlog::BlobFileId, Checksum, CompressionType, SeqNo, UserKey, UserValue, +}; +use byteorder::{LittleEndian, ReadBytesExt}; use std::{ fs::File, - io::{BufReader, Read}, + io::{BufReader, Read, Seek}, path::Path, }; @@ -45,24 +47,26 @@ impl Scanner { self.compression = compression; self } +} - // pub(crate) fn get_offset(&mut self) -> std::io::Result { - // self.inner.stream_position() - // } - - // pub(crate) fn into_inner(self) -> BufReader { - // self.inner - // } +#[derive(Debug)] +pub struct ScanEntry { + pub key: UserKey, + pub seqno: SeqNo, + pub value: UserValue, + pub offset: u64, } impl Iterator for Scanner { - type Item = crate::Result<(UserKey, UserValue, Checksum)>; + type Item = crate::Result; fn next(&mut self) -> Option { if self.is_terminated { return None; } + let offset = fail_iter!(self.inner.stream_position()); + { let mut buf = [0; BLOB_HEADER_MAGIC.len()]; fail_iter!(self.inner.read_exact(&mut buf)); @@ -79,35 +83,58 @@ impl Iterator for Scanner { } } - let checksum = fail_iter!(self.inner.read_u128::()); + let expected_checksum = fail_iter!(self.inner.read_u128::()); + let seqno = fail_iter!(self.inner.read_u64::()); - let key_len = fail_iter!(self.inner.read_u16::()); - let real_val_len = fail_iter!(self.inner.read_u32::()); - let on_disk_val_len = fail_iter!(self.inner.read_u32::()); + let key_len = fail_iter!(self.inner.read_u16::()); + let real_val_len = fail_iter!(self.inner.read_u32::()); + let on_disk_val_len = fail_iter!(self.inner.read_u32::()); let key = fail_iter!(UserKey::from_reader(&mut self.inner, key_len as usize)); - // TODO: finish compression + let raw_data = fail_iter!(UserValue::from_reader( + &mut self.inner, + on_disk_val_len as usize + )); + #[warn(clippy::match_single_binding)] - let val = match &self.compression { - _ => { - fail_iter!(UserValue::from_reader( - &mut self.inner, - on_disk_val_len as usize - )) + let value = match &self.compression { + CompressionType::None => raw_data, + + #[cfg(feature = "lz4")] + CompressionType::Lz4 => { + #[warn(unsafe_code)] + let mut builder = unsafe { UserValue::builder_unzeroed(real_val_len as usize) }; + + fail_iter!(lz4_flex::decompress_into(&raw_data, &mut builder) + .map_err(|_| crate::Error::Decompress(self.compression))); + + builder.freeze().into() } }; - // Some(compressor) => { - // // TODO: https://github.com/PSeitz/lz4_flex/issues/166 - // let mut val = vec![0; val_len as usize]; - // fail_iter!(self.inner.read_exact(&mut val)); - // UserValue::from(fail_iter!(compressor.decompress(&val))) - // } - // None => { - // } + { + let checksum = { + let mut hasher = xxhash_rust::xxh3::Xxh3::default(); + hasher.update(&key); + hasher.update(&value); + hasher.digest128() + }; + + if expected_checksum != checksum { + return Some(Err(crate::Error::ChecksumMismatch { + got: Checksum::from_raw(checksum), + expected: Checksum::from_raw(expected_checksum), + })); + } + } - Some(Ok((key, val, Checksum::from_raw(checksum)))) + Some(Ok(ScanEntry { + key, + seqno, + value, + offset, + })) } } @@ -120,7 +147,7 @@ mod tests { use test_log::test; #[test] - fn blob_file_scanner() -> crate::Result<()> { + fn blob_scanner() -> crate::Result<()> { let dir = tempdir()?; let blob_file_path = dir.path().join("0"); @@ -130,7 +157,7 @@ mod tests { let mut writer = BlobFileWriter::new(&blob_file_path, 0)?; for key in keys { - writer.write(key, &key.repeat(100))?; + writer.write(key, 0, &key.repeat(100))?; } writer.finish()?; @@ -144,7 +171,46 @@ mod tests { (Slice::from(key), Slice::from(key.repeat(100))), scanner .next() - .map(|result| result.map(|(k, v, _)| { (k, v) })) + .map(|result| result.map(|entry| { (entry.key, entry.value) })) + .unwrap()?, + ); + } + + assert!(scanner.next().is_none()); + } + + Ok(()) + } + + #[test] + #[cfg(feature = "lz4")] + fn blob_scanner_lz4() -> crate::Result<()> { + let dir = tempdir()?; + let blob_file_path = dir.path().join("0"); + + let keys = [b"a", b"b", b"c", b"d", b"e"]; + + { + let mut writer = + BlobFileWriter::new(&blob_file_path, 0)?.use_compression(CompressionType::Lz4); + + for key in keys { + writer.write(key, 0, &key.repeat(100))?; + } + + writer.finish()?; + } + + { + let mut scanner = + Scanner::new(&blob_file_path, 0)?.use_compression(CompressionType::Lz4); + + for key in keys { + assert_eq!( + (Slice::from(key), Slice::from(key.repeat(100))), + scanner + .next() + .map(|result| result.map(|entry| { (entry.key, entry.value) })) .unwrap()?, ); } diff --git a/src/vlog/blob_file/writer.rs b/src/vlog/blob_file/writer.rs index d7fded07..8354f532 100644 --- a/src/vlog/blob_file/writer.rs +++ b/src/vlog/blob_file/writer.rs @@ -3,8 +3,8 @@ // (found in the LICENSE-* files in the repository) use super::meta::Metadata; -use crate::{coding::Encode, vlog::BlobFileId, CompressionType, KeyRange, UserKey}; -use byteorder::{BigEndian, WriteBytesExt}; +use crate::{coding::Encode, vlog::BlobFileId, CompressionType, KeyRange, SeqNo, UserKey}; +use byteorder::{LittleEndian, WriteBytesExt}; use std::{ io::Write, path::{Path, PathBuf}, @@ -13,10 +13,11 @@ use std::{ pub const BLOB_HEADER_MAGIC: &[u8] = b"BLOB"; pub const BLOB_HEADER_LEN: usize = BLOB_HEADER_MAGIC.len() - + std::mem::size_of::() - + std::mem::size_of::() - + std::mem::size_of::() - + std::mem::size_of::(); + + std::mem::size_of::() // Checksum + + std::mem::size_of::() // SeqNo + + std::mem::size_of::() // Key length + + std::mem::size_of::() // Real value length + + std::mem::size_of::(); // On-disk value length /// Blob file writer pub struct Writer { @@ -98,7 +99,7 @@ impl Writer { /// # Panics /// /// Panics if the key length is empty or greater than 2^16, or the value length is greater than 2^32. - pub fn write(&mut self, key: &[u8], value: &[u8]) -> crate::Result { + pub fn write(&mut self, key: &[u8], seqno: SeqNo, value: &[u8]) -> crate::Result { assert!(!key.is_empty()); assert!(u16::try_from(key.len()).is_ok()); assert!(u32::try_from(value.len()).is_ok()); @@ -115,6 +116,7 @@ impl Writer { // // [MAGIC_BYTES; 4B] // [Checksum; 16B] + // [Seqno; 8B] // [key len; 2B] // [real val len; 4B] // [on-disk val len; 4B] @@ -124,38 +126,49 @@ impl Writer { // Write header self.writer.write_all(BLOB_HEADER_MAGIC)?; - let mut hasher = xxhash_rust::xxh3::Xxh3::new(); - hasher.update(key); - hasher.update(value); - let checksum = hasher.digest128(); + let checksum = { + let mut hasher = xxhash_rust::xxh3::Xxh3::default(); + hasher.update(key); + hasher.update(value); + hasher.digest128() + }; // Write checksum - self.writer.write_u128::(checksum)?; + self.writer.write_u128::(checksum)?; + + // Write seqno + self.writer.write_u64::(seqno)?; // NOTE: Truncation is okay and actually needed #[allow(clippy::cast_possible_truncation)] - self.writer.write_u16::(key.len() as u16)?; + self.writer.write_u16::(key.len() as u16)?; + + // Write uncompressed value length // NOTE: Truncation is okay and actually needed #[allow(clippy::cast_possible_truncation)] - self.writer.write_u32::(value.len() as u32)?; + self.writer.write_u32::(value.len() as u32)?; - // TODO: finish compression - #[warn(clippy::match_single_binding)] let value = match &self.compression { - _ => value, + CompressionType::None => std::borrow::Cow::Borrowed(value), + + #[cfg(feature = "lz4")] + CompressionType::Lz4 => std::borrow::Cow::Owned(lz4_flex::compress(value)), }; + // Write compressed (on-disk) value length + // NOTE: Truncation is okay and actually needed #[allow(clippy::cast_possible_truncation)] - self.writer.write_u32::(value.len() as u32)?; + self.writer.write_u32::(value.len() as u32)?; self.writer.write_all(key)?; - self.writer.write_all(value)?; + self.writer.write_all(&value)?; // Update offset self.offset += BLOB_HEADER_MAGIC.len() as u64; self.offset += std::mem::size_of::() as u64; + self.offset += std::mem::size_of::() as u64; self.offset += std::mem::size_of::() as u64; self.offset += std::mem::size_of::() as u64; @@ -168,6 +181,9 @@ impl Writer { self.written_blob_bytes += value.len() as u64; self.item_count += 1; + // TODO: 3.0.0 if we store the offset before writing, we can return a vhandle here + // TODO: instead of needing to call offset() and blob_file_id() before write() + // NOTE: Truncation is okay #[allow(clippy::cast_possible_truncation)] Ok(value.len() as u32) @@ -189,6 +205,7 @@ impl Writer { .clone() .expect("should have written at least 1 item"), )), + compression: self.compression, }; metadata.encode_into(&mut self.writer)?; diff --git a/src/vlog/mod.rs b/src/vlog/mod.rs index 4feea768..1ba77c9b 100644 --- a/src/vlog/mod.rs +++ b/src/vlog/mod.rs @@ -7,15 +7,19 @@ pub mod blob_file; mod handle; pub use { - accessor::Accessor, blob_file::multi_writer::MultiWriter as BlobFileWriter, - blob_file::BlobFile, handle::ValueHandle, + accessor::Accessor, blob_file::merge::MergeScanner as BlobFileMergeScanner, + blob_file::multi_writer::MultiWriter as BlobFileWriter, + blob_file::scanner::Scanner as BlobFileScanner, blob_file::BlobFile, handle::ValueHandle, }; use crate::{ coding::Decode, vlog::blob_file::{Inner as BlobFileInner, Metadata}, }; -use std::{path::Path, sync::Arc}; +use std::{ + path::Path, + sync::{atomic::AtomicBool, Arc}, +}; pub fn recover_blob_files(folder: &Path, ids: &[BlobFileId]) -> crate::Result> { let cnt = ids.len(); @@ -46,7 +50,12 @@ pub fn recover_blob_files(folder: &Path, ids: &[BlobFileId]) -> crate::Result Date: Sat, 4 Oct 2025 17:02:31 +0200 Subject: [PATCH 506/613] wip --- src/value_type.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/value_type.rs b/src/value_type.rs index 5d24450f..2fabea55 100644 --- a/src/value_type.rs +++ b/src/value_type.rs @@ -2,10 +2,6 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -// TODO: remove MaybeInlineValue because we can just store values flat and look at key instead - -// TODO: add ValueType::is_vhandle - /// Value type (regular value or tombstone) #[derive(Copy, Clone, Debug, Eq, PartialEq)] #[allow(clippy::module_name_repetitions)] From 2f17836684fd37f2fce5af77d981d8cfe07b5db4 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 4 Oct 2025 17:02:42 +0200 Subject: [PATCH 507/613] rename test case --- tests/blob_major_compact_relink.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/blob_major_compact_relink.rs b/tests/blob_major_compact_relink.rs index e834b152..c917dff7 100644 --- a/tests/blob_major_compact_relink.rs +++ b/tests/blob_major_compact_relink.rs @@ -2,7 +2,7 @@ use lsm_tree::{AbstractTree, SeqNo}; use test_log::test; #[test] -fn blob_tree_major_compact_gc_stats() -> lsm_tree::Result<()> { +fn blob_tree_major_compact_relink() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let path = folder.path(); From 2ffceee75a1f306878003f13e707eca0d74121aa Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 4 Oct 2025 17:03:23 +0200 Subject: [PATCH 508/613] change key range binary encoding --- src/key_range.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/key_range.rs b/src/key_range.rs index d1e1b52a..0062ac2c 100644 --- a/src/key_range.rs +++ b/src/key_range.rs @@ -6,7 +6,7 @@ use crate::{ coding::{Decode, DecodeError, Encode, EncodeError}, Slice, UserKey, }; -use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use std::{ io::{Read, Write}, ops::Bound, @@ -166,12 +166,12 @@ impl Encode for KeyRange { // NOTE: Max key size = u16 #[allow(clippy::cast_possible_truncation)] - writer.write_u16::(min.len() as u16)?; + writer.write_u16::(min.len() as u16)?; writer.write_all(min)?; // NOTE: Max key size = u16 #[allow(clippy::cast_possible_truncation)] - writer.write_u16::(max.len() as u16)?; + writer.write_u16::(max.len() as u16)?; writer.write_all(max)?; Ok(()) @@ -180,10 +180,10 @@ impl Encode for KeyRange { impl Decode for KeyRange { fn decode_from(reader: &mut R) -> Result { - let key_min_len = reader.read_u16::()?; + let key_min_len = reader.read_u16::()?; let key_min: UserKey = Slice::from_reader(reader, key_min_len.into())?; - let key_max_len = reader.read_u16::()?; + let key_max_len = reader.read_u16::()?; let key_max: UserKey = Slice::from_reader(reader, key_max_len.into())?; Ok(Self::new((key_min, key_max))) From 238e6bb8e5bcc18a4469af7463d5e8cd99da2d79 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 4 Oct 2025 17:04:01 +0200 Subject: [PATCH 509/613] big compaction refactor --- src/compaction/flavour.rs | 377 ++++++++++++++++++++++ src/compaction/mod.rs | 1 + src/compaction/worker.rs | 641 ++++++++++++++++++++++++++------------ 3 files changed, 818 insertions(+), 201 deletions(-) create mode 100644 src/compaction/flavour.rs diff --git a/src/compaction/flavour.rs b/src/compaction/flavour.rs new file mode 100644 index 00000000..04309e27 --- /dev/null +++ b/src/compaction/flavour.rs @@ -0,0 +1,377 @@ +use std::iter::Peekable; +use std::time::Instant; + +use crate::blob_tree::handle::BlobIndirection; +use crate::blob_tree::FragmentationMap; +use crate::coding::{Decode, Encode}; +use crate::compaction::worker::Options; +use crate::compaction::Input as CompactionPayload; +use crate::file::SEGMENTS_FOLDER; +use crate::level_manifest::LevelManifest; +use crate::segment::multi_writer::MultiWriter; +use crate::version::Version; +use crate::vlog::{BlobFileId, BlobFileMergeScanner, BlobFileWriter}; +use crate::{BlobFile, HashSet, InternalValue, Segment}; + +pub(super) fn prepare_table_writer( + version: &Version, + opts: &Options, + payload: &CompactionPayload, +) -> crate::Result { + let segments_base_folder = opts.config.path.join(SEGMENTS_FOLDER); + + let dst_lvl = payload.canonical_level.into(); + + let data_block_size = opts.config.data_block_size_policy.get(dst_lvl); + let index_block_size = opts.config.index_block_size_policy.get(dst_lvl); + + let data_block_restart_interval = opts.config.data_block_restart_interval_policy.get(dst_lvl); + let index_block_restart_interval = opts.config.index_block_restart_interval_policy.get(dst_lvl); + + let data_block_compression = opts.config.data_block_compression_policy.get(dst_lvl); + let index_block_compression = opts.config.index_block_compression_policy.get(dst_lvl); + + let data_block_hash_ratio = opts.config.data_block_hash_ratio_policy.get(dst_lvl); + + let table_writer = MultiWriter::new( + segments_base_folder, + opts.segment_id_generator.clone(), + payload.target_size, + )?; + + let last_level = (version.level_count() - 1) as u8; + let is_last_level = payload.dest_level == last_level; + + log::debug!( + "Compacting tables {:?} into L{} (canonical L{}), data_block_restart_interval={data_block_restart_interval}, index_block_restart_interval={index_block_restart_interval}, data_block_size={data_block_size}, index_block_size={index_block_size}, data_block_compression={data_block_compression}, index_block_compression={index_block_compression}, mvcc_gc_watermark={}", + payload.segment_ids, + payload.dest_level, + payload.canonical_level, + opts.eviction_seqno, + ); + + Ok(table_writer + .use_data_block_restart_interval(data_block_restart_interval) + .use_index_block_restart_interval(index_block_restart_interval) + .use_data_block_compression(data_block_compression) + .use_data_block_size(data_block_size) + .use_index_block_size(index_block_size) + .use_data_block_hash_ratio(data_block_hash_ratio) + .use_index_block_compression(index_block_compression) + .use_bloom_policy({ + use crate::config::FilterPolicyEntry::{Bloom, None}; + use crate::segment::filter::BloomConstructionPolicy; + + if is_last_level && opts.config.expect_point_read_hits { + BloomConstructionPolicy::BitsPerKey(0.0) + } else { + match opts + .config + .filter_policy + .get(usize::from(payload.dest_level)) + { + Bloom(policy) => policy, + None => BloomConstructionPolicy::BitsPerKey(0.0), + } + } + })) +} + +// TODO: 3.0.0 find a good name +pub(super) trait CompactionFlavour { + fn write(&mut self, item: InternalValue) -> crate::Result<()>; + + fn finish( + self: Box, + levels: &mut LevelManifest, + opts: &Options, + payload: &CompactionPayload, + dst_lvl: usize, + blob_frag_map: FragmentationMap, + ) -> crate::Result<()>; +} + +/// Compaction worker that will relocate blobs that sit in blob files that are being rewritten +pub struct RelocatingCompaction { + inner: StandardCompaction, + blob_scanner: Peekable, + blob_writer: BlobFileWriter, + rewriting_blob_file_ids: HashSet, + rewriting_blob_files: Vec, +} + +impl CompactionFlavour for RelocatingCompaction { + fn write(&mut self, item: InternalValue) -> crate::Result<()> { + if item.key.value_type.is_indirection() { + let mut reader = &item.value[..]; + + let Ok(mut indirection) = BlobIndirection::decode_from(&mut reader) else { + log::error!("Failed to deserialize blob indirection: {item:?}"); + return Ok(()); + }; + + log::debug!( + "{:?}:{} => encountered indirection: {indirection:?}", + item.key.user_key, + item.key.seqno, + ); + + if self + .rewriting_blob_file_ids + .contains(&indirection.vhandle.blob_file_id) + { + loop { + // TODO: uglyyyy + let blob = self + .blob_scanner + .peek() + .expect("should have enough blob entries"); + + if let Ok((entry, blob_file_id)) = blob { + if self.rewriting_blob_file_ids.contains(blob_file_id) { + // This blob is part of the rewritten blob files + if entry.key < item.key.user_key { + self.blob_scanner.next().expect("should exist")?; + continue; + } + + if entry.key == item.key.user_key { + if *blob_file_id < indirection.vhandle.blob_file_id { + self.blob_scanner.next().expect("should exist")?; + continue; + } + if entry.offset < indirection.vhandle.offset { + self.blob_scanner.next().expect("should exist")?; + continue; + } + if entry.offset == indirection.vhandle.offset { + // This is the blob we need + break; + } + } + assert!( + (entry.key > item.key.user_key), + "we passed vptr without getting blob", + ); + break; + } + + break; + } + + let e = self.blob_scanner.next().expect("should exist"); + return Err(e.expect_err("should be error")); + } + + let blob = self.blob_scanner.next().expect("should have blob")?; + + log::info!( + "=> use blob: {:?}:{} offset: {} from BF {}", + blob.0.key, + blob.0.seqno, + blob.0.offset, + blob.1, + ); + + indirection.vhandle.blob_file_id = self.blob_writer.blob_file_id(); + indirection.vhandle.offset = self.blob_writer.offset(); + + log::debug!("RELOCATE to {indirection:?}"); + + self.blob_writer + .write(&item.key.user_key, item.key.seqno, &blob.0.value)?; + + self.inner + .table_writer + .write(InternalValue::from_components( + item.key.user_key, + indirection.encode_into_vec(), + item.key.seqno, + crate::ValueType::Indirection, + ))?; + } else { + // This blob is not part of the rewritten blob files + // So just pass it through + log::trace!("Pass through {indirection:?} because it is not being relocated"); + self.inner.table_writer.register_blob(indirection); + self.inner.table_writer.write(item)?; + } + } else { + self.inner.table_writer.write(item)?; + } + + Ok(()) + } + + fn finish( + mut self: Box, + levels: &mut LevelManifest, + opts: &Options, + payload: &CompactionPayload, + dst_lvl: usize, + blob_frag_map: FragmentationMap, + ) -> crate::Result<()> { + log::debug!( + "Relocating compaction done in {:?}", + self.inner.start.elapsed(), + ); + + let table_ids_to_delete = std::mem::take(&mut self.inner.tables_to_rewrite); + + let created_tables = self.inner.consume_writer(opts, dst_lvl)?; + let created_blob_files = self.blob_writer.finish()?; + + let blob_file_ids_to_drop = self.rewriting_blob_file_ids; + + levels.atomic_swap( + |current| { + current.with_merge( + &payload.segment_ids.iter().copied().collect::>(), + &created_tables, + payload.dest_level as usize, + if blob_frag_map.is_empty() { + None + } else { + Some(blob_frag_map) + }, + created_blob_files, + blob_file_ids_to_drop, + ) + }, + opts.eviction_seqno, + )?; + + // NOTE: If the application were to crash >here< it's fine + // The tables/blob files are not referenced anymore, and will be + // cleaned up upon recovery + for table in table_ids_to_delete { + table.mark_as_deleted(); + } + + for blob_file in self.rewriting_blob_files { + blob_file.mark_as_deleted(); + } + + Ok(()) + } +} + +impl RelocatingCompaction { + pub fn new( + inner: StandardCompaction, + blob_scanner: Peekable, + blob_writer: BlobFileWriter, + rewriting_blob_file_ids: HashSet, + rewriting_blob_files: Vec, + ) -> Self { + Self { + inner, + blob_scanner, + blob_writer, + rewriting_blob_file_ids, + rewriting_blob_files, + } + } +} + +/// Standard compaction worker that just passes through all its data +pub struct StandardCompaction { + start: Instant, + table_writer: MultiWriter, + tables_to_rewrite: Vec, +} + +impl StandardCompaction { + pub fn new(table_writer: MultiWriter, tables_to_rewrite: Vec) -> Self { + Self { + start: Instant::now(), + table_writer, + tables_to_rewrite, + } + } + + fn register_blob(&mut self, indirection: BlobIndirection) { + self.table_writer.register_blob(indirection); + } + + fn consume_writer(self, opts: &Options, dst_lvl: usize) -> crate::Result> { + let segments_base_folder = self.table_writer.base_path.clone(); + + let pin_filter = opts.config.filter_block_pinning_policy.get(dst_lvl); + let pin_index = opts.config.filter_block_pinning_policy.get(dst_lvl); + + let writer_results = self.table_writer.finish()?; + + let created_segments = writer_results + .into_iter() + .map(|segment_id| -> crate::Result { + Segment::recover( + segments_base_folder.join(segment_id.to_string()), + opts.tree_id, + opts.config.cache.clone(), + opts.config.descriptor_table.clone(), + pin_filter, + pin_index, + #[cfg(feature = "metrics")] + opts.metrics.clone(), + ) + }) + .collect::>>()?; + + Ok(created_segments) + } +} + +impl CompactionFlavour for StandardCompaction { + fn write(&mut self, item: InternalValue) -> crate::Result<()> { + if item.key.value_type.is_indirection() { + let mut reader = &item.value[..]; + let indirection = BlobIndirection::decode_from(&mut reader)?; + self.register_blob(indirection); + } + + self.table_writer.write(item) + } + + fn finish( + mut self: Box, + levels: &mut LevelManifest, + opts: &Options, + payload: &CompactionPayload, + dst_lvl: usize, + blob_frag_map: FragmentationMap, + ) -> crate::Result<()> { + log::debug!("Compaction done in {:?}", self.start.elapsed()); + + let table_ids_to_delete = std::mem::take(&mut self.tables_to_rewrite); + + let created_segments = self.consume_writer(opts, dst_lvl)?; + + levels.atomic_swap( + |current| { + current.with_merge( + &payload.segment_ids.iter().copied().collect::>(), + &created_segments, + payload.dest_level as usize, + if blob_frag_map.is_empty() { + None + } else { + Some(blob_frag_map) + }, + Vec::default(), + HashSet::default(), + ) + }, + opts.eviction_seqno, + )?; + + // NOTE: If the application were to crash >here< it's fine + // The tables are not referenced anymore, and will be + // cleaned up upon recovery + for table in table_ids_to_delete { + table.mark_as_deleted(); + } + + Ok(()) + } +} diff --git a/src/compaction/mod.rs b/src/compaction/mod.rs index 03283260..f69938e4 100644 --- a/src/compaction/mod.rs +++ b/src/compaction/mod.rs @@ -8,6 +8,7 @@ pub(crate) mod fifo; pub(crate) mod leveled; // pub(crate) mod maintenance; pub(crate) mod drop_range; +mod flavour; pub(crate) mod major; pub(crate) mod movedown; pub(crate) mod pulldown; diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 0984e364..6e17a6a9 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -4,17 +4,21 @@ use super::{CompactionStrategy, Input as CompactionPayload}; use crate::{ - blob_tree::{handle::BlobIndirection, FragmentationMap}, - coding::Decode, - compaction::{stream::CompactionStream, Choice}, - file::SEGMENTS_FOLDER, + blob_tree::FragmentationMap, + compaction::{ + flavour::{RelocatingCompaction, StandardCompaction}, + stream::CompactionStream, + Choice, + }, + file::BLOBS_FOLDER, level_manifest::LevelManifest, merge::Merger, run_scanner::RunScanner, - segment::{multi_writer::MultiWriter, Segment}, stop_signal::StopSignal, tree::inner::TreeId, - AbstractTree, Config, InternalValue, SegmentId, SeqNo, TreeType, + vlog::{BlobFileMergeScanner, BlobFileScanner, BlobFileWriter}, + AbstractTree, BlobFile, Config, HashSet, InternalValue, SegmentId, SeqNo, + SequenceNumberCounter, }; use std::{ sync::{atomic::AtomicU64, Arc, RwLock, RwLockWriteGuard}, @@ -30,7 +34,9 @@ pub type CompactionReader<'a> = Box, + pub segment_id_generator: Arc, // TODO: change segment_id_generator to be SequenceNumberCounter + + pub blob_file_id_generator: SequenceNumberCounter, /// Configuration of tree. pub config: Config, @@ -57,6 +63,7 @@ impl Options { Self { tree_id: tree.id, segment_id_generator: tree.segment_id_counter.clone(), + blob_file_id_generator: tree.blob_file_id_generator.clone(), config: tree.config.clone(), levels: tree.manifest().clone(), stop_signal: tree.stop_signal.clone(), @@ -222,32 +229,6 @@ fn merge_segments( return Ok(()); }; - let segments_base_folder = opts.config.path.join(SEGMENTS_FOLDER); - - let dst_lvl = payload.canonical_level.into(); - - let data_block_size = opts.config.data_block_size_policy.get(dst_lvl); - let index_block_size = opts.config.index_block_size_policy.get(dst_lvl); - - let data_block_restart_interval = opts.config.data_block_restart_interval_policy.get(dst_lvl); - let index_block_restart_interval = opts.config.index_block_restart_interval_policy.get(dst_lvl); - - let data_block_compression = opts.config.data_block_compression_policy.get(dst_lvl); - let index_block_compression = opts.config.index_block_compression_policy.get(dst_lvl); - - let pin_filter = opts.config.filter_block_pinning_policy.get(dst_lvl); - let pin_index = opts.config.filter_block_pinning_policy.get(dst_lvl); - - let data_block_hash_ratio = opts.config.data_block_hash_ratio_policy.get(dst_lvl); - - log::debug!( - "Compacting segments {:?} into L{} (canonical L{}), data_block_restart_interval={data_block_restart_interval}, index_block_restart_interval={index_block_restart_interval}, data_block_size={data_block_size}, index_block_size={index_block_size}, data_block_compression={data_block_compression}, index_block_compression={index_block_compression}, mvcc_gc_watermark={}", - payload.segment_ids, - payload.dest_level, - payload.canonical_level, - opts.eviction_seqno, - ); - let mut blob_frag_map = FragmentationMap::default(); let Some(mut merge_iter) = create_compaction_stream( @@ -262,115 +243,358 @@ fn merge_segments( return Ok(()); }; + let dst_lvl = payload.canonical_level.into(); let last_level = levels.last_level_index(); - levels.hide_segments(payload.segment_ids.iter().copied()); - - // IMPORTANT: Free lock so the compaction (which may go on for a while) - // does not block possible other compactions and reads - drop(levels); - // NOTE: Only evict tombstones when reaching the last level, // That way we don't resurrect data beneath the tombstone let is_last_level = payload.dest_level == last_level; - let start = Instant::now(); + let table_writer = + super::flavour::prepare_table_writer(levels.current_version(), opts, payload)?; + + let mut compactor = match &opts.config.kv_separation_opts { + Some(blob_opts) => { + merge_iter = merge_iter.with_expiration_callback(&mut blob_frag_map); + + let version = levels.current_version(); + + let blob_files_to_rewrite = { + // TODO: 3.0.0 vvv if blob gc is disabled, skip this part vvv + + // TODO: 3.0.0 unit test and optimize... somehow + let mut linked_blob_files = payload + .segment_ids + .iter() + .map(|&id| version.get_segment(id).expect("table should exist")) + .filter_map(|x| x.get_linked_blob_files().expect("handle error")) + .flatten() + .map(|blob_file_ref| { + version + .value_log + .get(&blob_file_ref.blob_file_id) + .expect("blob file should exist") + }) + .filter(|blob_file| { + blob_file.is_stale(version.gc_stats(), 0.25 /* TODO: option */) + }) + .collect::>() + .into_iter() + .collect::>(); + + linked_blob_files.sort_by_key(|a| a.id()); + // TODO: 3.0.0 ^- age cutoff + + // TODO: 3.0.0 remove + log::debug!( + "maybe rewrite blob files: {:#?}", + linked_blob_files + .iter() + .map(|bf| bf.id()) + .collect::>(), + ); + + // NOTE: If there is any table not part of our compaction input + // that also points to the blob file, we cannot rewrite the blob file + for table in version.iter_segments() { + if payload.segment_ids.contains(&table.id()) { + continue; + } + + let other_ref = table + .get_linked_blob_files() + .expect("should not fail") + .unwrap_or_default(); + + let other_ref = other_ref + .iter() + .find(|x| linked_blob_files.iter().any(|bf| bf.id() == x.blob_file_id)); + + if let Some(other_ref) = other_ref { + linked_blob_files.retain(|x| x.id() != other_ref.blob_file_id); + } + } - let segment_writer = match MultiWriter::new( - segments_base_folder.clone(), - opts.segment_id_generator.clone(), - payload.target_size, - ) { - Ok(v) => v, - Err(e) => { - log::error!("Compaction failed: {e:?}"); - - // IMPORTANT: Show the segments again, because compaction failed - opts.levels - .write() - .expect("lock is poisoned") - .show_segments(payload.segment_ids.iter().copied()); + linked_blob_files.into_iter().cloned().collect::>() + }; - return Ok(()); + if blob_files_to_rewrite.is_empty() { + Box::new(StandardCompaction::new(table_writer, segments)) + as Box + } else { + log::debug!( + "relocate blob files: {:#?}", + blob_files_to_rewrite + .iter() + .map(BlobFile::id) + .collect::>(), + ); + + let scanner = BlobFileMergeScanner::new( + blob_files_to_rewrite + .iter() + .map(|bf| { + Ok(BlobFileScanner::new(&bf.0.path, bf.id())? + .use_compression(bf.0.meta.compression)) + }) + .collect::>>()?, + ); + + // TODO: we need to relocate blob files without decompressing + // TODO: BUT the meta needs to store the compression type + let writer = BlobFileWriter::new( + opts.blob_file_id_generator.clone(), + blob_opts.blob_file_target_size, + opts.config.path.join(BLOBS_FOLDER), + )?; + + let inner = StandardCompaction::new(table_writer, segments); + + Box::new(RelocatingCompaction::new( + inner, + scanner.peekable(), + writer, + blob_files_to_rewrite.iter().map(BlobFile::id).collect(), + blob_files_to_rewrite, + )) + } } + None => Box::new(StandardCompaction::new(table_writer, segments)), }; - let mut segment_writer = segment_writer - .use_data_block_restart_interval(data_block_restart_interval) - .use_index_block_restart_interval(index_block_restart_interval) - .use_data_block_compression(data_block_compression) - .use_data_block_size(data_block_size) - .use_index_block_size(index_block_size) - .use_data_block_hash_ratio(data_block_hash_ratio) - .use_index_block_compression(index_block_compression) - .use_bloom_policy({ - use crate::config::FilterPolicyEntry::{Bloom, None}; - use crate::segment::filter::BloomConstructionPolicy; - - if is_last_level && opts.config.expect_point_read_hits { - BloomConstructionPolicy::BitsPerKey(0.0) - } else { - match opts - .config - .filter_policy - .get(usize::from(payload.dest_level)) - { - Bloom(policy) => policy, - None => BloomConstructionPolicy::BitsPerKey(0.0), - } - } - }); + // // NOTE: If we are a blob tree, install callback to listen for evicted KVs + // let mut blob_stuff = if let Some(blob_opts) = &opts.config.kv_separation_opts { + // merge_iter = merge_iter.with_expiration_callback(&mut blob_frag_map); + + // // TODO: 3.0.0 vvv if blob gc is disabled, skip this part vvv + + // let version = levels.current_version(); + + // // TODO: 3.0.0 unit test and optimize... somehow + // let mut linked_blob_files = payload + // .segment_ids + // .iter() + // .map(|&id| version.get_segment(id).expect("table should exist")) + // .filter_map(|x| x.get_linked_blob_files().expect("handle error")) + // .flatten() + // .map(|blob_file_ref| { + // version + // .value_log + // .get(&blob_file_ref.blob_file_id) + // .expect("blob file should exist") + // }) + // .filter(|blob_file| { + // eprintln!("stale? {}", blob_file.id()); + // blob_file.is_stale(version.gc_stats(), 0.25 /* TODO: option */) + // }) + // .collect::>(); + + // eprintln!( + // "maybe rewrite blob files: {:#?}", + // linked_blob_files + // .iter() + // .map(|bf| bf.id()) + // .collect::>(), + // ); + + // // NOTE: If there is any table not part of our compaction input + // // that also points to the blob file, we cannot rewrite the blob file + // for table in version.iter_segments() { + // if payload.segment_ids.contains(&table.id()) { + // continue; + // } + + // let other_ref = table + // .get_linked_blob_files() + // .expect("should not fail") + // .unwrap_or_default(); + + // let other_ref = other_ref + // .iter() + // .find(|x| linked_blob_files.iter().any(|bf| bf.id() == x.blob_file_id)); + + // if let Some(other_ref) = other_ref { + // linked_blob_files.retain(|x| x.id() != other_ref.blob_file_id); + // } + // } + + // let rewritten_blob_file_ids = linked_blob_files + // .iter() + // .map(|bf| bf.id()) + // .collect::>(); + + // // TODO: be sure to actually remove blob file IDs from to-be-created Version + // eprintln!("rewrite blob files: {rewritten_blob_file_ids:#?}"); + + // if linked_blob_files.is_empty() { + // None + // } else { + // let scanner = BlobFileMergeScanner::new( + // linked_blob_files + // .iter() + // .map(|bf| { + // Ok(BlobFileScanner::new(&bf.0.path, bf.id())? + // .use_compression(bf.0.meta.compression)) + // }) + // .collect::>>()?, + // ); + + // // TODO: we need to relocate blob files without decompressing + // // TODO: BUT the meta needs to store the compression type + // let writer = BlobFileWriter::new( + // opts.blob_file_id_generator.clone(), + // blob_opts.blob_file_target_size, + // opts.config.path.join(BLOBS_FOLDER), + // )?; + + // Some((scanner.peekable(), writer, rewritten_blob_file_ids)) + // } + // } else { + // None + // }; - // NOTE: If we are a blob tree, install callback to listen for evicted KVs - if opts.config.kv_separation_opts.is_some() { - merge_iter = merge_iter.with_expiration_callback(&mut blob_frag_map); - } + levels.hide_segments(payload.segment_ids.iter().copied()); + + // IMPORTANT: Free lock so the compaction (which may go on for a while) + // does not block possible other compactions and reads + drop(levels); + + // TODO: guard hidden segments (somehow) for (idx, item) in merge_iter.enumerate() { - let item = match item { - Ok(v) => v, - Err(e) => { - log::error!("Compaction failed: {e:?}"); - - // IMPORTANT: Show the segments again, because compaction failed - opts.levels - .write() - .expect("lock is poisoned") - .show_segments(payload.segment_ids.iter().copied()); - - return Ok(()); - } - }; + let item = item?; // IMPORTANT: We can only drop tombstones when writing into last level if is_last_level && item.is_tombstone() { continue; } - if item.key.value_type.is_indirection() { - let mut reader = &item.value[..]; - - let Ok(indirection) = BlobIndirection::decode_from(&mut reader) else { - log::error!("Failed to deserialize blob indirection: {item:?}"); - return Ok(()); - }; - - // TODO: 3.0.0 -> IF we have a blob writer, use the active_blob_file ID instead (rewriting the vptr) - - segment_writer.register_blob(indirection); - } - - if let Err(e) = segment_writer.write(item) { - log::error!("Compaction failed: {e:?}"); - - // IMPORTANT: Show the segments again, because compaction failed - opts.levels - .write() - .expect("lock is poisoned") - .show_segments(payload.segment_ids.iter().copied()); - - return Ok(()); - } + compactor.write(item)?; + + // if item.key.value_type.is_indirection() { + // let mut reader = &item.value[..]; + + // let Ok(mut indirection) = BlobIndirection::decode_from(&mut reader) else { + // log::error!("Failed to deserialize blob indirection: {item:?}"); + // return Ok(()); + // }; + + // log::debug!( + // "{:?}:{} => encountered indirection: {indirection:?}", + // item.key.user_key, + // item.key.seqno, + // ); + + // // Check if we are actually doing blob garbage collection + // if let Some(blob_stuff) = &mut blob_stuff { + // if blob_stuff.2.contains(&indirection.vhandle.blob_file_id) { + // loop { + // let blob = blob_stuff + // .0 + // .peek() + // .expect("should have enough blob entries"); + + // if let Ok((entry, blob_file_id)) = blob { + // if blob_stuff.2.contains(blob_file_id) { + // // This blob is part of the rewritten blob files + // if entry.key < item.key.user_key { + // blob_stuff.0.next().expect("should exist"); + // continue; + // } + + // if entry.key == item.key.user_key { + // if *blob_file_id < indirection.vhandle.blob_file_id { + // blob_stuff.0.next().expect("should exist"); + // continue; + // } + // if entry.offset < indirection.vhandle.offset { + // blob_stuff.0.next().expect("should exist"); + // continue; + // } + // if entry.offset == indirection.vhandle.offset { + // // This is the blob we need + // break; + // } + // } + // assert!( + // (entry.key > item.key.user_key), + // "we passed vptr without getting blob", + // ); + // break; + // } + + // break; + // } else { + // let e = blob_stuff.0.next().expect("should exist"); + // return Err(e.expect_err("should be error")); + // } + // } + + // let blob = blob_stuff + // .0 + // .next() + // .expect("should have blob") + // .expect("should work TODO:"); + + // log::info!( + // "=> use blob: {:?}:{} offset: {} from BF {}", + // blob.0.key, + // blob.0.seqno, + // blob.0.offset, + // blob.1, + // ); + + // indirection.vhandle.blob_file_id = blob_stuff.1.blob_file_id(); + // indirection.vhandle.offset = blob_stuff.1.offset(); + + // log::debug!("RELOCATE to {indirection:?}"); + + // blob_stuff + // .1 + // .write(&item.key.user_key, item.key.seqno, &blob.0.value) + // .expect("should work TODO:"); + + // // TODO: 3.0.0, we REALLY need to change this function to allow for ? + + // if let Err(e) = table_writer.write(InternalValue::from_components( + // item.key.user_key, + // indirection.encode_into_vec(), + // item.key.seqno, + // crate::ValueType::Indirection, + // )) { + // log::error!("Compaction failed: {e:?}"); + + // // IMPORTANT: Show the segments again, because compaction failed + // opts.levels + // .write() + // .expect("lock is poisoned") + // .show_segments(payload.segment_ids.iter().copied()); + + // return Ok(()); + // } + + // continue 'table_iter; + // } + + // // This blob is not part of the rewritten blob files + // // So just pass it through + // log::trace!("Pass through {indirection:?} because it is not being relocated"); + // } + + // table_writer.register_blob(indirection); + // } + + // if let Err(e) = table_writer.write(item) { + // log::error!("Compaction failed: {e:?}"); + + // // IMPORTANT: Show the segments again, because compaction failed + // opts.levels + // .write() + // .expect("lock is poisoned") + // .show_segments(payload.segment_ids.iter().copied()); + + // return Ok(()); + // } if idx % 1_000_000 == 0 && opts.stop_signal.is_stopped() { log::debug!("compactor: stopping amidst compaction because of stop signal"); @@ -378,57 +602,51 @@ fn merge_segments( } } - let writer_results = match segment_writer.finish() { - Ok(v) => v, - Err(e) => { - log::error!("Compaction failed: {e:?}"); - - // IMPORTANT: Show the segments again, because compaction failed - opts.levels - .write() - .expect("lock is poisoned") - .show_segments(payload.segment_ids.iter().copied()); - - return Ok(()); - } - }; - - log::debug!( - "Compacted in {:?} ({} segments created)", - start.elapsed(), - writer_results.len(), - ); - - let created_segments = writer_results - .into_iter() - .map(|segment_id| -> crate::Result { - Segment::recover( - segments_base_folder.join(segment_id.to_string()), - opts.tree_id, - opts.config.cache.clone(), - opts.config.descriptor_table.clone(), - pin_filter, - pin_index, - #[cfg(feature = "metrics")] - opts.metrics.clone(), - ) - }) - .collect::>>(); - - let created_segments = match created_segments { - Ok(v) => v, - Err(e) => { - log::error!("Compaction failed: {e:?}"); - - // IMPORTANT: Show the segments again, because compaction failed - opts.levels - .write() - .expect("lock is poisoned") - .show_segments(payload.segment_ids.iter().copied()); - - return Ok(()); - } - }; + // let writer_results = match table_writer.finish() { + // Ok(v) => v, + // Err(e) => { + // log::error!("Compaction failed: {e:?}"); + + // // IMPORTANT: Show the segments again, because compaction failed + // opts.levels + // .write() + // .expect("lock is poisoned") + // .show_segments(payload.segment_ids.iter().copied()); + + // return Ok(()); + // } + // }; + + // let created_segments = writer_results + // .into_iter() + // .map(|segment_id| -> crate::Result { + // Segment::recover( + // segments_base_folder.join(segment_id.to_string()), + // opts.tree_id, + // opts.config.cache.clone(), + // opts.config.descriptor_table.clone(), + // pin_filter, + // pin_index, + // #[cfg(feature = "metrics")] + // opts.metrics.clone(), + // ) + // }) + // .collect::>>(); + + // let created_segments = match created_segments { + // Ok(v) => v, + // Err(e) => { + // log::error!("Compaction failed: {e:?}"); + + // // IMPORTANT: Show the segments again, because compaction failed + // opts.levels + // .write() + // .expect("lock is poisoned") + // .show_segments(payload.segment_ids.iter().copied()); + + // return Ok(()); + // } + // }; // NOTE: Mind lock order L -> M -> S log::trace!("compactor: acquiring levels manifest write lock"); @@ -437,34 +655,52 @@ fn merge_segments( log::trace!("Blob fragmentation diff: {blob_frag_map:#?}"); - let swap_result = levels.atomic_swap( - |current| { - current.with_merge( - &payload.segment_ids.iter().copied().collect::>(), - &created_segments, - payload.dest_level as usize, - if blob_frag_map.is_empty() { - None - } else { - Some(blob_frag_map) - }, - ) - }, - opts.eviction_seqno, - ); - - if let Err(e) = swap_result { - // IMPORTANT: Show the segments again, because compaction failed - levels.show_segments(payload.segment_ids.iter().copied()); - return Err(e); - } - - // NOTE: If the application were to crash >here< it's fine - // The segments are not referenced anymore, and will be - // cleaned up upon recovery - for segment in segments { - segment.mark_as_deleted(); - } + compactor.finish(&mut levels, opts, payload, dst_lvl, blob_frag_map)?; + + // let (writer, rewritten_blob_file_ids) = if let Some(blob_stuff) = blob_stuff { + // (Some(blob_stuff.1), Some(blob_stuff.2)) + // } else { + // (None, None) + // }; + + // let swap_result = levels.atomic_swap( + // |current| { + // current.with_merge( + // &payload.segment_ids.iter().copied().collect::>(), + // &created_segments, + // payload.dest_level as usize, + // if blob_frag_map.is_empty() { + // None + // } else { + // Some(blob_frag_map) + // }, + // { + // if let Some(writer) = writer { + // writer.finish().expect("should work TODO:") + // } else { + // vec![] + // } + // }, + // rewritten_blob_file_ids.unwrap_or_default(), + // ) + // }, + // opts.eviction_seqno, + // ); + + // if let Err(e) = swap_result { + // // IMPORTANT: Show the segments again, because compaction failed + // levels.show_segments(payload.segment_ids.iter().copied()); + // return Err(e); + // } + + // TODO: fwiw also add all dead blob files + // TODO: mark blob files deleted + /* + blob_file + .0 + .is_deleted + .store(true, std::sync::atomic::Ordering::Release); + */ levels.show_segments(payload.segment_ids.iter().copied()); @@ -520,6 +756,9 @@ fn drop_segments( segment.mark_as_deleted(); } + // TODO: fwiw also add all dead blob files + // TODO: look if any blob files can be trivially deleted as well + if let Err(e) = levels.maintenance(opts.eviction_seqno) { log::error!("Manifest maintenance failed: {e:?}"); return Err(e); From 74e58af1b43d9d27b389eb79b6efd91eaa81f3f9 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 4 Oct 2025 17:05:01 +0200 Subject: [PATCH 510/613] refactor --- src/compaction/worker.rs | 307 --------------------------------------- 1 file changed, 307 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 6e17a6a9..63c3ec71 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -362,97 +362,6 @@ fn merge_segments( None => Box::new(StandardCompaction::new(table_writer, segments)), }; - // // NOTE: If we are a blob tree, install callback to listen for evicted KVs - // let mut blob_stuff = if let Some(blob_opts) = &opts.config.kv_separation_opts { - // merge_iter = merge_iter.with_expiration_callback(&mut blob_frag_map); - - // // TODO: 3.0.0 vvv if blob gc is disabled, skip this part vvv - - // let version = levels.current_version(); - - // // TODO: 3.0.0 unit test and optimize... somehow - // let mut linked_blob_files = payload - // .segment_ids - // .iter() - // .map(|&id| version.get_segment(id).expect("table should exist")) - // .filter_map(|x| x.get_linked_blob_files().expect("handle error")) - // .flatten() - // .map(|blob_file_ref| { - // version - // .value_log - // .get(&blob_file_ref.blob_file_id) - // .expect("blob file should exist") - // }) - // .filter(|blob_file| { - // eprintln!("stale? {}", blob_file.id()); - // blob_file.is_stale(version.gc_stats(), 0.25 /* TODO: option */) - // }) - // .collect::>(); - - // eprintln!( - // "maybe rewrite blob files: {:#?}", - // linked_blob_files - // .iter() - // .map(|bf| bf.id()) - // .collect::>(), - // ); - - // // NOTE: If there is any table not part of our compaction input - // // that also points to the blob file, we cannot rewrite the blob file - // for table in version.iter_segments() { - // if payload.segment_ids.contains(&table.id()) { - // continue; - // } - - // let other_ref = table - // .get_linked_blob_files() - // .expect("should not fail") - // .unwrap_or_default(); - - // let other_ref = other_ref - // .iter() - // .find(|x| linked_blob_files.iter().any(|bf| bf.id() == x.blob_file_id)); - - // if let Some(other_ref) = other_ref { - // linked_blob_files.retain(|x| x.id() != other_ref.blob_file_id); - // } - // } - - // let rewritten_blob_file_ids = linked_blob_files - // .iter() - // .map(|bf| bf.id()) - // .collect::>(); - - // // TODO: be sure to actually remove blob file IDs from to-be-created Version - // eprintln!("rewrite blob files: {rewritten_blob_file_ids:#?}"); - - // if linked_blob_files.is_empty() { - // None - // } else { - // let scanner = BlobFileMergeScanner::new( - // linked_blob_files - // .iter() - // .map(|bf| { - // Ok(BlobFileScanner::new(&bf.0.path, bf.id())? - // .use_compression(bf.0.meta.compression)) - // }) - // .collect::>>()?, - // ); - - // // TODO: we need to relocate blob files without decompressing - // // TODO: BUT the meta needs to store the compression type - // let writer = BlobFileWriter::new( - // opts.blob_file_id_generator.clone(), - // blob_opts.blob_file_target_size, - // opts.config.path.join(BLOBS_FOLDER), - // )?; - - // Some((scanner.peekable(), writer, rewritten_blob_file_ids)) - // } - // } else { - // None - // }; - levels.hide_segments(payload.segment_ids.iter().copied()); // IMPORTANT: Free lock so the compaction (which may go on for a while) @@ -471,183 +380,12 @@ fn merge_segments( compactor.write(item)?; - // if item.key.value_type.is_indirection() { - // let mut reader = &item.value[..]; - - // let Ok(mut indirection) = BlobIndirection::decode_from(&mut reader) else { - // log::error!("Failed to deserialize blob indirection: {item:?}"); - // return Ok(()); - // }; - - // log::debug!( - // "{:?}:{} => encountered indirection: {indirection:?}", - // item.key.user_key, - // item.key.seqno, - // ); - - // // Check if we are actually doing blob garbage collection - // if let Some(blob_stuff) = &mut blob_stuff { - // if blob_stuff.2.contains(&indirection.vhandle.blob_file_id) { - // loop { - // let blob = blob_stuff - // .0 - // .peek() - // .expect("should have enough blob entries"); - - // if let Ok((entry, blob_file_id)) = blob { - // if blob_stuff.2.contains(blob_file_id) { - // // This blob is part of the rewritten blob files - // if entry.key < item.key.user_key { - // blob_stuff.0.next().expect("should exist"); - // continue; - // } - - // if entry.key == item.key.user_key { - // if *blob_file_id < indirection.vhandle.blob_file_id { - // blob_stuff.0.next().expect("should exist"); - // continue; - // } - // if entry.offset < indirection.vhandle.offset { - // blob_stuff.0.next().expect("should exist"); - // continue; - // } - // if entry.offset == indirection.vhandle.offset { - // // This is the blob we need - // break; - // } - // } - // assert!( - // (entry.key > item.key.user_key), - // "we passed vptr without getting blob", - // ); - // break; - // } - - // break; - // } else { - // let e = blob_stuff.0.next().expect("should exist"); - // return Err(e.expect_err("should be error")); - // } - // } - - // let blob = blob_stuff - // .0 - // .next() - // .expect("should have blob") - // .expect("should work TODO:"); - - // log::info!( - // "=> use blob: {:?}:{} offset: {} from BF {}", - // blob.0.key, - // blob.0.seqno, - // blob.0.offset, - // blob.1, - // ); - - // indirection.vhandle.blob_file_id = blob_stuff.1.blob_file_id(); - // indirection.vhandle.offset = blob_stuff.1.offset(); - - // log::debug!("RELOCATE to {indirection:?}"); - - // blob_stuff - // .1 - // .write(&item.key.user_key, item.key.seqno, &blob.0.value) - // .expect("should work TODO:"); - - // // TODO: 3.0.0, we REALLY need to change this function to allow for ? - - // if let Err(e) = table_writer.write(InternalValue::from_components( - // item.key.user_key, - // indirection.encode_into_vec(), - // item.key.seqno, - // crate::ValueType::Indirection, - // )) { - // log::error!("Compaction failed: {e:?}"); - - // // IMPORTANT: Show the segments again, because compaction failed - // opts.levels - // .write() - // .expect("lock is poisoned") - // .show_segments(payload.segment_ids.iter().copied()); - - // return Ok(()); - // } - - // continue 'table_iter; - // } - - // // This blob is not part of the rewritten blob files - // // So just pass it through - // log::trace!("Pass through {indirection:?} because it is not being relocated"); - // } - - // table_writer.register_blob(indirection); - // } - - // if let Err(e) = table_writer.write(item) { - // log::error!("Compaction failed: {e:?}"); - - // // IMPORTANT: Show the segments again, because compaction failed - // opts.levels - // .write() - // .expect("lock is poisoned") - // .show_segments(payload.segment_ids.iter().copied()); - - // return Ok(()); - // } - if idx % 1_000_000 == 0 && opts.stop_signal.is_stopped() { log::debug!("compactor: stopping amidst compaction because of stop signal"); return Ok(()); } } - // let writer_results = match table_writer.finish() { - // Ok(v) => v, - // Err(e) => { - // log::error!("Compaction failed: {e:?}"); - - // // IMPORTANT: Show the segments again, because compaction failed - // opts.levels - // .write() - // .expect("lock is poisoned") - // .show_segments(payload.segment_ids.iter().copied()); - - // return Ok(()); - // } - // }; - - // let created_segments = writer_results - // .into_iter() - // .map(|segment_id| -> crate::Result { - // Segment::recover( - // segments_base_folder.join(segment_id.to_string()), - // opts.tree_id, - // opts.config.cache.clone(), - // opts.config.descriptor_table.clone(), - // pin_filter, - // pin_index, - // #[cfg(feature = "metrics")] - // opts.metrics.clone(), - // ) - // }) - // .collect::>>(); - - // let created_segments = match created_segments { - // Ok(v) => v, - // Err(e) => { - // log::error!("Compaction failed: {e:?}"); - - // // IMPORTANT: Show the segments again, because compaction failed - // opts.levels - // .write() - // .expect("lock is poisoned") - // .show_segments(payload.segment_ids.iter().copied()); - - // return Ok(()); - // } - // }; - // NOTE: Mind lock order L -> M -> S log::trace!("compactor: acquiring levels manifest write lock"); let mut levels = opts.levels.write().expect("lock is poisoned"); @@ -657,51 +395,6 @@ fn merge_segments( compactor.finish(&mut levels, opts, payload, dst_lvl, blob_frag_map)?; - // let (writer, rewritten_blob_file_ids) = if let Some(blob_stuff) = blob_stuff { - // (Some(blob_stuff.1), Some(blob_stuff.2)) - // } else { - // (None, None) - // }; - - // let swap_result = levels.atomic_swap( - // |current| { - // current.with_merge( - // &payload.segment_ids.iter().copied().collect::>(), - // &created_segments, - // payload.dest_level as usize, - // if blob_frag_map.is_empty() { - // None - // } else { - // Some(blob_frag_map) - // }, - // { - // if let Some(writer) = writer { - // writer.finish().expect("should work TODO:") - // } else { - // vec![] - // } - // }, - // rewritten_blob_file_ids.unwrap_or_default(), - // ) - // }, - // opts.eviction_seqno, - // ); - - // if let Err(e) = swap_result { - // // IMPORTANT: Show the segments again, because compaction failed - // levels.show_segments(payload.segment_ids.iter().copied()); - // return Err(e); - // } - - // TODO: fwiw also add all dead blob files - // TODO: mark blob files deleted - /* - blob_file - .0 - .is_deleted - .store(true, std::sync::atomic::Ordering::Release); - */ - levels.show_segments(payload.segment_ids.iter().copied()); if let Err(e) = levels.maintenance(opts.eviction_seqno) { From 2df9e6aba7d38404b444fe570b1a83c555ff3d98 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 4 Oct 2025 17:05:45 +0200 Subject: [PATCH 511/613] move blob file ID generator into TreeInner --- src/blob_tree/handle.rs | 8 +++++++- src/blob_tree/mod.rs | 23 ++++++++++++++++------- src/compaction/flavour.rs | 2 ++ src/tree/inner.rs | 9 +++++++-- src/tree/mod.rs | 1 + 5 files changed, 33 insertions(+), 10 deletions(-) diff --git a/src/blob_tree/handle.rs b/src/blob_tree/handle.rs index 89c6b53a..75ba0288 100644 --- a/src/blob_tree/handle.rs +++ b/src/blob_tree/handle.rs @@ -6,12 +6,18 @@ use crate::{ use std::io::{Read, Write}; use varint_rs::{VarintReader, VarintWriter}; -#[derive(Copy, Clone)] +#[derive(Copy, Clone, Debug, Eq)] pub struct BlobIndirection { pub(crate) vhandle: ValueHandle, pub(crate) size: u32, } +impl PartialEq for BlobIndirection { + fn eq(&self, other: &Self) -> bool { + self.vhandle == other.vhandle && self.size == other.size + } +} + impl Encode for BlobIndirection { fn encode_into(&self, writer: &mut W) -> Result<(), EncodeError> { self.vhandle.encode_into(writer)?; diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index 0daf6099..c33b7fa6 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -101,8 +101,6 @@ pub struct BlobTree { pub index: crate::Tree, blobs_folder: PathBuf, - - blob_file_id_generator: SequenceNumberCounter, } impl BlobTree { @@ -125,10 +123,14 @@ impl BlobTree { .map(|x| x + 1) .unwrap_or_default(); + index + .0 + .blob_file_id_generator + .set(blob_file_id_to_continue_with); + Ok(Self { index, blobs_folder, - blob_file_id_generator: SequenceNumberCounter::new(blob_file_id_to_continue_with), }) } } @@ -413,12 +415,19 @@ impl AbstractTree for BlobTree { } }); - // TODO: 3.0.0 select compression let mut blob_writer = BlobFileWriter::new( - self.blob_file_id_generator.clone(), + self.index.0.blob_file_id_generator.clone(), u64::MAX, // TODO: actually use target size? but be sure to link to table correctly self.index.config.path.join(BLOBS_FOLDER), - )?; + )? + .use_compression( + self.index + .config + .kv_separation_opts + .as_ref() + .expect("blob options should exist") + .blob_compression, + ); let iter = memtable.iter().map(Ok); let compaction_filter = CompactionStream::new(iter, eviction_seqno); @@ -453,7 +462,7 @@ impl AbstractTree for BlobTree { if value_size >= separation_threshold { let offset = blob_writer.offset(); let blob_file_id = blob_writer.blob_file_id(); - let on_disk_size = blob_writer.write(&item.key.user_key, value)?; + let on_disk_size = blob_writer.write(&item.key.user_key, item.key.seqno, &value)?; let indirection = BlobIndirection { vhandle: ValueHandle { diff --git a/src/compaction/flavour.rs b/src/compaction/flavour.rs index 04309e27..f400c81e 100644 --- a/src/compaction/flavour.rs +++ b/src/compaction/flavour.rs @@ -223,6 +223,8 @@ impl CompactionFlavour for RelocatingCompaction { let blob_file_ids_to_drop = self.rewriting_blob_file_ids; + // TODO: fwiw also add all dead blob files + levels.atomic_swap( |current| { current.with_merge( diff --git a/src/tree/inner.rs b/src/tree/inner.rs index 00da1bca..52d16abb 100644 --- a/src/tree/inner.rs +++ b/src/tree/inner.rs @@ -4,7 +4,7 @@ use crate::{ config::Config, level_manifest::LevelManifest, memtable::Memtable, stop_signal::StopSignal, - SegmentId, + SegmentId, SequenceNumberCounter, }; use std::sync::{atomic::AtomicU64, Arc, RwLock}; @@ -57,10 +57,14 @@ pub struct TreeInner { /// Unique tree ID pub id: TreeId, - /// Hands out a unique (monotonically increasing) segment ID + /// Hands out a unique (monotonically increasing) table ID #[doc(hidden)] pub segment_id_counter: Arc, + // This is not really used in the normal tree, but we need it in the blob tree + /// Hands out a unique (monotonically increasing) blob file ID + pub(crate) blob_file_id_generator: SequenceNumberCounter, + /// Active memtable that is being written to pub(crate) active_memtable: Arc>>, @@ -91,6 +95,7 @@ impl TreeInner { Ok(Self { id: get_next_tree_id(), segment_id_counter: Arc::new(AtomicU64::default()), + blob_file_id_generator: SequenceNumberCounter::default(), config, active_memtable: Arc::default(), sealed_memtables: Arc::default(), diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 370d7415..1a4ade34 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -955,6 +955,7 @@ impl Tree { let inner = TreeInner { id: tree_id, segment_id_counter: Arc::new(AtomicU64::new(highest_segment_id + 1)), + blob_file_id_generator: SequenceNumberCounter::default(), active_memtable: Arc::default(), sealed_memtables: Arc::default(), manifest: Arc::new(RwLock::new(levels)), From 3cc4fa33babb40ed956f63ea1cefe7bb34b618ee Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 4 Oct 2025 17:06:06 +0200 Subject: [PATCH 512/613] test: blob relocation --- tests/blob_major_compact_gc_stats.rs | 9 ++- tests/blob_major_compact_relocation.rs | 100 +++++++++++++++++++++++++ 2 files changed, 106 insertions(+), 3 deletions(-) create mode 100644 tests/blob_major_compact_relocation.rs diff --git a/tests/blob_major_compact_gc_stats.rs b/tests/blob_major_compact_gc_stats.rs index d46ada8f..b8150d82 100644 --- a/tests/blob_major_compact_gc_stats.rs +++ b/tests/blob_major_compact_gc_stats.rs @@ -24,7 +24,6 @@ fn blob_tree_major_compact_gc_stats() -> lsm_tree::Result<()> { assert_eq!(&*value, big_value); tree.flush_active_memtable(0)?; - assert_eq!(1, tree.segment_count()); assert_eq!(1, tree.blob_file_count()); @@ -32,8 +31,11 @@ fn blob_tree_major_compact_gc_stats() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; - // Major compaction does not rewrite every blob file + // Blob file has no fragmentation before compaction (in stats) + // so it is not rewritten tree.major_compact(64_000_000, 1_000)?; + assert_eq!(1, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); let gc_stats = tree .manifest() @@ -103,7 +105,8 @@ fn blob_tree_major_compact_gc_stats_tombstone() -> lsm_tree::Result<()> { .get_linked_blob_files()?, ); - // Major compaction does not rewrite every blob file + // Blob file has no fragmentation before compaction (in stats) + // so it is not rewritten tree.major_compact(64_000_000, 1_000)?; assert_eq!(1, tree.segment_count()); assert_eq!(1, tree.blob_file_count()); diff --git a/tests/blob_major_compact_relocation.rs b/tests/blob_major_compact_relocation.rs new file mode 100644 index 00000000..e2737e62 --- /dev/null +++ b/tests/blob_major_compact_relocation.rs @@ -0,0 +1,100 @@ +use lsm_tree::{blob_tree::FragmentationEntry, AbstractTree, SeqNo}; +use test_log::test; + +#[test] +fn blob_tree_major_compact_relocation_simple() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let path = folder.path(); + + let big_value = b"neptune!".repeat(128_000); + let new_big_value = b"winter!".repeat(128_000); + + { + let tree = lsm_tree::Config::new(path) + .with_kv_separation(Some(Default::default())) + .open()?; + + assert!(tree.get("big", SeqNo::MAX)?.is_none()); + tree.insert("big", &big_value, 0); + tree.insert("big2", &big_value, 0); + tree.insert("smol", "smol", 0); + + let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + let value = tree.get("big2", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + let value = tree.get("smol", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, b"smol"); + + tree.flush_active_memtable(0)?; + assert_eq!(1, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); + + tree.insert("big", &new_big_value, 1); + + tree.flush_active_memtable(0)?; + + let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, new_big_value); + let value = tree.get("big2", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + let value = tree.get("smol", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, b"smol"); + + tree.major_compact(64_000_000, 1_000)?; + assert_eq!(1, tree.segment_count()); + assert_eq!(2, tree.blob_file_count()); + + let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, new_big_value); + let value = tree.get("big2", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + let value = tree.get("smol", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, b"smol"); + + { + let gc_stats = tree + .manifest() + .read() + .expect("lock is poisoned") + .current_version() + .gc_stats() + .clone(); + + // "big":0 is expired + assert_eq!( + &{ + let mut map = lsm_tree::HashMap::default(); + map.insert(0, FragmentationEntry::new(1, big_value.len() as u64)); + map + }, + &*gc_stats, + ); + } + + tree.major_compact(64_000_000, 1_000)?; + assert_eq!(1, tree.segment_count()); + assert_eq!(2, tree.blob_file_count()); + + { + let gc_stats = tree + .manifest() + .read() + .expect("lock is poisoned") + .current_version() + .gc_stats() + .clone(); + + assert_eq!(&lsm_tree::HashMap::default(), &*gc_stats); + } + + let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, new_big_value); + let value = tree.get("big2", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + let value = tree.get("smol", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, b"smol"); + } + + Ok(()) +} From 96646bb529c0a00c155c60270844117ab51c6151 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 4 Oct 2025 17:06:49 +0200 Subject: [PATCH 513/613] fix test --- tests/blob_major_compact_gc_stats.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/blob_major_compact_gc_stats.rs b/tests/blob_major_compact_gc_stats.rs index b8150d82..e01f5eb4 100644 --- a/tests/blob_major_compact_gc_stats.rs +++ b/tests/blob_major_compact_gc_stats.rs @@ -35,7 +35,7 @@ fn blob_tree_major_compact_gc_stats() -> lsm_tree::Result<()> { // so it is not rewritten tree.major_compact(64_000_000, 1_000)?; assert_eq!(1, tree.segment_count()); - assert_eq!(1, tree.blob_file_count()); + assert_eq!(2, tree.blob_file_count()); let gc_stats = tree .manifest() From edf81a8a155d5a541328a13aa9476d9e301b0dce Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 4 Oct 2025 17:07:09 +0200 Subject: [PATCH 514/613] add blob handling to Version --- src/version/mod.rs | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/src/version/mod.rs b/src/version/mod.rs index eed37b95..64eeed13 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -144,6 +144,7 @@ pub struct VersionInner { /// The individual LSM-tree levels which consist of runs of tables pub(crate) levels: Vec, + // TODO: 3.0.0 this should really be a newtype // NOTE: We purposefully use Arc<_> to avoid deep cloning the blob files again and again // // Changing the value log tends to happen way less often than other modifications to the @@ -411,6 +412,8 @@ impl Version { new_segments: &[Segment], dest_level: usize, diff: Option, + new_blob_files: Vec, + blob_files_to_drop: HashSet, ) -> Self { let id = self.id + 1; @@ -440,19 +443,35 @@ impl Version { let has_diff = diff.is_some(); - let gc_stats = if let Some(diff) = diff { + let gc_stats = if has_diff || !blob_files_to_drop.is_empty() { let mut copy = self.gc_stats.deref().clone(); - diff.merge_into(&mut copy); + + if let Some(diff) = diff { + diff.merge_into(&mut copy); + } + + for id in &blob_files_to_drop { + copy.remove(id); + } + copy.prune(&self.value_log); + Arc::new(copy) } else { self.gc_stats.clone() }; - let value_log = if has_diff { - // TODO: 3.0.0 this should really be a newtype + let value_log = if has_diff || !new_blob_files.is_empty() { let mut copy = self.value_log.deref().clone(); - copy.retain(|_, blob_file| !blob_file.is_dead(&gc_stats)); + + for blob_file in new_blob_files { + copy.insert(blob_file.id(), blob_file); + } + + for id in blob_files_to_drop { + copy.remove(&id); + } + Arc::new(copy) } else { self.value_log.clone() From 8b66447747a161d06d5d560ce7216c7f5a477dd7 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 4 Oct 2025 17:07:22 +0200 Subject: [PATCH 515/613] wip --- src/segment/multi_writer.rs | 2 +- src/segment/writer/mod.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/segment/multi_writer.rs b/src/segment/multi_writer.rs index 2cb2c541..686a78fc 100644 --- a/src/segment/multi_writer.rs +++ b/src/segment/multi_writer.rs @@ -17,7 +17,7 @@ use std::{ /// This results in a sorted "run" of segments #[allow(clippy::module_name_repetitions)] pub struct MultiWriter { - base_path: PathBuf, + pub(crate) base_path: PathBuf, data_block_hash_ratio: f32, diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 689a06d4..fcfb3524 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -12,7 +12,7 @@ use crate::{ use index::{BlockIndexWriter, FullIndexWriter}; use std::{fs::File, io::BufWriter, path::PathBuf}; -#[derive(Copy, Clone, PartialEq, Eq, Debug)] +#[derive(Copy, Clone, PartialEq, Eq, Debug, std::hash::Hash)] pub struct LinkedFile { pub blob_file_id: BlobFileId, pub bytes: u64, From bb78d6ba91d66561dbdbf9c6d86ff2ec6390ac11 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 4 Oct 2025 17:07:41 +0200 Subject: [PATCH 516/613] disable seqno truncation temporarily --- src/blob_tree/gc.rs | 4 ++-- src/compaction/stream.rs | 25 ++++++++++++++----------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/src/blob_tree/gc.rs b/src/blob_tree/gc.rs index 8f4748cd..95d2098a 100644 --- a/src/blob_tree/gc.rs +++ b/src/blob_tree/gc.rs @@ -205,8 +205,8 @@ mod tests { let mut iter = CompactionStream::new(iter, 1_000).with_expiration_callback(&mut my_watcher); assert_eq!( - // Seqno is reset to 0 - InternalValue::from_components(*b"a", b"abc", 0, ValueType::Value), + // TODO: Seqno is normally reset to 0 + InternalValue::from_components(*b"a", b"abc", 1, ValueType::Value), iter.next().unwrap()?, ); diff --git a/src/compaction/stream.rs b/src/compaction/stream.rs index 7618b339..39c77afe 100644 --- a/src/compaction/stream.rs +++ b/src/compaction/stream.rs @@ -111,12 +111,13 @@ impl> Iterator for CompactionStream<'_, I> { } } - // NOTE: Convert sequence number to zero if it is below the snapshot watermark. - // - // This can save a lot of space, because "0" only takes 1 byte. - if head.key.seqno < self.gc_seqno_threshold { - head.key.seqno = 0; - } + // TODO: look at how this plays with blob GC + // // NOTE: Convert sequence number to zero if it is below the snapshot watermark. + // // + // // This can save a lot of space, because "0" only takes 1 byte. + // if head.key.seqno < self.gc_seqno_threshold { + // head.key.seqno = 0; + // } return Some(Ok(head)); } @@ -189,8 +190,8 @@ mod tests { let mut iter = CompactionStream::new(iter, 1_000).with_expiration_callback(&mut my_watcher); assert_eq!( - // Seqno is reset to 0 - InternalValue::from_components(*b"a", *b"", 0, ValueType::Tombstone), + // TODO: Seqno is normally reset to 0 + InternalValue::from_components(*b"a", *b"", 999, ValueType::Tombstone), iter.next().unwrap()?, ); iter_closed!(iter); @@ -208,6 +209,7 @@ mod tests { #[test] #[allow(clippy::unwrap_used)] + #[ignore = "wip"] fn compaction_stream_seqno_zeroing_1() -> crate::Result<()> { #[rustfmt::skip] let vec = stream![ @@ -261,16 +263,17 @@ mod tests { let iter = vec.iter().cloned().map(Ok); let mut iter = CompactionStream::new(iter, 1_000_000); + // TODO: Seqno is normally reset to 0 assert_eq!( - InternalValue::from_components(*b"a", *b"", 0, ValueType::Tombstone), + InternalValue::from_components(*b"a", *b"", 999, ValueType::Tombstone), iter.next().unwrap()?, ); assert_eq!( - InternalValue::from_components(*b"b", *b"", 0, ValueType::Tombstone), + InternalValue::from_components(*b"b", *b"", 999, ValueType::Tombstone), iter.next().unwrap()?, ); assert_eq!( - InternalValue::from_components(*b"c", *b"", 0, ValueType::Tombstone), + InternalValue::from_components(*b"c", *b"", 999, ValueType::Tombstone), iter.next().unwrap()?, ); iter_closed!(iter); From 529909c906ad0a5388e336d7e9a00e6eb8ec76e9 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 4 Oct 2025 17:16:30 +0200 Subject: [PATCH 517/613] fix: show tables on compaction fail --- src/compaction/worker.rs | 38 ++++++++++++++++++++-------------- tests/tree_major_compaction.rs | 4 ++-- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 63c3ec71..13873192 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -287,7 +287,7 @@ fn merge_segments( // TODO: 3.0.0 remove log::debug!( - "maybe rewrite blob files: {:#?}", + "Maybe rewrite blob files: {:#?}", linked_blob_files .iter() .map(|bf| bf.id()) @@ -323,7 +323,7 @@ fn merge_segments( as Box } else { log::debug!( - "relocate blob files: {:#?}", + "Relocate blob files: {:#?}", blob_files_to_rewrite .iter() .map(BlobFile::id) @@ -368,39 +368,47 @@ fn merge_segments( // does not block possible other compactions and reads drop(levels); - // TODO: guard hidden segments (somehow) - for (idx, item) in merge_iter.enumerate() { - let item = item?; + let item = item.inspect_err(|_| { + // IMPORTANT: We need to show tables again on error + let mut levels = opts.levels.write().expect("lock is poisoned"); + levels.show_segments(payload.segment_ids.iter().copied()); + })?; // IMPORTANT: We can only drop tombstones when writing into last level if is_last_level && item.is_tombstone() { continue; } - compactor.write(item)?; + compactor.write(item).inspect_err(|_| { + // IMPORTANT: We need to show tables again on error + let mut levels = opts.levels.write().expect("lock is poisoned"); + levels.show_segments(payload.segment_ids.iter().copied()); + })?; if idx % 1_000_000 == 0 && opts.stop_signal.is_stopped() { - log::debug!("compactor: stopping amidst compaction because of stop signal"); + log::debug!("Stopping amidst compaction because of stop signal"); return Ok(()); } } // NOTE: Mind lock order L -> M -> S - log::trace!("compactor: acquiring levels manifest write lock"); + log::trace!("Acquiring levels manifest write lock"); let mut levels = opts.levels.write().expect("lock is poisoned"); - log::trace!("compactor: acquired levels manifest write lock"); - - log::trace!("Blob fragmentation diff: {blob_frag_map:#?}"); + log::trace!("Acquired levels manifest write lock"); - compactor.finish(&mut levels, opts, payload, dst_lvl, blob_frag_map)?; + compactor + .finish(&mut levels, opts, payload, dst_lvl, blob_frag_map) + .inspect_err(|_| { + // IMPORTANT: We need to show tables again on error + levels.show_segments(payload.segment_ids.iter().copied()); + })?; levels.show_segments(payload.segment_ids.iter().copied()); - if let Err(e) = levels.maintenance(opts.eviction_seqno) { + levels.maintenance(opts.eviction_seqno).inspect_err(|e| { log::error!("Manifest maintenance failed: {e:?}"); - return Err(e); - } + })?; drop(levels); diff --git a/tests/tree_major_compaction.rs b/tests/tree_major_compaction.rs index b83fdc49..1fa508ff 100644 --- a/tests/tree_major_compaction.rs +++ b/tests/tree_major_compaction.rs @@ -28,12 +28,12 @@ fn tree_major_compaction() -> lsm_tree::Result<()> { let item = tree.get_internal_entry(b"b", SeqNo::MAX)?.unwrap(); assert_eq!(&*item.key.user_key, "b".as_bytes()); assert!(!item.is_tombstone()); - assert_eq!(item.key.seqno, 0); // NOTE: Seqno is zeroed because below GC threshold + // assert_eq!(item.key.seqno, 0); // NOTE: Seqno is zeroed because below GC threshold // TODO: let item = tree.get_internal_entry(b"c", SeqNo::MAX)?.unwrap(); assert_eq!(&*item.key.user_key, "c".as_bytes()); assert!(!item.is_tombstone()); - assert_eq!(item.key.seqno, 0); // NOTE: Seqno is zeroed because below GC threshold + // assert_eq!(item.key.seqno, 0); // NOTE: Seqno is zeroed because below GC threshold // TODO: assert_eq!(1, tree.segment_count()); assert_eq!(3, tree.len(SeqNo::MAX, None)?); From 85f0043d44d709ba73499bc4c81f3f67fe2a925f Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 4 Oct 2025 18:18:41 +0200 Subject: [PATCH 518/613] add file descriptor cache metrics tickers --- src/metrics.rs | 6 ++++++ src/segment/util.rs | 10 +++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/metrics.rs b/src/metrics.rs index 9f6430a7..1bc391f4 100644 --- a/src/metrics.rs +++ b/src/metrics.rs @@ -10,6 +10,12 @@ use std::sync::atomic::Ordering::Relaxed; /// Are not stored durably, so metrics will reset after a restart/crash. #[derive(Debug, Default)] pub struct Metrics { + /// Number of times a table file was opened using `fopen()` + pub(crate) table_file_opened: AtomicUsize, + + /// Number of times a table file was retrieved from descriptor cache + pub(crate) table_file_opened_cached: AtomicUsize, + /// Number of index blocks that were actually read from disk pub(crate) index_block_load_io: AtomicUsize, diff --git a/src/segment/util.rs b/src/segment/util.rs index c8a144a6..70e11c39 100644 --- a/src/segment/util.rs +++ b/src/segment/util.rs @@ -54,9 +54,17 @@ pub fn load_block( let fd_cache_miss = cached_fd.is_none(); let fd = if let Some(fd) = cached_fd { + #[cfg(feature = "metrics")] + metrics.table_file_opened_cached.fetch_add(1, Relaxed); + fd } else { - Arc::new(std::fs::File::open(path)?) + let fd = std::fs::File::open(path)?; + + #[cfg(feature = "metrics")] + metrics.table_file_opened.fetch_add(1, Relaxed); + + Arc::new(fd) }; let block = Block::from_file(&fd, *handle, compression)?; From 82f424d9dd8d1613a373392d383aafd8bf0a3343 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 4 Oct 2025 19:55:19 +0200 Subject: [PATCH 519/613] rename --- src/compaction/flavour.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/compaction/flavour.rs b/src/compaction/flavour.rs index f400c81e..9e4e5414 100644 --- a/src/compaction/flavour.rs +++ b/src/compaction/flavour.rs @@ -209,7 +209,7 @@ impl CompactionFlavour for RelocatingCompaction { opts: &Options, payload: &CompactionPayload, dst_lvl: usize, - blob_frag_map: FragmentationMap, + blob_frag_map_diff: FragmentationMap, ) -> crate::Result<()> { log::debug!( "Relocating compaction done in {:?}", @@ -231,10 +231,10 @@ impl CompactionFlavour for RelocatingCompaction { &payload.segment_ids.iter().copied().collect::>(), &created_tables, payload.dest_level as usize, - if blob_frag_map.is_empty() { + if blob_frag_map_diff.is_empty() { None } else { - Some(blob_frag_map) + Some(blob_frag_map_diff) }, created_blob_files, blob_file_ids_to_drop, From c7961940ac55d3870e352e6f7e11e8fe565d1aa2 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 4 Oct 2025 19:55:26 +0200 Subject: [PATCH 520/613] test: blob file drop test --- tests/blob_major_compact_drop_dead_files.rs | 109 ++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 tests/blob_major_compact_drop_dead_files.rs diff --git a/tests/blob_major_compact_drop_dead_files.rs b/tests/blob_major_compact_drop_dead_files.rs new file mode 100644 index 00000000..c8e2414a --- /dev/null +++ b/tests/blob_major_compact_drop_dead_files.rs @@ -0,0 +1,109 @@ +use lsm_tree::{blob_tree::FragmentationEntry, AbstractTree, SeqNo}; +use test_log::test; + +#[test] +fn blob_tree_major_compact_drop_dead_files() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let path = folder.path(); + + let big_value = b"neptune!".repeat(128_000); + let new_big_value = b"winter!".repeat(128_000); + + { + let tree = lsm_tree::Config::new(path) + .with_kv_separation(Some(Default::default())) + .open()?; + + assert!(tree.get("big", SeqNo::MAX)?.is_none()); + + tree.insert("big", &big_value, 0); + tree.flush_active_memtable(0)?; + assert_eq!(1, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); + + tree.insert("big", &big_value, 1); + tree.flush_active_memtable(0)?; + assert_eq!(2, tree.segment_count()); + assert_eq!(2, tree.blob_file_count()); + + tree.insert("big", &big_value, 2); + tree.flush_active_memtable(0)?; + assert_eq!(3, tree.segment_count()); + assert_eq!(3, tree.blob_file_count()); + + tree.insert("big", &big_value, 3); + tree.flush_active_memtable(0)?; + assert_eq!(4, tree.segment_count()); + assert_eq!(4, tree.blob_file_count()); + + tree.insert("big", &new_big_value, 4); + tree.flush_active_memtable(0)?; + assert_eq!(5, tree.segment_count()); + assert_eq!(5, tree.blob_file_count()); + + let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, new_big_value); + + { + let gc_stats = tree + .manifest() + .read() + .expect("lock is poisoned") + .current_version() + .gc_stats() + .clone(); + + assert_eq!(&lsm_tree::HashMap::default(), &*gc_stats); + } + + tree.major_compact(64_000_000, 1_000)?; + assert_eq!(1, tree.segment_count()); + assert_eq!(5, tree.blob_file_count()); + + let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, new_big_value); + + { + let gc_stats = tree + .manifest() + .read() + .expect("lock is poisoned") + .current_version() + .gc_stats() + .clone(); + + assert_eq!( + &{ + let mut map = lsm_tree::HashMap::default(); + map.insert(0, FragmentationEntry::new(1, big_value.len() as u64)); + map.insert(1, FragmentationEntry::new(1, big_value.len() as u64)); + map.insert(2, FragmentationEntry::new(1, big_value.len() as u64)); + map.insert(3, FragmentationEntry::new(1, big_value.len() as u64)); + map + }, + &*gc_stats, + ); + } + + tree.major_compact(64_000_000, 1_000)?; + assert_eq!(1, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); + + { + let gc_stats = tree + .manifest() + .read() + .expect("lock is poisoned") + .current_version() + .gc_stats() + .clone(); + + assert_eq!(&lsm_tree::HashMap::default(), &*gc_stats); + } + + let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, new_big_value); + } + + Ok(()) +} From 953b03833bde7e1dced1680ed7ade4f7380973db Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 4 Oct 2025 19:55:37 +0200 Subject: [PATCH 521/613] increase blob file scanner buffer size --- src/vlog/blob_file/scanner.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vlog/blob_file/scanner.rs b/src/vlog/blob_file/scanner.rs index 916f2c81..19d3f39f 100644 --- a/src/vlog/blob_file/scanner.rs +++ b/src/vlog/blob_file/scanner.rs @@ -28,7 +28,7 @@ impl Scanner { /// /// Will return `Err` if an IO error occurs. pub fn new>(path: P, blob_file_id: BlobFileId) -> crate::Result { - let file_reader = BufReader::new(File::open(path)?); + let file_reader = BufReader::with_capacity(32_000, File::open(path)?); Ok(Self::with_reader(blob_file_id, file_reader)) } From 2bd9618a757668a98743de32e53c9ec929be58c2 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 4 Oct 2025 19:55:53 +0200 Subject: [PATCH 522/613] fix: version merge with dropping blob files --- src/version/mod.rs | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/version/mod.rs b/src/version/mod.rs index 64eeed13..7be64f58 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -443,25 +443,27 @@ impl Version { let has_diff = diff.is_some(); - let gc_stats = if has_diff || !blob_files_to_drop.is_empty() { - let mut copy = self.gc_stats.deref().clone(); + let gc_stats = + if has_diff || !blob_files_to_drop.is_empty() || !blob_files_to_drop.is_empty() { + let mut copy = self.gc_stats.deref().clone(); - if let Some(diff) = diff { - diff.merge_into(&mut copy); - } + if let Some(diff) = diff { + diff.merge_into(&mut copy); + } - for id in &blob_files_to_drop { - copy.remove(id); - } + for id in &blob_files_to_drop { + copy.remove(id); + } - copy.prune(&self.value_log); + copy.prune(&self.value_log); - Arc::new(copy) - } else { - self.gc_stats.clone() - }; + Arc::new(copy) + } else { + self.gc_stats.clone() + }; - let value_log = if has_diff || !new_blob_files.is_empty() { + let value_log = if has_diff || !new_blob_files.is_empty() || !blob_files_to_drop.is_empty() + { let mut copy = self.value_log.deref().clone(); for blob_file in new_blob_files { From 267b3d554320264b0cb4341783bef2a04b8ce8e3 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 4 Oct 2025 19:56:28 +0200 Subject: [PATCH 523/613] drop dead blob files on merge --- src/compaction/flavour.rs | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/src/compaction/flavour.rs b/src/compaction/flavour.rs index 9e4e5414..49627693 100644 --- a/src/compaction/flavour.rs +++ b/src/compaction/flavour.rs @@ -221,9 +221,14 @@ impl CompactionFlavour for RelocatingCompaction { let created_tables = self.inner.consume_writer(opts, dst_lvl)?; let created_blob_files = self.blob_writer.finish()?; - let blob_file_ids_to_drop = self.rewriting_blob_file_ids; + let mut blob_file_ids_to_drop = self.rewriting_blob_file_ids; - // TODO: fwiw also add all dead blob files + for blob_file in levels.current_version().value_log.values() { + if blob_file.is_dead(levels.current_version().gc_stats()) { + blob_file_ids_to_drop.insert(blob_file.id()); + self.rewriting_blob_files.push(blob_file.clone()); + } + } levels.atomic_swap( |current| { @@ -349,6 +354,14 @@ impl CompactionFlavour for StandardCompaction { let created_segments = self.consume_writer(opts, dst_lvl)?; + let mut blob_files_to_drop = Vec::default(); + + for blob_file in levels.current_version().value_log.values() { + if blob_file.is_dead(levels.current_version().gc_stats()) { + blob_files_to_drop.push(blob_file.clone()); + } + } + levels.atomic_swap( |current| { current.with_merge( @@ -361,7 +374,10 @@ impl CompactionFlavour for StandardCompaction { Some(blob_frag_map) }, Vec::default(), - HashSet::default(), + blob_files_to_drop + .iter() + .map(BlobFile::id) + .collect::>(), ) }, opts.eviction_seqno, @@ -374,6 +390,10 @@ impl CompactionFlavour for StandardCompaction { table.mark_as_deleted(); } + for blob_file in blob_files_to_drop { + blob_file.mark_as_deleted(); + } + Ok(()) } } From 864fb84300e369e00aae22af2f12da2b6ca5b48b Mon Sep 17 00:00:00 2001 From: zaidoon Date: Sun, 28 Sep 2025 22:30:24 -0400 Subject: [PATCH 524/613] add weak tombstone tracking and reclaimable value metrics --- src/abstract.rs | 6 ++++ src/blob_tree/mod.rs | 8 ++++++ src/compaction/leveled.rs | 4 +++ src/segment/meta.rs | 6 ++++ src/segment/mod.rs | 14 ++++++++++ src/segment/writer/meta.rs | 8 ++++++ src/segment/writer/mod.rs | 47 ++++++++++++++++++++++++++------ src/tree/mod.rs | 24 ++++++++++++++++ tests/segment_weak_tombstones.rs | 38 ++++++++++++++++++++++++++ 9 files changed, 146 insertions(+), 9 deletions(-) create mode 100644 tests/segment_weak_tombstones.rs diff --git a/src/abstract.rs b/src/abstract.rs index 1954e657..c735f454 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -74,6 +74,12 @@ pub trait AbstractTree { /// Returns the approximate number of tombstones in the tree. fn tombstone_count(&self) -> u64; + /// Returns the approximate number of weak tombstones (single deletes) in the tree. + fn weak_tombstone_count(&self) -> u64; + + /// Returns the approximate number of values reclaimable once weak tombstones can be GC'd. + fn weak_tombstone_reclaimable_count(&self) -> u64; + // TODO: clear() with Nuke compaction strategy (write lock) -> drop_range(..) /// Drops segments that are fully contained in a given range. diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index 8c0db395..9a417895 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -432,6 +432,14 @@ impl AbstractTree for BlobTree { self.index.tombstone_count() } + fn weak_tombstone_count(&self) -> u64 { + self.index.weak_tombstone_count() + } + + fn weak_tombstone_reclaimable_count(&self) -> u64 { + self.index.weak_tombstone_reclaimable_count() + } + fn drop_range, R: RangeBounds>(&self, range: R) -> crate::Result<()> { self.index.drop_range(range) } diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index ae50df5a..640a65ed 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -239,6 +239,10 @@ impl CompactionStrategy for Strategy { let mut scores = [(/* score */ 0.0, /* overshoot */ 0u64); 7]; { + // TODO(weak-tombstone-rewrite): incorporate `Segment::weak_tombstone_count` and + // `Segment::weak_tombstone_reclaimable` when computing level scores so rewrite + // decisions can prioritize segments that would free the most reclaimable values. + // Score first level // NOTE: We always have at least one level diff --git a/src/segment/meta.rs b/src/segment/meta.rs index 4163c3a2..25851db5 100644 --- a/src/segment/meta.rs +++ b/src/segment/meta.rs @@ -44,6 +44,8 @@ pub struct ParsedMeta { pub file_size: u64, pub item_count: u64, pub tombstone_count: u64, + pub weak_tombstone_count: u64, + pub weak_tombstone_reclaimable: u64, pub data_block_compression: CompressionType, pub index_block_compression: CompressionType, @@ -125,6 +127,8 @@ impl ParsedMeta { let data_block_count = read_u64!(block, b"#data_block_count"); let index_block_count = read_u64!(block, b"#index_block_count"); let file_size = read_u64!(block, b"#size"); // TODO: rename file_size + let weak_tombstone_count = read_u64!(block, b"#weak_tombstone_count"); + let weak_tombstone_reclaimable = read_u64!(block, b"#weak_tombstone_reclaimable"); let created_at = { let bytes = block @@ -196,6 +200,8 @@ impl ParsedMeta { file_size, item_count, tombstone_count, + weak_tombstone_count, + weak_tombstone_reclaimable, data_block_compression, index_block_compression, }) diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 361e9127..aeb02639 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -505,6 +505,20 @@ impl Segment { self.metadata.tombstone_count } + /// Returns the number of weak (single delete) tombstones in the `Segment`. + #[must_use] + #[doc(hidden)] + pub fn weak_tombstone_count(&self) -> u64 { + self.metadata.weak_tombstone_count + } + + /// Returns the number of value entries reclaimable once weak tombstones can be GC'd. + #[must_use] + #[doc(hidden)] + pub fn weak_tombstone_reclaimable(&self) -> u64 { + self.metadata.weak_tombstone_reclaimable + } + /// Returns the ratio of tombstone markers in the `Segment`. #[must_use] #[doc(hidden)] diff --git a/src/segment/writer/meta.rs b/src/segment/writer/meta.rs index 11ac694b..4e616526 100644 --- a/src/segment/writer/meta.rs +++ b/src/segment/writer/meta.rs @@ -14,6 +14,12 @@ pub struct Metadata { /// Tombstone count pub tombstone_count: usize, + /// Weak tombstone (single delete) count + pub weak_tombstone_count: usize, + + /// Weak tombstone + value pairs that become reclaimable when GC watermark advances + pub weak_tombstone_reclaimable_count: usize, + // TODO: 3.0.0 - https://github.com/fjall-rs/lsm-tree/issues/101 /// Written key count (unique keys) pub key_count: usize, @@ -44,6 +50,8 @@ impl Default for Metadata { item_count: 0, tombstone_count: 0, + weak_tombstone_count: 0, + weak_tombstone_reclaimable_count: 0, key_count: 0, file_pos: BlockOffset(0), uncompressed_size: 0, diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 68fb420b..2fbaff51 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -7,7 +7,7 @@ use super::{ }; use crate::{ coding::Encode, file::fsync_directory, segment::filter::standard_bloom::Builder, - time::unix_timestamp, CompressionType, InternalValue, SegmentId, UserKey, + time::unix_timestamp, CompressionType, InternalValue, SegmentId, UserKey, ValueType, }; use index::{BlockIndexWriter, FullIndexWriter}; use std::{fs::File, io::BufWriter, path::PathBuf}; @@ -61,6 +61,9 @@ pub struct Writer { /// /// using enhanced double hashing, so we got two u64s pub bloom_hash_buffer: Vec, + + /// Tracks the previously written item to detect weak tombstone/value pairs + previous_item: Option<(UserKey, ValueType)>, } impl Writer { @@ -103,6 +106,8 @@ impl Writer { bloom_policy: BloomConstructionPolicy::default(), bloom_hash_buffer: Vec::new(), + + previous_item: None, }) } @@ -171,33 +176,49 @@ impl Writer { /// sorted as described by the [`UserKey`], otherwise the block layout will /// be non-sense. pub fn write(&mut self, item: InternalValue) -> crate::Result<()> { + let value_type = item.key.value_type; + let seqno = item.key.seqno; + let user_key = item.key.user_key.clone(); + let value_len = item.value.len(); + if item.is_tombstone() { self.meta.tombstone_count += 1; } + if value_type == ValueType::WeakTombstone { + self.meta.weak_tombstone_count += 1; + } + + if value_type == ValueType::Value { + if let Some((prev_key, prev_type)) = &self.previous_item { + if prev_type == &ValueType::WeakTombstone && prev_key.as_ref() == user_key.as_ref() + { + self.meta.weak_tombstone_reclaimable_count += 1; + } + } + } + // NOTE: Check if we visit a new key - if Some(&item.key.user_key) != self.current_key.as_ref() { + if Some(&user_key) != self.current_key.as_ref() { self.meta.key_count += 1; - self.current_key = Some(item.key.user_key.clone()); + self.current_key = Some(user_key.clone()); // IMPORTANT: Do not buffer *every* item's key // because there may be multiple versions // of the same key if self.bloom_policy.is_active() { - self.bloom_hash_buffer - .push(Builder::get_hash(&item.key.user_key)); + self.bloom_hash_buffer.push(Builder::get_hash(&user_key)); } } - let seqno = item.key.seqno; - if self.meta.first_key.is_none() { - self.meta.first_key = Some(item.key.user_key.clone()); + self.meta.first_key = Some(user_key.clone()); } - self.chunk_size += item.key.user_key.len() + item.value.len(); + self.chunk_size += user_key.len() + value_len; self.chunk.push(item); + self.previous_item = Some((user_key, value_type)); if self.chunk_size >= self.data_block_size as usize { self.spill_block()?; @@ -403,6 +424,14 @@ impl Writer { "#user_data_size", &self.meta.uncompressed_size.to_le_bytes(), ), + meta( + "#weak_tombstone_count", + &(self.meta.weak_tombstone_count as u64).to_le_bytes(), + ), + meta( + "#weak_tombstone_reclaimable", + &(self.meta.weak_tombstone_reclaimable_count as u64).to_le_bytes(), + ), meta("v#lsmt", env!("CARGO_PKG_VERSION").as_bytes()), meta("v#table_version", &[3u8]), // TODO: tli_handle_count diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 6b01ad5c..436be3e4 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -120,6 +120,30 @@ impl AbstractTree for Tree { .sum() } + /// Returns the number of weak tombstones (single deletes) in the tree. + #[must_use] + fn weak_tombstone_count(&self) -> u64 { + self.manifest + .read() + .expect("lock is poisoned") + .current_version() + .iter_segments() + .map(Segment::weak_tombstone_count) + .sum() + } + + /// Returns the number of value entries that become reclaimable once weak tombstones can be GC'd. + #[must_use] + fn weak_tombstone_reclaimable_count(&self) -> u64 { + self.manifest + .read() + .expect("lock is poisoned") + .current_version() + .iter_segments() + .map(Segment::weak_tombstone_reclaimable) + .sum() + } + fn ingest( &self, iter: impl Iterator, diff --git a/tests/segment_weak_tombstones.rs b/tests/segment_weak_tombstones.rs new file mode 100644 index 00000000..6948cccf --- /dev/null +++ b/tests/segment_weak_tombstones.rs @@ -0,0 +1,38 @@ +use lsm_tree::{AbstractTree, Config}; + +#[test] +fn weak_tombstone_counts_single_pair() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let tree = Config::new(folder.path()).open()?; + + tree.insert(b"a", b"old", 1); + tree.remove_weak(b"a", 2); + tree.flush_active_memtable(0)?; + + assert_eq!(tree.weak_tombstone_count(), 1); + assert_eq!(tree.weak_tombstone_reclaimable_count(), 1); + + Ok(()) +} + +#[test] +fn weak_tombstone_counts_multiple_keys() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let tree = Config::new(folder.path()).open()?; + + tree.insert(b"a", b"old", 10); + tree.remove_weak(b"a", 11); + + tree.remove_weak(b"b", 12); + + tree.insert(b"c", b"old", 13); + tree.insert(b"c", b"new", 14); + tree.remove_weak(b"c", 15); + + tree.flush_active_memtable(0)?; + + assert_eq!(tree.weak_tombstone_count(), 3); + assert_eq!(tree.weak_tombstone_reclaimable_count(), 2); + + Ok(()) +} From 3d606aa2ae44db0392336d86b78a664f62dc4b25 Mon Sep 17 00:00:00 2001 From: Marvin <33938500+marvin-j97@users.noreply.github.com> Date: Sat, 4 Oct 2025 22:17:39 +0200 Subject: [PATCH 525/613] Update segment_weak_tombstones.rs --- tests/segment_weak_tombstones.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/segment_weak_tombstones.rs b/tests/segment_weak_tombstones.rs index 6948cccf..130b68e3 100644 --- a/tests/segment_weak_tombstones.rs +++ b/tests/segment_weak_tombstones.rs @@ -36,3 +36,22 @@ fn weak_tombstone_counts_multiple_keys() -> lsm_tree::Result<()> { Ok(()) } + +#[test] +fn weak_tombstone_counts_multiple_weak() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let tree = Config::new(folder.path()).open()?; + + tree.insert(b"a", b"old", 10); + tree.remove_weak(b"a", 11); + tree.remove_weak(b"a", 12); + tree.remove_weak(b"a", 13); + tree.remove_weak(b"a", 14); + + tree.flush_active_memtable(0)?; + + assert_eq!(tree.weak_tombstone_count(), 4); + assert_eq!(tree.weak_tombstone_reclaimable_count(), 1); // a:10 is paired with a:11 + + Ok(()) +} From b836109a91ac7a2de2fe261ec46764fdb12fa0ff Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 4 Oct 2025 23:14:22 +0200 Subject: [PATCH 526/613] doc --- src/config/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/config/mod.rs b/src/config/mod.rs index cbdbfc7a..0df19057 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -176,7 +176,7 @@ impl Config { /// You can create a global [`Cache`] and share it between multiple /// trees to cap global cache memory usage. /// - /// Defaults to a cache with 8 MiB of capacity *per tree*. + /// Defaults to a cache with 16 MiB of capacity *per tree*. #[must_use] pub fn use_cache(mut self, cache: Arc) -> Self { self.cache = cache; From ea4cc5204f0be66780975689cbd3b374f1ee5c30 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 5 Oct 2025 21:33:01 +0200 Subject: [PATCH 527/613] allow Version change fn to be fallible --- src/compaction/worker.rs | 2 +- src/level_manifest/mod.rs | 4 ++-- src/tree/mod.rs | 6 +++++- src/version/mod.rs | 11 ++++------- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 13873192..49afb991 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -184,7 +184,7 @@ fn move_segments( let segment_ids = payload.segment_ids.iter().copied().collect::>(); levels.atomic_swap( - |current| current.with_moved(&segment_ids, payload.dest_level as usize), + |current| Ok(current.with_moved(&segment_ids, payload.dest_level as usize)), opts.eviction_seqno, )?; diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index 549ecf82..54a0abf1 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -311,7 +311,7 @@ impl LevelManifest { /// and returns a new version. /// /// The function takes care of persisting the version changes on disk. - pub(crate) fn atomic_swap Version>( + pub(crate) fn atomic_swap crate::Result>( &mut self, f: F, gc_watermark: SeqNo, @@ -322,7 +322,7 @@ impl LevelManifest { // without mutating the current level manifest // If persisting to disk fails, this way the level manifest // is unchanged - let next_version = f(&self.current); + let next_version = f(&self.current)?; Self::persist_version(&self.folder, &next_version)?; diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 1a4ade34..74242eeb 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -414,7 +414,11 @@ impl AbstractTree for Tree { manifest.atomic_swap( |version| { - version.with_new_l0_run(segments, blob_files, frag_map.filter(|x| !x.is_empty())) + Ok(version.with_new_l0_run( + segments, + blob_files, + frag_map.filter(|x| !x.is_empty()), + )) }, seqno_threshold, )?; diff --git a/src/version/mod.rs b/src/version/mod.rs index 7be64f58..4f47f6c6 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -331,7 +331,7 @@ impl Version { /// Returns a new version with a list of segments removed. /// /// The segment files are not immediately deleted, this is handled by the version system's free list. - pub fn with_dropped(&self, ids: &[SegmentId]) -> Self { + pub fn with_dropped(&self, ids: &[SegmentId]) -> crate::Result { let id = self.id + 1; let mut levels = vec![]; @@ -368,10 +368,7 @@ impl Version { let mut copy = self.gc_stats.deref().clone(); for segment in &dropped_segments { - let linked_blob_files = segment - .get_linked_blob_files() - .expect("TODO: handle error") - .unwrap_or_default(); + let linked_blob_files = segment.get_linked_blob_files()?.unwrap_or_default(); for blob_file in linked_blob_files { copy.entry(blob_file.blob_file_id) @@ -395,7 +392,7 @@ impl Version { Arc::new(copy) }; - Self { + Ok(Self { inner: Arc::new(VersionInner { id, levels, @@ -403,7 +400,7 @@ impl Version { gc_stats, }), seqno_watermark: 0, - } + }) } pub fn with_merge( From a2c0da74e85837dcd1c91846c4811503bd89b657 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 5 Oct 2025 21:33:40 +0200 Subject: [PATCH 528/613] test: add blob GC test case --- tests/blob_major_compact_relocation.rs | 232 +++++++++++++++++++++++++ 1 file changed, 232 insertions(+) diff --git a/tests/blob_major_compact_relocation.rs b/tests/blob_major_compact_relocation.rs index e2737e62..9cd32c8b 100644 --- a/tests/blob_major_compact_relocation.rs +++ b/tests/blob_major_compact_relocation.rs @@ -98,3 +98,235 @@ fn blob_tree_major_compact_relocation_simple() -> lsm_tree::Result<()> { Ok(()) } + +#[test] +fn blob_tree_major_compact_relocation_repeated_key() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let path = folder.path(); + + let big_value = b"neptune!".repeat(2_000); + let very_big_value = b"winter!".repeat(128_000); + + { + let tree = lsm_tree::Config::new(path) + .with_kv_separation(Some(Default::default())) + .open()?; + + assert!(tree.get("big", SeqNo::MAX)?.is_none()); + tree.insert("a", &big_value, 0); + tree.insert("b", &big_value, 0); + tree.insert("c", &very_big_value, 0); + tree.insert("d", &big_value, 0); + tree.insert("e", &big_value, 0); + + let value = tree.get("a", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + let value = tree.get("b", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + let value = tree.get("c", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, very_big_value); + let value = tree.get("d", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + let value = tree.get("e", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + + tree.flush_active_memtable(0)?; + assert_eq!(1, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); + + tree.remove("c", 1); + + tree.flush_active_memtable(0)?; + assert_eq!(2, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); + + let value = tree.get("a", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + let value = tree.get("b", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + let value = tree.get("c", SeqNo::MAX)?; + assert!(value.is_none()); + let value = tree.get("d", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + let value = tree.get("e", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + + tree.major_compact(64_000_000, 1_000)?; + assert_eq!(1, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); + + let value = tree.get("a", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + let value = tree.get("b", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + let value = tree.get("c", SeqNo::MAX)?; + assert!(value.is_none()); + let value = tree.get("d", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + let value = tree.get("e", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + + { + let gc_stats = tree + .manifest() + .read() + .expect("lock is poisoned") + .current_version() + .gc_stats() + .clone(); + + assert_eq!( + &{ + let mut map = lsm_tree::HashMap::default(); + map.insert(0, FragmentationEntry::new(1, very_big_value.len() as u64)); + map + }, + &*gc_stats, + ); + } + + tree.major_compact(64_000_000, 1_000)?; + assert_eq!(1, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); + + { + let gc_stats = tree + .manifest() + .read() + .expect("lock is poisoned") + .current_version() + .gc_stats() + .clone(); + + assert_eq!(&lsm_tree::HashMap::default(), &*gc_stats); + } + + let value = tree.get("a", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + let value = tree.get("b", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + let value = tree.get("c", SeqNo::MAX)?; + assert!(value.is_none()); + let value = tree.get("d", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + let value = tree.get("e", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + } + + Ok(()) +} + +#[test] +fn blob_tree_major_compact_relocation_interleaved() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let path = folder.path(); + + let big_value = b"neptune!".repeat(2_000); + + { + let tree = lsm_tree::Config::new(path) + .with_kv_separation(Some(Default::default())) + .open()?; + + assert!(tree.get("big", SeqNo::MAX)?.is_none()); + tree.insert("a", b"smol", 0); + tree.insert("b", &big_value, 0); + tree.insert("c", b"smol", 0); + tree.insert("d", &big_value, 0); + tree.insert("e", b"smol", 0); + + let value = tree.get("a", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, b"smol"); + let value = tree.get("b", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + let value = tree.get("c", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, b"smol"); + let value = tree.get("d", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + let value = tree.get("e", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, b"smol"); + + tree.flush_active_memtable(0)?; + assert_eq!(1, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); + + tree.remove("d", 1); + + tree.flush_active_memtable(0)?; + assert_eq!(2, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); + + let value = tree.get("a", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, b"smol"); + let value = tree.get("b", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + let value = tree.get("c", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, b"smol"); + let value = tree.get("d", SeqNo::MAX)?; + assert!(value.is_none()); + let value = tree.get("e", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, b"smol"); + + tree.major_compact(64_000_000, 1_000)?; + assert_eq!(1, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); + + let value = tree.get("a", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, b"smol"); + let value = tree.get("b", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + let value = tree.get("c", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, b"smol"); + let value = tree.get("d", SeqNo::MAX)?; + assert!(value.is_none()); + let value = tree.get("e", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, b"smol"); + { + let gc_stats = tree + .manifest() + .read() + .expect("lock is poisoned") + .current_version() + .gc_stats() + .clone(); + + assert_eq!( + &{ + let mut map = lsm_tree::HashMap::default(); + map.insert(0, FragmentationEntry::new(1, big_value.len() as u64)); + map + }, + &*gc_stats, + ); + } + + tree.major_compact(64_000_000, 1_000)?; + assert_eq!(1, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); + + { + let gc_stats = tree + .manifest() + .read() + .expect("lock is poisoned") + .current_version() + .gc_stats() + .clone(); + + assert_eq!(&lsm_tree::HashMap::default(), &*gc_stats); + } + + let value = tree.get("a", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, b"smol"); + let value = tree.get("b", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, big_value); + let value = tree.get("c", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, b"smol"); + let value = tree.get("d", SeqNo::MAX)?; + assert!(value.is_none()); + let value = tree.get("e", SeqNo::MAX)?.expect("should exist"); + assert_eq!(&*value, b"smol"); + } + + Ok(()) +} From 9f35f1dbb5b72ea73309a735412a2684bf6835a0 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 5 Oct 2025 21:33:58 +0200 Subject: [PATCH 529/613] change vlog fd caching to need an Arc clone less --- src/vlog/accessor.rs | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/vlog/accessor.rs b/src/vlog/accessor.rs index fdf7ef2f..2be6bf54 100644 --- a/src/vlog/accessor.rs +++ b/src/vlog/accessor.rs @@ -43,19 +43,24 @@ impl<'a> Accessor<'a> { let bf_id = GlobalSegmentId::from((tree_id, blob_file.id())); - let file = if let Some(fd) = descriptor_table.access_for_blob_file(&bf_id) { + let cached_fd = descriptor_table.access_for_blob_file(&bf_id); + let fd_cache_miss = cached_fd.is_none(); + + let file = if let Some(fd) = cached_fd { fd } else { - let file = Arc::new(File::open( + Arc::new(File::open( base_path.join(vhandle.blob_file_id.to_string()), - )?); - descriptor_table.insert_for_blob_file(bf_id, file.clone()); - file + )?) }; let value = Reader::new(blob_file, &file).get(key, vhandle)?; cache.insert_blob(tree_id, vhandle, value.clone()); + if fd_cache_miss { + descriptor_table.insert_for_blob_file(bf_id, file); + } + Ok(Some(value)) } } From 498f6a461b444ead7b339d493cd9e9169d1e6711 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 5 Oct 2025 21:34:02 +0200 Subject: [PATCH 530/613] fmt --- src/vlog/blob_file/merge.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/vlog/blob_file/merge.rs b/src/vlog/blob_file/merge.rs index 4891c7ee..56681c38 100644 --- a/src/vlog/blob_file/merge.rs +++ b/src/vlog/blob_file/merge.rs @@ -127,14 +127,14 @@ mod tests { (Slice::from(b"a"), Slice::from(b"1".repeat(100))), merger .next() - .map(|result| result.map(|(entry, _)| { (entry.key, entry.value) })) + .map(|result| result.map(|(entry, _)| (entry.key, entry.value))) .unwrap()?, ); assert_eq!( (Slice::from(b"a"), Slice::from(b"0".repeat(100))), merger .next() - .map(|result| result.map(|(entry, _)| { (entry.key, entry.value) })) + .map(|result| result.map(|(entry, _)| (entry.key, entry.value))) .unwrap()?, ); @@ -192,7 +192,7 @@ mod tests { (Slice::from(key), Slice::from(key.repeat(100))), merger .next() - .map(|result| result.map(|(entry, _)| { (entry.key, entry.value) })) + .map(|result| result.map(|(entry, _)| (entry.key, entry.value))) .unwrap()?, ); } From 6f83c59dd19e9f41c5e841bef131dbe88e4d8be6 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 5 Oct 2025 21:34:27 +0200 Subject: [PATCH 531/613] refactor: compactor blob drain logic --- src/compaction/flavour.rs | 137 ++++++++++++++++++-------------------- 1 file changed, 64 insertions(+), 73 deletions(-) diff --git a/src/compaction/flavour.rs b/src/compaction/flavour.rs index 49627693..c3d9e4f4 100644 --- a/src/compaction/flavour.rs +++ b/src/compaction/flavour.rs @@ -100,6 +100,49 @@ pub struct RelocatingCompaction { rewriting_blob_files: Vec, } +impl RelocatingCompaction { + pub fn new( + inner: StandardCompaction, + blob_scanner: Peekable, + blob_writer: BlobFileWriter, + rewriting_blob_file_ids: HashSet, + rewriting_blob_files: Vec, + ) -> Self { + Self { + inner, + blob_scanner, + blob_writer, + rewriting_blob_file_ids, + rewriting_blob_files, + } + } + + /// Drains all blobs that come "before" the given vptr. + fn drain_blobs(&mut self, key: &[u8], vptr: &BlobIndirection) -> crate::Result<()> { + loop { + let Some(blob) = self.blob_scanner.next_if(|x| match x { + Ok((entry, blob_file_id)) => { + entry.key != key + || (*blob_file_id != vptr.vhandle.blob_file_id) + || (entry.offset < vptr.vhandle.offset) + } + Err(_) => true, + }) else { + break; + }; + + match blob { + Ok((entry, _)) => { + assert!((entry.key <= key), "vptr was not matched with blob"); + } + Err(e) => return Err(e), + } + } + + Ok(()) + } +} + impl CompactionFlavour for RelocatingCompaction { fn write(&mut self, item: InternalValue) -> crate::Result<()> { if item.key.value_type.is_indirection() { @@ -110,7 +153,7 @@ impl CompactionFlavour for RelocatingCompaction { return Ok(()); }; - log::debug!( + log::trace!( "{:?}:{} => encountered indirection: {indirection:?}", item.key.user_key, item.key.seqno, @@ -120,66 +163,32 @@ impl CompactionFlavour for RelocatingCompaction { .rewriting_blob_file_ids .contains(&indirection.vhandle.blob_file_id) { - loop { - // TODO: uglyyyy - let blob = self - .blob_scanner - .peek() - .expect("should have enough blob entries"); - - if let Ok((entry, blob_file_id)) = blob { - if self.rewriting_blob_file_ids.contains(blob_file_id) { - // This blob is part of the rewritten blob files - if entry.key < item.key.user_key { - self.blob_scanner.next().expect("should exist")?; - continue; - } - - if entry.key == item.key.user_key { - if *blob_file_id < indirection.vhandle.blob_file_id { - self.blob_scanner.next().expect("should exist")?; - continue; - } - if entry.offset < indirection.vhandle.offset { - self.blob_scanner.next().expect("should exist")?; - continue; - } - if entry.offset == indirection.vhandle.offset { - // This is the blob we need - break; - } - } - assert!( - (entry.key > item.key.user_key), - "we passed vptr without getting blob", - ); - break; - } - - break; - } - - let e = self.blob_scanner.next().expect("should exist"); - return Err(e.expect_err("should be error")); - } + self.drain_blobs(&item.key.user_key, &indirection)?; - let blob = self.blob_scanner.next().expect("should have blob")?; + let (blob_entry, blob_file_id) = self + .blob_scanner + .next() + .expect("vptr was not matched with blob (scanner is unexpectedly exhausted)")?; - log::info!( + debug_assert_eq!(blob_file_id, indirection.vhandle.blob_file_id); + debug_assert_eq!(blob_entry.key, item.key.user_key); + debug_assert_eq!(blob_entry.offset, indirection.vhandle.offset); + + log::trace!( "=> use blob: {:?}:{} offset: {} from BF {}", - blob.0.key, - blob.0.seqno, - blob.0.offset, - blob.1, + blob_entry.key, + blob_entry.seqno, + blob_entry.offset, + blob_file_id, ); indirection.vhandle.blob_file_id = self.blob_writer.blob_file_id(); indirection.vhandle.offset = self.blob_writer.offset(); - log::debug!("RELOCATE to {indirection:?}"); + log::trace!("RELOCATE to {indirection:?}"); self.blob_writer - .write(&item.key.user_key, item.key.seqno, &blob.0.value)?; + .write(&item.key.user_key, item.key.seqno, &blob_entry.value)?; self.inner .table_writer @@ -232,7 +241,7 @@ impl CompactionFlavour for RelocatingCompaction { levels.atomic_swap( |current| { - current.with_merge( + Ok(current.with_merge( &payload.segment_ids.iter().copied().collect::>(), &created_tables, payload.dest_level as usize, @@ -243,7 +252,7 @@ impl CompactionFlavour for RelocatingCompaction { }, created_blob_files, blob_file_ids_to_drop, - ) + )) }, opts.eviction_seqno, )?; @@ -263,24 +272,6 @@ impl CompactionFlavour for RelocatingCompaction { } } -impl RelocatingCompaction { - pub fn new( - inner: StandardCompaction, - blob_scanner: Peekable, - blob_writer: BlobFileWriter, - rewriting_blob_file_ids: HashSet, - rewriting_blob_files: Vec, - ) -> Self { - Self { - inner, - blob_scanner, - blob_writer, - rewriting_blob_file_ids, - rewriting_blob_files, - } - } -} - /// Standard compaction worker that just passes through all its data pub struct StandardCompaction { start: Instant, @@ -364,7 +355,7 @@ impl CompactionFlavour for StandardCompaction { levels.atomic_swap( |current| { - current.with_merge( + Ok(current.with_merge( &payload.segment_ids.iter().copied().collect::>(), &created_segments, payload.dest_level as usize, @@ -378,7 +369,7 @@ impl CompactionFlavour for StandardCompaction { .iter() .map(BlobFile::id) .collect::>(), - ) + )) }, opts.eviction_seqno, )?; From eb873f7bd5fe77ce2d163796faccb22c261905b2 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 5 Oct 2025 21:35:02 +0200 Subject: [PATCH 532/613] don't rewrite dead blob files just drop them instead --- src/compaction/worker.rs | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 49afb991..1f5e253b 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -253,6 +253,8 @@ fn merge_segments( let table_writer = super::flavour::prepare_table_writer(levels.current_version(), opts, payload)?; + let start = Instant::now(); + let mut compactor = match &opts.config.kv_separation_opts { Some(blob_opts) => { merge_iter = merge_iter.with_expiration_callback(&mut blob_frag_map); @@ -278,6 +280,10 @@ fn merge_segments( .filter(|blob_file| { blob_file.is_stale(version.gc_stats(), 0.25 /* TODO: option */) }) + .filter(|blob_file| { + // NOTE: Dead blob files are dropped anyway during Version change commit + !blob_file.is_dead(version.gc_stats()) + }) .collect::>() .into_iter() .collect::>(); @@ -285,15 +291,6 @@ fn merge_segments( linked_blob_files.sort_by_key(|a| a.id()); // TODO: 3.0.0 ^- age cutoff - // TODO: 3.0.0 remove - log::debug!( - "Maybe rewrite blob files: {:#?}", - linked_blob_files - .iter() - .map(|bf| bf.id()) - .collect::>(), - ); - // NOTE: If there is any table not part of our compaction input // that also points to the blob file, we cannot rewrite the blob file for table in version.iter_segments() { @@ -362,10 +359,12 @@ fn merge_segments( None => Box::new(StandardCompaction::new(table_writer, segments)), }; + log::trace!("Blob file GC preparation done in {:?}", start.elapsed()); + levels.hide_segments(payload.segment_ids.iter().copied()); // IMPORTANT: Free lock so the compaction (which may go on for a while) - // does not block possible other compactions and reads + // does not block possible other compactions and writes/reads drop(levels); for (idx, item) in merge_iter.enumerate() { From 6be58e49e1b2eb996b3c1159fc62387e75d86107 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 5 Oct 2025 21:42:09 +0200 Subject: [PATCH 533/613] wip --- src/config/mod.rs | 22 +++++++++++----------- src/segment/meta.rs | 17 +++++++++++++++++ src/segment/writer/mod.rs | 4 ++-- 3 files changed, 30 insertions(+), 13 deletions(-) diff --git a/src/config/mod.rs b/src/config/mod.rs index 408c3f8e..70312243 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -280,17 +280,17 @@ impl Config { self } - /// Sets the restart interval inside index blocks. - /// - /// A higher restart interval saves space while increasing lookup times - /// inside index blocks. - /// - /// Default = 1 - #[must_use] - pub fn index_block_restart_interval_policy(mut self, policy: RestartIntervalPolicy) -> Self { - self.index_block_restart_interval_policy = policy; - self - } + // /// Sets the restart interval inside index blocks. + // /// + // /// A higher restart interval saves space while increasing lookup times + // /// inside index blocks. + // /// + // /// Default = 1 + // #[must_use] + // pub fn index_block_restart_interval_policy(mut self, policy: RestartIntervalPolicy) -> Self { + // self.index_block_restart_interval_policy = policy; + // self + // } /// Sets the filter construction policy. #[must_use] diff --git a/src/segment/meta.rs b/src/segment/meta.rs index 983bccb3..dc99c622 100644 --- a/src/segment/meta.rs +++ b/src/segment/meta.rs @@ -49,6 +49,17 @@ pub struct ParsedMeta { pub index_block_compression: CompressionType, } +macro_rules! read_u8 { + ($block:expr, $name:expr) => {{ + let bytes = $block + .point_read($name, SeqNo::MAX) + .unwrap_or_else(|| panic!("meta property {:?} should exist", $name)); + + let mut bytes = &bytes.value[..]; + bytes.read_u8()? + }}; +} + macro_rules! read_u64 { ($block:expr, $name:expr) => {{ let bytes = $block @@ -119,6 +130,12 @@ impl ParsedMeta { ); } + assert_eq!( + read_u8!(block, b"#restart_interval#index"), + 1, + "index block restart intervals >1 are not supported for this version", + ); + let id = read_u64!(block, b"#id"); let item_count = read_u64!(block, b"#item_count"); let tombstone_count = read_u64!(block, b"#tombstone_count"); diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index fcfb3524..95bc9413 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -418,8 +418,8 @@ impl Writer { self.meta.first_key.as_ref().expect("should exist"), ), meta("#key_count", &(self.meta.key_count as u64).to_le_bytes()), - meta("#prefix_truncation#data", &[1]), - meta("#prefix_truncation#index", &[0]), + meta("#prefix_truncation#data", &[1]), // NOTE: currently prefix truncation can not be disabled + meta("#prefix_truncation#index", &[1]), // NOTE: currently prefix truncation can not be disabled meta( "#restart_interval#data", &self.data_block_restart_interval.to_le_bytes(), From baed8b2358b0703bf890d2340170c957c6cd89b9 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 5 Oct 2025 21:45:09 +0200 Subject: [PATCH 534/613] wip --- src/config/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/config/mod.rs b/src/config/mod.rs index 70312243..00cb47d0 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -280,6 +280,7 @@ impl Config { self } + // TODO: not supported yet in index blocks // /// Sets the restart interval inside index blocks. // /// // /// A higher restart interval saves space while increasing lookup times From 87c02544467e00ab3f1b4b3a939ab663703f26da Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 6 Oct 2025 20:14:22 +0200 Subject: [PATCH 535/613] add blob checksum error logs --- src/vlog/blob_file/reader.rs | 4 ++++ src/vlog/blob_file/scanner.rs | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/src/vlog/blob_file/reader.rs b/src/vlog/blob_file/reader.rs index 0d206c68..01bec7b0 100644 --- a/src/vlog/blob_file/reader.rs +++ b/src/vlog/blob_file/reader.rs @@ -85,6 +85,10 @@ impl<'a> Reader<'a> { }; if expected_checksum != checksum { + log::error!( + "Checksum mismatch for blob {vhandle:?}, got={checksum}, expected={expected_checksum}", + ); + return Err(crate::Error::ChecksumMismatch { got: Checksum::from_raw(checksum), expected: Checksum::from_raw(expected_checksum), diff --git a/src/vlog/blob_file/scanner.rs b/src/vlog/blob_file/scanner.rs index 19d3f39f..9a23311e 100644 --- a/src/vlog/blob_file/scanner.rs +++ b/src/vlog/blob_file/scanner.rs @@ -122,6 +122,11 @@ impl Iterator for Scanner { }; if expected_checksum != checksum { + log::error!( + "Checksum mismatch for blob>{}@{offset}, got={checksum}, expected={expected_checksum}", + self.blob_file_id, + ); + return Some(Err(crate::Error::ChecksumMismatch { got: Checksum::from_raw(checksum), expected: Checksum::from_raw(expected_checksum), From fbaf6e6c11ab872a8ecd00d74248aaf2412f236e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 6 Oct 2025 21:53:43 +0200 Subject: [PATCH 536/613] adjust kv opts config --- src/blob_tree/mod.rs | 4 ++-- src/config/mod.rs | 51 ++++++++++++++++++++++++++++++-------------- 2 files changed, 37 insertions(+), 18 deletions(-) diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index c33b7fa6..248c91a4 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -426,7 +426,7 @@ impl AbstractTree for BlobTree { .kv_separation_opts .as_ref() .expect("blob options should exist") - .blob_compression, + .compression, ); let iter = memtable.iter().map(Ok); @@ -441,7 +441,7 @@ impl AbstractTree for BlobTree { .kv_separation_opts .as_ref() .expect("kv separation options should exist") - .blob_file_separation_threshold; + .separation_threshold; for item in compaction_filter { let item = item?; diff --git a/src/config/mod.rs b/src/config/mod.rs index 00cb47d0..31b9e890 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -61,24 +61,35 @@ const DEFAULT_FILE_FOLDER: &str = ".lsm.data"; #[derive(Clone, Debug)] pub struct KvSeparationOptions { /// What type of compression is used for blobs - pub blob_compression: CompressionType, + pub compression: CompressionType, /// Blob file (value log segment) target size in bytes #[doc(hidden)] - pub blob_file_target_size: u64, + pub file_target_size: u64, /// Key-value separation threshold in bytes #[doc(hidden)] - pub blob_file_separation_threshold: u32, - // TODO: blob_file_staleness_threshold AND/OR space_amp_threshold + pub separation_threshold: u32, + + pub(crate) staleness_threshold: f32, + + pub(crate) age_cutoff: f32, // TODO: 3.0.0 } impl Default for KvSeparationOptions { fn default() -> Self { Self { - blob_compression: CompressionType::None, // TODO: LZ4 - blob_file_target_size: /* 64 MiB */ 64 * 1_024 * 1_024, - blob_file_separation_threshold: /* 1 KiB */ 1_024, + #[cfg(feature="lz4")] + compression: CompressionType::Lz4, + + #[cfg(not(feature="lz4"))] + compression: CompressionType::None, + + file_target_size: /* 256 MiB */ 256 * 1_024 * 1_024, + separation_threshold: /* 1 KiB */ 1_024, + + staleness_threshold: 0.25, + age_cutoff: 0.25, } } } @@ -86,8 +97,8 @@ impl Default for KvSeparationOptions { impl KvSeparationOptions { /// Sets the blob compression method. #[must_use] - pub fn blob_compression(mut self, compression: CompressionType) -> Self { - self.blob_compression = compression; + pub fn compression(mut self, compression: CompressionType) -> Self { + self.compression = compression; self } @@ -100,11 +111,9 @@ impl KvSeparationOptions { /// overhead. /// /// Defaults to 64 MiB. - /// - /// This option has no effect when not used for opening a blob tree. #[must_use] - pub fn blob_file_target_size(mut self, bytes: u64) -> Self { - self.blob_file_target_size = bytes; + pub fn file_target_size(mut self, bytes: u64) -> Self { + self.file_target_size = bytes; self } @@ -114,11 +123,21 @@ impl KvSeparationOptions { /// at the cost of lower read performance. /// /// Defaults to 4KiB. + #[must_use] + pub fn separation_threshold(mut self, bytes: u32) -> Self { + self.separation_threshold = bytes; + self + } + + /// Sets the staleness threshold percentage. + /// + /// The staleness percentage determines how much a blob file needs to be fragmented to be + /// picked up by the garbage collection. /// - /// This option has no effect when not used for opening a blob tree. + /// Defaults to 25%. #[must_use] - pub fn blob_file_separation_threshold(mut self, bytes: u32) -> Self { - self.blob_file_separation_threshold = bytes; + pub fn staleness_threshold(mut self, ratio: f32) -> Self { + self.staleness_threshold = ratio; self } } From e127dcd154c60c3097df1399844398ba1dbe0c60 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 6 Oct 2025 21:54:02 +0200 Subject: [PATCH 537/613] hide some stuff from config --- src/config/mod.rs | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/src/config/mod.rs b/src/config/mod.rs index 31b9e890..6b441857 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -333,22 +333,23 @@ impl Config { self } - /// Sets the number of levels of the LSM tree (depth of tree). - /// - /// Defaults to 7, like `LevelDB` and `RocksDB`. - /// - /// Cannot be changed once set. - /// - /// # Panics - /// - /// Panics if `n` is 0. - #[must_use] - pub fn level_count(mut self, n: u8) -> Self { - assert!(n > 0); + // TODO: level count is fixed to 7 right now + // /// Sets the number of levels of the LSM tree (depth of tree). + // /// + // /// Defaults to 7, like `LevelDB` and `RocksDB`. + // /// + // /// Cannot be changed once set. + // /// + // /// # Panics + // /// + // /// Panics if `n` is 0. + // #[must_use] + // pub fn level_count(mut self, n: u8) -> Self { + // assert!(n > 0); - self.level_count = n; - self - } + // self.level_count = n; + // self + // } /// Sets the data block size policy. #[must_use] @@ -357,12 +358,13 @@ impl Config { self } - /// Sets the index block size policy. - #[must_use] - pub fn index_block_size_policy(mut self, policy: BlockSizePolicy) -> Self { - self.index_block_size_policy = policy; - self - } + // TODO: 3.0.0 does nothing until we have partitioned indexes + // /// Sets the index block size policy. + // #[must_use] + // pub fn index_block_size_policy(mut self, policy: BlockSizePolicy) -> Self { + // self.index_block_size_policy = policy; + // self + // } /// Sets the hash ratio policy for data blocks. /// From 718b7818bf7948ddbe0ab5389aee36aacd09455c Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 6 Oct 2025 21:54:16 +0200 Subject: [PATCH 538/613] adjust tests --- tests/blob_sep_threshold.rs | 2 +- tests/mvcc_slab.rs | 4 ++-- tests/segment_point_reads.rs | 8 ++++---- tests/segment_range.rs | 6 +++--- tests/segment_range_oob.rs | 2 +- tests/segment_remove_weak.rs | 2 +- tests/snapshot_point_read.rs | 8 ++++---- tests/tree_different_block_size.rs | 8 ++++---- tests/tree_disjoint_point_read.rs | 8 ++++---- tests/tree_non_disjoint_point_read.rs | 2 +- 10 files changed, 25 insertions(+), 25 deletions(-) diff --git a/tests/blob_sep_threshold.rs b/tests/blob_sep_threshold.rs index 1c26a11a..ab643d91 100644 --- a/tests/blob_sep_threshold.rs +++ b/tests/blob_sep_threshold.rs @@ -9,7 +9,7 @@ fn blob_tree_separation_threshold() -> lsm_tree::Result<()> { let tree = lsm_tree::Config::new(path) .with_kv_separation(Some( - KvSeparationOptions::default().blob_file_separation_threshold(1_024), + KvSeparationOptions::default().separation_threshold(1_024), )) .open()?; diff --git a/tests/mvcc_slab.rs b/tests/mvcc_slab.rs index 45ef7ee3..51ce4638 100644 --- a/tests/mvcc_slab.rs +++ b/tests/mvcc_slab.rs @@ -9,7 +9,7 @@ fn segment_reader_mvcc_slab() -> lsm_tree::Result<()> { let tree = Config::new(&folder) .data_block_size_policy(BlockSizePolicy::all(1_024)) - .index_block_size_policy(BlockSizePolicy::all(1_024)) + // .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; let seqno = SequenceNumberCounter::default(); @@ -46,7 +46,7 @@ fn segment_reader_mvcc_slab_blob() -> lsm_tree::Result<()> { let tree = Config::new(&folder) .data_block_size_policy(BlockSizePolicy::all(1_024)) - .index_block_size_policy(BlockSizePolicy::all(1_024)) + // .index_block_size_policy(BlockSizePolicy::all(1_024)) .with_kv_separation(Some(Default::default())) .open()?; diff --git a/tests/segment_point_reads.rs b/tests/segment_point_reads.rs index 46c770a9..6c169dd3 100644 --- a/tests/segment_point_reads.rs +++ b/tests/segment_point_reads.rs @@ -9,7 +9,7 @@ fn segment_point_reads() -> lsm_tree::Result<()> { let tree = Config::new(folder) .data_block_size_policy(BlockSizePolicy::all(1_024)) - .index_block_size_policy(BlockSizePolicy::all(1_024)) + // .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; for x in 0..ITEM_COUNT as u64 { @@ -33,7 +33,7 @@ fn segment_point_reads_mvcc() -> lsm_tree::Result<()> { let tree = Config::new(folder) .data_block_size_policy(BlockSizePolicy::all(1_024)) - .index_block_size_policy(BlockSizePolicy::all(1_024)) + // .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; for x in 0..ITEM_COUNT as u64 { @@ -75,7 +75,7 @@ fn segment_point_reads_mvcc_slab() -> lsm_tree::Result<()> { let tree = Config::new(folder) .data_block_size_policy(BlockSizePolicy::all(1_024)) - .index_block_size_policy(BlockSizePolicy::all(1_024)) + // .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; let keys = [0, 1, 2] @@ -122,7 +122,7 @@ fn blob_tree_segment_point_reads_mvcc_slab() -> lsm_tree::Result<()> { let tree = Config::new(folder) .data_block_size_policy(BlockSizePolicy::all(1_024)) - .index_block_size_policy(BlockSizePolicy::all(1_024)) + // .index_block_size_policy(BlockSizePolicy::all(1_024)) .with_kv_separation(Some(Default::default())) .open()?; diff --git a/tests/segment_range.rs b/tests/segment_range.rs index 6824a876..907ad202 100644 --- a/tests/segment_range.rs +++ b/tests/segment_range.rs @@ -9,7 +9,7 @@ fn segment_ranges() -> lsm_tree::Result<()> { let tree = Config::new(folder) .data_block_size_policy(BlockSizePolicy::all(1_024)) - .index_block_size_policy(BlockSizePolicy::all(1_024)) + // .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; for x in 0..ITEM_COUNT as u64 { @@ -67,7 +67,7 @@ fn segment_range_last_back() -> lsm_tree::Result<()> { let tree = Config::new(folder) .data_block_size_policy(BlockSizePolicy::all(1_024)) - .index_block_size_policy(BlockSizePolicy::all(1_024)) + // .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; let value = (0..2_000).map(|_| 0).collect::>(); @@ -102,7 +102,7 @@ fn segment_range_last_back_2() -> lsm_tree::Result<()> { let tree = Config::new(folder) .data_block_size_policy(BlockSizePolicy::all(1_024)) - .index_block_size_policy(BlockSizePolicy::all(1_024)) + // .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; let value = (0..2_000).map(|_| 0).collect::>(); diff --git a/tests/segment_range_oob.rs b/tests/segment_range_oob.rs index a9ca589e..c9c0d28f 100644 --- a/tests/segment_range_oob.rs +++ b/tests/segment_range_oob.rs @@ -31,7 +31,7 @@ fn segment_range_out_of_bounds_hi() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.keep(); let tree = Config::new(folder) - .index_block_size_policy(BlockSizePolicy::all(1_024)) + // .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; for x in 0..ITEM_COUNT as u64 { diff --git a/tests/segment_remove_weak.rs b/tests/segment_remove_weak.rs index f1125819..bca86a02 100644 --- a/tests/segment_remove_weak.rs +++ b/tests/segment_remove_weak.rs @@ -7,7 +7,7 @@ fn segment_remove_weak_simple() -> lsm_tree::Result<()> { let tree = Config::new(folder) .data_block_size_policy(BlockSizePolicy::all(1_024)) - .index_block_size_policy(BlockSizePolicy::all(1_024)) + // .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; tree.insert("a", "a", 0); diff --git a/tests/snapshot_point_read.rs b/tests/snapshot_point_read.rs index d1e8cdd2..da747ddf 100644 --- a/tests/snapshot_point_read.rs +++ b/tests/snapshot_point_read.rs @@ -7,7 +7,7 @@ fn snapshot_404() -> lsm_tree::Result<()> { let tree = Config::new(&folder) .data_block_size_policy(BlockSizePolicy::all(1_024)) - .index_block_size_policy(BlockSizePolicy::all(1_024)) + // .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; tree.insert("a", "a", 0); @@ -42,7 +42,7 @@ fn snapshot_lots_of_versions() -> lsm_tree::Result<()> { let tree = Config::new(&folder) .data_block_size_policy(BlockSizePolicy::all(1_024)) - .index_block_size_policy(BlockSizePolicy::all(1_024)) + // .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; let key = "abc"; @@ -81,7 +81,7 @@ fn snapshot_disk_point_reads() -> lsm_tree::Result<()> { let tree = Config::new(&folder) .data_block_size_policy(BlockSizePolicy::all(1_024)) - .index_block_size_policy(BlockSizePolicy::all(1_024)) + // .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; let seqno = SequenceNumberCounter::default(); @@ -138,7 +138,7 @@ fn snapshot_disk_and_memtable_reads() -> lsm_tree::Result<()> { let tree = Config::new(&folder) .data_block_size_policy(BlockSizePolicy::all(1_024)) - .index_block_size_policy(BlockSizePolicy::all(1_024)) + // .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; let seqno = SequenceNumberCounter::default(); diff --git a/tests/tree_different_block_size.rs b/tests/tree_different_block_size.rs index 6df806de..91258689 100644 --- a/tests/tree_different_block_size.rs +++ b/tests/tree_different_block_size.rs @@ -10,7 +10,7 @@ fn tree_block_size_after_recovery() -> lsm_tree::Result<()> { { let tree = Config::new(&folder) .data_block_size_policy(BlockSizePolicy::all(2_048)) - .index_block_size_policy(BlockSizePolicy::all(2_048)) + // .index_block_size_policy(BlockSizePolicy::all(2_048)) .open()?; let seqno = SequenceNumberCounter::default(); @@ -29,7 +29,7 @@ fn tree_block_size_after_recovery() -> lsm_tree::Result<()> { { let tree = Config::new(&folder) .data_block_size_policy(BlockSizePolicy::all(2_048)) - .index_block_size_policy(BlockSizePolicy::all(2_048)) + // .index_block_size_policy(BlockSizePolicy::all(2_048)) .open()?; assert_eq!(ITEM_COUNT, tree.len(SeqNo::MAX, None)?); } @@ -37,7 +37,7 @@ fn tree_block_size_after_recovery() -> lsm_tree::Result<()> { { let tree = Config::new(&folder) .data_block_size_policy(BlockSizePolicy::all(4_096)) - .index_block_size_policy(BlockSizePolicy::all(4_096)) + // .index_block_size_policy(BlockSizePolicy::all(4_096)) .open()?; assert_eq!(ITEM_COUNT, tree.len(SeqNo::MAX, None)?); } @@ -45,7 +45,7 @@ fn tree_block_size_after_recovery() -> lsm_tree::Result<()> { { let tree = Config::new(&folder) .data_block_size_policy(BlockSizePolicy::all(78_652)) - .index_block_size_policy(BlockSizePolicy::all(78_652)) + // .index_block_size_policy(BlockSizePolicy::all(78_652)) .open()?; assert_eq!(ITEM_COUNT, tree.len(SeqNo::MAX, None)?); } diff --git a/tests/tree_disjoint_point_read.rs b/tests/tree_disjoint_point_read.rs index 7356653d..a369c24b 100644 --- a/tests/tree_disjoint_point_read.rs +++ b/tests/tree_disjoint_point_read.rs @@ -8,7 +8,7 @@ fn tree_disjoint_point_read() -> lsm_tree::Result<()> { let tree = Config::new(folder) .data_block_size_policy(BlockSizePolicy::all(1_024)) - .index_block_size_policy(BlockSizePolicy::all(1_024)) + // .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; tree.insert("a", "a", 0); @@ -37,7 +37,7 @@ fn tree_disjoint_point_read_blob() -> lsm_tree::Result<()> { let tree = Config::new(folder) .data_block_size_policy(BlockSizePolicy::all(1_024)) - .index_block_size_policy(BlockSizePolicy::all(1_024)) + // .index_block_size_policy(BlockSizePolicy::all(1_024)) .with_kv_separation(Some(Default::default())) .open()?; @@ -68,7 +68,7 @@ fn tree_disjoint_point_read_multiple_levels() -> lsm_tree::Result<()> { let tree = Config::new(folder) .data_block_size_policy(BlockSizePolicy::all(1_024)) - .index_block_size_policy(BlockSizePolicy::all(1_024)) + // .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; tree.insert("z", "z", 0); @@ -123,7 +123,7 @@ fn tree_disjoint_point_read_multiple_levels_blob() -> lsm_tree::Result<()> { let tree = Config::new(folder) .data_block_size_policy(BlockSizePolicy::all(1_024)) - .index_block_size_policy(BlockSizePolicy::all(1_024)) + // .index_block_size_policy(BlockSizePolicy::all(1_024)) .with_kv_separation(Some(Default::default())) .open()?; diff --git a/tests/tree_non_disjoint_point_read.rs b/tests/tree_non_disjoint_point_read.rs index 7935c91c..c93555fc 100644 --- a/tests/tree_non_disjoint_point_read.rs +++ b/tests/tree_non_disjoint_point_read.rs @@ -7,7 +7,7 @@ fn tree_non_disjoint_point_read() -> lsm_tree::Result<()> { let tree = Config::new(folder) .data_block_size_policy(BlockSizePolicy::all(1_024)) - .index_block_size_policy(BlockSizePolicy::all(1_024)) + // .index_block_size_policy(BlockSizePolicy::all(1_024)) .open()?; tree.insert("a", "a", 0); From fc3308f5e96023c4e945710d33301f8bb2e7a2aa Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 6 Oct 2025 21:54:53 +0200 Subject: [PATCH 539/613] use blob config in compaction --- src/compaction/worker.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 1f5e253b..bf04bf7a 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -278,7 +278,7 @@ fn merge_segments( .expect("blob file should exist") }) .filter(|blob_file| { - blob_file.is_stale(version.gc_stats(), 0.25 /* TODO: option */) + blob_file.is_stale(version.gc_stats(), blob_opts.staleness_threshold) }) .filter(|blob_file| { // NOTE: Dead blob files are dropped anyway during Version change commit @@ -337,13 +337,12 @@ fn merge_segments( .collect::>>()?, ); - // TODO: we need to relocate blob files without decompressing - // TODO: BUT the meta needs to store the compression type let writer = BlobFileWriter::new( opts.blob_file_id_generator.clone(), - blob_opts.blob_file_target_size, + blob_opts.file_target_size, opts.config.path.join(BLOBS_FOLDER), - )?; + )? + .use_compression(blob_opts.compression); let inner = StandardCompaction::new(table_writer, segments); From fac206c495a867e62d0aaaf5f81dccbd1d8d6a59 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 6 Oct 2025 21:55:00 +0200 Subject: [PATCH 540/613] test: add blob compression test case --- tests/blob_compression.rs | 104 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 tests/blob_compression.rs diff --git a/tests/blob_compression.rs b/tests/blob_compression.rs new file mode 100644 index 00000000..2796a74f --- /dev/null +++ b/tests/blob_compression.rs @@ -0,0 +1,104 @@ +use lsm_tree::{blob_tree::FragmentationEntry, AbstractTree, KvSeparationOptions, SeqNo}; +use test_log::test; + +#[test] +#[cfg(feature = "lz4")] +fn blob_tree_compression() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let path = folder.path(); + + let tree = lsm_tree::Config::new(path) + .with_kv_separation(Some( + KvSeparationOptions::default() + .compression(lsm_tree::CompressionType::Lz4) + .separation_threshold(1) + .staleness_threshold(0.0000001), + )) + .open()?; + + let big_value = b"abc".repeat(128_000); + + tree.insert("a", &big_value, 0); + tree.insert("b", b"smol", 0); + tree.flush_active_memtable(0)?; + assert_eq!(1, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); + + { + let value = tree.get("a", SeqNo::MAX)?.unwrap(); + assert_eq!(&*value, big_value); + + let value = tree.get("b", SeqNo::MAX)?.unwrap(); + assert_eq!(&*value, b"smol"); + } + + tree.remove("b", 1); + tree.flush_active_memtable(0)?; + assert_eq!(2, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); + + { + let value = tree.get("a", SeqNo::MAX)?.unwrap(); + assert_eq!(&*value, big_value); + + assert!(!tree.contains_key("b", SeqNo::MAX)?); + } + + tree.major_compact(u64::MAX, 1_000)?; + assert_eq!(1, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); + + { + let gc_stats = tree + .manifest() + .read() + .expect("lock is poisoned") + .current_version() + .gc_stats() + .clone(); + + assert_eq!( + &{ + let mut map = lsm_tree::HashMap::default(); + map.insert( + 0, + FragmentationEntry::new(1, b"smol".len().try_into().unwrap()), + ); + map + }, + &*gc_stats, + ); + } + + { + let value = tree.get("a", SeqNo::MAX)?.unwrap(); + assert_eq!(&*value, big_value); + + assert!(!tree.contains_key("b", SeqNo::MAX)?); + } + + tree.major_compact(u64::MAX, 1_000)?; + assert_eq!(1, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); + + { + let gc_stats = tree + .manifest() + .read() + .expect("lock is poisoned") + .current_version() + .gc_stats() + .clone(); + + assert_eq!(&lsm_tree::HashMap::default(), &*gc_stats); + } + + { + let value = tree.get("a", SeqNo::MAX)?.unwrap(); + assert_eq!(&*value, big_value); + + assert!(!tree.contains_key("b", SeqNo::MAX)?); + } + + Ok(()) +} From 299e63bb9f8da518e14e6aac4b7f9d8687fb5774 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 7 Oct 2025 00:05:11 +0200 Subject: [PATCH 541/613] flip fields in blocks to skip a varint read --- src/segment/block/decoder.rs | 3 +-- src/segment/block/encoder.rs | 18 ++++++++---------- src/segment/block/trailer.rs | 24 ++++++++++++++++++------ src/segment/data_block/mod.rs | 18 ++++++++---------- 4 files changed, 35 insertions(+), 28 deletions(-) diff --git a/src/segment/block/decoder.rs b/src/segment/block/decoder.rs index a950d230..be16ae0c 100644 --- a/src/segment/block/decoder.rs +++ b/src/segment/block/decoder.rs @@ -100,9 +100,8 @@ impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Pa "invalid binary index step size", ); - // TODO: flip len, offset - let binary_index_offset = unwrap!(reader.read_u32::()); let binary_index_len = unwrap!(reader.read_u32::()); + let binary_index_offset = unwrap!(reader.read_u32::()); Self { block, diff --git a/src/segment/block/encoder.rs b/src/segment/block/encoder.rs index 682ac037..e1b404b6 100644 --- a/src/segment/block/encoder.rs +++ b/src/segment/block/encoder.rs @@ -69,8 +69,7 @@ pub struct Encoder<'a, Context: Default, Item: Encodable> { pub(crate) restart_count: usize, pub(crate) restart_interval: u8, - pub(crate) use_prefix_truncation: bool, - + // pub(crate) use_prefix_truncation: bool, // TODO: support non-prefix truncation pub(crate) binary_index_builder: BinaryIndexBuilder, pub(crate) hash_index_builder: HashIndexBuilder, @@ -102,8 +101,7 @@ impl<'a, Context: Default, Item: Encodable> Encoder<'a, Context, Item> restart_count: 0, restart_interval, - use_prefix_truncation: true, - + // use_prefix_truncation: true, binary_index_builder, hash_index_builder, @@ -111,14 +109,14 @@ impl<'a, Context: Default, Item: Encodable> Encoder<'a, Context, Item> } } - /// Toggles prefix truncation. - pub fn use_prefix_truncation(mut self, flag: bool) -> Self { - assert!(flag, "prefix truncation is currently required to be true"); + // /// Toggles prefix truncation. + // pub fn use_prefix_truncation(mut self, flag: bool) -> Self { + // assert!(flag, "prefix truncation is currently required to be true"); - self.use_prefix_truncation = flag; + // self.use_prefix_truncation = flag; - self - } + // self + // } pub fn write(&mut self, item: &'a Item) -> crate::Result<()> { // NOTE: Check if we are a restart marker diff --git a/src/segment/block/trailer.rs b/src/segment/block/trailer.rs index 74fb7c5d..c6728b5e 100644 --- a/src/segment/block/trailer.rs +++ b/src/segment/block/trailer.rs @@ -11,7 +11,12 @@ use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; pub const TRAILER_START_MARKER: u8 = 255; -const TRAILER_SIZE: usize = 5 * std::mem::size_of::() + (2 * std::mem::size_of::()); +const TRAILER_SIZE: usize = 5 * std::mem::size_of::() + + (2 * std::mem::size_of::()) + // Fixed key size (unused) + + std::mem::size_of::() + // Fixed value size (unused) + + std::mem::size_of::(); /// Block trailer /// @@ -107,10 +112,6 @@ impl<'a> Trailer<'a> { encoder.writer.write_u8(binary_index_step_size)?; - encoder - .writer - .write_u32::(binary_index_offset)?; - // NOTE: Even with a dense index, there can't be more index pointers than items #[allow(clippy::cast_possible_truncation)] encoder @@ -119,7 +120,7 @@ impl<'a> Trailer<'a> { encoder .writer - .write_u32::(hash_index_offset)?; + .write_u32::(binary_index_offset)?; encoder .writer @@ -129,6 +130,17 @@ impl<'a> Trailer<'a> { 0 })?; + encoder + .writer + .write_u32::(hash_index_offset)?; + + // Fixed key size (unused) + encoder.writer.write_u16::(0)?; + + // TODO: 3.0.0 what if value is actually 0...? we need another byte prob + // Fixed value size (unused) + encoder.writer.write_u32::(0)?; + #[cfg(debug_assertions)] assert_eq!( TRAILER_SIZE, diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index 1496de4d..7eca7d1a 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -315,9 +315,8 @@ impl DataBlock { "invalid binary index step size", ); - // TODO: 3.0.0 flip len and offset - let binary_index_offset = unwrap!(reader.read_u32::()); let binary_index_len = unwrap!(reader.read_u32::()); + let binary_index_offset = unwrap!(reader.read_u32::()); BinaryIndexReader::new( &self.inner.data, @@ -343,18 +342,18 @@ impl DataBlock { let mut reader = unwrap!(trailer.as_slice().get(offset..)); - // TODO: 3.0.0 flip offset and len, so we can terminate after len if == 0 - let hash_index_offset = unwrap!(reader.read_u32::()); let hash_index_len = unwrap!(reader.read_u32::()); - if hash_index_len > 0 { + if hash_index_len == 0 { + None + } else { + let hash_index_offset = unwrap!(reader.read_u32::()); + Some(HashIndexReader::new( &self.inner.data, hash_index_offset, hash_index_len, )) - } else { - None } } @@ -447,9 +446,8 @@ impl DataBlock { let trailer = Trailer::new(&self.inner); - // NOTE: Skip item count (u32), restart interval (u8), binary index step size (u8), - // and binary index offset (u32) - let offset = size_of::() + (2 * size_of::()) + size_of::(); + // NOTE: Skip item count (u32), restart interval (u8), binary index step size (u8) + let offset = size_of::() + (2 * size_of::()); let mut reader = unwrap!(trailer.as_slice().get(offset..)); unwrap!(reader.read_u32::()) From 77e41b5f4f9a8b4e433911632f89568bd8ef2b1a Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 7 Oct 2025 00:12:02 +0200 Subject: [PATCH 542/613] clippy --- src/tree/mod.rs | 29 +++++++++++------------------ src/vlog/blob_file/reader.rs | 8 ++++++-- 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 74242eeb..5ba5a533 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -28,7 +28,7 @@ use std::{ io::Cursor, ops::{Bound, RangeBounds}, path::Path, - sync::{atomic::AtomicU64, Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}, + sync::{atomic::AtomicU64, Arc, RwLock, RwLockWriteGuard}, }; #[cfg(feature = "metrics")] @@ -223,7 +223,7 @@ impl AbstractTree for Tree { } fn drop_range, R: RangeBounds>(&self, range: R) -> crate::Result<()> { - let (bounds, is_empty) = Self::range_bounds_to_owned_bounds(&range)?; + let (bounds, is_empty) = Self::range_bounds_to_owned_bounds(&range); if is_empty { return Ok(()); @@ -627,7 +627,7 @@ impl Tree { /// while still having access to the normalized bounds for non-empty cases. fn range_bounds_to_owned_bounds, R: RangeBounds>( range: &R, - ) -> crate::Result<(OwnedBounds, bool)> { + ) -> (OwnedBounds, bool) { use Bound::{Excluded, Included, Unbounded}; let start = match range.start_bound() { @@ -642,17 +642,14 @@ impl Tree { Unbounded => Unbounded, }; - let is_empty = if let (Included(lo), Included(hi)) - | (Included(lo), Excluded(hi)) - | (Excluded(lo), Included(hi)) - | (Excluded(lo), Excluded(hi)) = (&start, &end) - { - lo.as_ref() > hi.as_ref() - } else { - false - }; + let is_empty = + if let (Included(lo) | Excluded(lo), Included(hi) | Excluded(hi)) = (&start, &end) { + lo.as_ref() > hi.as_ref() + } else { + false + }; - Ok((OwnedBounds { start, end }, is_empty)) + (OwnedBounds { start, end }, is_empty) } /// Opens an LSM-tree in the given directory. @@ -686,10 +683,6 @@ impl Tree { Ok(tree) } - pub(crate) fn read_lock_active_memtable(&self) -> RwLockReadGuard<'_, Arc> { - self.active_memtable.read().expect("lock is poisoned") - } - pub(crate) fn consume_writer( &self, writer: crate::segment::Writer, @@ -1088,7 +1081,7 @@ impl Tree { } let segment_file_name = file_name.to_str().ok_or_else(|| { - log::error!("invalid segment file name {file_name:?}"); + log::error!("invalid segment file name {}", file_name.display()); crate::Error::Unrecoverable })?; diff --git a/src/vlog/blob_file/reader.rs b/src/vlog/blob_file/reader.rs index 01bec7b0..03abd79d 100644 --- a/src/vlog/blob_file/reader.rs +++ b/src/vlog/blob_file/reader.rs @@ -52,11 +52,15 @@ impl<'a> Reader<'a> { let expected_checksum = reader.read_u128::()?; let _seqno = reader.read_u64::()?; - let key_len = reader.read_u16::()? as usize; + let key_len = reader.read_u16::()?; + + // NOTE: Used in feature flagged branch + #[allow(unused)] let real_val_len = reader.read_u32::()? as usize; + let _on_disk_val_len = reader.read_u32::()? as usize; - reader.seek(std::io::SeekFrom::Current(key_len as i64))?; + reader.seek(std::io::SeekFrom::Current(key_len.into()))?; let raw_data = value.slice((add_size as usize)..); From 29b9e1d23755b5b09276634affce8f7f9907f383 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 7 Oct 2025 15:18:35 +0200 Subject: [PATCH 543/613] adjust blob config options --- src/config/mod.rs | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/config/mod.rs b/src/config/mod.rs index 6b441857..9267dd70 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -73,7 +73,7 @@ pub struct KvSeparationOptions { pub(crate) staleness_threshold: f32, - pub(crate) age_cutoff: f32, // TODO: 3.0.0 + pub(crate) age_cutoff: f32, } impl Default for KvSeparationOptions { @@ -85,11 +85,11 @@ impl Default for KvSeparationOptions { #[cfg(not(feature="lz4"))] compression: CompressionType::None, - file_target_size: /* 256 MiB */ 256 * 1_024 * 1_024, + file_target_size: /* 64 MiB */ 64 * 1_024 * 1_024, separation_threshold: /* 1 KiB */ 1_024, - staleness_threshold: 0.25, - age_cutoff: 0.25, + staleness_threshold: 0.33, + age_cutoff: 0.20, } } } @@ -134,12 +134,21 @@ impl KvSeparationOptions { /// The staleness percentage determines how much a blob file needs to be fragmented to be /// picked up by the garbage collection. /// - /// Defaults to 25%. + /// Defaults to 33%. #[must_use] pub fn staleness_threshold(mut self, ratio: f32) -> Self { self.staleness_threshold = ratio; self } + + /// Sets the age cutoff threshold. + /// + /// Defaults to 20%. + #[must_use] + pub fn age_cutoff(mut self, ratio: f32) -> Self { + self.age_cutoff = ratio; + self + } } #[derive(Clone)] From 8ba35dc0f1d23b8dc170b78d5e5cda6b0901a98d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 7 Oct 2025 17:14:03 +0200 Subject: [PATCH 544/613] remove commented code --- src/abstract.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/abstract.rs b/src/abstract.rs index 666e96da..b2408e5a 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -143,10 +143,6 @@ pub trait AbstractTree { #[cfg(feature = "metrics")] fn metrics(&self) -> &Arc; - // TODO:? - /* #[doc(hidden)] - fn verify(&self) -> crate::Result; */ - /// Synchronously flushes a memtable to a disk segment. /// /// This method will not make the segment immediately available, From b55bee22990fd0d6ee73e890f682f0f8916a86a2 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 7 Oct 2025 20:17:13 +0200 Subject: [PATCH 545/613] change abstract tree api --- src/abstract.rs | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/src/abstract.rs b/src/abstract.rs index b2408e5a..65cee716 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -4,15 +4,12 @@ use crate::{ blob_tree::FragmentationMap, compaction::CompactionStrategy, config::TreeType, - iter_guard::IterGuardImpl, level_manifest::LevelManifest, segment::Segment, - tree::inner::MemtableId, vlog::BlobFile, AnyTree, BlobTree, Config, Guard, InternalValue, - KvPair, Memtable, SegmentId, SeqNo, SequenceNumberCounter, Tree, TreeId, UserKey, UserValue, + iter_guard::IterGuardImpl, segment::Segment, tree::inner::MemtableId, version::Version, + vlog::BlobFile, AnyTree, BlobTree, Config, Guard, InternalValue, KvPair, Memtable, SegmentId, + SeqNo, SequenceNumberCounter, Tree, TreeId, UserKey, UserValue, }; use enum_dispatch::enum_dispatch; -use std::{ - ops::RangeBounds, - sync::{Arc, RwLock, RwLockWriteGuard}, -}; +use std::{ops::RangeBounds, sync::Arc}; pub type RangeItem = crate::Result; @@ -30,7 +27,7 @@ pub trait AbstractTree { fn get_internal_entry(&self, key: &[u8], seqno: SeqNo) -> crate::Result>; #[doc(hidden)] - fn manifest(&self) -> &Arc>; + fn current_version(&self) -> Version; /// Synchronously flushes the active memtable to a disk segment. /// @@ -172,9 +169,6 @@ pub trait AbstractTree { seqno_threshold: SeqNo, ) -> crate::Result<()>; - /// Write-locks the active memtable for exclusive access - fn lock_active_memtable(&self) -> RwLockWriteGuard<'_, Arc>; - /// Clears the active memtable atomically. fn clear_active_memtable(&self); From 99d3fb625526f109cfa931bec1216a1b84f644dc Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 7 Oct 2025 20:18:16 +0200 Subject: [PATCH 546/613] introduce super version and refactor sealed memtables list --- src/tree/inner.rs | 67 ++++++++++++++++++++-------------------------- src/tree/mod.rs | 5 ++-- src/tree/sealed.rs | 33 +++++++++++++++++++++++ 3 files changed, 65 insertions(+), 40 deletions(-) create mode 100644 src/tree/sealed.rs diff --git a/src/tree/inner.rs b/src/tree/inner.rs index 52d16abb..7fb99e62 100644 --- a/src/tree/inner.rs +++ b/src/tree/inner.rs @@ -4,9 +4,9 @@ use crate::{ config::Config, level_manifest::LevelManifest, memtable::Memtable, stop_signal::StopSignal, - SegmentId, SequenceNumberCounter, + tree::sealed::SealedMemtables, SegmentId, SequenceNumberCounter, }; -use std::sync::{atomic::AtomicU64, Arc, RwLock}; +use std::sync::{atomic::AtomicU64, Arc, Mutex, RwLock}; #[cfg(feature = "metrics")] use crate::metrics::Metrics; @@ -21,37 +21,23 @@ pub type TreeId = u64; /// Memtable IDs map one-to-one to some segment. pub type MemtableId = u64; -/// Stores references to all sealed memtables -/// -/// Memtable IDs are monotonically increasing, so we don't really -/// need a search tree; also there are only a handful of them at most. -#[derive(Default)] -pub struct SealedMemtables(Vec<(MemtableId, Arc)>); - -impl SealedMemtables { - pub fn add(&mut self, id: MemtableId, memtable: Arc) { - self.0.push((id, memtable)); - } - - pub fn remove(&mut self, id_to_remove: MemtableId) { - self.0.retain(|(id, _)| *id != id_to_remove); - } - - pub fn iter(&self) -> impl DoubleEndedIterator)> { - self.0.iter() - } - - pub fn len(&self) -> usize { - self.0.len() - } -} - /// Hands out a unique (monotonically increasing) tree ID. pub fn get_next_tree_id() -> TreeId { static TREE_ID_COUNTER: AtomicU64 = AtomicU64::new(0); TREE_ID_COUNTER.fetch_add(1, std::sync::atomic::Ordering::Relaxed) } +pub struct SuperVersion { + /// Active memtable that is being written to + pub(crate) active_memtable: Arc, + + /// Frozen memtables that are being flushed + pub(crate) sealed_memtables: Arc, + + /// Current tree version + pub(crate) manifest: LevelManifest, +} + #[allow(clippy::module_name_repetitions)] pub struct TreeInner { /// Unique tree ID @@ -65,14 +51,7 @@ pub struct TreeInner { /// Hands out a unique (monotonically increasing) blob file ID pub(crate) blob_file_id_generator: SequenceNumberCounter, - /// Active memtable that is being written to - pub(crate) active_memtable: Arc>>, - - /// Frozen memtables that are being flushed - pub(crate) sealed_memtables: Arc>, - - /// Current tree version - pub(super) manifest: Arc>, + pub(crate) super_version: Arc>, /// Tree configuration pub config: Config, @@ -81,8 +60,16 @@ pub struct TreeInner { /// will interrupt the compaction and kill the worker. pub(crate) stop_signal: StopSignal, + /// Used by major compaction to be the exclusive compaction going on. + /// + /// Minor compactions use `major_compaction_lock.read()` instead, so they + /// can be concurrent next to each other. pub(crate) major_compaction_lock: RwLock<()>, + // TODO: 3.0.0 compaction state + // Serializes compactions when they look at the tree levels and prepare compactions + pub(crate) compaction_lock: Arc>, + #[doc(hidden)] #[cfg(feature = "metrics")] pub metrics: Arc, @@ -97,11 +84,15 @@ impl TreeInner { segment_id_counter: Arc::new(AtomicU64::default()), blob_file_id_generator: SequenceNumberCounter::default(), config, - active_memtable: Arc::default(), - sealed_memtables: Arc::default(), - manifest: Arc::new(RwLock::new(manifest)), + super_version: Arc::new(RwLock::new(SuperVersion { + active_memtable: Arc::default(), + sealed_memtables: Arc::default(), + manifest, + })), stop_signal: StopSignal::default(), major_compaction_lock: RwLock::default(), + compaction_lock: Arc::default(), + #[cfg(feature = "metrics")] metrics: Metrics::default().into(), }) diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 5ba5a533..4fc86ce9 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -4,6 +4,7 @@ pub mod ingest; pub mod inner; +mod sealed; use crate::{ blob_tree::FragmentationMap, @@ -23,12 +24,12 @@ use crate::{ AbstractTree, Cache, DescriptorTable, KvPair, SegmentId, SeqNo, SequenceNumberCounter, TreeType, UserKey, UserValue, ValueType, }; -use inner::{MemtableId, SealedMemtables, TreeId, TreeInner}; +use inner::{MemtableId, TreeId, TreeInner}; use std::{ io::Cursor, ops::{Bound, RangeBounds}, path::Path, - sync::{atomic::AtomicU64, Arc, RwLock, RwLockWriteGuard}, + sync::{atomic::AtomicU64, Arc, RwLock}, }; #[cfg(feature = "metrics")] diff --git a/src/tree/sealed.rs b/src/tree/sealed.rs new file mode 100644 index 00000000..fa48fbad --- /dev/null +++ b/src/tree/sealed.rs @@ -0,0 +1,33 @@ +use crate::{tree::inner::MemtableId, Memtable}; +use std::sync::Arc; + +/// Stores references to all sealed memtables +/// +/// Memtable IDs are monotonically increasing, so we don't really +/// need a search tree; also there are only a handful of them at most. +#[derive(Clone, Default)] +pub struct SealedMemtables(Vec<(MemtableId, Arc)>); + +impl SealedMemtables { + /// Copy-and-writes a new list with additional Memtable. + pub fn add(&self, id: MemtableId, memtable: Arc) -> Self { + let mut copy = self.clone(); + copy.0.push((id, memtable)); + copy + } + + /// Copy-and-writes a new list with the specified Memtable removed. + pub fn remove(&self, id_to_remove: MemtableId) -> Self { + let mut copy = self.clone(); + copy.0.retain(|(id, _)| *id != id_to_remove); + copy + } + + pub fn iter(&self) -> impl DoubleEndedIterator)> { + self.0.iter() + } + + pub fn len(&self) -> usize { + self.0.len() + } +} From 93a763711ce515b08c594378c8f556df73b3213f Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 7 Oct 2025 20:19:02 +0200 Subject: [PATCH 547/613] adjust tests --- tests/blob_drop_range_gc_stats.rs | 8 +--- tests/blob_major_compact_drop_dead_files.rs | 24 ++--------- tests/blob_major_compact_gc_stats.rs | 26 ++--------- tests/blob_major_compact_relink.rs | 10 +---- tests/blob_major_compact_relocation.rs | 48 +++------------------ tests/blob_recover_gc_stats.rs | 16 +------ tests/compaction_readers_grouping.rs | 12 ------ tests/multi_trees.rs | 6 --- tests/mvcc_slab.rs | 10 ++--- tests/tree_disjoint_point_read.rs | 22 +--------- tests/tree_flush_eviction.rs | 10 +---- 11 files changed, 26 insertions(+), 166 deletions(-) diff --git a/tests/blob_drop_range_gc_stats.rs b/tests/blob_drop_range_gc_stats.rs index ae8b7d73..fba6a2ab 100644 --- a/tests/blob_drop_range_gc_stats.rs +++ b/tests/blob_drop_range_gc_stats.rs @@ -30,13 +30,7 @@ fn blob_tree_drop_range_gc_stats() -> lsm_tree::Result<()> { assert_eq!(0, tree.blob_file_count()); assert_eq!(0, tree.segment_count()); - let gc_stats = tree - .manifest() - .read() - .expect("lock is poisoned") - .current_version() - .gc_stats() - .clone(); + let gc_stats = tree.current_version().gc_stats().clone(); // "big":0 was dropped assert_eq!( diff --git a/tests/blob_major_compact_drop_dead_files.rs b/tests/blob_major_compact_drop_dead_files.rs index c8e2414a..f25e447d 100644 --- a/tests/blob_major_compact_drop_dead_files.rs +++ b/tests/blob_major_compact_drop_dead_files.rs @@ -45,13 +45,7 @@ fn blob_tree_major_compact_drop_dead_files() -> lsm_tree::Result<()> { assert_eq!(&*value, new_big_value); { - let gc_stats = tree - .manifest() - .read() - .expect("lock is poisoned") - .current_version() - .gc_stats() - .clone(); + let gc_stats = tree.current_version().gc_stats().clone(); assert_eq!(&lsm_tree::HashMap::default(), &*gc_stats); } @@ -64,13 +58,7 @@ fn blob_tree_major_compact_drop_dead_files() -> lsm_tree::Result<()> { assert_eq!(&*value, new_big_value); { - let gc_stats = tree - .manifest() - .read() - .expect("lock is poisoned") - .current_version() - .gc_stats() - .clone(); + let gc_stats = tree.current_version().gc_stats().clone(); assert_eq!( &{ @@ -90,13 +78,7 @@ fn blob_tree_major_compact_drop_dead_files() -> lsm_tree::Result<()> { assert_eq!(1, tree.blob_file_count()); { - let gc_stats = tree - .manifest() - .read() - .expect("lock is poisoned") - .current_version() - .gc_stats() - .clone(); + let gc_stats = tree.current_version().gc_stats().clone(); assert_eq!(&lsm_tree::HashMap::default(), &*gc_stats); } diff --git a/tests/blob_major_compact_gc_stats.rs b/tests/blob_major_compact_gc_stats.rs index e01f5eb4..c9d2a16d 100644 --- a/tests/blob_major_compact_gc_stats.rs +++ b/tests/blob_major_compact_gc_stats.rs @@ -37,13 +37,7 @@ fn blob_tree_major_compact_gc_stats() -> lsm_tree::Result<()> { assert_eq!(1, tree.segment_count()); assert_eq!(2, tree.blob_file_count()); - let gc_stats = tree - .manifest() - .read() - .expect("lock is poisoned") - .current_version() - .gc_stats() - .clone(); + let gc_stats = tree.current_version().gc_stats().clone(); // "big":0 is expired assert_eq!( @@ -95,10 +89,7 @@ fn blob_tree_major_compact_gc_stats_tombstone() -> lsm_tree::Result<()> { bytes: 2 * big_value.len() as u64, len: 2, }]), - tree.manifest() - .read() - .expect("lock is poisoned") - .current_version() + tree.current_version() .iter_segments() .nth(1) .unwrap() @@ -111,13 +102,7 @@ fn blob_tree_major_compact_gc_stats_tombstone() -> lsm_tree::Result<()> { assert_eq!(1, tree.segment_count()); assert_eq!(1, tree.blob_file_count()); - let gc_stats = tree - .manifest() - .read() - .expect("lock is poisoned") - .current_version() - .gc_stats() - .clone(); + let gc_stats = tree.current_version().gc_stats().clone(); // "big":0 is expired assert_eq!( @@ -135,10 +120,7 @@ fn blob_tree_major_compact_gc_stats_tombstone() -> lsm_tree::Result<()> { bytes: big_value.len() as u64, len: 1, }]), - tree.manifest() - .read() - .expect("lock is poisoned") - .current_version() + tree.current_version() .iter_segments() .next() .unwrap() diff --git a/tests/blob_major_compact_relink.rs b/tests/blob_major_compact_relink.rs index c917dff7..d4b3d9d9 100644 --- a/tests/blob_major_compact_relink.rs +++ b/tests/blob_major_compact_relink.rs @@ -30,10 +30,7 @@ fn blob_tree_major_compact_relink() -> lsm_tree::Result<()> { bytes: big_value.len() as u64, len: 1, }]), - tree.manifest() - .read() - .expect("lock is poisoned") - .current_version() + tree.current_version() .iter_segments() .next() .unwrap() @@ -52,10 +49,7 @@ fn blob_tree_major_compact_relink() -> lsm_tree::Result<()> { bytes: big_value.len() as u64, len: 1, }]), - tree.manifest() - .read() - .expect("lock is poisoned") - .current_version() + tree.current_version() .iter_segments() .next() .unwrap() diff --git a/tests/blob_major_compact_relocation.rs b/tests/blob_major_compact_relocation.rs index 9cd32c8b..b37d8921 100644 --- a/tests/blob_major_compact_relocation.rs +++ b/tests/blob_major_compact_relocation.rs @@ -53,13 +53,7 @@ fn blob_tree_major_compact_relocation_simple() -> lsm_tree::Result<()> { assert_eq!(&*value, b"smol"); { - let gc_stats = tree - .manifest() - .read() - .expect("lock is poisoned") - .current_version() - .gc_stats() - .clone(); + let gc_stats = tree.current_version().gc_stats().clone(); // "big":0 is expired assert_eq!( @@ -77,13 +71,7 @@ fn blob_tree_major_compact_relocation_simple() -> lsm_tree::Result<()> { assert_eq!(2, tree.blob_file_count()); { - let gc_stats = tree - .manifest() - .read() - .expect("lock is poisoned") - .current_version() - .gc_stats() - .clone(); + let gc_stats = tree.current_version().gc_stats().clone(); assert_eq!(&lsm_tree::HashMap::default(), &*gc_stats); } @@ -167,13 +155,7 @@ fn blob_tree_major_compact_relocation_repeated_key() -> lsm_tree::Result<()> { assert_eq!(&*value, big_value); { - let gc_stats = tree - .manifest() - .read() - .expect("lock is poisoned") - .current_version() - .gc_stats() - .clone(); + let gc_stats = tree.current_version().gc_stats().clone(); assert_eq!( &{ @@ -190,13 +172,7 @@ fn blob_tree_major_compact_relocation_repeated_key() -> lsm_tree::Result<()> { assert_eq!(1, tree.blob_file_count()); { - let gc_stats = tree - .manifest() - .read() - .expect("lock is poisoned") - .current_version() - .gc_stats() - .clone(); + let gc_stats = tree.current_version().gc_stats().clone(); assert_eq!(&lsm_tree::HashMap::default(), &*gc_stats); } @@ -282,13 +258,7 @@ fn blob_tree_major_compact_relocation_interleaved() -> lsm_tree::Result<()> { let value = tree.get("e", SeqNo::MAX)?.expect("should exist"); assert_eq!(&*value, b"smol"); { - let gc_stats = tree - .manifest() - .read() - .expect("lock is poisoned") - .current_version() - .gc_stats() - .clone(); + let gc_stats = tree.current_version().gc_stats().clone(); assert_eq!( &{ @@ -305,13 +275,7 @@ fn blob_tree_major_compact_relocation_interleaved() -> lsm_tree::Result<()> { assert_eq!(1, tree.blob_file_count()); { - let gc_stats = tree - .manifest() - .read() - .expect("lock is poisoned") - .current_version() - .gc_stats() - .clone(); + let gc_stats = tree.current_version().gc_stats().clone(); assert_eq!(&lsm_tree::HashMap::default(), &*gc_stats); } diff --git a/tests/blob_recover_gc_stats.rs b/tests/blob_recover_gc_stats.rs index e0a9695b..558532ca 100644 --- a/tests/blob_recover_gc_stats.rs +++ b/tests/blob_recover_gc_stats.rs @@ -32,13 +32,7 @@ fn blob_tree_recover_gc_stats() -> lsm_tree::Result<()> { tree.major_compact(64_000_000, 1_000)?; - let gc_stats = tree - .manifest() - .read() - .expect("lock is poisoned") - .current_version() - .gc_stats() - .clone(); + let gc_stats = tree.current_version().gc_stats().clone(); // "big":0 is expired assert_eq!( @@ -56,13 +50,7 @@ fn blob_tree_recover_gc_stats() -> lsm_tree::Result<()> { .with_kv_separation(Some(Default::default())) .open()?; - let gc_stats = tree - .manifest() - .read() - .expect("lock is poisoned") - .current_version() - .gc_stats() - .clone(); + let gc_stats = tree.current_version().gc_stats().clone(); // "big":0 is still expired assert_eq!( diff --git a/tests/compaction_readers_grouping.rs b/tests/compaction_readers_grouping.rs index dd7d5ea0..e32964a6 100644 --- a/tests/compaction_readers_grouping.rs +++ b/tests/compaction_readers_grouping.rs @@ -37,36 +37,24 @@ fn compaction_readers_grouping() -> lsm_tree::Result<()> { tree.compact(Arc::new(lsm_tree::compaction::PullDown(2, 3)), 0)?; assert!(!tree - .manifest() - .read() - .expect("asdasd") .current_version() .level(0) .expect("level should exist") .is_empty()); assert!(tree - .manifest() - .read() - .expect("asdasd") .current_version() .level(1) .expect("level should exist") .is_empty()); assert!(tree - .manifest() - .read() - .expect("asdasd") .current_version() .level(2) .expect("level should exist") .is_empty()); assert!(!tree - .manifest() - .read() - .expect("asdasd") .current_version() .level(3) .expect("level should exist") diff --git a/tests/multi_trees.rs b/tests/multi_trees.rs index 47dff0ec..5f94c245 100644 --- a/tests/multi_trees.rs +++ b/tests/multi_trees.rs @@ -19,9 +19,6 @@ fn tree_multi_segment_ids() -> lsm_tree::Result<()> { assert_eq!( 0, tree0 - .manifest() - .read() - .expect("lock is poisoned") .current_version() .level(0) .expect("level should exist") @@ -46,9 +43,6 @@ fn tree_multi_segment_ids() -> lsm_tree::Result<()> { assert_eq!( 0, tree1 - .manifest() - .read() - .expect("lock is poisoned") .current_version() .level(0) .expect("level should exist") diff --git a/tests/mvcc_slab.rs b/tests/mvcc_slab.rs index 51ce4638..77aecbf3 100644 --- a/tests/mvcc_slab.rs +++ b/tests/mvcc_slab.rs @@ -21,10 +21,9 @@ fn segment_reader_mvcc_slab() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; - let level_manifest = tree.manifest().read().expect("lock is poisoned"); + let version = tree.current_version(); - let segment = level_manifest - .current_version() + let segment = version .level(0) .expect("level should exist") .first() @@ -59,10 +58,9 @@ fn segment_reader_mvcc_slab_blob() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; - let level_manifest = tree.manifest().read().expect("lock is poisoned"); + let version = tree.current_version(); - let segment = level_manifest - .current_version() + let segment = version .level(0) .expect("level should exist") .first() diff --git a/tests/tree_disjoint_point_read.rs b/tests/tree_disjoint_point_read.rs index a369c24b..8d799263 100644 --- a/tests/tree_disjoint_point_read.rs +++ b/tests/tree_disjoint_point_read.rs @@ -84,16 +84,7 @@ fn tree_disjoint_point_read_multiple_levels() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; tree.compact(Arc::new(lsm_tree::compaction::SizeTiered::new(10, 8)), 1)?; - assert_eq!( - 1, - tree.manifest() - .read() - .expect("asdasd") - .current_version() - .level(1) - .unwrap() - .len() - ); + assert_eq!(1, tree.current_version().level(1).unwrap().len()); tree.insert("e", "e", 0); tree.flush_active_memtable(0)?; @@ -140,16 +131,7 @@ fn tree_disjoint_point_read_multiple_levels_blob() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; tree.compact(Arc::new(lsm_tree::compaction::SizeTiered::new(10, 8)), 1)?; - assert_eq!( - 1, - tree.manifest() - .read() - .expect("asdasd") - .current_version() - .level(1) - .unwrap() - .len() - ); + assert_eq!(1, tree.current_version().level(1).unwrap().len()); tree.insert("e", "e", 0); tree.flush_active_memtable(0)?; diff --git a/tests/tree_flush_eviction.rs b/tests/tree_flush_eviction.rs index a1ed93c7..95ea31f4 100644 --- a/tests/tree_flush_eviction.rs +++ b/tests/tree_flush_eviction.rs @@ -84,10 +84,7 @@ fn tree_flush_eviction_4() -> lsm_tree::Result<()> { assert_eq!(1, tree.len(SeqNo::MAX, None)?); assert_eq!( 1, - tree.manifest() - .read() - .expect("lock is poisoned") - .current_version() + tree.current_version() .level(0) .expect("should exist") .first() @@ -104,10 +101,7 @@ fn tree_flush_eviction_4() -> lsm_tree::Result<()> { assert_eq!(1, tree.len(SeqNo::MAX, None)?); assert_eq!( 0, - tree.manifest() - .read() - .expect("lock is poisoned") - .current_version() + tree.current_version() .level(6) .expect("should exist") .first() From 3e3b25e0ea0b6eb6cc09ae802bf47a05754c917e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 7 Oct 2025 20:19:08 +0200 Subject: [PATCH 548/613] wip --- src/tree/ingest.rs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/tree/ingest.rs b/src/tree/ingest.rs index 63d5d539..9523999c 100644 --- a/src/tree/ingest.rs +++ b/src/tree/ingest.rs @@ -141,12 +141,7 @@ impl<'a> Ingestion<'a> { self.tree .register_segments(&created_segments, None, None, 0)?; - let last_level_idx = self - .tree - .manifest - .read() - .expect("lock is poisoned") - .last_level_index(); + let last_level_idx = self.tree.config.level_count - 1; self.tree .compact(Arc::new(MoveDown(0, last_level_idx)), 0)?; From 0c78c63547d6c4a07796586a40bc4ddf5a5864ab Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 7 Oct 2025 20:19:20 +0200 Subject: [PATCH 549/613] adjust tests --- src/multi_reader.rs | 3 --- src/run_reader.rs | 12 ++++-------- src/run_scanner.rs | 3 --- 3 files changed, 4 insertions(+), 14 deletions(-) diff --git a/src/multi_reader.rs b/src/multi_reader.rs index bf32552c..250813ab 100644 --- a/src/multi_reader.rs +++ b/src/multi_reader.rs @@ -76,9 +76,6 @@ mod tests { } let segments = tree - .manifest() - .read() - .expect("lock is poisoned") .current_version() .iter_segments() .cloned() diff --git a/src/run_reader.rs b/src/run_reader.rs index 21b9c443..a6a292e9 100644 --- a/src/run_reader.rs +++ b/src/run_reader.rs @@ -154,10 +154,8 @@ mod tests { } let segments = tree - .manifest() - .read() - .expect("lock is poisoned") - .iter() + .current_version() + .iter_segments() .cloned() .collect::>(); @@ -196,10 +194,8 @@ mod tests { } let segments = tree - .manifest() - .read() - .expect("lock is poisoned") - .iter() + .current_version() + .iter_segments() .cloned() .collect::>(); diff --git a/src/run_scanner.rs b/src/run_scanner.rs index 1405b8d9..a522323c 100644 --- a/src/run_scanner.rs +++ b/src/run_scanner.rs @@ -90,9 +90,6 @@ mod tests { } let segments = tree - .manifest() - .read() - .expect("lock is poisoned") .current_version() .iter_segments() .cloned() From 57e0e1b0c3978a2fb459db8e5f9e2d58678ecd67 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 7 Oct 2025 20:19:32 +0200 Subject: [PATCH 550/613] remove unnecessary function --- src/level_manifest/mod.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index 54a0abf1..8474961c 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -372,12 +372,6 @@ impl LevelManifest { } } - /// Returns the number of levels in the tree. - #[must_use] - pub fn last_level_index(&self) -> u8 { - DEFAULT_LEVEL_COUNT - 1 - } - /// Returns the number of segments, summed over all levels #[must_use] pub fn len(&self) -> usize { From c4c523b76204f7ddd46c849231c69810dc2c09bb Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 7 Oct 2025 20:19:41 +0200 Subject: [PATCH 551/613] adjust test --- src/level_manifest/mod.rs | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index 8474961c..5fe80a79 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -453,16 +453,27 @@ mod tests { let segment_count_before_major_compact = tree.segment_count(); - // NOTE: Purposefully change level manifest to have invalid path - // to force an I/O error - tree.manifest().write().expect("lock is poisoned").folder = "/invaliiid/asd".into(); + let crate::AnyTree::Standard(tree) = tree else { + unreachable!(); + }; + + { + // NOTE: Purposefully change level manifest to have invalid path + // to force an I/O error + tree.super_version + .write() + .expect("lock is poisoned") + .manifest + .folder = "/invaliiid/asd".into(); + } assert!(tree.major_compact(u64::MAX, 4).is_err()); assert!(tree - .manifest() + .super_version .read() .expect("lock is poisoned") + .manifest .hidden_set .is_empty()); From 8bb3933f1106b2a1bf391ea5d2dc58979512b8be Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 7 Oct 2025 20:20:03 +0200 Subject: [PATCH 552/613] perf: use coarse lock (super version) --- src/tree/mod.rs | 244 +++++++++++++++++++----------------------------- 1 file changed, 98 insertions(+), 146 deletions(-) diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 4fc86ce9..7cfc0601 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -19,7 +19,9 @@ use crate::{ memtable::Memtable, segment::Segment, slice::Slice, + tree::inner::SuperVersion, value::InternalValue, + version::Version, vlog::BlobFile, AbstractTree, Cache, DescriptorTable, KvPair, SegmentId, SeqNo, SequenceNumberCounter, TreeType, UserKey, UserValue, ValueType, @@ -85,27 +87,31 @@ impl AbstractTree for Tree { } fn get_internal_entry(&self, key: &[u8], seqno: SeqNo) -> crate::Result> { - // TODO: consolidate memtable & sealed behind single RwLock + #[allow(clippy::significant_drop_tightening)] + let version_lock = self.super_version.read().expect("lock is poisoned"); - let memtable_lock = self.active_memtable.read().expect("lock is poisoned"); - - if let Some(entry) = memtable_lock.get(key, seqno) { + if let Some(entry) = version_lock.active_memtable.get(key, seqno) { return Ok(ignore_tombstone_value(entry)); } - drop(memtable_lock); - // Now look in sealed memtables - if let Some(entry) = self.get_internal_entry_from_sealed_memtables(key, seqno) { + if let Some(entry) = + self.get_internal_entry_from_sealed_memtables(&version_lock, key, seqno) + { return Ok(ignore_tombstone_value(entry)); } // Now look in segments... this may involve disk I/O - self.get_internal_entry_from_segments(key, seqno) + self.get_internal_entry_from_segments(&version_lock, key, seqno) } - fn manifest(&self) -> &Arc> { - &self.manifest + fn current_version(&self) -> Version { + self.super_version + .read() + .expect("poisoned") + .manifest + .current_version() + .clone() } fn flush_active_memtable(&self, seqno_threshold: SeqNo) -> crate::Result> { @@ -131,9 +137,10 @@ impl AbstractTree for Tree { } fn version_free_list_len(&self) -> usize { - self.manifest + self.super_version .read() .expect("lock is poisoned") + .manifest .version_free_list .len() } @@ -164,10 +171,7 @@ impl AbstractTree for Tree { // TODO: doctest fn tombstone_count(&self) -> u64 { - self.manifest - .read() - .expect("lock is poisoned") - .current_version() + self.current_version() .iter_segments() .map(Segment::tombstone_count) .sum() @@ -182,16 +186,20 @@ impl AbstractTree for Tree { use crate::tree::ingest::Ingestion; use std::time::Instant; - // NOTE: Lock active memtable so nothing else can be going on while we are bulk loading - let memtable_lock = self.lock_active_memtable(); + // // TODO: 3.0.0 ... hmmmm + // let global_lock = self.super_version.write().expect("lock is poisoned"); let seqno = seqno_generator.next(); // TODO: allow ingestion always, by flushing memtable - assert!( - memtable_lock.is_empty(), - "can only perform bulk ingestion with empty memtable", - ); + // assert!( + // global_lock.active_memtable.is_empty(), + // "can only perform bulk ingestion with empty memtable(s)", + // ); + // assert!( + // global_lock.sealed_memtables.len() == 0, + // "can only perform bulk ingestion with empty memtable(s)", + // ); let mut writer = Ingestion::new(self)?.with_seqno(seqno); @@ -259,10 +267,7 @@ impl AbstractTree for Tree { } fn l0_run_count(&self) -> usize { - self.manifest - .read() - .expect("lock is poisoned") - .current_version() + self.current_version() .level(0) .map(|x| x.run_count()) .unwrap_or_default() @@ -275,39 +280,31 @@ impl AbstractTree for Tree { } fn filter_size(&self) -> usize { - self.manifest - .read() - .expect("lock is poisoned") - .current_version() + self.current_version() .iter_segments() .map(Segment::filter_size) .sum() } fn pinned_filter_size(&self) -> usize { - self.manifest - .read() - .expect("lock is poisoned") - .current_version() + self.current_version() .iter_segments() .map(Segment::pinned_filter_size) .sum() } fn pinned_block_index_size(&self) -> usize { - self.manifest - .read() - .expect("lock is poisoned") - .current_version() + self.current_version() .iter_segments() .map(Segment::pinned_block_index_size) .sum() } fn sealed_memtable_count(&self) -> usize { - self.sealed_memtables + self.super_version .read() .expect("lock is poisoned") + .sealed_memtables .len() } @@ -403,17 +400,9 @@ impl AbstractTree for Tree { blob_files.map(<[BlobFile]>::len).unwrap_or_default(), ); - // NOTE: Mind lock order L -> M -> S - log::trace!("register: Acquiring levels manifest write lock"); - let mut manifest = self.manifest.write().expect("lock is poisoned"); - log::trace!("register: Acquired levels manifest write lock"); + let mut version_lock = self.super_version.write().expect("lock is poisoned"); - // NOTE: Mind lock order L -> M -> S - log::trace!("register: Acquiring sealed memtables write lock"); - let mut sealed_memtables = self.sealed_memtables.write().expect("lock is poisoned"); - log::trace!("register: Acquired sealed memtables write lock"); - - manifest.atomic_swap( + version_lock.manifest.atomic_swap( |version| { Ok(version.with_new_l0_run( segments, @@ -426,28 +415,29 @@ impl AbstractTree for Tree { for segment in segments { log::trace!("releasing sealed memtable {}", segment.id()); - sealed_memtables.remove(segment.id()); + + version_lock.sealed_memtables = + Arc::new(version_lock.sealed_memtables.remove(segment.id())); } Ok(()) } - fn lock_active_memtable(&self) -> RwLockWriteGuard<'_, Arc> { - self.active_memtable.write().expect("lock is poisoned") - } - fn clear_active_memtable(&self) { - *self.active_memtable.write().expect("lock is poisoned") = Arc::new(Memtable::default()); + self.super_version + .write() + .expect("lock is poisoned") + .active_memtable = Arc::new(Memtable::default()); } fn set_active_memtable(&self, memtable: Memtable) { - let mut memtable_lock = self.active_memtable.write().expect("lock is poisoned"); - *memtable_lock = Arc::new(memtable); + let mut version_lock = self.super_version.write().expect("lock is poisoned"); + version_lock.active_memtable = Arc::new(memtable); } fn add_sealed_memtable(&self, id: MemtableId, memtable: Arc) { - let mut memtable_lock = self.sealed_memtables.write().expect("lock is poisoned"); - memtable_lock.add(id, memtable); + let mut version_lock = self.super_version.write().expect("lock is poisoned"); + version_lock.sealed_memtables = Arc::new(version_lock.sealed_memtables.add(id, memtable)); } fn compact( @@ -478,9 +468,10 @@ impl AbstractTree for Tree { fn active_memtable_size(&self) -> u64 { use std::sync::atomic::Ordering::Acquire; - self.active_memtable + self.super_version .read() .expect("lock is poisoned") + .active_memtable .approximate_size .load(Acquire) } @@ -490,21 +481,22 @@ impl AbstractTree for Tree { } fn rotate_memtable(&self) -> Option<(MemtableId, Arc)> { - log::trace!("rotate: acquiring active memtable write lock"); - let mut active_memtable = self.lock_active_memtable(); - - log::trace!("rotate: acquiring sealed memtables write lock"); - let mut sealed_memtables = self.lock_sealed_memtables(); + let mut version_lock = self.super_version.write().expect("lock is poisoned"); - if active_memtable.is_empty() { + if version_lock.active_memtable.is_empty() { return None; } - let yanked_memtable = std::mem::take(&mut *active_memtable); + let yanked_memtable = std::mem::take(&mut version_lock.active_memtable); let yanked_memtable = yanked_memtable; let tmp_memtable_id = self.get_next_segment_id(); - sealed_memtables.add(tmp_memtable_id, yanked_memtable.clone()); + + version_lock.sealed_memtables = Arc::new( + version_lock + .sealed_memtables + .add(tmp_memtable_id, yanked_memtable.clone()), + ); log::trace!("rotate: added memtable id={tmp_memtable_id} to sealed memtables"); @@ -512,37 +504,29 @@ impl AbstractTree for Tree { } fn segment_count(&self) -> usize { - self.manifest - .read() - .expect("lock is poisoned") - .current_version() - .segment_count() + self.current_version().segment_count() } fn level_segment_count(&self, idx: usize) -> Option { - self.manifest - .read() - .expect("lock is poisoned") - .current_version() - .level(idx) - .map(|x| x.segment_count()) + self.current_version().level(idx).map(|x| x.segment_count()) } #[allow(clippy::significant_drop_tightening)] fn approximate_len(&self) -> usize { - // NOTE: Mind lock order L -> M -> S - let manifest = self.manifest.read().expect("lock is poisoned"); - let memtable = self.active_memtable.read().expect("lock is poisoned"); - let sealed = self.sealed_memtables.read().expect("lock is poisoned"); + let version = self.super_version.read().expect("lock is poisoned"); - let segments_item_count = manifest + let segments_item_count = self .current_version() .iter_segments() .map(|x| x.metadata.item_count) .sum::(); - let memtable_count = memtable.len() as u64; - let sealed_count = sealed.iter().map(|(_, mt)| mt.len()).sum::() as u64; + let memtable_count = version.active_memtable.len() as u64; + let sealed_count = version + .sealed_memtables + .iter() + .map(|(_, mt)| mt.len()) + .sum::() as u64; (memtable_count + sealed_count + segments_item_count) .try_into() @@ -550,26 +534,19 @@ impl AbstractTree for Tree { } fn disk_space(&self) -> u64 { - self.manifest - .read() - .expect("lock is poisoned") - .current_version() + self.current_version() .iter_levels() .map(super::version::Level::size) .sum() } fn get_highest_memtable_seqno(&self) -> Option { - let active = self - .active_memtable - .read() - .expect("lock is poisoned") - .get_highest_seqno(); + let version = self.super_version.read().expect("lock is poisoned"); - let sealed = self + let active = version.active_memtable.get_highest_seqno(); + + let sealed = version .sealed_memtables - .read() - .expect("Lock is poisoned") .iter() .map(|(_, table)| table.get_highest_seqno()) .max() @@ -579,10 +556,7 @@ impl AbstractTree for Tree { } fn get_highest_persisted_seqno(&self) -> Option { - self.manifest - .read() - .expect("lock is poisoned") - .current_version() + self.current_version() .iter_segments() .map(Segment::get_highest_seqno) .max() @@ -719,45 +693,20 @@ impl Tree { #[doc(hidden)] #[must_use] pub fn is_compacting(&self) -> bool { - self.manifest + self.super_version .read() .expect("lock is poisoned") + .manifest .is_compacting() } - /// Write-locks the sealed memtables for exclusive access - fn lock_sealed_memtables(&self) -> RwLockWriteGuard<'_, SealedMemtables> { - self.sealed_memtables.write().expect("lock is poisoned") - } - - // TODO: maybe not needed anyway - /// Used for [`BlobTree`] lookup - pub(crate) fn get_internal_entry_with_memtable( - &self, - memtable_lock: &Memtable, - key: &[u8], - seqno: SeqNo, - ) -> crate::Result> { - if let Some(entry) = memtable_lock.get(key, seqno) { - return Ok(ignore_tombstone_value(entry)); - } - - // Now look in sealed memtables - if let Some(entry) = self.get_internal_entry_from_sealed_memtables(key, seqno) { - return Ok(ignore_tombstone_value(entry)); - } - - self.get_internal_entry_from_segments(key, seqno) - } - fn get_internal_entry_from_sealed_memtables( &self, + super_version: &SuperVersion, key: &[u8], seqno: SeqNo, ) -> Option { - let memtable_lock = self.sealed_memtables.read().expect("lock is poisoned"); - - for (_, memtable) in memtable_lock.iter().rev() { + for (_, memtable) in super_version.sealed_memtables.iter().rev() { if let Some(entry) = memtable.get(key, seqno) { return Some(entry); } @@ -768,6 +717,7 @@ impl Tree { fn get_internal_entry_from_segments( &self, + super_version: &SuperVersion, key: &[u8], seqno: SeqNo, ) -> crate::Result> { @@ -775,9 +725,7 @@ impl Tree { // https://fjall-rs.github.io/post/bloom-filter-hash-sharing/ let key_hash = crate::segment::filter::standard_bloom::Builder::get_hash(key); - let manifest = self.manifest.read().expect("lock is poisoned"); - - for level in manifest.current_version().iter_levels() { + for level in super_version.manifest.current_version().iter_levels() { for run in level.iter() { // NOTE: Based on benchmarking, binary search is only worth it with ~4 segments if run.len() >= 4 { @@ -855,24 +803,21 @@ impl Tree { let bounds: (Bound, Bound) = (lo, hi); - // NOTE: Mind lock order L -> M -> S - log::trace!("range read: acquiring read locks"); - - let manifest = self.manifest.read().expect("lock is poisoned"); + let version_lock = self.super_version.write().expect("lock is poisoned"); let iter_state = { - let active = self.active_memtable.read().expect("lock is poisoned"); - let sealed = &self.sealed_memtables.read().expect("lock is poisoned"); + let active = &version_lock.active_memtable; + let sealed = &version_lock.sealed_memtables; IterState { active: active.clone(), sealed: sealed.iter().map(|(_, mt)| mt.clone()).collect(), ephemeral, - version: manifest.current_version().clone(), + version: version_lock.manifest.current_version().clone(), } }; - TreeIter::create_range(iter_state, bounds, seqno, &manifest) + TreeIter::create_range(iter_state, bounds, seqno, &version_lock.manifest) } #[doc(hidden)] @@ -908,8 +853,11 @@ impl Tree { #[doc(hidden)] #[must_use] pub fn append_entry(&self, value: InternalValue) -> (u64, u64) { - let memtable_lock = self.active_memtable.read().expect("lock is poisoned"); - memtable_lock.insert(value) + self.super_version + .read() + .expect("lock is poisoned") + .active_memtable + .insert(value) } /// Recovers previous state, by loading the level manifest and segments. @@ -954,12 +902,16 @@ impl Tree { id: tree_id, segment_id_counter: Arc::new(AtomicU64::new(highest_segment_id + 1)), blob_file_id_generator: SequenceNumberCounter::default(), - active_memtable: Arc::default(), - sealed_memtables: Arc::default(), - manifest: Arc::new(RwLock::new(levels)), + super_version: Arc::new(RwLock::new(SuperVersion { + active_memtable: Arc::default(), + sealed_memtables: Arc::default(), + manifest: levels, + })), stop_signal: StopSignal::default(), config, major_compaction_lock: RwLock::default(), + compaction_lock: Arc::default(), + #[cfg(feature = "metrics")] metrics, }; From a56df3fcf82669a2858eb12fd636df50ec5a103e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 7 Oct 2025 20:31:41 +0200 Subject: [PATCH 553/613] wip --- src/compaction/leveled.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index ae50df5a..8ff6e005 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -201,7 +201,7 @@ impl CompactionStrategy for Strategy { .skip(1) .find(|(_, lvl)| !lvl.is_empty()) .map(|(idx, _)| idx) - .unwrap_or_else(|| usize::from(levels.last_level_index())); + .unwrap_or_else(|| usize::from(levels.level_count() - 1)); // Number of levels we have to shift to get from the actual level idx to the canonical let mut level_shift = canonical_l1_idx - 1; From 90352ee8f96713c2cb7bbf7dce36c0262378da9f Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 7 Oct 2025 20:32:32 +0200 Subject: [PATCH 554/613] adjust blob tree to use super version as well --- src/blob_tree/mod.rs | 83 ++++++++++++-------------------------------- 1 file changed, 23 insertions(+), 60 deletions(-) diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index 248c91a4..fe91fed7 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -13,26 +13,20 @@ use crate::{ compaction::stream::CompactionStream, file::{fsync_directory, BLOBS_FOLDER}, iter_guard::{IterGuard, IterGuardImpl}, - level_manifest::LevelManifest, r#abstract::{AbstractTree, RangeItem}, segment::Segment, tree::inner::MemtableId, value::InternalValue, - vlog::{Accessor, BlobFile, BlobFileId, BlobFileWriter, ValueHandle}, + version::Version, + vlog::{Accessor, BlobFile, BlobFileWriter, ValueHandle}, Config, Memtable, SegmentId, SeqNo, SequenceNumberCounter, UserKey, UserValue, }; use handle::BlobIndirection; -use std::{ - collections::BTreeMap, - io::Cursor, - ops::RangeBounds, - path::PathBuf, - sync::{Arc, RwLock}, -}; +use std::{io::Cursor, ops::RangeBounds, path::PathBuf, sync::Arc}; pub struct Guard<'a> { blob_tree: &'a BlobTree, - vlog: Arc>, + version: Version, kv: crate::Result, } @@ -47,21 +41,17 @@ impl IterGuard for Guard<'_> { } fn into_inner(self) -> crate::Result<(UserKey, UserValue)> { - resolve_value_handle(self.blob_tree, &self.vlog, self.kv?) + resolve_value_handle(self.blob_tree, &self.version, self.kv?) } } -fn resolve_value_handle( - tree: &BlobTree, - vlog: &BTreeMap, - item: InternalValue, -) -> RangeItem { +fn resolve_value_handle(tree: &BlobTree, version: &Version, item: InternalValue) -> RangeItem { if item.key.value_type.is_indirection() { let mut cursor = Cursor::new(item.value); let vptr = BlobIndirection::decode_from(&mut cursor)?; // Resolve indirection using value log - match Accessor::new(vlog).get( + match Accessor::new(&version.value_log).get( tree.id(), &tree.blobs_folder, &item.key.user_key, @@ -75,10 +65,10 @@ fn resolve_value_handle( } Ok(None) => { panic!( - "value handle ({:?} => {:?}) did not match any blob - this is a bug", - String::from_utf8_lossy(&item.key.user_key), - vptr.vhandle, - ) + "value handle ({:?} => {:?}) did not match any blob - this is a bug; version={}", + item.key.user_key, vptr.vhandle, + version.id(), + ); } Err(e) => Err(e), } @@ -112,9 +102,6 @@ impl BlobTree { fsync_directory(&blobs_folder)?; let blob_file_id_to_continue_with = index - .manifest() - .read() - .expect("lock is poisoned") .current_version() .value_log .values() @@ -148,8 +135,8 @@ impl AbstractTree for BlobTree { self.index.get_internal_entry(key, seqno) } - fn manifest(&self) -> &Arc> { - self.index.manifest() + fn current_version(&self) -> Version { + self.index.current_version() } fn flush_active_memtable(&self, eviction_seqno: SeqNo) -> crate::Result> { @@ -191,12 +178,7 @@ impl AbstractTree for BlobTree { let range = prefix_to_range(prefix.as_ref()); - let version = self - .manifest() - .read() - .expect("lock is poisoned") - .current_version() - .clone(); + let version = self.current_version(); Box::new( self.index @@ -204,7 +186,7 @@ impl AbstractTree for BlobTree { .map(move |kv| { IterGuardImpl::Blob(Guard { blob_tree: self, - vlog: version.value_log.clone(), // TODO: PERF: ugly Arc clone + version: version.clone(), // TODO: PERF: ugly Arc clone kv, }) }), @@ -217,12 +199,7 @@ impl AbstractTree for BlobTree { seqno: SeqNo, index: Option>, ) -> Box> + '_> { - let version = self - .manifest() - .read() - .expect("lock is poisoned") - .current_version() - .clone(); + let version = self.current_version(); // TODO: PERF: ugly Arc clone Box::new( @@ -231,7 +208,7 @@ impl AbstractTree for BlobTree { .map(move |kv| { IterGuardImpl::Blob(Guard { blob_tree: self, - vlog: version.value_log.clone(), // TODO: PERF: ugly Arc clone + version: version.clone(), // TODO: PERF: ugly Arc clone kv, }) }), @@ -335,11 +312,7 @@ impl AbstractTree for BlobTree { } fn blob_file_count(&self) -> usize { - self.manifest() - .read() - .expect("lock is poisoned") - .current_version() - .blob_file_count() + self.current_version().blob_file_count() } // NOTE: We skip reading from the value log @@ -363,12 +336,7 @@ impl AbstractTree for BlobTree { } fn stale_blob_bytes(&self) -> u64 { - self.manifest() - .read() - .expect("lock is poisoned") - .current_version() - .gc_stats() - .stale_bytes() + self.current_version().gc_stats().stale_bytes() } fn filter_size(&self) -> usize { @@ -520,10 +488,6 @@ impl AbstractTree for BlobTree { .register_segments(segments, blob_files, frag_map, seqno_threshold) } - fn lock_active_memtable(&self) -> std::sync::RwLockWriteGuard<'_, Arc> { - self.index.lock_active_memtable() - } - fn set_active_memtable(&self, memtable: Memtable) { self.index.set_active_memtable(memtable); } @@ -595,8 +559,7 @@ impl AbstractTree for BlobTree { } fn disk_space(&self) -> u64 { - let lock = self.manifest().read().expect("lock is poisoned"); - let version = lock.current_version(); + let version = self.current_version(); let vlog = crate::vlog::Accessor::new(&version.value_log); self.index.disk_space() + vlog.disk_space() } @@ -628,9 +591,9 @@ impl AbstractTree for BlobTree { return Ok(None); }; - let lock = self.manifest().read().expect("lock is poisoned"); - let version = lock.current_version(); - let (_, v) = resolve_value_handle(self, &version.value_log, item)?; + let version = self.current_version(); + let (_, v) = resolve_value_handle(self, &version, item)?; + Ok(Some(v)) } From df6daf75e4f3501d0227932cce32f85c967f5d00 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 7 Oct 2025 20:32:42 +0200 Subject: [PATCH 555/613] wip --- src/compaction/major.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/compaction/major.rs b/src/compaction/major.rs index 6cb79e14..720728d5 100644 --- a/src/compaction/major.rs +++ b/src/compaction/major.rs @@ -37,7 +37,7 @@ impl CompactionStrategy for Strategy { "MajorCompaction" } - fn choose(&self, levels: &LevelManifest, _: &Config) -> Choice { + fn choose(&self, levels: &LevelManifest, cfg: &Config) -> Choice { let segment_ids: HashSet<_> = levels.iter().map(Segment::id).collect(); // NOTE: This should generally not occur because of the @@ -50,10 +50,12 @@ impl CompactionStrategy for Strategy { if some_hidden { Choice::DoNothing } else { + let last_level_idx = cfg.level_count - 1; + Choice::Merge(CompactionInput { segment_ids, - dest_level: levels.last_level_index(), - canonical_level: levels.last_level_index(), + dest_level: last_level_idx, + canonical_level: last_level_idx, target_size: self.target_size, }) } From 35e65ad51d5ee921ad4799c964b82efb7b53418e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 7 Oct 2025 20:33:45 +0200 Subject: [PATCH 556/613] adjust compactor to use super version as well also implement blob file age cutoff --- src/compaction/worker.rs | 190 +++++++++++++++++++++++++++------------ 1 file changed, 131 insertions(+), 59 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index bf04bf7a..fd8cf00c 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -15,13 +15,12 @@ use crate::{ merge::Merger, run_scanner::RunScanner, stop_signal::StopSignal, - tree::inner::TreeId, + tree::inner::{SuperVersion, TreeId}, vlog::{BlobFileMergeScanner, BlobFileScanner, BlobFileWriter}, - AbstractTree, BlobFile, Config, HashSet, InternalValue, SegmentId, SeqNo, - SequenceNumberCounter, + BlobFile, Config, HashSet, InternalValue, SegmentId, SeqNo, SequenceNumberCounter, }; use std::{ - sync::{atomic::AtomicU64, Arc, RwLock, RwLockWriteGuard}, + sync::{atomic::AtomicU64, Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard}, time::Instant, }; @@ -41,8 +40,7 @@ pub struct Options { /// Configuration of tree. pub config: Config, - /// Levels manifest. - pub levels: Arc>, + pub super_version: Arc>, /// Compaction strategy to use. pub strategy: Arc, @@ -54,6 +52,11 @@ pub struct Options { /// Evicts items that are older than this seqno (MVCC GC). pub eviction_seqno: u64, + /// Compaction to lock out other compactions + /// + /// This is not the same lock as the major compaction lock in the `Tree`. + pub compaction_lock: Arc>, + #[cfg(feature = "metrics")] pub metrics: Arc, } @@ -65,10 +68,12 @@ impl Options { segment_id_generator: tree.segment_id_counter.clone(), blob_file_id_generator: tree.blob_file_id_generator.clone(), config: tree.config.clone(), - levels: tree.manifest().clone(), + super_version: tree.super_version.clone(), stop_signal: tree.stop_signal.clone(), strategy, eviction_seqno: 0, + compaction_lock: tree.compaction_lock.clone(), + #[cfg(feature = "metrics")] metrics: tree.metrics.clone(), } @@ -79,23 +84,25 @@ impl Options { /// /// This will block until the compactor is fully finished. pub fn do_compaction(opts: &Options) -> crate::Result<()> { - log::trace!("Acquiring levels manifest lock"); - let original_levels = opts.levels.write().expect("lock is poisoned"); + let lock = opts.compaction_lock.lock().expect("lock is poisoned"); + + let version = opts.super_version.read().expect("lock is poisoned"); let start = Instant::now(); log::trace!( "Consulting compaction strategy {:?}", opts.strategy.get_name(), ); - let choice = opts.strategy.choose(&original_levels, &opts.config); + let choice = opts.strategy.choose(&version.manifest, &opts.config); log::debug!("Compaction choice: {choice:?} in {:?}", start.elapsed()); match choice { - Choice::Merge(payload) => merge_segments(original_levels, opts, &payload), - Choice::Move(payload) => move_segments(original_levels, opts, &payload), + Choice::Merge(payload) => merge_segments(lock, version, opts, &payload), + Choice::Move(payload) => move_segments(lock, version, opts, &payload), Choice::Drop(payload) => drop_segments( - original_levels, + lock, + version, opts, &payload.into_iter().collect::>(), ), @@ -168,12 +175,20 @@ fn create_compaction_stream<'a>( } fn move_segments( - mut levels: RwLockWriteGuard<'_, LevelManifest>, + _compaction_lock: MutexGuard<'_, ()>, + super_version: RwLockReadGuard<'_, SuperVersion>, opts: &Options, payload: &CompactionPayload, ) -> crate::Result<()> { + drop(super_version); + + let mut super_version = opts.super_version.write().expect("lock is poisoned"); + // Fail-safe for buggy compaction strategies - if levels.should_decline_compaction(payload.segment_ids.iter().copied()) { + if super_version + .manifest + .should_decline_compaction(payload.segment_ids.iter().copied()) + { log::warn!( "Compaction task created by {:?} contained hidden segments, declining to run it - please report this at https://github.com/fjall-rs/lsm-tree/issues/new?template=bug_report.md", opts.strategy.get_name(), @@ -183,12 +198,12 @@ fn move_segments( let segment_ids = payload.segment_ids.iter().copied().collect::>(); - levels.atomic_swap( + super_version.manifest.atomic_swap( |current| Ok(current.with_moved(&segment_ids, payload.dest_level as usize)), opts.eviction_seqno, )?; - if let Err(e) = levels.maintenance(opts.eviction_seqno) { + if let Err(e) = super_version.manifest.maintenance(opts.eviction_seqno) { log::error!("Manifest maintenance failed: {e:?}"); return Err(e); } @@ -198,7 +213,8 @@ fn move_segments( #[allow(clippy::too_many_lines)] fn merge_segments( - mut levels: RwLockWriteGuard<'_, LevelManifest>, + compaction_lock: MutexGuard<'_, ()>, + super_version: RwLockReadGuard<'_, SuperVersion>, opts: &Options, payload: &CompactionPayload, ) -> crate::Result<()> { @@ -208,7 +224,10 @@ fn merge_segments( } // Fail-safe for buggy compaction strategies - if levels.should_decline_compaction(payload.segment_ids.iter().copied()) { + if super_version + .manifest + .should_decline_compaction(payload.segment_ids.iter().copied()) + { log::warn!( "Compaction task created by {:?} contained hidden segments, declining to run it - please report this at https://github.com/fjall-rs/lsm-tree/issues/new?template=bug_report.md", opts.strategy.get_name(), @@ -219,7 +238,13 @@ fn merge_segments( let Some(segments) = payload .segment_ids .iter() - .map(|&id| levels.current_version().get_segment(id).cloned()) + .map(|&id| { + super_version + .manifest + .current_version() + .get_segment(id) + .cloned() + }) .collect::>>() else { log::warn!( @@ -232,7 +257,7 @@ fn merge_segments( let mut blob_frag_map = FragmentationMap::default(); let Some(mut merge_iter) = create_compaction_stream( - &levels, + &super_version.manifest, &payload.segment_ids.iter().copied().collect::>(), opts.eviction_seqno, )? @@ -244,14 +269,15 @@ fn merge_segments( }; let dst_lvl = payload.canonical_level.into(); - let last_level = levels.last_level_index(); + let last_level = opts.config.level_count - 1; // NOTE: Only evict tombstones when reaching the last level, // That way we don't resurrect data beneath the tombstone let is_last_level = payload.dest_level == last_level; - let table_writer = - super::flavour::prepare_table_writer(levels.current_version(), opts, payload)?; + let current_version = super_version.manifest.current_version(); + + let table_writer = super::flavour::prepare_table_writer(current_version, opts, payload)?; let start = Instant::now(); @@ -259,8 +285,6 @@ fn merge_segments( Some(blob_opts) => { merge_iter = merge_iter.with_expiration_callback(&mut blob_frag_map); - let version = levels.current_version(); - let blob_files_to_rewrite = { // TODO: 3.0.0 vvv if blob gc is disabled, skip this part vvv @@ -268,32 +292,38 @@ fn merge_segments( let mut linked_blob_files = payload .segment_ids .iter() - .map(|&id| version.get_segment(id).expect("table should exist")) + .map(|&id| current_version.get_segment(id).expect("table should exist")) .filter_map(|x| x.get_linked_blob_files().expect("handle error")) .flatten() .map(|blob_file_ref| { - version + current_version .value_log .get(&blob_file_ref.blob_file_id) .expect("blob file should exist") }) .filter(|blob_file| { - blob_file.is_stale(version.gc_stats(), blob_opts.staleness_threshold) + blob_file + .is_stale(current_version.gc_stats(), blob_opts.staleness_threshold) }) .filter(|blob_file| { - // NOTE: Dead blob files are dropped anyway during Version change commit - !blob_file.is_dead(version.gc_stats()) + // NOTE: Dead blob files are dropped anyway during current_version change commit + !blob_file.is_dead(current_version.gc_stats()) }) .collect::>() .into_iter() .collect::>(); linked_blob_files.sort_by_key(|a| a.id()); - // TODO: 3.0.0 ^- age cutoff + + let cutoff_point = { + let len = linked_blob_files.len() as f32; + (len * blob_opts.age_cutoff) as usize + }; + linked_blob_files.drain(cutoff_point..); // NOTE: If there is any table not part of our compaction input // that also points to the blob file, we cannot rewrite the blob file - for table in version.iter_segments() { + for table in current_version.iter_segments() { if payload.segment_ids.contains(&table.id()) { continue; } @@ -360,17 +390,27 @@ fn merge_segments( log::trace!("Blob file GC preparation done in {:?}", start.elapsed()); - levels.hide_segments(payload.segment_ids.iter().copied()); + drop(super_version); + + { + opts.super_version + .write() + .expect("lock is poisoned") + .manifest + .hide_segments(payload.segment_ids.iter().copied()); + } - // IMPORTANT: Free lock so the compaction (which may go on for a while) - // does not block possible other compactions and writes/reads - drop(levels); + // IMPORTANT: Unlock exclusive compaction lock as we are now doing the actual (CPU-intensive) compaction + drop(compaction_lock); for (idx, item) in merge_iter.enumerate() { let item = item.inspect_err(|_| { // IMPORTANT: We need to show tables again on error - let mut levels = opts.levels.write().expect("lock is poisoned"); - levels.show_segments(payload.segment_ids.iter().copied()); + let mut version_lock = opts.super_version.write().expect("lock is poisoned"); + + version_lock + .manifest + .show_segments(payload.segment_ids.iter().copied()); })?; // IMPORTANT: We can only drop tombstones when writing into last level @@ -380,8 +420,11 @@ fn merge_segments( compactor.write(item).inspect_err(|_| { // IMPORTANT: We need to show tables again on error - let mut levels = opts.levels.write().expect("lock is poisoned"); - levels.show_segments(payload.segment_ids.iter().copied()); + let mut version_lock = opts.super_version.write().expect("lock is poisoned"); + + version_lock + .manifest + .show_segments(payload.segment_ids.iter().copied()); })?; if idx % 1_000_000 == 0 && opts.stop_signal.is_stopped() { @@ -390,25 +433,39 @@ fn merge_segments( } } - // NOTE: Mind lock order L -> M -> S - log::trace!("Acquiring levels manifest write lock"); - let mut levels = opts.levels.write().expect("lock is poisoned"); - log::trace!("Acquired levels manifest write lock"); + let compaction_lock = opts.compaction_lock.lock().expect("lock is poisoned"); + + log::trace!("Acquiring super version write lock"); + let mut super_version = opts.super_version.write().expect("lock is poisoned"); + log::trace!("Acquired super version write lock"); compactor - .finish(&mut levels, opts, payload, dst_lvl, blob_frag_map) + .finish( + &mut super_version.manifest, + opts, + payload, + dst_lvl, + blob_frag_map, + ) .inspect_err(|_| { - // IMPORTANT: We need to show tables again on error - levels.show_segments(payload.segment_ids.iter().copied()); + super_version + .manifest + .show_segments(payload.segment_ids.iter().copied()); })?; - levels.show_segments(payload.segment_ids.iter().copied()); + super_version + .manifest + .show_segments(payload.segment_ids.iter().copied()); - levels.maintenance(opts.eviction_seqno).inspect_err(|e| { - log::error!("Manifest maintenance failed: {e:?}"); - })?; + super_version + .manifest + .maintenance(opts.eviction_seqno) + .inspect_err(|e| { + log::error!("Manifest maintenance failed: {e:?}"); + })?; - drop(levels); + drop(super_version); + drop(compaction_lock); log::trace!("Compaction successful"); @@ -416,12 +473,20 @@ fn merge_segments( } fn drop_segments( - mut levels: RwLockWriteGuard<'_, LevelManifest>, + compaction_lock: MutexGuard<'_, ()>, + super_version: RwLockReadGuard<'_, SuperVersion>, opts: &Options, ids_to_drop: &[SegmentId], ) -> crate::Result<()> { + drop(super_version); + + let mut super_version = opts.super_version.write().expect("lock is poisoned"); + // Fail-safe for buggy compaction strategies - if levels.should_decline_compaction(ids_to_drop.iter().copied()) { + if super_version + .manifest + .should_decline_compaction(ids_to_drop.iter().copied()) + { log::warn!( "Compaction task created by {:?} contained hidden segments, declining to run it - please report this at https://github.com/fjall-rs/lsm-tree/issues/new?template=bug_report.md", opts.strategy.get_name(), @@ -431,7 +496,13 @@ fn drop_segments( let Some(segments) = ids_to_drop .iter() - .map(|&id| levels.current_version().get_segment(id).cloned()) + .map(|&id| { + super_version + .manifest + .current_version() + .get_segment(id) + .cloned() + }) .collect::>>() else { log::warn!( @@ -443,7 +514,7 @@ fn drop_segments( // IMPORTANT: Write the manifest with the removed segments first // Otherwise the segment files are deleted, but are still referenced! - levels.atomic_swap( + super_version.manifest.atomic_swap( |current| current.with_dropped(ids_to_drop), opts.eviction_seqno, // TODO: make naming in code base eviction_seqno vs watermark vs threshold consistent )?; @@ -458,12 +529,13 @@ fn drop_segments( // TODO: fwiw also add all dead blob files // TODO: look if any blob files can be trivially deleted as well - if let Err(e) = levels.maintenance(opts.eviction_seqno) { + if let Err(e) = super_version.manifest.maintenance(opts.eviction_seqno) { log::error!("Manifest maintenance failed: {e:?}"); return Err(e); } - drop(levels); + drop(super_version); + drop(compaction_lock); log::trace!("Dropped {} segments", ids_to_drop.len()); From 73c68413edc2aa6e47ac8be04b5d8e0e66dabd43 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 7 Oct 2025 20:41:48 +0200 Subject: [PATCH 557/613] fix: blob tests --- tests/blob_compression.rs | 22 +++++----------------- tests/blob_major_compact_relocation.rs | 8 ++++---- 2 files changed, 9 insertions(+), 21 deletions(-) diff --git a/tests/blob_compression.rs b/tests/blob_compression.rs index 2796a74f..3e7c20a3 100644 --- a/tests/blob_compression.rs +++ b/tests/blob_compression.rs @@ -1,9 +1,9 @@ -use lsm_tree::{blob_tree::FragmentationEntry, AbstractTree, KvSeparationOptions, SeqNo}; -use test_log::test; - #[test] #[cfg(feature = "lz4")] fn blob_tree_compression() -> lsm_tree::Result<()> { + use lsm_tree::{blob_tree::FragmentationEntry, AbstractTree, KvSeparationOptions, SeqNo}; + use test_log::test; + let folder = tempfile::tempdir()?; let path = folder.path(); @@ -49,13 +49,7 @@ fn blob_tree_compression() -> lsm_tree::Result<()> { assert_eq!(1, tree.blob_file_count()); { - let gc_stats = tree - .manifest() - .read() - .expect("lock is poisoned") - .current_version() - .gc_stats() - .clone(); + let gc_stats = tree.current_version().gc_stats().clone(); assert_eq!( &{ @@ -82,13 +76,7 @@ fn blob_tree_compression() -> lsm_tree::Result<()> { assert_eq!(1, tree.blob_file_count()); { - let gc_stats = tree - .manifest() - .read() - .expect("lock is poisoned") - .current_version() - .gc_stats() - .clone(); + let gc_stats = tree.current_version().gc_stats().clone(); assert_eq!(&lsm_tree::HashMap::default(), &*gc_stats); } diff --git a/tests/blob_major_compact_relocation.rs b/tests/blob_major_compact_relocation.rs index b37d8921..a6d07221 100644 --- a/tests/blob_major_compact_relocation.rs +++ b/tests/blob_major_compact_relocation.rs @@ -1,4 +1,4 @@ -use lsm_tree::{blob_tree::FragmentationEntry, AbstractTree, SeqNo}; +use lsm_tree::{blob_tree::FragmentationEntry, AbstractTree, KvSeparationOptions, SeqNo}; use test_log::test; #[test] @@ -11,7 +11,7 @@ fn blob_tree_major_compact_relocation_simple() -> lsm_tree::Result<()> { { let tree = lsm_tree::Config::new(path) - .with_kv_separation(Some(Default::default())) + .with_kv_separation(Some(KvSeparationOptions::default().age_cutoff(1.0))) .open()?; assert!(tree.get("big", SeqNo::MAX)?.is_none()); @@ -97,7 +97,7 @@ fn blob_tree_major_compact_relocation_repeated_key() -> lsm_tree::Result<()> { { let tree = lsm_tree::Config::new(path) - .with_kv_separation(Some(Default::default())) + .with_kv_separation(Some(KvSeparationOptions::default().age_cutoff(1.0))) .open()?; assert!(tree.get("big", SeqNo::MAX)?.is_none()); @@ -201,7 +201,7 @@ fn blob_tree_major_compact_relocation_interleaved() -> lsm_tree::Result<()> { { let tree = lsm_tree::Config::new(path) - .with_kv_separation(Some(Default::default())) + .with_kv_separation(Some(KvSeparationOptions::default().age_cutoff(1.0))) .open()?; assert!(tree.get("big", SeqNo::MAX)?.is_none()); From fac10142a7e1b72ef65a50e684a340852b2dfc17 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 7 Oct 2025 21:15:40 +0200 Subject: [PATCH 558/613] actually fix blob tests --- tests/blob_compression.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/blob_compression.rs b/tests/blob_compression.rs index 3e7c20a3..752535ca 100644 --- a/tests/blob_compression.rs +++ b/tests/blob_compression.rs @@ -12,7 +12,8 @@ fn blob_tree_compression() -> lsm_tree::Result<()> { KvSeparationOptions::default() .compression(lsm_tree::CompressionType::Lz4) .separation_threshold(1) - .staleness_threshold(0.0000001), + .staleness_threshold(0.0000001) + .age_cutoff(1.0), )) .open()?; From 2dc9d875116975a1afcfd1cb579e4dff22aef7f6 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 9 Oct 2025 01:24:06 +0200 Subject: [PATCH 559/613] adjust tests --- tests/blob_drop_after_flush._rs | 51 ------ tests/blob_gc._rs | 148 ------------------ ..._gc_watermark._rs => blob_gc_watermark.rs} | 41 +++-- tests/blob_sep_threshold.rs | 1 - tests/blob_simple.rs | 44 ------ tests/blob_tombstone._rs | 43 ----- tests/blob_tree_flush._rs | 36 ----- ...rded_size.rs => blob_tree_guarded_size.rs} | 3 +- tests/blob_tree_reload_blob.rs | 2 - tests/compaction_readers_grouping.rs | 1 + tests/open_files.rs | 26 --- ...guarded_range.rs => tree_guarded_range.rs} | 5 +- tests/tree_range.rs | 1 - 13 files changed, 35 insertions(+), 367 deletions(-) delete mode 100644 tests/blob_drop_after_flush._rs delete mode 100644 tests/blob_gc._rs rename tests/{blob_gc_watermark._rs => blob_gc_watermark.rs} (69%) delete mode 100644 tests/blob_tombstone._rs delete mode 100644 tests/blob_tree_flush._rs rename tests/{experimental_blob_tree_guarded_size.rs => blob_tree_guarded_size.rs} (83%) delete mode 100644 tests/open_files.rs rename tests/{experimental_tree_guarded_range.rs => tree_guarded_range.rs} (91%) diff --git a/tests/blob_drop_after_flush._rs b/tests/blob_drop_after_flush._rs deleted file mode 100644 index fad0eaf3..00000000 --- a/tests/blob_drop_after_flush._rs +++ /dev/null @@ -1,51 +0,0 @@ -use lsm_tree::{config::CompressionPolicy, AbstractTree, Config, SeqNo}; -use std::time::Duration; -use test_log::test; - -// NOTE: This was a race condition in v2 that could drop a blob file -// before its corresponding segment was registered -// -// https://github.com/fjall-rs/lsm-tree/commit/a3a174ed9eff0755f671f793626d17f4ef3f5f57 -#[test] -#[ignore = "restore"] -fn blob_drop_after_flush() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?; - - let tree = Config::new(&folder) - .data_block_compression_policy(CompressionPolicy::all(lsm_tree::CompressionType::None)) - .open_as_blob_tree()?; - - tree.insert("a", "neptune".repeat(10_000), 0); - let (id, memtable) = tree.rotate_memtable().unwrap(); - - let (segment, blob_file) = tree.flush_memtable(id, &memtable, 0).unwrap().unwrap(); - - // NOTE: Segment is now in-flight - - let gc_report = std::thread::spawn({ - let tree = tree.clone(); - - move || { - let report = tree.gc_scan_stats(1, 0)?; - Ok::<_, lsm_tree::Error>(report) - } - }); - - std::thread::sleep(Duration::from_secs(1)); - - let strategy = lsm_tree::gc::SpaceAmpStrategy::new(1.0); - tree.apply_gc_strategy(&strategy, 0)?; - - tree.register_segments(&[segment], Some(&[blob_file.unwrap()]), 0)?; - - assert_eq!( - "neptune".repeat(10_000).as_bytes(), - &*tree.get("a", SeqNo::MAX)?.unwrap(), - ); - - let report = gc_report.join().unwrap()?; - assert_eq!(0, report.stale_blobs); - assert_eq!(1, report.total_blobs); - - Ok(()) -} diff --git a/tests/blob_gc._rs b/tests/blob_gc._rs deleted file mode 100644 index 6aa50d86..00000000 --- a/tests/blob_gc._rs +++ /dev/null @@ -1,148 +0,0 @@ -use lsm_tree::{AbstractTree, Config, SeqNo, SequenceNumberCounter}; -use test_log::test; - -#[test] -#[ignore] -fn blob_gc_1() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?; - - let tree = Config::new(&folder).open_as_blob_tree()?; - - let seqno = SequenceNumberCounter::default(); - - tree.insert("a", "neptune".repeat(10_000), seqno.next()); - tree.insert("b", "neptune".repeat(10_000), seqno.next()); - tree.insert("c", "neptune".repeat(10_000), seqno.next()); - - tree.flush_active_memtable(0)?; - assert_eq!(1, tree.blob_file_count()); - - tree.gc_scan_stats(seqno.get(), 0)?; - - assert_eq!(1.0, tree.space_amp()); - - tree.insert("a", "a", seqno.next()); - tree.gc_scan_stats(seqno.get(), /* simulate some time has passed */ 1_000)?; - assert_eq!(1.5, tree.space_amp()); - - tree.insert("b", "b", seqno.next()); - tree.gc_scan_stats(seqno.get(), 1_000)?; - assert_eq!(3.0, tree.space_amp()); - - // NOTE: Everything is stale - tree.insert("c", "c", seqno.next()); - tree.gc_scan_stats(seqno.get(), 1_000)?; - assert_eq!(0.0, tree.space_amp()); - - tree.gc_drop_stale()?; - - assert_eq!(&*tree.get("a", SeqNo::MAX)?.unwrap(), b"a"); - assert_eq!(&*tree.get("b", SeqNo::MAX)?.unwrap(), b"b"); - assert_eq!(&*tree.get("c", SeqNo::MAX)?.unwrap(), b"c"); - assert_eq!(0, tree.blob_file_count()); - assert_eq!(0.0, tree.space_amp()); - - Ok(()) -} - -#[test] -#[ignore] -fn blob_gc_2() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?; - - let tree = Config::new(&folder).open_as_blob_tree()?; - - let seqno = SequenceNumberCounter::default(); - - tree.insert("a", "neptune".repeat(10_000), seqno.next()); - tree.insert("b", "neptune".repeat(10_000), seqno.next()); - tree.insert("c", "neptune".repeat(10_000), seqno.next()); - - tree.flush_active_memtable(0)?; - assert_eq!(1, tree.blob_file_count()); - - tree.gc_scan_stats(seqno.get(), 0)?; - assert_eq!(1.0, tree.space_amp()); - - tree.insert("a", "a", seqno.next()); - tree.gc_scan_stats(seqno.get(), /* simulate some time has passed */ 1_000)?; - assert_eq!(1.5, tree.space_amp()); - - tree.insert("b", "b", seqno.next()); - tree.gc_scan_stats(seqno.get(), 1_000)?; - assert_eq!(3.0, tree.space_amp()); - - let strategy = lsm_tree::gc::SpaceAmpStrategy::new(1.0); - tree.apply_gc_strategy(&strategy, seqno.next())?; - - assert_eq!(&*tree.get("a", SeqNo::MAX)?.unwrap(), b"a"); - assert_eq!(&*tree.get("b", SeqNo::MAX)?.unwrap(), b"b"); - assert_eq!( - &*tree.get("c", SeqNo::MAX)?.unwrap(), - "neptune".repeat(10_000).as_bytes() - ); - assert_eq!(1, tree.blob_file_count()); - assert_eq!(1.0, tree.space_amp()); - - tree.insert("c", "c", seqno.next()); - - tree.gc_scan_stats(seqno.get(), 1_000)?; - - let strategy = lsm_tree::gc::SpaceAmpStrategy::new(1.0); - tree.apply_gc_strategy(&strategy, seqno.next())?; - assert_eq!(0, tree.blob_file_count()); - - Ok(()) -} - -#[test] -#[ignore] -fn blob_gc_3() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?; - - let tree = Config::new(&folder).open_as_blob_tree()?; - - let seqno = SequenceNumberCounter::default(); - - tree.insert("a", "neptune".repeat(10_000), seqno.next()); - tree.insert("b", "neptune".repeat(10_000), seqno.next()); - tree.insert("c", "neptune".repeat(10_000), seqno.next()); - - tree.flush_active_memtable(0)?; - assert_eq!(1, tree.blob_file_count()); - - tree.gc_scan_stats(seqno.get(), 0)?; - assert_eq!(1.0, tree.space_amp()); - - tree.remove("a", seqno.next()); - - tree.gc_scan_stats(seqno.get(), /* simulate some time has passed */ 1_000)?; - assert_eq!(1.5, tree.space_amp()); - - tree.remove("b", seqno.next()); - tree.gc_scan_stats(seqno.get(), 1_000)?; - assert_eq!(3.0, tree.space_amp()); - - let strategy = lsm_tree::gc::SpaceAmpStrategy::new(1.0); - tree.apply_gc_strategy(&strategy, seqno.next())?; - - assert!(tree.get("a", SeqNo::MAX)?.is_none()); - assert!(tree.get("b", SeqNo::MAX)?.is_none()); - assert_eq!( - &*tree.get("c", SeqNo::MAX)?.unwrap(), - "neptune".repeat(10_000).as_bytes() - ); - assert_eq!(1, tree.blob_file_count()); - assert_eq!(1.0, tree.space_amp()); - - tree.remove("c", seqno.next()); - assert!(tree.get("c", SeqNo::MAX)?.is_none()); - - tree.gc_scan_stats(seqno.get(), 1_000)?; - - let strategy = lsm_tree::gc::SpaceAmpStrategy::new(1.0); - tree.apply_gc_strategy(&strategy, seqno.next())?; - assert_eq!(0, tree.blob_file_count()); - - Ok(()) -} diff --git a/tests/blob_gc_watermark._rs b/tests/blob_gc_watermark.rs similarity index 69% rename from tests/blob_gc_watermark._rs rename to tests/blob_gc_watermark.rs index 8c4b4f46..38d4f56a 100644 --- a/tests/blob_gc_watermark._rs +++ b/tests/blob_gc_watermark.rs @@ -1,4 +1,7 @@ -use lsm_tree::{config::CompressionPolicy, AbstractTree, Config, SeqNo, SequenceNumberCounter}; +use lsm_tree::{ + config::CompressionPolicy, AbstractTree, Config, KvSeparationOptions, SeqNo, + SequenceNumberCounter, +}; use test_log::test; // NOTE: This was a logic/MVCC error in v2 that could drop @@ -6,13 +9,17 @@ use test_log::test; // // https://github.com/fjall-rs/lsm-tree/commit/79c6ead4b955051cbb4835913e21d08b8aeafba1 #[test] -#[ignore] fn blob_gc_seqno_watermark() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let tree = Config::new(&folder) .data_block_compression_policy(CompressionPolicy::all(lsm_tree::CompressionType::None)) - .open_as_blob_tree()?; + .with_kv_separation(Some( + KvSeparationOptions::default() + .staleness_threshold(0.01) + .age_cutoff(1.0), + )) + .open()?; let seqno = SequenceNumberCounter::default(); tree.insert("a", "neptune".repeat(10_000), seqno.next()); @@ -58,18 +65,18 @@ fn blob_gc_seqno_watermark() -> lsm_tree::Result<()> { b"neptune3".repeat(10_000), ); - let report = tree.gc_scan_stats(seqno.get() + 1, 0)?; - assert_eq!(2, report.stale_blobs); - - let strategy = lsm_tree::gc::SpaceAmpStrategy::new(1.0); - tree.apply_gc_strategy(&strategy, 0)?; + tree.major_compact(u64::MAX, 0)?; + tree.major_compact(u64::MAX, 0)?; // IMPORTANT: We cannot drop any blobs yet // because the watermark is too low // // This would previously fail - let report = tree.gc_scan_stats(seqno.get() + 1, 0)?; - assert_eq!(2, report.stale_blobs); + + { + let gc_stats = tree.current_version().gc_stats().clone(); + assert_eq!(&lsm_tree::HashMap::default(), &*gc_stats); + } assert_eq!( &*tree.get("a", snapshot_seqno)?.unwrap(), @@ -80,5 +87,19 @@ fn blob_gc_seqno_watermark() -> lsm_tree::Result<()> { b"neptune3".repeat(10_000), ); + tree.major_compact(u64::MAX, 1_000)?; + + { + let gc_stats = tree.current_version().gc_stats().clone(); + assert!(!gc_stats.is_empty()); + } + + tree.major_compact(u64::MAX, 1_000)?; + + { + let gc_stats = tree.current_version().gc_stats().clone(); + assert_eq!(&lsm_tree::HashMap::default(), &*gc_stats); + } + Ok(()) } diff --git a/tests/blob_sep_threshold.rs b/tests/blob_sep_threshold.rs index ab643d91..fb537053 100644 --- a/tests/blob_sep_threshold.rs +++ b/tests/blob_sep_threshold.rs @@ -2,7 +2,6 @@ use lsm_tree::{AbstractTree, KvSeparationOptions, SeqNo}; use test_log::test; #[test] -#[ignore] fn blob_tree_separation_threshold() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let path = folder.path(); diff --git a/tests/blob_simple.rs b/tests/blob_simple.rs index 8557ed1d..d51eb03f 100644 --- a/tests/blob_simple.rs +++ b/tests/blob_simple.rs @@ -63,47 +63,3 @@ fn blob_tree_simple_flush_read() -> lsm_tree::Result<()> { Ok(()) } - -#[cfg(feature = "lz4")] -#[test] -#[ignore = "wip"] -fn blob_tree_simple_compressed() -> lsm_tree::Result<()> { - todo!() - - // let folder = tempfile::tempdir()?; - // let path = folder.path(); - - // let tree = lsm_tree::Config::new(path) - // .compression(lsm_tree::CompressionType::Lz4) - // .open_as_blob_tree()?; - - // let big_value = b"neptune!".repeat(128_000); - - // assert!(tree.get("big", SeqNo::MAX)?.is_none()); - // tree.insert("big", &big_value, 0); - // tree.insert("smol", "small value", 0); - - // let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); - // assert_eq!(&*value, big_value); - - // tree.flush_active_memtable(0)?; - - // let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); - // assert_eq!(&*value, big_value); - - // let value = tree.get("smol", SeqNo::MAX)?.expect("should exist"); - // assert_eq!(&*value, b"small value"); - - // let new_big_value = b"winter!".repeat(128_000); - // tree.insert("big", &new_big_value, 1); - - // let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); - // assert_eq!(&*value, new_big_value); - - // tree.flush_active_memtable(0)?; - - // let value = tree.get("big", SeqNo::MAX)?.expect("should exist"); - // assert_eq!(&*value, new_big_value); - - // Ok(()) -} diff --git a/tests/blob_tombstone._rs b/tests/blob_tombstone._rs deleted file mode 100644 index 6f767808..00000000 --- a/tests/blob_tombstone._rs +++ /dev/null @@ -1,43 +0,0 @@ -use lsm_tree::{AbstractTree, SeqNo}; -use test_log::test; - -#[test] -#[ignore] -fn blob_tree_tombstone() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?; - let path = folder.path(); - - let tree = lsm_tree::Config::new(path).open_as_blob_tree()?; - - let big_value = b"neptune!".repeat(128_000); - - tree.insert("a", &big_value, 0); - tree.insert("b", &big_value, 0); - tree.insert("c", &big_value, 0); - assert_eq!(3, tree.len(SeqNo::MAX, None)?); - - tree.flush_active_memtable(0)?; - assert_eq!(3, tree.len(SeqNo::MAX, None)?); - - tree.remove("b", 1); - assert_eq!(2, tree.len(SeqNo::MAX, None)?); - - tree.flush_active_memtable(0)?; - assert_eq!(2, tree.len(SeqNo::MAX, None)?); - - assert_eq!(&*tree.get("a", SeqNo::MAX)?.unwrap(), big_value); - assert!(tree.get("b", SeqNo::MAX)?.is_none()); - assert_eq!(&*tree.get("c", SeqNo::MAX)?.unwrap(), big_value); - - tree.gc_scan_stats(2, 0)?; - - let strategy = lsm_tree::gc::StaleThresholdStrategy::new(0.01); - tree.apply_gc_strategy(&strategy, 2)?; - assert_eq!(2, tree.len(SeqNo::MAX, None)?); - - assert_eq!(&*tree.get("a", SeqNo::MAX)?.unwrap(), big_value); - assert!(tree.get("b", SeqNo::MAX)?.is_none()); - assert_eq!(&*tree.get("c", SeqNo::MAX)?.unwrap(), big_value); - - Ok(()) -} diff --git a/tests/blob_tree_flush._rs b/tests/blob_tree_flush._rs deleted file mode 100644 index 1fa4792c..00000000 --- a/tests/blob_tree_flush._rs +++ /dev/null @@ -1,36 +0,0 @@ -use lsm_tree::{AbstractTree, Config, SequenceNumberCounter}; -use test_log::test; - -#[test] -#[ignore] -fn blob_gc_flush_tombstone() -> lsm_tree::Result<()> { - let folder = tempfile::tempdir()?; - - let tree = Config::new(&folder).open_as_blob_tree()?; - - let seqno = SequenceNumberCounter::default(); - - tree.insert("a", "neptune".repeat(10_000), seqno.next()); - tree.insert("b", "neptune".repeat(10_000), seqno.next()); - tree.flush_active_memtable(0)?; - - tree.remove("b", seqno.next()); - - tree.gc_scan_stats(seqno.get(), /* simulate some time has passed */ 1_000)?; - assert_eq!(2.0, tree.space_amp()); - - let strategy = lsm_tree::gc::SpaceAmpStrategy::new(1.0); - tree.apply_gc_strategy(&strategy, seqno.next())?; - assert_eq!(1, tree.blob_file_count()); - - tree.gc_scan_stats(seqno.get(), 1_000)?; - assert_eq!(1.0, tree.space_amp()); - - tree.flush_active_memtable(0)?; - assert_eq!(1, tree.blob_file_count()); - - tree.gc_scan_stats(seqno.get(), 1_000)?; - assert_eq!(1.0, tree.space_amp()); - - Ok(()) -} diff --git a/tests/experimental_blob_tree_guarded_size.rs b/tests/blob_tree_guarded_size.rs similarity index 83% rename from tests/experimental_blob_tree_guarded_size.rs rename to tests/blob_tree_guarded_size.rs index df7dbaff..634301cc 100644 --- a/tests/experimental_blob_tree_guarded_size.rs +++ b/tests/blob_tree_guarded_size.rs @@ -2,8 +2,7 @@ use lsm_tree::{AbstractTree, Config, Guard, SeqNo}; use test_log::test; #[test] -#[ignore = "restore"] -fn experimental_blob_tree_guarded_size() -> lsm_tree::Result<()> { +fn blob_tree_guarded_size() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let tree = Config::new(folder) diff --git a/tests/blob_tree_reload_blob.rs b/tests/blob_tree_reload_blob.rs index 784eb115..7ed5ccd4 100644 --- a/tests/blob_tree_reload_blob.rs +++ b/tests/blob_tree_reload_blob.rs @@ -4,7 +4,6 @@ use test_log::test; const ITEM_COUNT: usize = 10_000; #[test] -#[ignore] fn blob_tree_reload_empty() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; @@ -68,7 +67,6 @@ fn blob_tree_reload_empty() -> lsm_tree::Result<()> { } #[test] -#[ignore] fn blob_tree_reload() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; diff --git a/tests/compaction_readers_grouping.rs b/tests/compaction_readers_grouping.rs index e32964a6..79c71b28 100644 --- a/tests/compaction_readers_grouping.rs +++ b/tests/compaction_readers_grouping.rs @@ -2,6 +2,7 @@ use lsm_tree::{AbstractTree, Config, SeqNo, SequenceNumberCounter}; use std::sync::Arc; use test_log::test; +/// NOTE: Fix: https://github.com/fjall-rs/lsm-tree/commit/66a974ae6748646a40df475c291e04cf1dfbaece #[test] #[ignore] fn compaction_readers_grouping() -> lsm_tree::Result<()> { diff --git a/tests/open_files.rs b/tests/open_files.rs deleted file mode 100644 index e4466387..00000000 --- a/tests/open_files.rs +++ /dev/null @@ -1,26 +0,0 @@ -use lsm_tree::{AbstractTree, Cache, Config, SeqNo}; -use std::sync::Arc; -use test_log::test; - -#[test] -#[ignore = "this is a sanity check test, but the data it writes is impossible, so the range scan first_key_value is doing is crashing as of 2.1.1 lol"] -fn open_file_limit() -> lsm_tree::Result<()> { - std::fs::create_dir_all(".test_open_files")?; - let folder = tempfile::tempdir_in(".test_open_files")?; - - let tree = Config::new(folder) - .use_cache(Arc::new(Cache::with_capacity_bytes(0))) - .open()?; - - for _ in 0..2_048 { - let key = 0u64.to_be_bytes(); - tree.insert(key, key, 0); - tree.flush_active_memtable(0)?; - } - - for _ in 0..5 { - assert!(tree.first_key_value(SeqNo::MAX, None)?.is_some()); - } - - Ok(()) -} diff --git a/tests/experimental_tree_guarded_range.rs b/tests/tree_guarded_range.rs similarity index 91% rename from tests/experimental_tree_guarded_range.rs rename to tests/tree_guarded_range.rs index 0b18937d..a359828e 100644 --- a/tests/experimental_tree_guarded_range.rs +++ b/tests/tree_guarded_range.rs @@ -2,7 +2,7 @@ use lsm_tree::{AbstractTree, Config, Guard, SeqNo}; use test_log::test; #[test] -fn experimental_tree_guarded_range() -> lsm_tree::Result<()> { +fn tree_guarded_range() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let tree = Config::new(folder).open()?; @@ -32,8 +32,7 @@ fn experimental_tree_guarded_range() -> lsm_tree::Result<()> { } #[test] -#[ignore = "restore"] -fn experimental_blob_tree_guarded_range() -> lsm_tree::Result<()> { +fn blob_tree_guarded_range() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let tree = Config::new(folder) diff --git a/tests/tree_range.rs b/tests/tree_range.rs index 8375e24c..21c18128 100644 --- a/tests/tree_range.rs +++ b/tests/tree_range.rs @@ -62,7 +62,6 @@ fn tree_range_count() -> lsm_tree::Result<()> { } #[test] -#[ignore = "restore"] fn blob_tree_range_count() -> lsm_tree::Result<()> { use std::ops::Bound::{self, Excluded, Unbounded}; From 8e7b43f364f2cca4cc96cb1705c36dcd798082c3 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 9 Oct 2025 01:24:16 +0200 Subject: [PATCH 560/613] add back another test --- src/version/optimize.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/version/optimize.rs b/src/version/optimize.rs index 2ab53184..c031f8b7 100644 --- a/src/version/optimize.rs +++ b/src/version/optimize.rs @@ -91,7 +91,6 @@ mod tests { } #[test] - #[ignore = "fix!!!"] fn optimize_runs_two_overlap_2() { let runs = vec![ Run::new(vec![s(0, "a", "z")]), From cc8f7a2254622266da038546cf0ad7a112cd9df9 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 9 Oct 2025 01:24:31 +0200 Subject: [PATCH 561/613] fix: blob tree guard size of inline value --- src/blob_tree/mod.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index fe91fed7..2222815d 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -36,8 +36,16 @@ impl IterGuard for Guard<'_> { } fn size(self) -> crate::Result { - let mut cursor = Cursor::new(self.kv?.value); - Ok(BlobIndirection::decode_from(&mut cursor)?.size) + let kv = self.kv?; + + if kv.key.value_type.is_indirection() { + let mut cursor = Cursor::new(kv.value); + Ok(BlobIndirection::decode_from(&mut cursor)?.size) + } else { + // NOTE: We know that values are u32 max length + #[allow(clippy::cast_possible_truncation)] + Ok(kv.value.len() as u32) + } } fn into_inner(self) -> crate::Result<(UserKey, UserValue)> { From 76e3df174f04f1cac93fb188c76fbc712f5f0dac Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 9 Oct 2025 15:55:03 +0200 Subject: [PATCH 562/613] derive --- src/config/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/config/mod.rs b/src/config/mod.rs index 9267dd70..7d9d05c3 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -58,7 +58,7 @@ impl TryFrom for TreeType { const DEFAULT_FILE_FOLDER: &str = ".lsm.data"; /// Options for key-value separation -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq)] pub struct KvSeparationOptions { /// What type of compression is used for blobs pub compression: CompressionType, From 44e3bbeb47212eac6dee78be2d2bdaca87daa122 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 11 Oct 2025 13:54:32 +0200 Subject: [PATCH 563/613] remove level manifest --- src/level_manifest/mod.rs | 484 -------------------------------------- src/lib.rs | 3 - 2 files changed, 487 deletions(-) delete mode 100644 src/level_manifest/mod.rs diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs deleted file mode 100644 index 5fe80a79..00000000 --- a/src/level_manifest/mod.rs +++ /dev/null @@ -1,484 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -pub(crate) mod hidden_set; - -use crate::{ - coding::Decode, - file::{fsync_directory, rewrite_atomic}, - segment::Segment, - version::{Level, Run, Version, VersionId, DEFAULT_LEVEL_COUNT}, - vlog::BlobFileId, - BlobFile, SegmentId, SeqNo, -}; -use byteorder::{LittleEndian, ReadBytesExt}; -use hidden_set::HiddenSet; -use std::{ - collections::VecDeque, - io::BufWriter, - path::{Path, PathBuf}, - sync::Arc, -}; - -pub struct Recovery { - pub curr_version_id: VersionId, - pub segment_ids: Vec>>, - pub blob_file_ids: Vec, - pub gc_stats: crate::blob_tree::FragmentationMap, -} - -/// Represents the levels of a log-structured merge tree -pub struct LevelManifest { - /// Path of tree folder. - folder: PathBuf, - - /// Current version. - current: Version, - - /// Set of segment IDs that are masked. - /// - /// While consuming segments (because of compaction) they will not appear in the list of segments - /// as to not cause conflicts between multiple compaction threads (compacting the same segments). - hidden_set: HiddenSet, - - /// Holds onto versions until they are safe to drop. - pub(crate) version_free_list: VecDeque, -} - -impl std::fmt::Display for LevelManifest { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - for (idx, level) in self.current.iter_levels().enumerate() { - writeln!( - f, - "{idx} [{}], r={}: ", - match (level.is_empty(), level.is_disjoint()) { - (true, _) => ".", - (false, true) => "D", - (false, false) => "_", - }, - level.len(), - )?; - - for run in level.iter() { - write!(f, " ")?; - - if run.len() >= 30 { - #[allow(clippy::indexing_slicing)] - for segment in run.iter().take(2) { - let id = segment.id(); - let is_hidden = self.hidden_set.is_hidden(id); - - write!( - f, - "{}{id}{}", - if is_hidden { "(" } else { "[" }, - if is_hidden { ")" } else { "]" }, - )?; - } - write!(f, " . . . ")?; - - #[allow(clippy::indexing_slicing)] - for segment in run.iter().rev().take(2).rev() { - let id = segment.id(); - let is_hidden = self.hidden_set.is_hidden(id); - - write!( - f, - "{}{id}{}", - if is_hidden { "(" } else { "[" }, - if is_hidden { ")" } else { "]" }, - )?; - } - - writeln!( - f, - " | # = {}, {} MiB", - run.len(), - run.iter().map(Segment::file_size).sum::() / 1_024 / 1_024, - )?; - } else { - for segment in run.iter() { - let id = segment.id(); - let is_hidden = self.hidden_set.is_hidden(id); - - write!( - f, - "{}{id}{}", - if is_hidden { "(" } else { "[" }, - if is_hidden { ")" } else { "]" }, - )?; - } - - writeln!( - f, - " | # = {}, {} MiB", - run.len(), - run.iter().map(Segment::file_size).sum::() / 1_024 / 1_024, - )?; - } - } - } - - Ok(()) - } -} - -impl LevelManifest { - #[must_use] - pub fn current_version(&self) -> &Version { - &self.current - } - - pub(crate) fn is_compacting(&self) -> bool { - !self.hidden_set.is_empty() - } - - pub(crate) fn create_new>(folder: P) -> crate::Result { - // assert!(level_count > 0, "level_count should be >= 1"); - - #[allow(unused_mut)] - let mut manifest = Self { - folder: folder.into(), - current: Version::new(0), - hidden_set: HiddenSet::default(), - version_free_list: VecDeque::default(), - }; - - Self::persist_version(&manifest.folder, &manifest.current)?; - - Ok(manifest) - } - - pub(crate) fn recover_ids(folder: &Path) -> crate::Result { - let curr_version_id = Self::get_current_version(folder)?; - let version_file_path = folder.join(format!("v{curr_version_id}")); - - log::info!( - "Recovering current manifest at {}", - version_file_path.display(), - ); - - let reader = sfa::Reader::new(&version_file_path)?; - let toc = reader.toc(); - - // // TODO: vvv move into Version::decode vvv - let mut levels = vec![]; - - { - let mut reader = toc - .section(b"tables") - .expect("tables should exist") - .buf_reader(&version_file_path)?; - - let level_count = reader.read_u8()?; - - for _ in 0..level_count { - let mut level = vec![]; - let run_count = reader.read_u8()?; - - for _ in 0..run_count { - let mut run = vec![]; - let segment_count = reader.read_u32::()?; - - for _ in 0..segment_count { - let id = reader.read_u64::()?; - run.push(id); - } - - level.push(run); - } - - levels.push(level); - } - } - - let blob_file_ids = { - let mut reader = toc - .section(b"blob_files") - .expect("blob_files should exist") - .buf_reader(&version_file_path)?; - - let blob_file_count = reader.read_u32::()?; - let mut blob_file_ids = Vec::with_capacity(blob_file_count as usize); - - for _ in 0..blob_file_count { - let id = reader.read_u64::()?; - blob_file_ids.push(id); - } - - blob_file_ids - }; - - let gc_stats = { - let mut reader = toc - .section(b"blob_gc_stats") - .expect("blob_gc_stats should exist") - .buf_reader(&version_file_path)?; - - crate::blob_tree::FragmentationMap::decode_from(&mut reader)? - }; - - Ok(Recovery { - curr_version_id, - segment_ids: levels, - blob_file_ids, - gc_stats, - }) - } - - pub fn get_current_version(folder: &Path) -> crate::Result { - std::fs::File::open(folder.join("current")) - .and_then(|mut f| f.read_u64::()) - .map_err(Into::into) - } - - pub(crate) fn recover>( - folder: P, - recovery: Recovery, - segments: &[Segment], - blob_files: &[BlobFile], - ) -> crate::Result { - let version_levels = recovery - .segment_ids - .iter() - .map(|level| { - let level_runs = level - .iter() - .map(|run| { - let run_segments = run - .iter() - .map(|segment_id| { - segments - .iter() - .find(|x| x.id() == *segment_id) - .cloned() - .ok_or(crate::Error::Unrecoverable) - }) - .collect::>>()?; - - Ok(Arc::new(Run::new(run_segments))) - }) - .collect::>>()?; - - Ok(Level::from_runs(level_runs)) - }) - .collect::>>()?; - - Ok(Self { - current: Version::from_levels( - recovery.curr_version_id, - version_levels, - blob_files.iter().cloned().map(|bf| (bf.id(), bf)).collect(), - recovery.gc_stats, - ), - folder: folder.into(), - hidden_set: HiddenSet::default(), - version_free_list: VecDeque::default(), // TODO: 3. create free list from versions that are N < CURRENT, or delete old versions eagerly... - }) - } - - fn persist_version(folder: &Path, version: &Version) -> crate::Result<()> { - log::trace!( - "Persisting version {} in {}", - version.id(), - folder.display(), - ); - - let path = folder.join(format!("v{}", version.id())); - let file = std::fs::File::create_new(path)?; - let writer = BufWriter::new(file); - let mut writer = sfa::Writer::into_writer(writer); - - version.encode_into(&mut writer)?; - - writer.finish().map_err(|e| match e { - sfa::Error::Io(e) => crate::Error::from(e), - _ => unreachable!(), - })?; - - // IMPORTANT: fsync folder on Unix - fsync_directory(folder)?; - - rewrite_atomic(&folder.join("current"), &version.id().to_le_bytes())?; - - Ok(()) - } - - /// Modifies the level manifest atomically. - /// - /// The function accepts a transition function that receives the current version - /// and returns a new version. - /// - /// The function takes care of persisting the version changes on disk. - pub(crate) fn atomic_swap crate::Result>( - &mut self, - f: F, - gc_watermark: SeqNo, - ) -> crate::Result<()> { - // NOTE: Copy-on-write... - // - // Create a copy of the levels we can operate on - // without mutating the current level manifest - // If persisting to disk fails, this way the level manifest - // is unchanged - let next_version = f(&self.current)?; - - Self::persist_version(&self.folder, &next_version)?; - - let mut old_version = std::mem::replace(&mut self.current, next_version); - old_version.seqno_watermark = gc_watermark; - - self.version_free_list.push_back(old_version); - - Ok(()) - } - - pub(crate) fn maintenance(&mut self, gc_watermark: SeqNo) -> crate::Result<()> { - log::debug!("Running manifest GC"); - - loop { - let Some(head) = self.version_free_list.front() else { - break; - }; - - if head.seqno_watermark < gc_watermark { - let path = self.folder.join(format!("v{}", head.id())); - std::fs::remove_file(path)?; - self.version_free_list.pop_front(); - } else { - break; - } - } - - log::debug!("Manifest GC done"); - - Ok(()) - } - - /// Returns `true` if there are no segments - #[must_use] - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - - /// Returns the number of levels in the tree - #[must_use] - pub fn level_count(&self) -> u8 { - // NOTE: Level count is u8 - #[allow(clippy::cast_possible_truncation)] - { - self.current.level_count() as u8 - } - } - - /// Returns the number of segments, summed over all levels - #[must_use] - pub fn len(&self) -> usize { - self.current.segment_count() - } - - /// Returns the (compressed) size of all segments - #[must_use] - pub fn size(&self) -> u64 { - self.iter().map(Segment::file_size).sum() - } - - #[must_use] - pub fn level_is_busy(&self, idx: usize) -> bool { - self.current.level(idx).is_some_and(|level| { - level - .iter() - .flat_map(|run| run.iter()) - .any(|segment| self.hidden_set.is_hidden(segment.id())) - }) - } - - #[must_use] - pub fn as_slice(&self) -> &[Level] { - &self.current.levels - } - - pub fn iter(&self) -> impl Iterator { - self.current.iter_segments() - } - - pub(crate) fn should_decline_compaction>( - &self, - ids: T, - ) -> bool { - self.hidden_set().is_blocked(ids) - } - - pub(crate) fn hidden_set(&self) -> &HiddenSet { - &self.hidden_set - } - - pub(crate) fn hide_segments>(&mut self, keys: T) { - self.hidden_set.hide(keys); - } - - pub(crate) fn show_segments>(&mut self, keys: T) { - self.hidden_set.show(keys); - } -} - -#[cfg(test)] -#[allow(clippy::expect_used)] -mod tests { - use crate::AbstractTree; - use test_log::test; - - #[test] - fn level_manifest_atomicity() -> crate::Result<()> { - let folder = tempfile::tempdir()?; - - let tree = crate::Config::new(folder).open()?; - - tree.insert("a", "a", 0); - tree.flush_active_memtable(0)?; - tree.insert("a", "a", 1); - tree.flush_active_memtable(0)?; - tree.insert("a", "a", 2); - tree.flush_active_memtable(0)?; - - assert_eq!(3, tree.approximate_len()); - - tree.major_compact(u64::MAX, 3)?; - - assert_eq!(1, tree.segment_count()); - - tree.insert("a", "a", 3); - tree.flush_active_memtable(0)?; - - let segment_count_before_major_compact = tree.segment_count(); - - let crate::AnyTree::Standard(tree) = tree else { - unreachable!(); - }; - - { - // NOTE: Purposefully change level manifest to have invalid path - // to force an I/O error - tree.super_version - .write() - .expect("lock is poisoned") - .manifest - .folder = "/invaliiid/asd".into(); - } - - assert!(tree.major_compact(u64::MAX, 4).is_err()); - - assert!(tree - .super_version - .read() - .expect("lock is poisoned") - .manifest - .hidden_set - .is_empty()); - - assert_eq!(segment_count_before_major_compact, tree.segment_count()); - - Ok(()) - } -} diff --git a/src/lib.rs b/src/lib.rs index cd2883eb..6328eb7a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -178,9 +178,6 @@ mod iter_guard; mod key; mod key_range; -#[doc(hidden)] -pub mod level_manifest; - mod run_reader; mod run_scanner; From 0bfdf2515ecdecca00037c71d763cace1b025dda Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 11 Oct 2025 13:54:48 +0200 Subject: [PATCH 564/613] version recovery --- src/version/mod.rs | 135 +++++++++++++++++++++++++++++++++++++++- src/version/recovery.rs | 99 +++++++++++++++++++++++++++++ 2 files changed, 233 insertions(+), 1 deletion(-) create mode 100644 src/version/recovery.rs diff --git a/src/version/mod.rs b/src/version/mod.rs index 4f47f6c6..2c27b92c 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -3,18 +3,22 @@ // (found in the LICENSE-* files in the repository) mod optimize; +pub(crate) mod recovery; pub mod run; pub use run::Run; use crate::blob_tree::{FragmentationEntry, FragmentationMap}; use crate::coding::Encode; +use crate::compaction::state::hidden_set::HiddenSet; +use crate::version::recovery::Recovery; use crate::{ vlog::{BlobFile, BlobFileId}, HashSet, KeyRange, Segment, SegmentId, SeqNo, }; use optimize::optimize_runs; use run::Ranged; +use std::path::PathBuf; use std::{collections::BTreeMap, ops::Deref, sync::Arc}; pub const DEFAULT_LEVEL_COUNT: u8 = 7; @@ -142,7 +146,7 @@ pub struct VersionInner { id: VersionId, /// The individual LSM-tree levels which consist of runs of tables - pub(crate) levels: Vec, + levels: Vec, // TODO: 3.0.0 this should really be a newtype // NOTE: We purposefully use Arc<_> to avoid deep cloning the blob files again and again @@ -190,6 +194,20 @@ impl Version { &self.gc_stats } + pub fn l0(&self) -> &Level { + self.levels.first().expect("L0 should exist") + } + + #[must_use] + pub fn level_is_busy(&self, idx: usize, hidden_set: &HiddenSet) -> bool { + self.level(idx).is_some_and(|level| { + level + .iter() + .flat_map(|run: &Arc>| run.iter()) + .any(|segment| hidden_set.is_hidden(segment.id())) + }) + } + /// Creates a new empty version. pub fn new(id: VersionId) -> Self { let levels = (0..DEFAULT_LEVEL_COUNT).map(|_| Level::empty()).collect(); @@ -205,6 +223,45 @@ impl Version { } } + pub(crate) fn from_recovery( + recovery: Recovery, + segments: &[Segment], + blob_files: &[BlobFile], + ) -> crate::Result { + let version_levels = recovery + .segment_ids + .iter() + .map(|level| { + let level_runs = level + .iter() + .map(|run| { + let run_segments = run + .iter() + .map(|segment_id| { + segments + .iter() + .find(|x| x.id() == *segment_id) + .cloned() + .ok_or(crate::Error::Unrecoverable) + }) + .collect::>>()?; + + Ok(Arc::new(Run::new(run_segments))) + }) + .collect::>>()?; + + Ok(Level::from_runs(level_runs)) + }) + .collect::>>()?; + + Ok(Self::from_levels( + recovery.curr_version_id, + version_levels, + blob_files.iter().cloned().map(|bf| (bf.id(), bf)).collect(), + recovery.gc_stats, + )) + } + /// Creates a new pre-populated version. pub fn from_levels( id: VersionId, @@ -581,4 +638,80 @@ impl Version { Ok(()) } + + pub fn fmt(&self, f: &mut std::fmt::Formatter<'_>, hidden_set: &HiddenSet) -> std::fmt::Result { + for (idx, level) in self.iter_levels().enumerate() { + writeln!( + f, + "{idx} [{}], r={}: ", + match (level.is_empty(), level.is_disjoint()) { + (true, _) => ".", + (false, true) => "D", + (false, false) => "_", + }, + level.len(), + )?; + + for run in level.iter() { + write!(f, " ")?; + + if run.len() >= 30 { + #[allow(clippy::indexing_slicing)] + for segment in run.iter().take(2) { + let id = segment.id(); + let is_hidden = hidden_set.is_hidden(id); + + write!( + f, + "{}{id}{}", + if is_hidden { "(" } else { "[" }, + if is_hidden { ")" } else { "]" }, + )?; + } + write!(f, " . . . ")?; + + #[allow(clippy::indexing_slicing)] + for segment in run.iter().rev().take(2).rev() { + let id = segment.id(); + let is_hidden = hidden_set.is_hidden(id); + + write!( + f, + "{}{id}{}", + if is_hidden { "(" } else { "[" }, + if is_hidden { ")" } else { "]" }, + )?; + } + + writeln!( + f, + " | # = {}, {} MiB", + run.len(), + run.iter().map(Segment::file_size).sum::() / 1_024 / 1_024, + )?; + } else { + for segment in run.iter() { + let id = segment.id(); + let is_hidden = hidden_set.is_hidden(id); + + write!( + f, + "{}{id}{}", + if is_hidden { "(" } else { "[" }, + if is_hidden { ")" } else { "]" }, + )?; + } + + writeln!( + f, + " | # = {}, {} MiB", + run.len(), + run.iter().map(Segment::file_size).sum::() / 1_024 / 1_024, + )?; + } + } + } + + Ok(()) + } } diff --git a/src/version/recovery.rs b/src/version/recovery.rs new file mode 100644 index 00000000..2627ce8d --- /dev/null +++ b/src/version/recovery.rs @@ -0,0 +1,99 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use crate::{coding::Decode, version::VersionId, vlog::BlobFileId, SegmentId}; +use byteorder::{LittleEndian, ReadBytesExt}; +use std::path::Path; + +pub fn get_current_version(folder: &std::path::Path) -> crate::Result { + use byteorder::{LittleEndian, ReadBytesExt}; + + std::fs::File::open(folder.join("current")) + .and_then(|mut f| f.read_u64::()) + .map_err(Into::into) +} + +pub struct Recovery { + pub curr_version_id: VersionId, + pub segment_ids: Vec>>, + pub blob_file_ids: Vec, + pub gc_stats: crate::blob_tree::FragmentationMap, +} + +pub fn recover_ids(folder: &Path) -> crate::Result { + let curr_version_id = get_current_version(folder)?; + let version_file_path = folder.join(format!("v{curr_version_id}")); + + log::info!( + "Recovering current manifest at {}", + version_file_path.display(), + ); + + let reader = sfa::Reader::new(&version_file_path)?; + let toc = reader.toc(); + + // // TODO: vvv move into Version::decode vvv + let mut levels = vec![]; + + { + let mut reader = toc + .section(b"tables") + .expect("tables should exist") + .buf_reader(&version_file_path)?; + + let level_count = reader.read_u8()?; + + for _ in 0..level_count { + let mut level = vec![]; + let run_count = reader.read_u8()?; + + for _ in 0..run_count { + let mut run = vec![]; + let segment_count = reader.read_u32::()?; + + for _ in 0..segment_count { + let id = reader.read_u64::()?; + run.push(id); + } + + level.push(run); + } + + levels.push(level); + } + } + + let blob_file_ids = { + let mut reader = toc + .section(b"blob_files") + .expect("blob_files should exist") + .buf_reader(&version_file_path)?; + + let blob_file_count = reader.read_u32::()?; + let mut blob_file_ids = Vec::with_capacity(blob_file_count as usize); + + for _ in 0..blob_file_count { + let id = reader.read_u64::()?; + blob_file_ids.push(id); + } + + blob_file_ids + }; + + let gc_stats = { + let mut reader = toc + .section(b"blob_gc_stats") + .expect("blob_gc_stats should exist") + .buf_reader(&version_file_path)?; + + crate::blob_tree::FragmentationMap::decode_from(&mut reader)? + }; + + Ok(Recovery { + curr_version_id, + segment_ids: levels, + blob_file_ids, + gc_stats, + }) +} From 493d03c5660ee0add6eade57b6acc6dbbea98ec6 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 11 Oct 2025 13:55:24 +0200 Subject: [PATCH 565/613] compaction state --- src/compaction/mod.rs | 7 +- .../state}/hidden_set.rs | 11 + src/compaction/state/mod.rs | 203 ++++++++++++++++++ 3 files changed, 219 insertions(+), 2 deletions(-) rename src/{level_manifest => compaction/state}/hidden_set.rs (76%) create mode 100644 src/compaction/state/mod.rs diff --git a/src/compaction/mod.rs b/src/compaction/mod.rs index f69938e4..f8720d4b 100644 --- a/src/compaction/mod.rs +++ b/src/compaction/mod.rs @@ -12,6 +12,7 @@ mod flavour; pub(crate) mod major; pub(crate) mod movedown; pub(crate) mod pulldown; +pub(crate) mod state; pub(crate) mod stream; pub(crate) mod tiered; pub(crate) mod worker; @@ -20,7 +21,9 @@ pub use fifo::Strategy as Fifo; pub use leveled::Strategy as Leveled; pub use tiered::Strategy as SizeTiered; -use crate::{config::Config, level_manifest::LevelManifest, HashSet, SegmentId}; +use crate::{ + compaction::state::CompactionState, config::Config, version::Version, HashSet, SegmentId, +}; /// Alias for `Leveled` pub type Levelled = Leveled; @@ -83,5 +86,5 @@ pub trait CompactionStrategy { fn get_name(&self) -> &'static str; /// Decides on what to do based on the current state of the LSM-tree's levels - fn choose(&self, _: &LevelManifest, config: &Config) -> Choice; + fn choose(&self, version: &Version, config: &Config, state: &CompactionState) -> Choice; } diff --git a/src/level_manifest/hidden_set.rs b/src/compaction/state/hidden_set.rs similarity index 76% rename from src/level_manifest/hidden_set.rs rename to src/compaction/state/hidden_set.rs index 90615e7a..1f07ae69 100644 --- a/src/level_manifest/hidden_set.rs +++ b/src/compaction/state/hidden_set.rs @@ -1,3 +1,7 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + use crate::SegmentId; /// The hidden set keeps track of which segments are currently being compacted @@ -33,4 +37,11 @@ impl HiddenSet { pub(crate) fn is_empty(&self) -> bool { self.set.is_empty() } + + pub(crate) fn should_decline_compaction>( + &self, + ids: T, + ) -> bool { + self.is_blocked(ids) + } } diff --git a/src/compaction/state/mod.rs b/src/compaction/state/mod.rs new file mode 100644 index 00000000..9bc304c1 --- /dev/null +++ b/src/compaction/state/mod.rs @@ -0,0 +1,203 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +pub mod hidden_set; + +use crate::{ + file::{fsync_directory, rewrite_atomic}, + tree::inner::SuperVersion, + version::Version, + SeqNo, +}; +use hidden_set::HiddenSet; +use std::{ + collections::VecDeque, + io::BufWriter, + path::{Path, PathBuf}, +}; + +pub fn persist_version(folder: &Path, version: &Version) -> crate::Result<()> { + log::trace!( + "Persisting version {} in {}", + version.id(), + folder.display(), + ); + + let path = folder.join(format!("v{}", version.id())); + let file = std::fs::File::create_new(path)?; + let writer = BufWriter::new(file); + let mut writer = sfa::Writer::into_writer(writer); + + version.encode_into(&mut writer)?; + + writer.finish().map_err(|e| match e { + sfa::Error::Io(e) => crate::Error::from(e), + _ => unreachable!(), + })?; + + // IMPORTANT: fsync folder on Unix + fsync_directory(folder)?; + + rewrite_atomic(&folder.join("current"), &version.id().to_le_bytes())?; + + Ok(()) +} + +pub struct CompactionState { + /// Path of tree folder. + folder: PathBuf, + + /// Set of segment IDs that are masked. + /// + /// While consuming segments (because of compaction) they will not appear in the list of segments + /// as to not cause conflicts between multiple compaction threads (compacting the same segments). + hidden_set: HiddenSet, + + /// Holds onto versions until they are safe to drop. + version_free_list: VecDeque, +} + +impl CompactionState { + pub fn new(folder: impl Into) -> Self { + Self { + folder: folder.into(), + hidden_set: HiddenSet::default(), + version_free_list: VecDeque::default(), + } + } + + pub fn create_new(folder: impl Into) -> crate::Result { + let folder = folder.into(); + + persist_version(&folder, &Version::new(0))?; + + Ok(Self::new(folder)) + } + + /// Modifies the level manifest atomically. + /// + /// The function accepts a transition function that receives the current version + /// and returns a new version. + /// + /// The function takes care of persisting the version changes on disk. + pub(crate) fn upgrade_version crate::Result>( + &mut self, + super_version: &mut SuperVersion, + f: F, + gc_watermark: SeqNo, + ) -> crate::Result<()> { + // NOTE: Copy-on-write... + // + // Create a copy of the levels we can operate on + // without mutating the current level manifest + // If persisting to disk fails, this way the level manifest + // is unchanged + let next_version = f(&super_version.version)?; + + persist_version(&self.folder, &next_version)?; + + let mut old_version = std::mem::replace(&mut super_version.version, next_version); + old_version.seqno_watermark = gc_watermark; + + self.push_old_version(old_version); + + Ok(()) + } + + fn push_old_version(&mut self, version: Version) { + self.version_free_list.push_back(version); + } + + pub fn version_free_list_len(&self) -> usize { + self.version_free_list.len() + } + + pub fn hidden_set(&self) -> &HiddenSet { + &self.hidden_set + } + + pub fn hidden_set_mut(&mut self) -> &mut HiddenSet { + &mut self.hidden_set + } + + pub(crate) fn maintenance(&mut self, gc_watermark: SeqNo) -> crate::Result<()> { + log::debug!("Running manifest GC"); + + loop { + let Some(head) = self.version_free_list.front() else { + break; + }; + + if head.seqno_watermark < gc_watermark { + let path = self.folder.join(format!("v{}", head.id())); + std::fs::remove_file(path)?; + self.version_free_list.pop_front(); + } else { + break; + } + } + + log::debug!("Manifest GC done"); + + Ok(()) + } +} + +#[cfg(test)] +#[allow(clippy::expect_used)] +mod tests { + use crate::AbstractTree; + use test_log::test; + + #[test] + fn level_manifest_atomicity() -> crate::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = crate::Config::new(folder).open()?; + + tree.insert("a", "a", 0); + tree.flush_active_memtable(0)?; + tree.insert("a", "a", 1); + tree.flush_active_memtable(0)?; + tree.insert("a", "a", 2); + tree.flush_active_memtable(0)?; + + assert_eq!(3, tree.approximate_len()); + + tree.major_compact(u64::MAX, 3)?; + + assert_eq!(1, tree.segment_count()); + + tree.insert("a", "a", 3); + tree.flush_active_memtable(0)?; + + let segment_count_before_major_compact = tree.segment_count(); + + let crate::AnyTree::Standard(tree) = tree else { + unreachable!(); + }; + + { + // NOTE: Purposefully change level manifest to have invalid path + // to force an I/O error + tree.compaction_state + .lock() + .expect("lock is poisoned") + .folder = "/invaliiid/asd".into(); + } + + assert!(tree.major_compact(u64::MAX, 4).is_err()); + + assert!(tree + .compaction_state + .lock() + .expect("lock is poisoned") + .hidden_set() + .is_empty()); + + assert_eq!(segment_count_before_major_compact, tree.segment_count()); + + Ok(()) + } +} From e5fd9587f6024fa61bacf6b248effa3da5a5001b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 11 Oct 2025 13:56:40 +0200 Subject: [PATCH 566/613] implement compaction state in tree --- src/range.rs | 9 +-- src/segment/index_block/block_handle.rs | 2 +- src/tree/inner.rs | 26 +++++---- src/tree/mod.rs | 73 +++++++++++++------------ 4 files changed, 56 insertions(+), 54 deletions(-) diff --git a/src/range.rs b/src/range.rs index 7b5034dd..8b56d233 100644 --- a/src/range.rs +++ b/src/range.rs @@ -4,7 +4,6 @@ use crate::{ key::InternalKey, - level_manifest::LevelManifest, memtable::Memtable, merge::Merger, mvcc_stream::MvccStream, @@ -144,7 +143,7 @@ impl TreeIter { guard: IterState, range: R, seqno: SeqNo, - level_manifest: &LevelManifest, + version: &Version, ) -> Self { Self::new(guard, |lock| { let lo = match range.start_bound() { @@ -209,11 +208,7 @@ impl TreeIter { // }; #[allow(clippy::needless_continue)] - for run in level_manifest - .current_version() - .iter_levels() - .flat_map(|lvl| lvl.iter()) - { + for run in version.iter_levels().flat_map(|lvl| lvl.iter()) { match run.len() { 0 => continue, 1 => { diff --git a/src/segment/index_block/block_handle.rs b/src/segment/index_block/block_handle.rs index 83f6ec4e..7b4c194c 100644 --- a/src/segment/index_block/block_handle.rs +++ b/src/segment/index_block/block_handle.rs @@ -263,6 +263,6 @@ impl Decodable for KeyedBlockHandle { offset: usize, base_key_offset: usize, ) -> Option { - todo!() + unimplemented!() } } diff --git a/src/tree/inner.rs b/src/tree/inner.rs index 7fb99e62..79bb08b6 100644 --- a/src/tree/inner.rs +++ b/src/tree/inner.rs @@ -3,8 +3,13 @@ // (found in the LICENSE-* files in the repository) use crate::{ - config::Config, level_manifest::LevelManifest, memtable::Memtable, stop_signal::StopSignal, - tree::sealed::SealedMemtables, SegmentId, SequenceNumberCounter, + compaction::state::{persist_version, CompactionState}, + config::Config, + memtable::Memtable, + stop_signal::StopSignal, + tree::sealed::SealedMemtables, + version::Version, + SegmentId, SequenceNumberCounter, }; use std::sync::{atomic::AtomicU64, Arc, Mutex, RwLock}; @@ -35,7 +40,7 @@ pub struct SuperVersion { pub(crate) sealed_memtables: Arc, /// Current tree version - pub(crate) manifest: LevelManifest, + pub(crate) version: Version, } #[allow(clippy::module_name_repetitions)] @@ -53,6 +58,8 @@ pub struct TreeInner { pub(crate) super_version: Arc>, + pub(crate) compaction_state: Arc>, + /// Tree configuration pub config: Config, @@ -66,10 +73,6 @@ pub struct TreeInner { /// can be concurrent next to each other. pub(crate) major_compaction_lock: RwLock<()>, - // TODO: 3.0.0 compaction state - // Serializes compactions when they look at the tree levels and prepare compactions - pub(crate) compaction_lock: Arc>, - #[doc(hidden)] #[cfg(feature = "metrics")] pub metrics: Arc, @@ -77,7 +80,10 @@ pub struct TreeInner { impl TreeInner { pub(crate) fn create_new(config: Config) -> crate::Result { - let manifest = LevelManifest::create_new(&config.path)?; + let version = Version::new(0); + persist_version(&config.path, &version)?; + + let path = config.path.clone(); Ok(Self { id: get_next_tree_id(), @@ -87,11 +93,11 @@ impl TreeInner { super_version: Arc::new(RwLock::new(SuperVersion { active_memtable: Arc::default(), sealed_memtables: Arc::default(), - manifest, + version, })), stop_signal: StopSignal::default(), major_compaction_lock: RwLock::default(), - compaction_lock: Arc::default(), + compaction_state: Arc::new(Mutex::new(CompactionState::new(path))), #[cfg(feature = "metrics")] metrics: Metrics::default().into(), diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 7cfc0601..eb374341 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -9,19 +9,18 @@ mod sealed; use crate::{ blob_tree::FragmentationMap, coding::{Decode, Encode}, - compaction::{drop_range::OwnedBounds, CompactionStrategy}, + compaction::{drop_range::OwnedBounds, state::CompactionState, CompactionStrategy}, config::Config, file::BLOBS_FOLDER, format_version::FormatVersion, iter_guard::{IterGuard, IterGuardImpl}, - level_manifest::LevelManifest, manifest::Manifest, memtable::Memtable, segment::Segment, slice::Slice, tree::inner::SuperVersion, value::InternalValue, - version::Version, + version::{recovery::recover_ids, Version}, vlog::BlobFile, AbstractTree, Cache, DescriptorTable, KvPair, SegmentId, SeqNo, SequenceNumberCounter, TreeType, UserKey, UserValue, ValueType, @@ -31,7 +30,7 @@ use std::{ io::Cursor, ops::{Bound, RangeBounds}, path::Path, - sync::{atomic::AtomicU64, Arc, RwLock}, + sync::{atomic::AtomicU64, Arc, Mutex, RwLock}, }; #[cfg(feature = "metrics")] @@ -106,12 +105,7 @@ impl AbstractTree for Tree { } fn current_version(&self) -> Version { - self.super_version - .read() - .expect("poisoned") - .manifest - .current_version() - .clone() + self.super_version.read().expect("poisoned").version.clone() } fn flush_active_memtable(&self, seqno_threshold: SeqNo) -> crate::Result> { @@ -137,12 +131,10 @@ impl AbstractTree for Tree { } fn version_free_list_len(&self) -> usize { - self.super_version - .read() + self.compaction_state + .lock() .expect("lock is poisoned") - .manifest - .version_free_list - .len() + .version_free_list_len() } fn prefix>( @@ -400,9 +392,11 @@ impl AbstractTree for Tree { blob_files.map(<[BlobFile]>::len).unwrap_or_default(), ); - let mut version_lock = self.super_version.write().expect("lock is poisoned"); + let mut compaction_state = self.compaction_state.lock().expect("lock is poisoned"); + let mut super_version = self.super_version.write().expect("lock is poisoned"); - version_lock.manifest.atomic_swap( + compaction_state.upgrade_version( + &mut super_version, |version| { Ok(version.with_new_l0_run( segments, @@ -416,8 +410,8 @@ impl AbstractTree for Tree { for segment in segments { log::trace!("releasing sealed memtable {}", segment.id()); - version_lock.sealed_memtables = - Arc::new(version_lock.sealed_memtables.remove(segment.id())); + super_version.sealed_memtables = + Arc::new(super_version.sealed_memtables.remove(segment.id())); } Ok(()) @@ -693,11 +687,12 @@ impl Tree { #[doc(hidden)] #[must_use] pub fn is_compacting(&self) -> bool { - self.super_version - .read() + !self + .compaction_state + .lock() .expect("lock is poisoned") - .manifest - .is_compacting() + .hidden_set() + .is_empty() } fn get_internal_entry_from_sealed_memtables( @@ -725,7 +720,7 @@ impl Tree { // https://fjall-rs.github.io/post/bloom-filter-hash-sharing/ let key_hash = crate::segment::filter::standard_bloom::Builder::get_hash(key); - for level in super_version.manifest.current_version().iter_levels() { + for level in super_version.version.iter_levels() { for run in level.iter() { // NOTE: Based on benchmarking, binary search is only worth it with ~4 segments if run.len() >= 4 { @@ -803,21 +798,21 @@ impl Tree { let bounds: (Bound, Bound) = (lo, hi); - let version_lock = self.super_version.write().expect("lock is poisoned"); + let super_version = self.super_version.write().expect("lock is poisoned"); let iter_state = { - let active = &version_lock.active_memtable; - let sealed = &version_lock.sealed_memtables; + let active = &super_version.active_memtable; + let sealed = &super_version.sealed_memtables; IterState { active: active.clone(), sealed: sealed.iter().map(|(_, mt)| mt.clone()).collect(), ephemeral, - version: version_lock.manifest.current_version().clone(), + version: super_version.version.clone(), } }; - TreeIter::create_range(iter_state, bounds, seqno, &version_lock.manifest) + TreeIter::create_range(iter_state, bounds, seqno, &super_version.version) } #[doc(hidden)] @@ -887,7 +882,7 @@ impl Tree { #[cfg(feature = "metrics")] let metrics = Arc::new(Metrics::default()); - let levels = Self::recover_levels( + let version = Self::recover_levels( &config.path, tree_id, &config.cache, @@ -896,7 +891,13 @@ impl Tree { &metrics, )?; - let highest_segment_id = levels.iter().map(Segment::id).max().unwrap_or_default(); + let highest_segment_id = version + .iter_segments() + .map(Segment::id) + .max() + .unwrap_or_default(); + + let path = config.path.clone(); let inner = TreeInner { id: tree_id, @@ -905,12 +906,12 @@ impl Tree { super_version: Arc::new(RwLock::new(SuperVersion { active_memtable: Arc::default(), sealed_memtables: Arc::default(), - manifest: levels, + version, })), stop_signal: StopSignal::default(), config, major_compaction_lock: RwLock::default(), - compaction_lock: Arc::default(), + compaction_state: Arc::new(Mutex::new(CompactionState::new(path))), #[cfg(feature = "metrics")] metrics, @@ -965,12 +966,12 @@ impl Tree { cache: &Arc, descriptor_table: &Arc, #[cfg(feature = "metrics")] metrics: &Arc, - ) -> crate::Result { + ) -> crate::Result { use crate::{file::fsync_directory, file::SEGMENTS_FOLDER, SegmentId}; let tree_path = tree_path.as_ref(); - let recovery = LevelManifest::recover_ids(tree_path)?; + let recovery = recover_ids(tree_path)?; let segment_id_map = { let mut result: crate::HashMap = @@ -1091,6 +1092,6 @@ impl Tree { &recovery.blob_file_ids, )?; - LevelManifest::recover(tree_path, recovery, &segments, &blob_files) + Version::from_recovery(recovery, &segments, &blob_files) } } From 9f143c4dedb569b4e5be99e08c6600787bd10879 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 11 Oct 2025 13:57:26 +0200 Subject: [PATCH 567/613] adjust compaction worker --- src/compaction/drop_range.rs | 13 ++-- src/compaction/fifo.rs | 11 ++- src/compaction/flavour.rs | 27 +++++--- src/compaction/leveled.rs | 49 +++++++------ src/compaction/major.rs | 10 +-- src/compaction/movedown.rs | 8 +-- src/compaction/pulldown.rs | 6 +- src/compaction/tiered.rs | 14 ++-- src/compaction/worker.rs | 129 ++++++++++++++++------------------- 9 files changed, 134 insertions(+), 133 deletions(-) diff --git a/src/compaction/drop_range.rs b/src/compaction/drop_range.rs index 7374804a..d1c40f97 100644 --- a/src/compaction/drop_range.rs +++ b/src/compaction/drop_range.rs @@ -3,9 +3,9 @@ // (found in the LICENSE-* files in the repository) use super::{Choice, CompactionStrategy}; -use crate::{ - config::Config, level_manifest::LevelManifest, slice::Slice, version::run::Ranged, KeyRange, -}; +use crate::compaction::state::CompactionState; +use crate::version::Version; +use crate::{config::Config, slice::Slice, version::run::Ranged, KeyRange}; use crate::{HashSet, Segment}; use std::ops::{Bound, RangeBounds}; @@ -73,9 +73,8 @@ impl CompactionStrategy for Strategy { "DropRangeCompaction" } - fn choose(&self, levels: &LevelManifest, _: &Config) -> Choice { - let segment_ids: HashSet<_> = levels - .current_version() + fn choose(&self, version: &Version, _: &Config, state: &CompactionState) -> Choice { + let segment_ids: HashSet<_> = version .iter_levels() .flat_map(|lvl| lvl.iter()) .flat_map(|run| { @@ -93,7 +92,7 @@ impl CompactionStrategy for Strategy { // But just as a fail-safe... let some_hidden = segment_ids .iter() - .any(|&id| levels.hidden_set().is_hidden(id)); + .any(|&id| state.hidden_set().is_hidden(id)); if some_hidden { Choice::DoNothing diff --git a/src/compaction/fifo.rs b/src/compaction/fifo.rs index 0d6cc78b..de2d873f 100644 --- a/src/compaction/fifo.rs +++ b/src/compaction/fifo.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{Choice, CompactionStrategy}; -use crate::{config::Config, level_manifest::LevelManifest, HashSet}; +use crate::{compaction::state::CompactionState, config::Config, version::Version, HashSet}; /// FIFO-style compaction /// @@ -45,13 +45,18 @@ impl CompactionStrategy for Strategy { } // TODO: TTL - fn choose(&self, levels: &LevelManifest, _config: &Config) -> Choice { + fn choose(&self, version: &Version, _config: &Config, state: &CompactionState) -> Choice { // NOTE: We always have at least one level #[allow(clippy::expect_used)] - let first_level = levels.as_slice().first().expect("should have first level"); + let first_level = version.l0(); assert!(first_level.is_disjoint(), "L0 needs to be disjoint"); + assert!( + !version.level_is_busy(0, state.hidden_set()), + "FIFO compaction never compacts", + ); + let l0_size = first_level.size(); if l0_size > self.limit { diff --git a/src/compaction/flavour.rs b/src/compaction/flavour.rs index c3d9e4f4..c03f8d32 100644 --- a/src/compaction/flavour.rs +++ b/src/compaction/flavour.rs @@ -4,11 +4,12 @@ use std::time::Instant; use crate::blob_tree::handle::BlobIndirection; use crate::blob_tree::FragmentationMap; use crate::coding::{Decode, Encode}; +use crate::compaction::state::CompactionState; use crate::compaction::worker::Options; use crate::compaction::Input as CompactionPayload; use crate::file::SEGMENTS_FOLDER; -use crate::level_manifest::LevelManifest; use crate::segment::multi_writer::MultiWriter; +use crate::tree::inner::SuperVersion; use crate::version::Version; use crate::vlog::{BlobFileId, BlobFileMergeScanner, BlobFileWriter}; use crate::{BlobFile, HashSet, InternalValue, Segment}; @@ -81,9 +82,11 @@ pub(super) fn prepare_table_writer( pub(super) trait CompactionFlavour { fn write(&mut self, item: InternalValue) -> crate::Result<()>; + #[warn(clippy::too_many_arguments)] fn finish( self: Box, - levels: &mut LevelManifest, + super_version: &mut SuperVersion, + state: &mut CompactionState, opts: &Options, payload: &CompactionPayload, dst_lvl: usize, @@ -214,7 +217,8 @@ impl CompactionFlavour for RelocatingCompaction { fn finish( mut self: Box, - levels: &mut LevelManifest, + super_version: &mut SuperVersion, + state: &mut CompactionState, opts: &Options, payload: &CompactionPayload, dst_lvl: usize, @@ -232,14 +236,15 @@ impl CompactionFlavour for RelocatingCompaction { let mut blob_file_ids_to_drop = self.rewriting_blob_file_ids; - for blob_file in levels.current_version().value_log.values() { - if blob_file.is_dead(levels.current_version().gc_stats()) { + for blob_file in super_version.version.value_log.values() { + if blob_file.is_dead(super_version.version.gc_stats()) { blob_file_ids_to_drop.insert(blob_file.id()); self.rewriting_blob_files.push(blob_file.clone()); } } - levels.atomic_swap( + state.upgrade_version( + super_version, |current| { Ok(current.with_merge( &payload.segment_ids.iter().copied().collect::>(), @@ -333,7 +338,8 @@ impl CompactionFlavour for StandardCompaction { fn finish( mut self: Box, - levels: &mut LevelManifest, + super_version: &mut SuperVersion, + state: &mut CompactionState, opts: &Options, payload: &CompactionPayload, dst_lvl: usize, @@ -347,13 +353,14 @@ impl CompactionFlavour for StandardCompaction { let mut blob_files_to_drop = Vec::default(); - for blob_file in levels.current_version().value_log.values() { - if blob_file.is_dead(levels.current_version().gc_stats()) { + for blob_file in super_version.version.value_log.values() { + if blob_file.is_dead(super_version.version.gc_stats()) { blob_files_to_drop.push(blob_file.clone()); } } - levels.atomic_swap( + state.upgrade_version( + super_version, |current| { Ok(current.with_merge( &payload.segment_ids.iter().copied().collect::>(), diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 8ff6e005..d40f428d 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -4,11 +4,11 @@ use super::{Choice, CompactionStrategy, Input as CompactionInput}; use crate::{ + compaction::state::{hidden_set::HiddenSet, CompactionState}, config::Config, - level_manifest::{hidden_set::HiddenSet, LevelManifest}, segment::Segment, slice_windows::{GrowingWindowsExt, ShrinkingWindowsExt}, - version::{run::Ranged, Run}, + version::{run::Ranged, Run, Version}, HashSet, KeyRange, SegmentId, }; @@ -189,27 +189,25 @@ impl CompactionStrategy for Strategy { } #[allow(clippy::too_many_lines)] - fn choose(&self, levels: &LevelManifest, _: &Config) -> Choice { - assert!(levels.as_slice().len() == 7, "should have exactly 7 levels"); + fn choose(&self, version: &Version, _: &Config, state: &CompactionState) -> Choice { + assert!(version.level_count() == 7, "should have exactly 7 levels"); // Find the level that corresponds to L1 #[allow(clippy::map_unwrap_or)] - let mut canonical_l1_idx = levels - .as_slice() - .iter() + let mut canonical_l1_idx = version + .iter_levels() .enumerate() .skip(1) .find(|(_, lvl)| !lvl.is_empty()) .map(|(idx, _)| idx) - .unwrap_or_else(|| usize::from(levels.level_count() - 1)); + .unwrap_or_else(|| version.level_count() - 1); // Number of levels we have to shift to get from the actual level idx to the canonical let mut level_shift = canonical_l1_idx - 1; - if canonical_l1_idx > 1 && levels.as_slice().iter().skip(1).any(|lvl| !lvl.is_empty()) { - let need_new_l1 = levels - .as_slice() - .iter() + if canonical_l1_idx > 1 && version.iter_levels().skip(1).any(|lvl| !lvl.is_empty()) { + let need_new_l1 = version + .iter_levels() .enumerate() .skip(1) .filter(|(_, lvl)| !lvl.is_empty()) @@ -219,7 +217,7 @@ impl CompactionStrategy for Strategy { .flat_map(|x| x.iter()) // NOTE: Take bytes that are already being compacted into account, // otherwise we may be overcompensating - .filter(|x| !levels.hidden_set().is_hidden(x.id())) + .filter(|x| !state.hidden_set().is_hidden(x.id())) .map(Segment::file_size) .sum::(); @@ -243,7 +241,7 @@ impl CompactionStrategy for Strategy { // NOTE: We always have at least one level #[allow(clippy::expect_used)] - let first_level = levels.as_slice().first().expect("first level should exist"); + let first_level = version.l0(); // TODO: use run_count instead? but be careful because of version free list GC thingy if first_level.segment_count() >= usize::from(self.l0_threshold) { @@ -252,7 +250,7 @@ impl CompactionStrategy for Strategy { } // Score L1+ - for (idx, level) in levels.as_slice().iter().enumerate().skip(1) { + for (idx, level) in version.iter_levels().enumerate().skip(1) { if level.is_empty() { continue; } @@ -262,7 +260,7 @@ impl CompactionStrategy for Strategy { .flat_map(|x| x.iter()) // NOTE: Take bytes that are already being compacted into account, // otherwise we may be overcompensating - .filter(|x| !levels.hidden_set().is_hidden(x.id())) + .filter(|x| !state.hidden_set().is_hidden(x.id())) .map(Segment::file_size) .sum::(); @@ -277,9 +275,8 @@ impl CompactionStrategy for Strategy { ); // NOTE: Force a trivial move - if levels - .as_slice() - .get(idx + 1) + if version + .level(idx + 1) .is_some_and(|next_level| next_level.is_empty()) { scores[idx] = (99.99, 999); @@ -313,15 +310,17 @@ impl CompactionStrategy for Strategy { // We choose L0->L1 compaction if level_idx_with_highest_score == 0 { - let Some(first_level) = levels.current_version().level(0) else { + let Some(first_level) = version.level(0) else { return Choice::DoNothing; }; - if levels.level_is_busy(0) || levels.level_is_busy(canonical_l1_idx) { + if version.level_is_busy(0, state.hidden_set()) + || version.level_is_busy(canonical_l1_idx, state.hidden_set()) + { return Choice::DoNothing; } - let Some(target_level) = &levels.current_version().level(canonical_l1_idx) else { + let Some(target_level) = &version.level(canonical_l1_idx) else { return Choice::DoNothing; }; @@ -365,11 +364,11 @@ impl CompactionStrategy for Strategy { let next_level_index = curr_level_index + 1; - let Some(level) = levels.current_version().level(level_idx_with_highest_score) else { + let Some(level) = version.level(level_idx_with_highest_score) else { return Choice::DoNothing; }; - let Some(next_level) = levels.current_version().level(next_level_index as usize) else { + let Some(next_level) = version.level(next_level_index as usize) else { return Choice::DoNothing; }; @@ -379,7 +378,7 @@ impl CompactionStrategy for Strategy { let Some((segment_ids, can_trivial_move)) = pick_minimal_compaction( level.first_run().expect("should have exactly one run"), next_level.first_run().map(std::ops::Deref::deref), - levels.hidden_set(), + state.hidden_set(), overshoot_bytes, u64::from(self.target_size), ) else { diff --git a/src/compaction/major.rs b/src/compaction/major.rs index 720728d5..580900cc 100644 --- a/src/compaction/major.rs +++ b/src/compaction/major.rs @@ -3,7 +3,9 @@ // (found in the LICENSE-* files in the repository) use super::{Choice, CompactionStrategy, Input as CompactionInput}; -use crate::{config::Config, level_manifest::LevelManifest, segment::Segment, HashSet}; +use crate::{ + compaction::state::CompactionState, config::Config, segment::Segment, version::Version, HashSet, +}; /// Compacts all segments into the last level pub struct Strategy { @@ -37,15 +39,15 @@ impl CompactionStrategy for Strategy { "MajorCompaction" } - fn choose(&self, levels: &LevelManifest, cfg: &Config) -> Choice { - let segment_ids: HashSet<_> = levels.iter().map(Segment::id).collect(); + fn choose(&self, version: &Version, cfg: &Config, state: &CompactionState) -> Choice { + let segment_ids: HashSet<_> = version.iter_segments().map(Segment::id).collect(); // NOTE: This should generally not occur because of the // tree-level major compaction lock // But just as a fail-safe... let some_hidden = segment_ids .iter() - .any(|&id| levels.hidden_set().is_hidden(id)); + .any(|&id| state.hidden_set().is_hidden(id)); if some_hidden { Choice::DoNothing diff --git a/src/compaction/movedown.rs b/src/compaction/movedown.rs index 5a78b68c..e7f2ec76 100644 --- a/src/compaction/movedown.rs +++ b/src/compaction/movedown.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{Choice, CompactionStrategy, Input}; -use crate::{level_manifest::LevelManifest, segment::Segment, Config}; +use crate::{compaction::state::CompactionState, segment::Segment, version::Version, Config}; /// Moves down a level into the destination level. pub struct Strategy(pub u8, pub u8); @@ -14,12 +14,12 @@ impl CompactionStrategy for Strategy { } #[allow(clippy::expect_used)] - fn choose(&self, levels: &LevelManifest, _: &Config) -> Choice { - if levels.level_is_busy(usize::from(self.0)) { + fn choose(&self, version: &Version, _: &Config, state: &CompactionState) -> Choice { + if version.level_is_busy(usize::from(self.0), state.hidden_set()) { return Choice::DoNothing; } - let Some(level) = levels.as_slice().get(self.0 as usize) else { + let Some(level) = version.level(self.0.into()) else { return Choice::DoNothing; }; diff --git a/src/compaction/pulldown.rs b/src/compaction/pulldown.rs index d88ffd2a..af838be7 100644 --- a/src/compaction/pulldown.rs +++ b/src/compaction/pulldown.rs @@ -2,8 +2,8 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::{Choice, CompactionStrategy, Input}; -use crate::{level_manifest::LevelManifest, segment::Segment, Config, HashSet}; +use super::{Choice, CompactionStrategy}; +use crate::{compaction::state::CompactionState, version::Version, Config}; /// Pulls down and merges a level into the destination level. /// @@ -16,7 +16,7 @@ impl CompactionStrategy for Strategy { } #[allow(clippy::expect_used)] - fn choose(&self, levels: &LevelManifest, _: &Config) -> Choice { + fn choose(&self, version: &Version, _: &Config, state: &CompactionState) -> Choice { todo!() } } diff --git a/src/compaction/tiered.rs b/src/compaction/tiered.rs index 638f78b4..0e9cb1f7 100644 --- a/src/compaction/tiered.rs +++ b/src/compaction/tiered.rs @@ -2,12 +2,12 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::{Choice, CompactionStrategy, Input as CompactionInput}; -use crate::{level_manifest::LevelManifest, segment::Segment, Config, HashSet}; +use super::{Choice, CompactionStrategy}; +use crate::{compaction::state::CompactionState, version::Version, Config}; -fn desired_level_size_in_bytes(level_idx: u8, ratio: u8, base_size: u32) -> usize { - (ratio as usize).pow(u32::from(level_idx + 1)) * (base_size as usize) -} +// fn desired_level_size_in_bytes(level_idx: u8, ratio: u8, base_size: u32) -> usize { +// (ratio as usize).pow(u32::from(level_idx + 1)) * (base_size as usize) +// } /// Size-tiered compaction strategy (STCS) /// @@ -54,8 +54,8 @@ impl CompactionStrategy for Strategy { "TieredStrategy" } - fn choose(&self, levels: &LevelManifest, config: &Config) -> Choice { - todo!() + fn choose(&self, version: &Version, _config: &Config, state: &CompactionState) -> Choice { + unimplemented!() } } /* diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index fd8cf00c..576e295a 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -7,15 +7,16 @@ use crate::{ blob_tree::FragmentationMap, compaction::{ flavour::{RelocatingCompaction, StandardCompaction}, + state::CompactionState, stream::CompactionStream, Choice, }, file::BLOBS_FOLDER, - level_manifest::LevelManifest, merge::Merger, run_scanner::RunScanner, stop_signal::StopSignal, tree::inner::{SuperVersion, TreeId}, + version::Version, vlog::{BlobFileMergeScanner, BlobFileScanner, BlobFileWriter}, BlobFile, Config, HashSet, InternalValue, SegmentId, SeqNo, SequenceNumberCounter, }; @@ -52,10 +53,7 @@ pub struct Options { /// Evicts items that are older than this seqno (MVCC GC). pub eviction_seqno: u64, - /// Compaction to lock out other compactions - /// - /// This is not the same lock as the major compaction lock in the `Tree`. - pub compaction_lock: Arc>, + pub compaction_state: Arc>, #[cfg(feature = "metrics")] pub metrics: Arc, @@ -72,7 +70,8 @@ impl Options { stop_signal: tree.stop_signal.clone(), strategy, eviction_seqno: 0, - compaction_lock: tree.compaction_lock.clone(), + + compaction_state: tree.compaction_state.clone(), #[cfg(feature = "metrics")] metrics: tree.metrics.clone(), @@ -84,25 +83,27 @@ impl Options { /// /// This will block until the compactor is fully finished. pub fn do_compaction(opts: &Options) -> crate::Result<()> { - let lock = opts.compaction_lock.lock().expect("lock is poisoned"); + let compaction_state = opts.compaction_state.lock().expect("lock is poisoned"); - let version = opts.super_version.read().expect("lock is poisoned"); + let super_version = opts.super_version.read().expect("lock is poisoned"); let start = Instant::now(); log::trace!( "Consulting compaction strategy {:?}", opts.strategy.get_name(), ); - let choice = opts.strategy.choose(&version.manifest, &opts.config); + let choice = opts + .strategy + .choose(&super_version.version, &opts.config, &compaction_state); log::debug!("Compaction choice: {choice:?} in {:?}", start.elapsed()); match choice { - Choice::Merge(payload) => merge_segments(lock, version, opts, &payload), - Choice::Move(payload) => move_segments(lock, version, opts, &payload), + Choice::Merge(payload) => merge_segments(compaction_state, super_version, opts, &payload), + Choice::Move(payload) => move_segments(compaction_state, super_version, opts, &payload), Choice::Drop(payload) => drop_segments( - lock, - version, + compaction_state, + super_version, opts, &payload.into_iter().collect::>(), ), @@ -114,14 +115,14 @@ pub fn do_compaction(opts: &Options) -> crate::Result<()> { } fn create_compaction_stream<'a>( - levels: &LevelManifest, + version: &Version, to_compact: &[SegmentId], eviction_seqno: SeqNo, ) -> crate::Result>>>> { let mut readers: Vec> = vec![]; let mut found = 0; - for level in levels.current_version().iter_levels() { + for level in version.iter_levels() { if level.is_empty() { continue; } @@ -175,7 +176,7 @@ fn create_compaction_stream<'a>( } fn move_segments( - _compaction_lock: MutexGuard<'_, ()>, + mut compaction_state: MutexGuard<'_, CompactionState>, super_version: RwLockReadGuard<'_, SuperVersion>, opts: &Options, payload: &CompactionPayload, @@ -185,8 +186,8 @@ fn move_segments( let mut super_version = opts.super_version.write().expect("lock is poisoned"); // Fail-safe for buggy compaction strategies - if super_version - .manifest + if compaction_state + .hidden_set() .should_decline_compaction(payload.segment_ids.iter().copied()) { log::warn!( @@ -198,12 +199,13 @@ fn move_segments( let segment_ids = payload.segment_ids.iter().copied().collect::>(); - super_version.manifest.atomic_swap( + compaction_state.upgrade_version( + &mut super_version, |current| Ok(current.with_moved(&segment_ids, payload.dest_level as usize)), opts.eviction_seqno, )?; - if let Err(e) = super_version.manifest.maintenance(opts.eviction_seqno) { + if let Err(e) = compaction_state.maintenance(opts.eviction_seqno) { log::error!("Manifest maintenance failed: {e:?}"); return Err(e); } @@ -213,7 +215,7 @@ fn move_segments( #[allow(clippy::too_many_lines)] fn merge_segments( - compaction_lock: MutexGuard<'_, ()>, + mut compaction_state: MutexGuard<'_, CompactionState>, super_version: RwLockReadGuard<'_, SuperVersion>, opts: &Options, payload: &CompactionPayload, @@ -224,8 +226,8 @@ fn merge_segments( } // Fail-safe for buggy compaction strategies - if super_version - .manifest + if compaction_state + .hidden_set() .should_decline_compaction(payload.segment_ids.iter().copied()) { log::warn!( @@ -238,13 +240,7 @@ fn merge_segments( let Some(segments) = payload .segment_ids .iter() - .map(|&id| { - super_version - .manifest - .current_version() - .get_segment(id) - .cloned() - }) + .map(|&id| super_version.version.get_segment(id).cloned()) .collect::>>() else { log::warn!( @@ -257,7 +253,7 @@ fn merge_segments( let mut blob_frag_map = FragmentationMap::default(); let Some(mut merge_iter) = create_compaction_stream( - &super_version.manifest, + &super_version.version, &payload.segment_ids.iter().copied().collect::>(), opts.eviction_seqno, )? @@ -275,7 +271,7 @@ fn merge_segments( // That way we don't resurrect data beneath the tombstone let is_last_level = payload.dest_level == last_level; - let current_version = super_version.manifest.current_version(); + let current_version = &super_version.version; let table_writer = super::flavour::prepare_table_writer(current_version, opts, payload)?; @@ -393,24 +389,22 @@ fn merge_segments( drop(super_version); { - opts.super_version - .write() - .expect("lock is poisoned") - .manifest - .hide_segments(payload.segment_ids.iter().copied()); + compaction_state + .hidden_set_mut() + .hide(payload.segment_ids.iter().copied()); } // IMPORTANT: Unlock exclusive compaction lock as we are now doing the actual (CPU-intensive) compaction - drop(compaction_lock); + drop(compaction_state); for (idx, item) in merge_iter.enumerate() { let item = item.inspect_err(|_| { // IMPORTANT: We need to show tables again on error - let mut version_lock = opts.super_version.write().expect("lock is poisoned"); + let mut compaction_state = opts.compaction_state.lock().expect("lock is poisoned"); - version_lock - .manifest - .show_segments(payload.segment_ids.iter().copied()); + compaction_state + .hidden_set_mut() + .show(payload.segment_ids.iter().copied()); })?; // IMPORTANT: We can only drop tombstones when writing into last level @@ -420,11 +414,11 @@ fn merge_segments( compactor.write(item).inspect_err(|_| { // IMPORTANT: We need to show tables again on error - let mut version_lock = opts.super_version.write().expect("lock is poisoned"); + let mut compaction_state = opts.compaction_state.lock().expect("lock is poisoned"); - version_lock - .manifest - .show_segments(payload.segment_ids.iter().copied()); + compaction_state + .hidden_set_mut() + .show(payload.segment_ids.iter().copied()); })?; if idx % 1_000_000 == 0 && opts.stop_signal.is_stopped() { @@ -433,7 +427,7 @@ fn merge_segments( } } - let compaction_lock = opts.compaction_lock.lock().expect("lock is poisoned"); + let mut compaction_state = opts.compaction_state.lock().expect("lock is poisoned"); log::trace!("Acquiring super version write lock"); let mut super_version = opts.super_version.write().expect("lock is poisoned"); @@ -441,31 +435,31 @@ fn merge_segments( compactor .finish( - &mut super_version.manifest, + &mut super_version, + &mut compaction_state, opts, payload, dst_lvl, blob_frag_map, ) .inspect_err(|_| { - super_version - .manifest - .show_segments(payload.segment_ids.iter().copied()); + compaction_state + .hidden_set_mut() + .show(payload.segment_ids.iter().copied()); })?; - super_version - .manifest - .show_segments(payload.segment_ids.iter().copied()); + compaction_state + .hidden_set_mut() + .show(payload.segment_ids.iter().copied()); - super_version - .manifest + compaction_state .maintenance(opts.eviction_seqno) .inspect_err(|e| { log::error!("Manifest maintenance failed: {e:?}"); })?; drop(super_version); - drop(compaction_lock); + drop(compaction_state); log::trace!("Compaction successful"); @@ -473,7 +467,7 @@ fn merge_segments( } fn drop_segments( - compaction_lock: MutexGuard<'_, ()>, + mut compaction_state: MutexGuard<'_, CompactionState>, super_version: RwLockReadGuard<'_, SuperVersion>, opts: &Options, ids_to_drop: &[SegmentId], @@ -483,8 +477,8 @@ fn drop_segments( let mut super_version = opts.super_version.write().expect("lock is poisoned"); // Fail-safe for buggy compaction strategies - if super_version - .manifest + if compaction_state + .hidden_set() .should_decline_compaction(ids_to_drop.iter().copied()) { log::warn!( @@ -496,13 +490,7 @@ fn drop_segments( let Some(segments) = ids_to_drop .iter() - .map(|&id| { - super_version - .manifest - .current_version() - .get_segment(id) - .cloned() - }) + .map(|&id| super_version.version.get_segment(id).cloned()) .collect::>>() else { log::warn!( @@ -514,7 +502,8 @@ fn drop_segments( // IMPORTANT: Write the manifest with the removed segments first // Otherwise the segment files are deleted, but are still referenced! - super_version.manifest.atomic_swap( + compaction_state.upgrade_version( + &mut super_version, |current| current.with_dropped(ids_to_drop), opts.eviction_seqno, // TODO: make naming in code base eviction_seqno vs watermark vs threshold consistent )?; @@ -529,13 +518,13 @@ fn drop_segments( // TODO: fwiw also add all dead blob files // TODO: look if any blob files can be trivially deleted as well - if let Err(e) = super_version.manifest.maintenance(opts.eviction_seqno) { + if let Err(e) = compaction_state.maintenance(opts.eviction_seqno) { log::error!("Manifest maintenance failed: {e:?}"); return Err(e); } drop(super_version); - drop(compaction_lock); + drop(compaction_state); log::trace!("Dropped {} segments", ids_to_drop.len()); From 37c53985e5a9406abcc835a3d0339e9d147aa3b6 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 11 Oct 2025 16:32:46 +0200 Subject: [PATCH 568/613] refactor: closes #87 --- src/compaction/worker.rs | 63 +++++++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 576e295a..f236cad8 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -213,6 +213,23 @@ fn move_segments( Ok(()) } +fn hidden_guard( + payload: &CompactionPayload, + opts: &Options, + f: impl FnOnce() -> crate::Result<()>, +) -> crate::Result<()> { + f().inspect_err(|e| { + log::error!("Compaction failed: {e:?}"); + + // IMPORTANT: We need to show tables again on error + let mut compaction_state = opts.compaction_state.lock().expect("lock is poisoned"); + + compaction_state + .hidden_set_mut() + .show(payload.segment_ids.iter().copied()); + }) +} + #[allow(clippy::too_many_lines)] fn merge_segments( mut compaction_state: MutexGuard<'_, CompactionState>, @@ -397,35 +414,25 @@ fn merge_segments( // IMPORTANT: Unlock exclusive compaction lock as we are now doing the actual (CPU-intensive) compaction drop(compaction_state); - for (idx, item) in merge_iter.enumerate() { - let item = item.inspect_err(|_| { - // IMPORTANT: We need to show tables again on error - let mut compaction_state = opts.compaction_state.lock().expect("lock is poisoned"); - - compaction_state - .hidden_set_mut() - .show(payload.segment_ids.iter().copied()); - })?; + hidden_guard(payload, opts, || { + for (idx, item) in merge_iter.enumerate() { + let item = item?; - // IMPORTANT: We can only drop tombstones when writing into last level - if is_last_level && item.is_tombstone() { - continue; - } - - compactor.write(item).inspect_err(|_| { - // IMPORTANT: We need to show tables again on error - let mut compaction_state = opts.compaction_state.lock().expect("lock is poisoned"); + // IMPORTANT: We can only drop tombstones when writing into last level + if is_last_level && item.is_tombstone() { + continue; + } - compaction_state - .hidden_set_mut() - .show(payload.segment_ids.iter().copied()); - })?; + compactor.write(item)?; - if idx % 1_000_000 == 0 && opts.stop_signal.is_stopped() { - log::debug!("Stopping amidst compaction because of stop signal"); - return Ok(()); + if idx % 1_000_000 == 0 && opts.stop_signal.is_stopped() { + log::debug!("Stopping amidst compaction because of stop signal"); + return Ok(()); + } } - } + + Ok(()) + })?; let mut compaction_state = opts.compaction_state.lock().expect("lock is poisoned"); @@ -442,7 +449,11 @@ fn merge_segments( dst_lvl, blob_frag_map, ) - .inspect_err(|_| { + .inspect_err(|e| { + // NOTE: We cannot use hidden_guard here because we already locked the compaction state + + log::error!("Compaction failed: {e:?}"); + compaction_state .hidden_set_mut() .show(payload.segment_ids.iter().copied()); From 05247dccaf43f89c37647098fc1f0d88c28939ff Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 11 Oct 2025 16:33:00 +0200 Subject: [PATCH 569/613] refactor --- src/config/mod.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/config/mod.rs b/src/config/mod.rs index 7d9d05c3..bb1f477c 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -17,7 +17,8 @@ pub use pinning::PinningPolicy; pub use restart_interval::RestartIntervalPolicy; use crate::{ - path::absolute_path, AnyTree, BlobTree, Cache, CompressionType, DescriptorTable, Tree, + path::absolute_path, version::DEFAULT_LEVEL_COUNT, AnyTree, BlobTree, Cache, CompressionType, + DescriptorTable, Tree, }; use std::{ path::{Path, PathBuf}, @@ -222,7 +223,7 @@ impl Default for Config { data_block_restart_interval_policy: RestartIntervalPolicy::all(16), index_block_restart_interval_policy: RestartIntervalPolicy::all(1), - level_count: 7, + level_count: DEFAULT_LEVEL_COUNT, data_block_size_policy: BlockSizePolicy::default(), index_block_size_policy: BlockSizePolicy::default(), From f0f055a940b9c72de8e45cd9c7b1a192117c362e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 11 Oct 2025 18:37:11 +0200 Subject: [PATCH 570/613] update recovery cleanup --- src/segment/writer/mod.rs | 2 - src/tree/mod.rs | 112 +++++++++++------- src/version/mod.rs | 3 +- src/vlog/mod.rs | 5 +- ...ange_gc_stats.rs => blob_nuke_gc_stats.rs} | 2 +- tests/tree_recovery_versions.rs | 36 ++++++ 6 files changed, 108 insertions(+), 52 deletions(-) rename tests/{blob_drop_range_gc_stats.rs => blob_nuke_gc_stats.rs} (95%) create mode 100644 tests/tree_recovery_versions.rs diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 95bc9413..84fb0843 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -452,8 +452,6 @@ impl Writer { assert!(is_sorted, "meta items not sorted correctly"); } - log::trace!("Encoding metadata block: {meta_items:#?}"); - self.block_buffer.clear(); // TODO: no binary index diff --git a/src/tree/mod.rs b/src/tree/mod.rs index eb374341..43a48dd8 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -20,7 +20,7 @@ use crate::{ slice::Slice, tree::inner::SuperVersion, value::InternalValue, - version::{recovery::recover_ids, Version}, + version::{recovery::recover_ids, Version, VersionId}, vlog::BlobFile, AbstractTree, Cache, DescriptorTable, KvPair, SegmentId, SeqNo, SequenceNumberCounter, TreeType, UserKey, UserValue, ValueType, @@ -959,7 +959,7 @@ impl Tree { Ok(Self(Arc::new(inner))) } - /// Recovers the level manifest, loading all segments from disk. + /// Recovers the level manifest, loading all tables from disk. fn recover_levels>( tree_path: P, tree_id: TreeId, @@ -973,17 +973,17 @@ impl Tree { let recovery = recover_ids(tree_path)?; - let segment_id_map = { + let table_id_map = { let mut result: crate::HashMap = crate::HashMap::default(); - for (level_idx, segment_ids) in recovery.segment_ids.iter().enumerate() { - for run in segment_ids { - for segment_id in run { + for (level_idx, table_ids) in recovery.segment_ids.iter().enumerate() { + for run in table_ids { + for table_id in run { // NOTE: We know there are always less than 256 levels #[allow(clippy::expect_used)] result.insert( - *segment_id, + *table_id, level_idx .try_into() .expect("there are less than 256 levels"), @@ -995,12 +995,9 @@ impl Tree { result }; - let cnt = segment_id_map.len(); + let cnt = table_id_map.len(); - log::debug!( - "Recovering {cnt} disk segments from {}", - tree_path.display(), - ); + log::debug!("Recovering {cnt} tables from {}", tree_path.display()); let progress_mod = match cnt { _ if cnt <= 20 => 1, @@ -1008,19 +1005,18 @@ impl Tree { _ => 100, }; - let mut segments = vec![]; + let mut tables = vec![]; - let segment_base_folder = tree_path.join(SEGMENTS_FOLDER); + let table_base_folder = tree_path.join(SEGMENTS_FOLDER); - if !segment_base_folder.try_exists()? { - std::fs::create_dir_all(&segment_base_folder)?; - fsync_directory(&segment_base_folder)?; + if !table_base_folder.try_exists()? { + std::fs::create_dir_all(&table_base_folder)?; + fsync_directory(&table_base_folder)?; } - // TODO: 3.0.0 only remove unreferenced segments once we have successfully recovered the most recent version - // TODO: same for blob files + let mut orphaned_tables = vec![]; - for (idx, dirent) in std::fs::read_dir(&segment_base_folder)?.enumerate() { + for (idx, dirent) in std::fs::read_dir(&table_base_folder)?.enumerate() { let dirent = dirent?; let file_name = dirent.file_name(); @@ -1034,24 +1030,24 @@ impl Tree { continue; } - let segment_file_name = file_name.to_str().ok_or_else(|| { - log::error!("invalid segment file name {}", file_name.display()); + let table_file_name = file_name.to_str().ok_or_else(|| { + log::error!("invalid table file name {}", file_name.display()); crate::Error::Unrecoverable })?; - let segment_file_path = dirent.path(); - assert!(!segment_file_path.is_dir()); + let table_file_path = dirent.path(); + assert!(!table_file_path.is_dir()); - log::debug!("Recovering segment from {}", segment_file_path.display()); + log::debug!("Recovering table from {}", table_file_path.display()); - let segment_id = segment_file_name.parse::().map_err(|e| { - log::error!("invalid segment file name {segment_file_name:?}: {e:?}"); + let table_id = table_file_name.parse::().map_err(|e| { + log::error!("invalid table file name {table_file_name:?}: {e:?}"); crate::Error::Unrecoverable })?; - if let Some(&level_idx) = segment_id_map.get(&segment_id) { - let segment = Segment::recover( - segment_file_path, + if let Some(&level_idx) = table_id_map.get(&table_id) { + let table = Segment::recover( + table_file_path, tree_id, cache.clone(), descriptor_table.clone(), @@ -1061,37 +1057,67 @@ impl Tree { metrics.clone(), )?; - log::debug!("Recovered segment from {:?}", segment.path); + log::debug!("Recovered table from {:?}", table.path); - segments.push(segment); + tables.push(table); if idx % progress_mod == 0 { - log::debug!("Recovered {idx}/{cnt} disk segments"); + log::debug!("Recovered {idx}/{cnt} tables"); } } else { - log::debug!( - "Deleting unfinished segment: {}", - segment_file_path.display(), - ); - std::fs::remove_file(&segment_file_path)?; + orphaned_tables.push(table_file_path); } } - if segments.len() < cnt { + if tables.len() < cnt { log::error!( - "Recovered less segments than expected: {:?}", - segment_id_map.keys(), + "Recovered less tables than expected: {:?}", + table_id_map.keys(), ); return Err(crate::Error::Unrecoverable); } - log::debug!("Successfully recovered {} segments", segments.len()); + log::debug!("Successfully recovered {} tables", tables.len()); let blob_files = crate::vlog::recover_blob_files( &tree_path.join(BLOBS_FOLDER), &recovery.blob_file_ids, )?; - Version::from_recovery(recovery, &segments, &blob_files) + let version = Version::from_recovery(recovery, &tables, &blob_files)?; + + // NOTE: Cleanup old versions + // But only after we definitely recovered the latest version + Self::cleanup_orphaned_version(tree_path, version.id())?; + + for table_path in orphaned_tables { + log::debug!("Deleting orphaned table {}", table_path.display()); + std::fs::remove_file(&table_path)?; + } + + // TODO: remove orphaned blob files as well -> unit test + + Ok(version) + } + + fn cleanup_orphaned_version(path: &Path, latest_version_id: VersionId) -> crate::Result<()> { + let version_str = format!("v{latest_version_id}"); + + for file in std::fs::read_dir(path)? { + let dirent = file?; + + if dirent.file_type()?.is_dir() { + continue; + } + + let name = dirent.file_name(); + + if name.to_string_lossy().starts_with('v') && *name != *version_str { + log::trace!("Cleanup orphaned version {}", name.display()); + std::fs::remove_file(dirent.path())?; + } + } + + Ok(()) } } diff --git a/src/version/mod.rs b/src/version/mod.rs index 2c27b92c..583f67a5 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) mod optimize; -pub(crate) mod recovery; +pub mod recovery; pub mod run; pub use run::Run; @@ -18,7 +18,6 @@ use crate::{ }; use optimize::optimize_runs; use run::Ranged; -use std::path::PathBuf; use std::{collections::BTreeMap, ops::Deref, sync::Arc}; pub const DEFAULT_LEVEL_COUNT: u8 = 7; diff --git a/src/vlog/mod.rs b/src/vlog/mod.rs index 1ba77c9b..ad39f168 100644 --- a/src/vlog/mod.rs +++ b/src/vlog/mod.rs @@ -30,10 +30,7 @@ pub fn recover_blob_files(folder: &Path, ids: &[BlobFileId]) -> crate::Result 100, }; - log::debug!("Recovering {cnt} blob files from {:?}", folder.display(),); - - // TODO: - // Self::remove_unfinished_blob_files(&folder, &ids)?; + log::debug!("Recovering {cnt} blob files from {:?}", folder.display()); let mut blob_files = Vec::with_capacity(ids.len()); diff --git a/tests/blob_drop_range_gc_stats.rs b/tests/blob_nuke_gc_stats.rs similarity index 95% rename from tests/blob_drop_range_gc_stats.rs rename to tests/blob_nuke_gc_stats.rs index fba6a2ab..1a90c118 100644 --- a/tests/blob_drop_range_gc_stats.rs +++ b/tests/blob_nuke_gc_stats.rs @@ -2,7 +2,7 @@ use lsm_tree::{blob_tree::FragmentationEntry, AbstractTree, SeqNo}; use test_log::test; #[test] -fn blob_tree_drop_range_gc_stats() -> lsm_tree::Result<()> { +fn blob_tree_nuke_gc_stats() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; let path = folder.path(); diff --git a/tests/tree_recovery_versions.rs b/tests/tree_recovery_versions.rs new file mode 100644 index 00000000..0fc742a4 --- /dev/null +++ b/tests/tree_recovery_versions.rs @@ -0,0 +1,36 @@ +use lsm_tree::{AbstractTree, Config}; +use test_log::test; + +#[test] +fn tree_recovery_version_free_list() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let path = folder.path(); + + { + let tree = Config::new(path).open()?; + assert!(path.join("v0").try_exists()?); + + tree.insert("a", "a", 0); + tree.flush_active_memtable(0)?; + assert_eq!(1, tree.version_free_list_len()); + assert!(path.join("v1").try_exists()?); + + tree.insert("b", "b", 0); + tree.flush_active_memtable(0)?; + assert_eq!(2, tree.version_free_list_len()); + assert!(path.join("v2").try_exists()?); + } + + { + let tree = Config::new(&folder).open()?; + assert_eq!(0, tree.version_free_list_len()); + assert!(!path.join("v0").try_exists()?); + assert!(!path.join("v1").try_exists()?); + assert!(path.join("v2").try_exists()?); + + assert!(tree.contains_key("a", 1)?); + assert!(tree.contains_key("b", 1)?); + } + + Ok(()) +} From 33ecaaf7da14d4f43a137a7c10cb2010b1200a3f Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 12 Oct 2025 14:57:12 +0200 Subject: [PATCH 571/613] wip --- src/segment/block/header.rs | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/src/segment/block/header.rs b/src/segment/block/header.rs index dce3ff9f..0d056d51 100644 --- a/src/segment/block/header.rs +++ b/src/segment/block/header.rs @@ -2,7 +2,6 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::offset::BlockOffset; use super::Checksum; use crate::coding::{Decode, DecodeError, Encode, EncodeError}; use crate::file::MAGIC_BYTES; @@ -54,9 +53,6 @@ pub struct Header { /// Checksum value to verify integrity of data pub checksum: Checksum, - /// File offset of previous block - only used for data blocks - pub previous_block_offset: BlockOffset, // TODO: 3.0.0 remove? - /// On-disk size of data segment pub data_length: u32, @@ -72,8 +68,6 @@ impl Header { + std::mem::size_of::() // Checksum + std::mem::size_of::() - // Backlink - + std::mem::size_of::() // On-disk size + std::mem::size_of::() // Uncompressed data length @@ -92,9 +86,6 @@ impl Encode for Header { // Write checksum writer.write_u128::(*self.checksum)?; - // Write prev offset - writer.write_u64::(*self.previous_block_offset)?; - // Write on-disk size length writer.write_u32::(self.data_length)?; @@ -122,9 +113,6 @@ impl Decode for Header { // Read checksum let checksum = reader.read_u128::()?; - // Read prev offset - let previous_block_offset = reader.read_u64::()?; - // Read data length let data_length = reader.read_u32::()?; @@ -134,7 +122,6 @@ impl Decode for Header { Ok(Self { block_type, checksum: Checksum::from_raw(checksum), - previous_block_offset: BlockOffset(previous_block_offset), data_length, uncompressed_length, }) @@ -152,7 +139,6 @@ mod tests { block_type: BlockType::Data, checksum: Checksum::from_raw(5), data_length: 252_356, - previous_block_offset: BlockOffset(35), uncompressed_length: 124_124_124, }; From 4263e09113841f2d26f06bc33cea12cdfae4305d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 12 Oct 2025 14:57:24 +0200 Subject: [PATCH 572/613] add value type sanity check --- Cargo.toml | 1 + src/segment/block/trailer.rs | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index 4629144e..1cddfaac 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -45,6 +45,7 @@ criterion = { version = "0.5.1", features = ["html_reports"] } fs_extra = "1.3.0" nanoid = "0.4.0" rand = "0.9.2" +strum = { version = "0.27.2", features = ["derive"] } test-log = "0.2.18" [package.metadata.cargo-all-features] diff --git a/src/segment/block/trailer.rs b/src/segment/block/trailer.rs index 74fb7c5d..4f128062 100644 --- a/src/segment/block/trailer.rs +++ b/src/segment/block/trailer.rs @@ -139,3 +139,18 @@ impl<'a> Trailer<'a> { Ok(()) } } + +#[cfg(test)] +mod tests { + use crate::segment::block::TRAILER_START_MARKER; + use strum::IntoEnumIterator; + use test_log::test; + + #[test] + fn value_type_never_block_trailer_start_marker() { + for variant in crate::ValueType::iter() { + let n: u8 = variant.into(); + assert_ne!(n, TRAILER_START_MARKER); + } + } +} From f4b81ee538942d167aece1e6c0d805ad1aeeb406 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 12 Oct 2025 14:57:32 +0200 Subject: [PATCH 573/613] change value type --- src/value_type.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/value_type.rs b/src/value_type.rs index 2fabea55..efc1bbb7 100644 --- a/src/value_type.rs +++ b/src/value_type.rs @@ -5,6 +5,7 @@ /// Value type (regular value or tombstone) #[derive(Copy, Clone, Debug, Eq, PartialEq)] #[allow(clippy::module_name_repetitions)] +#[cfg_attr(test, derive(strum::EnumIter))] pub enum ValueType { /// Existing value Value, @@ -41,7 +42,7 @@ impl TryFrom for ValueType { 0 => Ok(Self::Value), 0x0000_0001 => Ok(Self::Tombstone), 0x0000_0011 => Ok(Self::WeakTombstone), - 0b1000_0000 => Ok(Self::Indirection), + 0b0000_0100 => Ok(Self::Indirection), _ => Err(()), } } @@ -53,7 +54,7 @@ impl From for u8 { ValueType::Value => 0, ValueType::Tombstone => 0x0000_0001, ValueType::WeakTombstone => 0x0000_0011, - ValueType::Indirection => 0b1000_0000, + ValueType::Indirection => 0b0000_0100, } } } From b3c1ada1f57023ba82a4be1ab837058accf16062 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 12 Oct 2025 14:58:06 +0200 Subject: [PATCH 574/613] checksum before decompression --- src/segment/block/mod.rs | 79 ++++++++++++++++++--------------- src/segment/data_block/iter.rs | 21 --------- src/segment/data_block/mod.rs | 16 ------- src/segment/index_block/iter.rs | 10 ----- src/segment/writer/mod.rs | 8 ---- src/vlog/blob_file/reader.rs | 34 +++++++------- src/vlog/blob_file/scanner.rs | 41 ++++++++++------- src/vlog/blob_file/writer.rs | 21 +++++---- 8 files changed, 95 insertions(+), 135 deletions(-) diff --git a/src/segment/block/mod.rs b/src/segment/block/mod.rs index 10edc744..b332ae27 100644 --- a/src/segment/block/mod.rs +++ b/src/segment/block/mod.rs @@ -50,10 +50,9 @@ impl Block { ) -> crate::Result
{ let mut header = Header { block_type, - checksum: Checksum::from_raw(crate::hash::hash128(data)), - data_length: 0, // <-- NOTE: Is set later on + checksum: Checksum::from_raw(0), // <-- NOTE: Is set later on + data_length: 0, // <-- NOTE: Is set later on uncompressed_length: data.len() as u32, - previous_block_offset: BlockOffset(0), // <-- TODO: }; let data = match compression { @@ -63,6 +62,7 @@ impl Block { CompressionType::Lz4 => &lz4_flex::compress(data), }; header.data_length = data.len() as u32; + header.checksum = Checksum::from_raw(crate::hash::hash128(data)); header.encode_into(&mut writer)?; writer.write_all(data)?; @@ -85,6 +85,20 @@ impl Block { let header = Header::decode_from(reader)?; let raw_data = Slice::from_reader(reader, header.data_length as usize)?; + let checksum = Checksum::from_raw(crate::hash::hash128(&raw_data)); + if checksum != header.checksum { + log::error!( + "Checksum mismatch for , got={}, expected={}", + *checksum, + *header.checksum, + ); + + return Err(crate::Error::ChecksumMismatch { + got: checksum, + expected: header.checksum, + }); + } + let data = match compression { CompressionType::None => raw_data, @@ -108,20 +122,6 @@ impl Block { } }); - let checksum = Checksum::from_raw(crate::hash::hash128(&data)); - if checksum != header.checksum { - log::error!( - "Checksum mismatch for , got={}, expected={}", - *checksum, - *header.checksum, - ); - - return Err(crate::Error::ChecksumMismatch { - got: checksum, - expected: header.checksum, - }); - } - Ok(Self { header, data }) } @@ -135,8 +135,32 @@ impl Block { let header = Header::decode_from(&mut &buf[..])?; + #[allow(clippy::indexing_slicing)] + let checksum = Checksum::from_raw(crate::hash::hash128(&buf[Header::serialized_len()..])); + if checksum != header.checksum { + log::error!( + "Checksum mismatch for block {handle:?}, got={}, expected={}", + *checksum, + *header.checksum, + ); + + return Err(crate::Error::ChecksumMismatch { + got: checksum, + expected: header.checksum, + }); + } + let buf = match compression { - CompressionType::None => buf.slice(Header::serialized_len()..), + CompressionType::None => { + let value = buf.slice(Header::serialized_len()..); + + #[allow(clippy::expect_used, clippy::cast_possible_truncation)] + { + debug_assert_eq!(header.uncompressed_length, value.len() as u32); + } + + value + } #[cfg(feature = "lz4")] CompressionType::Lz4 => { @@ -156,25 +180,6 @@ impl Block { } }; - #[allow(clippy::expect_used, clippy::cast_possible_truncation)] - { - debug_assert_eq!(header.uncompressed_length, buf.len() as u32); - } - - let checksum = Checksum::from_raw(crate::hash::hash128(&buf)); - if checksum != header.checksum { - log::error!( - "Checksum mismatch for block {handle:?}, got={}, expected={}", - *checksum, - *header.checksum, - ); - - return Err(crate::Error::ChecksumMismatch { - got: checksum, - expected: header.checksum, - }); - } - Ok(Self { header, data: buf }) } } diff --git a/src/segment/data_block/iter.rs b/src/segment/data_block/iter.rs index f02b45a7..19232ec9 100644 --- a/src/segment/data_block/iter.rs +++ b/src/segment/data_block/iter.rs @@ -187,7 +187,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -262,7 +261,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -337,7 +335,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -390,7 +387,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -426,7 +422,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -466,7 +461,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -507,7 +501,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -589,7 +582,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -631,7 +623,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -672,7 +663,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -776,7 +766,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -848,7 +837,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -886,7 +874,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -927,7 +914,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -968,7 +954,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -1006,7 +991,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -1040,7 +1024,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -1129,7 +1112,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -1220,7 +1202,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -1320,7 +1301,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -1377,7 +1357,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index 1496de4d..3b99ffe0 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -550,7 +550,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -612,7 +611,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -654,7 +652,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -694,7 +691,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -728,7 +724,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -757,7 +752,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -799,7 +793,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -836,7 +829,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -872,7 +864,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -919,7 +910,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -965,7 +955,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -1015,7 +1004,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -1065,7 +1053,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -1102,7 +1089,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -1140,7 +1126,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -1184,7 +1169,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); diff --git a/src/segment/index_block/iter.rs b/src/segment/index_block/iter.rs index f3c44600..f4da5681 100644 --- a/src/segment/index_block/iter.rs +++ b/src/segment/index_block/iter.rs @@ -76,7 +76,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -113,7 +112,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -148,7 +146,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -186,7 +183,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -221,7 +217,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -256,7 +251,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -298,7 +292,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -333,7 +326,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -384,7 +376,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); @@ -434,7 +425,6 @@ mod tests { checksum: Checksum::from_raw(0), data_length: 0, uncompressed_length: 0, - previous_block_offset: BlockOffset(0), }, }); diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 84fb0843..adc51c3c 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -247,14 +247,6 @@ impl Writer { self.data_block_hash_ratio, )?; - // log::warn!("encoding {:?}", self.chunk); - // log::warn!( - // "encoded 0x{:#X?} -> {:?}", - // self.meta.file_pos, - // self.block_buffer - // ); - - // TODO: prev block offset let header = Block::write_into( &mut self.block_writer, &self.block_buffer, diff --git a/src/vlog/blob_file/reader.rs b/src/vlog/blob_file/reader.rs index 03abd79d..ba898aa2 100644 --- a/src/vlog/blob_file/reader.rs +++ b/src/vlog/blob_file/reader.rs @@ -64,27 +64,11 @@ impl<'a> Reader<'a> { let raw_data = value.slice((add_size as usize)..); - #[warn(clippy::match_single_binding)] - let value = match &self.blob_file.0.meta.compression { - CompressionType::None => raw_data, - - #[cfg(feature = "lz4")] - CompressionType::Lz4 => { - #[warn(unsafe_code)] - let mut builder = unsafe { UserValue::builder_unzeroed(real_val_len as usize) }; - - lz4_flex::decompress_into(&raw_data, &mut builder) - .map_err(|_| crate::Error::Decompress(self.blob_file.0.meta.compression))?; - - builder.freeze().into() - } - }; - { let checksum = { let mut hasher = xxhash_rust::xxh3::Xxh3::default(); hasher.update(key); - hasher.update(&value); + hasher.update(&raw_data); hasher.digest128() }; @@ -100,6 +84,22 @@ impl<'a> Reader<'a> { } } + #[warn(clippy::match_single_binding)] + let value = match &self.blob_file.0.meta.compression { + CompressionType::None => raw_data, + + #[cfg(feature = "lz4")] + CompressionType::Lz4 => { + #[warn(unsafe_code)] + let mut builder = unsafe { UserValue::builder_unzeroed(real_val_len as usize) }; + + lz4_flex::decompress_into(&raw_data, &mut builder) + .map_err(|_| crate::Error::Decompress(self.blob_file.0.meta.compression))?; + + builder.freeze().into() + } + }; + Ok(value) } } diff --git a/src/vlog/blob_file/scanner.rs b/src/vlog/blob_file/scanner.rs index 9a23311e..be3e28a7 100644 --- a/src/vlog/blob_file/scanner.rs +++ b/src/vlog/blob_file/scanner.rs @@ -97,27 +97,11 @@ impl Iterator for Scanner { on_disk_val_len as usize )); - #[warn(clippy::match_single_binding)] - let value = match &self.compression { - CompressionType::None => raw_data, - - #[cfg(feature = "lz4")] - CompressionType::Lz4 => { - #[warn(unsafe_code)] - let mut builder = unsafe { UserValue::builder_unzeroed(real_val_len as usize) }; - - fail_iter!(lz4_flex::decompress_into(&raw_data, &mut builder) - .map_err(|_| crate::Error::Decompress(self.compression))); - - builder.freeze().into() - } - }; - { let checksum = { let mut hasher = xxhash_rust::xxh3::Xxh3::default(); hasher.update(&key); - hasher.update(&value); + hasher.update(&raw_data); hasher.digest128() }; @@ -134,6 +118,29 @@ impl Iterator for Scanner { } } + #[warn(clippy::match_single_binding)] + let value = match &self.compression { + CompressionType::None => { + #[allow(clippy::expect_used, clippy::cast_possible_truncation)] + { + debug_assert_eq!(real_val_len, raw_data.len() as u32); + } + + raw_data + } + + #[cfg(feature = "lz4")] + CompressionType::Lz4 => { + #[warn(unsafe_code)] + let mut builder = unsafe { UserValue::builder_unzeroed(real_val_len as usize) }; + + fail_iter!(lz4_flex::decompress_into(&raw_data, &mut builder) + .map_err(|_| crate::Error::Decompress(self.compression))); + + builder.freeze().into() + } + }; + Some(Ok(ScanEntry { key, seqno, diff --git a/src/vlog/blob_file/writer.rs b/src/vlog/blob_file/writer.rs index 8354f532..19e1dc0a 100644 --- a/src/vlog/blob_file/writer.rs +++ b/src/vlog/blob_file/writer.rs @@ -126,10 +126,19 @@ impl Writer { // Write header self.writer.write_all(BLOB_HEADER_MAGIC)?; + let uncompressed_len = value.len(); + + let value = match &self.compression { + CompressionType::None => std::borrow::Cow::Borrowed(value), + + #[cfg(feature = "lz4")] + CompressionType::Lz4 => std::borrow::Cow::Owned(lz4_flex::compress(value)), + }; + let checksum = { let mut hasher = xxhash_rust::xxh3::Xxh3::default(); hasher.update(key); - hasher.update(value); + hasher.update(&value); hasher.digest128() }; @@ -147,14 +156,8 @@ impl Writer { // NOTE: Truncation is okay and actually needed #[allow(clippy::cast_possible_truncation)] - self.writer.write_u32::(value.len() as u32)?; - - let value = match &self.compression { - CompressionType::None => std::borrow::Cow::Borrowed(value), - - #[cfg(feature = "lz4")] - CompressionType::Lz4 => std::borrow::Cow::Owned(lz4_flex::compress(value)), - }; + self.writer + .write_u32::(uncompressed_len as u32)?; // Write compressed (on-disk) value length From 9ed50cccd673b6a4684b95130cc33cfe5da65147 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 12 Oct 2025 16:08:47 +0200 Subject: [PATCH 575/613] relocation skip recompressing --- Cargo.toml | 2 +- src/compaction/flavour.rs | 8 +++-- src/compaction/worker.rs | 7 ++--- src/config/mod.rs | 2 +- src/vlog/blob_file/multi_writer.rs | 48 +++++++++++++++++++++++++++--- src/vlog/blob_file/scanner.rs | 14 ++++----- src/vlog/blob_file/writer.rs | 41 ++++++++++++++----------- tests/blob_compression.rs | 5 ++-- 8 files changed, 86 insertions(+), 41 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 1cddfaac..7d078534 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ name = "lsm-tree" description = "A K.I.S.S. implementation of log-structured merge trees (LSM-trees/LSMTs)" license = "MIT OR Apache-2.0" -version = "3.0.0-pre.0" +version = "3.0.0-pre.1" edition = "2021" rust-version = "1.87.0" readme = "README.md" diff --git a/src/compaction/flavour.rs b/src/compaction/flavour.rs index c03f8d32..eb8b96b4 100644 --- a/src/compaction/flavour.rs +++ b/src/compaction/flavour.rs @@ -190,8 +190,12 @@ impl CompactionFlavour for RelocatingCompaction { log::trace!("RELOCATE to {indirection:?}"); - self.blob_writer - .write(&item.key.user_key, item.key.seqno, &blob_entry.value)?; + self.blob_writer.write_raw( + &item.key.user_key, + item.key.seqno, + &blob_entry.value, + blob_entry.uncompressed_len, + )?; self.inner .table_writer diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index f236cad8..5fe36fb9 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -373,10 +373,7 @@ fn merge_segments( let scanner = BlobFileMergeScanner::new( blob_files_to_rewrite .iter() - .map(|bf| { - Ok(BlobFileScanner::new(&bf.0.path, bf.id())? - .use_compression(bf.0.meta.compression)) - }) + .map(|bf| BlobFileScanner::new(&bf.0.path, bf.id())) .collect::>>()?, ); @@ -385,7 +382,7 @@ fn merge_segments( blob_opts.file_target_size, opts.config.path.join(BLOBS_FOLDER), )? - .use_compression(blob_opts.compression); + .use_passthrough_compression(blob_opts.compression); let inner = StandardCompaction::new(table_writer, segments); diff --git a/src/config/mod.rs b/src/config/mod.rs index bb1f477c..c48c89e7 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -123,7 +123,7 @@ impl KvSeparationOptions { /// Smaller value will reduce compaction overhead and thus write amplification, /// at the cost of lower read performance. /// - /// Defaults to 4KiB. + /// Defaults to 1 KiB. #[must_use] pub fn separation_threshold(mut self, bytes: u32) -> Self { self.separation_threshold = bytes; diff --git a/src/vlog/blob_file/multi_writer.rs b/src/vlog/blob_file/multi_writer.rs index 51e81eb0..31c018b5 100644 --- a/src/vlog/blob_file/multi_writer.rs +++ b/src/vlog/blob_file/multi_writer.rs @@ -27,6 +27,7 @@ pub struct MultiWriter { id_generator: SequenceNumberCounter, compression: CompressionType, + passthrough_compression: CompressionType, } impl MultiWriter { @@ -56,6 +57,7 @@ impl MultiWriter { results: Vec::new(), compression: CompressionType::None, + passthrough_compression: CompressionType::None, }) } @@ -66,6 +68,16 @@ impl MultiWriter { self } + /// Sets the compression method in blob file writer metadata, but does not actually compress blobs. + /// + /// This is used in garbage collection to pass through already-compressed blobs, but correctly + /// set the compression type in the metadata. + pub(crate) fn use_passthrough_compression(mut self, compression: CompressionType) -> Self { + assert_eq!(self.compression, CompressionType::None); + self.passthrough_compression = compression; + self + } + /// Sets the compression method. #[must_use] #[doc(hidden)] @@ -96,13 +108,16 @@ impl MultiWriter { Writer::new(blob_file_path, new_blob_file_id)?.use_compression(self.compression); let old_writer = std::mem::replace(&mut self.active_writer, new_writer); - let blob_file = Self::consume_writer(old_writer)?; + let blob_file = Self::consume_writer(old_writer, self.passthrough_compression)?; self.results.extend(blob_file); Ok(()) } - fn consume_writer(writer: Writer) -> crate::Result> { + fn consume_writer( + writer: Writer, + passthrough_compression: CompressionType, + ) -> crate::Result> { if writer.item_count > 0 { let blob_file_id = writer.blob_file_id; @@ -134,7 +149,11 @@ impl MultiWriter { .clone() .expect("should have written at least 1 item"), )), - compression: writer.compression, + compression: if passthrough_compression == CompressionType::None { + writer.compression + } else { + passthrough_compression + }, }, })); @@ -178,9 +197,30 @@ impl MultiWriter { Ok(bytes_written) } + pub(crate) fn write_raw( + &mut self, + key: &[u8], + seqno: SeqNo, + value: &[u8], + uncompressed_len: u32, + ) -> crate::Result { + let target_size = self.target_size; + + // Write actual value into blob file + let writer = &mut self.active_writer; + let bytes_written = writer.write_raw(key, seqno, value, uncompressed_len)?; + + // Check for blob file size target, maybe rotate to next writer + if writer.offset() >= target_size { + self.rotate()?; + } + + Ok(bytes_written) + } + pub(crate) fn finish(mut self) -> crate::Result> { if self.active_writer.item_count > 0 { - let blob_file = Self::consume_writer(self.active_writer)?; + let blob_file = Self::consume_writer(self.active_writer, self.passthrough_compression)?; self.results.extend(blob_file); } diff --git a/src/vlog/blob_file/scanner.rs b/src/vlog/blob_file/scanner.rs index be3e28a7..76a186a9 100644 --- a/src/vlog/blob_file/scanner.rs +++ b/src/vlog/blob_file/scanner.rs @@ -55,6 +55,7 @@ pub struct ScanEntry { pub seqno: SeqNo, pub value: UserValue, pub offset: u64, + pub uncompressed_len: u32, } impl Iterator for Scanner { @@ -87,7 +88,10 @@ impl Iterator for Scanner { let seqno = fail_iter!(self.inner.read_u64::()); let key_len = fail_iter!(self.inner.read_u16::()); + + #[allow(unused)] let real_val_len = fail_iter!(self.inner.read_u32::()); + let on_disk_val_len = fail_iter!(self.inner.read_u32::()); let key = fail_iter!(UserKey::from_reader(&mut self.inner, key_len as usize)); @@ -120,14 +124,7 @@ impl Iterator for Scanner { #[warn(clippy::match_single_binding)] let value = match &self.compression { - CompressionType::None => { - #[allow(clippy::expect_used, clippy::cast_possible_truncation)] - { - debug_assert_eq!(real_val_len, raw_data.len() as u32); - } - - raw_data - } + CompressionType::None => raw_data, #[cfg(feature = "lz4")] CompressionType::Lz4 => { @@ -146,6 +143,7 @@ impl Iterator for Scanner { seqno, value, offset, + uncompressed_len: real_val_len, })) } } diff --git a/src/vlog/blob_file/writer.rs b/src/vlog/blob_file/writer.rs index 19e1dc0a..46135408 100644 --- a/src/vlog/blob_file/writer.rs +++ b/src/vlog/blob_file/writer.rs @@ -88,18 +88,13 @@ impl Writer { self.blob_file_id } - /// Writes an item into the file. - /// - /// Items need to be written in key order. - /// - /// # Errors - /// - /// Will return `Err` if an IO error occurs. - /// - /// # Panics - /// - /// Panics if the key length is empty or greater than 2^16, or the value length is greater than 2^32. - pub fn write(&mut self, key: &[u8], seqno: SeqNo, value: &[u8]) -> crate::Result { + pub(crate) fn write_raw( + &mut self, + key: &[u8], + seqno: SeqNo, + value: &[u8], + uncompressed_len: u32, + ) -> crate::Result { assert!(!key.is_empty()); assert!(u16::try_from(key.len()).is_ok()); assert!(u32::try_from(value.len()).is_ok()); @@ -109,7 +104,7 @@ impl Writer { } self.last_key = Some(key.into()); - self.uncompressed_bytes += value.len() as u64; + self.uncompressed_bytes += u64::from(uncompressed_len); // NOTE: // BLOB HEADER LAYOUT @@ -126,8 +121,6 @@ impl Writer { // Write header self.writer.write_all(BLOB_HEADER_MAGIC)?; - let uncompressed_len = value.len(); - let value = match &self.compression { CompressionType::None => std::borrow::Cow::Borrowed(value), @@ -156,8 +149,7 @@ impl Writer { // NOTE: Truncation is okay and actually needed #[allow(clippy::cast_possible_truncation)] - self.writer - .write_u32::(uncompressed_len as u32)?; + self.writer.write_u32::(uncompressed_len)?; // Write compressed (on-disk) value length @@ -192,6 +184,21 @@ impl Writer { Ok(value.len() as u32) } + /// Writes an item into the file. + /// + /// Items need to be written in key order. + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + /// + /// # Panics + /// + /// Panics if the key length is empty or greater than 2^16, or the value length is greater than 2^32. + pub fn write(&mut self, key: &[u8], seqno: SeqNo, value: &[u8]) -> crate::Result { + self.write_raw(key, seqno, value, value.len() as u32) + } + pub(crate) fn finish(mut self) -> crate::Result<()> { self.writer.start("meta")?; diff --git a/tests/blob_compression.rs b/tests/blob_compression.rs index 752535ca..b191c800 100644 --- a/tests/blob_compression.rs +++ b/tests/blob_compression.rs @@ -1,8 +1,7 @@ -#[test] +#[test_log::test] #[cfg(feature = "lz4")] fn blob_tree_compression() -> lsm_tree::Result<()> { use lsm_tree::{blob_tree::FragmentationEntry, AbstractTree, KvSeparationOptions, SeqNo}; - use test_log::test; let folder = tempfile::tempdir()?; let path = folder.path(); @@ -17,7 +16,7 @@ fn blob_tree_compression() -> lsm_tree::Result<()> { )) .open()?; - let big_value = b"abc".repeat(128_000); + let big_value = b"abc".repeat(50); tree.insert("a", &big_value, 0); tree.insert("b", b"smol", 0); From 65ceaa0aee078435455f903c1449a31040732c2f Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 12 Oct 2025 16:14:06 +0200 Subject: [PATCH 576/613] refactor --- src/vlog/blob_file/scanner.rs | 31 +++---------------------------- 1 file changed, 3 insertions(+), 28 deletions(-) diff --git a/src/vlog/blob_file/scanner.rs b/src/vlog/blob_file/scanner.rs index 76a186a9..18571d94 100644 --- a/src/vlog/blob_file/scanner.rs +++ b/src/vlog/blob_file/scanner.rs @@ -3,9 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{meta::METADATA_HEADER_MAGIC, writer::BLOB_HEADER_MAGIC}; -use crate::{ - coding::DecodeError, vlog::BlobFileId, Checksum, CompressionType, SeqNo, UserKey, UserValue, -}; +use crate::{coding::DecodeError, vlog::BlobFileId, Checksum, SeqNo, UserKey, UserValue}; use byteorder::{LittleEndian, ReadBytesExt}; use std::{ fs::File, @@ -18,7 +16,6 @@ pub struct Scanner { pub(crate) blob_file_id: BlobFileId, // TODO: remove unused? inner: BufReader, is_terminated: bool, - compression: CompressionType, } impl Scanner { @@ -39,14 +36,8 @@ impl Scanner { blob_file_id, inner: file_reader, is_terminated: false, - compression: CompressionType::None, } } - - pub(crate) fn use_compression(mut self, compression: CompressionType) -> Self { - self.compression = compression; - self - } } #[derive(Debug)] @@ -96,7 +87,7 @@ impl Iterator for Scanner { let key = fail_iter!(UserKey::from_reader(&mut self.inner, key_len as usize)); - let raw_data = fail_iter!(UserValue::from_reader( + let value = fail_iter!(UserValue::from_reader( &mut self.inner, on_disk_val_len as usize )); @@ -105,7 +96,7 @@ impl Iterator for Scanner { let checksum = { let mut hasher = xxhash_rust::xxh3::Xxh3::default(); hasher.update(&key); - hasher.update(&raw_data); + hasher.update(&value); hasher.digest128() }; @@ -122,22 +113,6 @@ impl Iterator for Scanner { } } - #[warn(clippy::match_single_binding)] - let value = match &self.compression { - CompressionType::None => raw_data, - - #[cfg(feature = "lz4")] - CompressionType::Lz4 => { - #[warn(unsafe_code)] - let mut builder = unsafe { UserValue::builder_unzeroed(real_val_len as usize) }; - - fail_iter!(lz4_flex::decompress_into(&raw_data, &mut builder) - .map_err(|_| crate::Error::Decompress(self.compression))); - - builder.freeze().into() - } - }; - Some(Ok(ScanEntry { key, seqno, From a45791c75f3820b7a33c2c215651e8e339cc3911 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 12 Oct 2025 16:16:44 +0200 Subject: [PATCH 577/613] wip --- src/vlog/blob_file/scanner.rs | 39 ----------------------------------- 1 file changed, 39 deletions(-) diff --git a/src/vlog/blob_file/scanner.rs b/src/vlog/blob_file/scanner.rs index 18571d94..4c93dd24 100644 --- a/src/vlog/blob_file/scanner.rs +++ b/src/vlog/blob_file/scanner.rs @@ -166,43 +166,4 @@ mod tests { Ok(()) } - - #[test] - #[cfg(feature = "lz4")] - fn blob_scanner_lz4() -> crate::Result<()> { - let dir = tempdir()?; - let blob_file_path = dir.path().join("0"); - - let keys = [b"a", b"b", b"c", b"d", b"e"]; - - { - let mut writer = - BlobFileWriter::new(&blob_file_path, 0)?.use_compression(CompressionType::Lz4); - - for key in keys { - writer.write(key, 0, &key.repeat(100))?; - } - - writer.finish()?; - } - - { - let mut scanner = - Scanner::new(&blob_file_path, 0)?.use_compression(CompressionType::Lz4); - - for key in keys { - assert_eq!( - (Slice::from(key), Slice::from(key.repeat(100))), - scanner - .next() - .map(|result| result.map(|entry| { (entry.key, entry.value) })) - .unwrap()?, - ); - } - - assert!(scanner.next().is_none()); - } - - Ok(()) - } } From 4ea2daf30aed46e665c2f4f87af103bf5de8f840 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 12 Oct 2025 16:29:15 +0200 Subject: [PATCH 578/613] fmt --- src/segment/writer/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 01d2648e..479808fb 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -7,8 +7,8 @@ use super::{ }; use crate::{ coding::Encode, file::fsync_directory, segment::filter::standard_bloom::Builder, - time::unix_timestamp, CompressionType, InternalValue, SegmentId, UserKey, ValueType, - vlog::BlobFileId, + time::unix_timestamp, vlog::BlobFileId, CompressionType, InternalValue, SegmentId, UserKey, + ValueType, }; use index::{BlockIndexWriter, FullIndexWriter}; use std::{fs::File, io::BufWriter, path::PathBuf}; From dea3753e4dff9036936689944e643da923825f9e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 12 Oct 2025 16:33:12 +0200 Subject: [PATCH 579/613] fix --- src/tree/mod.rs | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 8e85fc43..48572030 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -161,7 +161,7 @@ impl AbstractTree for Tree { ) } - // TODO: doctest + /// Returns the number of tombstones in the tree. fn tombstone_count(&self) -> u64 { self.current_version() .iter_segments() @@ -170,24 +170,16 @@ impl AbstractTree for Tree { } /// Returns the number of weak tombstones (single deletes) in the tree. - #[must_use] fn weak_tombstone_count(&self) -> u64 { - self.manifest - .read() - .expect("lock is poisoned") - .current_version() + self.current_version() .iter_segments() .map(Segment::weak_tombstone_count) .sum() } /// Returns the number of value entries that become reclaimable once weak tombstones can be GC'd. - #[must_use] fn weak_tombstone_reclaimable_count(&self) -> u64 { - self.manifest - .read() - .expect("lock is poisoned") - .current_version() + self.current_version() .iter_segments() .map(Segment::weak_tombstone_reclaimable) .sum() From 78b5ced73b66a733d6c2d76cd18b4730aaaed3ff Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 13 Oct 2025 17:14:11 +0200 Subject: [PATCH 580/613] doc --- src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 6328eb7a..a109373a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -221,13 +221,13 @@ mod value_type; mod version; mod vlog; -/// User defined key +/// User defined key (byte array) pub type UserKey = Slice; /// User defined data (byte array) pub type UserValue = Slice; -/// KV-tuple, typically returned by an iterator +/// KV-tuple (key + value) pub type KvPair = (UserKey, UserValue); #[doc(hidden)] From 26ebb83166c8397334307b65ff0067980fa1edbe Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 13 Oct 2025 18:08:26 +0200 Subject: [PATCH 581/613] add test case --- tests/model_1.rs | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 tests/model_1.rs diff --git a/tests/model_1.rs b/tests/model_1.rs new file mode 100644 index 00000000..d0869e09 --- /dev/null +++ b/tests/model_1.rs @@ -0,0 +1,36 @@ +// Found by model testing + +use lsm_tree::{AbstractTree, Result}; +use std::sync::Arc; +use test_log::test; + +#[test] +fn model_1() -> Result<()> { + let folder = tempfile::tempdir()?; + let path = folder.path(); + + let tree = lsm_tree::Config::new(path).open()?; + let compaction = Arc::new(lsm_tree::compaction::Leveled::default()); + + let value = b"hellohello"; + + tree.insert(b"a", value, 0); + tree.flush_active_memtable(0)?; + + tree.insert(b"b", value, 1); + tree.flush_active_memtable(0)?; + + tree.remove(b"b", 2); + tree.flush_active_memtable(0)?; + + tree.insert(b"c", value, 3); + tree.flush_active_memtable(0)?; + tree.compact(compaction.clone(), 0)?; + + { + let seqno = 4; + assert!(!tree.contains_key(b"b", seqno)?); + } + + Ok(()) +} From adc3a6c5fdf4a64ca117c112ad5db9e1d9e86f46 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 13 Oct 2025 18:08:33 +0200 Subject: [PATCH 582/613] update log levels --- src/segment/mod.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/segment/mod.rs b/src/segment/mod.rs index b20f2460..7510b544 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -420,11 +420,11 @@ impl Segment { let trailer = sfa::Reader::from_reader(&mut file)?; let regions = ParsedRegions::parse_from_toc(trailer.toc())?; - log::debug!("Reading meta block, with meta_ptr={:?}", regions.metadata); + log::trace!("Reading meta block, with meta_ptr={:?}", regions.metadata); let metadata = ParsedMeta::load_with_handle(&file, ®ions.metadata)?; let block_index = if let Some(index_block_handle) = regions.index { - log::debug!( + log::trace!( "Creating partitioned block index, with tli_ptr={:?}, index_block_ptr={index_block_handle:?}", regions.tli, ); @@ -434,7 +434,7 @@ impl Segment { // BlockIndexImpl::TwoLevel(tli_block, todo!()) } else if pin_index { let tli_block = { - log::debug!("Reading TLI block, with tli_ptr={:?}", regions.tli); + log::trace!("Reading TLI block, with tli_ptr={:?}", regions.tli); let block = Block::from_file(&file, regions.tli, metadata.index_block_compression)?; @@ -448,13 +448,13 @@ impl Segment { IndexBlock::new(block) }; - log::debug!( + log::trace!( "Creating pinned block index, with tli_ptr={:?}", regions.tli, ); BlockIndexImpl::Full(FullBlockIndex::new(tli_block)) } else { - log::debug!("Creating volatile block index"); + log::trace!("Creating volatile block index"); BlockIndexImpl::VolatileFull }; From d0395387bd1f8f70b32e1fb4dea007b305ba8b36 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 13 Oct 2025 20:11:43 +0200 Subject: [PATCH 583/613] fix: tombstone dropping --- src/compaction/stream.rs | 22 +++++++++++++++++++++- src/compaction/worker.rs | 11 ++++------- src/segment/data_block/mod.rs | 2 +- src/version/mod.rs | 2 +- tests/model_1.rs | 3 ++- 5 files changed, 29 insertions(+), 11 deletions(-) diff --git a/src/compaction/stream.rs b/src/compaction/stream.rs index 39c77afe..d1ae4513 100644 --- a/src/compaction/stream.rs +++ b/src/compaction/stream.rs @@ -28,6 +28,8 @@ pub struct CompactionStream<'a, I: Iterator> { /// Event emitter that receives all expired KVs expiration_callback: Option<&'a mut dyn ExpiredKvCallback>, + + evict_tombstones: bool, } impl<'a, I: Iterator> CompactionStream<'a, I> { @@ -40,9 +42,15 @@ impl<'a, I: Iterator> CompactionStream<'a, I> { inner: iter, gc_seqno_threshold, expiration_callback: None, + evict_tombstones: false, } } + pub fn evict_tombstones(mut self, b: bool) -> Self { + self.evict_tombstones = b; + self + } + /// Installs a callback that receives all expired KVs. pub fn with_expiration_callback(mut self, cb: &'a mut dyn ExpiredKvCallback) -> Self { self.expiration_callback = Some(cb); @@ -80,7 +88,7 @@ impl> Iterator for CompactionStream<'_, I> { fn next(&mut self) -> Option { loop { - let mut head = fail_iter!(self.inner.next()?); + let head = fail_iter!(self.inner.next()?); if let Some(peeked) = self.inner.peek() { let Ok(peeked) = peeked else { @@ -94,8 +102,18 @@ impl> Iterator for CompactionStream<'_, I> { }; if peeked.key.user_key > head.key.user_key { + if head.is_tombstone() && self.evict_tombstones { + continue; + } + // NOTE: Only item of this key and thus latest version, so return it no matter what + // ... } else if peeked.key.seqno < self.gc_seqno_threshold { + if head.key.value_type == ValueType::Tombstone && self.evict_tombstones { + fail_iter!(self.drain_key(&head.key.user_key)); + continue; + } + // NOTE: If next item is an actual value, and current value is weak tombstone, // drop the tombstone let drop_weak_tombstone = peeked.key.value_type == ValueType::Value @@ -109,6 +127,8 @@ impl> Iterator for CompactionStream<'_, I> { continue; } } + } else if head.is_tombstone() && self.evict_tombstones { + continue; } // TODO: look at how this plays with blob GC diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 5fe36fb9..5dd293d8 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -288,6 +288,8 @@ fn merge_segments( // That way we don't resurrect data beneath the tombstone let is_last_level = payload.dest_level == last_level; + merge_iter = merge_iter.evict_tombstones(is_last_level); + let current_version = &super_version.version; let table_writer = super::flavour::prepare_table_writer(current_version, opts, payload)?; @@ -362,8 +364,8 @@ fn merge_segments( Box::new(StandardCompaction::new(table_writer, segments)) as Box } else { - log::debug!( - "Relocate blob files: {:#?}", + log::warn!( + "Relocate blob files: {:?}", blob_files_to_rewrite .iter() .map(BlobFile::id) @@ -415,11 +417,6 @@ fn merge_segments( for (idx, item) in merge_iter.enumerate() { let item = item?; - // IMPORTANT: We can only drop tombstones when writing into last level - if is_last_level && item.is_tombstone() { - continue; - } - compactor.write(item)?; if idx % 1_000_000 == 0 && opts.stop_signal.is_stopped() { diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index 3b99ffe0..02c880df 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -515,7 +515,7 @@ mod tests { use crate::{ segment::{ block::{BlockType, Header, ParsedItem}, - Block, BlockOffset, DataBlock, + Block, DataBlock, }, Checksum, InternalValue, SeqNo, Slice, ValueType::{Tombstone, Value}, diff --git a/src/version/mod.rs b/src/version/mod.rs index 583f67a5..b0a7a41e 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -638,7 +638,7 @@ impl Version { Ok(()) } - pub fn fmt(&self, f: &mut std::fmt::Formatter<'_>, hidden_set: &HiddenSet) -> std::fmt::Result { + pub fn fmt(&self, mut f: impl std::io::Write, hidden_set: &HiddenSet) -> std::io::Result<()> { for (idx, level) in self.iter_levels().enumerate() { writeln!( f, diff --git a/tests/model_1.rs b/tests/model_1.rs index d0869e09..7b9d400e 100644 --- a/tests/model_1.rs +++ b/tests/model_1.rs @@ -28,7 +28,8 @@ fn model_1() -> Result<()> { tree.compact(compaction.clone(), 0)?; { - let seqno = 4; + log::info!(r#"Getting "b""#); + let seqno = 5; assert!(!tree.contains_key(b"b", seqno)?); } From b1b682eebeb359fae8f1f6cc05ab358b6724a602 Mon Sep 17 00:00:00 2001 From: zaidoon Date: Mon, 13 Oct 2025 15:55:22 -0400 Subject: [PATCH 584/613] handle edge cases in block decoder partition point and range iteration --- src/segment/block/decoder.rs | 72 +++++++++++++++++++++------ src/segment/index_block/iter.rs | 36 ++++++++++++-- src/segment/iter.rs | 87 ++++++++++++++++++++++++++------- 3 files changed, 160 insertions(+), 35 deletions(-) diff --git a/src/segment/block/decoder.rs b/src/segment/block/decoder.rs index a950d230..6d4b9678 100644 --- a/src/segment/block/decoder.rs +++ b/src/segment/block/decoder.rs @@ -166,10 +166,19 @@ impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Pa Item::parse_restart_key(&mut cursor, pos, bytes).expect("should exist") } - fn partition_point( - &self, - pred: impl Fn(&[u8]) -> bool, - ) -> Option<(/* offset */ usize, /* idx */ usize)> { + fn partition_point(&self, pred: &mut F) -> Option<(/* offset */ usize, /* idx */ usize)> + where + F: FnMut(&[u8]) -> bool, + { + // The first pass over the binary index emulates `Iterator::partition_point` over the + // restart heads that are in natural key order. We keep track of both the byte offset and + // the restart index because callers need the offset to seed the linear scanner, while the + // index is sometimes reused (for example by `seek_upper`). + // + // In contrast to the usual `partition_point`, we intentionally return the *last* restart + // entry when the predicate continues to hold for every head key. Forward scans rely on + // this behaviour to land on the final restart interval and resume the linear scan there + // instead of erroneously reporting "not found". let binary_index = self.get_binary_index_reader(); debug_assert!( @@ -202,16 +211,27 @@ impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Pa return Some((0, 0)); } + if left == binary_index.len() { + let idx = binary_index.len() - 1; + let offset = binary_index.get(idx); + return Some((offset, idx)); + } + let offset = binary_index.get(left - 1); Some((offset, left - 1)) } // TODO: - fn partition_point_2( - &self, - pred: impl Fn(&[u8]) -> bool, - ) -> Option<(/* offset */ usize, /* idx */ usize)> { + fn partition_point_2(&self, pred: &mut F) -> Option<(/* offset */ usize, /* idx */ usize)> + where + F: FnMut(&[u8]) -> bool, + { + // `partition_point_2` mirrors `partition_point` but keeps the *next* restart entry instead + // of the previous one. This variant is used exclusively by reverse scans (`seek_upper`) + // that want the first restart whose head key exceeds the predicate. Returning the raw + // offset preserves the ability to reuse linear scanning infrastructure without duplicating + // decoder logic. let binary_index = self.get_binary_index_reader(); debug_assert!( @@ -258,12 +278,12 @@ impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Pa /// Seeks using the given predicate. /// /// Returns `false` if the key does not possible exist. - pub fn seek(&mut self, pred: impl Fn(&[u8]) -> bool, second_partition: bool) -> bool { + pub fn seek(&mut self, mut pred: impl FnMut(&[u8]) -> bool, second_partition: bool) -> bool { // TODO: make this nicer, maybe predicate that can affect the resulting index...? let result = if second_partition { - self.partition_point_2(pred) + self.partition_point_2(&mut pred) } else { - self.partition_point(pred) + self.partition_point(&mut pred) }; // Binary index lookup @@ -271,6 +291,26 @@ impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Pa return false; }; + if second_partition && pred(self.get_key_at(offset)) { + // `second_partition == true` means we ran the "look one restart ahead" search used by + // index blocks. When the predicate is still true at the chosen restart head it means + // the caller asked us to seek strictly beyond the last entry. In that case we skip any + // costly parsing and flip both scanners into an "exhausted" state so the outer iterator + // immediately reports EOF. + let end = self.block.data.len(); + + self.lo_scanner.offset = end; + self.lo_scanner.remaining_in_interval = 0; + self.lo_scanner.base_key_offset = None; + + self.hi_scanner.offset = end; + self.hi_scanner.ptr_idx = usize::MAX; + self.hi_scanner.stack.clear(); + self.hi_scanner.base_key_offset = Some(0); + + return false; + } + self.lo_scanner.offset = offset; true @@ -279,11 +319,15 @@ impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Pa /// Seeks the upper bound using the given predicate. /// /// Returns `false` if the key does not possible exist. - pub fn seek_upper(&mut self, pred: impl Fn(&[u8]) -> bool, second_partition: bool) -> bool { + pub fn seek_upper( + &mut self, + mut pred: impl FnMut(&[u8]) -> bool, + second_partition: bool, + ) -> bool { let result = if second_partition { - self.partition_point_2(pred) + self.partition_point_2(&mut pred) } else { - self.partition_point(pred) + self.partition_point(&mut pred) }; // Binary index lookup diff --git a/src/segment/index_block/iter.rs b/src/segment/index_block/iter.rs index f4da5681..e12680ed 100644 --- a/src/segment/index_block/iter.rs +++ b/src/segment/index_block/iter.rs @@ -271,11 +271,7 @@ mod tests { Ok(()) } - // TODO: seek and seek_upper need separate binary search routines...? - // TODO: because seeking in [a,b,c] to e should return None for seek, - // TODO: but not for seek_upper #[test] - #[ignore = "should not seek"] fn v3_index_block_iter_too_far() -> crate::Result<()> { let items = [ KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), @@ -309,6 +305,38 @@ mod tests { Ok(()) } + #[test] + fn v3_index_block_iter_too_far_next_back() -> crate::Result<()> { + let items = [ + KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), + KeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), + KeyedBlockHandle::new(b"def".into(), BlockOffset(13_000), 5_000), + ]; + + let bytes = IndexBlock::encode_into_vec(&items)?; + + let index_block = IndexBlock::new(Block { + data: bytes.into(), + header: Header { + block_type: BlockType::Index, + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + }, + }); + + let mut iter = index_block.iter(); + assert!(!iter.seek(b"zzz"), "should not seek"); + + assert!(iter.next().is_none(), "iterator should be exhausted"); + assert!( + iter.next_back().is_none(), + "reverse iterator should also be exhausted" + ); + + Ok(()) + } + #[test] fn v3_index_block_iter_span() -> crate::Result<()> { let items = [ diff --git a/src/segment/iter.rs b/src/segment/iter.rs index 66ac0a3d..dead432f 100644 --- a/src/segment/iter.rs +++ b/src/segment/iter.rs @@ -130,6 +130,8 @@ impl Iterator for Iter { type Item = crate::Result; fn next(&mut self) -> Option { + // Always try to keep iterating inside the already-materialized low data block first; this + // lets callers consume multiple entries without touching the index or cache again. if let Some(block) = &mut self.lo_data_block { if let Some(item) = block.next().map(Ok) { return Some(item); @@ -137,32 +139,54 @@ impl Iterator for Iter { } if !self.index_initialized { + // The index iterator is lazy-initialized on the first call so that the constructor does + // not eagerly seek. This is important because range bounds might be configured *after* + // `Iter::new`, and we only want to pay the seek cost if iteration actually happens. + let mut ok = true; + if let Some(key) = &self.range.0 { - self.index_iter.seek_lower(key); + // Seek to the first block whose end key is ≥ lower bound. If this fails we can + // immediately conclude the range is empty. + ok = self.index_iter.seek_lower(key); } - if let Some(key) = &self.range.1 { - self.index_iter.seek_upper(key); + + if ok { + if let Some(key) = &self.range.1 { + // Narrow the iterator further by skipping any blocks strictly above the upper + // bound. Again, a miss means the range is empty. + ok = self.index_iter.seek_upper(key); + } } + self.index_initialized = true; + + if !ok { + // No block in the index overlaps the requested window, so we clear state and return + // EOF without attempting to touch any data blocks. + self.lo_data_block = None; + self.hi_data_block = None; + return None; + } } loop { let Some(handle) = self.index_iter.next() else { - // NOTE: No more block handles from index, - // Now check hi buffer if it exists + // No more block handles coming from the index. Flush any pending items buffered on + // the high side (used by reverse iteration) before signalling completion. if let Some(block) = &mut self.hi_data_block { if let Some(item) = block.next().map(Ok) { return Some(item); } } - // NOTE: If there is no more item, we are done + // Nothing left to serve; drop both buffers so the iterator can be reused safely. self.lo_data_block = None; self.hi_data_block = None; return None; }; - // NOTE: Load next lo block + // Load the next data block referenced by the index handle. We try the shared block + // cache first to avoid hitting the filesystem, and fall back to `load_block` on miss. #[allow(clippy::single_match_else)] let block = match self.cache.get_block(self.segment_id, handle.offset()) { Some(block) => block, @@ -185,9 +209,12 @@ impl Iterator for Iter { let mut reader = create_data_block_reader(block); if let Some(key) = &self.range.0 { + // Each block is self-contained, so we have to apply range bounds again to discard + // entries that precede the requested lower key. reader.seek_lower(key, SeqNo::MAX); } if let Some(key) = &self.range.1 { + // Ditto for the upper bound: advance the block-local iterator to the right spot. reader.seek_upper(key, SeqNo::MAX); } @@ -197,6 +224,8 @@ impl Iterator for Iter { self.lo_data_block = Some(reader); if let Some(item) = item { + // Serving the first item immediately avoids stashing it in a temporary buffer and + // keeps block iteration semantics identical to the simple case at the top. return Some(Ok(item)); } } @@ -205,6 +234,8 @@ impl Iterator for Iter { impl DoubleEndedIterator for Iter { fn next_back(&mut self) -> Option { + // Mirror the forward iterator: prefer consuming buffered items from the high data block to + // avoid touching the index once a block has been materialized. if let Some(block) = &mut self.hi_data_block { if let Some(item) = block.next_back().map(Ok) { return Some(item); @@ -212,32 +243,48 @@ impl DoubleEndedIterator for Iter { } if !self.index_initialized { + // As in `next`, set up the index iterator lazily so that callers can configure range + // bounds before we spend time seeking or loading blocks. + let mut ok = true; + if let Some(key) = &self.range.0 { - self.index_iter.seek_lower(key); + ok = self.index_iter.seek_lower(key); } - if let Some(key) = &self.range.1 { - self.index_iter.seek_upper(key); + + if ok { + if let Some(key) = &self.range.1 { + ok = self.index_iter.seek_upper(key); + } } + self.index_initialized = true; + + if !ok { + // No index span overlaps the requested window; clear both buffers and finish early. + self.lo_data_block = None; + self.hi_data_block = None; + return None; + } } loop { let Some(handle) = self.index_iter.next_back() else { - // NOTE: No more block handles from index, - // Now check lo buffer if it exists + // Once we exhaust the index in reverse order, flush any items that were buffered on + // the low side (set when iterating forward first) before signalling completion. if let Some(block) = &mut self.lo_data_block { if let Some(item) = block.next_back().map(Ok) { return Some(item); } } - // NOTE: If there is no more item, we are done + // Nothing left to produce; reset both buffers to keep the iterator reusable. self.lo_data_block = None; self.hi_data_block = None; return None; }; - // NOTE: Load next hi block + // Retrieve the next data block from the cache (or disk on miss) so the high-side reader + // can serve entries in reverse order. #[allow(clippy::single_match_else)] let block = match self.cache.get_block(self.segment_id, handle.offset()) { Some(block) => block, @@ -259,12 +306,16 @@ impl DoubleEndedIterator for Iter { let mut reader = create_data_block_reader(block); - if let Some(key) = &self.range.0 { - reader.seek_lower(key, SeqNo::MAX); - } if let Some(key) = &self.range.1 { + // Reverse iteration needs to clamp the upper bound first so that `next_back` only + // sees entries ≤ the requested high key. reader.seek_upper(key, SeqNo::MAX); } + if let Some(key) = &self.range.0 { + // Apply the lower bound as well so that we never step past the low key when + // iterating backwards through the block. + reader.seek_lower(key, SeqNo::MAX); + } let item = reader.next_back(); @@ -272,6 +323,8 @@ impl DoubleEndedIterator for Iter { self.hi_data_block = Some(reader); if let Some(item) = item { + // Emit the first materialized entry immediately to match the forward path and avoid + // storing it in a temporary buffer. return Some(Ok(item)); } } From a08b5dc35e5e9c3759594a848d27f98b1b8bb421 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 14 Oct 2025 17:03:06 +0200 Subject: [PATCH 585/613] fix: 2 more blob tests and fixes found through model based testing --- src/compaction/flavour.rs | 8 ++-- src/compaction/worker.rs | 17 ++++--- src/version/mod.rs | 44 +++++++++++++++--- tests/model_2.rs | 43 ++++++++++++++++++ tests/model_3.rs | 96 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 190 insertions(+), 18 deletions(-) create mode 100644 tests/model_2.rs create mode 100644 tests/model_3.rs diff --git a/src/compaction/flavour.rs b/src/compaction/flavour.rs index eb8b96b4..2ca07c86 100644 --- a/src/compaction/flavour.rs +++ b/src/compaction/flavour.rs @@ -197,6 +197,8 @@ impl CompactionFlavour for RelocatingCompaction { blob_entry.uncompressed_len, )?; + self.inner.table_writer.register_blob(indirection); + self.inner .table_writer .write(InternalValue::from_components( @@ -297,10 +299,6 @@ impl StandardCompaction { } } - fn register_blob(&mut self, indirection: BlobIndirection) { - self.table_writer.register_blob(indirection); - } - fn consume_writer(self, opts: &Options, dst_lvl: usize) -> crate::Result> { let segments_base_folder = self.table_writer.base_path.clone(); @@ -334,7 +332,7 @@ impl CompactionFlavour for StandardCompaction { if item.key.value_type.is_indirection() { let mut reader = &item.value[..]; let indirection = BlobIndirection::decode_from(&mut reader)?; - self.register_blob(indirection); + self.table_writer.register_blob(indirection); } self.table_writer.write(item) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 5dd293d8..f595f1a8 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -499,17 +499,21 @@ fn drop_segments( .collect::>>() else { log::warn!( - "Compaction task created by {:?} contained segments not referenced in the level manifest", - opts.strategy.get_name(), - ); + "Compaction task created by {:?} contained segments not referenced in the level manifest", + opts.strategy.get_name(), + ); return Ok(()); }; + log::debug!("Dropping tables: {ids_to_drop:?}"); + + let mut dropped_blob_files = vec![]; + // IMPORTANT: Write the manifest with the removed segments first // Otherwise the segment files are deleted, but are still referenced! compaction_state.upgrade_version( &mut super_version, - |current| current.with_dropped(ids_to_drop), + |current| current.with_dropped(ids_to_drop, &mut dropped_blob_files), opts.eviction_seqno, // TODO: make naming in code base eviction_seqno vs watermark vs threshold consistent )?; @@ -520,8 +524,9 @@ fn drop_segments( segment.mark_as_deleted(); } - // TODO: fwiw also add all dead blob files - // TODO: look if any blob files can be trivially deleted as well + for blob_file in dropped_blob_files { + blob_file.mark_as_deleted(); + } if let Err(e) = compaction_state.maintenance(opts.eviction_seqno) { log::error!("Manifest maintenance failed: {e:?}"); diff --git a/src/version/mod.rs b/src/version/mod.rs index b0a7a41e..d8527892 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -154,7 +154,7 @@ pub struct VersionInner { // LSM-tree // /// Blob files for large values (value log) - pub(crate) value_log: Arc>, + pub value_log: Arc>, /// Blob file fragmentation gc_stats: Arc, @@ -194,6 +194,7 @@ impl Version { } pub fn l0(&self) -> &Level { + #[allow(clippy::expect_used)] self.levels.first().expect("L0 should exist") } @@ -384,15 +385,19 @@ impl Version { } } - /// Returns a new version with a list of segments removed. + /// Returns a new version with a list of tables removed. /// - /// The segment files are not immediately deleted, this is handled by the version system's free list. - pub fn with_dropped(&self, ids: &[SegmentId]) -> crate::Result { + /// The table files are not immediately deleted, this is handled by the version system's free list. + pub fn with_dropped( + &self, + ids: &[SegmentId], + dropped_blob_files: &mut Vec, + ) -> crate::Result { let id = self.id + 1; let mut levels = vec![]; - let mut dropped_segments = vec![]; + let mut dropped_segments: Vec = vec![]; for level in &self.levels { let runs = level @@ -406,7 +411,7 @@ impl Version { .inner_mut() .extract_if(.., |x| ids.contains(&x.metadata.id)); - dropped_segments = removed_segments.collect(); + dropped_segments.extend(removed_segments); run }) @@ -444,7 +449,20 @@ impl Version { } else { // TODO: 3.0.0 this should really be a newtype let mut copy = self.value_log.deref().clone(); - copy.retain(|_, blob_file| !blob_file.is_dead(&gc_stats)); + + // TODO: 3.0.0 1.91 + // copy.extract_if(.., |_, blob_file| blob_file.is_dead(&gc_stats)); + + copy.retain(|_, blob_file| { + if blob_file.is_dead(&gc_stats) { + log::debug!("Dropping blob file: {}", blob_file.id()); + dropped_blob_files.push(blob_file.clone()); + false + } else { + true + } + }); + Arc::new(copy) }; @@ -711,6 +729,18 @@ impl Version { } } + if !self.value_log.is_empty() { + writeln!(f)?; + writeln!( + f, + "BLOB: {:?}", + self.value_log + .values() + .map(BlobFile::id) + .collect::>(), + )?; + } + Ok(()) } } diff --git a/tests/model_2.rs b/tests/model_2.rs new file mode 100644 index 00000000..4ed688f8 --- /dev/null +++ b/tests/model_2.rs @@ -0,0 +1,43 @@ +// Found by model testing + +use lsm_tree::{AbstractTree, KvSeparationOptions, Result}; +use std::sync::Arc; +use test_log::test; + +#[test] +fn model_2() -> Result<()> { + let folder = tempfile::tempdir()?; + let path = folder.path(); + + let tree = lsm_tree::Config::new(path) + .with_kv_separation(Some(KvSeparationOptions::default().separation_threshold(5))) + .open()?; + let compaction = Arc::new(lsm_tree::compaction::Leveled::default()); + + let value = b"hellohello"; + + tree.insert("a", value, 3); + tree.flush_active_memtable(0)?; + tree.compact(compaction.clone(), 0)?; + assert_eq!(1, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); + + tree.insert("b", value, 4); + tree.flush_active_memtable(0)?; + tree.compact(compaction.clone(), 0)?; + assert_eq!(2, tree.segment_count()); + assert_eq!(2, tree.blob_file_count()); + + tree.insert("a", value, 5); + tree.flush_active_memtable(0)?; + tree.compact(compaction.clone(), 0)?; + assert_eq!(3, tree.segment_count()); + assert_eq!(3, tree.blob_file_count()); + + tree.drop_range::<&[u8], _>(..)?; + + assert_eq!(0, tree.segment_count()); + assert_eq!(0, tree.blob_file_count()); + + Ok(()) +} diff --git a/tests/model_3.rs b/tests/model_3.rs new file mode 100644 index 00000000..a565fa0b --- /dev/null +++ b/tests/model_3.rs @@ -0,0 +1,96 @@ +// Found by model testing + +use lsm_tree::{AbstractTree, KvSeparationOptions, Result}; +use std::sync::Arc; +use test_log::test; + +#[test] +fn model_3() -> Result<()> { + let folder = tempfile::tempdir()?; + let path = folder.path(); + + let tree = lsm_tree::Config::new(path) + .with_kv_separation(Some(KvSeparationOptions::default().separation_threshold(5))) + .open()?; + let compaction = Arc::new(lsm_tree::compaction::Leveled::default()); + + let value = b"hellohello"; + + tree.insert("a", value, 1); + tree.insert("i", value, 1); + tree.flush_active_memtable(0)?; + eprintln!("=========="); + eprintln!("{:?}", tree.current_version().gc_stats()); + + tree.insert("a", value, 2); + tree.insert("f", value, 2); + tree.flush_active_memtable(0)?; + eprintln!("=========="); + eprintln!("{:?}", tree.current_version().gc_stats()); + + tree.insert("a", value, 3); + tree.insert("h", value, 3); + tree.flush_active_memtable(0)?; + eprintln!("=========="); + eprintln!("{:?}", tree.current_version().gc_stats()); + + tree.insert("a", value, 4); + tree.insert("b", value, 4); + tree.flush_active_memtable(0)?; + eprintln!("=========="); + eprintln!("{:?}", tree.current_version().gc_stats()); + + tree.insert("c", value, 5); + tree.insert("g", value, 5); + tree.flush_active_memtable(0)?; + eprintln!("=========="); + eprintln!("{:?}", tree.current_version().gc_stats()); + + tree.insert("b", value, 6); + tree.insert("c", value, 6); + tree.insert("d", value, 6); + tree.insert("e", value, 6); + tree.flush_active_memtable(15)?; + tree.compact(compaction.clone(), 41)?; + eprintln!("=========="); + eprintln!("{:#?}", tree.current_version().gc_stats()); + + tree.insert("a", value, 7); + tree.flush_active_memtable(16)?; + eprintln!("=========="); + eprintln!("{:#?}", tree.current_version().gc_stats()); + + tree.insert("a", value, 8); + tree.flush_active_memtable(17)?; + eprintln!("=========="); + eprintln!("{:#?}", tree.current_version().gc_stats()); + + tree.insert("a", value, 9); + tree.flush_active_memtable(18)?; + eprintln!("=========="); + eprintln!("{:#?}", tree.current_version().gc_stats()); + + tree.insert("a", value, 10); + tree.flush_active_memtable(19)?; + tree.compact(compaction.clone(), 19)?; + eprintln!("=========="); + eprintln!("{:#?}", tree.current_version().gc_stats()); + + tree.drop_range::<&[u8], _>(..)?; + eprintln!("=========="); + eprintln!("{:?}", tree.current_version().gc_stats()); + + eprintln!( + "{:?}", + tree.current_version() + .value_log + .values() + .map(|x| x.id()) + .collect::>(), + ); + + assert_eq!(0, tree.segment_count()); + assert_eq!(0, tree.blob_file_count()); + + Ok(()) +} From 6cfae627c026363be8d7fc5729bf122ca25d4fcc Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 14 Oct 2025 17:04:08 +0200 Subject: [PATCH 586/613] wip --- src/version/mod.rs | 2 +- tests/model_3.rs | 31 ------------------------------- 2 files changed, 1 insertion(+), 32 deletions(-) diff --git a/src/version/mod.rs b/src/version/mod.rs index d8527892..fdf9c79b 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -154,7 +154,7 @@ pub struct VersionInner { // LSM-tree // /// Blob files for large values (value log) - pub value_log: Arc>, + pub(crate) value_log: Arc>, /// Blob file fragmentation gc_stats: Arc, diff --git a/tests/model_3.rs b/tests/model_3.rs index a565fa0b..bd212b33 100644 --- a/tests/model_3.rs +++ b/tests/model_3.rs @@ -19,32 +19,22 @@ fn model_3() -> Result<()> { tree.insert("a", value, 1); tree.insert("i", value, 1); tree.flush_active_memtable(0)?; - eprintln!("=========="); - eprintln!("{:?}", tree.current_version().gc_stats()); tree.insert("a", value, 2); tree.insert("f", value, 2); tree.flush_active_memtable(0)?; - eprintln!("=========="); - eprintln!("{:?}", tree.current_version().gc_stats()); tree.insert("a", value, 3); tree.insert("h", value, 3); tree.flush_active_memtable(0)?; - eprintln!("=========="); - eprintln!("{:?}", tree.current_version().gc_stats()); tree.insert("a", value, 4); tree.insert("b", value, 4); tree.flush_active_memtable(0)?; - eprintln!("=========="); - eprintln!("{:?}", tree.current_version().gc_stats()); tree.insert("c", value, 5); tree.insert("g", value, 5); tree.flush_active_memtable(0)?; - eprintln!("=========="); - eprintln!("{:?}", tree.current_version().gc_stats()); tree.insert("b", value, 6); tree.insert("c", value, 6); @@ -52,42 +42,21 @@ fn model_3() -> Result<()> { tree.insert("e", value, 6); tree.flush_active_memtable(15)?; tree.compact(compaction.clone(), 41)?; - eprintln!("=========="); - eprintln!("{:#?}", tree.current_version().gc_stats()); tree.insert("a", value, 7); tree.flush_active_memtable(16)?; - eprintln!("=========="); - eprintln!("{:#?}", tree.current_version().gc_stats()); tree.insert("a", value, 8); tree.flush_active_memtable(17)?; - eprintln!("=========="); - eprintln!("{:#?}", tree.current_version().gc_stats()); tree.insert("a", value, 9); tree.flush_active_memtable(18)?; - eprintln!("=========="); - eprintln!("{:#?}", tree.current_version().gc_stats()); tree.insert("a", value, 10); tree.flush_active_memtable(19)?; tree.compact(compaction.clone(), 19)?; - eprintln!("=========="); - eprintln!("{:#?}", tree.current_version().gc_stats()); tree.drop_range::<&[u8], _>(..)?; - eprintln!("=========="); - eprintln!("{:?}", tree.current_version().gc_stats()); - - eprintln!( - "{:?}", - tree.current_version() - .value_log - .values() - .map(|x| x.id()) - .collect::>(), - ); assert_eq!(0, tree.segment_count()); assert_eq!(0, tree.blob_file_count()); From 30149695f8f13d4a63b58b3500f8916e67ebd7a5 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 14 Oct 2025 17:05:53 +0200 Subject: [PATCH 587/613] make block decoder predicates non-mutable Fn --- src/segment/block/decoder.rs | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/src/segment/block/decoder.rs b/src/segment/block/decoder.rs index 6d4b9678..f0df5be9 100644 --- a/src/segment/block/decoder.rs +++ b/src/segment/block/decoder.rs @@ -166,9 +166,9 @@ impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Pa Item::parse_restart_key(&mut cursor, pos, bytes).expect("should exist") } - fn partition_point(&self, pred: &mut F) -> Option<(/* offset */ usize, /* idx */ usize)> + fn partition_point(&self, pred: F) -> Option<(/* offset */ usize, /* idx */ usize)> where - F: FnMut(&[u8]) -> bool, + F: Fn(&[u8]) -> bool, { // The first pass over the binary index emulates `Iterator::partition_point` over the // restart heads that are in natural key order. We keep track of both the byte offset and @@ -223,9 +223,9 @@ impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Pa } // TODO: - fn partition_point_2(&self, pred: &mut F) -> Option<(/* offset */ usize, /* idx */ usize)> + fn partition_point_2(&self, pred: F) -> Option<(/* offset */ usize, /* idx */ usize)> where - F: FnMut(&[u8]) -> bool, + F: Fn(&[u8]) -> bool, { // `partition_point_2` mirrors `partition_point` but keeps the *next* restart entry instead // of the previous one. This variant is used exclusively by reverse scans (`seek_upper`) @@ -278,12 +278,12 @@ impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Pa /// Seeks using the given predicate. /// /// Returns `false` if the key does not possible exist. - pub fn seek(&mut self, mut pred: impl FnMut(&[u8]) -> bool, second_partition: bool) -> bool { + pub fn seek(&mut self, pred: impl Fn(&[u8]) -> bool, second_partition: bool) -> bool { // TODO: make this nicer, maybe predicate that can affect the resulting index...? let result = if second_partition { - self.partition_point_2(&mut pred) + self.partition_point_2(&pred) } else { - self.partition_point(&mut pred) + self.partition_point(&pred) }; // Binary index lookup @@ -319,15 +319,11 @@ impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Pa /// Seeks the upper bound using the given predicate. /// /// Returns `false` if the key does not possible exist. - pub fn seek_upper( - &mut self, - mut pred: impl FnMut(&[u8]) -> bool, - second_partition: bool, - ) -> bool { + pub fn seek_upper(&mut self, mut pred: impl Fn(&[u8]) -> bool, second_partition: bool) -> bool { let result = if second_partition { - self.partition_point_2(&mut pred) + self.partition_point_2(&pred) } else { - self.partition_point(&mut pred) + self.partition_point(&pred) }; // Binary index lookup From e2c64f9fa57a391ab4cf2ef7e45d1947a23ebcc4 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 14 Oct 2025 17:09:27 +0200 Subject: [PATCH 588/613] guard index block seek early-return behind restart_interval=1 restart_interval > 1 is currently not implemented for index blocks, but if it is, we can not rely on the restart head itself, but also must do the linear scan --- src/segment/block/decoder.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/segment/block/decoder.rs b/src/segment/block/decoder.rs index f0df5be9..e2146b2a 100644 --- a/src/segment/block/decoder.rs +++ b/src/segment/block/decoder.rs @@ -228,8 +228,8 @@ impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Pa F: Fn(&[u8]) -> bool, { // `partition_point_2` mirrors `partition_point` but keeps the *next* restart entry instead - // of the previous one. This variant is used exclusively by reverse scans (`seek_upper`) - // that want the first restart whose head key exceeds the predicate. Returning the raw + // of the previous one. This variant is used exclusively by reverse scans (`seek_upper`) + // that want the first restart whose head key exceeds the predicate. Returning the raw // offset preserves the ability to reuse linear scanning infrastructure without duplicating // decoder logic. let binary_index = self.get_binary_index_reader(); @@ -291,10 +291,10 @@ impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Pa return false; }; - if second_partition && pred(self.get_key_at(offset)) { + if second_partition && self.restart_interval == 1 && pred(self.get_key_at(offset)) { // `second_partition == true` means we ran the "look one restart ahead" search used by - // index blocks. When the predicate is still true at the chosen restart head it means - // the caller asked us to seek strictly beyond the last entry. In that case we skip any + // index blocks. When the predicate is still true at the chosen restart head it means + // the caller asked us to seek strictly beyond the last entry. In that case we skip any // costly parsing and flip both scanners into an "exhausted" state so the outer iterator // immediately reports EOF. let end = self.block.data.len(); From b478c9cb372f7a73ba6e141b83b15b88cb75a0eb Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 14 Oct 2025 17:20:14 +0200 Subject: [PATCH 589/613] move item count in block trailer honestly item count is not needed but whatever --- src/segment/block/decoder.rs | 5 +---- src/segment/block/trailer.rs | 22 ++++++++++++++-------- src/segment/data_block/mod.rs | 16 ++++++---------- 3 files changed, 21 insertions(+), 22 deletions(-) diff --git a/src/segment/block/decoder.rs b/src/segment/block/decoder.rs index be16ae0c..59bc74a5 100644 --- a/src/segment/block/decoder.rs +++ b/src/segment/block/decoder.rs @@ -76,8 +76,7 @@ pub struct Decoder<'a, Item: Decodable, Parsed: ParsedItem> { hi_scanner: HiScanner, // Cached metadata - pub(crate) restart_interval: u8, - + restart_interval: u8, binary_index_step_size: u8, binary_index_offset: u32, binary_index_len: u32, @@ -89,8 +88,6 @@ impl<'a, Item: Decodable, Parsed: ParsedItem> Decoder<'a, Item, Pa let trailer = Trailer::new(block); let mut reader = trailer.as_slice(); - let _item_count = reader.read_u32::().expect("should read"); - let restart_interval = unwrap!(reader.read_u8()); let binary_index_step_size = unwrap!(reader.read_u8()); diff --git a/src/segment/block/trailer.rs b/src/segment/block/trailer.rs index 4827e0fe..f308ad3a 100644 --- a/src/segment/block/trailer.rs +++ b/src/segment/block/trailer.rs @@ -22,7 +22,7 @@ const TRAILER_SIZE: usize = 5 * std::mem::size_of::() /// /// ## Format /// -/// \[item_count\] \[restart_interval\] \[binary_index_offset\] \[binary_index_len\] \[hash_index_offset\] \[hash_index_len\] +/// \[restart_interval\] \[binary_index_offset\] \[binary_index_len\] \[hash_index_offset\] \[hash_index_len\] \[item_count\] #[allow(clippy::doc_markdown)] pub struct Trailer<'a> { block: &'a Block, @@ -41,7 +41,13 @@ impl<'a> Trailer<'a> { /// Returns the number of items in the block #[must_use] pub fn item_count(&self) -> usize { - let mut reader = self.as_slice(); + let reader = self.as_slice(); + + eprintln!("{reader:?}"); + + // NOTE: We now that the item count is the the end and is a u32 + #[allow(clippy::indexing_slicing)] + let reader = &mut &reader[(TRAILER_SIZE - std::mem::size_of::())..]; // NOTE: We know the trailer offset is valid, and the trailer has a fixed size // so the next item must be the item count @@ -102,12 +108,6 @@ impl<'a> Trailer<'a> { #[cfg(debug_assertions)] let bytes_before = encoder.writer.len(); - // NOTE: We know that data blocks will never even approach 4 GB in size, so there can't be that many items either - #[allow(clippy::cast_possible_truncation)] - encoder - .writer - .write_u32::(encoder.item_count as u32)?; - encoder.writer.write_u8(encoder.restart_interval)?; encoder.writer.write_u8(binary_index_step_size)?; @@ -141,6 +141,12 @@ impl<'a> Trailer<'a> { // Fixed value size (unused) encoder.writer.write_u32::(0)?; + // NOTE: We know that data blocks will never even approach 4 GB in size, so there can't be that many items either + #[allow(clippy::cast_possible_truncation)] + encoder + .writer + .write_u32::(encoder.item_count as u32)?; + #[cfg(debug_assertions)] assert_eq!( TRAILER_SIZE, diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index f27e6ee5..e762194c 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -303,8 +303,8 @@ impl DataBlock { let trailer = Trailer::new(&self.inner); - // NOTE: Skip item count (u32) and restart interval (u8) - let offset = size_of::() + size_of::(); + // NOTE: Skip restart interval (u8) + let offset = size_of::(); let mut reader = unwrap!(trailer.as_slice().get(offset..)); @@ -332,13 +332,9 @@ impl DataBlock { let trailer = Trailer::new(&self.inner); - // NOTE: Skip item count (u32), restart interval (u8), binary index step size (u8) + // NOTE: Skip restart interval (u8), binary index step size (u8) // and binary stuff (2x u32) - let offset = size_of::() - + size_of::() - + size_of::() - + size_of::() - + size_of::(); + let offset = size_of::() + size_of::() + size_of::() + size_of::(); let mut reader = unwrap!(trailer.as_slice().get(offset..)); @@ -446,8 +442,8 @@ impl DataBlock { let trailer = Trailer::new(&self.inner); - // NOTE: Skip item count (u32), restart interval (u8), binary index step size (u8) - let offset = size_of::() + (2 * size_of::()); + // NOTE: Skip restart interval (u8) and binary index step size (u8) + let offset = 2 * size_of::(); let mut reader = unwrap!(trailer.as_slice().get(offset..)); unwrap!(reader.read_u32::()) From f8d269bfe111b5d1629ad5e87f9c5d96c1675b3d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 14 Oct 2025 17:21:59 +0200 Subject: [PATCH 590/613] remove old fuzz tests --- fuzz/.gitignore | 2 - fuzz/Cargo.toml | 29 --- fuzz/fuzz_targets/data_block.rs | 260 --------------------------- fuzz/fuzz_targets/partition_point.rs | 21 --- 4 files changed, 312 deletions(-) delete mode 100644 fuzz/.gitignore delete mode 100644 fuzz/Cargo.toml delete mode 100644 fuzz/fuzz_targets/data_block.rs delete mode 100644 fuzz/fuzz_targets/partition_point.rs diff --git a/fuzz/.gitignore b/fuzz/.gitignore deleted file mode 100644 index b400c278..00000000 --- a/fuzz/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -corpus -artifacts diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml deleted file mode 100644 index 10cbfbe4..00000000 --- a/fuzz/Cargo.toml +++ /dev/null @@ -1,29 +0,0 @@ -[package] -name = "lsm-tree-fuzz" -version = "0.0.0" -publish = false -edition = "2021" - -[package.metadata] -cargo-fuzz = true - -[dependencies] -arbitrary = { version = "1", features = ["derive"] } -libfuzzer-sys = "0.4" -lsm-tree = { path = ".." } -rand_chacha = "0.9" -rand = "0.9" - -[[bin]] -name = "data_block" -path = "fuzz_targets/data_block.rs" -test = false -doc = false -bench = false - -[[bin]] -name = "partition_point" -path = "fuzz_targets/partition_point.rs" -test = false -doc = false -bench = false diff --git a/fuzz/fuzz_targets/data_block.rs b/fuzz/fuzz_targets/data_block.rs deleted file mode 100644 index 1b4e67d5..00000000 --- a/fuzz/fuzz_targets/data_block.rs +++ /dev/null @@ -1,260 +0,0 @@ -#![no_main] -use arbitrary::{Arbitrary, Result, Unstructured}; -use libfuzzer_sys::fuzz_target; -use lsm_tree::{ - segment::{block::BlockOffset, Block, DataBlock}, - InternalValue, SeqNo, ValueType, -}; - -#[derive(Arbitrary, Clone, Debug, PartialEq, Eq, Ord, PartialOrd)] -enum FuzzyValueType { - Value, - Tombstone, - // TODO: single delete -} - -impl Into for FuzzyValueType { - fn into(self) -> ValueType { - match self { - Self::Value => ValueType::Value, - Self::Tombstone => ValueType::Tombstone, - } - } -} - -#[derive(Clone, Debug, PartialEq, Eq, Ord, PartialOrd)] -struct FuzzyValue(InternalValue); - -impl<'a> Arbitrary<'a> for FuzzyValue { - fn arbitrary(u: &mut Unstructured<'a>) -> Result { - let key = Vec::::arbitrary(u)?; - let value = Vec::::arbitrary(u)?; - let seqno = u64::arbitrary(u)?; - let vtype = FuzzyValueType::arbitrary(u)?; - - let key = if key.is_empty() { vec![0] } else { key }; - - Ok(Self(InternalValue::from_components( - key, - value, - seqno, - vtype.into(), - ))) - } -} - -fn generate_ping_pong_code(seed: u64, len: usize) -> Vec { - use rand::prelude::*; - use rand::SeedableRng; - use rand_chacha::ChaCha8Rng; - - let mut rng = ChaCha8Rng::seed_from_u64(seed); - (0..len).map(|_| rng.random_range(0..=1)).collect() -} - -fuzz_target!(|data: &[u8]| { - let mut unstructured = Unstructured::new(data); - - let restart_interval = u8::arbitrary(&mut unstructured).unwrap().max(1); - - let seed = u64::arbitrary(&mut unstructured).unwrap(); - - // eprintln!("restart_interval={restart_interval}, hash_ratio={hash_ratio}"); - - let item_count = { - use rand::prelude::*; - use rand::SeedableRng; - use rand_chacha::ChaCha8Rng; - - let mut rng = ChaCha8Rng::seed_from_u64(seed); - rng.random_range(1..1_000) - }; - - let hash_ratio = { - use rand::prelude::*; - use rand::SeedableRng; - use rand_chacha::ChaCha8Rng; - - let mut rng = ChaCha8Rng::seed_from_u64(seed); - rng.random_range(0.0..4.0) - }; - - let mut items = (0..item_count) - .map(|_| FuzzyValue::arbitrary(&mut unstructured).unwrap()) - .collect::>(); - - assert!(!items.is_empty()); - - items.sort(); - items.dedup(); - - /* eprintln!("-- items --"); - for item in items.iter().map(|value| &value.0) { - eprintln!( - r#"InternalValue::from_components({:?}, {:?}, {}, {:?}),"#, - item.key.user_key, item.value, item.key.seqno, item.key.value_type, - ); - } */ - - /* if items.len() > 100 { - eprintln!("================== {}. ", items.len()); - } */ - - let items = items.into_iter().map(|value| value.0).collect::>(); - - for restart_interval in 1..=u8::MAX { - let bytes = DataBlock::encode_items(&items, restart_interval.into(), hash_ratio).unwrap(); - - let data_block = DataBlock::new(Block { - data: bytes.into(), - header: lsm_tree::segment::block::Header { - checksum: lsm_tree::segment::Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - }, - }); - - assert_eq!(data_block.len(), items.len()); - assert!(!data_block.is_empty()); - - if data_block.binary_index_len() > 254 { - assert!(data_block.hash_bucket_count().is_none()); - } else if hash_ratio > 0.0 { - assert!(data_block.hash_bucket_count().unwrap() > 0); - } - - // eprintln!("{items:?}"); - - for needle in &items { - if needle.key.seqno == SeqNo::MAX { - continue; - } - - // eprintln!("needle: {:?}", needle.key); - - assert_eq!( - Some(needle.clone()), - data_block.point_read(&needle.key.user_key, Some(needle.key.seqno + 1)), - ); - - assert_eq!( - data_block.point_read(&needle.key.user_key, None).unwrap(), - items - .iter() - .find(|item| item.key.user_key == needle.key.user_key) - .cloned() - .unwrap(), - ); - } - - assert_eq!(items, data_block.iter().collect::>()); - - assert_eq!( - items.iter().rev().cloned().collect::>(), - data_block.iter().rev().collect::>(), - ); - - { - let ping_pongs = generate_ping_pong_code(seed, items.len()); - - let expected_ping_ponged_items = { - let mut iter = items.iter(); - let mut v = vec![]; - - for &x in &ping_pongs { - if x == 0 { - v.push(iter.next().cloned().unwrap()); - } else { - v.push(iter.next_back().cloned().unwrap()); - } - } - - v - }; - - let real_ping_ponged_items = { - let mut iter = data_block.iter(); - let mut v = vec![]; - - for &x in &ping_pongs { - if x == 0 { - v.push(iter.next().unwrap()); - } else { - v.push(iter.next_back().unwrap()); - } - } - - v - }; - - assert_eq!(expected_ping_ponged_items, real_ping_ponged_items); - } - - { - let ping_pongs = generate_ping_pong_code(seed, items.len()); - - let expected_ping_ponged_items = { - let mut iter = items.iter().rev(); - let mut v = vec![]; - - for &x in &ping_pongs { - if x == 0 { - v.push(iter.next().cloned().unwrap()); - } else { - v.push(iter.next_back().cloned().unwrap()); - } - } - - v - }; - - let real_ping_ponged_items = { - let mut iter = data_block.iter().rev(); - let mut v = vec![]; - - for &x in &ping_pongs { - if x == 0 { - v.push(iter.next().unwrap()); - } else { - v.push(iter.next_back().unwrap()); - } - } - - v - }; - - assert_eq!(expected_ping_ponged_items, real_ping_ponged_items); - } - - { - use rand::prelude::*; - use rand::SeedableRng; - use rand_chacha::ChaCha8Rng; - - let mut rng = ChaCha8Rng::seed_from_u64(seed); - let mut lo = rng.random_range(0..items.len()); - let mut hi = rng.random_range(0..items.len()); - - if lo > hi { - std::mem::swap(&mut lo, &mut hi); - } - - let lo_key = &items[lo].key.user_key; - let hi_key = &items[hi].key.user_key; - - let expected_range: Vec<_> = items - .iter() - .filter(|kv| kv.key.user_key >= lo_key && kv.key.user_key <= hi_key) - .cloned() - .collect(); - - assert_eq!( - expected_range, - data_block - .range::<&[u8], _>(&(lo_key.as_ref()..=hi_key.as_ref())) - .collect::>(), - ); - } - } -}); diff --git a/fuzz/fuzz_targets/partition_point.rs b/fuzz/fuzz_targets/partition_point.rs deleted file mode 100644 index 65ed01ee..00000000 --- a/fuzz/fuzz_targets/partition_point.rs +++ /dev/null @@ -1,21 +0,0 @@ -#![no_main] -use libfuzzer_sys::{ - arbitrary::{Arbitrary, Unstructured}, - fuzz_target, -}; -use lsm_tree::binary_search::partition_point; - -fuzz_target!(|data: &[u8]| { - let mut unstructured = Unstructured::new(data); - - if let Ok(mut items) = as Arbitrary>::arbitrary(&mut unstructured) { - items.sort(); - items.dedup(); - - let mut index = ::arbitrary(&mut unstructured).unwrap(); - - let idx = partition_point(&items, |&x| x < index); - let std_pp_idx = items.partition_point(|&x| x < index); - assert_eq!(std_pp_idx, idx); - } -}); From f0f062b468f05fab2465823d72668ca618dc9cff Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 14 Oct 2025 17:43:11 +0200 Subject: [PATCH 591/613] test: data block afl fuzz --- UNSAFE.md | 7 +- fuzz/.gitignore | 1 + fuzz/data_block/.gitignore | 2 + fuzz/data_block/Cargo.toml | 12 ++ fuzz/data_block/src/main.rs | 326 ++++++++++++++++++++++++++++++++++++ 5 files changed, 345 insertions(+), 3 deletions(-) create mode 100644 fuzz/.gitignore create mode 100644 fuzz/data_block/.gitignore create mode 100644 fuzz/data_block/Cargo.toml create mode 100644 fuzz/data_block/src/main.rs diff --git a/UNSAFE.md b/UNSAFE.md index 773d3e44..a25a8bae 100644 --- a/UNSAFE.md +++ b/UNSAFE.md @@ -5,9 +5,10 @@ ## Run fuzz testing ```bash -cargo +nightly fuzz run data_block -- -max_len=8000000 -cargo +nightly fuzz run index_block -- -max_len=8000000 -cargo +nightly fuzz run partition_point -- -max_len=1000000 +cd fuzz/data_block +mkdir in +cat /dev/random | head -n 100 > in/input +cargo afl build && cargo afl fuzz -i in -o out target/debug/data_block ``` ## Run mutation testing diff --git a/fuzz/.gitignore b/fuzz/.gitignore new file mode 100644 index 00000000..fe709903 --- /dev/null +++ b/fuzz/.gitignore @@ -0,0 +1 @@ +out* diff --git a/fuzz/data_block/.gitignore b/fuzz/data_block/.gitignore new file mode 100644 index 00000000..f4ee534d --- /dev/null +++ b/fuzz/data_block/.gitignore @@ -0,0 +1,2 @@ +in* +out* diff --git a/fuzz/data_block/Cargo.toml b/fuzz/data_block/Cargo.toml new file mode 100644 index 00000000..a1c3b4a4 --- /dev/null +++ b/fuzz/data_block/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "data_block" +version = "0.1.0" +edition = "2024" + +[dependencies] +afl = "*" +arbitrary = { version = "1", features = ["derive"] } +libfuzzer-sys = "0.4" +lsm-tree = { path = "../.." } +rand_chacha = "0.9" +rand = "0.9" diff --git a/fuzz/data_block/src/main.rs b/fuzz/data_block/src/main.rs new file mode 100644 index 00000000..577f3a26 --- /dev/null +++ b/fuzz/data_block/src/main.rs @@ -0,0 +1,326 @@ +#[macro_use] +extern crate afl; + +use arbitrary::{Arbitrary, Result, Unstructured}; +use lsm_tree::{ + segment::{ + block::{decoder::ParsedItem, BlockOffset}, + Block, DataBlock, + }, + InternalValue, SeqNo, ValueType, +}; + +#[derive(Arbitrary, Clone, Debug, PartialEq, Eq, Ord, PartialOrd)] +enum FuzzyValueType { + Value, + Tombstone, + // TODO: single delete +} + +impl Into for FuzzyValueType { + fn into(self) -> ValueType { + match self { + Self::Value => ValueType::Value, + Self::Tombstone => ValueType::Tombstone, + } + } +} + +#[derive(Clone, Debug, PartialEq, Eq, Ord, PartialOrd)] +struct FuzzyValue(InternalValue); + +impl<'a> Arbitrary<'a> for FuzzyValue { + fn arbitrary(u: &mut Unstructured<'a>) -> Result { + let key = Vec::::arbitrary(u)?; + let value = Vec::::arbitrary(u)?; + let seqno = u64::arbitrary(u)?; + let vtype = FuzzyValueType::arbitrary(u)?; + + let key = if key.is_empty() { vec![0] } else { key }; + + Ok(Self(InternalValue::from_components( + key, + value, + seqno, + vtype.into(), + ))) + } +} + +fn generate_ping_pong_code(seed: u64, len: usize) -> Vec { + use rand::prelude::*; + use rand::SeedableRng; + use rand_chacha::ChaCha8Rng; + + let mut rng = ChaCha8Rng::seed_from_u64(seed); + (0..len).map(|_| rng.random_range(0..=1)).collect() +} + +fn main() { + fuzz!(|data: &[u8]| { + /* let data = &[ + 117, 3, 0, 42, 117, 147, 87, 255, 253, 43, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 251, 45, + ]; */ + + let mut unstructured = Unstructured::new(data); + + let seed = u64::arbitrary(&mut unstructured).unwrap(); + + let restart_interval = u8::arbitrary(&mut unstructured).unwrap(); + let restart_interval = restart_interval.max(1); + + let item_count = { + use rand::prelude::*; + use rand::SeedableRng; + use rand_chacha::ChaCha8Rng; + + let mut rng = ChaCha8Rng::seed_from_u64(seed); + rng.random_range(1..100) + }; + + let hash_ratio = { + use rand::prelude::*; + use rand::SeedableRng; + use rand_chacha::ChaCha8Rng; + + let mut rng = ChaCha8Rng::seed_from_u64(seed); + rng.random_range(0.0..8.0) + }; + + // eprintln!("restart_interval={restart_interval}, hash_ratio={hash_ratio}"); + + let mut items = (0..item_count) + .map(|_| FuzzyValue::arbitrary(&mut unstructured).unwrap()) + .collect::>(); + + assert!(!items.is_empty()); + + items.sort(); + items.dedup(); + + /* eprintln!("-- items --"); + for item in items.iter().map(|value| &value.0) { + eprintln!( + r#"InternalValue::from_components({:?}, {:?}, {}, {:?}),"#, + item.key.user_key, item.value, item.key.seqno, item.key.value_type, + ); + } + + if items.len() > 100 { + eprintln!("================== {}. ", items.len()); + } */ + + let items = items.into_iter().map(|value| value.0).collect::>(); + + // for restart_interval in 1..=16 { + let bytes = + DataBlock::encode_into_vec(&items, restart_interval.into(), hash_ratio).unwrap(); + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: lsm_tree::segment::block::Header { + checksum: lsm_tree::segment::Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + block_type: lsm_tree::segment::block::BlockType::Data, + }, + }); + + assert_eq!(data_block.len(), items.len()); + + if data_block.binary_index_len() > 254 { + assert!(data_block.hash_bucket_count().is_none()); + } else if hash_ratio > 0.0 { + assert!(data_block.hash_bucket_count().unwrap() > 0); + } + + /* + eprintln!("{items:?}"); */ + + for needle in &items { + if needle.key.seqno == SeqNo::MAX { + continue; + } + + // eprintln!("needle: {:?}", needle.key); + + assert_eq!( + Some(needle.clone()), + data_block.point_read(&needle.key.user_key, needle.key.seqno + 1), + ); + + assert_eq!( + data_block + .point_read(&needle.key.user_key, u64::MAX) + .unwrap(), + items + .iter() + .find(|item| item.key.user_key == needle.key.user_key + && item.key.seqno < u64::MAX) + .cloned() + .unwrap(), + ); + } + + assert_eq!( + items, + data_block + .iter() + .map(|x| x.materialize(data_block.as_slice())) + .collect::>() + ); + + // assert_eq!(items, data_block.scan().collect::>()); + + assert_eq!( + items.iter().rev().cloned().collect::>(), + data_block + .iter() + .map(|x| x.materialize(data_block.as_slice())) + .rev() + .collect::>(), + ); + + { + let ping_pongs = generate_ping_pong_code(seed, items.len()); + + // eprintln!("{ping_pongs:?}"); + + let expected_ping_ponged_items = { + let mut iter = items.iter(); + let mut v = vec![]; + + for &x in &ping_pongs { + if x == 0 { + v.push(iter.next().cloned().unwrap()); + } else { + v.push(iter.next_back().cloned().unwrap()); + } + } + + v + }; + + let real_ping_ponged_items = { + let mut iter = data_block + .iter() + .map(|x| x.materialize(data_block.as_slice())); + + let mut v = vec![]; + + for &x in &ping_pongs { + if x == 0 { + v.push(iter.next().unwrap()); + } else { + v.push(iter.next_back().unwrap()); + } + } + + v + }; + + assert_eq!(expected_ping_ponged_items, real_ping_ponged_items); + } + + { + let ping_pongs = generate_ping_pong_code(seed, items.len()); + + let expected_ping_ponged_items = { + let mut iter = items.iter().rev(); + let mut v = vec![]; + + for &x in &ping_pongs { + if x == 0 { + v.push(iter.next().cloned().unwrap()); + } else { + v.push(iter.next_back().cloned().unwrap()); + } + } + + v + }; + + let real_ping_ponged_items = { + let mut iter = data_block + .iter() + .rev() + .map(|x| x.materialize(data_block.as_slice())); + + let mut v = vec![]; + + for &x in &ping_pongs { + if x == 0 { + v.push(iter.next().unwrap()); + } else { + v.push(iter.next_back().unwrap()); + } + } + + v + }; + + assert_eq!(expected_ping_ponged_items, real_ping_ponged_items); + } + + { + use rand::prelude::*; + use rand::SeedableRng; + use rand_chacha::ChaCha8Rng; + + let mut rng = ChaCha8Rng::seed_from_u64(seed); + let mut lo = rng.random_range(0..items.len()); + let mut hi = rng.random_range(0..items.len()); + + if lo > hi { + std::mem::swap(&mut lo, &mut hi); + } + + // NOTE: If there is A:1, A:2, B:1 + // And we select lo as A:2 + // Our data block will seek to A:1 (correct) + // But our model won't... + // So seek to the first occurence of a key + loop { + if lo == 0 { + break; + } + + if items[lo - 1].key.user_key == items[lo].key.user_key { + lo -= 1; + } else { + break; + } + } + + // NOTE: Similar to lo + loop { + if hi == items.len() - 1 { + break; + } + + if items[hi + 1].key.user_key == items[hi].key.user_key { + hi += 1; + } else { + break; + } + } + + let lo_key = &items[lo].key.user_key; + let hi_key = &items[hi].key.user_key; + + let expected_range: Vec<_> = items[lo..=hi].iter().cloned().collect(); + + let mut iter = data_block.iter(); + assert!(iter.seek(&lo_key), "should seek"); + assert!(iter.seek_upper(hi_key), "should seek"); + + assert_eq!( + expected_range, + iter.map(|x| x.materialize(data_block.as_slice())) + .collect::>(), + ); + } + // } + }); +} From eb4b57acefb60dbc56ad98eae11e245697ef5b17 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 14 Oct 2025 18:01:15 +0200 Subject: [PATCH 592/613] test: prepare index block fuzzing --- UNSAFE.md | 5 ++ fuzz/data_block/nuclear.sh | 30 +++++++++ fuzz/index_block/.gitignore | 2 + fuzz/index_block/Cargo.toml | 12 ++++ fuzz/index_block/src/main.rs | 117 +++++++++++++++++++++++++++++++++ src/segment/index_block/mod.rs | 1 - 6 files changed, 166 insertions(+), 1 deletion(-) create mode 100644 fuzz/data_block/nuclear.sh create mode 100644 fuzz/index_block/.gitignore create mode 100644 fuzz/index_block/Cargo.toml create mode 100644 fuzz/index_block/src/main.rs diff --git a/UNSAFE.md b/UNSAFE.md index a25a8bae..79ecabd1 100644 --- a/UNSAFE.md +++ b/UNSAFE.md @@ -9,6 +9,11 @@ cd fuzz/data_block mkdir in cat /dev/random | head -n 100 > in/input cargo afl build && cargo afl fuzz -i in -o out target/debug/data_block + +cd fuzz/index_block +mkdir in +cat /dev/random | head -n 100 > in/input +cargo afl build && cargo afl fuzz -i in -o out target/debug/index_block ``` ## Run mutation testing diff --git a/fuzz/data_block/nuclear.sh b/fuzz/data_block/nuclear.sh new file mode 100644 index 00000000..c6d6a53a --- /dev/null +++ b/fuzz/data_block/nuclear.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +cargo afl build -r +cat /dev/random | head -n 1024 > in1/in1 +cat /dev/random | head -n 1024 > in2/in2 +cat /dev/random | head -n 1024 > in3/in3 +cat /dev/random | head -n 1024 > in4/in4 + +# Set session name +SESSION_NAME="my_session" + +# Start a new tmux session in detached mode +tmux new-session -d -s $SESSION_NAME -c "w" + +# Split the first window vertically +tmux split-window -h -p 25 -t $SESSION_NAME -c $1 + +# Focus on the left pane and start helix +tmux select-pane -t 1 +# tmux send-keys "cargo afl fuzz -i in1 -o out1 target/release/data_block" C-m + +# Switch focus to the right pane +tmux select-pane -t 2 +# tmux send-keys "cargo afl fuzz -i in2 -o out2 target/release/data_block" C-m + +# Create a new window for RSB +# tmux new-window -t $SESSION_NAME -n "2" -c "/devssd/code/rust/rust-storage-bench" + +# Attach to the tmux session +tmux attach -t $SESSION_NAME diff --git a/fuzz/index_block/.gitignore b/fuzz/index_block/.gitignore new file mode 100644 index 00000000..f4ee534d --- /dev/null +++ b/fuzz/index_block/.gitignore @@ -0,0 +1,2 @@ +in* +out* diff --git a/fuzz/index_block/Cargo.toml b/fuzz/index_block/Cargo.toml new file mode 100644 index 00000000..6fb9cb4d --- /dev/null +++ b/fuzz/index_block/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "index_block" +version = "0.1.0" +edition = "2024" + +[dependencies] +afl = "*" +arbitrary = { version = "1", features = ["derive"] } +libfuzzer-sys = "0.4" +lsm-tree = { path = "../.." } +rand_chacha = "0.9" +rand = "0.9" diff --git a/fuzz/index_block/src/main.rs b/fuzz/index_block/src/main.rs new file mode 100644 index 00000000..4f6ba9ca --- /dev/null +++ b/fuzz/index_block/src/main.rs @@ -0,0 +1,117 @@ +#[macro_use] +extern crate afl; + +use arbitrary::{Arbitrary, Result, Unstructured}; +use lsm_tree::segment::{block::BlockOffset, Block, IndexBlock, KeyedBlockHandle}; + +#[derive(Clone, Debug, PartialEq, Eq, Ord, PartialOrd)] +struct FuzzyValue(KeyedBlockHandle); + +impl<'a> Arbitrary<'a> for FuzzyValue { + fn arbitrary(u: &mut Unstructured<'a>) -> Result { + let key = Vec::::arbitrary(u)?; + + let key = if key.is_empty() { vec![0] } else { key }; + + Ok(Self(KeyedBlockHandle::new( + key.into(), + BlockOffset(0), + u32::arbitrary(u)?, + ))) + } +} + +fn generate_ping_pong_code(seed: u64, len: usize) -> Vec { + use rand::prelude::*; + use rand::SeedableRng; + use rand_chacha::ChaCha8Rng; + + let mut rng = ChaCha8Rng::seed_from_u64(seed); + (0..len).map(|_| rng.random_range(0..=1)).collect() +} + +fn main() { + fuzz!(|data: &[u8]| { + let mut unstructured = Unstructured::new(data); + + // eprintln!("restart_interval={restart_interval}"); + + if let Ok(mut items) = as Arbitrary>::arbitrary(&mut unstructured) { + // let mut items = items.to_vec(); + + if !items.is_empty() { + items.sort(); + items.dedup(); + + /* eprintln!("-- items --"); + for item in items.iter().map(|value| &value.0) { + eprintln!( + r#"InternalValue::from_components({:?}, {:?}, {}, {:?}),"#, + item.key.user_key, item.value, item.key.seqno, item.key.value_type, + ); + } */ + + /* if items.len() > 100 { + eprintln!("================== {}. ", items.len()); + } */ + + let items = items.into_iter().map(|value| value.0).collect::>(); + + // for restart_interval in 1..=u8::MAX { + let bytes = IndexBlock::encode_into_vec( + &items, + // restart_interval.into(), + ) + .unwrap(); + + let index_block = IndexBlock::new(Block { + data: bytes.into(), + header: lsm_tree::segment::block::Header { + checksum: lsm_tree::segment::block::Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + block_type: lsm_tree::segment::block::BlockType::Index, + }, + }); + + assert_eq!(index_block.len(), items.len()); + + /* if data_block.binary_index_len() > 254 { + assert!(data_block.hash_bucket_count().is_none()); + } else if hash_ratio > 0.0 { + assert!(data_block.hash_bucket_count().unwrap() > 0); + } */ + + // eprintln!("{items:?}"); + + /* for handle in &items { + // eprintln!("needle: {:?}", needle.key); + + assert_eq!( + Some(needle.clone()), + data_block.point_read(&handle.end_key).unwrap(), + ); + } */ + + /* assert_eq!( + items, + data_block.iter().map(|x| x.unwrap()).collect::>(), + ); + + assert_eq!( + items.iter().rev().cloned().collect::>(), + data_block + .iter() + .rev() + .map(|x| x.unwrap()) + .collect::>(), + ); */ + + // TODO: add ping-pong iters + + // TODO: add range iter too + // } + } + } + }); +} diff --git a/src/segment/index_block/mod.rs b/src/segment/index_block/mod.rs index 495a909d..380a05ac 100644 --- a/src/segment/index_block/mod.rs +++ b/src/segment/index_block/mod.rs @@ -84,7 +84,6 @@ impl IndexBlock { )) } - #[cfg(test)] pub fn encode_into_vec(items: &[KeyedBlockHandle]) -> crate::Result> { let mut buf = vec![]; From 3a10d32978b9254b28458a2672dd0481dc6dba28 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 14 Oct 2025 18:30:21 +0200 Subject: [PATCH 593/613] feat: prefixed_range https://github.com/fjall-rs/fjall/pull/143 --- src/lib.rs | 4 ++ src/range.rs | 25 +++++--- src/util.rs | 158 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 180 insertions(+), 7 deletions(-) create mode 100644 src/util.rs diff --git a/src/lib.rs b/src/lib.rs index a109373a..788693e3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -216,6 +216,10 @@ pub mod stop_signal; mod format_version; mod time; mod tree; + +/// Utility functions +pub mod util; + mod value; mod value_type; mod version; diff --git a/src/range.rs b/src/range.rs index 8b56d233..adc4a860 100644 --- a/src/range.rs +++ b/src/range.rs @@ -24,13 +24,11 @@ pub fn seqno_filter(item_seqno: SeqNo, seqno: SeqNo) -> bool { item_seqno < seqno } -#[must_use] -#[allow(clippy::module_name_repetitions)] -pub fn prefix_to_range(prefix: &[u8]) -> (Bound, Bound) { - use std::ops::Bound::{Excluded, Included, Unbounded}; +pub(crate) fn prefix_upper_range(prefix: &[u8]) -> Bound { + use std::ops::Bound::{Excluded, Unbounded}; if prefix.is_empty() { - return (Unbounded, Unbounded); + return Unbounded; } let mut end = prefix.to_vec(); @@ -42,11 +40,24 @@ pub fn prefix_to_range(prefix: &[u8]) -> (Bound, Bound) { if *byte < 255 { *byte += 1; end.truncate(idx + 1); - return (Included(prefix.into()), Excluded(end.into())); + return Excluded(end.into()); } } - (Included(prefix.into()), Unbounded) + Unbounded +} + +/// Converts a prefix to range bounds. +#[must_use] +#[allow(clippy::module_name_repetitions)] +pub fn prefix_to_range(prefix: &[u8]) -> (Bound, Bound) { + use std::ops::Bound::{Included, Unbounded}; + + if prefix.is_empty() { + return (Unbounded, Unbounded); + } + + (Included(prefix.into()), prefix_upper_range(prefix)) } /// The iter state references the memtables used while the range is open diff --git a/src/util.rs b/src/util.rs new file mode 100644 index 00000000..f22fe2e3 --- /dev/null +++ b/src/util.rs @@ -0,0 +1,158 @@ +use crate::range::prefix_upper_range; +use crate::UserKey; +use byteview::ByteView; +use std::ops::RangeBounds; + +pub use crate::range::prefix_to_range; + +/// Helper function to create a prefixed range. +/// +/// Made for phil. +/// +/// # Panics +/// +/// Panics if the prefix is empty. +pub fn prefixed_range, K: AsRef<[u8]>, R: RangeBounds>( + prefix: P, + range: R, +) -> impl RangeBounds { + use std::ops::Bound::{Excluded, Included, Unbounded}; + + let prefix = prefix.as_ref(); + + assert!(!prefix.is_empty(), "prefix may not be empty"); + + match (range.start_bound(), range.end_bound()) { + (Unbounded, Unbounded) => prefix_to_range(prefix), + (lower, Unbounded) => { + let lower = lower.map(|k| UserKey::from(ByteView::fused(prefix, k.as_ref()))); + let upper = prefix_upper_range(prefix); + (lower, upper) + } + (Unbounded, upper) => { + let upper = match upper { + Included(k) => Included(UserKey::from(ByteView::fused(prefix, k.as_ref()))), + Excluded(k) => Excluded(UserKey::from(ByteView::fused(prefix, k.as_ref()))), + Unbounded => unreachable!(), + }; + + (Included(prefix.into()), upper) + } + (lower, upper) => { + let lower = match lower { + Included(k) => Included(UserKey::from(ByteView::fused(prefix, k.as_ref()))), + Excluded(k) => Excluded(UserKey::from(ByteView::fused(prefix, k.as_ref()))), + Unbounded => unreachable!(), + }; + + let upper = match upper { + Included(k) => Included(UserKey::from(ByteView::fused(prefix, k.as_ref()))), + Excluded(k) => Excluded(UserKey::from(ByteView::fused(prefix, k.as_ref()))), + Unbounded => unreachable!(), + }; + + (lower, upper) + } + } +} + +#[cfg(test)] +mod tests { + use super::prefixed_range; + use crate::UserKey; + use std::ops::Bound::{Excluded, Included}; + use std::ops::RangeBounds; + use test_log::test; + + #[test] + fn prefixed_range_1() { + let prefix = "abc"; + let min = 5u8.to_be_bytes(); + let max = 9u8.to_be_bytes(); + + let range = prefixed_range(prefix, min..=max); + + assert_eq!( + range.start_bound(), + Included(&UserKey::new(&[b'a', b'b', b'c', 5])) + ); + assert_eq!( + range.end_bound(), + Included(&UserKey::new(&[b'a', b'b', b'c', 9])) + ); + } + + #[test] + fn prefixed_range_2() { + let prefix = "abc"; + let min = 5u8.to_be_bytes(); + let max = 9u8.to_be_bytes(); + + let range = prefixed_range(prefix, min..max); + + assert_eq!( + range.start_bound(), + Included(&UserKey::new(&[b'a', b'b', b'c', 5])) + ); + assert_eq!( + range.end_bound(), + Excluded(&UserKey::new(&[b'a', b'b', b'c', 9])) + ); + } + + #[test] + fn prefixed_range_3() { + let prefix = "abc"; + let min = 5u8.to_be_bytes(); + + let range = prefixed_range(prefix, min..); + + assert_eq!( + range.start_bound(), + Included(&UserKey::new(&[b'a', b'b', b'c', 5])) + ); + assert_eq!(range.end_bound(), Excluded(&UserKey::new(b"abd"))); + } + + #[test] + fn prefixed_range_4() { + let prefix = "abc"; + let max = 9u8.to_be_bytes(); + + let range = prefixed_range(prefix, ..max); + + assert_eq!(range.start_bound(), Included(&UserKey::new(b"abc"))); + assert_eq!( + range.end_bound(), + Excluded(&UserKey::new(&[b'a', b'b', b'c', 9])) + ); + } + + #[test] + fn prefixed_range_5() { + let prefix = "abc"; + let max = u8::MAX.to_be_bytes(); + + let range = prefixed_range(prefix, ..=max); + + assert_eq!(range.start_bound(), Included(&UserKey::new(b"abc"))); + assert_eq!( + range.end_bound(), + Included(&UserKey::new(&[b'a', b'b', b'c', u8::MAX])) + ); + } + + #[test] + fn prefixed_range_6() { + let prefix = "abc"; + let max = u8::MAX.to_be_bytes(); + + let range = prefixed_range(prefix, ..max); + + assert_eq!(range.start_bound(), Included(&UserKey::new(b"abc"))); + assert_eq!( + range.end_bound(), + Excluded(&UserKey::new(&[b'a', b'b', b'c', u8::MAX])) + ); + } +} From 5f79a7b0063d1d3449516024a4b5dc3f4ee502c9 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 14 Oct 2025 18:36:02 +0200 Subject: [PATCH 594/613] fix --- src/util.rs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/util.rs b/src/util.rs index f22fe2e3..23ea08ad 100644 --- a/src/util.rs +++ b/src/util.rs @@ -1,6 +1,5 @@ use crate::range::prefix_upper_range; use crate::UserKey; -use byteview::ByteView; use std::ops::RangeBounds; pub use crate::range::prefix_to_range; @@ -25,14 +24,14 @@ pub fn prefixed_range, K: AsRef<[u8]>, R: RangeBounds>( match (range.start_bound(), range.end_bound()) { (Unbounded, Unbounded) => prefix_to_range(prefix), (lower, Unbounded) => { - let lower = lower.map(|k| UserKey::from(ByteView::fused(prefix, k.as_ref()))); + let lower = lower.map(|k| UserKey::fused(prefix, k.as_ref())); let upper = prefix_upper_range(prefix); (lower, upper) } (Unbounded, upper) => { let upper = match upper { - Included(k) => Included(UserKey::from(ByteView::fused(prefix, k.as_ref()))), - Excluded(k) => Excluded(UserKey::from(ByteView::fused(prefix, k.as_ref()))), + Included(k) => Included(UserKey::fused(prefix, k.as_ref())), + Excluded(k) => Excluded(UserKey::fused(prefix, k.as_ref())), Unbounded => unreachable!(), }; @@ -40,14 +39,14 @@ pub fn prefixed_range, K: AsRef<[u8]>, R: RangeBounds>( } (lower, upper) => { let lower = match lower { - Included(k) => Included(UserKey::from(ByteView::fused(prefix, k.as_ref()))), - Excluded(k) => Excluded(UserKey::from(ByteView::fused(prefix, k.as_ref()))), + Included(k) => Included(UserKey::fused(prefix, k.as_ref())), + Excluded(k) => Excluded(UserKey::fused(prefix, k.as_ref())), Unbounded => unreachable!(), }; let upper = match upper { - Included(k) => Included(UserKey::from(ByteView::fused(prefix, k.as_ref()))), - Excluded(k) => Excluded(UserKey::from(ByteView::fused(prefix, k.as_ref()))), + Included(k) => Included(UserKey::fused(prefix, k.as_ref())), + Excluded(k) => Excluded(UserKey::fused(prefix, k.as_ref())), Unbounded => unreachable!(), }; From 0d882d4d432d911974252a3aa0d056e068312def Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 14 Oct 2025 18:36:55 +0200 Subject: [PATCH 595/613] remove old microbench (for now) --- microbench/block_bin_index/Cargo.toml | 20 -- microbench/block_bin_index/run.nu | 6 - microbench/block_bin_index/src/main.rs | 169 ---------------- microbench/block_bin_index/template.py | 68 ------- .../block_bin_index/template3d_space.py | 37 ---- .../block_bin_index/template3d_speed.py | 36 ---- microbench/block_hash_index/Cargo.toml | 19 -- microbench/block_hash_index/run.nu | 5 - microbench/block_hash_index/src/main.rs | 143 ------------- microbench/block_hash_index/template.py | 64 ------ microbench/block_load/.gitignore | 1 - microbench/block_load/Cargo.toml | 19 -- microbench/block_load/run.nu | 4 - microbench/block_load/src/main.rs | 75 ------- microbench/block_load/template.py | 68 ------- microbench/bloom_fpr/Cargo.toml | 11 - microbench/bloom_fpr/run.nu | 3 - microbench/bloom_fpr/src/main.rs | 90 --------- microbench/bloom_fpr/template.py | 93 --------- microbench/bloom_speed/Cargo.toml | 12 -- microbench/bloom_speed/run.nu | 4 - microbench/bloom_speed/src/main.rs | 113 ----------- microbench/bloom_speed/template.py | 58 ------ microbench/fractional_cascading/Cargo.toml | 14 -- microbench/fractional_cascading/run.nu | 8 - microbench/fractional_cascading/run.py | 69 ------- microbench/fractional_cascading/src/main.rs | 190 ------------------ microbench/fractional_cascading/template.py | 46 ----- microbench/hash_fns/.gitignore | 1 - microbench/hash_fns/Cargo.toml | 21 -- microbench/hash_fns/run.nu | 2 - microbench/hash_fns/src/lib.rs | 150 -------------- microbench/hash_fns/src/main.rs | 102 ---------- microbench/hash_fns/template.py | 60 ------ microbench/run.nu | 18 -- 35 files changed, 1799 deletions(-) delete mode 100644 microbench/block_bin_index/Cargo.toml delete mode 100644 microbench/block_bin_index/run.nu delete mode 100644 microbench/block_bin_index/src/main.rs delete mode 100644 microbench/block_bin_index/template.py delete mode 100644 microbench/block_bin_index/template3d_space.py delete mode 100644 microbench/block_bin_index/template3d_speed.py delete mode 100644 microbench/block_hash_index/Cargo.toml delete mode 100644 microbench/block_hash_index/run.nu delete mode 100644 microbench/block_hash_index/src/main.rs delete mode 100644 microbench/block_hash_index/template.py delete mode 100644 microbench/block_load/.gitignore delete mode 100644 microbench/block_load/Cargo.toml delete mode 100644 microbench/block_load/run.nu delete mode 100644 microbench/block_load/src/main.rs delete mode 100644 microbench/block_load/template.py delete mode 100644 microbench/bloom_fpr/Cargo.toml delete mode 100644 microbench/bloom_fpr/run.nu delete mode 100644 microbench/bloom_fpr/src/main.rs delete mode 100644 microbench/bloom_fpr/template.py delete mode 100644 microbench/bloom_speed/Cargo.toml delete mode 100644 microbench/bloom_speed/run.nu delete mode 100644 microbench/bloom_speed/src/main.rs delete mode 100644 microbench/bloom_speed/template.py delete mode 100644 microbench/fractional_cascading/Cargo.toml delete mode 100644 microbench/fractional_cascading/run.nu delete mode 100644 microbench/fractional_cascading/run.py delete mode 100644 microbench/fractional_cascading/src/main.rs delete mode 100644 microbench/fractional_cascading/template.py delete mode 100644 microbench/hash_fns/.gitignore delete mode 100644 microbench/hash_fns/Cargo.toml delete mode 100644 microbench/hash_fns/run.nu delete mode 100644 microbench/hash_fns/src/lib.rs delete mode 100644 microbench/hash_fns/src/main.rs delete mode 100644 microbench/hash_fns/template.py delete mode 100644 microbench/run.nu diff --git a/microbench/block_bin_index/Cargo.toml b/microbench/block_bin_index/Cargo.toml deleted file mode 100644 index b026ed17..00000000 --- a/microbench/block_bin_index/Cargo.toml +++ /dev/null @@ -1,20 +0,0 @@ -[package] -name = "block_bin_index_bench" -version = "1.0.0" -edition = "2021" -publish = false - -[profile.release] -debug = true - -[features] -default = [] -use_unsafe = ["lsm-tree/use_unsafe"] - -[dependencies] -env_logger = "0.11.8" -lsm-tree = { path = "../..", features = ["lz4"] } -lz4_flex = "0.11.3" -rand = "0.9" -serde_json = "1.0.140" -scru128 = "3.1.0" diff --git a/microbench/block_bin_index/run.nu b/microbench/block_bin_index/run.nu deleted file mode 100644 index ada20241..00000000 --- a/microbench/block_bin_index/run.nu +++ /dev/null @@ -1,6 +0,0 @@ -rm -f data.jsonl -cargo run -r | save --append data.jsonl -cargo run -r --features use_unsafe | save --append data.jsonl -python3 template3d_speed.py -python3 template3d_space.py -python3 template.py diff --git a/microbench/block_bin_index/src/main.rs b/microbench/block_bin_index/src/main.rs deleted file mode 100644 index 5807ba07..00000000 --- a/microbench/block_bin_index/src/main.rs +++ /dev/null @@ -1,169 +0,0 @@ -use lsm_tree::{InternalValue, SeqNo}; -use rand::{Rng, RngCore}; -use std::io::Write; - -fn generate_key(primary_key: u64, secondary_key: u64) -> [u8; 16] { - scru128::new().into() -} - -pub fn main() -> lsm_tree::Result<()> { - env_logger::Builder::from_default_env().init(); - - let mut rng = rand::rng(); - - #[cfg(feature = "use_unsafe")] - let used_unsafe = true; - - #[cfg(not(feature = "use_unsafe"))] - let used_unsafe = false; - - for item_count in [10, 50, 100, 250, 500, 1_000, 2_000, 4_000] { - let mut items = vec![]; - - { - let mut buf = [0u8; 16]; - - for item in 0u64..item_count { - let key = generate_key(item, 0); - rng.fill_bytes(&mut buf); - - items.push(InternalValue::from_components( - &key, - &buf, - 0, - lsm_tree::ValueType::Value, - )); - } - } - - let intervals: &[u8] = if std::env::var("DEFAULT_RESTART_INTERVAL_ONLY").is_ok() { - &[16] - } else { - &[ - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, - ] - }; - - for &restart_interval in intervals { - // eprintln!("hash_ratio={hash_ratio}"); - - use lsm_tree::segment::{ - block::{BlockType, Header}, - BlockOffset, Checksum, DataBlock, - }; - - let bytes = DataBlock::encode_into_vec(&items, restart_interval, 0.0)?; - // eprintln!("{bytes:?}"); - // eprintln!("{}", String::from_utf8_lossy(&bytes)); - // eprintln!("encoded into {} bytes", bytes.len()); - - { - use lsm_tree::segment::Block; - use std::time::Instant; - - let block = DataBlock::new(Block { - data: lsm_tree::Slice::new(&bytes), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - block_type: BlockType::Data, - }, - }); - - /* eprintln!( - "hash index conflicts: {:?} / {:?}", - block.hash_bucket_conflict_count(), - block.hash_bucket_count(), - ); - eprintln!( - "hash index free slots: {:?} / {:?}", - block.hash_bucket_free_count(), - block.hash_bucket_count(), - ); */ - - { - const NUM_RUNS: u128 = 10_000_000; - - let start = Instant::now(); - for _ in 0..NUM_RUNS { - let needle = rng.random_range(0..item_count as usize); - let needle = &items[needle].key.user_key; - - let mut iter = block.iter(); - - assert!( - iter.seek(&needle /* TODO: , SeqNo::MAX */), - "did not find key", - ); - // block.point_read(&needle, None); - } - - let rps_ns = { - let ns = start.elapsed().as_nanos(); - ns / NUM_RUNS - }; - - /* eprintln!("one read took {:?}ns",); */ - - println!( - "{}", - serde_json::json!({ - "block_size": bytes.len(), - "restart_interval": restart_interval, - "rps_ns": rps_ns, - "item_count": item_count, - "unsafe": used_unsafe, - }) - .to_string(), - ); - } - - /* { - let start = Instant::now(); - for _ in 0..25_000 { - assert_eq!(items.len(), block.iter().count()); - } - - eprintln!("one iter() took {:?}ns", { - let ns = start.elapsed().as_nanos() as usize; - ns / 25_000 / items.len() - }); - } */ - - /* { - let start = Instant::now(); - for _ in 0..25_000 { - assert_eq!(items.len(), block.iter().rev().count()); - } - - eprintln!("one iter().rev() took {:?}ns", { - let ns = start.elapsed().as_nanos() as usize; - ns / 25_000 / items.len() - }); - } */ - } - - /* { - let mut writer = vec![]; - header.encode_into(&mut writer)?; - writer.write_all(&bytes)?; - - eprintln!("V3 format (uncompressed): {}B", writer.len()); - } - - { - let mut writer = vec![]; - header.encode_into(&mut writer)?; - - let bytes = lz4_flex::compress_prepend_size(&bytes); - writer.write_all(&bytes)?; - - eprintln!("V3 format (LZ4): {}B", writer.len()); - } */ - } - } - - Ok(()) -} diff --git a/microbench/block_bin_index/template.py b/microbench/block_bin_index/template.py deleted file mode 100644 index 533936d0..00000000 --- a/microbench/block_bin_index/template.py +++ /dev/null @@ -1,68 +0,0 @@ -import json -from pathlib import Path -import matplotlib.pyplot as plt -from palettable.tableau import PurpleGray_6 - -colors = PurpleGray_6.mpl_colors - -# Path to your data file -data_path = Path("data.jsonl") - -# Read and parse the data -safe_data = [] -unsafe_data = [] - -with data_path.open() as f: - for line in f: - if len(line) == 0: - continue - - entry = json.loads(line) - - if entry.get("item_count") != 1000: - continue - - if entry.get("unsafe") == False: - safe_data.append(entry) - else: - unsafe_data.append(entry) - -# Sort by restart_interval to ensure smooth lines -safe_data.sort(key=lambda x: x["restart_interval"]) -unsafe_data.sort(key=lambda x: x["restart_interval"]) - -# Extract data for plotting -restart_interval_safe = [d["restart_interval"] for d in safe_data] -rps_ns_safe = [d["rps_ns"] for d in safe_data] -block_size = [d["block_size"] for d in safe_data] - -restart_interval_unsafe = [d["restart_interval"] for d in unsafe_data] -rps_ns_unsafe = [d["rps_ns"] for d in unsafe_data] - -# Create figure and first Y-axis -fig, ax1 = plt.subplots(figsize=(6, 4)) - -# Plot rps_ns (left Y-axis) -ax1.plot(restart_interval_safe, rps_ns_safe, label='Read latency (safe)', marker='o', color = colors[0]) -ax1.plot(restart_interval_unsafe, rps_ns_unsafe, label='Read latency (unsafe)', marker='x', color = colors[1]) -ax1.set_xlabel('Restart interval') -ax1.set_ylabel('Point read latency [ns]') -ax1.tick_params(axis='y') - -# Create second Y-axis for block size -ax2 = ax1.twinx() -ax2.plot(restart_interval_safe, block_size, label='Block size', linestyle='--', marker='d', color = colors[2]) -ax2.set_ylabel('Block size [bytes]') -ax2.tick_params(axis='y') - -# Combine legends from both axes -lines1, labels1 = ax1.get_legend_handles_labels() -lines2, labels2 = ax2.get_legend_handles_labels() -ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper center', fancybox=True, bbox_to_anchor=(0.5, 1.25), shadow=True, ncol=2) - -# Grid and title -ax1.grid(color="0.9", linestyle='--', linewidth=1) -# plt.title('Safe vs Unsafe: rps_ns and Block Size vs Hash Ratio') -plt.tight_layout() - -plt.savefig("block_binary_index.svg") diff --git a/microbench/block_bin_index/template3d_space.py b/microbench/block_bin_index/template3d_space.py deleted file mode 100644 index dd18dac0..00000000 --- a/microbench/block_bin_index/template3d_space.py +++ /dev/null @@ -1,37 +0,0 @@ -from pathlib import Path -import json -import matplotlib.pyplot as plt - -# Read JSONL file using Path API -data_file = Path("data.jsonl") -lines = [line for line in data_file.read_text().splitlines() if line.strip()] -data_points = [json.loads(line) for line in lines] -filtered_data = [point for point in data_points if not point.get("unsafe", False)] - -# Extract the axes -x_vals = [point["item_count"] for point in filtered_data] -y_vals = [point["restart_interval"] for point in filtered_data] -z_vals = [point["block_size"] / 1024 for point in filtered_data] - -# Plotting -fig = plt.figure(figsize=(6, 4)) -ax = fig.add_subplot(111, projection='3d') - -trisurf = ax.plot_trisurf(x_vals, y_vals, z_vals, cmap='viridis', edgecolor='none', alpha=0.8) - -cbar = fig.colorbar(trisurf, ax=ax, pad=0.1, shrink=0.8, aspect=15) -cbar.set_label("", labelpad=10) - -ax.set_xlabel("# KV tuples") -ax.set_ylabel("Restart interval") -ax.set_zlabel("Block size [KiB]") - -ax.set_zlim(bottom=0) - -ax.invert_xaxis() -ax.invert_yaxis() - -fig.subplots_adjust(left=-0.3, right=0.99, top=0.99, bottom=0.08) - -# plt.tight_layout() -plt.savefig("binary_index_3d_space.svg") diff --git a/microbench/block_bin_index/template3d_speed.py b/microbench/block_bin_index/template3d_speed.py deleted file mode 100644 index 969f675e..00000000 --- a/microbench/block_bin_index/template3d_speed.py +++ /dev/null @@ -1,36 +0,0 @@ -from pathlib import Path -import json -import matplotlib.pyplot as plt - -# Read JSONL file using Path API -data_file = Path("data.jsonl") -lines = [line for line in data_file.read_text().splitlines() if line.strip()] -data_points = [json.loads(line) for line in lines] -filtered_data = [point for point in data_points if not point.get("unsafe", False)] - -# Extract the axes -x_vals = [point["item_count"] for point in filtered_data] -y_vals = [point["restart_interval"] for point in filtered_data] -z_vals = [point["rps_ns"] for point in filtered_data] - -# Plotting -fig = plt.figure(figsize=(6, 4)) -ax = fig.add_subplot(111, projection='3d') - -trisurf = ax.plot_trisurf(x_vals, y_vals, z_vals, cmap='viridis', edgecolor='none', alpha=0.8) - -cbar = fig.colorbar(trisurf, ax=ax, pad=0.1, shrink=0.8, aspect=15) -cbar.set_label("", labelpad=10) - -ax.set_xlabel("# KV tuples") -ax.set_ylabel("Restart interval") -ax.set_zlabel("Read latency [ns]") - -ax.set_zlim(bottom=0) - -ax.invert_xaxis() - -fig.subplots_adjust(left=-0.3, right=0.99, top=0.99, bottom=0.08) - -# plt.tight_layout() -plt.savefig("binary_index_3d_speed.svg") diff --git a/microbench/block_hash_index/Cargo.toml b/microbench/block_hash_index/Cargo.toml deleted file mode 100644 index 20976422..00000000 --- a/microbench/block_hash_index/Cargo.toml +++ /dev/null @@ -1,19 +0,0 @@ -[package] -name = "block_hash_index_bench" -version = "1.0.0" -edition = "2021" -publish = false - -[profile.release] -debug = true - -[features] -default = [] -use_unsafe = ["lsm-tree/use_unsafe"] - -[dependencies] -env_logger = "0.11.8" -lsm-tree = { path = "../..", features = ["lz4"] } -lz4_flex = "0.11.3" -rand = "0.9" -serde_json = "1.0.140" diff --git a/microbench/block_hash_index/run.nu b/microbench/block_hash_index/run.nu deleted file mode 100644 index 92a918e1..00000000 --- a/microbench/block_hash_index/run.nu +++ /dev/null @@ -1,5 +0,0 @@ -rm -f data.jsonl -cargo run -r | save data.jsonl --append -cargo run -r --features use_unsafe | save data.jsonl --append -python3 template.py - diff --git a/microbench/block_hash_index/src/main.rs b/microbench/block_hash_index/src/main.rs deleted file mode 100644 index 51c4498e..00000000 --- a/microbench/block_hash_index/src/main.rs +++ /dev/null @@ -1,143 +0,0 @@ -use lsm_tree::segment::DataBlock; -use lsm_tree::{coding::Encode, InternalValue, ValueType}; -use rand::Rng; -use std::io::Write; -use std::time::Instant; - -pub fn main() -> lsm_tree::Result<()> { - env_logger::Builder::from_default_env().init(); - - #[cfg(feature = "use_unsafe")] - let use_unsafe = true; - - #[cfg(not(feature = "use_unsafe"))] - let use_unsafe = false; - - let mut rng = rand::rng(); - - let mut items = vec![]; - let item_count = 500; - - for item in 0u128..item_count { - items.push(InternalValue::from_components( - item.to_be_bytes(), - b"asevrasevfbss4b4n6tuziwernwawrbg", - 0, - lsm_tree::ValueType::Value, - )); - } - - for hash_ratio in [0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0] { - // eprintln!("hash_ratio={hash_ratio}"); - - use lsm_tree::segment::{ - block::{BlockType, Header}, - BlockOffset, Checksum, DataBlock, - }; - - let bytes = DataBlock::encode_into_vec(&items, 16, hash_ratio)?; - // eprintln!("{bytes:?}"); - // eprintln!("{}", String::from_utf8_lossy(&bytes)); - // eprintln!("encoded into {} bytes", bytes.len()); - - { - use lsm_tree::segment::Block; - - let block = DataBlock::new(Block { - data: lsm_tree::Slice::new(&bytes), - header: Header { - checksum: Checksum::from_raw(0), - data_length: 0, - uncompressed_length: 0, - previous_block_offset: BlockOffset(0), - block_type: BlockType::Data, - }, - }); - - /* eprintln!( - "hash index conflicts: {:?} / {:?}", - block.hash_bucket_conflict_count(), - block.hash_bucket_count(), - ); - eprintln!( - "hash index free slots: {:?} / {:?}", - block.hash_bucket_free_count(), - block.hash_bucket_count(), - ); */ - - { - const NUM_RUNS: u128 = 50_000_000; - - let start = Instant::now(); - for _ in 0..NUM_RUNS { - let needle = rng.random_range(0..item_count).to_be_bytes(); - block.point_read(&needle, u64::MAX).unwrap(); - } - - let rps_ns = { - let ns = start.elapsed().as_nanos(); - ns / NUM_RUNS - }; - - /* eprintln!("one read took {:?}ns",); */ - - println!( - "{}", - serde_json::json!({ - "block_size": bytes.len(), - "hash_ratio": format!("{hash_ratio:.1?}"), - "rps_ns": rps_ns, - "conflicts": block.get_hash_index_reader().map(|x| x.conflict_count()).unwrap_or_default(), - "free": block.get_hash_index_reader().map(|x| x.free_count()).unwrap_or_default(), - "use_unsafe": use_unsafe, - }) - .to_string(), - ); - } - - /* { - let start = Instant::now(); - for _ in 0..25_000 { - assert_eq!(items.len(), block.iter().count()); - } - - eprintln!("one iter() took {:?}ns", { - let ns = start.elapsed().as_nanos() as usize; - ns / 25_000 / items.len() - }); - } */ - - /* { - let start = Instant::now(); - for _ in 0..25_000 { - assert_eq!(items.len(), block.iter().rev().count()); - } - - eprintln!("one iter().rev() took {:?}ns", { - let ns = start.elapsed().as_nanos() as usize; - ns / 25_000 / items.len() - }); - } */ - } - - /* { - let mut writer = vec![]; - header.encode_into(&mut writer)?; - writer.write_all(&bytes)?; - - eprintln!("V3 format (uncompressed): {}B", writer.len()); - } - - { - let mut writer = vec![]; - header.encode_into(&mut writer)?; - - let bytes = lz4_flex::compress_prepend_size(&bytes); - writer.write_all(&bytes)?; - - eprintln!("V3 format (LZ4): {}B", writer.len()); - } */ - } - - Ok(()) -} diff --git a/microbench/block_hash_index/template.py b/microbench/block_hash_index/template.py deleted file mode 100644 index ae8796b5..00000000 --- a/microbench/block_hash_index/template.py +++ /dev/null @@ -1,64 +0,0 @@ -import json -from pathlib import Path -import matplotlib.pyplot as plt -from palettable.tableau import PurpleGray_6 - -colors = PurpleGray_6.mpl_colors - -# Path to your data file -data_path = Path("data.jsonl") - -# Read and parse the data -safe_data = [] -unsafe_data = [] - -with data_path.open() as f: - for line in f: - if len(line) == 0: - continue - - entry = json.loads(line) - if entry.get("use_unsafe") == False: - safe_data.append(entry) - else: - unsafe_data.append(entry) - -# Sort by hash_ratio to ensure smooth lines -safe_data.sort(key=lambda x: x["hash_ratio"]) -unsafe_data.sort(key=lambda x: x["hash_ratio"]) - -# Extract data for plotting -hash_ratio_safe = [d["hash_ratio"] for d in safe_data] -rps_ns_safe = [d["rps_ns"] for d in safe_data] -block_size = [d["block_size"] for d in safe_data] - -hash_ratio_unsafe = [d["hash_ratio"] for d in unsafe_data] -rps_ns_unsafe = [d["rps_ns"] for d in unsafe_data] - -# Create figure and first Y-axis -fig, ax1 = plt.subplots(figsize=(6, 4)) - -# Plot rps_ns (left Y-axis) -ax1.plot(hash_ratio_safe, rps_ns_safe, label='Read latency (safe)', marker='o', color = colors[0]) -ax1.plot(hash_ratio_unsafe, rps_ns_unsafe, label='Read latency (unsafe)', marker='x', color = colors[1]) -ax1.set_xlabel('Hash ratio [bytes per KV]') -ax1.set_ylabel('Point read latency [ns]') -ax1.tick_params(axis='y') - -# Create second Y-axis for block size -ax2 = ax1.twinx() -ax2.plot(hash_ratio_safe, block_size, label='Block size', linestyle='--', marker='d', color = colors[2]) -ax2.set_ylabel('Block size [bytes]') -ax2.tick_params(axis='y') - -# Combine legends from both axes -lines1, labels1 = ax1.get_legend_handles_labels() -lines2, labels2 = ax2.get_legend_handles_labels() -ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper center', fancybox=True, bbox_to_anchor=(0.5, 1.25), shadow=True, ncol=2) - -# Grid and title -ax1.grid(color="0.9", linestyle='--', linewidth=1) -# plt.title('Safe vs Unsafe: rps_ns and Block Size vs Hash Ratio') -plt.tight_layout() - -plt.savefig("block_hash_index.svg") diff --git a/microbench/block_load/.gitignore b/microbench/block_load/.gitignore deleted file mode 100644 index 48cf2ad1..00000000 --- a/microbench/block_load/.gitignore +++ /dev/null @@ -1 +0,0 @@ -block diff --git a/microbench/block_load/Cargo.toml b/microbench/block_load/Cargo.toml deleted file mode 100644 index e16002a0..00000000 --- a/microbench/block_load/Cargo.toml +++ /dev/null @@ -1,19 +0,0 @@ -[package] -name = "block_load_bench" -version = "1.0.0" -edition = "2021" -publish = false - -[profile.release] -debug = true - -[features] -default = [] -use_unsafe = ["lsm-tree/use_unsafe"] - -[dependencies] -env_logger = "0.11.8" -lsm-tree = { path = "../..", features = ["lz4"] } -lz4_flex = "0.11.3" -rand = "0.9" -serde_json = "1.0.140" diff --git a/microbench/block_load/run.nu b/microbench/block_load/run.nu deleted file mode 100644 index 11240364..00000000 --- a/microbench/block_load/run.nu +++ /dev/null @@ -1,4 +0,0 @@ -rm -f data.jsonl -cargo run -r --features use_unsafe | save data.jsonl --append -cargo run -r --no-default-features | save data.jsonl --append -python3 template.py diff --git a/microbench/block_load/src/main.rs b/microbench/block_load/src/main.rs deleted file mode 100644 index 04e7265c..00000000 --- a/microbench/block_load/src/main.rs +++ /dev/null @@ -1,75 +0,0 @@ -use lsm_tree::{ - segment::{ - block::{Block, BlockType, Header as BlockHeader}, - BlockHandle, BlockOffset, DataBlock, - }, - CompressionType, InternalValue, -}; -use std::time::Instant; - -pub fn main() -> lsm_tree::Result<()> { - env_logger::Builder::from_default_env().init(); - - #[cfg(feature = "use_unsafe")] - let used_unsafe = true; - - #[cfg(not(feature = "use_unsafe"))] - let used_unsafe = false; - - for item_count in [100, 200, 400, 1_000, 2_000] { - let mut items = vec![]; - - for item in 0u64..item_count { - items.push(InternalValue::from_components( - item.to_be_bytes(), - b"1asdabawerbwqerbqwr", - 0, - lsm_tree::ValueType::Value, - )); - } - - let mut file = std::fs::File::create("block")?; - - let bytes = DataBlock::encode_into_vec(&items, 16, 1.33)?; - let header = Block::write_into(&mut file, &bytes, BlockType::Data, CompressionType::None)?; - let bytes_written = BlockHeader::serialized_len() as u32 + header.data_length; - - file.sync_all()?; - drop(file); - - { - let file = std::fs::File::open("block")?; - - { - const NUM_RUNS: u128 = 10_000_000; - - let start = Instant::now(); - for _ in 0..NUM_RUNS { - let _block = lsm_tree::segment::Block::from_file( - &file, - BlockHandle::new(BlockOffset(0), bytes_written as u32), - BlockType::Data, - CompressionType::None, - )?; - } - - let rps_ns = { - let ns = start.elapsed().as_nanos(); - ns / NUM_RUNS - }; - - println!( - "{}", - serde_json::json!({ - "block_size": bytes.len(), - "rps_ns": rps_ns, - "unsafe": used_unsafe, - }) - .to_string(), - ); - } - } - } - - Ok(()) -} diff --git a/microbench/block_load/template.py b/microbench/block_load/template.py deleted file mode 100644 index a0ffb812..00000000 --- a/microbench/block_load/template.py +++ /dev/null @@ -1,68 +0,0 @@ -import matplotlib.pyplot as plt -import json -from palettable.tableau import PurpleGray_6 -from pathlib import Path - -colors = PurpleGray_6.mpl_colors - -data = Path('data.jsonl').read_text() - -# Parse the data -data_list = [json.loads(line) for line in data.strip().split('\n')] - -# Separate data based on the 'unsafe' field -safe_data = [item for item in data_list if not item["unsafe"]] -unsafe_data = [item for item in data_list if item["unsafe"]] - -# Extract x and y values for each category -safe_block_sizes = [item["block_size"] for item in safe_data] -safe_latencies = [item["rps_ns"] for item in safe_data] - -unsafe_block_sizes = [item["block_size"] for item in unsafe_data] -unsafe_latencies = [item["rps_ns"] for item in unsafe_data] - -plt.rcParams.update({ - 'axes.labelsize': 8, - 'font.size': 8, - 'legend.fontsize': 10, - 'xtick.labelsize': 10, - 'ytick.labelsize': 10, - 'text.usetex': False, - 'figure.figsize': [4.5, 4.5] -}) - -# Create the plot -plt.figure(figsize=(6, 4)) - -# Plot the data for 'unsafe' = False -plt.plot( - safe_block_sizes, - safe_latencies, - marker="o", - linestyle="-", - label="safe", - color=colors[0], -) - -# Plot the data for 'unsafe' = True -plt.plot( - unsafe_block_sizes, - unsafe_latencies, - marker="s", - linestyle="--", - label="unsafe", - color=colors[1], -) - -# Add labels and title -plt.xscale("log") -plt.yscale("log") -# plt.ylim(bottom=0) -plt.xlabel("Block size [bytes]") -plt.ylabel("Read latency [ns/op]") -plt.legend(loc='upper center', fancybox=True, bbox_to_anchor=(0.5, 1.05), shadow=True, ncol=2) -plt.grid(color="0.9", linestyle='--', linewidth=1) -plt.tight_layout() - -# Show the plot -plt.savefig("block_load.svg") diff --git a/microbench/bloom_fpr/Cargo.toml b/microbench/bloom_fpr/Cargo.toml deleted file mode 100644 index a5a99caa..00000000 --- a/microbench/bloom_fpr/Cargo.toml +++ /dev/null @@ -1,11 +0,0 @@ -[package] -name = "bloom_fpr" -version = "0.1.0" -edition = "2024" - -[features] -default = [] - -[dependencies] -lsm-tree = { path = "../..", features = ["lz4"] } -rand = "0.9.0" diff --git a/microbench/bloom_fpr/run.nu b/microbench/bloom_fpr/run.nu deleted file mode 100644 index 7ee48ce9..00000000 --- a/microbench/bloom_fpr/run.nu +++ /dev/null @@ -1,3 +0,0 @@ -rm -f data.jsonl -cargo run -r | save data.jsonl --append -python3 template.py diff --git a/microbench/bloom_fpr/src/main.rs b/microbench/bloom_fpr/src/main.rs deleted file mode 100644 index 97b7cd42..00000000 --- a/microbench/bloom_fpr/src/main.rs +++ /dev/null @@ -1,90 +0,0 @@ -use rand::RngCore; -use std::time::Instant; - -const NUM_READS: usize = 100_000_000; - -pub fn main() { - let mut rng = rand::rng(); - - let keys = (0..100_000_000u64) - .map(|x| x.to_be_bytes()) - .collect::>(); - - for fpr in [0.25, 0.1, 0.01, 0.001, 0.000_1, 0.000_01, 0.000_001] { - let n = keys.len(); - - { - use lsm_tree::segment::filter::standard_bloom::Builder; - use lsm_tree::segment::filter::standard_bloom::StandardBloomFilterReader as Reader; - - let mut filter = Builder::with_fp_rate(n, fpr); - - for key in &keys { - filter.set_with_hash(Builder::get_hash(key)); - } - - let filter_bytes = filter.build(); - let filter = Reader::new(&filter_bytes).unwrap(); - - eprintln!("-- standard n={n} e={fpr} --"); - - { - let mut hits = 0; - - for _ in 0..NUM_READS { - let mut key = [0; 16]; - rng.fill_bytes(&mut key); - let hash = Builder::get_hash(&key); - - if filter.contains_hash(hash) { - hits += 1; - } - } - - let real_fpr = hits as f64 / NUM_READS as f64; - - let filter_size_bytes = filter_bytes.len(); - println!( - r#"{{"real_fpr":{real_fpr},"key_count":{n},"target_fpr":{fpr},"impl":"standard","false_hits":{hits},"bytes":{filter_size_bytes}}}"# - ); - } - } - - { - use lsm_tree::segment::filter::blocked_bloom::Builder; - use lsm_tree::segment::filter::blocked_bloom::BlockedBloomFilterReader as Reader; - - let mut filter = Builder::with_fp_rate(n, fpr); - - for key in &keys { - filter.set_with_hash(Builder::get_hash(key)); - } - - let filter_bytes = filter.build(); - let filter = Reader::new(&filter_bytes).unwrap(); - - eprintln!("-- blocked n={n} e={fpr} --"); - - { - let mut hits = 0; - - for _ in 0..NUM_READS { - let mut key = [0; 16]; - rng.fill_bytes(&mut key); - let hash = Builder::get_hash(&key); - - if filter.contains_hash(hash) { - hits += 1; - } - } - - let real_fpr = hits as f64 / NUM_READS as f64; - - let filter_size_bytes = filter_bytes.len(); - println!( - r#"{{"real_fpr":{real_fpr},"key_count":{n},"target_fpr":{fpr},"impl":"blocked","false_hits":{hits},"bytes":{filter_size_bytes}}}"# - ); - } - } - } -} diff --git a/microbench/bloom_fpr/template.py b/microbench/bloom_fpr/template.py deleted file mode 100644 index 11e84d70..00000000 --- a/microbench/bloom_fpr/template.py +++ /dev/null @@ -1,93 +0,0 @@ -import json -import matplotlib.pyplot as plt -from collections import defaultdict -from pathlib import Path -from palettable.tableau import PurpleGray_6 - -colors = PurpleGray_6.mpl_colors - -jsonl_path = Path('data.jsonl') - -fpr_data = defaultdict(list) -size_data = defaultdict(list) - -for line in jsonl_path.read_text().splitlines(): - obj = json.loads(line) - impl = obj['impl'] - fpr_data[impl].append((obj['target_fpr'], obj['real_fpr'])) - size_data[impl].append((obj['target_fpr'], obj['bytes'])) - -plt.rcParams.update({ - 'axes.labelsize': 8, - 'font.size': 8, - 'legend.fontsize': 10, - 'xtick.labelsize': 10, - 'ytick.labelsize': 10, - 'text.usetex': False, - 'figure.figsize': [4.5, 4.5] -}) - -fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4)) - -# --- Plot 1: Real FPR vs Target FPR --- -i = 0 - -for impl, values in fpr_data.items(): - values.sort() - x_vals = [x for x, y in values] - y_vals = [y for x, y in values] - marker = "v" if impl == "blocked" else "o" - label = impl - ax1.plot(x_vals, y_vals, marker=marker, label=label, color=colors[i], linestyle="-") - i += 1 - -# --- Plot 2: Filter Size vs Target FPR --- -i = 0 -for impl, values in size_data.items(): - values.sort() - x_vals = [x for x, y in values] - y_vals = [y / 1_024 / 1_024 for x, y in values] - marker = "v" if impl == "blocked" else "o" - ax2.plot(x_vals, y_vals, marker=marker, label=impl, color=colors[i], linestyle="-") - i += 1 - -# --- Secondary Y-axis: Size difference --- -ax2b = ax2.twinx() - -# Compute difference (impls[1] - impls[0]) assuming same target_fpr -impl1_vals = sorted(size_data["standard"]) -impl2_vals = sorted(size_data["blocked"]) - -# Make sure lengths and x-values match -percent_diff_x = [] -percent_diff_y = [] -for (x1, y1), (x2, y2) in zip(impl1_vals, impl2_vals): - percent_diff_x.append(x1) - percent_diff_y.append(100.0 * (y2 - y1) / y1) - -ax2b.plot(percent_diff_x, percent_diff_y, color='#a0a0a0', linestyle='dotted', marker='x', label="Diff") -ax2b.set_ylabel("Size difference [%]") -ax2b.invert_yaxis() -ax2b.set_ylim(top=0, bottom=33) -ax2b.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{int(x)}")) - -ax1.set_title("A", loc='left') -ax1.set_xscale("log") -ax1.set_yscale("log") -ax1.set_xlabel("Target false positive rate") -ax1.set_ylabel("Real false positive rate") -ax1.grid(color="0.9", linestyle='--', linewidth=1) -ax1.legend(loc='upper center', fancybox=True, bbox_to_anchor=(0.5, 1.15), shadow=True, ncol=2) - -ax2.set_title("B", loc='left') -ax2.set_xscale("log") -ax2.set_ylim(bottom=0) -ax2.set_xlabel("Target false positive rate") -ax2.set_ylabel("Filter size [MiB]") -ax2.grid(color="0.9", linestyle='--', linewidth=1) -lines1, labels1 = ax2.get_legend_handles_labels() -lines2, labels2 = ax2b.get_legend_handles_labels() -ax2b.legend(lines1 + lines2, labels1 + labels2, loc='upper center', fancybox=True, bbox_to_anchor=(0.5, 1.15), shadow=True, ncol=2) - -plt.tight_layout() -plt.savefig("bloom_fpr.svg") diff --git a/microbench/bloom_speed/Cargo.toml b/microbench/bloom_speed/Cargo.toml deleted file mode 100644 index ba88f204..00000000 --- a/microbench/bloom_speed/Cargo.toml +++ /dev/null @@ -1,12 +0,0 @@ -[package] -name = "bloom_speed" -version = "0.1.0" -edition = "2024" - -[features] -default = [] -use_unsafe = ["lsm-tree/bloom_use_unsafe"] - -[dependencies] -lsm-tree = { path = "../..", features = ["lz4"] } -rand = "0.9.0" diff --git a/microbench/bloom_speed/run.nu b/microbench/bloom_speed/run.nu deleted file mode 100644 index 36f75fca..00000000 --- a/microbench/bloom_speed/run.nu +++ /dev/null @@ -1,4 +0,0 @@ -rm -f data.jsonl -cargo run -r | save data.jsonl --append -cargo run -r --features use_unsafe | save data.jsonl --append -python3 template.py diff --git a/microbench/bloom_speed/src/main.rs b/microbench/bloom_speed/src/main.rs deleted file mode 100644 index 55de258e..00000000 --- a/microbench/bloom_speed/src/main.rs +++ /dev/null @@ -1,113 +0,0 @@ -use rand::{Rng, RngCore}; -use std::time::Instant; - -const NUM_READS: usize = 200_000_000; - -pub fn main() { - let mut rng = rand::rng(); - - let keys = (0..100_000_000u128) - .map(|x| x.to_be_bytes()) - .collect::>(); - - for fpr in [0.25, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001] { - let n = keys.len(); - - { - use lsm_tree::segment::filter::standard_bloom::Builder; - use lsm_tree::segment::filter::standard_bloom::StandardBloomFilterReader as Reader; - - let mut filter = Builder::with_fp_rate(n, fpr); - - for key in &keys { - filter.set_with_hash(Builder::get_hash(key)); - } - - let filter_bytes = filter.build(); - let filter = Reader::new(&filter_bytes).unwrap(); - - eprintln!("-- standard n={n} e={fpr} --"); - - { - let start = Instant::now(); - - for _ in 0..NUM_READS { - use rand::seq::IndexedRandom; - - // let sample = keys.choose(&mut rng).unwrap(); - - let mut sample = [0; 8]; - rng.fill(&mut sample); - - let hash = Builder::get_hash(&sample); - filter.contains_hash(hash); - // assert!(filter.contains_hash(hash)); - } - - let ns = start.elapsed().as_nanos(); - let per_read = ns / NUM_READS as u128; - eprintln!(" true positive in {per_read}ns"); - - #[cfg(feature = "use_unsafe")] - let use_unsafe = true; - - #[cfg(not(feature = "use_unsafe"))] - let use_unsafe = false; - - let filter_size_bytes = filter_bytes.len(); - println!( - r#"{{"key_count":{n},"fpr":{fpr},"impl":"standard","ns":{per_read},"bytes":{filter_size_bytes},"unsafe":{use_unsafe}}}"# - ); - } - } - - { - use lsm_tree::segment::filter::blocked_bloom::BlockedBloomFilterReader as Reader; - use lsm_tree::segment::filter::blocked_bloom::Builder; - - let mut filter = Builder::with_fp_rate(n, fpr); - - for key in &keys { - filter.set_with_hash(Builder::get_hash(key)); - } - - let filter_bytes = filter.build(); - let filter = Reader::new(&filter_bytes).unwrap(); - - eprintln!("-- blocked n={n} e={fpr} --"); - - { - let start = Instant::now(); - - for _ in 0..NUM_READS { - use rand::seq::IndexedRandom; - - // let sample = keys.choose(&mut rng).unwrap(); - - let mut sample = [0; 8]; - rng.fill(&mut sample); - - let hash = Builder::get_hash(&sample); - filter.contains_hash(hash); - - // assert!(filter.contains_hash(hash)); - } - - let ns = start.elapsed().as_nanos(); - let per_read = ns / NUM_READS as u128; - eprintln!(" true positive in {per_read}ns"); - - #[cfg(feature = "use_unsafe")] - let use_unsafe = true; - - #[cfg(not(feature = "use_unsafe"))] - let use_unsafe = false; - - let filter_size_bytes = filter_bytes.len(); - println!( - r#"{{"key_count":{n},"fpr":{fpr},"impl":"blocked","ns":{per_read},"bytes":{filter_size_bytes},"unsafe":{use_unsafe}}}"# - ); - } - } - } -} diff --git a/microbench/bloom_speed/template.py b/microbench/bloom_speed/template.py deleted file mode 100644 index 83b995f8..00000000 --- a/microbench/bloom_speed/template.py +++ /dev/null @@ -1,58 +0,0 @@ -import json -import matplotlib.pyplot as plt -from collections import defaultdict -from pathlib import Path -from palettable.tableau import PurpleGray_6 - -colors = PurpleGray_6.mpl_colors - -# Path to the JSONL file -jsonl_path = Path('data.jsonl') - -# Data structure: {(impl, unsafe): [(fpr, ns), ...]} -data = defaultdict(list) - -# Read the JSONL file -for line in jsonl_path.read_text().splitlines(): - obj = json.loads(line) - key = (obj['impl'], obj['unsafe']) - data[key].append((obj['fpr'], obj['ns'])) - -plt.rcParams.update({ - 'axes.labelsize': 8, - 'font.size': 8, - 'legend.fontsize': 10, - 'xtick.labelsize': 10, - 'ytick.labelsize': 10, - 'text.usetex': False, - 'figure.figsize': [4.5, 4.5] -}) - -# Plotting -plt.figure(figsize=(6, 4)) - -i = 0 - -for (impl, unsafe), values in data.items(): - # Sort by FPR for consistent line plots - values.sort() - fprs = [fpr for fpr, ns in values] - ns_vals = [ns for fpr, ns in values] - safe_label = "unsafe" if unsafe else "safe" - label = f"{impl}, {safe_label}" - stroke = "-." if unsafe else "-" - marker = "v" if impl == "blocked" else "o" - plt.plot(fprs, ns_vals, marker=marker, label=label, color=colors[i], linestyle=stroke) - i += 1 - -plt.xscale("log") -plt.ylim(bottom=0) -plt.xlabel("False positive rate") -plt.ylabel("Latency [ns]") -# plt.title("Read Performance vs False Positive Rate") -plt.legend(loc='upper center', fancybox=True, bbox_to_anchor=(0.5, 1.15), shadow=True, ncol=2) -plt.grid(color="0.9", linestyle='--', linewidth=1) -plt.tight_layout() -# plt.show() -plt.savefig("bloom_speed.svg") - diff --git a/microbench/fractional_cascading/Cargo.toml b/microbench/fractional_cascading/Cargo.toml deleted file mode 100644 index 67da0adc..00000000 --- a/microbench/fractional_cascading/Cargo.toml +++ /dev/null @@ -1,14 +0,0 @@ -[package] -name = "fractional_bench" -version = "0.1.0" -edition = "2024" - -[features] -default = ["fast_partition_point"] -cascading = [] -fast_partition_point = [] -use_unsafe = ["lsm-tree/use_unsafe"] - -[dependencies] -lsm-tree = { path = "../.." } -rand = "0.9.1" diff --git a/microbench/fractional_cascading/run.nu b/microbench/fractional_cascading/run.nu deleted file mode 100644 index f57c5e71..00000000 --- a/microbench/fractional_cascading/run.nu +++ /dev/null @@ -1,8 +0,0 @@ -#/bin/nu - -rm -f data.jsonl -cargo run -r | save data.jsonl --append -cargo run -r --features use_unsafe | save data.jsonl --append -cargo run -r --features cascading | save data.jsonl --append -cargo run -r --features cascading,use_unsafe | save data.jsonl --append -python3 template.py diff --git a/microbench/fractional_cascading/run.py b/microbench/fractional_cascading/run.py deleted file mode 100644 index 97b25ad7..00000000 --- a/microbench/fractional_cascading/run.py +++ /dev/null @@ -1,69 +0,0 @@ -import itertools -import subprocess - -features_list = [ - "cascading", - # "fast_partition_point", - "use_unsafe" -] - -def run_with_features(features): - """ - Constructs the cargo command and runs it with the specified features. - Prints the command being run and its output. - """ - if not features: - # Handle the case of no features (though technically a combination of size 0) - # If you want to run with no features, you might adjust this - features_arg = "" - command = ["cargo", "run", "-r"] - # print("--- Running command: cargo run -r (no features) ---") - else: - features_arg = ",".join(features) - command = ["cargo", "run", "-r", "--features", features_arg] - # print(f"--- Running command: {' '.join(command)} ---") - - try: - # Run the command and capture output - result = subprocess.run( - command, - capture_output=True, - text=True, # Capture output as text - check=True # Raise an exception if the command fails - ) - # print("--- Output ---") - print(result.stdout.strip()) - # if result.stderr: - # print("--- Stderr ---") - # print(result.stderr) - - except subprocess.CalledProcessError as e: - print(f"--- Command failed with error: {e} ---") - print(f"--- Stderr ---") - print(e.stderr) - except FileNotFoundError: - print("--- Error: 'cargo' command not found. Is Cargo installed and in your PATH? ---") - except Exception as e: - print(f"--- An unexpected error occurred: {e} ---") - -if __name__ == "__main__": - # Generate combinations of different lengths (from 1 to the total number of features) - all_combinations = [] - - for i in range(1, len(features_list) + 1): - combinations_of_length_i = itertools.combinations(features_list, i) - all_combinations.extend(list(combinations_of_length_i)) - - all_combinations.append(tuple()) - all_combinations.sort(key=len) - - # Include the case with no features (an empty combination) if desired - # all_combinations.append(tuple()) # Uncomment this line to include running with no features - - # Loop over each combination - for combination in all_combinations: - # Run the cargo command with the current combination's features - # Convert the tuple to a list for the join operation - run_with_features(list(combination)) - # print("\n" + "="*50 + "\n") # Separator for clarity - diff --git a/microbench/fractional_cascading/src/main.rs b/microbench/fractional_cascading/src/main.rs deleted file mode 100644 index 3b00fc3b..00000000 --- a/microbench/fractional_cascading/src/main.rs +++ /dev/null @@ -1,190 +0,0 @@ -use lsm_tree::{KeyRange, UserKey}; -use rand::Rng; -use std::{sync::Arc, time::Instant}; - -#[cfg(feature = "fast_partition_point")] -pub fn partition_point(slice: &[T], pred: F) -> usize -where - F: Fn(&T) -> bool, -{ - let mut left = 0; - let mut right = slice.len(); - - if right == 0 { - return 0; - } - - while left < right { - let mid = (left + right) / 2; - - // SAFETY: See https://github.com/rust-lang/rust/blob/ebf0cf75d368c035f4c7e7246d203bd469ee4a51/library/core/src/slice/mod.rs#L2834-L2836 - #[warn(unsafe_code)] - #[cfg(feature = "use_unsafe")] - let item = unsafe { slice.get_unchecked(mid) }; - - #[cfg(not(feature = "use_unsafe"))] - let item = slice.get(mid).unwrap(); - - if pred(item) { - left = mid + 1; - } else { - right = mid; - } - } - - left -} - -pub fn get_segment_containing_key(segments: &[Arc], key: &[u8]) -> Option> { - #[cfg(feature = "fast_partition_point")] - let idx = partition_point(segments, |segment| segment.key_range.max() < &key); - - #[cfg(not(feature = "fast_partition_point"))] - let idx = segments.partition_point(|segment| segment.key_range.max() < &key); - - segments - .get(idx) - .filter(|x| x.key_range.min() <= &key) - .cloned() -} - -#[derive(Clone, Debug)] -struct Segment { - // id: String, - is_lmax: bool, - key_range: KeyRange, - next: (u32, u32), -} - -fn run(num_sst: usize) { - eprintln!("Benchmarking {num_sst} SSTs"); - - let keys = (0..num_sst * 2) - .map(|x| x.to_be_bytes().to_vec()) - .collect::>(); - - let lowest_level = keys - .chunks(2) - .map(|x| KeyRange::new((UserKey::new(&x[0]), UserKey::new(&x[1])))) - .enumerate() - .map(|(idx, key_range)| { - Arc::new(Segment { - // id: format!("Lmax-{idx}"), - is_lmax: true, - key_range, - next: (u32::MAX, u32::MAX), - }) - }) - .collect::>(); - - let mut levels = vec![lowest_level]; - - for _ in 0..10 { - let next_level = &levels[0]; - - if next_level.len() <= 10 { - break; - } - - let new_upper_level = next_level - .chunks(10) - .enumerate() - .map(|(idx, x)| { - let idx = idx as u32; - let key_range = KeyRange::aggregate(x.iter().map(|x| &x.key_range)); - Arc::new(Segment { - // id: format!("L3-{idx}"), - is_lmax: false, - key_range, - next: (idx * 10, idx * 10 + 9), - }) - }) - .collect::>(); - - levels.insert(0, new_upper_level); - } - - for (idx, level) in levels.iter().enumerate() { - eprintln!("L{:?} = {}", idx + 1, level.len()); - } - - let mut rng = rand::rng(); - - const RUNS: usize = 25_000_000; - - let start = Instant::now(); - - for _ in 0..RUNS { - let idx = rng.random_range(0..keys.len()); - let key = &keys[idx]; - - // NOTE: Naive search - #[cfg(not(feature = "cascading"))] - { - for (_idx, level) in levels.iter().enumerate() { - let _segment = get_segment_containing_key(&level, &*key).unwrap(); - // eprintln!("found {segment:?} in L{}", idx + 1); - } - } - - // NOTE: Search with fractional cascading - #[cfg(feature = "cascading")] - { - let mut bounds: (u32, u32) = (u32::MAX, u32::MAX); - - for (idx, level) in levels.iter().enumerate() { - let segment = if idx == 0 { - get_segment_containing_key(&level, &*key).expect("should find segment") - } else { - let (lo, hi) = bounds; - let lo = lo as usize; - let hi = hi as usize; - - #[cfg(feature = "use_unsafe")] - let slice = unsafe { level.get_unchecked(lo..=hi) }; - - #[cfg(not(feature = "use_unsafe"))] - let slice = level.get(lo..=hi).unwrap(); - - get_segment_containing_key(slice, &*key).expect("should find segment") - }; - // eprintln!("found {segment:?} in L{}", idx + 1); - - bounds = segment.next; - } - } - } - - let elapsed = start.elapsed(); - let ns = elapsed.as_nanos(); - let per_run = ns / RUNS as u128; - - #[cfg(feature = "cascading")] - let cascading = true; - - #[cfg(not(feature = "cascading"))] - let cascading = false; - - #[cfg(feature = "fast_partition_point")] - let fast_partition_point = true; - - #[cfg(not(feature = "fast_partition_point"))] - let fast_partition_point = false; - - #[cfg(feature = "use_unsafe")] - let used_unsafe = true; - - #[cfg(not(feature = "use_unsafe"))] - let used_unsafe = false; - - println!( - "{{\"lmax_ssts\": {num_sst}, \"ns\":{per_run}, \"unsafe\":{used_unsafe}, \"std_partition_point\":{}, \"cascading\":{cascading} }}", - !fast_partition_point, - ); -} - -fn main() { - for lmax_sst_count in [100, 500, 1_000, 2_000, 4_000, 10_000] { - run(lmax_sst_count); - } -} diff --git a/microbench/fractional_cascading/template.py b/microbench/fractional_cascading/template.py deleted file mode 100644 index cf600454..00000000 --- a/microbench/fractional_cascading/template.py +++ /dev/null @@ -1,46 +0,0 @@ -import matplotlib.pyplot as plt -import json -from palettable.tableau import PurpleGray_6 -from pathlib import Path - -colors = PurpleGray_6.mpl_colors - -data = Path("data.jsonl").read_text() - -# Parse the data -data_list = [json.loads(line) for line in data.strip().split('\n')] - -# Organize data by boolean key -from collections import defaultdict - -grouped = defaultdict(list) -for entry in data_list: - key = (entry["unsafe"], entry["std_partition_point"], entry["cascading"]) - grouped[key].append((entry["lmax_ssts"], entry["ns"])) - -# Plot -plt.figure(figsize=(6, 4)) - -markers = ["*", "o", "d", ".", "v", "^"] -i = 0 - -for key, values in grouped.items(): - values.sort() - x = [v[0] for v in values] - y = [v[1] for v in values] - label = "Cascading" if key[2] else "No cascading" - label += " unsafe" if key[0] else "" - plt.plot(x, y, label=label, color=colors[i], marker=markers[i]) - i += 1 - -plt.xscale("log") - -plt.xlabel("Segments in last level") -plt.ylabel("lookup latency [ns]") - -plt.legend(loc='upper center', fancybox=True, bbox_to_anchor=(0.5, 1.10), shadow=True, ncol=2) -plt.grid(color="0.9", linestyle='--', linewidth=1) -plt.tight_layout() - -plt.savefig("segment_indexing.svg") - diff --git a/microbench/hash_fns/.gitignore b/microbench/hash_fns/.gitignore deleted file mode 100644 index 291a5fe2..00000000 --- a/microbench/hash_fns/.gitignore +++ /dev/null @@ -1 +0,0 @@ -data.jsonl diff --git a/microbench/hash_fns/Cargo.toml b/microbench/hash_fns/Cargo.toml deleted file mode 100644 index 1b9c3654..00000000 --- a/microbench/hash_fns/Cargo.toml +++ /dev/null @@ -1,21 +0,0 @@ -[package] -name = "hash_bench" -version = "0.1.0" -edition = "2024" - -[lib] - -[dependencies] -cityhasher = "0.1.0" -fnv = "1.0.7" -foldhash = "0.1.5" -fxhash = "0.2.1" -gxhash = "3.5.0" -metrohash = "1.0.7" -rand = "0.9.1" -rapidhash = "3.0.0" -rustc-hash = "2.1.1" -seahash = "4.1.0" -wyhash = "0.5.0" -xxhash-rust = { version = "0.8.15", features = ["xxh3", "xxh64"] } -twox-hash = { version = "2.1.0" } diff --git a/microbench/hash_fns/run.nu b/microbench/hash_fns/run.nu deleted file mode 100644 index c45a4c16..00000000 --- a/microbench/hash_fns/run.nu +++ /dev/null @@ -1,2 +0,0 @@ -RUSTFLAGS="-C target-cpu=native" cargo run -r - diff --git a/microbench/hash_fns/src/lib.rs b/microbench/hash_fns/src/lib.rs deleted file mode 100644 index 38dceb15..00000000 --- a/microbench/hash_fns/src/lib.rs +++ /dev/null @@ -1,150 +0,0 @@ -use std::hash::{BuildHasher, Hasher}; - -/// Calculates a 64-bit hash from a byte slice. -pub trait Hash64 { - /// Gets the readable hash function name (e.g. "metrohash") - fn name(&self) -> &'static str; - - /// Hashes a byte slice to a 64-bit digest - fn hash64(&self, bytes: &[u8]) -> u64; -} - -pub struct Fnv; -impl Hash64 for Fnv { - fn name(&self) -> &'static str { - "FNV" - } - - fn hash64(&self, bytes: &[u8]) -> u64 { - let mut hasher = fnv::FnvHasher::default(); - hasher.write(bytes); - hasher.finish() - } -} - -pub struct Xxh64; -impl Hash64 for Xxh64 { - fn name(&self) -> &'static str { - "XXH64" - } - - fn hash64(&self, bytes: &[u8]) -> u64 { - let mut hasher = xxhash_rust::xxh64::Xxh64::default(); - hasher.write(bytes); - hasher.finish() - } -} - -pub struct Xxh3; -impl Hash64 for Xxh3 { - fn name(&self) -> &'static str { - "XXH3" - } - - fn hash64(&self, bytes: &[u8]) -> u64 { - xxhash_rust::xxh3::xxh3_64(bytes) - } -} - -pub struct Xxh3_B; -impl Hash64 for Xxh3_B { - fn name(&self) -> &'static str { - "XXH3_B" - } - - fn hash64(&self, bytes: &[u8]) -> u64 { - twox_hash::XxHash3_64::oneshot(bytes) - } -} - -pub struct CityHash; -impl Hash64 for CityHash { - fn name(&self) -> &'static str { - "CityHash" - } - - fn hash64(&self, bytes: &[u8]) -> u64 { - cityhasher::hash(bytes) - } -} - -pub struct MetroHash; -impl Hash64 for MetroHash { - fn name(&self) -> &'static str { - "MetroHash" - } - - fn hash64(&self, bytes: &[u8]) -> u64 { - let mut hasher = metrohash::MetroHash64::default(); - hasher.write(bytes); - hasher.finish() - } -} - -pub struct WyHash; -impl Hash64 for WyHash { - fn name(&self) -> &'static str { - "WyHash" - } - - fn hash64(&self, bytes: &[u8]) -> u64 { - wyhash::wyhash(bytes, 0) - } -} - -pub struct RapidHash; -impl Hash64 for RapidHash { - fn name(&self) -> &'static str { - "RapidHash" - } - - fn hash64(&self, bytes: &[u8]) -> u64 { - let mut hasher = rapidhash::fast::RapidHasher::default(); - hasher.write(bytes); - hasher.finish() - } -} - -pub struct SeaHash; -impl Hash64 for SeaHash { - fn name(&self) -> &'static str { - "SeaHash" - } - - fn hash64(&self, bytes: &[u8]) -> u64 { - seahash::hash(bytes) - } -} - -pub struct RustcHash; -impl Hash64 for RustcHash { - fn name(&self) -> &'static str { - "RustcHash" - } - - fn hash64(&self, bytes: &[u8]) -> u64 { - rustc_hash::FxBuildHasher::default().hash_one(bytes) - } -} - -pub struct FxHash; -impl Hash64 for FxHash { - fn name(&self) -> &'static str { - "FxHash" - } - - fn hash64(&self, bytes: &[u8]) -> u64 { - fxhash::hash64(bytes) - } -} - -pub struct GxHash; -impl Hash64 for GxHash { - fn name(&self) -> &'static str { - "GxHash" - } - - fn hash64(&self, bytes: &[u8]) -> u64 { - gxhash::gxhash64(bytes, 123) - } -} diff --git a/microbench/hash_fns/src/main.rs b/microbench/hash_fns/src/main.rs deleted file mode 100644 index 55d88742..00000000 --- a/microbench/hash_fns/src/main.rs +++ /dev/null @@ -1,102 +0,0 @@ -use hash_bench::*; -use rand::RngCore; -use std::{path::Path, time::Instant}; - -fn main() { - let hashers: &[&dyn Hash64] = &[ - /* NOTE: GxHash needs AES instructions and a manual build flag, so a bit annoying to compile - but it's very fast: - RUSTFLAGS="-C target-cpu=native" cargo run -r - */ - // &GxHash, - &Xxh64, - &Xxh3, - // &Xxh3_B, // NOTE: twox-hash is slower than xxhash-rust - &RapidHash, &CityHash, &MetroHash, - &WyHash, - // &Fnv, - // &RustcHash, // NOTE: rustc_hash is supposedly stable, but stability is a non-goal: https://github.com/rust-lang/rustc-hash/pull/56#issuecomment-2667670854 - // &FxHash, // NOTE: ^ same for fxhash - // &SeaHash, // NOTE: seahash is pretty slow - ]; - - let mut rng = rand::rng(); - - let mut output = Vec::with_capacity(hashers.len()); - - for hasher in hashers { - for (byte_len, invocations) in [ - (4, 1_000_000_000), - (8, 1_000_000_000), - (16, 1_000_000_000), - (32, 1_000_000_000), - (64, 1_000_000_000), - (128, 500_000_000), - (256, 250_000_000), - (512, 125_000_000), - (1_024, 64_000_000), - (4 * 1_024, 16_000_000), - (8 * 1_024, 8_000_000), - (16 * 1_024, 4_000_000), - (32 * 1_024, 2_000_000), - (64 * 1_024, 1_000_000), - ] { - let invocations = if hasher.name() == "FNV" { - invocations / 4 / 10 - } else { - invocations / 4 - }; - - let mut bytes = vec![0; byte_len]; - rng.fill_bytes(&mut bytes); - eprint!("{} ({} bytes): ", hasher.name(), bytes.len()); - - let start = Instant::now(); - for _ in 0..invocations { - std::hint::black_box(hasher.hash64(&bytes)); - } - let elapsed = start.elapsed(); - let ns = elapsed.as_nanos(); - let per_call = ns as f64 / invocations as f64; - - eprintln!("{elapsed:?} - {per_call}ns per invocation"); - - output.push(format!( - "{{\"hash\": {:?},\"byte_len\": {byte_len}, \"ns\": {per_call}}}", - hasher.name(), - )); - } - - eprintln!(); - } - - { - let output = Path::new("hash.png"); - - if output.exists() { - std::fs::remove_file(&output).unwrap(); - } - } - - { - let data = output.join("\n"); - - let template = std::fs::read_to_string("template.py").unwrap(); - let template = template.replace("% data %", &data); - std::fs::write("tmp.py", &template).unwrap(); - std::fs::write("data.jsonl", &data).unwrap(); - } - - { - let status = std::process::Command::new("python3") - .arg("tmp.py") - .status() - .unwrap(); - - std::fs::remove_file("tmp.py").unwrap(); - - assert!(status.success(), "python failed"); - } - - // TODO: bench conflicts 1B keys - if >0, create matplotlib python image as well, also save JSON -} diff --git a/microbench/hash_fns/template.py b/microbench/hash_fns/template.py deleted file mode 100644 index b042dc3d..00000000 --- a/microbench/hash_fns/template.py +++ /dev/null @@ -1,60 +0,0 @@ -import matplotlib.pyplot as plt -import json -from palettable.tableau import BlueRed_6 -from pathlib import Path - -colors = BlueRed_6.mpl_colors - -data = Path('data.jsonl').read_text() - -# Parse the data -data_list = [json.loads(line) for line in data.strip().split('\n')] - -# Calculate throughput (hashes per second) -for entry in data_list: - # Convert ns to seconds and calculate throughput (1 hash per measurement) - time_in_seconds = entry["ns"] / 1e9 - entry["throughput"] = 1 / time_in_seconds # 1 hash / time in seconds - -# Group data by hash type -grouped_data = {} -for entry in data_list: - hash_type = entry["hash"] - if hash_type not in grouped_data: - grouped_data[hash_type] = {"byte_len": [], "throughput": []} - grouped_data[hash_type]["byte_len"].append(entry["byte_len"]) - grouped_data[hash_type]["throughput"].append(entry["throughput"]) - -plt.rcParams.update({ - 'axes.labelsize': 8, - 'font.size': 8, - 'legend.fontsize': 10, - 'xtick.labelsize': 10, - 'ytick.labelsize': 10, - 'text.usetex': False, - 'figure.figsize': [4.5, 4.5] -}) - -# Create the plot -plt.figure(figsize=(6, 4)) - -i = 0 -markers = ["*", "o", "d", ".", "v", "^"] - -for hash_type, values in grouped_data.items(): - plt.plot(values["byte_len"], values["throughput"], marker=markers[i], - linestyle='-', label=hash_type, color=colors[i]) - i += 1 - -plt.xlabel("Input length [bytes]") -plt.ylabel("Throughput [op/s]") - -plt.xscale('log') -plt.yscale('log') - -plt.legend(loc='upper center', fancybox=True, bbox_to_anchor=(0.5, 1.25), shadow=True, ncol=3) -plt.grid(color="0.9", linestyle='--', linewidth=1) -plt.tight_layout() - -# Save the plot to a file -plt.savefig("hash_fns.svg") diff --git a/microbench/run.nu b/microbench/run.nu deleted file mode 100644 index c54af228..00000000 --- a/microbench/run.nu +++ /dev/null @@ -1,18 +0,0 @@ -let benchmarks = [ - "block_bin_index", - "block_hash_index", - "block_load", - "bloom_fpr", - "bloom_speed", - "fractional_cascading", - "hash_fns", -] - -print "===== Running all benchmarks, this will take a while =====" - -for bench in $benchmarks { - print $"=== Running ($bench) function benchmark ===" - cd $bench - nu run.nu - cd .. -} From 94a407d8186944a2520926f284f03eb176e70b2c Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 14 Oct 2025 20:40:10 +0200 Subject: [PATCH 596/613] wip --- fuzz/data_block/nuclear.sh | 30 ------------------------------ 1 file changed, 30 deletions(-) delete mode 100644 fuzz/data_block/nuclear.sh diff --git a/fuzz/data_block/nuclear.sh b/fuzz/data_block/nuclear.sh deleted file mode 100644 index c6d6a53a..00000000 --- a/fuzz/data_block/nuclear.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -cargo afl build -r -cat /dev/random | head -n 1024 > in1/in1 -cat /dev/random | head -n 1024 > in2/in2 -cat /dev/random | head -n 1024 > in3/in3 -cat /dev/random | head -n 1024 > in4/in4 - -# Set session name -SESSION_NAME="my_session" - -# Start a new tmux session in detached mode -tmux new-session -d -s $SESSION_NAME -c "w" - -# Split the first window vertically -tmux split-window -h -p 25 -t $SESSION_NAME -c $1 - -# Focus on the left pane and start helix -tmux select-pane -t 1 -# tmux send-keys "cargo afl fuzz -i in1 -o out1 target/release/data_block" C-m - -# Switch focus to the right pane -tmux select-pane -t 2 -# tmux send-keys "cargo afl fuzz -i in2 -o out2 target/release/data_block" C-m - -# Create a new window for RSB -# tmux new-window -t $SESSION_NAME -n "2" -c "/devssd/code/rust/rust-storage-bench" - -# Attach to the tmux session -tmux attach -t $SESSION_NAME From 00534b844684b760274ffcfbe34d0c5e275fab7d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 15 Oct 2025 21:33:02 +0200 Subject: [PATCH 597/613] test: another model finding --- tests/model_6.rs | 1291 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1291 insertions(+) create mode 100644 tests/model_6.rs diff --git a/tests/model_6.rs b/tests/model_6.rs new file mode 100644 index 00000000..f365adb9 --- /dev/null +++ b/tests/model_6.rs @@ -0,0 +1,1291 @@ +// Found by model testing + +use lsm_tree::{config::BlockSizePolicy, AbstractTree, KvSeparationOptions, Result, config::CompressionPolicy}; +use std::sync::Arc; +use test_log::test; + +#[test] +fn model_6() -> Result<()> { + let folder = tempfile::tempdir()?; + let path = folder.path(); + + let tree = lsm_tree::Config::new(path) + .data_block_compression_policy(CompressionPolicy::disabled()) + .index_block_compression_policy(CompressionPolicy::disabled()) + .data_block_size_policy(BlockSizePolicy::all(100)) + .with_kv_separation(Some(KvSeparationOptions::default().separation_threshold(10))) + .open()?; + + let compaction = Arc::new(lsm_tree::compaction::Leveled { + target_size: 1_024, + ..Default::default() + }); + + let value = b"hellohello"; + + tree.insert([0, 0, 0, 0, 0, 0, 3, 152], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170598); +tree.insert([0, 0, 0, 0, 0, 0, 0, 9], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170599); +tree.insert([0, 0, 0, 0, 0, 0, 1, 184], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170600); +tree.flush_active_memtable(170501)?; +tree.insert([0, 0, 0, 0, 0, 0, 3, 1], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170601); +tree.insert([0, 0, 0, 0, 0, 0, 2, 31], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170602); +tree.insert([0, 0, 0, 0, 0, 0, 3, 95], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170603); +tree.insert([0, 0, 0, 0, 0, 0, 1, 53], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170604); +tree.insert([0, 0, 0, 0, 0, 0, 0, 40], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170605); +tree.insert([0, 0, 0, 0, 0, 0, 3, 112], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170606); +tree.insert([0, 0, 0, 0, 0, 0, 2, 71], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170607); +tree.insert([0, 0, 0, 0, 0, 0, 0, 4], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170608); +tree.insert([0, 0, 0, 0, 0, 0, 2, 151], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170609); +tree.insert([0, 0, 0, 0, 0, 0, 2, 4], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170610); +tree.insert([0, 0, 0, 0, 0, 0, 2, 99], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170611); +tree.insert([0, 0, 0, 0, 0, 0, 2, 133], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170612); +tree.insert([0, 0, 0, 0, 0, 0, 3, 74], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170613); +tree.insert([0, 0, 0, 0, 0, 0, 2, 189], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170614); +tree.insert([0, 0, 0, 0, 0, 0, 3, 170], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170615); +tree.insert([0, 0, 0, 0, 0, 0, 1, 157], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170616); +tree.insert([0, 0, 0, 0, 0, 0, 2, 102], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170617); +tree.insert([0, 0, 0, 0, 0, 0, 3, 119], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170619); +tree.insert([0, 0, 0, 0, 0, 0, 2, 60], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170620); +tree.insert([0, 0, 0, 0, 0, 0, 0, 188], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170621); +tree.insert([0, 0, 0, 0, 0, 0, 0, 194], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170624); +tree.insert([0, 0, 0, 0, 0, 0, 1, 140], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170625); +tree.insert([0, 0, 0, 0, 0, 0, 0, 255], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170626); +tree.insert([0, 0, 0, 0, 0, 0, 1, 97], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170628); +tree.insert([0, 0, 0, 0, 0, 0, 2, 14], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170629); +tree.insert([0, 0, 0, 0, 0, 0, 0, 103], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170630); +tree.insert([0, 0, 0, 0, 0, 0, 0, 225], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170631); +tree.insert([0, 0, 0, 0, 0, 0, 0, 78], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170633); +tree.insert([0, 0, 0, 0, 0, 0, 0, 242], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170634); +tree.insert([0, 0, 0, 0, 0, 0, 0, 122], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170635); +tree.insert([0, 0, 0, 0, 0, 0, 0, 153], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170636); +tree.insert([0, 0, 0, 0, 0, 0, 1, 142], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170638); +tree.insert([0, 0, 0, 0, 0, 0, 0, 182], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170639); +tree.insert([0, 0, 0, 0, 0, 0, 1, 213], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170641); +tree.insert([0, 0, 0, 0, 0, 0, 2, 150], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170643); +tree.insert([0, 0, 0, 0, 0, 0, 0, 118], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170644); +tree.insert([0, 0, 0, 0, 0, 0, 1, 48], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170645); +tree.insert([0, 0, 0, 0, 0, 0, 0, 28], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170647); +tree.insert([0, 0, 0, 0, 0, 0, 0, 142], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170648); +tree.insert([0, 0, 0, 0, 0, 0, 0, 73], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170649); +tree.insert([0, 0, 0, 0, 0, 0, 0, 68], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170650); +tree.insert([0, 0, 0, 0, 0, 0, 1, 1], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170651); +tree.insert([0, 0, 0, 0, 0, 0, 2, 64], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170653); +tree.insert([0, 0, 0, 0, 0, 0, 1, 204], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170654); +tree.insert([0, 0, 0, 0, 0, 0, 1, 170], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170655); +tree.insert([0, 0, 0, 0, 0, 0, 2, 172], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170656); +tree.insert([0, 0, 0, 0, 0, 0, 1, 28], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170657); +tree.insert([0, 0, 0, 0, 0, 0, 2, 80], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170658); +tree.insert([0, 0, 0, 0, 0, 0, 0, 147], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170659); +tree.insert([0, 0, 0, 0, 0, 0, 1, 248], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170660); +tree.insert([0, 0, 0, 0, 0, 0, 2, 30], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170661); +tree.insert([0, 0, 0, 0, 0, 0, 1, 113], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170662); +tree.insert([0, 0, 0, 0, 0, 0, 0, 79], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170664); +tree.insert([0, 0, 0, 0, 0, 0, 1, 216], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170665); +tree.insert([0, 0, 0, 0, 0, 0, 0, 48], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170666); +tree.insert([0, 0, 0, 0, 0, 0, 0, 42], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170669); +tree.insert([0, 0, 0, 0, 0, 0, 1, 14], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170671); +tree.insert([0, 0, 0, 0, 0, 0, 0, 38], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170675); +tree.insert([0, 0, 0, 0, 0, 0, 1, 156], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170676); +tree.insert([0, 0, 0, 0, 0, 0, 0, 123], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170677); +tree.insert([0, 0, 0, 0, 0, 0, 0, 238], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170679); +tree.insert([0, 0, 0, 0, 0, 0, 0, 164], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170680); +tree.insert([0, 0, 0, 0, 0, 0, 1, 244], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170681); +tree.insert([0, 0, 0, 0, 0, 0, 0, 30], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170687); +tree.insert([0, 0, 0, 0, 0, 0, 0, 201], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170689); +tree.insert([0, 0, 0, 0, 0, 0, 1, 131], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170690); +tree.insert([0, 0, 0, 0, 0, 0, 0, 247], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170692); +tree.insert([0, 0, 0, 0, 0, 0, 1, 29], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170694); +tree.insert([0, 0, 0, 0, 0, 0, 1, 195], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170695); +tree.insert([0, 0, 0, 0, 0, 0, 1, 158], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170699); +tree.insert([0, 0, 0, 0, 0, 0, 0, 236], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170702); +tree.insert([0, 0, 0, 0, 0, 0, 2, 11], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170705); +tree.insert([0, 0, 0, 0, 0, 0, 0, 107], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170707); +tree.insert([0, 0, 0, 0, 0, 0, 1, 167], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170709); +tree.insert([0, 0, 0, 0, 0, 0, 0, 180], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170710); +tree.insert([0, 0, 0, 0, 0, 0, 3, 171], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170711); +tree.insert([0, 0, 0, 0, 0, 0, 3, 80], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170712); +tree.insert([0, 0, 0, 0, 0, 0, 1, 68], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170713); +tree.insert([0, 0, 0, 0, 0, 0, 3, 66], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170714); +tree.insert([0, 0, 0, 0, 0, 0, 1, 90], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170715); +tree.insert([0, 0, 0, 0, 0, 0, 1, 89], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170716); +tree.insert([0, 0, 0, 0, 0, 0, 1, 166], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170717); +tree.insert([0, 0, 0, 0, 0, 0, 3, 222], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170718); +tree.insert([0, 0, 0, 0, 0, 0, 0, 64], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170719); +tree.insert([0, 0, 0, 0, 0, 0, 3, 209], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170720); +tree.insert([0, 0, 0, 0, 0, 0, 1, 114], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170721); +tree.insert([0, 0, 0, 0, 0, 0, 3, 89], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170722); +tree.insert([0, 0, 0, 0, 0, 0, 0, 117], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170723); +tree.insert([0, 0, 0, 0, 0, 0, 2, 96], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170724); +tree.insert([0, 0, 0, 0, 0, 0, 3, 44], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170725); +tree.insert([0, 0, 0, 0, 0, 0, 2, 158], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170726); +tree.insert([0, 0, 0, 0, 0, 0, 3, 67], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170727); +tree.insert([0, 0, 0, 0, 0, 0, 0, 152], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170728); +tree.insert([0, 0, 0, 0, 0, 0, 3, 54], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170729); +tree.insert([0, 0, 0, 0, 0, 0, 0, 75], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170730); +tree.insert([0, 0, 0, 0, 0, 0, 1, 33], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170731); +tree.insert([0, 0, 0, 0, 0, 0, 1, 22], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170732); +tree.insert([0, 0, 0, 0, 0, 0, 1, 143], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170733); +tree.insert([0, 0, 0, 0, 0, 0, 1, 254], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170735); +tree.insert([0, 0, 0, 0, 0, 0, 2, 183], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170736); +tree.insert([0, 0, 0, 0, 0, 0, 1, 236], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170737); +tree.insert([0, 0, 0, 0, 0, 0, 3, 13], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170738); +tree.insert([0, 0, 0, 0, 0, 0, 2, 6], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170739); +tree.insert([0, 0, 0, 0, 0, 0, 2, 7], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170740); +tree.insert([0, 0, 0, 0, 0, 0, 2, 34], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170741); +tree.insert([0, 0, 0, 0, 0, 0, 2, 70], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170742); +tree.insert([0, 0, 0, 0, 0, 0, 3, 29], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170743); +tree.insert([0, 0, 0, 0, 0, 0, 0, 101], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170744); +tree.insert([0, 0, 0, 0, 0, 0, 1, 226], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170745); +tree.insert([0, 0, 0, 0, 0, 0, 3, 182], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170746); +tree.insert([0, 0, 0, 0, 0, 0, 0, 120], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170747); +tree.insert([0, 0, 0, 0, 0, 0, 1, 215], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170748); +tree.insert([0, 0, 0, 0, 0, 0, 1, 3], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170749); +tree.insert([0, 0, 0, 0, 0, 0, 3, 65], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170751); +tree.insert([0, 0, 0, 0, 0, 0, 1, 74], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170752); +tree.insert([0, 0, 0, 0, 0, 0, 3, 115], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170754); +tree.insert([0, 0, 0, 0, 0, 0, 0, 102], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170755); +tree.insert([0, 0, 0, 0, 0, 0, 0, 50], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170756); +tree.insert([0, 0, 0, 0, 0, 0, 1, 100], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170757); +tree.insert([0, 0, 0, 0, 0, 0, 2, 134], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170758); +tree.insert([0, 0, 0, 0, 0, 0, 1, 86], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170759); +tree.insert([0, 0, 0, 0, 0, 0, 0, 95], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170760); +tree.insert([0, 0, 0, 0, 0, 0, 1, 176], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170761); +tree.insert([0, 0, 0, 0, 0, 0, 2, 174], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170764); +tree.insert([0, 0, 0, 0, 0, 0, 3, 49], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170765); +tree.insert([0, 0, 0, 0, 0, 0, 3, 124], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170766); +tree.insert([0, 0, 0, 0, 0, 0, 1, 11], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170767); +tree.insert([0, 0, 0, 0, 0, 0, 2, 248], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170768); +tree.insert([0, 0, 0, 0, 0, 0, 1, 129], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170769); +tree.insert([0, 0, 0, 0, 0, 0, 0, 89], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170770); +tree.insert([0, 0, 0, 0, 0, 0, 1, 34], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170771); +tree.insert([0, 0, 0, 0, 0, 0, 1, 71], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170772); +tree.insert([0, 0, 0, 0, 0, 0, 3, 210], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170773); +tree.insert([0, 0, 0, 0, 0, 0, 1, 81], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170774); +tree.insert([0, 0, 0, 0, 0, 0, 0, 125], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170775); +tree.insert([0, 0, 0, 0, 0, 0, 1, 179], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170776); +tree.insert([0, 0, 0, 0, 0, 0, 1, 186], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170777); +tree.insert([0, 0, 0, 0, 0, 0, 3, 75], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170778); +tree.insert([0, 0, 0, 0, 0, 0, 3, 93], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170779); +tree.insert([0, 0, 0, 0, 0, 0, 2, 131], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170780); +tree.insert([0, 0, 0, 0, 0, 0, 3, 46], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170781); +tree.insert([0, 0, 0, 0, 0, 0, 1, 172], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170782); +tree.insert([0, 0, 0, 0, 0, 0, 0, 144], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170784); +tree.insert([0, 0, 0, 0, 0, 0, 2, 47], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170785); +tree.insert([0, 0, 0, 0, 0, 0, 2, 236], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170786); +tree.insert([0, 0, 0, 0, 0, 0, 1, 58], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170787); +tree.insert([0, 0, 0, 0, 0, 0, 3, 18], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170788); +tree.insert([0, 0, 0, 0, 0, 0, 3, 175], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170789); +tree.insert([0, 0, 0, 0, 0, 0, 1, 17], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170790); +tree.insert([0, 0, 0, 0, 0, 0, 2, 255], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170791); +tree.insert([0, 0, 0, 0, 0, 0, 0, 109], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170793); +tree.insert([0, 0, 0, 0, 0, 0, 1, 148], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170794); +tree.insert([0, 0, 0, 0, 0, 0, 0, 155], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170795); +tree.insert([0, 0, 0, 0, 0, 0, 2, 154], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170796); +tree.insert([0, 0, 0, 0, 0, 0, 3, 91], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170797); +tree.insert([0, 0, 0, 0, 0, 0, 0, 199], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170798); +tree.insert([0, 0, 0, 0, 0, 0, 1, 247], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170799); +tree.flush_active_memtable(170701)?; +tree.insert([0, 0, 0, 0, 0, 0, 1, 192], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170801); +tree.insert([0, 0, 0, 0, 0, 0, 1, 165], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170802); +tree.insert([0, 0, 0, 0, 0, 0, 0, 116], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170804); +tree.insert([0, 0, 0, 0, 0, 0, 0, 32], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170805); +tree.insert([0, 0, 0, 0, 0, 0, 0, 96], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170809); +tree.insert([0, 0, 0, 0, 0, 0, 1, 124], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170812); +tree.insert([0, 0, 0, 0, 0, 0, 2, 145], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170813); +tree.insert([0, 0, 0, 0, 0, 0, 1, 251], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170814); +tree.insert([0, 0, 0, 0, 0, 0, 0, 138], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170815); +tree.insert([0, 0, 0, 0, 0, 0, 3, 45], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170816); +tree.insert([0, 0, 0, 0, 0, 0, 3, 189], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170817); +tree.insert([0, 0, 0, 0, 0, 0, 2, 98], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170818); +tree.insert([0, 0, 0, 0, 0, 0, 3, 145], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170819); +tree.insert([0, 0, 0, 0, 0, 0, 3, 43], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170820); +tree.insert([0, 0, 0, 0, 0, 0, 0, 5], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170822); +tree.insert([0, 0, 0, 0, 0, 0, 2, 153], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170823); +tree.insert([0, 0, 0, 0, 0, 0, 0, 211], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170824); +tree.insert([0, 0, 0, 0, 0, 0, 3, 92], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170825); +tree.insert([0, 0, 0, 0, 0, 0, 0, 45], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170826); +tree.insert([0, 0, 0, 0, 0, 0, 2, 171], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170827); +tree.insert([0, 0, 0, 0, 0, 0, 0, 133], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170828); +tree.insert([0, 0, 0, 0, 0, 0, 2, 167], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170829); +tree.insert([0, 0, 0, 0, 0, 0, 0, 226], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170830); +tree.insert([0, 0, 0, 0, 0, 0, 1, 242], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170831); +tree.insert([0, 0, 0, 0, 0, 0, 0, 169], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170832); +tree.insert([0, 0, 0, 0, 0, 0, 1, 93], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170833); +tree.insert([0, 0, 0, 0, 0, 0, 1, 240], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170834); +tree.insert([0, 0, 0, 0, 0, 0, 2, 84], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170836); +tree.insert([0, 0, 0, 0, 0, 0, 3, 185], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170837); +tree.insert([0, 0, 0, 0, 0, 0, 0, 145], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170838); +tree.insert([0, 0, 0, 0, 0, 0, 2, 68], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170840); +tree.insert([0, 0, 0, 0, 0, 0, 3, 143], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170841); +tree.insert([0, 0, 0, 0, 0, 0, 0, 227], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170842); +tree.insert([0, 0, 0, 0, 0, 0, 2, 5], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170843); +tree.insert([0, 0, 0, 0, 0, 0, 1, 151], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170844); +tree.insert([0, 0, 0, 0, 0, 0, 1, 118], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170846); +tree.insert([0, 0, 0, 0, 0, 0, 2, 240], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170847); +tree.insert([0, 0, 0, 0, 0, 0, 3, 98], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170848); +tree.insert([0, 0, 0, 0, 0, 0, 1, 178], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170850); +tree.insert([0, 0, 0, 0, 0, 0, 0, 53], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170851); +tree.insert([0, 0, 0, 0, 0, 0, 1, 101], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170852); +tree.insert([0, 0, 0, 0, 0, 0, 3, 199], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170853); +tree.insert([0, 0, 0, 0, 0, 0, 2, 76], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170855); +tree.insert([0, 0, 0, 0, 0, 0, 1, 234], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170856); +tree.insert([0, 0, 0, 0, 0, 0, 1, 50], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170857); +tree.insert([0, 0, 0, 0, 0, 0, 0, 43], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170859); +tree.insert([0, 0, 0, 0, 0, 0, 1, 228], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170860); +tree.insert([0, 0, 0, 0, 0, 0, 2, 159], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170861); +tree.insert([0, 0, 0, 0, 0, 0, 2, 233], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170862); +tree.insert([0, 0, 0, 0, 0, 0, 3, 69], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170863); +tree.insert([0, 0, 0, 0, 0, 0, 1, 95], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170864); +tree.insert([0, 0, 0, 0, 0, 0, 0, 254], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170865); +tree.insert([0, 0, 0, 0, 0, 0, 1, 5], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170866); +tree.insert([0, 0, 0, 0, 0, 0, 2, 61], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170868); +tree.insert([0, 0, 0, 0, 0, 0, 3, 4], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170869); +tree.insert([0, 0, 0, 0, 0, 0, 3, 151], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170871); +tree.insert([0, 0, 0, 0, 0, 0, 0, 239], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170873); +tree.insert([0, 0, 0, 0, 0, 0, 0, 113], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170874); +tree.insert([0, 0, 0, 0, 0, 0, 1, 137], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170875); +tree.insert([0, 0, 0, 0, 0, 0, 2, 36], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170877); +tree.insert([0, 0, 0, 0, 0, 0, 2, 45], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170879); +tree.insert([0, 0, 0, 0, 0, 0, 2, 230], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170880); +tree.insert([0, 0, 0, 0, 0, 0, 3, 172], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170881); +tree.insert([0, 0, 0, 0, 0, 0, 3, 137], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170882); +tree.insert([0, 0, 0, 0, 0, 0, 2, 53], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170883); +tree.insert([0, 0, 0, 0, 0, 0, 0, 229], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170885); +tree.insert([0, 0, 0, 0, 0, 0, 2, 182], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170886); +tree.insert([0, 0, 0, 0, 0, 0, 0, 31], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170887); +tree.insert([0, 0, 0, 0, 0, 0, 0, 13], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170888); +tree.insert([0, 0, 0, 0, 0, 0, 2, 91], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170889); +tree.insert([0, 0, 0, 0, 0, 0, 3, 60], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170890); +tree.insert([0, 0, 0, 0, 0, 0, 1, 222], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170891); +tree.insert([0, 0, 0, 0, 0, 0, 1, 136], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170892); +tree.insert([0, 0, 0, 0, 0, 0, 2, 44], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170894); +tree.insert([0, 0, 0, 0, 0, 0, 1, 180], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170896); +tree.insert([0, 0, 0, 0, 0, 0, 0, 22], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170897); +tree.insert([0, 0, 0, 0, 0, 0, 2, 107], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170898); +tree.insert([0, 0, 0, 0, 0, 0, 3, 27], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170899); +tree.insert([0, 0, 0, 0, 0, 0, 3, 135], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170900); +tree.insert([0, 0, 0, 0, 0, 0, 2, 251], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170901); +tree.insert([0, 0, 0, 0, 0, 0, 0, 167], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170902); +tree.insert([0, 0, 0, 0, 0, 0, 3, 32], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170903); +tree.insert([0, 0, 0, 0, 0, 0, 1, 104], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170904); +tree.insert([0, 0, 0, 0, 0, 0, 3, 204], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170905); +tree.insert([0, 0, 0, 0, 0, 0, 1, 232], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170906); +tree.insert([0, 0, 0, 0, 0, 0, 3, 134], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170907); +tree.insert([0, 0, 0, 0, 0, 0, 0, 119], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170908); +tree.insert([0, 0, 0, 0, 0, 0, 2, 179], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170909); +tree.insert([0, 0, 0, 0, 0, 0, 3, 117], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170911); +tree.insert([0, 0, 0, 0, 0, 0, 3, 198], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170914); +tree.insert([0, 0, 0, 0, 0, 0, 2, 97], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170915); +tree.insert([0, 0, 0, 0, 0, 0, 3, 20], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170918); +tree.insert([0, 0, 0, 0, 0, 0, 0, 231], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170919); +tree.insert([0, 0, 0, 0, 0, 0, 2, 213], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170920); +tree.insert([0, 0, 0, 0, 0, 0, 1, 141], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170921); +tree.insert([0, 0, 0, 0, 0, 0, 0, 128], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170922); +tree.insert([0, 0, 0, 0, 0, 0, 0, 49], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170923); +tree.insert([0, 0, 0, 0, 0, 0, 2, 17], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170924); +tree.insert([0, 0, 0, 0, 0, 0, 1, 40], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170926); +tree.insert([0, 0, 0, 0, 0, 0, 3, 173], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170927); +tree.insert([0, 0, 0, 0, 0, 0, 3, 106], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170928); +tree.insert([0, 0, 0, 0, 0, 0, 2, 59], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170929); +tree.insert([0, 0, 0, 0, 0, 0, 1, 26], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170930); +tree.insert([0, 0, 0, 0, 0, 0, 1, 205], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170932); +tree.insert([0, 0, 0, 0, 0, 0, 0, 154], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170933); +tree.insert([0, 0, 0, 0, 0, 0, 1, 51], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170935); +tree.insert([0, 0, 0, 0, 0, 0, 3, 26], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170937); +tree.insert([0, 0, 0, 0, 0, 0, 1, 246], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170938); +tree.insert([0, 0, 0, 0, 0, 0, 3, 186], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170939); +tree.insert([0, 0, 0, 0, 0, 0, 1, 160], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170940); +tree.insert([0, 0, 0, 0, 0, 0, 1, 115], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170943); +tree.insert([0, 0, 0, 0, 0, 0, 1, 194], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170945); +tree.insert([0, 0, 0, 0, 0, 0, 0, 54], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170946); +tree.insert([0, 0, 0, 0, 0, 0, 2, 204], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170947); +tree.insert([0, 0, 0, 0, 0, 0, 2, 216], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170949); +tree.insert([0, 0, 0, 0, 0, 0, 2, 41], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170953); +tree.insert([0, 0, 0, 0, 0, 0, 2, 94], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170956); +tree.insert([0, 0, 0, 0, 0, 0, 1, 128], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170957); +tree.insert([0, 0, 0, 0, 0, 0, 1, 15], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170958); +tree.insert([0, 0, 0, 0, 0, 0, 1, 60], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170959); +tree.insert([0, 0, 0, 0, 0, 0, 1, 99], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170961); +tree.insert([0, 0, 0, 0, 0, 0, 0, 17], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170964); +tree.insert([0, 0, 0, 0, 0, 0, 0, 150], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170965); +tree.insert([0, 0, 0, 0, 0, 0, 1, 52], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170966); +tree.insert([0, 0, 0, 0, 0, 0, 3, 120], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170967); +tree.insert([0, 0, 0, 0, 0, 0, 1, 57], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170968); +tree.insert([0, 0, 0, 0, 0, 0, 2, 188], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170970); +tree.insert([0, 0, 0, 0, 0, 0, 3, 23], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170972); +tree.insert([0, 0, 0, 0, 0, 0, 3, 193], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170973); +tree.insert([0, 0, 0, 0, 0, 0, 3, 17], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170974); +tree.insert([0, 0, 0, 0, 0, 0, 2, 212], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170976); +tree.insert([0, 0, 0, 0, 0, 0, 3, 16], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170978); +tree.insert([0, 0, 0, 0, 0, 0, 1, 98], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170980); +tree.insert([0, 0, 0, 0, 0, 0, 3, 52], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170981); +tree.insert([0, 0, 0, 0, 0, 0, 1, 214], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170982); +tree.insert([0, 0, 0, 0, 0, 0, 0, 19], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170983); +tree.insert([0, 0, 0, 0, 0, 0, 0, 246], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170985); +tree.insert([0, 0, 0, 0, 0, 0, 0, 77], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170986); +tree.insert([0, 0, 0, 0, 0, 0, 3, 96], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170987); +tree.insert([0, 0, 0, 0, 0, 0, 2, 162], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170988); +tree.insert([0, 0, 0, 0, 0, 0, 0, 198], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170989); +tree.insert([0, 0, 0, 0, 0, 0, 2, 127], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170990); +tree.insert([0, 0, 0, 0, 0, 0, 0, 170], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170991); +tree.insert([0, 0, 0, 0, 0, 0, 1, 173], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170993); +tree.insert([0, 0, 0, 0, 0, 0, 3, 114], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170994); +tree.insert([0, 0, 0, 0, 0, 0, 0, 192], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170995); +tree.insert([0, 0, 0, 0, 0, 0, 2, 43], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170996); +tree.insert([0, 0, 0, 0, 0, 0, 1, 38], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170998); +tree.insert([0, 0, 0, 0, 0, 0, 1, 174], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170999); +tree.insert([0, 0, 0, 0, 0, 0, 0, 90], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171000); +tree.flush_active_memtable(170901)?; +tree.insert([0, 0, 0, 0, 0, 0, 2, 163], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171001); +tree.insert([0, 0, 0, 0, 0, 0, 2, 195], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171003); +tree.insert([0, 0, 0, 0, 0, 0, 3, 86], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171004); +tree.insert([0, 0, 0, 0, 0, 0, 1, 8], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171005); +tree.insert([0, 0, 0, 0, 0, 0, 0, 82], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171006); +tree.insert([0, 0, 0, 0, 0, 0, 3, 121], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171007); +tree.insert([0, 0, 0, 0, 0, 0, 1, 208], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171008); +tree.insert([0, 0, 0, 0, 0, 0, 0, 237], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171010); +tree.insert([0, 0, 0, 0, 0, 0, 1, 190], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171012); +tree.insert([0, 0, 0, 0, 0, 0, 2, 221], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171013); +tree.insert([0, 0, 0, 0, 0, 0, 2, 54], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171014); +tree.insert([0, 0, 0, 0, 0, 0, 1, 231], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171015); +tree.insert([0, 0, 0, 0, 0, 0, 0, 97], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171016); +tree.insert([0, 0, 0, 0, 0, 0, 1, 217], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171017); +tree.insert([0, 0, 0, 0, 0, 0, 2, 224], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171018); +tree.insert([0, 0, 0, 0, 0, 0, 0, 10], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171019); +tree.insert([0, 0, 0, 0, 0, 0, 3, 58], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171022); +tree.insert([0, 0, 0, 0, 0, 0, 0, 235], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171024); +tree.insert([0, 0, 0, 0, 0, 0, 1, 12], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171027); +tree.insert([0, 0, 0, 0, 0, 0, 1, 107], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171034); +tree.insert([0, 0, 0, 0, 0, 0, 1, 233], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171042); +tree.insert([0, 0, 0, 0, 0, 0, 1, 0], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171044); +tree.insert([0, 0, 0, 0, 0, 0, 2, 77], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171045); +tree.insert([0, 0, 0, 0, 0, 0, 0, 253], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171050); +tree.insert([0, 0, 0, 0, 0, 0, 0, 241], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171051); +tree.insert([0, 0, 0, 0, 0, 0, 2, 22], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171052); +tree.insert([0, 0, 0, 0, 0, 0, 0, 12], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171053); +tree.insert([0, 0, 0, 0, 0, 0, 0, 24], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171063); +tree.insert([0, 0, 0, 0, 0, 0, 1, 24], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171065); +tree.insert([0, 0, 0, 0, 0, 0, 3, 48], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171070); +tree.insert([0, 0, 0, 0, 0, 0, 3, 138], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171071); +tree.insert([0, 0, 0, 0, 0, 0, 3, 126], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171075); +tree.insert([0, 0, 0, 0, 0, 0, 3, 42], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171076); +tree.insert([0, 0, 0, 0, 0, 0, 1, 146], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171078); +tree.insert([0, 0, 0, 0, 0, 0, 3, 169], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171080); +tree.insert([0, 0, 0, 0, 0, 0, 0, 233], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171082); +tree.insert([0, 0, 0, 0, 0, 0, 0, 1], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171085); +tree.insert([0, 0, 0, 0, 0, 0, 2, 42], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171088); +tree.insert([0, 0, 0, 0, 0, 0, 0, 23], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171089); +tree.insert([0, 0, 0, 0, 0, 0, 1, 202], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171091); +tree.insert([0, 0, 0, 0, 0, 0, 1, 112], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171092); +tree.insert([0, 0, 0, 0, 0, 0, 2, 193], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171094); +tree.insert([0, 0, 0, 0, 0, 0, 1, 21], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171095); +tree.insert([0, 0, 0, 0, 0, 0, 1, 37], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171097); +tree.insert([0, 0, 0, 0, 0, 0, 2, 191], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171098); +tree.insert([0, 0, 0, 0, 0, 0, 0, 114], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171099); +tree.insert([0, 0, 0, 0, 0, 0, 2, 186], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171103); +tree.insert([0, 0, 0, 0, 0, 0, 1, 237], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171104); +tree.insert([0, 0, 0, 0, 0, 0, 0, 67], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171105); +tree.insert([0, 0, 0, 0, 0, 0, 0, 174], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171110); +tree.insert([0, 0, 0, 0, 0, 0, 2, 111], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171111); +tree.insert([0, 0, 0, 0, 0, 0, 1, 198], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171113); +tree.insert([0, 0, 0, 0, 0, 0, 2, 119], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171114); +tree.insert([0, 0, 0, 0, 0, 0, 1, 125], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171115); +tree.insert([0, 0, 0, 0, 0, 0, 1, 154], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171116); +tree.insert([0, 0, 0, 0, 0, 0, 0, 166], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171117); +tree.insert([0, 0, 0, 0, 0, 0, 2, 249], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171118); +tree.insert([0, 0, 0, 0, 0, 0, 2, 223], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171122); +tree.insert([0, 0, 0, 0, 0, 0, 0, 108], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171133); +tree.insert([0, 0, 0, 0, 0, 0, 1, 84], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171173); +tree.insert([0, 0, 0, 0, 0, 0, 0, 224], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171174); +tree.insert([0, 0, 0, 0, 0, 0, 1, 152], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171175); +tree.insert([0, 0, 0, 0, 0, 0, 2, 16], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171180); +tree.flush_active_memtable(171101)?; +tree.compact(compaction.clone(), 171101)?; + +tree.insert([0, 0, 0, 0, 0, 0, 3, 6], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171201); +tree.insert([0, 0, 0, 0, 0, 0, 1, 217], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171202); +tree.insert([0, 0, 0, 0, 0, 0, 3, 80], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171203); +tree.insert([0, 0, 0, 0, 0, 0, 2, 33], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171204); +tree.insert([0, 0, 0, 0, 0, 0, 0, 232], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171205); +tree.insert([0, 0, 0, 0, 0, 0, 3, 89], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171206); +tree.insert([0, 0, 0, 0, 0, 0, 3, 41], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171207); +tree.insert([0, 0, 0, 0, 0, 0, 3, 216], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171208); +tree.insert([0, 0, 0, 0, 0, 0, 1, 161], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171209); +tree.insert([0, 0, 0, 0, 0, 0, 0, 106], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171210); +tree.insert([0, 0, 0, 0, 0, 0, 0, 6], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171211); +tree.insert([0, 0, 0, 0, 0, 0, 2, 69], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171212); +tree.insert([0, 0, 0, 0, 0, 0, 2, 229], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171213); +tree.insert([0, 0, 0, 0, 0, 0, 0, 210], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171214); +tree.insert([0, 0, 0, 0, 0, 0, 0, 65], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171215); +tree.insert([0, 0, 0, 0, 0, 0, 0, 120], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171216); +tree.insert([0, 0, 0, 0, 0, 0, 1, 203], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171217); +tree.insert([0, 0, 0, 0, 0, 0, 2, 194], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171218); +tree.insert([0, 0, 0, 0, 0, 0, 0, 209], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171219); +tree.insert([0, 0, 0, 0, 0, 0, 0, 64], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171220); +tree.insert([0, 0, 0, 0, 0, 0, 3, 162], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171221); +tree.insert([0, 0, 0, 0, 0, 0, 1, 46], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171223); +tree.insert([0, 0, 0, 0, 0, 0, 0, 12], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171224); +tree.insert([0, 0, 0, 0, 0, 0, 1, 179], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171225); +tree.insert([0, 0, 0, 0, 0, 0, 0, 37], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171226); +tree.insert([0, 0, 0, 0, 0, 0, 2, 200], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171227); +tree.insert([0, 0, 0, 0, 0, 0, 3, 200], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171228); +tree.insert([0, 0, 0, 0, 0, 0, 2, 46], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171229); +tree.insert([0, 0, 0, 0, 0, 0, 1, 21], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171230); +tree.insert([0, 0, 0, 0, 0, 0, 2, 12], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171231); +tree.insert([0, 0, 0, 0, 0, 0, 2, 187], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171232); +tree.insert([0, 0, 0, 0, 0, 0, 2, 235], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171233); +tree.insert([0, 0, 0, 0, 0, 0, 0, 178], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171234); +tree.insert([0, 0, 0, 0, 0, 0, 0, 63], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171235); +tree.insert([0, 0, 0, 0, 0, 0, 1, 89], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171236); +tree.insert([0, 0, 0, 0, 0, 0, 1, 171], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171237); +tree.insert([0, 0, 0, 0, 0, 0, 1, 242], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171238); +tree.insert([0, 0, 0, 0, 0, 0, 0, 142], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171240); +tree.insert([0, 0, 0, 0, 0, 0, 1, 22], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171241); +tree.insert([0, 0, 0, 0, 0, 0, 3, 143], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171242); +tree.insert([0, 0, 0, 0, 0, 0, 0, 131], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171243); +tree.insert([0, 0, 0, 0, 0, 0, 3, 37], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171244); +tree.insert([0, 0, 0, 0, 0, 0, 0, 187], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171245); +tree.insert([0, 0, 0, 0, 0, 0, 2, 226], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171246); +tree.insert([0, 0, 0, 0, 0, 0, 3, 112], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171247); +tree.insert([0, 0, 0, 0, 0, 0, 1, 135], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171248); +tree.insert([0, 0, 0, 0, 0, 0, 2, 103], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171249); +tree.insert([0, 0, 0, 0, 0, 0, 1, 3], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171250); +tree.insert([0, 0, 0, 0, 0, 0, 1, 245], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171251); +tree.insert([0, 0, 0, 0, 0, 0, 1, 33], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171252); +tree.insert([0, 0, 0, 0, 0, 0, 0, 206], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171253); +tree.insert([0, 0, 0, 0, 0, 0, 0, 195], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171254); +tree.insert([0, 0, 0, 0, 0, 0, 3, 26], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171255); +tree.insert([0, 0, 0, 0, 0, 0, 2, 153], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171256); +tree.insert([0, 0, 0, 0, 0, 0, 3, 172], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171257); +tree.insert([0, 0, 0, 0, 0, 0, 0, 140], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171258); +tree.insert([0, 0, 0, 0, 0, 0, 2, 59], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171259); +tree.insert([0, 0, 0, 0, 0, 0, 2, 171], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171260); +tree.insert([0, 0, 0, 0, 0, 0, 0, 82], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171262); +tree.insert([0, 0, 0, 0, 0, 0, 1, 184], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171263); +tree.insert([0, 0, 0, 0, 0, 0, 3, 187], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171264); +tree.insert([0, 0, 0, 0, 0, 0, 0, 161], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171265); +tree.insert([0, 0, 0, 0, 0, 0, 2, 34], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171266); +tree.insert([0, 0, 0, 0, 0, 0, 2, 21], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171267); +tree.insert([0, 0, 0, 0, 0, 0, 1, 182], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171268); +tree.insert([0, 0, 0, 0, 0, 0, 3, 34], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171269); +tree.insert([0, 0, 0, 0, 0, 0, 3, 169], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171270); +tree.insert([0, 0, 0, 0, 0, 0, 1, 241], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171271); +tree.insert([0, 0, 0, 0, 0, 0, 1, 60], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171272); +tree.insert([0, 0, 0, 0, 0, 0, 2, 104], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171273); +tree.insert([0, 0, 0, 0, 0, 0, 0, 28], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171274); +tree.insert([0, 0, 0, 0, 0, 0, 3, 23], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171275); +tree.insert([0, 0, 0, 0, 0, 0, 0, 16], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171276); +tree.insert([0, 0, 0, 0, 0, 0, 0, 101], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171277); +tree.insert([0, 0, 0, 0, 0, 0, 0, 250], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171278); +tree.insert([0, 0, 0, 0, 0, 0, 0, 41], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171279); +tree.insert([0, 0, 0, 0, 0, 0, 3, 161], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171280); +tree.insert([0, 0, 0, 0, 0, 0, 1, 122], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171281); +tree.insert([0, 0, 0, 0, 0, 0, 0, 217], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171282); +tree.insert([0, 0, 0, 0, 0, 0, 3, 121], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171283); +tree.insert([0, 0, 0, 0, 0, 0, 0, 9], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171284); +tree.insert([0, 0, 0, 0, 0, 0, 0, 185], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171285); +tree.insert([0, 0, 0, 0, 0, 0, 2, 56], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171286); +tree.insert([0, 0, 0, 0, 0, 0, 0, 179], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171287); +tree.insert([0, 0, 0, 0, 0, 0, 1, 216], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171288); +tree.insert([0, 0, 0, 0, 0, 0, 2, 185], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171289); +tree.insert([0, 0, 0, 0, 0, 0, 0, 80], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171290); +tree.insert([0, 0, 0, 0, 0, 0, 2, 53], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171291); +tree.insert([0, 0, 0, 0, 0, 0, 1, 236], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171292); +tree.insert([0, 0, 0, 0, 0, 0, 0, 40], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171293); +tree.insert([0, 0, 0, 0, 0, 0, 3, 123], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171294); +tree.insert([0, 0, 0, 0, 0, 0, 0, 238], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171295); +tree.insert([0, 0, 0, 0, 0, 0, 0, 171], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171296); +tree.insert([0, 0, 0, 0, 0, 0, 0, 78], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171297); +tree.insert([0, 0, 0, 0, 0, 0, 3, 81], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171298); +tree.insert([0, 0, 0, 0, 0, 0, 0, 76], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171300); +tree.insert([0, 0, 0, 0, 0, 0, 2, 47], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171301); +tree.insert([0, 0, 0, 0, 0, 0, 3, 74], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171302); +tree.insert([0, 0, 0, 0, 0, 0, 3, 217], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171303); +tree.insert([0, 0, 0, 0, 0, 0, 2, 47], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171304); +tree.insert([0, 0, 0, 0, 0, 0, 0, 218], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171305); +tree.insert([0, 0, 0, 0, 0, 0, 3, 142], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171306); +tree.insert([0, 0, 0, 0, 0, 0, 3, 12], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171307); +tree.insert([0, 0, 0, 0, 0, 0, 2, 94], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171308); +tree.insert([0, 0, 0, 0, 0, 0, 0, 207], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171309); +tree.insert([0, 0, 0, 0, 0, 0, 0, 92], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171310); +tree.insert([0, 0, 0, 0, 0, 0, 0, 143], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171311); +tree.insert([0, 0, 0, 0, 0, 0, 0, 155], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171312); +tree.insert([0, 0, 0, 0, 0, 0, 1, 168], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171313); +tree.insert([0, 0, 0, 0, 0, 0, 1, 232], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171314); +tree.insert([0, 0, 0, 0, 0, 0, 2, 122], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171315); +tree.insert([0, 0, 0, 0, 0, 0, 1, 108], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171316); +tree.insert([0, 0, 0, 0, 0, 0, 3, 126], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171317); +tree.insert([0, 0, 0, 0, 0, 0, 0, 153], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171318); +tree.insert([0, 0, 0, 0, 0, 0, 3, 106], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171320); +tree.insert([0, 0, 0, 0, 0, 0, 2, 154], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171321); +tree.insert([0, 0, 0, 0, 0, 0, 3, 106], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171322); +tree.insert([0, 0, 0, 0, 0, 0, 0, 91], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171323); +tree.insert([0, 0, 0, 0, 0, 0, 3, 102], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171324); +tree.insert([0, 0, 0, 0, 0, 0, 0, 73], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171325); +tree.insert([0, 0, 0, 0, 0, 0, 3, 157], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171326); +tree.insert([0, 0, 0, 0, 0, 0, 0, 26], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171327); +tree.insert([0, 0, 0, 0, 0, 0, 3, 14], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171328); +tree.insert([0, 0, 0, 0, 0, 0, 1, 59], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171329); +tree.insert([0, 0, 0, 0, 0, 0, 1, 96], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171330); +tree.insert([0, 0, 0, 0, 0, 0, 1, 54], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171331); +tree.insert([0, 0, 0, 0, 0, 0, 0, 69], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171332); +tree.insert([0, 0, 0, 0, 0, 0, 0, 93], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171333); +tree.insert([0, 0, 0, 0, 0, 0, 3, 136], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171334); +tree.insert([0, 0, 0, 0, 0, 0, 2, 61], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171336); +tree.insert([0, 0, 0, 0, 0, 0, 1, 251], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171337); +tree.insert([0, 0, 0, 0, 0, 0, 1, 243], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171338); +tree.insert([0, 0, 0, 0, 0, 0, 1, 64], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171339); +tree.insert([0, 0, 0, 0, 0, 0, 0, 30], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171340); +tree.insert([0, 0, 0, 0, 0, 0, 3, 111], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171341); +tree.insert([0, 0, 0, 0, 0, 0, 2, 139], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171343); +tree.insert([0, 0, 0, 0, 0, 0, 2, 162], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171344); +tree.insert([0, 0, 0, 0, 0, 0, 3, 182], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171345); +tree.insert([0, 0, 0, 0, 0, 0, 3, 111], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171346); +tree.insert([0, 0, 0, 0, 0, 0, 1, 235], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171347); +tree.insert([0, 0, 0, 0, 0, 0, 0, 245], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171348); +tree.insert([0, 0, 0, 0, 0, 0, 2, 49], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171349); +tree.insert([0, 0, 0, 0, 0, 0, 1, 24], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171350); +tree.insert([0, 0, 0, 0, 0, 0, 2, 253], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171351); +tree.insert([0, 0, 0, 0, 0, 0, 3, 75], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171352); +tree.insert([0, 0, 0, 0, 0, 0, 2, 177], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171353); +tree.insert([0, 0, 0, 0, 0, 0, 1, 115], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171354); +tree.insert([0, 0, 0, 0, 0, 0, 3, 186], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171355); +tree.insert([0, 0, 0, 0, 0, 0, 2, 147], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171356); +tree.insert([0, 0, 0, 0, 0, 0, 1, 61], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171357); +tree.insert([0, 0, 0, 0, 0, 0, 0, 251], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171358); +tree.insert([0, 0, 0, 0, 0, 0, 2, 221], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171359); +tree.insert([0, 0, 0, 0, 0, 0, 1, 147], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171360); +tree.insert([0, 0, 0, 0, 0, 0, 3, 82], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171361); +tree.insert([0, 0, 0, 0, 0, 0, 3, 226], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171362); +tree.insert([0, 0, 0, 0, 0, 0, 1, 31], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171363); +tree.insert([0, 0, 0, 0, 0, 0, 2, 155], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171364); +tree.insert([0, 0, 0, 0, 0, 0, 2, 17], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171365); +tree.insert([0, 0, 0, 0, 0, 0, 2, 179], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171366); +tree.insert([0, 0, 0, 0, 0, 0, 2, 30], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171367); +tree.insert([0, 0, 0, 0, 0, 0, 2, 88], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171368); +tree.insert([0, 0, 0, 0, 0, 0, 0, 199], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171369); +tree.insert([0, 0, 0, 0, 0, 0, 1, 142], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171370); +tree.insert([0, 0, 0, 0, 0, 0, 2, 16], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171371); +tree.insert([0, 0, 0, 0, 0, 0, 3, 97], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171372); +tree.insert([0, 0, 0, 0, 0, 0, 1, 251], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171373); +tree.insert([0, 0, 0, 0, 0, 0, 2, 207], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171374); +tree.insert([0, 0, 0, 0, 0, 0, 1, 127], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171375); +tree.insert([0, 0, 0, 0, 0, 0, 0, 193], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171376); +tree.insert([0, 0, 0, 0, 0, 0, 3, 174], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171377); +tree.insert([0, 0, 0, 0, 0, 0, 3, 170], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171378); +tree.insert([0, 0, 0, 0, 0, 0, 2, 10], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171379); +tree.insert([0, 0, 0, 0, 0, 0, 3, 176], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171380); +tree.insert([0, 0, 0, 0, 0, 0, 3, 179], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171381); +tree.insert([0, 0, 0, 0, 0, 0, 2, 230], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171382); +tree.insert([0, 0, 0, 0, 0, 0, 3, 31], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171383); +tree.insert([0, 0, 0, 0, 0, 0, 0, 129], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171385); +tree.insert([0, 0, 0, 0, 0, 0, 2, 201], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171387); +tree.insert([0, 0, 0, 0, 0, 0, 0, 111], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171388); +tree.insert([0, 0, 0, 0, 0, 0, 0, 192], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171389); +tree.insert([0, 0, 0, 0, 0, 0, 1, 210], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171390); +tree.insert([0, 0, 0, 0, 0, 0, 3, 92], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171391); +tree.insert([0, 0, 0, 0, 0, 0, 3, 131], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171392); +tree.insert([0, 0, 0, 0, 0, 0, 2, 91], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171393); +tree.insert([0, 0, 0, 0, 0, 0, 3, 43], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171394); +tree.insert([0, 0, 0, 0, 0, 0, 1, 30], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171395); +tree.insert([0, 0, 0, 0, 0, 0, 2, 10], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171396); +tree.insert([0, 0, 0, 0, 0, 0, 2, 198], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171397); +tree.insert([0, 0, 0, 0, 0, 0, 1, 93], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171398); +tree.insert([0, 0, 0, 0, 0, 0, 1, 128], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171400); +tree.flush_active_memtable(171301)?; +tree.insert([0, 0, 0, 0, 0, 0, 3, 199], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171401); +tree.insert([0, 0, 0, 0, 0, 0, 3, 2], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171402); +tree.insert([0, 0, 0, 0, 0, 0, 1, 139], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171403); +tree.insert([0, 0, 0, 0, 0, 0, 0, 210], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171404); +tree.insert([0, 0, 0, 0, 0, 0, 0, 183], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171405); +tree.insert([0, 0, 0, 0, 0, 0, 0, 170], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171406); +tree.insert([0, 0, 0, 0, 0, 0, 1, 98], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171407); +tree.insert([0, 0, 0, 0, 0, 0, 2, 249], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171408); +tree.insert([0, 0, 0, 0, 0, 0, 3, 27], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171409); +tree.insert([0, 0, 0, 0, 0, 0, 0, 16], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171410); +tree.insert([0, 0, 0, 0, 0, 0, 1, 240], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171411); +tree.insert([0, 0, 0, 0, 0, 0, 0, 224], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171412); +tree.insert([0, 0, 0, 0, 0, 0, 2, 161], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171413); +tree.insert([0, 0, 0, 0, 0, 0, 0, 219], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171414); +tree.insert([0, 0, 0, 0, 0, 0, 1, 200], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171415); +tree.insert([0, 0, 0, 0, 0, 0, 1, 73], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171417); +tree.insert([0, 0, 0, 0, 0, 0, 0, 115], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171418); +tree.insert([0, 0, 0, 0, 0, 0, 0, 119], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171419); +tree.insert([0, 0, 0, 0, 0, 0, 0, 3], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171420); +tree.insert([0, 0, 0, 0, 0, 0, 1, 48], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171421); +tree.insert([0, 0, 0, 0, 0, 0, 0, 40], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171422); +tree.insert([0, 0, 0, 0, 0, 0, 0, 142], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171425); +tree.insert([0, 0, 0, 0, 0, 0, 0, 152], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171426); +tree.insert([0, 0, 0, 0, 0, 0, 1, 58], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171427); +tree.insert([0, 0, 0, 0, 0, 0, 1, 242], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171428); +tree.insert([0, 0, 0, 0, 0, 0, 2, 177], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171431); +tree.insert([0, 0, 0, 0, 0, 0, 1, 87], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171433); +tree.insert([0, 0, 0, 0, 0, 0, 0, 176], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171434); +tree.insert([0, 0, 0, 0, 0, 0, 0, 250], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171436); +tree.insert([0, 0, 0, 0, 0, 0, 1, 15], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171437); +tree.insert([0, 0, 0, 0, 0, 0, 1, 110], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171438); +tree.insert([0, 0, 0, 0, 0, 0, 1, 197], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171439); +tree.insert([0, 0, 0, 0, 0, 0, 0, 33], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171440); +tree.insert([0, 0, 0, 0, 0, 0, 1, 45], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171443); +tree.insert([0, 0, 0, 0, 0, 0, 1, 49], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171444); +tree.insert([0, 0, 0, 0, 0, 0, 3, 174], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171445); +tree.insert([0, 0, 0, 0, 0, 0, 0, 125], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171446); +tree.insert([0, 0, 0, 0, 0, 0, 3, 31], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171449); +tree.insert([0, 0, 0, 0, 0, 0, 3, 90], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171455); +tree.insert([0, 0, 0, 0, 0, 0, 0, 194], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171456); +tree.insert([0, 0, 0, 0, 0, 0, 2, 157], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171457); +tree.insert([0, 0, 0, 0, 0, 0, 2, 12], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171458); +tree.insert([0, 0, 0, 0, 0, 0, 3, 187], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171459); +tree.insert([0, 0, 0, 0, 0, 0, 3, 64], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171460); +tree.insert([0, 0, 0, 0, 0, 0, 0, 163], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171461); +tree.insert([0, 0, 0, 0, 0, 0, 3, 32], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171462); +tree.insert([0, 0, 0, 0, 0, 0, 2, 195], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171463); +tree.insert([0, 0, 0, 0, 0, 0, 0, 179], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171464); +tree.insert([0, 0, 0, 0, 0, 0, 3, 5], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171465); +tree.insert([0, 0, 0, 0, 0, 0, 2, 117], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171466); +tree.insert([0, 0, 0, 0, 0, 0, 3, 88], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171467); +tree.insert([0, 0, 0, 0, 0, 0, 3, 192], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171468); +tree.insert([0, 0, 0, 0, 0, 0, 1, 148], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171469); +tree.insert([0, 0, 0, 0, 0, 0, 3, 48], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171470); +tree.insert([0, 0, 0, 0, 0, 0, 2, 194], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171471); +tree.insert([0, 0, 0, 0, 0, 0, 1, 63], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171472); +tree.insert([0, 0, 0, 0, 0, 0, 2, 14], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171473); +tree.insert([0, 0, 0, 0, 0, 0, 1, 84], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171474); +tree.insert([0, 0, 0, 0, 0, 0, 0, 84], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171475); +tree.insert([0, 0, 0, 0, 0, 0, 2, 225], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171476); +tree.insert([0, 0, 0, 0, 0, 0, 1, 41], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171478); +tree.insert([0, 0, 0, 0, 0, 0, 0, 134], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171479); +tree.insert([0, 0, 0, 0, 0, 0, 3, 30], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171480); +tree.insert([0, 0, 0, 0, 0, 0, 3, 50], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171481); +tree.insert([0, 0, 0, 0, 0, 0, 0, 22], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171482); +tree.insert([0, 0, 0, 0, 0, 0, 3, 61], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171483); +tree.insert([0, 0, 0, 0, 0, 0, 3, 24], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171484); +tree.insert([0, 0, 0, 0, 0, 0, 3, 141], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171485); +tree.insert([0, 0, 0, 0, 0, 0, 2, 174], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171486); +tree.insert([0, 0, 0, 0, 0, 0, 2, 90], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171488); +tree.insert([0, 0, 0, 0, 0, 0, 0, 255], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171489); +tree.insert([0, 0, 0, 0, 0, 0, 0, 48], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171491); +tree.insert([0, 0, 0, 0, 0, 0, 0, 234], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171492); +tree.insert([0, 0, 0, 0, 0, 0, 2, 130], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171493); +tree.insert([0, 0, 0, 0, 0, 0, 0, 63], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171494); +tree.insert([0, 0, 0, 0, 0, 0, 2, 34], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171495); +tree.insert([0, 0, 0, 0, 0, 0, 3, 113], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171496); +tree.insert([0, 0, 0, 0, 0, 0, 3, 38], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171497); +tree.insert([0, 0, 0, 0, 0, 0, 2, 48], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171498); +tree.insert([0, 0, 0, 0, 0, 0, 1, 154], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171499); +tree.insert([0, 0, 0, 0, 0, 0, 0, 172], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171500); +tree.insert([0, 0, 0, 0, 0, 0, 3, 21], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171502); +tree.insert([0, 0, 0, 0, 0, 0, 0, 30], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171503); +tree.insert([0, 0, 0, 0, 0, 0, 3, 132], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171504); +tree.insert([0, 0, 0, 0, 0, 0, 1, 33], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171505); +tree.insert([0, 0, 0, 0, 0, 0, 0, 20], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171506); +tree.insert([0, 0, 0, 0, 0, 0, 1, 198], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171507); +tree.insert([0, 0, 0, 0, 0, 0, 3, 100], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171508); +tree.insert([0, 0, 0, 0, 0, 0, 3, 121], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171509); +tree.insert([0, 0, 0, 0, 0, 0, 0, 242], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171511); +tree.insert([0, 0, 0, 0, 0, 0, 2, 230], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171512); +tree.insert([0, 0, 0, 0, 0, 0, 3, 125], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171514); +tree.insert([0, 0, 0, 0, 0, 0, 1, 239], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171515); +tree.insert([0, 0, 0, 0, 0, 0, 0, 83], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171516); +tree.insert([0, 0, 0, 0, 0, 0, 3, 72], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171517); +tree.insert([0, 0, 0, 0, 0, 0, 1, 108], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171518); +tree.insert([0, 0, 0, 0, 0, 0, 0, 41], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171519); +tree.insert([0, 0, 0, 0, 0, 0, 2, 239], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171520); +tree.insert([0, 0, 0, 0, 0, 0, 2, 3], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171521); +tree.insert([0, 0, 0, 0, 0, 0, 0, 111], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171522); +tree.insert([0, 0, 0, 0, 0, 0, 2, 215], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171524); +tree.insert([0, 0, 0, 0, 0, 0, 2, 74], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171525); +tree.insert([0, 0, 0, 0, 0, 0, 2, 209], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171526); +tree.insert([0, 0, 0, 0, 0, 0, 0, 227], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171527); +tree.insert([0, 0, 0, 0, 0, 0, 1, 116], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171528); +tree.insert([0, 0, 0, 0, 0, 0, 3, 172], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171530); +tree.insert([0, 0, 0, 0, 0, 0, 1, 214], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171531); +tree.insert([0, 0, 0, 0, 0, 0, 3, 177], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171532); +tree.insert([0, 0, 0, 0, 0, 0, 0, 147], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171533); +tree.insert([0, 0, 0, 0, 0, 0, 1, 79], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171534); +tree.insert([0, 0, 0, 0, 0, 0, 0, 186], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171535); +tree.insert([0, 0, 0, 0, 0, 0, 3, 176], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171536); +tree.insert([0, 0, 0, 0, 0, 0, 3, 86], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171537); +tree.insert([0, 0, 0, 0, 0, 0, 3, 26], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171538); +tree.insert([0, 0, 0, 0, 0, 0, 1, 173], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171539); +tree.insert([0, 0, 0, 0, 0, 0, 0, 174], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171540); +tree.insert([0, 0, 0, 0, 0, 0, 1, 157], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171541); +tree.insert([0, 0, 0, 0, 0, 0, 2, 41], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171542); +tree.insert([0, 0, 0, 0, 0, 0, 2, 33], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171545); +tree.insert([0, 0, 0, 0, 0, 0, 0, 254], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171546); +tree.insert([0, 0, 0, 0, 0, 0, 1, 179], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171548); +tree.insert([0, 0, 0, 0, 0, 0, 3, 123], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171550); +tree.insert([0, 0, 0, 0, 0, 0, 2, 158], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171551); +tree.insert([0, 0, 0, 0, 0, 0, 3, 54], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171552); +tree.insert([0, 0, 0, 0, 0, 0, 1, 149], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171553); +tree.insert([0, 0, 0, 0, 0, 0, 3, 142], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171555); +tree.insert([0, 0, 0, 0, 0, 0, 3, 155], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171556); +tree.insert([0, 0, 0, 0, 0, 0, 2, 42], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171557); +tree.insert([0, 0, 0, 0, 0, 0, 2, 21], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171561); +tree.insert([0, 0, 0, 0, 0, 0, 1, 28], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171563); +tree.insert([0, 0, 0, 0, 0, 0, 0, 69], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171564); +tree.insert([0, 0, 0, 0, 0, 0, 0, 193], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171565); +tree.insert([0, 0, 0, 0, 0, 0, 1, 17], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171568); +tree.insert([0, 0, 0, 0, 0, 0, 0, 71], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171570); +tree.insert([0, 0, 0, 0, 0, 0, 1, 178], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171571); +tree.insert([0, 0, 0, 0, 0, 0, 2, 118], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171572); +tree.insert([0, 0, 0, 0, 0, 0, 0, 165], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171573); +tree.insert([0, 0, 0, 0, 0, 0, 0, 141], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171574); +tree.insert([0, 0, 0, 0, 0, 0, 2, 119], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171575); +tree.insert([0, 0, 0, 0, 0, 0, 2, 99], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171576); +tree.insert([0, 0, 0, 0, 0, 0, 1, 250], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171577); +tree.insert([0, 0, 0, 0, 0, 0, 0, 11], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171578); +tree.insert([0, 0, 0, 0, 0, 0, 2, 50], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171581); +tree.insert([0, 0, 0, 0, 0, 0, 1, 114], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171582); +tree.insert([0, 0, 0, 0, 0, 0, 2, 207], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171583); +tree.insert([0, 0, 0, 0, 0, 0, 1, 82], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171584); +tree.insert([0, 0, 0, 0, 0, 0, 1, 231], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171585); +tree.insert([0, 0, 0, 0, 0, 0, 0, 189], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171586); +tree.insert([0, 0, 0, 0, 0, 0, 1, 71], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171589); +tree.insert([0, 0, 0, 0, 0, 0, 2, 92], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171592); +tree.insert([0, 0, 0, 0, 0, 0, 0, 73], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171593); +tree.insert([0, 0, 0, 0, 0, 0, 0, 60], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171595); +tree.insert([0, 0, 0, 0, 0, 0, 0, 52], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171596); +tree.insert([0, 0, 0, 0, 0, 0, 2, 47], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171597); +tree.insert([0, 0, 0, 0, 0, 0, 0, 32], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171599); +tree.flush_active_memtable(171501)?; +tree.insert([0, 0, 0, 0, 0, 0, 1, 224], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171601); +tree.insert([0, 0, 0, 0, 0, 0, 0, 77], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171602); +tree.insert([0, 0, 0, 0, 0, 0, 0, 93], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171604); +tree.insert([0, 0, 0, 0, 0, 0, 1, 94], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171606); +tree.insert([0, 0, 0, 0, 0, 0, 2, 68], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171607); +tree.insert([0, 0, 0, 0, 0, 0, 3, 102], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171608); +tree.insert([0, 0, 0, 0, 0, 0, 2, 11], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171609); +tree.insert([0, 0, 0, 0, 0, 0, 2, 114], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171613); +tree.insert([0, 0, 0, 0, 0, 0, 2, 96], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171614); +tree.insert([0, 0, 0, 0, 0, 0, 1, 199], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171615); +tree.insert([0, 0, 0, 0, 0, 0, 1, 50], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171616); +tree.insert([0, 0, 0, 0, 0, 0, 0, 112], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171618); +tree.insert([0, 0, 0, 0, 0, 0, 0, 249], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171621); +tree.insert([0, 0, 0, 0, 0, 0, 2, 82], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171622); +tree.insert([0, 0, 0, 0, 0, 0, 1, 156], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171623); +tree.insert([0, 0, 0, 0, 0, 0, 2, 73], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171624); +tree.insert([0, 0, 0, 0, 0, 0, 2, 19], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171628); +tree.insert([0, 0, 0, 0, 0, 0, 2, 13], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171629); +tree.insert([0, 0, 0, 0, 0, 0, 2, 49], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171633); +tree.insert([0, 0, 0, 0, 0, 0, 0, 169], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171635); +tree.insert([0, 0, 0, 0, 0, 0, 2, 23], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171636); +tree.insert([0, 0, 0, 0, 0, 0, 3, 80], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171637); +tree.insert([0, 0, 0, 0, 0, 0, 2, 63], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171638); +tree.insert([0, 0, 0, 0, 0, 0, 0, 206], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171639); +tree.insert([0, 0, 0, 0, 0, 0, 0, 29], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171643); +tree.insert([0, 0, 0, 0, 0, 0, 1, 133], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171644); +tree.insert([0, 0, 0, 0, 0, 0, 0, 13], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171645); +tree.insert([0, 0, 0, 0, 0, 0, 1, 120], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171646); +tree.insert([0, 0, 0, 0, 0, 0, 0, 9], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171647); +tree.insert([0, 0, 0, 0, 0, 0, 0, 166], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171649); +tree.insert([0, 0, 0, 0, 0, 0, 1, 168], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171651); +tree.insert([0, 0, 0, 0, 0, 0, 1, 202], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171653); +tree.insert([0, 0, 0, 0, 0, 0, 1, 229], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171657); +tree.insert([0, 0, 0, 0, 0, 0, 2, 121], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171658); +tree.insert([0, 0, 0, 0, 0, 0, 0, 68], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171659); +tree.insert([0, 0, 0, 0, 0, 0, 1, 6], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171662); +tree.insert([0, 0, 0, 0, 0, 0, 2, 71], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171663); +tree.insert([0, 0, 0, 0, 0, 0, 1, 126], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171666); +tree.insert([0, 0, 0, 0, 0, 0, 2, 52], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171667); +tree.insert([0, 0, 0, 0, 0, 0, 2, 105], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171675); +tree.insert([0, 0, 0, 0, 0, 0, 1, 155], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171678); +tree.insert([0, 0, 0, 0, 0, 0, 0, 58], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171679); +tree.insert([0, 0, 0, 0, 0, 0, 1, 118], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171681); +tree.insert([0, 0, 0, 0, 0, 0, 1, 69], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171686); +tree.insert([0, 0, 0, 0, 0, 0, 0, 81], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171687); +tree.insert([0, 0, 0, 0, 0, 0, 1, 254], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171692); +tree.insert([0, 0, 0, 0, 0, 0, 0, 35], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171695); +tree.insert([0, 0, 0, 0, 0, 0, 1, 102], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171699); +tree.insert([0, 0, 0, 0, 0, 0, 1, 247], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171700); +tree.insert([0, 0, 0, 0, 0, 0, 1, 174], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171703); +tree.insert([0, 0, 0, 0, 0, 0, 0, 215], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171705); +tree.insert([0, 0, 0, 0, 0, 0, 2, 112], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171707); +tree.insert([0, 0, 0, 0, 0, 0, 0, 253], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171717); +tree.insert([0, 0, 0, 0, 0, 0, 1, 70], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171723); +tree.insert([0, 0, 0, 0, 0, 0, 2, 131], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171724); +tree.insert([0, 0, 0, 0, 0, 0, 0, 156], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171725); +tree.insert([0, 0, 0, 0, 0, 0, 1, 52], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171728); +tree.insert([0, 0, 0, 0, 0, 0, 1, 74], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171733); +tree.insert([0, 0, 0, 0, 0, 0, 0, 96], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171735); +tree.insert([0, 0, 0, 0, 0, 0, 0, 72], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171740); +tree.insert([0, 0, 0, 0, 0, 0, 2, 26], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171742); +tree.insert([0, 0, 0, 0, 0, 0, 1, 10], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171746); +tree.insert([0, 0, 0, 0, 0, 0, 0, 14], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171747); +tree.insert([0, 0, 0, 0, 0, 0, 2, 166], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171751); +tree.insert([0, 0, 0, 0, 0, 0, 0, 162], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171752); +tree.insert([0, 0, 0, 0, 0, 0, 1, 26], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171754); +tree.insert([0, 0, 0, 0, 0, 0, 2, 138], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171763); +tree.insert([0, 0, 0, 0, 0, 0, 1, 248], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171764); +tree.insert([0, 0, 0, 0, 0, 0, 2, 4], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171765); +tree.insert([0, 0, 0, 0, 0, 0, 0, 7], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171767); +tree.insert([0, 0, 0, 0, 0, 0, 0, 198], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171768); +tree.insert([0, 0, 0, 0, 0, 0, 0, 226], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171770); +tree.insert([0, 0, 0, 0, 0, 0, 1, 180], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171771); +tree.insert([0, 0, 0, 0, 0, 0, 0, 27], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171772); +tree.insert([0, 0, 0, 0, 0, 0, 2, 98], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171777); +tree.insert([0, 0, 0, 0, 0, 0, 1, 112], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171782); +tree.insert([0, 0, 0, 0, 0, 0, 2, 151], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171785); +tree.insert([0, 0, 0, 0, 0, 0, 1, 117], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171786); +tree.insert([0, 0, 0, 0, 0, 0, 0, 53], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171787); +tree.insert([0, 0, 0, 0, 0, 0, 2, 186], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171788); +tree.insert([0, 0, 0, 0, 0, 0, 2, 134], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171792); +tree.insert([0, 0, 0, 0, 0, 0, 2, 89], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171793); +tree.insert([0, 0, 0, 0, 0, 0, 1, 123], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171794); +tree.insert([0, 0, 0, 0, 0, 0, 1, 193], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171797); +tree.insert([0, 0, 0, 0, 0, 0, 1, 125], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171799); +tree.flush_active_memtable(171701)?; +tree.insert([0, 0, 0, 0, 0, 0, 1, 8], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171805); +tree.insert([0, 0, 0, 0, 0, 0, 2, 111], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171806); +tree.insert([0, 0, 0, 0, 0, 0, 2, 27], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171809); +tree.insert([0, 0, 0, 0, 0, 0, 2, 102], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171810); +tree.insert([0, 0, 0, 0, 0, 0, 0, 228], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171811); +tree.insert([0, 0, 0, 0, 0, 0, 1, 67], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171812); +tree.insert([0, 0, 0, 0, 0, 0, 1, 189], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171813); +tree.insert([0, 0, 0, 0, 0, 0, 2, 84], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171814); +tree.insert([0, 0, 0, 0, 0, 0, 1, 172], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171815); +tree.insert([0, 0, 0, 0, 0, 0, 0, 204], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171819); +tree.insert([0, 0, 0, 0, 0, 0, 1, 40], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171821); +tree.insert([0, 0, 0, 0, 0, 0, 1, 212], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171823); +tree.insert([0, 0, 0, 0, 0, 0, 2, 164], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171824); +tree.insert([0, 0, 0, 0, 0, 0, 1, 78], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171825); +tree.insert([0, 0, 0, 0, 0, 0, 1, 104], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171831); +tree.insert([0, 0, 0, 0, 0, 0, 1, 131], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171833); +tree.insert([0, 0, 0, 0, 0, 0, 0, 237], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171834); +tree.insert([0, 0, 0, 0, 0, 0, 2, 184], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171836); +tree.insert([0, 0, 0, 0, 0, 0, 2, 173], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171837); +tree.insert([0, 0, 0, 0, 0, 0, 2, 144], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171838); +tree.insert([0, 0, 0, 0, 0, 0, 2, 28], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171848); +tree.insert([0, 0, 0, 0, 0, 0, 1, 187], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171850); +tree.insert([0, 0, 0, 0, 0, 0, 2, 168], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171856); +tree.insert([0, 0, 0, 0, 0, 0, 1, 121], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171862); +tree.insert([0, 0, 0, 0, 0, 0, 0, 203], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171863); +tree.insert([0, 0, 0, 0, 0, 0, 0, 105], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171865); +tree.insert([0, 0, 0, 0, 0, 0, 2, 116], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171866); +tree.insert([0, 0, 0, 0, 0, 0, 1, 164], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171868); +tree.insert([0, 0, 0, 0, 0, 0, 1, 138], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171873); +tree.insert([0, 0, 0, 0, 0, 0, 0, 247], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171875); +tree.insert([0, 0, 0, 0, 0, 0, 1, 16], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171877); +tree.insert([0, 0, 0, 0, 0, 0, 0, 190], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171882); +tree.insert([0, 0, 0, 0, 0, 0, 0, 90], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171885); +tree.insert([0, 0, 0, 0, 0, 0, 1, 11], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171890); +tree.insert([0, 0, 0, 0, 0, 0, 2, 66], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171891); +tree.insert([0, 0, 0, 0, 0, 0, 1, 132], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171892); +tree.insert([0, 0, 0, 0, 0, 0, 0, 36], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171895); +tree.insert([0, 0, 0, 0, 0, 0, 0, 122], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171898); +tree.insert([0, 0, 0, 0, 0, 0, 0, 50], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171899); +tree.insert([0, 0, 0, 0, 0, 0, 0, 62], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171901); +tree.insert([0, 0, 0, 0, 0, 0, 2, 129], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171902); +tree.insert([0, 0, 0, 0, 0, 0, 0, 139], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171905); +tree.insert([0, 0, 0, 0, 0, 0, 1, 166], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171907); +tree.insert([0, 0, 0, 0, 0, 0, 0, 145], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171908); +tree.insert([0, 0, 0, 0, 0, 0, 1, 20], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171911); +tree.insert([0, 0, 0, 0, 0, 0, 2, 100], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171917); +tree.insert([0, 0, 0, 0, 0, 0, 0, 99], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171918); +tree.insert([0, 0, 0, 0, 0, 0, 1, 124], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171921); +tree.insert([0, 0, 0, 0, 0, 0, 2, 78], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171922); +tree.insert([0, 0, 0, 0, 0, 0, 2, 149], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171925); +tree.insert([0, 0, 0, 0, 0, 0, 1, 246], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171926); +tree.insert([0, 0, 0, 0, 0, 0, 1, 95], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171928); +tree.insert([0, 0, 0, 0, 0, 0, 0, 76], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171931); +tree.insert([0, 0, 0, 0, 0, 0, 1, 9], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171936); +tree.insert([0, 0, 0, 0, 0, 0, 1, 152], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171937); +tree.insert([0, 0, 0, 0, 0, 0, 1, 158], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171947); +tree.insert([0, 0, 0, 0, 0, 0, 2, 51], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171948); +tree.insert([0, 0, 0, 0, 0, 0, 0, 51], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171950); +tree.insert([0, 0, 0, 0, 0, 0, 0, 144], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171958); +tree.insert([0, 0, 0, 0, 0, 0, 0, 244], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171959); +tree.insert([0, 0, 0, 0, 0, 0, 1, 109], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171963); +tree.insert([0, 0, 0, 0, 0, 0, 0, 59], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171965); +tree.insert([0, 0, 0, 0, 0, 0, 0, 76], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171975); +tree.insert([0, 0, 0, 0, 0, 0, 0, 98], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171976); +tree.insert([0, 0, 0, 0, 0, 0, 1, 163], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171977); +tree.insert([0, 0, 0, 0, 0, 0, 1, 97], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171985); +tree.insert([0, 0, 0, 0, 0, 0, 1, 42], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171987); +tree.insert([0, 0, 0, 0, 0, 0, 1, 162], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171993); +tree.insert([0, 0, 0, 0, 0, 0, 1, 208], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171997); +tree.flush_active_memtable(171901)?; +tree.compact(compaction.clone(), 171901)?; + + +tree.insert([0, 0, 0, 0, 0, 0, 3, 78], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172007); +tree.insert([0, 0, 0, 0, 0, 0, 1, 37], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172008); +tree.insert([0, 0, 0, 0, 0, 0, 3, 71], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172009); +tree.insert([0, 0, 0, 0, 0, 0, 3, 14], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172010); +tree.insert([0, 0, 0, 0, 0, 0, 2, 39], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172011); +tree.insert([0, 0, 0, 0, 0, 0, 2, 147], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172012); +tree.insert([0, 0, 0, 0, 0, 0, 2, 23], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172013); +tree.insert([0, 0, 0, 0, 0, 0, 0, 121], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172014); +tree.insert([0, 0, 0, 0, 0, 0, 1, 68], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172015); +tree.insert([0, 0, 0, 0, 0, 0, 2, 57], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172016); +tree.insert([0, 0, 0, 0, 0, 0, 0, 127], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172017); +tree.insert([0, 0, 0, 0, 0, 0, 1, 72], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172018); +tree.insert([0, 0, 0, 0, 0, 0, 3, 0], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172020); +tree.insert([0, 0, 0, 0, 0, 0, 1, 213], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172023); +tree.insert([0, 0, 0, 0, 0, 0, 0, 51], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172024); +tree.insert([0, 0, 0, 0, 0, 0, 2, 139], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172025); +tree.insert([0, 0, 0, 0, 0, 0, 1, 235], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172026); +tree.insert([0, 0, 0, 0, 0, 0, 2, 160], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172027); +tree.insert([0, 0, 0, 0, 0, 0, 2, 27], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172028); +tree.insert([0, 0, 0, 0, 0, 0, 1, 194], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172029); +tree.insert([0, 0, 0, 0, 0, 0, 1, 105], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172031); +tree.insert([0, 0, 0, 0, 0, 0, 1, 59], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172033); +tree.insert([0, 0, 0, 0, 0, 0, 2, 85], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172035); +tree.insert([0, 0, 0, 0, 0, 0, 1, 19], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172036); +tree.insert([0, 0, 0, 0, 0, 0, 2, 29], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172037); +tree.insert([0, 0, 0, 0, 0, 0, 2, 50], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172038); +tree.insert([0, 0, 0, 0, 0, 0, 1, 18], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172039); +tree.insert([0, 0, 0, 0, 0, 0, 0, 0], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172043); +tree.insert([0, 0, 0, 0, 0, 0, 0, 144], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172044); +tree.insert([0, 0, 0, 0, 0, 0, 1, 169], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172045); +tree.insert([0, 0, 0, 0, 0, 0, 1, 74], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172046); +tree.insert([0, 0, 0, 0, 0, 0, 3, 58], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172047); +tree.insert([0, 0, 0, 0, 0, 0, 2, 64], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172048); +tree.insert([0, 0, 0, 0, 0, 0, 1, 47], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172049); +tree.insert([0, 0, 0, 0, 0, 0, 2, 103], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172050); +tree.insert([0, 0, 0, 0, 0, 0, 1, 231], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172051); +tree.insert([0, 0, 0, 0, 0, 0, 2, 179], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172052); +tree.insert([0, 0, 0, 0, 0, 0, 3, 17], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172053); +tree.insert([0, 0, 0, 0, 0, 0, 1, 217], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172054); +tree.insert([0, 0, 0, 0, 0, 0, 1, 82], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172056); +tree.insert([0, 0, 0, 0, 0, 0, 1, 184], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172057); +tree.insert([0, 0, 0, 0, 0, 0, 1, 212], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172058); +tree.insert([0, 0, 0, 0, 0, 0, 0, 74], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172059); +tree.insert([0, 0, 0, 0, 0, 0, 1, 114], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172061); +tree.insert([0, 0, 0, 0, 0, 0, 1, 173], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172062); +tree.insert([0, 0, 0, 0, 0, 0, 1, 237], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172064); +tree.insert([0, 0, 0, 0, 0, 0, 1, 25], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172065); +tree.insert([0, 0, 0, 0, 0, 0, 1, 125], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172066); +tree.insert([0, 0, 0, 0, 0, 0, 2, 78], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172067); +tree.insert([0, 0, 0, 0, 0, 0, 1, 245], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172068); +tree.insert([0, 0, 0, 0, 0, 0, 1, 35], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172069); +tree.insert([0, 0, 0, 0, 0, 0, 3, 4], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172070); +tree.insert([0, 0, 0, 0, 0, 0, 3, 120], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172071); +tree.insert([0, 0, 0, 0, 0, 0, 3, 10], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172072); +tree.insert([0, 0, 0, 0, 0, 0, 2, 74], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172073); +tree.insert([0, 0, 0, 0, 0, 0, 2, 15], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172074); +tree.insert([0, 0, 0, 0, 0, 0, 2, 109], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172075); +tree.insert([0, 0, 0, 0, 0, 0, 1, 39], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172077); +tree.insert([0, 0, 0, 0, 0, 0, 2, 221], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172078); +tree.insert([0, 0, 0, 0, 0, 0, 2, 250], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172081); +tree.insert([0, 0, 0, 0, 0, 0, 0, 180], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172082); +tree.insert([0, 0, 0, 0, 0, 0, 2, 5], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172083); +tree.insert([0, 0, 0, 0, 0, 0, 0, 223], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172085); +tree.insert([0, 0, 0, 0, 0, 0, 3, 109], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172087); +tree.insert([0, 0, 0, 0, 0, 0, 2, 175], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172088); +tree.insert([0, 0, 0, 0, 0, 0, 2, 228], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172089); +tree.insert([0, 0, 0, 0, 0, 0, 0, 126], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172090); +tree.insert([0, 0, 0, 0, 0, 0, 0, 56], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172091); +tree.insert([0, 0, 0, 0, 0, 0, 0, 157], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172092); +tree.insert([0, 0, 0, 0, 0, 0, 0, 99], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172093); +tree.insert([0, 0, 0, 0, 0, 0, 0, 96], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172094); +tree.insert([0, 0, 0, 0, 0, 0, 0, 76], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172095); +tree.insert([0, 0, 0, 0, 0, 0, 2, 253], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172096); +tree.insert([0, 0, 0, 0, 0, 0, 0, 210], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172097); +tree.insert([0, 0, 0, 0, 0, 0, 3, 60], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172098); +tree.insert([0, 0, 0, 0, 0, 0, 2, 169], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172101); +tree.insert([0, 0, 0, 0, 0, 0, 2, 118], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172102); +tree.insert([0, 0, 0, 0, 0, 0, 0, 85], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172104); +tree.insert([0, 0, 0, 0, 0, 0, 2, 53], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172105); +tree.insert([0, 0, 0, 0, 0, 0, 2, 142], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172108); +tree.insert([0, 0, 0, 0, 0, 0, 1, 249], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172110); +tree.insert([0, 0, 0, 0, 0, 0, 1, 243], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172111); +tree.insert([0, 0, 0, 0, 0, 0, 1, 48], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172112); +tree.insert([0, 0, 0, 0, 0, 0, 1, 16], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172113); +tree.insert([0, 0, 0, 0, 0, 0, 2, 121], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172114); +tree.insert([0, 0, 0, 0, 0, 0, 1, 2], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172116); +tree.insert([0, 0, 0, 0, 0, 0, 0, 8], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172117); +tree.insert([0, 0, 0, 0, 0, 0, 1, 128], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172118); +tree.insert([0, 0, 0, 0, 0, 0, 1, 104], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172119); +tree.insert([0, 0, 0, 0, 0, 0, 0, 12], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172122); +tree.insert([0, 0, 0, 0, 0, 0, 0, 122], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172123); +tree.insert([0, 0, 0, 0, 0, 0, 0, 123], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172124); +tree.insert([0, 0, 0, 0, 0, 0, 2, 140], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172125); +tree.insert([0, 0, 0, 0, 0, 0, 2, 30], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172126); +tree.insert([0, 0, 0, 0, 0, 0, 0, 235], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172127); +tree.insert([0, 0, 0, 0, 0, 0, 1, 126], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172128); +tree.insert([0, 0, 0, 0, 0, 0, 1, 149], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172149); +tree.insert([0, 0, 0, 0, 0, 0, 0, 9], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172151); +tree.insert([0, 0, 0, 0, 0, 0, 1, 78], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172152); +tree.insert([0, 0, 0, 0, 0, 0, 0, 64], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172153); +tree.insert([0, 0, 0, 0, 0, 0, 0, 161], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172157); +tree.insert([0, 0, 0, 0, 0, 0, 1, 200], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172158); +tree.insert([0, 0, 0, 0, 0, 0, 1, 234], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172163); +tree.insert([0, 0, 0, 0, 0, 0, 1, 196], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172164); +tree.insert([0, 0, 0, 0, 0, 0, 1, 153], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172167); +tree.flush_active_memtable(172101)?; +tree.compact(compaction.clone(), 172101)?; + +tree.insert([0, 0, 0, 0, 0, 0, 0, 219], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172222); +tree.insert([0, 0, 0, 0, 0, 0, 0, 124], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172225); +tree.insert([0, 0, 0, 0, 0, 0, 2, 192], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172226); +tree.insert([0, 0, 0, 0, 0, 0, 0, 108], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172227); +tree.insert([0, 0, 0, 0, 0, 0, 0, 41], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172229); +tree.insert([0, 0, 0, 0, 0, 0, 0, 49], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172231); +tree.insert([0, 0, 0, 0, 0, 0, 1, 98], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172233); +tree.insert([0, 0, 0, 0, 0, 0, 2, 100], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172235); +tree.insert([0, 0, 0, 0, 0, 0, 3, 40], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172237); +tree.insert([0, 0, 0, 0, 0, 0, 3, 39], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172238); +tree.insert([0, 0, 0, 0, 0, 0, 2, 16], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172239); +tree.insert([0, 0, 0, 0, 0, 0, 1, 172], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172240); +tree.insert([0, 0, 0, 0, 0, 0, 0, 143], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172242); +tree.insert([0, 0, 0, 0, 0, 0, 1, 101], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172245); +tree.insert([0, 0, 0, 0, 0, 0, 0, 138], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172247); +tree.insert([0, 0, 0, 0, 0, 0, 0, 88], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172251); +tree.insert([0, 0, 0, 0, 0, 0, 0, 154], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172252); +tree.insert([0, 0, 0, 0, 0, 0, 2, 10], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172253); +tree.insert([0, 0, 0, 0, 0, 0, 1, 233], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172257); +tree.insert([0, 0, 0, 0, 0, 0, 0, 229], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172259); +tree.insert([0, 0, 0, 0, 0, 0, 1, 32], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172260); +tree.insert([0, 0, 0, 0, 0, 0, 1, 211], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172261); +tree.insert([0, 0, 0, 0, 0, 0, 2, 69], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172262); +tree.insert([0, 0, 0, 0, 0, 0, 2, 86], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172263); +tree.insert([0, 0, 0, 0, 0, 0, 2, 56], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172264); +tree.insert([0, 0, 0, 0, 0, 0, 2, 190], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172265); +tree.insert([0, 0, 0, 0, 0, 0, 2, 28], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172270); +tree.insert([0, 0, 0, 0, 0, 0, 2, 21], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172271); +tree.insert([0, 0, 0, 0, 0, 0, 1, 118], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172273); +tree.insert([0, 0, 0, 0, 0, 0, 1, 193], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172274); +tree.insert([0, 0, 0, 0, 0, 0, 0, 115], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172275); +tree.insert([0, 0, 0, 0, 0, 0, 2, 200], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172276); +tree.insert([0, 0, 0, 0, 0, 0, 0, 225], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172278); +tree.insert([0, 0, 0, 0, 0, 0, 1, 7], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172279); +tree.insert([0, 0, 0, 0, 0, 0, 0, 129], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172280); +tree.insert([0, 0, 0, 0, 0, 0, 0, 226], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172281); +tree.insert([0, 0, 0, 0, 0, 0, 2, 9], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172286); +tree.insert([0, 0, 0, 0, 0, 0, 0, 50], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172289); +tree.insert([0, 0, 0, 0, 0, 0, 2, 43], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172290); +tree.insert([0, 0, 0, 0, 0, 0, 0, 120], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172291); +tree.insert([0, 0, 0, 0, 0, 0, 1, 239], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172292); +tree.insert([0, 0, 0, 0, 0, 0, 0, 61], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172293); +tree.insert([0, 0, 0, 0, 0, 0, 0, 246], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172294); +tree.insert([0, 0, 0, 0, 0, 0, 0, 7], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172296); +tree.insert([0, 0, 0, 0, 0, 0, 0, 68], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172297); +tree.insert([0, 0, 0, 0, 0, 0, 3, 18], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172298); +tree.insert([0, 0, 0, 0, 0, 0, 0, 131], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172299); +tree.insert([0, 0, 0, 0, 0, 0, 2, 124], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172301); +tree.insert([0, 0, 0, 0, 0, 0, 0, 142], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172304); +tree.insert([0, 0, 0, 0, 0, 0, 2, 54], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172305); +tree.insert([0, 0, 0, 0, 0, 0, 1, 42], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172307); +tree.insert([0, 0, 0, 0, 0, 0, 0, 128], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172308); +tree.insert([0, 0, 0, 0, 0, 0, 3, 36], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172310); +tree.insert([0, 0, 0, 0, 0, 0, 0, 184], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172311); +tree.insert([0, 0, 0, 0, 0, 0, 1, 182], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172313); +tree.insert([0, 0, 0, 0, 0, 0, 1, 5], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172315); +tree.insert([0, 0, 0, 0, 0, 0, 2, 198], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172317); +tree.insert([0, 0, 0, 0, 0, 0, 0, 17], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172318); +tree.insert([0, 0, 0, 0, 0, 0, 0, 39], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172321); +tree.insert([0, 0, 0, 0, 0, 0, 2, 36], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172323); +tree.insert([0, 0, 0, 0, 0, 0, 0, 192], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172324); +tree.insert([0, 0, 0, 0, 0, 0, 1, 80], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172327); +tree.insert([0, 0, 0, 0, 0, 0, 1, 181], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172328); +tree.insert([0, 0, 0, 0, 0, 0, 0, 116], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172331); +tree.insert([0, 0, 0, 0, 0, 0, 0, 47], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172337); +tree.insert([0, 0, 0, 0, 0, 0, 1, 30], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172338); +tree.insert([0, 0, 0, 0, 0, 0, 0, 243], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172340); +tree.insert([0, 0, 0, 0, 0, 0, 0, 90], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172342); +tree.insert([0, 0, 0, 0, 0, 0, 0, 178], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172343); +tree.insert([0, 0, 0, 0, 0, 0, 1, 22], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172350); +tree.insert([0, 0, 0, 0, 0, 0, 0, 240], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172352); +tree.insert([0, 0, 0, 0, 0, 0, 2, 8], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172354); +tree.insert([0, 0, 0, 0, 0, 0, 2, 45], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172355); +tree.insert([0, 0, 0, 0, 0, 0, 2, 87], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172357); +tree.insert([0, 0, 0, 0, 0, 0, 1, 127], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172358); +tree.insert([0, 0, 0, 0, 0, 0, 0, 195], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172359); +tree.insert([0, 0, 0, 0, 0, 0, 2, 115], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172360); +tree.insert([0, 0, 0, 0, 0, 0, 2, 126], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172362); +tree.insert([0, 0, 0, 0, 0, 0, 2, 105], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172365); +tree.insert([0, 0, 0, 0, 0, 0, 2, 191], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172366); +tree.insert([0, 0, 0, 0, 0, 0, 1, 209], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172367); +tree.insert([0, 0, 0, 0, 0, 0, 0, 156], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172368); +tree.insert([0, 0, 0, 0, 0, 0, 2, 208], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172370); +tree.insert([0, 0, 0, 0, 0, 0, 2, 60], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172371); +tree.insert([0, 0, 0, 0, 0, 0, 1, 87], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172372); +tree.insert([0, 0, 0, 0, 0, 0, 1, 44], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172373); +tree.insert([0, 0, 0, 0, 0, 0, 0, 232], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172374); +tree.insert([0, 0, 0, 0, 0, 0, 2, 47], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172376); +tree.insert([0, 0, 0, 0, 0, 0, 1, 178], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172378); +tree.insert([0, 0, 0, 0, 0, 0, 2, 150], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172380); +tree.insert([0, 0, 0, 0, 0, 0, 2, 82], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172381); +tree.insert([0, 0, 0, 0, 0, 0, 1, 53], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172383); +tree.insert([0, 0, 0, 0, 0, 0, 1, 81], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172387); +tree.insert([0, 0, 0, 0, 0, 0, 1, 227], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172390); +tree.insert([0, 0, 0, 0, 0, 0, 0, 38], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172392); +tree.insert([0, 0, 0, 0, 0, 0, 2, 162], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172393); +tree.insert([0, 0, 0, 0, 0, 0, 1, 65], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172394); +tree.insert([0, 0, 0, 0, 0, 0, 1, 255], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172397); +tree.insert([0, 0, 0, 0, 0, 0, 0, 95], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172398); +tree.insert([0, 0, 0, 0, 0, 0, 3, 63], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172399); +tree.insert([0, 0, 0, 0, 0, 0, 0, 66], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172400); +tree.flush_active_memtable(172301)?; +tree.insert([0, 0, 0, 0, 0, 0, 1, 27], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172402); +tree.insert([0, 0, 0, 0, 0, 0, 0, 187], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172404); +tree.insert([0, 0, 0, 0, 0, 0, 2, 211], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172408); +tree.insert([0, 0, 0, 0, 0, 0, 1, 216], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172409); +tree.insert([0, 0, 0, 0, 0, 0, 0, 93], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172414); +tree.insert([0, 0, 0, 0, 0, 0, 0, 71], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172416); +tree.insert([0, 0, 0, 0, 0, 0, 3, 7], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172417); +tree.insert([0, 0, 0, 0, 0, 0, 1, 176], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172418); +tree.insert([0, 0, 0, 0, 0, 0, 0, 215], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172420); +tree.insert([0, 0, 0, 0, 0, 0, 1, 223], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172423); +tree.insert([0, 0, 0, 0, 0, 0, 0, 221], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172424); +tree.insert([0, 0, 0, 0, 0, 0, 0, 54], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172425); +tree.insert([0, 0, 0, 0, 0, 0, 0, 166], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172426); +tree.insert([0, 0, 0, 0, 0, 0, 1, 167], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172427); +tree.insert([0, 0, 0, 0, 0, 0, 2, 170], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172428); +tree.insert([0, 0, 0, 0, 0, 0, 2, 217], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172429); +tree.insert([0, 0, 0, 0, 0, 0, 1, 56], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172432); +tree.insert([0, 0, 0, 0, 0, 0, 0, 58], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172433); +tree.insert([0, 0, 0, 0, 0, 0, 2, 89], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172434); +tree.insert([0, 0, 0, 0, 0, 0, 1, 96], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172435); +tree.insert([0, 0, 0, 0, 0, 0, 1, 58], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172436); +tree.insert([0, 0, 0, 0, 0, 0, 2, 88], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172438); +tree.insert([0, 0, 0, 0, 0, 0, 0, 188], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172439); +tree.insert([0, 0, 0, 0, 0, 0, 1, 99], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172440); +tree.insert([0, 0, 0, 0, 0, 0, 2, 114], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172441); +tree.insert([0, 0, 0, 0, 0, 0, 1, 83], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172442); +tree.insert([0, 0, 0, 0, 0, 0, 2, 49], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172443); +tree.insert([0, 0, 0, 0, 0, 0, 1, 13], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172444); +tree.insert([0, 0, 0, 0, 0, 0, 0, 211], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172445); +tree.insert([0, 0, 0, 0, 0, 0, 0, 227], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172449); +tree.insert([0, 0, 0, 0, 0, 0, 3, 22], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172450); +tree.insert([0, 0, 0, 0, 0, 0, 0, 230], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172451); +tree.insert([0, 0, 0, 0, 0, 0, 1, 112], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172455); +tree.insert([0, 0, 0, 0, 0, 0, 0, 45], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172457); +tree.insert([0, 0, 0, 0, 0, 0, 2, 224], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172458); +tree.insert([0, 0, 0, 0, 0, 0, 1, 124], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172461); +tree.insert([0, 0, 0, 0, 0, 0, 1, 111], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172462); +tree.insert([0, 0, 0, 0, 0, 0, 0, 80], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172463); +tree.insert([0, 0, 0, 0, 0, 0, 0, 253], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172465); +tree.insert([0, 0, 0, 0, 0, 0, 1, 8], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172466); +tree.insert([0, 0, 0, 0, 0, 0, 1, 85], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172470); +tree.insert([0, 0, 0, 0, 0, 0, 1, 60], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172471); +tree.insert([0, 0, 0, 0, 0, 0, 1, 250], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172472); +tree.insert([0, 0, 0, 0, 0, 0, 1, 154], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172479); +tree.insert([0, 0, 0, 0, 0, 0, 1, 138], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172486); +tree.insert([0, 0, 0, 0, 0, 0, 0, 46], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172488); +tree.insert([0, 0, 0, 0, 0, 0, 2, 41], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172489); +tree.insert([0, 0, 0, 0, 0, 0, 1, 224], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172490); +tree.insert([0, 0, 0, 0, 0, 0, 1, 228], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172491); +tree.insert([0, 0, 0, 0, 0, 0, 2, 35], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172493); +tree.insert([0, 0, 0, 0, 0, 0, 0, 75], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172494); +tree.insert([0, 0, 0, 0, 0, 0, 1, 183], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172495); +tree.insert([0, 0, 0, 0, 0, 0, 1, 20], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172496); +tree.insert([0, 0, 0, 0, 0, 0, 2, 243], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172497); +tree.insert([0, 0, 0, 0, 0, 0, 3, 46], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172498); +tree.insert([0, 0, 0, 0, 0, 0, 3, 55], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172508); +tree.insert([0, 0, 0, 0, 0, 0, 1, 195], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172510); +tree.insert([0, 0, 0, 0, 0, 0, 2, 222], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172511); +tree.insert([0, 0, 0, 0, 0, 0, 1, 97], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172512); +tree.insert([0, 0, 0, 0, 0, 0, 0, 134], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172513); +tree.insert([0, 0, 0, 0, 0, 0, 0, 206], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172514); +tree.insert([0, 0, 0, 0, 0, 0, 1, 236], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172516); +tree.insert([0, 0, 0, 0, 0, 0, 3, 34], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172519); +tree.insert([0, 0, 0, 0, 0, 0, 2, 7], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172523); +tree.insert([0, 0, 0, 0, 0, 0, 2, 13], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172525); +tree.insert([0, 0, 0, 0, 0, 0, 1, 201], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172527); +tree.insert([0, 0, 0, 0, 0, 0, 1, 63], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172528); +tree.insert([0, 0, 0, 0, 0, 0, 1, 156], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172529); +tree.insert([0, 0, 0, 0, 0, 0, 1, 246], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172535); +tree.insert([0, 0, 0, 0, 0, 0, 0, 78], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172536); +tree.insert([0, 0, 0, 0, 0, 0, 1, 117], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172537); +tree.insert([0, 0, 0, 0, 0, 0, 1, 62], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172538); +tree.insert([0, 0, 0, 0, 0, 0, 0, 94], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172541); +tree.insert([0, 0, 0, 0, 0, 0, 2, 71], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172542); +tree.insert([0, 0, 0, 0, 0, 0, 1, 145], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172547); +tree.insert([0, 0, 0, 0, 0, 0, 2, 178], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172548); +tree.insert([0, 0, 0, 0, 0, 0, 0, 252], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172549); +tree.insert([0, 0, 0, 0, 0, 0, 2, 154], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172551); +tree.insert([0, 0, 0, 0, 0, 0, 1, 140], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172554); +tree.insert([0, 0, 0, 0, 0, 0, 2, 98], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172555); +tree.insert([0, 0, 0, 0, 0, 0, 0, 24], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172556); +tree.insert([0, 0, 0, 0, 0, 0, 2, 159], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172558); +tree.insert([0, 0, 0, 0, 0, 0, 2, 220], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172559); +tree.insert([0, 0, 0, 0, 0, 0, 3, 43], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172560); +tree.insert([0, 0, 0, 0, 0, 0, 0, 249], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172562); +tree.insert([0, 0, 0, 0, 0, 0, 0, 11], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172563); +tree.insert([0, 0, 0, 0, 0, 0, 3, 19], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172566); +tree.insert([0, 0, 0, 0, 0, 0, 1, 36], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172567); +tree.insert([0, 0, 0, 0, 0, 0, 0, 19], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172570); +tree.insert([0, 0, 0, 0, 0, 0, 2, 171], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172574); +tree.insert([0, 0, 0, 0, 0, 0, 0, 222], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172579); +tree.insert([0, 0, 0, 0, 0, 0, 0, 201], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172582); +tree.insert([0, 0, 0, 0, 0, 0, 1, 119], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172584); +tree.insert([0, 0, 0, 0, 0, 0, 2, 113], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172586); +tree.insert([0, 0, 0, 0, 0, 0, 1, 130], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172587); +tree.insert([0, 0, 0, 0, 0, 0, 1, 17], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172591); +tree.insert([0, 0, 0, 0, 0, 0, 1, 91], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172595); +tree.insert([0, 0, 0, 0, 0, 0, 0, 33], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172599); +tree.flush_active_memtable(172501)?; +tree.insert([0, 0, 0, 0, 0, 0, 0, 114], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172602); +tree.insert([0, 0, 0, 0, 0, 0, 1, 113], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172605); +tree.insert([0, 0, 0, 0, 0, 0, 0, 100], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172609); +tree.insert([0, 0, 0, 0, 0, 0, 0, 28], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172610); +tree.insert([0, 0, 0, 0, 0, 0, 2, 117], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172614); +tree.insert([0, 0, 0, 0, 0, 0, 2, 52], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172615); +tree.insert([0, 0, 0, 0, 0, 0, 3, 35], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172616); +tree.insert([0, 0, 0, 0, 0, 0, 0, 231], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172619); +tree.insert([0, 0, 0, 0, 0, 0, 0, 34], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172625); +tree.insert([0, 0, 0, 0, 0, 0, 1, 147], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172626); +tree.insert([0, 0, 0, 0, 0, 0, 3, 3], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172627); +tree.insert([0, 0, 0, 0, 0, 0, 1, 247], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172628); +tree.insert([0, 0, 0, 0, 0, 0, 1, 190], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172631); +tree.insert([0, 0, 0, 0, 0, 0, 1, 164], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172632); +tree.insert([0, 0, 0, 0, 0, 0, 1, 137], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172633); +tree.insert([0, 0, 0, 0, 0, 0, 1, 168], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172634); +tree.insert([0, 0, 0, 0, 0, 0, 3, 12], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172637); +tree.insert([0, 0, 0, 0, 0, 0, 2, 183], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172638); +tree.insert([0, 0, 0, 0, 0, 0, 0, 113], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172645); +tree.insert([0, 0, 0, 0, 0, 0, 0, 233], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172649); +tree.insert([0, 0, 0, 0, 0, 0, 2, 67], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172650); +tree.insert([0, 0, 0, 0, 0, 0, 1, 203], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172655); +tree.insert([0, 0, 0, 0, 0, 0, 2, 80], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172656); +tree.insert([0, 0, 0, 0, 0, 0, 0, 179], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172658); +tree.insert([0, 0, 0, 0, 0, 0, 1, 45], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172660); +tree.insert([0, 0, 0, 0, 0, 0, 0, 2], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172661); +tree.insert([0, 0, 0, 0, 0, 0, 1, 93], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172663); +tree.insert([0, 0, 0, 0, 0, 0, 1, 28], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172665); +tree.insert([0, 0, 0, 0, 0, 0, 0, 164], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172672); +tree.insert([0, 0, 0, 0, 0, 0, 0, 60], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172675); +tree.insert([0, 0, 0, 0, 0, 0, 0, 170], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172679); +tree.insert([0, 0, 0, 0, 0, 0, 0, 92], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172684); +tree.insert([0, 0, 0, 0, 0, 0, 1, 197], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172688); +tree.insert([0, 0, 0, 0, 0, 0, 0, 169], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172693); +tree.insert([0, 0, 0, 0, 0, 0, 3, 23], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172700); +tree.insert([0, 0, 0, 0, 0, 0, 0, 153], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172702); +tree.insert([0, 0, 0, 0, 0, 0, 2, 4], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172704); +tree.insert([0, 0, 0, 0, 0, 0, 0, 84], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172705); +tree.insert([0, 0, 0, 0, 0, 0, 0, 250], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172706); +tree.insert([0, 0, 0, 0, 0, 0, 1, 218], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172708); +tree.insert([0, 0, 0, 0, 0, 0, 0, 132], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172728); +tree.insert([0, 0, 0, 0, 0, 0, 1, 10], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172733); +tree.insert([0, 0, 0, 0, 0, 0, 0, 63], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172735); +tree.insert([0, 0, 0, 0, 0, 0, 0, 77], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172737); +tree.insert([0, 0, 0, 0, 0, 0, 0, 35], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172740); +tree.insert([0, 0, 0, 0, 0, 0, 0, 103], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172742); +tree.insert([0, 0, 0, 0, 0, 0, 0, 16], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172749); +tree.insert([0, 0, 0, 0, 0, 0, 1, 38], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172750); +tree.insert([0, 0, 0, 0, 0, 0, 0, 155], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172755); +tree.insert([0, 0, 0, 0, 0, 0, 0, 255], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172756); +tree.insert([0, 0, 0, 0, 0, 0, 1, 46], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172761); +tree.insert([0, 0, 0, 0, 0, 0, 0, 10], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172765); +tree.insert([0, 0, 0, 0, 0, 0, 0, 82], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172766); +tree.insert([0, 0, 0, 0, 0, 0, 0, 177], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172767); +tree.insert([0, 0, 0, 0, 0, 0, 0, 199], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172774); +tree.insert([0, 0, 0, 0, 0, 0, 0, 106], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172786); +tree.insert([0, 0, 0, 0, 0, 0, 1, 34], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172788); +tree.flush_active_memtable(172701)?; +tree.compact(compaction.clone(), 172701)?; + +tree.compact(compaction.clone(), 172901)?; + +tree.compact(compaction.clone(), 173101)?; + + tree.drop_range::<&[u8], _>(..)?; + + + assert_eq!(0, tree.segment_count()); + assert_eq!(0, tree.blob_file_count()); + + Ok(()) +} From 85bbe9cfbd1177eb50cd3ac2ab9d0135f6738ba6 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 15 Oct 2025 21:55:55 +0200 Subject: [PATCH 598/613] fix: model 6 --- src/compaction/flavour.rs | 3 ++- src/compaction/worker.rs | 13 ++++++----- src/segment/writer/mod.rs | 3 ++- src/version/mod.rs | 47 +++++++++++++++++++-------------------- tests/model_6.rs | 3 --- 5 files changed, 34 insertions(+), 35 deletions(-) diff --git a/src/compaction/flavour.rs b/src/compaction/flavour.rs index 2ca07c86..42097058 100644 --- a/src/compaction/flavour.rs +++ b/src/compaction/flavour.rs @@ -211,9 +211,10 @@ impl CompactionFlavour for RelocatingCompaction { // This blob is not part of the rewritten blob files // So just pass it through log::trace!("Pass through {indirection:?} because it is not being relocated"); - self.inner.table_writer.register_blob(indirection); self.inner.table_writer.write(item)?; } + + self.inner.table_writer.register_blob(indirection); } else { self.inner.table_writer.write(item)?; } diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index f595f1a8..6b507508 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -348,12 +348,13 @@ fn merge_segments( .expect("should not fail") .unwrap_or_default(); - let other_ref = other_ref - .iter() - .find(|x| linked_blob_files.iter().any(|bf| bf.id() == x.blob_file_id)); + let other_refs = other_ref + .into_iter() + .filter(|x| linked_blob_files.iter().any(|bf| bf.id() == x.blob_file_id)) + .collect::>(); - if let Some(other_ref) = other_ref { - linked_blob_files.retain(|x| x.id() != other_ref.blob_file_id); + for additional_ref in other_refs { + linked_blob_files.retain(|x| x.id() != additional_ref.blob_file_id); } } @@ -364,7 +365,7 @@ fn merge_segments( Box::new(StandardCompaction::new(table_writer, segments)) as Box } else { - log::warn!( + log::debug!( "Relocate blob files: {:?}", blob_files_to_rewrite .iter() diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index adc51c3c..acac569b 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -468,9 +468,10 @@ impl Writer { fsync_directory(self.path.parent().expect("should have folder"))?; log::debug!( - "Written {} items in {} blocks into new segment file, written {} MiB", + "Written {} items in {} blocks into new table file #{}, written {} MiB", self.meta.item_count, self.meta.data_block_count, + self.segment_id, *self.meta.file_pos / 1_024 / 1_024, ); diff --git a/src/version/mod.rs b/src/version/mod.rs index fdf9c79b..25bbb407 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -365,10 +365,10 @@ impl Version { self.value_log.clone() }; - let gc_map = if let Some(diff) = diff { + let gc_stats = if let Some(diff) = diff { let mut copy = self.gc_stats.deref().clone(); diff.merge_into(&mut copy); - copy.prune(&self.value_log); + copy.prune(&value_log); Arc::new(copy) } else { self.gc_stats.clone() @@ -379,7 +379,7 @@ impl Version { id, levels, value_log, - gc_stats: gc_map, + gc_stats, }), seqno_watermark: 0, } @@ -514,25 +514,6 @@ impl Version { let has_diff = diff.is_some(); - let gc_stats = - if has_diff || !blob_files_to_drop.is_empty() || !blob_files_to_drop.is_empty() { - let mut copy = self.gc_stats.deref().clone(); - - if let Some(diff) = diff { - diff.merge_into(&mut copy); - } - - for id in &blob_files_to_drop { - copy.remove(id); - } - - copy.prune(&self.value_log); - - Arc::new(copy) - } else { - self.gc_stats.clone() - }; - let value_log = if has_diff || !new_blob_files.is_empty() || !blob_files_to_drop.is_empty() { let mut copy = self.value_log.deref().clone(); @@ -541,8 +522,8 @@ impl Version { copy.insert(blob_file.id(), blob_file); } - for id in blob_files_to_drop { - copy.remove(&id); + for id in &blob_files_to_drop { + copy.remove(id); } Arc::new(copy) @@ -550,6 +531,24 @@ impl Version { self.value_log.clone() }; + let gc_stats = if has_diff || !blob_files_to_drop.is_empty() { + let mut copy = self.gc_stats.deref().clone(); + + if let Some(diff) = diff { + diff.merge_into(&mut copy); + } + + for id in &blob_files_to_drop { + copy.remove(id); + } + + copy.prune(&value_log); + + Arc::new(copy) + } else { + self.gc_stats.clone() + }; + Self { inner: Arc::new(VersionInner { id, diff --git a/tests/model_6.rs b/tests/model_6.rs index f365adb9..8aade921 100644 --- a/tests/model_6.rs +++ b/tests/model_6.rs @@ -401,7 +401,6 @@ tree.insert([0, 0, 0, 0, 0, 0, 1, 152], [104, 101, 108, 108, 111, 104, 101, 108, tree.insert([0, 0, 0, 0, 0, 0, 2, 16], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171180); tree.flush_active_memtable(171101)?; tree.compact(compaction.clone(), 171101)?; - tree.insert([0, 0, 0, 0, 0, 0, 3, 6], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171201); tree.insert([0, 0, 0, 0, 0, 0, 1, 217], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171202); tree.insert([0, 0, 0, 0, 0, 0, 3, 80], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 171203); @@ -1015,7 +1014,6 @@ tree.insert([0, 0, 0, 0, 0, 0, 1, 196], [104, 101, 108, 108, 111, 104, 101, 108, tree.insert([0, 0, 0, 0, 0, 0, 1, 153], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172167); tree.flush_active_memtable(172101)?; tree.compact(compaction.clone(), 172101)?; - tree.insert([0, 0, 0, 0, 0, 0, 0, 219], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172222); tree.insert([0, 0, 0, 0, 0, 0, 0, 124], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172225); tree.insert([0, 0, 0, 0, 0, 0, 2, 192], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 172226); @@ -1282,7 +1280,6 @@ tree.compact(compaction.clone(), 172901)?; tree.compact(compaction.clone(), 173101)?; tree.drop_range::<&[u8], _>(..)?; - assert_eq!(0, tree.segment_count()); assert_eq!(0, tree.blob_file_count()); From cf72a8275fbe4f30279fa3eed3b4fb0c2ac3346f Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 15 Oct 2025 21:56:21 +0200 Subject: [PATCH 599/613] fix: another model testing finding --- src/version/mod.rs | 2 +- tests/model_5.rs | 82 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 1 deletion(-) create mode 100644 tests/model_5.rs diff --git a/src/version/mod.rs b/src/version/mod.rs index 25bbb407..6e66c06a 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -600,7 +600,7 @@ impl Version { id, levels, value_log: self.value_log.clone(), - gc_stats: Arc::default(), + gc_stats: self.gc_stats.clone(), }), seqno_watermark: 0, } diff --git a/tests/model_5.rs b/tests/model_5.rs new file mode 100644 index 00000000..4cc44067 --- /dev/null +++ b/tests/model_5.rs @@ -0,0 +1,82 @@ +// Found by model testing + +use lsm_tree::{config::BlockSizePolicy, AbstractTree, KvSeparationOptions, Result}; +use std::sync::Arc; +use test_log::test; + +#[test] +fn model_5() -> Result<()> { + let folder = tempfile::tempdir()?; + let path = folder.path(); + + let tree = lsm_tree::Config::new(path) + .with_kv_separation(Some(KvSeparationOptions::default().separation_threshold(5))) + .data_block_size_policy(BlockSizePolicy::all(100)) + .open()?; + + let compaction = Arc::new(lsm_tree::compaction::Leveled { + target_size: 150, + ..Default::default() + }); + + let value = b"hellohello"; + + tree.insert([0], value, 753); + tree.insert([0, 0, 0, 0, 0, 0, 1, 0], value, 754); + tree.insert([0, 0, 0, 0, 0, 0, 0, 140], value, 756); + tree.insert([0, 0, 0, 0, 0, 0, 0, 127], value, 762); + tree.insert([0, 0, 0, 0, 0, 0, 0, 98], value, 763); + tree.insert([0, 0, 0, 0, 0, 0, 2, 138], value, 764); + tree.insert([0, 0, 0, 0, 0, 0, 3, 150], value, 765); + tree.insert([0, 0, 0, 0, 0, 0, 0, 23], value, 766); + tree.insert([0, 0, 0, 0, 0, 0, 3, 121], value, 767); + tree.insert([0, 0, 0, 0, 0, 0, 1, 212], value, 842); + tree.insert([0, 0, 0, 0, 0, 0, 2, 152], value, 843); + tree.insert([0, 0, 0, 0, 0, 0, 2, 241], value, 844); + tree.flush_active_memtable(798)?; + + tree.insert([0, 0, 0, 0, 0, 0, 3, 120], value, 898); + tree.flush_active_memtable(799)?; + + tree.insert([0, 0, 0, 0, 0, 0, 3, 89], value, 899); + tree.flush_active_memtable(800)?; + + tree.insert([0, 0, 0, 0, 0, 0, 1, 52], value, 901); + tree.insert([0, 0, 0, 0, 0, 0, 0, 177], value, 902); + tree.insert([0, 0, 0, 0, 0, 0, 3, 43], value, 903); + tree.insert([0, 0, 0, 0, 0, 0, 3, 41], value, 904); + tree.insert([0, 0, 0, 0, 0, 0, 3, 160], value, 905); + tree.insert([0, 0, 0, 0, 0, 0, 1, 182], value, 906); + tree.insert([0, 0, 0, 0, 0, 0, 0, 73], value, 907); + tree.insert([0, 0, 0, 0, 0, 0, 0, 78], value, 912); + tree.insert([0, 0, 0, 0, 0, 0, 2, 103], value, 913); + tree.insert([0, 0, 0, 0, 0, 0, 1, 39], value, 914); + tree.insert([0, 0, 0, 0, 0, 0, 1, 78], value, 927); + tree.insert([0, 0, 0, 0, 0, 0, 0, 244], value, 928); + tree.insert([0, 0, 0, 0, 0, 0, 2, 76], value, 929); + tree.insert([0, 0, 0, 0, 0, 0, 1, 202], value, 934); + tree.insert([0, 0, 0, 0, 0, 0, 2, 140], value, 936); + tree.insert([0, 0, 0, 0, 0, 0, 2, 152], value, 937); + tree.flush_active_memtable(886)?; + tree.compact(compaction.clone(), 886)?; + + tree.insert([0, 0, 0, 0, 0, 0, 3, 145], value, 989); + tree.flush_active_memtable(890)?; + + tree.insert([0, 0, 0, 0, 0, 0, 3, 99], value, 993); + tree.flush_active_memtable(894)?; + + tree.insert([0, 0, 0, 0, 0, 0, 1, 106], value, 997); + tree.flush_active_memtable(898)?; + + tree.insert([0, 0, 0, 0, 0, 0, 2, 99], value, 1001); + tree.flush_active_memtable(902)?; + tree.compact(compaction.clone(), 902)?; + + tree.drop_range::<&[u8], _>(..)?; + + assert_eq!(0, tree.segment_count()); + assert_eq!(0, tree.blob_file_count()); + + Ok(()) +} From 0d2d7b2071c65f2538bb01e4512907892991dcbe Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 15 Oct 2025 21:56:53 +0200 Subject: [PATCH 600/613] fix: register blobs after writing to table writer --- src/compaction/flavour.rs | 21 +++++--- tests/model_4.rs | 106 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+), 6 deletions(-) create mode 100644 tests/model_4.rs diff --git a/src/compaction/flavour.rs b/src/compaction/flavour.rs index 42097058..bf15ae86 100644 --- a/src/compaction/flavour.rs +++ b/src/compaction/flavour.rs @@ -197,8 +197,6 @@ impl CompactionFlavour for RelocatingCompaction { blob_entry.uncompressed_len, )?; - self.inner.table_writer.register_blob(indirection); - self.inner .table_writer .write(InternalValue::from_components( @@ -207,6 +205,8 @@ impl CompactionFlavour for RelocatingCompaction { item.key.seqno, crate::ValueType::Indirection, ))?; + + self.inner.table_writer.register_blob(indirection); } else { // This blob is not part of the rewritten blob files // So just pass it through @@ -330,13 +330,22 @@ impl StandardCompaction { impl CompactionFlavour for StandardCompaction { fn write(&mut self, item: InternalValue) -> crate::Result<()> { - if item.key.value_type.is_indirection() { - let mut reader = &item.value[..]; - let indirection = BlobIndirection::decode_from(&mut reader)?; + let indirection = if item.key.value_type.is_indirection() { + Some({ + let mut reader = &item.value[..]; + BlobIndirection::decode_from(&mut reader)? + }) + } else { + None + }; + + self.table_writer.write(item)?; + + if let Some(indirection) = indirection { self.table_writer.register_blob(indirection); } - self.table_writer.write(item) + Ok(()) } fn finish( diff --git a/tests/model_4.rs b/tests/model_4.rs new file mode 100644 index 00000000..0a6a4eaa --- /dev/null +++ b/tests/model_4.rs @@ -0,0 +1,106 @@ +// Found by model testing + +use lsm_tree::{config::BlockSizePolicy, AbstractTree, KvSeparationOptions, Result}; +use std::sync::Arc; +use test_log::test; + +#[test] +fn model_4() -> Result<()> { + let folder = tempfile::tempdir()?; + let path = folder.path(); + + let tree = lsm_tree::Config::new(path) + .with_kv_separation(Some(KvSeparationOptions::default().separation_threshold(5))) + .data_block_size_policy(BlockSizePolicy::all(100)) + .open()?; + + let compaction = Arc::new(lsm_tree::compaction::Leveled { + target_size: 1_000, + ..Default::default() + }); + + let value = b"hellohello"; + + tree.insert([0], value, 132); + tree.insert([0], value, 133); + tree.insert([1], value, 134); + tree.insert([2], value, 135); + tree.insert([3], value, 136); + tree.insert([4], value, 137); + tree.insert([5], value, 138); + tree.insert([6], value, 139); + tree.insert([7], value, 140); + tree.insert([0, 0, 0, 0, 0, 0, 9, 217], value, 141); + tree.insert([0, 0, 0, 0, 0, 0, 2, 77], value, 142); + tree.insert([0, 0, 0, 0, 0, 0, 33, 92], value, 143); + tree.insert([0, 0, 0, 0, 0, 0, 38, 41], value, 144); + tree.insert([0, 0, 0, 0, 0, 0, 22, 143], value, 145); + tree.insert([0, 0, 0, 0, 0, 0, 22, 161], value, 146); + tree.insert([0, 0, 0, 0, 0, 0, 9, 143], value, 148); + tree.insert([0, 0, 0, 0, 0, 0, 25, 222], value, 149); + tree.insert([0, 0, 0, 0, 0, 0, 11, 144], value, 150); + tree.insert([0, 0, 0, 0, 0, 0, 8, 208], value, 151); + tree.insert([0, 0, 0, 0, 0, 0, 31, 195], value, 152); + tree.insert([0, 0, 0, 0, 0, 0, 27, 47], value, 153); + tree.insert([0, 0, 0, 0, 0, 0, 31, 104], value, 154); + tree.insert([0, 0, 0, 0, 0, 0, 14, 219], value, 155); + tree.insert([0, 0, 0, 0, 0, 0, 17, 125], value, 156); + tree.insert([0, 0, 0, 0, 0, 0, 15, 52], value, 157); + tree.insert([0, 0, 0, 0, 0, 0, 20, 230], value, 158); + tree.insert([0, 0, 0, 0, 0, 0, 16, 88], value, 159); + tree.insert([0, 0, 0, 0, 0, 0, 9, 26], value, 160); + tree.insert([0, 0, 0, 0, 0, 0, 20, 21], value, 161); + tree.insert([0, 0, 0, 0, 0, 0, 27, 86], value, 162); + tree.insert([0, 0, 0, 0, 0, 0, 4, 112], value, 163); + tree.insert([0, 0, 0, 0, 0, 0, 12, 60], value, 164); + tree.insert([0, 0, 0, 0, 0, 0, 8, 186], value, 165); + tree.insert([0, 0, 0, 0, 0, 0, 34, 18], value, 166); + tree.insert([0, 0, 0, 0, 0, 0, 15, 156], value, 167); + tree.insert([0, 0, 0, 0, 0, 0, 5, 91], value, 168); + tree.insert([0, 0, 0, 0, 0, 0, 36, 0], value, 169); + tree.insert([0, 0, 0, 0, 0, 0, 38, 249], value, 170); + tree.insert([0, 0, 0, 0, 0, 0, 23, 42], value, 171); + tree.insert([0, 0, 0, 0, 0, 0, 23, 14], value, 172); + tree.insert([0, 0, 0, 0, 0, 0, 32, 119], value, 173); + tree.insert([0, 0, 0, 0, 0, 0, 31, 9], value, 174); + tree.insert([0, 0, 0, 0, 0, 0, 4, 170], value, 175); + tree.insert([0, 0, 0, 0, 0, 0, 18, 119], value, 176); + tree.insert([0, 0, 0, 0, 0, 0, 4, 178], value, 177); + tree.insert([0, 0, 0, 0, 0, 0, 4, 36], value, 178); + tree.insert([0, 0, 0, 0, 0, 0, 36, 53], value, 179); + tree.insert([0, 0, 0, 0, 0, 0, 35, 157], value, 181); + tree.insert([0, 0, 0, 0, 0, 0, 22, 24], value, 182); + tree.insert([0, 0, 0, 0, 0, 0, 33, 247], value, 183); + tree.insert([0, 0, 0, 0, 0, 0, 26, 236], value, 185); + tree.flush_active_memtable(86)?; + + tree.insert([0], value, 186); + tree.flush_active_memtable(87)?; + + tree.insert([0, 0, 0, 0, 0, 0, 7, 49], value, 187); + tree.flush_active_memtable(88)?; + + tree.insert([0, 0, 0, 0, 0, 0, 18, 134], value, 188); + tree.flush_active_memtable(89)?; + tree.compact(compaction.clone(), 89)?; + + tree.insert([0], value, 189); + tree.flush_active_memtable(90)?; + + tree.insert([0], value, 190); + tree.flush_active_memtable(91)?; + + tree.insert([0], value, 191); + tree.flush_active_memtable(92)?; + + tree.insert([0], value, 192); + tree.flush_active_memtable(93)?; + tree.compact(compaction.clone(), 93)?; + + tree.drop_range::<&[u8], _>(..)?; + + assert_eq!(0, tree.segment_count()); + assert_eq!(0, tree.blob_file_count()); + + Ok(()) +} From af936ed6f50df504d239988ed7ca46675b80e92d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 15 Oct 2025 22:00:08 +0200 Subject: [PATCH 601/613] losing sanity --- src/compaction/flavour.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/compaction/flavour.rs b/src/compaction/flavour.rs index bf15ae86..b8546160 100644 --- a/src/compaction/flavour.rs +++ b/src/compaction/flavour.rs @@ -205,8 +205,6 @@ impl CompactionFlavour for RelocatingCompaction { item.key.seqno, crate::ValueType::Indirection, ))?; - - self.inner.table_writer.register_blob(indirection); } else { // This blob is not part of the rewritten blob files // So just pass it through From 6279b4ca0eef0b86d32bcac2fc8ad54f10cdbfef Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 15 Oct 2025 22:07:18 +0200 Subject: [PATCH 602/613] wip --- tests/model_6.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/model_6.rs b/tests/model_6.rs index 8aade921..4c43a246 100644 --- a/tests/model_6.rs +++ b/tests/model_6.rs @@ -1,10 +1,14 @@ // Found by model testing -use lsm_tree::{config::BlockSizePolicy, AbstractTree, KvSeparationOptions, Result, config::CompressionPolicy}; +use lsm_tree::{ + config::BlockSizePolicy, config::CompressionPolicy, AbstractTree, KvSeparationOptions, Result, +}; use std::sync::Arc; use test_log::test; +// Yes this file is very large, it's hard to condense it to a more minimal repro #[test] +#[rustfmt::skip] fn model_6() -> Result<()> { let folder = tempfile::tempdir()?; let path = folder.path(); @@ -21,8 +25,6 @@ fn model_6() -> Result<()> { ..Default::default() }); - let value = b"hellohello"; - tree.insert([0, 0, 0, 0, 0, 0, 3, 152], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170598); tree.insert([0, 0, 0, 0, 0, 0, 0, 9], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170599); tree.insert([0, 0, 0, 0, 0, 0, 1, 184], [104, 101, 108, 108, 111, 104, 101, 108, 108, 111], 170600); From 0197d7989a681c06a50eb742357cd58cf8e7216c Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 16 Oct 2025 15:59:21 +0200 Subject: [PATCH 603/613] rename --- src/blob_tree/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index 2222815d..c3c12624 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -406,7 +406,7 @@ impl AbstractTree for BlobTree { ); let iter = memtable.iter().map(Ok); - let compaction_filter = CompactionStream::new(iter, eviction_seqno); + let compaction_stream = CompactionStream::new(iter, eviction_seqno); let mut blob_bytes_referenced = 0; let mut blobs_referenced_count = 0; @@ -419,7 +419,7 @@ impl AbstractTree for BlobTree { .expect("kv separation options should exist") .separation_threshold; - for item in compaction_filter { + for item in compaction_stream { let item = item?; if item.is_tombstone() { From 2dfed92c5e37e78c038d5cd279e048d7c559addf Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 16 Oct 2025 16:10:16 +0200 Subject: [PATCH 604/613] test: index block iter fuzz --- fuzz/index_block/src/main.rs | 145 +++++++++++++++++++++++++-------- src/segment/index_block/mod.rs | 6 ++ 2 files changed, 117 insertions(+), 34 deletions(-) diff --git a/fuzz/index_block/src/main.rs b/fuzz/index_block/src/main.rs index 4f6ba9ca..3bb89177 100644 --- a/fuzz/index_block/src/main.rs +++ b/fuzz/index_block/src/main.rs @@ -2,7 +2,9 @@ extern crate afl; use arbitrary::{Arbitrary, Result, Unstructured}; -use lsm_tree::segment::{block::BlockOffset, Block, IndexBlock, KeyedBlockHandle}; +use lsm_tree::segment::{ + Block, IndexBlock, KeyedBlockHandle, block::BlockOffset, block::decoder::ParsedItem, +}; #[derive(Clone, Debug, PartialEq, Eq, Ord, PartialOrd)] struct FuzzyValue(KeyedBlockHandle); @@ -22,8 +24,8 @@ impl<'a> Arbitrary<'a> for FuzzyValue { } fn generate_ping_pong_code(seed: u64, len: usize) -> Vec { - use rand::prelude::*; use rand::SeedableRng; + use rand::prelude::*; use rand_chacha::ChaCha8Rng; let mut rng = ChaCha8Rng::seed_from_u64(seed); @@ -34,7 +36,7 @@ fn main() { fuzz!(|data: &[u8]| { let mut unstructured = Unstructured::new(data); - // eprintln!("restart_interval={restart_interval}"); + let seed = u64::arbitrary(&mut unstructured).unwrap(); if let Ok(mut items) = as Arbitrary>::arbitrary(&mut unstructured) { // let mut items = items.to_vec(); @@ -57,12 +59,7 @@ fn main() { let items = items.into_iter().map(|value| value.0).collect::>(); - // for restart_interval in 1..=u8::MAX { - let bytes = IndexBlock::encode_into_vec( - &items, - // restart_interval.into(), - ) - .unwrap(); + let bytes = IndexBlock::encode_into_vec(&items).unwrap(); let index_block = IndexBlock::new(Block { data: bytes.into(), @@ -76,41 +73,121 @@ fn main() { assert_eq!(index_block.len(), items.len()); - /* if data_block.binary_index_len() > 254 { - assert!(data_block.hash_bucket_count().is_none()); - } else if hash_ratio > 0.0 { - assert!(data_block.hash_bucket_count().unwrap() > 0); - } */ - - // eprintln!("{items:?}"); - - /* for handle in &items { - // eprintln!("needle: {:?}", needle.key); - - assert_eq!( - Some(needle.clone()), - data_block.point_read(&handle.end_key).unwrap(), - ); - } */ - - /* assert_eq!( + assert_eq!( items, - data_block.iter().map(|x| x.unwrap()).collect::>(), + index_block + .iter() + .map(|x| x.materialize(index_block.as_slice())) + .collect::>() ); assert_eq!( items.iter().rev().cloned().collect::>(), - data_block + index_block .iter() + .map(|x| x.materialize(index_block.as_slice())) .rev() - .map(|x| x.unwrap()) .collect::>(), - ); */ + ); - // TODO: add ping-pong iters + { + let ping_pongs = generate_ping_pong_code(seed, items.len()); + + let expected_ping_ponged_items = { + let mut iter = items.iter().rev(); + let mut v = vec![]; + + for &x in &ping_pongs { + if x == 0 { + v.push(iter.next().cloned().unwrap()); + } else { + v.push(iter.next_back().cloned().unwrap()); + } + } + + v + }; + + let real_ping_ponged_items = { + let mut iter = index_block + .iter() + .rev() + .map(|x| x.materialize(index_block.as_slice())); + + let mut v = vec![]; + + for &x in &ping_pongs { + if x == 0 { + v.push(iter.next().unwrap()); + } else { + v.push(iter.next_back().unwrap()); + } + } + + v + }; + + assert_eq!(expected_ping_ponged_items, real_ping_ponged_items); + } + + { + use rand::SeedableRng; + use rand::prelude::*; + use rand_chacha::ChaCha8Rng; + + let mut rng = ChaCha8Rng::seed_from_u64(seed); + let mut lo = rng.random_range(0..items.len()); + let mut hi = rng.random_range(0..items.len()); + + if lo > hi { + std::mem::swap(&mut lo, &mut hi); + } + + // NOTE: If there is A:1, A:2, B:1 + // And we select lo as A:2 + // Our data block will seek to A:1 (correct) + // But our model won't... + // So seek to the first occurence of a key + loop { + if lo == 0 { + break; + } + + if items[lo - 1].end_key() == items[lo].end_key() { + lo -= 1; + } else { + break; + } + } + + // NOTE: Similar to lo + loop { + if hi == items.len() - 1 { + break; + } + + if items[hi + 1].end_key() == items[hi].end_key() { + hi += 1; + } else { + break; + } + } + + let lo_key = &items[lo].end_key(); + let hi_key = &items[hi].end_key(); + + let expected_range: Vec<_> = items[lo..=hi].iter().cloned().collect(); + + let mut iter = index_block.iter(); + assert!(iter.seek(&lo_key), "should seek"); + assert!(iter.seek_upper(hi_key), "should seek"); - // TODO: add range iter too - // } + assert_eq!( + expected_range, + iter.map(|x| x.materialize(index_block.as_slice())) + .collect::>(), + ); + } } } }); diff --git a/src/segment/index_block/mod.rs b/src/segment/index_block/mod.rs index 380a05ac..ca9ca647 100644 --- a/src/segment/index_block/mod.rs +++ b/src/segment/index_block/mod.rs @@ -69,6 +69,12 @@ impl IndexBlock { Self { inner } } + /// Accesses the inner raw bytes + #[must_use] + pub fn as_slice(&self) -> &Slice { + &self.inner.data + } + /// Returns the number of items in the block. #[must_use] #[allow(clippy::len_without_is_empty)] From 1edc0fcf06277def000595c958c71a3f4fbae124 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 16 Oct 2025 16:10:16 +0200 Subject: [PATCH 605/613] test: index block iter fuzz --- fuzz/index_block/src/main.rs | 157 +++++++++++++++++++++++---------- src/segment/index_block/mod.rs | 6 ++ 2 files changed, 116 insertions(+), 47 deletions(-) diff --git a/fuzz/index_block/src/main.rs b/fuzz/index_block/src/main.rs index 4f6ba9ca..158fa6fa 100644 --- a/fuzz/index_block/src/main.rs +++ b/fuzz/index_block/src/main.rs @@ -2,7 +2,9 @@ extern crate afl; use arbitrary::{Arbitrary, Result, Unstructured}; -use lsm_tree::segment::{block::BlockOffset, Block, IndexBlock, KeyedBlockHandle}; +use lsm_tree::segment::{ + block::decoder::ParsedItem, block::BlockOffset, Block, IndexBlock, KeyedBlockHandle, +}; #[derive(Clone, Debug, PartialEq, Eq, Ord, PartialOrd)] struct FuzzyValue(KeyedBlockHandle); @@ -34,35 +36,16 @@ fn main() { fuzz!(|data: &[u8]| { let mut unstructured = Unstructured::new(data); - // eprintln!("restart_interval={restart_interval}"); + let seed = u64::arbitrary(&mut unstructured).unwrap(); if let Ok(mut items) = as Arbitrary>::arbitrary(&mut unstructured) { - // let mut items = items.to_vec(); - if !items.is_empty() { items.sort(); items.dedup(); - /* eprintln!("-- items --"); - for item in items.iter().map(|value| &value.0) { - eprintln!( - r#"InternalValue::from_components({:?}, {:?}, {}, {:?}),"#, - item.key.user_key, item.value, item.key.seqno, item.key.value_type, - ); - } */ - - /* if items.len() > 100 { - eprintln!("================== {}. ", items.len()); - } */ - let items = items.into_iter().map(|value| value.0).collect::>(); - // for restart_interval in 1..=u8::MAX { - let bytes = IndexBlock::encode_into_vec( - &items, - // restart_interval.into(), - ) - .unwrap(); + let bytes = IndexBlock::encode_into_vec(&items).unwrap(); let index_block = IndexBlock::new(Block { data: bytes.into(), @@ -76,41 +59,121 @@ fn main() { assert_eq!(index_block.len(), items.len()); - /* if data_block.binary_index_len() > 254 { - assert!(data_block.hash_bucket_count().is_none()); - } else if hash_ratio > 0.0 { - assert!(data_block.hash_bucket_count().unwrap() > 0); - } */ - - // eprintln!("{items:?}"); - - /* for handle in &items { - // eprintln!("needle: {:?}", needle.key); - - assert_eq!( - Some(needle.clone()), - data_block.point_read(&handle.end_key).unwrap(), - ); - } */ - - /* assert_eq!( + assert_eq!( items, - data_block.iter().map(|x| x.unwrap()).collect::>(), + index_block + .iter() + .map(|x| x.materialize(index_block.as_slice())) + .collect::>() ); assert_eq!( items.iter().rev().cloned().collect::>(), - data_block + index_block .iter() + .map(|x| x.materialize(index_block.as_slice())) .rev() - .map(|x| x.unwrap()) .collect::>(), - ); */ + ); - // TODO: add ping-pong iters + { + let ping_pongs = generate_ping_pong_code(seed, items.len()); + + let expected_ping_ponged_items = { + let mut iter = items.iter().rev(); + let mut v = vec![]; + + for &x in &ping_pongs { + if x == 0 { + v.push(iter.next().cloned().unwrap()); + } else { + v.push(iter.next_back().cloned().unwrap()); + } + } + + v + }; + + let real_ping_ponged_items = { + let mut iter = index_block + .iter() + .rev() + .map(|x| x.materialize(index_block.as_slice())); + + let mut v = vec![]; + + for &x in &ping_pongs { + if x == 0 { + v.push(iter.next().unwrap()); + } else { + v.push(iter.next_back().unwrap()); + } + } + + v + }; + + assert_eq!(expected_ping_ponged_items, real_ping_ponged_items); + } + + { + use rand::prelude::*; + use rand::SeedableRng; + use rand_chacha::ChaCha8Rng; + + let mut rng = ChaCha8Rng::seed_from_u64(seed); + let mut lo = rng.random_range(0..items.len()); + let mut hi = rng.random_range(0..items.len()); + + if lo > hi { + std::mem::swap(&mut lo, &mut hi); + } + + // NOTE: If there is A:1, A:2, B:1 + // And we select lo as A:2 + // Our data block will seek to A:1 (correct) + // But our model won't... + // So seek to the first occurence of a key + loop { + if lo == 0 { + break; + } + + if items[lo - 1].end_key() == items[lo].end_key() { + lo -= 1; + } else { + break; + } + } + + // NOTE: Similar to lo + loop { + if hi == items.len() - 1 { + break; + } + + if items[hi + 1].end_key() == items[hi].end_key() { + hi += 1; + } else { + break; + } + } + + let lo_key = &items[lo].end_key(); + let hi_key = &items[hi].end_key(); + + let expected_range: Vec<_> = items[lo..=hi].iter().cloned().collect(); + + let mut iter = index_block.iter(); + assert!(iter.seek(&lo_key), "should seek"); + assert!(iter.seek_upper(hi_key), "should seek"); - // TODO: add range iter too - // } + assert_eq!( + expected_range, + iter.map(|x| x.materialize(index_block.as_slice())) + .collect::>(), + ); + } } } }); diff --git a/src/segment/index_block/mod.rs b/src/segment/index_block/mod.rs index 380a05ac..ca9ca647 100644 --- a/src/segment/index_block/mod.rs +++ b/src/segment/index_block/mod.rs @@ -69,6 +69,12 @@ impl IndexBlock { Self { inner } } + /// Accesses the inner raw bytes + #[must_use] + pub fn as_slice(&self) -> &Slice { + &self.inner.data + } + /// Returns the number of items in the block. #[must_use] #[allow(clippy::len_without_is_empty)] From eb3acc531bc0382379e146ae2a8ab25b930633f4 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 16 Oct 2025 16:21:20 +0200 Subject: [PATCH 606/613] fix: trailer size --- src/segment/block/trailer.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/segment/block/trailer.rs b/src/segment/block/trailer.rs index f308ad3a..66f0e6e8 100644 --- a/src/segment/block/trailer.rs +++ b/src/segment/block/trailer.rs @@ -14,8 +14,10 @@ pub const TRAILER_START_MARKER: u8 = 255; const TRAILER_SIZE: usize = 5 * std::mem::size_of::() + (2 * std::mem::size_of::()) // Fixed key size (unused) + + std::mem::size_of::() + std::mem::size_of::() // Fixed value size (unused) + + std::mem::size_of::() + std::mem::size_of::(); /// Block trailer @@ -135,10 +137,11 @@ impl<'a> Trailer<'a> { .write_u32::(hash_index_offset)?; // Fixed key size (unused) + encoder.writer.write_u8(0)?; encoder.writer.write_u16::(0)?; - // TODO: 3.0.0 what if value is actually 0...? we need another byte prob // Fixed value size (unused) + encoder.writer.write_u8(0)?; encoder.writer.write_u32::(0)?; // NOTE: We know that data blocks will never even approach 4 GB in size, so there can't be that many items either From a98328dfdb73e8e019add6b6697567e309f6717d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 16 Oct 2025 20:34:06 +0200 Subject: [PATCH 607/613] adjust trace log --- src/compaction/state/mod.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/compaction/state/mod.rs b/src/compaction/state/mod.rs index 9bc304c1..0bde4896 100644 --- a/src/compaction/state/mod.rs +++ b/src/compaction/state/mod.rs @@ -122,7 +122,7 @@ impl CompactionState { } pub(crate) fn maintenance(&mut self, gc_watermark: SeqNo) -> crate::Result<()> { - log::debug!("Running manifest GC"); + log::trace!("Running manifest GC with watermark={gc_watermark}"); loop { let Some(head) = self.version_free_list.front() else { @@ -138,7 +138,10 @@ impl CompactionState { } } - log::debug!("Manifest GC done"); + log::trace!( + "Manifest GC done, manifest length now {}", + self.version_free_list_len(), + ); Ok(()) } From 8ace9968a58ce8684b55755e899ebf78a22a9968 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 16 Oct 2025 20:42:09 +0200 Subject: [PATCH 608/613] closes #130 --- src/binary_search.rs | 91 -------------------------------------------- src/lib.rs | 3 -- src/version/run.rs | 12 +++--- 3 files changed, 6 insertions(+), 100 deletions(-) delete mode 100644 src/binary_search.rs diff --git a/src/binary_search.rs b/src/binary_search.rs deleted file mode 100644 index a30a0fed..00000000 --- a/src/binary_search.rs +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -/// Returns the index of the partition point according to the given predicate -/// (the index of the first element of the second partition). -/// -/// This seems to be faster than std's `partition_point`: -pub fn partition_point(slice: &[T], pred: F) -> usize -where - F: Fn(&T) -> bool, -{ - let mut left = 0; - let mut right = slice.len(); - - if right == 0 { - return 0; - } - - while left < right { - let mid = (left + right) / 2; - - // SAFETY: See https://github.com/rust-lang/rust/blob/ebf0cf75d368c035f4c7e7246d203bd469ee4a51/library/core/src/slice/mod.rs#L2834-L2836 - #[allow(unsafe_code)] - let item = unsafe { slice.get_unchecked(mid) }; - - if pred(item) { - left = mid + 1; - } else { - right = mid; - } - } - - left -} - -#[cfg(test)] -mod tests { - use super::partition_point; - use test_log::test; - - #[test] - fn binary_search_first() { - let items = [1, 2, 3, 4, 5]; - let idx = partition_point(&items, |&x| x < 1); - assert_eq!(0, idx); - - let std_pp_idx = items.partition_point(|&x| x < 1); - assert_eq!(std_pp_idx, idx); - } - - #[test] - fn binary_search_last() { - let items = [1, 2, 3, 4, 5]; - let idx = partition_point(&items, |&x| x < 5); - assert_eq!(4, idx); - - let std_pp_idx = items.partition_point(|&x| x < 5); - assert_eq!(std_pp_idx, idx); - } - - #[test] - fn binary_search_middle() { - let items = [1, 2, 3, 4, 5]; - let idx = partition_point(&items, |&x| x < 3); - assert_eq!(2, idx); - - let std_pp_idx = items.partition_point(|&x| x < 3); - assert_eq!(std_pp_idx, idx); - } - - #[test] - fn binary_search_none() { - let items = [1, 2, 3, 4, 5]; - let idx = partition_point(&items, |&x| x < 10); - assert_eq!(5, idx); - - let std_pp_idx = items.partition_point(|&x| x < 10); - assert_eq!(std_pp_idx, idx); - } - - #[test] - fn binary_search_empty() { - let items: [i32; 0] = []; - let idx = partition_point(&items, |&x| x < 10); - assert_eq!(0, idx); - - let std_pp_idx = items.partition_point(|&x| x < 10); - assert_eq!(std_pp_idx, idx); - } -} diff --git a/src/lib.rs b/src/lib.rs index 788693e3..e2ca4adc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -144,9 +144,6 @@ mod any_tree; mod r#abstract; -#[doc(hidden)] -pub mod binary_search; - #[doc(hidden)] pub mod blob_tree; diff --git a/src/version/run.rs b/src/version/run.rs index 039e17e1..2fa2a6e8 100644 --- a/src/version/run.rs +++ b/src/version/run.rs @@ -2,7 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::{binary_search::partition_point, KeyRange}; +use crate::KeyRange; use std::ops::{Bound, RangeBounds}; pub trait Ranged { @@ -93,7 +93,7 @@ impl Run { /// Returns the segment tha'a,t possibly contains the key. pub fn get_for_key(&self, key: &[u8]) -> Option<&T> { - let idx = partition_point(self, |x| x.key_range().max() < &key); + let idx = self.partition_point(|x| x.key_range().max() < &key); self.0.get(idx).filter(|x| x.key_range().min() <= &key) } @@ -160,10 +160,10 @@ impl Run { let lo = match key_range.start_bound() { Bound::Unbounded => 0, Bound::Included(start_key) => { - partition_point(level, |x| x.key_range().max() < start_key) + level.partition_point(|x| x.key_range().max() < start_key) } Bound::Excluded(start_key) => { - partition_point(level, |x| x.key_range().max() <= start_key) + level.partition_point(|x| x.key_range().max() <= start_key) } }; @@ -179,7 +179,7 @@ impl Run { Bound::Unbounded => level.len() - 1, Bound::Included(end_key) => { // IMPORTANT: We need to add back `lo` because we sliced it off - let idx = lo + partition_point(truncated_level, |x| x.key_range().min() <= end_key); + let idx = lo + truncated_level.partition_point(|x| x.key_range().min() <= end_key); if idx == 0 { return None; @@ -189,7 +189,7 @@ impl Run { } Bound::Excluded(end_key) => { // IMPORTANT: We need to add back `lo` because we sliced it off - let idx = lo + partition_point(truncated_level, |x| x.key_range().min() < end_key); + let idx = lo + truncated_level.partition_point(|x| x.key_range().min() < end_key); if idx == 0 { return None; From 8615280b20ef92b79097ffc6e04cfb0e5068df04 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 16 Oct 2025 20:43:20 +0200 Subject: [PATCH 609/613] refactor: remove v3_ prefix from test names --- src/clipping_iter.rs | 6 ++--- src/fallible_clipping_iter.rs | 6 ++--- src/run_reader.rs | 4 +-- src/segment/block/hash_index/mod.rs | 10 +++---- src/segment/block/header.rs | 2 +- src/segment/data_block/iter.rs | 42 ++++++++++++++--------------- src/segment/data_block/mod.rs | 32 +++++++++++----------- src/segment/index_block/iter.rs | 22 +++++++-------- src/segment/mod.rs | 14 +++++----- src/segment/util.rs | 2 +- 10 files changed, 70 insertions(+), 70 deletions(-) diff --git a/src/clipping_iter.rs b/src/clipping_iter.rs index 7fc769fd..5675f083 100644 --- a/src/clipping_iter.rs +++ b/src/clipping_iter.rs @@ -161,7 +161,7 @@ mod tests { use test_log::test; #[test] - fn v3_clipping_iter_forwards() { + fn clipping_iter_forwards() { let items = [ InternalValue::from_components(b"a", b"", 0, crate::ValueType::Value), InternalValue::from_components(b"b", b"", 0, crate::ValueType::Value), @@ -184,7 +184,7 @@ mod tests { } #[test] - fn v3_clipping_iter_rev() { + fn clipping_iter_rev() { let items = [ InternalValue::from_components(b"a", b"", 0, crate::ValueType::Value), InternalValue::from_components(b"b", b"", 0, crate::ValueType::Value), @@ -207,7 +207,7 @@ mod tests { } #[test] - fn v3_clipping_iter_ping_pong() { + fn clipping_iter_ping_pong() { let items = [ InternalValue::from_components(b"a", b"", 0, crate::ValueType::Value), InternalValue::from_components(b"b", b"", 0, crate::ValueType::Value), diff --git a/src/fallible_clipping_iter.rs b/src/fallible_clipping_iter.rs index 6c320d3a..0bef7d81 100644 --- a/src/fallible_clipping_iter.rs +++ b/src/fallible_clipping_iter.rs @@ -161,7 +161,7 @@ mod tests { use test_log::test; #[test] - fn v3_clipping_iter_forwards() -> crate::Result<()> { + fn clipping_iter_forwards() -> crate::Result<()> { let items = [ Ok(InternalValue::from_components( b"a", @@ -211,7 +211,7 @@ mod tests { } #[test] - fn v3_clipping_iter_rev() -> crate::Result<()> { + fn clipping_iter_rev() -> crate::Result<()> { let items = [ Ok(InternalValue::from_components( b"a", @@ -267,7 +267,7 @@ mod tests { } #[test] - fn v3_clipping_iter_ping_pong() -> crate::Result<()> { + fn clipping_iter_ping_pong() -> crate::Result<()> { let items = [ Ok(InternalValue::from_components( b"a", diff --git a/src/run_reader.rs b/src/run_reader.rs index a6a292e9..d84ea329 100644 --- a/src/run_reader.rs +++ b/src/run_reader.rs @@ -135,7 +135,7 @@ mod tests { use test_log::test; #[test] - fn v3_run_reader_skip() -> crate::Result<()> { + fn run_reader_skip() -> crate::Result<()> { let tempdir = tempfile::tempdir()?; let tree = crate::Config::new(&tempdir).open()?; @@ -175,7 +175,7 @@ mod tests { #[test] #[allow(clippy::unwrap_used)] - fn v3_run_reader_basic() -> crate::Result<()> { + fn run_reader_basic() -> crate::Result<()> { let tempdir = tempfile::tempdir()?; let tree = crate::Config::new(&tempdir).open()?; diff --git a/src/segment/block/hash_index/mod.rs b/src/segment/block/hash_index/mod.rs index 5e0aa26d..cc78066c 100644 --- a/src/segment/block/hash_index/mod.rs +++ b/src/segment/block/hash_index/mod.rs @@ -43,7 +43,7 @@ mod tests { use test_log::test; #[test] - fn v3_hash_index_build_simple() { + fn hash_index_build_simple() { let mut hash_index = Builder::with_bucket_count(100); hash_index.set(b"a", 5); @@ -76,7 +76,7 @@ mod tests { } #[test] - fn v3_hash_index_build_conflict() { + fn hash_index_build_conflict() { let mut hash_index = Builder::with_bucket_count(1); hash_index.set(b"a", 5); @@ -90,7 +90,7 @@ mod tests { } #[test] - fn v3_hash_index_build_same_offset() { + fn hash_index_build_same_offset() { let mut hash_index = Builder::with_bucket_count(1); hash_index.set(b"a", 5); @@ -107,7 +107,7 @@ mod tests { } #[test] - fn v3_hash_index_build_mix() { + fn hash_index_build_mix() { let mut hash_index = Builder::with_bucket_count(1); hash_index.set(b"a", 5); @@ -122,7 +122,7 @@ mod tests { } #[test] - fn v3_hash_index_read_conflict() { + fn hash_index_read_conflict() { let mut hash_index = Builder::with_bucket_count(1); hash_index.set(b"a", 5); diff --git a/src/segment/block/header.rs b/src/segment/block/header.rs index 0d056d51..8fc1e9ba 100644 --- a/src/segment/block/header.rs +++ b/src/segment/block/header.rs @@ -134,7 +134,7 @@ mod tests { use test_log::test; #[test] - fn v3_block_header_serde_roundtrip() -> crate::Result<()> { + fn block_header_serde_roundtrip() -> crate::Result<()> { let header = Header { block_type: BlockType::Data, checksum: Checksum::from_raw(5), diff --git a/src/segment/data_block/iter.rs b/src/segment/data_block/iter.rs index 19232ec9..e1230fb8 100644 --- a/src/segment/data_block/iter.rs +++ b/src/segment/data_block/iter.rs @@ -132,7 +132,7 @@ mod tests { use test_log::test; #[test] - fn v3_data_block_wtf() -> crate::Result<()> { + fn data_block_wtf() -> crate::Result<()> { let keys = [ [0, 0, 0, 0, 0, 0, 0, 108], [0, 0, 0, 0, 0, 0, 0, 109], @@ -246,7 +246,7 @@ mod tests { } #[test] - fn v3_data_block_range() -> crate::Result<()> { + fn data_block_range() -> crate::Result<()> { let items = (100u64..110) .map(|i| InternalValue::from_components(i.to_be_bytes(), "", 0, Value)) .collect::>(); @@ -320,7 +320,7 @@ mod tests { } #[test] - fn v3_data_block_range_ping_pong() -> crate::Result<()> { + fn data_block_range_ping_pong() -> crate::Result<()> { let items = (0u64..100) .map(|i| InternalValue::from_components(i.to_be_bytes(), "", 0, Value)) .collect::>(); @@ -368,7 +368,7 @@ mod tests { } #[test] - fn v3_data_block_iter_forward() -> crate::Result<()> { + fn data_block_iter_forward() -> crate::Result<()> { let items = [ InternalValue::from_components("b", "b", 0, Value), InternalValue::from_components("c", "c", 0, Value), @@ -403,7 +403,7 @@ mod tests { } #[test] - fn v3_data_block_iter_rev() -> crate::Result<()> { + fn data_block_iter_rev() -> crate::Result<()> { let items = [ InternalValue::from_components("b", "b", 0, Value), InternalValue::from_components("c", "c", 0, Value), @@ -442,7 +442,7 @@ mod tests { } #[test] - fn v3_data_block_iter_rev_seek_back() -> crate::Result<()> { + fn data_block_iter_rev_seek_back() -> crate::Result<()> { let items = [ InternalValue::from_components("b", "b", 0, Value), InternalValue::from_components("c", "c", 0, Value), @@ -482,7 +482,7 @@ mod tests { } #[test] - fn v3_data_block_iter_range_edges() -> crate::Result<()> { + fn data_block_iter_range_edges() -> crate::Result<()> { let items = [ InternalValue::from_components("b", "b", 0, Value), InternalValue::from_components("c", "c", 0, Value), @@ -563,7 +563,7 @@ mod tests { } #[test] - fn v3_data_block_iter_range() -> crate::Result<()> { + fn data_block_iter_range() -> crate::Result<()> { let items = [ InternalValue::from_components("b", "b", 0, Value), InternalValue::from_components("c", "c", 0, Value), @@ -604,7 +604,7 @@ mod tests { } #[test] - fn v3_data_block_iter_only_first() -> crate::Result<()> { + fn data_block_iter_only_first() -> crate::Result<()> { let items = [ InternalValue::from_components("b", "b", 0, Value), InternalValue::from_components("c", "c", 0, Value), @@ -644,7 +644,7 @@ mod tests { } #[test] - fn v3_data_block_iter_range_same_key() -> crate::Result<()> { + fn data_block_iter_range_same_key() -> crate::Result<()> { let items = [ InternalValue::from_components("b", "b", 0, Value), InternalValue::from_components("c", "c", 0, Value), @@ -747,7 +747,7 @@ mod tests { } #[test] - fn v3_data_block_iter_range_empty() -> crate::Result<()> { + fn data_block_iter_range_empty() -> crate::Result<()> { let items = [ InternalValue::from_components("b", "b", 0, Value), InternalValue::from_components("c", "c", 0, Value), @@ -818,7 +818,7 @@ mod tests { } #[test] - fn v3_data_block_iter_forward_seek_restart_head() -> crate::Result<()> { + fn data_block_iter_forward_seek_restart_head() -> crate::Result<()> { let items = [ InternalValue::from_components("b", "b", 0, Value), InternalValue::from_components("c", "c", 0, Value), @@ -855,7 +855,7 @@ mod tests { } #[test] - fn v3_data_block_iter_forward_seek_in_interval() -> crate::Result<()> { + fn data_block_iter_forward_seek_in_interval() -> crate::Result<()> { let items = [ InternalValue::from_components("b", "b", 0, Value), InternalValue::from_components("c", "c", 0, Value), @@ -895,7 +895,7 @@ mod tests { } #[test] - fn v3_data_block_iter_forward_seek_last() -> crate::Result<()> { + fn data_block_iter_forward_seek_last() -> crate::Result<()> { let items = [ InternalValue::from_components("b", "b", 0, Value), InternalValue::from_components("c", "c", 0, Value), @@ -935,7 +935,7 @@ mod tests { } #[test] - fn v3_data_block_iter_forward_seek_before_first() -> crate::Result<()> { + fn data_block_iter_forward_seek_before_first() -> crate::Result<()> { let items = [ InternalValue::from_components("b", "b", 0, Value), InternalValue::from_components("c", "c", 0, Value), @@ -972,7 +972,7 @@ mod tests { } #[test] - fn v3_data_block_iter_forward_seek_after_last() -> crate::Result<()> { + fn data_block_iter_forward_seek_after_last() -> crate::Result<()> { let items = [ InternalValue::from_components("b", "b", 0, Value), InternalValue::from_components("c", "c", 0, Value), @@ -1005,7 +1005,7 @@ mod tests { } #[test] - fn v3_data_block_iter_consume_last_back() -> crate::Result<()> { + fn data_block_iter_consume_last_back() -> crate::Result<()> { let items = [ InternalValue::from_components("pla:earth:fact", "eaaaaaaaaarth", 0, Value), InternalValue::from_components("pla:jupiter:fact", "Jupiter is big", 0, Value), @@ -1093,7 +1093,7 @@ mod tests { } #[test] - fn v3_data_block_iter_consume_last_forwards() -> crate::Result<()> { + fn data_block_iter_consume_last_forwards() -> crate::Result<()> { let items = [ InternalValue::from_components("pla:earth:fact", "eaaaaaaaaarth", 0, Value), InternalValue::from_components("pla:jupiter:fact", "Jupiter is big", 0, Value), @@ -1183,7 +1183,7 @@ mod tests { } #[test] - fn v3_data_block_iter_ping_pong_exhaust() -> crate::Result<()> { + fn data_block_iter_ping_pong_exhaust() -> crate::Result<()> { let items = [ InternalValue::from_components("a", "a", 0, Value), InternalValue::from_components("b", "b", 0, Value), @@ -1273,7 +1273,7 @@ mod tests { } #[test] - fn v3_data_block_iter_fuzz_3() -> crate::Result<()> { + fn data_block_iter_fuzz_3() -> crate::Result<()> { let items = [ InternalValue::from_components( Slice::from([ @@ -1324,7 +1324,7 @@ mod tests { } #[test] - fn v3_data_block_iter_fuzz_4() -> crate::Result<()> { + fn data_block_iter_fuzz_4() -> crate::Result<()> { let items = [ InternalValue::from_components( Slice::new(&[0]), diff --git a/src/segment/data_block/mod.rs b/src/segment/data_block/mod.rs index 218aacda..a73f9e4b 100644 --- a/src/segment/data_block/mod.rs +++ b/src/segment/data_block/mod.rs @@ -517,7 +517,7 @@ mod tests { use test_log::test; #[test] - fn v3_data_block_ping_pong_fuzz_1() -> crate::Result<()> { + fn data_block_ping_pong_fuzz_1() -> crate::Result<()> { let items = [ InternalValue::from_components( Slice::from([111]), @@ -586,7 +586,7 @@ mod tests { } #[test] - fn v3_data_block_point_read_simple() -> crate::Result<()> { + fn data_block_point_read_simple() -> crate::Result<()> { let items = [ InternalValue::from_components("b", "b", 0, Value), InternalValue::from_components("c", "c", 0, Value), @@ -628,7 +628,7 @@ mod tests { } #[test] - fn v3_data_block_point_read_one() -> crate::Result<()> { + fn data_block_point_read_one() -> crate::Result<()> { let items = [InternalValue::from_components( "pla:earth:fact", "eaaaaaaaaarth", @@ -666,7 +666,7 @@ mod tests { } #[test] - fn v3_data_block_vhandle() -> crate::Result<()> { + fn data_block_vhandle() -> crate::Result<()> { let items = [InternalValue::from_components( "abc", "world", @@ -699,7 +699,7 @@ mod tests { } #[test] - fn v3_data_block_mvcc_read_first() -> crate::Result<()> { + fn data_block_mvcc_read_first() -> crate::Result<()> { let items = [InternalValue::from_components( "hello", "world", @@ -731,7 +731,7 @@ mod tests { } #[test] - fn v3_data_block_point_read_fuzz_1() -> crate::Result<()> { + fn data_block_point_read_fuzz_1() -> crate::Result<()> { let items = [ InternalValue::from_components([0], b"", 23_523_531_241_241_242, Value), InternalValue::from_components([0], b"", 0, Value), @@ -770,7 +770,7 @@ mod tests { } #[test] - fn v3_data_block_point_read_fuzz_2() -> crate::Result<()> { + fn data_block_point_read_fuzz_2() -> crate::Result<()> { let items = [ InternalValue::from_components([0], [], 5, Value), InternalValue::from_components([0], [], 4, Tombstone), @@ -806,7 +806,7 @@ mod tests { } #[test] - fn v3_data_block_point_read_dense() -> crate::Result<()> { + fn data_block_point_read_dense() -> crate::Result<()> { let items = [ InternalValue::from_components(b"a", b"a", 3, Value), InternalValue::from_components(b"b", b"b", 2, Value), @@ -841,7 +841,7 @@ mod tests { } #[test] - fn v3_data_block_point_read_dense_mvcc_with_hash() -> crate::Result<()> { + fn data_block_point_read_dense_mvcc_with_hash() -> crate::Result<()> { let items = [ InternalValue::from_components(b"a", b"a", 3, Value), InternalValue::from_components(b"a", b"a", 2, Value), @@ -883,7 +883,7 @@ mod tests { #[test] #[allow(clippy::unwrap_used)] - fn v3_data_block_point_read_mvcc_latest_fuzz_1() -> crate::Result<()> { + fn data_block_point_read_mvcc_latest_fuzz_1() -> crate::Result<()> { let items = [ InternalValue::from_components(Slice::from([0]), Slice::from([]), 0, Value), InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 0, Value), @@ -920,7 +920,7 @@ mod tests { #[test] #[allow(clippy::unwrap_used)] - fn v3_data_block_point_read_mvcc_latest_fuzz_2() -> crate::Result<()> { + fn data_block_point_read_mvcc_latest_fuzz_2() -> crate::Result<()> { let items = [ InternalValue::from_components(Slice::from([0]), Slice::from([]), 0, Value), InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 8, Value), @@ -969,7 +969,7 @@ mod tests { #[test] #[allow(clippy::unwrap_used)] - fn v3_data_block_point_read_mvcc_latest_fuzz_3() -> crate::Result<()> { + fn data_block_point_read_mvcc_latest_fuzz_3() -> crate::Result<()> { let items = [ InternalValue::from_components(Slice::from([0]), Slice::from([]), 0, Value), InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 8, Value), @@ -1018,7 +1018,7 @@ mod tests { #[test] #[allow(clippy::unwrap_used)] - fn v3_data_block_point_read_mvcc_latest_fuzz_3_dense() -> crate::Result<()> { + fn data_block_point_read_mvcc_latest_fuzz_3_dense() -> crate::Result<()> { let items = [ InternalValue::from_components(Slice::from([0]), Slice::from([]), 0, Value), InternalValue::from_components(Slice::from([233, 233]), Slice::from([]), 8, Value), @@ -1066,7 +1066,7 @@ mod tests { } #[test] - fn v3_data_block_point_read_dense_mvcc_no_hash() -> crate::Result<()> { + fn data_block_point_read_dense_mvcc_no_hash() -> crate::Result<()> { let items = [ InternalValue::from_components(b"a", b"a", 3, Value), InternalValue::from_components(b"a", b"a", 2, Value), @@ -1102,7 +1102,7 @@ mod tests { } #[test] - fn v3_data_block_point_read_shadowing() -> crate::Result<()> { + fn data_block_point_read_shadowing() -> crate::Result<()> { let items = [ InternalValue::from_components("pla:saturn:fact", "Saturn is pretty big", 0, Value), InternalValue::from_components("pla:saturn:name", "Saturn", 0, Value), @@ -1140,7 +1140,7 @@ mod tests { } #[test] - fn v3_data_block_point_read_dense_2() -> crate::Result<()> { + fn data_block_point_read_dense_2() -> crate::Result<()> { let items = [ InternalValue::from_components("pla:earth:fact", "eaaaaaaaaarth", 0, Value), InternalValue::from_components("pla:jupiter:fact", "Jupiter is big", 0, Value), diff --git a/src/segment/index_block/iter.rs b/src/segment/index_block/iter.rs index e12680ed..219abf4c 100644 --- a/src/segment/index_block/iter.rs +++ b/src/segment/index_block/iter.rs @@ -60,7 +60,7 @@ mod tests { use test_log::test; #[test] - fn v3_index_block_iter_seek_before_start() -> crate::Result<()> { + fn index_block_iter_seek_before_start() -> crate::Result<()> { let items = [ KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), KeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), @@ -96,7 +96,7 @@ mod tests { } #[test] - fn v3_index_block_iter_seek_start() -> crate::Result<()> { + fn index_block_iter_seek_start() -> crate::Result<()> { let items = [ KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), KeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), @@ -130,7 +130,7 @@ mod tests { } #[test] - fn v3_index_block_iter_seek_middle() -> crate::Result<()> { + fn index_block_iter_seek_middle() -> crate::Result<()> { let items = [ KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), KeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), @@ -167,7 +167,7 @@ mod tests { } #[test] - fn v3_index_block_iter_rev_seek() -> crate::Result<()> { + fn index_block_iter_rev_seek() -> crate::Result<()> { let items = [ KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), KeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), @@ -201,7 +201,7 @@ mod tests { } #[test] - fn v3_index_block_iter_rev_seek_2() -> crate::Result<()> { + fn index_block_iter_rev_seek_2() -> crate::Result<()> { let items = [ KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), KeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), @@ -235,7 +235,7 @@ mod tests { } #[test] - fn v3_index_block_iter_rev_seek_3() -> crate::Result<()> { + fn index_block_iter_rev_seek_3() -> crate::Result<()> { let items = [ KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), KeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), @@ -272,7 +272,7 @@ mod tests { } #[test] - fn v3_index_block_iter_too_far() -> crate::Result<()> { + fn index_block_iter_too_far() -> crate::Result<()> { let items = [ KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), KeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), @@ -306,7 +306,7 @@ mod tests { } #[test] - fn v3_index_block_iter_too_far_next_back() -> crate::Result<()> { + fn index_block_iter_too_far_next_back() -> crate::Result<()> { let items = [ KeyedBlockHandle::new(b"b".into(), BlockOffset(0), 6_000), KeyedBlockHandle::new(b"bcdef".into(), BlockOffset(6_000), 7_000), @@ -338,7 +338,7 @@ mod tests { } #[test] - fn v3_index_block_iter_span() -> crate::Result<()> { + fn index_block_iter_span() -> crate::Result<()> { let items = [ KeyedBlockHandle::new(b"a".into(), BlockOffset(0), 6_000), KeyedBlockHandle::new(b"a".into(), BlockOffset(6_000), 7_000), @@ -388,7 +388,7 @@ mod tests { } #[test] - fn v3_index_block_iter_rev_span() -> crate::Result<()> { + fn index_block_iter_rev_span() -> crate::Result<()> { let items = [ KeyedBlockHandle::new(b"a".into(), BlockOffset(0), 6_000), KeyedBlockHandle::new(b"a".into(), BlockOffset(6_000), 7_000), @@ -435,7 +435,7 @@ mod tests { } #[test] - fn v3_index_block_iter_range_1() -> crate::Result<()> { + fn index_block_iter_range_1() -> crate::Result<()> { let items = [ KeyedBlockHandle::new(b"a".into(), BlockOffset(0), 6_000), KeyedBlockHandle::new(b"b".into(), BlockOffset(13_000), 5_000), diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 1e68e96c..58fc279d 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -573,7 +573,7 @@ mod tests { #[test] #[allow(clippy::unwrap_used)] - fn v3_segment_recover() -> crate::Result<()> { + fn segment_recover() -> crate::Result<()> { let dir = tempdir()?; let file = dir.path().join("segment"); @@ -664,7 +664,7 @@ mod tests { #[test] #[allow(clippy::unwrap_used)] - fn v3_segment_scan() -> crate::Result<()> { + fn segment_scan() -> crate::Result<()> { let dir = tempdir()?; let file = dir.path().join("segment"); @@ -721,7 +721,7 @@ mod tests { #[test] #[allow(clippy::unwrap_used)] - fn v3_segment_iter_simple() -> crate::Result<()> { + fn segment_iter_simple() -> crate::Result<()> { let dir = tempdir()?; let file = dir.path().join("segment"); @@ -777,7 +777,7 @@ mod tests { #[test] #[allow(clippy::unwrap_used)] - fn v3_segment_range_simple() -> crate::Result<()> { + fn segment_range_simple() -> crate::Result<()> { let dir = tempdir()?; let file = dir.path().join("segment"); @@ -844,7 +844,7 @@ mod tests { #[test] #[allow(clippy::unwrap_used)] - fn v3_segment_range_ping_pong() -> crate::Result<()> { + fn segment_range_ping_pong() -> crate::Result<()> { let dir = tempdir()?; let file = dir.path().join("segment"); @@ -917,7 +917,7 @@ mod tests { #[test] #[allow(clippy::unwrap_used)] - fn v3_segment_range_multiple_data_blocks() -> crate::Result<()> { + fn segment_range_multiple_data_blocks() -> crate::Result<()> { let dir = tempdir()?; let file = dir.path().join("segment"); @@ -993,7 +993,7 @@ mod tests { // TODO: when using stats cfg feature: check filter hits += 1 #[test] #[allow(clippy::unwrap_used)] - fn v3_segment_unpinned_filter() -> crate::Result<()> { + fn segment_unpinned_filter() -> crate::Result<()> { let dir = tempdir()?; let file = dir.path().join("segment"); diff --git a/src/segment/util.rs b/src/segment/util.rs index 70e11c39..f349cd5c 100644 --- a/src/segment/util.rs +++ b/src/segment/util.rs @@ -158,7 +158,7 @@ mod tests { use test_log::test; #[test] - fn v3_compare_prefixed_slice() { + fn test_compare_prefixed_slice() { use std::cmp::Ordering::{Equal, Greater, Less}; assert_eq!(Equal, compare_prefixed_slice(b"", b"", b"")); From 29a75d2cc0557e9ee9c47126096484e53bbb4ce7 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 16 Oct 2025 23:10:13 +0200 Subject: [PATCH 610/613] refactor --- src/clipping_iter.rs | 236 -------------------------------------- src/compaction/flavour.rs | 33 ++++-- 2 files changed, 23 insertions(+), 246 deletions(-) delete mode 100644 src/clipping_iter.rs diff --git a/src/clipping_iter.rs b/src/clipping_iter.rs deleted file mode 100644 index 5675f083..00000000 --- a/src/clipping_iter.rs +++ /dev/null @@ -1,236 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use crate::InternalValue; -use std::{ - marker::PhantomData, - ops::{Bound, RangeBounds}, -}; - -type Item = InternalValue; - -/// Clips an iterator to a key range -pub struct ClippingIter<'a, K, R, I> -where - K: AsRef<[u8]>, - R: RangeBounds, - I: DoubleEndedIterator, -{ - _phantom: std::marker::PhantomData, - - inner: I, - range: &'a R, - - has_entered_lo: bool, - has_entered_hi: bool, -} - -impl<'a, K, R, I> ClippingIter<'a, K, R, I> -where - K: AsRef<[u8]>, - R: RangeBounds, - I: DoubleEndedIterator, -{ - pub fn new(iter: I, range: &'a R) -> Self { - Self { - _phantom: PhantomData, - - inner: iter, - range, - - has_entered_lo: false, - has_entered_hi: false, - } - } -} - -impl Iterator for ClippingIter<'_, K, R, I> -where - K: AsRef<[u8]>, - R: RangeBounds, - I: DoubleEndedIterator, -{ - type Item = Item; - - fn next(&mut self) -> Option { - loop { - let item = self.inner.next()?; - - // NOTE: PERF: As soon as we enter ->[lo..] - // we don't need to do key comparisons anymore which are - // more expensive than a simple flag check, especially for long keys - if !self.has_entered_lo { - match self.range.start_bound() { - Bound::Included(start) => { - if item.key.user_key < start.as_ref() { - // Before min key - continue; - } - self.has_entered_lo = true; - } - Bound::Excluded(start) => { - if item.key.user_key <= start.as_ref() { - // Before or equal min key - continue; - } - self.has_entered_lo = true; - } - Bound::Unbounded => {} - } - } - - match self.range.end_bound() { - Bound::Included(start) => { - if item.key.user_key > start.as_ref() { - // After max key - return None; - } - } - Bound::Excluded(start) => { - if item.key.user_key >= start.as_ref() { - // Reached max key - return None; - } - } - Bound::Unbounded => {} - } - - return Some(item); - } - } -} - -impl DoubleEndedIterator for ClippingIter<'_, K, R, I> -where - K: AsRef<[u8]>, - R: RangeBounds, - I: DoubleEndedIterator, -{ - fn next_back(&mut self) -> Option { - loop { - let item = self.inner.next_back()?; - - match self.range.start_bound() { - Bound::Included(start) => { - if item.key.user_key < start.as_ref() { - // Reached min key - return None; - } - } - Bound::Excluded(start) => { - if item.key.user_key <= start.as_ref() { - // Before min key - return None; - } - } - Bound::Unbounded => {} - } - - // NOTE: PERF: As soon as we enter [..hi]<- - // we don't need to do key comparisons anymore which are - // more expensive than a simple flag check, especially for long keys - if !self.has_entered_hi { - match self.range.end_bound() { - Bound::Included(end) => { - if item.key.user_key > end.as_ref() { - // After max key - continue; - } - self.has_entered_hi = true; - } - Bound::Excluded(end) => { - if item.key.user_key >= end.as_ref() { - // After or equal max key - continue; - } - self.has_entered_hi = true; - } - Bound::Unbounded => {} - } - } - - return Some(item); - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use test_log::test; - - #[test] - fn clipping_iter_forwards() { - let items = [ - InternalValue::from_components(b"a", b"", 0, crate::ValueType::Value), - InternalValue::from_components(b"b", b"", 0, crate::ValueType::Value), - InternalValue::from_components(b"c", b"", 0, crate::ValueType::Value), - InternalValue::from_components(b"d", b"", 0, crate::ValueType::Value), - InternalValue::from_components(b"e", b"", 0, crate::ValueType::Value), - ]; - let range = "c"..="d"; - - let mut iter = ClippingIter::new(items.into_iter(), &range); - assert_eq!( - Some(b"c" as &[u8]), - iter.next().map(|x| x.key.user_key).as_deref(), - ); - assert_eq!( - Some(b"d" as &[u8]), - iter.next().map(|x| x.key.user_key).as_deref(), - ); - assert!(iter.next().is_none()); - } - - #[test] - fn clipping_iter_rev() { - let items = [ - InternalValue::from_components(b"a", b"", 0, crate::ValueType::Value), - InternalValue::from_components(b"b", b"", 0, crate::ValueType::Value), - InternalValue::from_components(b"c", b"", 0, crate::ValueType::Value), - InternalValue::from_components(b"d", b"", 0, crate::ValueType::Value), - InternalValue::from_components(b"e", b"", 0, crate::ValueType::Value), - ]; - let range = "c"..="d"; - - let mut iter = ClippingIter::new(items.into_iter(), &range); - assert_eq!( - Some(b"d" as &[u8]), - iter.next_back().map(|x| x.key.user_key).as_deref(), - ); - assert_eq!( - Some(b"c" as &[u8]), - iter.next_back().map(|x| x.key.user_key).as_deref(), - ); - assert!(iter.next_back().is_none()); - } - - #[test] - fn clipping_iter_ping_pong() { - let items = [ - InternalValue::from_components(b"a", b"", 0, crate::ValueType::Value), - InternalValue::from_components(b"b", b"", 0, crate::ValueType::Value), - InternalValue::from_components(b"c", b"", 0, crate::ValueType::Value), - InternalValue::from_components(b"d", b"", 0, crate::ValueType::Value), - InternalValue::from_components(b"e", b"", 0, crate::ValueType::Value), - ]; - let range = "b"..="d"; - - let mut iter = ClippingIter::new(items.into_iter(), &range); - assert_eq!( - Some(b"b" as &[u8]), - iter.next().map(|x| x.key.user_key).as_deref(), - ); - assert_eq!( - Some(b"d" as &[u8]), - iter.next_back().map(|x| x.key.user_key).as_deref(), - ); - assert_eq!( - Some(b"c" as &[u8]), - iter.next().map(|x| x.key.user_key).as_deref(), - ); - assert!(iter.next_back().is_none()); - assert!(iter.next().is_none()); - } -} diff --git a/src/compaction/flavour.rs b/src/compaction/flavour.rs index b8546160..e240a181 100644 --- a/src/compaction/flavour.rs +++ b/src/compaction/flavour.rs @@ -108,7 +108,7 @@ impl RelocatingCompaction { inner: StandardCompaction, blob_scanner: Peekable, blob_writer: BlobFileWriter, - rewriting_blob_file_ids: HashSet, + rewriting_blob_file_ids: HashSet, // TODO: <- remove rewriting_blob_files: Vec, ) -> Self { Self { @@ -120,6 +120,8 @@ impl RelocatingCompaction { } } + // TODO: vvv validate/unit test this vvv + /// Drains all blobs that come "before" the given vptr. fn drain_blobs(&mut self, key: &[u8], vptr: &BlobIndirection) -> crate::Result<()> { loop { @@ -136,7 +138,7 @@ impl RelocatingCompaction { match blob { Ok((entry, _)) => { - assert!((entry.key <= key), "vptr was not matched with blob"); + assert!(entry.key <= key, "vptr was not matched with blob"); } Err(e) => return Err(e), } @@ -173,9 +175,18 @@ impl CompactionFlavour for RelocatingCompaction { .next() .expect("vptr was not matched with blob (scanner is unexpectedly exhausted)")?; - debug_assert_eq!(blob_file_id, indirection.vhandle.blob_file_id); - debug_assert_eq!(blob_entry.key, item.key.user_key); - debug_assert_eq!(blob_entry.offset, indirection.vhandle.offset); + assert_eq!( + blob_file_id, indirection.vhandle.blob_file_id, + "matched blob has different blob file ID than vptr", + ); + assert_eq!( + blob_entry.key, item.key.user_key, + "matched blob has different key than vptr", + ); + assert_eq!( + blob_entry.offset, indirection.vhandle.offset, + "matched blob has different offset than vptr", + ); log::trace!( "=> use blob: {:?}:{} offset: {} from BF {}", @@ -239,12 +250,11 @@ impl CompactionFlavour for RelocatingCompaction { let created_tables = self.inner.consume_writer(opts, dst_lvl)?; let created_blob_files = self.blob_writer.finish()?; - let mut blob_file_ids_to_drop = self.rewriting_blob_file_ids; + let mut blob_files_to_drop = self.rewriting_blob_files; for blob_file in super_version.version.value_log.values() { if blob_file.is_dead(super_version.version.gc_stats()) { - blob_file_ids_to_drop.insert(blob_file.id()); - self.rewriting_blob_files.push(blob_file.clone()); + blob_files_to_drop.push(blob_file.clone()); } } @@ -261,7 +271,10 @@ impl CompactionFlavour for RelocatingCompaction { Some(blob_frag_map_diff) }, created_blob_files, - blob_file_ids_to_drop, + blob_files_to_drop + .iter() + .map(BlobFile::id) + .collect::>(), )) }, opts.eviction_seqno, @@ -274,7 +287,7 @@ impl CompactionFlavour for RelocatingCompaction { table.mark_as_deleted(); } - for blob_file in self.rewriting_blob_files { + for blob_file in blob_files_to_drop { blob_file.mark_as_deleted(); } From c825c648b69fe7ea3f0413314498dbc400cfcccc Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 17 Oct 2025 19:42:54 +0200 Subject: [PATCH 611/613] add test --- tests/blob_register_table_rotation.rs | 124 ++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 tests/blob_register_table_rotation.rs diff --git a/tests/blob_register_table_rotation.rs b/tests/blob_register_table_rotation.rs new file mode 100644 index 00000000..086eaa21 --- /dev/null +++ b/tests/blob_register_table_rotation.rs @@ -0,0 +1,124 @@ +use lsm_tree::{config::BlockSizePolicy, AbstractTree, KvSeparationOptions}; +use test_log::test; + +// Force one block per table and one blob per block +// +// Then check if item_count in a table matches the number of referenced blobs (so 1). +// +// See https://github.com/fjall-rs/lsm-tree/commit/0d2d7b2071c65f2538bb01e4512907892991dcbe +#[test] +fn blob_register_table_rotation() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let path = folder.path(); + + let tree = lsm_tree::Config::new(path) + .data_block_size_policy(BlockSizePolicy::all(1)) + .with_kv_separation(Some( + KvSeparationOptions::default() + .separation_threshold(0) + .age_cutoff(1.0) + .staleness_threshold(0.0), + )) + .open()?; + + tree.insert("a", "a", 0); + tree.insert("b", "b", 0); + tree.insert("c", "c", 0); + tree.insert("d", "d", 0); + tree.insert("e", "e", 0); + + tree.flush_active_memtable(0)?; + tree.major_compact(1, 0)?; + + assert_eq!(5, tree.segment_count()); + assert_eq!(1, tree.blob_file_count()); + + for table in tree.current_version().iter_segments() { + assert_eq!( + 1, + table + .list_blob_file_references()? + .unwrap() + .iter() + .map(|x| x.len) + .sum::(), + ); + assert_eq!( + 1, + table + .list_blob_file_references()? + .unwrap() + .iter() + .map(|x| x.bytes) + .sum::(), + ); + } + + Ok(()) +} + +#[test] +fn blob_register_table_rotation_relocation() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let path = folder.path(); + + let tree = lsm_tree::Config::new(path) + .data_block_size_policy(BlockSizePolicy::all(1)) + .with_kv_separation(Some( + KvSeparationOptions::default() + .separation_threshold(0) + .age_cutoff(1.0) + .staleness_threshold(0.0), + )) + .open()?; + + tree.insert("a", "a", 0); + tree.insert("b", "b", 0); + tree.insert("c", "c", 0); + tree.insert("d", "d", 0); + tree.insert("e", "e", 0); + tree.insert("f", "f", 0); // f is not overwritten + + tree.flush_active_memtable(0)?; + tree.major_compact(1, 0)?; + + tree.insert("a", "a", 1); + tree.insert("b", "b", 1); + tree.insert("c", "c", 1); + tree.insert("d", "d", 1); + tree.insert("e", "e", 1); + + tree.flush_active_memtable(0)?; + tree.major_compact(1, 10)?; + + assert_eq!(6, tree.segment_count()); + assert_eq!(2, tree.blob_file_count()); + + tree.major_compact(1, 11)?; + + assert_eq!(6, tree.segment_count()); + assert_eq!(2, tree.blob_file_count()); + + for table in tree.current_version().iter_segments() { + assert_eq!( + 1, + table + .list_blob_file_references()? + .unwrap() + .iter() + .map(|x| x.len) + .sum::(), + ); + assert_eq!( + 1, + table + .list_blob_file_references()? + .unwrap() + .iter() + .map(|x| x.bytes) + .sum::(), + ); + } + + Ok(()) +} From 8f5e4cd5b7d672b5be8747ecbf2990ef5741d2a0 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 17 Oct 2025 19:43:53 +0200 Subject: [PATCH 612/613] wip --- src/blob_tree/gc.rs | 4 ++-- src/compaction/flavour.rs | 16 ++++++++-------- src/compaction/major.rs | 5 ----- src/compaction/worker.rs | 1 - src/lib.rs | 1 + src/segment/meta.rs | 2 +- src/segment/mod.rs | 2 +- src/segment/multi_writer.rs | 2 +- src/segment/writer/mod.rs | 2 +- src/version/mod.rs | 2 +- tests/blob_major_compact_gc_stats.rs | 4 ++-- tests/blob_major_compact_relink.rs | 4 ++-- tests/blob_simple.rs | 3 --- 13 files changed, 20 insertions(+), 28 deletions(-) diff --git a/src/blob_tree/gc.rs b/src/blob_tree/gc.rs index 95d2098a..0e7d7d57 100644 --- a/src/blob_tree/gc.rs +++ b/src/blob_tree/gc.rs @@ -47,14 +47,14 @@ impl FragmentationMap { self.0.values().map(|x| x.bytes).sum() } - // TODO: unit test + // TODO: 3.0.0 unit test /// Removes blob file entries that are not part of the value log (anymore) /// to reduce linear memory growth. pub fn prune(&mut self, value_log: &BTreeMap) { self.0.retain(|k, _| value_log.contains_key(k)); } - // TODO: unit test + // TODO: 3.0.0 unit test pub fn merge_into(self, other: &mut Self) { for (blob_file_id, diff) in self.0 { other diff --git a/src/compaction/flavour.rs b/src/compaction/flavour.rs index e240a181..d14abb5f 100644 --- a/src/compaction/flavour.rs +++ b/src/compaction/flavour.rs @@ -44,12 +44,13 @@ pub(super) fn prepare_table_writer( let is_last_level = payload.dest_level == last_level; log::debug!( - "Compacting tables {:?} into L{} (canonical L{}), data_block_restart_interval={data_block_restart_interval}, index_block_restart_interval={index_block_restart_interval}, data_block_size={data_block_size}, index_block_size={index_block_size}, data_block_compression={data_block_compression}, index_block_compression={index_block_compression}, mvcc_gc_watermark={}", - payload.segment_ids, - payload.dest_level, - payload.canonical_level, - opts.eviction_seqno, - ); + "Compacting tables {:?} into L{} (canonical L{}), target_size={}, data_block_restart_interval={data_block_restart_interval}, index_block_restart_interval={index_block_restart_interval}, data_block_size={data_block_size}, index_block_size={index_block_size}, data_block_compression={data_block_compression}, index_block_compression={index_block_compression}, mvcc_gc_watermark={}", + payload.segment_ids, + payload.dest_level, + payload.canonical_level, + payload.target_size, + opts.eviction_seqno, + ); Ok(table_writer .use_data_block_restart_interval(data_block_restart_interval) @@ -108,14 +109,13 @@ impl RelocatingCompaction { inner: StandardCompaction, blob_scanner: Peekable, blob_writer: BlobFileWriter, - rewriting_blob_file_ids: HashSet, // TODO: <- remove rewriting_blob_files: Vec, ) -> Self { Self { inner, blob_scanner, blob_writer, - rewriting_blob_file_ids, + rewriting_blob_file_ids: rewriting_blob_files.iter().map(BlobFile::id).collect(), rewriting_blob_files, } } diff --git a/src/compaction/major.rs b/src/compaction/major.rs index 580900cc..5641f301 100644 --- a/src/compaction/major.rs +++ b/src/compaction/major.rs @@ -14,14 +14,9 @@ pub struct Strategy { impl Strategy { /// Configures a new `Major` compaction strategy. - /// - /// # Panics - /// - /// Panics, if `target_size` is below 1024 bytes. #[must_use] #[allow(dead_code)] pub fn new(target_size: u64) -> Self { - assert!(target_size >= 1_024); Self { target_size } } } diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 6b507508..2aa768a0 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -393,7 +393,6 @@ fn merge_segments( inner, scanner.peekable(), writer, - blob_files_to_rewrite.iter().map(BlobFile::id).collect(), blob_files_to_rewrite, )) } diff --git a/src/lib.rs b/src/lib.rs index e2ca4adc..0c0b2b82 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -233,6 +233,7 @@ pub type KvPair = (UserKey, UserValue); #[doc(hidden)] pub use { + blob_tree::handle::BlobIndirection, key_range::KeyRange, merge::BoxedIterator, segment::{block::Checksum, GlobalSegmentId, Segment, SegmentId}, diff --git a/src/segment/meta.rs b/src/segment/meta.rs index 18427efc..288703c6 100644 --- a/src/segment/meta.rs +++ b/src/segment/meta.rs @@ -143,7 +143,7 @@ impl ParsedMeta { let tombstone_count = read_u64!(block, b"#tombstone_count"); let data_block_count = read_u64!(block, b"#data_block_count"); let index_block_count = read_u64!(block, b"#index_block_count"); - let file_size = read_u64!(block, b"#size"); // TODO: 3.0.0 rename file_size + let file_size = read_u64!(block, b"#file_size"); let weak_tombstone_count = read_u64!(block, b"#weak_tombstone_count"); let weak_tombstone_reclaimable = read_u64!(block, b"#weak_tombstone_reclaimable"); diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 58fc279d..9659aae3 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -101,7 +101,7 @@ impl std::fmt::Debug for Segment { } impl Segment { - pub fn get_linked_blob_files(&self) -> crate::Result>> { + pub fn list_blob_file_references(&self) -> crate::Result>> { use byteorder::{ReadBytesExt, LE}; Ok(if let Some(handle) = &self.regions.linked_blob_files { diff --git a/src/segment/multi_writer.rs b/src/segment/multi_writer.rs index 686a78fc..23621dc7 100644 --- a/src/segment/multi_writer.rs +++ b/src/segment/multi_writer.rs @@ -87,7 +87,7 @@ impl MultiWriter { current_key: None, - linked_blobs: HashMap::default(), // TODO: consume on finish or rotate + linked_blobs: HashMap::default(), }) } diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index a7ec1975..ea9c4ae2 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -412,6 +412,7 @@ impl Writer { "#data_block_count", &(self.meta.data_block_count as u64).to_le_bytes(), ), + meta("#file_size", &self.meta.file_pos.to_le_bytes()), meta("#filter_hash_type", b"xxh3"), meta("#id", &self.segment_id.to_le_bytes()), meta( @@ -444,7 +445,6 @@ impl Writer { ), meta("#seqno#max", &self.meta.highest_seqno.to_le_bytes()), meta("#seqno#min", &self.meta.lowest_seqno.to_le_bytes()), - meta("#size", &self.meta.file_pos.to_le_bytes()), meta( "#tombstone_count", &(self.meta.tombstone_count as u64).to_le_bytes(), diff --git a/src/version/mod.rs b/src/version/mod.rs index 6e66c06a..c71198ad 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -429,7 +429,7 @@ impl Version { let mut copy = self.gc_stats.deref().clone(); for segment in &dropped_segments { - let linked_blob_files = segment.get_linked_blob_files()?.unwrap_or_default(); + let linked_blob_files = segment.list_blob_file_references()?.unwrap_or_default(); for blob_file in linked_blob_files { copy.entry(blob_file.blob_file_id) diff --git a/tests/blob_major_compact_gc_stats.rs b/tests/blob_major_compact_gc_stats.rs index c9d2a16d..742751d1 100644 --- a/tests/blob_major_compact_gc_stats.rs +++ b/tests/blob_major_compact_gc_stats.rs @@ -93,7 +93,7 @@ fn blob_tree_major_compact_gc_stats_tombstone() -> lsm_tree::Result<()> { .iter_segments() .nth(1) .unwrap() - .get_linked_blob_files()?, + .list_blob_file_references()?, ); // Blob file has no fragmentation before compaction (in stats) @@ -124,7 +124,7 @@ fn blob_tree_major_compact_gc_stats_tombstone() -> lsm_tree::Result<()> { .iter_segments() .next() .unwrap() - .get_linked_blob_files()?, + .list_blob_file_references()?, ); } diff --git a/tests/blob_major_compact_relink.rs b/tests/blob_major_compact_relink.rs index d4b3d9d9..7cdbe73b 100644 --- a/tests/blob_major_compact_relink.rs +++ b/tests/blob_major_compact_relink.rs @@ -34,7 +34,7 @@ fn blob_tree_major_compact_relink() -> lsm_tree::Result<()> { .iter_segments() .next() .unwrap() - .get_linked_blob_files()?, + .list_blob_file_references()?, ); tree.flush_active_memtable(1)?; @@ -53,7 +53,7 @@ fn blob_tree_major_compact_relink() -> lsm_tree::Result<()> { .iter_segments() .next() .unwrap() - .get_linked_blob_files()?, + .list_blob_file_references()?, ); } diff --git a/tests/blob_simple.rs b/tests/blob_simple.rs index d51eb03f..22444b05 100644 --- a/tests/blob_simple.rs +++ b/tests/blob_simple.rs @@ -10,9 +10,6 @@ fn blob_tree_simple_flush_read() -> lsm_tree::Result<()> { let new_big_value = b"winter!".repeat(128_000); { - // TODO: 3.0.0 just do Config.with_kv_separation().open() - // on recover, check manifest for type - // just return AnyTree let tree = lsm_tree::Config::new(path) .with_kv_separation(Some(Default::default())) .open()?; From c74abfdc5efaef2236896623230f0b15df23120c Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 17 Oct 2025 19:44:02 +0200 Subject: [PATCH 613/613] refactor: blob file picking --- src/compaction/worker.rs | 145 +++++++++++++++++++++++---------------- 1 file changed, 85 insertions(+), 60 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 2aa768a0..87845226 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -213,6 +213,85 @@ fn move_segments( Ok(()) } +// TODO: 3.0.0 unit test +/// Picks blob files to rewrite (defragment) +fn pick_blob_files_to_rewrite( + picked_tables: &HashSet, + current_version: &Version, + blob_opts: &crate::KvSeparationOptions, +) -> crate::Result> { + use crate::Segment; + + // We start off by getting all the blob files that are referenced by the tables + // that we want to compact. + let linked_blob_files = picked_tables + .iter() + .map(|&id| { + current_version.get_segment(id).unwrap_or_else(|| { + panic!("table {id} should exist"); + }) + }) + .map(Segment::list_blob_file_references) + .collect::, _>>()?; + + // Then we filter all blob files that are not fragmented or old enough. + let mut linked_blob_files = linked_blob_files + .into_iter() + .flatten() + .flatten() + .map(|blob_file_ref| { + current_version + .value_log + .get(&blob_file_ref.blob_file_id) + .unwrap_or_else(|| { + panic!("blob file {} should exist", blob_file_ref.blob_file_id); + }) + }) + .filter(|blob_file| { + blob_file.is_stale(current_version.gc_stats(), blob_opts.staleness_threshold) + }) + .filter(|blob_file| { + // NOTE: Dead blob files are dropped anyway during current_version change commit + !blob_file.is_dead(current_version.gc_stats()) + }) + .collect::>() + .into_iter() + .collect::>(); + + linked_blob_files.sort_by_key(|a| a.id()); + + let cutoff_point = { + let len = linked_blob_files.len() as f32; + (len * blob_opts.age_cutoff) as usize + }; + linked_blob_files.drain(cutoff_point..); + + // IMPORTANT: Additionally, we also have to check if any other tables reference any of our candidate blob files. + // We have to *not* include blob files that are referenced by other tables, because otherwise those + // blob references would point into nothing (becoming dangling). + for table in current_version.iter_segments() { + if picked_tables.contains(&table.id()) { + continue; + } + + let other_ref = table + .list_blob_file_references() + .expect("should not fail") + .unwrap_or_default(); + + let other_refs = other_ref + .into_iter() + .filter(|x| linked_blob_files.iter().any(|bf| bf.id() == x.blob_file_id)) + .collect::>(); + + for additional_ref in other_refs { + linked_blob_files.retain(|x| x.id() != additional_ref.blob_file_id); + } + } + + Ok(linked_blob_files.into_iter().cloned().collect::>()) +} + fn hidden_guard( payload: &CompactionPayload, opts: &Options, @@ -300,68 +379,12 @@ fn merge_segments( Some(blob_opts) => { merge_iter = merge_iter.with_expiration_callback(&mut blob_frag_map); - let blob_files_to_rewrite = { - // TODO: 3.0.0 vvv if blob gc is disabled, skip this part vvv - - // TODO: 3.0.0 unit test and optimize... somehow - let mut linked_blob_files = payload - .segment_ids - .iter() - .map(|&id| current_version.get_segment(id).expect("table should exist")) - .filter_map(|x| x.get_linked_blob_files().expect("handle error")) - .flatten() - .map(|blob_file_ref| { - current_version - .value_log - .get(&blob_file_ref.blob_file_id) - .expect("blob file should exist") - }) - .filter(|blob_file| { - blob_file - .is_stale(current_version.gc_stats(), blob_opts.staleness_threshold) - }) - .filter(|blob_file| { - // NOTE: Dead blob files are dropped anyway during current_version change commit - !blob_file.is_dead(current_version.gc_stats()) - }) - .collect::>() - .into_iter() - .collect::>(); - - linked_blob_files.sort_by_key(|a| a.id()); - - let cutoff_point = { - let len = linked_blob_files.len() as f32; - (len * blob_opts.age_cutoff) as usize - }; - linked_blob_files.drain(cutoff_point..); - - // NOTE: If there is any table not part of our compaction input - // that also points to the blob file, we cannot rewrite the blob file - for table in current_version.iter_segments() { - if payload.segment_ids.contains(&table.id()) { - continue; - } - - let other_ref = table - .get_linked_blob_files() - .expect("should not fail") - .unwrap_or_default(); - - let other_refs = other_ref - .into_iter() - .filter(|x| linked_blob_files.iter().any(|bf| bf.id() == x.blob_file_id)) - .collect::>(); - - for additional_ref in other_refs { - linked_blob_files.retain(|x| x.id() != additional_ref.blob_file_id); - } - } - - linked_blob_files.into_iter().cloned().collect::>() - }; + let blob_files_to_rewrite = + pick_blob_files_to_rewrite(&payload.segment_ids, current_version, blob_opts)?; if blob_files_to_rewrite.is_empty() { + log::debug!("No blob relocation needed"); + Box::new(StandardCompaction::new(table_writer, segments)) as Box } else { @@ -434,6 +457,8 @@ fn merge_segments( let mut super_version = opts.super_version.write().expect("lock is poisoned"); log::trace!("Acquired super version write lock"); + log::trace!("Blob fragmentation diff: {blob_frag_map:#?}"); + compactor .finish( &mut super_version,