Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 74 additions & 18 deletions analyzeme/src/stringtable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,9 @@ use std::borrow::Cow;
use std::error::Error;
use memchr::memchr;

// See module-level documentation for more information on the encoding.
const UTF8_CONTINUATION_MASK: u8 = 0b1100_0000;
const UTF8_CONTINUATION_BYTE: u8 = 0b1000_0000;

fn deserialize_index_entry(bytes: &[u8]) -> (StringId, Addr) {
(
StringId::reserved(LittleEndian::read_u32(&bytes[0..4])),
StringId::new(LittleEndian::read_u32(&bytes[0..4])),
Addr(LittleEndian::read_u32(&bytes[4..8])),
)
}
Expand All @@ -29,12 +25,29 @@ pub struct StringRef<'st> {
table: &'st StringTable,
}

// This is the text we emit when encountering a virtual string ID that cannot
// be resolved.
const UNKNOWN_STRING: &str = "<unknown>";

impl<'st> StringRef<'st> {

/// Expands the StringRef into an actual string. This method will
/// avoid allocating a `String` if it can instead return a `&str` pointing
/// into the raw string table data.
pub fn to_string(&self) -> Cow<'st, str> {

// Try to avoid the allocation, which we can do if this is a
// [value, 0xFF] entry.
let addr = self.table.index[&self.id];
let addr = match self.get_addr() {
Ok(addr) => addr,
Err(_) => {
return Cow::from(UNKNOWN_STRING)
}
};

// Try to avoid the allocation, which we can do if this is
//
// - a string with a single value component (`[value, 0xFF]`) or
// - a string with a single reference component (`[string_id, 0xFF]`)

let pos = addr.as_usize();
let slice_to_search = &self.table.string_data[pos..];

Expand All @@ -43,36 +56,53 @@ impl<'st> StringRef<'st> {
// is super fast.
let terminator_pos = memchr(TERMINATOR, slice_to_search).unwrap();

// Check if this is a string containing a single StringId component
let first_byte = self.table.string_data[pos];
const STRING_ID_SIZE: usize = std::mem::size_of::<StringId>();
if terminator_pos == pos + STRING_ID_SIZE && is_utf8_continuation_byte(first_byte) {
let id = decode_string_id_from_data(&self.table.string_data[pos..pos+STRING_ID_SIZE]);
return StringRef {
id,
table: self.table,
}.to_string();
}

// Decode the bytes until the terminator. If there is a string id in
// between somewhere this will fail, and we fall back to the allocating
// path.
if let Ok(s) = std::str::from_utf8(&slice_to_search[..terminator_pos]) {
Cow::from(s)
} else {
// This is the slow path where we actually allocate a `String` on
// the heap and expand into that. If you suspect that there is a
// bug in the fast path above, you can easily check if always taking
// the slow path fixes the issue.
let mut output = String::new();
self.write_to_string(&mut output);
Cow::from(output)
}
}

pub fn write_to_string(&self, output: &mut String) {
let addr = self.table.index[&self.id];

let addr = match self.get_addr() {
Ok(addr) => addr,
Err(_) => {
output.push_str(UNKNOWN_STRING);
return
}
};

let mut pos = addr.as_usize();

loop {
let byte = self.table.string_data[pos];

if byte == TERMINATOR {
return;
} else if (byte & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_BYTE {
// This is a string-id
let id = BigEndian::read_u32(&self.table.string_data[pos..pos + 4]);

// Mask off the `0b10` prefix
let id = id & STRING_ID_MASK;

} else if is_utf8_continuation_byte(byte) {
let string_ref = StringRef {
id: StringId::reserved(id),
id: decode_string_id_from_data(&self.table.string_data[pos..pos + 4]),
table: self.table,
};

Expand All @@ -87,6 +117,32 @@ impl<'st> StringRef<'st> {
}
}
}

fn get_addr(&self) -> Result<Addr, ()> {
if self.id.is_virtual() {
match self.table.index.get(&self.id) {
Some(&addr) => Ok(addr),
None => Err(()),
}
} else {
Ok(self.id.to_addr())
}
}
}

fn is_utf8_continuation_byte(byte: u8) -> bool {
// See module-level documentation for more information on the encoding.
const UTF8_CONTINUATION_MASK: u8 = 0b1100_0000;
const UTF8_CONTINUATION_BYTE: u8 = 0b1000_0000;
(byte & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_BYTE
}

// String IDs in the table data are encoded in big endian format, while string
// IDs in the index are encoded in little endian format. Don't mix the two up.
fn decode_string_id_from_data(bytes: &[u8]) -> StringId {
let id = BigEndian::read_u32(&bytes[0..4]);
// Mask off the `0b10` prefix
StringId::new(id & STRING_ID_MASK)
}

// Tries to decode a UTF-8 codepoint starting at the beginning of `bytes`.
Expand Down Expand Up @@ -181,7 +237,7 @@ impl StringTable {
}

pub fn get_metadata<'a>(&'a self) -> StringRef<'a> {
let id = StringId::reserved(METADATA_STRING_ID);
let id = StringId::new(METADATA_STRING_ID);
self.get(id)
}
}
Expand Down
9 changes: 6 additions & 3 deletions analyzeme/src/testing_common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@ fn generate_profiling_data<S: SerializationSink>(
) -> Vec<Event<'static>> {
let profiler = Arc::new(Profiler::<S>::new(Path::new(filestem)).unwrap());

let event_id_reserved = StringId::reserved(42);
let event_id_virtual = StringId::new_virtual(42);

let event_ids = vec![
(
profiler.alloc_string("Generic"),
profiler.alloc_string("SomeGenericActivity"),
),
(profiler.alloc_string("Query"), event_id_reserved),
(profiler.alloc_string("Query"), event_id_virtual),
];

// This and event_ids have to match!
Expand Down Expand Up @@ -73,7 +73,10 @@ fn generate_profiling_data<S: SerializationSink>(

// An example of allocating the string contents of an event id that has
// already been used
profiler.alloc_string_with_reserved_id(event_id_reserved, "SomeQuery");
profiler.map_virtual_to_concrete_string(
event_id_virtual,
profiler.alloc_string("SomeQuery")
);

expected_events
}
Expand Down
2 changes: 1 addition & 1 deletion measureme/src/file_header.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use crate::serialization::SerializationSink;
use byteorder::{ByteOrder, LittleEndian};
use std::error::Error;

pub const CURRENT_FILE_FORMAT_VERSION: u32 = 3;
pub const CURRENT_FILE_FORMAT_VERSION: u32 = 4;
pub const FILE_MAGIC_EVENT_STREAM: &[u8; 4] = b"MMES";
pub const FILE_MAGIC_STRINGTABLE_DATA: &[u8; 4] = b"MMSD";
pub const FILE_MAGIC_STRINGTABLE_INDEX: &[u8; 4] = b"MMSI";
Expand Down
20 changes: 15 additions & 5 deletions measureme/src/profiler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,21 @@ impl<S: SerializationSink> Profiler<S> {
}

#[inline(always)]
pub fn alloc_string_with_reserved_id<STR: SerializableString + ?Sized>(
pub fn map_virtual_to_concrete_string(&self, virtual_id: StringId, concrete_id: StringId) {
self.string_table
.map_virtual_to_concrete_string(virtual_id, concrete_id);
}

#[inline(always)]
pub fn bulk_map_virtual_to_single_concrete_string<I>(
&self,
id: StringId,
s: &STR,
) -> StringId {
self.string_table.alloc_with_reserved_id(id, s)
virtual_ids: I,
concrete_id: StringId,
) where
I: Iterator<Item = StringId> + ExactSizeIterator,
{
self.string_table
.bulk_map_virtual_to_single_concrete_string(virtual_ids, concrete_id);
}

#[inline(always)]
Expand All @@ -92,6 +101,7 @@ impl<S: SerializationSink> Profiler<S> {

/// Creates a "start" event and returns a `TimingGuard` that will create
/// the corresponding "end" event when it is dropped.
#[inline]
pub fn start_recording_interval_event<'a>(
&'a self,
event_kind: StringId,
Expand Down
Loading