perf: small performance improvements

explodingcamera · explodingcamera · commit f6dba823b785 · 2024-02-01T01:36:12.000+01:00
Signed-off-by: Henry Gressmann &lt;mail@henrygressmann.de&gt;
diff --git a/BENCHMARKS.md b/BENCHMARKS.md
@@ -33,7 +33,7 @@ All runtimes are compiled with the following settings:
 | `fib`        | 6ns    | 44.76µs  | 48.96µs  | 52µs                 |
 | `fib-rec`    | 284ns  | 25.565ms | 5.11ms   | 0.50ms               |
 | `argon2id`   | 0.52ms | 110.08ms | 44.408ms | 4.76ms               |
-| `selfhosted` | 45µs   | 2.18ms   | 4.25ms   | 258.87ms             |
+| `selfhosted` | 45µs   | 2.08ms   | 4.25ms   | 258.87ms             |
 
 ### Fib
 
@@ -49,7 +49,7 @@ TinyWasm is a lot slower here, but that's because there's currently no way to re
 
 This benchmark runs the Argon2id hashing algorithm, with 2 iterations, 1KB of memory, and 1 parallel lane.
 I had to decrease the memory usage from the default to 1KB, because especially the interpreters were struggling to finish in a reasonable amount of time.
-This is where `simd` instructions would be really useful, and it also highlights some of the issues with the current implementation of TinyWasm's Value Stack and Memory Instances.
+This is where `simd` instructions would be really useful, and it also highlights some of the issues with the current implementation of TinyWasm's Value Stack and Memory Instances. These spend a lot of time on `Vec` operations, so they might be a good place to start experimenting with Arena Allocation.
 
 ### Selfhosted
 
@@ -62,6 +62,8 @@ Wasmer also offers a pre-parsed module format, so keep in mind that this number
 
 After profiling and fixing some low-hanging fruits, I found the biggest bottleneck to be Vector operations, especially for the Value Stack, and having shared access to Memory Instances using RefCell. These are the two areas I will be focusing on improving in the future, trying out Arena Allocation and other data structures to improve performance. Additionally, typed FuncHandles have a significant overhead over the untyped ones, so I will be looking into improving that as well. Still, I'm quite happy with the results, especially considering the use of standard Rust data structures.
 
+Something that made a much bigger difference than I expected was to give compiler hints about cold paths, and to force inlining of some functions. This made the benchmarks 30%+ faster in some cases. A lot of places in the codebase have comments about what optimizations have been done.
+
 # Running benchmarks
 
 Benchmarks are run using [Criterion.rs](https://github.com/bheisler/criterion.rs). To run a benchmark, use the following command:
diff --git a/crates/tinywasm/src/instance.rs b/crates/tinywasm/src/instance.rs
@@ -36,15 +36,18 @@ pub(crate) struct ModuleInstanceInner {
 
 impl ModuleInstance {
     // drop the module instance reference and swap it with another one
+    #[inline]
     pub(crate) fn swap(&mut self, other: Self) {
         self.0 = other.0;
     }
 
+    #[inline]
     pub(crate) fn swap_with(&mut self, other_addr: ModuleInstanceAddr, store: &mut Store) {
         self.swap(store.get_module_instance_raw(other_addr))
     }
 
     /// Get the module instance's address
+    #[inline]
     pub fn id(&self) -> ModuleInstanceAddr {
         self.0.idx
     }
@@ -118,44 +121,53 @@ impl ModuleInstance {
         Some(ExternVal::new(kind, *addr))
     }
 
-    pub(crate) fn func_addrs(&self) -> &[FuncAddr] {
-        &self.0.func_addrs
-    }
-
+    #[inline]
     pub(crate) fn new(inner: ModuleInstanceInner) -> Self {
         Self(Rc::new(inner))
     }
 
+    #[inline]
     pub(crate) fn func_ty(&self, addr: FuncAddr) -> &FuncType {
         self.0.types.get(addr as usize).expect("No func type for func, this is a bug")
     }
 
+    #[inline]
+    pub(crate) fn func_addrs(&self) -> &[FuncAddr] {
+        &self.0.func_addrs
+    }
+
     // resolve a function address to the global store address
+    #[inline]
     pub(crate) fn resolve_func_addr(&self, addr: FuncAddr) -> FuncAddr {
         *self.0.func_addrs.get(addr as usize).expect("No func addr for func, this is a bug")
     }
 
     // resolve a table address to the global store address
+    #[inline]
     pub(crate) fn resolve_table_addr(&self, addr: TableAddr) -> TableAddr {
         *self.0.table_addrs.get(addr as usize).expect("No table addr for table, this is a bug")
     }
 
     // resolve a memory address to the global store address
+    #[inline]
     pub(crate) fn resolve_mem_addr(&self, addr: MemAddr) -> MemAddr {
         *self.0.mem_addrs.get(addr as usize).expect("No mem addr for mem, this is a bug")
     }
 
     // resolve a data address to the global store address
+    #[inline]
     pub(crate) fn resolve_data_addr(&self, addr: DataAddr) -> MemAddr {
         *self.0.data_addrs.get(addr as usize).expect("No data addr for data, this is a bug")
     }
 
     // resolve a memory address to the global store address
+    #[inline]
     pub(crate) fn resolve_elem_addr(&self, addr: ElemAddr) -> ElemAddr {
         *self.0.elem_addrs.get(addr as usize).expect("No elem addr for elem, this is a bug")
     }
 
     // resolve a global address to the global store address
+    #[inline]
     pub(crate) fn resolve_global_addr(&self, addr: GlobalAddr) -> GlobalAddr {
         self.0.global_addrs[addr as usize]
     }
diff --git a/crates/tinywasm/src/runtime/interpreter/macros.rs b/crates/tinywasm/src/runtime/interpreter/macros.rs
@@ -28,13 +28,13 @@ macro_rules! mem_load {
     }};
 
     ($load_type:ty, $target_type:ty, $arg:ident, $stack:ident, $store:ident, $module:ident) => {{
-        // TODO: there could be a lot of performance improvements here
         let mem_idx = $module.resolve_mem_addr($arg.mem_addr);
         let mem = $store.get_mem(mem_idx as usize)?;
         let mem_ref = mem.borrow_mut();
 
         let addr = $stack.values.pop()?.raw_value();
         let addr = $arg.offset.checked_add(addr).ok_or_else(|| {
+            cold();
             Error::Trap(crate::Trap::MemoryOutOfBounds {
                 offset: $arg.offset as usize,
                 len: core::mem::size_of::<$load_type>(),
@@ -43,6 +43,7 @@ macro_rules! mem_load {
         })?;
 
         let addr: usize = addr.try_into().ok().ok_or_else(|| {
+            cold();
             Error::Trap(crate::Trap::MemoryOutOfBounds {
                 offset: $arg.offset as usize,
                 len: core::mem::size_of::<$load_type>(),
diff --git a/crates/tinywasm/src/runtime/interpreter/mod.rs b/crates/tinywasm/src/runtime/interpreter/mod.rs
@@ -29,28 +29,31 @@ impl InterpreterRuntime {
         let mut current_module = store.get_module_instance_raw(cf.func_instance.1);
 
         loop {
-            match exec_one(&mut cf, stack, store, &current_module)? {
+            match exec_one(&mut cf, stack, store, &current_module) {
                 // Continue execution at the new top of the call stack
-                ExecResult::Call => {
+                Ok(ExecResult::Call) => {
                     cf = stack.call_stack.pop()?;
+
+                    // keeping the pointer seperate from the call frame is about 2% faster
+                    // than storing it in the call frame
                     if cf.func_instance.1 != current_module.id() {
                         current_module.swap_with(cf.func_instance.1, store);
                     }
                 }
 
                 // return from the function
-                ExecResult::Return => return Ok(()),
+                Ok(ExecResult::Return) => return Ok(()),
 
                 // continue to the next instruction and increment the instruction pointer
-                ExecResult::Ok => cf.instr_ptr += 1,
+                Ok(ExecResult::Ok) => cf.instr_ptr += 1,
 
                 // trap the program
-                ExecResult::Trap(trap) => {
+                Err(error) => {
                     cf.instr_ptr += 1;
                     // push the call frame back onto the stack so that it can be resumed
                     // if the trap can be handled
                     stack.call_stack.push(cf)?;
-                    return Err(Error::Trap(trap));
+                    return Err(error);
                 }
             }
         }
@@ -61,13 +64,14 @@ enum ExecResult {
     Ok,
     Return,
     Call,
-    Trap(crate::Trap),
 }
 
 /// Run a single step of the interpreter
 /// A seperate function is used so later, we can more easily implement
 /// a step-by-step debugger (using generators once they're stable?)
-#[inline(always)] // this improves performance by more than 20% in some cases
+// we want this be always part of the loop, rust just doesn't inline it as its too big
+// this can be a 30%+ performance difference in some cases
+#[inline(always)]
 fn exec_one(cf: &mut CallFrame, stack: &mut Stack, store: &mut Store, module: &ModuleInstance) -> Result<ExecResult> {
     let instrs = &cf.func_instance.0.instructions;
     if unlikely(cf.instr_ptr >= instrs.len() || instrs.is_empty()) {
@@ -84,7 +88,7 @@ fn exec_one(cf: &mut CallFrame, stack: &mut Stack, store: &mut Store, module: &M
         Nop => { /* do nothing */ }
         Unreachable => {
             cold();
-            return Ok(ExecResult::Trap(crate::Trap::Unreachable));
+            return Err(crate::Trap::Unreachable.into());
         } // we don't need to include the call frame here because it's already on the stack
         Drop => stack.values.pop().map(|_| ())?,
 
diff --git a/crates/tinywasm/src/store/memory.rs b/crates/tinywasm/src/store/memory.rs
@@ -100,6 +100,7 @@ impl MemoryInstance {
         Ok(val)
     }
 
+    #[inline]
     pub(crate) fn page_count(&self) -> usize {
         self.page_count
     }
@@ -186,10 +187,12 @@ macro_rules! impl_mem_loadable_for_primitive {
         $(
             #[allow(unsafe_code)]
             unsafe impl MemLoadable<$size> for $type {
+                #[inline]
                 fn from_le_bytes(bytes: [u8; $size]) -> Self {
                     <$type>::from_le_bytes(bytes)
                 }
 
+                #[inline]
                 fn from_be_bytes(bytes: [u8; $size]) -> Self {
                     <$type>::from_be_bytes(bytes)
                 }
diff --git a/crates/tinywasm/src/store/mod.rs b/crates/tinywasm/src/store/mod.rs
@@ -116,6 +116,72 @@ impl Store {
         Ok(())
     }
 
+    #[cold]
+    fn not_found_error(name: &str) -> Error {
+        Error::Other(format!("{} not found", name))
+    }
+
+    /// Get the function at the actual index in the store
+    #[inline]
+    pub(crate) fn get_func(&self, addr: usize) -> Result<&FunctionInstance> {
+        self.data.funcs.get(addr).ok_or_else(|| Self::not_found_error("function"))
+    }
+
+    /// Get the memory at the actual index in the store
+    #[inline]
+    pub(crate) fn get_mem(&self, addr: usize) -> Result<&Rc<RefCell<MemoryInstance>>> {
+        self.data.memories.get(addr).ok_or_else(|| Self::not_found_error("memory"))
+    }
+
+    /// Get the table at the actual index in the store
+    #[inline]
+    pub(crate) fn get_table(&self, addr: usize) -> Result<&Rc<RefCell<TableInstance>>> {
+        self.data.tables.get(addr).ok_or_else(|| Self::not_found_error("table"))
+    }
+
+    /// Get the data at the actual index in the store
+    #[inline]
+    pub(crate) fn get_data(&self, addr: usize) -> Result<&DataInstance> {
+        self.data.datas.get(addr).ok_or_else(|| Self::not_found_error("data"))
+    }
+
+    /// Get the data at the actual index in the store
+    #[inline]
+    pub(crate) fn get_data_mut(&mut self, addr: usize) -> Result<&mut DataInstance> {
+        self.data.datas.get_mut(addr).ok_or_else(|| Self::not_found_error("data"))
+    }
+
+    /// Get the element at the actual index in the store
+    #[inline]
+    pub(crate) fn get_elem(&self, addr: usize) -> Result<&ElementInstance> {
+        self.data.elements.get(addr).ok_or_else(|| Self::not_found_error("element"))
+    }
+
+    /// Get the global at the actual index in the store
+    #[inline]
+    pub(crate) fn get_global(&self, addr: usize) -> Result<&Rc<RefCell<GlobalInstance>>> {
+        self.data.globals.get(addr).ok_or_else(|| Self::not_found_error("global"))
+    }
+
+    /// Get the global at the actual index in the store
+    #[inline]
+    pub fn get_global_val(&self, addr: usize) -> Result<RawWasmValue> {
+        self.data.globals.get(addr).ok_or_else(|| Self::not_found_error("global")).map(|global| global.borrow().value)
+    }
+
+    /// Set the global at the actual index in the store
+    #[inline]
+    pub(crate) fn set_global_val(&mut self, addr: usize, value: RawWasmValue) -> Result<()> {
+        self.data
+            .globals
+            .get(addr)
+            .ok_or_else(|| Self::not_found_error("global"))
+            .map(|global| global.borrow_mut().value = value)
+    }
+}
+
+// Linking related functions
+impl Store {
     /// Add functions to the store, returning their addresses in the store
     pub(crate) fn init_funcs(
         &mut self,
@@ -391,58 +457,4 @@ impl Store {
         };
         Ok(val)
     }
-
-    #[cold]
-    fn not_found_error(name: &str) -> Error {
-        Error::Other(format!("{} not found", name))
-    }
-
-    /// Get the function at the actual index in the store
-    pub(crate) fn get_func(&self, addr: usize) -> Result<&FunctionInstance> {
-        self.data.funcs.get(addr).ok_or_else(|| Self::not_found_error("function"))
-    }
-
-    /// Get the memory at the actual index in the store
-    pub(crate) fn get_mem(&self, addr: usize) -> Result<&Rc<RefCell<MemoryInstance>>> {
-        self.data.memories.get(addr).ok_or_else(|| Self::not_found_error("memory"))
-    }
-
-    /// Get the table at the actual index in the store
-    pub(crate) fn get_table(&self, addr: usize) -> Result<&Rc<RefCell<TableInstance>>> {
-        self.data.tables.get(addr).ok_or_else(|| Self::not_found_error("table"))
-    }
-
-    /// Get the data at the actual index in the store
-    pub(crate) fn get_data(&self, addr: usize) -> Result<&DataInstance> {
-        self.data.datas.get(addr).ok_or_else(|| Self::not_found_error("data"))
-    }
-
-    /// Get the data at the actual index in the store
-    pub(crate) fn get_data_mut(&mut self, addr: usize) -> Result<&mut DataInstance> {
-        self.data.datas.get_mut(addr).ok_or_else(|| Self::not_found_error("data"))
-    }
-
-    /// Get the element at the actual index in the store
-    pub(crate) fn get_elem(&self, addr: usize) -> Result<&ElementInstance> {
-        self.data.elements.get(addr).ok_or_else(|| Self::not_found_error("element"))
-    }
-
-    /// Get the global at the actual index in the store
-    pub(crate) fn get_global(&self, addr: usize) -> Result<&Rc<RefCell<GlobalInstance>>> {
-        self.data.globals.get(addr).ok_or_else(|| Self::not_found_error("global"))
-    }
-
-    /// Get the global at the actual index in the store
-    pub fn get_global_val(&self, addr: usize) -> Result<RawWasmValue> {
-        self.data.globals.get(addr).ok_or_else(|| Self::not_found_error("global")).map(|global| global.borrow().value)
-    }
-
-    /// Set the global at the actual index in the store
-    pub(crate) fn set_global_val(&mut self, addr: usize, value: RawWasmValue) -> Result<()> {
-        self.data
-            .globals
-            .get(addr)
-            .ok_or_else(|| Self::not_found_error("global"))
-            .map(|global| global.borrow_mut().value = value)
-    }
 }

Original file line number	Diff line number	Diff line change
`@@ -100,6 +100,7 @@ impl MemoryInstance {`
`100`	`100`	`Ok(val)`
`101`	`101`	`}`
`102`	`102`
	`103`	`+ #[inline]`
`103`	`104`	`pub(crate) fn page_count(&self) -> usize {`
`104`	`105`	`self.page_count`
`105`	`106`	`}`
`@@ -186,10 +187,12 @@ macro_rules! impl_mem_loadable_for_primitive {`
`186`	`187`	`$(`
`187`	`188`	`#[allow(unsafe_code)]`
`188`	`189`	`unsafe impl MemLoadable<$size> for $type {`
	`190`	`+ #[inline]`
`189`	`191`	`fn from_le_bytes(bytes: [u8; $size]) -> Self {`
`190`	`192`	`<$type>::from_le_bytes(bytes)`
`191`	`193`	`}`
`192`	`194`
	`195`	`+ #[inline]`
`193`	`196`	`fn from_be_bytes(bytes: [u8; $size]) -> Self {`
`194`	`197`	`<$type>::from_be_bytes(bytes)`
`195`	`198`	`}`