Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions kvm-bindings/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,7 @@ pub use self::arm64::*;
mod riscv64;
#[cfg(target_arch = "riscv64")]
pub use self::riscv64::*;

// linux defines these based on _BITUL macros and bindgen fails to generate them
pub const KVM_DIRTY_GFN_F_DIRTY: u32 = 0b1;
pub const KVM_DIRTY_GFN_F_RESET: u32 = 0b10;
18 changes: 18 additions & 0 deletions kvm-ioctls/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,24 @@

## Upcoming Release

### Fixed

- Fixed `VmFd::enable_cap` available for all architectures

### Added

- Added `KvmDirtyLogRing` structure to mmap the dirty log ring.
- Added `KVM_DIRTY_GFN_F_DIRTY` and `KVM_DIRTY_GFN_F_RESET` bitflags.
- Added `KvmDirtyLogRing` iterator type for accessing dirty log entries.
- Added `dirty_log_ring` field to `VcpuFd` to access per-vCpu dirty rings.
- Inserted fences in KvmDirtyLogRing iterator `next` for architectures with weak memory consistency that require Acquire/Release
- Added `DirtyLogRingInfo` struct and `dirty_log_ring_info` field to `VmFd` to
track dirty ring configuration.
- Added `enable_dirty_log_ring` function on `VmFd` to check corresponding
capabilities and enable KVM's dirty log ring.
- Added `VcpuFd::dirty_log_ring_iter()` to iterate over dirty guest frame numbers.
- Added `VmFd::reset_dirty_rings()` to reset all dirty rings for the VM.

- Plumb through KVM_CAP_DIRTY_LOG_RING as DirtyLogRing cap.

## v0.24.0
Expand Down
119 changes: 118 additions & 1 deletion kvm-ioctls/src/ioctls/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@
use std::mem::size_of;
use std::os::unix::io::AsRawFd;
use std::ptr::{NonNull, null_mut};
use std::sync::atomic::{Ordering, fence};

use kvm_bindings::{
KVM_COALESCED_MMIO_PAGE_OFFSET, kvm_coalesced_mmio, kvm_coalesced_mmio_ring, kvm_run,
KVM_COALESCED_MMIO_PAGE_OFFSET, KVM_DIRTY_GFN_F_DIRTY, KVM_DIRTY_GFN_F_RESET,
KVM_DIRTY_LOG_PAGE_OFFSET, kvm_coalesced_mmio, kvm_coalesced_mmio_ring, kvm_dirty_gfn, kvm_run,
};
use vmm_sys_util::errno;

Expand All @@ -29,6 +31,121 @@ pub mod vm;
/// is otherwise a direct mapping to Result.
pub type Result<T> = std::result::Result<T, errno::Error>;

/// A wrapper around the KVM dirty log ring page.
#[derive(Debug)]
pub(crate) struct KvmDirtyLogRing {
/// Next potentially dirty guest frame number slot index
next_dirty: u64,
/// Memory-mapped array of dirty guest frame number entries
gfns: NonNull<kvm_dirty_gfn>,
/// Ring size mask (size-1) for efficient modulo operations
mask: u64,
/// `true` if we need to use Acquire/Release memory ordering
use_acq_rel: bool,
}

impl KvmDirtyLogRing {
/// Maps the KVM dirty log ring from the vCPU file descriptor.
///
/// # Arguments
/// * `fd` - vCPU file descriptor to mmap from.
/// * `size` - Size of memory region in bytes.
pub(crate) fn mmap_from_fd<F: AsRawFd>(
fd: &F,
bytes: usize,
use_acq_rel: bool,
) -> Result<Self> {
// SAFETY: We trust the sysconf libc function and we're calling it
// with a correct parameter.
let page_size = match unsafe { libc::sysconf(libc::_SC_PAGESIZE) } {
-1 => return Err(errno::Error::last()),
ps => ps as usize,
};

let offset = page_size * KVM_DIRTY_LOG_PAGE_OFFSET as usize;

if bytes % std::mem::size_of::<kvm_dirty_gfn>() != 0 {
// Size of dirty ring in bytes must be multiples of slot size
return Err(errno::Error::new(libc::EINVAL));
}
let slots = bytes / std::mem::size_of::<kvm_dirty_gfn>();
if !slots.is_power_of_two() {
// Number of slots must be power of two
return Err(errno::Error::new(libc::EINVAL));
}

// SAFETY: KVM guarantees that there is a page at offset
// KVM_DIRTY_LOG_PAGE_OFFSET * PAGE_SIZE if the appropriate
// capability is available. If it is not, the call will simply
// fail.
let gfns = unsafe {
NonNull::<kvm_dirty_gfn>::new(libc::mmap(
null_mut(),
bytes,
libc::PROT_READ | libc::PROT_WRITE,
libc::MAP_SHARED,
fd.as_raw_fd(),
offset as i64,
) as *mut kvm_dirty_gfn)
.filter(|addr| addr.as_ptr() != libc::MAP_FAILED as *mut kvm_dirty_gfn)
.ok_or_else(errno::Error::last)?
};
Ok(Self {
next_dirty: 0,
gfns,
mask: (slots - 1) as u64,
use_acq_rel,
})
}
}

impl Drop for KvmDirtyLogRing {
fn drop(&mut self) {
// SAFETY: This is safe because we mmap the page ourselves, and nobody
// else is holding a reference to it.
unsafe {
libc::munmap(
self.gfns.as_ptr().cast(),
(self.mask + 1) as usize * std::mem::size_of::<kvm_dirty_gfn>(),
);
}
}
}

impl Iterator for KvmDirtyLogRing {
type Item = (u32, u64);
fn next(&mut self) -> Option<Self::Item> {
let i = self.next_dirty & self.mask;
// SAFETY: i is not larger than mask, thus is a valid offset into self.gfns,
// therefore this operation produces a valid pointer to a kvm_dirty_gfn
let gfn_ptr = unsafe { self.gfns.add(i as usize).as_ptr() };

if self.use_acq_rel {
fence(Ordering::Acquire);
}

// SAFETY: Can read a valid pointer to a kvm_dirty_gfn
let gfn = unsafe { gfn_ptr.read_volatile() };

if gfn.flags & KVM_DIRTY_GFN_F_DIRTY == 0 {
// next_dirty stays the same, it will become the next dirty element
None
} else {
self.next_dirty += 1;
let mut updated_gfn = gfn;
updated_gfn.flags ^= KVM_DIRTY_GFN_F_RESET;
// SAFETY: Can write to a valid pointer to a kvm_dirty_gfn
unsafe {
gfn_ptr.write_volatile(updated_gfn);
};
if self.use_acq_rel {
fence(Ordering::Release);
}
Some((gfn.slot, gfn.offset))
}
}
}

/// A wrapper around the coalesced MMIO ring page.
#[derive(Debug)]
pub(crate) struct KvmCoalescedIoRing {
Expand Down
180 changes: 178 additions & 2 deletions kvm-ioctls/src/ioctls/vcpu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use libc::EINVAL;
use std::fs::File;
use std::os::unix::io::{AsRawFd, RawFd};

use crate::ioctls::{KvmCoalescedIoRing, KvmRunWrapper, Result};
use crate::ioctls::{KvmCoalescedIoRing, KvmDirtyLogRing, KvmRunWrapper, Result};
use crate::kvm_ioctls::*;
use vmm_sys_util::errno;
use vmm_sys_util::ioctl::{ioctl, ioctl_with_mut_ref, ioctl_with_ref};
Expand Down Expand Up @@ -197,6 +197,9 @@ pub struct VcpuFd {
kvm_run_ptr: KvmRunWrapper,
/// A pointer to the coalesced MMIO page
coalesced_mmio_ring: Option<KvmCoalescedIoRing>,
/// A pointer to the dirty log ring
#[allow(unused)]
dirty_log_ring: Option<KvmDirtyLogRing>,
}

/// KVM Sync Registers used to tell KVM which registers to sync
Expand Down Expand Up @@ -2047,6 +2050,36 @@ impl VcpuFd {
}
}

/// Gets the dirty log ring iterator if one is mapped.
///
/// Returns an iterator over dirty guest frame numbers as (slot, offset) tuples.
/// Returns `None` if no dirty log ring has been mapped.
///
/// # Returns
///
/// An optional iterator over the dirty log ring entries.
///
/// # Example
///
/// ```no_run
/// # use kvm_ioctls::Kvm;
/// # use kvm_ioctls::Cap;
/// let kvm = Kvm::new().unwrap();
/// let mut vm = kvm.create_vm().unwrap();
/// vm.enable_dirty_log_ring(None).unwrap();
/// let mut vcpu = vm.create_vcpu(0).unwrap();
/// if kvm.check_extension(Cap::DirtyLogRing) {
/// if let Some(mut iter) = vcpu.dirty_log_ring_iter() {
/// for (slot, offset) in iter {
/// println!("Dirty page in slot {} at offset {}", slot, offset);
/// }
/// }
/// }
/// ```
pub fn dirty_log_ring_iter(&mut self) -> Option<impl Iterator<Item = (u32, u64)>> {
self.dirty_log_ring.as_mut()
}

/// Maps the coalesced MMIO ring page. This allows reading entries from
/// the ring via [`coalesced_mmio_read()`](VcpuFd::coalesced_mmio_read).
///
Expand Down Expand Up @@ -2102,11 +2135,16 @@ impl VcpuFd {
/// This should not be exported as a public function because the preferred way is to use
/// `create_vcpu` from `VmFd`. The function cannot be part of the `VcpuFd` implementation because
/// then it would be exported with the public `VcpuFd` interface.
pub fn new_vcpu(vcpu: File, kvm_run_ptr: KvmRunWrapper) -> VcpuFd {
pub fn new_vcpu(
vcpu: File,
kvm_run_ptr: KvmRunWrapper,
dirty_log_ring: Option<KvmDirtyLogRing>,
) -> VcpuFd {
VcpuFd {
vcpu,
kvm_run_ptr,
coalesced_mmio_ring: None,
dirty_log_ring,
}
}

Expand Down Expand Up @@ -2777,6 +2815,144 @@ mod tests {
}
}

#[cfg(target_arch = "x86_64")]
#[test]
fn test_run_code_dirty_log_ring() {
use std::io::Write;

let kvm = Kvm::new().unwrap();
let mut vm = kvm.create_vm().unwrap();

// Enable dirty log ring
let need_bitmap = vm.enable_dirty_log_ring(None).unwrap();

// This example is based on https://lwn.net/Articles/658511/
#[rustfmt::skip]
let code = [
0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
0x00, 0xd8, /* add %bl, %al */
0x04, b'0', /* add $'0', %al */
0xee, /* out %al, %dx */
0xec, /* in %dx, %al */
0xc6, 0x06, 0x00, 0x80, 0x00, /* movl $0, (0x8000); This generates a MMIO Write.*/
0x8a, 0x16, 0x00, 0x80, /* movl (0x8000), %dl; This generates a MMIO Read.*/
0xc6, 0x06, 0x00, 0x20, 0x00, /* movl $0, (0x2000); Dirty one page in guest mem. */
0xf4, /* hlt */
];
let expected_rips: [u64; 3] = [0x1003, 0x1005, 0x1007];

let mem_size = 0x4000;
let load_addr = mmap_anonymous(mem_size).as_ptr();
let guest_addr: u64 = 0x1000;
let slot: u32 = 0;
let mem_region = kvm_userspace_memory_region {
slot,
guest_phys_addr: guest_addr,
memory_size: mem_size as u64,
userspace_addr: load_addr as u64,
flags: KVM_MEM_LOG_DIRTY_PAGES,
};
unsafe {
vm.set_user_memory_region(mem_region).unwrap();
}

unsafe {
// Get a mutable slice of `mem_size` from `load_addr`.
// This is safe because we mapped it before.
let mut slice = std::slice::from_raw_parts_mut(load_addr, mem_size);
slice.write_all(&code).unwrap();
}

let mut vcpu_fd = vm.create_vcpu(0).unwrap();

let mut vcpu_sregs = vcpu_fd.get_sregs().unwrap();
assert_ne!(vcpu_sregs.cs.base, 0);
assert_ne!(vcpu_sregs.cs.selector, 0);
vcpu_sregs.cs.base = 0;
vcpu_sregs.cs.selector = 0;
vcpu_fd.set_sregs(&vcpu_sregs).unwrap();

let mut vcpu_regs = vcpu_fd.get_regs().unwrap();
// Set the Instruction Pointer to the guest address where we loaded the code.
vcpu_regs.rip = guest_addr;
vcpu_regs.rax = 2;
vcpu_regs.rbx = 3;
vcpu_regs.rflags = 2;
vcpu_fd.set_regs(&vcpu_regs).unwrap();

let mut debug_struct = kvm_guest_debug {
control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP,
pad: 0,
arch: kvm_guest_debug_arch {
debugreg: [0, 0, 0, 0, 0, 0, 0, 0],
},
};
vcpu_fd.set_guest_debug(&debug_struct).unwrap();

let mut instr_idx = 0;
loop {
match vcpu_fd.run().expect("run failed") {
VcpuExit::IoIn(addr, data) => {
assert_eq!(addr, 0x3f8);
assert_eq!(data.len(), 1);
}
VcpuExit::IoOut(addr, data) => {
assert_eq!(addr, 0x3f8);
assert_eq!(data.len(), 1);
assert_eq!(data[0], b'5');
}
VcpuExit::MmioRead(addr, data) => {
assert_eq!(addr, 0x8000);
assert_eq!(data.len(), 1);
}
VcpuExit::MmioWrite(addr, data) => {
assert_eq!(addr, 0x8000);
assert_eq!(data.len(), 1);
assert_eq!(data[0], 0);
}
VcpuExit::Debug(debug) => {
if instr_idx == expected_rips.len() - 1 {
// Disabling debugging/single-stepping
debug_struct.control = 0;
vcpu_fd.set_guest_debug(&debug_struct).unwrap();
} else if instr_idx >= expected_rips.len() {
unreachable!();
}
let vcpu_regs = vcpu_fd.get_regs().unwrap();
assert_eq!(vcpu_regs.rip, expected_rips[instr_idx]);
assert_eq!(debug.exception, 1);
assert_eq!(debug.pc, expected_rips[instr_idx]);
// Check first 15 bits of DR6
let mask = (1 << 16) - 1;
assert_eq!(debug.dr6 & mask, 0b100111111110000);
// Bit 10 in DR7 is always 1
assert_eq!(debug.dr7, 1 << 10);
instr_idx += 1;
}
VcpuExit::Hlt => {
// The code snippet dirties 2 pages:
// * one when the code itself is loaded in memory;
// * and one more from the `movl` that writes to address 0x8000

let dirty_pages: u32 =
u32::try_from(vcpu_fd.dirty_log_ring_iter().unwrap().count()).unwrap()
+ if need_bitmap {
let dirty_pages_bitmap = vm.get_dirty_log(slot, mem_size).unwrap();
dirty_pages_bitmap
.into_iter()
.map(|page| page.count_ones())
.sum()
} else {
0
};
assert_eq!(dirty_pages, 2);
break;
}
r => panic!("unexpected exit reason: {:?}", r),
}
}
}

#[test]
#[cfg(target_arch = "aarch64")]
fn test_get_preferred_target() {
Expand Down
Loading