Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -4819,6 +4819,7 @@ dependencies = [
"inspect",
"mesh",
"open_enum",
"static_assertions",
"storage_string",
"zerocopy 0.8.25",
]
Expand Down Expand Up @@ -4998,6 +4999,7 @@ dependencies = [
"memory_range",
"mesh",
"page_pool_alloc",
"tracing",
"user_driver",
"virt",
"vmcore",
Expand Down
1 change: 1 addition & 0 deletions openhcl/openhcl_dma_manager/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ virt.workspace = true
vmcore.workspace = true

anyhow.workspace = true
tracing.workspace = true

[lints]
workspace = true
34 changes: 31 additions & 3 deletions openhcl/openhcl_dma_manager/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -286,12 +286,20 @@ impl DmaManagerInner {
allocation_visibility: AllocationVisibility::Private,
persistent_allocations: false,
shared_spawner: _,
private_spawner: _,
private_spawner,
} => match lower_vtl_policy {
LowerVtlPermissionPolicy::Any => {
// No persistence needed means the `LockedMemorySpawner`
// using normal VTL2 ram is fine.
DmaClientBacking::LockedMemory(LockedMemorySpawner)
match private_spawner {
Some(private) => DmaClientBacking::PrivatePoolWithFallback((
private
.allocator(device_name.into())
.context("failed to create private allocator")?,
LockedMemorySpawner,
)),
None => DmaClientBacking::LockedMemory(LockedMemorySpawner),
}
}
LowerVtlPermissionPolicy::Vtl0 => {
// `LockedMemorySpawner` uses private VTL2 ram, so
Expand Down Expand Up @@ -416,6 +424,7 @@ enum DmaClientBacking {
SharedPool(#[inspect(skip)] PagePoolAllocator),
PrivatePool(#[inspect(skip)] PagePoolAllocator),
LockedMemory(#[inspect(skip)] LockedMemorySpawner),
PrivatePoolWithFallback(#[inspect(skip)] (PagePoolAllocator, LockedMemorySpawner)),
PrivatePoolLowerVtl(#[inspect(skip)] LowerVtlMemorySpawner<PagePoolAllocator>),
LockedMemoryLowerVtl(#[inspect(skip)] LowerVtlMemorySpawner<LockedMemorySpawner>),
}
Expand All @@ -429,6 +438,16 @@ impl DmaClientBacking {
DmaClientBacking::SharedPool(allocator) => allocator.allocate_dma_buffer(total_size),
DmaClientBacking::PrivatePool(allocator) => allocator.allocate_dma_buffer(total_size),
DmaClientBacking::LockedMemory(spawner) => spawner.allocate_dma_buffer(total_size),
DmaClientBacking::PrivatePoolWithFallback((allocator, spawner)) => {
allocator.allocate_dma_buffer(total_size).or_else(|err| {
tracing::warn!(
size = total_size,
error = ?err,
"falling back to locked memory for dma allocation"
);
spawner.allocate_dma_buffer(total_size)
})
}
DmaClientBacking::PrivatePoolLowerVtl(spawner) => {
spawner.allocate_dma_buffer(total_size)
}
Expand All @@ -442,7 +461,16 @@ impl DmaClientBacking {
match self {
DmaClientBacking::SharedPool(allocator) => allocator.attach_pending_buffers(),
DmaClientBacking::PrivatePool(allocator) => allocator.attach_pending_buffers(),
DmaClientBacking::LockedMemory(spawner) => spawner.attach_pending_buffers(),
DmaClientBacking::PrivatePoolWithFallback(_) => {
anyhow::bail!("cannot attach pending buffers with fallback allocator")
}
DmaClientBacking::LockedMemory(_) => {
anyhow::bail!(
"attaching pending buffers is not supported with locked memory; \
this client type does not maintain a pool of pending allocations. \
To use attach_pending_buffers, create a client backed by a shared or private pool."
)
}
DmaClientBacking::PrivatePoolLowerVtl(spawner) => spawner.attach_pending_buffers(),
DmaClientBacking::LockedMemoryLowerVtl(spawner) => spawner.attach_pending_buffers(),
}
Expand Down
15 changes: 14 additions & 1 deletion vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,9 @@ impl<T: DeviceBacking> NvmeDriver<T> {
)
.context("failed to create admin queue pair")?;

let admin_sqes = admin.sq_entries();
let admin_cqes = admin.cq_entries();

let admin = worker.admin.insert(admin);

// Register the admin queue with the controller.
Expand Down Expand Up @@ -452,6 +455,13 @@ impl<T: DeviceBacking> NvmeDriver<T> {
let io_cqsize = (MAX_CQ_ENTRIES - 1).min(worker.registers.cap.mqes_z()) + 1;
let io_sqsize = (MAX_SQ_ENTRIES - 1).min(worker.registers.cap.mqes_z()) + 1;

tracing::debug!(
io_cqsize,
io_sqsize,
hw_size = worker.registers.cap.mqes_z(),
"io queue sizes"
);

// Some hardware (such as ASAP) require that the sq and cq have the same size.
io_cqsize.min(io_sqsize)
};
Expand Down Expand Up @@ -630,7 +640,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {
admin: None, // Updated below.
identify: Some(Arc::new(
spec::IdentifyController::read_from_bytes(saved_state.identify_ctrl.as_bytes())
.map_err(|_| RestoreError::InvalidData)?, // TODO: zerocopy: map_err (https://github.com/microsoft/openvmm/issues/759)
.map_err(|_| RestoreError::InvalidData)?,
)),
driver: driver.clone(),
io_issuers,
Expand Down Expand Up @@ -955,6 +965,9 @@ impl<T: DeviceBacking> DriverWorkerTask<T> {
)
.map_err(|err| DeviceError::IoQueuePairCreationFailure(err, qid))?;

assert_eq!(queue.sq_entries(), queue.cq_entries());
state.qsize = queue.sq_entries();

let io_sq_addr = queue.sq_addr();
let io_cq_addr = queue.cq_addr();

Expand Down
103 changes: 91 additions & 12 deletions vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,18 +46,23 @@ use zerocopy::FromZeros;

/// Value for unused PRP entries, to catch/mitigate buffer size mismatches.
const INVALID_PAGE_ADDR: u64 = !(PAGE_SIZE as u64 - 1);
/// Maximum SQ size in entries.
pub const MAX_SQ_ENTRIES: u16 = (PAGE_SIZE / 64) as u16;
/// Maximum CQ size in entries.
pub const MAX_CQ_ENTRIES: u16 = (PAGE_SIZE / 16) as u16;

const SQ_ENTRY_SIZE: usize = size_of::<spec::Command>();
const CQ_ENTRY_SIZE: usize = size_of::<spec::Completion>();
/// Submission Queue size in bytes.
const SQ_SIZE: usize = PAGE_SIZE;
const SQ_SIZE: usize = PAGE_SIZE * 4;
/// Completion Queue size in bytes.
const CQ_SIZE: usize = PAGE_SIZE;
/// Maximum SQ size in entries.
pub const MAX_SQ_ENTRIES: u16 = (SQ_SIZE / SQ_ENTRY_SIZE) as u16;
/// Maximum CQ size in entries.
pub const MAX_CQ_ENTRIES: u16 = (CQ_SIZE / CQ_ENTRY_SIZE) as u16;
/// Number of pages per queue if bounce buffering.
const PER_QUEUE_PAGES_BOUNCE_BUFFER: usize = 128;
/// Number of pages per queue if not bounce buffering.
const PER_QUEUE_PAGES_NO_BOUNCE_BUFFER: usize = 64;
/// Number of SQ entries per page (64).
const SQ_ENTRIES_PER_PAGE: usize = PAGE_SIZE / SQ_ENTRY_SIZE;

#[derive(Inspect)]
pub(crate) struct QueuePair<T: AerHandler> {
Expand All @@ -75,6 +80,8 @@ pub(crate) struct QueuePair<T: AerHandler> {
sq_entries: u16,
#[inspect(skip)]
cq_entries: u16,
sq_addr: u64,
cq_addr: u64,
}

impl PendingCommands {
Expand Down Expand Up @@ -177,17 +184,31 @@ impl PendingCommands {
}

impl<T: AerHandler> QueuePair<T> {
/// Create a new queue pair.
///
/// `sq_entries` and `cq_entries` are the requested sizes in entries.
/// Calling code should request the largest size it thinks the device
/// will support (see `CAP.MQES`). These may be clamped down to what will
/// fit in one page should this routine fail to allocate physically
/// contiguous memory to back the queues.
/// IMPORTANT: Calling code should check the actual sizes via corresponding
/// calls to [`QueuePair::sq_entries`] and [`QueuePair::cq_entries`] AFTER calling this routine.
pub fn new(
spawner: impl SpawnDriver,
device: &impl DeviceBacking,
qid: u16,
sq_entries: u16, // Requested SQ size in entries.
cq_entries: u16, // Requested CQ size in entries.
sq_entries: u16,
cq_entries: u16,
interrupt: DeviceInterrupt,
registers: Arc<DeviceRegisters<impl DeviceBacking>>,
bounce_buffer: bool,
aer_handler: T,
) -> anyhow::Result<Self> {
// FUTURE: Consider splitting this into several allocations, rather than
// allocating the sum total together. This can increase the likelihood
// of getting contiguous memory when falling back to the LockedMem
// allocator, but this is not the expected path. Be careful that any
// changes you make here work with already established save state.
let total_size = SQ_SIZE
+ CQ_SIZE
+ if bounce_buffer {
Expand All @@ -196,6 +217,10 @@ impl<T: AerHandler> QueuePair<T> {
PER_QUEUE_PAGES_NO_BOUNCE_BUFFER * PAGE_SIZE
};
let dma_client = device.dma_client();

// TODO: Keepalive: Detect when the allocation came from outside
// the private pool and put the device in a degraded state, so it
// is possible to inspect that a servicing with keepalive will fail.
let mem = dma_client
.allocate_dma_buffer(total_size)
.context("failed to allocate memory for queues")?;
Expand All @@ -217,12 +242,11 @@ impl<T: AerHandler> QueuePair<T> {
)
}

/// Create new object or restore from saved state.
fn new_or_restore(
spawner: impl SpawnDriver,
qid: u16,
sq_entries: u16, // Submission queue entries.
cq_entries: u16, // Completion queue entries.
sq_entries: u16,
cq_entries: u16,
mut interrupt: DeviceInterrupt,
registers: Arc<DeviceRegisters<impl DeviceBacking>>,
mem: MemoryBlock,
Expand All @@ -235,6 +259,49 @@ impl<T: AerHandler> QueuePair<T> {
let cq_mem_block = mem.subblock(SQ_SIZE, CQ_SIZE);
let data_offset = SQ_SIZE + CQ_SIZE;

// Make sure that the queue memory is physically contiguous. While the
// NVMe spec allows for some provisions of queue memory to be
// non-contiguous, this depends on device support. At least one device
// that we must support requires that the memory is contiguous (via the
// CAP.CQR bit). Because of that, just simplify the code paths to use
// contiguous memory.
//
// We could also seek through the memory block to find contiguous pages
// (for example, if the first 4 pages are not contiguous, but pages 5-8
// are, use those), but other parts of this driver already assume the
// math to get the correct offsets.
//
// N.B. It is expected that allocations from the private pool will
// always be contiguous, and that is the normal path. That can fail in
// some cases (e.g. if we got some guesses about memory size wrong), and
// we prefer to operate in a perf degraded state rather than fail
// completely.

let (sq_is_contiguous, cq_is_contiguous) = (
sq_mem_block.contiguous_pfns(),
cq_mem_block.contiguous_pfns(),
);

let (sq_entries, cq_entries) = if !sq_is_contiguous || !cq_is_contiguous {
tracing::warn!(
qid,
sq_is_contiguous,
sq_mem_block.pfns = ?sq_mem_block.pfns(),
cq_is_contiguous,
cq_mem_block.pfns = ?cq_mem_block.pfns(),
"non-contiguous queue memory detected, falling back to single page queues"
);
// Clamp both queues to the number of entries that will fit in a
// single SQ page (since this will be the smaller between the SQ and
// CQ capacity).
(SQ_ENTRIES_PER_PAGE as u16, SQ_ENTRIES_PER_PAGE as u16)
} else {
(sq_entries, cq_entries)
};

let sq_addr = sq_mem_block.pfns()[0] * PAGE_SIZE64;
let cq_addr = cq_mem_block.pfns()[0] * PAGE_SIZE64;

let mut queue_handler = match saved_state {
Some(s) => QueueHandler::restore(sq_mem_block, cq_mem_block, s, aer_handler)?,
None => {
Expand Down Expand Up @@ -296,15 +363,27 @@ impl<T: AerHandler> QueuePair<T> {
qid,
sq_entries,
cq_entries,
sq_addr,
cq_addr,
})
}

/// Returns the actual number of SQ entries supported by this queue pair.
pub fn sq_entries(&self) -> u16 {
self.sq_entries
}

/// Returns the actual number of CQ entries supported by this queue pair.
pub fn cq_entries(&self) -> u16 {
self.cq_entries
}

pub fn sq_addr(&self) -> u64 {
self.mem.pfns()[0] * PAGE_SIZE64
self.sq_addr
}

pub fn cq_addr(&self) -> u64 {
self.mem.pfns()[1] * PAGE_SIZE64
self.cq_addr
}

pub fn issuer(&self) -> &Arc<Issuer> {
Expand Down
6 changes: 4 additions & 2 deletions vm/devices/storage/disk_nvme/nvme_driver/src/queues.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ pub(crate) struct QueueFull;

impl SubmissionQueue {
pub fn new(sqid: u16, len: u16, mem: MemoryBlock) -> Self {
tracing::debug!(sqid, len, pfns = ?mem.pfns(), "new submission queue");

Self {
sqid,
head: 0,
Expand Down Expand Up @@ -122,6 +124,7 @@ pub(crate) struct CompletionQueue {

impl CompletionQueue {
pub fn new(cqid: u16, len: u16, mem: MemoryBlock) -> CompletionQueue {
tracing::debug!(cqid, len, pfns = ?mem.pfns(), "new completion queue");
Self {
cqid,
head: 0,
Expand All @@ -138,8 +141,7 @@ impl CompletionQueue {

pub fn read(&mut self) -> Option<spec::Completion> {
let completion_mem = self.mem.as_slice()
[self.head as usize * size_of::<spec::Completion>()..]
[..size_of::<spec::Completion>() * 2]
[self.head as usize * size_of::<spec::Completion>()..][..size_of::<spec::Completion>()]
.as_atomic_slice::<AtomicU64>()
.unwrap();

Expand Down
1 change: 1 addition & 0 deletions vm/devices/storage/nvme_spec/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ mesh.workspace = true
bitfield-struct.workspace = true
open_enum.workspace = true
zerocopy.workspace = true
static_assertions.workspace = true

[lints]
workspace = true
2 changes: 2 additions & 0 deletions vm/devices/storage/nvme_spec/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ pub struct Command {
pub cdw14: u32,
pub cdw15: u32,
}
static_assertions::assert_eq_size!(Command, [u8; 64]);

#[derive(Inspect)]
#[bitfield(u32)]
Expand Down Expand Up @@ -230,6 +231,7 @@ pub struct Completion {
pub cid: u16,
pub status: CompletionStatus,
}
static_assertions::assert_eq_size!(Completion, [u8; 16]);

#[bitfield(u16)]
#[derive(IntoBytes, Immutable, KnownLayout, FromBytes, MeshPayload)]
Expand Down
2 changes: 2 additions & 0 deletions vm/devices/user_driver/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ pub trait DmaClient: Send + Sync + Inspect {
/// Allocate a new DMA buffer. This buffer must be zero initialized.
///
/// TODO: string tag for allocation?
/// TODO: contiguous vs non-contiguous? (both on the request side, and if
/// a request contiguous allocation cannot be fulfilled)
fn allocate_dma_buffer(&self, total_size: usize) -> anyhow::Result<MemoryBlock>;

/// Attach all previously allocated memory blocks.
Expand Down
14 changes: 14 additions & 0 deletions vm/devices/user_driver/src/memory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,20 @@ impl MemoryBlock {
self.mem.pfn_bias()
}

/// Returns true if the PFNs are contiguous.
///
/// TODO: Fallback allocations are here for now, but we should eventually
/// allow the caller to require these. See `DmaClient::allocate_dma_buffer`.
pub fn contiguous_pfns(&self) -> bool {
for (curr, next) in self.pfns().iter().zip(self.pfns().iter().skip(1)) {
if *curr + 1 != *next {
return false;
}
}

true
}

/// Gets the buffer as an atomic slice.
pub fn as_slice(&self) -> &[AtomicU8] {
// SAFETY: the underlying memory is valid for the lifetime of `mem`.
Expand Down
Loading