microsoft · mattkur · Oct 16, 2025 · Oct 13, 2025 · Oct 13, 2025 · Oct 14, 2025
@@ -4819,6 +4819,7 @@ dependencies = [
  "inspect",
  "mesh",
  "open_enum",
+ "static_assertions",
  "storage_string",
  "zerocopy 0.8.25",
 ]
@@ -4998,6 +4999,7 @@ dependencies = [
  "memory_range",
  "mesh",
  "page_pool_alloc",
+ "tracing",
  "user_driver",
  "virt",
  "vmcore",

@@ -21,6 +21,7 @@ virt.workspace = true
 vmcore.workspace = true
 
 anyhow.workspace = true
+tracing.workspace = true
 
 [lints]
 workspace = true
@@ -286,12 +286,20 @@ impl DmaManagerInner {
                     allocation_visibility: AllocationVisibility::Private,
                     persistent_allocations: false,
                     shared_spawner: _,
-                    private_spawner: _,
+                    private_spawner,
                 } => match lower_vtl_policy {
                     LowerVtlPermissionPolicy::Any => {
                         // No persistence needed means the `LockedMemorySpawner`
                         // using normal VTL2 ram is fine.
-                        DmaClientBacking::LockedMemory(LockedMemorySpawner)
+                        match private_spawner {
+                            Some(private) => DmaClientBacking::PrivatePoolWithFallback((
+                                private
+                                    .allocator(device_name.into())
+                                    .context("failed to create private allocator")?,
+                                LockedMemorySpawner,
+                            )),
+                            None => DmaClientBacking::LockedMemory(LockedMemorySpawner),
+                        }
                     }
                     LowerVtlPermissionPolicy::Vtl0 => {
                         // `LockedMemorySpawner` uses private VTL2 ram, so
@@ -416,6 +424,7 @@ enum DmaClientBacking {
     SharedPool(#[inspect(skip)] PagePoolAllocator),
     PrivatePool(#[inspect(skip)] PagePoolAllocator),
     LockedMemory(#[inspect(skip)] LockedMemorySpawner),
+    PrivatePoolWithFallback(#[inspect(skip)] (PagePoolAllocator, LockedMemorySpawner)),
     PrivatePoolLowerVtl(#[inspect(skip)] LowerVtlMemorySpawner<PagePoolAllocator>),
     LockedMemoryLowerVtl(#[inspect(skip)] LowerVtlMemorySpawner<LockedMemorySpawner>),
 }
@@ -429,6 +438,16 @@ impl DmaClientBacking {
             DmaClientBacking::SharedPool(allocator) => allocator.allocate_dma_buffer(total_size),
             DmaClientBacking::PrivatePool(allocator) => allocator.allocate_dma_buffer(total_size),
             DmaClientBacking::LockedMemory(spawner) => spawner.allocate_dma_buffer(total_size),
+            DmaClientBacking::PrivatePoolWithFallback((allocator, spawner)) => {
+                allocator.allocate_dma_buffer(total_size).or_else(|err| {
+                    tracing::warn!(
+                        size = total_size,
+                        error = ?err,
+                        "falling back to locked memory for dma allocation"
+                    );
+                    spawner.allocate_dma_buffer(total_size)
+                })
+            }
             DmaClientBacking::PrivatePoolLowerVtl(spawner) => {
                 spawner.allocate_dma_buffer(total_size)
             }
@@ -442,7 +461,16 @@ impl DmaClientBacking {
         match self {
             DmaClientBacking::SharedPool(allocator) => allocator.attach_pending_buffers(),
             DmaClientBacking::PrivatePool(allocator) => allocator.attach_pending_buffers(),
-            DmaClientBacking::LockedMemory(spawner) => spawner.attach_pending_buffers(),
+            DmaClientBacking::PrivatePoolWithFallback(_) => {
+                anyhow::bail!("cannot attach pending buffers with fallback allocator")
+            }
+            DmaClientBacking::LockedMemory(_) => {
+                anyhow::bail!(
+                    "attaching pending buffers is not supported with locked memory; \
+                    this client type does not maintain a pool of pending allocations. \
+                    To use attach_pending_buffers, create a client backed by a shared or private pool."
+                )
+            }
             DmaClientBacking::PrivatePoolLowerVtl(spawner) => spawner.attach_pending_buffers(),
             DmaClientBacking::LockedMemoryLowerVtl(spawner) => spawner.attach_pending_buffers(),
         }

@@ -325,6 +325,9 @@ impl<T: DeviceBacking> NvmeDriver<T> {
         )
         .context("failed to create admin queue pair")?;
 
+        let admin_sqes = admin.sq_entries();
+        let admin_cqes = admin.cq_entries();
+
         let admin = worker.admin.insert(admin);
 
         // Register the admin queue with the controller.
@@ -452,6 +455,13 @@ impl<T: DeviceBacking> NvmeDriver<T> {
             let io_cqsize = (MAX_CQ_ENTRIES - 1).min(worker.registers.cap.mqes_z()) + 1;
             let io_sqsize = (MAX_SQ_ENTRIES - 1).min(worker.registers.cap.mqes_z()) + 1;
 
+            tracing::debug!(
+                io_cqsize,
+                io_sqsize,
+                hw_size = worker.registers.cap.mqes_z(),
+                "io queue sizes"
+            );
+
             // Some hardware (such as ASAP) require that the sq and cq have the same size.
             io_cqsize.min(io_sqsize)
         };
@@ -630,7 +640,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {
             admin: None, // Updated below.
             identify: Some(Arc::new(
                 spec::IdentifyController::read_from_bytes(saved_state.identify_ctrl.as_bytes())
-                    .map_err(|_| RestoreError::InvalidData)?, // TODO: zerocopy: map_err (https://github.com/microsoft/openvmm/issues/759)
+                    .map_err(|_| RestoreError::InvalidData)?,
             )),
             driver: driver.clone(),
             io_issuers,
@@ -955,6 +965,9 @@ impl<T: DeviceBacking> DriverWorkerTask<T> {
         )
         .map_err(|err| DeviceError::IoQueuePairCreationFailure(err, qid))?;
 
+        assert_eq!(queue.sq_entries(), queue.cq_entries());
+        state.qsize = queue.sq_entries();
+
         let io_sq_addr = queue.sq_addr();
         let io_cq_addr = queue.cq_addr();
 

@@ -46,18 +46,23 @@ use zerocopy::FromZeros;
 
 /// Value for unused PRP entries, to catch/mitigate buffer size mismatches.
 const INVALID_PAGE_ADDR: u64 = !(PAGE_SIZE as u64 - 1);
-/// Maximum SQ size in entries.
-pub const MAX_SQ_ENTRIES: u16 = (PAGE_SIZE / 64) as u16;
-/// Maximum CQ size in entries.
-pub const MAX_CQ_ENTRIES: u16 = (PAGE_SIZE / 16) as u16;
+
+const SQ_ENTRY_SIZE: usize = size_of::<spec::Command>();
+const CQ_ENTRY_SIZE: usize = size_of::<spec::Completion>();
 /// Submission Queue size in bytes.
-const SQ_SIZE: usize = PAGE_SIZE;
+const SQ_SIZE: usize = PAGE_SIZE * 4;
 /// Completion Queue size in bytes.
 const CQ_SIZE: usize = PAGE_SIZE;
+/// Maximum SQ size in entries.
+pub const MAX_SQ_ENTRIES: u16 = (SQ_SIZE / SQ_ENTRY_SIZE) as u16;
+/// Maximum CQ size in entries.
+pub const MAX_CQ_ENTRIES: u16 = (CQ_SIZE / CQ_ENTRY_SIZE) as u16;
 /// Number of pages per queue if bounce buffering.
 const PER_QUEUE_PAGES_BOUNCE_BUFFER: usize = 128;
 /// Number of pages per queue if not bounce buffering.
 const PER_QUEUE_PAGES_NO_BOUNCE_BUFFER: usize = 64;
+/// Number of SQ entries per page (64).
+const SQ_ENTRIES_PER_PAGE: usize = PAGE_SIZE / SQ_ENTRY_SIZE;
 
 #[derive(Inspect)]
 pub(crate) struct QueuePair<T: AerHandler> {
@@ -75,6 +80,8 @@ pub(crate) struct QueuePair<T: AerHandler> {
     sq_entries: u16,
     #[inspect(skip)]
     cq_entries: u16,
+    sq_addr: u64,
+    cq_addr: u64,
 }
 
 impl PendingCommands {
@@ -177,17 +184,31 @@ impl PendingCommands {
 }
 
 impl<T: AerHandler> QueuePair<T> {
+    /// Create a new queue pair.
+    ///
+    /// `sq_entries` and `cq_entries` are the requested sizes in entries.
+    /// Calling code should request the largest size it thinks the device
+    /// will support (see `CAP.MQES`). These may be clamped down to what will
+    /// fit in one page should this routine fail to allocate physically
+    /// contiguous memory to back the queues.
+    /// IMPORTANT: Calling code should check the actual sizes via corresponding
+    /// calls to [`QueuePair::sq_entries`] and [`QueuePair::cq_entries`] AFTER calling this routine.
     pub fn new(
         spawner: impl SpawnDriver,
         device: &impl DeviceBacking,
         qid: u16,
-        sq_entries: u16, // Requested SQ size in entries.
-        cq_entries: u16, // Requested CQ size in entries.
+        sq_entries: u16,
+        cq_entries: u16,
         interrupt: DeviceInterrupt,
         registers: Arc<DeviceRegisters<impl DeviceBacking>>,
         bounce_buffer: bool,
         aer_handler: T,
     ) -> anyhow::Result<Self> {
+        // FUTURE: Consider splitting this into several allocations, rather than
+        // allocating the sum total together. This can increase the likelihood
+        // of getting contiguous memory when falling back to the LockedMem
+        // allocator, but this is not the expected path. Be careful that any
+        // changes you make here work with already established save state.
         let total_size = SQ_SIZE
             + CQ_SIZE
             + if bounce_buffer {
@@ -196,6 +217,10 @@ impl<T: AerHandler> QueuePair<T> {
                 PER_QUEUE_PAGES_NO_BOUNCE_BUFFER * PAGE_SIZE
             };
         let dma_client = device.dma_client();
+
+        // TODO: Keepalive: Detect when the allocation came from outside
+        // the private pool and put the device in a degraded state, so it
+        // is possible to inspect that a servicing with keepalive will fail.
         let mem = dma_client
             .allocate_dma_buffer(total_size)
             .context("failed to allocate memory for queues")?;
@@ -217,12 +242,11 @@ impl<T: AerHandler> QueuePair<T> {
         )
     }
 
-    /// Create new object or restore from saved state.
     fn new_or_restore(
         spawner: impl SpawnDriver,
         qid: u16,
-        sq_entries: u16, // Submission queue entries.
-        cq_entries: u16, // Completion queue entries.
+        sq_entries: u16,
+        cq_entries: u16,
         mut interrupt: DeviceInterrupt,
         registers: Arc<DeviceRegisters<impl DeviceBacking>>,
         mem: MemoryBlock,
@@ -235,6 +259,49 @@ impl<T: AerHandler> QueuePair<T> {
         let cq_mem_block = mem.subblock(SQ_SIZE, CQ_SIZE);
         let data_offset = SQ_SIZE + CQ_SIZE;
 
+        // Make sure that the queue memory is physically contiguous. While the
+        // NVMe spec allows for some provisions of queue memory to be
+        // non-contiguous, this depends on device support. At least one device
+        // that we must support requires that the memory is contiguous (via the
+        // CAP.CQR bit). Because of that, just simplify the code paths to use
+        // contiguous memory.
+        //
+        // We could also seek through the memory block to find contiguous pages
+        // (for example, if the first 4 pages are not contiguous, but pages 5-8
+        // are, use those), but other parts of this driver already assume the
+        // math to get the correct offsets.
+        //
+        // N.B. It is expected that allocations from the private pool will
+        // always be contiguous, and that is the normal path. That can fail in
+        // some cases (e.g. if we got some guesses about memory size wrong), and
+        // we prefer to operate in a perf degraded state rather than fail
+        // completely.
+
+        let (sq_is_contiguous, cq_is_contiguous) = (
+            sq_mem_block.contiguous_pfns(),
+            cq_mem_block.contiguous_pfns(),
+        );
+
+        let (sq_entries, cq_entries) = if !sq_is_contiguous || !cq_is_contiguous {
+            tracing::warn!(
+                qid,
+                sq_is_contiguous,
+                sq_mem_block.pfns = ?sq_mem_block.pfns(),
+                cq_is_contiguous,
+                cq_mem_block.pfns = ?cq_mem_block.pfns(),
+                "non-contiguous queue memory detected, falling back to single page queues"
+            );
+            // Clamp both queues to the number of entries that will fit in a
+            // single SQ page (since this will be the smaller between the SQ and
+            // CQ capacity).
+            (SQ_ENTRIES_PER_PAGE as u16, SQ_ENTRIES_PER_PAGE as u16)
+        } else {
+            (sq_entries, cq_entries)
+        };
+
+        let sq_addr = sq_mem_block.pfns()[0] * PAGE_SIZE64;
+        let cq_addr = cq_mem_block.pfns()[0] * PAGE_SIZE64;
+
         let mut queue_handler = match saved_state {
             Some(s) => QueueHandler::restore(sq_mem_block, cq_mem_block, s, aer_handler)?,
             None => {
@@ -296,15 +363,27 @@ impl<T: AerHandler> QueuePair<T> {
             qid,
             sq_entries,
             cq_entries,
+            sq_addr,
+            cq_addr,
         })
     }
 
+    /// Returns the actual number of SQ entries supported by this queue pair.
+    pub fn sq_entries(&self) -> u16 {
+        self.sq_entries
+    }
+
+    /// Returns the actual number of CQ entries supported by this queue pair.
+    pub fn cq_entries(&self) -> u16 {
+        self.cq_entries
+    }
+
     pub fn sq_addr(&self) -> u64 {
-        self.mem.pfns()[0] * PAGE_SIZE64
+        self.sq_addr
     }
 
     pub fn cq_addr(&self) -> u64 {
-        self.mem.pfns()[1] * PAGE_SIZE64
+        self.cq_addr
     }
 
     pub fn issuer(&self) -> &Arc<Issuer> {

@@ -31,6 +31,8 @@ pub(crate) struct QueueFull;
 
 impl SubmissionQueue {
     pub fn new(sqid: u16, len: u16, mem: MemoryBlock) -> Self {
+        tracing::debug!(sqid, len, pfns = ?mem.pfns(), "new submission queue");
+
         Self {
             sqid,
             head: 0,
@@ -122,6 +124,7 @@ pub(crate) struct CompletionQueue {
 
 impl CompletionQueue {
     pub fn new(cqid: u16, len: u16, mem: MemoryBlock) -> CompletionQueue {
+        tracing::debug!(cqid, len, pfns = ?mem.pfns(), "new completion queue");
         Self {
             cqid,
             head: 0,
@@ -138,8 +141,7 @@ impl CompletionQueue {
 
     pub fn read(&mut self) -> Option<spec::Completion> {
         let completion_mem = self.mem.as_slice()
-            [self.head as usize * size_of::<spec::Completion>()..]
-            [..size_of::<spec::Completion>() * 2]
+            [self.head as usize * size_of::<spec::Completion>()..][..size_of::<spec::Completion>()]
             .as_atomic_slice::<AtomicU64>()
             .unwrap();
 

@@ -15,6 +15,7 @@ mesh.workspace = true
 bitfield-struct.workspace = true
 open_enum.workspace = true
 zerocopy.workspace = true
+static_assertions.workspace = true
 
 [lints]
 workspace = true
@@ -157,6 +157,7 @@ pub struct Command {
     pub cdw14: u32,
     pub cdw15: u32,
 }
+static_assertions::assert_eq_size!(Command, [u8; 64]);
 
 #[derive(Inspect)]
 #[bitfield(u32)]
@@ -230,6 +231,7 @@ pub struct Completion {
     pub cid: u16,
     pub status: CompletionStatus,
 }
+static_assertions::assert_eq_size!(Completion, [u8; 16]);
 
 #[bitfield(u16)]
 #[derive(IntoBytes, Immutable, KnownLayout, FromBytes, MeshPayload)]

@@ -72,6 +72,8 @@ pub trait DmaClient: Send + Sync + Inspect {
     /// Allocate a new DMA buffer. This buffer must be zero initialized.
     ///
     /// TODO: string tag for allocation?
+    /// TODO: contiguous vs non-contiguous? (both on the request side, and if
+    ///       a request contiguous allocation cannot be fulfilled)
     fn allocate_dma_buffer(&self, total_size: usize) -> anyhow::Result<MemoryBlock>;
 
     /// Attach all previously allocated memory blocks.

@@ -156,6 +156,20 @@ impl MemoryBlock {
         self.mem.pfn_bias()
     }
 
+    /// Returns true if the PFNs are contiguous.
+    ///
+    /// TODO: Fallback allocations are here for now, but we should eventually
+    /// allow the caller to require these. See `DmaClient::allocate_dma_buffer`.
+    pub fn contiguous_pfns(&self) -> bool {
+        for (curr, next) in self.pfns().iter().zip(self.pfns().iter().skip(1)) {
+            if *curr + 1 != *next {
+                return false;
+            }
+        }
+
+        true
+    }
+
     /// Gets the buffer as an atomic slice.
     pub fn as_slice(&self) -> &[AtomicU8] {
         // SAFETY: the underlying memory is valid for the lifetime of `mem`.