Skip to content

Commit e745258

Browse files
bchaliosShadowCurse
authored andcommitted
net: use readv for reading frames from TAP device
Right now, we are performing two copies for writing a frame from the TAP device into guest memory. We first read the frame in an array held by the Net device and then copy that array in a DescriptorChain. In order to avoid the double copy use the readv system call to read directly from the TAP device into the buffers described by DescriptorChain. The main challenge with this is that DescriptorChain objects describe memory that is at least 65562 bytes long when guest TSO4, TSO6 or UFO are enabled or 1526 otherwise and parsing the chain includes overhead which we pay even if the frame we are receiving is much smaller than these sizes. PR firecracker-microvm#4748 reduced the overheads involved with parsing DescriptorChain objects. To further avoid this overhead, move the parsing of DescriptorChain objects out of the hot path of process_rx() where we are actually receiving a frame into process_rx_queue_event() where we get the notification that the guest added new buffers for network RX. Signed-off-by: Babis Chalios <[email protected]>
1 parent 7762425 commit e745258

File tree

8 files changed

+584
-304
lines changed

8 files changed

+584
-304
lines changed

resources/seccomp/aarch64-unknown-linux-musl.json

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,108 @@
3232
"syscall": "writev",
3333
"comment": "Used by the VirtIO net device to write to tap"
3434
},
35+
{
36+
"syscall": "readv",
37+
"comment": "Used by the VirtIO net device to read from tap"
38+
},
39+
{
40+
"syscall": "memfd_create",
41+
"comment": "Used by the IovDeque implementation"
42+
},
43+
{
44+
"syscall": "fcntl",
45+
"comment": "Used by the IovDeque implementation",
46+
"args": [
47+
{
48+
"index": 1,
49+
"type": "dword",
50+
"op": "eq",
51+
"val": 1033,
52+
"comment": "FCNTL_F_SETFD"
53+
},
54+
{
55+
"index": 2,
56+
"type": "dword",
57+
"op": "eq",
58+
"val": 6,
59+
"comment": "F_SEAL_SHRINK|F_SEAL_GROW"
60+
}
61+
]
62+
},
63+
{
64+
"syscall": "fcntl",
65+
"comment": "Used by the IovDeque implementation",
66+
"args": [
67+
{
68+
"index": 1,
69+
"type": "dword",
70+
"op": "eq",
71+
"val": 1033,
72+
"comment": "FCNTL_F_SETFD"
73+
},
74+
{
75+
"index": 2,
76+
"type": "dword",
77+
"op": "eq",
78+
"val": 1,
79+
"comment": "F_SEAL_SEAL"
80+
}
81+
]
82+
},
83+
{
84+
"syscall": "mmap",
85+
"comment": "Used by the IovDeque implementation",
86+
"args": [
87+
{
88+
"index": 1,
89+
"type": "dword",
90+
"op": "eq",
91+
"val": 4096,
92+
"comment": "Page size allocation"
93+
},
94+
{
95+
"index": 2,
96+
"type": "dword",
97+
"op": "eq",
98+
"val": 3,
99+
"comment": "PROT_READ|PROT_WRITE"
100+
},
101+
{
102+
"index": 3,
103+
"type": "dword",
104+
"op": "eq",
105+
"val": 17,
106+
"comment": "MAP_SHARED|MAP_FIXED"
107+
}
108+
]
109+
},
110+
{
111+
"syscall": "mmap",
112+
"comment": "Used by the IovDeque implementation",
113+
"args": [
114+
{
115+
"index": 1,
116+
"type": "dword",
117+
"op": "eq",
118+
"val": 8192,
119+
"comment": "2 pages allocation"
120+
},
121+
{
122+
"index": 2,
123+
"type": "dword",
124+
"op": "eq",
125+
"val": 0,
126+
"comment": "PROT_NONE"
127+
},
128+
{
129+
"index": 3,
130+
"type": "dword",
131+
"op": "eq",
132+
"val": 34,
133+
"comment": "MAP_PRIVATE|MAP_ANONYMOUS"
134+
}
135+
]
136+
},
35137
{
36138
"syscall": "fsync"
37139
},

resources/seccomp/x86_64-unknown-linux-musl.json

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,108 @@
3232
"syscall": "writev",
3333
"comment": "Used by the VirtIO net device to write to tap"
3434
},
35+
{
36+
"syscall": "readv",
37+
"comment": "Used by the VirtIO net device to read from tap"
38+
},
39+
{
40+
"syscall": "memfd_create",
41+
"comment": "Used by the IovDeque implementation"
42+
},
43+
{
44+
"syscall": "fcntl",
45+
"comment": "Used by the IovDeque implementation",
46+
"args": [
47+
{
48+
"index": 1,
49+
"type": "dword",
50+
"op": "eq",
51+
"val": 1033,
52+
"comment": "FCNTL_F_SETFD"
53+
},
54+
{
55+
"index": 2,
56+
"type": "dword",
57+
"op": "eq",
58+
"val": 6,
59+
"comment": "F_SEAL_SHRINK|F_SEAL_GROW"
60+
}
61+
]
62+
},
63+
{
64+
"syscall": "fcntl",
65+
"comment": "Used by the IovDeque implementation",
66+
"args": [
67+
{
68+
"index": 1,
69+
"type": "dword",
70+
"op": "eq",
71+
"val": 1033,
72+
"comment": "FCNTL_F_SETFD"
73+
},
74+
{
75+
"index": 2,
76+
"type": "dword",
77+
"op": "eq",
78+
"val": 1,
79+
"comment": "F_SEAL_SEAL"
80+
}
81+
]
82+
},
83+
{
84+
"syscall": "mmap",
85+
"comment": "Used by the IovDeque implementation",
86+
"args": [
87+
{
88+
"index": 1,
89+
"type": "dword",
90+
"op": "eq",
91+
"val": 4096,
92+
"comment": "Page size allocation"
93+
},
94+
{
95+
"index": 2,
96+
"type": "dword",
97+
"op": "eq",
98+
"val": 3,
99+
"comment": "PROT_READ|PROT_WRITE"
100+
},
101+
{
102+
"index": 3,
103+
"type": "dword",
104+
"op": "eq",
105+
"val": 17,
106+
"comment": "MAP_SHARED|MAP_FIXED"
107+
}
108+
]
109+
},
110+
{
111+
"syscall": "mmap",
112+
"comment": "Used by the IovDeque implementation",
113+
"args": [
114+
{
115+
"index": 1,
116+
"type": "dword",
117+
"op": "eq",
118+
"val": 8192,
119+
"comment": "2 pages allocation"
120+
},
121+
{
122+
"index": 2,
123+
"type": "dword",
124+
"op": "eq",
125+
"val": 0,
126+
"comment": "PROT_NONE"
127+
},
128+
{
129+
"index": 3,
130+
"type": "dword",
131+
"op": "eq",
132+
"val": 34,
133+
"comment": "MAP_PRIVATE|MAP_ANONYMOUS"
134+
}
135+
]
136+
},
35137
{
36138
"syscall": "fsync"
37139
},

src/vmm/src/devices/virtio/iovec.rs

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ pub enum IoVecError {
2121
ReadOnlyDescriptor,
2222
/// Tried to create an `IoVec` or `IoVecMut` from a descriptor chain that was too large
2323
OverflowedDescriptor,
24+
/// Tried to push to full IovDeque.
25+
IovDequeOverflow,
2426
/// Guest memory error: {0}
2527
GuestMemory(#[from] GuestMemoryError),
2628
/// Error with underlying `IovDeque`: {0}
@@ -250,42 +252,47 @@ impl IoVecBufferMut {
250252
let mut nr_iovecs = 0u16;
251253
while let Some(desc) = next_descriptor {
252254
if !desc.is_write_only() {
253-
self.vecs.pop_front(nr_iovecs);
255+
self.vecs.pop_back(nr_iovecs);
254256
return Err(IoVecError::ReadOnlyDescriptor);
255257
}
256258

257259
// We use get_slice instead of `get_host_address` here in order to have the whole
258260
// range of the descriptor chain checked, i.e. [addr, addr + len) is a valid memory
259261
// region in the GuestMemoryMmap.
260262
let slice = mem.get_slice(desc.addr, desc.len as usize).map_err(|err| {
261-
self.vecs.pop_front(nr_iovecs);
263+
self.vecs.pop_back(nr_iovecs);
262264
err
263265
})?;
264-
265266
// We need to mark the area of guest memory that will be mutated through this
266267
// IoVecBufferMut as dirty ahead of time, as we loose access to all
267268
// vm-memory related information after converting down to iovecs.
268269
slice.bitmap().mark_dirty(0, desc.len as usize);
269-
270270
let iov_base = slice.ptr_guard_mut().as_ptr().cast::<c_void>();
271+
272+
if self.vecs.is_full() {
273+
self.vecs.pop_back(nr_iovecs);
274+
return Err(IoVecError::IovDequeOverflow);
275+
}
276+
271277
self.vecs.push_back(iovec {
272278
iov_base,
273279
iov_len: desc.len as size_t,
274280
});
281+
275282
nr_iovecs += 1;
276283
length = length
277284
.checked_add(desc.len)
278285
.ok_or(IoVecError::OverflowedDescriptor)
279286
.map_err(|err| {
280-
self.vecs.pop_front(nr_iovecs);
287+
self.vecs.pop_back(nr_iovecs);
281288
err
282289
})?;
283290

284291
next_descriptor = desc.next_descriptor();
285292
}
286293

287294
self.len = self.len.checked_add(length).ok_or_else(|| {
288-
self.vecs.pop_front(nr_iovecs);
295+
self.vecs.pop_back(nr_iovecs);
289296
IoVecError::OverflowedDescriptor
290297
})?;
291298

@@ -333,14 +340,22 @@ impl IoVecBufferMut {
333340
self.parse_descriptor(mem, head)
334341
}
335342

336-
/// Drop memory from the `IoVecBufferMut`
343+
/// Drop descriptor chain from the `IoVecBufferMut` front
337344
///
338345
/// This will drop memory described by the `IoVecBufferMut` from the beginning.
339-
pub fn drop_descriptor_chain(&mut self, parse_descriptor: &ParsedDescriptorChain) {
346+
pub fn drop_chain_front(&mut self, parse_descriptor: &ParsedDescriptorChain) {
340347
self.vecs.pop_front(parse_descriptor.nr_iovecs);
341348
self.len -= parse_descriptor.length;
342349
}
343350

351+
/// Drop descriptor chain from the `IoVecBufferMut` back
352+
///
353+
/// This will drop memory described by the `IoVecBufferMut` from the beginning.
354+
pub fn drop_chain_back(&mut self, parse_descriptor: &ParsedDescriptorChain) {
355+
self.vecs.pop_back(parse_descriptor.nr_iovecs);
356+
self.len -= parse_descriptor.length;
357+
}
358+
344359
/// Create an `IoVecBuffer` from a `DescriptorChain`
345360
///
346361
/// # Safety

0 commit comments

Comments
 (0)