Skip to content

Commit 069d114

Browse files
Tariq ToukanSaeed Mahameed
authored andcommitted
net/mlx5e: RX, Enhance legacy Receive Queue memory scheme
Enhance the memory scheme of the legacy RQ, such that only order-0 pages are used. Whenever possible, prefer using a linear SKB, and build it wrapping the WQE buffer. Otherwise (for example, jumbo frames on x86), use non-linear SKB, with as many frags as needed. In this case, multiple WQE scatter entries are used, up to a maximum of 4 frags and 10KB of MTU. This implied to remove support of HW LRO in legacy RQ, as it would require large number of page allocations and scatter entries per WQE on archs with PAGE_SIZE = 4KB, yielding bad performance. In earlier patches, we guaranteed that all completions are in-order, and that we use a cyclic WQ. This creates an oppurtunity for a performance optimization: The mapping between a "struct mlx5e_dma_info", and the WQEs (struct mlx5e_wqe_frag_info) pointing to it, is constant across different cycles of a WQ. This allows initializing the mapping in the time of RQ creation, and not handle it in datapath. A struct mlx5e_dma_info that is shared between different WQEs is allocated by the first WQE, and freed by the last one. This implies an important requirement: WQEs that share the same struct mlx5e_dma_info must be posted within the same NAPI. Otherwise, upon completion, struct mlx5e_wqe_frag_info would mistakenly point to the new struct mlx5e_dma_info, not the one that was posted (and the HW wrote to). This bulking requirement is actually good also for performance reasons, hence we extend the bulk beyong the minimal requirement above. With this memory scheme, the RQs memory footprint is reduce by a factor of 2 on x86, and by a factor of 32 on PowerPC. Same factors apply for the number of pages in a GRO session. Performance tests: ConnectX-4, single core, single RX ring, default MTU. x86: CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz Packet rate (early drop in TC): no degradation TCP streams: ~5% improvement PowerPC: CPU: POWER8 (raw), altivec supported Packet rate (early drop in TC): 20% gain TCP streams: 25% gain Signed-off-by: Tariq Toukan <[email protected]> Signed-off-by: Saeed Mahameed <[email protected]>
1 parent 99cbfa9 commit 069d114

File tree

3 files changed

+362
-127
lines changed

3 files changed

+362
-127
lines changed

drivers/net/ethernet/mellanox/mlx5/core/en.h

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -101,11 +101,15 @@ struct page_pool;
101101
(MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE_MPW + \
102102
(MLX5_MPWRQ_LOG_WQE_SZ - MLX5E_ORDER2_MAX_PACKET_MTU))
103103

104+
#define MLX5E_MIN_SKB_FRAG_SZ (MLX5_SKB_FRAG_SZ(MLX5_RX_HEADROOM))
105+
#define MLX5E_LOG_MAX_RX_WQE_BULK \
106+
(ilog2(PAGE_SIZE / roundup_pow_of_two(MLX5E_MIN_SKB_FRAG_SZ)))
107+
104108
#define MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE 0x6
105109
#define MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE 0xa
106110
#define MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE 0xd
107111

108-
#define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE 0x1
112+
#define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE (1 + MLX5E_LOG_MAX_RX_WQE_BULK)
109113
#define MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE 0xa
110114
#define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE min_t(u8, 0xd, \
111115
MLX5E_LOG_MAX_RQ_NUM_PACKETS_MPW)
@@ -462,8 +466,9 @@ struct mlx5e_dma_info {
462466
};
463467

464468
struct mlx5e_wqe_frag_info {
465-
struct mlx5e_dma_info di;
469+
struct mlx5e_dma_info *di;
466470
u32 offset;
471+
bool last_in_page;
467472
};
468473

469474
struct mlx5e_umr_dma_info {
@@ -476,6 +481,8 @@ struct mlx5e_mpw_info {
476481
DECLARE_BITMAP(xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE);
477482
};
478483

484+
#define MLX5E_MAX_RX_FRAGS 4
485+
479486
/* a single cache unit is capable to serve one napi call (for non-striding rq)
480487
* or a MPWQE (for striding rq).
481488
*/
@@ -493,23 +500,37 @@ typedef void (*mlx5e_fp_handle_rx_cqe)(struct mlx5e_rq*, struct mlx5_cqe64*);
493500
typedef struct sk_buff *
494501
(*mlx5e_fp_skb_from_cqe_mpwrq)(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
495502
u16 cqe_bcnt, u32 head_offset, u32 page_idx);
503+
typedef struct sk_buff *
504+
(*mlx5e_fp_skb_from_cqe)(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
505+
struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt);
496506
typedef bool (*mlx5e_fp_post_rx_wqes)(struct mlx5e_rq *rq);
497507
typedef void (*mlx5e_fp_dealloc_wqe)(struct mlx5e_rq*, u16);
498508

499509
enum mlx5e_rq_flag {
500510
MLX5E_RQ_FLAG_XDP_XMIT = BIT(0),
501511
};
502512

513+
struct mlx5e_rq_frag_info {
514+
int frag_size;
515+
int frag_stride;
516+
};
517+
518+
struct mlx5e_rq_frags_info {
519+
struct mlx5e_rq_frag_info arr[MLX5E_MAX_RX_FRAGS];
520+
u8 num_frags;
521+
u8 log_num_frags;
522+
u8 wqe_bulk;
523+
};
524+
503525
struct mlx5e_rq {
504526
/* data path */
505527
union {
506528
struct {
507-
struct mlx5_wq_cyc wq;
508-
struct mlx5e_wqe_frag_info *frag_info;
509-
u32 frag_sz; /* max possible skb frag_sz */
510-
union {
511-
bool page_reuse;
512-
};
529+
struct mlx5_wq_cyc wq;
530+
struct mlx5e_wqe_frag_info *frags;
531+
struct mlx5e_dma_info *di;
532+
struct mlx5e_rq_frags_info info;
533+
mlx5e_fp_skb_from_cqe skb_from_cqe;
513534
} wqe;
514535
struct {
515536
struct mlx5_wq_ll wq;
@@ -523,7 +544,6 @@ struct mlx5e_rq {
523544
};
524545
struct {
525546
u16 headroom;
526-
u8 page_order;
527547
u8 map_dir; /* dma map direction */
528548
} buff;
529549

@@ -879,6 +899,12 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
879899
struct sk_buff *
880900
mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
881901
u16 cqe_bcnt, u32 head_offset, u32 page_idx);
902+
struct sk_buff *
903+
mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
904+
struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt);
905+
struct sk_buff *
906+
mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
907+
struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt);
882908

883909
void mlx5e_update_stats(struct mlx5e_priv *priv);
884910

0 commit comments

Comments
 (0)