Skip to content

Commit 7cf497e

Browse files
committed
Merge branch 'nexthop-group-stats'
Petr Machata says: ==================== Support for nexthop group statistics ECMP is a fundamental component in L3 designs. However, it's fragile. Many factors influence whether an ECMP group will operate as intended: hash policy (i.e. the set of fields that contribute to ECMP hash calculation), neighbor validity, hash seed (which might lead to polarization) or the type of ECMP group used (hash-threshold or resilient). At the same time, collection of statistics that would help an operator determine that the group performs as desired, is difficult. A solution that we present in this patchset is to add counters to next hop group entries. For SW-datapath deployments, this will on its own allow collection and evaluation of relevant statistics. For HW-datapath deployments, we further add a way to request that HW counters be installed for a given group, in-kernel interfaces to collect the HW statistics, and netlink interfaces to query them. For example: # ip nexthop replace id 4000 group 4001/4002 hw_stats on # ip -s -d nexthop show id 4000 id 4000 group 4001/4002 scope global proto unspec offload hw_stats on used on stats: id 4001 packets 5002 packets_hw 5000 id 4002 packets 4999 packets_hw 4999 The point of the patchset is visibility of ECMP balance, and that is influenced by packet headers, not their payload. Correspondingly, we only include packet counters in the statistics, not byte counters. We also decided to model HW statistics as a nexthop group attribute, not an arbitrary nexthop one. The latter would count any traffic going through a given nexthop, regardless of which ECMP group it is in, or any at all. The reason is again hat the point of the patchset is ECMP balance visibility, not arbitrary inspection of how busy a particular nexthop is. Implementation of individual-nexthop statistics is certainly possible, and could well follow the general approach we are taking in this patchset. For resilient groups, per-bucket statistics could be done in a similar manner as well. This patchset contains the core code. mlxsw support will be sent in a follow-up patch set. This patchset progresses as follows: - Patches #1 and #2 add support for a new next-hop object attribute, NHA_OP_FLAGS. That is meant to carry various op-specific signaling, in particular whether SW- and HW-collected nexthop stats should be part of the get or dump response. The idea is to avoid wasting message space, and time for collection of HW statistics, when the values are not needed. - Patches #3 and #4 add SW-datapath stats and corresponding UAPI. - Patches #5, torvalds#6 and torvalds#7 add support fro HW-datapath stats and UAPI. Individual drivers still need to contribute the appropriate HW-specific support code. v4: - Patch #2: - s/nla_get_bitfield32/nla_get_u32/ in __nh_valid_dump_req(). v3: - Patch #3: - Convert to u64_stats_t - Patch #4: - Give a symbolic name to the set of all valid dump flags for the NHA_OP_FLAGS attribute. - Convert to u64_stats_t - Patch torvalds#6: - Use a named constant for the NHA_HW_STATS_ENABLE policy. v2: - Patch #2: - Change OP_FLAGS to u32, enforce through NLA_POLICY_MASK - Patch #3: - Set err on nexthop_create_group() error path - Patch #4: - Use uint to encode NHA_GROUP_STATS_ENTRY_PACKETS - Rename jump target in nla_put_nh_group_stats() to avoid having to rename further in the patchset. - Patch torvalds#7: - Use uint to encode NHA_GROUP_STATS_ENTRY_PACKETS_HW - Do not cancel outside of nesting in nla_put_nh_group_stats() ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents 3b43f19 + 5072ae0 commit 7cf497e

File tree

3 files changed

+363
-40
lines changed

3 files changed

+363
-40
lines changed

include/net/nexthop.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ struct nh_config {
4747
bool nh_grp_res_has_idle_timer;
4848
bool nh_grp_res_has_unbalanced_timer;
4949

50+
bool nh_hw_stats;
51+
5052
struct nlattr *nh_encap;
5153
u16 nh_encap_type;
5254

@@ -95,8 +97,14 @@ struct nh_res_table {
9597
struct nh_res_bucket nh_buckets[] __counted_by(num_nh_buckets);
9698
};
9799

100+
struct nh_grp_entry_stats {
101+
u64_stats_t packets;
102+
struct u64_stats_sync syncp;
103+
};
104+
98105
struct nh_grp_entry {
99106
struct nexthop *nh;
107+
struct nh_grp_entry_stats __percpu *stats;
100108
u8 weight;
101109

102110
union {
@@ -114,6 +122,7 @@ struct nh_grp_entry {
114122

115123
struct list_head nh_list;
116124
struct nexthop *nh_parent; /* nexthop of group with this entry */
125+
u64 packets_hw;
117126
};
118127

119128
struct nh_group {
@@ -124,6 +133,7 @@ struct nh_group {
124133
bool resilient;
125134
bool fdb_nh;
126135
bool has_v4;
136+
bool hw_stats;
127137

128138
struct nh_res_table __rcu *res_table;
129139
struct nh_grp_entry nh_entries[] __counted_by(num_nh);
@@ -157,13 +167,15 @@ enum nexthop_event_type {
157167
NEXTHOP_EVENT_REPLACE,
158168
NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE,
159169
NEXTHOP_EVENT_BUCKET_REPLACE,
170+
NEXTHOP_EVENT_HW_STATS_REPORT_DELTA,
160171
};
161172

162173
enum nh_notifier_info_type {
163174
NH_NOTIFIER_INFO_TYPE_SINGLE,
164175
NH_NOTIFIER_INFO_TYPE_GRP,
165176
NH_NOTIFIER_INFO_TYPE_RES_TABLE,
166177
NH_NOTIFIER_INFO_TYPE_RES_BUCKET,
178+
NH_NOTIFIER_INFO_TYPE_GRP_HW_STATS,
167179
};
168180

169181
struct nh_notifier_single_info {
@@ -187,6 +199,7 @@ struct nh_notifier_grp_entry_info {
187199
struct nh_notifier_grp_info {
188200
u16 num_nh;
189201
bool is_fdb;
202+
bool hw_stats;
190203
struct nh_notifier_grp_entry_info nh_entries[] __counted_by(num_nh);
191204
};
192205

@@ -200,9 +213,21 @@ struct nh_notifier_res_bucket_info {
200213

201214
struct nh_notifier_res_table_info {
202215
u16 num_nh_buckets;
216+
bool hw_stats;
203217
struct nh_notifier_single_info nhs[] __counted_by(num_nh_buckets);
204218
};
205219

220+
struct nh_notifier_grp_hw_stats_entry_info {
221+
u32 id;
222+
u64 packets;
223+
};
224+
225+
struct nh_notifier_grp_hw_stats_info {
226+
u16 num_nh;
227+
bool hw_stats_used;
228+
struct nh_notifier_grp_hw_stats_entry_info stats[] __counted_by(num_nh);
229+
};
230+
206231
struct nh_notifier_info {
207232
struct net *net;
208233
struct netlink_ext_ack *extack;
@@ -213,6 +238,7 @@ struct nh_notifier_info {
213238
struct nh_notifier_grp_info *nh_grp;
214239
struct nh_notifier_res_table_info *nh_res_table;
215240
struct nh_notifier_res_bucket_info *nh_res_bucket;
241+
struct nh_notifier_grp_hw_stats_info *nh_grp_hw_stats;
216242
};
217243
};
218244

@@ -225,6 +251,9 @@ void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index,
225251
bool offload, bool trap);
226252
void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets,
227253
unsigned long *activity);
254+
void nh_grp_hw_stats_report_delta(struct nh_notifier_grp_hw_stats_info *info,
255+
unsigned int nh_idx,
256+
u64 delta_packets);
228257

229258
/* caller is holding rcu or rtnl; no reference taken to nexthop */
230259
struct nexthop *nexthop_find_by_id(struct net *net, u32 id);

include/uapi/linux/nexthop.h

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ enum {
3030

3131
#define NEXTHOP_GRP_TYPE_MAX (__NEXTHOP_GRP_TYPE_MAX - 1)
3232

33+
#define NHA_OP_FLAG_DUMP_STATS BIT(0)
34+
#define NHA_OP_FLAG_DUMP_HW_STATS BIT(1)
35+
3336
enum {
3437
NHA_UNSPEC,
3538
NHA_ID, /* u32; id for nexthop. id == 0 means auto-assign */
@@ -60,6 +63,18 @@ enum {
6063
/* nested; nexthop bucket attributes */
6164
NHA_RES_BUCKET,
6265

66+
/* u32; operation-specific flags */
67+
NHA_OP_FLAGS,
68+
69+
/* nested; nexthop group stats */
70+
NHA_GROUP_STATS,
71+
72+
/* u32; nexthop hardware stats enable */
73+
NHA_HW_STATS_ENABLE,
74+
75+
/* u32; read-only; whether any driver collects HW stats */
76+
NHA_HW_STATS_USED,
77+
6378
__NHA_MAX,
6479
};
6580

@@ -101,4 +116,34 @@ enum {
101116

102117
#define NHA_RES_BUCKET_MAX (__NHA_RES_BUCKET_MAX - 1)
103118

119+
enum {
120+
NHA_GROUP_STATS_UNSPEC,
121+
122+
/* nested; nexthop group entry stats */
123+
NHA_GROUP_STATS_ENTRY,
124+
125+
__NHA_GROUP_STATS_MAX,
126+
};
127+
128+
#define NHA_GROUP_STATS_MAX (__NHA_GROUP_STATS_MAX - 1)
129+
130+
enum {
131+
NHA_GROUP_STATS_ENTRY_UNSPEC,
132+
133+
/* u32; nexthop id of the nexthop group entry */
134+
NHA_GROUP_STATS_ENTRY_ID,
135+
136+
/* uint; number of packets forwarded via the nexthop group entry */
137+
NHA_GROUP_STATS_ENTRY_PACKETS,
138+
139+
/* uint; number of packets forwarded via the nexthop group entry in
140+
* hardware
141+
*/
142+
NHA_GROUP_STATS_ENTRY_PACKETS_HW,
143+
144+
__NHA_GROUP_STATS_ENTRY_MAX,
145+
};
146+
147+
#define NHA_GROUP_STATS_ENTRY_MAX (__NHA_GROUP_STATS_ENTRY_MAX - 1)
148+
104149
#endif

0 commit comments

Comments
 (0)