Skip to content

Commit 0daf07e

Browse files
edumazetdavem330
authored andcommitted
raw: convert raw sockets to RCU
Using rwlock in networking code is extremely risky. writers can starve if enough readers are constantly grabing the rwlock. I thought rwlock were at fault and sent this patch: https://lkml.org/lkml/2022/6/17/272 But Peter and Linus essentially told me rwlock had to be unfair. We need to get rid of rwlock in networking code. Without this fix, following script triggers soft lockups: for i in {1..48} do ping -f -n -q 127.0.0.1 & sleep 0.1 done Fixes: 1da177e ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent ba44f81 commit 0daf07e

File tree

7 files changed

+80
-70
lines changed

7 files changed

+80
-70
lines changed

include/net/raw.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,18 @@ int raw_rcv(struct sock *, struct sk_buff *);
3333

3434
struct raw_hashinfo {
3535
rwlock_t lock;
36-
struct hlist_head ht[RAW_HTABLE_SIZE];
36+
struct hlist_nulls_head ht[RAW_HTABLE_SIZE];
3737
};
3838

39+
static inline void raw_hashinfo_init(struct raw_hashinfo *hashinfo)
40+
{
41+
int i;
42+
43+
rwlock_init(&hashinfo->lock);
44+
for (i = 0; i < RAW_HTABLE_SIZE; i++)
45+
INIT_HLIST_NULLS_HEAD(&hashinfo->ht[i], i);
46+
}
47+
3948
#ifdef CONFIG_PROC_FS
4049
int raw_proc_init(void);
4150
void raw_proc_exit(void);

include/net/rawv6.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#define _NET_RAWV6_H
44

55
#include <net/protocol.h>
6+
#include <net/raw.h>
67

78
extern struct raw_hashinfo raw_v6_hashinfo;
89
bool raw_v6_match(struct net *net, struct sock *sk, unsigned short num,

net/ipv4/af_inet.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1929,6 +1929,8 @@ static int __init inet_init(void)
19291929

19301930
sock_skb_cb_check_size(sizeof(struct inet_skb_parm));
19311931

1932+
raw_hashinfo_init(&raw_v4_hashinfo);
1933+
19321934
rc = proto_register(&tcp_prot, 1);
19331935
if (rc)
19341936
goto out;

net/ipv4/raw.c

Lines changed: 38 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -85,20 +85,19 @@ struct raw_frag_vec {
8585
int hlen;
8686
};
8787

88-
struct raw_hashinfo raw_v4_hashinfo = {
89-
.lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
90-
};
88+
struct raw_hashinfo raw_v4_hashinfo;
9189
EXPORT_SYMBOL_GPL(raw_v4_hashinfo);
9290

9391
int raw_hash_sk(struct sock *sk)
9492
{
9593
struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
96-
struct hlist_head *head;
94+
struct hlist_nulls_head *hlist;
9795

98-
head = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)];
96+
hlist = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)];
9997

10098
write_lock_bh(&h->lock);
101-
sk_add_node(sk, head);
99+
hlist_nulls_add_head_rcu(&sk->sk_nulls_node, hlist);
100+
sock_set_flag(sk, SOCK_RCU_FREE);
102101
write_unlock_bh(&h->lock);
103102
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
104103

@@ -111,7 +110,7 @@ void raw_unhash_sk(struct sock *sk)
111110
struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
112111

113112
write_lock_bh(&h->lock);
114-
if (sk_del_node_init(sk))
113+
if (__sk_nulls_del_node_init_rcu(sk))
115114
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
116115
write_unlock_bh(&h->lock);
117116
}
@@ -164,17 +163,16 @@ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
164163
static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
165164
{
166165
struct net *net = dev_net(skb->dev);
166+
struct hlist_nulls_head *hlist;
167+
struct hlist_nulls_node *hnode;
167168
int sdif = inet_sdif(skb);
168169
int dif = inet_iif(skb);
169-
struct hlist_head *head;
170170
int delivered = 0;
171171
struct sock *sk;
172172

173-
head = &raw_v4_hashinfo.ht[hash];
174-
if (hlist_empty(head))
175-
return 0;
176-
read_lock(&raw_v4_hashinfo.lock);
177-
sk_for_each(sk, head) {
173+
hlist = &raw_v4_hashinfo.ht[hash];
174+
rcu_read_lock();
175+
hlist_nulls_for_each_entry(sk, hnode, hlist, sk_nulls_node) {
178176
if (!raw_v4_match(net, sk, iph->protocol,
179177
iph->saddr, iph->daddr, dif, sdif))
180178
continue;
@@ -189,7 +187,7 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
189187
raw_rcv(sk, clone);
190188
}
191189
}
192-
read_unlock(&raw_v4_hashinfo.lock);
190+
rcu_read_unlock();
193191
return delivered;
194192
}
195193

@@ -265,25 +263,26 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
265263
void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
266264
{
267265
struct net *net = dev_net(skb->dev);;
266+
struct hlist_nulls_head *hlist;
267+
struct hlist_nulls_node *hnode;
268268
int dif = skb->dev->ifindex;
269269
int sdif = inet_sdif(skb);
270-
struct hlist_head *head;
271270
const struct iphdr *iph;
272271
struct sock *sk;
273272
int hash;
274273

275274
hash = protocol & (RAW_HTABLE_SIZE - 1);
276-
head = &raw_v4_hashinfo.ht[hash];
275+
hlist = &raw_v4_hashinfo.ht[hash];
277276

278-
read_lock(&raw_v4_hashinfo.lock);
279-
sk_for_each(sk, head) {
277+
rcu_read_lock();
278+
hlist_nulls_for_each_entry(sk, hnode, hlist, sk_nulls_node) {
280279
iph = (const struct iphdr *)skb->data;
281280
if (!raw_v4_match(net, sk, iph->protocol,
282281
iph->saddr, iph->daddr, dif, sdif))
283282
continue;
284283
raw_err(sk, skb, info);
285284
}
286-
read_unlock(&raw_v4_hashinfo.lock);
285+
rcu_read_unlock();
287286
}
288287

289288
static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
@@ -944,44 +943,41 @@ struct proto raw_prot = {
944943
};
945944

946945
#ifdef CONFIG_PROC_FS
947-
static struct sock *raw_get_first(struct seq_file *seq)
946+
static struct sock *raw_get_first(struct seq_file *seq, int bucket)
948947
{
949-
struct sock *sk;
950948
struct raw_hashinfo *h = pde_data(file_inode(seq->file));
951949
struct raw_iter_state *state = raw_seq_private(seq);
950+
struct hlist_nulls_head *hlist;
951+
struct hlist_nulls_node *hnode;
952+
struct sock *sk;
952953

953-
for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE;
954+
for (state->bucket = bucket; state->bucket < RAW_HTABLE_SIZE;
954955
++state->bucket) {
955-
sk_for_each(sk, &h->ht[state->bucket])
956+
hlist = &h->ht[state->bucket];
957+
hlist_nulls_for_each_entry(sk, hnode, hlist, sk_nulls_node) {
956958
if (sock_net(sk) == seq_file_net(seq))
957-
goto found;
959+
return sk;
960+
}
958961
}
959-
sk = NULL;
960-
found:
961-
return sk;
962+
return NULL;
962963
}
963964

964965
static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
965966
{
966-
struct raw_hashinfo *h = pde_data(file_inode(seq->file));
967967
struct raw_iter_state *state = raw_seq_private(seq);
968968

969969
do {
970-
sk = sk_next(sk);
971-
try_again:
972-
;
970+
sk = sk_nulls_next(sk);
973971
} while (sk && sock_net(sk) != seq_file_net(seq));
974972

975-
if (!sk && ++state->bucket < RAW_HTABLE_SIZE) {
976-
sk = sk_head(&h->ht[state->bucket]);
977-
goto try_again;
978-
}
973+
if (!sk)
974+
return raw_get_first(seq, state->bucket + 1);
979975
return sk;
980976
}
981977

982978
static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
983979
{
984-
struct sock *sk = raw_get_first(seq);
980+
struct sock *sk = raw_get_first(seq, 0);
985981

986982
if (sk)
987983
while (pos && (sk = raw_get_next(seq, sk)) != NULL)
@@ -990,11 +986,9 @@ static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
990986
}
991987

992988
void *raw_seq_start(struct seq_file *seq, loff_t *pos)
993-
__acquires(&h->lock)
989+
__acquires(RCU)
994990
{
995-
struct raw_hashinfo *h = pde_data(file_inode(seq->file));
996-
997-
read_lock(&h->lock);
991+
rcu_read_lock();
998992
return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
999993
}
1000994
EXPORT_SYMBOL_GPL(raw_seq_start);
@@ -1004,7 +998,7 @@ void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1004998
struct sock *sk;
1005999

10061000
if (v == SEQ_START_TOKEN)
1007-
sk = raw_get_first(seq);
1001+
sk = raw_get_first(seq, 0);
10081002
else
10091003
sk = raw_get_next(seq, v);
10101004
++*pos;
@@ -1013,11 +1007,9 @@ void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
10131007
EXPORT_SYMBOL_GPL(raw_seq_next);
10141008

10151009
void raw_seq_stop(struct seq_file *seq, void *v)
1016-
__releases(&h->lock)
1010+
__releases(RCU)
10171011
{
1018-
struct raw_hashinfo *h = pde_data(file_inode(seq->file));
1019-
1020-
read_unlock(&h->lock);
1012+
rcu_read_unlock();
10211013
}
10221014
EXPORT_SYMBOL_GPL(raw_seq_stop);
10231015

@@ -1079,6 +1071,7 @@ static __net_initdata struct pernet_operations raw_net_ops = {
10791071

10801072
int __init raw_proc_init(void)
10811073
{
1074+
10821075
return register_pernet_subsys(&raw_net_ops);
10831076
}
10841077

net/ipv4/raw_diag.c

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -57,31 +57,32 @@ static bool raw_lookup(struct net *net, struct sock *sk,
5757
static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r)
5858
{
5959
struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
60+
struct hlist_nulls_head *hlist;
61+
struct hlist_nulls_node *hnode;
6062
struct sock *sk;
6163
int slot;
6264

6365
if (IS_ERR(hashinfo))
6466
return ERR_CAST(hashinfo);
6567

66-
read_lock(&hashinfo->lock);
68+
rcu_read_lock();
6769
for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) {
68-
sk_for_each(sk, &hashinfo->ht[slot]) {
70+
hlist = &hashinfo->ht[slot];
71+
hlist_nulls_for_each_entry(sk, hnode, hlist, sk_nulls_node) {
6972
if (raw_lookup(net, sk, r)) {
7073
/*
7174
* Grab it and keep until we fill
72-
* diag meaage to be reported, so
75+
* diag message to be reported, so
7376
* caller should call sock_put then.
74-
* We can do that because we're keeping
75-
* hashinfo->lock here.
7677
*/
77-
sock_hold(sk);
78-
goto out_unlock;
78+
if (refcount_inc_not_zero(&sk->sk_refcnt))
79+
goto out_unlock;
7980
}
8081
}
8182
}
8283
sk = ERR_PTR(-ENOENT);
8384
out_unlock:
84-
read_unlock(&hashinfo->lock);
85+
rcu_read_unlock();
8586

8687
return sk;
8788
}
@@ -141,6 +142,8 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
141142
struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
142143
struct net *net = sock_net(skb->sk);
143144
struct inet_diag_dump_data *cb_data;
145+
struct hlist_nulls_head *hlist;
146+
struct hlist_nulls_node *hnode;
144147
int num, s_num, slot, s_slot;
145148
struct sock *sk = NULL;
146149
struct nlattr *bc;
@@ -157,7 +160,8 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
157160
for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) {
158161
num = 0;
159162

160-
sk_for_each(sk, &hashinfo->ht[slot]) {
163+
hlist = &hashinfo->ht[slot];
164+
hlist_nulls_for_each_entry(sk, hnode, hlist, sk_nulls_node) {
161165
struct inet_sock *inet = inet_sk(sk);
162166

163167
if (!net_eq(sock_net(sk), net))

net/ipv6/af_inet6.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@
6363
#include <net/compat.h>
6464
#include <net/xfrm.h>
6565
#include <net/ioam6.h>
66+
#include <net/rawv6.h>
6667

6768
#include <linux/uaccess.h>
6869
#include <linux/mroute6.h>
@@ -1073,6 +1074,8 @@ static int __init inet6_init(void)
10731074
goto out;
10741075
}
10751076

1077+
raw_hashinfo_init(&raw_v6_hashinfo);
1078+
10761079
err = proto_register(&tcpv6_prot, 1);
10771080
if (err)
10781081
goto out;

net/ipv6/raw.c

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,7 @@
6161

6262
#define ICMPV6_HDRLEN 4 /* ICMPv6 header, RFC 4443 Section 2.1 */
6363

64-
struct raw_hashinfo raw_v6_hashinfo = {
65-
.lock = __RW_LOCK_UNLOCKED(raw_v6_hashinfo.lock),
66-
};
64+
struct raw_hashinfo raw_v6_hashinfo;
6765
EXPORT_SYMBOL_GPL(raw_v6_hashinfo);
6866

6967
bool raw_v6_match(struct net *net, struct sock *sk, unsigned short num,
@@ -143,9 +141,10 @@ EXPORT_SYMBOL(rawv6_mh_filter_unregister);
143141
static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
144142
{
145143
struct net *net = dev_net(skb->dev);
144+
struct hlist_nulls_head *hlist;
145+
struct hlist_nulls_node *hnode;
146146
const struct in6_addr *saddr;
147147
const struct in6_addr *daddr;
148-
struct hlist_head *head;
149148
struct sock *sk;
150149
bool delivered = false;
151150
__u8 hash;
@@ -154,11 +153,9 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
154153
daddr = saddr + 1;
155154

156155
hash = nexthdr & (RAW_HTABLE_SIZE - 1);
157-
head = &raw_v6_hashinfo.ht[hash];
158-
if (hlist_empty(head))
159-
return false;
160-
read_lock(&raw_v6_hashinfo.lock);
161-
sk_for_each(sk, head) {
156+
hlist = &raw_v6_hashinfo.ht[hash];
157+
rcu_read_lock();
158+
hlist_nulls_for_each_entry(sk, hnode, hlist, sk_nulls_node) {
162159
int filtered;
163160

164161
if (!raw_v6_match(net, sk, nexthdr, daddr, saddr,
@@ -203,7 +200,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
203200
}
204201
}
205202
}
206-
read_unlock(&raw_v6_hashinfo.lock);
203+
rcu_read_unlock();
207204
return delivered;
208205
}
209206

@@ -337,14 +334,15 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
337334
{
338335
const struct in6_addr *saddr, *daddr;
339336
struct net *net = dev_net(skb->dev);
340-
struct hlist_head *head;
337+
struct hlist_nulls_head *hlist;
338+
struct hlist_nulls_node *hnode;
341339
struct sock *sk;
342340
int hash;
343341

344342
hash = nexthdr & (RAW_HTABLE_SIZE - 1);
345-
head = &raw_v6_hashinfo.ht[hash];
346-
read_lock(&raw_v6_hashinfo.lock);
347-
sk_for_each(sk, head) {
343+
hlist = &raw_v6_hashinfo.ht[hash];
344+
rcu_read_lock();
345+
hlist_nulls_for_each_entry(sk, hnode, hlist, sk_nulls_node) {
348346
/* Note: ipv6_hdr(skb) != skb->data */
349347
const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data;
350348
saddr = &ip6h->saddr;
@@ -355,7 +353,7 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
355353
continue;
356354
rawv6_err(sk, skb, NULL, type, code, inner_offset, info);
357355
}
358-
read_unlock(&raw_v6_hashinfo.lock);
356+
rcu_read_unlock();
359357
}
360358

361359
static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb)

0 commit comments

Comments
 (0)