Skip to content

Commit 4fbac77

Browse files
rdnaborkmann
authored andcommitted
bpf: Hooks for sys_bind
== The problem == There is a use-case when all processes inside a cgroup should use one single IP address on a host that has multiple IP configured. Those processes should use the IP for both ingress and egress, for TCP and UDP traffic. So TCP/UDP servers should be bound to that IP to accept incoming connections on it, and TCP/UDP clients should make outgoing connections from that IP. It should not require changing application code since it's often not possible. Currently it's solved by intercepting glibc wrappers around syscalls such as `bind(2)` and `connect(2)`. It's done by a shared library that is preloaded for every process in a cgroup so that whenever TCP/UDP server calls `bind(2)`, the library replaces IP in sockaddr before passing arguments to syscall. When application calls `connect(2)` the library transparently binds the local end of connection to that IP (`bind(2)` with `IP_BIND_ADDRESS_NO_PORT` to avoid performance penalty). Shared library approach is fragile though, e.g.: * some applications clear env vars (incl. `LD_PRELOAD`); * `/etc/ld.so.preload` doesn't help since some applications are linked with option `-z nodefaultlib`; * other applications don't use glibc and there is nothing to intercept. == The solution == The patch provides much more reliable in-kernel solution for the 1st part of the problem: binding TCP/UDP servers on desired IP. It does not depend on application environment and implementation details (whether glibc is used or not). It adds new eBPF program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` and attach types `BPF_CGROUP_INET4_BIND` and `BPF_CGROUP_INET6_BIND` (similar to already existing `BPF_CGROUP_INET_SOCK_CREATE`). The new program type is intended to be used with sockets (`struct sock`) in a cgroup and provided by user `struct sockaddr`. Pointers to both of them are parts of the context passed to programs of newly added types. The new attach types provides hooks in `bind(2)` system call for both IPv4 and IPv6 so that one can write a program to override IP addresses and ports user program tries to bind to and apply such a program for whole cgroup. == Implementation notes == [1] Separate attach types for `AF_INET` and `AF_INET6` are added intentionally to prevent reading/writing to offsets that don't make sense for corresponding socket family. E.g. if user passes `sockaddr_in` it doesn't make sense to read from / write to `user_ip6[]` context fields. [2] The write access to `struct bpf_sock_addr_kern` is implemented using special field as an additional "register". There are just two registers in `sock_addr_convert_ctx_access`: `src` with value to write and `dst` with pointer to context that can't be changed not to break later instructions. But the fields, allowed to write to, are not available directly and to access them address of corresponding pointer has to be loaded first. To get additional register the 1st not used by `src` and `dst` one is taken, its content is saved to `bpf_sock_addr_kern.tmp_reg`, then the register is used to load address of pointer field, and finally the register's content is restored from the temporary field after writing `src` value. Signed-off-by: Andrey Ignatov <[email protected]> Signed-off-by: Alexei Starovoitov <[email protected]> Signed-off-by: Daniel Borkmann <[email protected]>
1 parent d7be143 commit 4fbac77

File tree

10 files changed

+366
-8
lines changed

10 files changed

+366
-8
lines changed

include/linux/bpf-cgroup.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <uapi/linux/bpf.h>
77

88
struct sock;
9+
struct sockaddr;
910
struct cgroup;
1011
struct sk_buff;
1112
struct bpf_sock_ops_kern;
@@ -63,6 +64,10 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
6364
int __cgroup_bpf_run_filter_sk(struct sock *sk,
6465
enum bpf_attach_type type);
6566

67+
int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
68+
struct sockaddr *uaddr,
69+
enum bpf_attach_type type);
70+
6671
int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
6772
struct bpf_sock_ops_kern *sock_ops,
6873
enum bpf_attach_type type);
@@ -103,6 +108,20 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
103108
__ret; \
104109
})
105110

111+
#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \
112+
({ \
113+
int __ret = 0; \
114+
if (cgroup_bpf_enabled) \
115+
__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type); \
116+
__ret; \
117+
})
118+
119+
#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \
120+
BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND)
121+
122+
#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \
123+
BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND)
124+
106125
#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \
107126
({ \
108127
int __ret = 0; \
@@ -135,6 +154,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
135154
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
136155
#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
137156
#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
157+
#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; })
158+
#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; })
138159
#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
139160
#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
140161

include/linux/bpf_types.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act)
88
BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp)
99
BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb)
1010
BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock)
11+
BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr)
1112
BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout)
1213
BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout)
1314
BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit)

include/linux/filter.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1021,6 +1021,16 @@ static inline int bpf_tell_extensions(void)
10211021
return SKF_AD_MAX;
10221022
}
10231023

1024+
struct bpf_sock_addr_kern {
1025+
struct sock *sk;
1026+
struct sockaddr *uaddr;
1027+
/* Temporary "register" to make indirect stores to nested structures
1028+
* defined above. We need three registers to make such a store, but
1029+
* only two (src and dst) are available at convert_ctx_access time
1030+
*/
1031+
u64 tmp_reg;
1032+
};
1033+
10241034
struct bpf_sock_ops_kern {
10251035
struct sock *sk;
10261036
u32 op;

include/uapi/linux/bpf.h

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ enum bpf_prog_type {
136136
BPF_PROG_TYPE_CGROUP_DEVICE,
137137
BPF_PROG_TYPE_SK_MSG,
138138
BPF_PROG_TYPE_RAW_TRACEPOINT,
139+
BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
139140
};
140141

141142
enum bpf_attach_type {
@@ -147,6 +148,8 @@ enum bpf_attach_type {
147148
BPF_SK_SKB_STREAM_VERDICT,
148149
BPF_CGROUP_DEVICE,
149150
BPF_SK_MSG_VERDICT,
151+
BPF_CGROUP_INET4_BIND,
152+
BPF_CGROUP_INET6_BIND,
150153
__MAX_BPF_ATTACH_TYPE
151154
};
152155

@@ -1010,6 +1013,26 @@ struct bpf_map_info {
10101013
__u64 netns_ino;
10111014
} __attribute__((aligned(8)));
10121015

1016+
/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
1017+
* by user and intended to be used by socket (e.g. to bind to, depends on
1018+
* attach attach type).
1019+
*/
1020+
struct bpf_sock_addr {
1021+
__u32 user_family; /* Allows 4-byte read, but no write. */
1022+
__u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write.
1023+
* Stored in network byte order.
1024+
*/
1025+
__u32 user_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write.
1026+
* Stored in network byte order.
1027+
*/
1028+
__u32 user_port; /* Allows 4-byte read and write.
1029+
* Stored in network byte order
1030+
*/
1031+
__u32 family; /* Allows 4-byte read, but no write */
1032+
__u32 type; /* Allows 4-byte read, but no write */
1033+
__u32 protocol; /* Allows 4-byte read, but no write */
1034+
};
1035+
10131036
/* User bpf_sock_ops struct to access socket values and specify request ops
10141037
* and their replies.
10151038
* Some of this fields are in network (bigendian) byte order and may need

kernel/bpf/cgroup.c

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,42 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
494494
}
495495
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
496496

497+
/**
498+
* __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
499+
* provided by user sockaddr
500+
* @sk: sock struct that will use sockaddr
501+
* @uaddr: sockaddr struct provided by user
502+
* @type: The type of program to be exectuted
503+
*
504+
* socket is expected to be of type INET or INET6.
505+
*
506+
* This function will return %-EPERM if an attached program is found and
507+
* returned value != 1 during execution. In all other cases, 0 is returned.
508+
*/
509+
int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
510+
struct sockaddr *uaddr,
511+
enum bpf_attach_type type)
512+
{
513+
struct bpf_sock_addr_kern ctx = {
514+
.sk = sk,
515+
.uaddr = uaddr,
516+
};
517+
struct cgroup *cgrp;
518+
int ret;
519+
520+
/* Check socket family since not all sockets represent network
521+
* endpoint (e.g. AF_UNIX).
522+
*/
523+
if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
524+
return 0;
525+
526+
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
527+
ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
528+
529+
return ret == 1 ? 0 : -EPERM;
530+
}
531+
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
532+
497533
/**
498534
* __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
499535
* @sk: socket to get cgroup from

kernel/bpf/syscall.c

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1175,19 +1175,29 @@ static int
11751175
bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
11761176
enum bpf_attach_type expected_attach_type)
11771177
{
1178-
/* There are currently no prog types that require specifying
1179-
* attach_type at load time.
1180-
*/
1181-
return 0;
1178+
switch (prog_type) {
1179+
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
1180+
switch (expected_attach_type) {
1181+
case BPF_CGROUP_INET4_BIND:
1182+
case BPF_CGROUP_INET6_BIND:
1183+
return 0;
1184+
default:
1185+
return -EINVAL;
1186+
}
1187+
default:
1188+
return 0;
1189+
}
11821190
}
11831191

11841192
static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
11851193
enum bpf_attach_type attach_type)
11861194
{
1187-
/* There are currently no prog types that require specifying
1188-
* attach_type at load time.
1189-
*/
1190-
return 0;
1195+
switch (prog->type) {
1196+
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
1197+
return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
1198+
default:
1199+
return 0;
1200+
}
11911201
}
11921202

11931203
/* last field in 'union bpf_attr' used by this command */
@@ -1479,6 +1489,10 @@ static int bpf_prog_attach(const union bpf_attr *attr)
14791489
case BPF_CGROUP_INET_SOCK_CREATE:
14801490
ptype = BPF_PROG_TYPE_CGROUP_SOCK;
14811491
break;
1492+
case BPF_CGROUP_INET4_BIND:
1493+
case BPF_CGROUP_INET6_BIND:
1494+
ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
1495+
break;
14821496
case BPF_CGROUP_SOCK_OPS:
14831497
ptype = BPF_PROG_TYPE_SOCK_OPS;
14841498
break;
@@ -1541,6 +1555,10 @@ static int bpf_prog_detach(const union bpf_attr *attr)
15411555
case BPF_CGROUP_INET_SOCK_CREATE:
15421556
ptype = BPF_PROG_TYPE_CGROUP_SOCK;
15431557
break;
1558+
case BPF_CGROUP_INET4_BIND:
1559+
case BPF_CGROUP_INET6_BIND:
1560+
ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
1561+
break;
15441562
case BPF_CGROUP_SOCK_OPS:
15451563
ptype = BPF_PROG_TYPE_SOCK_OPS;
15461564
break;
@@ -1590,6 +1608,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
15901608
case BPF_CGROUP_INET_INGRESS:
15911609
case BPF_CGROUP_INET_EGRESS:
15921610
case BPF_CGROUP_INET_SOCK_CREATE:
1611+
case BPF_CGROUP_INET4_BIND:
1612+
case BPF_CGROUP_INET6_BIND:
15931613
case BPF_CGROUP_SOCK_OPS:
15941614
case BPF_CGROUP_DEVICE:
15951615
break;

kernel/bpf/verifier.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3887,6 +3887,7 @@ static int check_return_code(struct bpf_verifier_env *env)
38873887
switch (env->prog->type) {
38883888
case BPF_PROG_TYPE_CGROUP_SKB:
38893889
case BPF_PROG_TYPE_CGROUP_SOCK:
3890+
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
38903891
case BPF_PROG_TYPE_SOCK_OPS:
38913892
case BPF_PROG_TYPE_CGROUP_DEVICE:
38923893
break;

0 commit comments

Comments
 (0)