-
-
[原创][eBPF源码分析]Socket_filter类型调用链埋点分析
-
发表于: 2024-5-11 18:51 8684
-
概述
socket filter,在BPF中的类型为BPF_PROG_TYPE_SOCKET_FILTER
,顾名思义,实现的是socket的过滤器。
本文会分析BPF_PROG_TYPE_SOCKET_FILTER
类型程序的实现原理,一直到埋点函数。
内核中有示例代码,位置在sample/bpf/sock_example.c
、samples/bpf/sockex1_kern.c
等。
一般会将socket filter程序的段名定义成SEC("socketxxx")
下文的代码分析,基于5.15.99
版本的内核
prog加载
这里通过sample/bpf/sock_example.c
学习。
一些注释和文件头
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | /* eBPF example program: * - creates arraymap in kernel with key 4 bytes and value 8 bytes * * - loads eBPF program: * r0 = skb->data[ETH_HLEN + offsetof(struct iphdr, protocol)]; * *(u32*)(fp - 4) = r0; * // assuming packet is IPv4, lookup ip->proto in a map * value = bpf_map_lookup_elem(map_fd, fp - 4); * if (value) * (*(u64*)value) += 1; * * - attaches this program to loopback interface "lo" raw socket * * - every second user space reads map[tcp], map[udp], map[icmp] to see * how many packets of given protocol were seen on "lo" */ #include <stdio.h> #include <unistd.h> #include <assert.h> #include <linux/bpf.h> #include <string.h> #include <stdlib.h> #include <errno.h> #include <sys/socket.h> #include <arpa/inet.h> #include <linux/if_ether.h> #include <linux/ip.h> #include <stddef.h> #include <bpf/bpf.h> #include "bpf_insn.h" #include "sock_example.h" |
加载map,使用内核的bpf_create_map
函数
1 2 3 4 5 6 7 8 9 | int sock = -1, map_fd, prog_fd, i, key; long long value = 0, tcp_cnt, udp_cnt, icmp_cnt; map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof (key), sizeof (value), 256, 0); if (map_fd < 0) { printf ( "failed to create map '%s'\n" , strerror ( errno )); goto cleanup; } |
用字节码的形式定义的BPF prog程序本体
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | struct bpf_insn prog[] = { BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), BPF_LD_ABS(BPF_B, ETH_HLEN + offsetof( struct iphdr, protocol) /* R0 = ip->proto */ ), BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ BPF_LD_MAP_FD(BPF_REG_1, map_fd), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */ BPF_EXIT_INSN(), }; size_t insns_cnt = sizeof (prog) / sizeof ( struct bpf_insn); |
这里的bpf_insn
是bpf程序底层的字节码,是抽象过的汇编代码。各种高级库都要转换成这种形式,最后转换成汇编代码。
1 2 3 4 5 6 7 | struct bpf_insn { __u8 code; /* opcode */ __u8 dst_reg:4; /* dest register */ __u8 src_reg:4; /* source register */ __s16 off; /* signed offset */ __s32 imm; /* signed immediate constant */ }; |
以上实现的程序:
1 2 3 4 5 6 | r0 = skb->data[ETH_HLEN + offsetof( struct iphdr, protocol)]; *(u32*)(fp - 4) = r0; // assuming packet is IPv4, lookup ip->proto in a map value = bpf_map_lookup_elem(map_fd, fp - 4); if (value) (*(u64*)value) += 1; |
用内核函数bpf_load_program
装载prog程序,参数为BPF_PROG_TYPE_SOCKET_FILTER
,表示socket/filter
类型
1 2 3 4 5 6 | prog_fd = bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, prog, insns_cnt, "GPL" , 0, bpf_log_buf, BPF_LOG_BUF_SIZE); if (prog_fd < 0) { printf ( "failed to load prog '%s'\n" , strerror ( errno )); goto cleanup; } |
open_raw_sock
创建一个raw_socket,调用setsockopt
将bpf prog附着到这个socket上,参数为SO_ATTACH_BPF
。
1 2 3 4 5 6 7 | sock = open_raw_sock( "lo" ); if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof (prog_fd)) < 0) { printf ( "setsockopt %s\n" , strerror ( errno )); goto cleanup; } |
点位跟踪
接下来重点关注setsockopt
如何访问sock/filter的点位。
查找setsockopt
的源码,寻找SO_ATTACH_BPF参数逻辑。在5.15.99
的net/core/sock.c:1169
行,找到了处理逻辑
1 2 3 4 5 6 7 8 9 10 11 12 | case SO_ATTACH_BPF: ret = -EINVAL; if (optlen == sizeof (u32)) { u32 ufd; ret = -EFAULT; if (copy_from_sockptr(&ufd, optval, sizeof (ufd))) break ; ret = sk_attach_bpf(ufd, sk); } break ; |
1 | 跳转到`sk_attach_bpf`函数(`net / core / filter .c: 1571 `) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | int sk_attach_bpf(u32 ufd, struct sock *sk) { struct bpf_prog *prog = __get_bpf(ufd, sk); int err; if (IS_ERR(prog)) return PTR_ERR(prog); err = __sk_attach_prog(prog, sk); if (err < 0) { bpf_prog_put(prog); return err; } return 0; } |
排除关于bpf的操作(一些对prog程序的操作),跟进__sk_attach_prog
。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | static int __sk_attach_prog( struct bpf_prog *prog, struct sock *sk) { //创建socket_filter的对象 // struct sk_filter { // refcount_t refcnt; // struct rcu_head rcu; // struct bpf_prog *prog; // }; struct sk_filter *fp, *old_fp; fp = kmalloc( sizeof (*fp), GFP_KERNEL); if (!fp) return -ENOMEM; fp->prog = prog; // 为fp sk_filter对象分配一个socket的引用,如果失败,释放fp空间 if (!__sk_filter_charge(sk, fp)) { kfree(fp); return -ENOMEM; } refcount_set(&fp->refcnt, 1); // 获取原先socket/filter过滤器 old_fp = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); // 将sk->sk_filter的值变为我们新分配的fp rcu_assign_pointer(sk->sk_filter, fp); // 如果有淘汰下来的旧prog,需要对空间进行清理 if (old_fp) sk_filter_uncharge(sk, old_fp); return 0; } |
在代码注释里的操作过后,成功将prog对象指向了sk->sk_filter->prog。
sk_filter
查找sk_filter
代码,寻找调用函数。可以在很多函数中找到踪迹,在include/linux/filter.h
中找到函数的原型。
1 2 3 4 5 | int sk_filter_trim_cap( struct sock *sk, struct sk_buff *skb, unsigned int cap); static inline int sk_filter( struct sock *sk, struct sk_buff *skb) { return sk_filter_trim_cap(sk, skb, 1); } |
sk_filter是封装好的sk->sk_filter
调用原型,也有其他代码通过获取sk->sk_filter
或者直接调用sk_filter_trim_cap
来进行SOCKET_FILTER程序的运行。
sk_filter_trim_cap
跟进sk_filter_trim_cap
函数,net/core/filter.c
。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | / * sk_filter_trim_cap - run a packet through a socket filter * @sk: sock associated with &sk_buff * @skb: buffer to filter * @cap: limit on how short the eBPF program may trim the packet * * Run the eBPF program and then cut skb->data to correct size returned by * the program. If pkt_len is 0 we toss packet. If skb->len is smaller * than pkt_len we keep whole skb->data. This is the socket level * wrapper to bpf_prog_run. It returns 0 if the packet should * be accepted or -EPERM if the packet should be tossed. * */ int sk_filter_trim_cap( struct sock *sk, struct sk_buff *skb, unsigned int cap) { int err; struct sk_filter *filter; /* * If the skb was allocated from pfmemalloc reserves, only * allow SOCK_MEMALLOC sockets to use it as this socket is * helping free memory */ // 检查SKB是否分配PF_MEMALLOC标志位 if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); return -ENOMEM; } err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); if (err) return err; //lsm框架hook点 err = security_sock_rcv_skb(sk, skb); if (err) return err; rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter) { struct sock *save_sk = skb->sk; unsigned int pkt_len; skb->sk = sk; pkt_len = bpf_prog_run_save_cb(filter->prog, skb); skb->sk = save_sk; err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; } rcu_read_unlock(); return err; } EXPORT_SYMBOL(sk_filter_trim_cap); |
PF_MEMALLOC含义:
当前进程有很多可以释放的内存,如果能分配一点紧急内存给当前进程,那么当前进程可以返回更多的内存给系统。非内存管理子系统不应该使用这个标记,除非这次分配保证会释放更大的内存给系统。如果每个子系统都滥用这个标记,可能会耗尽内存管理子系统的保留内存。
程序首先检查 skb 是否设置了PF_MEMALLOC
标志位,如果是的话,只有设置了 SOCK_MEMALLOC
标志的 socket 才能使用它,否则就返回 -ENOMEM 并增加统计计数器 LINUX_MIB_PFMEMALLOCDROP
。这是为了防止内存不足的情况下,非紧急的 socket 占用有限的内存资源。
下一步,调用 BPF_CGROUP_RUN_PROG_INET_INGRESS()
函数,执行 cgroup 的 ingress
hook上的 eBPF 程序,如果返回err,就return err。这是为了实现 cgroup 的网络隔离和限制功能。
这里如果开启了CGROUP_BPF的CGROUP_INET_INGRESS
点,则调用__cgroup_bpf_run_filter_skb
函数,执行CGROUP的filter程序。若没有开启,返回0值,继续执行代码。
cgroup细节之后讨论。
1 2 3 4 5 6 7 8 9 10 | /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_INET_INGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(sk, skb, \ CGROUP_INET_INGRESS); \ \ __ret; \ }) |
调用 security_sock_rcv_skb
函数,这是LSM的预留hook点,检查 socket 是否有权限接收 skb。
接下来,获取读锁,防止 sk_filter 被并发修改,从 sk 中获取 sk_filter 结构体指针。
取出sk->filter
后,更新skb中的sock为当前传入socket,并且调用bpf_prog_run_save_cb
执行bpf程序。然后把skb->sk
赋值旧的socket回去。
1 2 3 4 5 6 7 8 9 10 11 12 | rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter) { struct sock *save_sk = skb->sk; unsigned int pkt_len; skb->sk = sk; pkt_len = bpf_prog_run_save_cb(filter->prog, skb); skb->sk = save_sk; err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; } rcu_read_unlock(); |
如果返回的长度不为 0,就调用 pskb_trim 函数,将 skb 的数据部分裁剪到 cap 和返回的长度中的较大值,如果裁剪失败,就返回错误码;如果返回的长度为 0,就将错误码设置为 -EPERM,表示要丢弃 packet。
bpf_prog_run_save_cb
bpf prog执行的细节可以简单看一下。
其中涉及细节放到BPF系统源码分析里讲。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 | static inline u32 bpf_prog_run_save_cb( const struct bpf_prog *prog, struct sk_buff *skb) { u32 res; migrate_disable(); res = __bpf_prog_run_save_cb(prog, skb); migrate_enable(); return res; } /* Must be invoked with migration disabled */ static inline u32 __bpf_prog_run_save_cb( const struct bpf_prog *prog, const void *ctx) { const struct sk_buff *skb = ctx; u8 *cb_data = bpf_skb_cb(skb); u8 cb_saved[BPF_SKB_CB_LEN]; u32 res; if (unlikely(prog->cb_access)) { memcpy (cb_saved, cb_data, sizeof (cb_saved)); memset (cb_data, 0, sizeof (cb_saved)); } res = bpf_prog_run(prog, skb); if (unlikely(prog->cb_access)) memcpy (cb_data, cb_saved, sizeof (cb_saved)); return res; } static inline u8 *bpf_skb_cb( const struct sk_buff *skb) { /* eBPF programs may read/write skb->cb[] area to transfer meta * data between tail calls. Since this also needs to work with * tc, that scratch memory is mapped to qdisc_skb_cb's data area. * * In some socket filter cases, the cb unfortunately needs to be * saved/restored so that protocol specific skb->cb[] data won't * be lost. In any case, due to unpriviledged eBPF programs * attached to sockets, we need to clear the bpf_skb_cb() area * to not leak previous contents to user space. */ BUILD_BUG_ON(sizeof_field( struct __sk_buff, cb) != BPF_SKB_CB_LEN); BUILD_BUG_ON(sizeof_field( struct __sk_buff, cb) != sizeof_field( struct qdisc_skb_cb, data)); return qdisc_skb_cb(skb)->data; } static __always_inline u32 bpf_prog_run( const struct bpf_prog *prog, const void *ctx) { return __bpf_prog_run(prog, ctx, bpf_dispatcher_nop_func); } static __always_inline u32 __bpf_prog_run( const struct bpf_prog *prog, const void *ctx, bpf_dispatcher_fn dfunc) { u32 ret; cant_migrate(); if (static_branch_unlikely(&bpf_stats_enabled_key)) { struct bpf_prog_stats *stats; u64 start = sched_clock(); unsigned long flags; ret = dfunc(ctx, prog->insnsi, prog->bpf_func); stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->cnt); u64_stats_add(&stats->nsecs, sched_clock() - start); u64_stats_update_end_irqrestore(&stats->syncp, flags); } else { ret = dfunc(ctx, prog->insnsi, prog->bpf_func); } return ret; } |
调用链分析
可以通过搜索sk->sk_filter
、sk_filter
、sk_filter_trim_cap
,分析filter程序的调用
SOCKET_RAW
查找函数引用,回溯一下调用链。
查找调用sk_filter
的函数,定位到sock_queue_rcv_skb(net/core/sock.c)
(很多函数有注释,比如)
1 2 3 4 5 6 7 8 9 10 11 | int sock_queue_rcv_skb( struct sock *sk, struct sk_buff *skb) { int err; err = sk_filter(sk, skb); if (err) return err; return __sock_queue_rcv_skb(sk, skb); } EXPORT_SYMBOL(sock_queue_rcv_skb); |
代码中的sk_filter就是埋点函数。
代码中有非常多的调用,很多都是各种协议的适配,比如J1939
,搜索后发现是汽车的CAN总线通信协议。这里我们关注net/ieee802154/socket.c
。
可以看到,dgram_rcv_skb(数据报SOCKET)
和raw_rcv_skb
都调用了sock_queue_rcv_skb
。
1 2 3 4 5 6 7 8 9 10 11 12 13 | static int raw_rcv_skb( struct sock *sk, struct sk_buff *skb) { skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return NET_RX_DROP; if (sock_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; } return NET_RX_SUCCESS; } |
ipv4的raw_rcv_skb
逻辑也差不多,后面都进入同样的raw_rcv
1 2 3 4 5 6 7 8 9 10 11 12 | static int raw_rcv_skb( struct sock *sk, struct sk_buff *skb) { /* Charge it to the socket. */ ipv4_pktinfo_prepare(sk, skb); if (sock_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; } return NET_RX_SUCCESS; } |
跟进到net/ipv4/raw.c
的 raw_rcv。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | int raw_rcv( struct sock *sk, struct sk_buff *skb) { // 安全策略检查 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { atomic_inc(&sk->sk_drops); kfree_skb(skb); return NET_RX_DROP; } //NFHOOK埋点,重置跟踪信息 nf_reset_ct(skb); skb_push(skb, skb->data - skb_network_header(skb)); raw_rcv_skb(sk, skb); return 0; } |
跟进raw_v4_input
。这个函数主要做socket_raw的RX方向sk分配。SOCKET_RAW允许多个socket同时接收同一个数据包,
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | /* IP input processing comes here for RAW socket delivery. * Caller owns SKB, so we must make clones. * * RFC 1122: SHOULD pass TOS value up to the transport layer. * -> It does. And not only TOS, but all IP header. */ static int raw_v4_input( struct sk_buff *skb, const struct iphdr *iph, int hash) { ...... // 根据网络设备寻找匹配的socket net = dev_net(skb->dev); sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol, iph->saddr, iph->daddr, dif, sdif); while (sk) { delivered = 1; if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) && ip_mc_sf_allow(sk, iph->daddr, iph->saddr, skb->dev->ifindex, sdif)) { // clone的目的是不共享数据包,socket拥有自己的数据包 struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); /* Not releasing hash table! */ if (clone) raw_rcv(sk, clone); } sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol, iph->saddr, iph->daddr, dif, sdif); } out: read_unlock(&raw_v4_hashinfo.lock); return delivered; } |
跟进到raw_local_deliver
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | int raw_local_deliver( struct sk_buff *skb, int protocol) { int hash; struct sock *raw_sk; // 根据协议获取哈希值,从raw_v4_hashinfo链表获得socket对象 hash = protocol & (RAW_HTABLE_SIZE - 1); raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); /* If there maybe a raw socket we must check - if not we * don't care less */ if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash)) raw_sk = NULL; return raw_sk != NULL; } |
跟进ip_local_deliver->ip_local_deliver_finish->ip_protocol_deliver_rcu->raw_local_deliver
,这就来到了网络层转发到传输层的函数入口了。ip_local_deliver负责网络层转发到上层协议。由于SOCKET_RAW跳过传输层,因此检查设置在了这,具体细节可以看网络系统文章。
[Linux内核源码分析]网络子系统
SOCKET_STREAM
net/ipv4/tcp_ipv4.c
的tcp_filter
函数调用了sk_filter_trim_cap
1 2 3 4 5 6 7 | int tcp_filter( struct sock *sk, struct sk_buff *skb) { struct tcphdr *th = ( struct tcphdr *)skb->data; return sk_filter_trim_cap(sk, skb, th->doff * 4); } EXPORT_SYMBOL(tcp_filter); |
跟进到tcp_v4_rcv
(AF_INET_tcp的recv函数)
在TCP_NEW_SYN_RECV
的处理逻辑以及主体函数逻辑中,都有tcp_filter
的函数调用
tcp_v4_rcv函数中会对TCP_NEW_SYN_RECV
进行处理,如果连接检查成功,则需要新建控制块来处理连接,这个新建控制块的状态将会使用TCP_SYN_RECV
状态;
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | /* * From tcp_input.c */ int tcp_v4_rcv( struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct sk_buff *skb_to_free; const struct iphdr *iph; const struct tcphdr *th; struct sock *sk; ...... ......... th = ( const struct tcphdr *)skb->data; iph = ip_hdr(skb); // 获取等待tcp包的合适socket lookup: sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, th->dest, sdif, &refcounted); if (!sk) goto no_tcp_socket; process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; // 进入TCP_NEW_SYN_RECV逻辑 if (sk->sk_state == TCP_NEW_SYN_RECV) { ......... if (!tcp_filter(sk, skb)) { th = ( const struct tcphdr *)skb->data; iph = ip_hdr(skb); tcp_v4_fill_cb(skb, iph, th); nsk = tcp_check_req(sk, skb, req, false , &req_stolen); } ......... if (tcp_filter(sk, skb)) { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; goto discard_and_relse; } ...... if (!sock_owned_by_user(sk)) { skb_to_free = sk->sk_rx_skb_cache; sk->sk_rx_skb_cache = NULL; ret = tcp_v4_do_rcv(sk, skb); } else { if (tcp_add_backlog(sk, skb)) goto discard_and_relse; skb_to_free = NULL; } ......... switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), &tcp_hashinfo, skb, __tcp_hdrlen(th), iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb), sdif); if (sk2) { inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; tcp_v4_restore_cb(skb); refcounted = false ; goto process; } } /* to ACK */ fallthrough; case TCP_TW_ACK: tcp_v4_timewait_ack(sk, skb); break ; case TCP_TW_RST: tcp_v4_send_reset(sk, skb); inet_twsk_deschedule_put(inet_twsk(sk)); goto discard_it; case TCP_TW_SUCCESS:; } goto discard_it; } |
SOCKET_DGRAM
搜索sk_filter
找到了udp_queue_rcv_one_skb
函数。这个函数位于udp_queue_rcv_skb
内部
1 2 3 4 5 6 7 8 9 | static int udp_queue_rcv_skb( struct sock *sk, struct sk_buff *skb) { struct sk_buff *next, *segs; int ret; if (likely(!udp_unexpected_gso(sk, skb))) return udp_queue_rcv_one_skb(sk, skb); ...... } |
逆向一路向上跟进至udp_rcv
,可知检测逻辑在UDP协议栈rcv处理函数内部。从udp_rcv顺序分析
1 2 3 4 | int udp_rcv( struct sk_buff *skb) { return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP); } |
跟进__udp4_lib_rcv
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | /* * All we need to do is get the socket, and then do a checksum. */ int __udp4_lib_rcv( struct sk_buff *skb, struct udp_table *udptable, int proto) { struct sock *sk; struct udphdr *uh; struct rtable *rt = skb_rtable(skb); __be32 saddr, daddr; struct net *net = dev_net(skb->dev); ...... sk = skb_steal_sock(skb, &refcounted); if (sk) { struct dst_entry *dst = skb_dst(skb); int ret; if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst)) udp_sk_rx_dst_set(sk, dst); ret = udp_unicast_rcv_skb(sk, skb, uh); if (refcounted) sock_put(sk); return ret; } if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return __udp4_lib_mcast_deliver(net, skb, uh, saddr, daddr, udptable, proto); sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk) return udp_unicast_rcv_skb(sk, skb, uh); ...... } |
udp_unicast_rcv_skb
和__udp4_lib_mcast_deliver
都调用了udp_queue_rcv_skb
,而udp_queue_rcv_skb
内部包含udp_queue_rcv_one_skb
。
最后在udp_queue_rcv_one_skb中调用了sk_filter_trim_cap
。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 | /* returns: * -1: error * 0: success * >0: "udp encap" protocol resubmission * * Note that in the success and error cases, the skb is assumed to * have either been requeued or freed. */ static int udp_queue_rcv_one_skb( struct sock *sk, struct sk_buff *skb) { struct udp_sock *up = udp_sk(sk); int is_udplite = IS_UDPLITE(sk); /* * Charge it to the socket, dropping if the queue is full. */ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto drop; nf_reset_ct(skb); ...... if (sk_filter_trim_cap(sk, skb, sizeof ( struct udphdr))) goto drop; ...... } |
调用链:udp_rcv->__udp4_lib_rcv->udp_unicast_rcv_skb/__udp4_lib_mcast_deliver->udp_queue_rcv_skb->udp_queue_rcv_one_skb->sk_filter_trim_cap
其他协议
还有一些内核函数也调用sk_filter相关函数,但是属于通用sock处理逻辑(__sk_receive_skb
),一些其他协议使用,比如DCCP、pppoe、l2tp等,这里就不加以分析。
SOCKET析构逻辑
跟踪点是__sk_destruct
(net/core/sock.c),其中会检查sock_filter是否还存在,还存在的话调用sk_filter_uncharge
删除分配的内存。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | /* Sockets having SOCK_RCU_FREE will call this function after one RCU * grace period. This is the case for UDP sockets and TCP listeners. */ static void __sk_destruct( struct rcu_head *head) { struct sock *sk = container_of(head, struct sock, sk_rcu); struct sk_filter *filter; if (sk->sk_destruct) sk->sk_destruct(sk); filter = rcu_dereference_check(sk->sk_filter, refcount_read(&sk->sk_wmem_alloc) == 0); if (filter) { sk_filter_uncharge(sk, filter); RCU_INIT_POINTER(sk->sk_filter, NULL); } ...... #ifdef CONFIG_BPF_SYSCALL bpf_sk_storage_free(sk); #endif ...... sk_prot_free(sk->sk_prot_creator, sk); } |
跟进到sk_destruct
1 2 3 4 5 6 7 8 9 10 11 12 13 14 | void sk_destruct( struct sock *sk) { bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); if (rcu_access_pointer(sk->sk_reuseport_cb)) { reuseport_detach_sock(sk); use_call_rcu = true ; } if (use_call_rcu) call_rcu(&sk->sk_rcu, __sk_destruct); else __sk_destruct(&sk->sk_rcu); } |
跟进__sk_free
1 2 3 4 5 6 7 8 9 10 | static void __sk_free( struct sock *sk) { if (likely(sk->sk_net_refcnt)) sock_inuse_add(sock_net(sk), -1); if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) sock_diag_broadcast_destroy(sk); else sk_destruct(sk); } |
跟进至sk_free
1 2 3 4 5 6 7 8 9 10 11 | void sk_free( struct sock *sk) { /* * We subtract one from sk_wmem_alloc and can know if * some packets are still in some tx queue. * If not null, sock_wfree() will call __sk_free(sk) later */ if (refcount_dec_and_test(&sk->sk_wmem_alloc)) __sk_free(sk); } EXPORT_SYMBOL(sk_free); |
sk_free是内核删除socket对象的函数。内核通过sk_alloc
分配socket对象。以下为tipc_sk_create
(net/tipc/socket.c)的示例。通过sk_alloc
创建socket对象,然后判断创建失败,sk_free释放内存。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 | /* Allocate socket's protocol area */ // sk_alloc - All socket objects are allocated here sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto, kern); if (sk == NULL) return -ENOMEM; tsk = tipc_sk(sk); tsk->max_pkt = MAX_PKT_DEFAULT; tsk->maxnagle = 0; tsk->nagle_start = NAGLE_START_INIT; INIT_LIST_HEAD(&tsk->publications); INIT_LIST_HEAD(&tsk->cong_links); msg = &tsk->phdr; /* Finish initializing socket data structures */ sock->ops = ops; sock_init_data(sock, sk); tipc_set_sk_state(sk, TIPC_OPEN); if (tipc_sk_insert(tsk)) { sk_free(sk); pr_warn( "Socket create failed; port number exhausted\n" ); return -EINVAL; } |
处理逻辑调用链:sk_free->__sk_free->sk_destruct->__sk_destruct
SOCKET_PACKET
net/packet/af_packet.c的run_filter
会取出sk->sk_filter->prog程序,bpf执行
1 2 3 4 5 6 7 8 9 10 11 12 13 14 | static unsigned int run_filter( struct sk_buff *skb, const struct sock *sk, unsigned int res) { struct sk_filter *filter; rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter != NULL) res = bpf_prog_run_clear_cb(filter->prog, skb); rcu_read_unlock(); return res; } |
跟进到packet_rcv
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | /* * This function makes lazy skb cloning in hope that most of packets * are discarded by BPF. * * Note tricky part: we DO mangle shared skb! skb->data, skb->len * and skb->cb are mangled. It works because (and until) packets * falling here are owned by current CPU. Output packets are cloned * by dev_queue_xmit_nit(), input packets are processed by net_bh * sequentially, so that if we return skb to original state on exit, * we will not harm anyone. */ static int packet_rcv( struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct sock *sk; struct sockaddr_ll *sll; struct packet_sock *po; u8 *skb_head = skb->data; int skb_len = skb->len; unsigned int snaplen, res; bool is_drop_n_account = false ; if (skb->pkt_type == PACKET_LOOPBACK) goto drop; sk = pt->af_packet_priv; po = pkt_sk(sk); if (!net_eq(dev_net(dev), sock_net(sk))) goto drop; skb->dev = dev; ...... res = run_filter(skb, sk, snaplen); ...... } |
跟进packet_create
,这是PF_PACKET
协议栈的create函数,其中创建了packet_sock,并且把packet_rcv指针赋值到协议栈处理函数中。当系统创建socket时,会调用inet_create
,从inetsw
数组中取出协议栈注册的函数,对应PF_PACKET
的就是这里的packet_create
。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | /* * Create a packet of type SOCK_PACKET. */ static int packet_create( struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct packet_sock *po; __be16 proto = (__force __be16)protocol; /* weird, but documented */ int err; ...... po->prot_hook.func = packet_rcv; if (sock->type == SOCK_PACKET) po->prot_hook.func = packet_rcv_spkt; po->prot_hook.af_packet_priv = sk; po->prot_hook.af_packet_net = sock_net(sk); if (proto) { po->prot_hook.type = proto; __register_prot_hook(sk); } ...... } |
packet_sock的结构
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | struct packet_sock { /* struct sock has to be the first member of packet_sock */ struct sock sk; struct packet_fanout *fanout; union tpacket_stats_u stats; struct packet_ring_buffer rx_ring; struct packet_ring_buffer tx_ring; int copy_thresh; spinlock_t bind_lock; struct mutex pg_vec_lock; unsigned int running; /* bind_lock must be held */ unsigned int auxdata:1, /* writer must hold sock lock */ origdev:1, has_vnet_hdr:1, tp_loss:1, tp_tx_has_off:1; int pressure; int ifindex; /* bound device */ __be16 num; struct packet_rollover *rollover; struct packet_mclist *mclist; atomic_t mapped; enum tpacket_versions tp_version; unsigned int tp_hdrlen; unsigned int tp_reserve; unsigned int tp_tstamp; struct completion skb_completion; struct net_device __rcu *cached_dev; int (*xmit)( struct sk_buff *skb); struct packet_type prot_hook ____cacheline_aligned_in_smp; atomic_t tp_drops ____cacheline_aligned_in_smp; }; |
总结
BPF_PROG_TYPE_SOCKET_FILTER
类型的bpf程序,需要利用setsockopt
函数绑定,埋点函数位于net/core/filter.c:sk_filter_trim_cap
。
调用链总结
- SOCKET_RAW:ip_local_deliver->ip_local_deliver_finish->ip_protocol_deliver_rcu->raw_local_deliver->raw_v4_input->raw_rcv->raw_rcv_skb
- SOCKET_STREAM:tcp_v4_rcv->tcp_filter
- SOCKET_DGRAM:udp_rcv->__udp4_lib_rcv->udp_unicast_rcv_skb/__udp4_lib_mcast_deliver->udp_queue_rcv_skb->udp_queue_rcv_one_skb->sk_filter_trim_cap
- SOCKET析构:sk_free->__sk_free->sk_destruct->__sk_destruct
- SOCKET_PACKET:packet_create->packet_rcv->run_filter