sockfilter 是一个用于监控数据包并处理 __sk_buff 结构体的示例。
它会将套接字类型的 BPF 程序附加到 sock_queue_rcv_skb() 函数上,从 BPF_MAP_TYPE_RINGBUF 类型的映射中获取信息,然后在标准输出中打印协议类型、源 IP 地址、源端口、目的 IP 地址和目的端口。
目前,该示例已涵盖了 uapi/linux/in.h 头文件中定义的大部分 IPv4 协议,若需查看支持的协议列表,可参考示例代码 examples/c/sockfilter.c 中的 ipproto_mapping 映射表。
前置知识
运行 1 2 3 $ ./sockfilter interface: lo protocol: TCP 127.0.0.1:60196(src) -> 127.0.0.1:40705(dst) interface: lo protocol: TCP 127.0.0.1:40705(src) -> 127.0.0.1:60196(dst)
或者:
1 2 3 $ ./sockfilter -i lo interface: lo protocol: TCP 127.0.0.1:60196(src) -> 127.0.0.1:40705(dst) interface: lo protocol: TCP 127.0.0.1:40705(src) -> 127.0.0.1:60196(dst)
struct so_event 该结构体在 BPF 程序和用户态程序中都可以使用。
1 2 3 4 5 6 7 8 9 10 11 struct so_event { __be32 src_addr; __be32 dst_addr; union { __be32 ports; __be16 port16[2 ]; }; __u32 ip_proto; __u32 pkt_type; __u32 ifindex; };
__be32
相对应的,__le32 表示小端序 32 位整数。
The BPF side 头文件 1 2 3 4 #include "sockfilter.h" #define IP_MF 0x2000 #define IP_OFFSET 0x1FFF
BPF_MAP_TYPE_RINGBUF 定义 BPF_MAP_TYPE_RINGBUF 类型的 BPF map,用于将内核收集到的事件信息传送给用户态程序
1 2 3 4 struct { __uint(type, BPF_MAP_TYPE_RINGBUF); __uint(max_entries, 256 * 1024 ); } rb SEC (".maps" ) ;
ip_is_fragment
1 2 3 4 5 6 7 8 static inline int ip_is_fragment (struct __sk_buff *skb, __u32 nhoff) { __u16 frag_off; bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, frag_off), &frag_off, 2 ); frag_off = __bpf_ntohs(frag_off); return frag_off & (IP_MF | IP_OFFSET); }
bpf_skb_load_bytes
1 bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, frag_off), &frag_off, 2 );
该辅助函数提供了一种简便的方式从数据报中加载数据。它可用于从 skb 关联的数据包中,将偏移量 nhoff + offsetof(struct iphdr, frag_off) 处开始的 2 字节数据,加载到 &frag_off 所指向的缓冲区中。
自 Linux 4.7 版本开始,该辅助函数的使用场景已基本被 direct packet access 替代,通过 skb->data 和 skb->data_end 即可操作数据包数据。不过,若需要一次性从数据包中读取大量数据到 eBPF 栈中,该辅助函数仍有实用价值。
__bpf_ntohs 结合查到的资料,__bpf_ntohs 仅出现在 Linux 内核源码中 BPF 子系统的实现里,是 BPF 字节序转换的底层内部实现。bpf_ntohs 是封装后的公开稳定 API,更推荐使用这个。
SEC(“socket”)
附加该 eBPF 程序到 socket 收到数据包之前
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 SEC("socket" ) int socket_handler (struct __sk_buff *skb) { struct so_event *e ; __u8 verlen; __u16 proto; __u32 nhoff = ETH_HLEN; bpf_skb_load_bytes(skb, 12 , &proto, 2 ); proto = bpf_ntohs(proto); if (proto != ETH_P_IP) return 0 ; if (ip_is_fragment(skb, nhoff)) return 0 ;
bpf_ringbuf_reserve 1 2 3 4 e = bpf_ringbuf_reserve(&rb, sizeof (*e), 0 ); if (!e) return 0 ;
bpf_ringbuf_submit 1 2 3 4 5 6 7 8 9 10 11 12 bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, protocol), &e->ip_proto, 1 ); if (e->ip_proto != IPPROTO_GRE) { bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, saddr), &(e->src_addr), 4 ); bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, daddr), &(e->dst_addr), 4 ); } bpf_skb_load_bytes(skb, nhoff + 0 , &verlen, 1 ); bpf_skb_load_bytes(skb, nhoff + ((verlen & 0xF ) << 2 ), &(e->ports), 4 ); e->pkt_type = skb->pkt_type; e->ifindex = skb->ifindex; bpf_ringbuf_submit(e, 0 );
return
允许该数据包继续传递到socket, 返回值的意义是截取数据包的前多少字节传递到socket
The user-space side env
1 2 3 static struct env { const char *interface; } env;
ipproto_mapping
1 2 3 4 5 6 7 8 9 10 11 static const char *ipproto_mapping[IPPROTO_MAX] = { [IPPROTO_IP] = "IP" , [IPPROTO_ICMP] = "ICMP" , [IPPROTO_IGMP] = "IGMP" , [IPPROTO_IPIP] = "IPIP" , [IPPROTO_TCP] = "TCP" , [IPPROTO_EGP] = "EGP" , [IPPROTO_PUP] = "PUP" , [IPPROTO_UDP] = "UDP" , [IPPROTO_IDP] = "IDP" , [IPPROTO_TP] = "TP" , [IPPROTO_DCCP] = "DCCP" , [IPPROTO_IPV6] = "IPV6" , [IPPROTO_RSVP] = "RSVP" , [IPPROTO_GRE] = "GRE" , [IPPROTO_ESP] = "ESP" , [IPPROTO_AH] = "AH" , [IPPROTO_MTP] = "MTP" , [IPPROTO_BEETPH] = "BEETPH" , [IPPROTO_ENCAP] = "ENCAP" , [IPPROTO_PIM] = "PIM" , [IPPROTO_COMP] = "COMP" , [IPPROTO_SCTP] = "SCTP" , [IPPROTO_UDPLITE] = "UDPLITE" , [IPPROTO_MPLS] = "MPLS" , [IPPROTO_RAW] = "RAW" };
open_raw_sock 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 static int open_raw_sock (const char *name) { struct sockaddr_ll sll ; int sock; sock = socket(PF_PACKET, SOCK_RAW | SOCK_NONBLOCK | SOCK_CLOEXEC, htons(ETH_P_ALL)); if (sock < 0 ) { fprintf (stderr , "Failed to create raw socket\n" ); return -1 ; } memset (&sll, 0 , sizeof (sll)); sll.sll_family = AF_PACKET; sll.sll_ifindex = if_nametoindex(name); sll.sll_protocol = htons(ETH_P_ALL); if (bind(sock, (struct sockaddr *)&sll, sizeof (sll)) < 0 ) { fprintf (stderr , "Failed to bind to %s: %s\n" , name, strerror(errno)); close(sock); return -1 ; } return sock; }
sockaddr_ll sockaddr_ll 是 Linux 系统特有的套接字地址结构体,隶属于 AF_PACKET 地址族,专门用于描述数据链路层(二层) 的地址和帧属性,是用户空间程序直接操作链路层数据包(如以太网帧)的核心接口。
handle_event 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 static inline void ltoa (uint32_t addr, char *dst) { snprintf (dst, 16 , "%u.%u.%u.%u" , (addr >> 24 ) & 0xFF , (addr >> 16 ) & 0xFF , (addr >> 8 ) & 0xFF , (addr & 0xFF )); } static int handle_event (void *ctx, void *data, size_t data_sz) { const struct so_event *e = data; char ifname[IF_NAMESIZE]; char sstr[16 ] = {}, dstr[16 ] = {}; if (e->pkt_type != PACKET_HOST) return 0 ; if (e->ip_proto < 0 || e->ip_proto >= IPPROTO_MAX) return 0 ; if (!if_indextoname(e->ifindex, ifname)) return 0 ; ltoa(ntohl(e->src_addr), sstr); ltoa(ntohl(e->dst_addr), dstr); printf ("interface: %s\tprotocol: %s\t%s:%d(src) -> %s:%d(dst)\n" , ifname, ipproto_mapping[e->ip_proto], sstr, ntohs(e->port16[0 ]), dstr, ntohs(e->port16[1 ])); return 0 ; }
open_and_load 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 struct ring_buffer *rb = NULL ;struct sockfilter_bpf *skel ;int err, prog_fd, sock;env.interface = "lo" ; signal(SIGINT, sig_handler); signal(SIGTERM, sig_handler); skel = sockfilter_bpf__open_and_load(); if (!skel) { fprintf (stderr , "Failed to open and load BPF skeleton\n" ); return 1 ; }
ring_buffer__new 1 2 3 4 5 6 7 rb = ring_buffer__new(bpf_map__fd(skel->maps.rb), handle_event, NULL , NULL ); if (!rb) { err = -1 ; fprintf (stderr , "Failed to create ring buffer\n" ); goto cleanup; }
SO_ATTACH_BPF 📌 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 sock = open_raw_sock(env.interface); if (sock < 0 ) { err = -2 ; fprintf (stderr , "Failed to open raw socket\n" ); goto cleanup; } prog_fd = bpf_program__fd(skel->progs.socket_handler); if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof (prog_fd))) { err = -3 ; fprintf (stderr , "Failed to attach to raw socket\n" ); goto cleanup; }
ring_buffer__poll 1 2 3 4 5 6 7 8 9 10 11 12 13 14 while (!exiting) { err = ring_buffer__poll(rb, 100 ); if (err == -EINTR) { err = 0 ; break ; } if (err < 0 ) { fprintf (stderr , "Error polling perf buffer: %d\n" , err); break ; } sleep(1 ); }
cleanup 1 2 3 cleanup: ring_buffer__free(rb); sockfilter_bpf__destroy(skel);