sockfilter 是一个用于监控数据包并处理 __sk_buff 结构体的示例。

它会将套接字类型的 BPF 程序附加到 sock_queue_rcv_skb() 函数上,从 BPF_MAP_TYPE_RINGBUF 类型的映射中获取信息,然后在标准输出中打印协议类型、源 IP 地址、源端口、目的 IP 地址和目的端口。

目前,该示例已涵盖了 uapi/linux/in.h 头文件中定义的大部分 IPv4 协议,若需查看支持的协议列表,可参考示例代码 examples/c/sockfilter.c 中的 ipproto_mapping 映射表。

前置知识

  • socket
  • ringbuf

运行

1
2
3
$ ./sockfilter
interface: lo protocol: TCP 127.0.0.1:60196(src) -> 127.0.0.1:40705(dst)
interface: lo protocol: TCP 127.0.0.1:40705(src) -> 127.0.0.1:60196(dst)

或者:

1
2
3
$ ./sockfilter -i lo
interface: lo protocol: TCP 127.0.0.1:60196(src) -> 127.0.0.1:40705(dst)
interface: lo protocol: TCP 127.0.0.1:40705(src) -> 127.0.0.1:60196(dst)

struct so_event

该结构体在 BPF 程序和用户态程序中都可以使用。

1
2
3
4
5
6
7
8
9
10
11
struct so_event {
__be32 src_addr;
__be32 dst_addr;
union {
__be32 ports;
__be16 port16[2];
};
__u32 ip_proto;
__u32 pkt_type;
__u32 ifindex;
};

__be32

  • be: 大端序

  • 32:32位

相对应的,__le32 表示小端序 32 位整数。

The BPF side

头文件

1
2
3
4
#include "sockfilter.h"

#define IP_MF 0x2000
#define IP_OFFSET 0x1FFF

BPF_MAP_TYPE_RINGBUF

定义 BPF_MAP_TYPE_RINGBUF 类型的 BPF map,用于将内核收集到的事件信息传送给用户态程序

1
2
3
4
struct {
__uint(type, BPF_MAP_TYPE_RINGBUF);
__uint(max_entries, 256 * 1024);
} rb SEC(".maps");

ip_is_fragment

  • inline
  • 判断是否为 IP 分片包
1
2
3
4
5
6
7
8
static inline int ip_is_fragment(struct __sk_buff *skb, __u32 nhoff)
{
__u16 frag_off;

bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, frag_off), &frag_off, 2);
frag_off = __bpf_ntohs(frag_off);
return frag_off & (IP_MF | IP_OFFSET);
}

bpf_skb_load_bytes

  • Helper function
1
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, frag_off), &frag_off, 2);

该辅助函数提供了一种简便的方式从数据报中加载数据。它可用于从 skb 关联的数据包中,将偏移量 nhoff + offsetof(struct iphdr, frag_off) 处开始的 2 字节数据,加载到 &frag_off 所指向的缓冲区中。

自 Linux 4.7 版本开始,该辅助函数的使用场景已基本被 direct packet access 替代,通过 skb->dataskb->data_end 即可操作数据包数据。不过,若需要一次性从数据包中读取大量数据到 eBPF 栈中,该辅助函数仍有实用价值。

__bpf_ntohs

结合查到的资料,__bpf_ntohs 仅出现在 Linux 内核源码中 BPF 子系统的实现里,是 BPF 字节序转换的底层内部实现。bpf_ntohs 是封装后的公开稳定 API,更推荐使用这个。

SEC(“socket”)

  • 附加该 eBPF 程序到 socket 收到数据包之前
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
SEC("socket")
int socket_handler(struct __sk_buff *skb)
{
struct so_event *e;
__u8 verlen;
__u16 proto;
__u32 nhoff = ETH_HLEN;

bpf_skb_load_bytes(skb, 12, &proto, 2);
proto = bpf_ntohs(proto);
if (proto != ETH_P_IP)
return 0;

if (ip_is_fragment(skb, nhoff))
return 0;

bpf_ringbuf_reserve

1
2
3
4
/* reserve sample from BPF ringbuf */
e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
if (!e)
return 0;

bpf_ringbuf_submit

1
2
3
4
5
6
7
8
9
10
11
12
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, protocol), &e->ip_proto, 1);

if (e->ip_proto != IPPROTO_GRE) {
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, saddr), &(e->src_addr), 4);
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, daddr), &(e->dst_addr), 4);
}

bpf_skb_load_bytes(skb, nhoff + 0, &verlen, 1);
bpf_skb_load_bytes(skb, nhoff + ((verlen & 0xF) << 2), &(e->ports), 4);
e->pkt_type = skb->pkt_type;
e->ifindex = skb->ifindex;
bpf_ringbuf_submit(e, 0);

return

  • 允许该数据包继续传递到socket, 返回值的意义是截取数据包的前多少字节传递到socket
1
return skb->len;

The user-space side

env

  • 要监听的端口
1
2
3
static struct env {
const char *interface;
} env;

ipproto_mapping

  • 协议格式与协议名的映射表, 来源可参考
1
2
3
4
5
6
7
8
9
10
11
static const char *ipproto_mapping[IPPROTO_MAX] = {
[IPPROTO_IP] = "IP", [IPPROTO_ICMP] = "ICMP", [IPPROTO_IGMP] = "IGMP",
[IPPROTO_IPIP] = "IPIP", [IPPROTO_TCP] = "TCP", [IPPROTO_EGP] = "EGP",
[IPPROTO_PUP] = "PUP", [IPPROTO_UDP] = "UDP", [IPPROTO_IDP] = "IDP",
[IPPROTO_TP] = "TP", [IPPROTO_DCCP] = "DCCP", [IPPROTO_IPV6] = "IPV6",
[IPPROTO_RSVP] = "RSVP", [IPPROTO_GRE] = "GRE", [IPPROTO_ESP] = "ESP",
[IPPROTO_AH] = "AH", [IPPROTO_MTP] = "MTP", [IPPROTO_BEETPH] = "BEETPH",
[IPPROTO_ENCAP] = "ENCAP", [IPPROTO_PIM] = "PIM", [IPPROTO_COMP] = "COMP",
[IPPROTO_SCTP] = "SCTP", [IPPROTO_UDPLITE] = "UDPLITE", [IPPROTO_MPLS] = "MPLS",
[IPPROTO_RAW] = "RAW"
};

open_raw_sock

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
static int open_raw_sock(const char *name)
{
struct sockaddr_ll sll;
int sock;

sock = socket(PF_PACKET, SOCK_RAW | SOCK_NONBLOCK | SOCK_CLOEXEC, htons(ETH_P_ALL));
if (sock < 0) {
fprintf(stderr, "Failed to create raw socket\n");
return -1;
}

memset(&sll, 0, sizeof(sll));
sll.sll_family = AF_PACKET;
sll.sll_ifindex = if_nametoindex(name);
sll.sll_protocol = htons(ETH_P_ALL);
if (bind(sock, (struct sockaddr *)&sll, sizeof(sll)) < 0) {
fprintf(stderr, "Failed to bind to %s: %s\n", name, strerror(errno));
close(sock);
return -1;
}

return sock;
}

sockaddr_ll

sockaddr_ll 是 Linux 系统特有的套接字地址结构体,隶属于 AF_PACKET 地址族,专门用于描述数据链路层(二层) 的地址和帧属性,是用户空间程序直接操作链路层数据包(如以太网帧)的核心接口。

handle_event

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
static inline void ltoa(uint32_t addr, char *dst)
{
snprintf(dst, 16, "%u.%u.%u.%u", (addr >> 24) & 0xFF, (addr >> 16) & 0xFF,
(addr >> 8) & 0xFF, (addr & 0xFF));
}

static int handle_event(void *ctx, void *data, size_t data_sz)
{
const struct so_event *e = data;
char ifname[IF_NAMESIZE];
char sstr[16] = {}, dstr[16] = {};

if (e->pkt_type != PACKET_HOST)
return 0;

if (e->ip_proto < 0 || e->ip_proto >= IPPROTO_MAX)
return 0;

if (!if_indextoname(e->ifindex, ifname))
return 0;

ltoa(ntohl(e->src_addr), sstr);
ltoa(ntohl(e->dst_addr), dstr);

printf("interface: %s\tprotocol: %s\t%s:%d(src) -> %s:%d(dst)\n", ifname,
ipproto_mapping[e->ip_proto], sstr, ntohs(e->port16[0]), dstr, ntohs(e->port16[1]));

return 0;
}

open_and_load

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
struct ring_buffer *rb = NULL;
struct sockfilter_bpf *skel;
int err, prog_fd, sock;

env.interface = "lo";

/* Cleaner handling of Ctrl-C */
signal(SIGINT, sig_handler);
signal(SIGTERM, sig_handler);

/* Load and verify BPF programs*/
skel = sockfilter_bpf__open_and_load();
if (!skel) {
fprintf(stderr, "Failed to open and load BPF skeleton\n");
return 1;
}

ring_buffer__new

1
2
3
4
5
6
7
/* Set up ring buffer polling */
rb = ring_buffer__new(bpf_map__fd(skel->maps.rb), handle_event, NULL, NULL);
if (!rb) {
err = -1;
fprintf(stderr, "Failed to create ring buffer\n");
goto cleanup;
}

SO_ATTACH_BPF 📌

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/* Create raw socket for localhost interface */
sock = open_raw_sock(env.interface);
if (sock < 0) {
err = -2;
fprintf(stderr, "Failed to open raw socket\n");
goto cleanup;
}

/* Attach BPF program to raw socket */
prog_fd = bpf_program__fd(skel->progs.socket_handler);
if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd))) {
err = -3;
fprintf(stderr, "Failed to attach to raw socket\n");
goto cleanup;
}

ring_buffer__poll

1
2
3
4
5
6
7
8
9
10
11
12
13
14
/* Process events */
while (!exiting) {
err = ring_buffer__poll(rb, 100 /* timeout, ms */);
/* Ctrl-C will cause -EINTR */
if (err == -EINTR) {
err = 0;
break;
}
if (err < 0) {
fprintf(stderr, "Error polling perf buffer: %d\n", err);
break;
}
sleep(1);
}

cleanup

1
2
3
cleanup:
ring_buffer__free(rb);
sockfilter_bpf__destroy(skel);