tc(Traffic Control,流量控制)是一个处理入站网络流量的示例。它会在 lo 接口(回环接口)上创建一个队列规则(qdisc),并将 tc_ingress 类型的 BPF 程序附加到该规则上,最终上报进入 lo 接口的 IP 数据包的元数据。

前置知识

运行

1
2
3
$ ./tc
libbpf: loading object 'tc_bpf' from buffer
...
1
2
3
4
$ cat /sys/kernel/debug/tracing/trace_pipe
node-31445 [006] ..s2. 264002.057570: bpf_trace_printk: Got IP packet: tot_len: 461, ttl: 64
node-31425 [002] ..s2. 264002.059394: bpf_trace_printk: Got IP packet: tot_len: 143, ttl: 64
node-31425 [002] ..s2. 264002.059497: bpf_trace_printk: Got IP packet: tot_len: 52, ttl: 64

The BPF side

SEC(“tc”)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
#include <vmlinux.h>
#include <bpf/bpf_endian.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>

#define TC_ACT_OK 0
#define ETH_P_IP 0x0800 /* Internet Protocol packet */

SEC("tc")
int tc_ingress(struct __sk_buff *ctx)
{
void *data_end = (void *)(__u64)ctx->data_end;
void *data = (void *)(__u64)ctx->data;
struct ethhdr *l2;
struct iphdr *l3;

if (ctx->protocol != bpf_htons(ETH_P_IP))
return TC_ACT_OK;

l2 = data;
if ((void *)(l2 + 1) > data_end)
return TC_ACT_OK;

l3 = (struct iphdr *)(l2 + 1);
if ((void *)(l3 + 1) > data_end)
return TC_ACT_OK;

bpf_printk("Got IP packet: tot_len: %d, ttl: %d", bpf_ntohs(l3->tot_len), l3->ttl);
return TC_ACT_OK;
}

char __license[] SEC("license") = "GPL";

l2 + 1

l2struct ethhdr * 类型的指针(指向以太网头起始地址),在 C 语言中:结构体指针 +1,偏移的字节数 = 该结构体的总大小(而非 1 个字节)。

l2 + 1 等价于 data + sizeof(struct ethhdr),即指向以太网头末尾的下一个字节(也就是 IP 头的起始位置)。l2 + 1 更直观、无需手动计算 sizeof,不易出错。

The user-space side

LO_IFINDEX

  • 环回设备的接口索引,可通过 ip link show 命令查看
1
#define LO_IFINDEX 1

bpf_tc_hook

  • 定义类型为 bpf_tc_hook ,变量名为 tc_hook 的结构体变量。
1
2
3
4
5
6
7
8
9
10
11
12
13
struct bpf_tc_hook {
size_t sz;
int ifindex;
enum bpf_tc_attach_point attach_point;
__u32 parent;
size_t :0;
};

enum bpf_tc_attach_point {
BPF_TC_INGRESS = 1 << 0,
BPF_TC_EGRESS = 1 << 1,
BPF_TC_CUSTOM = 1 << 2,
};
1
2
DECLARE_LIBBPF_OPTS(bpf_tc_hook, tc_hook, .ifindex = LO_IFINDEX,
.attach_point = BPF_TC_INGRESS);

bpf_tc_opts

  • 定义类型为 bpf_tc_opts ,变量名为 tc_opts 的结构体变量。
1
2
3
4
5
6
7
8
9
struct bpf_tc_opts {
size_t sz;
int prog_fd;
__u32 flags;
__u32 prog_id;
__u32 handle;
__u32 priority;
size_t :0;
};
1
DECLARE_LIBBPF_OPTS(bpf_tc_opts, tc_opts, .handle = 1, .priority = 1);
  • prog_fd: 要附加的 BPF 程序的文件描述符。
  • handle: TC 分类器的句柄。
  • priority: TC 分类器的优先级。取值范围 [0 - 65535]。

open_and_load

1
2
3
4
5
6
7
8
9
bool hook_created = false;
struct tc_bpf *skel;
int err;

skel = tc_bpf__open_and_load();
if (!skel) {
fprintf(stderr, "Failed to open BPF skeleton\n");
return 1;
}

bpf_tc_hook_create 📌

  • 创建TC钩子,即创建 qdisc
1
2
3
4
5
6
7
8
9
10
11
12
13
/* The hook (i.e. qdisc) may already exists because:
* 1. it is created by other processes or users
* 2. or since we are attaching to the TC ingress ONLY,
* bpf_tc_hook_destroy does NOT really remove the qdisc,
* there may be an egress filter on the qdisc
*/
err = bpf_tc_hook_create(&tc_hook);
if (!err)
hook_created = true;
if (err && err != -EEXIST) {
fprintf(stderr, "Failed to create TC hook: %d\n", err);
goto cleanup;
}

bpf_program__fd

1
tc_opts.prog_fd = bpf_program__fd(skel->progs.tc_ingress);

bpf_tc_attach 📌

1
2
3
4
5
err = bpf_tc_attach(&tc_hook, &tc_opts);
if (err) {
fprintf(stderr, "Failed to attach TC: %d\n", err);
goto cleanup;
}

while

1
2
3
4
while (!exiting) {
fprintf(stderr, ".");
sleep(1);
}

bpf_tc_detach 📌

  • 将 eBPF 程序从 TC 钩子上卸载
1
2
3
4
5
6
tc_opts.flags = tc_opts.prog_fd = tc_opts.prog_id = 0;
err = bpf_tc_detach(&tc_hook, &tc_opts);
if (err) {
fprintf(stderr, "Failed to detach TC: %d\n", err);
goto cleanup;
}

cleanup

  • 清除之前创建的 TC 钩子
1
2
3
4
cleanup:
if (hook_created)
bpf_tc_hook_destroy(&tc_hook);
tc_bpf__destroy(skel);