mirror of
				git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
				synced 2025-09-04 20:19:47 +08:00 
			
		
		
		
	 deb5940046
			
		
	
	
		deb5940046
		
	
	
	
	
		
			
			The check for 4-byte load from dst_port offset into bpf_sock is failing on
big-endian architecture - s390. The bpf access converter rewrites the
4-byte load to a 2-byte load from sock_common at skc_dport offset, as shown
below.
  * s390 / llvm-objdump -S --no-show-raw-insn
  00000000000002a0 <sk_dst_port__load_word>:
        84:       r1 = *(u32 *)(r1 + 48)
        85:       w0 = 1
        86:       if w1 == 51966 goto +1 <LBB5_2>
        87:       w0 = 0
  00000000000002c0 <LBB5_2>:
        88:       exit
  * s390 / bpftool prog dump xlated
  _Bool sk_dst_port__load_word(struct bpf_sock * sk):
    35: (69) r1 = *(u16 *)(r1 +12)
    36: (bc) w1 = w1
    37: (b4) w0 = 1
    38: (16) if w1 == 0xcafe goto pc+1
    39: (b4) w0 = 0
    40: (95) exit
  * x86_64 / llvm-objdump -S --no-show-raw-insn
  00000000000002a0 <sk_dst_port__load_word>:
        84:       r1 = *(u32 *)(r1 + 48)
        85:       w0 = 1
        86:       if w1 == 65226 goto +1 <LBB5_2>
        87:       w0 = 0
  00000000000002c0 <LBB5_2>:
        88:       exit
  * x86_64 / bpftool prog dump xlated
  _Bool sk_dst_port__load_word(struct bpf_sock * sk):
    33: (69) r1 = *(u16 *)(r1 +12)
    34: (b4) w0 = 1
    35: (16) if w1 == 0xfeca goto pc+1
    36: (b4) w0 = 0
    37: (95) exit
This leads to surprises if we treat the destination register contents as a
32-bit value, ignoring the fact that in reality it contains a 16-bit value.
On little-endian the register contents reflect the bpf_sock struct
definition, where the lower 16-bits contain the port number:
	struct bpf_sock {
		...
		__be16 dst_port;	/* offset 48 */
		__u16 :16;
		...
	};
However, on big-endian the register contents suggest that field the layout
of bpf_sock struct is as so:
	struct bpf_sock {
		...
		__u16 :16;		/* offset 48 */
		__be16 dst_port;
		...
	};
Account for this quirky access conversion in the test case exercising the
4-byte load by treating the result as 16-bit wide.
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20220317113920.1068535-5-jakub@cloudflare.com
		
	
			
		
			
				
	
	
		
			305 lines
		
	
	
		
			7.2 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			305 lines
		
	
	
		
			7.2 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| // SPDX-License-Identifier: GPL-2.0
 | |
| /* Copyright (c) 2019 Facebook */
 | |
| 
 | |
| #include <linux/bpf.h>
 | |
| #include <netinet/in.h>
 | |
| #include <stdbool.h>
 | |
| 
 | |
| #include <bpf/bpf_helpers.h>
 | |
| #include <bpf/bpf_endian.h>
 | |
| #include "bpf_tcp_helpers.h"
 | |
| 
 | |
| enum bpf_linum_array_idx {
 | |
| 	EGRESS_LINUM_IDX,
 | |
| 	INGRESS_LINUM_IDX,
 | |
| 	READ_SK_DST_PORT_LINUM_IDX,
 | |
| 	__NR_BPF_LINUM_ARRAY_IDX,
 | |
| };
 | |
| 
 | |
| struct {
 | |
| 	__uint(type, BPF_MAP_TYPE_ARRAY);
 | |
| 	__uint(max_entries, __NR_BPF_LINUM_ARRAY_IDX);
 | |
| 	__type(key, __u32);
 | |
| 	__type(value, __u32);
 | |
| } linum_map SEC(".maps");
 | |
| 
 | |
| struct bpf_spinlock_cnt {
 | |
| 	struct bpf_spin_lock lock;
 | |
| 	__u32 cnt;
 | |
| };
 | |
| 
 | |
| struct {
 | |
| 	__uint(type, BPF_MAP_TYPE_SK_STORAGE);
 | |
| 	__uint(map_flags, BPF_F_NO_PREALLOC);
 | |
| 	__type(key, int);
 | |
| 	__type(value, struct bpf_spinlock_cnt);
 | |
| } sk_pkt_out_cnt SEC(".maps");
 | |
| 
 | |
| struct {
 | |
| 	__uint(type, BPF_MAP_TYPE_SK_STORAGE);
 | |
| 	__uint(map_flags, BPF_F_NO_PREALLOC);
 | |
| 	__type(key, int);
 | |
| 	__type(value, struct bpf_spinlock_cnt);
 | |
| } sk_pkt_out_cnt10 SEC(".maps");
 | |
| 
 | |
| struct bpf_tcp_sock listen_tp = {};
 | |
| struct sockaddr_in6 srv_sa6 = {};
 | |
| struct bpf_tcp_sock cli_tp = {};
 | |
| struct bpf_tcp_sock srv_tp = {};
 | |
| struct bpf_sock listen_sk = {};
 | |
| struct bpf_sock srv_sk = {};
 | |
| struct bpf_sock cli_sk = {};
 | |
| __u64 parent_cg_id = 0;
 | |
| __u64 child_cg_id = 0;
 | |
| __u64 lsndtime = 0;
 | |
| 
 | |
| static bool is_loopback6(__u32 *a6)
 | |
| {
 | |
| 	return !a6[0] && !a6[1] && !a6[2] && a6[3] == bpf_htonl(1);
 | |
| }
 | |
| 
 | |
| static void skcpy(struct bpf_sock *dst,
 | |
| 		  const struct bpf_sock *src)
 | |
| {
 | |
| 	dst->bound_dev_if = src->bound_dev_if;
 | |
| 	dst->family = src->family;
 | |
| 	dst->type = src->type;
 | |
| 	dst->protocol = src->protocol;
 | |
| 	dst->mark = src->mark;
 | |
| 	dst->priority = src->priority;
 | |
| 	dst->src_ip4 = src->src_ip4;
 | |
| 	dst->src_ip6[0] = src->src_ip6[0];
 | |
| 	dst->src_ip6[1] = src->src_ip6[1];
 | |
| 	dst->src_ip6[2] = src->src_ip6[2];
 | |
| 	dst->src_ip6[3] = src->src_ip6[3];
 | |
| 	dst->src_port = src->src_port;
 | |
| 	dst->dst_ip4 = src->dst_ip4;
 | |
| 	dst->dst_ip6[0] = src->dst_ip6[0];
 | |
| 	dst->dst_ip6[1] = src->dst_ip6[1];
 | |
| 	dst->dst_ip6[2] = src->dst_ip6[2];
 | |
| 	dst->dst_ip6[3] = src->dst_ip6[3];
 | |
| 	dst->dst_port = src->dst_port;
 | |
| 	dst->state = src->state;
 | |
| }
 | |
| 
 | |
| static void tpcpy(struct bpf_tcp_sock *dst,
 | |
| 		  const struct bpf_tcp_sock *src)
 | |
| {
 | |
| 	dst->snd_cwnd = src->snd_cwnd;
 | |
| 	dst->srtt_us = src->srtt_us;
 | |
| 	dst->rtt_min = src->rtt_min;
 | |
| 	dst->snd_ssthresh = src->snd_ssthresh;
 | |
| 	dst->rcv_nxt = src->rcv_nxt;
 | |
| 	dst->snd_nxt = src->snd_nxt;
 | |
| 	dst->snd_una = src->snd_una;
 | |
| 	dst->mss_cache = src->mss_cache;
 | |
| 	dst->ecn_flags = src->ecn_flags;
 | |
| 	dst->rate_delivered = src->rate_delivered;
 | |
| 	dst->rate_interval_us = src->rate_interval_us;
 | |
| 	dst->packets_out = src->packets_out;
 | |
| 	dst->retrans_out = src->retrans_out;
 | |
| 	dst->total_retrans = src->total_retrans;
 | |
| 	dst->segs_in = src->segs_in;
 | |
| 	dst->data_segs_in = src->data_segs_in;
 | |
| 	dst->segs_out = src->segs_out;
 | |
| 	dst->data_segs_out = src->data_segs_out;
 | |
| 	dst->lost_out = src->lost_out;
 | |
| 	dst->sacked_out = src->sacked_out;
 | |
| 	dst->bytes_received = src->bytes_received;
 | |
| 	dst->bytes_acked = src->bytes_acked;
 | |
| }
 | |
| 
 | |
| /* Always return CG_OK so that no pkt will be filtered out */
 | |
| #define CG_OK 1
 | |
| 
 | |
| #define RET_LOG() ({						\
 | |
| 	linum = __LINE__;					\
 | |
| 	bpf_map_update_elem(&linum_map, &linum_idx, &linum, BPF_ANY);	\
 | |
| 	return CG_OK;						\
 | |
| })
 | |
| 
 | |
| SEC("cgroup_skb/egress")
 | |
| int egress_read_sock_fields(struct __sk_buff *skb)
 | |
| {
 | |
| 	struct bpf_spinlock_cnt cli_cnt_init = { .lock = 0, .cnt = 0xeB9F };
 | |
| 	struct bpf_spinlock_cnt *pkt_out_cnt, *pkt_out_cnt10;
 | |
| 	struct bpf_tcp_sock *tp, *tp_ret;
 | |
| 	struct bpf_sock *sk, *sk_ret;
 | |
| 	__u32 linum, linum_idx;
 | |
| 	struct tcp_sock *ktp;
 | |
| 
 | |
| 	linum_idx = EGRESS_LINUM_IDX;
 | |
| 
 | |
| 	sk = skb->sk;
 | |
| 	if (!sk)
 | |
| 		RET_LOG();
 | |
| 
 | |
| 	/* Not testing the egress traffic or the listening socket,
 | |
| 	 * which are covered by the cgroup_skb/ingress test program.
 | |
| 	 */
 | |
| 	if (sk->family != AF_INET6 || !is_loopback6(sk->src_ip6) ||
 | |
| 	    sk->state == BPF_TCP_LISTEN)
 | |
| 		return CG_OK;
 | |
| 
 | |
| 	if (sk->src_port == bpf_ntohs(srv_sa6.sin6_port)) {
 | |
| 		/* Server socket */
 | |
| 		sk_ret = &srv_sk;
 | |
| 		tp_ret = &srv_tp;
 | |
| 	} else if (sk->dst_port == srv_sa6.sin6_port) {
 | |
| 		/* Client socket */
 | |
| 		sk_ret = &cli_sk;
 | |
| 		tp_ret = &cli_tp;
 | |
| 	} else {
 | |
| 		/* Not the testing egress traffic */
 | |
| 		return CG_OK;
 | |
| 	}
 | |
| 
 | |
| 	/* It must be a fullsock for cgroup_skb/egress prog */
 | |
| 	sk = bpf_sk_fullsock(sk);
 | |
| 	if (!sk)
 | |
| 		RET_LOG();
 | |
| 
 | |
| 	/* Not the testing egress traffic */
 | |
| 	if (sk->protocol != IPPROTO_TCP)
 | |
| 		return CG_OK;
 | |
| 
 | |
| 	tp = bpf_tcp_sock(sk);
 | |
| 	if (!tp)
 | |
| 		RET_LOG();
 | |
| 
 | |
| 	skcpy(sk_ret, sk);
 | |
| 	tpcpy(tp_ret, tp);
 | |
| 
 | |
| 	if (sk_ret == &srv_sk) {
 | |
| 		ktp = bpf_skc_to_tcp_sock(sk);
 | |
| 
 | |
| 		if (!ktp)
 | |
| 			RET_LOG();
 | |
| 
 | |
| 		lsndtime = ktp->lsndtime;
 | |
| 
 | |
| 		child_cg_id = bpf_sk_cgroup_id(ktp);
 | |
| 		if (!child_cg_id)
 | |
| 			RET_LOG();
 | |
| 
 | |
| 		parent_cg_id = bpf_sk_ancestor_cgroup_id(ktp, 2);
 | |
| 		if (!parent_cg_id)
 | |
| 			RET_LOG();
 | |
| 
 | |
| 		/* The userspace has created it for srv sk */
 | |
| 		pkt_out_cnt = bpf_sk_storage_get(&sk_pkt_out_cnt, ktp, 0, 0);
 | |
| 		pkt_out_cnt10 = bpf_sk_storage_get(&sk_pkt_out_cnt10, ktp,
 | |
| 						   0, 0);
 | |
| 	} else {
 | |
| 		pkt_out_cnt = bpf_sk_storage_get(&sk_pkt_out_cnt, sk,
 | |
| 						 &cli_cnt_init,
 | |
| 						 BPF_SK_STORAGE_GET_F_CREATE);
 | |
| 		pkt_out_cnt10 = bpf_sk_storage_get(&sk_pkt_out_cnt10,
 | |
| 						   sk, &cli_cnt_init,
 | |
| 						   BPF_SK_STORAGE_GET_F_CREATE);
 | |
| 	}
 | |
| 
 | |
| 	if (!pkt_out_cnt || !pkt_out_cnt10)
 | |
| 		RET_LOG();
 | |
| 
 | |
| 	/* Even both cnt and cnt10 have lock defined in their BTF,
 | |
| 	 * intentionally one cnt takes lock while one does not
 | |
| 	 * as a test for the spinlock support in BPF_MAP_TYPE_SK_STORAGE.
 | |
| 	 */
 | |
| 	pkt_out_cnt->cnt += 1;
 | |
| 	bpf_spin_lock(&pkt_out_cnt10->lock);
 | |
| 	pkt_out_cnt10->cnt += 10;
 | |
| 	bpf_spin_unlock(&pkt_out_cnt10->lock);
 | |
| 
 | |
| 	return CG_OK;
 | |
| }
 | |
| 
 | |
| SEC("cgroup_skb/ingress")
 | |
| int ingress_read_sock_fields(struct __sk_buff *skb)
 | |
| {
 | |
| 	struct bpf_tcp_sock *tp;
 | |
| 	__u32 linum, linum_idx;
 | |
| 	struct bpf_sock *sk;
 | |
| 
 | |
| 	linum_idx = INGRESS_LINUM_IDX;
 | |
| 
 | |
| 	sk = skb->sk;
 | |
| 	if (!sk)
 | |
| 		RET_LOG();
 | |
| 
 | |
| 	/* Not the testing ingress traffic to the server */
 | |
| 	if (sk->family != AF_INET6 || !is_loopback6(sk->src_ip6) ||
 | |
| 	    sk->src_port != bpf_ntohs(srv_sa6.sin6_port))
 | |
| 		return CG_OK;
 | |
| 
 | |
| 	/* Only interested in the listening socket */
 | |
| 	if (sk->state != BPF_TCP_LISTEN)
 | |
| 		return CG_OK;
 | |
| 
 | |
| 	/* It must be a fullsock for cgroup_skb/ingress prog */
 | |
| 	sk = bpf_sk_fullsock(sk);
 | |
| 	if (!sk)
 | |
| 		RET_LOG();
 | |
| 
 | |
| 	tp = bpf_tcp_sock(sk);
 | |
| 	if (!tp)
 | |
| 		RET_LOG();
 | |
| 
 | |
| 	skcpy(&listen_sk, sk);
 | |
| 	tpcpy(&listen_tp, tp);
 | |
| 
 | |
| 	return CG_OK;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * NOTE: 4-byte load from bpf_sock at dst_port offset is quirky. It
 | |
|  * gets rewritten by the access converter to a 2-byte load for
 | |
|  * backward compatibility. Treating the load result as a be16 value
 | |
|  * makes the code portable across little- and big-endian platforms.
 | |
|  */
 | |
| static __noinline bool sk_dst_port__load_word(struct bpf_sock *sk)
 | |
| {
 | |
| 	__u32 *word = (__u32 *)&sk->dst_port;
 | |
| 	return word[0] == bpf_htons(0xcafe);
 | |
| }
 | |
| 
 | |
| static __noinline bool sk_dst_port__load_half(struct bpf_sock *sk)
 | |
| {
 | |
| 	__u16 *half = (__u16 *)&sk->dst_port;
 | |
| 	return half[0] == bpf_htons(0xcafe);
 | |
| }
 | |
| 
 | |
| static __noinline bool sk_dst_port__load_byte(struct bpf_sock *sk)
 | |
| {
 | |
| 	__u8 *byte = (__u8 *)&sk->dst_port;
 | |
| 	return byte[0] == 0xca && byte[1] == 0xfe;
 | |
| }
 | |
| 
 | |
| SEC("cgroup_skb/egress")
 | |
| int read_sk_dst_port(struct __sk_buff *skb)
 | |
| {
 | |
| 	__u32 linum, linum_idx;
 | |
| 	struct bpf_sock *sk;
 | |
| 
 | |
| 	linum_idx = READ_SK_DST_PORT_LINUM_IDX;
 | |
| 
 | |
| 	sk = skb->sk;
 | |
| 	if (!sk)
 | |
| 		RET_LOG();
 | |
| 
 | |
| 	/* Ignore everything but the SYN from the client socket */
 | |
| 	if (sk->state != BPF_TCP_SYN_SENT)
 | |
| 		return CG_OK;
 | |
| 
 | |
| 	if (!sk_dst_port__load_word(sk))
 | |
| 		RET_LOG();
 | |
| 	if (!sk_dst_port__load_half(sk))
 | |
| 		RET_LOG();
 | |
| 	if (!sk_dst_port__load_byte(sk))
 | |
| 		RET_LOG();
 | |
| 
 | |
| 	return CG_OK;
 | |
| }
 | |
| 
 | |
| char _license[] SEC("license") = "GPL";
 |