mirror of
				git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
				synced 2025-09-04 20:19:47 +08:00 
			
		
		
		
	 ee9fd0ac30
			
		
	
	
		ee9fd0ac30
		
	
	
	
	
		
			
			KCSAN reported a data-race when accessing node->ref. Although node->ref does not have to be accurate, take this chance to use a more common READ_ONCE() and WRITE_ONCE() pattern instead of data_race(). There is an existing bpf_lru_node_is_ref() and bpf_lru_node_set_ref(). This patch also adds bpf_lru_node_clear_ref() to do the WRITE_ONCE(node->ref, 0) also. ================================================================== BUG: KCSAN: data-race in __bpf_lru_list_rotate / __htab_lru_percpu_map_update_elem write to 0xffff888137038deb of 1 bytes by task 11240 on cpu 1: __bpf_lru_node_move kernel/bpf/bpf_lru_list.c:113 [inline] __bpf_lru_list_rotate_active kernel/bpf/bpf_lru_list.c:149 [inline] __bpf_lru_list_rotate+0x1bf/0x750 kernel/bpf/bpf_lru_list.c:240 bpf_lru_list_pop_free_to_local kernel/bpf/bpf_lru_list.c:329 [inline] bpf_common_lru_pop_free kernel/bpf/bpf_lru_list.c:447 [inline] bpf_lru_pop_free+0x638/0xe20 kernel/bpf/bpf_lru_list.c:499 prealloc_lru_pop kernel/bpf/hashtab.c:290 [inline] __htab_lru_percpu_map_update_elem+0xe7/0x820 kernel/bpf/hashtab.c:1316 bpf_percpu_hash_update+0x5e/0x90 kernel/bpf/hashtab.c:2313 bpf_map_update_value+0x2a9/0x370 kernel/bpf/syscall.c:200 generic_map_update_batch+0x3ae/0x4f0 kernel/bpf/syscall.c:1687 bpf_map_do_batch+0x2d9/0x3d0 kernel/bpf/syscall.c:4534 __sys_bpf+0x338/0x810 __do_sys_bpf kernel/bpf/syscall.c:5096 [inline] __se_sys_bpf kernel/bpf/syscall.c:5094 [inline] __x64_sys_bpf+0x43/0x50 kernel/bpf/syscall.c:5094 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read to 0xffff888137038deb of 1 bytes by task 11241 on cpu 0: bpf_lru_node_set_ref kernel/bpf/bpf_lru_list.h:70 [inline] __htab_lru_percpu_map_update_elem+0x2f1/0x820 kernel/bpf/hashtab.c:1332 bpf_percpu_hash_update+0x5e/0x90 kernel/bpf/hashtab.c:2313 bpf_map_update_value+0x2a9/0x370 kernel/bpf/syscall.c:200 generic_map_update_batch+0x3ae/0x4f0 kernel/bpf/syscall.c:1687 bpf_map_do_batch+0x2d9/0x3d0 kernel/bpf/syscall.c:4534 __sys_bpf+0x338/0x810 __do_sys_bpf kernel/bpf/syscall.c:5096 [inline] __se_sys_bpf kernel/bpf/syscall.c:5094 [inline] __x64_sys_bpf+0x43/0x50 kernel/bpf/syscall.c:5094 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x01 -> 0x00 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 11241 Comm: syz-executor.3 Not tainted 6.3.0-rc7-syzkaller-00136-g6a66fdd29ea1 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/30/2023 ================================================================== Reported-by: syzbot+ebe648a84e8784763f82@syzkaller.appspotmail.com Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org> Acked-by: Yonghong Song <yhs@fb.com> Link: https://lore.kernel.org/r/20230511043748.1384166-1-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov <ast@kernel.org>
		
			
				
	
	
		
			701 lines
		
	
	
		
			18 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			701 lines
		
	
	
		
			18 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| // SPDX-License-Identifier: GPL-2.0-only
 | |
| /* Copyright (c) 2016 Facebook
 | |
|  */
 | |
| #include <linux/cpumask.h>
 | |
| #include <linux/spinlock.h>
 | |
| #include <linux/percpu.h>
 | |
| 
 | |
| #include "bpf_lru_list.h"
 | |
| 
 | |
| #define LOCAL_FREE_TARGET		(128)
 | |
| #define LOCAL_NR_SCANS			LOCAL_FREE_TARGET
 | |
| 
 | |
| #define PERCPU_FREE_TARGET		(4)
 | |
| #define PERCPU_NR_SCANS			PERCPU_FREE_TARGET
 | |
| 
 | |
| /* Helpers to get the local list index */
 | |
| #define LOCAL_LIST_IDX(t)	((t) - BPF_LOCAL_LIST_T_OFFSET)
 | |
| #define LOCAL_FREE_LIST_IDX	LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE)
 | |
| #define LOCAL_PENDING_LIST_IDX	LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING)
 | |
| #define IS_LOCAL_LIST_TYPE(t)	((t) >= BPF_LOCAL_LIST_T_OFFSET)
 | |
| 
 | |
| static int get_next_cpu(int cpu)
 | |
| {
 | |
| 	cpu = cpumask_next(cpu, cpu_possible_mask);
 | |
| 	if (cpu >= nr_cpu_ids)
 | |
| 		cpu = cpumask_first(cpu_possible_mask);
 | |
| 	return cpu;
 | |
| }
 | |
| 
 | |
| /* Local list helpers */
 | |
| static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l)
 | |
| {
 | |
| 	return &loc_l->lists[LOCAL_FREE_LIST_IDX];
 | |
| }
 | |
| 
 | |
| static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l)
 | |
| {
 | |
| 	return &loc_l->lists[LOCAL_PENDING_LIST_IDX];
 | |
| }
 | |
| 
 | |
| /* bpf_lru_node helpers */
 | |
| static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node)
 | |
| {
 | |
| 	return READ_ONCE(node->ref);
 | |
| }
 | |
| 
 | |
| static void bpf_lru_node_clear_ref(struct bpf_lru_node *node)
 | |
| {
 | |
| 	WRITE_ONCE(node->ref, 0);
 | |
| }
 | |
| 
 | |
| static void bpf_lru_list_count_inc(struct bpf_lru_list *l,
 | |
| 				   enum bpf_lru_list_type type)
 | |
| {
 | |
| 	if (type < NR_BPF_LRU_LIST_COUNT)
 | |
| 		l->counts[type]++;
 | |
| }
 | |
| 
 | |
| static void bpf_lru_list_count_dec(struct bpf_lru_list *l,
 | |
| 				   enum bpf_lru_list_type type)
 | |
| {
 | |
| 	if (type < NR_BPF_LRU_LIST_COUNT)
 | |
| 		l->counts[type]--;
 | |
| }
 | |
| 
 | |
| static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l,
 | |
| 					struct bpf_lru_node *node,
 | |
| 					struct list_head *free_list,
 | |
| 					enum bpf_lru_list_type tgt_free_type)
 | |
| {
 | |
| 	if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
 | |
| 		return;
 | |
| 
 | |
| 	/* If the removing node is the next_inactive_rotation candidate,
 | |
| 	 * move the next_inactive_rotation pointer also.
 | |
| 	 */
 | |
| 	if (&node->list == l->next_inactive_rotation)
 | |
| 		l->next_inactive_rotation = l->next_inactive_rotation->prev;
 | |
| 
 | |
| 	bpf_lru_list_count_dec(l, node->type);
 | |
| 
 | |
| 	node->type = tgt_free_type;
 | |
| 	list_move(&node->list, free_list);
 | |
| }
 | |
| 
 | |
| /* Move nodes from local list to the LRU list */
 | |
| static void __bpf_lru_node_move_in(struct bpf_lru_list *l,
 | |
| 				   struct bpf_lru_node *node,
 | |
| 				   enum bpf_lru_list_type tgt_type)
 | |
| {
 | |
| 	if (WARN_ON_ONCE(!IS_LOCAL_LIST_TYPE(node->type)) ||
 | |
| 	    WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type)))
 | |
| 		return;
 | |
| 
 | |
| 	bpf_lru_list_count_inc(l, tgt_type);
 | |
| 	node->type = tgt_type;
 | |
| 	bpf_lru_node_clear_ref(node);
 | |
| 	list_move(&node->list, &l->lists[tgt_type]);
 | |
| }
 | |
| 
 | |
| /* Move nodes between or within active and inactive list (like
 | |
|  * active to inactive, inactive to active or tail of active back to
 | |
|  * the head of active).
 | |
|  */
 | |
| static void __bpf_lru_node_move(struct bpf_lru_list *l,
 | |
| 				struct bpf_lru_node *node,
 | |
| 				enum bpf_lru_list_type tgt_type)
 | |
| {
 | |
| 	if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)) ||
 | |
| 	    WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type)))
 | |
| 		return;
 | |
| 
 | |
| 	if (node->type != tgt_type) {
 | |
| 		bpf_lru_list_count_dec(l, node->type);
 | |
| 		bpf_lru_list_count_inc(l, tgt_type);
 | |
| 		node->type = tgt_type;
 | |
| 	}
 | |
| 	bpf_lru_node_clear_ref(node);
 | |
| 
 | |
| 	/* If the moving node is the next_inactive_rotation candidate,
 | |
| 	 * move the next_inactive_rotation pointer also.
 | |
| 	 */
 | |
| 	if (&node->list == l->next_inactive_rotation)
 | |
| 		l->next_inactive_rotation = l->next_inactive_rotation->prev;
 | |
| 
 | |
| 	list_move(&node->list, &l->lists[tgt_type]);
 | |
| }
 | |
| 
 | |
| static bool bpf_lru_list_inactive_low(const struct bpf_lru_list *l)
 | |
| {
 | |
| 	return l->counts[BPF_LRU_LIST_T_INACTIVE] <
 | |
| 		l->counts[BPF_LRU_LIST_T_ACTIVE];
 | |
| }
 | |
| 
 | |
| /* Rotate the active list:
 | |
|  * 1. Start from tail
 | |
|  * 2. If the node has the ref bit set, it will be rotated
 | |
|  *    back to the head of active list with the ref bit cleared.
 | |
|  *    Give this node one more chance to survive in the active list.
 | |
|  * 3. If the ref bit is not set, move it to the head of the
 | |
|  *    inactive list.
 | |
|  * 4. It will at most scan nr_scans nodes
 | |
|  */
 | |
| static void __bpf_lru_list_rotate_active(struct bpf_lru *lru,
 | |
| 					 struct bpf_lru_list *l)
 | |
| {
 | |
| 	struct list_head *active = &l->lists[BPF_LRU_LIST_T_ACTIVE];
 | |
| 	struct bpf_lru_node *node, *tmp_node, *first_node;
 | |
| 	unsigned int i = 0;
 | |
| 
 | |
| 	first_node = list_first_entry(active, struct bpf_lru_node, list);
 | |
| 	list_for_each_entry_safe_reverse(node, tmp_node, active, list) {
 | |
| 		if (bpf_lru_node_is_ref(node))
 | |
| 			__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
 | |
| 		else
 | |
| 			__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
 | |
| 
 | |
| 		if (++i == lru->nr_scans || node == first_node)
 | |
| 			break;
 | |
| 	}
 | |
| }
 | |
| 
 | |
| /* Rotate the inactive list.  It starts from the next_inactive_rotation
 | |
|  * 1. If the node has ref bit set, it will be moved to the head
 | |
|  *    of active list with the ref bit cleared.
 | |
|  * 2. If the node does not have ref bit set, it will leave it
 | |
|  *    at its current location (i.e. do nothing) so that it can
 | |
|  *    be considered during the next inactive_shrink.
 | |
|  * 3. It will at most scan nr_scans nodes
 | |
|  */
 | |
| static void __bpf_lru_list_rotate_inactive(struct bpf_lru *lru,
 | |
| 					   struct bpf_lru_list *l)
 | |
| {
 | |
| 	struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE];
 | |
| 	struct list_head *cur, *last, *next = inactive;
 | |
| 	struct bpf_lru_node *node;
 | |
| 	unsigned int i = 0;
 | |
| 
 | |
| 	if (list_empty(inactive))
 | |
| 		return;
 | |
| 
 | |
| 	last = l->next_inactive_rotation->next;
 | |
| 	if (last == inactive)
 | |
| 		last = last->next;
 | |
| 
 | |
| 	cur = l->next_inactive_rotation;
 | |
| 	while (i < lru->nr_scans) {
 | |
| 		if (cur == inactive) {
 | |
| 			cur = cur->prev;
 | |
| 			continue;
 | |
| 		}
 | |
| 
 | |
| 		node = list_entry(cur, struct bpf_lru_node, list);
 | |
| 		next = cur->prev;
 | |
| 		if (bpf_lru_node_is_ref(node))
 | |
| 			__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
 | |
| 		if (cur == last)
 | |
| 			break;
 | |
| 		cur = next;
 | |
| 		i++;
 | |
| 	}
 | |
| 
 | |
| 	l->next_inactive_rotation = next;
 | |
| }
 | |
| 
 | |
| /* Shrink the inactive list.  It starts from the tail of the
 | |
|  * inactive list and only move the nodes without the ref bit
 | |
|  * set to the designated free list.
 | |
|  */
 | |
| static unsigned int
 | |
| __bpf_lru_list_shrink_inactive(struct bpf_lru *lru,
 | |
| 			       struct bpf_lru_list *l,
 | |
| 			       unsigned int tgt_nshrink,
 | |
| 			       struct list_head *free_list,
 | |
| 			       enum bpf_lru_list_type tgt_free_type)
 | |
| {
 | |
| 	struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE];
 | |
| 	struct bpf_lru_node *node, *tmp_node;
 | |
| 	unsigned int nshrinked = 0;
 | |
| 	unsigned int i = 0;
 | |
| 
 | |
| 	list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) {
 | |
| 		if (bpf_lru_node_is_ref(node)) {
 | |
| 			__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
 | |
| 		} else if (lru->del_from_htab(lru->del_arg, node)) {
 | |
| 			__bpf_lru_node_move_to_free(l, node, free_list,
 | |
| 						    tgt_free_type);
 | |
| 			if (++nshrinked == tgt_nshrink)
 | |
| 				break;
 | |
| 		}
 | |
| 
 | |
| 		if (++i == lru->nr_scans)
 | |
| 			break;
 | |
| 	}
 | |
| 
 | |
| 	return nshrinked;
 | |
| }
 | |
| 
 | |
| /* 1. Rotate the active list (if needed)
 | |
|  * 2. Always rotate the inactive list
 | |
|  */
 | |
| static void __bpf_lru_list_rotate(struct bpf_lru *lru, struct bpf_lru_list *l)
 | |
| {
 | |
| 	if (bpf_lru_list_inactive_low(l))
 | |
| 		__bpf_lru_list_rotate_active(lru, l);
 | |
| 
 | |
| 	__bpf_lru_list_rotate_inactive(lru, l);
 | |
| }
 | |
| 
 | |
| /* Calls __bpf_lru_list_shrink_inactive() to shrink some
 | |
|  * ref-bit-cleared nodes and move them to the designated
 | |
|  * free list.
 | |
|  *
 | |
|  * If it cannot get a free node after calling
 | |
|  * __bpf_lru_list_shrink_inactive().  It will just remove
 | |
|  * one node from either inactive or active list without
 | |
|  * honoring the ref-bit.  It prefers inactive list to active
 | |
|  * list in this situation.
 | |
|  */
 | |
| static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru,
 | |
| 					  struct bpf_lru_list *l,
 | |
| 					  unsigned int tgt_nshrink,
 | |
| 					  struct list_head *free_list,
 | |
| 					  enum bpf_lru_list_type tgt_free_type)
 | |
| 
 | |
| {
 | |
| 	struct bpf_lru_node *node, *tmp_node;
 | |
| 	struct list_head *force_shrink_list;
 | |
| 	unsigned int nshrinked;
 | |
| 
 | |
| 	nshrinked = __bpf_lru_list_shrink_inactive(lru, l, tgt_nshrink,
 | |
| 						   free_list, tgt_free_type);
 | |
| 	if (nshrinked)
 | |
| 		return nshrinked;
 | |
| 
 | |
| 	/* Do a force shrink by ignoring the reference bit */
 | |
| 	if (!list_empty(&l->lists[BPF_LRU_LIST_T_INACTIVE]))
 | |
| 		force_shrink_list = &l->lists[BPF_LRU_LIST_T_INACTIVE];
 | |
| 	else
 | |
| 		force_shrink_list = &l->lists[BPF_LRU_LIST_T_ACTIVE];
 | |
| 
 | |
| 	list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list,
 | |
| 					 list) {
 | |
| 		if (lru->del_from_htab(lru->del_arg, node)) {
 | |
| 			__bpf_lru_node_move_to_free(l, node, free_list,
 | |
| 						    tgt_free_type);
 | |
| 			return 1;
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return 0;
 | |
| }
 | |
| 
 | |
| /* Flush the nodes from the local pending list to the LRU list */
 | |
| static void __local_list_flush(struct bpf_lru_list *l,
 | |
| 			       struct bpf_lru_locallist *loc_l)
 | |
| {
 | |
| 	struct bpf_lru_node *node, *tmp_node;
 | |
| 
 | |
| 	list_for_each_entry_safe_reverse(node, tmp_node,
 | |
| 					 local_pending_list(loc_l), list) {
 | |
| 		if (bpf_lru_node_is_ref(node))
 | |
| 			__bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE);
 | |
| 		else
 | |
| 			__bpf_lru_node_move_in(l, node,
 | |
| 					       BPF_LRU_LIST_T_INACTIVE);
 | |
| 	}
 | |
| }
 | |
| 
 | |
| static void bpf_lru_list_push_free(struct bpf_lru_list *l,
 | |
| 				   struct bpf_lru_node *node)
 | |
| {
 | |
| 	unsigned long flags;
 | |
| 
 | |
| 	if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
 | |
| 		return;
 | |
| 
 | |
| 	raw_spin_lock_irqsave(&l->lock, flags);
 | |
| 	__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
 | |
| 	raw_spin_unlock_irqrestore(&l->lock, flags);
 | |
| }
 | |
| 
 | |
| static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
 | |
| 					   struct bpf_lru_locallist *loc_l)
 | |
| {
 | |
| 	struct bpf_lru_list *l = &lru->common_lru.lru_list;
 | |
| 	struct bpf_lru_node *node, *tmp_node;
 | |
| 	unsigned int nfree = 0;
 | |
| 
 | |
| 	raw_spin_lock(&l->lock);
 | |
| 
 | |
| 	__local_list_flush(l, loc_l);
 | |
| 
 | |
| 	__bpf_lru_list_rotate(lru, l);
 | |
| 
 | |
| 	list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE],
 | |
| 				 list) {
 | |
| 		__bpf_lru_node_move_to_free(l, node, local_free_list(loc_l),
 | |
| 					    BPF_LRU_LOCAL_LIST_T_FREE);
 | |
| 		if (++nfree == LOCAL_FREE_TARGET)
 | |
| 			break;
 | |
| 	}
 | |
| 
 | |
| 	if (nfree < LOCAL_FREE_TARGET)
 | |
| 		__bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree,
 | |
| 				      local_free_list(loc_l),
 | |
| 				      BPF_LRU_LOCAL_LIST_T_FREE);
 | |
| 
 | |
| 	raw_spin_unlock(&l->lock);
 | |
| }
 | |
| 
 | |
| static void __local_list_add_pending(struct bpf_lru *lru,
 | |
| 				     struct bpf_lru_locallist *loc_l,
 | |
| 				     int cpu,
 | |
| 				     struct bpf_lru_node *node,
 | |
| 				     u32 hash)
 | |
| {
 | |
| 	*(u32 *)((void *)node + lru->hash_offset) = hash;
 | |
| 	node->cpu = cpu;
 | |
| 	node->type = BPF_LRU_LOCAL_LIST_T_PENDING;
 | |
| 	bpf_lru_node_clear_ref(node);
 | |
| 	list_add(&node->list, local_pending_list(loc_l));
 | |
| }
 | |
| 
 | |
| static struct bpf_lru_node *
 | |
| __local_list_pop_free(struct bpf_lru_locallist *loc_l)
 | |
| {
 | |
| 	struct bpf_lru_node *node;
 | |
| 
 | |
| 	node = list_first_entry_or_null(local_free_list(loc_l),
 | |
| 					struct bpf_lru_node,
 | |
| 					list);
 | |
| 	if (node)
 | |
| 		list_del(&node->list);
 | |
| 
 | |
| 	return node;
 | |
| }
 | |
| 
 | |
| static struct bpf_lru_node *
 | |
| __local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l)
 | |
| {
 | |
| 	struct bpf_lru_node *node;
 | |
| 	bool force = false;
 | |
| 
 | |
| ignore_ref:
 | |
| 	/* Get from the tail (i.e. older element) of the pending list. */
 | |
| 	list_for_each_entry_reverse(node, local_pending_list(loc_l),
 | |
| 				    list) {
 | |
| 		if ((!bpf_lru_node_is_ref(node) || force) &&
 | |
| 		    lru->del_from_htab(lru->del_arg, node)) {
 | |
| 			list_del(&node->list);
 | |
| 			return node;
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	if (!force) {
 | |
| 		force = true;
 | |
| 		goto ignore_ref;
 | |
| 	}
 | |
| 
 | |
| 	return NULL;
 | |
| }
 | |
| 
 | |
| static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
 | |
| 						    u32 hash)
 | |
| {
 | |
| 	struct list_head *free_list;
 | |
| 	struct bpf_lru_node *node = NULL;
 | |
| 	struct bpf_lru_list *l;
 | |
| 	unsigned long flags;
 | |
| 	int cpu = raw_smp_processor_id();
 | |
| 
 | |
| 	l = per_cpu_ptr(lru->percpu_lru, cpu);
 | |
| 
 | |
| 	raw_spin_lock_irqsave(&l->lock, flags);
 | |
| 
 | |
| 	__bpf_lru_list_rotate(lru, l);
 | |
| 
 | |
| 	free_list = &l->lists[BPF_LRU_LIST_T_FREE];
 | |
| 	if (list_empty(free_list))
 | |
| 		__bpf_lru_list_shrink(lru, l, PERCPU_FREE_TARGET, free_list,
 | |
| 				      BPF_LRU_LIST_T_FREE);
 | |
| 
 | |
| 	if (!list_empty(free_list)) {
 | |
| 		node = list_first_entry(free_list, struct bpf_lru_node, list);
 | |
| 		*(u32 *)((void *)node + lru->hash_offset) = hash;
 | |
| 		bpf_lru_node_clear_ref(node);
 | |
| 		__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
 | |
| 	}
 | |
| 
 | |
| 	raw_spin_unlock_irqrestore(&l->lock, flags);
 | |
| 
 | |
| 	return node;
 | |
| }
 | |
| 
 | |
| static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
 | |
| 						    u32 hash)
 | |
| {
 | |
| 	struct bpf_lru_locallist *loc_l, *steal_loc_l;
 | |
| 	struct bpf_common_lru *clru = &lru->common_lru;
 | |
| 	struct bpf_lru_node *node;
 | |
| 	int steal, first_steal;
 | |
| 	unsigned long flags;
 | |
| 	int cpu = raw_smp_processor_id();
 | |
| 
 | |
| 	loc_l = per_cpu_ptr(clru->local_list, cpu);
 | |
| 
 | |
| 	raw_spin_lock_irqsave(&loc_l->lock, flags);
 | |
| 
 | |
| 	node = __local_list_pop_free(loc_l);
 | |
| 	if (!node) {
 | |
| 		bpf_lru_list_pop_free_to_local(lru, loc_l);
 | |
| 		node = __local_list_pop_free(loc_l);
 | |
| 	}
 | |
| 
 | |
| 	if (node)
 | |
| 		__local_list_add_pending(lru, loc_l, cpu, node, hash);
 | |
| 
 | |
| 	raw_spin_unlock_irqrestore(&loc_l->lock, flags);
 | |
| 
 | |
| 	if (node)
 | |
| 		return node;
 | |
| 
 | |
| 	/* No free nodes found from the local free list and
 | |
| 	 * the global LRU list.
 | |
| 	 *
 | |
| 	 * Steal from the local free/pending list of the
 | |
| 	 * current CPU and remote CPU in RR.  It starts
 | |
| 	 * with the loc_l->next_steal CPU.
 | |
| 	 */
 | |
| 
 | |
| 	first_steal = loc_l->next_steal;
 | |
| 	steal = first_steal;
 | |
| 	do {
 | |
| 		steal_loc_l = per_cpu_ptr(clru->local_list, steal);
 | |
| 
 | |
| 		raw_spin_lock_irqsave(&steal_loc_l->lock, flags);
 | |
| 
 | |
| 		node = __local_list_pop_free(steal_loc_l);
 | |
| 		if (!node)
 | |
| 			node = __local_list_pop_pending(lru, steal_loc_l);
 | |
| 
 | |
| 		raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
 | |
| 
 | |
| 		steal = get_next_cpu(steal);
 | |
| 	} while (!node && steal != first_steal);
 | |
| 
 | |
| 	loc_l->next_steal = steal;
 | |
| 
 | |
| 	if (node) {
 | |
| 		raw_spin_lock_irqsave(&loc_l->lock, flags);
 | |
| 		__local_list_add_pending(lru, loc_l, cpu, node, hash);
 | |
| 		raw_spin_unlock_irqrestore(&loc_l->lock, flags);
 | |
| 	}
 | |
| 
 | |
| 	return node;
 | |
| }
 | |
| 
 | |
| struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash)
 | |
| {
 | |
| 	if (lru->percpu)
 | |
| 		return bpf_percpu_lru_pop_free(lru, hash);
 | |
| 	else
 | |
| 		return bpf_common_lru_pop_free(lru, hash);
 | |
| }
 | |
| 
 | |
| static void bpf_common_lru_push_free(struct bpf_lru *lru,
 | |
| 				     struct bpf_lru_node *node)
 | |
| {
 | |
| 	u8 node_type = READ_ONCE(node->type);
 | |
| 	unsigned long flags;
 | |
| 
 | |
| 	if (WARN_ON_ONCE(node_type == BPF_LRU_LIST_T_FREE) ||
 | |
| 	    WARN_ON_ONCE(node_type == BPF_LRU_LOCAL_LIST_T_FREE))
 | |
| 		return;
 | |
| 
 | |
| 	if (node_type == BPF_LRU_LOCAL_LIST_T_PENDING) {
 | |
| 		struct bpf_lru_locallist *loc_l;
 | |
| 
 | |
| 		loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu);
 | |
| 
 | |
| 		raw_spin_lock_irqsave(&loc_l->lock, flags);
 | |
| 
 | |
| 		if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) {
 | |
| 			raw_spin_unlock_irqrestore(&loc_l->lock, flags);
 | |
| 			goto check_lru_list;
 | |
| 		}
 | |
| 
 | |
| 		node->type = BPF_LRU_LOCAL_LIST_T_FREE;
 | |
| 		bpf_lru_node_clear_ref(node);
 | |
| 		list_move(&node->list, local_free_list(loc_l));
 | |
| 
 | |
| 		raw_spin_unlock_irqrestore(&loc_l->lock, flags);
 | |
| 		return;
 | |
| 	}
 | |
| 
 | |
| check_lru_list:
 | |
| 	bpf_lru_list_push_free(&lru->common_lru.lru_list, node);
 | |
| }
 | |
| 
 | |
| static void bpf_percpu_lru_push_free(struct bpf_lru *lru,
 | |
| 				     struct bpf_lru_node *node)
 | |
| {
 | |
| 	struct bpf_lru_list *l;
 | |
| 	unsigned long flags;
 | |
| 
 | |
| 	l = per_cpu_ptr(lru->percpu_lru, node->cpu);
 | |
| 
 | |
| 	raw_spin_lock_irqsave(&l->lock, flags);
 | |
| 
 | |
| 	__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
 | |
| 
 | |
| 	raw_spin_unlock_irqrestore(&l->lock, flags);
 | |
| }
 | |
| 
 | |
| void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node)
 | |
| {
 | |
| 	if (lru->percpu)
 | |
| 		bpf_percpu_lru_push_free(lru, node);
 | |
| 	else
 | |
| 		bpf_common_lru_push_free(lru, node);
 | |
| }
 | |
| 
 | |
| static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
 | |
| 				    u32 node_offset, u32 elem_size,
 | |
| 				    u32 nr_elems)
 | |
| {
 | |
| 	struct bpf_lru_list *l = &lru->common_lru.lru_list;
 | |
| 	u32 i;
 | |
| 
 | |
| 	for (i = 0; i < nr_elems; i++) {
 | |
| 		struct bpf_lru_node *node;
 | |
| 
 | |
| 		node = (struct bpf_lru_node *)(buf + node_offset);
 | |
| 		node->type = BPF_LRU_LIST_T_FREE;
 | |
| 		bpf_lru_node_clear_ref(node);
 | |
| 		list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
 | |
| 		buf += elem_size;
 | |
| 	}
 | |
| }
 | |
| 
 | |
| static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf,
 | |
| 				    u32 node_offset, u32 elem_size,
 | |
| 				    u32 nr_elems)
 | |
| {
 | |
| 	u32 i, pcpu_entries;
 | |
| 	int cpu;
 | |
| 	struct bpf_lru_list *l;
 | |
| 
 | |
| 	pcpu_entries = nr_elems / num_possible_cpus();
 | |
| 
 | |
| 	i = 0;
 | |
| 
 | |
| 	for_each_possible_cpu(cpu) {
 | |
| 		struct bpf_lru_node *node;
 | |
| 
 | |
| 		l = per_cpu_ptr(lru->percpu_lru, cpu);
 | |
| again:
 | |
| 		node = (struct bpf_lru_node *)(buf + node_offset);
 | |
| 		node->cpu = cpu;
 | |
| 		node->type = BPF_LRU_LIST_T_FREE;
 | |
| 		bpf_lru_node_clear_ref(node);
 | |
| 		list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
 | |
| 		i++;
 | |
| 		buf += elem_size;
 | |
| 		if (i == nr_elems)
 | |
| 			break;
 | |
| 		if (i % pcpu_entries)
 | |
| 			goto again;
 | |
| 	}
 | |
| }
 | |
| 
 | |
| void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
 | |
| 		      u32 elem_size, u32 nr_elems)
 | |
| {
 | |
| 	if (lru->percpu)
 | |
| 		bpf_percpu_lru_populate(lru, buf, node_offset, elem_size,
 | |
| 					nr_elems);
 | |
| 	else
 | |
| 		bpf_common_lru_populate(lru, buf, node_offset, elem_size,
 | |
| 					nr_elems);
 | |
| }
 | |
| 
 | |
| static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu)
 | |
| {
 | |
| 	int i;
 | |
| 
 | |
| 	for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++)
 | |
| 		INIT_LIST_HEAD(&loc_l->lists[i]);
 | |
| 
 | |
| 	loc_l->next_steal = cpu;
 | |
| 
 | |
| 	raw_spin_lock_init(&loc_l->lock);
 | |
| }
 | |
| 
 | |
| static void bpf_lru_list_init(struct bpf_lru_list *l)
 | |
| {
 | |
| 	int i;
 | |
| 
 | |
| 	for (i = 0; i < NR_BPF_LRU_LIST_T; i++)
 | |
| 		INIT_LIST_HEAD(&l->lists[i]);
 | |
| 
 | |
| 	for (i = 0; i < NR_BPF_LRU_LIST_COUNT; i++)
 | |
| 		l->counts[i] = 0;
 | |
| 
 | |
| 	l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE];
 | |
| 
 | |
| 	raw_spin_lock_init(&l->lock);
 | |
| }
 | |
| 
 | |
| int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
 | |
| 		 del_from_htab_func del_from_htab, void *del_arg)
 | |
| {
 | |
| 	int cpu;
 | |
| 
 | |
| 	if (percpu) {
 | |
| 		lru->percpu_lru = alloc_percpu(struct bpf_lru_list);
 | |
| 		if (!lru->percpu_lru)
 | |
| 			return -ENOMEM;
 | |
| 
 | |
| 		for_each_possible_cpu(cpu) {
 | |
| 			struct bpf_lru_list *l;
 | |
| 
 | |
| 			l = per_cpu_ptr(lru->percpu_lru, cpu);
 | |
| 			bpf_lru_list_init(l);
 | |
| 		}
 | |
| 		lru->nr_scans = PERCPU_NR_SCANS;
 | |
| 	} else {
 | |
| 		struct bpf_common_lru *clru = &lru->common_lru;
 | |
| 
 | |
| 		clru->local_list = alloc_percpu(struct bpf_lru_locallist);
 | |
| 		if (!clru->local_list)
 | |
| 			return -ENOMEM;
 | |
| 
 | |
| 		for_each_possible_cpu(cpu) {
 | |
| 			struct bpf_lru_locallist *loc_l;
 | |
| 
 | |
| 			loc_l = per_cpu_ptr(clru->local_list, cpu);
 | |
| 			bpf_lru_locallist_init(loc_l, cpu);
 | |
| 		}
 | |
| 
 | |
| 		bpf_lru_list_init(&clru->lru_list);
 | |
| 		lru->nr_scans = LOCAL_NR_SCANS;
 | |
| 	}
 | |
| 
 | |
| 	lru->percpu = percpu;
 | |
| 	lru->del_from_htab = del_from_htab;
 | |
| 	lru->del_arg = del_arg;
 | |
| 	lru->hash_offset = hash_offset;
 | |
| 
 | |
| 	return 0;
 | |
| }
 | |
| 
 | |
| void bpf_lru_destroy(struct bpf_lru *lru)
 | |
| {
 | |
| 	if (lru->percpu)
 | |
| 		free_percpu(lru->percpu_lru);
 | |
| 	else
 | |
| 		free_percpu(lru->common_lru.local_list);
 | |
| }
 |