Merge tag 'slab-for-7.0' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab

Pull slab updates from Vlastimil Babka:

 - The percpu sheaves caching layer was introduced as opt-in in 6.18 and
   now we enable it for all caches and remove the previous cpu (partial)
   slab caching mechanism.

   Besides the lower locking overhead and much more likely fastpath when
   freeing, this removes the rather complicated code related to the cpu
   slab lockless fastpaths (using this_cpu_try_cmpxchg128/64) and all
   its complications for PREEMPT_RT or kmalloc_nolock().

   The lockless slab freelist+counters update operation using
   try_cmpxchg128/64 remains and is crucial for freeing remote NUMA
   objects, and to allow flushing objects from sheaves to slabs mostly
   without the node list_lock (Vlastimil Babka)

 - Eliminate slabobj_ext metadata overhead when possible. Instead of
   using kmalloc() to allocate the array for memcg and/or allocation
   profiling tag pointers, use leftover space in a slab or per-object
   padding due to alignment (Harry Yoo)

 - Various followup improvements to the above (Hao Li)

* tag 'slab-for-7.0' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab: (39 commits)
  slub: let need_slab_obj_exts() return false if SLAB_NO_OBJ_EXT is set
  mm/slab: only allow SLAB_OBJ_EXT_IN_OBJ for unmergeable caches
  mm/slab: place slabobj_ext metadata in unused space within s->size
  mm/slab: move [__]ksize and slab_ksize() to mm/slub.c
  mm/slab: save memory by allocating slabobj_ext array from leftover
  mm/memcontrol,alloc_tag: handle slabobj_ext access under KASAN poison
  mm/slab: use stride to access slabobj_ext
  mm/slab: abstract slabobj_ext access via new slab_obj_ext() helper
  ext4: specify the free pointer offset for ext4_inode_cache
  mm/slab: allow specifying free pointer offset when using constructor
  mm/slab: use unsigned long for orig_size to ensure proper metadata align
  slub: clarify object field layout comments
  mm/slab: avoid allocating slabobj_ext array from its own slab
  slub: avoid list_lock contention from __refill_objects_any()
  mm/slub: cleanup and repurpose some stat items
  mm/slub: remove DEACTIVATE_TO_* stat items
  slab: remove frozen slab checks from __slab_free()
  slab: update overview comments
  slab: refill sheaves from all nodes
  slab: remove unused PREEMPT_RT specific macros
  ...
This commit is contained in:
Linus Torvalds
2026-02-11 14:12:50 -08:00
9 changed files with 1773 additions and 2079 deletions

View File

@@ -1496,12 +1496,19 @@ static void init_once(void *foo)
static int __init init_inodecache(void)
{
ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache",
sizeof(struct ext4_inode_info), 0,
SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
offsetof(struct ext4_inode_info, i_data),
sizeof_field(struct ext4_inode_info, i_data),
init_once);
struct kmem_cache_args args = {
.useroffset = offsetof(struct ext4_inode_info, i_data),
.usersize = sizeof_field(struct ext4_inode_info, i_data),
.use_freeptr_offset = true,
.freeptr_offset = offsetof(struct ext4_inode_info, i_flags),
.ctor = init_once,
};
ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
sizeof(struct ext4_inode_info),
&args,
SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT);
if (ext4_inode_cachep == NULL)
return -ENOMEM;
return 0;

View File

@@ -58,8 +58,9 @@ enum _slab_flag_bits {
#endif
_SLAB_OBJECT_POISON,
_SLAB_CMPXCHG_DOUBLE,
#ifdef CONFIG_SLAB_OBJ_EXT
_SLAB_NO_OBJ_EXT,
#if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT)
_SLAB_OBJ_EXT_IN_OBJ,
#endif
_SLAB_FLAGS_LAST_BIT
};
@@ -239,10 +240,12 @@ enum _slab_flag_bits {
#define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */
/* Slab created using create_boot_cache */
#ifdef CONFIG_SLAB_OBJ_EXT
#define SLAB_NO_OBJ_EXT __SLAB_FLAG_BIT(_SLAB_NO_OBJ_EXT)
#if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT)
#define SLAB_OBJ_EXT_IN_OBJ __SLAB_FLAG_BIT(_SLAB_OBJ_EXT_IN_OBJ)
#else
#define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED
#define SLAB_OBJ_EXT_IN_OBJ __SLAB_FLAG_UNUSED
#endif
/*
@@ -300,24 +303,26 @@ struct kmem_cache_args {
unsigned int usersize;
/**
* @freeptr_offset: Custom offset for the free pointer
* in &SLAB_TYPESAFE_BY_RCU caches
* in caches with &SLAB_TYPESAFE_BY_RCU or @ctor
*
* By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer
* outside of the object. This might cause the object to grow in size.
* Cache creators that have a reason to avoid this can specify a custom
* free pointer offset in their struct where the free pointer will be
* placed.
* By default, &SLAB_TYPESAFE_BY_RCU and @ctor caches place the free
* pointer outside of the object. This might cause the object to grow
* in size. Cache creators that have a reason to avoid this can specify
* a custom free pointer offset in their data structure where the free
* pointer will be placed.
*
* Note that placing the free pointer inside the object requires the
* caller to ensure that no fields are invalidated that are required to
* guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for
* details).
* For caches with &SLAB_TYPESAFE_BY_RCU, the caller must ensure that
* the free pointer does not overlay fields required to guard against
* object recycling (See &SLAB_TYPESAFE_BY_RCU for details).
*
* For caches with @ctor, the caller must ensure that the free pointer
* does not overlay fields initialized by the constructor.
*
* Currently, only caches with &SLAB_TYPESAFE_BY_RCU or @ctor
* may specify @freeptr_offset.
*
* Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset
* is specified, %use_freeptr_offset must be set %true.
*
* Note that @ctor currently isn't supported with custom free pointers
* as a @ctor requires an external free pointer.
* is specified, @use_freeptr_offset must be set %true.
*/
unsigned int freeptr_offset;
/**
@@ -508,7 +513,6 @@ void * __must_check krealloc_node_align_noprof(const void *objp, size_t new_size
void kfree(const void *objp);
void kfree_nolock(const void *objp);
void kfree_sensitive(const void *objp);
size_t __ksize(const void *objp);
DEFINE_FREE(kfree, void *, if (!IS_ERR_OR_NULL(_T)) kfree(_T))
DEFINE_FREE(kfree_sensitive, void *, if (_T) kfree_sensitive(_T))

View File

@@ -247,17 +247,6 @@ config SLUB_STATS
out which slabs are relevant to a particular load.
Try running: slabinfo -DA
config SLUB_CPU_PARTIAL
default y
depends on SMP && !SLUB_TINY
bool "Enable per cpu partial caches"
help
Per cpu partial caches accelerate objects allocation and freeing
that is local to a processor at the price of more indeterminism
in the latency of the free. On overflow these caches will be cleared
which requires the taking of locks that may cause latency spikes.
Typically one would choose no for a realtime system.
config RANDOM_KMALLOC_CACHES
default n
depends on !SLUB_TINY

View File

@@ -838,6 +838,7 @@ static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int ord
struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order);
#define alloc_frozen_pages_nolock(...) \
alloc_hooks(alloc_frozen_pages_nolock_noprof(__VA_ARGS__))
void free_frozen_pages_nolock(struct page *page, unsigned int order);
extern void zone_pcp_reset(struct zone *zone);
extern void zone_pcp_disable(struct zone *zone);

View File

@@ -2627,16 +2627,24 @@ struct mem_cgroup *mem_cgroup_from_obj_slab(struct slab *slab, void *p)
* Memcg membership data for each individual object is saved in
* slab->obj_exts.
*/
struct slabobj_ext *obj_exts;
unsigned long obj_exts;
struct slabobj_ext *obj_ext;
unsigned int off;
obj_exts = slab_obj_exts(slab);
if (!obj_exts)
return NULL;
get_slab_obj_exts(obj_exts);
off = obj_to_index(slab->slab_cache, slab, p);
if (obj_exts[off].objcg)
return obj_cgroup_memcg(obj_exts[off].objcg);
obj_ext = slab_obj_ext(slab, obj_exts, off);
if (obj_ext->objcg) {
struct obj_cgroup *objcg = obj_ext->objcg;
put_slab_obj_exts(obj_exts);
return obj_cgroup_memcg(objcg);
}
put_slab_obj_exts(obj_exts);
return NULL;
}
@@ -3222,6 +3230,9 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
}
for (i = 0; i < size; i++) {
unsigned long obj_exts;
struct slabobj_ext *obj_ext;
slab = virt_to_slab(p[i]);
if (!slab_obj_exts(slab) &&
@@ -3244,29 +3255,35 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
slab_pgdat(slab), cache_vmstat_idx(s)))
return false;
obj_exts = slab_obj_exts(slab);
get_slab_obj_exts(obj_exts);
off = obj_to_index(s, slab, p[i]);
obj_ext = slab_obj_ext(slab, obj_exts, off);
obj_cgroup_get(objcg);
slab_obj_exts(slab)[off].objcg = objcg;
obj_ext->objcg = objcg;
put_slab_obj_exts(obj_exts);
}
return true;
}
void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void **p, int objects, struct slabobj_ext *obj_exts)
void **p, int objects, unsigned long obj_exts)
{
size_t obj_size = obj_full_size(s);
for (int i = 0; i < objects; i++) {
struct obj_cgroup *objcg;
struct slabobj_ext *obj_ext;
unsigned int off;
off = obj_to_index(s, slab, p[i]);
objcg = obj_exts[off].objcg;
obj_ext = slab_obj_ext(slab, obj_exts, off);
objcg = obj_ext->objcg;
if (!objcg)
continue;
obj_exts[off].objcg = NULL;
obj_ext->objcg = NULL;
refill_obj_stock(objcg, obj_size, true, -obj_size,
slab_pgdat(slab), cache_vmstat_idx(s));
obj_cgroup_put(objcg);

View File

@@ -3011,6 +3011,11 @@ void free_frozen_pages(struct page *page, unsigned int order)
__free_frozen_pages(page, order, FPI_NONE);
}
void free_frozen_pages_nolock(struct page *page, unsigned int order)
{
__free_frozen_pages(page, order, FPI_TRYLOCK);
}
/*
* Free a batch of folios
*/

213
mm/slab.h
View File

@@ -21,14 +21,12 @@
# define system_has_freelist_aba() system_has_cmpxchg128()
# define try_cmpxchg_freelist try_cmpxchg128
# endif
#define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg128
typedef u128 freelist_full_t;
#else /* CONFIG_64BIT */
# ifdef system_has_cmpxchg64
# define system_has_freelist_aba() system_has_cmpxchg64()
# define try_cmpxchg_freelist try_cmpxchg64
# endif
#define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg64
typedef u64 freelist_full_t;
#endif /* CONFIG_64BIT */
@@ -55,6 +53,14 @@ struct freelist_counters {
* that the slab was corrupted
*/
unsigned frozen:1;
#ifdef CONFIG_64BIT
/*
* Some optimizations use free bits in 'counters' field
* to save memory. In case ->stride field is not available,
* such optimizations are disabled.
*/
unsigned short stride;
#endif
};
};
};
@@ -71,19 +77,7 @@ struct slab {
struct kmem_cache *slab_cache;
union {
struct {
union {
struct list_head slab_list;
struct { /* For deferred deactivate_slab() */
struct llist_node llnode;
void *flush_freelist;
};
#ifdef CONFIG_SLUB_CPU_PARTIAL
struct {
struct slab *next;
int slabs; /* Nr of slabs left */
};
#endif
};
struct list_head slab_list;
/* Double-word boundary */
struct freelist_counters;
};
@@ -188,23 +182,6 @@ static inline size_t slab_size(const struct slab *slab)
return PAGE_SIZE << slab_order(slab);
}
#ifdef CONFIG_SLUB_CPU_PARTIAL
#define slub_percpu_partial(c) ((c)->partial)
#define slub_set_percpu_partial(c, p) \
({ \
slub_percpu_partial(c) = (p)->next; \
})
#define slub_percpu_partial_read_once(c) READ_ONCE(slub_percpu_partial(c))
#else
#define slub_percpu_partial(c) NULL
#define slub_set_percpu_partial(c, p)
#define slub_percpu_partial_read_once(c) NULL
#endif // CONFIG_SLUB_CPU_PARTIAL
/*
* Word size structure that can be atomically updated or read and that
* contains both the order and the number of objects that a slab of the
@@ -218,8 +195,6 @@ struct kmem_cache_order_objects {
* Slab cache management.
*/
struct kmem_cache {
struct kmem_cache_cpu __percpu *cpu_slab;
struct lock_class_key lock_key;
struct slub_percpu_sheaves __percpu *cpu_sheaves;
/* Used for retrieving partial slabs, etc. */
slab_flags_t flags;
@@ -228,12 +203,6 @@ struct kmem_cache {
unsigned int object_size; /* Object size without metadata */
struct reciprocal_value reciprocal_size;
unsigned int offset; /* Free pointer offset */
#ifdef CONFIG_SLUB_CPU_PARTIAL
/* Number of per cpu partial objects to keep around */
unsigned int cpu_partial;
/* Number of per cpu partial slabs to keep around */
unsigned int cpu_partial_slabs;
#endif
unsigned int sheaf_capacity;
struct kmem_cache_order_objects oo;
@@ -274,16 +243,35 @@ struct kmem_cache {
unsigned int usersize; /* Usercopy region size */
#endif
#ifdef CONFIG_SLUB_STATS
struct kmem_cache_stats __percpu *cpu_stats;
#endif
struct kmem_cache_node *node[MAX_NUMNODES];
};
/*
* Every cache has !NULL s->cpu_sheaves but they may point to the
* bootstrap_sheaf temporarily during init, or permanently for the boot caches
* and caches with debugging enabled, or all caches with CONFIG_SLUB_TINY. This
* helper distinguishes whether cache has real non-bootstrap sheaves.
*/
static inline bool cache_has_sheaves(struct kmem_cache *s)
{
/* Test CONFIG_SLUB_TINY for code elimination purposes */
return !IS_ENABLED(CONFIG_SLUB_TINY) && s->sheaf_capacity;
}
#if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY)
#define SLAB_SUPPORTS_SYSFS 1
void sysfs_slab_unlink(struct kmem_cache *s);
void sysfs_slab_release(struct kmem_cache *s);
int sysfs_slab_alias(struct kmem_cache *s, const char *name);
#else
static inline void sysfs_slab_unlink(struct kmem_cache *s) { }
static inline void sysfs_slab_release(struct kmem_cache *s) { }
static inline int sysfs_slab_alias(struct kmem_cache *s, const char *name)
{ return 0; }
#endif
void *fixup_red_left(struct kmem_cache *s, void *p);
@@ -400,11 +388,7 @@ extern void create_boot_cache(struct kmem_cache *, const char *name,
unsigned int useroffset, unsigned int usersize);
int slab_unmergeable(struct kmem_cache *s);
struct kmem_cache *find_mergeable(unsigned size, unsigned align,
slab_flags_t flags, const char *name, void (*ctor)(void *));
struct kmem_cache *
__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
slab_flags_t flags, void (*ctor)(void *));
bool slab_args_unmergeable(struct kmem_cache_args *args, slab_flags_t flags);
slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name);
@@ -502,6 +486,24 @@ bool slab_in_kunit_test(void);
static inline bool slab_in_kunit_test(void) { return false; }
#endif
/*
* slub is about to manipulate internal object metadata. This memory lies
* outside the range of the allocated object, so accessing it would normally
* be reported by kasan as a bounds error. metadata_access_enable() is used
* to tell kasan that these accesses are OK.
*/
static inline void metadata_access_enable(void)
{
kasan_disable_current();
kmsan_disable_current();
}
static inline void metadata_access_disable(void)
{
kmsan_enable_current();
kasan_enable_current();
}
#ifdef CONFIG_SLAB_OBJ_EXT
/*
@@ -509,10 +511,26 @@ static inline bool slab_in_kunit_test(void) { return false; }
* associated with a slab.
* @slab: a pointer to the slab struct
*
* Returns a pointer to the object extension vector associated with the slab,
* or NULL if no such vector has been associated yet.
* Returns the address of the object extension vector associated with the slab,
* or zero if no such vector has been associated yet.
* Do not dereference the return value directly; use get/put_slab_obj_exts()
* pair and slab_obj_ext() to access individual elements.
*
* Example usage:
*
* obj_exts = slab_obj_exts(slab);
* if (obj_exts) {
* get_slab_obj_exts(obj_exts);
* obj_ext = slab_obj_ext(slab, obj_exts, obj_to_index(s, slab, obj));
* // do something with obj_ext
* put_slab_obj_exts(obj_exts);
* }
*
* Note that the get/put semantics does not involve reference counting.
* Instead, it updates kasan/kmsan depth so that accesses to slabobj_ext
* won't be reported as access violations.
*/
static inline struct slabobj_ext *slab_obj_exts(struct slab *slab)
static inline unsigned long slab_obj_exts(struct slab *slab)
{
unsigned long obj_exts = READ_ONCE(slab->obj_exts);
@@ -525,7 +543,62 @@ static inline struct slabobj_ext *slab_obj_exts(struct slab *slab)
obj_exts != OBJEXTS_ALLOC_FAIL, slab_page(slab));
VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab));
#endif
return (struct slabobj_ext *)(obj_exts & ~OBJEXTS_FLAGS_MASK);
return obj_exts & ~OBJEXTS_FLAGS_MASK;
}
static inline void get_slab_obj_exts(unsigned long obj_exts)
{
VM_WARN_ON_ONCE(!obj_exts);
metadata_access_enable();
}
static inline void put_slab_obj_exts(unsigned long obj_exts)
{
metadata_access_disable();
}
#ifdef CONFIG_64BIT
static inline void slab_set_stride(struct slab *slab, unsigned short stride)
{
slab->stride = stride;
}
static inline unsigned short slab_get_stride(struct slab *slab)
{
return slab->stride;
}
#else
static inline void slab_set_stride(struct slab *slab, unsigned short stride)
{
VM_WARN_ON_ONCE(stride != sizeof(struct slabobj_ext));
}
static inline unsigned short slab_get_stride(struct slab *slab)
{
return sizeof(struct slabobj_ext);
}
#endif
/*
* slab_obj_ext - get the pointer to the slab object extension metadata
* associated with an object in a slab.
* @slab: a pointer to the slab struct
* @obj_exts: a pointer to the object extension vector
* @index: an index of the object
*
* Returns a pointer to the object extension associated with the object.
* Must be called within a section covered by get/put_slab_obj_exts().
*/
static inline struct slabobj_ext *slab_obj_ext(struct slab *slab,
unsigned long obj_exts,
unsigned int index)
{
struct slabobj_ext *obj_ext;
VM_WARN_ON_ONCE(obj_exts != slab_obj_exts(slab));
obj_ext = (struct slabobj_ext *)(obj_exts +
slab_get_stride(slab) * index);
return kasan_reset_tag(obj_ext);
}
int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
@@ -533,11 +606,22 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
#else /* CONFIG_SLAB_OBJ_EXT */
static inline struct slabobj_ext *slab_obj_exts(struct slab *slab)
static inline unsigned long slab_obj_exts(struct slab *slab)
{
return 0;
}
static inline struct slabobj_ext *slab_obj_ext(struct slab *slab,
unsigned long obj_exts,
unsigned int index)
{
return NULL;
}
static inline void slab_set_stride(struct slab *slab, unsigned int stride) { }
static inline unsigned int slab_get_stride(struct slab *slab) { return 0; }
#endif /* CONFIG_SLAB_OBJ_EXT */
static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s)
@@ -550,38 +634,11 @@ static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s)
bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
gfp_t flags, size_t size, void **p);
void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void **p, int objects, struct slabobj_ext *obj_exts);
void **p, int objects, unsigned long obj_exts);
#endif
void kvfree_rcu_cb(struct rcu_head *head);
size_t __ksize(const void *objp);
static inline size_t slab_ksize(const struct kmem_cache *s)
{
#ifdef CONFIG_SLUB_DEBUG
/*
* Debugging requires use of the padding between object
* and whatever may come after it.
*/
if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
return s->object_size;
#endif
if (s->flags & SLAB_KASAN)
return s->object_size;
/*
* If we have the need to store the freelist pointer
* back there or track user information then we can
* only use the space before that information.
*/
if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER))
return s->inuse;
/*
* Else we can use all the padding etc for the allocation
*/
return s->size;
}
static inline unsigned int large_kmalloc_order(const struct page *page)
{
return page[1].flags.f & 0xff;

View File

@@ -43,11 +43,13 @@ DEFINE_MUTEX(slab_mutex);
struct kmem_cache *kmem_cache;
/*
* Set of flags that will prevent slab merging
* Set of flags that will prevent slab merging.
* Any flag that adds per-object metadata should be included,
* since slab merging can update s->inuse that affects the metadata layout.
*/
#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
SLAB_FAILSLAB | SLAB_NO_MERGE)
#define SLAB_NEVER_MERGE (SLAB_DEBUG_FLAGS | SLAB_TYPESAFE_BY_RCU | \
SLAB_NOLEAKTRACE | SLAB_FAILSLAB | SLAB_NO_MERGE | \
SLAB_OBJ_EXT_IN_OBJ)
#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
SLAB_CACHE_DMA32 | SLAB_ACCOUNT)
@@ -163,9 +165,6 @@ int slab_unmergeable(struct kmem_cache *s)
return 1;
#endif
if (s->cpu_sheaves)
return 1;
/*
* We may have set a slab to be unmergeable during bootstrap.
*/
@@ -175,24 +174,35 @@ int slab_unmergeable(struct kmem_cache *s)
return 0;
}
struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
slab_flags_t flags, const char *name, void (*ctor)(void *))
bool slab_args_unmergeable(struct kmem_cache_args *args, slab_flags_t flags)
{
struct kmem_cache *s;
if (slab_nomerge)
return NULL;
return true;
if (ctor)
return NULL;
if (args->ctor)
return true;
flags = kmem_cache_flags(flags, name);
if (IS_ENABLED(CONFIG_HARDENED_USERCOPY) && args->usersize)
return true;
if (flags & SLAB_NEVER_MERGE)
return true;
return false;
}
static struct kmem_cache *find_mergeable(unsigned int size, slab_flags_t flags,
const char *name, struct kmem_cache_args *args)
{
struct kmem_cache *s;
unsigned int align;
flags = kmem_cache_flags(flags, name);
if (slab_args_unmergeable(args, flags))
return NULL;
size = ALIGN(size, sizeof(void *));
align = calculate_alignment(flags, align, size);
align = calculate_alignment(flags, args->align, size);
size = ALIGN(size, align);
list_for_each_entry_reverse(s, &slab_caches, list) {
@@ -231,7 +241,7 @@ static struct kmem_cache *create_cache(const char *name,
err = -EINVAL;
if (args->use_freeptr_offset &&
(args->freeptr_offset >= object_size ||
!(flags & SLAB_TYPESAFE_BY_RCU) ||
(!(flags & SLAB_TYPESAFE_BY_RCU) && !args->ctor) ||
!IS_ALIGNED(args->freeptr_offset, __alignof__(freeptr_t))))
goto out;
@@ -253,6 +263,31 @@ out:
return ERR_PTR(err);
}
static struct kmem_cache *
__kmem_cache_alias(const char *name, unsigned int size, slab_flags_t flags,
struct kmem_cache_args *args)
{
struct kmem_cache *s;
s = find_mergeable(size, flags, name, args);
if (s) {
if (sysfs_slab_alias(s, name))
pr_err("SLUB: Unable to add cache alias %s to sysfs\n",
name);
s->refcount++;
/*
* Adjust the object sizes so that we clear
* the complete object on kzalloc.
*/
s->object_size = max(s->object_size, size);
s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
}
return s;
}
/**
* __kmem_cache_create_args - Create a kmem cache.
* @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -305,6 +340,13 @@ struct kmem_cache *__kmem_cache_create_args(const char *name,
flags &= ~SLAB_DEBUG_FLAGS;
#endif
/*
* Caches with specific capacity are special enough. It's simpler to
* make them unmergeable.
*/
if (args->sheaf_capacity)
flags |= SLAB_NO_MERGE;
mutex_lock(&slab_mutex);
err = kmem_cache_sanity_check(name, object_size);
@@ -324,9 +366,7 @@ struct kmem_cache *__kmem_cache_create_args(const char *name,
object_size - args->usersize < args->useroffset))
args->usersize = args->useroffset = 0;
if (!args->usersize && !args->sheaf_capacity)
s = __kmem_cache_alias(name, object_size, args->align, flags,
args->ctor);
s = __kmem_cache_alias(name, object_size, flags, args);
if (s)
goto out_unlock;
@@ -983,43 +1023,6 @@ void __init create_kmalloc_caches(void)
0, SLAB_NO_MERGE, NULL);
}
/**
* __ksize -- Report full size of underlying allocation
* @object: pointer to the object
*
* This should only be used internally to query the true size of allocations.
* It is not meant to be a way to discover the usable size of an allocation
* after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond
* the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS,
* and/or FORTIFY_SOURCE.
*
* Return: size of the actual memory used by @object in bytes
*/
size_t __ksize(const void *object)
{
const struct page *page;
const struct slab *slab;
if (unlikely(object == ZERO_SIZE_PTR))
return 0;
page = virt_to_page(object);
if (unlikely(PageLargeKmalloc(page)))
return large_kmalloc_size(page);
slab = page_slab(page);
/* Delete this after we're sure there are no users */
if (WARN_ON(!slab))
return page_size(page);
#ifdef CONFIG_SLUB_DEBUG
skip_orig_size_check(slab->slab_cache, object);
#endif
return slab_ksize(slab->slab_cache);
}
gfp_t kmalloc_fix_flags(gfp_t flags)
{
gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
@@ -1235,30 +1238,6 @@ void kfree_sensitive(const void *p)
}
EXPORT_SYMBOL(kfree_sensitive);
size_t ksize(const void *objp)
{
/*
* We need to first check that the pointer to the object is valid.
* The KASAN report printed from ksize() is more useful, then when
* it's printed later when the behaviour could be undefined due to
* a potential use-after-free or double-free.
*
* We use kasan_check_byte(), which is supported for the hardware
* tag-based KASAN mode, unlike kasan_check_read/write().
*
* If the pointed to memory is invalid, we return 0 to avoid users of
* ksize() writing to and potentially corrupting the memory region.
*
* We want to perform the check before __ksize(), to avoid potentially
* crashing in __ksize() due to accessing invalid metadata.
*/
if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp))
return 0;
return kfence_ksize(objp) ?: __ksize(objp);
}
EXPORT_SYMBOL(ksize);
#ifdef CONFIG_BPF_SYSCALL
#include <linux/btf.h>
@@ -1625,11 +1604,8 @@ static bool kfree_rcu_sheaf(void *obj)
return false;
s = slab->slab_cache;
if (s->cpu_sheaves) {
if (likely(!IS_ENABLED(CONFIG_NUMA) ||
slab_nid(slab) == numa_mem_id()))
return __kfree_rcu_sheaf(s, obj);
}
if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id()))
return __kfree_rcu_sheaf(s, obj);
return false;
}
@@ -2133,8 +2109,11 @@ EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
*/
void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
{
if (s->cpu_sheaves)
if (cache_has_sheaves(s)) {
flush_rcu_sheaves_on_cache(s);
rcu_barrier();
}
/*
* TODO: Introduce a version of __kvfree_rcu_barrier() that works
* on a specific slab cache.

3379
mm/slub.c

File diff suppressed because it is too large Load Diff