mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-09-04 20:19:47 +08:00
Port files to rely on file_ref reference to improve scaling and gain
overflow protection.
- We continue to WARN during get_file() in case a file that is already
marked dead is revived as get_file() is only valid if the caller
already holds a reference to the file. This hasn't changed just the
check changes.
- The semantics for epoll and ttm's dmabuf usage have changed. Both
epoll and ttm synchronize with __fput() to prevent the underlying file
from beeing freed.
(1) epoll
Explaining epoll is straightforward using a simple diagram.
Essentially, the mutex of the epoll instance needs to be taken in both
__fput() and around epi_fget() preventing the file from being freed
while it is polled or preventing the file from being resurrected.
CPU1 CPU2
fput(file)
-> __fput(file)
-> eventpoll_release(file)
-> eventpoll_release_file(file)
mutex_lock(&ep->mtx)
epi_item_poll()
-> epi_fget()
-> file_ref_get(file)
mutex_unlock(&ep->mtx)
mutex_lock(&ep->mtx);
__ep_remove()
mutex_unlock(&ep->mtx);
-> kmem_cache_free(file)
(2) ttm dmabuf
This explanation is a bit more involved. A regular dmabuf file stashed
the dmabuf in file->private_data and the file in dmabuf->file:
file->private_data = dmabuf;
dmabuf->file = file;
The generic release method of a dmabuf file handles file specific
things:
f_op->release::dma_buf_file_release()
while the generic dentry release method of a dmabuf handles dmabuf
freeing including driver specific things:
dentry->d_release::dma_buf_release()
During ttm dmabuf initialization in ttm_object_device_init() the ttm
driver copies the provided struct dma_buf_ops into a private location:
struct ttm_object_device {
spinlock_t object_lock;
struct dma_buf_ops ops;
void (*dmabuf_release)(struct dma_buf *dma_buf);
struct idr idr;
};
ttm_object_device_init(const struct dma_buf_ops *ops)
{
// copy original dma_buf_ops in private location
tdev->ops = *ops;
// stash the release method of the original struct dma_buf_ops
tdev->dmabuf_release = tdev->ops.release;
// override the release method in the copy of the struct dma_buf_ops
// with ttm's own dmabuf release method
tdev->ops.release = ttm_prime_dmabuf_release;
}
When a new dmabuf is created the struct dma_buf_ops with the overriden
release method set to ttm_prime_dmabuf_release is passed in exp_info.ops:
DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
exp_info.ops = &tdev->ops;
exp_info.size = prime->size;
exp_info.flags = flags;
exp_info.priv = prime;
The call to dma_buf_export() then sets
mutex_lock_interruptible(&prime->mutex);
dma_buf = dma_buf_export(&exp_info)
{
dmabuf->ops = exp_info->ops;
}
mutex_unlock(&prime->mutex);
which creates a new dmabuf file and then install a file descriptor to
it in the callers file descriptor table:
ret = dma_buf_fd(dma_buf, flags);
When that dmabuf file is closed we now get:
fput(file)
-> __fput(file)
-> f_op->release::dma_buf_file_release()
-> dput()
-> d_op->d_release::dma_buf_release()
-> dmabuf->ops->release::ttm_prime_dmabuf_release()
mutex_lock(&prime->mutex);
if (prime->dma_buf == dma_buf)
prime->dma_buf = NULL;
mutex_unlock(&prime->mutex);
Where we can see that prime->dma_buf is set to NULL. So when we have
the following diagram:
CPU1 CPU2
fput(file)
-> __fput(file)
-> f_op->release::dma_buf_file_release()
-> dput()
-> d_op->d_release::dma_buf_release()
-> dmabuf->ops->release::ttm_prime_dmabuf_release()
ttm_prime_handle_to_fd()
mutex_lock_interruptible(&prime->mutex)
dma_buf = prime->dma_buf
dma_buf && get_dma_buf_unless_doomed(dma_buf)
-> file_ref_get(dma_buf->file)
mutex_unlock(&prime->mutex);
mutex_lock(&prime->mutex);
if (prime->dma_buf == dma_buf)
prime->dma_buf = NULL;
mutex_unlock(&prime->mutex);
-> kmem_cache_free(file)
The logic of the mechanism is the same as for epoll: sync with
__fput() preventing the file from being freed. Here the
synchronization happens through the ttm instance's prime->mutex.
Basically, the lifetime of the dma_buf and the file are tighly
coupled.
Both (1) and (2) used to call atomic_inc_not_zero() to check whether
the file has already been marked dead and then refuse to revive it.
This is only safe because both (1) and (2) sync with __fput() and thus
prevent kmem_cache_free() on the file being called and thus prevent
the file from being immediately recycled due to SLAB_TYPESAFE_BY_RCU.
Both (1) and (2) have been ported from atomic_inc_not_zero() to
file_ref_get(). That means a file that is already in the process of
being marked as FILE_REF_DEAD:
file_ref_put()
cnt = atomic_long_dec_return()
-> __file_ref_put(cnt)
if (cnt == FIlE_REF_NOREF)
atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)
can be revived again:
CPU1 CPU2
file_ref_put()
cnt = atomic_long_dec_return()
-> __file_ref_put(cnt)
if (cnt == FIlE_REF_NOREF)
file_ref_get()
// Brings reference back to FILE_REF_ONEREF
atomic_long_add_negative()
atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)
This is fine and inherent to the file_ref_get()/file_ref_put()
semantics. For both (1) and (2) this is safe because __fput() is
prevented from making progress if file_ref_get() fails due to the
aforementioned synchronization mechanisms.
Two cases need to be considered that affect both (1) epoll and (2) ttm
dmabuf:
(i) fput()'s file_ref_put() and marks the file as FILE_REF_NOREF but
before that fput() can mark the file as FILE_REF_DEAD someone
manages to sneak in a file_ref_get() and brings the refcount back
from FILE_REF_NOREF to FILE_REF_ONEREF. In that case the original
fput() doesn't call __fput(). For epoll the poll will finish and
for ttm dmabuf the file can be used again. For ttm dambuf this is
actually an advantage because it avoids immediately allocating
a new dmabuf object.
CPU1 CPU2
file_ref_put()
cnt = atomic_long_dec_return()
-> __file_ref_put(cnt)
if (cnt == FIlE_REF_NOREF)
file_ref_get()
// Brings reference back to FILE_REF_ONEREF
atomic_long_add_negative()
atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)
(ii) fput()'s file_ref_put() marks the file FILE_REF_NOREF and
also suceeds in actually marking it FILE_REF_DEAD and then calls
into __fput() to free the file.
When either (1) or (2) call file_ref_get() they fail as
atomic_long_add_negative() will return true.
At the same time, both (1) and (2) all file_ref_get() under
mutexes that __fput() must also acquire preventing
kmem_cache_free() from freeing the file.
So while this might be treated as a change in semantics for (1) and
(2) it really isn't. It if should end up causing issues this can be
fixed by adding a helper that does something like:
long cnt = atomic_long_read(&ref->refcnt);
do {
if (cnt < 0)
return false;
} while (!atomic_long_try_cmpxchg(&ref->refcnt, &cnt, cnt + 1));
return true;
which would block FILE_REF_NOREF to FILE_REF_ONEREF transitions.
- Jann correctly pointed out that kmem_cache_zalloc() cannot be used
anymore once files have been ported to file_ref_t.
The kmem_cache_zalloc() call will memset() the whole struct file to
zero when it is reallocated. This will also set file->f_ref to zero
which mens that a concurrent file_ref_get() can return true:
CPU1 CPU2
__get_file_rcu()
rcu_dereference_raw()
close()
[frees file]
alloc_empty_file()
kmem_cache_zalloc()
[reallocates same file]
memset(..., 0, ...)
file_ref_get()
[increments 0->1, returns true]
init_file()
file_ref_init(..., 1)
[sets to 0]
rcu_dereference_raw()
fput()
file_ref_put()
[decrements 0->FILE_REF_NOREF, frees file]
[UAF]
causing a concurrent __get_file_rcu() call to acquire a reference to
the file that is about to be reallocated and immediately freeing it
on realizing that it has been recycled. This causes a UAF for the
task that reallocated/recycled the file.
This is prevented by switching from kmem_cache_zalloc() to
kmem_cache_alloc() and initializing the fields manually. With
file->f_ref initialized last.
Note that a memset() also isn't guaranteed to atomically update an
unsigned long so it's theoretically possible to see torn and
therefore bogus counter values.
Link: https://lore.kernel.org/r/20241007-brauner-file-rcuref-v2-3-387e24dc9163@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
174 lines
3.5 KiB
C
174 lines
3.5 KiB
C
// SPDX-License-Identifier: MIT
|
|
/*
|
|
* Copyright © 2020 Intel Corporation
|
|
*/
|
|
|
|
#include <linux/iosys-map.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/shmem_fs.h>
|
|
#include <linux/vmalloc.h>
|
|
|
|
#include "i915_drv.h"
|
|
#include "gem/i915_gem_object.h"
|
|
#include "gem/i915_gem_lmem.h"
|
|
#include "shmem_utils.h"
|
|
|
|
struct file *shmem_create_from_data(const char *name, void *data, size_t len)
|
|
{
|
|
struct file *file;
|
|
int err;
|
|
|
|
file = shmem_file_setup(name, PAGE_ALIGN(len), VM_NORESERVE);
|
|
if (IS_ERR(file))
|
|
return file;
|
|
|
|
err = shmem_write(file, 0, data, len);
|
|
if (err) {
|
|
fput(file);
|
|
return ERR_PTR(err);
|
|
}
|
|
|
|
return file;
|
|
}
|
|
|
|
struct file *shmem_create_from_object(struct drm_i915_gem_object *obj)
|
|
{
|
|
enum i915_map_type map_type;
|
|
struct file *file;
|
|
void *ptr;
|
|
|
|
if (i915_gem_object_is_shmem(obj)) {
|
|
file = obj->base.filp;
|
|
get_file(file);
|
|
return file;
|
|
}
|
|
|
|
map_type = i915_gem_object_is_lmem(obj) ? I915_MAP_WC : I915_MAP_WB;
|
|
ptr = i915_gem_object_pin_map_unlocked(obj, map_type);
|
|
if (IS_ERR(ptr))
|
|
return ERR_CAST(ptr);
|
|
|
|
file = shmem_create_from_data("", ptr, obj->base.size);
|
|
i915_gem_object_unpin_map(obj);
|
|
|
|
return file;
|
|
}
|
|
|
|
void *shmem_pin_map(struct file *file)
|
|
{
|
|
struct page **pages;
|
|
size_t n_pages, i;
|
|
void *vaddr;
|
|
|
|
n_pages = file->f_mapping->host->i_size >> PAGE_SHIFT;
|
|
pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
|
|
if (!pages)
|
|
return NULL;
|
|
|
|
for (i = 0; i < n_pages; i++) {
|
|
pages[i] = shmem_read_mapping_page_gfp(file->f_mapping, i,
|
|
GFP_KERNEL);
|
|
if (IS_ERR(pages[i]))
|
|
goto err_page;
|
|
}
|
|
|
|
vaddr = vmap(pages, n_pages, VM_MAP_PUT_PAGES, PAGE_KERNEL);
|
|
if (!vaddr)
|
|
goto err_page;
|
|
mapping_set_unevictable(file->f_mapping);
|
|
return vaddr;
|
|
err_page:
|
|
while (i--)
|
|
put_page(pages[i]);
|
|
kvfree(pages);
|
|
return NULL;
|
|
}
|
|
|
|
void shmem_unpin_map(struct file *file, void *ptr)
|
|
{
|
|
mapping_clear_unevictable(file->f_mapping);
|
|
vfree(ptr);
|
|
}
|
|
|
|
static int __shmem_rw(struct file *file, loff_t off,
|
|
void *ptr, size_t len,
|
|
bool write)
|
|
{
|
|
unsigned long pfn;
|
|
|
|
for (pfn = off >> PAGE_SHIFT; len; pfn++) {
|
|
unsigned int this =
|
|
min_t(size_t, PAGE_SIZE - offset_in_page(off), len);
|
|
struct page *page;
|
|
void *vaddr;
|
|
|
|
page = shmem_read_mapping_page_gfp(file->f_mapping, pfn,
|
|
GFP_KERNEL);
|
|
if (IS_ERR(page))
|
|
return PTR_ERR(page);
|
|
|
|
vaddr = kmap(page);
|
|
if (write) {
|
|
memcpy(vaddr + offset_in_page(off), ptr, this);
|
|
set_page_dirty(page);
|
|
} else {
|
|
memcpy(ptr, vaddr + offset_in_page(off), this);
|
|
}
|
|
mark_page_accessed(page);
|
|
kunmap(page);
|
|
put_page(page);
|
|
|
|
len -= this;
|
|
ptr += this;
|
|
off = 0;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int shmem_read_to_iosys_map(struct file *file, loff_t off,
|
|
struct iosys_map *map, size_t map_off, size_t len)
|
|
{
|
|
unsigned long pfn;
|
|
|
|
for (pfn = off >> PAGE_SHIFT; len; pfn++) {
|
|
unsigned int this =
|
|
min_t(size_t, PAGE_SIZE - offset_in_page(off), len);
|
|
struct page *page;
|
|
void *vaddr;
|
|
|
|
page = shmem_read_mapping_page_gfp(file->f_mapping, pfn,
|
|
GFP_KERNEL);
|
|
if (IS_ERR(page))
|
|
return PTR_ERR(page);
|
|
|
|
vaddr = kmap(page);
|
|
iosys_map_memcpy_to(map, map_off, vaddr + offset_in_page(off),
|
|
this);
|
|
mark_page_accessed(page);
|
|
kunmap(page);
|
|
put_page(page);
|
|
|
|
len -= this;
|
|
map_off += this;
|
|
off = 0;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int shmem_read(struct file *file, loff_t off, void *dst, size_t len)
|
|
{
|
|
return __shmem_rw(file, off, dst, len, false);
|
|
}
|
|
|
|
int shmem_write(struct file *file, loff_t off, void *src, size_t len)
|
|
{
|
|
return __shmem_rw(file, off, src, len, true);
|
|
}
|
|
|
|
#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
|
|
#include "st_shmem_utils.c"
|
|
#endif
|