mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-04-07 15:09:15 +08:00
Pull MM updates from Andrew Morton:
- "mm, swap: improve cluster scan strategy" from Kairui Song improves
performance and reduces the failure rate of swap cluster allocation
- "support large align and nid in Rust allocators" from Vitaly Wool
permits Rust allocators to set NUMA node and large alignment when
perforning slub and vmalloc reallocs
- "mm/damon/vaddr: support stat-purpose DAMOS" from Yueyang Pan extend
DAMOS_STAT's handling of the DAMON operations sets for virtual
address spaces for ops-level DAMOS filters
- "execute PROCMAP_QUERY ioctl under per-vma lock" from Suren
Baghdasaryan reduces mmap_lock contention during reads of
/proc/pid/maps
- "mm/mincore: minor clean up for swap cache checking" from Kairui Song
performs some cleanup in the swap code
- "mm: vm_normal_page*() improvements" from David Hildenbrand provides
code cleanup in the pagemap code
- "add persistent huge zero folio support" from Pankaj Raghav provides
a block layer speedup by optionalls making the
huge_zero_pagepersistent, instead of releasing it when its refcount
falls to zero
- "kho: fixes and cleanups" from Mike Rapoport adds a few touchups to
the recently added Kexec Handover feature
- "mm: make mm->flags a bitmap and 64-bit on all arches" from Lorenzo
Stoakes turns mm_struct.flags into a bitmap. To end the constant
struggle with space shortage on 32-bit conflicting with 64-bit's
needs
- "mm/swapfile.c and swap.h cleanup" from Chris Li cleans up some swap
code
- "selftests/mm: Fix false positives and skip unsupported tests" from
Donet Tom fixes a few things in our selftests code
- "prctl: extend PR_SET_THP_DISABLE to only provide THPs when advised"
from David Hildenbrand "allows individual processes to opt-out of
THP=always into THP=madvise, without affecting other workloads on the
system".
It's a long story - the [1/N] changelog spells out the considerations
- "Add and use memdesc_flags_t" from Matthew Wilcox gets us started on
the memdesc project. Please see
https://kernelnewbies.org/MatthewWilcox/Memdescs and
https://blogs.oracle.com/linux/post/introducing-memdesc
- "Tiny optimization for large read operations" from Chi Zhiling
improves the efficiency of the pagecache read path
- "Better split_huge_page_test result check" from Zi Yan improves our
folio splitting selftest code
- "test that rmap behaves as expected" from Wei Yang adds some rmap
selftests
- "remove write_cache_pages()" from Christoph Hellwig removes that
function and converts its two remaining callers
- "selftests/mm: uffd-stress fixes" from Dev Jain fixes some UFFD
selftests issues
- "introduce kernel file mapped folios" from Boris Burkov introduces
the concept of "kernel file pages". Using these permits btrfs to
account its metadata pages to the root cgroup, rather than to the
cgroups of random inappropriate tasks
- "mm/pageblock: improve readability of some pageblock handling" from
Wei Yang provides some readability improvements to the page allocator
code
- "mm/damon: support ARM32 with LPAE" from SeongJae Park teaches DAMON
to understand arm32 highmem
- "tools: testing: Use existing atomic.h for vma/maple tests" from
Brendan Jackman performs some code cleanups and deduplication under
tools/testing/
- "maple_tree: Fix testing for 32bit compiles" from Liam Howlett fixes
a couple of 32-bit issues in tools/testing/radix-tree.c
- "kasan: unify kasan_enabled() and remove arch-specific
implementations" from Sabyrzhan Tasbolatov moves KASAN arch-specific
initialization code into a common arch-neutral implementation
- "mm: remove zpool" from Johannes Weiner removes zspool - an
indirection layer which now only redirects to a single thing
(zsmalloc)
- "mm: task_stack: Stack handling cleanups" from Pasha Tatashin makes a
couple of cleanups in the fork code
- "mm: remove nth_page()" from David Hildenbrand makes rather a lot of
adjustments at various nth_page() callsites, eventually permitting
the removal of that undesirable helper function
- "introduce kasan.write_only option in hw-tags" from Yeoreum Yun
creates a KASAN read-only mode for ARM, using that architecture's
memory tagging feature. It is felt that a read-only mode KASAN is
suitable for use in production systems rather than debug-only
- "mm: hugetlb: cleanup hugetlb folio allocation" from Kefeng Wang does
some tidying in the hugetlb folio allocation code
- "mm: establish const-correctness for pointer parameters" from Max
Kellermann makes quite a number of the MM API functions more accurate
about the constness of their arguments. This was getting in the way
of subsystems (in this case CEPH) when they attempt to improving
their own const/non-const accuracy
- "Cleanup free_pages() misuse" from Vishal Moola fixes a number of
code sites which were confused over when to use free_pages() vs
__free_pages()
- "Add Rust abstraction for Maple Trees" from Alice Ryhl makes the
mapletree code accessible to Rust. Required by nouveau and by its
forthcoming successor: the new Rust Nova driver
- "selftests/mm: split_huge_page_test: split_pte_mapped_thp
improvements" from David Hildenbrand adds a fix and some cleanups to
the thp selftesting code
- "mm, swap: introduce swap table as swap cache (phase I)" from Chris
Li and Kairui Song is the first step along the path to implementing
"swap tables" - a new approach to swap allocation and state tracking
which is expected to yield speed and space improvements. This
patchset itself yields a 5-20% performance benefit in some situations
- "Some ptdesc cleanups" from Matthew Wilcox utilizes the new memdesc
layer to clean up the ptdesc code a little
- "Fix va_high_addr_switch.sh test failure" from Chunyu Hu fixes some
issues in our 5-level pagetable selftesting code
- "Minor fixes for memory allocation profiling" from Suren Baghdasaryan
addresses a couple of minor issues in relatively new memory
allocation profiling feature
- "Small cleanups" from Matthew Wilcox has a few cleanups in
preparation for more memdesc work
- "mm/damon: add addr_unit for DAMON_LRU_SORT and DAMON_RECLAIM" from
Quanmin Yan makes some changes to DAMON in furtherance of supporting
arm highmem
- "selftests/mm: Add -Wunreachable-code and fix warnings" from Muhammad
Anjum adds that compiler check to selftests code and fixes the
fallout, by removing dead code
- "Improvements to Victim Process Thawing and OOM Reaper Traversal
Order" from zhongjinji makes a number of improvements in the OOM
killer: mainly thawing a more appropriate group of victim threads so
they can release resources
- "mm/damon: misc fixups and improvements for 6.18" from SeongJae Park
is a bunch of small and unrelated fixups for DAMON
- "mm/damon: define and use DAMON initialization check function" from
SeongJae Park implement reliability and maintainability improvements
to a recently-added bug fix
- "mm/damon/stat: expose auto-tuned intervals and non-idle ages" from
SeongJae Park provides additional transparency to userspace clients
of the DAMON_STAT information
- "Expand scope of khugepaged anonymous collapse" from Dev Jain removes
some constraints on khubepaged's collapsing of anon VMAs. It also
increases the success rate of MADV_COLLAPSE against an anon vma
- "mm: do not assume file == vma->vm_file in compat_vma_mmap_prepare()"
from Lorenzo Stoakes moves us further towards removal of
file_operations.mmap(). This patchset concentrates upon clearing up
the treatment of stacked filesystems
- "mm: Improve mlock tracking for large folios" from Kiryl Shutsemau
provides some fixes and improvements to mlock's tracking of large
folios. /proc/meminfo's "Mlocked" field became more accurate
- "mm/ksm: Fix incorrect accounting of KSM counters during fork" from
Donet Tom fixes several user-visible KSM stats inaccuracies across
forks and adds selftest code to verify these counters
- "mm_slot: fix the usage of mm_slot_entry" from Wei Yang addresses
some potential but presently benign issues in KSM's mm_slot handling
* tag 'mm-stable-2025-10-01-19-00' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (372 commits)
mm: swap: check for stable address space before operating on the VMA
mm: convert folio_page() back to a macro
mm/khugepaged: use start_addr/addr for improved readability
hugetlbfs: skip VMAs without shareable locks in hugetlb_vmdelete_list
alloc_tag: fix boot failure due to NULL pointer dereference
mm: silence data-race in update_hiwater_rss
mm/memory-failure: don't select MEMORY_ISOLATION
mm/khugepaged: remove definition of struct khugepaged_mm_slot
mm/ksm: get mm_slot by mm_slot_entry() when slot is !NULL
hugetlb: increase number of reserving hugepages via cmdline
selftests/mm: add fork inheritance test for ksm_merging_pages counter
mm/ksm: fix incorrect KSM counter handling in mm_struct during fork
drivers/base/node: fix double free in register_one_node()
mm: remove PMD alignment constraint in execmem_vmalloc()
mm/memory_hotplug: fix typo 'esecially' -> 'especially'
mm/rmap: improve mlock tracking for large folios
mm/filemap: map entire large folio faultaround
mm/fault: try to map the entire file folio in finish_fault()
mm/rmap: mlock large folios in try_to_unmap_one()
mm/rmap: fix a mlock race condition in folio_referenced_one()
...
1226 lines
30 KiB
C
1226 lines
30 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
|
|
#include <linux/blkdev.h>
|
|
#include <linux/wait.h>
|
|
#include <linux/rbtree.h>
|
|
#include <linux/kthread.h>
|
|
#include <linux/backing-dev.h>
|
|
#include <linux/blk-cgroup.h>
|
|
#include <linux/freezer.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/sched/mm.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/module.h>
|
|
#include <linux/writeback.h>
|
|
#include <linux/device.h>
|
|
#include <trace/events/writeback.h>
|
|
#include "internal.h"
|
|
|
|
struct backing_dev_info noop_backing_dev_info;
|
|
EXPORT_SYMBOL_GPL(noop_backing_dev_info);
|
|
|
|
static const char *bdi_unknown_name = "(unknown)";
|
|
|
|
/*
|
|
* bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
|
|
* reader side locking.
|
|
*/
|
|
DEFINE_SPINLOCK(bdi_lock);
|
|
static u64 bdi_id_cursor;
|
|
static struct rb_root bdi_tree = RB_ROOT;
|
|
LIST_HEAD(bdi_list);
|
|
|
|
/* bdi_wq serves all asynchronous writeback tasks */
|
|
struct workqueue_struct *bdi_wq;
|
|
|
|
#ifdef CONFIG_DEBUG_FS
|
|
#include <linux/debugfs.h>
|
|
#include <linux/seq_file.h>
|
|
|
|
struct wb_stats {
|
|
unsigned long nr_dirty;
|
|
unsigned long nr_io;
|
|
unsigned long nr_more_io;
|
|
unsigned long nr_dirty_time;
|
|
unsigned long nr_writeback;
|
|
unsigned long nr_reclaimable;
|
|
unsigned long nr_dirtied;
|
|
unsigned long nr_written;
|
|
unsigned long dirty_thresh;
|
|
unsigned long wb_thresh;
|
|
};
|
|
|
|
static struct dentry *bdi_debug_root;
|
|
|
|
static void bdi_debug_init(void)
|
|
{
|
|
bdi_debug_root = debugfs_create_dir("bdi", NULL);
|
|
}
|
|
|
|
static void collect_wb_stats(struct wb_stats *stats,
|
|
struct bdi_writeback *wb)
|
|
{
|
|
struct inode *inode;
|
|
|
|
spin_lock(&wb->list_lock);
|
|
list_for_each_entry(inode, &wb->b_dirty, i_io_list)
|
|
stats->nr_dirty++;
|
|
list_for_each_entry(inode, &wb->b_io, i_io_list)
|
|
stats->nr_io++;
|
|
list_for_each_entry(inode, &wb->b_more_io, i_io_list)
|
|
stats->nr_more_io++;
|
|
list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
|
|
if (inode->i_state & I_DIRTY_TIME)
|
|
stats->nr_dirty_time++;
|
|
spin_unlock(&wb->list_lock);
|
|
|
|
stats->nr_writeback += wb_stat(wb, WB_WRITEBACK);
|
|
stats->nr_reclaimable += wb_stat(wb, WB_RECLAIMABLE);
|
|
stats->nr_dirtied += wb_stat(wb, WB_DIRTIED);
|
|
stats->nr_written += wb_stat(wb, WB_WRITTEN);
|
|
stats->wb_thresh += wb_calc_thresh(wb, stats->dirty_thresh);
|
|
}
|
|
|
|
#ifdef CONFIG_CGROUP_WRITEBACK
|
|
static void bdi_collect_stats(struct backing_dev_info *bdi,
|
|
struct wb_stats *stats)
|
|
{
|
|
struct bdi_writeback *wb;
|
|
|
|
rcu_read_lock();
|
|
list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) {
|
|
if (!wb_tryget(wb))
|
|
continue;
|
|
|
|
collect_wb_stats(stats, wb);
|
|
wb_put(wb);
|
|
}
|
|
rcu_read_unlock();
|
|
}
|
|
#else
|
|
static void bdi_collect_stats(struct backing_dev_info *bdi,
|
|
struct wb_stats *stats)
|
|
{
|
|
collect_wb_stats(stats, &bdi->wb);
|
|
}
|
|
#endif
|
|
|
|
static int bdi_debug_stats_show(struct seq_file *m, void *v)
|
|
{
|
|
struct backing_dev_info *bdi = m->private;
|
|
unsigned long background_thresh;
|
|
unsigned long dirty_thresh;
|
|
struct wb_stats stats;
|
|
unsigned long tot_bw;
|
|
|
|
global_dirty_limits(&background_thresh, &dirty_thresh);
|
|
|
|
memset(&stats, 0, sizeof(stats));
|
|
stats.dirty_thresh = dirty_thresh;
|
|
bdi_collect_stats(bdi, &stats);
|
|
tot_bw = atomic_long_read(&bdi->tot_write_bandwidth);
|
|
|
|
seq_printf(m,
|
|
"BdiWriteback: %10lu kB\n"
|
|
"BdiReclaimable: %10lu kB\n"
|
|
"BdiDirtyThresh: %10lu kB\n"
|
|
"DirtyThresh: %10lu kB\n"
|
|
"BackgroundThresh: %10lu kB\n"
|
|
"BdiDirtied: %10lu kB\n"
|
|
"BdiWritten: %10lu kB\n"
|
|
"BdiWriteBandwidth: %10lu kBps\n"
|
|
"b_dirty: %10lu\n"
|
|
"b_io: %10lu\n"
|
|
"b_more_io: %10lu\n"
|
|
"b_dirty_time: %10lu\n"
|
|
"bdi_list: %10u\n"
|
|
"state: %10lx\n",
|
|
K(stats.nr_writeback),
|
|
K(stats.nr_reclaimable),
|
|
K(stats.wb_thresh),
|
|
K(dirty_thresh),
|
|
K(background_thresh),
|
|
K(stats.nr_dirtied),
|
|
K(stats.nr_written),
|
|
K(tot_bw),
|
|
stats.nr_dirty,
|
|
stats.nr_io,
|
|
stats.nr_more_io,
|
|
stats.nr_dirty_time,
|
|
!list_empty(&bdi->bdi_list), bdi->wb.state);
|
|
|
|
return 0;
|
|
}
|
|
DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats);
|
|
|
|
static void wb_stats_show(struct seq_file *m, struct bdi_writeback *wb,
|
|
struct wb_stats *stats)
|
|
{
|
|
|
|
seq_printf(m,
|
|
"WbCgIno: %10lu\n"
|
|
"WbWriteback: %10lu kB\n"
|
|
"WbReclaimable: %10lu kB\n"
|
|
"WbDirtyThresh: %10lu kB\n"
|
|
"WbDirtied: %10lu kB\n"
|
|
"WbWritten: %10lu kB\n"
|
|
"WbWriteBandwidth: %10lu kBps\n"
|
|
"b_dirty: %10lu\n"
|
|
"b_io: %10lu\n"
|
|
"b_more_io: %10lu\n"
|
|
"b_dirty_time: %10lu\n"
|
|
"state: %10lx\n\n",
|
|
#ifdef CONFIG_CGROUP_WRITEBACK
|
|
cgroup_ino(wb->memcg_css->cgroup),
|
|
#else
|
|
1ul,
|
|
#endif
|
|
K(stats->nr_writeback),
|
|
K(stats->nr_reclaimable),
|
|
K(stats->wb_thresh),
|
|
K(stats->nr_dirtied),
|
|
K(stats->nr_written),
|
|
K(wb->avg_write_bandwidth),
|
|
stats->nr_dirty,
|
|
stats->nr_io,
|
|
stats->nr_more_io,
|
|
stats->nr_dirty_time,
|
|
wb->state);
|
|
}
|
|
|
|
static int cgwb_debug_stats_show(struct seq_file *m, void *v)
|
|
{
|
|
struct backing_dev_info *bdi = m->private;
|
|
unsigned long background_thresh;
|
|
unsigned long dirty_thresh;
|
|
struct bdi_writeback *wb;
|
|
|
|
global_dirty_limits(&background_thresh, &dirty_thresh);
|
|
|
|
rcu_read_lock();
|
|
list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) {
|
|
struct wb_stats stats = { .dirty_thresh = dirty_thresh };
|
|
|
|
if (!wb_tryget(wb))
|
|
continue;
|
|
|
|
collect_wb_stats(&stats, wb);
|
|
|
|
/*
|
|
* Calculate thresh of wb in writeback cgroup which is min of
|
|
* thresh in global domain and thresh in cgroup domain. Drop
|
|
* rcu lock because cgwb_calc_thresh may sleep in
|
|
* cgroup_rstat_flush. We can do so here because we have a ref.
|
|
*/
|
|
if (mem_cgroup_wb_domain(wb)) {
|
|
rcu_read_unlock();
|
|
stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb));
|
|
rcu_read_lock();
|
|
}
|
|
|
|
wb_stats_show(m, wb, &stats);
|
|
|
|
wb_put(wb);
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
return 0;
|
|
}
|
|
DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats);
|
|
|
|
static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
|
|
{
|
|
bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
|
|
|
|
debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
|
|
&bdi_debug_stats_fops);
|
|
debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi,
|
|
&cgwb_debug_stats_fops);
|
|
}
|
|
|
|
static void bdi_debug_unregister(struct backing_dev_info *bdi)
|
|
{
|
|
debugfs_remove_recursive(bdi->debug_dir);
|
|
}
|
|
#else /* CONFIG_DEBUG_FS */
|
|
static inline void bdi_debug_init(void)
|
|
{
|
|
}
|
|
static inline void bdi_debug_register(struct backing_dev_info *bdi,
|
|
const char *name)
|
|
{
|
|
}
|
|
static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
|
|
{
|
|
}
|
|
#endif /* CONFIG_DEBUG_FS */
|
|
|
|
static ssize_t read_ahead_kb_store(struct device *dev,
|
|
struct device_attribute *attr,
|
|
const char *buf, size_t count)
|
|
{
|
|
struct backing_dev_info *bdi = dev_get_drvdata(dev);
|
|
unsigned long read_ahead_kb;
|
|
ssize_t ret;
|
|
|
|
ret = kstrtoul(buf, 10, &read_ahead_kb);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
|
|
|
|
return count;
|
|
}
|
|
|
|
#define BDI_SHOW(name, expr) \
|
|
static ssize_t name##_show(struct device *dev, \
|
|
struct device_attribute *attr, char *buf) \
|
|
{ \
|
|
struct backing_dev_info *bdi = dev_get_drvdata(dev); \
|
|
\
|
|
return sysfs_emit(buf, "%lld\n", (long long)expr); \
|
|
} \
|
|
static DEVICE_ATTR_RW(name);
|
|
|
|
BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
|
|
|
|
static ssize_t min_ratio_store(struct device *dev,
|
|
struct device_attribute *attr, const char *buf, size_t count)
|
|
{
|
|
struct backing_dev_info *bdi = dev_get_drvdata(dev);
|
|
unsigned int ratio;
|
|
ssize_t ret;
|
|
|
|
ret = kstrtouint(buf, 10, &ratio);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
ret = bdi_set_min_ratio(bdi, ratio);
|
|
if (!ret)
|
|
ret = count;
|
|
|
|
return ret;
|
|
}
|
|
BDI_SHOW(min_ratio, bdi->min_ratio / BDI_RATIO_SCALE)
|
|
|
|
static ssize_t min_ratio_fine_store(struct device *dev,
|
|
struct device_attribute *attr, const char *buf, size_t count)
|
|
{
|
|
struct backing_dev_info *bdi = dev_get_drvdata(dev);
|
|
unsigned int ratio;
|
|
ssize_t ret;
|
|
|
|
ret = kstrtouint(buf, 10, &ratio);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
ret = bdi_set_min_ratio_no_scale(bdi, ratio);
|
|
if (!ret)
|
|
ret = count;
|
|
|
|
return ret;
|
|
}
|
|
BDI_SHOW(min_ratio_fine, bdi->min_ratio)
|
|
|
|
static ssize_t max_ratio_store(struct device *dev,
|
|
struct device_attribute *attr, const char *buf, size_t count)
|
|
{
|
|
struct backing_dev_info *bdi = dev_get_drvdata(dev);
|
|
unsigned int ratio;
|
|
ssize_t ret;
|
|
|
|
ret = kstrtouint(buf, 10, &ratio);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
ret = bdi_set_max_ratio(bdi, ratio);
|
|
if (!ret)
|
|
ret = count;
|
|
|
|
return ret;
|
|
}
|
|
BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE)
|
|
|
|
static ssize_t max_ratio_fine_store(struct device *dev,
|
|
struct device_attribute *attr, const char *buf, size_t count)
|
|
{
|
|
struct backing_dev_info *bdi = dev_get_drvdata(dev);
|
|
unsigned int ratio;
|
|
ssize_t ret;
|
|
|
|
ret = kstrtouint(buf, 10, &ratio);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
ret = bdi_set_max_ratio_no_scale(bdi, ratio);
|
|
if (!ret)
|
|
ret = count;
|
|
|
|
return ret;
|
|
}
|
|
BDI_SHOW(max_ratio_fine, bdi->max_ratio)
|
|
|
|
static ssize_t min_bytes_show(struct device *dev,
|
|
struct device_attribute *attr,
|
|
char *buf)
|
|
{
|
|
struct backing_dev_info *bdi = dev_get_drvdata(dev);
|
|
|
|
return sysfs_emit(buf, "%llu\n", bdi_get_min_bytes(bdi));
|
|
}
|
|
|
|
static ssize_t min_bytes_store(struct device *dev,
|
|
struct device_attribute *attr, const char *buf, size_t count)
|
|
{
|
|
struct backing_dev_info *bdi = dev_get_drvdata(dev);
|
|
u64 bytes;
|
|
ssize_t ret;
|
|
|
|
ret = kstrtoull(buf, 10, &bytes);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
ret = bdi_set_min_bytes(bdi, bytes);
|
|
if (!ret)
|
|
ret = count;
|
|
|
|
return ret;
|
|
}
|
|
static DEVICE_ATTR_RW(min_bytes);
|
|
|
|
static ssize_t max_bytes_show(struct device *dev,
|
|
struct device_attribute *attr,
|
|
char *buf)
|
|
{
|
|
struct backing_dev_info *bdi = dev_get_drvdata(dev);
|
|
|
|
return sysfs_emit(buf, "%llu\n", bdi_get_max_bytes(bdi));
|
|
}
|
|
|
|
static ssize_t max_bytes_store(struct device *dev,
|
|
struct device_attribute *attr, const char *buf, size_t count)
|
|
{
|
|
struct backing_dev_info *bdi = dev_get_drvdata(dev);
|
|
u64 bytes;
|
|
ssize_t ret;
|
|
|
|
ret = kstrtoull(buf, 10, &bytes);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
ret = bdi_set_max_bytes(bdi, bytes);
|
|
if (!ret)
|
|
ret = count;
|
|
|
|
return ret;
|
|
}
|
|
static DEVICE_ATTR_RW(max_bytes);
|
|
|
|
static ssize_t stable_pages_required_show(struct device *dev,
|
|
struct device_attribute *attr,
|
|
char *buf)
|
|
{
|
|
dev_warn_once(dev,
|
|
"the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n");
|
|
return sysfs_emit(buf, "%d\n", 0);
|
|
}
|
|
static DEVICE_ATTR_RO(stable_pages_required);
|
|
|
|
static ssize_t strict_limit_store(struct device *dev,
|
|
struct device_attribute *attr, const char *buf, size_t count)
|
|
{
|
|
struct backing_dev_info *bdi = dev_get_drvdata(dev);
|
|
unsigned int strict_limit;
|
|
ssize_t ret;
|
|
|
|
ret = kstrtouint(buf, 10, &strict_limit);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
ret = bdi_set_strict_limit(bdi, strict_limit);
|
|
if (!ret)
|
|
ret = count;
|
|
|
|
return ret;
|
|
}
|
|
|
|
static ssize_t strict_limit_show(struct device *dev,
|
|
struct device_attribute *attr, char *buf)
|
|
{
|
|
struct backing_dev_info *bdi = dev_get_drvdata(dev);
|
|
|
|
return sysfs_emit(buf, "%d\n",
|
|
!!(bdi->capabilities & BDI_CAP_STRICTLIMIT));
|
|
}
|
|
static DEVICE_ATTR_RW(strict_limit);
|
|
|
|
static struct attribute *bdi_dev_attrs[] = {
|
|
&dev_attr_read_ahead_kb.attr,
|
|
&dev_attr_min_ratio.attr,
|
|
&dev_attr_min_ratio_fine.attr,
|
|
&dev_attr_max_ratio.attr,
|
|
&dev_attr_max_ratio_fine.attr,
|
|
&dev_attr_min_bytes.attr,
|
|
&dev_attr_max_bytes.attr,
|
|
&dev_attr_stable_pages_required.attr,
|
|
&dev_attr_strict_limit.attr,
|
|
NULL,
|
|
};
|
|
ATTRIBUTE_GROUPS(bdi_dev);
|
|
|
|
static const struct class bdi_class = {
|
|
.name = "bdi",
|
|
.dev_groups = bdi_dev_groups,
|
|
};
|
|
|
|
static __init int bdi_class_init(void)
|
|
{
|
|
int ret;
|
|
|
|
ret = class_register(&bdi_class);
|
|
if (ret)
|
|
return ret;
|
|
|
|
bdi_debug_init();
|
|
|
|
return 0;
|
|
}
|
|
postcore_initcall(bdi_class_init);
|
|
|
|
static int __init default_bdi_init(void)
|
|
{
|
|
bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND |
|
|
WQ_SYSFS, 0);
|
|
if (!bdi_wq)
|
|
return -ENOMEM;
|
|
return 0;
|
|
}
|
|
subsys_initcall(default_bdi_init);
|
|
|
|
static void wb_update_bandwidth_workfn(struct work_struct *work)
|
|
{
|
|
struct bdi_writeback *wb = container_of(to_delayed_work(work),
|
|
struct bdi_writeback, bw_dwork);
|
|
|
|
wb_update_bandwidth(wb);
|
|
}
|
|
|
|
/*
|
|
* Initial write bandwidth: 100 MB/s
|
|
*/
|
|
#define INIT_BW MB_TO_PAGES(100)
|
|
|
|
static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
|
|
gfp_t gfp)
|
|
{
|
|
int err;
|
|
|
|
memset(wb, 0, sizeof(*wb));
|
|
|
|
wb->bdi = bdi;
|
|
wb->last_old_flush = jiffies;
|
|
INIT_LIST_HEAD(&wb->b_dirty);
|
|
INIT_LIST_HEAD(&wb->b_io);
|
|
INIT_LIST_HEAD(&wb->b_more_io);
|
|
INIT_LIST_HEAD(&wb->b_dirty_time);
|
|
spin_lock_init(&wb->list_lock);
|
|
|
|
atomic_set(&wb->writeback_inodes, 0);
|
|
wb->bw_time_stamp = jiffies;
|
|
wb->balanced_dirty_ratelimit = INIT_BW;
|
|
wb->dirty_ratelimit = INIT_BW;
|
|
wb->write_bandwidth = INIT_BW;
|
|
wb->avg_write_bandwidth = INIT_BW;
|
|
|
|
spin_lock_init(&wb->work_lock);
|
|
INIT_LIST_HEAD(&wb->work_list);
|
|
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
|
|
INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
|
|
|
|
err = fprop_local_init_percpu(&wb->completions, gfp);
|
|
if (err)
|
|
return err;
|
|
|
|
err = percpu_counter_init_many(wb->stat, 0, gfp, NR_WB_STAT_ITEMS);
|
|
if (err)
|
|
fprop_local_destroy_percpu(&wb->completions);
|
|
|
|
return err;
|
|
}
|
|
|
|
static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);
|
|
|
|
/*
|
|
* Remove bdi from the global list and shutdown any threads we have running
|
|
*/
|
|
static void wb_shutdown(struct bdi_writeback *wb)
|
|
{
|
|
/* Make sure nobody queues further work */
|
|
spin_lock_irq(&wb->work_lock);
|
|
if (!test_and_clear_bit(WB_registered, &wb->state)) {
|
|
spin_unlock_irq(&wb->work_lock);
|
|
return;
|
|
}
|
|
spin_unlock_irq(&wb->work_lock);
|
|
|
|
cgwb_remove_from_bdi_list(wb);
|
|
/*
|
|
* Drain work list and shutdown the delayed_work. !WB_registered
|
|
* tells wb_workfn() that @wb is dying and its work_list needs to
|
|
* be drained no matter what.
|
|
*/
|
|
mod_delayed_work(bdi_wq, &wb->dwork, 0);
|
|
flush_delayed_work(&wb->dwork);
|
|
WARN_ON(!list_empty(&wb->work_list));
|
|
flush_delayed_work(&wb->bw_dwork);
|
|
}
|
|
|
|
static void wb_exit(struct bdi_writeback *wb)
|
|
{
|
|
WARN_ON(delayed_work_pending(&wb->dwork));
|
|
percpu_counter_destroy_many(wb->stat, NR_WB_STAT_ITEMS);
|
|
fprop_local_destroy_percpu(&wb->completions);
|
|
}
|
|
|
|
#ifdef CONFIG_CGROUP_WRITEBACK
|
|
|
|
#include <linux/memcontrol.h>
|
|
|
|
/*
|
|
* cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and
|
|
* memcg->cgwb_list. bdi->cgwb_tree is also RCU protected.
|
|
*/
|
|
static DEFINE_SPINLOCK(cgwb_lock);
|
|
static struct workqueue_struct *cgwb_release_wq;
|
|
|
|
static LIST_HEAD(offline_cgwbs);
|
|
static void cleanup_offline_cgwbs_workfn(struct work_struct *work);
|
|
static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn);
|
|
|
|
static void cgwb_free_rcu(struct rcu_head *rcu_head)
|
|
{
|
|
struct bdi_writeback *wb = container_of(rcu_head,
|
|
struct bdi_writeback, rcu);
|
|
|
|
percpu_ref_exit(&wb->refcnt);
|
|
kfree(wb);
|
|
}
|
|
|
|
static void cgwb_release_workfn(struct work_struct *work)
|
|
{
|
|
struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
|
|
release_work);
|
|
struct backing_dev_info *bdi = wb->bdi;
|
|
|
|
mutex_lock(&wb->bdi->cgwb_release_mutex);
|
|
wb_shutdown(wb);
|
|
|
|
css_put(wb->memcg_css);
|
|
css_put(wb->blkcg_css);
|
|
mutex_unlock(&wb->bdi->cgwb_release_mutex);
|
|
|
|
/* triggers blkg destruction if no online users left */
|
|
blkcg_unpin_online(wb->blkcg_css);
|
|
|
|
fprop_local_destroy_percpu(&wb->memcg_completions);
|
|
|
|
spin_lock_irq(&cgwb_lock);
|
|
list_del(&wb->offline_node);
|
|
spin_unlock_irq(&cgwb_lock);
|
|
|
|
wb_exit(wb);
|
|
bdi_put(bdi);
|
|
WARN_ON_ONCE(!list_empty(&wb->b_attached));
|
|
WARN_ON_ONCE(work_pending(&wb->switch_work));
|
|
call_rcu(&wb->rcu, cgwb_free_rcu);
|
|
}
|
|
|
|
static void cgwb_release(struct percpu_ref *refcnt)
|
|
{
|
|
struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
|
|
refcnt);
|
|
queue_work(cgwb_release_wq, &wb->release_work);
|
|
}
|
|
|
|
static void cgwb_kill(struct bdi_writeback *wb)
|
|
{
|
|
lockdep_assert_held(&cgwb_lock);
|
|
|
|
WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
|
|
list_del(&wb->memcg_node);
|
|
list_del(&wb->blkcg_node);
|
|
list_add(&wb->offline_node, &offline_cgwbs);
|
|
percpu_ref_kill(&wb->refcnt);
|
|
}
|
|
|
|
static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
|
|
{
|
|
spin_lock_irq(&cgwb_lock);
|
|
list_del_rcu(&wb->bdi_node);
|
|
spin_unlock_irq(&cgwb_lock);
|
|
}
|
|
|
|
static int cgwb_create(struct backing_dev_info *bdi,
|
|
struct cgroup_subsys_state *memcg_css, gfp_t gfp)
|
|
{
|
|
struct mem_cgroup *memcg;
|
|
struct cgroup_subsys_state *blkcg_css;
|
|
struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
|
|
struct bdi_writeback *wb;
|
|
unsigned long flags;
|
|
int ret = 0;
|
|
|
|
memcg = mem_cgroup_from_css(memcg_css);
|
|
blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
|
|
memcg_cgwb_list = &memcg->cgwb_list;
|
|
blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css);
|
|
|
|
/* look up again under lock and discard on blkcg mismatch */
|
|
spin_lock_irqsave(&cgwb_lock, flags);
|
|
wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
|
|
if (wb && wb->blkcg_css != blkcg_css) {
|
|
cgwb_kill(wb);
|
|
wb = NULL;
|
|
}
|
|
spin_unlock_irqrestore(&cgwb_lock, flags);
|
|
if (wb)
|
|
goto out_put;
|
|
|
|
/* need to create a new one */
|
|
wb = kmalloc(sizeof(*wb), gfp);
|
|
if (!wb) {
|
|
ret = -ENOMEM;
|
|
goto out_put;
|
|
}
|
|
|
|
ret = wb_init(wb, bdi, gfp);
|
|
if (ret)
|
|
goto err_free;
|
|
|
|
ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
|
|
if (ret)
|
|
goto err_wb_exit;
|
|
|
|
ret = fprop_local_init_percpu(&wb->memcg_completions, gfp);
|
|
if (ret)
|
|
goto err_ref_exit;
|
|
|
|
wb->memcg_css = memcg_css;
|
|
wb->blkcg_css = blkcg_css;
|
|
INIT_LIST_HEAD(&wb->b_attached);
|
|
INIT_WORK(&wb->switch_work, inode_switch_wbs_work_fn);
|
|
init_llist_head(&wb->switch_wbs_ctxs);
|
|
INIT_WORK(&wb->release_work, cgwb_release_workfn);
|
|
set_bit(WB_registered, &wb->state);
|
|
bdi_get(bdi);
|
|
|
|
/*
|
|
* The root wb determines the registered state of the whole bdi and
|
|
* memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
|
|
* whether they're still online. Don't link @wb if any is dead.
|
|
* See wb_memcg_offline() and wb_blkcg_offline().
|
|
*/
|
|
ret = -ENODEV;
|
|
spin_lock_irqsave(&cgwb_lock, flags);
|
|
if (test_bit(WB_registered, &bdi->wb.state) &&
|
|
blkcg_cgwb_list->next && memcg_cgwb_list->next) {
|
|
/* we might have raced another instance of this function */
|
|
ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
|
|
if (!ret) {
|
|
list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
|
|
list_add(&wb->memcg_node, memcg_cgwb_list);
|
|
list_add(&wb->blkcg_node, blkcg_cgwb_list);
|
|
blkcg_pin_online(blkcg_css);
|
|
css_get(memcg_css);
|
|
css_get(blkcg_css);
|
|
}
|
|
}
|
|
spin_unlock_irqrestore(&cgwb_lock, flags);
|
|
if (ret) {
|
|
if (ret == -EEXIST)
|
|
ret = 0;
|
|
goto err_fprop_exit;
|
|
}
|
|
goto out_put;
|
|
|
|
err_fprop_exit:
|
|
bdi_put(bdi);
|
|
fprop_local_destroy_percpu(&wb->memcg_completions);
|
|
err_ref_exit:
|
|
percpu_ref_exit(&wb->refcnt);
|
|
err_wb_exit:
|
|
wb_exit(wb);
|
|
err_free:
|
|
kfree(wb);
|
|
out_put:
|
|
css_put(blkcg_css);
|
|
return ret;
|
|
}
|
|
|
|
/**
|
|
* wb_get_lookup - get wb for a given memcg
|
|
* @bdi: target bdi
|
|
* @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
|
|
*
|
|
* Try to get the wb for @memcg_css on @bdi. The returned wb has its
|
|
* refcount incremented.
|
|
*
|
|
* This function uses css_get() on @memcg_css and thus expects its refcnt
|
|
* to be positive on invocation. IOW, rcu_read_lock() protection on
|
|
* @memcg_css isn't enough. try_get it before calling this function.
|
|
*
|
|
* A wb is keyed by its associated memcg. As blkcg implicitly enables
|
|
* memcg on the default hierarchy, memcg association is guaranteed to be
|
|
* more specific (equal or descendant to the associated blkcg) and thus can
|
|
* identify both the memcg and blkcg associations.
|
|
*
|
|
* Because the blkcg associated with a memcg may change as blkcg is enabled
|
|
* and disabled closer to root in the hierarchy, each wb keeps track of
|
|
* both the memcg and blkcg associated with it and verifies the blkcg on
|
|
* each lookup. On mismatch, the existing wb is discarded and a new one is
|
|
* created.
|
|
*/
|
|
struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
|
|
struct cgroup_subsys_state *memcg_css)
|
|
{
|
|
struct bdi_writeback *wb;
|
|
|
|
if (!memcg_css->parent)
|
|
return &bdi->wb;
|
|
|
|
rcu_read_lock();
|
|
wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
|
|
if (wb) {
|
|
struct cgroup_subsys_state *blkcg_css;
|
|
|
|
/* see whether the blkcg association has changed */
|
|
blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
|
|
if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
|
|
wb = NULL;
|
|
css_put(blkcg_css);
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
return wb;
|
|
}
|
|
|
|
/**
|
|
* wb_get_create - get wb for a given memcg, create if necessary
|
|
* @bdi: target bdi
|
|
* @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
|
|
* @gfp: allocation mask to use
|
|
*
|
|
* Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to
|
|
* create one. See wb_get_lookup() for more details.
|
|
*/
|
|
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
|
|
struct cgroup_subsys_state *memcg_css,
|
|
gfp_t gfp)
|
|
{
|
|
struct bdi_writeback *wb;
|
|
|
|
might_alloc(gfp);
|
|
|
|
do {
|
|
wb = wb_get_lookup(bdi, memcg_css);
|
|
} while (!wb && !cgwb_create(bdi, memcg_css, gfp));
|
|
|
|
return wb;
|
|
}
|
|
|
|
static int cgwb_bdi_init(struct backing_dev_info *bdi)
|
|
{
|
|
int ret;
|
|
|
|
INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
|
|
mutex_init(&bdi->cgwb_release_mutex);
|
|
init_rwsem(&bdi->wb_switch_rwsem);
|
|
|
|
ret = wb_init(&bdi->wb, bdi, GFP_KERNEL);
|
|
if (!ret) {
|
|
bdi->wb.memcg_css = &root_mem_cgroup->css;
|
|
bdi->wb.blkcg_css = blkcg_root_css;
|
|
INIT_WORK(&bdi->wb.switch_work, inode_switch_wbs_work_fn);
|
|
init_llist_head(&bdi->wb.switch_wbs_ctxs);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
|
|
{
|
|
struct radix_tree_iter iter;
|
|
void **slot;
|
|
struct bdi_writeback *wb;
|
|
|
|
WARN_ON(test_bit(WB_registered, &bdi->wb.state));
|
|
|
|
spin_lock_irq(&cgwb_lock);
|
|
radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
|
|
cgwb_kill(*slot);
|
|
spin_unlock_irq(&cgwb_lock);
|
|
|
|
mutex_lock(&bdi->cgwb_release_mutex);
|
|
spin_lock_irq(&cgwb_lock);
|
|
while (!list_empty(&bdi->wb_list)) {
|
|
wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
|
|
bdi_node);
|
|
spin_unlock_irq(&cgwb_lock);
|
|
wb_shutdown(wb);
|
|
spin_lock_irq(&cgwb_lock);
|
|
}
|
|
spin_unlock_irq(&cgwb_lock);
|
|
mutex_unlock(&bdi->cgwb_release_mutex);
|
|
}
|
|
|
|
/*
|
|
* cleanup_offline_cgwbs_workfn - try to release dying cgwbs
|
|
*
|
|
* Try to release dying cgwbs by switching attached inodes to the nearest
|
|
* living ancestor's writeback. Processed wbs are placed at the end
|
|
* of the list to guarantee the forward progress.
|
|
*/
|
|
static void cleanup_offline_cgwbs_workfn(struct work_struct *work)
|
|
{
|
|
struct bdi_writeback *wb;
|
|
LIST_HEAD(processed);
|
|
|
|
spin_lock_irq(&cgwb_lock);
|
|
|
|
while (!list_empty(&offline_cgwbs)) {
|
|
wb = list_first_entry(&offline_cgwbs, struct bdi_writeback,
|
|
offline_node);
|
|
list_move(&wb->offline_node, &processed);
|
|
|
|
/*
|
|
* If wb is dirty, cleaning up the writeback by switching
|
|
* attached inodes will result in an effective removal of any
|
|
* bandwidth restrictions, which isn't the goal. Instead,
|
|
* it can be postponed until the next time, when all io
|
|
* will be likely completed. If in the meantime some inodes
|
|
* will get re-dirtied, they should be eventually switched to
|
|
* a new cgwb.
|
|
*/
|
|
if (wb_has_dirty_io(wb))
|
|
continue;
|
|
|
|
if (!wb_tryget(wb))
|
|
continue;
|
|
|
|
spin_unlock_irq(&cgwb_lock);
|
|
while (cleanup_offline_cgwb(wb))
|
|
cond_resched();
|
|
spin_lock_irq(&cgwb_lock);
|
|
|
|
wb_put(wb);
|
|
}
|
|
|
|
if (!list_empty(&processed))
|
|
list_splice_tail(&processed, &offline_cgwbs);
|
|
|
|
spin_unlock_irq(&cgwb_lock);
|
|
}
|
|
|
|
/**
|
|
* wb_memcg_offline - kill all wb's associated with a memcg being offlined
|
|
* @memcg: memcg being offlined
|
|
*
|
|
* Also prevents creation of any new wb's associated with @memcg.
|
|
*/
|
|
void wb_memcg_offline(struct mem_cgroup *memcg)
|
|
{
|
|
struct list_head *memcg_cgwb_list = &memcg->cgwb_list;
|
|
struct bdi_writeback *wb, *next;
|
|
|
|
spin_lock_irq(&cgwb_lock);
|
|
list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
|
|
cgwb_kill(wb);
|
|
memcg_cgwb_list->next = NULL; /* prevent new wb's */
|
|
spin_unlock_irq(&cgwb_lock);
|
|
|
|
queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work);
|
|
}
|
|
|
|
/**
|
|
* wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
|
|
* @css: blkcg being offlined
|
|
*
|
|
* Also prevents creation of any new wb's associated with @blkcg.
|
|
*/
|
|
void wb_blkcg_offline(struct cgroup_subsys_state *css)
|
|
{
|
|
struct bdi_writeback *wb, *next;
|
|
struct list_head *list = blkcg_get_cgwb_list(css);
|
|
|
|
spin_lock_irq(&cgwb_lock);
|
|
list_for_each_entry_safe(wb, next, list, blkcg_node)
|
|
cgwb_kill(wb);
|
|
list->next = NULL; /* prevent new wb's */
|
|
spin_unlock_irq(&cgwb_lock);
|
|
}
|
|
|
|
static void cgwb_bdi_register(struct backing_dev_info *bdi)
|
|
{
|
|
spin_lock_irq(&cgwb_lock);
|
|
list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
|
|
spin_unlock_irq(&cgwb_lock);
|
|
}
|
|
|
|
static int __init cgwb_init(void)
|
|
{
|
|
/*
|
|
* There can be many concurrent release work items overwhelming
|
|
* system_wq. Put them in a separate wq and limit concurrency.
|
|
* There's no point in executing many of these in parallel.
|
|
*/
|
|
cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1);
|
|
if (!cgwb_release_wq)
|
|
return -ENOMEM;
|
|
|
|
return 0;
|
|
}
|
|
subsys_initcall(cgwb_init);
|
|
|
|
#else /* CONFIG_CGROUP_WRITEBACK */
|
|
|
|
static int cgwb_bdi_init(struct backing_dev_info *bdi)
|
|
{
|
|
return wb_init(&bdi->wb, bdi, GFP_KERNEL);
|
|
}
|
|
|
|
static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }
|
|
|
|
static void cgwb_bdi_register(struct backing_dev_info *bdi)
|
|
{
|
|
list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
|
|
}
|
|
|
|
static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
|
|
{
|
|
list_del_rcu(&wb->bdi_node);
|
|
}
|
|
|
|
#endif /* CONFIG_CGROUP_WRITEBACK */
|
|
|
|
int bdi_init(struct backing_dev_info *bdi)
|
|
{
|
|
bdi->dev = NULL;
|
|
|
|
kref_init(&bdi->refcnt);
|
|
bdi->min_ratio = 0;
|
|
bdi->max_ratio = 100 * BDI_RATIO_SCALE;
|
|
bdi->max_prop_frac = FPROP_FRAC_BASE;
|
|
INIT_LIST_HEAD(&bdi->bdi_list);
|
|
INIT_LIST_HEAD(&bdi->wb_list);
|
|
init_waitqueue_head(&bdi->wb_waitq);
|
|
bdi->last_bdp_sleep = jiffies;
|
|
|
|
return cgwb_bdi_init(bdi);
|
|
}
|
|
|
|
struct backing_dev_info *bdi_alloc(int node_id)
|
|
{
|
|
struct backing_dev_info *bdi;
|
|
|
|
bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id);
|
|
if (!bdi)
|
|
return NULL;
|
|
|
|
if (bdi_init(bdi)) {
|
|
kfree(bdi);
|
|
return NULL;
|
|
}
|
|
bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT;
|
|
bdi->ra_pages = VM_READAHEAD_PAGES;
|
|
bdi->io_pages = VM_READAHEAD_PAGES;
|
|
timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0);
|
|
return bdi;
|
|
}
|
|
EXPORT_SYMBOL(bdi_alloc);
|
|
|
|
static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp)
|
|
{
|
|
struct rb_node **p = &bdi_tree.rb_node;
|
|
struct rb_node *parent = NULL;
|
|
struct backing_dev_info *bdi;
|
|
|
|
lockdep_assert_held(&bdi_lock);
|
|
|
|
while (*p) {
|
|
parent = *p;
|
|
bdi = rb_entry(parent, struct backing_dev_info, rb_node);
|
|
|
|
if (bdi->id > id)
|
|
p = &(*p)->rb_left;
|
|
else if (bdi->id < id)
|
|
p = &(*p)->rb_right;
|
|
else
|
|
break;
|
|
}
|
|
|
|
if (parentp)
|
|
*parentp = parent;
|
|
return p;
|
|
}
|
|
|
|
/**
|
|
* bdi_get_by_id - lookup and get bdi from its id
|
|
* @id: bdi id to lookup
|
|
*
|
|
* Find bdi matching @id and get it. Returns NULL if the matching bdi
|
|
* doesn't exist or is already unregistered.
|
|
*/
|
|
struct backing_dev_info *bdi_get_by_id(u64 id)
|
|
{
|
|
struct backing_dev_info *bdi = NULL;
|
|
struct rb_node **p;
|
|
|
|
spin_lock_bh(&bdi_lock);
|
|
p = bdi_lookup_rb_node(id, NULL);
|
|
if (*p) {
|
|
bdi = rb_entry(*p, struct backing_dev_info, rb_node);
|
|
bdi_get(bdi);
|
|
}
|
|
spin_unlock_bh(&bdi_lock);
|
|
|
|
return bdi;
|
|
}
|
|
|
|
int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
|
|
{
|
|
struct device *dev;
|
|
struct rb_node *parent, **p;
|
|
|
|
if (bdi->dev) /* The driver needs to use separate queues per device */
|
|
return 0;
|
|
|
|
vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args);
|
|
dev = device_create(&bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
|
|
if (IS_ERR(dev))
|
|
return PTR_ERR(dev);
|
|
|
|
cgwb_bdi_register(bdi);
|
|
bdi->dev = dev;
|
|
|
|
bdi_debug_register(bdi, dev_name(dev));
|
|
set_bit(WB_registered, &bdi->wb.state);
|
|
|
|
spin_lock_bh(&bdi_lock);
|
|
|
|
bdi->id = ++bdi_id_cursor;
|
|
|
|
p = bdi_lookup_rb_node(bdi->id, &parent);
|
|
rb_link_node(&bdi->rb_node, parent, p);
|
|
rb_insert_color(&bdi->rb_node, &bdi_tree);
|
|
|
|
list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
|
|
|
|
spin_unlock_bh(&bdi_lock);
|
|
|
|
trace_writeback_bdi_register(bdi);
|
|
return 0;
|
|
}
|
|
|
|
int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
|
|
{
|
|
va_list args;
|
|
int ret;
|
|
|
|
va_start(args, fmt);
|
|
ret = bdi_register_va(bdi, fmt, args);
|
|
va_end(args);
|
|
return ret;
|
|
}
|
|
EXPORT_SYMBOL(bdi_register);
|
|
|
|
void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner)
|
|
{
|
|
WARN_ON_ONCE(bdi->owner);
|
|
bdi->owner = owner;
|
|
get_device(owner);
|
|
}
|
|
|
|
/*
|
|
* Remove bdi from bdi_list, and ensure that it is no longer visible
|
|
*/
|
|
static void bdi_remove_from_list(struct backing_dev_info *bdi)
|
|
{
|
|
spin_lock_bh(&bdi_lock);
|
|
rb_erase(&bdi->rb_node, &bdi_tree);
|
|
list_del_rcu(&bdi->bdi_list);
|
|
spin_unlock_bh(&bdi_lock);
|
|
|
|
synchronize_rcu_expedited();
|
|
}
|
|
|
|
void bdi_unregister(struct backing_dev_info *bdi)
|
|
{
|
|
timer_delete_sync(&bdi->laptop_mode_wb_timer);
|
|
|
|
/* make sure nobody finds us on the bdi_list anymore */
|
|
bdi_remove_from_list(bdi);
|
|
wb_shutdown(&bdi->wb);
|
|
cgwb_bdi_unregister(bdi);
|
|
|
|
/*
|
|
* If this BDI's min ratio has been set, use bdi_set_min_ratio() to
|
|
* update the global bdi_min_ratio.
|
|
*/
|
|
if (bdi->min_ratio)
|
|
bdi_set_min_ratio(bdi, 0);
|
|
|
|
if (bdi->dev) {
|
|
bdi_debug_unregister(bdi);
|
|
device_unregister(bdi->dev);
|
|
bdi->dev = NULL;
|
|
}
|
|
|
|
if (bdi->owner) {
|
|
put_device(bdi->owner);
|
|
bdi->owner = NULL;
|
|
}
|
|
}
|
|
EXPORT_SYMBOL(bdi_unregister);
|
|
|
|
static void release_bdi(struct kref *ref)
|
|
{
|
|
struct backing_dev_info *bdi =
|
|
container_of(ref, struct backing_dev_info, refcnt);
|
|
|
|
WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state));
|
|
WARN_ON_ONCE(bdi->dev);
|
|
wb_exit(&bdi->wb);
|
|
kfree(bdi);
|
|
}
|
|
|
|
void bdi_put(struct backing_dev_info *bdi)
|
|
{
|
|
kref_put(&bdi->refcnt, release_bdi);
|
|
}
|
|
EXPORT_SYMBOL(bdi_put);
|
|
|
|
struct backing_dev_info *inode_to_bdi(struct inode *inode)
|
|
{
|
|
struct super_block *sb;
|
|
|
|
if (!inode)
|
|
return &noop_backing_dev_info;
|
|
|
|
sb = inode->i_sb;
|
|
#ifdef CONFIG_BLOCK
|
|
if (sb_is_blkdev_sb(sb))
|
|
return I_BDEV(inode)->bd_disk->bdi;
|
|
#endif
|
|
return sb->s_bdi;
|
|
}
|
|
EXPORT_SYMBOL(inode_to_bdi);
|
|
|
|
const char *bdi_dev_name(struct backing_dev_info *bdi)
|
|
{
|
|
if (!bdi || !bdi->dev)
|
|
return bdi_unknown_name;
|
|
return bdi->dev_name;
|
|
}
|
|
EXPORT_SYMBOL_GPL(bdi_dev_name);
|