mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-03-21 23:16:50 +08:00
sched/mmcid: Avoid full tasklist walks
Chasing vfork()'ed tasks on a CID ownership mode switch requires a full
task list walk, which is obviously expensive on large systems.
Avoid that by keeping a list of tasks using a mm MMCID entity in mm::mm_cid
and walk this list instead. This removes the proven to be flaky counting
logic and avoids a full task list walk in the case of vfork()'ed tasks.
Fixes: fbd0e71dc3 ("sched/mmcid: Provide CID ownership mode fixup functions")
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20260310202526.183824481@kernel.org
This commit is contained in:
committed by
Peter Zijlstra
parent
7574ac6e49
commit
192d852129
@@ -133,10 +133,12 @@ struct rseq_data { };
|
||||
* @active: MM CID is active for the task
|
||||
* @cid: The CID associated to the task either permanently or
|
||||
* borrowed from the CPU
|
||||
* @node: Queued in the per MM MMCID list
|
||||
*/
|
||||
struct sched_mm_cid {
|
||||
unsigned int active;
|
||||
unsigned int cid;
|
||||
struct hlist_node node;
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -157,6 +159,7 @@ struct mm_cid_pcpu {
|
||||
* @work: Regular work to handle the affinity mode change case
|
||||
* @lock: Spinlock to protect against affinity setting which can't take @mutex
|
||||
* @mutex: Mutex to serialize forks and exits related to this mm
|
||||
* @user_list: List of the MM CID users of a MM
|
||||
* @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map
|
||||
* is growth only.
|
||||
* @users: The number of tasks sharing this MM. Separate from mm::mm_users
|
||||
@@ -177,13 +180,14 @@ struct mm_mm_cid {
|
||||
|
||||
raw_spinlock_t lock;
|
||||
struct mutex mutex;
|
||||
struct hlist_head user_list;
|
||||
|
||||
/* Low frequency modified */
|
||||
unsigned int nr_cpus_allowed;
|
||||
unsigned int users;
|
||||
unsigned int pcpu_thrs;
|
||||
unsigned int update_deferred;
|
||||
}____cacheline_aligned_in_smp;
|
||||
} ____cacheline_aligned;
|
||||
#else /* CONFIG_SCHED_MM_CID */
|
||||
struct mm_mm_cid { };
|
||||
struct sched_mm_cid { };
|
||||
|
||||
@@ -1000,6 +1000,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
|
||||
#ifdef CONFIG_SCHED_MM_CID
|
||||
tsk->mm_cid.cid = MM_CID_UNSET;
|
||||
tsk->mm_cid.active = 0;
|
||||
INIT_HLIST_NODE(&tsk->mm_cid.node);
|
||||
#endif
|
||||
return tsk;
|
||||
|
||||
|
||||
@@ -10620,13 +10620,10 @@ static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pc
|
||||
}
|
||||
}
|
||||
|
||||
static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
|
||||
static void mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
|
||||
{
|
||||
/* Remote access to mm::mm_cid::pcpu requires rq_lock */
|
||||
guard(task_rq_lock)(t);
|
||||
/* If the task is not active it is not in the users count */
|
||||
if (!t->mm_cid.active)
|
||||
return false;
|
||||
if (cid_on_task(t->mm_cid.cid)) {
|
||||
/* If running on the CPU, put the CID in transit mode, otherwise drop it */
|
||||
if (task_rq(t)->curr == t)
|
||||
@@ -10634,51 +10631,21 @@ static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm
|
||||
else
|
||||
mm_unset_cid_on_task(t);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static void mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm)
|
||||
{
|
||||
struct task_struct *p, *t;
|
||||
unsigned int users;
|
||||
|
||||
/*
|
||||
* This can obviously race with a concurrent affinity change, which
|
||||
* increases the number of allowed CPUs for this mm, but that does
|
||||
* not affect the mode and only changes the CID constraints. A
|
||||
* possible switch back to per task mode happens either in the
|
||||
* deferred handler function or in the next fork()/exit().
|
||||
*
|
||||
* The caller has already transferred so remove it from the users
|
||||
* count. The incoming task is already visible and has mm_cid.active,
|
||||
* but has task::mm_cid::cid == UNSET. Still it needs to be accounted
|
||||
* for. Concurrent fork()s might add more threads, but all of them have
|
||||
* task::mm_cid::active = 0, so they don't affect the accounting here.
|
||||
*/
|
||||
users = mm->mm_cid.users - 1;
|
||||
|
||||
guard(rcu)();
|
||||
for_other_threads(current, t) {
|
||||
if (mm_cid_fixup_task_to_cpu(t, mm))
|
||||
users--;
|
||||
}
|
||||
|
||||
if (!users)
|
||||
return;
|
||||
|
||||
/* Happens only for VM_CLONE processes. */
|
||||
for_each_process_thread(p, t) {
|
||||
if (t == current || t->mm != mm)
|
||||
continue;
|
||||
mm_cid_fixup_task_to_cpu(t, mm);
|
||||
}
|
||||
}
|
||||
|
||||
static void mm_cid_fixup_tasks_to_cpus(void)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct task_struct *t;
|
||||
|
||||
lockdep_assert_held(&mm->mm_cid.mutex);
|
||||
|
||||
hlist_for_each_entry(t, &mm->mm_cid.user_list, mm_cid.node) {
|
||||
/* Current has already transferred before invoking the fixup. */
|
||||
if (t != current)
|
||||
mm_cid_fixup_task_to_cpu(t, mm);
|
||||
}
|
||||
|
||||
mm_cid_do_fixup_tasks_to_cpus(mm);
|
||||
mm_cid_complete_transit(mm, MM_CID_ONCPU);
|
||||
}
|
||||
|
||||
@@ -10687,6 +10654,7 @@ static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
|
||||
lockdep_assert_held(&mm->mm_cid.lock);
|
||||
|
||||
t->mm_cid.active = 1;
|
||||
hlist_add_head(&t->mm_cid.node, &mm->mm_cid.user_list);
|
||||
mm->mm_cid.users++;
|
||||
return mm_update_max_cids(mm);
|
||||
}
|
||||
@@ -10744,6 +10712,7 @@ static bool sched_mm_cid_remove_user(struct task_struct *t)
|
||||
/* Clear the transition bit */
|
||||
t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid);
|
||||
mm_unset_cid_on_task(t);
|
||||
hlist_del_init(&t->mm_cid.node);
|
||||
t->mm->mm_cid.users--;
|
||||
return mm_update_max_cids(t->mm);
|
||||
}
|
||||
@@ -10886,6 +10855,7 @@ void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
|
||||
mutex_init(&mm->mm_cid.mutex);
|
||||
mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work);
|
||||
INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn);
|
||||
INIT_HLIST_HEAD(&mm->mm_cid.user_list);
|
||||
cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
|
||||
bitmap_zero(mm_cidmask(mm), num_possible_cpus());
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user