mirror of
				git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
				synced 2025-09-04 20:19:47 +08:00 
			
		
		
		
	 11ea68f553
			
		
	
	
		11ea68f553
		
	
	
	
	
		
			
			The affinity of managed interrupts is completely handled in the kernel and cannot be changed via the /proc/irq/* interfaces from user space. As the kernel tries to spread out interrupts evenly accross CPUs on x86 to prevent vector exhaustion, it can happen that a managed interrupt whose affinity mask contains both isolated and housekeeping CPUs is routed to an isolated CPU. As a consequence IO submitted on a housekeeping CPU causes interrupts on the isolated CPU. Add a new sub-parameter 'managed_irq' for 'isolcpus' and the corresponding logic in the interrupt affinity selection code. The subparameter indicates to the interrupt affinity selection logic that it should try to avoid the above scenario. This isolation is best effort and only effective if the automatically assigned interrupt mask of a device queue contains isolated and housekeeping CPUs. If housekeeping CPUs are online then such interrupts are directed to the housekeeping CPU so that IO submitted on the housekeeping CPU cannot disturb the isolated CPU. If a queue's affinity mask contains only isolated CPUs then this parameter has no effect on the interrupt routing decision, though interrupts are only happening when tasks running on those isolated CPUs submit IO. IO submitted on housekeeping CPUs has no influence on those queues. If the affinity mask contains both housekeeping and isolated CPUs, but none of the contained housekeeping CPUs is online, then the interrupt is also routed to an isolated CPU. Interrupts are only delivered when one of the isolated CPUs in the affinity mask submits IO. If one of the contained housekeeping CPUs comes online, the CPU hotplug logic migrates the interrupt automatically back to the upcoming housekeeping CPU. Depending on the type of interrupt controller, this can require that at least one interrupt is delivered to the isolated CPU in order to complete the migration. [ tglx: Removed unused parameter, added and edited comments/documentation and rephrased the changelog so it contains more details. ] Signed-off-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Link: https://lore.kernel.org/r/20200120091625.17912-1-ming.lei@redhat.com
		
			
				
	
	
		
			234 lines
		
	
	
		
			6.6 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			234 lines
		
	
	
		
			6.6 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| // SPDX-License-Identifier: GPL-2.0
 | |
| /*
 | |
|  * Generic cpu hotunplug interrupt migration code copied from the
 | |
|  * arch/arm implementation
 | |
|  *
 | |
|  * Copyright (C) Russell King
 | |
|  *
 | |
|  * This program is free software; you can redistribute it and/or modify
 | |
|  * it under the terms of the GNU General Public License version 2 as
 | |
|  * published by the Free Software Foundation.
 | |
|  */
 | |
| #include <linux/interrupt.h>
 | |
| #include <linux/ratelimit.h>
 | |
| #include <linux/irq.h>
 | |
| #include <linux/sched/isolation.h>
 | |
| 
 | |
| #include "internals.h"
 | |
| 
 | |
| /* For !GENERIC_IRQ_EFFECTIVE_AFF_MASK this looks at general affinity mask */
 | |
| static inline bool irq_needs_fixup(struct irq_data *d)
 | |
| {
 | |
| 	const struct cpumask *m = irq_data_get_effective_affinity_mask(d);
 | |
| 	unsigned int cpu = smp_processor_id();
 | |
| 
 | |
| #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
 | |
| 	/*
 | |
| 	 * The cpumask_empty() check is a workaround for interrupt chips,
 | |
| 	 * which do not implement effective affinity, but the architecture has
 | |
| 	 * enabled the config switch. Use the general affinity mask instead.
 | |
| 	 */
 | |
| 	if (cpumask_empty(m))
 | |
| 		m = irq_data_get_affinity_mask(d);
 | |
| 
 | |
| 	/*
 | |
| 	 * Sanity check. If the mask is not empty when excluding the outgoing
 | |
| 	 * CPU then it must contain at least one online CPU. The outgoing CPU
 | |
| 	 * has been removed from the online mask already.
 | |
| 	 */
 | |
| 	if (cpumask_any_but(m, cpu) < nr_cpu_ids &&
 | |
| 	    cpumask_any_and(m, cpu_online_mask) >= nr_cpu_ids) {
 | |
| 		/*
 | |
| 		 * If this happens then there was a missed IRQ fixup at some
 | |
| 		 * point. Warn about it and enforce fixup.
 | |
| 		 */
 | |
| 		pr_warn("Eff. affinity %*pbl of IRQ %u contains only offline CPUs after offlining CPU %u\n",
 | |
| 			cpumask_pr_args(m), d->irq, cpu);
 | |
| 		return true;
 | |
| 	}
 | |
| #endif
 | |
| 	return cpumask_test_cpu(cpu, m);
 | |
| }
 | |
| 
 | |
| static bool migrate_one_irq(struct irq_desc *desc)
 | |
| {
 | |
| 	struct irq_data *d = irq_desc_get_irq_data(desc);
 | |
| 	struct irq_chip *chip = irq_data_get_irq_chip(d);
 | |
| 	bool maskchip = !irq_can_move_pcntxt(d) && !irqd_irq_masked(d);
 | |
| 	const struct cpumask *affinity;
 | |
| 	bool brokeaff = false;
 | |
| 	int err;
 | |
| 
 | |
| 	/*
 | |
| 	 * IRQ chip might be already torn down, but the irq descriptor is
 | |
| 	 * still in the radix tree. Also if the chip has no affinity setter,
 | |
| 	 * nothing can be done here.
 | |
| 	 */
 | |
| 	if (!chip || !chip->irq_set_affinity) {
 | |
| 		pr_debug("IRQ %u: Unable to migrate away\n", d->irq);
 | |
| 		return false;
 | |
| 	}
 | |
| 
 | |
| 	/*
 | |
| 	 * No move required, if:
 | |
| 	 * - Interrupt is per cpu
 | |
| 	 * - Interrupt is not started
 | |
| 	 * - Affinity mask does not include this CPU.
 | |
| 	 *
 | |
| 	 * Note: Do not check desc->action as this might be a chained
 | |
| 	 * interrupt.
 | |
| 	 */
 | |
| 	if (irqd_is_per_cpu(d) || !irqd_is_started(d) || !irq_needs_fixup(d)) {
 | |
| 		/*
 | |
| 		 * If an irq move is pending, abort it if the dying CPU is
 | |
| 		 * the sole target.
 | |
| 		 */
 | |
| 		irq_fixup_move_pending(desc, false);
 | |
| 		return false;
 | |
| 	}
 | |
| 
 | |
| 	/*
 | |
| 	 * Complete an eventually pending irq move cleanup. If this
 | |
| 	 * interrupt was moved in hard irq context, then the vectors need
 | |
| 	 * to be cleaned up. It can't wait until this interrupt actually
 | |
| 	 * happens and this CPU was involved.
 | |
| 	 */
 | |
| 	irq_force_complete_move(desc);
 | |
| 
 | |
| 	/*
 | |
| 	 * If there is a setaffinity pending, then try to reuse the pending
 | |
| 	 * mask, so the last change of the affinity does not get lost. If
 | |
| 	 * there is no move pending or the pending mask does not contain
 | |
| 	 * any online CPU, use the current affinity mask.
 | |
| 	 */
 | |
| 	if (irq_fixup_move_pending(desc, true))
 | |
| 		affinity = irq_desc_get_pending_mask(desc);
 | |
| 	else
 | |
| 		affinity = irq_data_get_affinity_mask(d);
 | |
| 
 | |
| 	/* Mask the chip for interrupts which cannot move in process context */
 | |
| 	if (maskchip && chip->irq_mask)
 | |
| 		chip->irq_mask(d);
 | |
| 
 | |
| 	if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
 | |
| 		/*
 | |
| 		 * If the interrupt is managed, then shut it down and leave
 | |
| 		 * the affinity untouched.
 | |
| 		 */
 | |
| 		if (irqd_affinity_is_managed(d)) {
 | |
| 			irqd_set_managed_shutdown(d);
 | |
| 			irq_shutdown_and_deactivate(desc);
 | |
| 			return false;
 | |
| 		}
 | |
| 		affinity = cpu_online_mask;
 | |
| 		brokeaff = true;
 | |
| 	}
 | |
| 	/*
 | |
| 	 * Do not set the force argument of irq_do_set_affinity() as this
 | |
| 	 * disables the masking of offline CPUs from the supplied affinity
 | |
| 	 * mask and therefore might keep/reassign the irq to the outgoing
 | |
| 	 * CPU.
 | |
| 	 */
 | |
| 	err = irq_do_set_affinity(d, affinity, false);
 | |
| 	if (err) {
 | |
| 		pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
 | |
| 				    d->irq, err);
 | |
| 		brokeaff = false;
 | |
| 	}
 | |
| 
 | |
| 	if (maskchip && chip->irq_unmask)
 | |
| 		chip->irq_unmask(d);
 | |
| 
 | |
| 	return brokeaff;
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * irq_migrate_all_off_this_cpu - Migrate irqs away from offline cpu
 | |
|  *
 | |
|  * The current CPU has been marked offline.  Migrate IRQs off this CPU.
 | |
|  * If the affinity settings do not allow other CPUs, force them onto any
 | |
|  * available CPU.
 | |
|  *
 | |
|  * Note: we must iterate over all IRQs, whether they have an attached
 | |
|  * action structure or not, as we need to get chained interrupts too.
 | |
|  */
 | |
| void irq_migrate_all_off_this_cpu(void)
 | |
| {
 | |
| 	struct irq_desc *desc;
 | |
| 	unsigned int irq;
 | |
| 
 | |
| 	for_each_active_irq(irq) {
 | |
| 		bool affinity_broken;
 | |
| 
 | |
| 		desc = irq_to_desc(irq);
 | |
| 		raw_spin_lock(&desc->lock);
 | |
| 		affinity_broken = migrate_one_irq(desc);
 | |
| 		raw_spin_unlock(&desc->lock);
 | |
| 
 | |
| 		if (affinity_broken) {
 | |
| 			pr_warn_ratelimited("IRQ %u: no longer affine to CPU%u\n",
 | |
| 					    irq, smp_processor_id());
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| static bool hk_should_isolate(struct irq_data *data, unsigned int cpu)
 | |
| {
 | |
| 	const struct cpumask *hk_mask;
 | |
| 
 | |
| 	if (!housekeeping_enabled(HK_FLAG_MANAGED_IRQ))
 | |
| 		return false;
 | |
| 
 | |
| 	hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ);
 | |
| 	if (cpumask_subset(irq_data_get_effective_affinity_mask(data), hk_mask))
 | |
| 		return false;
 | |
| 
 | |
| 	return cpumask_test_cpu(cpu, hk_mask);
 | |
| }
 | |
| 
 | |
| static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu)
 | |
| {
 | |
| 	struct irq_data *data = irq_desc_get_irq_data(desc);
 | |
| 	const struct cpumask *affinity = irq_data_get_affinity_mask(data);
 | |
| 
 | |
| 	if (!irqd_affinity_is_managed(data) || !desc->action ||
 | |
| 	    !irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity))
 | |
| 		return;
 | |
| 
 | |
| 	if (irqd_is_managed_and_shutdown(data)) {
 | |
| 		irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
 | |
| 		return;
 | |
| 	}
 | |
| 
 | |
| 	/*
 | |
| 	 * If the interrupt can only be directed to a single target
 | |
| 	 * CPU then it is already assigned to a CPU in the affinity
 | |
| 	 * mask. No point in trying to move it around unless the
 | |
| 	 * isolation mechanism requests to move it to an upcoming
 | |
| 	 * housekeeping CPU.
 | |
| 	 */
 | |
| 	if (!irqd_is_single_target(data) || hk_should_isolate(data, cpu))
 | |
| 		irq_set_affinity_locked(data, affinity, false);
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * irq_affinity_online_cpu - Restore affinity for managed interrupts
 | |
|  * @cpu:	Upcoming CPU for which interrupts should be restored
 | |
|  */
 | |
| int irq_affinity_online_cpu(unsigned int cpu)
 | |
| {
 | |
| 	struct irq_desc *desc;
 | |
| 	unsigned int irq;
 | |
| 
 | |
| 	irq_lock_sparse();
 | |
| 	for_each_active_irq(irq) {
 | |
| 		desc = irq_to_desc(irq);
 | |
| 		raw_spin_lock_irq(&desc->lock);
 | |
| 		irq_restore_affinity_of_irq(desc, cpu);
 | |
| 		raw_spin_unlock_irq(&desc->lock);
 | |
| 	}
 | |
| 	irq_unlock_sparse();
 | |
| 
 | |
| 	return 0;
 | |
| }
 |