mirror of
				git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
				synced 2025-09-04 20:19:47 +08:00 
			
		
		
		
	cgroup/cpuset: Fix remote root partition creation problem
Since commit181c8e091a("cgroup/cpuset: Introduce remote partition"), a remote partition can be created underneath a non-partition root cpuset as long as its exclusive_cpus are set to distribute exclusive CPUs down to its children. The generate_sched_domains() function, however, doesn't take into account this new behavior and hence will fail to create the sched domain needed for a remote root (non-isolated) partition. There are two issues related to remote partition support. First of all, generate_sched_domains() has a fast path that is activated if root_load_balance is true and top_cpuset.nr_subparts is non-zero. The later condition isn't quite correct for remote partitions as nr_subparts just shows the number of local child partitions underneath it. There can be no local child partition under top_cpuset even if there are remote partitions further down the hierarchy. Fix that by checking for subpartitions_cpus which contains exclusive CPUs allocated to both local and remote partitions. Secondly, the valid partition check for subtree skipping in the csa[] generation loop isn't enough as remote partition does not need to have a partition root parent. Fix this problem by breaking csa[] array generation loop of generate_sched_domains() into v1 and v2 specific parts and checking a cpuset's exclusive_cpus before skipping its subtree in the v2 case. Also simplify generate_sched_domains() for cgroup v2 as only non-isolating partition roots should be included in building the cpuset array and none of the v1 scheduling attributes other than a different way to create an isolated partition are supported. Fixes:181c8e091a("cgroup/cpuset: Introduce remote partition") Signed-off-by: Waiman Long <longman@redhat.com> Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:
		
							parent
							
								
									6fe960147e
								
							
						
					
					
						commit
						ccac8e8de9
					
				| @ -169,7 +169,7 @@ struct cpuset { | |||||||
| 	/* for custom sched domain */ | 	/* for custom sched domain */ | ||||||
| 	int relax_domain_level; | 	int relax_domain_level; | ||||||
| 
 | 
 | ||||||
| 	/* number of valid sub-partitions */ | 	/* number of valid local child partitions */ | ||||||
| 	int nr_subparts; | 	int nr_subparts; | ||||||
| 
 | 
 | ||||||
| 	/* partition root state */ | 	/* partition root state */ | ||||||
| @ -957,13 +957,14 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||||||
| 	int nslot;		/* next empty doms[] struct cpumask slot */ | 	int nslot;		/* next empty doms[] struct cpumask slot */ | ||||||
| 	struct cgroup_subsys_state *pos_css; | 	struct cgroup_subsys_state *pos_css; | ||||||
| 	bool root_load_balance = is_sched_load_balance(&top_cpuset); | 	bool root_load_balance = is_sched_load_balance(&top_cpuset); | ||||||
|  | 	bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); | ||||||
| 
 | 
 | ||||||
| 	doms = NULL; | 	doms = NULL; | ||||||
| 	dattr = NULL; | 	dattr = NULL; | ||||||
| 	csa = NULL; | 	csa = NULL; | ||||||
| 
 | 
 | ||||||
| 	/* Special case for the 99% of systems with one, full, sched domain */ | 	/* Special case for the 99% of systems with one, full, sched domain */ | ||||||
| 	if (root_load_balance && !top_cpuset.nr_subparts) { | 	if (root_load_balance && cpumask_empty(subpartitions_cpus)) { | ||||||
| single_root_domain: | single_root_domain: | ||||||
| 		ndoms = 1; | 		ndoms = 1; | ||||||
| 		doms = alloc_sched_domains(ndoms); | 		doms = alloc_sched_domains(ndoms); | ||||||
| @ -992,16 +993,18 @@ single_root_domain: | |||||||
| 	cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { | 	cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { | ||||||
| 		if (cp == &top_cpuset) | 		if (cp == &top_cpuset) | ||||||
| 			continue; | 			continue; | ||||||
|  | 
 | ||||||
|  | 		if (cgrpv2) | ||||||
|  | 			goto v2; | ||||||
|  | 
 | ||||||
| 		/*
 | 		/*
 | ||||||
|  | 		 * v1: | ||||||
| 		 * Continue traversing beyond @cp iff @cp has some CPUs and | 		 * Continue traversing beyond @cp iff @cp has some CPUs and | ||||||
| 		 * isn't load balancing.  The former is obvious.  The | 		 * isn't load balancing.  The former is obvious.  The | ||||||
| 		 * latter: All child cpusets contain a subset of the | 		 * latter: All child cpusets contain a subset of the | ||||||
| 		 * parent's cpus, so just skip them, and then we call | 		 * parent's cpus, so just skip them, and then we call | ||||||
| 		 * update_domain_attr_tree() to calc relax_domain_level of | 		 * update_domain_attr_tree() to calc relax_domain_level of | ||||||
| 		 * the corresponding sched domain. | 		 * the corresponding sched domain. | ||||||
| 		 * |  | ||||||
| 		 * If root is load-balancing, we can skip @cp if it |  | ||||||
| 		 * is a subset of the root's effective_cpus. |  | ||||||
| 		 */ | 		 */ | ||||||
| 		if (!cpumask_empty(cp->cpus_allowed) && | 		if (!cpumask_empty(cp->cpus_allowed) && | ||||||
| 		    !(is_sched_load_balance(cp) && | 		    !(is_sched_load_balance(cp) && | ||||||
| @ -1009,16 +1012,28 @@ single_root_domain: | |||||||
| 					 housekeeping_cpumask(HK_TYPE_DOMAIN)))) | 					 housekeeping_cpumask(HK_TYPE_DOMAIN)))) | ||||||
| 			continue; | 			continue; | ||||||
| 
 | 
 | ||||||
| 		if (root_load_balance && |  | ||||||
| 		    cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus)) |  | ||||||
| 			continue; |  | ||||||
| 
 |  | ||||||
| 		if (is_sched_load_balance(cp) && | 		if (is_sched_load_balance(cp) && | ||||||
| 		    !cpumask_empty(cp->effective_cpus)) | 		    !cpumask_empty(cp->effective_cpus)) | ||||||
| 			csa[csn++] = cp; | 			csa[csn++] = cp; | ||||||
| 
 | 
 | ||||||
| 		/* skip @cp's subtree if not a partition root */ | 		/* skip @cp's subtree */ | ||||||
| 		if (!is_partition_valid(cp)) | 		pos_css = css_rightmost_descendant(pos_css); | ||||||
|  | 		continue; | ||||||
|  | 
 | ||||||
|  | v2: | ||||||
|  | 		/*
 | ||||||
|  | 		 * Only valid partition roots that are not isolated and with | ||||||
|  | 		 * non-empty effective_cpus will be saved into csn[]. | ||||||
|  | 		 */ | ||||||
|  | 		if ((cp->partition_root_state == PRS_ROOT) && | ||||||
|  | 		    !cpumask_empty(cp->effective_cpus)) | ||||||
|  | 			csa[csn++] = cp; | ||||||
|  | 
 | ||||||
|  | 		/*
 | ||||||
|  | 		 * Skip @cp's subtree if not a partition root and has no | ||||||
|  | 		 * exclusive CPUs to be granted to child cpusets. | ||||||
|  | 		 */ | ||||||
|  | 		if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus)) | ||||||
| 			pos_css = css_rightmost_descendant(pos_css); | 			pos_css = css_rightmost_descendant(pos_css); | ||||||
| 	} | 	} | ||||||
| 	rcu_read_unlock(); | 	rcu_read_unlock(); | ||||||
| @ -1072,6 +1087,20 @@ restart: | |||||||
| 	dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), | 	dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), | ||||||
| 			      GFP_KERNEL); | 			      GFP_KERNEL); | ||||||
| 
 | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * Cgroup v2 doesn't support domain attributes, just set all of them | ||||||
|  | 	 * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a | ||||||
|  | 	 * subset of HK_TYPE_DOMAIN housekeeping CPUs. | ||||||
|  | 	 */ | ||||||
|  | 	if (cgrpv2) { | ||||||
|  | 		for (i = 0; i < ndoms; i++) { | ||||||
|  | 			cpumask_copy(doms[i], csa[i]->effective_cpus); | ||||||
|  | 			if (dattr) | ||||||
|  | 				dattr[i] = SD_ATTR_INIT; | ||||||
|  | 		} | ||||||
|  | 		goto done; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	for (nslot = 0, i = 0; i < csn; i++) { | 	for (nslot = 0, i = 0; i < csn; i++) { | ||||||
| 		struct cpuset *a = csa[i]; | 		struct cpuset *a = csa[i]; | ||||||
| 		struct cpumask *dp; | 		struct cpumask *dp; | ||||||
| @ -1231,7 +1260,7 @@ static void rebuild_sched_domains_locked(void) | |||||||
| 	 * root should be only a subset of the active CPUs.  Since a CPU in any | 	 * root should be only a subset of the active CPUs.  Since a CPU in any | ||||||
| 	 * partition root could be offlined, all must be checked. | 	 * partition root could be offlined, all must be checked. | ||||||
| 	 */ | 	 */ | ||||||
| 	if (top_cpuset.nr_subparts) { | 	if (!cpumask_empty(subpartitions_cpus)) { | ||||||
| 		rcu_read_lock(); | 		rcu_read_lock(); | ||||||
| 		cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { | 		cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { | ||||||
| 			if (!is_partition_valid(cs)) { | 			if (!is_partition_valid(cs)) { | ||||||
| @ -4575,7 +4604,7 @@ static void cpuset_handle_hotplug(void) | |||||||
| 	 * In the rare case that hotplug removes all the cpus in | 	 * In the rare case that hotplug removes all the cpus in | ||||||
| 	 * subpartitions_cpus, we assumed that cpus are updated. | 	 * subpartitions_cpus, we assumed that cpus are updated. | ||||||
| 	 */ | 	 */ | ||||||
| 	if (!cpus_updated && top_cpuset.nr_subparts) | 	if (!cpus_updated && !cpumask_empty(subpartitions_cpus)) | ||||||
| 		cpus_updated = true; | 		cpus_updated = true; | ||||||
| 
 | 
 | ||||||
| 	/* For v1, synchronize cpus_allowed to cpu_active_mask */ | 	/* For v1, synchronize cpus_allowed to cpu_active_mask */ | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Waiman Long
						Waiman Long