drm/xe/vf: Abort VF post migration recovery on failure

If VF post-migration recovery fails, the device is wedged. However, submission queues still need to be enabled for proper cleanup. In such cases, call into the GuC submission backend to restart all queues that were previously paused. v3: - s/Avort/Abort (Tomasz) Signed-off-by: Matthew Brost <matthew.brost@intel.com> Reviewed-by: Tomasz Lis <tomasz.lis@intel.com> Link: https://lore.kernel.org/r/20251008214532.3442967-26-matthew.brost@intel.com
2026-03-22 07:27:12 +08:00 · 2025-10-08 14:45:23 -07:00
parent 16b6dd1a90
commit 7c4b7e34c8
3 changed files with 31 additions and 0 deletions
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -2098,6 +2098,26 @@ void xe_guc_submit_unpause(struct xe_guc *guc)
 	wake_up_all(&guc->ct.wq);
 }

+/**
+ * xe_guc_submit_pause_abort - Abort all paused submission task on given GuC.
+ * @guc: the &xe_guc struct instance whose scheduler is to be aborted
+ */
+void xe_guc_submit_pause_abort(struct xe_guc *guc)
+{
+	struct xe_exec_queue *q;
+	unsigned long index;
+
+	mutex_lock(&guc->submission_state.lock);
+	xa_for_each(&guc->submission_state.exec_queue_lookup, index, q) {
+		struct xe_gpu_scheduler *sched = &q->guc->sched;
+
+		xe_sched_submission_start(sched);
+		if (exec_queue_killed_or_banned_or_wedged(q))
+			xe_guc_exec_queue_trigger_cleanup(q);
+	}
+	mutex_unlock(&guc->submission_state.lock);
+}
+
 static struct xe_exec_queue *
 g2h_exec_queue_lookup(struct xe_guc *guc, u32 guc_id)
 {