2
0
mirror of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2025-09-04 20:19:47 +08:00
linux/drivers/gpu/drm/amd/amdkfd
Srinivasan Shanmugam fddc450263 drm/amdkfd: Fix Circular Locking Dependency in 'svm_range_cpu_invalidate_pagetables'
This commit addresses a circular locking dependency in the
svm_range_cpu_invalidate_pagetables function. The function previously
held a lock while determining whether to perform an unmap or eviction
operation, which could lead to deadlocks.

Fixes the below:

[  223.418794] ======================================================
[  223.418820] WARNING: possible circular locking dependency detected
[  223.418845] 6.12.0-amdstaging-drm-next-lol-050225 #14 Tainted: G     U     OE
[  223.418869] ------------------------------------------------------
[  223.418889] kfdtest/3939 is trying to acquire lock:
[  223.418906] ffff8957552eae38 (&dqm->lock_hidden){+.+.}-{3:3}, at: evict_process_queues_cpsch+0x43/0x210 [amdgpu]
[  223.419302]
               but task is already holding lock:
[  223.419303] ffff8957556b83b0 (&prange->lock){+.+.}-{3:3}, at: svm_range_cpu_invalidate_pagetables+0x9d/0x850 [amdgpu]
[  223.419447] Console: switching to colour dummy device 80x25
[  223.419477] [IGT] amd_basic: executing
[  223.419599]
               which lock already depends on the new lock.

[  223.419611]
               the existing dependency chain (in reverse order) is:
[  223.419621]
               -> #2 (&prange->lock){+.+.}-{3:3}:
[  223.419636]        __mutex_lock+0x85/0xe20
[  223.419647]        mutex_lock_nested+0x1b/0x30
[  223.419656]        svm_range_validate_and_map+0x2f1/0x15b0 [amdgpu]
[  223.419954]        svm_range_set_attr+0xe8c/0x1710 [amdgpu]
[  223.420236]        svm_ioctl+0x46/0x50 [amdgpu]
[  223.420503]        kfd_ioctl_svm+0x50/0x90 [amdgpu]
[  223.420763]        kfd_ioctl+0x409/0x6d0 [amdgpu]
[  223.421024]        __x64_sys_ioctl+0x95/0xd0
[  223.421036]        x64_sys_call+0x1205/0x20d0
[  223.421047]        do_syscall_64+0x87/0x140
[  223.421056]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  223.421068]
               -> #1 (reservation_ww_class_mutex){+.+.}-{3:3}:
[  223.421084]        __ww_mutex_lock.constprop.0+0xab/0x1560
[  223.421095]        ww_mutex_lock+0x2b/0x90
[  223.421103]        amdgpu_amdkfd_alloc_gtt_mem+0xcc/0x2b0 [amdgpu]
[  223.421361]        add_queue_mes+0x3bc/0x440 [amdgpu]
[  223.421623]        unhalt_cpsch+0x1ae/0x240 [amdgpu]
[  223.421888]        kgd2kfd_start_sched+0x5e/0xd0 [amdgpu]
[  223.422148]        amdgpu_amdkfd_start_sched+0x3d/0x50 [amdgpu]
[  223.422414]        amdgpu_gfx_enforce_isolation_handler+0x132/0x270 [amdgpu]
[  223.422662]        process_one_work+0x21e/0x680
[  223.422673]        worker_thread+0x190/0x330
[  223.422682]        kthread+0xe7/0x120
[  223.422690]        ret_from_fork+0x3c/0x60
[  223.422699]        ret_from_fork_asm+0x1a/0x30
[  223.422708]
               -> #0 (&dqm->lock_hidden){+.+.}-{3:3}:
[  223.422723]        __lock_acquire+0x16f4/0x2810
[  223.422734]        lock_acquire+0xd1/0x300
[  223.422742]        __mutex_lock+0x85/0xe20
[  223.422751]        mutex_lock_nested+0x1b/0x30
[  223.422760]        evict_process_queues_cpsch+0x43/0x210 [amdgpu]
[  223.423025]        kfd_process_evict_queues+0x8a/0x1d0 [amdgpu]
[  223.423285]        kgd2kfd_quiesce_mm+0x43/0x90 [amdgpu]
[  223.423540]        svm_range_cpu_invalidate_pagetables+0x4a7/0x850 [amdgpu]
[  223.423807]        __mmu_notifier_invalidate_range_start+0x1f5/0x250
[  223.423819]        copy_page_range+0x1e94/0x1ea0
[  223.423829]        copy_process+0x172f/0x2ad0
[  223.423839]        kernel_clone+0x9c/0x3f0
[  223.423847]        __do_sys_clone+0x66/0x90
[  223.423856]        __x64_sys_clone+0x25/0x30
[  223.423864]        x64_sys_call+0x1d7c/0x20d0
[  223.423872]        do_syscall_64+0x87/0x140
[  223.423880]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  223.423891]
               other info that might help us debug this:

[  223.423903] Chain exists of:
                 &dqm->lock_hidden --> reservation_ww_class_mutex --> &prange->lock

[  223.423926]  Possible unsafe locking scenario:

[  223.423935]        CPU0                    CPU1
[  223.423942]        ----                    ----
[  223.423949]   lock(&prange->lock);
[  223.423958]                                lock(reservation_ww_class_mutex);
[  223.423970]                                lock(&prange->lock);
[  223.423981]   lock(&dqm->lock_hidden);
[  223.423990]
                *** DEADLOCK ***

[  223.423999] 5 locks held by kfdtest/3939:
[  223.424006]  #0: ffffffffb82b4fc0 (dup_mmap_sem){.+.+}-{0:0}, at: copy_process+0x1387/0x2ad0
[  223.424026]  #1: ffff89575eda81b0 (&mm->mmap_lock){++++}-{3:3}, at: copy_process+0x13a8/0x2ad0
[  223.424046]  #2: ffff89575edaf3b0 (&mm->mmap_lock/1){+.+.}-{3:3}, at: copy_process+0x13e4/0x2ad0
[  223.424066]  #3: ffffffffb82e76e0 (mmu_notifier_invalidate_range_start){+.+.}-{0:0}, at: copy_page_range+0x1cea/0x1ea0
[  223.424088]  #4: ffff8957556b83b0 (&prange->lock){+.+.}-{3:3}, at: svm_range_cpu_invalidate_pagetables+0x9d/0x850 [amdgpu]
[  223.424365]
               stack backtrace:
[  223.424374] CPU: 0 UID: 0 PID: 3939 Comm: kfdtest Tainted: G     U     OE      6.12.0-amdstaging-drm-next-lol-050225 #14
[  223.424392] Tainted: [U]=USER, [O]=OOT_MODULE, [E]=UNSIGNED_MODULE
[  223.424401] Hardware name: Gigabyte Technology Co., Ltd. X570 AORUS PRO WIFI/X570 AORUS PRO WIFI, BIOS F36a 02/16/2022
[  223.424416] Call Trace:
[  223.424423]  <TASK>
[  223.424430]  dump_stack_lvl+0x9b/0xf0
[  223.424441]  dump_stack+0x10/0x20
[  223.424449]  print_circular_bug+0x275/0x350
[  223.424460]  check_noncircular+0x157/0x170
[  223.424469]  ? __bfs+0xfd/0x2c0
[  223.424481]  __lock_acquire+0x16f4/0x2810
[  223.424490]  ? srso_return_thunk+0x5/0x5f
[  223.424505]  lock_acquire+0xd1/0x300
[  223.424514]  ? evict_process_queues_cpsch+0x43/0x210 [amdgpu]
[  223.424783]  __mutex_lock+0x85/0xe20
[  223.424792]  ? evict_process_queues_cpsch+0x43/0x210 [amdgpu]
[  223.425058]  ? srso_return_thunk+0x5/0x5f
[  223.425067]  ? mark_held_locks+0x54/0x90
[  223.425076]  ? evict_process_queues_cpsch+0x43/0x210 [amdgpu]
[  223.425339]  ? srso_return_thunk+0x5/0x5f
[  223.425350]  mutex_lock_nested+0x1b/0x30
[  223.425358]  ? mutex_lock_nested+0x1b/0x30
[  223.425367]  evict_process_queues_cpsch+0x43/0x210 [amdgpu]
[  223.425631]  kfd_process_evict_queues+0x8a/0x1d0 [amdgpu]
[  223.425893]  kgd2kfd_quiesce_mm+0x43/0x90 [amdgpu]
[  223.426156]  svm_range_cpu_invalidate_pagetables+0x4a7/0x850 [amdgpu]
[  223.426423]  ? srso_return_thunk+0x5/0x5f
[  223.426436]  __mmu_notifier_invalidate_range_start+0x1f5/0x250
[  223.426450]  copy_page_range+0x1e94/0x1ea0
[  223.426461]  ? srso_return_thunk+0x5/0x5f
[  223.426474]  ? srso_return_thunk+0x5/0x5f
[  223.426484]  ? lock_acquire+0xd1/0x300
[  223.426494]  ? copy_process+0x1718/0x2ad0
[  223.426502]  ? srso_return_thunk+0x5/0x5f
[  223.426510]  ? sched_clock_noinstr+0x9/0x10
[  223.426519]  ? local_clock_noinstr+0xe/0xc0
[  223.426528]  ? copy_process+0x1718/0x2ad0
[  223.426537]  ? srso_return_thunk+0x5/0x5f
[  223.426550]  copy_process+0x172f/0x2ad0
[  223.426569]  kernel_clone+0x9c/0x3f0
[  223.426577]  ? __schedule+0x4c9/0x1b00
[  223.426586]  ? srso_return_thunk+0x5/0x5f
[  223.426594]  ? sched_clock_noinstr+0x9/0x10
[  223.426602]  ? srso_return_thunk+0x5/0x5f
[  223.426610]  ? local_clock_noinstr+0xe/0xc0
[  223.426619]  ? schedule+0x107/0x1a0
[  223.426629]  __do_sys_clone+0x66/0x90
[  223.426643]  __x64_sys_clone+0x25/0x30
[  223.426652]  x64_sys_call+0x1d7c/0x20d0
[  223.426661]  do_syscall_64+0x87/0x140
[  223.426671]  ? srso_return_thunk+0x5/0x5f
[  223.426679]  ? common_nsleep+0x44/0x50
[  223.426690]  ? srso_return_thunk+0x5/0x5f
[  223.426698]  ? trace_hardirqs_off+0x52/0xd0
[  223.426709]  ? srso_return_thunk+0x5/0x5f
[  223.426717]  ? syscall_exit_to_user_mode+0xcc/0x200
[  223.426727]  ? srso_return_thunk+0x5/0x5f
[  223.426736]  ? do_syscall_64+0x93/0x140
[  223.426748]  ? srso_return_thunk+0x5/0x5f
[  223.426756]  ? up_write+0x1c/0x1e0
[  223.426765]  ? srso_return_thunk+0x5/0x5f
[  223.426775]  ? srso_return_thunk+0x5/0x5f
[  223.426783]  ? trace_hardirqs_off+0x52/0xd0
[  223.426792]  ? srso_return_thunk+0x5/0x5f
[  223.426800]  ? syscall_exit_to_user_mode+0xcc/0x200
[  223.426810]  ? srso_return_thunk+0x5/0x5f
[  223.426818]  ? do_syscall_64+0x93/0x140
[  223.426826]  ? syscall_exit_to_user_mode+0xcc/0x200
[  223.426836]  ? srso_return_thunk+0x5/0x5f
[  223.426844]  ? do_syscall_64+0x93/0x140
[  223.426853]  ? srso_return_thunk+0x5/0x5f
[  223.426861]  ? irqentry_exit+0x6b/0x90
[  223.426869]  ? srso_return_thunk+0x5/0x5f
[  223.426877]  ? exc_page_fault+0xa7/0x2c0
[  223.426888]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  223.426898] RIP: 0033:0x7f46758eab57
[  223.426906] Code: ba 04 00 f3 0f 1e fa 64 48 8b 04 25 10 00 00 00 45 31 c0 31 d2 31 f6 bf 11 00 20 01 4c 8d 90 d0 02 00 00 b8 38 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 41 41 89 c0 85 c0 75 2c 64 48 8b 04 25 10 00
[  223.426930] RSP: 002b:00007fff5c3e5188 EFLAGS: 00000246 ORIG_RAX: 0000000000000038
[  223.426943] RAX: ffffffffffffffda RBX: 00007f4675f8c040 RCX: 00007f46758eab57
[  223.426954] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000001200011
[  223.426965] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
[  223.426975] R10: 00007f4675e81a50 R11: 0000000000000246 R12: 0000000000000001
[  223.426986] R13: 00007fff5c3e5470 R14: 00007fff5c3e53e0 R15: 00007fff5c3e5410
[  223.427004]  </TASK>

v2: To resolve this issue, the allocation of the process context buffer
(`proc_ctx_bo`) has been moved from the `add_queue_mes` function to the
`pqm_create_queue` function. This change ensures that the buffer is
allocated only when the first queue for a process is created and only if
the Micro Engine Scheduler (MES) is enabled. (Felix)

v3: Fix typo s/Memory Execution Scheduler (MES)/Micro Engine Scheduler
in commit message. (Lijo)

Fixes: 438b39ac74 ("drm/amdkfd: pause autosuspend when creating pdd")
Cc: Jesse Zhang <jesse.zhang@amd.com>
Cc: Yunxiang Li <Yunxiang.Li@amd.com>
Cc: Philip Yang <Philip.Yang@amd.com>
Cc: Alex Sierra <alex.sierra@amd.com>
Cc: Felix Kuehling <Felix.Kuehling@amd.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Srinivasan Shanmugam <srinivasan.shanmugam@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
2025-02-27 16:49:32 -05:00
..
cik_event_interrupt.c drm/amdkfd: Have kfd driver use same PASID values from graphic driver 2025-02-12 21:02:55 -05:00
cik_int.h
cik_regs.h
cwsr_trap_handler_gfx8.asm
cwsr_trap_handler_gfx9.asm drm/amdkfd: Clear MODE.VSKIP in gfx9 trap handler 2025-01-24 09:53:05 -05:00
cwsr_trap_handler_gfx10.asm drm/amdkfd: Move gfx12 trap handler to separate file 2025-01-09 16:02:56 -05:00
cwsr_trap_handler_gfx12.asm drm/amdkfd: Ensure consistent barrier state saved in gfx12 trap handler 2025-02-12 19:47:15 -05:00
cwsr_trap_handler.h drm/amdkfd: Ensure consistent barrier state saved in gfx12 trap handler 2025-02-12 19:47:15 -05:00
Kconfig drm/amdkfd: drop IOMMUv2 support 2023-08-11 14:47:25 -04:00
kfd_chardev.c drm/amdkfd: Have kfd driver use same PASID values from graphic driver 2025-02-12 21:02:55 -05:00
kfd_crat.c drm/amdgpu: simplify xgmi peer info calls 2025-02-25 11:45:12 -05:00
kfd_crat.h drm/amdkfd: Reconcile the definition and use of oem_id in struct kfd_topology_device 2024-05-08 18:46:26 -04:00
kfd_debug.c drm/amdkfd: Have kfd driver use same PASID values from graphic driver 2025-02-12 21:02:55 -05:00
kfd_debug.h drm/amdkfd: add gc 9.5.0 support on kfd 2024-12-10 10:26:51 -05:00
kfd_debugfs.c drm/amdkfd: Introduce kfd_node struct (v5) 2023-06-09 09:42:27 -04:00
kfd_device_queue_manager_cik.c drm/amdkfd: drop IOMMUv2 support 2023-08-11 14:47:25 -04:00
kfd_device_queue_manager_v9.c drm/amdkfd: add gc 9.5.0 support on kfd 2024-12-10 10:26:51 -05:00
kfd_device_queue_manager_v10.c drm/amdkfd: rename device_queue_manager_init_v10_navi10() 2023-08-11 14:47:39 -04:00
kfd_device_queue_manager_v11.c
kfd_device_queue_manager_v12.c drm/amdkfd: Added device queue manager files for GFX12. 2024-05-02 16:18:12 -04:00
kfd_device_queue_manager_vi.c drm/amdkfd: drop IOMMUv2 support 2023-08-11 14:47:25 -04:00
kfd_device_queue_manager.c drm/amdkfd: Fix Circular Locking Dependency in 'svm_range_cpu_invalidate_pagetables' 2025-02-27 16:49:32 -05:00
kfd_device_queue_manager.h drm/amdkfd: remove extra use of volatile 2024-10-24 18:04:49 -04:00
kfd_device.c drm/amdkfd: enable cooperative launch on gfx12 2025-02-25 11:45:12 -05:00
kfd_doorbell.c drm/amdkfd: get doorbell's absolute offset based on the db_size 2023-10-09 17:02:34 -04:00
kfd_events.c drm/amdkfd: Have kfd driver use same PASID values from graphic driver 2025-02-12 21:02:55 -05:00
kfd_events.h drm/amdkfd: add event_age tracking when receiving interrupt 2023-06-15 11:37:55 -04:00
kfd_flat_memory.c drm/amdkfd: Use device based logging for errors 2024-07-01 16:10:47 -04:00
kfd_int_process_v9.c drm/amdkfd: Have kfd driver use same PASID values from graphic driver 2025-02-12 21:02:55 -05:00
kfd_int_process_v10.c drm/amdkfd: clean up code for interrupt v10 2024-09-17 10:04:57 -04:00
kfd_int_process_v11.c drm/amdkfd: Have kfd driver use same PASID values from graphic driver 2025-02-12 21:02:55 -05:00
kfd_interrupt.c drm/amdgpu: Show warning message if IH ring overflow 2024-12-18 12:39:07 -05:00
kfd_kernel_queue.c drm/amdkfd: Use the correct wptr size 2024-11-21 15:55:20 -05:00
kfd_kernel_queue.h drm/amdkfd: Skip packet submission on fatal error 2024-02-26 11:14:31 -05:00
kfd_migrate.c drm/amdkfd: add a new flag to manage where VRAM allocations go 2025-02-12 21:04:08 -05:00
kfd_migrate.h drm/amdkfd: Use partial migrations/mapping for GPU/CPU page faults in SVM 2023-12-06 15:22:32 -05:00
kfd_module.c
kfd_mqd_manager_cik.c drm/amdkfd: Check preemption status on all XCDs 2024-03-20 13:38:12 -04:00
kfd_mqd_manager_v9.c drm/amdkfd: Preserve cp_hqd_pq_control on update_mqd 2025-02-25 11:43:58 -05:00
kfd_mqd_manager_v10.c drm/amdkfd: Preserve cp_hqd_pq_control on update_mqd 2025-02-25 11:43:58 -05:00
kfd_mqd_manager_v11.c drm/amdkfd: Preserve cp_hqd_pq_control on update_mqd 2025-02-25 11:43:58 -05:00
kfd_mqd_manager_v12.c drm/amdkfd: Preserve cp_hqd_pq_control on update_mqd 2025-02-25 11:43:58 -05:00
kfd_mqd_manager_vi.c drm/amdkfd: Check preemption status on all XCDs 2024-03-20 13:38:12 -04:00
kfd_mqd_manager.c drm/amdkfd: amdkfd_free_gtt_mem clear the correct pointer 2024-07-23 17:34:44 -04:00
kfd_mqd_manager.h drm/amdkfd: Check preemption status on all XCDs 2024-03-20 13:38:12 -04:00
kfd_packet_manager_v9.c drm/amdkfd: Have kfd driver use same PASID values from graphic driver 2025-02-12 21:02:55 -05:00
kfd_packet_manager_vi.c drm/amdkfd: Have kfd driver use same PASID values from graphic driver 2025-02-12 21:02:55 -05:00
kfd_packet_manager.c drm/amdkfd: add gc 9.5.0 support on kfd 2024-12-10 10:26:51 -05:00
kfd_pasid.c drm/amdkfd: Remove unused functions 2025-02-12 21:02:56 -05:00
kfd_pm4_headers_ai.h drm/amdkfd: Enable processes isolation on gfx9 2024-08-20 22:08:07 -04:00
kfd_pm4_headers_aldebaran.h drm/amdkfd: Enable processes isolation on gfx9 2024-08-20 22:08:07 -04:00
kfd_pm4_headers_vi.h
kfd_pm4_headers.h
kfd_pm4_opcodes.h
kfd_priv.h drm/amdkfd: Remove unused functions 2025-02-12 21:02:56 -05:00
kfd_process_queue_manager.c drm/amdkfd: Fix Circular Locking Dependency in 'svm_range_cpu_invalidate_pagetables' 2025-02-27 16:49:32 -05:00
kfd_process.c drm/amdkfd: Fix error handling for missing PASID in 'kfd_process_device_init_vm' 2025-02-19 15:16:41 -05:00
kfd_queue.c drm/amdkfd: Fix user queue validation on Gfx7/8 2025-02-17 14:09:29 -05:00
kfd_smi_events.c Revert "drm/amdkfd: SMI report dropped event count" 2024-10-22 17:51:20 -04:00
kfd_smi_events.h drm/amdkfd: Output migrate end event if migrate failed 2024-10-07 14:09:02 -04:00
kfd_svm.c drm/amdkfd: add a new flag to manage where VRAM allocations go 2025-02-12 21:04:08 -05:00
kfd_svm.h drm/amdkfd: add a new flag to manage where VRAM allocations go 2025-02-12 21:04:08 -05:00
kfd_topology.c drm/amdkfd: fix missing L2 cache info in topology 2025-02-12 21:05:49 -05:00
kfd_topology.h drm/amd: Refactor find_system_memory() 2025-02-12 21:04:09 -05:00
Makefile drm/amdkfd: Added device queue manager files for GFX12. 2024-05-02 16:18:12 -04:00
soc15_int.h drm/amdkfd: Check int source id for utcl2 poison event 2024-08-23 10:52:33 -04:00