Skip to content

Commit 5c1741a

Browse files
srishanmgregkh
authored andcommitted
drm/amdgpu: Replace Mutex with Spinlock for RLCG register access to avoid Priority Inversion in SRIOV
[ Upstream commit dc0297f ] RLCG Register Access is a way for virtual functions to safely access GPU registers in a virtualized environment., including TLB flushes and register reads. When multiple threads or VFs try to access the same registers simultaneously, it can lead to race conditions. By using the RLCG interface, the driver can serialize access to the registers. This means that only one thread can access the registers at a time, preventing conflicts and ensuring that operations are performed correctly. Additionally, when a low-priority task holds a mutex that a high-priority task needs, ie., If a thread holding a spinlock tries to acquire a mutex, it can lead to priority inversion. register access in amdgpu_virt_rlcg_reg_rw especially in a fast code path is critical. The call stack shows that the function amdgpu_virt_rlcg_reg_rw is being called, which attempts to acquire the mutex. This function is invoked from amdgpu_sriov_wreg, which in turn is called from gmc_v11_0_flush_gpu_tlb. The [ BUG: Invalid wait context ] indicates that a thread is trying to acquire a mutex while it is in a context that does not allow it to sleep (like holding a spinlock). Fixes the below: [ 253.013423] ============================= [ 253.013434] [ BUG: Invalid wait context ] [ 253.013446] 6.12.0-amdstaging-drm-next-lol-050225 #14 Tainted: G U OE [ 253.013464] ----------------------------- [ 253.013475] kworker/0:1/10 is trying to lock: [ 253.013487] ffff9f30542e3cf8 (&adev->virt.rlcg_reg_lock){+.+.}-{3:3}, at: amdgpu_virt_rlcg_reg_rw+0xf6/0x330 [amdgpu] [ 253.013815] other info that might help us debug this: [ 253.013827] context-{4:4} [ 253.013835] 3 locks held by kworker/0:1/10: [ 253.013847] #0: ffff9f3040050f58 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x3f5/0x680 [ 253.013877] #1: ffffb789c008be40 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_one_work+0x1d6/0x680 [ 253.013905] #2: ffff9f3054281838 (&adev->gmc.invalidate_lock){+.+.}-{2:2}, at: gmc_v11_0_flush_gpu_tlb+0x198/0x4f0 [amdgpu] [ 253.014154] stack backtrace: [ 253.014164] CPU: 0 UID: 0 PID: 10 Comm: kworker/0:1 Tainted: G U OE 6.12.0-amdstaging-drm-next-lol-050225 #14 [ 253.014189] Tainted: [U]=USER, [O]=OOT_MODULE, [E]=UNSIGNED_MODULE [ 253.014203] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 11/18/2024 [ 253.014224] Workqueue: events work_for_cpu_fn [ 253.014241] Call Trace: [ 253.014250] <TASK> [ 253.014260] dump_stack_lvl+0x9b/0xf0 [ 253.014275] dump_stack+0x10/0x20 [ 253.014287] __lock_acquire+0xa47/0x2810 [ 253.014303] ? srso_alias_return_thunk+0x5/0xfbef5 [ 253.014321] lock_acquire+0xd1/0x300 [ 253.014333] ? amdgpu_virt_rlcg_reg_rw+0xf6/0x330 [amdgpu] [ 253.014562] ? __lock_acquire+0xa6b/0x2810 [ 253.014578] __mutex_lock+0x85/0xe20 [ 253.014591] ? amdgpu_virt_rlcg_reg_rw+0xf6/0x330 [amdgpu] [ 253.014782] ? sched_clock_noinstr+0x9/0x10 [ 253.014795] ? srso_alias_return_thunk+0x5/0xfbef5 [ 253.014808] ? local_clock_noinstr+0xe/0xc0 [ 253.014822] ? amdgpu_virt_rlcg_reg_rw+0xf6/0x330 [amdgpu] [ 253.015012] ? srso_alias_return_thunk+0x5/0xfbef5 [ 253.015029] mutex_lock_nested+0x1b/0x30 [ 253.015044] ? mutex_lock_nested+0x1b/0x30 [ 253.015057] amdgpu_virt_rlcg_reg_rw+0xf6/0x330 [amdgpu] [ 253.015249] amdgpu_sriov_wreg+0xc5/0xd0 [amdgpu] [ 253.015435] gmc_v11_0_flush_gpu_tlb+0x44b/0x4f0 [amdgpu] [ 253.015667] gfx_v11_0_hw_init+0x499/0x29c0 [amdgpu] [ 253.015901] ? __pfx_smu_v13_0_update_pcie_parameters+0x10/0x10 [amdgpu] [ 253.016159] ? srso_alias_return_thunk+0x5/0xfbef5 [ 253.016173] ? smu_hw_init+0x18d/0x300 [amdgpu] [ 253.016403] amdgpu_device_init+0x29ad/0x36a0 [amdgpu] [ 253.016614] amdgpu_driver_load_kms+0x1a/0xc0 [amdgpu] [ 253.017057] amdgpu_pci_probe+0x1c2/0x660 [amdgpu] [ 253.017493] local_pci_probe+0x4b/0xb0 [ 253.017746] work_for_cpu_fn+0x1a/0x30 [ 253.017995] process_one_work+0x21e/0x680 [ 253.018248] worker_thread+0x190/0x330 [ 253.018500] ? __pfx_worker_thread+0x10/0x10 [ 253.018746] kthread+0xe7/0x120 [ 253.018988] ? __pfx_kthread+0x10/0x10 [ 253.019231] ret_from_fork+0x3c/0x60 [ 253.019468] ? __pfx_kthread+0x10/0x10 [ 253.019701] ret_from_fork_asm+0x1a/0x30 [ 253.019939] </TASK> v2: s/spin_trylock/spin_lock_irqsave to be safe (Christian). Fixes: e864180 ("drm/amdgpu: Add lock around VF RLCG interface") Cc: lin cao <[email protected]> Cc: Jingwen Chen <[email protected]> Cc: Victor Skvortsov <[email protected]> Cc: Zhigang Luo <[email protected]> Cc: Christian König <[email protected]> Cc: Alex Deucher <[email protected]> Signed-off-by: Srinivasan Shanmugam <[email protected]> Suggested-by: Alex Deucher <[email protected]> Reviewed-by: Christian König <[email protected]> Signed-off-by: Alex Deucher <[email protected]> Signed-off-by: Sasha Levin <[email protected]>
1 parent ea350e1 commit 5c1741a

File tree

3 files changed

+6
-4
lines changed

3 files changed

+6
-4
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4223,7 +4223,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
42234223
mutex_init(&adev->grbm_idx_mutex);
42244224
mutex_init(&adev->mn_lock);
42254225
mutex_init(&adev->virt.vf_errors.lock);
4226-
mutex_init(&adev->virt.rlcg_reg_lock);
42274226
hash_init(adev->mn_hash);
42284227
mutex_init(&adev->psp.mutex);
42294228
mutex_init(&adev->notifier_lock);
@@ -4249,6 +4248,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
42494248
spin_lock_init(&adev->se_cac_idx_lock);
42504249
spin_lock_init(&adev->audio_endpt_idx_lock);
42514250
spin_lock_init(&adev->mm_stats.lock);
4251+
spin_lock_init(&adev->virt.rlcg_reg_lock);
42524252
spin_lock_init(&adev->wb.lock);
42534253

42544254
INIT_LIST_HEAD(&adev->reset_list);

drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1017,6 +1017,7 @@ u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, u32 offset, u32 v, u32 f
10171017
void *scratch_reg2;
10181018
void *scratch_reg3;
10191019
void *spare_int;
1020+
unsigned long flags;
10201021

10211022
if (!adev->gfx.rlc.rlcg_reg_access_supported) {
10221023
dev_err(adev->dev,
@@ -1038,7 +1039,7 @@ u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, u32 offset, u32 v, u32 f
10381039
scratch_reg2 = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->scratch_reg2;
10391040
scratch_reg3 = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->scratch_reg3;
10401041

1041-
mutex_lock(&adev->virt.rlcg_reg_lock);
1042+
spin_lock_irqsave(&adev->virt.rlcg_reg_lock, flags);
10421043

10431044
if (reg_access_ctrl->spare_int)
10441045
spare_int = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->spare_int;
@@ -1097,7 +1098,7 @@ u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, u32 offset, u32 v, u32 f
10971098

10981099
ret = readl(scratch_reg0);
10991100

1100-
mutex_unlock(&adev->virt.rlcg_reg_lock);
1101+
spin_unlock_irqrestore(&adev->virt.rlcg_reg_lock, flags);
11011102

11021103
return ret;
11031104
}

drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,8 @@ struct amdgpu_virt {
279279
/* the ucode id to signal the autoload */
280280
uint32_t autoload_ucode_id;
281281

282-
struct mutex rlcg_reg_lock;
282+
/* Spinlock to protect access to the RLCG register interface */
283+
spinlock_t rlcg_reg_lock;
283284

284285
union amd_sriov_ras_caps ras_en_caps;
285286
union amd_sriov_ras_caps ras_telemetry_en_caps;

0 commit comments

Comments
 (0)