Skip to content

Commit 0a1123c

Browse files
committed
Merge tag 'amd-drm-fixes-6.8-2024-01-18' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-fixes-6.8-2024-01-18: amdgpu: - DSC fixes - DC resource pool fixes - OTG fix - DML2 fixes - Aux fix - GFX10 RLC firmware handling fix - Revert a broken workaround for SMU 13.0.2 - DC writeback fix - Enable gfxoff when ROCm apps are active on gfx11 with the proper FW version amdkfd: - Fix dma-buf exports using GEM handles Signed-off-by: Dave Airlie <[email protected]> From: Alex Deucher <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected]
2 parents 205e18c + aa0901a commit 0a1123c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+554
-373
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -254,8 +254,6 @@ extern int amdgpu_agp;
254254

255255
extern int amdgpu_wbrf;
256256

257-
extern int fw_bo_location;
258-
259257
#define AMDGPU_VM_MAX_NUM_CTX 4096
260258
#define AMDGPU_SG_THRESHOLD (256*1024*1024)
261259
#define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS 3000
@@ -1146,6 +1144,7 @@ struct amdgpu_device {
11461144
bool debug_vm;
11471145
bool debug_largebar;
11481146
bool debug_disable_soft_recovery;
1147+
bool debug_use_vram_fw_buf;
11491148
};
11501149

11511150
static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,9 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work)
138138
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
139139
}
140140

141+
static const struct drm_client_funcs kfd_client_funcs = {
142+
.unregister = drm_client_release,
143+
};
141144
void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
142145
{
143146
int i;
@@ -161,7 +164,7 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
161164
.enable_mes = adev->enable_mes,
162165
};
163166

164-
ret = drm_client_init(&adev->ddev, &adev->kfd.client, "kfd", NULL);
167+
ret = drm_client_init(&adev->ddev, &adev->kfd.client, "kfd", &kfd_client_funcs);
165168
if (ret) {
166169
dev_err(adev->dev, "Failed to init DRM client: %d\n", ret);
167170
return;
@@ -695,10 +698,8 @@ int amdgpu_amdkfd_submit_ib(struct amdgpu_device *adev,
695698
void amdgpu_amdkfd_set_compute_idle(struct amdgpu_device *adev, bool idle)
696699
{
697700
enum amd_powergating_state state = idle ? AMD_PG_STATE_GATE : AMD_PG_STATE_UNGATE;
698-
/* Temporary workaround to fix issues observed in some
699-
* compute applications when GFXOFF is enabled on GFX11.
700-
*/
701-
if (IP_VERSION_MAJ(amdgpu_ip_version(adev, GC_HWIP, 0)) == 11) {
701+
if (IP_VERSION_MAJ(amdgpu_ip_version(adev, GC_HWIP, 0)) == 11 &&
702+
((adev->mes.kiq_version & AMDGPU_MES_VERSION_MASK) <= 64)) {
702703
pr_debug("GFXOFF is %s\n", idle ? "enabled" : "disabled");
703704
amdgpu_gfx_off_ctrl(adev, idle);
704705
} else if ((IP_VERSION_MAJ(amdgpu_ip_version(adev, GC_HWIP, 0)) == 9) &&

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem);
311311
int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev, struct amdgpu_bo *bo);
312312

313313
int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
314-
struct dma_fence **ef);
314+
struct dma_fence __rcu **ef);
315315
int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device *adev,
316316
struct kfd_vm_fault_info *info);
317317
int amdgpu_amdkfd_gpuvm_import_dmabuf_fd(struct amdgpu_device *adev, int fd,

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2802,7 +2802,7 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
28022802
put_task_struct(usertask);
28032803
}
28042804

2805-
static void replace_eviction_fence(struct dma_fence **ef,
2805+
static void replace_eviction_fence(struct dma_fence __rcu **ef,
28062806
struct dma_fence *new_ef)
28072807
{
28082808
struct dma_fence *old_ef = rcu_replace_pointer(*ef, new_ef, true
@@ -2837,7 +2837,7 @@ static void replace_eviction_fence(struct dma_fence **ef,
28372837
* 7. Add fence to all PD and PT BOs.
28382838
* 8. Unreserve all BOs
28392839
*/
2840-
int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef)
2840+
int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu **ef)
28412841
{
28422842
struct amdkfd_process_info *process_info = info;
28432843
struct amdgpu_vm *peer_vm;

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 2 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1544,6 +1544,7 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
15441544
return true;
15451545

15461546
fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1547+
release_firmware(adev->pm.fw);
15471548
if (fw_ver < 0x00160e00)
15481549
return true;
15491550
}
@@ -5245,7 +5246,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
52455246
struct amdgpu_device *tmp_adev = NULL;
52465247
bool need_full_reset, skip_hw_reset, vram_lost = false;
52475248
int r = 0;
5248-
bool gpu_reset_for_dev_remove = 0;
52495249

52505250
/* Try reset handler method first */
52515251
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
@@ -5265,10 +5265,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
52655265
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
52665266
skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
52675267

5268-
gpu_reset_for_dev_remove =
5269-
test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5270-
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5271-
52725268
/*
52735269
* ASIC reset has to be done on all XGMI hive nodes ASAP
52745270
* to allow proper links negotiation in FW (within 1 sec)
@@ -5311,18 +5307,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
53115307
amdgpu_ras_intr_cleared();
53125308
}
53135309

5314-
/* Since the mode1 reset affects base ip blocks, the
5315-
* phase1 ip blocks need to be resumed. Otherwise there
5316-
* will be a BIOS signature error and the psp bootloader
5317-
* can't load kdb on the next amdgpu install.
5318-
*/
5319-
if (gpu_reset_for_dev_remove) {
5320-
list_for_each_entry(tmp_adev, device_list_handle, reset_list)
5321-
amdgpu_device_ip_resume_phase1(tmp_adev);
5322-
5323-
goto end;
5324-
}
5325-
53265310
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
53275311
if (need_full_reset) {
53285312
/* post card */
@@ -5559,11 +5543,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
55595543
int i, r = 0;
55605544
bool need_emergency_restart = false;
55615545
bool audio_suspended = false;
5562-
bool gpu_reset_for_dev_remove = false;
5563-
5564-
gpu_reset_for_dev_remove =
5565-
test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5566-
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
55675546

55685547
/*
55695548
* Special case: RAS triggered and full reset isn't supported
@@ -5601,7 +5580,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
56015580
if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
56025581
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
56035582
list_add_tail(&tmp_adev->reset_list, &device_list);
5604-
if (gpu_reset_for_dev_remove && adev->shutdown)
5583+
if (adev->shutdown)
56055584
tmp_adev->shutdown = true;
56065585
}
56075586
if (!list_is_first(&adev->reset_list, &device_list))
@@ -5686,10 +5665,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
56865665

56875666
retry: /* Rest of adevs pre asic reset from XGMI hive. */
56885667
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5689-
if (gpu_reset_for_dev_remove) {
5690-
/* Workaroud for ASICs need to disable SMC first */
5691-
amdgpu_device_smu_fini_early(tmp_adev);
5692-
}
56935668
r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
56945669
/*TODO Should we stop ?*/
56955670
if (r) {
@@ -5721,9 +5696,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
57215696
r = amdgpu_do_asic_reset(device_list_handle, reset_context);
57225697
if (r && r == -EAGAIN)
57235698
goto retry;
5724-
5725-
if (!r && gpu_reset_for_dev_remove)
5726-
goto recover_end;
57275699
}
57285700

57295701
skip_hw_reset:
@@ -5779,7 +5751,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
57795751
amdgpu_ras_set_error_query_ready(tmp_adev, true);
57805752
}
57815753

5782-
recover_end:
57835754
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
57845755
reset_list);
57855756
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);

drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1963,8 +1963,6 @@ static int amdgpu_discovery_set_gc_ip_blocks(struct amdgpu_device *adev)
19631963
amdgpu_device_ip_block_add(adev, &gfx_v9_0_ip_block);
19641964
break;
19651965
case IP_VERSION(9, 4, 3):
1966-
if (!amdgpu_exp_hw_support)
1967-
return -EINVAL;
19681966
amdgpu_device_ip_block_add(adev, &gfx_v9_4_3_ip_block);
19691967
break;
19701968
case IP_VERSION(10, 1, 10):

drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

Lines changed: 8 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ enum AMDGPU_DEBUG_MASK {
128128
AMDGPU_DEBUG_VM = BIT(0),
129129
AMDGPU_DEBUG_LARGEBAR = BIT(1),
130130
AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY = BIT(2),
131+
AMDGPU_DEBUG_USE_VRAM_FW_BUF = BIT(3),
131132
};
132133

133134
unsigned int amdgpu_vram_limit = UINT_MAX;
@@ -210,7 +211,6 @@ int amdgpu_seamless = -1; /* auto */
210211
uint amdgpu_debug_mask;
211212
int amdgpu_agp = -1; /* auto */
212213
int amdgpu_wbrf = -1;
213-
int fw_bo_location = -1;
214214

215215
static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work);
216216

@@ -990,10 +990,6 @@ MODULE_PARM_DESC(wbrf,
990990
"Enable Wifi RFI interference mitigation (0 = disabled, 1 = enabled, -1 = auto(default)");
991991
module_param_named(wbrf, amdgpu_wbrf, int, 0444);
992992

993-
MODULE_PARM_DESC(fw_bo_location,
994-
"location to put firmware bo for frontdoor loading (-1 = auto (default), 0 = on ram, 1 = on vram");
995-
module_param(fw_bo_location, int, 0644);
996-
997993
/* These devices are not supported by amdgpu.
998994
* They are supported by the mach64, r128, radeon drivers
999995
*/
@@ -2122,6 +2118,11 @@ static void amdgpu_init_debug_options(struct amdgpu_device *adev)
21222118
pr_info("debug: soft reset for GPU recovery disabled\n");
21232119
adev->debug_disable_soft_recovery = true;
21242120
}
2121+
2122+
if (amdgpu_debug_mask & AMDGPU_DEBUG_USE_VRAM_FW_BUF) {
2123+
pr_info("debug: place fw in vram for frontdoor loading\n");
2124+
adev->debug_use_vram_fw_buf = true;
2125+
}
21252126
}
21262127

21272128
static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long flags)
@@ -2233,6 +2234,8 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
22332234

22342235
pci_set_drvdata(pdev, ddev);
22352236

2237+
amdgpu_init_debug_options(adev);
2238+
22362239
ret = amdgpu_driver_load_kms(adev, flags);
22372240
if (ret)
22382241
goto err_pci;
@@ -2313,8 +2316,6 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
23132316
amdgpu_get_secondary_funcs(adev);
23142317
}
23152318

2316-
amdgpu_init_debug_options(adev);
2317-
23182319
return 0;
23192320

23202321
err_pci:
@@ -2336,38 +2337,6 @@ amdgpu_pci_remove(struct pci_dev *pdev)
23362337
pm_runtime_forbid(dev->dev);
23372338
}
23382339

2339-
if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 2) &&
2340-
!amdgpu_sriov_vf(adev)) {
2341-
bool need_to_reset_gpu = false;
2342-
2343-
if (adev->gmc.xgmi.num_physical_nodes > 1) {
2344-
struct amdgpu_hive_info *hive;
2345-
2346-
hive = amdgpu_get_xgmi_hive(adev);
2347-
if (hive->device_remove_count == 0)
2348-
need_to_reset_gpu = true;
2349-
hive->device_remove_count++;
2350-
amdgpu_put_xgmi_hive(hive);
2351-
} else {
2352-
need_to_reset_gpu = true;
2353-
}
2354-
2355-
/* Workaround for ASICs need to reset SMU.
2356-
* Called only when the first device is removed.
2357-
*/
2358-
if (need_to_reset_gpu) {
2359-
struct amdgpu_reset_context reset_context;
2360-
2361-
adev->shutdown = true;
2362-
memset(&reset_context, 0, sizeof(reset_context));
2363-
reset_context.method = AMD_RESET_METHOD_NONE;
2364-
reset_context.reset_req_dev = adev;
2365-
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
2366-
set_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context.flags);
2367-
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
2368-
}
2369-
}
2370-
23712340
amdgpu_driver_unload_kms(dev);
23722341

23732342
/*

drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1045,21 +1045,28 @@ int amdgpu_gmc_vram_checking(struct amdgpu_device *adev)
10451045
* seconds, so here, we just pick up three parts for emulation.
10461046
*/
10471047
ret = memcmp(vram_ptr, cptr, 10);
1048-
if (ret)
1049-
return ret;
1048+
if (ret) {
1049+
ret = -EIO;
1050+
goto release_buffer;
1051+
}
10501052

10511053
ret = memcmp(vram_ptr + (size / 2), cptr, 10);
1052-
if (ret)
1053-
return ret;
1054+
if (ret) {
1055+
ret = -EIO;
1056+
goto release_buffer;
1057+
}
10541058

10551059
ret = memcmp(vram_ptr + size - 10, cptr, 10);
1056-
if (ret)
1057-
return ret;
1060+
if (ret) {
1061+
ret = -EIO;
1062+
goto release_buffer;
1063+
}
10581064

1065+
release_buffer:
10591066
amdgpu_bo_free_kernel(&vram_bo, &vram_gpu,
10601067
&vram_ptr);
10611068

1062-
return 0;
1069+
return ret;
10631070
}
10641071

10651072
static ssize_t current_memory_partition_show(

drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1105,7 +1105,12 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
11051105
if (amdgpu_dpm_read_sensor(adev,
11061106
AMDGPU_PP_SENSOR_GPU_AVG_POWER,
11071107
(void *)&ui32, &ui32_size)) {
1108-
return -EINVAL;
1108+
/* fall back to input power for backwards compat */
1109+
if (amdgpu_dpm_read_sensor(adev,
1110+
AMDGPU_PP_SENSOR_GPU_INPUT_POWER,
1111+
(void *)&ui32, &ui32_size)) {
1112+
return -EINVAL;
1113+
}
11091114
}
11101115
ui32 >>= 8;
11111116
break;

drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -466,7 +466,7 @@ static int psp_sw_init(void *handle)
466466
}
467467

468468
ret = amdgpu_bo_create_kernel(adev, PSP_1_MEG, PSP_1_MEG,
469-
(amdgpu_sriov_vf(adev) || fw_bo_location == 1) ?
469+
(amdgpu_sriov_vf(adev) || adev->debug_use_vram_fw_buf) ?
470470
AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
471471
&psp->fw_pri_bo,
472472
&psp->fw_pri_mc_addr,

0 commit comments

Comments
 (0)