Skip to content

Commit 74d1696

Browse files
shroffnikeithbusch
authored andcommitted
nvmet-loop: avoid using mutex in IO hotpath
Using mutex lock in IO hot path causes the kernel BUG sleeping while atomic. Shinichiro[1], first encountered this issue while running blktest nvme/052 shown below: BUG: sleeping function called from invalid context at kernel/locking/mutex.c:585 in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 996, name: (udev-worker) preempt_count: 0, expected: 0 RCU nest depth: 1, expected: 0 2 locks held by (udev-worker)/996: #0: ffff8881004570c8 (mapping.invalidate_lock){.+.+}-{3:3}, at: page_cache_ra_unbounded+0x155/0x5c0 #1: ffffffff8607eaa0 (rcu_read_lock){....}-{1:2}, at: blk_mq_flush_plug_list+0xa75/0x1950 CPU: 2 UID: 0 PID: 996 Comm: (udev-worker) Not tainted 6.12.0-rc3+ #339 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-2.fc40 04/01/2014 Call Trace: <TASK> dump_stack_lvl+0x6a/0x90 __might_resched.cold+0x1f7/0x23d ? __pfx___might_resched+0x10/0x10 ? vsnprintf+0xdeb/0x18f0 __mutex_lock+0xf4/0x1220 ? nvmet_subsys_nsid_exists+0xb9/0x150 [nvmet] ? __pfx_vsnprintf+0x10/0x10 ? __pfx___mutex_lock+0x10/0x10 ? snprintf+0xa5/0xe0 ? xas_load+0x1ce/0x3f0 ? nvmet_subsys_nsid_exists+0xb9/0x150 [nvmet] nvmet_subsys_nsid_exists+0xb9/0x150 [nvmet] ? __pfx_nvmet_subsys_nsid_exists+0x10/0x10 [nvmet] nvmet_req_find_ns+0x24e/0x300 [nvmet] nvmet_req_init+0x694/0xd40 [nvmet] ? blk_mq_start_request+0x11c/0x750 ? nvme_setup_cmd+0x369/0x990 [nvme_core] nvme_loop_queue_rq+0x2a7/0x7a0 [nvme_loop] ? __pfx___lock_acquire+0x10/0x10 ? __pfx_nvme_loop_queue_rq+0x10/0x10 [nvme_loop] __blk_mq_issue_directly+0xe2/0x1d0 ? __pfx___blk_mq_issue_directly+0x10/0x10 ? blk_mq_request_issue_directly+0xc2/0x140 blk_mq_plug_issue_direct+0x13f/0x630 ? lock_acquire+0x2d/0xc0 ? blk_mq_flush_plug_list+0xa75/0x1950 blk_mq_flush_plug_list+0xa9d/0x1950 ? __pfx_blk_mq_flush_plug_list+0x10/0x10 ? __pfx_mpage_readahead+0x10/0x10 __blk_flush_plug+0x278/0x4d0 ? __pfx___blk_flush_plug+0x10/0x10 ? lock_release+0x460/0x7a0 blk_finish_plug+0x4e/0x90 read_pages+0x51b/0xbc0 ? __pfx_read_pages+0x10/0x10 ? lock_release+0x460/0x7a0 page_cache_ra_unbounded+0x326/0x5c0 force_page_cache_ra+0x1ea/0x2f0 filemap_get_pages+0x59e/0x17b0 ? __pfx_filemap_get_pages+0x10/0x10 ? lock_is_held_type+0xd5/0x130 ? __pfx___might_resched+0x10/0x10 ? find_held_lock+0x2d/0x110 filemap_read+0x317/0xb70 ? up_write+0x1ba/0x510 ? __pfx_filemap_read+0x10/0x10 ? inode_security+0x54/0xf0 ? selinux_file_permission+0x36d/0x420 blkdev_read_iter+0x143/0x3b0 vfs_read+0x6ac/0xa20 ? __pfx_vfs_read+0x10/0x10 ? __pfx_vm_mmap_pgoff+0x10/0x10 ? __pfx___seccomp_filter+0x10/0x10 ksys_read+0xf7/0x1d0 ? __pfx_ksys_read+0x10/0x10 do_syscall_64+0x93/0x180 ? lockdep_hardirqs_on_prepare+0x16d/0x400 ? do_syscall_64+0x9f/0x180 ? lockdep_hardirqs_on+0x78/0x100 ? do_syscall_64+0x9f/0x180 ? lockdep_hardirqs_on_prepare+0x16d/0x400 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f565bd1ce11 Code: 00 48 8b 15 09 90 0d 00 f7 d8 64 89 02 b8 ff ff ff ff eb bd e8 d0 ad 01 00 f3 0f 1e fa 80 3d 35 12 0e 00 00 74 13 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 4f c3 66 0f 1f 44 00 00 55 48 89 e5 48 83 ec RSP: 002b:00007ffd6e7a20c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 0000000000001000 RCX: 00007f565bd1ce11 RDX: 0000000000001000 RSI: 00007f565babb000 RDI: 0000000000000014 RBP: 00007ffd6e7a2130 R08: 00000000ffffffff R09: 0000000000000000 R10: 0000556000bfa610 R11: 0000000000000246 R12: 000000003ffff000 R13: 0000556000bfa5b0 R14: 0000000000000e00 R15: 0000556000c07328 </TASK> Apparently, the above issue is caused due to using mutex lock while we're in IO hot path. It's a regression caused with commit 5053639 ("nvmet: fix nvme status code when namespace is disabled"). The mutex ->su_mutex is used to find whether a disabled nsid exists in the config group or not. This is to differentiate between a nsid that is disabled vs non-existent. To mitigate the above issue, we've worked upon a fix[2] where we now insert nsid in subsys Xarray as soon as it's created under config group and later when that nsid is enabled, we add an Xarray mark on it and set ns->enabled to true. The Xarray mark is useful while we need to loop through all enabled namepsaces under a subsystem using xa_for_each_marked() API. If later a nsid is disabled then we clear Xarray mark from it and also set ns->enabled to false. It's only when nsid is deleted from the config group we delete it from the Xarray. So with this change, now we could easily differentiate a nsid is disabled (i.e. Xarray entry for ns exists but ns->enabled is set to false) vs non- existent (i.e.Xarray entry for ns doesn't exist). Link: https://lore.kernel.org/linux-nvme/[email protected]/ [2] Reported-by: Shinichiro Kawasaki <[email protected]> Closes: https://lore.kernel.org/linux-nvme/tqcy3sveity7p56v7ywp7ssyviwcb3w4623cnxj3knoobfcanq@yxgt2mjkbkam/ [1] Fixes: 5053639 ("nvmet: fix nvme status code when namespace is disabled") Fix-suggested-by: Christoph Hellwig <[email protected]> Reviewed-by: Hannes Reinecke <[email protected]> Reviewed-by: Chaitanya Kulkarni <[email protected]> Reviewed-by: Sagi Grimberg <[email protected]> Reviewed-by: Christoph Hellwig <[email protected]> Signed-off-by: Nilay Shroff <[email protected]> Signed-off-by: Keith Busch <[email protected]>
1 parent b579d6f commit 74d1696

File tree

5 files changed

+79
-65
lines changed

5 files changed

+79
-65
lines changed

drivers/nvme/target/admin-cmd.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
139139
unsigned long idx;
140140

141141
ctrl = req->sq->ctrl;
142-
xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
142+
nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) {
143143
/* we don't have the right data for file backed ns */
144144
if (!ns->bdev)
145145
continue;
@@ -331,9 +331,10 @@ static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid,
331331
u32 count = 0;
332332

333333
if (!(req->cmd->get_log_page.lsp & NVME_ANA_LOG_RGO)) {
334-
xa_for_each(&ctrl->subsys->namespaces, idx, ns)
334+
nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) {
335335
if (ns->anagrpid == grpid)
336336
desc->nsids[count++] = cpu_to_le32(ns->nsid);
337+
}
337338
}
338339

339340
desc->grpid = cpu_to_le32(grpid);
@@ -772,7 +773,7 @@ static void nvmet_execute_identify_endgrp_list(struct nvmet_req *req)
772773
goto out;
773774
}
774775

775-
xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
776+
nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) {
776777
if (ns->nsid <= min_endgid)
777778
continue;
778779

@@ -815,7 +816,7 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req, bool match_css)
815816
goto out;
816817
}
817818

818-
xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
819+
nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) {
819820
if (ns->nsid <= min_nsid)
820821
continue;
821822
if (match_css && req->ns->csi != req->cmd->identify.csi)

drivers/nvme/target/configfs.c

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -810,18 +810,6 @@ static struct configfs_attribute *nvmet_ns_attrs[] = {
810810
NULL,
811811
};
812812

813-
bool nvmet_subsys_nsid_exists(struct nvmet_subsys *subsys, u32 nsid)
814-
{
815-
struct config_item *ns_item;
816-
char name[12];
817-
818-
snprintf(name, sizeof(name), "%u", nsid);
819-
mutex_lock(&subsys->namespaces_group.cg_subsys->su_mutex);
820-
ns_item = config_group_find_item(&subsys->namespaces_group, name);
821-
mutex_unlock(&subsys->namespaces_group.cg_subsys->su_mutex);
822-
return ns_item != NULL;
823-
}
824-
825813
static void nvmet_ns_release(struct config_item *item)
826814
{
827815
struct nvmet_ns *ns = to_nvmet_ns(item);

drivers/nvme/target/core.c

Lines changed: 63 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ static u32 nvmet_max_nsid(struct nvmet_subsys *subsys)
127127
unsigned long idx;
128128
u32 nsid = 0;
129129

130-
xa_for_each(&subsys->namespaces, idx, cur)
130+
nvmet_for_each_enabled_ns(&subsys->namespaces, idx, cur)
131131
nsid = cur->nsid;
132132

133133
return nsid;
@@ -441,11 +441,14 @@ u16 nvmet_req_find_ns(struct nvmet_req *req)
441441
struct nvmet_subsys *subsys = nvmet_req_subsys(req);
442442

443443
req->ns = xa_load(&subsys->namespaces, nsid);
444-
if (unlikely(!req->ns)) {
444+
if (unlikely(!req->ns || !req->ns->enabled)) {
445445
req->error_loc = offsetof(struct nvme_common_command, nsid);
446-
if (nvmet_subsys_nsid_exists(subsys, nsid))
447-
return NVME_SC_INTERNAL_PATH_ERROR;
448-
return NVME_SC_INVALID_NS | NVME_STATUS_DNR;
446+
if (!req->ns) /* ns doesn't exist! */
447+
return NVME_SC_INVALID_NS | NVME_STATUS_DNR;
448+
449+
/* ns exists but it's disabled */
450+
req->ns = NULL;
451+
return NVME_SC_INTERNAL_PATH_ERROR;
449452
}
450453

451454
percpu_ref_get(&req->ns->ref);
@@ -583,8 +586,6 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
583586
goto out_unlock;
584587

585588
ret = -EMFILE;
586-
if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES)
587-
goto out_unlock;
588589

589590
ret = nvmet_bdev_ns_enable(ns);
590591
if (ret == -ENOTBLK)
@@ -599,38 +600,19 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
599600
list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
600601
nvmet_p2pmem_ns_add_p2p(ctrl, ns);
601602

602-
ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace,
603-
0, GFP_KERNEL);
604-
if (ret)
605-
goto out_dev_put;
606-
607-
if (ns->nsid > subsys->max_nsid)
608-
subsys->max_nsid = ns->nsid;
609-
610-
ret = xa_insert(&subsys->namespaces, ns->nsid, ns, GFP_KERNEL);
611-
if (ret)
612-
goto out_restore_subsys_maxnsid;
613-
614603
if (ns->pr.enable) {
615604
ret = nvmet_pr_init_ns(ns);
616605
if (ret)
617-
goto out_remove_from_subsys;
606+
goto out_dev_put;
618607
}
619608

620-
subsys->nr_namespaces++;
621-
622609
nvmet_ns_changed(subsys, ns->nsid);
623610
ns->enabled = true;
611+
xa_set_mark(&subsys->namespaces, ns->nsid, NVMET_NS_ENABLED);
624612
ret = 0;
625613
out_unlock:
626614
mutex_unlock(&subsys->lock);
627615
return ret;
628-
629-
out_remove_from_subsys:
630-
xa_erase(&subsys->namespaces, ns->nsid);
631-
out_restore_subsys_maxnsid:
632-
subsys->max_nsid = nvmet_max_nsid(subsys);
633-
percpu_ref_exit(&ns->ref);
634616
out_dev_put:
635617
list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
636618
pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid));
@@ -649,15 +631,37 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
649631
goto out_unlock;
650632

651633
ns->enabled = false;
652-
xa_erase(&ns->subsys->namespaces, ns->nsid);
653-
if (ns->nsid == subsys->max_nsid)
654-
subsys->max_nsid = nvmet_max_nsid(subsys);
634+
xa_clear_mark(&subsys->namespaces, ns->nsid, NVMET_NS_ENABLED);
655635

656636
list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
657637
pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid));
658638

659639
mutex_unlock(&subsys->lock);
660640

641+
if (ns->pr.enable)
642+
nvmet_pr_exit_ns(ns);
643+
644+
mutex_lock(&subsys->lock);
645+
nvmet_ns_changed(subsys, ns->nsid);
646+
nvmet_ns_dev_disable(ns);
647+
out_unlock:
648+
mutex_unlock(&subsys->lock);
649+
}
650+
651+
void nvmet_ns_free(struct nvmet_ns *ns)
652+
{
653+
struct nvmet_subsys *subsys = ns->subsys;
654+
655+
nvmet_ns_disable(ns);
656+
657+
mutex_lock(&subsys->lock);
658+
659+
xa_erase(&subsys->namespaces, ns->nsid);
660+
if (ns->nsid == subsys->max_nsid)
661+
subsys->max_nsid = nvmet_max_nsid(subsys);
662+
663+
mutex_unlock(&subsys->lock);
664+
661665
/*
662666
* Now that we removed the namespaces from the lookup list, we
663667
* can kill the per_cpu ref and wait for any remaining references
@@ -671,21 +675,9 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
671675
wait_for_completion(&ns->disable_done);
672676
percpu_ref_exit(&ns->ref);
673677

674-
if (ns->pr.enable)
675-
nvmet_pr_exit_ns(ns);
676-
677678
mutex_lock(&subsys->lock);
678-
679679
subsys->nr_namespaces--;
680-
nvmet_ns_changed(subsys, ns->nsid);
681-
nvmet_ns_dev_disable(ns);
682-
out_unlock:
683680
mutex_unlock(&subsys->lock);
684-
}
685-
686-
void nvmet_ns_free(struct nvmet_ns *ns)
687-
{
688-
nvmet_ns_disable(ns);
689681

690682
down_write(&nvmet_ana_sem);
691683
nvmet_ana_group_enabled[ns->anagrpid]--;
@@ -699,15 +691,33 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
699691
{
700692
struct nvmet_ns *ns;
701693

694+
mutex_lock(&subsys->lock);
695+
696+
if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES)
697+
goto out_unlock;
698+
702699
ns = kzalloc(sizeof(*ns), GFP_KERNEL);
703700
if (!ns)
704-
return NULL;
701+
goto out_unlock;
705702

706703
init_completion(&ns->disable_done);
707704

708705
ns->nsid = nsid;
709706
ns->subsys = subsys;
710707

708+
if (percpu_ref_init(&ns->ref, nvmet_destroy_namespace, 0, GFP_KERNEL))
709+
goto out_free;
710+
711+
if (ns->nsid > subsys->max_nsid)
712+
subsys->max_nsid = nsid;
713+
714+
if (xa_insert(&subsys->namespaces, ns->nsid, ns, GFP_KERNEL))
715+
goto out_exit;
716+
717+
subsys->nr_namespaces++;
718+
719+
mutex_unlock(&subsys->lock);
720+
711721
down_write(&nvmet_ana_sem);
712722
ns->anagrpid = NVMET_DEFAULT_ANA_GRPID;
713723
nvmet_ana_group_enabled[ns->anagrpid]++;
@@ -718,6 +728,14 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
718728
ns->csi = NVME_CSI_NVM;
719729

720730
return ns;
731+
out_exit:
732+
subsys->max_nsid = nvmet_max_nsid(subsys);
733+
percpu_ref_exit(&ns->ref);
734+
out_free:
735+
kfree(ns);
736+
out_unlock:
737+
mutex_unlock(&subsys->lock);
738+
return NULL;
721739
}
722740

723741
static void nvmet_update_sq_head(struct nvmet_req *req)
@@ -1394,7 +1412,7 @@ static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl,
13941412

13951413
ctrl->p2p_client = get_device(req->p2p_client);
13961414

1397-
xa_for_each(&ctrl->subsys->namespaces, idx, ns)
1415+
nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns)
13981416
nvmet_p2pmem_ns_add_p2p(ctrl, ns);
13991417
}
14001418

drivers/nvme/target/nvmet.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
#define NVMET_DEFAULT_VS NVME_VS(2, 1, 0)
2626

27+
#define NVMET_NS_ENABLED XA_MARK_1
2728
#define NVMET_ASYNC_EVENTS 4
2829
#define NVMET_ERROR_LOG_SLOTS 128
2930
#define NVMET_NO_ERROR_LOC ((u16)-1)
@@ -33,6 +34,12 @@
3334
#define NVMET_FR_MAX_SIZE 8
3435
#define NVMET_PR_LOG_QUEUE_SIZE 64
3536

37+
#define nvmet_for_each_ns(xa, index, entry) \
38+
xa_for_each(xa, index, entry)
39+
40+
#define nvmet_for_each_enabled_ns(xa, index, entry) \
41+
xa_for_each_marked(xa, index, entry, NVMET_NS_ENABLED)
42+
3643
/*
3744
* Supported optional AENs:
3845
*/

drivers/nvme/target/pr.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ u16 nvmet_set_feat_resv_notif_mask(struct nvmet_req *req, u32 mask)
6060
goto success;
6161
}
6262

63-
xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
63+
nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) {
6464
if (ns->pr.enable)
6565
WRITE_ONCE(ns->pr.notify_mask, mask);
6666
}
@@ -1056,7 +1056,7 @@ int nvmet_ctrl_init_pr(struct nvmet_ctrl *ctrl)
10561056
* nvmet_pr_init_ns(), see more details in nvmet_ns_enable().
10571057
* So just check ns->pr.enable.
10581058
*/
1059-
xa_for_each(&subsys->namespaces, idx, ns) {
1059+
nvmet_for_each_enabled_ns(&subsys->namespaces, idx, ns) {
10601060
if (ns->pr.enable) {
10611061
ret = nvmet_pr_alloc_and_insert_pc_ref(ns, ctrl->cntlid,
10621062
&ctrl->hostid);
@@ -1067,7 +1067,7 @@ int nvmet_ctrl_init_pr(struct nvmet_ctrl *ctrl)
10671067
return 0;
10681068

10691069
free_per_ctrl_refs:
1070-
xa_for_each(&subsys->namespaces, idx, ns) {
1070+
nvmet_for_each_enabled_ns(&subsys->namespaces, idx, ns) {
10711071
if (ns->pr.enable) {
10721072
pc_ref = xa_erase(&ns->pr_per_ctrl_refs, ctrl->cntlid);
10731073
if (pc_ref)
@@ -1087,7 +1087,7 @@ void nvmet_ctrl_destroy_pr(struct nvmet_ctrl *ctrl)
10871087
kfifo_free(&ctrl->pr_log_mgr.log_queue);
10881088
mutex_destroy(&ctrl->pr_log_mgr.lock);
10891089

1090-
xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
1090+
nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) {
10911091
if (ns->pr.enable) {
10921092
pc_ref = xa_erase(&ns->pr_per_ctrl_refs, ctrl->cntlid);
10931093
if (pc_ref)

0 commit comments

Comments
 (0)