From 9365ab4574bc003cdbe851d4797888d6205e8bc2 Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Thu, 10 Apr 2025 09:53:22 +0200 Subject: [PATCH] nvme-core: allocate the namespaces in order I've done some performance tests. The results are better than expected, there are practically no significant differences between the fully async mode in use in the upstream kernel and the "chained" tasks implemented here. Performance are measured by using this patch: http://bsdbackstore.eu/misc/async_scan_perf_test.patch High latency NVMe/TCP, ~100ms ping, 100 namespaces Synchronous namespace scan (RHEL-9.6): 31175ms Fully async namespace scan (6.15-rc1): 1563ms Async namespace scan with dependency chain (6.15-rc1 + my patch): 1554ms Low latency NVMe/TCP, ~0.4ms ping, 100 namespaces Synchronous namespace scan (RHEL-9.6): 594ms Fully async namespace scan (6.15-rc1): 156ms Async namespace scan with dependency chain (6.15-rc1 + my patch): 160ms NVMe-PCI, 2 namespaces: Synchronous namespace scan (RHEL-9.6): 84ms Fully async namespace scan (6.15-rc1): 121ms Async namespace scan with dependency chain (6.15-rc1 + my patch): 123ms Signed-off-by: Maurizio Lombardi --- drivers/nvme/host/core.c | 175 ++++++++++++++++++++++++++++----------- drivers/nvme/host/nvme.h | 2 + 2 files changed, 128 insertions(+), 49 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index cc23035148b4..67338cb30a35 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3876,16 +3876,11 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns) list_add(&ns->list, &ns->ctrl->namespaces); } -static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) +static void nvme_init_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + struct nvme_ns_info *info) { struct queue_limits lim = { }; - struct nvme_ns *ns; struct gendisk *disk; - int node = ctrl->numa_node; - - ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); - if (!ns) - return; if (ctrl->opts && ctrl->opts->data_digest) lim.features |= BLK_FEAT_STABLE_WRITES; @@ -3901,11 +3896,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) ns->disk = disk; ns->queue = disk->queue; - ns->ctrl = ctrl; - kref_init(&ns->kref); - - if (nvme_init_ns_head(ns, info)) - goto out_cleanup_disk; /* * If multipathing is enabled, the device name for all disks and not @@ -3978,7 +3968,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) list_del_init(&ns->head->entry); mutex_unlock(&ctrl->subsys->lock); nvme_put_ns_head(ns->head); - out_cleanup_disk: put_disk(disk); out_free_ns: kfree(ns); @@ -4065,19 +4054,109 @@ static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info) nvme_ns_remove(ns); } -static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid) +/** + * struct async_scan_task - keeps track of controller & NSID to scan + * @ctrl: Controller on which namespaces are being scanned + * @nsid: The NSID to scan + * @prev_finished: Set when the namespace scan has been completed + * @head: Linked list of the scan asynchronous tasks + */ +struct async_scan_task { + struct nvme_ctrl *ctrl; + u32 nsid; + struct completion prev_finished; + struct list_head head; +}; + +static struct async_scan_task *async_scan_task_init(struct nvme_ctrl *ctrl, + u32 nsid) +{ + struct async_scan_task *task = kzalloc(sizeof(*task), GFP_KERNEL); + if (!task) + return NULL; + + task->ctrl = ctrl; + task->nsid = nsid; + init_completion(&task->prev_finished); + INIT_LIST_HEAD(&task->head); + + spin_lock(&ctrl->scan_list_lock); + list_add_tail(&task->head, &ctrl->scan_list); + if (list_is_first(&task->head, &ctrl->scan_list)) + complete_all(&task->prev_finished); + spin_unlock(&ctrl->scan_list_lock); + + return task; +} + +static void async_scan_prev_task_wait(struct async_scan_task *task) +{ + struct async_scan_task *prev; + struct nvme_ctrl *ctrl; + + if (!task) + return; + + wait_for_completion(&task->prev_finished); +} + +static void async_scan_task_complete(struct async_scan_task *task) +{ + struct nvme_ctrl *ctrl; + + if (!task) + return; + + ctrl = task->ctrl; + async_scan_prev_task_wait(task); + + spin_lock(&ctrl->scan_list_lock); + list_del(&task->head); + if (!list_empty(&ctrl->scan_list)) { + struct async_scan_task *next = list_entry(ctrl->scan_list.next, + struct async_scan_task, head); + complete_all(&next->prev_finished); + } + spin_unlock(&ctrl->scan_list_lock); + kfree(task); +} + + +static struct nvme_ns *nvme_alloc_ns(struct nvme_ctrl *ctrl, + struct nvme_ns_info *info) +{ + struct nvme_ns *ns; + int node = ctrl->numa_node; + + ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); + if (!ns) + return NULL; + + ns->ctrl = ctrl; + kref_init(&ns->kref); + + if (nvme_init_ns_head(ns, info)) { + kfree(ns); + return NULL; + } + + return ns; +} + +static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid, + struct async_scan_task *task) { struct nvme_ns_info info = { .nsid = nsid }; struct nvme_ns *ns; int ret = 1; if (nvme_identify_ns_descs(ctrl, &info)) - return; + goto exit; if (info.ids.csi != NVME_CSI_NVM && !nvme_multi_css(ctrl)) { dev_warn(ctrl->device, "command set not reported for nsid: %d\n", nsid); - return; + goto exit; } /* @@ -4100,44 +4179,35 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid) * becomes ready and restart the scan. */ if (ret || !info.is_ready) - return; + goto exit; ns = nvme_find_get_ns(ctrl, nsid); if (ns) { + async_scan_task_complete(task); nvme_validate_ns(ns, &info); nvme_put_ns(ns); } else { - nvme_alloc_ns(ctrl, &info); + /* + * wait for the previous async task to finish + * before allocating the namespace. + */ + async_scan_prev_task_wait(task); + ns = nvme_alloc_ns(ctrl, &info); + async_scan_task_complete(task); + if (ns) + nvme_init_ns(ctrl, ns, &info); } -} + return; -/** - * struct async_scan_info - keeps track of controller & NSIDs to scan - * @ctrl: Controller on which namespaces are being scanned - * @next_nsid: Index of next NSID to scan in ns_list - * @ns_list: Pointer to list of NSIDs to scan - * - * Note: There is a single async_scan_info structure shared by all instances - * of nvme_scan_ns_async() scanning a given controller, so the atomic - * operations on next_nsid are critical to ensure each instance scans a unique - * NSID. - */ -struct async_scan_info { - struct nvme_ctrl *ctrl; - atomic_t next_nsid; - __le32 *ns_list; -}; +exit: + async_scan_task_complete(task); +} static void nvme_scan_ns_async(void *data, async_cookie_t cookie) { - struct async_scan_info *scan_info = data; - int idx; - u32 nsid; + struct async_scan_task *task = data; - idx = (u32)atomic_fetch_inc(&scan_info->next_nsid); - nsid = le32_to_cpu(scan_info->ns_list[idx]); - - nvme_scan_ns(scan_info->ctrl, nsid); + nvme_scan_ns(task->ctrl, task->nsid, task); } static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, @@ -4167,14 +4237,12 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) u32 prev = 0; int ret = 0, i; ASYNC_DOMAIN(domain); - struct async_scan_info scan_info; + struct async_scan_task *task; ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); if (!ns_list) return -ENOMEM; - scan_info.ctrl = ctrl; - scan_info.ns_list = ns_list; for (;;) { struct nvme_command cmd = { .identify.opcode = nvme_admin_identify, @@ -4190,24 +4258,31 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) goto free; } - atomic_set(&scan_info.next_nsid, 0); for (i = 0; i < nr_entries; i++) { u32 nsid = le32_to_cpu(ns_list[i]); if (!nsid) /* end of the list? */ goto out; - async_schedule_domain(nvme_scan_ns_async, &scan_info, + + task = async_scan_task_init(ctrl, nsid); + if (!task) { + ret = -ENOMEM; + goto out; + } + + async_schedule_domain(nvme_scan_ns_async, task, &domain); while (++prev < nsid) nvme_ns_remove_by_nsid(ctrl, prev); } - async_synchronize_full_domain(&domain); } out: + async_synchronize_full_domain(&domain); nvme_remove_invalid_namespaces(ctrl, prev); free: async_synchronize_full_domain(&domain); kfree(ns_list); + WARN_ON_ONCE(!list_empty(&ctrl->scan_list)); return ret; } @@ -4222,7 +4297,7 @@ static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl) kfree(id); for (i = 1; i <= nn; i++) - nvme_scan_ns(ctrl, i); + nvme_scan_ns(ctrl, i, NULL); nvme_remove_invalid_namespaces(ctrl, nn); } @@ -4833,6 +4908,8 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, mutex_init(&ctrl->scan_lock); INIT_LIST_HEAD(&ctrl->namespaces); + INIT_LIST_HEAD(&ctrl->scan_list); + spin_lock_init(&ctrl->scan_list_lock); xa_init(&ctrl->cels); ctrl->dev = dev; ctrl->ops = ops; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 51e078642127..5ec40d96649e 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -294,6 +294,8 @@ struct nvme_ctrl { struct blk_mq_tag_set *tagset; struct blk_mq_tag_set *admin_tagset; struct list_head namespaces; + struct list_head scan_list; + spinlock_t scan_list_lock; struct mutex namespaces_lock; struct srcu_struct srcu; struct device ctrl_device; -- 2.43.5