Skip to content

Commit

Permalink
hw/nvme: reimplement dsm to allow cancellation
Browse files Browse the repository at this point in the history
Prior to this patch, a loop was used to issue multiple "fire and forget"
aios for each range in the command. Without a reference to the aiocb
returned from the blk_aio_pdiscard calls, the aios cannot be canceled.

Fix this by processing the ranges one after another.

As a bonus, this fixes how metadata is cleared (i.e. we only zero it out
if the data was succesfully discarded).

Signed-off-by: Klaus Jensen <[email protected]>
Reviewed-by: Keith Busch <[email protected]>
  • Loading branch information
birkelund committed Jun 29, 2021
1 parent ff0ac2c commit d7d1474
Show file tree
Hide file tree
Showing 2 changed files with 157 additions and 70 deletions.
223 changes: 155 additions & 68 deletions hw/nvme/ctrl.c
Original file line number Diff line number Diff line change
Expand Up @@ -2015,26 +2015,6 @@ static void nvme_verify_mdata_in_cb(void *opaque, int ret)
nvme_verify_cb(ctx, ret);
}

static void nvme_aio_discard_cb(void *opaque, int ret)
{
NvmeRequest *req = opaque;
uintptr_t *discards = (uintptr_t *)&req->opaque;

trace_pci_nvme_aio_discard_cb(nvme_cid(req));

if (ret) {
nvme_aio_err(req, ret);
}

(*discards)--;

if (*discards) {
return;
}

nvme_enqueue_req_completion(nvme_cq(req), req);
}

struct nvme_zone_reset_ctx {
NvmeRequest *req;
NvmeZone *zone;
Expand Down Expand Up @@ -2495,75 +2475,182 @@ static void nvme_compare_data_cb(void *opaque, int ret)
nvme_enqueue_req_completion(nvme_cq(req), req);
}

static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
typedef struct NvmeDSMAIOCB {
BlockAIOCB common;
BlockAIOCB *aiocb;
NvmeRequest *req;
QEMUBH *bh;
int ret;

NvmeDsmRange *range;
unsigned int nr;
unsigned int idx;
} NvmeDSMAIOCB;

static void nvme_dsm_cancel(BlockAIOCB *aiocb)
{
NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);

/* break nvme_dsm_cb loop */
iocb->idx = iocb->nr;
iocb->ret = -ECANCELED;

if (iocb->aiocb) {
blk_aio_cancel_async(iocb->aiocb);
iocb->aiocb = NULL;
} else {
/*
* We only reach this if nvme_dsm_cancel() has already been called or
* the command ran to completion and nvme_dsm_bh is scheduled to run.
*/
assert(iocb->idx == iocb->nr);
}
}

static const AIOCBInfo nvme_dsm_aiocb_info = {
.aiocb_size = sizeof(NvmeDSMAIOCB),
.cancel_async = nvme_dsm_cancel,
};

static void nvme_dsm_bh(void *opaque)
{
NvmeDSMAIOCB *iocb = opaque;

iocb->common.cb(iocb->common.opaque, iocb->ret);

qemu_bh_delete(iocb->bh);
iocb->bh = NULL;
qemu_aio_unref(iocb);
}

static void nvme_dsm_cb(void *opaque, int ret);

static void nvme_dsm_md_cb(void *opaque, int ret)
{
NvmeDSMAIOCB *iocb = opaque;
NvmeRequest *req = iocb->req;
NvmeNamespace *ns = req->ns;
NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
NvmeDsmRange *range;
uint64_t slba;
uint32_t nlb;

uint32_t attr = le32_to_cpu(dsm->attributes);
uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
if (ret < 0) {
iocb->ret = ret;
goto done;
}

uint16_t status = NVME_SUCCESS;
if (!ns->lbaf.ms) {
nvme_dsm_cb(iocb, 0);
return;
}

trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
range = &iocb->range[iocb->idx - 1];
slba = le64_to_cpu(range->slba);
nlb = le32_to_cpu(range->nlb);

if (attr & NVME_DSMGMT_AD) {
int64_t offset;
size_t len;
NvmeDsmRange range[nr];
uintptr_t *discards = (uintptr_t *)&req->opaque;
/*
* Check that all block were discarded (zeroed); otherwise we do not zero
* the metadata.
*/

status = nvme_h2c(n, (uint8_t *)range, sizeof(range), req);
if (status) {
return status;
ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
if (ret) {
if (ret < 0) {
iocb->ret = ret;
goto done;
}

/*
* AIO callbacks may be called immediately, so initialize discards to 1
* to make sure the the callback does not complete the request before
* all discards have been issued.
*/
*discards = 1;
nvme_dsm_cb(iocb, 0);
}

for (int i = 0; i < nr; i++) {
uint64_t slba = le64_to_cpu(range[i].slba);
uint32_t nlb = le32_to_cpu(range[i].nlb);
iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
nvme_dsm_cb, iocb);
return;

if (nvme_check_bounds(ns, slba, nlb)) {
continue;
}
done:
iocb->aiocb = NULL;
qemu_bh_schedule(iocb->bh);
}

trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
nlb);
static void nvme_dsm_cb(void *opaque, int ret)
{
NvmeDSMAIOCB *iocb = opaque;
NvmeRequest *req = iocb->req;
NvmeCtrl *n = nvme_ctrl(req);
NvmeNamespace *ns = req->ns;
NvmeDsmRange *range;
uint64_t slba;
uint32_t nlb;

if (nlb > n->dmrsl) {
trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
}
if (ret < 0) {
iocb->ret = ret;
goto done;
}

offset = nvme_l2b(ns, slba);
len = nvme_l2b(ns, nlb);
next:
if (iocb->idx == iocb->nr) {
goto done;
}

while (len) {
size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
range = &iocb->range[iocb->idx++];
slba = le64_to_cpu(range->slba);
nlb = le32_to_cpu(range->nlb);

(*discards)++;
trace_pci_nvme_dsm_deallocate(slba, nlb);

blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
nvme_aio_discard_cb, req);
if (nlb > n->dmrsl) {
trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
goto next;
}

offset += bytes;
len -= bytes;
}
}
if (nvme_check_bounds(ns, slba, nlb)) {
trace_pci_nvme_err_invalid_lba_range(slba, nlb,
ns->id_ns.nsze);
goto next;
}

/* account for the 1-initialization */
(*discards)--;
iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
nvme_l2b(ns, nlb),
nvme_dsm_md_cb, iocb);
return;

if (*discards) {
status = NVME_NO_COMPLETE;
} else {
status = req->status;
done:
iocb->aiocb = NULL;
qemu_bh_schedule(iocb->bh);
}

static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
{
NvmeNamespace *ns = req->ns;
NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
uint32_t attr = le32_to_cpu(dsm->attributes);
uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
uint16_t status = NVME_SUCCESS;

trace_pci_nvme_dsm(nr, attr);

if (attr & NVME_DSMGMT_AD) {
NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
nvme_misc_cb, req);

iocb->req = req;
iocb->bh = qemu_bh_new(nvme_dsm_bh, iocb);
iocb->ret = 0;
iocb->range = g_new(NvmeDsmRange, nr);
iocb->nr = nr;
iocb->idx = 0;

status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
req);
if (status) {
return status;
}

req->aiocb = &iocb->common;
nvme_dsm_cb(iocb, 0);

return NVME_NO_COMPLETE;
}

return status;
Expand Down
4 changes: 2 additions & 2 deletions hw/nvme/trace-events
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ pci_nvme_verify_mdata_in_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" bl
pci_nvme_verify_cb(uint16_t cid, uint8_t prinfo, uint16_t apptag, uint16_t appmask, uint32_t reftag) "cid %"PRIu16" prinfo 0x%"PRIx8" apptag 0x%"PRIx16" appmask 0x%"PRIx16" reftag 0x%"PRIx32""
pci_nvme_rw_complete_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed %d"
pci_nvme_dsm(uint16_t cid, uint32_t nsid, uint32_t nr, uint32_t attr) "cid %"PRIu16" nsid %"PRIu32" nr %"PRIu32" attr 0x%"PRIx32""
pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32""
pci_nvme_dsm(uint32_t nr, uint32_t attr) "nr %"PRIu32" attr 0x%"PRIx32""
pci_nvme_dsm_deallocate(uint64_t slba, uint32_t nlb) "slba %"PRIu64" nlb %"PRIu32""
pci_nvme_dsm_single_range_limit_exceeded(uint32_t nlb, uint32_t dmrsl) "nlb %"PRIu32" dmrsl %"PRIu32""
pci_nvme_compare(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32""
pci_nvme_compare_data_cb(uint16_t cid) "cid %"PRIu16""
Expand Down

0 comments on commit d7d1474

Please sign in to comment.