summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKrzysztof Karas <krzysztof.karas@intel.com>2022-09-14 11:57:03 +0200
committerTomasz Zawadzki <tomasz.zawadzki@intel.com>2022-09-22 19:18:30 +0000
commitdfc989439662457d39bac524be72e8ea1c20e817 (patch)
tree442ce39c2dbda5ba6fc6e8d61152d5d6c3f822c0
parenteafb489c0daedfccb8d56cf4aa586a1641a71573 (diff)
bdev: send bdev reset based on outstanding IO and a new timeout parameter
A new parameter io_drain_timeout has been added to spdk_bdev structure. If this value is unset, the bdev reset behavior does not change. The io_drain_timeout controls how long a bdev reset must wait for IO to complete prior to issuing a reset to the underlying device. If there is no outstanding IO at the end of that period, the reset is skipped. Change-Id: I585af427064ce234a4f60afc3d69bc9fc3252432 Signed-off-by: Krzysztof Karas <krzysztof.karas@intel.com> Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/14501 Reviewed-by: Shuhei Matsumoto <smatsumoto@nvidia.com> Reviewed-by: Jim Harris <james.r.harris@intel.com> Reviewed-by: Tomasz Zawadzki <tomasz.zawadzki@intel.com> Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Community-CI: Mellanox Build Bot
-rw-r--r--include/spdk/bdev_module.h30
-rw-r--r--lib/bdev/bdev.c85
-rw-r--r--module/bdev/lvol/vbdev_lvol.c7
-rw-r--r--test/unit/lib/bdev/mt/bdev.c/bdev_ut.c212
4 files changed, 330 insertions, 4 deletions
diff --git a/include/spdk/bdev_module.h b/include/spdk/bdev_module.h
index 3081e236a..53d642f26 100644
--- a/include/spdk/bdev_module.h
+++ b/include/spdk/bdev_module.h
@@ -28,6 +28,11 @@
extern "C" {
#endif
+/* This parameter is best defined for bdevs that share an underlying bdev,
+ * such as multiple lvol bdevs sharing an nvme device, to avoid unnecessarily
+ * resetting the underlying bdev and affecting other bdevs that are sharing it. */
+#define BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE 5
+
/** Block device module */
struct spdk_bdev_module {
/**
@@ -431,6 +436,25 @@ struct spdk_bdev {
*/
bool media_events;
+ /* Upon receiving a reset request, this is the amount of time in seconds
+ * to wait for all I/O to complete before moving forward with the reset.
+ * If all I/O completes prior to this time out, the reset will be skipped.
+ * A value of 0 is special and will always send resets immediately, even
+ * if there is no I/O outstanding.
+ *
+ * Use case example:
+ * A shared bdev (e.g. multiple lvol bdevs sharing an underlying nvme bdev)
+ * needs to be reset. For a non-zero value bdev reset code will wait
+ * `reset_io_drain_timeout` seconds for outstanding IO that are present
+ * on any bdev channel, before sending a reset down to the underlying device.
+ * That way we can avoid sending "empty" resets and interrupting work of
+ * other lvols that use the same bdev. BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE
+ * is a good choice for the value of this parameter.
+ *
+ * If this parameter remains equal to zero, the bdev reset will be forcefully
+ * sent down to the device, without any delays and waiting for outstanding IO. */
+ uint16_t reset_io_drain_timeout;
+
/**
* Pointer to the bdev module that registered this bdev.
*/
@@ -629,6 +653,12 @@ struct spdk_bdev_io {
struct {
/** Channel reference held while messages for this reset are in progress. */
struct spdk_io_channel *ch_ref;
+ struct {
+ /* Handle to timed poller that checks each channel for outstanding IO. */
+ struct spdk_poller *poller;
+ /* Store calculated time value, when a poller should stop its work. */
+ uint64_t stop_time_tsc;
+ } wait_poller;
} reset;
struct {
/** The outstanding request matching bio_cb_arg which this abort attempts to cancel. */
diff --git a/lib/bdev/bdev.c b/lib/bdev/bdev.c
index 4e7248c41..e3570c0e5 100644
--- a/lib/bdev/bdev.c
+++ b/lib/bdev/bdev.c
@@ -54,6 +54,7 @@ int __itt_init_ittlib(const char *, __itt_group_id);
* when splitting into children requests at a time.
*/
#define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8)
+#define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000
static const char *qos_rpc_type[] = {"rw_ios_per_sec",
"rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec"
@@ -5276,15 +5277,91 @@ spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
return 0;
}
+static int bdev_reset_poll_for_outstanding_io(void *ctx);
+
static void
-bdev_reset_dev(struct spdk_io_channel_iter *i, int status)
+bdev_reset_check_outstanding_io_done(struct spdk_io_channel_iter *i, int status)
{
struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i);
struct spdk_bdev_io *bdev_io;
bdev_io = TAILQ_FIRST(&ch->queued_resets);
- TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
- bdev_io_submit_reset(bdev_io);
+
+ if (status == -EBUSY) {
+ if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) {
+ bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io,
+ ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD);
+ } else {
+ /* If outstanding IOs are still present and reset_io_drain_timeout seconds passed,
+ * start the reset. */
+ TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
+ bdev_io_submit_reset(bdev_io);
+ }
+ } else {
+ TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
+ SPDK_DEBUGLOG(bdev,
+ "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n",
+ ch->bdev->name);
+ /* Mark the completion status as a SUCCESS and complete the reset. */
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+ }
+}
+
+static void
+bdev_reset_check_outstanding_io(struct spdk_io_channel_iter *i)
+{
+ struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i);
+ struct spdk_bdev_channel *cur_ch = spdk_io_channel_get_ctx(io_ch);
+ int status = 0;
+
+ if (cur_ch->io_outstanding > 0) {
+ /* If a channel has outstanding IO, set status to -EBUSY code. This will stop
+ * further iteration over the rest of the channels and pass non-zero status
+ * to the callback function. */
+ status = -EBUSY;
+ }
+ spdk_for_each_channel_continue(i, status);
+}
+
+static int
+bdev_reset_poll_for_outstanding_io(void *ctx)
+{
+ struct spdk_bdev_channel *ch = ctx;
+ struct spdk_bdev_io *bdev_io;
+
+ bdev_io = TAILQ_FIRST(&ch->queued_resets);
+
+ spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller);
+ spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_check_outstanding_io,
+ ch, bdev_reset_check_outstanding_io_done);
+
+ return SPDK_POLLER_BUSY;
+}
+
+static void
+bdev_reset_freeze_channel_done(struct spdk_io_channel_iter *i, int status)
+{
+ struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i);
+ struct spdk_bdev *bdev = ch->bdev;
+ struct spdk_bdev_io *bdev_io;
+
+ bdev_io = TAILQ_FIRST(&ch->queued_resets);
+
+ if (bdev->reset_io_drain_timeout == 0) {
+ TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
+
+ bdev_io_submit_reset(bdev_io);
+ return;
+ }
+
+ bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() +
+ (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz());
+
+ /* In case bdev->reset_io_drain_timeout is not equal to zero,
+ * submit the reset to the underlying module only if outstanding I/O
+ * remain after reset_io_drain_timeout seconds have passed. */
+ spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_check_outstanding_io,
+ ch, bdev_reset_check_outstanding_io_done);
}
static void
@@ -5331,7 +5408,7 @@ bdev_start_reset(void *ctx)
struct spdk_bdev_channel *ch = ctx;
spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_freeze_channel,
- ch, bdev_reset_dev);
+ ch, bdev_reset_freeze_channel_done);
}
static void
diff --git a/module/bdev/lvol/vbdev_lvol.c b/module/bdev/lvol/vbdev_lvol.c
index dadf6c01d..1ce041436 100644
--- a/module/bdev/lvol/vbdev_lvol.c
+++ b/module/bdev/lvol/vbdev_lvol.c
@@ -1041,6 +1041,13 @@ _create_lvol_disk(struct spdk_lvol *lvol, bool destroy)
bdev->fn_table = &vbdev_lvol_fn_table;
bdev->module = &g_lvol_if;
+ /* Set default bdev reset waiting time. This value indicates how much
+ * time a reset should wait before forcing a reset down to the underlying
+ * bdev module.
+ * Setting this parameter is mainly to avoid "empty" resets to a shared
+ * bdev that may be used by multiple lvols. */
+ bdev->reset_io_drain_timeout = BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE;
+
rc = spdk_bdev_register(bdev);
if (rc) {
free(lvol_bdev);
diff --git a/test/unit/lib/bdev/mt/bdev.c/bdev_ut.c b/test/unit/lib/bdev/mt/bdev.c/bdev_ut.c
index bf9c52ef1..1af31c8ff 100644
--- a/test/unit/lib/bdev/mt/bdev.c/bdev_ut.c
+++ b/test/unit/lib/bdev/mt/bdev.c/bdev_ut.c
@@ -486,6 +486,8 @@ aborted_reset_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
spdk_bdev_free_io(bdev_io);
}
+static void io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
+
static void
aborted_reset(void)
{
@@ -543,6 +545,58 @@ aborted_reset(void)
}
static void
+aborted_reset_no_outstanding_io(void)
+{
+ struct spdk_io_channel *io_ch[2];
+ struct spdk_bdev_channel *bdev_ch[2];
+ struct spdk_bdev *bdev[2];
+ enum spdk_bdev_io_status status1 = SPDK_BDEV_IO_STATUS_PENDING,
+ status2 = SPDK_BDEV_IO_STATUS_PENDING;
+
+ setup_test();
+
+ /*
+ * This time we test the reset without any outstanding IO
+ * present on the bdev channel, so both resets should finish
+ * immediately.
+ */
+
+ set_thread(0);
+ /* Set reset_io_drain_timeout to allow bdev
+ * reset to stay pending until we call abort. */
+ io_ch[0] = spdk_bdev_get_io_channel(g_desc);
+ bdev_ch[0] = spdk_io_channel_get_ctx(io_ch[0]);
+ bdev[0] = bdev_ch[0]->bdev;
+ bdev[0]->reset_io_drain_timeout = BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE;
+ CU_ASSERT(io_ch[0] != NULL);
+ spdk_bdev_reset(g_desc, io_ch[0], aborted_reset_done, &status1);
+ poll_threads();
+ CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);
+ CU_ASSERT(status1 == SPDK_BDEV_IO_STATUS_SUCCESS);
+ spdk_put_io_channel(io_ch[0]);
+
+ set_thread(1);
+ /* Set reset_io_drain_timeout to allow bdev
+ * reset to stay pending until we call abort. */
+ io_ch[1] = spdk_bdev_get_io_channel(g_desc);
+ bdev_ch[1] = spdk_io_channel_get_ctx(io_ch[1]);
+ bdev[1] = bdev_ch[1]->bdev;
+ bdev[1]->reset_io_drain_timeout = BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE;
+ CU_ASSERT(io_ch[1] != NULL);
+ spdk_bdev_reset(g_desc, io_ch[1], aborted_reset_done, &status2);
+ poll_threads();
+ CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);
+ CU_ASSERT(status2 == SPDK_BDEV_IO_STATUS_SUCCESS);
+ spdk_put_io_channel(io_ch[1]);
+
+ stub_complete_io(g_bdev.io_target, 0);
+ poll_threads();
+
+ teardown_test();
+}
+
+
+static void
io_during_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
{
enum spdk_bdev_io_status *status = cb_arg;
@@ -655,6 +709,162 @@ io_during_reset(void)
teardown_test();
}
+static uint32_t
+count_queued_resets(void *io_target)
+{
+ struct spdk_io_channel *_ch = spdk_get_io_channel(io_target);
+ struct ut_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
+ struct spdk_bdev_io *io;
+ uint32_t submitted_resets = 0;
+
+ TAILQ_FOREACH(io, &ch->outstanding_io, module_link) {
+ if (io->type == SPDK_BDEV_IO_TYPE_RESET) {
+ submitted_resets++;
+ }
+ }
+
+ spdk_put_io_channel(_ch);
+
+ return submitted_resets;
+}
+
+static void
+reset_completions(void)
+{
+ struct spdk_io_channel *io_ch;
+ struct spdk_bdev_channel *bdev_ch;
+ struct spdk_bdev *bdev;
+ enum spdk_bdev_io_status status0, status_reset;
+ int rc, iter;
+
+ setup_test();
+
+ /* This test covers four test cases:
+ * 1) reset_io_drain_timeout of a bdev is greater than 0
+ * 2) No outstandind IO are present on any bdev channel
+ * 3) Outstanding IO finish during bdev reset
+ * 4) Outstanding IO do not finish before reset is done waiting
+ * for them.
+ *
+ * Above conditions mainly affect the timing of bdev reset completion
+ * and whether a reset should be skipped via spdk_bdev_io_complete()
+ * or sent down to the underlying bdev module via bdev_io_submit_reset(). */
+
+ /* Test preparation */
+ set_thread(0);
+ io_ch = spdk_bdev_get_io_channel(g_desc);
+ bdev_ch = spdk_io_channel_get_ctx(io_ch);
+ CU_ASSERT(bdev_ch->flags == 0);
+
+
+ /* Test case 1) reset_io_drain_timeout set to 0. Reset should be sent down immediately. */
+ bdev = &g_bdev.bdev;
+ bdev->reset_io_drain_timeout = 0;
+
+ status_reset = SPDK_BDEV_IO_STATUS_PENDING;
+ rc = spdk_bdev_reset(g_desc, io_ch, io_during_io_done, &status_reset);
+ CU_ASSERT(rc == 0);
+ poll_threads();
+ CU_ASSERT(count_queued_resets(g_bdev.io_target) == 1);
+
+ /* Call reset completion inside bdev module. */
+ stub_complete_io(g_bdev.io_target, 0);
+ poll_threads();
+ CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
+ CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_SUCCESS);
+ CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);
+
+
+ /* Test case 2) no outstanding IO are present. Reset should perform one iteration over
+ * channels and then be skipped. */
+ bdev->reset_io_drain_timeout = BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE;
+ status_reset = SPDK_BDEV_IO_STATUS_PENDING;
+
+ rc = spdk_bdev_reset(g_desc, io_ch, io_during_io_done, &status_reset);
+ CU_ASSERT(rc == 0);
+ poll_threads();
+ /* Reset was never submitted to the bdev module. */
+ CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
+ CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_SUCCESS);
+ CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);
+
+
+ /* Test case 3) outstanding IO finish during bdev reset procedure. Reset should initiate
+ * wait poller to check for IO completions every second, until reset_io_drain_timeout is
+ * reached, but finish earlier than this threshold. */
+ status0 = SPDK_BDEV_IO_STATUS_PENDING;
+ status_reset = SPDK_BDEV_IO_STATUS_PENDING;
+ rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, io_during_io_done, &status0);
+ CU_ASSERT(rc == 0);
+
+ rc = spdk_bdev_reset(g_desc, io_ch, io_during_io_done, &status_reset);
+ CU_ASSERT(rc == 0);
+ poll_threads();
+ /* The reset just started and should not have been submitted yet. */
+ CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
+
+ poll_threads();
+ CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_PENDING);
+ /* Let the poller wait for about half the time then complete outstanding IO. */
+ for (iter = 0; iter < 2; iter++) {
+ /* Reset is still processing and not submitted at this point. */
+ CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
+ spdk_delay_us(1000 * 1000);
+ poll_threads();
+ poll_threads();
+ }
+ CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_PENDING);
+ stub_complete_io(g_bdev.io_target, 0);
+ poll_threads();
+ spdk_delay_us(BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD);
+ poll_threads();
+ poll_threads();
+ CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_SUCCESS);
+ /* Sending reset to the bdev module has been skipped. */
+ CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
+ CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);
+
+
+ /* Test case 4) outstanding IO are still present after reset_io_drain_timeout
+ * seconds have passed. */
+ status0 = SPDK_BDEV_IO_STATUS_PENDING;
+ status_reset = SPDK_BDEV_IO_STATUS_PENDING;
+ rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, io_during_io_done, &status0);
+ CU_ASSERT(rc == 0);
+
+ rc = spdk_bdev_reset(g_desc, io_ch, io_during_io_done, &status_reset);
+ CU_ASSERT(rc == 0);
+ poll_threads();
+ /* The reset just started and should not have been submitted yet. */
+ CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
+
+ poll_threads();
+ CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_PENDING);
+ /* Let the poller wait for reset_io_drain_timeout seconds. */
+ for (iter = 0; iter < bdev->reset_io_drain_timeout; iter++) {
+ CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
+ spdk_delay_us(BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD);
+ poll_threads();
+ poll_threads();
+ }
+
+ /* After timing out, the reset should have been sent to the module. */
+ CU_ASSERT(count_queued_resets(g_bdev.io_target) == 1);
+ /* Complete reset submitted to the module and the read IO. */
+ stub_complete_io(g_bdev.io_target, 0);
+ poll_threads();
+ CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_SUCCESS);
+ CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);
+
+
+ /* Destroy the channel and end the test. */
+ spdk_put_io_channel(io_ch);
+ poll_threads();
+
+ teardown_test();
+}
+
+
static void
basic_qos(void)
{
@@ -2092,7 +2302,9 @@ main(int argc, char **argv)
CU_ADD_TEST(suite, basic_qos);
CU_ADD_TEST(suite, put_channel_during_reset);
CU_ADD_TEST(suite, aborted_reset);
+ CU_ADD_TEST(suite, aborted_reset_no_outstanding_io);
CU_ADD_TEST(suite, io_during_reset);
+ CU_ADD_TEST(suite, reset_completions);
CU_ADD_TEST(suite, io_during_qos_queue);
CU_ADD_TEST(suite, io_during_qos_reset);
CU_ADD_TEST(suite, enomem);