summaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-11-01 12:30:07 -1000
committerLinus Torvalds <torvalds@linux-foundation.org>2023-11-01 12:30:07 -1000
commit90d624af2e5a9945eedd5cafd6ae6d88f32cc977 (patch)
treee936a0cc8f2b613f327ab08280dccbad664703cf /drivers
parent4de520f1fcefd4ebb7dddcf28bde1b330c2f6b5d (diff)
parent0c696bb38f4cc0f0f90a8e06ae1eda21a9630cd0 (diff)
Merge tag 'for-6.7/block-2023-10-30' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe: - Improvements to the queue_rqs() support, and adding null_blk support for that as well (Chengming) - Series improving badblocks support (Coly) - Key store support for sed-opal (Greg) - IBM partition string handling improvements (Jan) - Make number of ublk devices supported configurable (Mike) - Cancelation improvements for ublk (Ming) - MD pull requests via Song: - Handle timeout in md-cluster, by Denis Plotnikov - Cleanup pers->prepare_suspend, by Yu Kuai - Rewrite mddev_suspend(), by Yu Kuai - Simplify md_seq_ops, by Yu Kuai - Reduce unnecessary locking array_state_store(), by Mariusz Tkaczyk - Make rdev add/remove independent from daemon thread, by Yu Kuai - Refactor code around quiesce() and mddev_suspend(), by Yu Kuai - NVMe pull request via Keith: - nvme-auth updates (Mark) - nvme-tcp tls (Hannes) - nvme-fc annotaions (Kees) - Misc cleanups and improvements (Jiapeng, Joel) * tag 'for-6.7/block-2023-10-30' of git://git.kernel.dk/linux: (95 commits) block: ublk_drv: Remove unused function md: cleanup pers->prepare_suspend() nvme-auth: allow mixing of secret and hash lengths nvme-auth: use transformed key size to create resp nvme-auth: alloc nvme_dhchap_key as single buffer nvmet-tcp: use 'spin_lock_bh' for state_lock() powerpc/pseries: PLPKS SED Opal keystore support block: sed-opal: keystore access for SED Opal keys block:sed-opal: SED Opal keystore ublk: simplify aborting request ublk: replace monitor with cancelable uring_cmd ublk: quiesce request queue when aborting queue ublk: rename mm_lock as lock ublk: move ublk_cancel_dev() out of ub->mutex ublk: make sure io cmd handled in submitter task context ublk: don't get ublk device reference in ublk_abort_queue() ublk: Make ublks_max configurable ublk: Limit dev_id/ub_number values md-cluster: check for timeout while a new disk adding nvme: rework NVME_AUTH Kconfig selection ...
Diffstat (limited to 'drivers')
-rw-r--r--drivers/block/aoe/aoenet.c3
-rw-r--r--drivers/block/null_blk/main.c22
-rw-r--r--drivers/block/ublk_drv.c342
-rw-r--r--drivers/block/virtio_blk.c2
-rw-r--r--drivers/cdrom/cdrom.c1
-rw-r--r--drivers/md/dm-raid.c17
-rw-r--r--drivers/md/md-autodetect.c4
-rw-r--r--drivers/md/md-bitmap.c61
-rw-r--r--drivers/md/md-cluster.c15
-rw-r--r--drivers/md/md-linear.c28
-rw-r--r--drivers/md/md-linear.h2
-rw-r--r--drivers/md/md.c822
-rw-r--r--drivers/md/md.h70
-rw-r--r--drivers/md/raid1.c6
-rw-r--r--drivers/md/raid10.c3
-rw-r--r--drivers/md/raid5-cache.c64
-rw-r--r--drivers/md/raid5.c103
-rw-r--r--drivers/nvme/common/Kconfig13
-rw-r--r--drivers/nvme/common/Makefile3
-rw-r--r--drivers/nvme/common/auth.c68
-rw-r--r--drivers/nvme/common/keyring.c182
-rw-r--r--drivers/nvme/host/Kconfig24
-rw-r--r--drivers/nvme/host/Makefile2
-rw-r--r--drivers/nvme/host/auth.c30
-rw-r--r--drivers/nvme/host/core.c14
-rw-r--r--drivers/nvme/host/fabrics.c67
-rw-r--r--drivers/nvme/host/fabrics.h9
-rw-r--r--drivers/nvme/host/nvme.h5
-rw-r--r--drivers/nvme/host/pci.c1
-rw-r--r--drivers/nvme/host/sysfs.c27
-rw-r--r--drivers/nvme/host/tcp.c184
-rw-r--r--drivers/nvme/target/Kconfig22
-rw-r--r--drivers/nvme/target/auth.c31
-rw-r--r--drivers/nvme/target/configfs.c128
-rw-r--r--drivers/nvme/target/fc.c3
-rw-r--r--drivers/nvme/target/nvmet.h11
-rw-r--r--drivers/nvme/target/tcp.c341
37 files changed, 1892 insertions, 838 deletions
diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c
index 63773a90581d..c51ea95bc2ce 100644
--- a/drivers/block/aoe/aoenet.c
+++ b/drivers/block/aoe/aoenet.c
@@ -39,8 +39,7 @@ static struct ktstate kts;
#ifndef MODULE
static int __init aoe_iflist_setup(char *str)
{
- strncpy(aoe_iflist, str, IFLISTSZ);
- aoe_iflist[IFLISTSZ - 1] = '\0';
+ strscpy(aoe_iflist, str, IFLISTSZ);
return 1;
}
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 968090935eb2..22a3cf7f32e2 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -1750,6 +1750,25 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
return null_handle_cmd(cmd, sector, nr_sectors, req_op(rq));
}
+static void null_queue_rqs(struct request **rqlist)
+{
+ struct request *requeue_list = NULL;
+ struct request **requeue_lastp = &requeue_list;
+ struct blk_mq_queue_data bd = { };
+ blk_status_t ret;
+
+ do {
+ struct request *rq = rq_list_pop(rqlist);
+
+ bd.rq = rq;
+ ret = null_queue_rq(rq->mq_hctx, &bd);
+ if (ret != BLK_STS_OK)
+ rq_list_add_tail(&requeue_lastp, rq);
+ } while (!rq_list_empty(*rqlist));
+
+ *rqlist = requeue_list;
+}
+
static void cleanup_queue(struct nullb_queue *nq)
{
bitmap_free(nq->tag_map);
@@ -1802,6 +1821,7 @@ static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
static const struct blk_mq_ops null_mq_ops = {
.queue_rq = null_queue_rq,
+ .queue_rqs = null_queue_rqs,
.complete = null_complete_rq,
.timeout = null_timeout_rq,
.poll = null_poll,
@@ -1946,7 +1966,7 @@ static int null_gendisk_register(struct nullb *nullb)
else
disk->fops = &null_bio_ops;
disk->private_data = nullb;
- strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN);
+ strscpy_pad(disk->disk_name, nullb->disk_name, DISK_NAME_LEN);
if (nullb->dev->zoned) {
int ret = null_register_zoned_dev(nullb);
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 630ddfe6657b..83600b45e12a 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -75,6 +75,7 @@ struct ublk_rq_data {
struct ublk_uring_cmd_pdu {
struct ublk_queue *ubq;
+ u16 tag;
};
/*
@@ -115,6 +116,9 @@ struct ublk_uring_cmd_pdu {
*/
#define UBLK_IO_FLAG_NEED_GET_DATA 0x08
+/* atomic RW with ubq->cancel_lock */
+#define UBLK_IO_FLAG_CANCELED 0x80000000
+
struct ublk_io {
/* userspace buffer address from io cmd */
__u64 addr;
@@ -138,13 +142,13 @@ struct ublk_queue {
unsigned int max_io_sz;
bool force_abort;
bool timeout;
+ bool canceling;
unsigned short nr_io_ready; /* how many ios setup */
+ spinlock_t cancel_lock;
struct ublk_device *dev;
struct ublk_io ios[];
};
-#define UBLK_DAEMON_MONITOR_PERIOD (5 * HZ)
-
struct ublk_device {
struct gendisk *ub_disk;
@@ -166,7 +170,7 @@ struct ublk_device {
struct mutex mutex;
- spinlock_t mm_lock;
+ spinlock_t lock;
struct mm_struct *mm;
struct ublk_params params;
@@ -175,11 +179,6 @@ struct ublk_device {
unsigned int nr_queues_ready;
unsigned int nr_privileged_daemon;
- /*
- * Our ubq->daemon may be killed without any notification, so
- * monitor each queue's daemon periodically
- */
- struct delayed_work monitor_work;
struct work_struct quiesce_work;
struct work_struct stop_work;
};
@@ -190,10 +189,11 @@ struct ublk_params_header {
__u32 types;
};
+static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq);
+
static inline unsigned int ublk_req_build_flags(struct request *req);
static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
int tag);
-
static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub)
{
return ub->dev_info.flags & UBLK_F_USER_COPY;
@@ -470,6 +470,7 @@ static DEFINE_MUTEX(ublk_ctl_mutex);
* It can be extended to one per-user limit in future or even controlled
* by cgroup.
*/
+#define UBLK_MAX_UBLKS UBLK_MINORS
static unsigned int ublks_max = 64;
static unsigned int ublks_added; /* protected by ublk_ctl_mutex */
@@ -1083,13 +1084,10 @@ static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
{
WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
- if (!(io->flags & UBLK_IO_FLAG_ABORTED)) {
- io->flags |= UBLK_IO_FLAG_ABORTED;
- if (ublk_queue_can_use_recovery_reissue(ubq))
- blk_mq_requeue_request(req, false);
- else
- ublk_put_req_ref(ubq, req);
- }
+ if (ublk_queue_can_use_recovery_reissue(ubq))
+ blk_mq_requeue_request(req, false);
+ else
+ ublk_put_req_ref(ubq, req);
}
static void ubq_complete_io_cmd(struct ublk_io *io, int res,
@@ -1118,8 +1116,6 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
blk_mq_requeue_request(rq, false);
else
blk_mq_end_request(rq, BLK_STS_IOERR);
-
- mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0);
}
static inline void __ublk_rq_task_work(struct request *req,
@@ -1212,15 +1208,6 @@ static inline void ublk_forward_io_cmds(struct ublk_queue *ubq,
__ublk_rq_task_work(blk_mq_rq_from_pdu(data), issue_flags);
}
-static inline void ublk_abort_io_cmds(struct ublk_queue *ubq)
-{
- struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds);
- struct ublk_rq_data *data, *tmp;
-
- llist_for_each_entry_safe(data, tmp, io_cmds, node)
- __ublk_abort_rq(ubq, blk_mq_rq_from_pdu(data));
-}
-
static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, unsigned issue_flags)
{
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
@@ -1232,38 +1219,19 @@ static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, unsigned issue_flags)
static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
{
struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
- struct ublk_io *io;
- if (!llist_add(&data->node, &ubq->io_cmds))
- return;
+ if (llist_add(&data->node, &ubq->io_cmds)) {
+ struct ublk_io *io = &ubq->ios[rq->tag];
- io = &ubq->ios[rq->tag];
- /*
- * If the check pass, we know that this is a re-issued request aborted
- * previously in monitor_work because the ubq_daemon(cmd's task) is
- * PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore
- * because this ioucmd's io_uring context may be freed now if no inflight
- * ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work.
- *
- * Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing
- * the tag). Then the request is re-started(allocating the tag) and we are here.
- * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED
- * guarantees that here is a re-issued request aborted previously.
- */
- if (unlikely(io->flags & UBLK_IO_FLAG_ABORTED)) {
- ublk_abort_io_cmds(ubq);
- } else {
- struct io_uring_cmd *cmd = io->cmd;
- struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
-
- pdu->ubq = ubq;
- io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
+ io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb);
}
}
static enum blk_eh_timer_return ublk_timeout(struct request *rq)
{
struct ublk_queue *ubq = rq->mq_hctx->driver_data;
+ unsigned int nr_inflight = 0;
+ int i;
if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) {
if (!ubq->timeout) {
@@ -1274,6 +1242,29 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq)
return BLK_EH_DONE;
}
+ if (!ubq_daemon_is_dying(ubq))
+ return BLK_EH_RESET_TIMER;
+
+ for (i = 0; i < ubq->q_depth; i++) {
+ struct ublk_io *io = &ubq->ios[i];
+
+ if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
+ nr_inflight++;
+ }
+
+ /* cancelable uring_cmd can't help us if all commands are in-flight */
+ if (nr_inflight == ubq->q_depth) {
+ struct ublk_device *ub = ubq->dev;
+
+ if (ublk_abort_requests(ub, ubq)) {
+ if (ublk_can_use_recovery(ub))
+ schedule_work(&ub->quiesce_work);
+ else
+ schedule_work(&ub->stop_work);
+ }
+ return BLK_EH_DONE;
+ }
+
return BLK_EH_RESET_TIMER;
}
@@ -1301,13 +1292,12 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort))
return BLK_STS_IOERR;
- blk_mq_start_request(bd->rq);
-
- if (unlikely(ubq_daemon_is_dying(ubq))) {
+ if (unlikely(ubq->canceling)) {
__ublk_abort_rq(ubq, rq);
return BLK_STS_OK;
}
+ blk_mq_start_request(bd->rq);
ublk_queue_cmd(ubq, rq);
return BLK_STS_OK;
@@ -1357,12 +1347,12 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
int q_id, ret = 0;
- spin_lock(&ub->mm_lock);
+ spin_lock(&ub->lock);
if (!ub->mm)
ub->mm = current->mm;
if (current->mm != ub->mm)
ret = -EINVAL;
- spin_unlock(&ub->mm_lock);
+ spin_unlock(&ub->lock);
if (ret)
return ret;
@@ -1411,17 +1401,14 @@ static void ublk_commit_completion(struct ublk_device *ub,
}
/*
- * When ->ubq_daemon is exiting, either new request is ended immediately,
- * or any queued io command is drained, so it is safe to abort queue
- * lockless
+ * Called from ubq_daemon context via cancel fn, meantime quiesce ublk
+ * blk-mq queue, so we are called exclusively with blk-mq and ubq_daemon
+ * context, so everything is serialized.
*/
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
{
int i;
- if (!ublk_get_device(ub))
- return;
-
for (i = 0; i < ubq->q_depth; i++) {
struct ublk_io *io = &ubq->ios[i];
@@ -1433,72 +1420,114 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
* will do it
*/
rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
- if (rq)
+ if (rq && blk_mq_request_started(rq)) {
+ io->flags |= UBLK_IO_FLAG_ABORTED;
__ublk_fail_req(ubq, io, rq);
+ }
}
}
- ublk_put_device(ub);
}
-static void ublk_daemon_monitor_work(struct work_struct *work)
+static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq)
{
- struct ublk_device *ub =
- container_of(work, struct ublk_device, monitor_work.work);
- int i;
+ struct gendisk *disk;
- for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
- struct ublk_queue *ubq = ublk_get_queue(ub, i);
+ spin_lock(&ubq->cancel_lock);
+ if (ubq->canceling) {
+ spin_unlock(&ubq->cancel_lock);
+ return false;
+ }
+ ubq->canceling = true;
+ spin_unlock(&ubq->cancel_lock);
- if (ubq_daemon_is_dying(ubq)) {
- if (ublk_queue_can_use_recovery(ubq))
- schedule_work(&ub->quiesce_work);
- else
- schedule_work(&ub->stop_work);
+ spin_lock(&ub->lock);
+ disk = ub->ub_disk;
+ if (disk)
+ get_device(disk_to_dev(disk));
+ spin_unlock(&ub->lock);
- /* abort queue is for making forward progress */
- ublk_abort_queue(ub, ubq);
- }
- }
+ /* Our disk has been dead */
+ if (!disk)
+ return false;
- /*
- * We can't schedule monitor work after ub's state is not UBLK_S_DEV_LIVE.
- * after ublk_remove() or __ublk_quiesce_dev() is started.
- *
- * No need ub->mutex, monitor work are canceled after state is marked
- * as not LIVE, so new state is observed reliably.
- */
- if (ub->dev_info.state == UBLK_S_DEV_LIVE)
- schedule_delayed_work(&ub->monitor_work,
- UBLK_DAEMON_MONITOR_PERIOD);
-}
+ /* Now we are serialized with ublk_queue_rq() */
+ blk_mq_quiesce_queue(disk->queue);
+ /* abort queue is for making forward progress */
+ ublk_abort_queue(ub, ubq);
+ blk_mq_unquiesce_queue(disk->queue);
+ put_device(disk_to_dev(disk));
-static inline bool ublk_queue_ready(struct ublk_queue *ubq)
-{
- return ubq->nr_io_ready == ubq->q_depth;
+ return true;
}
-static void ublk_cmd_cancel_cb(struct io_uring_cmd *cmd, unsigned issue_flags)
+static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io,
+ unsigned int issue_flags)
{
- io_uring_cmd_done(cmd, UBLK_IO_RES_ABORT, 0, issue_flags);
+ bool done;
+
+ if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
+ return;
+
+ spin_lock(&ubq->cancel_lock);
+ done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
+ if (!done)
+ io->flags |= UBLK_IO_FLAG_CANCELED;
+ spin_unlock(&ubq->cancel_lock);
+
+ if (!done)
+ io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0, issue_flags);
}
-static void ublk_cancel_queue(struct ublk_queue *ubq)
+/*
+ * The ublk char device won't be closed when calling cancel fn, so both
+ * ublk device and queue are guaranteed to be live
+ */
+static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
{
- int i;
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+ struct ublk_queue *ubq = pdu->ubq;
+ struct task_struct *task;
+ struct ublk_device *ub;
+ bool need_schedule;
+ struct ublk_io *io;
- if (!ublk_queue_ready(ubq))
+ if (WARN_ON_ONCE(!ubq))
return;
- for (i = 0; i < ubq->q_depth; i++) {
- struct ublk_io *io = &ubq->ios[i];
+ if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
+ return;
+
+ task = io_uring_cmd_get_task(cmd);
+ if (WARN_ON_ONCE(task && task != ubq->ubq_daemon))
+ return;
+
+ ub = ubq->dev;
+ need_schedule = ublk_abort_requests(ub, ubq);
+
+ io = &ubq->ios[pdu->tag];
+ WARN_ON_ONCE(io->cmd != cmd);
+ ublk_cancel_cmd(ubq, io, issue_flags);
- if (io->flags & UBLK_IO_FLAG_ACTIVE)
- io_uring_cmd_complete_in_task(io->cmd,
- ublk_cmd_cancel_cb);
+ if (need_schedule) {
+ if (ublk_can_use_recovery(ub))
+ schedule_work(&ub->quiesce_work);
+ else
+ schedule_work(&ub->stop_work);
}
+}
- /* all io commands are canceled */
- ubq->nr_io_ready = 0;
+static inline bool ublk_queue_ready(struct ublk_queue *ubq)
+{
+ return ubq->nr_io_ready == ubq->q_depth;
+}
+
+static void ublk_cancel_queue(struct ublk_queue *ubq)
+{
+ int i;
+
+ for (i = 0; i < ubq->q_depth; i++)
+ ublk_cancel_cmd(ubq, &ubq->ios[i], IO_URING_F_UNLOCKED);
}
/* Cancel all pending commands, must be called after del_gendisk() returns */
@@ -1545,16 +1574,6 @@ static void __ublk_quiesce_dev(struct ublk_device *ub)
blk_mq_quiesce_queue(ub->ub_disk->queue);
ublk_wait_tagset_rqs_idle(ub);
ub->dev_info.state = UBLK_S_DEV_QUIESCED;
- ublk_cancel_dev(ub);
- /* we are going to release task_struct of ubq_daemon and resets
- * ->ubq_daemon to NULL. So in monitor_work, check on ubq_daemon causes UAF.
- * Besides, monitor_work is not necessary in QUIESCED state since we have
- * already scheduled quiesce_work and quiesced all ubqs.
- *
- * Do not let monitor_work schedule itself if state it QUIESCED. And we cancel
- * it here and re-schedule it in END_USER_RECOVERY to avoid UAF.
- */
- cancel_delayed_work_sync(&ub->monitor_work);
}
static void ublk_quiesce_work_fn(struct work_struct *work)
@@ -1568,6 +1587,7 @@ static void ublk_quiesce_work_fn(struct work_struct *work)
__ublk_quiesce_dev(ub);
unlock:
mutex_unlock(&ub->mutex);
+ ublk_cancel_dev(ub);
}
static void ublk_unquiesce_dev(struct ublk_device *ub)
@@ -1593,6 +1613,8 @@ static void ublk_unquiesce_dev(struct ublk_device *ub)
static void ublk_stop_dev(struct ublk_device *ub)
{
+ struct gendisk *disk;
+
mutex_lock(&ub->mutex);
if (ub->dev_info.state == UBLK_S_DEV_DEAD)
goto unlock;
@@ -1602,14 +1624,18 @@ static void ublk_stop_dev(struct ublk_device *ub)
ublk_unquiesce_dev(ub);
}
del_gendisk(ub->ub_disk);
+
+ /* Sync with ublk_abort_queue() by holding the lock */
+ spin_lock(&ub->lock);
+ disk = ub->ub_disk;
ub->dev_info.state = UBLK_S_DEV_DEAD;
ub->dev_info.ublksrv_pid = -1;
- put_disk(ub->ub_disk);
ub->ub_disk = NULL;
+ spin_unlock(&ub->lock);
+ put_disk(disk);
unlock:
- ublk_cancel_dev(ub);
mutex_unlock(&ub->mutex);
- cancel_delayed_work_sync(&ub->monitor_work);
+ ublk_cancel_dev(ub);
}
/* device can only be started after all IOs are ready */
@@ -1660,6 +1686,21 @@ static inline void ublk_fill_io_cmd(struct ublk_io *io,
io->addr = buf_addr;
}
+static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
+ unsigned int issue_flags,
+ struct ublk_queue *ubq, unsigned int tag)
+{
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+
+ /*
+ * Safe to refer to @ubq since ublk_queue won't be died until its
+ * commands are completed
+ */
+ pdu->ubq = ubq;
+ pdu->tag = tag;
+ io_uring_cmd_mark_cancelable(cmd, issue_flags);
+}
+
static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
unsigned int issue_flags,
const struct ublksrv_io_cmd *ub_cmd)
@@ -1775,6 +1816,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
default:
goto out;
}
+ ublk_prep_cancel(cmd, issue_flags, ubq, tag);
return -EIOCBQUEUED;
out:
@@ -1814,7 +1856,8 @@ fail_put:
return NULL;
}
-static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+static inline int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
{
/*
* Not necessary for async retry, but let's keep it simple and always
@@ -1828,9 +1871,33 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
.addr = READ_ONCE(ub_src->addr)
};
+ WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
+
return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd);
}
+static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ ublk_ch_uring_cmd_local(cmd, issue_flags);
+}
+
+static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
+ ublk_uring_cmd_cancel_fn(cmd, issue_flags);
+ return 0;
+ }
+
+ /* well-implemented server won't run into unlocked */
+ if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
+ io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
+ return -EIOCBQUEUED;
+ }
+
+ return ublk_ch_uring_cmd_local(cmd, issue_flags);
+}
+
static inline bool ublk_check_ubuf_dir(const struct request *req,
int ubuf_dir)
{
@@ -1962,6 +2029,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id)
void *ptr;
int size;
+ spin_lock_init(&ubq->cancel_lock);
ubq->flags = ub->dev_info.flags;
ubq->q_id = q_id;
ubq->q_depth = ub->dev_info.queue_depth;
@@ -2026,7 +2094,8 @@ static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
if (err == -ENOSPC)
err = -EEXIST;
} else {
- err = idr_alloc(&ublk_index_idr, ub, 0, 0, GFP_NOWAIT);
+ err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS,
+ GFP_NOWAIT);
}
spin_unlock(&ublk_idr_lock);
@@ -2151,8 +2220,6 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
if (wait_for_completion_interruptible(&ub->completion) != 0)
return -EINTR;
- schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD);
-
mutex_lock(&ub->mutex);
if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
test_bit(UB_STATE_USED, &ub->state)) {
@@ -2305,6 +2372,12 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
return -EINVAL;
}
+ if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) {
+ pr_warn("%s: dev id is too large. Max supported is %d\n",
+ __func__, UBLK_MAX_UBLKS - 1);
+ return -EINVAL;
+ }
+
ublk_dump_dev_info(&info);
ret = mutex_lock_killable(&ublk_ctl_mutex);
@@ -2320,10 +2393,9 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
if (!ub)
goto out_unlock;
mutex_init(&ub->mutex);
- spin_lock_init(&ub->mm_lock);
+ spin_lock_init(&ub->lock);
INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn);
INIT_WORK(&ub->stop_work, ublk_stop_work_fn);
- INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work);
ret = ublk_alloc_dev_number(ub, header->dev_id);
if (ret < 0)
@@ -2569,13 +2641,15 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
int i;
WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq)));
+
/* All old ioucmds have to be completed */
- WARN_ON_ONCE(ubq->nr_io_ready);
+ ubq->nr_io_ready = 0;
/* old daemon is PF_EXITING, put it now */
put_task_struct(ubq->ubq_daemon);
/* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */
ubq->ubq_daemon = NULL;
ubq->timeout = false;
+ ubq->canceling = false;
for (i = 0; i < ubq->q_depth; i++) {
struct ublk_io *io = &ubq->ios[i];
@@ -2661,7 +2735,6 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub,
__func__, header->dev_id);
blk_mq_kick_requeue_list(ub->ub_disk->queue);
ub->dev_info.state = UBLK_S_DEV_LIVE;
- schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD);
ret = 0;
out_unlock:
mutex_unlock(&ub->mutex);
@@ -2932,7 +3005,22 @@ static void __exit ublk_exit(void)
module_init(ublk_init);
module_exit(ublk_exit);
-module_param(ublks_max, int, 0444);
+static int ublk_set_max_ublks(const char *buf, const struct kernel_param *kp)
+{
+ return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS);
+}
+
+static int ublk_get_max_ublks(char *buf, const struct kernel_param *kp)
+{
+ return sysfs_emit(buf, "%u\n", ublks_max);
+}
+
+static const struct kernel_param_ops ublk_max_ublks_ops = {
+ .set = ublk_set_max_ublks,
+ .get = ublk_get_max_ublks,
+};
+
+module_param_cb(ublks_max, &ublk_max_ublks_ops, &ublks_max, 0644);
MODULE_PARM_DESC(ublks_max, "max number of ublk devices allowed to add(default: 64)");
MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 1fe011676d07..4689ac2e0c0e 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -470,8 +470,6 @@ static bool virtblk_prep_rq_batch(struct request *req)
struct virtio_blk *vblk = req->mq_hctx->queue->queuedata;
struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
- req->mq_hctx->tags->rqs[req->tag] = req;
-
return virtblk_prep_rq(req->mq_hctx, vblk, req, vbr) == BLK_STS_OK;
}
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index cc2839805983..a5e07270e0d4 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -3655,7 +3655,6 @@ static struct ctl_table cdrom_table[] = {
.mode = 0644,
.proc_handler = cdrom_sysctl_handler
},
- { }
};
static struct ctl_table_header *cdrom_sysctl_header;
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 9755788e8b78..91ebdcc6e9a8 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -749,7 +749,11 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
return ERR_PTR(-ENOMEM);
}
- mddev_init(&rs->md);
+ if (mddev_init(&rs->md)) {
+ kfree(rs);
+ ti->error = "Cannot initialize raid context";
+ return ERR_PTR(-ENOMEM);
+ }
rs->raid_disks = raid_devs;
rs->delta_disks = 0;
@@ -798,6 +802,7 @@ static void raid_set_free(struct raid_set *rs)
dm_put_device(rs->ti, rs->dev[i].data_dev);
}
+ mddev_destroy(&rs->md);
kfree(rs);
}
@@ -3239,7 +3244,7 @@ size_check:
set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
/* Has to be held on running the array */
- mddev_lock_nointr(&rs->md);
+ mddev_suspend_and_lock_nointr(&rs->md);
r = md_run(&rs->md);
rs->md.in_sync = 0; /* Assume already marked dirty */
if (r) {
@@ -3263,7 +3268,6 @@ size_check:
}
}
- mddev_suspend(&rs->md);
set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags);
/* Try to adjust the raid4/5/6 stripe cache size to the stripe size */
@@ -3793,9 +3797,7 @@ static void raid_postsuspend(struct dm_target *ti)
if (!test_bit(MD_RECOVERY_FROZEN, &rs->md.recovery))
md_stop_writes(&rs->md);
- mddev_lock_nointr(&rs->md);
- mddev_suspend(&rs->md);
- mddev_unlock(&rs->md);
+ mddev_suspend(&rs->md, false);
}
}
@@ -4054,8 +4056,7 @@ static void raid_resume(struct dm_target *ti)
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
mddev->ro = 0;
mddev->in_sync = 0;
- mddev_resume(mddev);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
}
}
diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c
index 6eaa0eab40f9..4b80165afd23 100644
--- a/drivers/md/md-autodetect.c
+++ b/drivers/md/md-autodetect.c
@@ -175,7 +175,7 @@ static void __init md_setup_drive(struct md_setup_args *args)
return;
}
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err) {
pr_err("md: failed to lock array %s\n", name);
goto out_mddev_put;
@@ -221,7 +221,7 @@ static void __init md_setup_drive(struct md_setup_args *args)
if (err)
pr_warn("md: starting %s failed\n", name);
out_unlock:
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
out_mddev_put:
mddev_put(mddev);
}
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 6f9ff14971f9..9672f75c3050 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1861,7 +1861,7 @@ void md_bitmap_destroy(struct mddev *mddev)
md_bitmap_wait_behind_writes(mddev);
if (!mddev->serialize_policy)
- mddev_destroy_serial_pool(mddev, NULL, true);
+ mddev_destroy_serial_pool(mddev, NULL);
mutex_lock(&mddev->bitmap_info.mutex);
spin_lock(&mddev->lock);
@@ -1977,7 +1977,7 @@ int md_bitmap_load(struct mddev *mddev)
goto out;
rdev_for_each(rdev, mddev)
- mddev_create_serial_pool(mddev, rdev, true);
+ mddev_create_serial_pool(mddev, rdev);
if (mddev_is_clustered(mddev))
md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes);
@@ -2348,14 +2348,11 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
{
int rv;
- rv = mddev_lock(mddev);
+ rv = mddev_suspend_and_lock(mddev);
if (rv)
return rv;
+
if (mddev->pers) {
- if (!mddev->pers->quiesce) {
- rv = -EBUSY;
- goto out;
- }
if (mddev->recovery || mddev->sync_thread) {
rv = -EBUSY;
goto out;
@@ -2369,11 +2366,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
rv = -EBUSY;
goto out;
}
- if (mddev->pers) {
- mddev_suspend(mddev);
- md_bitmap_destroy(mddev);
- mddev_resume(mddev);
- }
+
+ md_bitmap_destroy(mddev);
mddev->bitmap_info.offset = 0;
if (mddev->bitmap_info.file) {
struct file *f = mddev->bitmap_info.file;
@@ -2383,6 +2377,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
} else {
/* No bitmap, OK to set a location */
long long offset;
+ struct bitmap *bitmap;
+
if (strncmp(buf, "none", 4) == 0)
/* nothing to be done */;
else if (strncmp(buf, "file:", 5) == 0) {
@@ -2406,25 +2402,20 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
rv = -EINVAL;
goto out;
}
+
mddev->bitmap_info.offset = offset;
- if (mddev->pers) {
- struct bitmap *bitmap;
- bitmap = md_bitmap_create(mddev, -1);
- mddev_suspend(mddev);
- if (IS_ERR(bitmap))
- rv = PTR_ERR(bitmap);
- else {
- mddev->bitmap = bitmap;
- rv = md_bitmap_load(mddev);
- if (rv)
- mddev->bitmap_info.offset = 0;
- }
- if (rv) {
- md_bitmap_destroy(mddev);
- mddev_resume(mddev);
- goto out;
- }
- mddev_resume(mddev);
+ bitmap = md_bitmap_create(mddev, -1);
+ if (IS_ERR(bitmap)) {
+ rv = PTR_ERR(bitmap);
+ goto out;
+ }
+
+ mddev->bitmap = bitmap;
+ rv = md_bitmap_load(mddev);
+ if (rv) {
+ mddev->bitmap_info.offset = 0;
+ md_bitmap_destroy(mddev);
+ goto out;
}
}
}
@@ -2437,7 +2428,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
}
rv = 0;
out:
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
if (rv)
return rv;
return len;
@@ -2546,7 +2537,7 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
if (backlog > COUNTER_MAX)
return -EINVAL;
- rv = mddev_lock(mddev);
+ rv = mddev_suspend_and_lock(mddev);
if (rv)
return rv;
@@ -2571,16 +2562,16 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
if (!backlog && mddev->serial_info_pool) {
/* serial_info_pool is not needed if backlog is zero */
if (!mddev->serialize_policy)
- mddev_destroy_serial_pool(mddev, NULL, false);
+ mddev_destroy_serial_pool(mddev, NULL);
} else if (backlog && !mddev->serial_info_pool) {
/* serial_info_pool is needed since backlog is not zero */
rdev_for_each(rdev, mddev)
- mddev_create_serial_pool(mddev, rdev, false);
+ mddev_create_serial_pool(mddev, rdev);
}
if (old_mwb != backlog)
md_bitmap_update_sb(mddev->bitmap);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return len;
}
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 1e26eb223349..8e36a0feec09 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -501,7 +501,7 @@ static void process_suspend_info(struct mddev *mddev,
mddev->pers->quiesce(mddev, 0);
}
-static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
+static int process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
{
char disk_uuid[64];
struct md_cluster_info *cinfo = mddev->cluster_info;
@@ -509,6 +509,7 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
char raid_slot[16];
char *envp[] = {event_name, disk_uuid, raid_slot, NULL};
int len;
+ int res = 0;
len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
sprintf(disk_uuid + len, "%pU", cmsg->uuid);
@@ -517,9 +518,14 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
init_completion(&cinfo->newdisk_completion);
set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
kobject_uevent_env(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE, envp);
- wait_for_completion_timeout(&cinfo->newdisk_completion,
- NEW_DEV_TIMEOUT);
+ if (!wait_for_completion_timeout(&cinfo->newdisk_completion,
+ NEW_DEV_TIMEOUT)) {
+ pr_err("md-cluster(%s:%d): timeout on a new disk adding\n",
+ __func__, __LINE__);
+ res = -1;
+ }
clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
+ return res;
}
@@ -594,7 +600,8 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
le64_to_cpu(msg->high));
break;
case NEWDISK:
- process_add_new_disk(mddev, msg);
+ if (process_add_new_disk(mddev, msg))
+ ret = -1;
break;
case REMOVE:
process_remove_disk(mddev, msg);
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 71ac99646827..8eca7693b793 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -69,6 +69,19 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
if (!conf)
return NULL;
+ /*
+ * conf->raid_disks is copy of mddev->raid_disks. The reason to
+ * keep a copy of mddev->raid_disks in struct linear_conf is,
+ * mddev->raid_disks may not be consistent with pointers number of
+ * conf->disks[] when it is updated in linear_add() and used to
+ * iterate old conf->disks[] earray in linear_congested().
+ * Here conf->raid_disks is always consitent with number of
+ * pointers in conf->disks[] array, and mddev->private is updated
+ * with rcu_assign_pointer() in linear_addr(), such race can be
+ * avoided.
+ */
+ conf->raid_disks = raid_disks;
+
cnt = 0;
conf->array_sectors = 0;
@@ -112,19 +125,6 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
conf->disks[i-1].end_sector +
conf->disks[i].rdev->sectors;
- /*
- * conf->raid_disks is copy of mddev->raid_disks. The reason to
- * keep a copy of mddev->raid_disks in struct linear_conf is,
- * mddev->raid_disks may not be consistent with pointers number of
- * conf->disks[] when it is updated in linear_add() and used to
- * iterate old conf->disks[] earray in linear_congested().
- * Here conf->raid_disks is always consitent with number of
- * pointers in conf->disks[] array, and mddev->private is updated
- * with rcu_assign_pointer() in linear_addr(), such race can be
- * avoided.
- */
- conf->raid_disks = raid_disks;
-
return conf;
out:
@@ -183,7 +183,6 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
* in linear_congested(), therefore kfree_rcu() is used to free
* oldconf until no one uses it anymore.
*/
- mddev_suspend(mddev);
oldconf = rcu_dereference_protected(mddev->private,
lockdep_is_held(&mddev->reconfig_mutex));
mddev->raid_disks++;
@@ -192,7 +191,6 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
rcu_assign_pointer(mddev->private, newconf);
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
- mddev_resume(mddev);
kfree_rcu(oldconf, rcu);
return 0;
}
diff --git a/drivers/md/md-linear.h b/drivers/md/md-linear.h
index 24e97db50ebb..5587eeedb882 100644
--- a/drivers/md/md-linear.h
+++ b/drivers/md/md-linear.h
@@ -12,6 +12,6 @@ struct linear_conf
struct rcu_head rcu;
sector_t array_sectors;
int raid_disks; /* a copy of mddev->raid_disks */
- struct dev_info disks[];
+ struct dev_info disks[] __counted_by(raid_disks);
};
#endif
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 839e79e567ee..411121a72df9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -91,6 +91,18 @@ static void mddev_detach(struct mddev *mddev);
static void export_rdev(struct md_rdev *rdev, struct mddev *mddev);
static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
+enum md_ro_state {
+ MD_RDWR,
+ MD_RDONLY,
+ MD_AUTO_READ,
+ MD_MAX_STATE
+};
+
+static bool md_is_rdwr(struct mddev *mddev)
+{
+ return (mddev->ro == MD_RDWR);
+}
+
/*
* Default number of read corrections we'll attempt on an rdev
* before ejecting it from the array. We divide the read error
@@ -206,8 +218,7 @@ static int rdev_need_serial(struct md_rdev *rdev)
* 1. rdev is the first device which return true from rdev_enable_serial.
* 2. rdev is NULL, means we want to enable serialization for all rdevs.
*/
-void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
- bool is_suspend)
+void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
{
int ret = 0;
@@ -215,15 +226,12 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
!test_bit(CollisionCheck, &rdev->flags))
return;
- if (!is_suspend)
- mddev_suspend(mddev);
-
if (!rdev)
ret = rdevs_init_serial(mddev);
else
ret = rdev_init_serial(rdev);
if (ret)
- goto abort;
+ return;
if (mddev->serial_info_pool == NULL) {
/*
@@ -238,10 +246,6 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
pr_err("can't alloc memory pool for serialization\n");
}
}
-
-abort:
- if (!is_suspend)
- mddev_resume(mddev);
}
/*
@@ -250,8 +254,7 @@ abort:
* 2. when bitmap is destroyed while policy is not enabled.
* 3. for disable policy, the pool is destroyed only when no rdev needs it.
*/
-void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
- bool is_suspend)
+void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
{
if (rdev && !test_bit(CollisionCheck, &rdev->flags))
return;
@@ -260,8 +263,6 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
struct md_rdev *temp;
int num = 0; /* used to track if other rdevs need the pool */
- if (!is_suspend)
- mddev_suspend(mddev);
rdev_for_each(temp, mddev) {
if (!rdev) {
if (!mddev->serialize_policy ||
@@ -283,8 +284,6 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
mempool_destroy(mddev->serial_info_pool);
mddev->serial_info_pool = NULL;
}
- if (!is_suspend)
- mddev_resume(mddev);
}
}
@@ -346,6 +345,10 @@ EXPORT_SYMBOL_GPL(md_new_event);
static LIST_HEAD(all_mddevs);
static DEFINE_SPINLOCK(all_mddevs_lock);
+static bool is_md_suspended(struct mddev *mddev)
+{
+ return percpu_ref_is_dying(&mddev->active_io);
+}
/* Rather than calling directly into the personality make_request function,
* IO requests come here first so that we can check if the device is
* being suspended pending a reconfiguration.
@@ -359,11 +362,11 @@ static bool is_suspended(struct mddev *mddev, struct bio *bio)
return true;
if (bio_data_dir(bio) != WRITE)
return false;
- if (mddev->suspend_lo >= mddev->suspend_hi)
+ if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi))
return false;
- if (bio->bi_iter.bi_sector >= mddev->suspend_hi)
+ if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi))
return false;
- if (bio_end_sector(bio) < mddev->suspend_lo)
+ if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo))
return false;
return true;
}
@@ -431,42 +434,73 @@ static void md_submit_bio(struct bio *bio)
md_handle_request(mddev, bio);
}
-/* mddev_suspend makes sure no new requests are submitted
- * to the device, and that any requests that have been submitted
- * are completely handled.
- * Once mddev_detach() is called and completes, the module will be
- * completely unused.
+/*
+ * Make sure no new requests are submitted to the device, and any requests that
+ * have been submitted are completely handled.
*/
-void mddev_suspend(struct mddev *mddev)
+int mddev_suspend(struct mddev *mddev, bool interruptible)
{
- struct md_thread *thread = rcu_dereference_protected(mddev->thread,
- lockdep_is_held(&mddev->reconfig_mutex));
+ int err = 0;
- WARN_ON_ONCE(thread && current == thread->tsk);
- if (mddev->suspended++)
- return;
- wake_up(&mddev->sb_wait);
- set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
- percpu_ref_kill(&mddev->active_io);
+ /*
+ * hold reconfig_mutex to wait for normal io will deadlock, because
+ * other context can't update super_block, and normal io can rely on
+ * updating super_block.
+ */
+ lockdep_assert_not_held(&mddev->reconfig_mutex);
- if (mddev->pers->prepare_suspend)
- mddev->pers->prepare_suspend(mddev);
+ if (interruptible)
+ err = mutex_lock_interruptible(&mddev->suspend_mutex);
+ else
+ mutex_lock(&mddev->suspend_mutex);
+ if (err)
+ return err;
- wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io));
- clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
- wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
+ if (mddev->suspended) {
+ WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
+ mutex_unlock(&mddev->suspend_mutex);
+ return 0;
+ }
+
+ percpu_ref_kill(&mddev->active_io);
+ if (interruptible)
+ err = wait_event_interruptible(mddev->sb_wait,
+ percpu_ref_is_zero(&mddev->active_io));
+ else
+ wait_event(mddev->sb_wait,
+ percpu_ref_is_zero(&mddev->active_io));
+ if (err) {
+ percpu_ref_resurrect(&mddev->active_io);
+ mutex_unlock(&mddev->suspend_mutex);
+ return err;
+ }
+
+ /*
+ * For raid456, io might be waiting for reshape to make progress,
+ * allow new reshape to start while waiting for io to be done to
+ * prevent deadlock.
+ */
+ WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
del_timer_sync(&mddev->safemode_timer);
/* restrict memory reclaim I/O during raid array is suspend */
mddev->noio_flag = memalloc_noio_save();
+
+ mutex_unlock(&mddev->suspend_mutex);
+ return 0;
}
EXPORT_SYMBOL_GPL(mddev_suspend);
void mddev_resume(struct mddev *mddev)
{
- lockdep_assert_held(&mddev->reconfig_mutex);
- if (--mddev->suspended)
+ lockdep_assert_not_held(&mddev->reconfig_mutex);
+
+ mutex_lock(&mddev->suspend_mutex);
+ WRITE_ONCE(mddev->suspended, mddev->suspended - 1);
+ if (mddev->suspended) {
+ mutex_unlock(&mddev->suspend_mutex);
return;
+ }
/* entred the memalloc scope from mddev_suspend() */
memalloc_noio_restore(mddev->noio_flag);
@@ -477,6 +511,8 @@ void mddev_resume(struct mddev *mddev)
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
+
+ mutex_unlock(&mddev->suspend_mutex);
}
EXPORT_SYMBOL_GPL(mddev_resume);
@@ -616,34 +652,63 @@ static inline struct mddev *mddev_get(struct mddev *mddev)
static void mddev_delayed_delete(struct work_struct *ws);
+static void __mddev_put(struct mddev *mddev)
+{
+ if (mddev->raid_disks || !list_empty(&mddev->disks) ||
+ mddev->ctime || mddev->hold_active)
+ return;
+
+ /* Array is not configured at all, and not held active, so destroy it */
+ set_bit(MD_DELETED, &mddev->flags);
+
+ /*
+ * Call queue_work inside the spinlock so that flush_workqueue() after
+ * mddev_find will succeed in waiting for the work to be done.
+ */
+ queue_work(md_misc_wq, &mddev->del_work);
+}
+
void mddev_put(struct mddev *mddev)
{
if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
return;
- if (!mddev->raid_disks && list_empty(&mddev->disks) &&
- mddev->ctime == 0 && !mddev->hold_active) {
- /* Array is not configured at all, and not held active,
- * so destroy it */
- set_bit(MD_DELETED, &mddev->flags);
- /*
- * Call queue_work inside the spinlock so that
- * flush_workqueue() after mddev_find will succeed in waiting
- * for the work to be done.
- */
- INIT_WORK(&mddev->del_work, mddev_delayed_delete);
- queue_work(md_misc_wq, &mddev->del_work);
- }
+ __mddev_put(mddev);
spin_unlock(&all_mddevs_lock);
}
static void md_safemode_timeout(struct timer_list *t);
+static void md_start_sync(struct work_struct *ws);
+
+static void active_io_release(struct percpu_ref *ref)
+{
+ struct mddev *mddev = container_of(ref, struct mddev, active_io);
+
+ wake_up(&mddev->sb_wait);
+}
+
+static void no_op(struct percpu_ref *r) {}
-void mddev_init(struct mddev *mddev)
+int mddev_init(struct mddev *mddev)
{
+
+ if (percpu_ref_init(&mddev->active_io, active_io_release,
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (percpu_ref_init(&mddev->writes_pending, no_op,
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
+ percpu_ref_exit(&mddev->active_io);
+ return -ENOMEM;
+ }
+
+ /* We want to start with the refcount at zero */
+ percpu_ref_put(&mddev->writes_pending);
+
mutex_init(&mddev->open_mutex);
mutex_init(&mddev->reconfig_mutex);
mutex_init(&mddev->sync_mutex);
+ mutex_init(&mddev->suspend_mutex);
mutex_init(&mddev->bitmap_info.mutex);
INIT_LIST_HEAD(&mddev->disks);
INIT_LIST_HEAD(&mddev->all_mddevs);
@@ -662,9 +727,21 @@ void mddev_init(struct mddev *mddev)
mddev->resync_min = 0;
mddev->resync_max = MaxSector;
mddev->level = LEVEL_NONE;
+
+ INIT_WORK(&mddev->sync_work, md_start_sync);
+ INIT_WORK(&mddev->del_work, mddev_delayed_delete);
+
+ return 0;
}
EXPORT_SYMBOL_GPL(mddev_init);
+void mddev_destroy(struct mddev *mddev)
+{
+ percpu_ref_exit(&mddev->active_io);
+ percpu_ref_exit(&mddev->writes_pending);
+}
+EXPORT_SYMBOL_GPL(mddev_destroy);
+
static struct mddev *mddev_find_locked(dev_t unit)
{
struct mddev *mddev;
@@ -708,13 +785,16 @@ static struct mddev *mddev_alloc(dev_t unit)
new = kzalloc(sizeof(*new), GFP_KERNEL);
if (!new)
return ERR_PTR(-ENOMEM);
- mddev_init(new);
+
+ error = mddev_init(new);
+ if (error)
+ goto out_free_new;
spin_lock(&all_mddevs_lock);
if (unit) {
error = -EEXIST;
if (mddev_find_locked(unit))
- goto out_free_new;
+ goto out_destroy_new;
new->unit = unit;
if (MAJOR(unit) == MD_MAJOR)
new->md_minor = MINOR(unit);
@@ -725,7 +805,7 @@ static struct mddev *mddev_alloc(dev_t unit)
error = -ENODEV;
new->unit = mddev_alloc_unit();
if (!new->unit)
- goto out_free_new;
+ goto out_destroy_new;
new->md_minor = MINOR(new->unit);
new->hold_active = UNTIL_STOP;
}
@@ -733,8 +813,11 @@ static struct mddev *mddev_alloc(dev_t unit)
list_add(&new->all_mddevs, &all_mddevs);
spin_unlock(&all_mddevs_lock);
return new;
-out_free_new:
+
+out_destroy_new:
spin_unlock(&all_mddevs_lock);
+ mddev_destroy(new);
+out_free_new:
kfree(new);
return ERR_PTR(error);
}
@@ -745,6 +828,7 @@ static void mddev_free(struct mddev *mddev)
list_del(&mddev->all_mddevs);
spin_unlock(&all_mddevs_lock);
+ mddev_destroy(mddev);
kfree(mddev);
}
@@ -2412,7 +2496,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
pr_debug("md: bind<%s>\n", b);
if (mddev->raid_disks)
- mddev_create_serial_pool(mddev, rdev, false);
+ mddev_create_serial_pool(mddev, rdev);
if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
goto fail;
@@ -2464,7 +2548,7 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev)
bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
list_del_rcu(&rdev->same_set);
pr_debug("md: unbind<%pg>\n", rdev->bdev);
- mddev_destroy_serial_pool(rdev->mddev, rdev, false);
+ mddev_destroy_serial_pool(rdev->mddev, rdev);
rdev->mddev = NULL;
sysfs_remove_link(&rdev->kobj, "block");
sysfs_put(rdev->sysfs_state);
@@ -2794,11 +2878,7 @@ static int add_bound_rdev(struct md_rdev *rdev)
*/
super_types[mddev->major_version].
validate_super(mddev, rdev);
- if (add_journal)
- mddev_suspend(mddev);
err = mddev->pers->hot_add_disk(mddev, rdev);
- if (add_journal)
- mddev_resume(mddev);
if (err) {
md_kick_rdev_from_array(rdev);
return err;
@@ -2935,11 +3015,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
}
} else if (cmd_match(buf, "writemostly")) {
set_bit(WriteMostly, &rdev->flags);
- mddev_create_serial_pool(rdev->mddev, rdev, false);
+ mddev_create_serial_pool(rdev->mddev, rdev);
need_update_sb = true;
err = 0;
} else if (cmd_match(buf, "-writemostly")) {
- mddev_destroy_serial_pool(rdev->mddev, rdev, false);
+ mddev_destroy_serial_pool(rdev->mddev, rdev);
clear_bit(WriteMostly, &rdev->flags);
need_update_sb = true;
err = 0;
@@ -3551,6 +3631,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
struct kernfs_node *kn = NULL;
+ bool suspend = false;
ssize_t rv;
struct mddev *mddev = rdev->mddev;
@@ -3558,17 +3639,25 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
return -EIO;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
+ if (!mddev)
+ return -ENODEV;
- if (entry->store == state_store && cmd_match(page, "remove"))
- kn = sysfs_break_active_protection(kobj, attr);
+ if (entry->store == state_store) {
+ if (cmd_match(page, "remove"))
+ kn = sysfs_break_active_protection(kobj, attr);
+ if (cmd_match(page, "remove") || cmd_match(page, "re-add") ||
+ cmd_match(page, "writemostly") ||
+ cmd_match(page, "-writemostly"))
+ suspend = true;
+ }
- rv = mddev ? mddev_lock(mddev) : -ENODEV;
+ rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev);
if (!rv) {
if (rdev->mddev == NULL)
rv = -ENODEV;
else
rv = entry->store(rdev, page, length);
- mddev_unlock(mddev);
+ suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev);
}
if (kn)
@@ -3867,12 +3956,12 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
if (slen == 0 || slen >= sizeof(clevel))
return -EINVAL;
- rv = mddev_lock(mddev);
+ rv = mddev_suspend_and_lock(mddev);
if (rv)
return rv;
if (mddev->pers == NULL) {
- strncpy(mddev->clevel, buf, slen);
+ memcpy(mddev->clevel, buf, slen);
if (mddev->clevel[slen-1] == '\n')
slen--;
mddev->clevel[slen] = 0;
@@ -3905,7 +3994,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
}
/* Now find the new personality */
- strncpy(clevel, buf, slen);
+ memcpy(clevel, buf, slen);
if (clevel[slen-1] == '\n')
slen--;
clevel[slen] = 0;
@@ -3960,7 +4049,6 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
}
/* Looks like we have a winner */
- mddev_suspend(mddev);
mddev_detach(mddev);
spin_lock(&mddev->lock);
@@ -4046,14 +4134,13 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
blk_set_stacking_limits(&mddev->queue->limits);
pers->run(mddev);
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
- mddev_resume(mddev);
if (!mddev->thread)
md_update_sb(mddev, 1);
sysfs_notify_dirent_safe(mddev->sysfs_level);
md_new_event();
rv = len;
out_unlock:
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return rv;
}
@@ -4361,6 +4448,18 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
int err = 0;
enum array_state st = match_word(buf, array_states);
+ /* No lock dependent actions */
+ switch (st) {
+ case suspended: /* not supported yet */
+ case write_pending: /* cannot be set */
+ case active_idle: /* cannot be set */
+ case broken: /* cannot be set */
+ case bad_word:
+ return -EINVAL;
+ default:
+ break;
+ }
+
if (mddev->pers && (st == active || st == clean) &&
mddev->ro != MD_RDONLY) {
/* don't take reconfig_mutex when toggling between
@@ -4385,23 +4484,16 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
err = mddev_lock(mddev);
if (err)
return err;
- err = -EINVAL;
- switch(st) {
- case bad_word:
- break;
- case clear:
- /* stopping an active array */
- err = do_md_stop(mddev, 0, NULL);
- break;
+
+ switch (st) {
case inactive:
- /* stopping an active array */
+ /* stop an active array, return 0 otherwise */
if (mddev->pers)
err = do_md_stop(mddev, 2, NULL);
- else
- err = 0; /* already inactive */
break;
- case suspended:
- break; /* not supported yet */
+ case clear:
+ err = do_md_stop(mddev, 0, NULL);
+ break;
case readonly:
if (mddev->pers)
err = md_set_readonly(mddev, NULL);
@@ -4452,10 +4544,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
err = do_md_run(mddev);
}
break;
- case write_pending:
- case active_idle:
- case broken:
- /* these cannot be set */
+ default:
+ err = -EINVAL;
break;
}
@@ -4528,7 +4618,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
minor != MINOR(dev))
return -EOVERFLOW;
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err)
return err;
if (mddev->persistent) {
@@ -4549,14 +4639,14 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
rdev = md_import_device(dev, -1, -1);
if (IS_ERR(rdev)) {
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return PTR_ERR(rdev);
}
err = bind_rdev_to_array(rdev, mddev);
out:
if (err)
export_rdev(rdev, mddev);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
if (!err)
md_new_event();
return err ? err : len;
@@ -4691,7 +4781,7 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len)
size_t namelen = len-9;
if (namelen >= sizeof(mddev->metadata_type))
namelen = sizeof(mddev->metadata_type)-1;
- strncpy(mddev->metadata_type, buf+9, namelen);
+ memcpy(mddev->metadata_type, buf+9, namelen);
mddev->metadata_type[namelen] = 0;
if (namelen && mddev->metadata_type[namelen-1] == '\n')
mddev->metadata_type[--namelen] = 0;
@@ -4865,6 +4955,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
/* A write to sync_action is enough to justify
* canceling read-auto mode
*/
+ flush_work(&mddev->sync_work);
mddev->ro = MD_RDWR;
md_wakeup_thread(mddev->sync_thread);
}
@@ -5121,7 +5212,8 @@ __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
static ssize_t
suspend_lo_show(struct mddev *mddev, char *page)
{
- return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
+ return sprintf(page, "%llu\n",
+ (unsigned long long)READ_ONCE(mddev->suspend_lo));
}
static ssize_t
@@ -5136,21 +5228,14 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
if (new != (sector_t)new)
return -EINVAL;
- err = mddev_lock(mddev);
+ err = mddev_suspend(mddev, true);
if (err)
return err;
- err = -EINVAL;
- if (mddev->pers == NULL ||
- mddev->pers->quiesce == NULL)
- goto unlock;
- mddev_suspend(mddev);
- mddev->suspend_lo = new;
+
+ WRITE_ONCE(mddev->suspend_lo, new);
mddev_resume(mddev);
- err = 0;
-unlock:
- mddev_unlock(mddev);
- return err ?: len;
+ return len;
}
static struct md_sysfs_entry md_suspend_lo =
__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
@@ -5158,7 +5243,8 @@ __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
static ssize_t
suspend_hi_show(struct mddev *mddev, char *page)
{
- return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
+ return sprintf(page, "%llu\n",
+ (unsigned long long)READ_ONCE(mddev->suspend_hi));
}
static ssize_t
@@ -5173,21 +5259,14 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
if (new != (sector_t)new)
return -EINVAL;
- err = mddev_lock(mddev);
+ err = mddev_suspend(mddev, true);
if (err)
return err;
- err = -EINVAL;
- if (mddev->pers == NULL)
- goto unlock;
- mddev_suspend(mddev);
- mddev->suspend_hi = new;
+ WRITE_ONCE(mddev->suspend_hi, new);
mddev_resume(mddev);
- err = 0;
-unlock:
- mddev_unlock(mddev);
- return err ?: len;
+ return len;
}
static struct md_sysfs_entry md_suspend_hi =
__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
@@ -5434,7 +5513,7 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
if (value == mddev->serialize_policy)
return len;
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err)
return err;
if (mddev->pers == NULL || (mddev->pers->level != 1)) {
@@ -5443,15 +5522,13 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
goto unlock;
}
- mddev_suspend(mddev);
if (value)
- mddev_create_serial_pool(mddev, NULL, true);
+ mddev_create_serial_pool(mddev, NULL);
else
- mddev_destroy_serial_pool(mddev, NULL, true);
+ mddev_destroy_serial_pool(mddev, NULL);
mddev->serialize_policy = value;
- mddev_resume(mddev);
unlock:
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return err ?: len;
}
@@ -5590,21 +5667,6 @@ static void mddev_delayed_delete(struct work_struct *ws)
kobject_put(&mddev->kobj);
}
-static void no_op(struct percpu_ref *r) {}
-
-int mddev_init_writes_pending(struct mddev *mddev)
-{
- if (mddev->writes_pending.percpu_count_ptr)
- return 0;
- if (percpu_ref_init(&mddev->writes_pending, no_op,
- PERCPU_REF_ALLOW_REINIT, GFP_KERNEL) < 0)
- return -ENOMEM;
- /* We want to start with the refcount at zero */
- percpu_ref_put(&mddev->writes_pending);
- return 0;
-}
-EXPORT_SYMBOL_GPL(mddev_init_writes_pending);
-
struct mddev *md_alloc(dev_t dev, char *name)
{
/*
@@ -5776,12 +5838,6 @@ static void md_safemode_timeout(struct timer_list *t)
}
static int start_dirty_degraded;
-static void active_io_release(struct percpu_ref *ref)
-{
- struct mddev *mddev = container_of(ref, struct mddev, active_io);
-
- wake_up(&mddev->sb_wait);
-}
int md_run(struct mddev *mddev)
{
@@ -5862,15 +5918,10 @@ int md_run(struct mddev *mddev)
nowait = nowait && bdev_nowait(rdev->bdev);
}
- err = percpu_ref_init(&mddev->active_io, active_io_release,
- PERCPU_REF_ALLOW_REINIT, GFP_KERNEL);
- if (err)
- return err;
-
if (!bioset_initialized(&mddev->bio_set)) {
err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
if (err)
- goto exit_active_io;
+ return err;
}
if (!bioset_initialized(&mddev->sync_set)) {
err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
@@ -6067,8 +6118,6 @@ exit_sync_set:
bioset_exit(&mddev->sync_set);
exit_bio_set:
bioset_exit(&mddev->bio_set);
-exit_active_io:
- percpu_ref_exit(&mddev->active_io);
return err;
}
EXPORT_SYMBOL_GPL(md_run);
@@ -6242,7 +6291,7 @@ static void __md_stop_writes(struct mddev *mddev)
}
/* disable policy to guarantee rdevs free resources for serialization */
mddev->serialize_policy = 0;
- mddev_destroy_serial_pool(mddev, NULL, true);
+ mddev_destroy_serial_pool(mddev, NULL);
}
void md_stop_writes(struct mddev *mddev)
@@ -6284,7 +6333,6 @@ static void __md_stop(struct mddev *mddev)
module_put(pers->owner);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- percpu_ref_exit(&mddev->active_io);
bioset_exit(&mddev->bio_set);
bioset_exit(&mddev->sync_set);
bioset_exit(&mddev->io_clone_set);
@@ -6299,7 +6347,6 @@ void md_stop(struct mddev *mddev)
*/
__md_stop_writes(mddev);
__md_stop(mddev);
- percpu_ref_exit(&mddev->writes_pending);
}
EXPORT_SYMBOL_GPL(md_stop);
@@ -6536,13 +6583,13 @@ static void autorun_devices(int part)
if (IS_ERR(mddev))
break;
- if (mddev_lock(mddev))
+ if (mddev_suspend_and_lock(mddev))
pr_warn("md: %s locked, cannot run\n", mdname(mddev));
else if (mddev->raid_disks || mddev->major_version
|| !list_empty(&mddev->disks)) {
pr_warn("md: %s already running, cannot run %pg\n",
mdname(mddev), rdev0->bdev);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
} else {
pr_debug("md: created %s\n", mdname(mddev));
mddev->persistent = 1;
@@ -6552,7 +6599,7 @@ static void autorun_devices(int part)
export_rdev(rdev, mddev);
}
autorun_array(mddev);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
}
/* on success, candidates will be empty, on error
* it won't...
@@ -7102,7 +7149,6 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
struct bitmap *bitmap;
bitmap = md_bitmap_create(mddev, -1);
- mddev_suspend(mddev);
if (!IS_ERR(bitmap)) {
mddev->bitmap = bitmap;
err = md_bitmap_load(mddev);
@@ -7112,11 +7158,8 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
md_bitmap_destroy(mddev);
fd = -1;
}
- mddev_resume(mddev);
} else if (fd < 0) {
- mddev_suspend(mddev);
md_bitmap_destroy(mddev);
- mddev_resume(mddev);
}
}
if (fd < 0) {
@@ -7405,7 +7448,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
mddev->bitmap_info.space =
mddev->bitmap_info.default_space;
bitmap = md_bitmap_create(mddev, -1);
- mddev_suspend(mddev);
if (!IS_ERR(bitmap)) {
mddev->bitmap = bitmap;
rv = md_bitmap_load(mddev);
@@ -7413,7 +7455,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
rv = PTR_ERR(bitmap);
if (rv)
md_bitmap_destroy(mddev);
- mddev_resume(mddev);
} else {
/* remove the bitmap */
if (!mddev->bitmap) {
@@ -7438,9 +7479,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
module_put(md_cluster_mod);
mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
}
- mddev_suspend(mddev);
md_bitmap_destroy(mddev);
- mddev_resume(mddev);
mddev->bitmap_info.offset = 0;
}
}
@@ -7511,6 +7550,20 @@ static inline bool md_ioctl_valid(unsigned int cmd)
}
}
+static bool md_ioctl_need_suspend(unsigned int cmd)
+{
+ switch (cmd) {
+ case ADD_NEW_DISK:
+ case HOT_ADD_DISK:
+ case HOT_REMOVE_DISK:
+ case SET_BITMAP_FILE:
+ case SET_ARRAY_INFO:
+ return true;
+ default:
+ return false;
+ }
+}
+
static int __md_set_array_info(struct mddev *mddev, void __user *argp)
{
mdu_array_info_t info;
@@ -7639,7 +7692,12 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
mutex_unlock(&mddev->open_mutex);
sync_blockdev(bdev);
}
- err = mddev_lock(mddev);
+
+ if (!md_is_rdwr(mddev))
+ flush_work(&mddev->sync_work);
+
+ err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) :
+ mddev_lock(mddev);
if (err) {
pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
err, cmd);
@@ -7767,7 +7825,10 @@ unlock:
if (mddev->hold_active == UNTIL_IOCTL &&
err != -EINVAL)
mddev->hold_active = 0;
- mddev_unlock(mddev);
+
+ md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) :
+ mddev_unlock(mddev);
+
out:
if(did_set_md_closing)
clear_bit(MD_CLOSING, &mddev->flags);
@@ -7879,7 +7940,6 @@ static void md_free_disk(struct gendisk *disk)
{
struct mddev *mddev = disk->private_data;
- percpu_ref_exit(&mddev->writes_pending);
mddev_free(mddev);
}
@@ -8195,105 +8255,46 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
}
static void *md_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(&all_mddevs_lock)
{
- struct list_head *tmp;
- loff_t l = *pos;
- struct mddev *mddev;
+ struct md_personality *pers;
- if (l == 0x10000) {
- ++*pos;
- return (void *)2;
- }
- if (l > 0x10000)
- return NULL;
- if (!l--)
- /* header */
- return (void*)1;
+ seq_puts(seq, "Personalities : ");
+ spin_lock(&pers_lock);
+ list_for_each_entry(pers, &pers_list, list)
+ seq_printf(seq, "[%s] ", pers->name);
+
+ spin_unlock(&pers_lock);
+ seq_puts(seq, "\n");
+ seq->poll_event = atomic_read(&md_event_count);
spin_lock(&all_mddevs_lock);
- list_for_each(tmp,&all_mddevs)
- if (!l--) {
- mddev = list_entry(tmp, struct mddev, all_mddevs);
- if (!mddev_get(mddev))
- continue;
- spin_unlock(&all_mddevs_lock);
- return mddev;
- }
- spin_unlock(&all_mddevs_lock);
- if (!l--)
- return (void*)2;/* tail */
- return NULL;
+
+ return seq_list_start(&all_mddevs, *pos);
}
static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct list_head *tmp;
- struct mddev *next_mddev, *mddev = v;
- struct mddev *to_put = NULL;
-
- ++*pos;
- if (v == (void*)2)
- return NULL;
-
- spin_lock(&all_mddevs_lock);
- if (v == (void*)1) {
- tmp = all_mddevs.next;
- } else {
- to_put = mddev;
- tmp = mddev->all_mddevs.next;
- }
-
- for (;;) {
- if (tmp == &all_mddevs) {
- next_mddev = (void*)2;
- *pos = 0x10000;
- break;
- }
- next_mddev = list_entry(tmp, struct mddev, all_mddevs);
- if (mddev_get(next_mddev))
- break;
- mddev = next_mddev;
- tmp = mddev->all_mddevs.next;
- }
- spin_unlock(&all_mddevs_lock);
-
- if (to_put)
- mddev_put(to_put);
- return next_mddev;
-
+ return seq_list_next(v, &all_mddevs, pos);
}
static void md_seq_stop(struct seq_file *seq, void *v)
+ __releases(&all_mddevs_lock)
{
- struct mddev *mddev = v;
-
- if (mddev && v != (void*)1 && v != (void*)2)
- mddev_put(mddev);
+ status_unused(seq);
+ spin_unlock(&all_mddevs_lock);
}
static int md_seq_show(struct seq_file *seq, void *v)
{
- struct mddev *mddev = v;
+ struct mddev *mddev = list_entry(v, struct mddev, all_mddevs);
sector_t sectors;
struct md_rdev *rdev;
- if (v == (void*)1) {
- struct md_personality *pers;
- seq_printf(seq, "Personalities : ");
- spin_lock(&pers_lock);
- list_for_each_entry(pers, &pers_list, list)
- seq_printf(seq, "[%s] ", pers->name);
-
- spin_unlock(&pers_lock);
- seq_printf(seq, "\n");
- seq->poll_event = atomic_read(&md_event_count);
+ if (!mddev_get(mddev))
return 0;
- }
- if (v == (void*)2) {
- status_unused(seq);
- return 0;
- }
+ spin_unlock(&all_mddevs_lock);
spin_lock(&mddev->lock);
if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
seq_printf(seq, "%s : %sactive", mdname(mddev),
@@ -8364,6 +8365,9 @@ static int md_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
}
spin_unlock(&mddev->lock);
+ spin_lock(&all_mddevs_lock);
+ if (atomic_dec_and_test(&mddev->active))
+ __mddev_put(mddev);
return 0;
}
@@ -8563,6 +8567,7 @@ bool md_write_start(struct mddev *mddev, struct bio *bi)
BUG_ON(mddev->ro == MD_RDONLY);
if (mddev->ro == MD_AUTO_READ) {
/* need to switch to read/write */
+ flush_work(&mddev->sync_work);
mddev->ro = MD_RDWR;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
@@ -9148,12 +9153,90 @@ void md_do_sync(struct md_thread *thread)
spin_unlock(&mddev->lock);
wake_up(&resync_wait);
- wake_up(&mddev->sb_wait);
md_wakeup_thread(mddev->thread);
return;
}
EXPORT_SYMBOL_GPL(md_do_sync);
+static bool rdev_removeable(struct md_rdev *rdev)
+{
+ /* rdev is not used. */
+ if (rdev->raid_disk < 0)
+ return false;
+
+ /* There are still inflight io, don't remove this rdev. */
+ if (atomic_read(&rdev->nr_pending))
+ return false;
+
+ /*
+ * An error occurred but has not yet been acknowledged by the metadata
+ * handler, don't remove this rdev.
+ */
+ if (test_bit(Blocked, &rdev->flags))
+ return false;
+
+ /* Fautly rdev is not used, it's safe to remove it. */
+ if (test_bit(Faulty, &rdev->flags))
+ return true;
+
+ /* Journal disk can only be removed if it's faulty. */
+ if (test_bit(Journal, &rdev->flags))
+ return false;
+
+ /*
+ * 'In_sync' is cleared while 'raid_disk' is valid, which means
+ * replacement has just become active from pers->spare_active(), and
+ * then pers->hot_remove_disk() will replace this rdev with replacement.
+ */
+ if (!test_bit(In_sync, &rdev->flags))
+ return true;
+
+ return false;
+}
+
+static bool rdev_is_spare(struct md_rdev *rdev)
+{
+ return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 &&
+ !test_bit(In_sync, &rdev->flags) &&
+ !test_bit(Journal, &rdev->flags) &&
+ !test_bit(Faulty, &rdev->flags);
+}
+
+static bool rdev_addable(struct md_rdev *rdev)
+{
+ /* rdev is already used, don't add it again. */
+ if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 ||
+ test_bit(Faulty, &rdev->flags))
+ return false;
+
+ /* Allow to add journal disk. */
+ if (test_bit(Journal, &rdev->flags))
+ return true;
+
+ /* Allow to add if array is read-write. */
+ if (md_is_rdwr(rdev->mddev))
+ return true;
+
+ /*
+ * For read-only array, only allow to readd a rdev. And if bitmap is
+ * used, don't allow to readd a rdev that is too old.
+ */
+ if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags))
+ return true;
+
+ return false;
+}
+
+static bool md_spares_need_change(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+
+ rdev_for_each(rdev, mddev)
+ if (rdev_removeable(rdev) || rdev_addable(rdev))
+ return true;
+ return false;
+}
+
static int remove_and_add_spares(struct mddev *mddev,
struct md_rdev *this)
{
@@ -9186,12 +9269,8 @@ static int remove_and_add_spares(struct mddev *mddev,
synchronize_rcu();
rdev_for_each(rdev, mddev) {
if ((this == NULL || rdev == this) &&
- rdev->raid_disk >= 0 &&
- !test_bit(Blocked, &rdev->flags) &&
- ((test_bit(RemoveSynchronized, &rdev->flags) ||
- (!test_bit(In_sync, &rdev->flags) &&
- !test_bit(Journal, &rdev->flags))) &&
- atomic_read(&rdev->nr_pending)==0)) {
+ (test_bit(RemoveSynchronized, &rdev->flags) ||
+ rdev_removeable(rdev))) {
if (mddev->pers->hot_remove_disk(
mddev, rdev) == 0) {
sysfs_unlink_rdev(mddev, rdev);
@@ -9213,25 +9292,12 @@ static int remove_and_add_spares(struct mddev *mddev,
rdev_for_each(rdev, mddev) {
if (this && this != rdev)
continue;
- if (test_bit(Candidate, &rdev->flags))
- continue;
- if (rdev->raid_disk >= 0 &&
- !test_bit(In_sync, &rdev->flags) &&
- !test_bit(Journal, &rdev->flags) &&
- !test_bit(Faulty, &rdev->flags))
+ if (rdev_is_spare(rdev))
spares++;
- if (rdev->raid_disk >= 0)
- continue;
- if (test_bit(Faulty, &rdev->flags))
+ if (!rdev_addable(rdev))
continue;
- if (!test_bit(Journal, &rdev->flags)) {
- if (!md_is_rdwr(mddev) &&
- !(rdev->saved_raid_disk >= 0 &&
- !test_bit(Bitmap_sync, &rdev->flags)))
- continue;
-
+ if (!test_bit(Journal, &rdev->flags))
rdev->recovery_offset = 0;
- }
if (mddev->pers->hot_add_disk(mddev, rdev) == 0) {
/* failure here is OK */
sysfs_link_rdev(mddev, rdev);
@@ -9247,9 +9313,86 @@ no_add:
return spares;
}
+static bool md_choose_sync_action(struct mddev *mddev, int *spares)
+{
+ /* Check if reshape is in progress first. */
+ if (mddev->reshape_position != MaxSector) {
+ if (mddev->pers->check_reshape == NULL ||
+ mddev->pers->check_reshape(mddev) != 0)
+ return false;
+
+ set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ return true;
+ }
+
+ /*
+ * Remove any failed drives, then add spares if possible. Spares are
+ * also removed and re-added, to allow the personality to fail the
+ * re-add.
+ */
+ *spares = remove_and_add_spares(mddev, NULL);
+ if (*spares) {
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+
+ /* Start new recovery. */
+ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ return true;
+ }
+
+ /* Check if recovery is in progress. */
+ if (mddev->recovery_cp < MaxSector) {
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ return true;
+ }
+
+ /* Delay to choose resync/check/repair in md_do_sync(). */
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+ return true;
+
+ /* Nothing to be done */
+ return false;
+}
+
static void md_start_sync(struct work_struct *ws)
{
- struct mddev *mddev = container_of(ws, struct mddev, del_work);
+ struct mddev *mddev = container_of(ws, struct mddev, sync_work);
+ int spares = 0;
+ bool suspend = false;
+
+ if (md_spares_need_change(mddev))
+ suspend = true;
+
+ suspend ? mddev_suspend_and_lock_nointr(mddev) :
+ mddev_lock_nointr(mddev);
+
+ if (!md_is_rdwr(mddev)) {
+ /*
+ * On a read-only array we can:
+ * - remove failed devices
+ * - add already-in_sync devices if the array itself is in-sync.
+ * As we only add devices that are already in-sync, we can
+ * activate the spares immediately.
+ */
+ remove_and_add_spares(mddev, NULL);
+ goto not_running;
+ }
+
+ if (!md_choose_sync_action(mddev, &spares))
+ goto not_running;
+
+ if (!mddev->pers->sync_request)
+ goto not_running;
+
+ /*
+ * We are adding a device or devices to an array which has the bitmap
+ * stored on all devices. So make sure all bitmap pages get written.
+ */
+ if (spares)
+ md_bitmap_write_all(mddev->bitmap);
rcu_assign_pointer(mddev->sync_thread,
md_register_thread(md_do_sync, mddev, "resync"));
@@ -9257,20 +9400,27 @@ static void md_start_sync(struct work_struct *ws)
pr_warn("%s: could not start resync thread...\n",
mdname(mddev));
/* leave the spares where they are, it shouldn't hurt */
- clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
- clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
- clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
- clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
- clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
- wake_up(&resync_wait);
- if (test_and_clear_bit(MD_RECOVERY_RECOVER,
- &mddev->recovery))
- if (mddev->sysfs_action)
- sysfs_notify_dirent_safe(mddev->sysfs_action);
- } else
- md_wakeup_thread(mddev->sync_thread);
+ goto not_running;
+ }
+
+ suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev);
+ md_wakeup_thread(mddev->sync_thread);
sysfs_notify_dirent_safe(mddev->sysfs_action);
md_new_event();
+ return;
+
+not_running:
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev);
+
+ wake_up(&resync_wait);
+ if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
+ mddev->sysfs_action)
+ sysfs_notify_dirent_safe(mddev->sysfs_action);
}
/*
@@ -9297,19 +9447,7 @@ static void md_start_sync(struct work_struct *ws)
*/
void md_check_recovery(struct mddev *mddev)
{
- if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
- /* Write superblock - thread that called mddev_suspend()
- * holds reconfig_mutex for us.
- */
- set_bit(MD_UPDATING_SB, &mddev->flags);
- smp_mb__after_atomic();
- if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
- md_update_sb(mddev, 0);
- clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
- wake_up(&mddev->sb_wait);
- }
-
- if (is_md_suspended(mddev))
+ if (READ_ONCE(mddev->suspended))
return;
if (mddev->bitmap)
@@ -9338,7 +9476,6 @@ void md_check_recovery(struct mddev *mddev)
return;
if (mddev_trylock(mddev)) {
- int spares = 0;
bool try_set_sync = mddev->safemode != 0;
if (!mddev->external && mddev->safemode == 1)
@@ -9346,30 +9483,43 @@ void md_check_recovery(struct mddev *mddev)
if (!md_is_rdwr(mddev)) {
struct md_rdev *rdev;
+
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
+ /* sync_work already queued. */
+ clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ goto unlock;
+ }
+
if (!mddev->external && mddev->in_sync)
- /* 'Blocked' flag not needed as failed devices
+ /*
+ * 'Blocked' flag not needed as failed devices
* will be recorded if array switched to read/write.
* Leaving it set will prevent the device
* from being removed.
*/
rdev_for_each(rdev, mddev)
clear_bit(Blocked, &rdev->flags);
- /* On a read-only array we can:
- * - remove failed devices
- * - add already-in_sync devices if the array itself
- * is in-sync.
- * As we only add devices that are already in-sync,
- * we can activate the spares immediately.
- */
- remove_and_add_spares(mddev, NULL);
- /* There is no thread, but we need to call
+
+ /*
+ * There is no thread, but we need to call
* ->spare_active and clear saved_raid_disk
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
+
+ /*
+ * Let md_start_sync() to remove and add rdevs to the
+ * array.
+ */
+ if (md_spares_need_change(mddev)) {
+ set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ queue_work(md_misc_wq, &mddev->sync_work);
+ }
+
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
+
goto unlock;
}
@@ -9425,56 +9575,14 @@ void md_check_recovery(struct mddev *mddev)
clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
- if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
- test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
- goto not_running;
- /* no recovery is running.
- * remove any failed drives, then
- * add spares if possible.
- * Spares are also removed and re-added, to allow
- * the personality to fail the re-add.
- */
-
- if (mddev->reshape_position != MaxSector) {
- if (mddev->pers->check_reshape == NULL ||
- mddev->pers->check_reshape(mddev) != 0)
- /* Cannot proceed */
- goto not_running;
- set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
- clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
- } else if ((spares = remove_and_add_spares(mddev, NULL))) {
- clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
- clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
- clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
- set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
- } else if (mddev->recovery_cp < MaxSector) {
- set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
- clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
- } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
- /* nothing to be done ... */
- goto not_running;
-
- if (mddev->pers->sync_request) {
- if (spares) {
- /* We are adding a device or devices to an array
- * which has the bitmap stored on all devices.
- * So make sure all bitmap pages get written
- */
- md_bitmap_write_all(mddev->bitmap);
- }
- INIT_WORK(&mddev->del_work, md_start_sync);
- queue_work(md_misc_wq, &mddev->del_work);
- goto unlock;
- }
- not_running:
- if (!mddev->sync_thread) {
+ if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
+ queue_work(md_misc_wq, &mddev->sync_work);
+ } else {
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
wake_up(&resync_wait);
- if (test_and_clear_bit(MD_RECOVERY_RECOVER,
- &mddev->recovery))
- if (mddev->sysfs_action)
- sysfs_notify_dirent_safe(mddev->sysfs_action);
}
+
unlock:
wake_up(&mddev->sb_wait);
mddev_unlock(mddev);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 274e7d61d19f..ade83af123a2 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -246,10 +246,6 @@ struct md_cluster_info;
* become failed.
* @MD_HAS_PPL: The raid array has PPL feature set.
* @MD_HAS_MULTIPLE_PPLS: The raid array has multiple PPLs feature set.
- * @MD_ALLOW_SB_UPDATE: md_check_recovery is allowed to update the metadata
- * without taking reconfig_mutex.
- * @MD_UPDATING_SB: md_check_recovery is updating the metadata without
- * explicitly holding reconfig_mutex.
* @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that
* array is ready yet.
* @MD_BROKEN: This is used to stop writes and mark array as failed.
@@ -266,8 +262,6 @@ enum mddev_flags {
MD_FAILFAST_SUPPORTED,
MD_HAS_PPL,
MD_HAS_MULTIPLE_PPLS,
- MD_ALLOW_SB_UPDATE,
- MD_UPDATING_SB,
MD_NOT_READY,
MD_BROKEN,
MD_DELETED,
@@ -314,6 +308,7 @@ struct mddev {
unsigned long sb_flags;
int suspended;
+ struct mutex suspend_mutex;
struct percpu_ref active_io;
int ro;
int sysfs_active; /* set when sysfs deletes
@@ -451,7 +446,10 @@ struct mddev {
struct kernfs_node *sysfs_degraded; /*handle for 'degraded' */
struct kernfs_node *sysfs_level; /*handle for 'level' */
- struct work_struct del_work; /* used for delayed sysfs removal */
+ /* used for delayed sysfs removal */
+ struct work_struct del_work;
+ /* used for register new sync thread */
+ struct work_struct sync_work;
/* "lock" protects:
* flush_bio transition from NULL to !NULL
@@ -565,23 +563,6 @@ enum recovery_flags {
MD_RESYNCING_REMOTE, /* remote node is running resync thread */
};
-enum md_ro_state {
- MD_RDWR,
- MD_RDONLY,
- MD_AUTO_READ,
- MD_MAX_STATE
-};
-
-static inline bool md_is_rdwr(struct mddev *mddev)
-{
- return (mddev->ro == MD_RDWR);
-}
-
-static inline bool is_md_suspended(struct mddev *mddev)
-{
- return percpu_ref_is_dying(&mddev->active_io);
-}
-
static inline int __must_check mddev_lock(struct mddev *mddev)
{
return mutex_lock_interruptible(&mddev->reconfig_mutex);
@@ -641,7 +622,6 @@ struct md_personality
int (*start_reshape) (struct mddev *mddev);
void (*finish_reshape) (struct mddev *mddev);
void (*update_reshape_pos) (struct mddev *mddev);
- void (*prepare_suspend) (struct mddev *mddev);
/* quiesce suspends or resumes internal processing.
* 1 - stop new actions and wait for action io to complete
* 0 - return to normal behaviour
@@ -766,7 +746,6 @@ extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **t
extern void md_wakeup_thread(struct md_thread __rcu *thread);
extern void md_check_recovery(struct mddev *mddev);
extern void md_reap_sync_thread(struct mddev *mddev);
-extern int mddev_init_writes_pending(struct mddev *mddev);
extern bool md_write_start(struct mddev *mddev, struct bio *bi);
extern void md_write_inc(struct mddev *mddev, struct bio *bi);
extern void md_write_end(struct mddev *mddev);
@@ -793,7 +772,8 @@ extern int md_integrity_register(struct mddev *mddev);
extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev);
extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
-extern void mddev_init(struct mddev *mddev);
+extern int mddev_init(struct mddev *mddev);
+extern void mddev_destroy(struct mddev *mddev);
struct mddev *md_alloc(dev_t dev, char *name);
void mddev_put(struct mddev *mddev);
extern int md_run(struct mddev *mddev);
@@ -804,15 +784,14 @@ extern int md_rdev_init(struct md_rdev *rdev);
extern void md_rdev_clear(struct md_rdev *rdev);
extern void md_handle_request(struct mddev *mddev, struct bio *bio);
-extern void mddev_suspend(struct mddev *mddev);
+extern int mddev_suspend(struct mddev *mddev, bool interruptible);
extern void mddev_resume(struct mddev *mddev);
extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
-extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
- bool is_suspend);
-extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
- bool is_suspend);
+extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev);
+extern void mddev_destroy_serial_pool(struct mddev *mddev,
+ struct md_rdev *rdev);
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);
@@ -850,6 +829,33 @@ static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio
mddev->queue->limits.max_write_zeroes_sectors = 0;
}
+static inline int mddev_suspend_and_lock(struct mddev *mddev)
+{
+ int ret;
+
+ ret = mddev_suspend(mddev, true);
+ if (ret)
+ return ret;
+
+ ret = mddev_lock(mddev);
+ if (ret)
+ mddev_resume(mddev);
+
+ return ret;
+}
+
+static inline void mddev_suspend_and_lock_nointr(struct mddev *mddev)
+{
+ mddev_suspend(mddev, false);
+ mutex_lock(&mddev->reconfig_mutex);
+}
+
+static inline void mddev_unlock_and_resume(struct mddev *mddev)
+{
+ mddev_unlock(mddev);
+ mddev_resume(mddev);
+}
+
struct mdu_array_info_s;
struct mdu_disk_info_s;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 2aabac773fe7..35d12948e0a9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1345,6 +1345,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
int first_clone;
int max_sectors;
bool write_behind = false;
+ bool is_discard = (bio_op(bio) == REQ_OP_DISCARD);
if (mddev_is_clustered(mddev) &&
md_cluster_ops->area_resyncing(mddev, WRITE,
@@ -1405,7 +1406,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
* write-mostly, which means we could allocate write behind
* bio later.
*/
- if (rdev && test_bit(WriteMostly, &rdev->flags))
+ if (!is_discard && rdev && test_bit(WriteMostly, &rdev->flags))
write_behind = true;
if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
@@ -3122,8 +3123,7 @@ static int raid1_run(struct mddev *mddev)
mdname(mddev));
return -EIO;
}
- if (mddev_init_writes_pending(mddev) < 0)
- return -ENOMEM;
+
/*
* copy the already verified devices into our private RAID1
* bookkeeping area. [whatever we allocate in run(),
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 023413120851..a5927e98dc67 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4154,9 +4154,6 @@ static int raid10_run(struct mddev *mddev)
sector_t min_offset_diff = 0;
int first = 1;
- if (mddev_init_writes_pending(mddev) < 0)
- return -ENOMEM;
-
if (mddev->private == NULL) {
conf = setup_conf(mddev);
if (IS_ERR(conf))
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 518b7cfa78b9..6157f5beb9fe 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -327,8 +327,9 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
void r5c_check_stripe_cache_usage(struct r5conf *conf)
{
int total_cached;
+ struct r5l_log *log = READ_ONCE(conf->log);
- if (!r5c_is_writeback(conf->log))
+ if (!r5c_is_writeback(log))
return;
total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
@@ -344,7 +345,7 @@ void r5c_check_stripe_cache_usage(struct r5conf *conf)
*/
if (total_cached > conf->min_nr_stripes * 1 / 2 ||
atomic_read(&conf->empty_inactive_list_nr) > 0)
- r5l_wake_reclaim(conf->log, 0);
+ r5l_wake_reclaim(log, 0);
}
/*
@@ -353,7 +354,9 @@ void r5c_check_stripe_cache_usage(struct r5conf *conf)
*/
void r5c_check_cached_full_stripe(struct r5conf *conf)
{
- if (!r5c_is_writeback(conf->log))
+ struct r5l_log *log = READ_ONCE(conf->log);
+
+ if (!r5c_is_writeback(log))
return;
/*
@@ -363,7 +366,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf)
if (atomic_read(&conf->r5c_cached_full_stripes) >=
min(R5C_FULL_STRIPE_FLUSH_BATCH(conf),
conf->chunk_sectors >> RAID5_STRIPE_SHIFT(conf)))
- r5l_wake_reclaim(conf->log, 0);
+ r5l_wake_reclaim(log, 0);
}
/*
@@ -396,7 +399,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf)
*/
static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
{
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
if (!r5c_is_writeback(log))
return 0;
@@ -449,7 +452,7 @@ static inline void r5c_update_log_state(struct r5l_log *log)
void r5c_make_stripe_write_out(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
BUG_ON(!r5c_is_writeback(log));
@@ -491,7 +494,7 @@ static void r5c_handle_parity_cached(struct stripe_head *sh)
*/
static void r5c_finish_cache_stripe(struct stripe_head *sh)
{
- struct r5l_log *log = sh->raid_conf->log;
+ struct r5l_log *log = READ_ONCE(sh->raid_conf->log);
if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
@@ -683,7 +686,6 @@ static void r5c_disable_writeback_async(struct work_struct *work)
disable_writeback_work);
struct mddev *mddev = log->rdev->mddev;
struct r5conf *conf = mddev->private;
- int locked = 0;
if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
return;
@@ -692,14 +694,14 @@ static void r5c_disable_writeback_async(struct work_struct *work)
/* wait superblock change before suspend */
wait_event(mddev->sb_wait,
- conf->log == NULL ||
- (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) &&
- (locked = mddev_trylock(mddev))));
- if (locked) {
- mddev_suspend(mddev);
+ !READ_ONCE(conf->log) ||
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
+
+ log = READ_ONCE(conf->log);
+ if (log) {
+ mddev_suspend(mddev, false);
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
mddev_resume(mddev);
- mddev_unlock(mddev);
}
}
@@ -1151,7 +1153,7 @@ static void r5l_run_no_space_stripes(struct r5l_log *log)
static sector_t r5c_calculate_new_cp(struct r5conf *conf)
{
struct stripe_head *sh;
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
sector_t new_cp;
unsigned long flags;
@@ -1159,12 +1161,12 @@ static sector_t r5c_calculate_new_cp(struct r5conf *conf)
return log->next_checkpoint;
spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
- if (list_empty(&conf->log->stripe_in_journal_list)) {
+ if (list_empty(&log->stripe_in_journal_list)) {
/* all stripes flushed */
spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
return log->next_checkpoint;
}
- sh = list_first_entry(&conf->log->stripe_in_journal_list,
+ sh = list_first_entry(&log->stripe_in_journal_list,
struct stripe_head, r5c);
new_cp = sh->log_start;
spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
@@ -1399,7 +1401,7 @@ void r5c_flush_cache(struct r5conf *conf, int num)
struct stripe_head *sh, *next;
lockdep_assert_held(&conf->device_lock);
- if (!conf->log)
+ if (!READ_ONCE(conf->log))
return;
count = 0;
@@ -1420,7 +1422,7 @@ void r5c_flush_cache(struct r5conf *conf, int num)
static void r5c_do_reclaim(struct r5conf *conf)
{
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
struct stripe_head *sh;
int count = 0;
unsigned long flags;
@@ -1549,7 +1551,7 @@ static void r5l_reclaim_thread(struct md_thread *thread)
{
struct mddev *mddev = thread->mddev;
struct r5conf *conf = mddev->private;
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
if (!log)
return;
@@ -1591,7 +1593,7 @@ void r5l_quiesce(struct r5l_log *log, int quiesce)
bool r5l_log_disk_error(struct r5conf *conf)
{
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
/* don't allow write if journal disk is missing */
if (!log)
@@ -2583,9 +2585,7 @@ int r5c_journal_mode_set(struct mddev *mddev, int mode)
mode == R5C_JOURNAL_MODE_WRITE_BACK)
return -EINVAL;
- mddev_suspend(mddev);
conf->log->r5c_journal_mode = mode;
- mddev_resume(mddev);
pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
mdname(mddev), mode, r5c_journal_mode_str[mode]);
@@ -2610,11 +2610,11 @@ static ssize_t r5c_journal_mode_store(struct mddev *mddev,
if (strlen(r5c_journal_mode_str[mode]) == len &&
!strncmp(page, r5c_journal_mode_str[mode], len))
break;
- ret = mddev_lock(mddev);
+ ret = mddev_suspend_and_lock(mddev);
if (ret)
return ret;
ret = r5c_journal_mode_set(mddev, mode);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return ret ?: length;
}
@@ -2635,7 +2635,7 @@ int r5c_try_caching_write(struct r5conf *conf,
struct stripe_head_state *s,
int disks)
{
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
int i;
struct r5dev *dev;
int to_cache = 0;
@@ -2802,7 +2802,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
struct stripe_head *sh,
struct stripe_head_state *s)
{
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
int i;
int do_wakeup = 0;
sector_t tree_index;
@@ -2941,7 +2941,7 @@ int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
/* check whether this big stripe is in write back cache. */
bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect)
{
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
sector_t tree_index;
void *slot;
@@ -3049,14 +3049,14 @@ int r5l_start(struct r5l_log *log)
void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
{
struct r5conf *conf = mddev->private;
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
if (!log)
return;
if ((raid5_calc_degraded(conf) > 0 ||
test_bit(Journal, &rdev->flags)) &&
- conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
+ log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
schedule_work(&log->disable_writeback_work);
}
@@ -3145,7 +3145,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
spin_lock_init(&log->stripe_in_journal_lock);
atomic_set(&log->stripe_in_journal_count, 0);
- conf->log = log;
+ WRITE_ONCE(conf->log, log);
set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
return 0;
@@ -3173,7 +3173,7 @@ void r5l_exit_log(struct r5conf *conf)
* 'reconfig_mutex' is held by caller, set 'confg->log' to NULL to
* ensure disable_writeback_work wakes up and exits.
*/
- conf->log = NULL;
+ WRITE_ONCE(conf->log, NULL);
wake_up(&conf->mddev->sb_wait);
flush_work(&log->disable_writeback_work);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 284cd71bcc68..c84ccc97329b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -70,6 +70,8 @@ MODULE_PARM_DESC(devices_handle_discard_safely,
"Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
static struct workqueue_struct *raid5_wq;
+static void raid5_quiesce(struct mddev *mddev, int quiesce);
+
static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
{
int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK;
@@ -2499,15 +2501,12 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
unsigned long cpu;
int err = 0;
- /*
- * Never shrink. And mddev_suspend() could deadlock if this is called
- * from raid5d. In that case, scribble_disks and scribble_sectors
- * should equal to new_disks and new_sectors
- */
+ /* Never shrink. */
if (conf->scribble_disks >= new_disks &&
conf->scribble_sectors >= new_sectors)
return 0;
- mddev_suspend(conf->mddev);
+
+ raid5_quiesce(conf->mddev, true);
cpus_read_lock();
for_each_present_cpu(cpu) {
@@ -2521,7 +2520,8 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
}
cpus_read_unlock();
- mddev_resume(conf->mddev);
+ raid5_quiesce(conf->mddev, false);
+
if (!err) {
conf->scribble_disks = new_disks;
conf->scribble_sectors = new_sectors;
@@ -5960,19 +5960,6 @@ out:
return ret;
}
-static bool reshape_inprogress(struct mddev *mddev)
-{
- return test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
- test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
- !test_bit(MD_RECOVERY_DONE, &mddev->recovery) &&
- !test_bit(MD_RECOVERY_INTR, &mddev->recovery);
-}
-
-static bool reshape_disabled(struct mddev *mddev)
-{
- return is_md_suspended(mddev) || !md_is_rdwr(mddev);
-}
-
static enum stripe_result make_stripe_request(struct mddev *mddev,
struct r5conf *conf, struct stripe_request_ctx *ctx,
sector_t logical_sector, struct bio *bi)
@@ -6004,8 +5991,7 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
if (ahead_of_reshape(mddev, logical_sector,
conf->reshape_safe)) {
spin_unlock_irq(&conf->device_lock);
- ret = STRIPE_SCHEDULE_AND_RETRY;
- goto out;
+ return STRIPE_SCHEDULE_AND_RETRY;
}
}
spin_unlock_irq(&conf->device_lock);
@@ -6084,15 +6070,6 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
out_release:
raid5_release_stripe(sh);
-out:
- if (ret == STRIPE_SCHEDULE_AND_RETRY && !reshape_inprogress(mddev) &&
- reshape_disabled(mddev)) {
- bi->bi_status = BLK_STS_IOERR;
- ret = STRIPE_FAIL;
- pr_err("md/raid456:%s: io failed across reshape position while reshape can't make progress.\n",
- mdname(mddev));
- }
-
return ret;
}
@@ -7032,7 +7009,7 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
new != roundup_pow_of_two(new))
return -EINVAL;
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err)
return err;
@@ -7056,7 +7033,6 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
goto out_unlock;
}
- mddev_suspend(mddev);
mutex_lock(&conf->cache_size_mutex);
size = conf->max_nr_stripes;
@@ -7071,10 +7047,9 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
err = -ENOMEM;
}
mutex_unlock(&conf->cache_size_mutex);
- mddev_resume(mddev);
out_unlock:
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return err ?: len;
}
@@ -7160,7 +7135,7 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
return -EINVAL;
new = !!new;
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err)
return err;
conf = mddev->private;
@@ -7169,15 +7144,13 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
else if (new != conf->skip_copy) {
struct request_queue *q = mddev->queue;
- mddev_suspend(mddev);
conf->skip_copy = new;
if (new)
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
else
blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
- mddev_resume(mddev);
}
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return err ?: len;
}
@@ -7232,15 +7205,13 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
if (new > 8192)
return -EINVAL;
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err)
return err;
conf = mddev->private;
if (!conf)
err = -ENODEV;
else if (new != conf->worker_cnt_per_group) {
- mddev_suspend(mddev);
-
old_groups = conf->worker_groups;
if (old_groups)
flush_workqueue(raid5_wq);
@@ -7257,9 +7228,8 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
kfree(old_groups[0].workers);
kfree(old_groups);
}
- mddev_resume(mddev);
}
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return err ?: len;
}
@@ -7785,9 +7755,6 @@ static int raid5_run(struct mddev *mddev)
long long min_offset_diff = 0;
int first = 1;
- if (mddev_init_writes_pending(mddev) < 0)
- return -ENOMEM;
-
if (mddev->recovery_cp != MaxSector)
pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
mdname(mddev));
@@ -8568,8 +8535,8 @@ static int raid5_start_reshape(struct mddev *mddev)
* the reshape wasn't running - like Discard or Read - have
* completed.
*/
- mddev_suspend(mddev);
- mddev_resume(mddev);
+ raid5_quiesce(mddev, true);
+ raid5_quiesce(mddev, false);
/* Add some new drives, as many as will fit.
* We know there are enough to make the newly sized array work.
@@ -8984,12 +8951,12 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
struct r5conf *conf;
int err;
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err)
return err;
conf = mddev->private;
if (!conf) {
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return -ENODEV;
}
@@ -8999,19 +8966,14 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
err = log_init(conf, NULL, true);
if (!err) {
err = resize_stripes(conf, conf->pool_size);
- if (err) {
- mddev_suspend(mddev);
+ if (err)
log_exit(conf);
- mddev_resume(mddev);
- }
}
} else
err = -EINVAL;
} else if (strncmp(buf, "resync", 6) == 0) {
if (raid5_has_ppl(conf)) {
- mddev_suspend(mddev);
log_exit(conf);
- mddev_resume(mddev);
err = resize_stripes(conf, conf->pool_size);
} else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
r5l_log_disk_error(conf)) {
@@ -9024,11 +8986,9 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
break;
}
- if (!journal_dev_exists) {
- mddev_suspend(mddev);
+ if (!journal_dev_exists)
clear_bit(MD_HAS_JOURNAL, &mddev->flags);
- mddev_resume(mddev);
- } else /* need remove journal device first */
+ else /* need remove journal device first */
err = -EBUSY;
} else
err = -EINVAL;
@@ -9039,7 +8999,7 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
if (!err)
md_update_sb(mddev, 1);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return err;
}
@@ -9051,22 +9011,6 @@ static int raid5_start(struct mddev *mddev)
return r5l_start(conf->log);
}
-static void raid5_prepare_suspend(struct mddev *mddev)
-{
- struct r5conf *conf = mddev->private;
-
- wait_event(mddev->sb_wait, !reshape_inprogress(mddev) ||
- percpu_ref_is_zero(&mddev->active_io));
- if (percpu_ref_is_zero(&mddev->active_io))
- return;
-
- /*
- * Reshape is not in progress, and array is suspended, io that is
- * waiting for reshpape can never be done.
- */
- wake_up(&conf->wait_for_overlap);
-}
-
static struct md_personality raid6_personality =
{
.name = "raid6",
@@ -9087,7 +9031,6 @@ static struct md_personality raid6_personality =
.check_reshape = raid6_check_reshape,
.start_reshape = raid5_start_reshape,
.finish_reshape = raid5_finish_reshape,
- .prepare_suspend = raid5_prepare_suspend,
.quiesce = raid5_quiesce,
.takeover = raid6_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
@@ -9112,7 +9055,6 @@ static struct md_personality raid5_personality =
.check_reshape = raid5_check_reshape,
.start_reshape = raid5_start_reshape,
.finish_reshape = raid5_finish_reshape,
- .prepare_suspend = raid5_prepare_suspend,
.quiesce = raid5_quiesce,
.takeover = raid5_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
@@ -9138,7 +9080,6 @@ static struct md_personality raid4_personality =
.check_reshape = raid5_check_reshape,
.start_reshape = raid5_start_reshape,
.finish_reshape = raid5_finish_reshape,
- .prepare_suspend = raid5_prepare_suspend,
.quiesce = raid5_quiesce,
.takeover = raid4_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
diff --git a/drivers/nvme/common/Kconfig b/drivers/nvme/common/Kconfig
index 4514f44362dd..06c8df00d1e2 100644
--- a/drivers/nvme/common/Kconfig
+++ b/drivers/nvme/common/Kconfig
@@ -2,3 +2,16 @@
config NVME_COMMON
tristate
+
+config NVME_KEYRING
+ bool
+ select KEYS
+
+config NVME_AUTH
+ bool
+ select CRYPTO
+ select CRYPTO_HMAC
+ select CRYPTO_SHA256
+ select CRYPTO_SHA512
+ select CRYPTO_DH
+ select CRYPTO_DH_RFC7919_GROUPS
diff --git a/drivers/nvme/common/Makefile b/drivers/nvme/common/Makefile
index 720c625b8a52..0cbd0b0b8d49 100644
--- a/drivers/nvme/common/Makefile
+++ b/drivers/nvme/common/Makefile
@@ -4,4 +4,5 @@ ccflags-y += -I$(src)
obj-$(CONFIG_NVME_COMMON) += nvme-common.o
-nvme-common-y += auth.o
+nvme-common-$(CONFIG_NVME_AUTH) += auth.o
+nvme-common-$(CONFIG_NVME_KEYRING) += keyring.o
diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c
index d90e4f0c08b7..a8e87dfbeab2 100644
--- a/drivers/nvme/common/auth.c
+++ b/drivers/nvme/common/auth.c
@@ -150,6 +150,14 @@ size_t nvme_auth_hmac_hash_len(u8 hmac_id)
}
EXPORT_SYMBOL_GPL(nvme_auth_hmac_hash_len);
+u32 nvme_auth_key_struct_size(u32 key_len)
+{
+ struct nvme_dhchap_key key;
+
+ return struct_size(&key, key, key_len);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_key_struct_size);
+
struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
u8 key_hash)
{
@@ -163,14 +171,9 @@ struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
p = strrchr(secret, ':');
if (p)
allocated_len = p - secret;
- key = kzalloc(sizeof(*key), GFP_KERNEL);
+ key = nvme_auth_alloc_key(allocated_len, 0);
if (!key)
return ERR_PTR(-ENOMEM);
- key->key = kzalloc(allocated_len, GFP_KERNEL);
- if (!key->key) {
- ret = -ENOMEM;
- goto out_free_key;
- }
key_len = base64_decode(secret, allocated_len, key->key);
if (key_len < 0) {
@@ -187,14 +190,6 @@ struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
goto out_free_secret;
}
- if (key_hash > 0 &&
- (key_len - 4) != nvme_auth_hmac_hash_len(key_hash)) {
- pr_err("Mismatched key len %d for %s\n", key_len,
- nvme_auth_hmac_name(key_hash));
- ret = -EINVAL;
- goto out_free_secret;
- }
-
/* The last four bytes is the CRC in little-endian format */
key_len -= 4;
/*
@@ -213,37 +208,51 @@ struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
key->hash = key_hash;
return key;
out_free_secret:
- kfree_sensitive(key->key);
-out_free_key:
- kfree(key);
+ nvme_auth_free_key(key);
return ERR_PTR(ret);
}
EXPORT_SYMBOL_GPL(nvme_auth_extract_key);
+struct nvme_dhchap_key *nvme_auth_alloc_key(u32 len, u8 hash)
+{
+ u32 num_bytes = nvme_auth_key_struct_size(len);
+ struct nvme_dhchap_key *key = kzalloc(num_bytes, GFP_KERNEL);
+
+ if (key) {
+ key->len = len;
+ key->hash = hash;
+ }
+ return key;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_alloc_key);
+
void nvme_auth_free_key(struct nvme_dhchap_key *key)
{
if (!key)
return;
- kfree_sensitive(key->key);
- kfree(key);
+ kfree_sensitive(key);
}
EXPORT_SYMBOL_GPL(nvme_auth_free_key);
-u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn)
+struct nvme_dhchap_key *nvme_auth_transform_key(
+ struct nvme_dhchap_key *key, char *nqn)
{
const char *hmac_name;
struct crypto_shash *key_tfm;
struct shash_desc *shash;
- u8 *transformed_key;
- int ret;
+ struct nvme_dhchap_key *transformed_key;
+ int ret, key_len;
- if (!key || !key->key) {
+ if (!key) {
pr_warn("No key specified\n");
return ERR_PTR(-ENOKEY);
}
if (key->hash == 0) {
- transformed_key = kmemdup(key->key, key->len, GFP_KERNEL);
- return transformed_key ? transformed_key : ERR_PTR(-ENOMEM);
+ key_len = nvme_auth_key_struct_size(key->len);
+ transformed_key = kmemdup(key, key_len, GFP_KERNEL);
+ if (!transformed_key)
+ return ERR_PTR(-ENOMEM);
+ return transformed_key;
}
hmac_name = nvme_auth_hmac_name(key->hash);
if (!hmac_name) {
@@ -253,7 +262,7 @@ u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn)
key_tfm = crypto_alloc_shash(hmac_name, 0, 0);
if (IS_ERR(key_tfm))
- return (u8 *)key_tfm;
+ return ERR_CAST(key_tfm);
shash = kmalloc(sizeof(struct shash_desc) +
crypto_shash_descsize(key_tfm),
@@ -263,7 +272,8 @@ u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn)
goto out_free_key;
}
- transformed_key = kzalloc(crypto_shash_digestsize(key_tfm), GFP_KERNEL);
+ key_len = crypto_shash_digestsize(key_tfm);
+ transformed_key = nvme_auth_alloc_key(key_len, key->hash);
if (!transformed_key) {
ret = -ENOMEM;
goto out_free_shash;
@@ -282,7 +292,7 @@ u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn)
ret = crypto_shash_update(shash, "NVMe-over-Fabrics", 17);
if (ret < 0)
goto out_free_transformed_key;
- ret = crypto_shash_final(shash, transformed_key);
+ ret = crypto_shash_final(shash, transformed_key->key);
if (ret < 0)
goto out_free_transformed_key;
@@ -292,7 +302,7 @@ u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn)
return transformed_key;
out_free_transformed_key:
- kfree_sensitive(transformed_key);
+ nvme_auth_free_key(transformed_key);
out_free_shash:
kfree(shash);
out_free_key:
diff --git a/drivers/nvme/common/keyring.c b/drivers/nvme/common/keyring.c
new file mode 100644
index 000000000000..f8d9a208397b
--- /dev/null
+++ b/drivers/nvme/common/keyring.c
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023 Hannes Reinecke, SUSE Labs
+ */
+
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/key.h>
+#include <linux/key-type.h>
+#include <keys/user-type.h>
+#include <linux/nvme.h>
+#include <linux/nvme-tcp.h>
+#include <linux/nvme-keyring.h>
+
+static struct key *nvme_keyring;
+
+key_serial_t nvme_keyring_id(void)
+{
+ return nvme_keyring->serial;
+}
+EXPORT_SYMBOL_GPL(nvme_keyring_id);
+
+static void nvme_tls_psk_describe(const struct key *key, struct seq_file *m)
+{
+ seq_puts(m, key->description);
+ seq_printf(m, ": %u", key->datalen);
+}
+
+static bool nvme_tls_psk_match(const struct key *key,
+ const struct key_match_data *match_data)
+{
+ const char *match_id;
+ size_t match_len;
+
+ if (!key->description) {
+ pr_debug("%s: no key description\n", __func__);
+ return false;
+ }
+ match_len = strlen(key->description);
+ pr_debug("%s: id %s len %zd\n", __func__, key->description, match_len);
+
+ if (!match_data->raw_data) {
+ pr_debug("%s: no match data\n", __func__);
+ return false;
+ }
+ match_id = match_data->raw_data;
+ pr_debug("%s: match '%s' '%s' len %zd\n",
+ __func__, match_id, key->description, match_len);
+ return !memcmp(key->description, match_id, match_len);
+}
+
+static int nvme_tls_psk_match_preparse(struct key_match_data *match_data)
+{
+ match_data->lookup_type = KEYRING_SEARCH_LOOKUP_ITERATE;
+ match_data->cmp = nvme_tls_psk_match;
+ return 0;
+}
+
+static struct key_type nvme_tls_psk_key_type = {
+ .name = "psk",
+ .flags = KEY_TYPE_NET_DOMAIN,
+ .preparse = user_preparse,
+ .free_preparse = user_free_preparse,
+ .match_preparse = nvme_tls_psk_match_preparse,
+ .instantiate = generic_key_instantiate,
+ .revoke = user_revoke,
+ .destroy = user_destroy,
+ .describe = nvme_tls_psk_describe,
+ .read = user_read,
+};
+
+static struct key *nvme_tls_psk_lookup(struct key *keyring,
+ const char *hostnqn, const char *subnqn,
+ int hmac, bool generated)
+{
+ char *identity;
+ size_t identity_len = (NVMF_NQN_SIZE) * 2 + 11;
+ key_ref_t keyref;
+ key_serial_t keyring_id;
+
+ identity = kzalloc(identity_len, GFP_KERNEL);
+ if (!identity)
+ return ERR_PTR(-ENOMEM);
+
+ snprintf(identity, identity_len, "NVMe0%c%02d %s %s",
+ generated ? 'G' : 'R', hmac, hostnqn, subnqn);
+
+ if (!keyring)
+ keyring = nvme_keyring;
+ keyring_id = key_serial(keyring);
+ pr_debug("keyring %x lookup tls psk '%s'\n",
+ keyring_id, identity);
+ keyref = keyring_search(make_key_ref(keyring, true),
+ &nvme_tls_psk_key_type,
+ identity, false);
+ if (IS_ERR(keyref)) {
+ pr_debug("lookup tls psk '%s' failed, error %ld\n",
+ identity, PTR_ERR(keyref));
+ kfree(identity);
+ return ERR_PTR(-ENOKEY);
+ }
+ kfree(identity);
+
+ return key_ref_to_ptr(keyref);
+}
+
+/*
+ * NVMe PSK priority list
+ *
+ * 'Retained' PSKs (ie 'generated == false')
+ * should be preferred to 'generated' PSKs,
+ * and SHA-384 should be preferred to SHA-256.
+ */
+struct nvme_tls_psk_priority_list {
+ bool generated;
+ enum nvme_tcp_tls_cipher cipher;
+} nvme_tls_psk_prio[] = {
+ { .generated = false,
+ .cipher = NVME_TCP_TLS_CIPHER_SHA384, },
+ { .generated = false,
+ .cipher = NVME_TCP_TLS_CIPHER_SHA256, },
+ { .generated = true,
+ .cipher = NVME_TCP_TLS_CIPHER_SHA384, },
+ { .generated = true,
+ .cipher = NVME_TCP_TLS_CIPHER_SHA256, },
+};
+
+/*
+ * nvme_tls_psk_default - Return the preferred PSK to use for TLS ClientHello
+ */
+key_serial_t nvme_tls_psk_default(struct key *keyring,
+ const char *hostnqn, const char *subnqn)
+{
+ struct key *tls_key;
+ key_serial_t tls_key_id;
+ int prio;
+
+ for (prio = 0; prio < ARRAY_SIZE(nvme_tls_psk_prio); prio++) {
+ bool generated = nvme_tls_psk_prio[prio].generated;
+ enum nvme_tcp_tls_cipher cipher = nvme_tls_psk_prio[prio].cipher;
+
+ tls_key = nvme_tls_psk_lookup(keyring, hostnqn, subnqn,
+ cipher, generated);
+ if (!IS_ERR(tls_key)) {
+ tls_key_id = tls_key->serial;
+ key_put(tls_key);
+ return tls_key_id;
+ }
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_tls_psk_default);
+
+int nvme_keyring_init(void)
+{
+ int err;
+
+ nvme_keyring = keyring_alloc(".nvme",
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+ current_cred(),
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ (KEY_USR_ALL & ~KEY_USR_SETATTR),
+ KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
+ if (IS_ERR(nvme_keyring))
+ return PTR_ERR(nvme_keyring);
+
+ err = register_key_type(&nvme_tls_psk_key_type);
+ if (err) {
+ key_put(nvme_keyring);
+ return err;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_keyring_init);
+
+void nvme_keyring_exit(void)
+{
+ unregister_key_type(&nvme_tls_psk_key_type);
+ key_revoke(nvme_keyring);
+ key_put(nvme_keyring);
+}
+EXPORT_SYMBOL_GPL(nvme_keyring_exit);
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 2f6a7f8c94e8..48f7d72de5e9 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -92,16 +92,26 @@ config NVME_TCP
If unsure, say N.
-config NVME_AUTH
+config NVME_TCP_TLS
+ bool "NVMe over Fabrics TCP TLS encryption support"
+ depends on NVME_TCP
+ select NVME_COMMON
+ select NVME_KEYRING
+ select NET_HANDSHAKE
+ select KEYS
+ help
+ Enables TLS encryption for NVMe TCP using the netlink handshake API.
+
+ The TLS handshake daemon is availble at
+ https://github.com/oracle/ktls-utils.
+
+ If unsure, say N.
+
+config NVME_HOST_AUTH
bool "NVM Express over Fabrics In-Band Authentication"
depends on NVME_CORE
select NVME_COMMON
- select CRYPTO
- select CRYPTO_HMAC
- select CRYPTO_SHA256
- select CRYPTO_SHA512
- select CRYPTO_DH
- select CRYPTO_DH_RFC7919_GROUPS
+ select NVME_AUTH
help
This provides support for NVMe over Fabrics In-Band Authentication.
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index c7c3cf202d12..6414ec968f99 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -17,7 +17,7 @@ nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o
nvme-core-$(CONFIG_BLK_DEV_ZONED) += zns.o
nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o
nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o
-nvme-core-$(CONFIG_NVME_AUTH) += auth.o
+nvme-core-$(CONFIG_NVME_HOST_AUTH) += auth.o
nvme-y += pci.o
diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c
index 064592a5d546..eaefebb2a799 100644
--- a/drivers/nvme/host/auth.c
+++ b/drivers/nvme/host/auth.c
@@ -23,6 +23,7 @@ struct nvme_dhchap_queue_context {
struct nvme_ctrl *ctrl;
struct crypto_shash *shash_tfm;
struct crypto_kpp *dh_tfm;
+ struct nvme_dhchap_key *transformed_key;
void *buf;
int qid;
int error;
@@ -36,7 +37,6 @@ struct nvme_dhchap_queue_context {
u8 c1[64];
u8 c2[64];
u8 response[64];
- u8 *host_response;
u8 *ctrl_key;
u8 *host_key;
u8 *sess_key;
@@ -428,12 +428,12 @@ static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl,
dev_dbg(ctrl->device, "%s: qid %d host response seq %u transaction %d\n",
__func__, chap->qid, chap->s1, chap->transaction);
- if (!chap->host_response) {
- chap->host_response = nvme_auth_transform_key(ctrl->host_key,
+ if (!chap->transformed_key) {
+ chap->transformed_key = nvme_auth_transform_key(ctrl->host_key,
ctrl->opts->host->nqn);
- if (IS_ERR(chap->host_response)) {
- ret = PTR_ERR(chap->host_response);
- chap->host_response = NULL;
+ if (IS_ERR(chap->transformed_key)) {
+ ret = PTR_ERR(chap->transformed_key);
+ chap->transformed_key = NULL;
return ret;
}
} else {
@@ -442,7 +442,7 @@ static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl,
}
ret = crypto_shash_setkey(chap->shash_tfm,
- chap->host_response, ctrl->host_key->len);
+ chap->transformed_key->key, chap->transformed_key->len);
if (ret) {
dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n",
chap->qid, ret);
@@ -508,19 +508,19 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl,
struct nvme_dhchap_queue_context *chap)
{
SHASH_DESC_ON_STACK(shash, chap->shash_tfm);
- u8 *ctrl_response;
+ struct nvme_dhchap_key *transformed_key;
u8 buf[4], *challenge = chap->c2;
int ret;
- ctrl_response = nvme_auth_transform_key(ctrl->ctrl_key,
+ transformed_key = nvme_auth_transform_key(ctrl->ctrl_key,
ctrl->opts->subsysnqn);
- if (IS_ERR(ctrl_response)) {
- ret = PTR_ERR(ctrl_response);
+ if (IS_ERR(transformed_key)) {
+ ret = PTR_ERR(transformed_key);
return ret;
}
ret = crypto_shash_setkey(chap->shash_tfm,
- ctrl_response, ctrl->ctrl_key->len);
+ transformed_key->key, transformed_key->len);
if (ret) {
dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n",
chap->qid, ret);
@@ -586,7 +586,7 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl,
out:
if (challenge != chap->c2)
kfree(challenge);
- kfree(ctrl_response);
+ nvme_auth_free_key(transformed_key);
return ret;
}
@@ -648,8 +648,8 @@ gen_sesskey:
static void nvme_auth_reset_dhchap(struct nvme_dhchap_queue_context *chap)
{
- kfree_sensitive(chap->host_response);
- chap->host_response = NULL;
+ nvme_auth_free_key(chap->transformed_key);
+ chap->transformed_key = NULL;
kfree_sensitive(chap->host_key);
chap->host_key = NULL;
chap->host_key_len = 0;
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 21783aa2ee8e..62612f87aafa 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -25,6 +25,7 @@
#include "nvme.h"
#include "fabrics.h"
#include <linux/nvme-auth.h>
+#include <linux/nvme-keyring.h>
#define CREATE_TRACE_POINTS
#include "trace.h"
@@ -420,7 +421,7 @@ void nvme_complete_rq(struct request *req)
nvme_failover_req(req);
return;
case AUTHENTICATE:
-#ifdef CONFIG_NVME_AUTH
+#ifdef CONFIG_NVME_HOST_AUTH
queue_work(nvme_wq, &ctrl->dhchap_auth_work);
nvme_retry_req(req);
#else
@@ -4399,7 +4400,7 @@ static void nvme_free_ctrl(struct device *dev)
if (!subsys || ctrl->instance != subsys->instance)
ida_free(&nvme_instance_ida, ctrl->instance);
-
+ key_put(ctrl->tls_key);
nvme_free_cels(ctrl);
nvme_mpath_uninit(ctrl);
nvme_auth_stop(ctrl);
@@ -4723,12 +4724,16 @@ static int __init nvme_core_init(void)
result = PTR_ERR(nvme_ns_chr_class);
goto unregister_generic_ns;
}
-
- result = nvme_init_auth();
+ result = nvme_keyring_init();
if (result)
goto destroy_ns_chr;
+ result = nvme_init_auth();
+ if (result)
+ goto keyring_exit;
return 0;
+keyring_exit:
+ nvme_keyring_exit();
destroy_ns_chr:
class_destroy(nvme_ns_chr_class);
unregister_generic_ns:
@@ -4752,6 +4757,7 @@ out:
static void __exit nvme_core_exit(void)
{
nvme_exit_auth();
+ nvme_keyring_exit();
class_destroy(nvme_ns_chr_class);
class_destroy(nvme_subsys_class);
class_destroy(nvme_class);
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 8175d49f2909..4673ead69c5f 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -12,6 +12,7 @@
#include <linux/seq_file.h>
#include "nvme.h"
#include "fabrics.h"
+#include <linux/nvme-keyring.h>
static LIST_HEAD(nvmf_transports);
static DECLARE_RWSEM(nvmf_transports_rwsem);
@@ -622,6 +623,23 @@ static struct nvmf_transport_ops *nvmf_lookup_transport(
return NULL;
}
+static struct key *nvmf_parse_key(int key_id)
+{
+ struct key *key;
+
+ if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) {
+ pr_err("TLS is not supported\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ key = key_lookup(key_id);
+ if (!IS_ERR(key))
+ pr_err("key id %08x not found\n", key_id);
+ else
+ pr_debug("Using key id %08x\n", key_id);
+ return key;
+}
+
static const match_table_t opt_tokens = {
{ NVMF_OPT_TRANSPORT, "transport=%s" },
{ NVMF_OPT_TRADDR, "traddr=%s" },
@@ -643,10 +661,17 @@ static const match_table_t opt_tokens = {
{ NVMF_OPT_NR_WRITE_QUEUES, "nr_write_queues=%d" },
{ NVMF_OPT_NR_POLL_QUEUES, "nr_poll_queues=%d" },
{ NVMF_OPT_TOS, "tos=%d" },
+#ifdef CONFIG_NVME_TCP_TLS
+ { NVMF_OPT_KEYRING, "keyring=%d" },
+ { NVMF_OPT_TLS_KEY, "tls_key=%d" },
+#endif
{ NVMF_OPT_FAIL_FAST_TMO, "fast_io_fail_tmo=%d" },
{ NVMF_OPT_DISCOVERY, "discovery" },
{ NVMF_OPT_DHCHAP_SECRET, "dhchap_secret=%s" },
{ NVMF_OPT_DHCHAP_CTRL_SECRET, "dhchap_ctrl_secret=%s" },
+#ifdef CONFIG_NVME_TCP_TLS
+ { NVMF_OPT_TLS, "tls" },
+#endif
{ NVMF_OPT_ERR, NULL }
};
@@ -657,9 +682,10 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
char *options, *o, *p;
int token, ret = 0;
size_t nqnlen = 0;
- int ctrl_loss_tmo = NVMF_DEF_CTRL_LOSS_TMO;
+ int ctrl_loss_tmo = NVMF_DEF_CTRL_LOSS_TMO, key_id;
uuid_t hostid;
char hostnqn[NVMF_NQN_SIZE];
+ struct key *key;
/* Set defaults */
opts->queue_size = NVMF_DEF_QUEUE_SIZE;
@@ -671,6 +697,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
opts->hdr_digest = false;
opts->data_digest = false;
opts->tos = -1; /* < 0 == use transport default */
+ opts->tls = false;
+ opts->tls_key = NULL;
+ opts->keyring = NULL;
options = o = kstrdup(buf, GFP_KERNEL);
if (!options)
@@ -924,6 +953,32 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
}
opts->tos = token;
break;
+ case NVMF_OPT_KEYRING:
+ if (match_int(args, &key_id) || key_id <= 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+ key = nvmf_parse_key(key_id);
+ if (IS_ERR(key)) {
+ ret = PTR_ERR(key);
+ goto out;
+ }
+ key_put(opts->keyring);
+ opts->keyring = key;
+ break;
+ case NVMF_OPT_TLS_KEY:
+ if (match_int(args, &key_id) || key_id <= 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+ key = nvmf_parse_key(key_id);
+ if (IS_ERR(key)) {
+ ret = PTR_ERR(key);
+ goto out;
+ }
+ key_put(opts->tls_key);
+ opts->tls_key = key;
+ break;
case NVMF_OPT_DISCOVERY:
opts->discovery_nqn = true;
break;
@@ -955,6 +1010,14 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
kfree(opts->dhchap_ctrl_secret);
opts->dhchap_ctrl_secret = p;
break;
+ case NVMF_OPT_TLS:
+ if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) {
+ pr_err("TLS is not supported\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->tls = true;
+ break;
default:
pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
p);
@@ -1156,6 +1219,8 @@ static int nvmf_check_allowed_opts(struct nvmf_ctrl_options *opts,
void nvmf_free_options(struct nvmf_ctrl_options *opts)
{
nvmf_host_put(opts->host);
+ key_put(opts->keyring);
+ key_put(opts->tls_key);
kfree(opts->transport);
kfree(opts->traddr);
kfree(opts->trsvcid);
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 82e7a27ffbde..fbaee5a7be19 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -70,6 +70,9 @@ enum {
NVMF_OPT_DISCOVERY = 1 << 22,
NVMF_OPT_DHCHAP_SECRET = 1 << 23,
NVMF_OPT_DHCHAP_CTRL_SECRET = 1 << 24,
+ NVMF_OPT_TLS = 1 << 25,
+ NVMF_OPT_KEYRING = 1 << 26,
+ NVMF_OPT_TLS_KEY = 1 << 27,
};
/**
@@ -102,6 +105,9 @@ enum {
* @dhchap_secret: DH-HMAC-CHAP secret
* @dhchap_ctrl_secret: DH-HMAC-CHAP controller secret for bi-directional
* authentication
+ * @keyring: Keyring to use for key lookups
+ * @tls_key: TLS key for encrypted connections (TCP)
+ * @tls: Start TLS encrypted connections (TCP)
* @disable_sqflow: disable controller sq flow control
* @hdr_digest: generate/verify header digest (TCP)
* @data_digest: generate/verify data digest (TCP)
@@ -128,6 +134,9 @@ struct nvmf_ctrl_options {
struct nvmf_host *host;
char *dhchap_secret;
char *dhchap_ctrl_secret;
+ struct key *keyring;
+ struct key *tls_key;
+ bool tls;
bool disable_sqflow;
bool hdr_digest;
bool data_digest;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index f35647c470af..39a90b7cb125 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -349,7 +349,7 @@ struct nvme_ctrl {
struct work_struct ana_work;
#endif
-#ifdef CONFIG_NVME_AUTH
+#ifdef CONFIG_NVME_HOST_AUTH
struct work_struct dhchap_auth_work;
struct mutex dhchap_auth_mutex;
struct nvme_dhchap_queue_context *dhchap_ctxs;
@@ -357,6 +357,7 @@ struct nvme_ctrl {
struct nvme_dhchap_key *ctrl_key;
u16 transaction;
#endif
+ struct key *tls_key;
/* Power saving configuration */
u64 ps_max_latency_us;
@@ -1048,7 +1049,7 @@ static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl)
return ctrl->sgls & ((1 << 0) | (1 << 1));
}
-#ifdef CONFIG_NVME_AUTH
+#ifdef CONFIG_NVME_HOST_AUTH
int __init nvme_init_auth(void);
void __exit nvme_exit_auth(void);
int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 3f0c9ee09a12..507bc149046d 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -924,7 +924,6 @@ static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req)
if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true)))
return false;
- req->mq_hctx->tags->rqs[req->tag] = req;
return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK;
}
diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c
index 212e1b05d298..c6b7fbd4d34d 100644
--- a/drivers/nvme/host/sysfs.c
+++ b/drivers/nvme/host/sysfs.c
@@ -409,7 +409,7 @@ static ssize_t dctype_show(struct device *dev,
}
static DEVICE_ATTR_RO(dctype);
-#ifdef CONFIG_NVME_AUTH
+#ifdef CONFIG_NVME_HOST_AUTH
static ssize_t nvme_ctrl_dhchap_secret_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -527,6 +527,19 @@ static DEVICE_ATTR(dhchap_ctrl_secret, S_IRUGO | S_IWUSR,
nvme_ctrl_dhchap_ctrl_secret_show, nvme_ctrl_dhchap_ctrl_secret_store);
#endif
+#ifdef CONFIG_NVME_TCP_TLS
+static ssize_t tls_key_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+ if (!ctrl->tls_key)
+ return 0;
+ return sysfs_emit(buf, "%08x", key_serial(ctrl->tls_key));
+}
+static DEVICE_ATTR_RO(tls_key);
+#endif
+
static struct attribute *nvme_dev_attrs[] = {
&dev_attr_reset_controller.attr,
&dev_attr_rescan_controller.attr,
@@ -550,10 +563,13 @@ static struct attribute *nvme_dev_attrs[] = {
&dev_attr_kato.attr,
&dev_attr_cntrltype.attr,
&dev_attr_dctype.attr,
-#ifdef CONFIG_NVME_AUTH
+#ifdef CONFIG_NVME_HOST_AUTH
&dev_attr_dhchap_secret.attr,
&dev_attr_dhchap_ctrl_secret.attr,
#endif
+#ifdef CONFIG_NVME_TCP_TLS
+ &dev_attr_tls_key.attr,
+#endif
NULL
};
@@ -577,12 +593,17 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
return 0;
if (a == &dev_attr_fast_io_fail_tmo.attr && !ctrl->opts)
return 0;
-#ifdef CONFIG_NVME_AUTH
+#ifdef CONFIG_NVME_HOST_AUTH
if (a == &dev_attr_dhchap_secret.attr && !ctrl->opts)
return 0;
if (a == &dev_attr_dhchap_ctrl_secret.attr && !ctrl->opts)
return 0;
#endif
+#ifdef CONFIG_NVME_TCP_TLS
+ if (a == &dev_attr_tls_key.attr &&
+ (!ctrl->opts || strcmp(ctrl->opts->transport, "tcp")))
+ return 0;
+#endif
return a->mode;
}
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 5b332d9f87fc..4714a902f4ca 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -8,9 +8,14 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/err.h>
+#include <linux/key.h>
#include <linux/nvme-tcp.h>
+#include <linux/nvme-keyring.h>
#include <net/sock.h>
#include <net/tcp.h>
+#include <net/tls.h>
+#include <net/tls_prot.h>
+#include <net/handshake.h>
#include <linux/blk-mq.h>
#include <crypto/hash.h>
#include <net/busy_poll.h>
@@ -31,6 +36,16 @@ static int so_priority;
module_param(so_priority, int, 0644);
MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
+#ifdef CONFIG_NVME_TCP_TLS
+/*
+ * TLS handshake timeout
+ */
+static int tls_handshake_timeout = 10;
+module_param(tls_handshake_timeout, int, 0644);
+MODULE_PARM_DESC(tls_handshake_timeout,
+ "nvme TLS handshake timeout in seconds (default 10)");
+#endif
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/* lockdep can detect a circular dependency of the form
* sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
@@ -146,7 +161,10 @@ struct nvme_tcp_queue {
struct ahash_request *snd_hash;
__le32 exp_ddgst;
__le32 recv_ddgst;
-
+#ifdef CONFIG_NVME_TCP_TLS
+ struct completion tls_complete;
+ int tls_err;
+#endif
struct page_frag_cache pf_cache;
void (*state_change)(struct sock *);
@@ -1338,7 +1356,9 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
}
noreclaim_flag = memalloc_noreclaim_save();
- sock_release(queue->sock);
+ /* ->sock will be released by fput() */
+ fput(queue->sock->file);
+ queue->sock = NULL;
memalloc_noreclaim_restore(noreclaim_flag);
kfree(queue->pdu);
@@ -1350,6 +1370,8 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_icreq_pdu *icreq;
struct nvme_tcp_icresp_pdu *icresp;
+ char cbuf[CMSG_LEN(sizeof(char))] = {};
+ u8 ctype;
struct msghdr msg = {};
struct kvec iov;
bool ctrl_hdgst, ctrl_ddgst;
@@ -1381,17 +1403,35 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
iov.iov_base = icreq;
iov.iov_len = sizeof(*icreq);
ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
- if (ret < 0)
+ if (ret < 0) {
+ pr_warn("queue %d: failed to send icreq, error %d\n",
+ nvme_tcp_queue_id(queue), ret);
goto free_icresp;
+ }
memset(&msg, 0, sizeof(msg));
iov.iov_base = icresp;
iov.iov_len = sizeof(*icresp);
+ if (queue->ctrl->ctrl.opts->tls) {
+ msg.msg_control = cbuf;
+ msg.msg_controllen = sizeof(cbuf);
+ }
ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
iov.iov_len, msg.msg_flags);
- if (ret < 0)
+ if (ret < 0) {
+ pr_warn("queue %d: failed to receive icresp, error %d\n",
+ nvme_tcp_queue_id(queue), ret);
goto free_icresp;
-
+ }
+ if (queue->ctrl->ctrl.opts->tls) {
+ ctype = tls_get_record_type(queue->sock->sk,
+ (struct cmsghdr *)cbuf);
+ if (ctype != TLS_RECORD_TYPE_DATA) {
+ pr_err("queue %d: unhandled TLS record %d\n",
+ nvme_tcp_queue_id(queue), ctype);
+ return -ENOTCONN;
+ }
+ }
ret = -EINVAL;
if (icresp->hdr.type != nvme_tcp_icresp) {
pr_err("queue %d: bad type returned %d\n",
@@ -1507,11 +1547,99 @@ static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
}
-static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
+#ifdef CONFIG_NVME_TCP_TLS
+static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid)
+{
+ struct nvme_tcp_queue *queue = data;
+ struct nvme_tcp_ctrl *ctrl = queue->ctrl;
+ int qid = nvme_tcp_queue_id(queue);
+ struct key *tls_key;
+
+ dev_dbg(ctrl->ctrl.device, "queue %d: TLS handshake done, key %x, status %d\n",
+ qid, pskid, status);
+
+ if (status) {
+ queue->tls_err = -status;
+ goto out_complete;
+ }
+
+ tls_key = key_lookup(pskid);
+ if (IS_ERR(tls_key)) {
+ dev_warn(ctrl->ctrl.device, "queue %d: Invalid key %x\n",
+ qid, pskid);
+ queue->tls_err = -ENOKEY;
+ } else {
+ ctrl->ctrl.tls_key = tls_key;
+ queue->tls_err = 0;
+ }
+
+out_complete:
+ complete(&queue->tls_complete);
+}
+
+static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl,
+ struct nvme_tcp_queue *queue,
+ key_serial_t pskid)
+{
+ int qid = nvme_tcp_queue_id(queue);
+ int ret;
+ struct tls_handshake_args args;
+ unsigned long tmo = tls_handshake_timeout * HZ;
+ key_serial_t keyring = nvme_keyring_id();
+
+ dev_dbg(nctrl->device, "queue %d: start TLS with key %x\n",
+ qid, pskid);
+ memset(&args, 0, sizeof(args));
+ args.ta_sock = queue->sock;
+ args.ta_done = nvme_tcp_tls_done;
+ args.ta_data = queue;
+ args.ta_my_peerids[0] = pskid;
+ args.ta_num_peerids = 1;
+ if (nctrl->opts->keyring)
+ keyring = key_serial(nctrl->opts->keyring);
+ args.ta_keyring = keyring;
+ args.ta_timeout_ms = tls_handshake_timeout * 1000;
+ queue->tls_err = -EOPNOTSUPP;
+ init_completion(&queue->tls_complete);
+ ret = tls_client_hello_psk(&args, GFP_KERNEL);
+ if (ret) {
+ dev_err(nctrl->device, "queue %d: failed to start TLS: %d\n",
+ qid, ret);
+ return ret;
+ }
+ ret = wait_for_completion_interruptible_timeout(&queue->tls_complete, tmo);
+ if (ret <= 0) {
+ if (ret == 0)
+ ret = -ETIMEDOUT;
+
+ dev_err(nctrl->device,
+ "queue %d: TLS handshake failed, error %d\n",
+ qid, ret);
+ tls_handshake_cancel(queue->sock->sk);
+ } else {
+ dev_dbg(nctrl->device,
+ "queue %d: TLS handshake complete, error %d\n",
+ qid, queue->tls_err);
+ ret = queue->tls_err;
+ }
+ return ret;
+}
+#else
+static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl,
+ struct nvme_tcp_queue *queue,
+ key_serial_t pskid)
+{
+ return -EPROTONOSUPPORT;
+}
+#endif
+
+static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
+ key_serial_t pskid)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct nvme_tcp_queue *queue = &ctrl->queues[qid];
int ret, rcv_pdu_size;
+ struct file *sock_file;
mutex_init(&queue->queue_lock);
queue->ctrl = ctrl;
@@ -1534,6 +1662,11 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
goto err_destroy_mutex;
}
+ sock_file = sock_alloc_file(queue->sock, O_CLOEXEC, NULL);
+ if (IS_ERR(sock_file)) {
+ ret = PTR_ERR(sock_file);
+ goto err_destroy_mutex;
+ }
nvme_tcp_reclassify_socket(queue->sock);
/* Single syn retry */
@@ -1624,6 +1757,13 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
goto err_rcv_pdu;
}
+ /* If PSKs are configured try to start TLS */
+ if (pskid) {
+ ret = nvme_tcp_start_tls(nctrl, queue, pskid);
+ if (ret)
+ goto err_init_connect;
+ }
+
ret = nvme_tcp_init_connection(queue);
if (ret)
goto err_init_connect;
@@ -1640,7 +1780,8 @@ err_crypto:
if (queue->hdr_digest || queue->data_digest)
nvme_tcp_free_crypto(queue);
err_sock:
- sock_release(queue->sock);
+ /* ->sock will be released by fput() */
+ fput(queue->sock->file);
queue->sock = NULL;
err_destroy_mutex:
mutex_destroy(&queue->send_mutex);
@@ -1772,10 +1913,25 @@ out_stop_queues:
static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
{
int ret;
+ key_serial_t pskid = 0;
- ret = nvme_tcp_alloc_queue(ctrl, 0);
+ if (ctrl->opts->tls) {
+ if (ctrl->opts->tls_key)
+ pskid = key_serial(ctrl->opts->tls_key);
+ else
+ pskid = nvme_tls_psk_default(ctrl->opts->keyring,
+ ctrl->opts->host->nqn,
+ ctrl->opts->subsysnqn);
+ if (!pskid) {
+ dev_err(ctrl->device, "no valid PSK found\n");
+ ret = -ENOKEY;
+ goto out_free_queue;
+ }
+ }
+
+ ret = nvme_tcp_alloc_queue(ctrl, 0, pskid);
if (ret)
- return ret;
+ goto out_free_queue;
ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
if (ret)
@@ -1792,8 +1948,13 @@ static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
{
int i, ret;
+ if (ctrl->opts->tls && !ctrl->tls_key) {
+ dev_err(ctrl->device, "no PSK negotiated\n");
+ return -ENOKEY;
+ }
for (i = 1; i < ctrl->queue_count; i++) {
- ret = nvme_tcp_alloc_queue(ctrl, i);
+ ret = nvme_tcp_alloc_queue(ctrl, i,
+ key_serial(ctrl->tls_key));
if (ret)
goto out_free_queues;
}
@@ -2621,7 +2782,8 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
- NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE,
+ NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE | NVMF_OPT_TLS |
+ NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY,
.create_ctrl = nvme_tcp_create_ctrl,
};
diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig
index 79fc64035ee3..fa479c9f5c3d 100644
--- a/drivers/nvme/target/Kconfig
+++ b/drivers/nvme/target/Kconfig
@@ -84,16 +84,26 @@ config NVME_TARGET_TCP
If unsure, say N.
+config NVME_TARGET_TCP_TLS
+ bool "NVMe over Fabrics TCP target TLS encryption support"
+ depends on NVME_TARGET_TCP
+ select NVME_COMMON
+ select NVME_KEYRING
+ select NET_HANDSHAKE
+ select KEYS
+ help
+ Enables TLS encryption for the NVMe TCP target using the netlink handshake API.
+
+ The TLS handshake daemon is available at
+ https://github.com/oracle/ktls-utils.
+
+ If unsure, say N.
+
config NVME_TARGET_AUTH
bool "NVMe over Fabrics In-band Authentication support"
depends on NVME_TARGET
select NVME_COMMON
- select CRYPTO
- select CRYPTO_HMAC
- select CRYPTO_SHA256
- select CRYPTO_SHA512
- select CRYPTO_DH
- select CRYPTO_DH_RFC7919_GROUPS
+ select NVME_AUTH
help
This enables support for NVMe over Fabrics In-band Authentication
diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c
index 4dcddcf95279..3ddbc3880cac 100644
--- a/drivers/nvme/target/auth.c
+++ b/drivers/nvme/target/auth.c
@@ -267,7 +267,8 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
struct shash_desc *shash;
struct nvmet_ctrl *ctrl = req->sq->ctrl;
const char *hash_name;
- u8 *challenge = req->sq->dhchap_c1, *host_response;
+ u8 *challenge = req->sq->dhchap_c1;
+ struct nvme_dhchap_key *transformed_key;
u8 buf[4];
int ret;
@@ -291,14 +292,15 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
goto out_free_tfm;
}
- host_response = nvme_auth_transform_key(ctrl->host_key, ctrl->hostnqn);
- if (IS_ERR(host_response)) {
- ret = PTR_ERR(host_response);
+ transformed_key = nvme_auth_transform_key(ctrl->host_key,
+ ctrl->hostnqn);
+ if (IS_ERR(transformed_key)) {
+ ret = PTR_ERR(transformed_key);
goto out_free_tfm;
}
- ret = crypto_shash_setkey(shash_tfm, host_response,
- ctrl->host_key->len);
+ ret = crypto_shash_setkey(shash_tfm, transformed_key->key,
+ transformed_key->len);
if (ret)
goto out_free_response;
@@ -365,7 +367,7 @@ out:
kfree(challenge);
kfree(shash);
out_free_response:
- kfree_sensitive(host_response);
+ nvme_auth_free_key(transformed_key);
out_free_tfm:
crypto_free_shash(shash_tfm);
return 0;
@@ -378,7 +380,8 @@ int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response,
struct shash_desc *shash;
struct nvmet_ctrl *ctrl = req->sq->ctrl;
const char *hash_name;
- u8 *challenge = req->sq->dhchap_c2, *ctrl_response;
+ u8 *challenge = req->sq->dhchap_c2;
+ struct nvme_dhchap_key *transformed_key;
u8 buf[4];
int ret;
@@ -402,15 +405,15 @@ int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response,
goto out_free_tfm;
}
- ctrl_response = nvme_auth_transform_key(ctrl->ctrl_key,
+ transformed_key = nvme_auth_transform_key(ctrl->ctrl_key,
ctrl->subsysnqn);
- if (IS_ERR(ctrl_response)) {
- ret = PTR_ERR(ctrl_response);
+ if (IS_ERR(transformed_key)) {
+ ret = PTR_ERR(transformed_key);
goto out_free_tfm;
}
- ret = crypto_shash_setkey(shash_tfm, ctrl_response,
- ctrl->ctrl_key->len);
+ ret = crypto_shash_setkey(shash_tfm, transformed_key->key,
+ transformed_key->len);
if (ret)
goto out_free_response;
@@ -474,7 +477,7 @@ out:
kfree(challenge);
kfree(shash);
out_free_response:
- kfree_sensitive(ctrl_response);
+ nvme_auth_free_key(transformed_key);
out_free_tfm:
crypto_free_shash(shash_tfm);
return 0;
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index 907143870da5..9eed6e6765ea 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -15,6 +15,7 @@
#ifdef CONFIG_NVME_TARGET_AUTH
#include <linux/nvme-auth.h>
#endif
+#include <linux/nvme-keyring.h>
#include <crypto/hash.h>
#include <crypto/kpp.h>
@@ -159,10 +160,14 @@ static const struct nvmet_type_name_map nvmet_addr_treq[] = {
{ NVMF_TREQ_NOT_REQUIRED, "not required" },
};
+static inline u8 nvmet_port_disc_addr_treq_mask(struct nvmet_port *port)
+{
+ return (port->disc_addr.treq & ~NVME_TREQ_SECURE_CHANNEL_MASK);
+}
+
static ssize_t nvmet_addr_treq_show(struct config_item *item, char *page)
{
- u8 treq = to_nvmet_port(item)->disc_addr.treq &
- NVME_TREQ_SECURE_CHANNEL_MASK;
+ u8 treq = nvmet_port_disc_addr_treq_secure_channel(to_nvmet_port(item));
int i;
for (i = 0; i < ARRAY_SIZE(nvmet_addr_treq); i++) {
@@ -178,7 +183,7 @@ static ssize_t nvmet_addr_treq_store(struct config_item *item,
const char *page, size_t count)
{
struct nvmet_port *port = to_nvmet_port(item);
- u8 treq = port->disc_addr.treq & ~NVME_TREQ_SECURE_CHANNEL_MASK;
+ u8 treq = nvmet_port_disc_addr_treq_mask(port);
int i;
if (nvmet_is_port_enabled(port, __func__))
@@ -193,6 +198,20 @@ static ssize_t nvmet_addr_treq_store(struct config_item *item,
return -EINVAL;
found:
+ if (port->disc_addr.trtype == NVMF_TRTYPE_TCP &&
+ port->disc_addr.tsas.tcp.sectype == NVMF_TCP_SECTYPE_TLS13) {
+ switch (nvmet_addr_treq[i].type) {
+ case NVMF_TREQ_NOT_SPECIFIED:
+ pr_debug("treq '%s' not allowed for TLS1.3\n",
+ nvmet_addr_treq[i].name);
+ return -EINVAL;
+ case NVMF_TREQ_NOT_REQUIRED:
+ pr_warn("Allow non-TLS connections while TLS1.3 is enabled\n");
+ break;
+ default:
+ break;
+ }
+ }
treq |= nvmet_addr_treq[i].type;
port->disc_addr.treq = treq;
return count;
@@ -303,6 +322,11 @@ static void nvmet_port_init_tsas_rdma(struct nvmet_port *port)
port->disc_addr.tsas.rdma.cms = NVMF_RDMA_CMS_RDMA_CM;
}
+static void nvmet_port_init_tsas_tcp(struct nvmet_port *port, int sectype)
+{
+ port->disc_addr.tsas.tcp.sectype = sectype;
+}
+
static ssize_t nvmet_addr_trtype_store(struct config_item *item,
const char *page, size_t count)
{
@@ -325,11 +349,99 @@ found:
port->disc_addr.trtype = nvmet_transport[i].type;
if (port->disc_addr.trtype == NVMF_TRTYPE_RDMA)
nvmet_port_init_tsas_rdma(port);
+ else if (port->disc_addr.trtype == NVMF_TRTYPE_TCP)
+ nvmet_port_init_tsas_tcp(port, NVMF_TCP_SECTYPE_NONE);
return count;
}
CONFIGFS_ATTR(nvmet_, addr_trtype);
+static const struct nvmet_type_name_map nvmet_addr_tsas_tcp[] = {
+ { NVMF_TCP_SECTYPE_NONE, "none" },
+ { NVMF_TCP_SECTYPE_TLS13, "tls1.3" },
+};
+
+static const struct nvmet_type_name_map nvmet_addr_tsas_rdma[] = {
+ { NVMF_RDMA_QPTYPE_CONNECTED, "connected" },
+ { NVMF_RDMA_QPTYPE_DATAGRAM, "datagram" },
+};
+
+static ssize_t nvmet_addr_tsas_show(struct config_item *item,
+ char *page)
+{
+ struct nvmet_port *port = to_nvmet_port(item);
+ int i;
+
+ if (port->disc_addr.trtype == NVMF_TRTYPE_TCP) {
+ for (i = 0; i < ARRAY_SIZE(nvmet_addr_tsas_tcp); i++) {
+ if (port->disc_addr.tsas.tcp.sectype == nvmet_addr_tsas_tcp[i].type)
+ return sprintf(page, "%s\n", nvmet_addr_tsas_tcp[i].name);
+ }
+ } else if (port->disc_addr.trtype == NVMF_TRTYPE_RDMA) {
+ for (i = 0; i < ARRAY_SIZE(nvmet_addr_tsas_rdma); i++) {
+ if (port->disc_addr.tsas.rdma.qptype == nvmet_addr_tsas_rdma[i].type)
+ return sprintf(page, "%s\n", nvmet_addr_tsas_rdma[i].name);
+ }
+ }
+ return sprintf(page, "reserved\n");
+}
+
+static ssize_t nvmet_addr_tsas_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct nvmet_port *port = to_nvmet_port(item);
+ u8 treq = nvmet_port_disc_addr_treq_mask(port);
+ u8 sectype;
+ int i;
+
+ if (nvmet_is_port_enabled(port, __func__))
+ return -EACCES;
+
+ if (port->disc_addr.trtype != NVMF_TRTYPE_TCP)
+ return -EINVAL;
+
+ for (i = 0; i < ARRAY_SIZE(nvmet_addr_tsas_tcp); i++) {
+ if (sysfs_streq(page, nvmet_addr_tsas_tcp[i].name)) {
+ sectype = nvmet_addr_tsas_tcp[i].type;
+ goto found;
+ }
+ }
+
+ pr_err("Invalid value '%s' for tsas\n", page);
+ return -EINVAL;
+
+found:
+ if (sectype == NVMF_TCP_SECTYPE_TLS13) {
+ if (!IS_ENABLED(CONFIG_NVME_TARGET_TCP_TLS)) {
+ pr_err("TLS is not supported\n");
+ return -EINVAL;
+ }
+ if (!port->keyring) {
+ pr_err("TLS keyring not configured\n");
+ return -EINVAL;
+ }
+ }
+
+ nvmet_port_init_tsas_tcp(port, sectype);
+ /*
+ * If TLS is enabled TREQ should be set to 'required' per default
+ */
+ if (sectype == NVMF_TCP_SECTYPE_TLS13) {
+ u8 sc = nvmet_port_disc_addr_treq_secure_channel(port);
+
+ if (sc == NVMF_TREQ_NOT_SPECIFIED)
+ treq |= NVMF_TREQ_REQUIRED;
+ else
+ treq |= sc;
+ } else {
+ treq |= NVMF_TREQ_NOT_SPECIFIED;
+ }
+ port->disc_addr.treq = treq;
+ return count;
+}
+
+CONFIGFS_ATTR(nvmet_, addr_tsas);
+
/*
* Namespace structures & file operation functions below
*/
@@ -1731,6 +1843,7 @@ static void nvmet_port_release(struct config_item *item)
flush_workqueue(nvmet_wq);
list_del(&port->global_entry);
+ key_put(port->keyring);
kfree(port->ana_state);
kfree(port);
}
@@ -1741,6 +1854,7 @@ static struct configfs_attribute *nvmet_port_attrs[] = {
&nvmet_attr_addr_traddr,
&nvmet_attr_addr_trsvcid,
&nvmet_attr_addr_trtype,
+ &nvmet_attr_addr_tsas,
&nvmet_attr_param_inline_data_size,
#ifdef CONFIG_BLK_DEV_INTEGRITY
&nvmet_attr_param_pi_enable,
@@ -1779,6 +1893,14 @@ static struct config_group *nvmet_ports_make(struct config_group *group,
return ERR_PTR(-ENOMEM);
}
+ if (nvme_keyring_id()) {
+ port->keyring = key_lookup(nvme_keyring_id());
+ if (IS_ERR(port->keyring)) {
+ pr_warn("NVMe keyring not available, disabling TLS\n");
+ port->keyring = NULL;
+ }
+ }
+
for (i = 1; i <= NVMET_MAX_ANAGRPS; i++) {
if (i == NVMET_DEFAULT_ANA_GRPID)
port->ana_state[1] = NVME_ANA_OPTIMIZED;
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 1ab6601fdd5c..bd59990b5250 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -146,7 +146,8 @@ struct nvmet_fc_tgt_queue {
struct workqueue_struct *work_q;
struct kref ref;
struct rcu_head rcu;
- struct nvmet_fc_fcp_iod fod[]; /* array of fcp_iods */
+ /* array of fcp_iods */
+ struct nvmet_fc_fcp_iod fod[] __counted_by(sqsize);
} __aligned(sizeof(unsigned long long));
struct nvmet_fc_hostport {
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 360e385be33b..6c8acebe1a1a 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -159,6 +159,7 @@ struct nvmet_port {
struct config_group ana_groups_group;
struct nvmet_ana_group ana_default_group;
enum nvme_ana_state *ana_state;
+ struct key *keyring;
void *priv;
bool enabled;
int inline_data_size;
@@ -179,6 +180,16 @@ static inline struct nvmet_port *ana_groups_to_port(
ana_groups_group);
}
+static inline u8 nvmet_port_disc_addr_treq_secure_channel(struct nvmet_port *port)
+{
+ return (port->disc_addr.treq & NVME_TREQ_SECURE_CHANNEL_MASK);
+}
+
+static inline bool nvmet_port_secure_channel_required(struct nvmet_port *port)
+{
+ return nvmet_port_disc_addr_treq_secure_channel(port) == NVMF_TREQ_REQUIRED;
+}
+
struct nvmet_ctrl {
struct nvmet_subsys *subsys;
struct nvmet_sq **sqs;
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 197fc2ecb164..92b74d0b8686 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -8,9 +8,14 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/err.h>
+#include <linux/key.h>
#include <linux/nvme-tcp.h>
+#include <linux/nvme-keyring.h>
#include <net/sock.h>
#include <net/tcp.h>
+#include <net/tls.h>
+#include <net/tls_prot.h>
+#include <net/handshake.h>
#include <linux/inet.h>
#include <linux/llist.h>
#include <crypto/hash.h>
@@ -66,6 +71,16 @@ device_param_cb(idle_poll_period_usecs, &set_param_ops,
MODULE_PARM_DESC(idle_poll_period_usecs,
"nvmet tcp io_work poll till idle time period in usecs: Default 0");
+#ifdef CONFIG_NVME_TARGET_TCP_TLS
+/*
+ * TLS handshake timeout
+ */
+static int tls_handshake_timeout = 10;
+module_param(tls_handshake_timeout, int, 0644);
+MODULE_PARM_DESC(tls_handshake_timeout,
+ "nvme TLS handshake timeout in seconds (default 10)");
+#endif
+
#define NVMET_TCP_RECV_BUDGET 8
#define NVMET_TCP_SEND_BUDGET 8
#define NVMET_TCP_IO_WORK_BUDGET 64
@@ -104,6 +119,7 @@ struct nvmet_tcp_cmd {
u32 pdu_len;
u32 pdu_recv;
int sg_idx;
+ char recv_cbuf[CMSG_LEN(sizeof(char))];
struct msghdr recv_msg;
struct bio_vec *iov;
u32 flags;
@@ -122,8 +138,10 @@ struct nvmet_tcp_cmd {
enum nvmet_tcp_queue_state {
NVMET_TCP_Q_CONNECTING,
+ NVMET_TCP_Q_TLS_HANDSHAKE,
NVMET_TCP_Q_LIVE,
NVMET_TCP_Q_DISCONNECTING,
+ NVMET_TCP_Q_FAILED,
};
struct nvmet_tcp_queue {
@@ -132,6 +150,7 @@ struct nvmet_tcp_queue {
struct work_struct io_work;
struct nvmet_cq nvme_cq;
struct nvmet_sq nvme_sq;
+ struct kref kref;
/* send state */
struct nvmet_tcp_cmd *cmds;
@@ -155,6 +174,10 @@ struct nvmet_tcp_queue {
struct ahash_request *snd_hash;
struct ahash_request *rcv_hash;
+ /* TLS state */
+ key_serial_t tls_pskid;
+ struct delayed_work tls_handshake_tmo_work;
+
unsigned long poll_end;
spinlock_t state_lock;
@@ -910,8 +933,10 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
iov.iov_base = icresp;
iov.iov_len = sizeof(*icresp);
ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
- if (ret < 0)
+ if (ret < 0) {
+ queue->state = NVMET_TCP_Q_FAILED;
return ret; /* queue removal will cleanup */
+ }
queue->state = NVMET_TCP_Q_LIVE;
nvmet_prepare_receive_pdu(queue);
@@ -1096,20 +1121,65 @@ static inline bool nvmet_tcp_pdu_valid(u8 type)
return false;
}
+static int nvmet_tcp_tls_record_ok(struct nvmet_tcp_queue *queue,
+ struct msghdr *msg, char *cbuf)
+{
+ struct cmsghdr *cmsg = (struct cmsghdr *)cbuf;
+ u8 ctype, level, description;
+ int ret = 0;
+
+ ctype = tls_get_record_type(queue->sock->sk, cmsg);
+ switch (ctype) {
+ case 0:
+ break;
+ case TLS_RECORD_TYPE_DATA:
+ break;
+ case TLS_RECORD_TYPE_ALERT:
+ tls_alert_recv(queue->sock->sk, msg, &level, &description);
+ if (level == TLS_ALERT_LEVEL_FATAL) {
+ pr_err("queue %d: TLS Alert desc %u\n",
+ queue->idx, description);
+ ret = -ENOTCONN;
+ } else {
+ pr_warn("queue %d: TLS Alert desc %u\n",
+ queue->idx, description);
+ ret = -EAGAIN;
+ }
+ break;
+ default:
+ /* discard this record type */
+ pr_err("queue %d: TLS record %d unhandled\n",
+ queue->idx, ctype);
+ ret = -EAGAIN;
+ break;
+ }
+ return ret;
+}
+
static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue)
{
struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
- int len;
+ int len, ret;
struct kvec iov;
+ char cbuf[CMSG_LEN(sizeof(char))] = {};
struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
recv:
iov.iov_base = (void *)&queue->pdu + queue->offset;
iov.iov_len = queue->left;
+ if (queue->tls_pskid) {
+ msg.msg_control = cbuf;
+ msg.msg_controllen = sizeof(cbuf);
+ }
len = kernel_recvmsg(queue->sock, &msg, &iov, 1,
iov.iov_len, msg.msg_flags);
if (unlikely(len < 0))
return len;
+ if (queue->tls_pskid) {
+ ret = nvmet_tcp_tls_record_ok(queue, &msg, cbuf);
+ if (ret < 0)
+ return ret;
+ }
queue->offset += len;
queue->left -= len;
@@ -1162,16 +1232,22 @@ static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd)
static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
{
struct nvmet_tcp_cmd *cmd = queue->cmd;
- int ret;
+ int len, ret;
while (msg_data_left(&cmd->recv_msg)) {
- ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg,
+ len = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg,
cmd->recv_msg.msg_flags);
- if (ret <= 0)
- return ret;
+ if (len <= 0)
+ return len;
+ if (queue->tls_pskid) {
+ ret = nvmet_tcp_tls_record_ok(cmd->queue,
+ &cmd->recv_msg, cmd->recv_cbuf);
+ if (ret < 0)
+ return ret;
+ }
- cmd->pdu_recv += ret;
- cmd->rbytes_done += ret;
+ cmd->pdu_recv += len;
+ cmd->rbytes_done += len;
}
if (queue->data_digest) {
@@ -1189,20 +1265,30 @@ static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue)
{
struct nvmet_tcp_cmd *cmd = queue->cmd;
- int ret;
+ int ret, len;
+ char cbuf[CMSG_LEN(sizeof(char))] = {};
struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
struct kvec iov = {
.iov_base = (void *)&cmd->recv_ddgst + queue->offset,
.iov_len = queue->left
};
- ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
+ if (queue->tls_pskid) {
+ msg.msg_control = cbuf;
+ msg.msg_controllen = sizeof(cbuf);
+ }
+ len = kernel_recvmsg(queue->sock, &msg, &iov, 1,
iov.iov_len, msg.msg_flags);
- if (unlikely(ret < 0))
- return ret;
+ if (unlikely(len < 0))
+ return len;
+ if (queue->tls_pskid) {
+ ret = nvmet_tcp_tls_record_ok(queue, &msg, cbuf);
+ if (ret < 0)
+ return ret;
+ }
- queue->offset += ret;
- queue->left -= ret;
+ queue->offset += len;
+ queue->left -= len;
if (queue->left)
return -EAGAIN;
@@ -1280,14 +1366,27 @@ done:
return ret;
}
+static void nvmet_tcp_release_queue(struct kref *kref)
+{
+ struct nvmet_tcp_queue *queue =
+ container_of(kref, struct nvmet_tcp_queue, kref);
+
+ WARN_ON(queue->state != NVMET_TCP_Q_DISCONNECTING);
+ queue_work(nvmet_wq, &queue->release_work);
+}
+
static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue)
{
- spin_lock(&queue->state_lock);
+ spin_lock_bh(&queue->state_lock);
+ if (queue->state == NVMET_TCP_Q_TLS_HANDSHAKE) {
+ /* Socket closed during handshake */
+ tls_handshake_cancel(queue->sock->sk);
+ }
if (queue->state != NVMET_TCP_Q_DISCONNECTING) {
queue->state = NVMET_TCP_Q_DISCONNECTING;
- queue_work(nvmet_wq, &queue->release_work);
+ kref_put(&queue->kref, nvmet_tcp_release_queue);
}
- spin_unlock(&queue->state_lock);
+ spin_unlock_bh(&queue->state_lock);
}
static inline void nvmet_tcp_arm_queue_deadline(struct nvmet_tcp_queue *queue)
@@ -1369,6 +1468,10 @@ static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
if (!c->r2t_pdu)
goto out_free_data;
+ if (queue->state == NVMET_TCP_Q_TLS_HANDSHAKE) {
+ c->recv_msg.msg_control = c->recv_cbuf;
+ c->recv_msg.msg_controllen = sizeof(c->recv_cbuf);
+ }
c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
list_add_tail(&c->entry, &queue->free_list);
@@ -1482,6 +1585,7 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
mutex_unlock(&nvmet_tcp_queue_mutex);
nvmet_tcp_restore_socket_callbacks(queue);
+ cancel_delayed_work_sync(&queue->tls_handshake_tmo_work);
cancel_work_sync(&queue->io_work);
/* stop accepting incoming data */
queue->rcv_state = NVMET_TCP_RECV_ERR;
@@ -1490,12 +1594,12 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
nvmet_sq_destroy(&queue->nvme_sq);
cancel_work_sync(&queue->io_work);
nvmet_tcp_free_cmd_data_in_buffers(queue);
- sock_release(queue->sock);
+ /* ->sock will be released by fput() */
+ fput(queue->sock->file);
nvmet_tcp_free_cmds(queue);
if (queue->hdr_digest || queue->data_digest)
nvmet_tcp_free_crypto(queue);
ida_free(&nvmet_tcp_queue_ida, queue->idx);
-
page = virt_to_head_page(queue->pf_cache.va);
__page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
kfree(queue);
@@ -1509,8 +1613,13 @@ static void nvmet_tcp_data_ready(struct sock *sk)
read_lock_bh(&sk->sk_callback_lock);
queue = sk->sk_user_data;
- if (likely(queue))
- queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
+ if (likely(queue)) {
+ if (queue->data_ready)
+ queue->data_ready(sk);
+ if (queue->state != NVMET_TCP_Q_TLS_HANDSHAKE)
+ queue_work_on(queue_cpu(queue), nvmet_tcp_wq,
+ &queue->io_work);
+ }
read_unlock_bh(&sk->sk_callback_lock);
}
@@ -1618,31 +1727,174 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
return ret;
}
-static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
+#ifdef CONFIG_NVME_TARGET_TCP_TLS
+static int nvmet_tcp_try_peek_pdu(struct nvmet_tcp_queue *queue)
+{
+ struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
+ int len, ret;
+ struct kvec iov = {
+ .iov_base = (u8 *)&queue->pdu + queue->offset,
+ .iov_len = sizeof(struct nvme_tcp_hdr),
+ };
+ char cbuf[CMSG_LEN(sizeof(char))] = {};
+ struct msghdr msg = {
+ .msg_control = cbuf,
+ .msg_controllen = sizeof(cbuf),
+ .msg_flags = MSG_PEEK,
+ };
+
+ if (nvmet_port_secure_channel_required(queue->port->nport))
+ return 0;
+
+ len = kernel_recvmsg(queue->sock, &msg, &iov, 1,
+ iov.iov_len, msg.msg_flags);
+ if (unlikely(len < 0)) {
+ pr_debug("queue %d: peek error %d\n",
+ queue->idx, len);
+ return len;
+ }
+
+ ret = nvmet_tcp_tls_record_ok(queue, &msg, cbuf);
+ if (ret < 0)
+ return ret;
+
+ if (len < sizeof(struct nvme_tcp_hdr)) {
+ pr_debug("queue %d: short read, %d bytes missing\n",
+ queue->idx, (int)iov.iov_len - len);
+ return -EAGAIN;
+ }
+ pr_debug("queue %d: hdr type %d hlen %d plen %d size %d\n",
+ queue->idx, hdr->type, hdr->hlen, hdr->plen,
+ (int)sizeof(struct nvme_tcp_icreq_pdu));
+ if (hdr->type == nvme_tcp_icreq &&
+ hdr->hlen == sizeof(struct nvme_tcp_icreq_pdu) &&
+ hdr->plen == (__le32)sizeof(struct nvme_tcp_icreq_pdu)) {
+ pr_debug("queue %d: icreq detected\n",
+ queue->idx);
+ return len;
+ }
+ return 0;
+}
+
+static void nvmet_tcp_tls_handshake_done(void *data, int status,
+ key_serial_t peerid)
+{
+ struct nvmet_tcp_queue *queue = data;
+
+ pr_debug("queue %d: TLS handshake done, key %x, status %d\n",
+ queue->idx, peerid, status);
+ spin_lock_bh(&queue->state_lock);
+ if (WARN_ON(queue->state != NVMET_TCP_Q_TLS_HANDSHAKE)) {
+ spin_unlock_bh(&queue->state_lock);
+ return;
+ }
+ if (!status) {
+ queue->tls_pskid = peerid;
+ queue->state = NVMET_TCP_Q_CONNECTING;
+ } else
+ queue->state = NVMET_TCP_Q_FAILED;
+ spin_unlock_bh(&queue->state_lock);
+
+ cancel_delayed_work_sync(&queue->tls_handshake_tmo_work);
+ if (status)
+ nvmet_tcp_schedule_release_queue(queue);
+ else
+ nvmet_tcp_set_queue_sock(queue);
+ kref_put(&queue->kref, nvmet_tcp_release_queue);
+}
+
+static void nvmet_tcp_tls_handshake_timeout(struct work_struct *w)
+{
+ struct nvmet_tcp_queue *queue = container_of(to_delayed_work(w),
+ struct nvmet_tcp_queue, tls_handshake_tmo_work);
+
+ pr_warn("queue %d: TLS handshake timeout\n", queue->idx);
+ /*
+ * If tls_handshake_cancel() fails we've lost the race with
+ * nvmet_tcp_tls_handshake_done() */
+ if (!tls_handshake_cancel(queue->sock->sk))
+ return;
+ spin_lock_bh(&queue->state_lock);
+ if (WARN_ON(queue->state != NVMET_TCP_Q_TLS_HANDSHAKE)) {
+ spin_unlock_bh(&queue->state_lock);
+ return;
+ }
+ queue->state = NVMET_TCP_Q_FAILED;
+ spin_unlock_bh(&queue->state_lock);
+ nvmet_tcp_schedule_release_queue(queue);
+ kref_put(&queue->kref, nvmet_tcp_release_queue);
+}
+
+static int nvmet_tcp_tls_handshake(struct nvmet_tcp_queue *queue)
+{
+ int ret = -EOPNOTSUPP;
+ struct tls_handshake_args args;
+
+ if (queue->state != NVMET_TCP_Q_TLS_HANDSHAKE) {
+ pr_warn("cannot start TLS in state %d\n", queue->state);
+ return -EINVAL;
+ }
+
+ kref_get(&queue->kref);
+ pr_debug("queue %d: TLS ServerHello\n", queue->idx);
+ memset(&args, 0, sizeof(args));
+ args.ta_sock = queue->sock;
+ args.ta_done = nvmet_tcp_tls_handshake_done;
+ args.ta_data = queue;
+ args.ta_keyring = key_serial(queue->port->nport->keyring);
+ args.ta_timeout_ms = tls_handshake_timeout * 1000;
+
+ ret = tls_server_hello_psk(&args, GFP_KERNEL);
+ if (ret) {
+ kref_put(&queue->kref, nvmet_tcp_release_queue);
+ pr_err("failed to start TLS, err=%d\n", ret);
+ } else {
+ queue_delayed_work(nvmet_wq, &queue->tls_handshake_tmo_work,
+ tls_handshake_timeout * HZ);
+ }
+ return ret;
+}
+#endif
+
+static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
struct socket *newsock)
{
struct nvmet_tcp_queue *queue;
+ struct file *sock_file = NULL;
int ret;
queue = kzalloc(sizeof(*queue), GFP_KERNEL);
- if (!queue)
- return -ENOMEM;
+ if (!queue) {
+ ret = -ENOMEM;
+ goto out_release;
+ }
INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work);
INIT_WORK(&queue->io_work, nvmet_tcp_io_work);
+ kref_init(&queue->kref);
queue->sock = newsock;
queue->port = port;
queue->nr_cmds = 0;
spin_lock_init(&queue->state_lock);
- queue->state = NVMET_TCP_Q_CONNECTING;
+ if (queue->port->nport->disc_addr.tsas.tcp.sectype ==
+ NVMF_TCP_SECTYPE_TLS13)
+ queue->state = NVMET_TCP_Q_TLS_HANDSHAKE;
+ else
+ queue->state = NVMET_TCP_Q_CONNECTING;
INIT_LIST_HEAD(&queue->free_list);
init_llist_head(&queue->resp_list);
INIT_LIST_HEAD(&queue->resp_send_list);
+ sock_file = sock_alloc_file(queue->sock, O_CLOEXEC, NULL);
+ if (IS_ERR(sock_file)) {
+ ret = PTR_ERR(sock_file);
+ goto out_free_queue;
+ }
+
queue->idx = ida_alloc(&nvmet_tcp_queue_ida, GFP_KERNEL);
if (queue->idx < 0) {
ret = queue->idx;
- goto out_free_queue;
+ goto out_sock;
}
ret = nvmet_tcp_alloc_cmd(queue, &queue->connect);
@@ -1659,11 +1911,33 @@ static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list);
mutex_unlock(&nvmet_tcp_queue_mutex);
+#ifdef CONFIG_NVME_TARGET_TCP_TLS
+ INIT_DELAYED_WORK(&queue->tls_handshake_tmo_work,
+ nvmet_tcp_tls_handshake_timeout);
+ if (queue->state == NVMET_TCP_Q_TLS_HANDSHAKE) {
+ struct sock *sk = queue->sock->sk;
+
+ /* Restore the default callbacks before starting upcall */
+ read_lock_bh(&sk->sk_callback_lock);
+ sk->sk_user_data = NULL;
+ sk->sk_data_ready = port->data_ready;
+ read_unlock_bh(&sk->sk_callback_lock);
+ if (!nvmet_tcp_try_peek_pdu(queue)) {
+ if (!nvmet_tcp_tls_handshake(queue))
+ return;
+ /* TLS handshake failed, terminate the connection */
+ goto out_destroy_sq;
+ }
+ /* Not a TLS connection, continue with normal processing */
+ queue->state = NVMET_TCP_Q_CONNECTING;
+ }
+#endif
+
ret = nvmet_tcp_set_queue_sock(queue);
if (ret)
goto out_destroy_sq;
- return 0;
+ return;
out_destroy_sq:
mutex_lock(&nvmet_tcp_queue_mutex);
list_del_init(&queue->queue_list);
@@ -1673,9 +1947,14 @@ out_free_connect:
nvmet_tcp_free_cmd(&queue->connect);
out_ida_remove:
ida_free(&nvmet_tcp_queue_ida, queue->idx);
+out_sock:
+ fput(queue->sock->file);
out_free_queue:
kfree(queue);
- return ret;
+out_release:
+ pr_err("failed to allocate queue, error %d\n", ret);
+ if (!sock_file)
+ sock_release(newsock);
}
static void nvmet_tcp_accept_work(struct work_struct *w)
@@ -1692,11 +1971,7 @@ static void nvmet_tcp_accept_work(struct work_struct *w)
pr_warn("failed to accept err=%d\n", ret);
return;
}
- ret = nvmet_tcp_alloc_queue(port, newsock);
- if (ret) {
- pr_err("failed to allocate queue\n");
- sock_release(newsock);
- }
+ nvmet_tcp_alloc_queue(port, newsock);
}
}