1 files changed, 166 insertions, 214 deletions
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index a128672472ec..f8db70ae172d 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -27,7 +27,6 @@
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/kdev_t.h>
-#include <linux/kthread.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/module.h>
@@ -39,6 +38,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/t10-pi.h>
+#include <linux/timer.h>
 #include <linux/types.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <asm/unaligned.h>
@@ -57,18 +57,6 @@
 #define NVME_NR_AEN_COMMANDS	1
 #define NVME_AQ_BLKMQ_DEPTH	(NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS)
 
-unsigned char admin_timeout = 60;
-module_param(admin_timeout, byte, 0644);
-MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
-
-unsigned char nvme_io_timeout = 30;
-module_param_named(io_timeout, nvme_io_timeout, byte, 0644);
-MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
-
-unsigned char shutdown_timeout = 5;
-module_param(shutdown_timeout, byte, 0644);
-MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
-
 static int use_threaded_interrupts;
 module_param(use_threaded_interrupts, int, 0);
 
@@ -76,24 +64,19 @@ static bool use_cmb_sqes = true;
 module_param(use_cmb_sqes, bool, 0644);
 MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");
 
-static LIST_HEAD(dev_list);
-static struct task_struct *nvme_thread;
 static struct workqueue_struct *nvme_workq;
-static wait_queue_head_t nvme_kthread_wait;
 
 struct nvme_dev;
 struct nvme_queue;
 
 static int nvme_reset(struct nvme_dev *dev);
 static void nvme_process_cq(struct nvme_queue *nvmeq);
-static void nvme_remove_dead_ctrl(struct nvme_dev *dev);
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
 
 /*
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
  */
 struct nvme_dev {
-	struct list_head node;
 	struct nvme_queue **queues;
 	struct blk_mq_tag_set tagset;
 	struct blk_mq_tag_set admin_tagset;
@@ -111,6 +94,8 @@ struct nvme_dev {
 	struct work_struct reset_work;
 	struct work_struct scan_work;
 	struct work_struct remove_work;
+	struct work_struct async_work;
+	struct timer_list watchdog_timer;
 	struct mutex shutdown_lock;
 	bool subsystem;
 	void __iomem *cmb;
@@ -120,6 +105,7 @@ struct nvme_dev {
 	unsigned long flags;
 
 #define NVME_CTRL_RESETTING    0
+#define NVME_CTRL_REMOVING     1
 
 	struct nvme_ctrl ctrl;
 	struct completion ioq_wait;
@@ -148,7 +134,6 @@ struct nvme_queue {
 	u32 __iomem *q_db;
 	u16 q_depth;
 	s16 cq_vector;
-	u16 sq_head;
 	u16 sq_tail;
 	u16 cq_head;
 	u16 qid;
@@ -286,23 +271,37 @@ static int nvme_init_request(void *data, struct request *req,
 	return 0;
 }
 
+static void nvme_queue_scan(struct nvme_dev *dev)
+{
+	/*
+	 * Do not queue new scan work when a controller is reset during
+	 * removal.
+	 */
+	if (test_bit(NVME_CTRL_REMOVING, &dev->flags))
+		return;
+	queue_work(nvme_workq, &dev->scan_work);
+}
+
 static void nvme_complete_async_event(struct nvme_dev *dev,
 		struct nvme_completion *cqe)
 {
 	u16 status = le16_to_cpu(cqe->status) >> 1;
 	u32 result = le32_to_cpu(cqe->result);
 
-	if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ)
+	if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) {
 		++dev->ctrl.event_limit;
+		queue_work(nvme_workq, &dev->async_work);
+	}
+
 	if (status != NVME_SC_SUCCESS)
 		return;
 
 	switch (result & 0xff07) {
 	case NVME_AER_NOTICE_NS_CHANGED:
-		dev_info(dev->dev, "rescanning\n");
-		queue_work(nvme_workq, &dev->scan_work);
+		dev_info(dev->ctrl.device, "rescanning\n");
+		nvme_queue_scan(dev);
 	default:
-		dev_warn(dev->dev, "async event result %08x\n", result);
+		dev_warn(dev->ctrl.device, "async event result %08x\n", result);
 	}
 }
 
@@ -679,7 +678,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 	spin_lock_irq(&nvmeq->q_lock);
 	if (unlikely(nvmeq->cq_vector < 0)) {
-		ret = BLK_MQ_RQ_QUEUE_BUSY;
+		if (ns && !test_bit(NVME_NS_DEAD, &ns->flags))
+			ret = BLK_MQ_RQ_QUEUE_BUSY;
+		else
+			ret = BLK_MQ_RQ_QUEUE_ERROR;
 		spin_unlock_irq(&nvmeq->q_lock);
 		goto out;
 	}
@@ -713,7 +715,7 @@ static void nvme_complete_rq(struct request *req)
 	}
 
 	if (unlikely(iod->aborted)) {
-		dev_warn(dev->dev,
+		dev_warn(dev->ctrl.device,
 			"completing aborted command with status: %04x\n",
 			req->errors);
 	}
@@ -735,7 +737,6 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
 
 		if ((status & 1) != phase)
 			break;
-		nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
 		if (++head == nvmeq->q_depth) {
 			head = 0;
 			phase = !phase;
@@ -745,7 +746,7 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
 			*tag = -1;
 
 		if (unlikely(cqe.command_id >= nvmeq->q_depth)) {
-			dev_warn(nvmeq->q_dmadev,
+			dev_warn(nvmeq->dev->ctrl.device,
 				"invalid id %d completed on queue %d\n",
 				cqe.command_id, le16_to_cpu(cqe.sq_id));
 			continue;
@@ -764,10 +765,8 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
 		}
 
 		req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id);
-		if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
-			u32 result = le32_to_cpu(cqe.result);
-			req->special = (void *)(uintptr_t)result;
-		}
+		if (req->cmd_type == REQ_TYPE_DRV_PRIV && req->special)
+			memcpy(req->special, &cqe, sizeof(cqe));
 		blk_mq_complete_request(req, status >> 1);
 
 	}
@@ -832,15 +831,22 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
 	return 0;
 }
 
-static void nvme_submit_async_event(struct nvme_dev *dev)
+static void nvme_async_event_work(struct work_struct *work)
 {
+	struct nvme_dev *dev = container_of(work, struct nvme_dev, async_work);
+	struct nvme_queue *nvmeq = dev->queues[0];
 	struct nvme_command c;
 
 	memset(&c, 0, sizeof(c));
 	c.common.opcode = nvme_admin_async_event;
-	c.common.command_id = NVME_AQ_BLKMQ_DEPTH + --dev->ctrl.event_limit;
 
-	__nvme_submit_cmd(dev->queues[0], &c);
+	spin_lock_irq(&nvmeq->q_lock);
+	while (dev->ctrl.event_limit > 0) {
+		c.common.command_id = NVME_AQ_BLKMQ_DEPTH +
+			--dev->ctrl.event_limit;
+		__nvme_submit_cmd(nvmeq, &c);
+	}
+	spin_unlock_irq(&nvmeq->q_lock);
 }
 
 static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
@@ -910,12 +916,10 @@ static void abort_endio(struct request *req, int error)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct nvme_queue *nvmeq = iod->nvmeq;
-	u32 result = (u32)(uintptr_t)req->special;
 	u16 status = req->errors;
 
-	dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result);
+	dev_warn(nvmeq->dev->ctrl.device, "Abort status: 0x%x", status);
 	atomic_inc(&nvmeq->dev->ctrl.abort_limit);
-
 	blk_mq_free_request(req);
 }
 
@@ -934,7 +938,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	 * shutdown, so we return BLK_EH_HANDLED.
 	 */
 	if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) {
-		dev_warn(dev->dev,
+		dev_warn(dev->ctrl.device,
 			 "I/O %d QID %d timeout, disable controller\n",
 			 req->tag, nvmeq->qid);
 		nvme_dev_disable(dev, false);
@@ -948,7 +952,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
  	 * returned to the driver, or if this is the admin queue.
 	 */
 	if (!nvmeq->qid || iod->aborted) {
-		dev_warn(dev->dev,
+		dev_warn(dev->ctrl.device,
 			 "I/O %d QID %d timeout, reset controller\n",
 			 req->tag, nvmeq->qid);
 		nvme_dev_disable(dev, false);
@@ -974,8 +978,9 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	cmd.abort.cid = req->tag;
 	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
 
-	dev_warn(nvmeq->q_dmadev, "I/O %d QID %d timeout, aborting\n",
-				 req->tag, nvmeq->qid);
+	dev_warn(nvmeq->dev->ctrl.device,
+		"I/O %d QID %d timeout, aborting\n",
+		 req->tag, nvmeq->qid);
 
 	abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd,
 			BLK_MQ_REQ_NOWAIT);
@@ -1004,7 +1009,7 @@ static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved
 	if (!blk_mq_request_started(req))
 		return;
 
-	dev_dbg_ratelimited(nvmeq->q_dmadev,
+	dev_dbg_ratelimited(nvmeq->dev->ctrl.device,
 		 "Cancelling I/O %d QID %d\n", req->tag, nvmeq->qid);
 
 	status = NVME_SC_ABORT_REQ;
@@ -1159,9 +1164,6 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	nvmeq->qid = qid;
 	nvmeq->cq_vector = -1;
 	dev->queues[qid] = nvmeq;
-
-	/* make sure queue descriptor is set before queue count, for kthread */
-	mb();
 	dev->queue_count++;
 
 	return nvmeq;
@@ -1250,6 +1252,12 @@ static struct blk_mq_ops nvme_mq_ops = {
 static void nvme_dev_remove_admin(struct nvme_dev *dev)
 {
 	if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
+		/*
+		 * If the controller was reset during removal, it's possible
+		 * user requests may be waiting on a stopped queue. Start the
+		 * queue to flush these to completion.
+		 */
+		blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true);
 		blk_cleanup_queue(dev->ctrl.admin_q);
 		blk_mq_free_tag_set(&dev->admin_tagset);
 	}
@@ -1340,53 +1348,31 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 	return result;
 }
 
-static int nvme_kthread(void *data)
-{
-	struct nvme_dev *dev, *next;
-
-	while (!kthread_should_stop()) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		spin_lock(&dev_list_lock);
-		list_for_each_entry_safe(dev, next, &dev_list, node) {
-			int i;
-			u32 csts = readl(dev->bar + NVME_REG_CSTS);
-
-			/*
-			 * Skip controllers currently under reset.
-			 */
-			if (work_pending(&dev->reset_work) || work_busy(&dev->reset_work))
-				continue;
-
-			if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) ||
-							csts & NVME_CSTS_CFS) {
-				if (queue_work(nvme_workq, &dev->reset_work)) {
-					dev_warn(dev->dev,
-						"Failed status: %x, reset controller\n",
-						readl(dev->bar + NVME_REG_CSTS));
-				}
-				continue;
-			}
-			for (i = 0; i < dev->queue_count; i++) {
-				struct nvme_queue *nvmeq = dev->queues[i];
-				if (!nvmeq)
-					continue;
-				spin_lock_irq(&nvmeq->q_lock);
-				nvme_process_cq(nvmeq);
-
-				while (i == 0 && dev->ctrl.event_limit > 0)
-					nvme_submit_async_event(dev);
-				spin_unlock_irq(&nvmeq->q_lock);
-			}
+static void nvme_watchdog_timer(unsigned long data)
+{
+	struct nvme_dev *dev = (struct nvme_dev *)data;
+	u32 csts = readl(dev->bar + NVME_REG_CSTS);
+
+	/*
+	 * Skip controllers currently under reset.
+	 */
+	if (!work_pending(&dev->reset_work) && !work_busy(&dev->reset_work) &&
+	    ((csts & NVME_CSTS_CFS) ||
+	     (dev->subsystem && (csts & NVME_CSTS_NSSRO)))) {
+		if (queue_work(nvme_workq, &dev->reset_work)) {
+			dev_warn(dev->dev,
+				"Failed status: 0x%x, reset controller.\n",
+				csts);
 		}
-		spin_unlock(&dev_list_lock);
-		schedule_timeout(round_jiffies_relative(HZ));
+		return;
 	}
-	return 0;
+
+	mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ));
 }
 
 static int nvme_create_io_queues(struct nvme_dev *dev)
 {
-	unsigned i;
+	unsigned i, max;
 	int ret = 0;
 
 	for (i = dev->queue_count; i <= dev->max_qid; i++) {
@@ -1396,7 +1382,8 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
 		}
 	}
 
-	for (i = dev->online_queues; i <= dev->queue_count - 1; i++) {
+	max = min(dev->max_qid, dev->queue_count - 1);
+	for (i = dev->online_queues; i <= max; i++) {
 		ret = nvme_create_queue(dev->queues[i], i);
 		if (ret) {
 			nvme_free_queues(dev, i);
@@ -1487,7 +1474,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	 * access to the admin queue, as that might be only way to fix them up.
 	 */
 	if (result > 0) {
-		dev_err(dev->dev, "Could not set queue count (%d)\n", result);
+		dev_err(dev->ctrl.device,
+			"Could not set queue count (%d)\n", result);
 		nr_io_queues = 0;
 		result = 0;
 	}
@@ -1553,9 +1541,6 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 		adminq->cq_vector = -1;
 		goto free_queues;
 	}
-
-	/* Free previously allocated queues that are no longer usable */
-	nvme_free_queues(dev, nr_io_queues + 1);
 	return nvme_create_io_queues(dev);
 
  free_queues:
@@ -1689,15 +1674,21 @@ static int nvme_dev_add(struct nvme_dev *dev)
 		if (blk_mq_alloc_tag_set(&dev->tagset))
 			return 0;
 		dev->ctrl.tagset = &dev->tagset;
+	} else {
+		blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
+
+		/* Free previously allocated queues that are no longer usable */
+		nvme_free_queues(dev, dev->online_queues);
 	}
-	queue_work(nvme_workq, &dev->scan_work);
+
+	nvme_queue_scan(dev);
 	return 0;
 }
 
-static int nvme_dev_map(struct nvme_dev *dev)
+static int nvme_pci_enable(struct nvme_dev *dev)
 {
 	u64 cap;
-	int bars, result = -ENOMEM;
+	int result = -ENOMEM;
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
 	if (pci_enable_device_mem(pdev))
@@ -1705,24 +1696,14 @@ static int nvme_dev_map(struct nvme_dev *dev)
 
 	dev->entry[0].vector = pdev->irq;
 	pci_set_master(pdev);
-	bars = pci_select_bars(pdev, IORESOURCE_MEM);
-	if (!bars)
-		goto disable_pci;
-
-	if (pci_request_selected_regions(pdev, bars, "nvme"))
-		goto disable_pci;
 
 	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) &&
 	    dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
 		goto disable;
 
-	dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
-	if (!dev->bar)
-		goto disable;
-
 	if (readl(dev->bar + NVME_REG_CSTS) == -1) {
 		result = -ENODEV;
-		goto unmap;
+		goto disable;
 	}
 
 	/*
@@ -1732,7 +1713,7 @@ static int nvme_dev_map(struct nvme_dev *dev)
 	if (!pdev->irq) {
 		result = pci_enable_msix(pdev, dev->entry, 1);
 		if (result < 0)
-			goto unmap;
+			goto disable;
 	}
 
 	cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
@@ -1759,18 +1740,20 @@ static int nvme_dev_map(struct nvme_dev *dev)
 	pci_save_state(pdev);
 	return 0;
 
- unmap:
-	iounmap(dev->bar);
-	dev->bar = NULL;
  disable:
-	pci_release_regions(pdev);
- disable_pci:
 	pci_disable_device(pdev);
 	return result;
 }
 
 static void nvme_dev_unmap(struct nvme_dev *dev)
 {
+	if (dev->bar)
+		iounmap(dev->bar);
+	pci_release_regions(to_pci_dev(dev->dev));
+}
+
+static void nvme_pci_disable(struct nvme_dev *dev)
+{
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
 	if (pdev->msi_enabled)
@@ -1778,71 +1761,21 @@ static void nvme_dev_unmap(struct nvme_dev *dev)
 	else if (pdev->msix_enabled)
 		pci_disable_msix(pdev);
 
-	if (dev->bar) {
-		iounmap(dev->bar);
-		dev->bar = NULL;
-		pci_release_regions(pdev);
-	}
-
 	if (pci_is_enabled(pdev)) {
 		pci_disable_pcie_error_reporting(pdev);
 		pci_disable_device(pdev);
 	}
 }
 
-static int nvme_dev_list_add(struct nvme_dev *dev)
-{
-	bool start_thread = false;
-
-	spin_lock(&dev_list_lock);
-	if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) {
-		start_thread = true;
-		nvme_thread = NULL;
-	}
-	list_add(&dev->node, &dev_list);
-	spin_unlock(&dev_list_lock);
-
-	if (start_thread) {
-		nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
-		wake_up_all(&nvme_kthread_wait);
-	} else
-		wait_event_killable(nvme_kthread_wait, nvme_thread);
-
-	if (IS_ERR_OR_NULL(nvme_thread))
-		return nvme_thread ? PTR_ERR(nvme_thread) : -EINTR;
-
-	return 0;
-}
-
-/*
-* Remove the node from the device list and check
-* for whether or not we need to stop the nvme_thread.
-*/
-static void nvme_dev_list_remove(struct nvme_dev *dev)
-{
-	struct task_struct *tmp = NULL;
-
-	spin_lock(&dev_list_lock);
-	list_del_init(&dev->node);
-	if (list_empty(&dev_list) && !IS_ERR_OR_NULL(nvme_thread)) {
-		tmp = nvme_thread;
-		nvme_thread = NULL;
-	}
-	spin_unlock(&dev_list_lock);
-
-	if (tmp)
-		kthread_stop(tmp);
-}
-
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 {
 	int i;
 	u32 csts = -1;
 
-	nvme_dev_list_remove(dev);
+	del_timer_sync(&dev->watchdog_timer);
 
 	mutex_lock(&dev->shutdown_lock);
-	if (dev->bar) {
+	if (pci_is_enabled(to_pci_dev(dev->dev))) {
 		nvme_stop_queues(&dev->ctrl);
 		csts = readl(dev->bar + NVME_REG_CSTS);
 	}
@@ -1855,7 +1788,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 		nvme_disable_io_queues(dev);
 		nvme_disable_admin_queue(dev, shutdown);
 	}
-	nvme_dev_unmap(dev);
+	nvme_pci_disable(dev);
 
 	for (i = dev->queue_count - 1; i >= 0; i--)
 		nvme_clear_queue(dev->queues[i]);
@@ -1899,10 +1832,20 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
 	kfree(dev);
 }
 
+static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
+{
+	dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status);
+
+	kref_get(&dev->ctrl.kref);
+	nvme_dev_disable(dev, false);
+	if (!schedule_work(&dev->remove_work))
+		nvme_put_ctrl(&dev->ctrl);
+}
+
 static void nvme_reset_work(struct work_struct *work)
 {
 	struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
-	int result;
+	int result = -ENODEV;
 
 	if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags)))
 		goto out;
@@ -1911,44 +1854,43 @@ static void nvme_reset_work(struct work_struct *work)
 	 * If we're called to reset a live controller first shut it down before
 	 * moving on.
 	 */
-	if (dev->bar)
+	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
 		nvme_dev_disable(dev, false);
 
 	set_bit(NVME_CTRL_RESETTING, &dev->flags);
 
-	result = nvme_dev_map(dev);
+	result = nvme_pci_enable(dev);
 	if (result)
 		goto out;
 
 	result = nvme_configure_admin_queue(dev);
 	if (result)
-		goto unmap;
+		goto out;
 
 	nvme_init_queue(dev->queues[0], 0);
 	result = nvme_alloc_admin_tags(dev);
 	if (result)
-		goto disable;
+		goto out;
 
 	result = nvme_init_identify(&dev->ctrl);
 	if (result)
-		goto free_tags;
+		goto out;
 
 	result = nvme_setup_io_queues(dev);
 	if (result)
-		goto free_tags;
+		goto out;
 
 	dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS;
+	queue_work(nvme_workq, &dev->async_work);
 
-	result = nvme_dev_list_add(dev);
-	if (result)
-		goto remove;
+	mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ));
 
 	/*
 	 * Keep the controller around but remove all namespaces if we don't have
 	 * any working I/O queue.
 	 */
 	if (dev->online_queues < 2) {
-		dev_warn(dev->dev, "IO queues not created\n");
+		dev_warn(dev->ctrl.device, "IO queues not created\n");
 		nvme_remove_namespaces(&dev->ctrl);
 	} else {
 		nvme_start_queues(&dev->ctrl);
@@ -1958,19 +1900,8 @@ static void nvme_reset_work(struct work_struct *work)
 	clear_bit(NVME_CTRL_RESETTING, &dev->flags);
 	return;
 
- remove:
-	nvme_dev_list_remove(dev);
- free_tags:
-	nvme_dev_remove_admin(dev);
-	blk_put_queue(dev->ctrl.admin_q);
-	dev->ctrl.admin_q = NULL;
-	dev->queues[0]->tags = NULL;
- disable:
-	nvme_disable_admin_queue(dev, false);
- unmap:
-	nvme_dev_unmap(dev);
  out:
-	nvme_remove_dead_ctrl(dev);
+	nvme_remove_dead_ctrl(dev, result);
 }
 
 static void nvme_remove_dead_ctrl_work(struct work_struct *work)
@@ -1978,19 +1909,12 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work)
 	struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
+	nvme_kill_queues(&dev->ctrl);
 	if (pci_get_drvdata(pdev))
 		pci_stop_and_remove_bus_device_locked(pdev);
 	nvme_put_ctrl(&dev->ctrl);
 }
 
-static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
-{
-	dev_warn(dev->dev, "Removing after probe failure\n");
-	kref_get(&dev->ctrl.kref);
-	if (!schedule_work(&dev->remove_work))
-		nvme_put_ctrl(&dev->ctrl);
-}
-
 static int nvme_reset(struct nvme_dev *dev)
 {
 	if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q))
@@ -2034,6 +1958,7 @@ static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl)
 }
 
 static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
+	.module			= THIS_MODULE,
 	.reg_read32		= nvme_pci_reg_read32,
 	.reg_write32		= nvme_pci_reg_write32,
 	.reg_read64		= nvme_pci_reg_read64,
@@ -2042,6 +1967,27 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
 	.free_ctrl		= nvme_pci_free_ctrl,
 };
 
+static int nvme_dev_map(struct nvme_dev *dev)
+{
+	int bars;
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+	bars = pci_select_bars(pdev, IORESOURCE_MEM);
+	if (!bars)
+		return -ENODEV;
+	if (pci_request_selected_regions(pdev, bars, "nvme"))
+		return -ENODEV;
+
+	dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
+	if (!dev->bar)
+		goto release;
+
+       return 0;
+  release:
+       pci_release_regions(pdev);
+       return -ENODEV;
+}
+
 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	int node, result = -ENOMEM;
@@ -2066,10 +2012,16 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	dev->dev = get_device(&pdev->dev);
 	pci_set_drvdata(pdev, dev);
 
-	INIT_LIST_HEAD(&dev->node);
+	result = nvme_dev_map(dev);
+	if (result)
+		goto free;
+
 	INIT_WORK(&dev->scan_work, nvme_dev_scan);
 	INIT_WORK(&dev->reset_work, nvme_reset_work);
 	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
+	INIT_WORK(&dev->async_work, nvme_async_event_work);
+	setup_timer(&dev->watchdog_timer, nvme_watchdog_timer,
+		(unsigned long)dev);
 	mutex_init(&dev->shutdown_lock);
 	init_completion(&dev->ioq_wait);
 
@@ -2082,6 +2034,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (result)
 		goto release_pools;
 
+	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
+
 	queue_work(nvme_workq, &dev->reset_work);
 	return 0;
 
@@ -2089,6 +2043,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	nvme_release_prp_pools(dev);
  put_pci:
 	put_device(dev->dev);
+	nvme_dev_unmap(dev);
  free:
 	kfree(dev->queues);
 	kfree(dev->entry);
@@ -2112,11 +2067,20 @@ static void nvme_shutdown(struct pci_dev *pdev)
 	nvme_dev_disable(dev, true);
 }
 
+/*
+ * The driver's remove may be called on a device in a partially initialized
+ * state. This function must not have any dependencies on the device state in
+ * order to proceed.
+ */
 static void nvme_remove(struct pci_dev *pdev)
 {
 	struct nvme_dev *dev = pci_get_drvdata(pdev);
 
+	del_timer_sync(&dev->watchdog_timer);
+
+	set_bit(NVME_CTRL_REMOVING, &dev->flags);
 	pci_set_drvdata(pdev, NULL);
+	flush_work(&dev->async_work);
 	flush_work(&dev->scan_work);
 	nvme_remove_namespaces(&dev->ctrl);
 	nvme_uninit_ctrl(&dev->ctrl);
@@ -2126,6 +2090,7 @@ static void nvme_remove(struct pci_dev *pdev)
 	nvme_free_queues(dev, 0);
 	nvme_release_cmb(dev);
 	nvme_release_prp_pools(dev);
+	nvme_dev_unmap(dev);
 	nvme_put_ctrl(&dev->ctrl);
 }
 
@@ -2161,7 +2126,7 @@ static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
 	 * shutdown the controller to quiesce. The controller will be restarted
 	 * after the slot reset through driver's slot_reset callback.
 	 */
-	dev_warn(&pdev->dev, "error detected: state:%d\n", state);
+	dev_warn(dev->ctrl.device, "error detected: state:%d\n", state);
 	switch (state) {
 	case pci_channel_io_normal:
 		return PCI_ERS_RESULT_CAN_RECOVER;
@@ -2178,7 +2143,7 @@ static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev)
 {
 	struct nvme_dev *dev = pci_get_drvdata(pdev);
 
-	dev_info(&pdev->dev, "restart after slot reset\n");
+	dev_info(dev->ctrl.device, "restart after slot reset\n");
 	pci_restore_state(pdev);
 	queue_work(nvme_workq, &dev->reset_work);
 	return PCI_ERS_RESULT_RECOVERED;
@@ -2201,7 +2166,8 @@ static const struct pci_error_handlers nvme_err_handler = {
 
 static const struct pci_device_id nvme_id_table[] = {
 	{ PCI_VDEVICE(INTEL, 0x0953),
-		.driver_data = NVME_QUIRK_STRIPE_SIZE, },
+		.driver_data = NVME_QUIRK_STRIPE_SIZE |
+				NVME_QUIRK_DISCARD_ZEROES, },
 	{ PCI_VDEVICE(INTEL, 0x5845),	/* Qemu emulated controller */
 		.driver_data = NVME_QUIRK_IDENTIFY_CNS, },
 	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
@@ -2226,34 +2192,20 @@ static int __init nvme_init(void)
 {
 	int result;
 
-	init_waitqueue_head(&nvme_kthread_wait);
-
 	nvme_workq = alloc_workqueue("nvme", WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
 	if (!nvme_workq)
 		return -ENOMEM;
 
-	result = nvme_core_init();
-	if (result < 0)
-		goto kill_workq;
-
 	result = pci_register_driver(&nvme_driver);
 	if (result)
-		goto core_exit;
-	return 0;
-
- core_exit:
-	nvme_core_exit();
- kill_workq:
-	destroy_workqueue(nvme_workq);
+		destroy_workqueue(nvme_workq);
 	return result;
 }
 
 static void __exit nvme_exit(void)
 {
 	pci_unregister_driver(&nvme_driver);
-	nvme_core_exit();
 	destroy_workqueue(nvme_workq);
-	BUG_ON(nvme_thread && !IS_ERR(nvme_thread));
 	_nvme_check_size();
 }