From 259bb458cb1a142ba0af7a6a0cd2b64e2174b7a8 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Tue, 21 May 2019 16:46:23 -0700 Subject: [PATCH] RH7: PCI: hv: Fix the affinity setting for the NVMe crash In the case of cpumask_equal(mask, cpu_online_mask) == false, "mask" may be a superset of "cfg->domain", and the real affinity is still saved in "cfg->domain", after __ioapic_set_affinity() returns. See the line "cpumask_copy(cfg->domain, tmp_mask);" in RHEL 7.x's kernel function __assign_irq_vector(). So we should always use "cfg->domain", otherwise the NVMe driver may fail to receive the expected interrupt, and later the buggy error handling code in nvme_dev_disable() can cause the below panic: [ 71.695565] nvme nvme7: I/O 19 QID 0 timeout, disable controller [ 71.724221] ------------[ cut here ]------------ [ 71.725067] WARNING: CPU: 4 PID: 11317 at kernel/irq/manage.c:1348 __free_irq+0xb3/0x280 [ 71.725067] Trying to free already-free IRQ 226 [ 71.725067] Modules linked in: ... [ 71.725067] CPU: 4 PID: 11317 Comm: kworker/4:1H Tainted: G OE ------------ T 3.10.0-957.10.1.el7.x86_64 #1 [ 71.725067] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS 090007 05/18/2018 [ 71.725067] Workqueue: kblockd blk_mq_timeout_work [ 71.725067] Call Trace: [ 71.725067] [] dump_stack+0x19/0x1b [ 71.725067] [] __warn+0xd8/0x100 [ 71.725067] [] warn_slowpath_fmt+0x5f/0x80 [ 71.725067] [] __free_irq+0xb3/0x280 [ 71.725067] [] free_irq+0x39/0x90 [ 71.725067] [] nvme_dev_disable+0x11c/0x4b0 [nvme] [ 71.725067] [] ? dev_warn+0x6c/0x90 [ 71.725067] [] nvme_timeout+0x204/0x2d0 [nvme] [ 71.725067] [] ? blk_mq_do_dispatch_sched+0x9d/0x130 [ 71.725067] [] ? update_curr+0x14c/0x1e0 [ 71.725067] [] blk_mq_rq_timed_out+0x32/0x80 [ 71.725067] [] blk_mq_check_expired+0x5c/0x60 [ 71.725067] [] bt_iter+0x54/0x60 [ 71.725067] [] blk_mq_queue_tag_busy_iter+0x11b/0x290 [ 71.725067] [] ? blk_mq_rq_timed_out+0x80/0x80 [ 71.725067] [] ? blk_mq_rq_timed_out+0x80/0x80 [ 71.725067] [] blk_mq_timeout_work+0x8b/0x180 [ 71.725067] [] process_one_work+0x17f/0x440 [ 71.725067] [] worker_thread+0x126/0x3c0 [ 71.725067] [] ? manage_workers.isra.25+0x2a0/0x2a0 [ 71.725067] [] kthread+0xd1/0xe0 [ 71.725067] [] ? insert_kthread_work+0x40/0x40 [ 71.725067] [] ret_from_fork_nospec_begin+0xe/0x21 [ 71.725067] [] ? insert_kthread_work+0x40/0x40 [ 71.725067] ---[ end trace b3257623bc50d02a ]--- [ 72.196556] BUG: unable to handle kernel NULL pointer dereference at 0000000000000048 [ 72.211013] IP: [] free_irq+0x39/0x90 It looks the bug is more easily triggered when the VM has a lot of vCPUs, e.g. L64v2 or L80v2 VM sizes. Presumably, in such a VM, the NVMe driver can pass a "mask" which has multiple bits of 1, but is not equal to "cpu_online_mask". Previously we incorrctly assumed the "mask" either contains only 1 bit of "1" or equals to "cpu_online_mask". Fixes: 9c8bbaee92bf ("RH7: PCI: hv: respect the affinity setting") Signed-off-by: Dexuan Cui --- hv-rhel7.x/hv/pci-hyperv.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/hv-rhel7.x/hv/pci-hyperv.c b/hv-rhel7.x/hv/pci-hyperv.c index 8ca591d4d..67e253738 100644 --- a/hv-rhel7.x/hv/pci-hyperv.c +++ b/hv-rhel7.x/hv/pci-hyperv.c @@ -811,12 +811,11 @@ struct irq_cfg *irqd_cfg(struct irq_data *irq_data) } /* Interrupt management hooks */ -static int hv_set_affinity(struct irq_data *data, const struct cpumask *mask, +static int hv_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force) { struct msi_desc *msi_desc = data->msi_desc; struct irq_cfg *cfg = irqd_cfg(data); - const struct cpumask *dest; struct retarget_msi_interrupt *params; struct hv_pcibus_device *hbus; struct pci_bus *pbus; @@ -827,10 +826,6 @@ static int hv_set_affinity(struct irq_data *data, const struct cpumask *mask, u64 res; u32 var_size = 0; - if (cpumask_equal(mask, cpu_online_mask)) - dest = cfg->domain; - else - dest = mask; ret = __ioapic_set_affinity(data, dest, &dest_id); if (ret) return ret; @@ -880,7 +875,7 @@ static int hv_set_affinity(struct irq_data *data, const struct cpumask *mask, */ var_size = 1 + HV_VP_SET_BANK_COUNT_MAX; - for_each_cpu_and(cpu, dest, cpu_online_mask) { + for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) { cpu_vmbus = hv_cpu_number_to_vp_number(cpu); if (cpu_vmbus >= HV_VP_SET_BANK_COUNT_MAX * 64) { @@ -894,7 +889,7 @@ static int hv_set_affinity(struct irq_data *data, const struct cpumask *mask, (1ULL << (cpu_vmbus & 63)); } } else { - for_each_cpu_and(cpu, dest, cpu_online_mask) { + for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) { params->int_target.vp_mask |= (1ULL << hv_cpu_number_to_vp_number(cpu)); }