Skip to content

Commit 09a3c1e

Browse files
Gaurav Batrampe
Gaurav Batra
authored andcommitted
powerpc/pseries/iommu: IOMMU table is not initialized for kdump over SR-IOV
When kdump kernel tries to copy dump data over SR-IOV, LPAR panics due to NULL pointer exception: Kernel attempted to read user page (0) - exploit attempt? (uid: 0) BUG: Kernel NULL pointer dereference on read at 0x00000000 Faulting instruction address: 0xc000000020847ad4 Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries Modules linked in: mlx5_core(+) vmx_crypto pseries_wdt papr_scm libnvdimm mlxfw tls psample sunrpc fuse overlay squashfs loop CPU: 12 PID: 315 Comm: systemd-udevd Not tainted 6.4.0-Test102+ Rust-for-Linux#12 Hardware name: IBM,9080-HEX POWER10 (raw) 0x800200 0xf000006 of:IBM,FW1060.00 (NH1060_008) hv:phyp pSeries NIP: c000000020847ad4 LR: c00000002083b2dc CTR: 00000000006cd18c REGS: c000000029162ca0 TRAP: 0300 Not tainted (6.4.0-Test102+) MSR: 800000000280b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE> CR: 48288244 XER: 00000008 CFAR: c00000002083b2d8 DAR: 0000000000000000 DSISR: 40000000 IRQMASK: 1 ... NIP _find_next_zero_bit+0x24/0x110 LR bitmap_find_next_zero_area_off+0x5c/0xe0 Call Trace: dev_printk_emit+0x38/0x48 (unreliable) iommu_area_alloc+0xc4/0x180 iommu_range_alloc+0x1e8/0x580 iommu_alloc+0x60/0x130 iommu_alloc_coherent+0x158/0x2b0 dma_iommu_alloc_coherent+0x3c/0x50 dma_alloc_attrs+0x170/0x1f0 mlx5_cmd_init+0xc0/0x760 [mlx5_core] mlx5_function_setup+0xf0/0x510 [mlx5_core] mlx5_init_one+0x84/0x210 [mlx5_core] probe_one+0x118/0x2c0 [mlx5_core] local_pci_probe+0x68/0x110 pci_call_probe+0x68/0x200 pci_device_probe+0xbc/0x1a0 really_probe+0x104/0x540 __driver_probe_device+0xb4/0x230 driver_probe_device+0x54/0x130 __driver_attach+0x158/0x2b0 bus_for_each_dev+0xa8/0x130 driver_attach+0x34/0x50 bus_add_driver+0x16c/0x300 driver_register+0xa4/0x1b0 __pci_register_driver+0x68/0x80 mlx5_init+0xb8/0x100 [mlx5_core] do_one_initcall+0x60/0x300 do_init_module+0x7c/0x2b0 At the time of LPAR dump, before kexec hands over control to kdump kernel, DDWs (Dynamic DMA Windows) are scanned and added to the FDT. For the SR-IOV case, default DMA window "ibm,dma-window" is removed from the FDT and DDW added, for the device. Now, kexec hands over control to the kdump kernel. When the kdump kernel initializes, PCI busses are scanned and IOMMU group/tables created, in pci_dma_bus_setup_pSeriesLP(). For the SR-IOV case, there is no "ibm,dma-window". The original commit: b1fc44e, fixes the path where memory is pre-mapped (direct mapped) to the DDW. When TCEs are direct mapped, there is no need to initialize IOMMU tables. iommu_table_setparms_lpar() only considers "ibm,dma-window" property when initiallizing IOMMU table. In the scenario where TCEs are dynamically allocated for SR-IOV, newly created IOMMU table is not initialized. Later, when the device driver tries to enter TCEs for the SR-IOV device, NULL pointer execption is thrown from iommu_area_alloc(). The fix is to initialize the IOMMU table with DDW property stored in the FDT. There are 2 points to remember: 1. For the dedicated adapter, kdump kernel would encounter both default and DDW in FDT. In this case, DDW property is used to initialize the IOMMU table. 2. A DDW could be direct or dynamic mapped. kdump kernel would initialize IOMMU table and mark the existing DDW as "dynamic". This works fine since, at the time of table initialization, iommu_table_clear() makes some space in the DDW, for some predefined number of TCEs which are needed for kdump to succeed. Fixes: b1fc44e ("pseries/iommu/ddw: Fix kdump to work in absence of ibm,dma-window") Signed-off-by: Gaurav Batra <[email protected]> Reviewed-by: Brian King <[email protected]> Signed-off-by: Michael Ellerman <[email protected]> Link: https://msgid.link/[email protected]
1 parent 20c8c4d commit 09a3c1e

File tree

1 file changed

+105
-51
lines changed
  • arch/powerpc/platforms/pseries

1 file changed

+105
-51
lines changed

arch/powerpc/platforms/pseries/iommu.c

+105-51
Original file line numberDiff line numberDiff line change
@@ -574,29 +574,6 @@ static void iommu_table_setparms(struct pci_controller *phb,
574574

575575
struct iommu_table_ops iommu_table_lpar_multi_ops;
576576

577-
/*
578-
* iommu_table_setparms_lpar
579-
*
580-
* Function: On pSeries LPAR systems, return TCE table info, given a pci bus.
581-
*/
582-
static void iommu_table_setparms_lpar(struct pci_controller *phb,
583-
struct device_node *dn,
584-
struct iommu_table *tbl,
585-
struct iommu_table_group *table_group,
586-
const __be32 *dma_window)
587-
{
588-
unsigned long offset, size, liobn;
589-
590-
of_parse_dma_window(dn, dma_window, &liobn, &offset, &size);
591-
592-
iommu_table_setparms_common(tbl, phb->bus->number, liobn, offset, size, IOMMU_PAGE_SHIFT_4K, NULL,
593-
&iommu_table_lpar_multi_ops);
594-
595-
596-
table_group->tce32_start = offset;
597-
table_group->tce32_size = size;
598-
}
599-
600577
struct iommu_table_ops iommu_table_pseries_ops = {
601578
.set = tce_build_pSeries,
602579
.clear = tce_free_pSeries,
@@ -724,44 +701,92 @@ struct iommu_table_ops iommu_table_lpar_multi_ops = {
724701
* dynamic 64bit DMA window, walking up the device tree.
725702
*/
726703
static struct device_node *pci_dma_find(struct device_node *dn,
727-
const __be32 **dma_window)
704+
struct dynamic_dma_window_prop *prop)
728705
{
729-
const __be32 *dw = NULL;
706+
const __be32 *default_prop = NULL;
707+
const __be32 *ddw_prop = NULL;
708+
struct device_node *rdn = NULL;
709+
bool default_win = false, ddw_win = false;
730710

731711
for ( ; dn && PCI_DN(dn); dn = dn->parent) {
732-
dw = of_get_property(dn, "ibm,dma-window", NULL);
733-
if (dw) {
734-
if (dma_window)
735-
*dma_window = dw;
736-
return dn;
712+
default_prop = of_get_property(dn, "ibm,dma-window", NULL);
713+
if (default_prop) {
714+
rdn = dn;
715+
default_win = true;
716+
}
717+
ddw_prop = of_get_property(dn, DIRECT64_PROPNAME, NULL);
718+
if (ddw_prop) {
719+
rdn = dn;
720+
ddw_win = true;
721+
break;
722+
}
723+
ddw_prop = of_get_property(dn, DMA64_PROPNAME, NULL);
724+
if (ddw_prop) {
725+
rdn = dn;
726+
ddw_win = true;
727+
break;
737728
}
738-
dw = of_get_property(dn, DIRECT64_PROPNAME, NULL);
739-
if (dw)
740-
return dn;
741-
dw = of_get_property(dn, DMA64_PROPNAME, NULL);
742-
if (dw)
743-
return dn;
729+
730+
/* At least found default window, which is the case for normal boot */
731+
if (default_win)
732+
break;
744733
}
745734

746-
return NULL;
735+
/* For PCI devices there will always be a DMA window, either on the device
736+
* or parent bus
737+
*/
738+
WARN_ON(!(default_win | ddw_win));
739+
740+
/* caller doesn't want to get DMA window property */
741+
if (!prop)
742+
return rdn;
743+
744+
/* parse DMA window property. During normal system boot, only default
745+
* DMA window is passed in OF. But, for kdump, a dedicated adapter might
746+
* have both default and DDW in FDT. In this scenario, DDW takes precedence
747+
* over default window.
748+
*/
749+
if (ddw_win) {
750+
struct dynamic_dma_window_prop *p;
751+
752+
p = (struct dynamic_dma_window_prop *)ddw_prop;
753+
prop->liobn = p->liobn;
754+
prop->dma_base = p->dma_base;
755+
prop->tce_shift = p->tce_shift;
756+
prop->window_shift = p->window_shift;
757+
} else if (default_win) {
758+
unsigned long offset, size, liobn;
759+
760+
of_parse_dma_window(rdn, default_prop, &liobn, &offset, &size);
761+
762+
prop->liobn = cpu_to_be32((u32)liobn);
763+
prop->dma_base = cpu_to_be64(offset);
764+
prop->tce_shift = cpu_to_be32(IOMMU_PAGE_SHIFT_4K);
765+
prop->window_shift = cpu_to_be32(order_base_2(size));
766+
}
767+
768+
return rdn;
747769
}
748770

749771
static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
750772
{
751773
struct iommu_table *tbl;
752774
struct device_node *dn, *pdn;
753775
struct pci_dn *ppci;
754-
const __be32 *dma_window = NULL;
776+
struct dynamic_dma_window_prop prop;
755777

756778
dn = pci_bus_to_OF_node(bus);
757779

758780
pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
759781
dn);
760782

761-
pdn = pci_dma_find(dn, &dma_window);
783+
pdn = pci_dma_find(dn, &prop);
762784

763-
if (dma_window == NULL)
764-
pr_debug(" no ibm,dma-window property !\n");
785+
/* In PPC architecture, there will always be DMA window on bus or one of the
786+
* parent bus. During reboot, there will be ibm,dma-window property to
787+
* define DMA window. For kdump, there will at least be default window or DDW
788+
* or both.
789+
*/
765790

766791
ppci = PCI_DN(pdn);
767792

@@ -771,13 +796,24 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
771796
if (!ppci->table_group) {
772797
ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);
773798
tbl = ppci->table_group->tables[0];
774-
if (dma_window) {
775-
iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
776-
ppci->table_group, dma_window);
777799

778-
if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))
779-
panic("Failed to initialize iommu table");
780-
}
800+
iommu_table_setparms_common(tbl, ppci->phb->bus->number,
801+
be32_to_cpu(prop.liobn),
802+
be64_to_cpu(prop.dma_base),
803+
1ULL << be32_to_cpu(prop.window_shift),
804+
be32_to_cpu(prop.tce_shift), NULL,
805+
&iommu_table_lpar_multi_ops);
806+
807+
/* Only for normal boot with default window. Doesn't matter even
808+
* if we set these with DDW which is 64bit during kdump, since
809+
* these will not be used during kdump.
810+
*/
811+
ppci->table_group->tce32_start = be64_to_cpu(prop.dma_base);
812+
ppci->table_group->tce32_size = 1 << be32_to_cpu(prop.window_shift);
813+
814+
if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))
815+
panic("Failed to initialize iommu table");
816+
781817
iommu_register_group(ppci->table_group,
782818
pci_domain_nr(bus), 0);
783819
pr_debug(" created table: %p\n", ppci->table_group);
@@ -968,6 +1004,12 @@ static void find_existing_ddw_windows_named(const char *name)
9681004
continue;
9691005
}
9701006

1007+
/* If at the time of system initialization, there are DDWs in OF,
1008+
* it means this is during kexec. DDW could be direct or dynamic.
1009+
* We will just mark DDWs as "dynamic" since this is kdump path,
1010+
* no need to worry about perforance. ddw_list_new_entry() will
1011+
* set window->direct = false.
1012+
*/
9711013
window = ddw_list_new_entry(pdn, dma64);
9721014
if (!window) {
9731015
of_node_put(pdn);
@@ -1524,8 +1566,8 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
15241566
{
15251567
struct device_node *pdn, *dn;
15261568
struct iommu_table *tbl;
1527-
const __be32 *dma_window = NULL;
15281569
struct pci_dn *pci;
1570+
struct dynamic_dma_window_prop prop;
15291571

15301572
pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev));
15311573

@@ -1538,7 +1580,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
15381580
dn = pci_device_to_OF_node(dev);
15391581
pr_debug(" node is %pOF\n", dn);
15401582

1541-
pdn = pci_dma_find(dn, &dma_window);
1583+
pdn = pci_dma_find(dn, &prop);
15421584
if (!pdn || !PCI_DN(pdn)) {
15431585
printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: "
15441586
"no DMA window found for pci dev=%s dn=%pOF\n",
@@ -1551,8 +1593,20 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
15511593
if (!pci->table_group) {
15521594
pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
15531595
tbl = pci->table_group->tables[0];
1554-
iommu_table_setparms_lpar(pci->phb, pdn, tbl,
1555-
pci->table_group, dma_window);
1596+
1597+
iommu_table_setparms_common(tbl, pci->phb->bus->number,
1598+
be32_to_cpu(prop.liobn),
1599+
be64_to_cpu(prop.dma_base),
1600+
1ULL << be32_to_cpu(prop.window_shift),
1601+
be32_to_cpu(prop.tce_shift), NULL,
1602+
&iommu_table_lpar_multi_ops);
1603+
1604+
/* Only for normal boot with default window. Doesn't matter even
1605+
* if we set these with DDW which is 64bit during kdump, since
1606+
* these will not be used during kdump.
1607+
*/
1608+
pci->table_group->tce32_start = be64_to_cpu(prop.dma_base);
1609+
pci->table_group->tce32_size = 1 << be32_to_cpu(prop.window_shift);
15561610

15571611
iommu_init_table(tbl, pci->phb->node, 0, 0);
15581612
iommu_register_group(pci->table_group,

0 commit comments

Comments
 (0)