From b7b3c01b191596d27a6980d1a42504f5b607f802 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2020 16:50:34 -0700 Subject: [PATCH] mm/memremap_pages: support multiple ranges per invocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In support of device-dax growing the ability to front physically dis-contiguous ranges of memory, update devm_memremap_pages() to track multiple ranges with a single reference counter and devm instance. Convert all [devm_]memremap_pages() users to specify the number of ranges they are mapping in their 'struct dev_pagemap' instance. Signed-off-by: Dan Williams Signed-off-by: Andrew Morton Cc: Paul Mackerras Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Vishal Verma Cc: Vivek Goyal Cc: Dave Jiang Cc: Ben Skeggs Cc: David Airlie Cc: Daniel Vetter Cc: Ira Weiny Cc: Bjorn Helgaas Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Cc: "Jérôme Glisse" Cc: Ard Biesheuvel Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: Brice Goglin Cc: Catalin Marinas Cc: Dave Hansen Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Hulk Robot Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Jason Yan Cc: Jeff Moyer Cc: "Jérôme Glisse" Cc: Jia He Cc: Joao Martins Cc: Jonathan Cameron Cc: kernel test robot Cc: Mike Rapoport Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Randy Dunlap Cc: Thomas Gleixner Cc: Tom Lendacky Cc: Wei Yang Cc: Will Deacon Link: https://lkml.kernel.org/r/159643103789.4062302.18426128170217903785.stgit@dwillia2-desk3.amr.corp.intel.com Link: https://lkml.kernel.org/r/160106116293.30709.13350662794915396198.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Linus Torvalds --- arch/powerpc/kvm/book3s_hv_uvmem.c | 1 + drivers/dax/device.c | 1 + drivers/gpu/drm/nouveau/nouveau_dmem.c | 1 + drivers/nvdimm/pfn_devs.c | 1 + drivers/nvdimm/pmem.c | 1 + drivers/pci/p2pdma.c | 1 + drivers/xen/unpopulated-alloc.c | 1 + include/linux/memremap.h | 10 +- lib/test_hmm.c | 1 + mm/memremap.c | 274 ++++++++++++++----------- 10 files changed, 174 insertions(+), 118 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 29ec555055c2..84e5a2dc8be5 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -1172,6 +1172,7 @@ int kvmppc_uvmem_init(void) kvmppc_uvmem_pgmap.type = MEMORY_DEVICE_PRIVATE; kvmppc_uvmem_pgmap.range.start = res->start; kvmppc_uvmem_pgmap.range.end = res->end; + kvmppc_uvmem_pgmap.nr_range = 1; kvmppc_uvmem_pgmap.ops = &kvmppc_uvmem_ops; /* just one global instance: */ kvmppc_uvmem_pgmap.owner = &kvmppc_uvmem_pgmap; diff --git a/drivers/dax/device.c b/drivers/dax/device.c index a14448bca83d..5f808617672a 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -417,6 +417,7 @@ int dev_dax_probe(struct dev_dax *dev_dax) if (!pgmap) return -ENOMEM; pgmap->range = *range; + pgmap->nr_range = 1; } pgmap->type = MEMORY_DEVICE_GENERIC; addr = devm_memremap_pages(dev, pgmap); diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 25811ed7e274..a13c6215bba8 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -251,6 +251,7 @@ nouveau_dmem_chunk_alloc(struct nouveau_drm *drm, struct page **ppage) chunk->pagemap.type = MEMORY_DEVICE_PRIVATE; chunk->pagemap.range.start = res->start; chunk->pagemap.range.end = res->end; + chunk->pagemap.nr_range = 1; chunk->pagemap.ops = &nouveau_dmem_pagemap_ops; chunk->pagemap.owner = drm->dev; diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 3c4787b92a6a..b499df630d4d 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -693,6 +693,7 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) .start = nsio->res.start + start_pad, .end = nsio->res.end - end_trunc, }; + pgmap->nr_range = 1; if (nd_pfn->mode == PFN_MODE_RAM) { if (offset < reserve) return -EINVAL; diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 1f394f44838f..875076b0ea6c 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -441,6 +441,7 @@ static int pmem_attach_disk(struct device *dev, } else if (pmem_should_map_pages(dev)) { pmem->pgmap.range.start = res->start; pmem->pgmap.range.end = res->end; + pmem->pgmap.nr_range = 1; pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; pmem->pgmap.ops = &fsdax_pagemap_ops; addr = devm_memremap_pages(dev, &pmem->pgmap); diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 256850513813..9d53c16b7329 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -187,6 +187,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, pgmap = &p2p_pgmap->pgmap; pgmap->range.start = pci_resource_start(pdev, bar) + offset; pgmap->range.end = pgmap->range.start + size - 1; + pgmap->nr_range = 1; pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; p2p_pgmap->provider = pdev; diff --git a/drivers/xen/unpopulated-alloc.c b/drivers/xen/unpopulated-alloc.c index 091b8669eca3..8c512ea550bb 100644 --- a/drivers/xen/unpopulated-alloc.c +++ b/drivers/xen/unpopulated-alloc.c @@ -47,6 +47,7 @@ static int fill_list(unsigned int nr_pages) .start = res->start, .end = res->end, }; + pgmap->nr_range = 1; pgmap->owner = res; #ifdef CONFIG_XEN_HAVE_PVMMU diff --git a/include/linux/memremap.h b/include/linux/memremap.h index d0dd261d87c0..79c49e7f5c30 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -94,7 +94,6 @@ struct dev_pagemap_ops { /** * struct dev_pagemap - metadata for ZONE_DEVICE mappings * @altmap: pre-allocated/reserved memory for vmemmap allocations - * @range: physical address range covered by @ref * @ref: reference count that pins the devm_memremap_pages() mapping * @internal_ref: internal reference if @ref is not provided by the caller * @done: completion for @internal_ref @@ -104,10 +103,12 @@ struct dev_pagemap_ops { * @owner: an opaque pointer identifying the entity that manages this * instance. Used by various helpers to make sure that no * foreign ZONE_DEVICE memory is accessed. + * @nr_range: number of ranges to be mapped + * @range: range to be mapped when nr_range == 1 + * @ranges: array of ranges to be mapped when nr_range > 1 */ struct dev_pagemap { struct vmem_altmap altmap; - struct range range; struct percpu_ref *ref; struct percpu_ref internal_ref; struct completion done; @@ -115,6 +116,11 @@ struct dev_pagemap { unsigned int flags; const struct dev_pagemap_ops *ops; void *owner; + int nr_range; + union { + struct range range; + struct range ranges[0]; + }; }; static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index e97ca8ec0bce..c710b4c5714d 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -472,6 +472,7 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; devmem->pagemap.range.start = res->start; devmem->pagemap.range.end = res->end; + devmem->pagemap.nr_range = 1; devmem->pagemap.ops = &dmirror_devmem_ops; devmem->pagemap.owner = mdevice; diff --git a/mm/memremap.c b/mm/memremap.c index d958d348b3ca..532ec3d36ab4 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -77,15 +77,19 @@ static void pgmap_array_delete(struct range *range) synchronize_rcu(); } -static unsigned long pfn_first(struct dev_pagemap *pgmap) +static unsigned long pfn_first(struct dev_pagemap *pgmap, int range_id) { - return PHYS_PFN(pgmap->range.start) + - vmem_altmap_offset(pgmap_altmap(pgmap)); + struct range *range = &pgmap->ranges[range_id]; + unsigned long pfn = PHYS_PFN(range->start); + + if (range_id) + return pfn; + return pfn + vmem_altmap_offset(pgmap_altmap(pgmap)); } -static unsigned long pfn_end(struct dev_pagemap *pgmap) +static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id) { - const struct range *range = &pgmap->range; + const struct range *range = &pgmap->ranges[range_id]; return (range->start + range_len(range)) >> PAGE_SHIFT; } @@ -97,8 +101,8 @@ static unsigned long pfn_next(unsigned long pfn) return pfn + 1; } -#define for_each_device_pfn(pfn, map) \ - for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn)) +#define for_each_device_pfn(pfn, map, i) \ + for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn)) static void dev_pagemap_kill(struct dev_pagemap *pgmap) { @@ -124,20 +128,14 @@ static void dev_pagemap_cleanup(struct dev_pagemap *pgmap) pgmap->ref = NULL; } -void memunmap_pages(struct dev_pagemap *pgmap) +static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) { - struct range *range = &pgmap->range; + struct range *range = &pgmap->ranges[range_id]; struct page *first_page; - unsigned long pfn; int nid; - dev_pagemap_kill(pgmap); - for_each_device_pfn(pfn, pgmap) - put_page(pfn_to_page(pfn)); - dev_pagemap_cleanup(pgmap); - /* make sure to access a memmap that was actually initialized */ - first_page = pfn_to_page(pfn_first(pgmap)); + first_page = pfn_to_page(pfn_first(pgmap, range_id)); /* pages are dead and unused, undo the arch mapping */ nid = page_to_nid(first_page); @@ -157,6 +155,22 @@ void memunmap_pages(struct dev_pagemap *pgmap) untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range)); pgmap_array_delete(range); +} + +void memunmap_pages(struct dev_pagemap *pgmap) +{ + unsigned long pfn; + int i; + + dev_pagemap_kill(pgmap); + for (i = 0; i < pgmap->nr_range; i++) + for_each_device_pfn(pfn, pgmap, i) + put_page(pfn_to_page(pfn)); + dev_pagemap_cleanup(pgmap); + + for (i = 0; i < pgmap->nr_range; i++) + pageunmap_range(pgmap, i); + WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n"); devmap_managed_enable_put(); } @@ -175,6 +189,114 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref) complete(&pgmap->done); } +static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, + int range_id, int nid) +{ + struct range *range = &pgmap->ranges[range_id]; + struct dev_pagemap *conflict_pgmap; + int error, is_ram; + + if (WARN_ONCE(pgmap_altmap(pgmap) && range_id > 0, + "altmap not supported for multiple ranges\n")) + return -EINVAL; + + conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->start), NULL); + if (conflict_pgmap) { + WARN(1, "Conflicting mapping in same section\n"); + put_dev_pagemap(conflict_pgmap); + return -ENOMEM; + } + + conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->end), NULL); + if (conflict_pgmap) { + WARN(1, "Conflicting mapping in same section\n"); + put_dev_pagemap(conflict_pgmap); + return -ENOMEM; + } + + is_ram = region_intersects(range->start, range_len(range), + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); + + if (is_ram != REGION_DISJOINT) { + WARN_ONCE(1, "attempted on %s region %#llx-%#llx\n", + is_ram == REGION_MIXED ? "mixed" : "ram", + range->start, range->end); + return -ENXIO; + } + + error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(range->start), + PHYS_PFN(range->end), pgmap, GFP_KERNEL)); + if (error) + return error; + + if (nid < 0) + nid = numa_mem_id(); + + error = track_pfn_remap(NULL, ¶ms->pgprot, PHYS_PFN(range->start), 0, + range_len(range)); + if (error) + goto err_pfn_remap; + + mem_hotplug_begin(); + + /* + * For device private memory we call add_pages() as we only need to + * allocate and initialize struct page for the device memory. More- + * over the device memory is un-accessible thus we do not want to + * create a linear mapping for the memory like arch_add_memory() + * would do. + * + * For all other device memory types, which are accessible by + * the CPU, we do want the linear mapping and thus use + * arch_add_memory(). + */ + if (pgmap->type == MEMORY_DEVICE_PRIVATE) { + error = add_pages(nid, PHYS_PFN(range->start), + PHYS_PFN(range_len(range)), params); + } else { + error = kasan_add_zero_shadow(__va(range->start), range_len(range)); + if (error) { + mem_hotplug_done(); + goto err_kasan; + } + + error = arch_add_memory(nid, range->start, range_len(range), + params); + } + + if (!error) { + struct zone *zone; + + zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; + move_pfn_range_to_zone(zone, PHYS_PFN(range->start), + PHYS_PFN(range_len(range)), params->altmap); + } + + mem_hotplug_done(); + if (error) + goto err_add_memory; + + /* + * Initialization of the pages has been deferred until now in order + * to allow us to do the work while not holding the hotplug lock. + */ + memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], + PHYS_PFN(range->start), + PHYS_PFN(range_len(range)), pgmap); + percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id) + - pfn_first(pgmap, range_id)); + return 0; + +err_add_memory: + kasan_remove_zero_shadow(__va(range->start), range_len(range)); +err_kasan: + untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range)); +err_pfn_remap: + pgmap_array_delete(range); + return error; +} + + /* * Not device managed version of dev_memremap_pages, undone by * memunmap_pages(). Please use dev_memremap_pages if you have a struct @@ -182,17 +304,16 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref) */ void *memremap_pages(struct dev_pagemap *pgmap, int nid) { - struct range *range = &pgmap->range; - struct dev_pagemap *conflict_pgmap; struct mhp_params params = { - /* - * We do not want any optional features only our own memmap - */ .altmap = pgmap_altmap(pgmap), .pgprot = PAGE_KERNEL, }; - int error, is_ram; + const int nr_range = pgmap->nr_range; bool need_devmap_managed = true; + int error, i; + + if (WARN_ONCE(!nr_range, "nr_range must be specified\n")) + return ERR_PTR(-EINVAL); switch (pgmap->type) { case MEMORY_DEVICE_PRIVATE: @@ -251,106 +372,27 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) return ERR_PTR(error); } - conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->start), NULL); - if (conflict_pgmap) { - WARN(1, "Conflicting mapping in same section\n"); - put_dev_pagemap(conflict_pgmap); - error = -ENOMEM; - goto err_array; - } - - conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->end), NULL); - if (conflict_pgmap) { - WARN(1, "Conflicting mapping in same section\n"); - put_dev_pagemap(conflict_pgmap); - error = -ENOMEM; - goto err_array; - } - - is_ram = region_intersects(range->start, range_len(range), - IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); - - if (is_ram != REGION_DISJOINT) { - WARN_ONCE(1, "attempted on %s region %#llx-%#llx\n", - is_ram == REGION_MIXED ? "mixed" : "ram", - range->start, range->end); - error = -ENXIO; - goto err_array; - } - - error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(range->start), - PHYS_PFN(range->end), pgmap, GFP_KERNEL)); - if (error) - goto err_array; - - if (nid < 0) - nid = numa_mem_id(); - - error = track_pfn_remap(NULL, ¶ms.pgprot, PHYS_PFN(range->start), 0, - range_len(range)); - if (error) - goto err_pfn_remap; - - mem_hotplug_begin(); - /* - * For device private memory we call add_pages() as we only need to - * allocate and initialize struct page for the device memory. More- - * over the device memory is un-accessible thus we do not want to - * create a linear mapping for the memory like arch_add_memory() - * would do. - * - * For all other device memory types, which are accessible by - * the CPU, we do want the linear mapping and thus use - * arch_add_memory(). + * Clear the pgmap nr_range as it will be incremented for each + * successfully processed range. This communicates how many + * regions to unwind in the abort case. */ - if (pgmap->type == MEMORY_DEVICE_PRIVATE) { - error = add_pages(nid, PHYS_PFN(range->start), - PHYS_PFN(range_len(range)), ¶ms); - } else { - error = kasan_add_zero_shadow(__va(range->start), range_len(range)); - if (error) { - mem_hotplug_done(); - goto err_kasan; - } - - error = arch_add_memory(nid, range->start, range_len(range), - ¶ms); + pgmap->nr_range = 0; + error = 0; + for (i = 0; i < nr_range; i++) { + error = pagemap_range(pgmap, ¶ms, i, nid); + if (error) + break; + pgmap->nr_range++; } - if (!error) { - struct zone *zone; - - zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; - move_pfn_range_to_zone(zone, PHYS_PFN(range->start), - PHYS_PFN(range_len(range)), params.altmap); + if (i < nr_range) { + memunmap_pages(pgmap); + pgmap->nr_range = nr_range; + return ERR_PTR(error); } - mem_hotplug_done(); - if (error) - goto err_add_memory; - - /* - * Initialization of the pages has been deferred until now in order - * to allow us to do the work while not holding the hotplug lock. - */ - memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], - PHYS_PFN(range->start), - PHYS_PFN(range_len(range)), pgmap); - percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap)); - return __va(range->start); - - err_add_memory: - kasan_remove_zero_shadow(__va(range->start), range_len(range)); - err_kasan: - untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range)); - err_pfn_remap: - pgmap_array_delete(range); - err_array: - dev_pagemap_kill(pgmap); - dev_pagemap_cleanup(pgmap); - devmap_managed_enable_put(); - return ERR_PTR(error); + return __va(pgmap->ranges[0].start); } EXPORT_SYMBOL_GPL(memremap_pages);