[PATCH v2] ndctl: Add support for get bus and region persistent domain
by Dave Jiang
Adding helper functions to iterate through sysfs region persistence domain
attribute. The region will display the domain with the most persistence for the
region. The bus will display the domain attribute with the least persistence
amongst all the regions. ndctl_bus_get_persistence_domain() and
ndctl_region_get_persistence_domain are exported. ndctl list will also display
the region persistence domain as well.
Signed-off-by: Dave Jiang <dave.jiang(a)intel.com>
---
v2:
- Simplied scanning of persistence domain from Ross's comments.
ndctl/lib/libndctl.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++
ndctl/lib/libndctl.sym | 2 +
ndctl/lib/private.h | 1 +
ndctl/libndctl.h | 10 ++++++
ndctl/list.c | 16 +++++++++
5 files changed, 116 insertions(+)
diff --git a/ndctl/lib/libndctl.c b/ndctl/lib/libndctl.c
index a165e697..8f4e1745 100644
--- a/ndctl/lib/libndctl.c
+++ b/ndctl/lib/libndctl.c
@@ -180,6 +180,7 @@ struct ndctl_region {
} iset;
FILE *badblocks;
struct badblock bb;
+ enum ndctl_persistence persistence_domain;
};
/**
@@ -755,6 +756,7 @@ static void *add_bus(void *parent, int id, const char *ctl_base)
list_head_init(&bus->regions);
bus->ctx = ctx;
bus->id = id;
+ bus->persistence_domain = PERSISTENCE_UNKNOWN;
sprintf(path, "%s/dev", ctl_base);
if (sysfs_read_attr(ctx, path, buf) < 0
@@ -916,6 +918,17 @@ NDCTL_EXPORT struct ndctl_bus *ndctl_bus_get_by_provider(struct ndctl_ctx *ctx,
return NULL;
}
+NDCTL_EXPORT unsigned int
+ndctl_bus_get_persistence_domain(struct ndctl_bus *bus)
+{
+ struct ndctl_region *region;
+
+ /* iterate through region to get the region persistence domain */
+ ndctl_region_foreach(bus, region) {}
+
+ return bus->persistence_domain;
+}
+
NDCTL_EXPORT struct ndctl_btt *ndctl_region_get_btt_seed(struct ndctl_region *region)
{
struct ndctl_ctx *ctx = ndctl_region_get_ctx(region);
@@ -1755,6 +1768,62 @@ static int region_set_type(struct ndctl_region *region, char *path)
return 0;
}
+static enum ndctl_persistence region_get_pd_type(char *name)
+{
+ if (strncmp("cpu_cache", name, 9) == 0)
+ return PERSISTENCE_CPU_CACHE;
+ else if (strncmp("memory_controller", name, 17) == 0)
+ return PERSISTENCE_MEM_CTRL;
+ else
+ return PERSISTENCE_UNKNOWN;
+}
+
+static int region_persistence_scan(struct ndctl_region *region)
+{
+ struct ndctl_ctx *ctx = ndctl_region_get_ctx(region);
+ char *pd_path;
+ FILE *pf;
+ char buf[64];
+ int rc = 0;
+ enum ndctl_persistence pd = PERSISTENCE_NONE;
+
+ if (asprintf(&pd_path, "%s/persistence_domain",
+ region->region_path) < 0) {
+ rc = -errno;
+ err(ctx, "region persist domain path allocation failure\n");
+ return rc;
+ }
+
+ pf = fopen(pd_path, "re");
+ if (!pf) {
+ rc = -errno;
+ free(pd_path);
+ return rc;
+ }
+
+ region->persistence_domain = PERSISTENCE_NONE;
+ do {
+ rc = fscanf(pf, "%s", buf);
+ if (rc == EOF) {
+ if (ferror(pf)) {
+ rc = -errno;
+ goto out;
+ }
+ } else if (rc == 1)
+ pd = region_get_pd_type(buf);
+
+ if (region->persistence_domain < pd)
+ region->persistence_domain = pd;
+ } while (rc != EOF);
+
+ rc = 0;
+
+out:
+ fclose(pf);
+ free(pd_path);
+ return rc;
+}
+
static void *add_region(void *parent, int id, const char *region_base)
{
char buf[SYSFS_ATTR_SIZE];
@@ -1762,6 +1831,7 @@ static void *add_region(void *parent, int id, const char *region_base)
struct ndctl_bus *bus = parent;
struct ndctl_ctx *ctx = bus->ctx;
char *path = calloc(1, strlen(region_base) + 100);
+ int rc;
if (!path)
return NULL;
@@ -1831,6 +1901,17 @@ static void *add_region(void *parent, int id, const char *region_base)
list_add(&bus->regions, ®ion->list);
free(path);
+
+ /* get the persistence domain attribs */
+ rc = region_persistence_scan(region);
+ if (rc < 0)
+ err(ctx, "%s: region persistence scan failed\n",
+ ndctl_region_get_devname(region));
+
+ /* we are looking for the least persistence domain */
+ if (region->bus->persistence_domain > region->persistence_domain)
+ region->bus->persistence_domain = region->persistence_domain;
+
return region;
err_read:
@@ -2093,6 +2174,12 @@ NDCTL_EXPORT struct badblock *ndctl_region_get_first_badblock(struct ndctl_regio
return ndctl_region_get_next_badblock(region);
}
+NDCTL_EXPORT unsigned int
+ndctl_region_get_persistence_domain(struct ndctl_region *region)
+{
+ return region->persistence_domain;
+}
+
static struct nd_cmd_vendor_tail *to_vendor_tail(struct ndctl_cmd *cmd)
{
struct nd_cmd_vendor_tail *tail = (struct nd_cmd_vendor_tail *)
diff --git a/ndctl/lib/libndctl.sym b/ndctl/lib/libndctl.sym
index 21276614..3209aefe 100644
--- a/ndctl/lib/libndctl.sym
+++ b/ndctl/lib/libndctl.sym
@@ -350,4 +350,6 @@ global:
ndctl_dimm_cmd_new_ack_shutdown_count;
ndctl_region_get_numa_node;
ndctl_dimm_fw_update_supported;
+ ndctl_region_get_persistence_domain;
+ ndctl_bus_get_persistence_domain;
} LIBNDCTL_14;
diff --git a/ndctl/lib/private.h b/ndctl/lib/private.h
index 1cad06b7..e7787ce8 100644
--- a/ndctl/lib/private.h
+++ b/ndctl/lib/private.h
@@ -171,6 +171,7 @@ struct ndctl_bus {
char *scrub_path;
unsigned long cmd_mask;
unsigned long nfit_dsm_mask;
+ enum ndctl_persistence persistence_domain;
};
/**
diff --git a/ndctl/libndctl.h b/ndctl/libndctl.h
index f3a27411..151d48f0 100644
--- a/ndctl/libndctl.h
+++ b/ndctl/libndctl.h
@@ -115,6 +115,7 @@ int ndctl_bus_is_cmd_supported(struct ndctl_bus *bus, int cmd);
unsigned int ndctl_bus_get_revision(struct ndctl_bus *bus);
unsigned int ndctl_bus_get_id(struct ndctl_bus *bus);
const char *ndctl_bus_get_provider(struct ndctl_bus *bus);
+unsigned int ndctl_bus_get_persistence_domain(struct ndctl_bus *bus);
int ndctl_bus_wait_probe(struct ndctl_bus *bus);
int ndctl_bus_wait_for_scrub_completion(struct ndctl_bus *bus);
unsigned int ndctl_bus_get_scrub_count(struct ndctl_bus *bus);
@@ -305,6 +306,14 @@ struct badblock {
unsigned long long offset;
unsigned int len;
};
+
+enum ndctl_persistence {
+ PERSISTENCE_NONE = 0,
+ PERSISTENCE_MEM_CTRL,
+ PERSISTENCE_CPU_CACHE,
+ PERSISTENCE_UNKNOWN,
+};
+
struct ndctl_region;
struct ndctl_region *ndctl_region_get_first(struct ndctl_bus *bus);
struct ndctl_region *ndctl_region_get_next(struct ndctl_region *region);
@@ -347,6 +356,7 @@ struct ndctl_region *ndctl_bus_get_region_by_physical_address(struct ndctl_bus *
for (dimm = ndctl_region_get_first_dimm(region); \
dimm != NULL; \
dimm = ndctl_region_get_next_dimm(region, dimm))
+unsigned int ndctl_region_get_persistence_domain(struct ndctl_region *region);
int ndctl_region_is_enabled(struct ndctl_region *region);
int ndctl_region_enable(struct ndctl_region *region);
int ndctl_region_disable_invalidate(struct ndctl_region *region);
diff --git a/ndctl/list.c b/ndctl/list.c
index 0ca5b6de..f3701ea9 100644
--- a/ndctl/list.c
+++ b/ndctl/list.c
@@ -73,6 +73,7 @@ static struct json_object *region_to_json(struct ndctl_region *region,
struct ndctl_interleave_set *iset;
struct ndctl_mapping *mapping;
unsigned int bb_count = 0;
+ enum ndctl_persistence pd;
int numa;
if (!jregion)
@@ -174,6 +175,21 @@ static struct json_object *region_to_json(struct ndctl_region *region,
if ((flags & UTIL_JSON_MEDIA_ERRORS) && jbbs)
json_object_object_add(jregion, "badblocks", jbbs);
+ pd = ndctl_region_get_persistence_domain(region);
+ switch (pd) {
+ case PERSISTENCE_CPU_CACHE:
+ jobj = json_object_new_string("cpu_cache");
+ break;
+ case PERSISTENCE_MEM_CTRL:
+ jobj = json_object_new_string("memory_controller");
+ break;
+ default:
+ jobj = NULL;
+ }
+
+ if (jobj)
+ json_object_object_add(jregion, "persistence_domain", jobj);
+
return jregion;
err:
fail("\n");
4 years, 3 months
[PATCH] ndctl: Add support for region persistent domain
by Dave Jiang
Adding helper functions to iterate through sysfs region persistence domain
attribute. The region will display the domain with the most persistence for the
region. The bus will display the domain attribute with the least persistence
amongst all the regions. ndctl_bus_get_persistence_domain() and
ndctl_region_get_persistence_domain are exported. ndctl list will also display
the region persistence domain as well.
Signed-off-by: Dave Jiang <dave.jiang(a)intel.com>
---
0 files changed
diff --git a/ndctl/lib/libndctl.c b/ndctl/lib/libndctl.c
index a165e697..94a1dbcd 100644
--- a/ndctl/lib/libndctl.c
+++ b/ndctl/lib/libndctl.c
@@ -180,6 +180,8 @@ struct ndctl_region {
} iset;
FILE *badblocks;
struct badblock bb;
+ FILE *persist_fp;
+ unsigned int persist_domain;
};
/**
@@ -916,6 +918,17 @@ NDCTL_EXPORT struct ndctl_bus *ndctl_bus_get_by_provider(struct ndctl_ctx *ctx,
return NULL;
}
+NDCTL_EXPORT unsigned int
+ndctl_bus_get_persistence_domain(struct ndctl_bus *bus)
+{
+ struct ndctl_region *region;
+
+ /* iterate through region to get the region persist domain */
+ ndctl_region_foreach(bus, region) {}
+
+ return bus->persist_domain;
+}
+
NDCTL_EXPORT struct ndctl_btt *ndctl_region_get_btt_seed(struct ndctl_region *region)
{
struct ndctl_ctx *ctx = ndctl_region_get_ctx(region);
@@ -1755,6 +1768,79 @@ static int region_set_type(struct ndctl_region *region, char *path)
return 0;
}
+#define region_pd_foreach(region, pd) \
+ for (pd = region_get_first_pd(region); \
+ pd != PERSISTENCE_NONE; \
+ pd = region_get_next_pd(region))
+
+static unsigned int region_get_pd_type(char *name)
+{
+ if (strncmp("cpu_cache", name, 9) == 0)
+ return PERSISTENCE_CPU_CACHE;
+ else if (strncmp("memory_controller", name, 17) == 0)
+ return PERSISTENCE_MEM_CTRL;
+ else
+ return PERSISTENCE_NONE;
+}
+
+static int region_persistence_init(struct ndctl_region *region)
+{
+ struct ndctl_ctx *ctx = ndctl_region_get_ctx(region);
+ char *pd_path;
+ int rc = 0;
+
+ if (region->persist_fp) {
+ fclose(region->persist_fp);
+ region->persist_fp = NULL;
+ }
+
+ if (asprintf(&pd_path, "%s/persistence_domain",
+ region->region_path) < 0) {
+ rc = -errno;
+ err(ctx, "region persist domain path allocation failure\n");
+ return rc;
+ }
+
+ region->persist_fp = fopen(pd_path, "re");
+ if (!region->persist_fp) {
+ rc = -errno;
+ free(pd_path);
+ return rc;
+ }
+
+ free(pd_path);
+ return rc;
+}
+
+static unsigned int region_get_next_pd(struct ndctl_region *region)
+{
+ int rc;
+ char buf[32];
+
+ if (!region->persist_fp)
+ return PERSISTENCE_NONE;
+
+ rc = fscanf(region->persist_fp, "%s", buf);
+ if (rc != 1) {
+ fclose(region->persist_fp);
+ region->persist_fp = NULL;
+ return PERSISTENCE_NONE;
+ }
+
+ return region_get_pd_type(buf);
+}
+
+static unsigned int region_get_first_pd(struct ndctl_region *region)
+{
+ int rc;
+
+ rc = region_persistence_init(region);
+ if (rc < 0)
+ return PERSISTENCE_NONE;
+
+ return region_get_next_pd(region);
+}
+
static void *add_region(void *parent, int id, const char *region_base)
{
char buf[SYSFS_ATTR_SIZE];
@@ -1762,6 +1848,7 @@ static void *add_region(void *parent, int id, const char *region_base)
struct ndctl_bus *bus = parent;
struct ndctl_ctx *ctx = bus->ctx;
char *path = calloc(1, strlen(region_base) + 100);
+ unsigned int pd = 0;
if (!path)
return NULL;
@@ -1831,6 +1918,19 @@ static void *add_region(void *parent, int id, const char *region_base)
list_add(&bus->regions, ®ion->list);
free(path);
+
+ /* get the persistence domain attribs */
+ region_pd_foreach(region, pd) {
+ if (region->persist_domain == 0 || region->persist_domain > pd)
+ region->persist_domain = pd;
+ }
+ pd = region->persist_domain;
+
+ /* we are looking for the least persistence domain */
+ if (region->bus->persist_domain == 0 ||
+ region->bus->persist_domain < pd)
+ region->bus->persist_domain = pd;
+
return region;
err_read:
@@ -2093,6 +2193,12 @@ NDCTL_EXPORT struct badblock *ndctl_region_get_first_badblock(struct ndctl_regio
return ndctl_region_get_next_badblock(region);
}
+NDCTL_EXPORT unsigned int
+ndctl_region_get_persistence_domain(struct ndctl_region *region)
+{
+ return region->persist_domain;
+}
+
static struct nd_cmd_vendor_tail *to_vendor_tail(struct ndctl_cmd *cmd)
{
struct nd_cmd_vendor_tail *tail = (struct nd_cmd_vendor_tail *)
diff --git a/ndctl/lib/libndctl.sym b/ndctl/lib/libndctl.sym
index 21276614..3209aefe 100644
--- a/ndctl/lib/libndctl.sym
+++ b/ndctl/lib/libndctl.sym
@@ -350,4 +350,6 @@ global:
ndctl_dimm_cmd_new_ack_shutdown_count;
ndctl_region_get_numa_node;
ndctl_dimm_fw_update_supported;
+ ndctl_region_get_persistence_domain;
+ ndctl_bus_get_persistence_domain;
} LIBNDCTL_14;
diff --git a/ndctl/lib/private.h b/ndctl/lib/private.h
index 1cad06b7..6394c50c 100644
--- a/ndctl/lib/private.h
+++ b/ndctl/lib/private.h
@@ -171,6 +171,7 @@ struct ndctl_bus {
char *scrub_path;
unsigned long cmd_mask;
unsigned long nfit_dsm_mask;
+ unsigned int persist_domain;
};
/**
diff --git a/ndctl/libndctl.h b/ndctl/libndctl.h
index f3a27411..be5b196c 100644
--- a/ndctl/libndctl.h
+++ b/ndctl/libndctl.h
@@ -115,6 +115,7 @@ int ndctl_bus_is_cmd_supported(struct ndctl_bus *bus, int cmd);
unsigned int ndctl_bus_get_revision(struct ndctl_bus *bus);
unsigned int ndctl_bus_get_id(struct ndctl_bus *bus);
const char *ndctl_bus_get_provider(struct ndctl_bus *bus);
+unsigned int ndctl_bus_get_persistence_domain(struct ndctl_bus *bus);
int ndctl_bus_wait_probe(struct ndctl_bus *bus);
int ndctl_bus_wait_for_scrub_completion(struct ndctl_bus *bus);
unsigned int ndctl_bus_get_scrub_count(struct ndctl_bus *bus);
@@ -305,6 +306,14 @@ struct badblock {
unsigned long long offset;
unsigned int len;
};
+
+enum {
+ PERSISTENCE_NONE = 0,
+/* order these in distance to CPU persistence domain */
+ PERSISTENCE_CPU_CACHE,
+ PERSISTENCE_MEM_CTRL,
+};
+
struct ndctl_region;
struct ndctl_region *ndctl_region_get_first(struct ndctl_bus *bus);
struct ndctl_region *ndctl_region_get_next(struct ndctl_region *region);
@@ -318,6 +327,7 @@ struct badblock *ndctl_region_get_next_badblock(struct ndctl_region *region);
for (badblock = ndctl_region_get_first_badblock(region); \
badblock != NULL; \
badblock = ndctl_region_get_next_badblock(region))
+
unsigned int ndctl_region_get_id(struct ndctl_region *region);
const char *ndctl_region_get_devname(struct ndctl_region *region);
unsigned int ndctl_region_get_interleave_ways(struct ndctl_region *region);
@@ -347,6 +357,7 @@ struct ndctl_region *ndctl_bus_get_region_by_physical_address(struct ndctl_bus *
for (dimm = ndctl_region_get_first_dimm(region); \
dimm != NULL; \
dimm = ndctl_region_get_next_dimm(region, dimm))
+unsigned int ndctl_region_get_persistence_domain(struct ndctl_region *region);
int ndctl_region_is_enabled(struct ndctl_region *region);
int ndctl_region_enable(struct ndctl_region *region);
int ndctl_region_disable_invalidate(struct ndctl_region *region);
diff --git a/ndctl/list.c b/ndctl/list.c
index 0ca5b6de..87326544 100644
--- a/ndctl/list.c
+++ b/ndctl/list.c
@@ -72,7 +72,7 @@ static struct json_object *region_to_json(struct ndctl_region *region,
struct json_object *jobj, *jbbs, *jmappings = NULL;
struct ndctl_interleave_set *iset;
struct ndctl_mapping *mapping;
- unsigned int bb_count = 0;
+ unsigned int bb_count = 0, pd;
int numa;
if (!jregion)
@@ -174,6 +174,21 @@ static struct json_object *region_to_json(struct ndctl_region *region,
if ((flags & UTIL_JSON_MEDIA_ERRORS) && jbbs)
json_object_object_add(jregion, "badblocks", jbbs);
+ pd = ndctl_region_get_persistence_domain(region);
+ switch (pd) {
+ case PERSISTENCE_CPU_CACHE:
+ jobj = json_object_new_string("cpu_cache");
+ break;
+ case PERSISTENCE_MEM_CTRL:
+ jobj = json_object_new_string("memory_controller");
+ break;
+ default:
+ jobj = NULL;
+ }
+
+ if (jobj)
+ json_object_object_add(jregion, "persistence_domain", jobj);
+
return jregion;
err:
fail("\n");
4 years, 3 months
[PATCH v6 00/15] dax: fix dma vs truncate/hole-punch
by Dan Williams
Changes since v5 [1]:
* Split the introduction of dax-specific address_space_operations into
its own patch, and place them in fs/libfs.c (Christoph)
* Kill some more straggling dead code implementing dax support for block
devices.
* Mark devm_memremap_pages EXPORT_SYMBOL_GPL (Christoph)
* Introduce {xfs,ext4,ext2}_dax_writepages() and kill the dynamic check
for IS_DAX() in the typical _writepages() implementations in these
filesystems. (Christoph)
* Rework xfs_break_layouts() to assume the XFS_MMAPLOCK_EXCL is held at
entry. (Christoph)
* Replace the XFS_BREAK_WRITE and XFS_BREAK_MAPS flags with the
BREAK_WRITE and BREAK_TRUNCATE enum values since BREAK_WRITE is a
subset of BREAK_TRUNCATE. (Christoph)
* Replace wait_for_atomic_one() with Peter's new wait_var_event()
facility [2]. (Peter)
[1]: https://lists.01.org/pipermail/linux-nvdimm/2018-March/014585.html
[2]: https://patchwork.kernel.org/patch/10284383/
---
Background:
get_user_pages() in the filesystem pins file backed memory pages for
access by devices performing dma. However, it only pins the memory pages
not the page-to-file offset association. If a file is truncated the
pages are mapped out of the file and dma may continue indefinitely into
a page that is owned by a device driver. This breaks coherency of the
file vs dma, but the assumption is that if userspace wants the
file-space truncated it does not matter what data is inbound from the
device, it is not relevant anymore. The only expectation is that dma can
safely continue while the filesystem reallocates the block(s).
Problem:
This expectation that dma can safely continue while the filesystem
changes the block map is broken by dax. With dax the target dma page
*is* the filesystem block. The model of leaving the page pinned for dma,
but truncating the file block out of the file, means that the filesytem
is free to reallocate a block under active dma to another file and now
the expected data-incoherency situation has turned into active
data-corruption.
Solution:
Defer all filesystem operations (fallocate(), truncate()) on a dax mode
file while any page/block in the file is under active dma. This solution
assumes that dma is transient. Cases where dma operations are known to
not be transient, like RDMA, have been explicitly disabled via
commits like 5f1d43de5416 "IB/core: disable memory registration of
filesystem-dax vmas".
The dax_layout_busy_page() routine is called by filesystems with a lock
held against mm faults (i_mmap_lock) to find pinned / busy dax pages.
The process of looking up a busy page invalidates all mappings
to trigger any subsequent get_user_pages() to block on i_mmap_lock.
The filesystem continues to call dax_layout_busy_page() until it finally
returns no more active pages. This approach assumes that the page
pinning is transient, if that assumption is violated the system would
have likely hung from the uncompleted I/O.
---
Dan Williams (15):
dax: store pfns in the radix
fs, dax: prepare for dax-specific address_space_operations
block, dax: remove dead code in blkdev_writepages()
xfs, dax: introduce xfs_dax_aops
ext4, dax: introduce ext4_dax_aops
ext2, dax: introduce ext2_dax_aops
fs, dax: use page->mapping to warn if truncate collides with a busy page
mm, dax: enable filesystems to trigger dev_pagemap ->page_free callbacks
mm, dev_pagemap: introduce CONFIG_DEV_PAGEMAP_OPS
memremap: mark devm_memremap_pages() EXPORT_SYMBOL_GPL
mm, fs, dax: handle layout changes to pinned dax mappings
xfs: require mmap lock for xfs_break_layouts()
xfs: communicate lock drop events from xfs_break_layouts()
xfs: prepare xfs_break_layouts() for another layout type
xfs, dax: introduce xfs_break_dax_layouts()
drivers/dax/super.c | 96 ++++++++++++++++---
drivers/nvdimm/pmem.c | 3 -
fs/Kconfig | 1
fs/block_dev.c | 5 -
fs/dax.c | 232 ++++++++++++++++++++++++++++++++++++----------
fs/ext2/ext2.h | 1
fs/ext2/inode.c | 43 +++++----
fs/ext2/namei.c | 18 ----
fs/ext2/super.c | 6 +
fs/ext4/inode.c | 38 ++++++--
fs/ext4/super.c | 6 +
fs/libfs.c | 27 +++++
fs/xfs/xfs_aops.c | 21 +++-
fs/xfs/xfs_aops.h | 1
fs/xfs/xfs_file.c | 87 ++++++++++++++++-
fs/xfs/xfs_inode.h | 16 +++
fs/xfs/xfs_ioctl.c | 8 --
fs/xfs/xfs_iops.c | 21 +++-
fs/xfs/xfs_pnfs.c | 17 ++-
fs/xfs/xfs_pnfs.h | 4 -
fs/xfs/xfs_super.c | 20 ++--
include/linux/dax.h | 51 +++++++++-
include/linux/fs.h | 3 +
include/linux/memremap.h | 28 ++----
include/linux/mm.h | 61 +++++++++---
kernel/memremap.c | 32 +++++-
mm/Kconfig | 5 +
mm/gup.c | 5 +
mm/hmm.c | 13 ---
mm/swap.c | 3 -
30 files changed, 653 insertions(+), 219 deletions(-)
4 years, 3 months
[PATCH] acpi, numa: fix pxm to online numa node associations
by Dan Williams
Commit 99759869faf1 "acpi: Add acpi_map_pxm_to_online_node()" added
support for mapping a given proximity to its nearest, by SLIT distance,
online node. However, it sometimes returns unexpected results due to the
fact that it switches from comparing the PXM node to the last node that
was closer than the current max.
for_each_online_node(n) {
dist = node_distance(node, n);
if (dist < min_dist) {
min_dist = dist;
node = n; <---- from this point we're using the
wrong node for node_distance()
Fixes: 99759869faf1 ("acpi: Add acpi_map_pxm_to_online_node()")
Cc: <stable(a)vger.kernel.org>
Cc: Toshi Kani <toshi.kani(a)hp.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
---
Rafael, I can take this through the nvdimm tree with your ack. I have a
few other nvdimm fixes pending for 4.16.
drivers/acpi/numa.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 8ccaae3550d2..85167603b9c9 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -103,25 +103,27 @@ int acpi_map_pxm_to_node(int pxm)
*/
int acpi_map_pxm_to_online_node(int pxm)
{
- int node, n, dist, min_dist;
+ int node, min_node;
node = acpi_map_pxm_to_node(pxm);
if (node == NUMA_NO_NODE)
node = 0;
+ min_node = node;
if (!node_online(node)) {
- min_dist = INT_MAX;
+ int min_dist = INT_MAX, dist, n;
+
for_each_online_node(n) {
dist = node_distance(node, n);
if (dist < min_dist) {
min_dist = dist;
- node = n;
+ min_node = n;
}
}
}
- return node;
+ return min_node;
}
EXPORT_SYMBOL(acpi_map_pxm_to_online_node);
4 years, 3 months
[ndctl PATCH v2] ndctl, list: fix namespace json object parenting
by Dan Williams
When listing namespaces and regions a generic jplatform object is
created to house the regions array. However, commit f8cc6fee4e4d "ndctl,
list: refactor core topology walking into util_filter_walk()"
inadvertently added namespaces to that jplatform, and otherwise failed
to parent namespaces to their proper parent region.
# ndctl list -RNu
{
"regions":[
{
"dev":"region1",
"size":"511.00 GiB (548.68 GB)",
"available_size":0,
"type":"pmem",
"numa_node":0,
"namespaces":[
{
"dev":"namespace1.0",
"mode":"raw",
"size":"511.00 GiB (548.68 GB)",
"sector_size":512,
"blockdev":"pmem1",
"numa_node":0
},
{
"dev":"namespace0.0", <------- wrong region
"mode":"fsdax",
"size":"4.00 GiB (4.29 GB)",
"sector_size":512,
"blockdev":"pmem0",
"numa_node":0
}
]
},
{
"dev":"region0",
"size":"4.00 GiB (4.29 GB)",
"available_size":0,
"type":"pmem"
}
],
"namespaces":[ <------ we already listed the namespaces
{
"dev":"namespace1.0",
"mode":"raw",
"size":"511.00 GiB (548.68 GB)",
"sector_size":512,
"blockdev":"pmem1",
"numa_node":0
},
{
"dev":"namespace0.0",
"mode":"fsdax",
"size":"4.00 GiB (4.29 GB)",
"sector_size":512,
"blockdev":"pmem0",
"numa_node":0
}
]
}
ndctl: json_object.c:188: json_object_put: Assertion `jso->_ref_count > 0' failed.
Aborted (core dumped) <------ not ideal
Clear out the jnamespaces tracking as the filter walk transitions to the
next region, and make sure that jnamepsaces is not added to jplatform
when regions are being displayed.
Fixes: f8cc6fee4e4d(" ndctl, list: refactor core topology walking...")
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
---
v2: fix breaking namespace arrays on region boundaries
ndctl/list.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/ndctl/list.c b/ndctl/list.c
index 0ca5b6dee5eb..fe8036eabc3b 100644
--- a/ndctl/list.c
+++ b/ndctl/list.c
@@ -245,6 +245,13 @@ static bool filter_region(struct ndctl_region *region,
lfa->jregion = jregion;
/*
+ * We've started a new region, any previous jnamespaces will
+ * have been parented to the last region. Clear out jnamespaces
+ * so we start a new array per region.
+ */
+ lfa->jnamespaces = NULL;
+
+ /*
* Without a bus we are collecting regions anonymously across
* the platform.
*/
@@ -366,7 +373,7 @@ static int list_display(struct list_filter_arg *lfa)
json_object_object_add(jplatform, "dimms", jdimms);
if (jregions)
json_object_object_add(jplatform, "regions", jregions);
- if (jnamespaces)
+ if (jnamespaces && !jregions)
json_object_object_add(jplatform, "namespaces",
jnamespaces);
printf("%s\n", json_object_to_json_string_ext(jplatform,
4 years, 3 months
[ndctl PATCH] ndctl. test: fix module-taint sanity-check
by Dan Williams
nfit_test_init() validates that the libnvdimm modules that a test would
use are the properly instrumented external versions. If module signing
is enabled the sanity check will fail because the check expects only the
'O' flag, but unsigned external modules will have the 'O' flag and the
'E' flag set. Relax the constraint to just check for 'O'.
Fixes: 00fc65075c89 ("test: validate nfit_test modules...")
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
---
test/core.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/test/core.c b/test/core.c
index ca983e4913f7..4b36b2d14d7b 100644
--- a/test/core.c
+++ b/test/core.c
@@ -195,7 +195,7 @@ retry:
break;
}
- if (strcmp(attr, "O") != 0) {
+ if (!strchr(attr, 'O')) {
log_err(&log_ctx, "%s.ko: expected taint: O got: %s\n",
name, attr);
break;
4 years, 3 months
[ndctl PATCH] ndctl, documentation: remove '...' from label-options.txt
by Vishal Verma
The '...' in label-options.txt ended up getting rendered as an ellipsis
character in the online man pages, and made filtering for certain things
a little harder. Remove it since the plural already indicates multiple
devices can be passed.
Cc: Dan Williams <dan.j.williams(a)intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma(a)intel.com>
---
Documentation/ndctl/labels-options.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Documentation/ndctl/labels-options.txt b/Documentation/ndctl/labels-options.txt
index f39c213..e13cdc5 100644
--- a/Documentation/ndctl/labels-options.txt
+++ b/Documentation/ndctl/labels-options.txt
@@ -1,4 +1,4 @@
-<memory device(s)>...::
+<memory device(s)>::
One or more 'nmemX' device names. The keyword 'all' can be specified to
operate on every dimm in the system, optionally filtered by bus id (see
--bus= option).
--
2.14.3
4 years, 3 months
[PATCH v5 00/11] dax: fix dma vs truncate/hole-punch
by Dan Williams
Changes since v4 [1]:
* Kill the DEFINE_FSDAX_AOPS macro and just open code new
address_space_operations instances for each fs (Matthew, Jan, Dave,
Christoph)
* Rename routines that had a 'dma_' prefix with 'dax_layout_' and merge
the dax-layout-break into xfs_break_layouts() (Dave, Christoph)
* Rework the implementation to have the fsdax core find the pages, but
leave the responsibility of waiting on those pages to the filesystem
(Dave).
* Drop the nfit_test infrastructure for testing this mechanism, I plan
to investigate better mechanisms for injecting arbitrary put_page()
delays for dax pages relative to an extent unmap operation. The
dm_delay target does not do what I want since it operates at whole
device level. A better test interface would be a mechanism to delay
I/O completion based on whether a bio referenced a given LBA.
Not changed since v4:
* This implementation still relies on RCU for synchronizing
get_user_pages() and get_user_pages_fast() against
dax_layout_busy_page(). We could perform the operation with just
barriers if we knew at get_user_pages() time that the pages were flagged
for truncation. However, dax_layout_busy_page() does not have the
information to flag that a page is actually going to be truncated, only
that it *might* be truncated.
[1]: https://lists.01.org/pipermail/linux-nvdimm/2017-December/013704.html
----
Background:
get_user_pages() in the filesystem pins file backed memory pages for
access by devices performing dma. However, it only pins the memory pages
not the page-to-file offset association. If a file is truncated the
pages are mapped out of the file and dma may continue indefinitely into
a page that is owned by a device driver. This breaks coherency of the
file vs dma, but the assumption is that if userspace wants the
file-space truncated it does not matter what data is inbound from the
device, it is not relevant anymore. The only expectation is that dma can
safely continue while the filesystem reallocates the block(s).
Problem:
This expectation that dma can safely continue while the filesystem
changes the block map is broken by dax. With dax the target dma page
*is* the filesystem block. The model of leaving the page pinned for dma,
but truncating the file block out of the file, means that the filesytem
is free to reallocate a block under active dma to another file and now
the expected data-incoherency situation has turned into active
data-corruption.
Solution:
Defer all filesystem operations (fallocate(), truncate()) on a dax mode
file while any page/block in the file is under active dma. This solution
assumes that dma is transient. Cases where dma operations are known to
not be transient, like RDMA, have been explicitly disabled via
commits like 5f1d43de5416 "IB/core: disable memory registration of
filesystem-dax vmas".
The dax_layout_busy_page() routine is called by filesystems with a lock
held against mm faults (i_mmap_lock) to find pinned / busy dax pages.
The process of looking up a busy page invalidates all mappings
to trigger any subsequent get_user_pages() to block on i_mmap_lock.
The filesystem continues to call dax_layout_busy_page() until it finally
returns no more active pages. This approach assumes that the page
pinning is transient, if that assumption is violated the system would
have likely hung from the uncompleted I/O.
---
Dan Williams (11):
dax: store pfns in the radix
xfs, dax: introduce xfs_dax_aops
ext4, dax: introduce ext4_dax_aops
ext2, dax: introduce ext2_dax_aops
fs, dax: use page->mapping to warn if truncate collides with a busy page
mm, dax: enable filesystems to trigger dev_pagemap ->page_free callbacks
mm, dev_pagemap: introduce CONFIG_DEV_PAGEMAP_OPS
wait_bit: introduce {wait_on,wake_up}_atomic_one
mm, fs, dax: handle layout changes to pinned dax mappings
xfs: prepare xfs_break_layouts() for another layout type
xfs, dax: introduce xfs_break_dax_layouts()
drivers/dax/super.c | 96 +++++++++++++++--
drivers/nvdimm/pmem.c | 3 -
fs/Kconfig | 1
fs/dax.c | 259 +++++++++++++++++++++++++++++++++++++---------
fs/ext2/ext2.h | 1
fs/ext2/inode.c | 28 ++++-
fs/ext2/namei.c | 18 ---
fs/ext2/super.c | 6 +
fs/ext4/inode.c | 11 ++
fs/ext4/super.c | 6 +
fs/xfs/xfs_aops.c | 7 +
fs/xfs/xfs_aops.h | 1
fs/xfs/xfs_file.c | 94 ++++++++++++++++-
fs/xfs/xfs_inode.h | 9 ++
fs/xfs/xfs_ioctl.c | 9 +-
fs/xfs/xfs_iops.c | 17 ++-
fs/xfs/xfs_pnfs.c | 8 +
fs/xfs/xfs_pnfs.h | 4 -
fs/xfs/xfs_super.c | 20 ++--
include/linux/dax.h | 45 +++++++-
include/linux/memremap.h | 28 ++---
include/linux/mm.h | 61 ++++++++---
include/linux/wait_bit.h | 13 ++
kernel/memremap.c | 30 +++++
kernel/sched/wait_bit.c | 59 +++++++++-
mm/Kconfig | 5 +
mm/gup.c | 5 +
mm/hmm.c | 13 --
mm/swap.c | 3 -
29 files changed, 663 insertions(+), 197 deletions(-)
4 years, 3 months
[PATCH 0/3] Introduce module_nd_driver
by Johannes Thumshirn
Provide a module_nd_driver() wrapper and move over the appliccable
drivers nd_pmem.ko and dax_pmem.ko.
Johannes Thumshirn (3):
libnvdimm: provide module_nd_driver wrapper
libnvdimm, pmem: use module_nd_driver
device-dax: use module_nd_driver
drivers/dax/pmem.c | 12 +-----------
drivers/nvdimm/pmem.c | 12 +-----------
include/linux/nd.h | 6 ++++++
3 files changed, 8 insertions(+), 22 deletions(-)
--
2.13.6
4 years, 3 months
[ndctl PATCH v2] ndctl, docs: cleanup the man page for create-namespace
by Vishal Verma
Clean up some rendering artifacts in the man page for
ndctl-create-namespace.
Cc: Dan Williams <dan.j.williams(a)intel.com>
Reviewed-by: Dan Williams <dan.j.williams(a)intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma(a)intel.com>
---
Documentation/ndctl/ndctl-create-namespace.txt | 54 +++++++++++++-------------
1 file changed, 26 insertions(+), 28 deletions(-)
v2: Make the "In the latter case.." paragraph a continuation of the
first bullet.
diff --git a/Documentation/ndctl/ndctl-create-namespace.txt b/Documentation/ndctl/ndctl-create-namespace.txt
index c8b1c99..8de6689 100644
--- a/Documentation/ndctl/ndctl-create-namespace.txt
+++ b/Documentation/ndctl/ndctl-create-namespace.txt
@@ -171,32 +171,31 @@ OPTIONS
Section 6.5.10 NVDIMM Label Methods) support "labelled
namespace" operation.
- There are two cases where the kernel will default to
- label-less operation:
-
- * NVDIMM does not support labels
-
- * The NVDIMM supports labels, but the Label Index Block (see
- UEFI 2.7) is not present and there is no capacity aliasing
- between 'blk' and 'pmem' regions.
-
- In the latter case the configuration can be upgraded to
- labelled operation by writing an index block on all DIMMs in a
- region and re-enabling that region. The 'autolabel' capability
- of 'ndctl create-namespace --reconfig' tries to do this by
- default if it can determine that all DIMM capacity is
- referenced by the namespace being reconfigured. It will
- otherwise fail to autolabel and remain in label-less mode if
- it finds a DIMM contributes capacity to more than one region.
- This check prevents inadvertent data loss of that other region
- is in active use. The --autolabel option is implied by
- default, the --no-autolabel option can be used to disable this
- behavior. When automatic labeling fails and labelled operation
- is still desired the safety policy can be bypassed by the
- following commands, note that all data on all regions is
- forfeited by running these commands:
-
- [verse]
+ - There are two cases where the kernel will default to
+ label-less operation:
+
+ * NVDIMM does not support labels
+
+ * The NVDIMM supports labels, but the Label Index Block (see
+ UEFI 2.7) is not present and there is no capacity aliasing
+ between 'blk' and 'pmem' regions.
+
+ - In the latter case the configuration can be upgraded to
+ labelled operation by writing an index block on all DIMMs in a
+ region and re-enabling that region. The 'autolabel' capability
+ of 'ndctl create-namespace --reconfig' tries to do this by
+ default if it can determine that all DIMM capacity is
+ referenced by the namespace being reconfigured. It will
+ otherwise fail to autolabel and remain in label-less mode if
+ it finds a DIMM contributes capacity to more than one region.
+ This check prevents inadvertent data loss of that other region
+ is in active use. The --autolabel option is implied by
+ default, the --no-autolabel option can be used to disable this
+ behavior. When automatic labeling fails and labelled operation
+ is still desired the safety policy can be bypassed by the
+ following commands, note that all data on all regions is
+ forfeited by running these commands:
+
ndctl disable-region all
ndctl init-labels all
ndctl enable-region all
@@ -222,5 +221,4 @@ linkndctl:ndctl-zero-labels[1],
linkndctl:ndctl-init-labels[1],
linkndctl:ndctl-disable-namespace[1],
linkndctl:ndctl-enable-namespace[1],
-http://www.uefi.org/sites/default/files/resources/UEFI_Spec_2_7.pdf[UEFI NVDIMM Label Protocol
-]
+http://www.uefi.org/sites/default/files/resources/UEFI_Spec_2_7.pdf[UEFI NVDIMM Label Protocol]
--
2.14.3
4 years, 3 months