[PATCH v5] ndctl: daxctl: Adding io option for daxctl
by Dave Jiang
The daxctl io option allows I/Os to be performed between file descriptor to
and from device dax files. It also provides a way to zero a device dax
device.
i.e. daxctl io --input=/home/myfile --output=/dev/dax1.0
Signed-off-by: Dave Jiang <dave.jiang(a)intel.com>
---
v5:
- Fixed up libpmem dependencies in configure.ac and spec file suggested
by Dan.
v4:
- Added documentation on size suffix suggested by Ross.
v3:
- Added support for size suffix suggested by Ross.
- Fixed the checking of __do_io() return value >32bit problem.
v2:
- Removed dependency on ndctl to match device and address other comments
by Dan.
Documentation/daxctl/Makefile.am | 3
Documentation/daxctl/daxctl-io.txt | 72 +++++
configure.ac | 1
daxctl/Makefile.am | 5
daxctl/daxctl.c | 2
daxctl/io.c | 576 ++++++++++++++++++++++++++++++++++++
ndctl.spec.in | 1
7 files changed, 658 insertions(+), 2 deletions(-)
create mode 100644 Documentation/daxctl/daxctl-io.txt
create mode 100644 daxctl/io.c
diff --git a/Documentation/daxctl/Makefile.am b/Documentation/daxctl/Makefile.am
index 5913c94..032d48c 100644
--- a/Documentation/daxctl/Makefile.am
+++ b/Documentation/daxctl/Makefile.am
@@ -16,7 +16,8 @@ asciidoc.conf: ../asciidoc.conf.in
man1_MANS = \
daxctl.1 \
- daxctl-list.1
+ daxctl-list.1 \
+ daxctl-io.1
CLEANFILES = $(man1_MANS)
diff --git a/Documentation/daxctl/daxctl-io.txt b/Documentation/daxctl/daxctl-io.txt
new file mode 100644
index 0000000..c4ad9ef
--- /dev/null
+++ b/Documentation/daxctl/daxctl-io.txt
@@ -0,0 +1,72 @@
+daxctl-io(1)
+===========
+
+NAME
+----
+daxctl-io - Perform I/O on Device-DAX devices or zero a Device-DAX device.
+
+SYNOPSIS
+--------
+[verse]
+'daxctl io' [<options>]
+
+There must be a Device-DAX device involved whether as the input or the output
+device. Read from a Device-DAX device and write to a file descriptor, or
+another Device-DAX device. Write to a Device-DAX device from a file descriptor
+or another Device-DAX device.
+
+No length specified will default to input file/device length. If input is
+a special char file then length will be the output file/device length.
+
+No input will default to stdin. No output will default to stdout.
+
+For a Device-DAX device, attempts to clear badblocks within range of writes
+will be performed.
+
+EXAMPLE
+-------
+[verse]
+# daxctl io --zero /dev/dax1.0
+
+# daxctl io --input=/dev/dax1.0 --output=/home/myfile --len=2M --seek=4096
+
+# cat /dev/zero | daxctl io --output=/dev/dax1.0
+
+# daxctl io --input=/dev/zero --output=/dev/dax1.0 --skip=4096
+
+OPTIONS
+-------
+-i::
+--input=::
+ Input device or file to read from.
+
+-o::
+--output=::
+ Output device or file to write to.
+
+-z::
+--zero::
+ Zero the output device for 'len' size. Or the entire device if no
+ length was provided. The output device must be a Device DAX device.
+
+-l::
+--len::
+ The length in bytes to perform the I/O. The following suffixes are
+ supported to make passing in size easier for kibi, mebi, gibi, and
+ tebi bytes: k/K,m/M,g/G,t/T. i.e. 20m - 20 Mebibytes
+
+-s::
+--seek::
+ The number of bytes to skip over on the output before performing a
+ write.
+
+-k::
+--skip::
+ The number of bytes to skip over on the input before performing a read.
+
+COPYRIGHT
+---------
+Copyright (c) 2017, Intel Corporation. License GPLv2: GNU GPL
+version 2 <http://gnu.org/licenses/gpl.html>. This is free software:
+you are free to change and redistribute it. There is NO WARRANTY, to
+the extent permitted by law.
diff --git a/configure.ac b/configure.ac
index 316f5b7..a16d555 100644
--- a/configure.ac
+++ b/configure.ac
@@ -93,6 +93,7 @@ PKG_CHECK_MODULES([KMOD], [libkmod])
PKG_CHECK_MODULES([UDEV], [libudev])
PKG_CHECK_MODULES([UUID], [uuid])
PKG_CHECK_MODULES([JSON], [json-c])
+PKG_CHECK_MODULES([PMEM], [libpmem])
AC_ARG_WITH([bash-completion-dir],
AS_HELP_STRING([--with-bash-completion-dir[=PATH]],
diff --git a/daxctl/Makefile.am b/daxctl/Makefile.am
index fe467d0..4321292 100644
--- a/daxctl/Makefile.am
+++ b/daxctl/Makefile.am
@@ -5,10 +5,13 @@ bin_PROGRAMS = daxctl
daxctl_SOURCES =\
daxctl.c \
list.c \
+ io.c \
../util/json.c
daxctl_LDADD =\
lib/libdaxctl.la \
+ ../ndctl/lib/libndctl.la \
../libutil.a \
$(UUID_LIBS) \
- $(JSON_LIBS)
+ $(JSON_LIBS) \
+ $(PMEM_LIBS)
diff --git a/daxctl/daxctl.c b/daxctl/daxctl.c
index 91a4600..db2e495 100644
--- a/daxctl/daxctl.c
+++ b/daxctl/daxctl.c
@@ -67,11 +67,13 @@ static int cmd_help(int argc, const char **argv, void *ctx)
}
int cmd_list(int argc, const char **argv, void *ctx);
+int cmd_io(int argc, const char **argv, void *ctx);
static struct cmd_struct commands[] = {
{ "version", cmd_version },
{ "list", cmd_list },
{ "help", cmd_help },
+ { "io", cmd_io },
};
int main(int argc, const char **argv)
diff --git a/daxctl/io.c b/daxctl/io.c
new file mode 100644
index 0000000..27e7463
--- /dev/null
+++ b/daxctl/io.c
@@ -0,0 +1,576 @@
+/*
+ * Copyright(c) 2015-2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <sys/param.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <limits.h>
+#include <libgen.h>
+#include <libpmem.h>
+#include <util/json.h>
+#include <util/filter.h>
+#include <util/size.h>
+#include <json-c/json.h>
+#include <daxctl/libdaxctl.h>
+#include <ccan/short_types/short_types.h>
+#include <util/parse-options.h>
+#include <ccan/array_size/array_size.h>
+#include <ndctl/ndctl.h>
+
+enum io_direction {
+ IO_READ = 0,
+ IO_WRITE,
+};
+
+struct io_dev {
+ int fd;
+ int major;
+ int minor;
+ void *mmap;
+ const char *parm_path;
+ char *real_path;
+ uint64_t offset;
+ enum io_direction direction;
+ bool is_dax;
+ bool is_char;
+ bool is_new;
+ bool need_trunc;
+ struct ndctl_ctx *ndctl_ctx;
+ struct ndctl_region *region;
+ struct ndctl_dax *dax;
+ uint64_t size;
+};
+
+static struct {
+ struct io_dev dev[2];
+ bool zero;
+ uint64_t len;
+ struct ndctl_cmd *ars_cap;
+ struct ndctl_cmd *clear_err;
+} io = {
+ .dev[0].fd = -1,
+ .dev[1].fd = -1,
+};
+
+#define fail(fmt, ...) \
+do { \
+ fprintf(stderr, "daxctl-%s:%s:%d: " fmt, \
+ VERSION, __func__, __LINE__, ##__VA_ARGS__); \
+} while (0)
+
+static bool is_stdinout(struct io_dev *io_dev)
+{
+ return (io_dev->fd == STDIN_FILENO ||
+ io_dev->fd == STDOUT_FILENO) ? true : false;
+}
+
+static int setup_device(struct io_dev *io_dev, size_t size)
+{
+ int flags, rc;
+
+ if (is_stdinout(io_dev))
+ return 0;
+
+ if (io_dev->is_new)
+ flags = O_CREAT|O_WRONLY|O_TRUNC;
+ else if (io_dev->need_trunc)
+ flags = O_RDWR | O_TRUNC;
+ else
+ flags = O_RDWR;
+
+ io_dev->fd = open(io_dev->parm_path, flags, S_IRUSR|S_IWUSR);
+ if (io_dev->fd == -1) {
+ rc = -errno;
+ perror("open");
+ return rc;
+ }
+
+ if (!io_dev->is_dax)
+ return 0;
+
+ flags = (io_dev->direction == IO_READ) ? PROT_READ : PROT_WRITE;
+ io_dev->mmap = mmap(NULL, size, flags, MAP_SHARED, io_dev->fd, 0);
+ if (io_dev->mmap == MAP_FAILED) {
+ rc = -errno;
+ perror("mmap");
+ return rc;
+ }
+
+ return 0;
+}
+
+static int match_device(struct io_dev *io_dev, struct daxctl_region *dregion)
+{
+ struct daxctl_dev *dev;
+
+ daxctl_dev_foreach(dregion, dev) {
+ if (io_dev->major == daxctl_dev_get_major(dev) &&
+ io_dev->minor == daxctl_dev_get_minor(dev)) {
+ io_dev->is_dax = true;
+ io_dev->size = daxctl_dev_get_size(dev);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+struct ndctl_dax *find_ndctl_dax(struct ndctl_ctx *ndctl_ctx,
+ struct io_dev *io_dev)
+{
+ struct ndctl_bus *bus;
+ struct ndctl_region *region;
+ struct ndctl_dax *dax;
+ struct daxctl_region *dregion;
+
+ ndctl_bus_foreach(ndctl_ctx, bus)
+ ndctl_region_foreach(bus, region)
+ ndctl_dax_foreach(region, dax) {
+ dregion = ndctl_dax_get_daxctl_region(dax);
+ if (match_device(io_dev, dregion))
+ return dax;
+ }
+
+ return NULL;
+}
+
+static int find_dax_device(struct io_dev *io_dev,
+ struct daxctl_ctx *daxctl_ctx, struct ndctl_ctx *ndctl_ctx,
+ enum io_direction dir)
+{
+ struct daxctl_region *dregion;
+ struct stat st;
+ int rc;
+
+ if (is_stdinout(io_dev)) {
+ io_dev->size = ULONG_MAX;
+ return 0;
+ }
+
+ rc = stat(io_dev->parm_path, &st);
+ if (rc == -1) {
+ rc = -errno;
+ if (rc == -ENOENT && dir == IO_WRITE) {
+ io_dev->is_new = true;
+ io_dev->size = ULONG_MAX;
+ return 0;
+ }
+ perror("stat");
+ return rc;
+ }
+
+ if (S_ISREG(st.st_mode)) {
+ if (dir == IO_WRITE) {
+ io_dev->need_trunc = true;
+ io_dev->size = ULONG_MAX;
+ } else
+ io_dev->size = st.st_size;
+ return 0;
+ } else if (S_ISBLK(st.st_mode)) {
+ io_dev->size = st.st_size;
+ return 0;
+ } else if (S_ISCHR(st.st_mode)) {
+ io_dev->size = ULONG_MAX;
+ io_dev->is_char = true;
+ io_dev->major = major(st.st_rdev);
+ io_dev->minor = minor(st.st_rdev);
+ } else
+ return -ENODEV;
+
+ /* grab the ndctl matches if they exist */
+ io_dev->dax = find_ndctl_dax(ndctl_ctx, io_dev);
+ if (io_dev->dax) {
+ io_dev->region = ndctl_dax_get_region(io_dev->dax);
+ return 1;
+ }
+
+ daxctl_region_foreach(daxctl_ctx, dregion)
+ if (match_device(io_dev, dregion))
+ return 1;
+
+ return 0;
+}
+
+static int send_clear_error(struct ndctl_bus *bus, uint64_t start, uint64_t size)
+{
+ uint64_t cleared;
+ int rc;
+
+ io.clear_err = ndctl_bus_cmd_new_clear_error(start, size, io.ars_cap);
+ if (!io.clear_err) {
+ fail("bus: %s failed to create cmd\n",
+ ndctl_bus_get_provider(bus));
+ return -ENXIO;
+ }
+
+ rc = ndctl_cmd_submit(io.clear_err);
+ if (rc) {
+ fail("bus: %s failed to submit cmd: %d\n",
+ ndctl_bus_get_provider(bus), rc);
+ ndctl_cmd_unref(io.clear_err);
+ return rc;
+ }
+
+ cleared = ndctl_cmd_clear_error_get_cleared(io.clear_err);
+ if (cleared != size) {
+ fail("bus: %s expected to clear: %ld actual: %ld\n",
+ ndctl_bus_get_provider(bus),
+ size, cleared);
+ return -ENXIO;
+ }
+
+ return 0;
+}
+
+static int get_ars_cap(struct ndctl_bus *bus, uint64_t start, uint64_t size)
+{
+ int rc;
+
+ io.ars_cap = ndctl_bus_cmd_new_ars_cap(bus, start, size);
+ if (!io.ars_cap) {
+ fail("bus: %s failed to create cmd\n",
+ ndctl_bus_get_provider(bus));
+ return -ENOTTY;
+ }
+
+ rc = ndctl_cmd_submit(io.ars_cap);
+ if (rc) {
+ fail("bus: %s failed to submit cmd: %d\n",
+ ndctl_bus_get_provider(bus), rc);
+ ndctl_cmd_unref(io.ars_cap);
+ return rc;
+ }
+
+ if (ndctl_cmd_ars_cap_get_size(io.ars_cap) <
+ sizeof(struct nd_cmd_ars_status)) {
+ fail("bus: %s expected size >= %zd got: %d\n",
+ ndctl_bus_get_provider(bus),
+ sizeof(struct nd_cmd_ars_status),
+ ndctl_cmd_ars_cap_get_size(io.ars_cap));
+ ndctl_cmd_unref(io.ars_cap);
+ return -ENXIO;
+ }
+
+ return 0;
+}
+
+int clear_errors(struct ndctl_bus *bus, uint64_t start, uint64_t len)
+{
+ int rc;
+
+ rc = get_ars_cap(bus, start, len);
+ if (rc) {
+ fail("get_ars_cap failed\n");
+ return rc;
+ }
+
+ rc = send_clear_error(bus, start, len);
+ if (rc) {
+ fail("send_clear_error failed\n");
+ return rc;
+ }
+
+ return 0;
+}
+
+static int clear_badblocks(struct io_dev *dev, uint64_t len)
+{
+ unsigned long long dax_begin, dax_size, dax_end;
+ unsigned long long region_begin, offset;
+ unsigned long long size, io_begin, io_end, io_len;
+ struct badblock *bb;
+ int rc;
+
+ dax_begin = ndctl_dax_get_resource(dev->dax);
+ if (dax_begin == ULLONG_MAX)
+ return -ERANGE;
+
+ dax_size = ndctl_dax_get_size(dev->dax);
+ if (dax_size == ULLONG_MAX)
+ return -ERANGE;
+
+ dax_end = dax_begin + dax_size - 1;
+
+ region_begin = ndctl_region_get_resource(dev->region);
+ if (region_begin == ULLONG_MAX)
+ return -ERANGE;
+
+ ndctl_region_badblock_foreach(dev->region, bb) {
+ unsigned long long bb_begin, bb_end, begin, end;
+
+ bb_begin = region_begin + (bb->offset << 9);
+ bb_end = bb_begin + (bb->len << 9) - 1;
+
+ if (bb_end <= dax_begin || bb_begin >= dax_end)
+ continue;
+
+ if (bb_begin < dax_begin)
+ begin = dax_begin;
+ else
+ begin = bb_begin;
+
+ if (bb_end > dax_end)
+ end = dax_end;
+ else
+ end = bb_end;
+
+ offset = begin - dax_begin;
+ size = end - begin + 1;
+
+ /*
+ * If end of I/O is before badblock or the offset of the
+ * I/O is greater than the actual size of badblock range
+ */
+ if (dev->offset + len - 1 < offset || dev->offset > size)
+ continue;
+
+ io_begin = (dev->offset < offset) ? offset : dev->offset;
+ if ((dev->offset + len) < (offset + size))
+ io_end = offset + len;
+ else
+ io_end = offset + size;
+
+ io_len = io_end - io_begin;
+ io_begin += dax_begin;
+ rc = clear_errors(ndctl_region_get_bus(dev->region),
+ io_begin, io_len);
+ if (rc < 0)
+ return rc;
+ }
+
+ return 0;
+}
+
+static int64_t __do_io(struct io_dev *dst_dev, struct io_dev *src_dev,
+ uint64_t len, bool zero)
+{
+ void *src, *dst;
+ ssize_t rc, count = 0;
+
+ if (zero && dst_dev->is_dax) {
+ dst = (uint8_t *)dst_dev->mmap + dst_dev->offset;
+ memset(dst, 0, len);
+ pmem_persist(dst, len);
+ rc = len;
+ } else if (dst_dev->is_dax && src_dev->is_dax) {
+ src = (uint8_t *)src_dev->mmap + src_dev->offset;
+ dst = (uint8_t *)dst_dev->mmap + dst_dev->offset;
+ pmem_memcpy_persist(dst, src, len);
+ rc = len;
+ } else if (src_dev->is_dax) {
+ src = (uint8_t *)src_dev->mmap + src_dev->offset;
+ if (dst_dev->offset) {
+ rc = lseek(dst_dev->fd, dst_dev->offset, SEEK_SET);
+ if (rc < 0) {
+ rc = -errno;
+ perror("lseek");
+ return rc;
+ }
+ }
+
+ do {
+ rc = write(dst_dev->fd, (uint8_t *)src + count,
+ len - count);
+ if (rc == -1) {
+ rc = -errno;
+ perror("write");
+ return rc;
+ }
+ count += rc;
+ } while (count != (ssize_t)len);
+ rc = count;
+ if (rc != (ssize_t)len)
+ printf("Requested size %lu larger than source.\n",
+ len);
+ } else if (dst_dev->is_dax) {
+ dst = (uint8_t *)dst_dev->mmap + dst_dev->offset;
+ if (src_dev->offset) {
+ rc = lseek(src_dev->fd, src_dev->offset, SEEK_SET);
+ if (rc < 0) {
+ rc = -errno;
+ perror("lseek");
+ return rc;
+ }
+ }
+
+ do {
+ rc = read(src_dev->fd, (uint8_t *)dst + count,
+ len - count);
+ if (rc == -1) {
+ rc = -errno;
+ perror("pread");
+ return rc;
+ }
+ /* end of file */
+ if (rc == 0)
+ break;
+ count += rc;
+ } while (count != (ssize_t)len);
+ pmem_persist(dst, count);
+ rc = count;
+ if (rc != (ssize_t)len)
+ printf("Requested size %lu larger than destination.\n", len);
+ } else
+ return -EINVAL;
+
+ return rc;
+}
+
+static int do_io(struct daxctl_ctx *daxctl_ctx, struct ndctl_ctx *ndctl_ctx)
+{
+ int i, dax_devs = 0;
+ ssize_t rc;
+
+ /* if we are zeroing the device, we just need output */
+ i = io.zero ? 1 : 0;
+ for (; i < 2; i++) {
+ if (!io.dev[i].parm_path)
+ continue;
+ rc = find_dax_device(&io.dev[i], daxctl_ctx, ndctl_ctx, i);
+ if (rc < 0)
+ return rc;
+
+ if (rc == 1)
+ dax_devs++;
+ }
+
+ if (dax_devs == 0) {
+ fail("No DAX devices for input or output, fail\n");
+ return -ENODEV;
+ }
+
+ if (io.len == 0) {
+ if (is_stdinout(&io.dev[0]))
+ io.len = io.dev[1].size;
+ else
+ io.len = io.dev[0].size;
+ }
+
+ io.dev[1].direction = IO_WRITE;
+ i = io.zero ? 1 : 0;
+ for (; i < 2; i++) {
+ if (!io.dev[i].parm_path)
+ continue;
+ rc = setup_device(&io.dev[i], io.len);
+ if (rc < 0)
+ return rc;
+ }
+
+ /* make sure we are DAX and we have ndctl related bits */
+ if (io.dev[1].is_dax && io.dev[1].dax) {
+ rc = clear_badblocks(&io.dev[1], io.len);
+ if (rc < 0) {
+ fail("Failed to clear badblocks on %s\n",
+ io.dev[1].parm_path);
+ return rc;
+ }
+ }
+
+ rc = __do_io(&io.dev[1], &io.dev[0], io.len, io.zero);
+ if (rc < 0) {
+ fail("Failed to perform I/O: %ld\n", rc);
+ return rc;
+ }
+
+ printf("Data copied %lu bytes to device %s\n",
+ rc, io.dev[1].parm_path);
+
+ return 0;
+}
+
+static void cleanup(void)
+{
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ if (is_stdinout(&io.dev[i]))
+ continue;
+ close(io.dev[i].fd);
+ }
+}
+
+int cmd_io(int argc, const char **argv, void *daxctl_ctx)
+{
+ const char *len_str;
+ const struct option options[] = {
+ OPT_STRING('i', "input", &io.dev[0].parm_path, "in device",
+ "input device/file"),
+ OPT_STRING('o', "output", &io.dev[1].parm_path, "out device",
+ "output device/file"),
+ OPT_BOOLEAN('z', "zero", &io.zero, "zeroing the device"),
+ OPT_STRING('l', "len", &len_str, "I/O length", "total length to perform the I/O"),
+ OPT_U64('s', "seek", &io.dev[1].offset, "seek offset for output"),
+ OPT_U64('k', "skip", &io.dev[0].offset, "skip offset for input"),
+ };
+ const char * const u[] = {
+ "daxctl io [<options>]",
+ NULL
+ };
+ int i, rc;
+ struct ndctl_ctx *ndctl_ctx;
+
+ argc = parse_options(argc, argv, options, u, 0);
+ for (i = 0; i < argc; i++) {
+ fail("Unknown parameter \"%s\"\n", argv[i]);
+ return -EINVAL;
+ }
+
+ if (argc) {
+ usage_with_options(u, options);
+ return 0;
+ }
+
+ if (!io.dev[0].parm_path && !io.dev[1].parm_path) {
+ usage_with_options(u, options);
+ return 0;
+ }
+
+ if (len_str) {
+ io.len = parse_size64(len_str);
+ if (io.len == ULLONG_MAX) {
+ fail("Incorrect len param entered: %s\n", len_str);
+ return -EINVAL;
+ }
+ } else
+ io.len = 0;
+
+ if (!io.dev[0].parm_path) {
+ io.dev[0].fd = STDIN_FILENO;
+ io.dev[0].offset = 0;
+ }
+
+ if (!io.dev[1].parm_path) {
+ io.dev[1].fd = STDOUT_FILENO;
+ io.dev[1].offset = 0;
+ }
+
+ rc = ndctl_new(&ndctl_ctx);
+ if (rc)
+ return -ENOMEM;
+
+ rc = do_io(daxctl_ctx, ndctl_ctx);
+ if (rc < 0)
+ goto out;
+
+ rc = 0;
+out:
+ cleanup();
+ ndctl_unref(ndctl_ctx);
+ return rc;
+}
diff --git a/ndctl.spec.in b/ndctl.spec.in
index b481762..6febbf2 100644
--- a/ndctl.spec.in
+++ b/ndctl.spec.in
@@ -20,6 +20,7 @@ BuildRequires: pkgconfig(libudev)
BuildRequires: pkgconfig(uuid)
BuildRequires: pkgconfig(json-c)
BuildRequires: pkgconfig(bash-completion)
+BuildRequires: pkgconfig(libpmem)
%description
Utility library for managing the "libnvdimm" subsystem. The "libnvdimm"
4 years, 8 months
[PATCH] nvdimm: Remove minimum size requirement
by Matthew Wilcox
From: Matthew Wilcox <mawilcox(a)microsoft.com>
There was no need to have a minimum size of 4MB for NV-DIMMs; it was
just a sanity check. Keep a check that it's at least one page in size
because we really can't add less than a page to the memory map. Promote
the print statement from 'debug' level to 'warning', since there was no
information for my colleague who stumbled over this problem while
attempting to add a 2MB chunk of memory.
Reported-by: Cheng-mean Liu <soccerl(a)microsoft.com>
Signed-off-by: Matthew Wilcox <mawilcox(a)microsoft.com>
---
drivers/nvdimm/namespace_devs.c | 6 +++---
include/uapi/linux/ndctl.h | 4 ----
2 files changed, 3 insertions(+), 7 deletions(-)
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 5f1c6756e57c..95169308078a 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -1689,9 +1689,9 @@ struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev)
}
size = nvdimm_namespace_capacity(ndns);
- if (size < ND_MIN_NAMESPACE_SIZE) {
- dev_dbg(&ndns->dev, "%pa, too small must be at least %#x\n",
- &size, ND_MIN_NAMESPACE_SIZE);
+ if (size < PAGE_SIZE) {
+ dev_warn(&ndns->dev, "%pa, too small must be at least %ld\n",
+ &size, PAGE_SIZE);
return ERR_PTR(-ENODEV);
}
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index 6d3c54264d8e..3ad1623bb585 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -299,10 +299,6 @@ enum nd_driver_flags {
ND_DRIVER_DAX_PMEM = 1 << ND_DEVICE_DAX_PMEM,
};
-enum {
- ND_MIN_NAMESPACE_SIZE = 0x00400000,
-};
-
enum ars_masks {
ARS_STATUS_MASK = 0x0000FFFF,
ARS_EXT_STATUS_SHIFT = 16,
--
2.11.0
4 years, 8 months
[PATCH 0/2] MAP_VALIDATE and mmap flags validation
by Dan Williams
As noted in patch2:
The mmap(2) syscall suffers from the ABI anti-pattern of not validating
unknown flags. However, proposals like MAP_SYNC and MAP_DIRECT need a
mechanism to define new behavior that is known to fail on older kernels
without the support. Define a new MAP_VALIDATE flag pattern that is
guaranteed to fail on all legacy mmap implementations.
On the assumption that it is too late to finalize either MAP_SYNC or
MAP_DIRECT for 4.14 inclusion I would still like to pursue getting at
least patch1 in for 4.14. This allows development of these new flags for
4.15 without worrying about new ->mmap() operation instances added
during the cycle. I.e. I would rebase these from v4.13-rc5 to the state
of the tree right before v4.14-rc1 and re-run the Coccinelle script.
Questions:
1/ Are there any objections to MAP_VALIDATE? I think we bottomed out on
the parisc compatibility concern with the realization that it is
missing fundamental pmem pre-requisite features, like ZONE_DEVICE,
and can otherwise define a new mmap syscall variant.
2/ Linus, are you open to taking a rebased version of patch1 late in the
4.14 window, or have a different suggestion?
---
Dan Williams (2):
vfs: add flags parameter to ->mmap() in 'struct file_operations'
mm: introduce MAP_VALIDATE, a mechanism for for safely defining new mmap flags
arch/arc/kernel/arc_hostlink.c | 3 +
arch/mips/kernel/vdso.c | 2 -
arch/powerpc/kernel/proc_powerpc.c | 3 +
arch/powerpc/kvm/book3s_64_vio.c | 3 +
arch/powerpc/platforms/cell/spufs/file.c | 21 ++++++--
arch/powerpc/platforms/powernv/opal-prd.c | 3 +
arch/um/drivers/mmapper_kern.c | 3 +
drivers/android/binder.c | 3 +
drivers/char/agp/frontend.c | 3 +
drivers/char/bsr.c | 3 +
drivers/char/hpet.c | 6 ++
drivers/char/mbcs.c | 3 +
drivers/char/mbcs.h | 3 +
drivers/char/mem.c | 11 +++-
drivers/char/mspec.c | 9 ++--
drivers/char/uv_mmtimer.c | 6 ++
drivers/dax/device.c | 3 +
drivers/dma-buf/dma-buf.c | 4 +-
drivers/firewire/core-cdev.c | 3 +
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 +
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 3 +
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 5 +-
drivers/gpu/drm/arc/arcpgu_drv.c | 5 +-
drivers/gpu/drm/ast/ast_drv.h | 3 +
drivers/gpu/drm/ast/ast_ttm.c | 3 +
drivers/gpu/drm/bochs/bochs.h | 3 +
drivers/gpu/drm/bochs/bochs_mm.c | 3 +
drivers/gpu/drm/cirrus/cirrus_drv.h | 3 +
drivers/gpu/drm/cirrus/cirrus_ttm.c | 3 +
drivers/gpu/drm/drm_gem.c | 3 +
drivers/gpu/drm/drm_gem_cma_helper.c | 6 ++
drivers/gpu/drm/drm_vm.c | 3 +
drivers/gpu/drm/etnaviv/etnaviv_drv.h | 3 +
drivers/gpu/drm/etnaviv/etnaviv_gem.c | 5 +-
drivers/gpu/drm/exynos/exynos_drm_gem.c | 5 +-
drivers/gpu/drm/exynos/exynos_drm_gem.h | 3 +
drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.h | 3 +
drivers/gpu/drm/hisilicon/hibmc/hibmc_ttm.c | 3 +
drivers/gpu/drm/i810/i810_dma.c | 3 +
drivers/gpu/drm/i915/i915_gem_dmabuf.c | 2 -
drivers/gpu/drm/mediatek/mtk_drm_gem.c | 5 +-
drivers/gpu/drm/mediatek/mtk_drm_gem.h | 3 +
drivers/gpu/drm/mgag200/mgag200_drv.h | 3 +
drivers/gpu/drm/mgag200/mgag200_ttm.c | 3 +
drivers/gpu/drm/msm/msm_drv.h | 3 +
drivers/gpu/drm/msm/msm_gem.c | 5 +-
drivers/gpu/drm/nouveau/nouveau_ttm.c | 5 +-
drivers/gpu/drm/nouveau/nouveau_ttm.h | 2 -
drivers/gpu/drm/omapdrm/omap_drv.h | 3 +
drivers/gpu/drm/omapdrm/omap_gem.c | 5 +-
drivers/gpu/drm/qxl/qxl_drv.h | 3 +
drivers/gpu/drm/qxl/qxl_ttm.c | 3 +
drivers/gpu/drm/radeon/radeon_drv.c | 3 +
drivers/gpu/drm/radeon/radeon_ttm.c | 3 +
drivers/gpu/drm/rockchip/rockchip_drm_gem.c | 5 +-
drivers/gpu/drm/rockchip/rockchip_drm_gem.h | 3 +
drivers/gpu/drm/tegra/gem.c | 5 +-
drivers/gpu/drm/tegra/gem.h | 3 +
drivers/gpu/drm/udl/udl_drv.h | 3 +
drivers/gpu/drm/udl/udl_gem.c | 5 +-
drivers/gpu/drm/vc4/vc4_bo.c | 5 +-
drivers/gpu/drm/vc4/vc4_drv.h | 3 +
drivers/gpu/drm/vgem/vgem_drv.c | 7 ++-
drivers/gpu/drm/virtio/virtgpu_drv.h | 3 +
drivers/gpu/drm/virtio/virtgpu_ttm.c | 3 +
drivers/gpu/drm/vmwgfx/vmwgfx_drv.h | 3 +
drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c | 3 +
drivers/hsi/clients/cmt_speech.c | 3 +
drivers/hwtracing/intel_th/msu.c | 3 +
drivers/hwtracing/stm/core.c | 3 +
drivers/infiniband/core/uverbs_main.c | 3 +
drivers/infiniband/hw/hfi1/file_ops.c | 6 ++
drivers/infiniband/hw/qib/qib_file_ops.c | 5 +-
drivers/media/v4l2-core/v4l2-dev.c | 3 +
drivers/misc/aspeed-lpc-ctrl.c | 3 +
drivers/misc/cxl/api.c | 5 +-
drivers/misc/cxl/cxl.h | 3 +
drivers/misc/cxl/file.c | 3 +
drivers/misc/genwqe/card_dev.c | 3 +
drivers/misc/mic/scif/scif_fd.c | 3 +
drivers/misc/mic/vop/vop_vringh.c | 3 +
drivers/misc/sgi-gru/grufile.c | 3 +
drivers/mtd/mtdchar.c | 3 +
drivers/pci/proc.c | 3 +
drivers/rapidio/devices/rio_mport_cdev.c | 3 +
drivers/sbus/char/flash.c | 3 +
drivers/sbus/char/jsflash.c | 3 +
drivers/scsi/cxlflash/superpipe.c | 5 +-
drivers/scsi/sg.c | 3 +
drivers/staging/android/ashmem.c | 3 +
drivers/staging/comedi/comedi_fops.c | 3 +
.../staging/lustre/lustre/llite/llite_internal.h | 3 +
drivers/staging/lustre/lustre/llite/llite_mmap.c | 5 +-
drivers/staging/vboxvideo/vbox_drv.h | 3 +
drivers/staging/vboxvideo/vbox_ttm.c | 3 +
drivers/staging/vme/devices/vme_user.c | 3 +
drivers/uio/uio.c | 3 +
drivers/usb/core/devio.c | 3 +
drivers/usb/mon/mon_bin.c | 3 +
drivers/vfio/vfio.c | 7 ++-
drivers/video/fbdev/core/fbmem.c | 3 +
drivers/video/fbdev/pxa3xx-gcu.c | 3 +
drivers/xen/gntalloc.c | 3 +
drivers/xen/gntdev.c | 3 +
drivers/xen/privcmd.c | 3 +
drivers/xen/xenbus/xenbus_dev_backend.c | 3 +
drivers/xen/xenfs/xenstored.c | 3 +
fs/9p/vfs_file.c | 10 ++--
fs/aio.c | 3 +
fs/btrfs/file.c | 3 +
fs/ceph/addr.c | 3 +
fs/ceph/super.h | 3 +
fs/cifs/cifsfs.h | 6 ++
fs/cifs/file.c | 10 ++--
fs/coda/file.c | 5 +-
fs/ecryptfs/file.c | 5 +-
fs/ext2/file.c | 5 +-
fs/ext4/file.c | 3 +
fs/f2fs/file.c | 3 +
fs/fuse/file.c | 8 ++-
fs/gfs2/file.c | 3 +
fs/hugetlbfs/inode.c | 3 +
fs/kernfs/file.c | 3 +
fs/ncpfs/mmap.c | 3 +
fs/ncpfs/ncp_fs.h | 2 -
fs/nfs/file.c | 5 +-
fs/nfs/internal.h | 2 -
fs/nilfs2/file.c | 3 +
fs/ocfs2/mmap.c | 3 +
fs/ocfs2/mmap.h | 3 +
fs/orangefs/file.c | 5 +-
fs/proc/inode.c | 7 ++-
fs/proc/vmcore.c | 6 ++
fs/ramfs/file-nommu.c | 6 ++
fs/romfs/mmap-nommu.c | 3 +
fs/ubifs/file.c | 5 +-
fs/xfs/xfs_file.c | 5 +-
include/drm/drm_gem.h | 3 +
include/drm/drm_gem_cma_helper.h | 3 +
include/drm/drm_legacy.h | 3 +
include/linux/fs.h | 14 ++++--
include/linux/mm.h | 2 -
include/linux/mman.h | 50 ++++++++++++++++++++
include/misc/cxl.h | 3 +
include/uapi/asm-generic/mman-common.h | 1
ipc/shm.c | 5 +-
kernel/events/core.c | 3 +
kernel/kcov.c | 3 +
kernel/relay.c | 3 +
mm/filemap.c | 14 ++++--
mm/mmap.c | 22 ++++++++-
mm/nommu.c | 4 +-
mm/shmem.c | 3 +
net/socket.c | 6 ++
security/selinux/selinuxfs.c | 6 ++
sound/core/compress_offload.c | 3 +
sound/core/hwdep.c | 3 +
sound/core/info.c | 3 +
sound/core/init.c | 3 +
sound/core/oss/pcm_oss.c | 3 +
sound/core/pcm_native.c | 3 +
sound/oss/soundcard.c | 3 +
sound/oss/swarm_cs4297a.c | 3 +
virt/kvm/kvm_main.c | 3 +
164 files changed, 481 insertions(+), 231 deletions(-)
4 years, 8 months
[PATCH v7 0/9] libnvdimm: add DMA supported blk-mq pmem driver
by Dave Jiang
- Per Dan's suggestions
- Moved all common code from attach_disk to pmem_core as helper functions.
- Fixed up Kconfig dependencies
- Cleaned up header file inclusions
- Removed module parameters
- Split pmem_core refactor into own patch
- Removed REQ_FLUSH define
v6:
- Put all common code for pmem drivers in pmem_core per Dan's suggestion.
- Added support code to get number of available DMA chans
- Fixed up Kconfig so that when pmem is built into the kernel, pmem_dma won't
show up.
v5:
- Added support to report descriptor transfer capability limit from dmaengine.
- Fixed up scatterlist support for dma_unmap_data per Dan's comments.
- Made the driver a separate pmem blk driver per Christoph's suggestion
and also fixed up all the issues pointed out by Christoph.
- Added pmem badblock checking/handling per Robert and also made DMA op to
be used by all buffer sizes.
v4:
- Addressed kbuild test bot issues. Passed kbuild test bot, 179 configs.
v3:
- Added patch to rename DMA_SG to DMA_SG_SG to make it explicit
- Added DMA_MEMCPY_SG transaction type to dmaengine
- Misc patch to add verification of DMA_MEMSET_SG that was missing
- Addressed all nd_pmem driver comments from Ross.
v2:
- Make dma_prep_memcpy_* into one function per Dan.
- Addressed various comments from Ross with code formatting and etc.
- Replaced open code with offset_in_page() macro per Johannes.
The following series implements a DMA driven blk-mq pmem driver and
also adds infrastructure code to ioatdma and dmaengine in order to
support copying to and from scatterlist in order to process block
requests provided by blk-mq. The usage of DMA engines available on certain
platforms allow us to drastically reduce CPU utilization and at the same time
maintain performance that is good enough. Experimentations have been done on
DRAM backed pmem block device that showed the utilization of DMA engine is
beneficial. By default nd_pmem.ko will be loaded. This can be overridden
through module blacklisting in order to load nd_pmem_dma.ko.
---
Dave Jiang (9):
dmaengine: ioatdma: revert 7618d035 to allow sharing of DMA channels
dmaengine: Add DMA_MEMCPY_SG transaction op
dmaengine: ioatdma: dma_prep_memcpy_sg support
dmaengine: add function to provide per descriptor xfercap for dma engine
dmaengine: add SG support to dmaengine_unmap
dmaengine: provide number of available channels
libnvdimm: remove definition of REQ_FLUSH
libnvdimm: move common function for pmem to pmem_core
libnvdimm: Add DMA based blk-mq pmem driver
Documentation/dmaengine/provider.txt | 3
drivers/dma/dmaengine.c | 72 +++++
drivers/dma/ioat/dma.h | 4
drivers/dma/ioat/init.c | 6
drivers/dma/ioat/prep.c | 57 ++++
drivers/nvdimm/Kconfig | 28 ++
drivers/nvdimm/Makefile | 6
drivers/nvdimm/pmem.c | 408 +----------------------------
drivers/nvdimm/pmem.h | 55 ++++
drivers/nvdimm/pmem_core.c | 451 ++++++++++++++++++++++++++++++++
drivers/nvdimm/pmem_dma.c | 475 ++++++++++++++++++++++++++++++++++
include/linux/dmaengine.h | 49 +++-
12 files changed, 1213 insertions(+), 401 deletions(-)
create mode 100644 drivers/nvdimm/pmem_core.c
create mode 100644 drivers/nvdimm/pmem_dma.c
--
Signature
4 years, 8 months
[PATCH v7 0/6] BTT error clearing rework
by Vishal Verma
changes in v7:
- There were cases where any problem while attempting to clear errors
would result in the BTT thread doing IO to be locked in an infinite
retry loop. Fix those conditions by detecting any error in clearing
and bailing on that write. (Toshi, Dan).
changes in v6:
- Remove the refactoring patch where we move the namespace offset
calculations to all call sites, instead add a helper for the final
initial_offset calculation, and call it directly from btt_is_badblock
changes in v5:
- Add patch 6 that refactors initial_offset calculations, and fix a bug
that caused error clearing to be missed in some cases (Toshi)
(I have a unit test for this that is mostly ready, but it depends
in a better error injection capability in nfit_test, so I will send
it out once that is ready).
Changes in v4:
- move the deadlock fix to before enabling the BTT error clear paths (Dan)
- No need for an error lock per freelist entry, just have one per arena (Dan)
Changes in v3:
- Change the dynamically allocated (during IO) zerobuf to the kernel's
ZERO_PAGE for error clearing (patch 5) (Dan).
- Move the NOIO fixes a level down into nvdimm_clear_poison since both
btt and pmem poison clearing goes through that (Dan).
Changes in v2:
- Drop the ACPI allocation change patch. Instead use
memalloc_noio_{save,restore} to set the GFP_NOIO flag around anything
that can be expected to call into ACPI for clearing errors. (Rafael, Dan).
Clearing errors or badblocks during a BTT write requires sending an ACPI
DSM, which means potentially sleeping. Since a BTT IO happens in atomic
context (preemption disabled, spinlocks may be held), we cannot perform
error clearing in the course of an IO. Due to this error clearing for
BTT IOs has hitherto been disabled.
This series fixes these problems by moving the error clearing out of
the atomic sections in the BTT.
Also fix a potential deadlock that can occur while clearing errors
from either BTT or pmem due to memory allocations in the IO path.
Vishal Verma (6):
btt: fix a missed NVDIMM_IO_ATOMIC case in the write path
btt: refactor map entry operations with macros
btt: ensure that flags were also unchanged during a map_read
btt: cache sector_size in arena_info
libnvdimm: fix potential deadlock while clearing errors
libnvdimm, btt: rework error clearing
drivers/nvdimm/btt.c | 137 ++++++++++++++++++++++++++++++++++++++++++-------
drivers/nvdimm/btt.h | 11 ++++
drivers/nvdimm/bus.c | 6 +++
drivers/nvdimm/claim.c | 9 +---
4 files changed, 136 insertions(+), 27 deletions(-)
--
2.9.5
4 years, 8 months
[PATCH v2] nd_blk: Remove mmio_flush_range()
by Robin Murphy
mmio_flush_range() suffers from a lack of clearly-defined semantics,
and is somewhat ambiguous to port to other architectures where the
scope of the writeback implied by "flush" and ordering might matter,
but MMIO would tend to imply non-cacheable anyway. Per the rationale
in 67a3e8fe9015 ("nd_blk: change aperture mapping from WC to WB"), the
only existing use is actually to invalidate clean cache lines for
ARCH_MEMREMAP_PMEM type mappings *without* writeback. Since the recent
cleanup of the pmem API, that also now happens to be the exact purpose
of arch_invalidate_pmem(), which would be a far more well-defined tool
for the job.
Rather than risk potentially inconsistent implementations of
mmio_flush_range() for the sake of one callsite, streamline things by
removing it entirely and instead move the ARCH_MEMREMAP_PMEM related
definitions up to the libnvdimm level, so they can be shared by NFIT
as well. This allows NFIT to be enabled for arm64.
Signed-off-by: Robin Murphy <robin.murphy(a)arm.com>
---
arch/x86/Kconfig | 1 -
arch/x86/include/asm/cacheflush.h | 2 --
drivers/acpi/nfit/Kconfig | 2 +-
drivers/acpi/nfit/core.c | 2 +-
drivers/nvdimm/pmem.h | 14 --------------
include/linux/libnvdimm.h | 15 +++++++++++++++
lib/Kconfig | 3 ---
tools/testing/nvdimm/test/nfit.c | 4 ++--
8 files changed, 19 insertions(+), 24 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 781521b7cf9e..5f3b756ec0d3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -53,7 +53,6 @@ config X86
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_KCOV if X86_64
- select ARCH_HAS_MMIO_FLUSH
select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
select ARCH_HAS_SET_MEMORY
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 8b4140f6724f..cb9a1af109b4 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -7,6 +7,4 @@
void clflush_cache_range(void *addr, unsigned int size);
-#define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
-
#endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/drivers/acpi/nfit/Kconfig b/drivers/acpi/nfit/Kconfig
index 6d3351452ea2..929ba4da0b30 100644
--- a/drivers/acpi/nfit/Kconfig
+++ b/drivers/acpi/nfit/Kconfig
@@ -2,7 +2,7 @@ config ACPI_NFIT
tristate "ACPI NVDIMM Firmware Interface Table (NFIT)"
depends on PHYS_ADDR_T_64BIT
depends on BLK_DEV
- depends on ARCH_HAS_MMIO_FLUSH
+ depends on ARCH_HAS_PMEM_API
select LIBNVDIMM
help
Infrastructure to probe ACPI 6 compliant platforms for
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 19182d091587..ee7726a16693 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1930,7 +1930,7 @@ static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk,
memcpy_flushcache(mmio->addr.aperture + offset, iobuf + copied, c);
else {
if (nfit_blk->dimm_flags & NFIT_BLK_READ_FLUSH)
- mmio_flush_range((void __force *)
+ arch_invalidate_pmem((void __force *)
mmio->addr.aperture + offset, c);
memcpy(iobuf + copied, mmio->addr.aperture + offset, c);
diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
index 5434321cad67..c5917f040fa7 100644
--- a/drivers/nvdimm/pmem.h
+++ b/drivers/nvdimm/pmem.h
@@ -5,20 +5,6 @@
#include <linux/pfn_t.h>
#include <linux/fs.h>
-#ifdef CONFIG_ARCH_HAS_PMEM_API
-#define ARCH_MEMREMAP_PMEM MEMREMAP_WB
-void arch_wb_cache_pmem(void *addr, size_t size);
-void arch_invalidate_pmem(void *addr, size_t size);
-#else
-#define ARCH_MEMREMAP_PMEM MEMREMAP_WT
-static inline void arch_wb_cache_pmem(void *addr, size_t size)
-{
-}
-static inline void arch_invalidate_pmem(void *addr, size_t size)
-{
-}
-#endif
-
/* this definition is in it's own header for tools/testing/nvdimm to consume */
struct pmem_device {
/* One contiguous memory region per device */
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index f3d3e6af8838..d11bc9881206 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -173,4 +173,19 @@ u64 nd_fletcher64(void *addr, size_t len, bool le);
void nvdimm_flush(struct nd_region *nd_region);
int nvdimm_has_flush(struct nd_region *nd_region);
int nvdimm_has_cache(struct nd_region *nd_region);
+
+#ifdef CONFIG_ARCH_HAS_PMEM_API
+#define ARCH_MEMREMAP_PMEM MEMREMAP_WB
+void arch_wb_cache_pmem(void *addr, size_t size);
+void arch_invalidate_pmem(void *addr, size_t size);
+#else
+#define ARCH_MEMREMAP_PMEM MEMREMAP_WT
+static inline void arch_wb_cache_pmem(void *addr, size_t size)
+{
+}
+static inline void arch_invalidate_pmem(void *addr, size_t size)
+{
+}
+#endif
+
#endif /* __LIBNVDIMM_H__ */
diff --git a/lib/Kconfig b/lib/Kconfig
index 6762529ad9e4..527da69e3be1 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -559,9 +559,6 @@ config ARCH_HAS_PMEM_API
config ARCH_HAS_UACCESS_FLUSHCACHE
bool
-config ARCH_HAS_MMIO_FLUSH
- bool
-
config STACKDEPOT
bool
select STACKTRACE
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 4c2fa98ef39d..d20791c3f499 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -1546,8 +1546,8 @@ static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa,
else {
memcpy(iobuf, mmio->addr.base + dpa, len);
- /* give us some some coverage of the mmio_flush_range() API */
- mmio_flush_range(mmio->addr.base + dpa, len);
+ /* give us some some coverage of the arch_invalidate_pmem() API */
+ arch_invalidate_pmem(mmio->addr.base + dpa, len);
}
nd_region_release_lane(nd_region, lane);
--
2.13.4.dirty
4 years, 8 months
[PATCH] libnvdimm, nfit: export an 'ecc_unit_size' sysfs attribute
by Dan Williams
When the nfit driver initializes it runs an ARS (Address Range Scrub)
operation across every pmem range. Part of that process involves
determining the ARS capabilities of a given address range. One of the
capabilities that is reported is the 'Clear Uncorrectable Error Range
Length Unit Size' (see: ACPI 6.2 section 9.20.7.4 Function Index 1 -
Query ARS Capabilities). This property is of interest to userspace
software as it indicates the boundary at which the NVDIMM may need to
perform read-modify-write cycles to maintain ECC blocks.
Cc: Vishal Verma <vishal.l.verma(a)intel.com>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
---
drivers/acpi/nfit/core.c | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 2c5608b92578..03105648f9b1 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1674,8 +1674,19 @@ static ssize_t range_index_show(struct device *dev,
}
static DEVICE_ATTR_RO(range_index);
+static ssize_t ecc_unit_size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+ struct nfit_spa *nfit_spa = nd_region_provider_data(nd_region);
+
+ return sprintf(buf, "%d\n", nfit_spa->clear_err_unit);
+}
+static DEVICE_ATTR_RO(ecc_unit_size);
+
static struct attribute *acpi_nfit_region_attributes[] = {
&dev_attr_range_index.attr,
+ &dev_attr_ecc_unit_size.attr,
NULL,
};
4 years, 8 months
[PATCH v3 0/4] fs, dax: lookup dax_device at mount time
by Dan Williams
Changes since v2 [1]:
* Split fs_dax_get_by_bdev() to its own patch (Christoph)
* Push dax_device reference management into <fs>_{fill,put}_super()
rather than the generic vfs. (Christoph)
[1]: https://lists.01.org/pipermail/linux-nvdimm/2017-August/012133.html
---
Christoph notes:
I just noticed that we now do a fs_dax_get_by_host in every
iomap_begin call for DAX. This function iterates a list, does a
string compared and igrab. I really think we need to cache this in
the superblock (possible even the fs superblock) similar to what we
do for the block device.
This passes the libnvdimm unit tests.
---
Dan Williams (4):
dax: introduce a fs_dax_get_by_bdev() helper
xfs: perform dax_device lookup at mount
ext2: perform dax_device lookup at mount
ext4: perform dax_device lookup at mount
drivers/dax/super.c | 10 ++++++++++
fs/ext2/ext2.h | 1 +
fs/ext2/inode.c | 11 +++--------
fs/ext2/super.c | 5 +++++
fs/ext4/ext4.h | 1 +
fs/ext4/inode.c | 11 +++--------
fs/ext4/super.c | 5 +++++
fs/xfs/xfs_aops.c | 13 +++++++++++++
fs/xfs/xfs_aops.h | 1 +
fs/xfs/xfs_buf.c | 4 +++-
fs/xfs/xfs_buf.h | 3 ++-
fs/xfs/xfs_iomap.c | 10 +---------
fs/xfs/xfs_super.c | 25 +++++++++++++++++++++----
include/linux/dax.h | 6 ++++++
14 files changed, 75 insertions(+), 31 deletions(-)
4 years, 8 months
[PATCH v2] Fix ext4 fault handling when mounted with -o dax,ro
by rdodgen@gmail.com
From: Randy Dodgen <dodgen(a)google.com>
If an ext4 filesystem is mounted with both the DAX and read-only
options, executables on that filesystem will fail to start (claiming
'Segmentation fault') due to the fault handler returning
VM_FAULT_SIGBUS.
This is due to the DAX fault handler (see ext4_dax_huge_fault)
attempting to write to the journal when FAULT_FLAG_WRITE is set. This is
the wrong behavior for write faults which will lead to a COW page; in
particular, this fails for readonly mounts.
This changes replicates some check from dax_iomap_fault to more
precisely reason about when a journal-write is needed.
It might be the case that this could be better handled in
ext4_iomap_begin / ext4_iomap_end (called via iomap_ops inside
dax_iomap_fault). These is some overlap already (e.g. grabbing journal
handles).
Signed-off-by: Randy Dodgen <dodgen(a)google.com>
---
I'm resending for some DMARC-proofing (thanks Ted for the explanation), a
missing Signed-off-by, and some extra cc's. Oops!
fs/ext4/file.c | 26 +++++++++++++++++++++++++-
1 file changed, 25 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 0d7cf0cc9b87..d512fb85a3e3 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -279,7 +279,31 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
handle_t *handle = NULL;
struct inode *inode = file_inode(vmf->vma->vm_file);
struct super_block *sb = inode->i_sb;
- bool write = vmf->flags & FAULT_FLAG_WRITE;
+ bool write;
+
+ /*
+ * We have to distinguish real writes from writes which will result in a
+ * COW page
+ * - COW writes need to fall-back to installing PTEs. See
+ * dax_iomap_pmd_fault.
+ * - COW writes should *not* poke the journal (the file will not be
+ * changed). Doing so would cause unintended failures when mounted
+ * read-only.
+ */
+ if (pe_size == PE_SIZE_PTE) {
+ /* See dax_iomap_pte_fault. */
+ write = (vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page;
+ } else if (pe_size == PE_SIZE_PMD) {
+ /* See dax_iomap_pmd_fault. */
+ write = vmf->flags & FAULT_FLAG_WRITE;
+ if (write && !(vmf->vma->vm_flags & VM_SHARED)) {
+ split_huge_pmd(vmf->vma, vmf->pmd, vmf->address);
+ count_vm_event(THP_FAULT_FALLBACK);
+ return VM_FAULT_FALLBACK;
+ }
+ } else {
+ return VM_FAULT_FALLBACK;
+ }
if (write) {
sb_start_pagefault(sb);
--
2.14.1.480.gb18f417b89-goog
4 years, 8 months