[ndctl PATCH] test: Add a unit test for dax error handling
by Vishal Verma
When we have a namespace with media errors, DAX should fail when trying
to map the bad blocks for direct access, but a regular write() to the
same sector should go through the driver and clear the error.
This test checks for all of the above happening - failure for a read()
on a file with a bad block, failure on an mmap-read for the same, and
finally a successful write that clears the bad block.
Signed-off-by: Vishal Verma <vishal.l.verma(a)intel.com>
---
Makefile.am | 5 +-
test/dax-errors.c | 139 +++++++++++++++++++++++++++++++++++++++++++++++++++++
test/dax-errors.sh | 118 +++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 260 insertions(+), 2 deletions(-)
create mode 100644 test/dax-errors.c
create mode 100755 test/dax-errors.sh
diff --git a/Makefile.am b/Makefile.am
index 8458277..e9231b8 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -144,8 +144,8 @@ EXTRA_DIST += lib/libndctl.pc.in
CLEANFILES += lib/libndctl.pc
TESTS = test/libndctl test/dpa-alloc test/parent-uuid test/create.sh \
- test/clear.sh
-check_PROGRAMS = test/libndctl test/dpa-alloc test/parent-uuid
+ test/clear.sh test/dax-errors.sh
+check_PROGRAMS = test/libndctl test/dpa-alloc test/parent-uuid test/dax-errors
if ENABLE_DESTRUCTIVE
TESTS += test/blk-ns test/pmem-ns test/pcommit
@@ -178,3 +178,4 @@ test_dax_dev_LDADD = lib/libndctl.la
test_dax_pmd_SOURCES = test/dax-pmd.c
test_mmap_SOURCES = test/mmap.c
+test_dax_err_SOURCES = test/dax-errors.c
diff --git a/test/dax-errors.c b/test/dax-errors.c
new file mode 100644
index 0000000..4e9bb04
--- /dev/null
+++ b/test/dax-errors.c
@@ -0,0 +1,139 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <stdlib.h>
+#include <linux/fs.h>
+#include <linux/fiemap.h>
+#include <setjmp.h>
+
+#define fail() fprintf(stderr, "%s: failed at: %d\n", __func__, __LINE__)
+
+static sigjmp_buf sj_env;
+static int sig_count;
+
+static void sigbus_hdl(int sig, siginfo_t *siginfo, void *ptr)
+{
+ fprintf(stderr, "** Received a SIGBUS **\n");
+ sig_count++;
+ siglongjmp(sj_env, 1);
+}
+
+static int test_dax_read_err(int fd)
+{
+ void *base, *buf;
+ int rc = 0;
+
+ if (fd < 0) {
+ fail();
+ return -ENXIO;
+ }
+
+ if (posix_memalign(&buf, 4096, 4096) != 0)
+ return -ENOMEM;
+
+ base = mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+ if (base == MAP_FAILED) {
+ perror("mmap");
+ rc = -ENXIO;
+ goto err_mmap;
+ }
+
+ if (sigsetjmp(sj_env, 1)) {
+ if (sig_count == 1) {
+ fprintf(stderr, "Failed to read from mapped file\n");
+ free(buf);
+ if (base) {
+ if (munmap(base, 4096) < 0) {
+ fail();
+ return 1;
+ }
+ }
+ return 1;
+ }
+ return sig_count;
+ }
+
+ /* read a page through DAX (should fail due to a bad block) */
+ memcpy(buf, base, 4096);
+
+ err_mmap:
+ free(buf);
+ return rc;
+}
+
+static int test_dax_write_clear(int fd)
+{
+ void *buf;
+ int rc = 0;
+
+ if (fd < 0) {
+ fail();
+ return -ENXIO;
+ }
+
+ if (posix_memalign(&buf, 4096, 4096) != 0)
+ return -ENOMEM;
+ memset(buf, 0, 4096);
+
+ /*
+ * Attempt to write zeroes to the first page of the file using write()
+ * This should clear the pmem errors/bad blocks
+ */
+ printf("Attempting to write\n");
+ if (write(fd, buf, 4096) < 0)
+ rc = errno;
+
+ free(buf);
+ return rc;
+}
+
+int main(int argc, char *argv[])
+{
+ int fd, rc;
+ struct sigaction act;
+
+ if (argc < 1)
+ return -EINVAL;
+
+ memset(&act, 0, sizeof(act));
+ act.sa_sigaction = sigbus_hdl;
+ act.sa_flags = SA_SIGINFO;
+
+ if (sigaction(SIGBUS, &act, 0)) {
+ fail();
+ return 1;
+ }
+
+ fd = open(argv[1], O_RDWR | O_DIRECT);
+
+ /* Start the test. First, we do an mmap-read, and expect it to fail */
+ rc = test_dax_read_err(fd);
+ if (rc == 0) {
+ fprintf(stderr, "Expected read to fail, but it succeeded\n");
+ rc = -ENXIO;
+ goto out;
+ }
+ if (rc > 1) {
+ fprintf(stderr, "Received a second SIGBUS, exiting.\n");
+ rc = -ENXIO;
+ goto out;
+ }
+ printf(" mmap-read failed as expected\n");
+
+ /* Next, do a regular (O_DIRECT) write() */
+ rc = test_dax_write_clear(fd);
+ if (rc)
+ perror("write");
+
+ out:
+ if (fd >= 0)
+ close(fd);
+ return rc;
+}
diff --git a/test/dax-errors.sh b/test/dax-errors.sh
new file mode 100755
index 0000000..72f8771
--- /dev/null
+++ b/test/dax-errors.sh
@@ -0,0 +1,118 @@
+#!/bin/bash -x
+
+DEV=""
+NDCTL="./ndctl"
+BUS="-b nfit_test.0"
+BUS1="-b nfit_test.1"
+MNT=test_dax_mnt
+FILE=image
+json2var="s/[{}\",]//g; s/:/=/g"
+rc=77
+
+err() {
+ rc=1
+ echo "test/dax-errors: failed at line $1"
+ rm -f $FILE
+ rm -f $MNT/$FILE
+ if [ -n "$blockdev" ]; then
+ umount /dev/$blockdev
+ else
+ rc=77
+ fi
+ rmdir $MNT
+ exit $rc
+}
+
+set -e
+mkdir -p $MNT
+trap 'err $LINENO' ERR
+
+# setup (reset nfit_test dimms)
+modprobe nfit_test
+$NDCTL disable-region $BUS all
+$NDCTL zero-labels $BUS all
+$NDCTL enable-region $BUS all
+
+rc=1
+
+# create pmem
+dev="x"
+json=$($NDCTL create-namespace $BUS -t pmem -m raw)
+eval $(echo $json | sed -e "$json2var")
+[ $dev = "x" ] && echo "fail: $LINENO" && exit 1
+[ $mode != "raw" ] && echo "fail: $LINENO" && exit 1
+
+# check for expected errors in the middle of the namespace
+read sector len < /sys/block/$blockdev/badblocks
+[ $((sector * 2)) -ne $((size /512)) ] && echo "fail: $LINENO" && exit 1
+if dd if=/dev/$blockdev of=/dev/null iflag=direct bs=512 skip=$sector count=$len; then
+ echo "fail: $LINENO" && exit 1
+fi
+
+size_raw=$size
+sector_raw=$sector
+
+# convert pmem to memory mode
+json=$($NDCTL create-namespace -m memory -f -e $dev)
+eval $(echo $json | sed -e "$json2var")
+[ $mode != "memory" ] && echo "fail: $LINENO" && exit 1
+
+# check for errors relative to the offset injected by the pfn device
+read sector len < /sys/block/$blockdev/badblocks
+[ $((sector_raw - sector)) -ne $(((size_raw - size) / 512)) ] && echo "fail: $LINENO" && exit 1
+
+# check that writing clears the errors
+if ! dd of=/dev/$blockdev if=/dev/zero oflag=direct bs=512 seek=$sector count=$len; then
+ echo "fail: $LINENO" && exit 1
+fi
+
+if read sector len < /sys/block/$blockdev/badblocks; then
+ # fail if reading badblocks returns data
+ echo "fail: $LINENO" && exit 1
+fi
+
+#mkfs.xfs /dev/$blockdev -b size=4096 -f
+mkfs.ext4 /dev/$blockdev -b 4096
+mount /dev/$blockdev $MNT -o dax
+
+# prepare an image file with random data
+dd if=/dev/urandom of=$FILE bs=4096 count=4
+test -s $FILE
+
+# copy it to the dax file system
+cp $FILE $MNT/$FILE
+
+# Get the start sector for the file
+start_sect=$(filefrag -v -b512 $MNT/$FILE | grep -E "^[ ]+[0-9]+.*" | head -1 | awk '{ print $4 }' | cut -d. -f1)
+test -n "$start_sect"
+echo "start sector of the file is $start_sect"
+
+# inject badblocks for one page at the start of the file
+echo $start_sect 8 > /sys/block/$blockdev/badblocks
+
+# make sure reading the first block of the file fails as expected
+: The following 'dd' is expected to hit an I/O Error
+dd if=$MNT/$FILE of=/dev/null iflag=direct bs=4096 count=1 && err $LINENO || true
+
+# run the dax-errors test
+test -x test/dax-errors
+test/dax-errors $MNT/$FILE
+
+if read sector len < /sys/block/$blockdev/badblocks; then
+ # fail if reading badblocks returns data
+ echo "fail: $LINENO" && exit 1
+fi
+
+# cleanup
+rm -f $FILE
+rm -f $MNT/$FILE
+if [ -n "$blockdev" ]; then
+ umount /dev/$blockdev
+fi
+rmdir $MNT
+
+$NDCTL disable-region $BUS all
+$NDCTL disable-region $BUS1 all
+modprobe -r nfit_test
+
+exit 0
--
2.5.5
6 years, 1 month
[ndctl PATCH 0/2] fix the nfit_test fallback for 'ndctl bat'
by Dan Williams
Per test results from Ross [1] we don't want 'ndctl bat' to require
nfit_test. Patch2 rearranges the fallback to only trigger when an
ACPI.NFIT provider is found.
This uncovered an interesting failure mode whereby
ndctl_bus_get_by_provider() failed to locate an expected bus. The fix
for that lead to the creation of a new ndctl_invalidate() api, see
patch1 for the details.
[1]: https://lists.01.org/pipermail/linux-nvdimm/2016-March/005017.html
---
Dan Williams (2):
ndctl: provide a method to invalidate the bus list
ndctl: don't skip bat tests if nfit_test not available
lib/libndctl.c | 22 ++++++++++++++---
lib/libndctl.sym | 1 +
lib/ndctl/libndctl.h.in | 1 +
test/blk_namespaces.c | 62 +++++++++++++++++++++++------------------------
test/pmem_namespaces.c | 55 ++++++++++++++++++++----------------------
5 files changed, 76 insertions(+), 65 deletions(-)
6 years, 1 month
Re: [PATCH 12/12] dax: New fault locking
by Jan Kara
On Wed 23-03-16 08:10:42, NeilBrown wrote:
> On Sat, Mar 19 2016, Jan Kara wrote:
> >
> > Actually, after some thought I don't think the wakeup is needed except for
> > dax_pfn_mkwrite(). In the other cases we know there is no radix tree
> > exceptional entry and thus there can be no waiters for its lock...
> >
>
> I think that is fragile logic - though it may be correct at present.
>
> A radix tree slot can transition from "Locked exception" to "unlocked
> exception" to "deleted" to "struct page".
Yes.
> So it is absolutely certain that a thread cannot go to sleep after
> finding a "locked exception" and wake up to find a "struct page" ??
With current implementation this should not happen but I agree entry
locking code should not rely on this.
> How about a much simpler change.
> - new local variable "slept" in lookup_unlocked_mapping_entry() which
> is set if prepare_to_wait_exclusive() gets called.
> - if after __radix_tree_lookup() returns:
> (ret==NULL || !radix_tree_exceptional_entry(ret)) && slept
> then it calls wakeup immediately - because if it was waiting,
> something else might be to.
>
> That would cover all vaguely possible cases except dax_pfn_mkwrite()
But how does this really help? If lookup_unlocked_mapping_entry() finds
there is no entry (and it was there before), the process deleting the entry
(or replacing it with something else) is responsible for waking up
everybody. So your change would only duplicate what
dax_delete_mapping_entry() does. The potential for breakage is that callers
of lookup_unlocked_mapping_entry() are responsible for waking up other
waiters *even if* they do not lock or delete the entry in the end. Maybe
I'll rename lookup_unlocked_mapping_entry() to get_unlocked_mapping_entry()
so that it is clearer that one must call either put_unlocked_mapping_entry()
or put_locked_mapping_entry() on it.
Honza
--
Jan Kara <jack(a)suse.com>
SUSE Labs, CR
6 years, 2 months
[PATCH RFC 1/1] Add support for ZONE_DEVICE IO memory with struct pages.
by Stephen Bates
From: Logan Gunthorpe <logang(a)deltatee.com>
Introduction
------------
This RFC patch adds struct page backing for IO memory and as such
allows IO memory to be used as a DMA target.
Patch Summary
-------------
We build on recent work done that adds memory regions owned by a device
driver (ZONE_DEVICE) [1] and to add struct page support for these new
regions of memory [2]. The patch should apply cleanly to 4.5-rc7.
1. Add a new flag (MEMREMAP_WC) to the enumerated type in include/io.h.
2. Add an extra flags argument into dev_memremap_pages to take in a
MEMREMAP_XX argument. We update the existing calls to this function to
reflect the change.
3. For completeness, we add MEMREMAP_WC support to the memremap;
however we have no actual need for this functionality.
4. We add the static functions, add_zone_device_pages and
remove_zone_device pages. These are similar to arch_add_memory except
they don't create the memory mapping. We don't believe these need to be
made arch specific, but are open to other opinions.
5. dev_memremap_pages and devm_memremap_pages_release are updated to
treat IO memory slightly differently. For IO memory we use a combination
of the appropriate io_remap function and the zone_device pages functions
created above. A flags variable and kaddr pointer are added to struct
page_mem to facilitate this for the release function.
Motivation and Use Cases
------------------------
PCIe IO devices are getting faster. It is not uncommon now to find PCIe
network and storage devices that can generate and consume several GB/s.
Almost always these devices have either a high performance DMA engine, a
number of exposed PCIe BARs or both.
Until this patch, any high-performance transfer of information between
two PICe devices has required the use of a staging buffer in system
memory. With this patch the bandwidth to system memory is not compromised
when high-throughput transfers occurs between PCIe devices. This means
that more system memory bandwidth is available to the CPU cores for data
processing and manipulation. In addition, in systems where the two PCIe
devices reside behind a PCIe switch the datapath avoids the CPU entirely.
Consumers
---------
In order to test this patch and to give an example of a consumer of this
patch we have developed a PCIe driver which can be used to bind to any
PCIe device that exposes IO memory via a PCIe BAR. A basic out of tree
module can be found at [3] which is intended only for testing and example
purposes. Assuming this RFC is received well, the first in-kernel user
would be a patchset which facilitates the Controller Memory Buffer (CMB)
feature for NVMe. This would enable RDMA devices to move data directly
to/from files on NVMe devices without the need for a round trip through
system memory.
The example out of tree module allows for the following accesses:
1. Block based access. For this we borrowed heavily from the pmem device
driver. Note that this mode allows for DAX filesystems to be mounted
on the block device. The driver uses devm_memremap_pages on a block of
PCIe memory such that files on this FS can then be mmapped using direct
access and used as a DMA target.
2. MMAP based access. The driver again uses devm_memremap_pages, then
hands out ZONE_DEVICE backed pages with its fault function. Thus, with
this patch, these mappings would now be usable as DMA targets.
Testing and Performance
-----------------------
We have done a moderate about of testing of this patch on a QEMU
environment and a real hardware environment. On real hardware we have
observed peer-to-peer writes of up to 4GB/s and reads of up to 1.2 GB/s.
In both cases these numbers are limitations of our consumer hardware. In
addtion, we have observed that the CPU DRAM bandwidth is not impacted
when using IOPMEM which is not the case when a traditional patch through
system memory is taken.
For more information on the testing and performance results see the
GitHub site [4].
Known Issues
------------
1. Address Translation. Suggestions have been made that in certain
architectures and topologies the dma_addr_t passed to the DMA master
in a peer-2-peer transfer will not correctly route to the IO memory
intended. However in our testing to date we have not seen this to be
an issue, even in systems with IOMMUs and PCIe switches. It is our
understanding that an IOMMU only maps system memory and would not
interfere with device memory regions. (It certainly has no opportunity
to do so if the transfer gets routed through a switch).
2. Memory Segment Spacing. This patch has the same limitations that
ZONE_DEVICE does in that memory regions must be spaces at least
SECTION_SIZE bytes part. On x86 this is 128MB and there are cases where
BARs can be placed closer together than this. Thus ZONE_DEVICE would not
be usable on neighboring BARs. For our purposes, this is not an issue as
we'd only be looking at enabling a single BAR in a given PCIe device.
More exotic use cases may have problems with this.
3. Coherency Issues. When IOMEM is written from both the CPU and a PCIe
peer there is potential for coherency issues and for writes to occur out
of order. This is something that users of this feature need to be
cognizant of and may necessitate the use of CONFIG_EXPERT. Though really,
this isn't much different than the existing situation with RDMA: if
userspace sets up an MR for remote use, they need to be careful about
using that memory region themselves.
4. Architecture. Currently this patch is applicable only to x86
architectures. The same is true for much of the code pertaining to
PMEM and ZONE_DEVICE. It is hoped that the work will be extended to other
ARCH over time.
References
----------
[1] https://lists.01.org/pipermail/linux-nvdimm/2015-August/001810.html
[2] https://lists.01.org/pipermail/linux-nvdimm/2015-October/002387.html
[3] https://github.com/sbates130272/linux-donard/tree/iopmem-rfc
[4] https://github.com/sbates130272/zone-device
Signed-off-by: Stephen Bates <stephen.bates(a)pmcs.com>
Signed-off-by: Logan Gunthorpe <logang(a)deltatee.com>
---
drivers/nvdimm/pmem.c | 4 +--
include/linux/io.h | 1 +
include/linux/memremap.h | 5 ++--
kernel/memremap.c | 78 ++++++++++++++++++++++++++++++++++++++++++++----
4 files changed, 79 insertions(+), 9 deletions(-)
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 8d0b546..095f358 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -184,7 +184,7 @@ static struct pmem_device *pmem_alloc(struct device *dev,
pmem->pfn_flags = PFN_DEV;
if (pmem_should_map_pages(dev)) {
pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res,
- &q->q_usage_counter, NULL);
+ &q->q_usage_counter, NULL, ARCH_MEMREMAP_PMEM);
pmem->pfn_flags |= PFN_MAP;
} else
pmem->virt_addr = (void __pmem *) devm_memremap(dev,
@@ -410,7 +410,7 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
q = pmem->pmem_queue;
devm_memunmap(dev, (void __force *) pmem->virt_addr);
pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res,
- &q->q_usage_counter, altmap);
+ &q->q_usage_counter, altmap, ARCH_MEMREMAP_PMEM);
pmem->pfn_flags |= PFN_MAP;
if (IS_ERR(pmem->virt_addr)) {
rc = PTR_ERR(pmem->virt_addr);
diff --git a/include/linux/io.h b/include/linux/io.h
index 32403b5..e2c8419 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -135,6 +135,7 @@ enum {
/* See memremap() kernel-doc for usage description... */
MEMREMAP_WB = 1 << 0,
MEMREMAP_WT = 1 << 1,
+ MEMREMAP_WC = 1 << 2,
};
void *memremap(resource_size_t offset, size_t size, unsigned long flags);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index bcaa634..ad5cf23 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -51,12 +51,13 @@ struct dev_pagemap {
#ifdef CONFIG_ZONE_DEVICE
void *devm_memremap_pages(struct device *dev, struct resource *res,
- struct percpu_ref *ref, struct vmem_altmap *altmap);
+ struct percpu_ref *ref, struct vmem_altmap *altmap,
+ unsigned long flags);
struct dev_pagemap *find_dev_pagemap(resource_size_t phys);
#else
static inline void *devm_memremap_pages(struct device *dev,
struct resource *res, struct percpu_ref *ref,
- struct vmem_altmap *altmap)
+ struct vmem_altmap *altmap, unsigned long flags)
{
/*
* Fail attempts to call devm_memremap_pages() without
diff --git a/kernel/memremap.c b/kernel/memremap.c
index b981a7b..a739286 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -41,7 +41,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
* memremap() - remap an iomem_resource as cacheable memory
* @offset: iomem resource start address
* @size: size of remap
- * @flags: either MEMREMAP_WB or MEMREMAP_WT
+ * @flags: either MEMREMAP_WB, MEMREMAP_WT or MEMREMAP_WC
*
* memremap() is "ioremap" for cases where it is known that the resource
* being mapped does not have i/o side effects and the __iomem
@@ -57,6 +57,9 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
* cache or are written through to memory and never exist in a
* cache-dirty state with respect to program visibility. Attempts to
* map "System RAM" with this mapping type will fail.
+ *
+ * MEMREMAP_WC - like MEMREMAP_WT but enables write combining.
+ *
*/
void *memremap(resource_size_t offset, size_t size, unsigned long flags)
{
@@ -101,6 +104,12 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
addr = ioremap_wt(offset, size);
}
+
+ if (!addr && (flags & MEMREMAP_WC)) {
+ flags &= ~MEMREMAP_WC;
+ addr = ioremap_wc(offset, size);
+ }
+
return addr;
}
EXPORT_SYMBOL(memremap);
@@ -164,13 +173,41 @@ static RADIX_TREE(pgmap_radix, GFP_KERNEL);
#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
+enum {
+ PAGEMAP_IO_MEM = 1 << 0,
+};
+
struct page_map {
struct resource res;
struct percpu_ref *ref;
struct dev_pagemap pgmap;
struct vmem_altmap altmap;
+ void *kaddr;
+ int flags;
};
+static int add_zone_device_pages(int nid, u64 start, u64 size)
+{
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ struct zone *zone = pgdat->node_zones + ZONE_DEVICE;
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+
+ return __add_pages(nid, zone, start_pfn, nr_pages);
+}
+
+static void remove_zone_device_pages(u64 start, u64 size)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ struct zone *zone;
+ int ret;
+
+ zone = page_zone(pfn_to_page(start_pfn));
+ ret = __remove_pages(zone, start_pfn, nr_pages);
+ WARN_ON_ONCE(ret);
+}
+
void get_zone_device_page(struct page *page)
{
percpu_ref_get(page->pgmap->ref);
@@ -235,8 +272,15 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
/* pages are dead and unused, undo the arch mapping */
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(resource_size(res), SECTION_SIZE);
- arch_remove_memory(align_start, align_size);
+
+ if (page_map->flags & PAGEMAP_IO_MEM) {
+ remove_zone_device_pages(align_start, align_size);
+ iounmap(page_map->kaddr);
+ } else {
+ arch_remove_memory(align_start, align_size);
+ }
pgmap_radix_release(res);
+
dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
"%s: failed to free all reserved pages\n", __func__);
}
@@ -258,6 +302,8 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
* @res: "host memory" address range
* @ref: a live per-cpu reference count
* @altmap: optional descriptor for allocating the memmap from @res
+ * @flags: either MEMREMAP_WB, MEMREMAP_WT or MEMREMAP_WC
+ * see memremap() for a description of the flags
*
* Notes:
* 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time
@@ -268,7 +314,8 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
* this is not enforced.
*/
void *devm_memremap_pages(struct device *dev, struct resource *res,
- struct percpu_ref *ref, struct vmem_altmap *altmap)
+ struct percpu_ref *ref, struct vmem_altmap *altmap,
+ unsigned long flags)
{
int is_ram = region_intersects(res->start, resource_size(res),
"System RAM");
@@ -277,6 +324,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
struct page_map *page_map;
unsigned long pfn;
int error, nid;
+ void *addr = NULL;
if (is_ram == REGION_MIXED) {
WARN_ONCE(1, "%s attempted on mixed region %pr\n",
@@ -344,10 +392,28 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
if (nid < 0)
nid = numa_mem_id();
- error = arch_add_memory(nid, align_start, align_size, true);
+ if (flags & MEMREMAP_WB || !flags) {
+ error = arch_add_memory(nid, align_start, align_size, true);
+ addr = __va(res->start);
+ } else {
+ page_map->flags |= PAGEMAP_IO_MEM;
+ error = add_zone_device_pages(nid, align_start, align_size);
+ }
+
if (error)
goto err_add_memory;
+ if (!addr && (flags & MEMREMAP_WT))
+ addr = ioremap_wt(res->start, resource_size(res));
+
+ if (!addr && (flags & MEMREMAP_WC))
+ addr = ioremap_wc(res->start, resource_size(res));
+
+ if (!addr && page_map->flags & PAGEMAP_IO_MEM) {
+ remove_zone_device_pages(res->start, resource_size(res));
+ goto err_add_memory;
+ }
+
for_each_device_pfn(pfn, page_map) {
struct page *page = pfn_to_page(pfn);
@@ -355,8 +421,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
list_force_poison(&page->lru);
page->pgmap = pgmap;
}
+
+ page_map->kaddr = addr;
devres_add(dev, page_map);
- return __va(res->start);
+ return addr;
err_add_memory:
err_radix:
--
2.1.4
6 years, 2 months
[RFC v2] [PATCH 0/10] DAX page fault locking
by Jan Kara
Hello,
this is my second attempt at DAX page fault locking rewrite. Things now
work reasonably well, it has survived full xfstests run on ext4. I guess
I need to do more mmap targetted tests to unveil issues. Guys what do you
used for DAX testing?
Changes since v1:
- handle wakeups of exclusive waiters properly
- fix cow fault races
- other minor stuff
General description
The basic idea is that we use a bit in an exceptional radix tree entry as
a lock bit and use it similarly to how page lock is used for normal faults.
That way we fix races between hole instantiation and read faults of the
same index. For now I have disabled PMD faults since there the issues with
page fault locking are even worse. Now that Matthew's multi-order radix tree
has landed, I can have a look into using that for proper locking of PMD faults
but first I want normal pages sorted out.
In the end I have decided to implement the bit locking directly in the DAX
code. Originally I was thinking we could provide something generic directly
in the radix tree code but the functions DAX needs are rather specific.
Maybe someone else will have a good idea how to distill some generally useful
functions out of what I've implemented for DAX but for now I didn't bother
with that.
Honza
6 years, 2 months
Re: [PATCH] pmem: don't allocate unused major device number
by Boaz Harrosh
On 03/09/2016 12:21 AM, NeilBrown wrote:
>
> When alloc_disk(0) or alloc_disk-node(0, XX) is used, the ->major
> number is completely ignored: all devices are allocated with a
> major of BLOCK_EXT_MAJOR.
>
> So there is no point allocating pmem_major.
>
> Signed-off-by: NeilBrown <neilb(a)suse.com>
Tested-by: Boaz Harrosh <boaz(a)plexistor.com>
Yes I sent in the same exact patch several times. This is not the
only driver that has this "wasted code" BTW.
I hope it will be finally removed. Thanks Neil
Boaz
> ---
> drivers/nvdimm/pmem.c | 19 +------------------
> 1 file changed, 1 insertion(+), 18 deletions(-)
>
> Hi Dan et al,
> I was recently educating myself about the behavior of alloc_disk(0).
> As I understand it, the ->major is ignored and all device numbers for all
> partitions (including '0') are allocated on demand with major number of
> BLOCK_EXT_MAJOR.
>
> So I was a little surprised to find that pmem.c allocated a major
> number which is never used - historical anomaly I suspect.
> I was a bit more surprised at the comment in:
>
> Commit: 9f53f9fa4ad1 ("libnvdimm, pmem: add libnvdimm support to the pmem driver")
>
> "The minor numbers are also more predictable by passing 0 to alloc_disk()."
>
> How can they possibly be more predictable given that they are allocated
> on-demand? Maybe discovery order is very predictable???
>
> In any case, I propose this patch but cannot test it (beyond compiling)
> as I don't have relevant hardware. And maybe some user-space code greps
> /proc/devices for "pmem" to determine if "pmem" is compiled in (though
> I sincerely hope not).
> So I cannot be certain that this patch won't break anything, but am
> hoping that if you like it you might test it.
>
> If it does prove acceptable, then similar changes would be appropriate
> for btt.c and blk.c. And drivers/memstick/core/ms_block.c and
> drivers/nvme/host/core.c. (gotta stamp out this cargo cult)
>
> drivers/lightnvm/core.c is the only driver which uses alloc_disk(0) and
> doesn't provide a 'major' number. :-(
>
> Thanks,
> NeilBrown
>
>
> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> index 8d0b54670184..ec7e9e6a768e 100644
> --- a/drivers/nvdimm/pmem.c
> +++ b/drivers/nvdimm/pmem.c
> @@ -47,8 +47,6 @@ struct pmem_device {
> struct badblocks bb;
> };
>
> -static int pmem_major;
> -
> static bool is_bad_pmem(struct badblocks *bb, sector_t sector, unsigned int len)
> {
> if (bb->count) {
> @@ -228,8 +226,6 @@ static int pmem_attach_disk(struct device *dev,
> return -ENOMEM;
> }
>
> - disk->major = pmem_major;
> - disk->first_minor = 0;
> disk->fops = &pmem_fops;
> disk->private_data = pmem;
> disk->queue = pmem->pmem_queue;
> @@ -502,26 +498,13 @@ static struct nd_device_driver nd_pmem_driver = {
>
> static int __init pmem_init(void)
> {
> - int error;
> -
> - pmem_major = register_blkdev(0, "pmem");
> - if (pmem_major < 0)
> - return pmem_major;
> -
> - error = nd_driver_register(&nd_pmem_driver);
> - if (error) {
> - unregister_blkdev(pmem_major, "pmem");
> - return error;
> - }
> -
> - return 0;
> + return nd_driver_register(&nd_pmem_driver);
> }
> module_init(pmem_init);
>
> static void pmem_exit(void)
> {
> driver_unregister(&nd_pmem_driver.drv);
> - unregister_blkdev(pmem_major, "pmem");
> }
> module_exit(pmem_exit);
>
>
6 years, 2 months
[PATCH v7 0/7] nvdimm: Add an IOCTL pass thru for DSM calls
by Jerry Hoemann
The NVDIMM code in the kernel supports an IOCTL interface to user
space based upon the Intel Example DSM:
http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf
This interface cannot be used by other NVDIMM DSMs that support
incompatible functions.
An alternative DSM specification for Type N DSM being developed
by Hewlett Packard Enterprise can be found at:
https://github.com/HewlettPackard/hpe-nvm/tree/master/Documentation
To accommodate multiple and conflicting DSM specifications, this patch
set adds a generic "passthru" IOCTL interface which is not tied to
a particular DSM.
A new _IOC_NR ND_CMD_CALL == "10" is added for the pass thru call.
The new data structure nd_cmd_pkg serves as a wrapper for the
passthru calls. This wrapper supplies the data that the kernel
needs to make the _DSM call.
Unlike the definitions of the _DSM functions themselves, the nd_cmd_pkg
provides the calling information (input/output sizes) in an uniform
manner making the kernel marshaling of the arguments straight
forward.
This shifts the marshaling burden from the kernel to the user
space application while still permitting the kernel to internally
call _DSM functions.
The kernel functions __nd_ioctl and acpi_nfit_ctl were modified
to accommodate ND_CMD_CALL.
Changes in version 7:
--------------------
0. change name ND_CMD_CALL_DSM to ND_CMD_CALL
- part of abstracting out DSM missed in version 6.
1. change name in struct nd_call_dsm
a) "ncp_" -> "nd_"
b) ncp_pot_size -> nd_fw_size
c) ncp_type -> nd_family
o) cascade name changes to other patches
2. Expanded comment around data structure nd_cmd_pkg
3. At Dan's request, hard coding "root" UUID.
a) retract extension of dsm_uuid to nvdimm_bus_descriptor.
b) reverted nfit.c/acpi_nfit_init_dsms() with the exception of
allowing function 0 in mask.
4. At Dan's request, removed "rev" from nd_cmd_pkg. Hard-coding
use of rev "1" in acpi_nfit_ctl.
Changes in version 6:
---------------------
Built against
git://git.kernel.org/pub/scm/linux/kernel/git/djbw/nvdimm.git
libnvdimm-pending
0. Patches "Clean-up access mode check" and "Fix security issue with DSM IOCTL"
already in above libnvdimm-pending. So omitted here.
1. Incorporated changes from Dan's RFC patch set
https://lists.01.org/pipermail/linux-nvdimm/2016-January/004049.html
2. Dan asked me to abstract out the DSM aspects from the ndm_cmd_dsmcall_pkg.
This became nd_cmd_pkg. UUIDs are no longer passed in from
user applications.
3. To accommodate multiple UUIDS, added table cmd_type_tbl which is used
to determine UUID for the acpi object by calling function 0 for
each UUID in table until success.
This table also provides a MASK field that the kernel can use
to exclude functions being called.
This table can be thought of a list of "acceptable" DSMs.
4. The cmd_type_tbl is also used by acpi_nfit_ctl to map the
external handle of calls to internal handle, UUID.
Note, code only validates that the requested type of call is one in
cmd_type_tbl, but it might not necessarily be the same found during
acpi_nfit_add_dimm. The ACPI SPEC appears to allow and firmware
does implement multiple UUID per object.
In the case where type is in table, but the UUID isn't supported
by the underlying firmware, firmware shall return an error when
called.
This allows for use of a secondary DSM on an object. This could
be considered a feature or a defect. This can be tightened
up if needed.
Changes in version 5:
---------------------
0. Fixed submit comment for drivers/acpi/utils.c.
Changes in version 4:
---------------------
0. Added patch to correct parameter type passed to acpi_evaluate_dsm
ACPI defines arguments rev and fun as 64 bit quantities and the ioctl
exports to user face rev and func. We want those to match the ACPI spec.
Also modified acpi_evaluate_dsm_typed and acpi_check dsm which had
similar issue.
1. nd_cmd_dsmcall_pkg rearrange a reserve and rounded up total size
to 16 byte boundary.
2. Created stand alone patch for the pre-existing security issue related
to "read only" IOCTL calls.
3. Added patch for increasing envelope size of IOCTL. Needed to
be able to read in the wrapper to know remaining size to copy in.
Note: in_env, out_env are statics sized based upon this change.
4. Moved copyin code to table driven nd_cmd_desc
Note, the last 40 lines or so of acpi_nfit_ctl will not return _DSM
data unless the size allocated in user space buffer equals
out_obj->buffer.length.
The semantic we want in the pass thru case is to return as much
of the _DSM data as the user space buffer would accommodate.
Hence, in acpi_nfit_ctl I have retained the line:
memcpy(pkg->dsm_buf + pkg->h.dsm_in,
out_obj->buffer.pointer,
min(pkg->h.dsm_size, pkg->h.dsm_out));
and the early return from the function.
Changes in version 3:
---------------------
1. Changed name ND_CMD_PASSTHRU to ND_CMD_CALL_DSM.
2. Value of ND_CMD_CALL_DSM is 10, not 100.
3. Changed name of nd_passthru_pkg to nd_cmd_dsmcall_pkg.
4. Removed separate functions for handling ND_CMD_CALL_DSM.
Moved functionality to __nd_ioctl and acpi_nfit_ctl proper.
The resultant code looks very different from prior versions.
5. BUGFIX: __nd_ioctl: Change the if read_only switch to use
_IOC_NR cmd (not ioctl_cmd) for better protection.
Do we want to make a stand alone patch for this issue?
Changes in version 2:
---------------------
1. Cleanup access mode check in nd_ioctl and nvdimm_ioctl.
2. Change name of ndn_pkg to nd_passthru_pkg
3. Adjust sizes in nd_passthru_pkg. DSM integers are 64 bit.
4. No new ioctl type, instead tunnel into the existing number space.
5. Push down one function level where determine ioctl cmd type.
6. re-work diagnostic print/dump message in pass-thru functions.
Jerry Hoemann (7):
ACPI / util: Fix acpi_evaluate_dsm() argument type
nvdimm: Add wrapper for IOCTL pass thru
nvdimm: Increase max envelope size for IOCTL
nvdimm: Add UUIDs
libnvdimm: advertise 'call_dsm' support
nvdimm: Add IOCTL pass thru functions
tools/testing/nvdimm: 'call_dsm' support
drivers/acpi/nfit.c | 142 ++++++++++++++++++++++++++++++++++-----
drivers/acpi/nfit.h | 5 ++
drivers/acpi/utils.c | 4 +-
drivers/nvdimm/bus.c | 45 ++++++++++++-
drivers/nvdimm/core.c | 3 +
drivers/nvdimm/dimm_devs.c | 8 ++-
include/acpi/acpi_bus.h | 6 +-
include/linux/libnvdimm.h | 3 +-
include/uapi/linux/ndctl.h | 53 +++++++++++++++
tools/testing/nvdimm/test/nfit.c | 13 +++-
10 files changed, 254 insertions(+), 28 deletions(-)
--
1.7.11.3
6 years, 2 months
[ndctl PATCH 0/5] refresh 'ndctl bat' tests and other misc updates
by Dan Williams
After discovering a segfault in test/blk-ns (fixed in patch 2) it became
clear that these tests could stand to have some more exposure. These
updates make the tests fallback to nfit_test resources when run on a
platform that lacks an ACPI.NFIT.
---
Dan Williams (5):
ndctl: update pkg-config description
ndctl: fix blk-ns test cleanup
Revert "ndctl: fix error handling in ND_BLK & PMEM tests"
ndctl: convert 'ndctl bat' to use struct ndctl_test
ndctl: fall back to nfit_test for test/{blk|pmem}-ns
Makefile.am | 12 ++--
builtin-bat.c | 27 +++++++--
lib/libndctl.pc.in | 2 -
test.h | 6 +-
test/blk_namespaces.c | 149 +++++++++++++++++++++++++++++++-----------------
test/pcommit.c | 19 +++++-
test/pmem_namespaces.c | 124 ++++++++++++++++++++++++++++------------
7 files changed, 232 insertions(+), 107 deletions(-)
6 years, 2 months
Re: [PATCH 12/12] dax: New fault locking
by Jan Kara
On Wed 16-03-16 08:34:28, NeilBrown wrote:
> On Fri, Mar 11 2016, NeilBrown wrote:
>
> > On Fri, Mar 11 2016, Jan Kara wrote:
> >
> >> Currently DAX page fault locking is racy.
> >>
> >> CPU0 (write fault) CPU1 (read fault)
> >>
> >> __dax_fault() __dax_fault()
> >> get_block(inode, block, &bh, 0) -> not mapped
> >> get_block(inode, block, &bh, 0)
> >> -> not mapped
> >> if (!buffer_mapped(&bh))
> >> if (vmf->flags & FAULT_FLAG_WRITE)
> >> get_block(inode, block, &bh, 1) -> allocates blocks
> >> if (page) -> no
> >> if (!buffer_mapped(&bh))
> >> if (vmf->flags & FAULT_FLAG_WRITE) {
> >> } else {
> >> dax_load_hole();
> >> }
> >> dax_insert_mapping()
> >>
> >> And we are in a situation where we fail in dax_radix_entry() with -EIO.
> >>
> >> Another problem with the current DAX page fault locking is that there is
> >> no race-free way to clear dirty tag in the radix tree. We can always
> >> end up with clean radix tree and dirty data in CPU cache.
> >>
> >> We fix the first problem by introducing locking of exceptional radix
> >> tree entries in DAX mappings acting very similarly to page lock and thus
> >> synchronizing properly faults against the same mapping index. The same
> >> lock can later be used to avoid races when clearing radix tree dirty
> >> tag.
> >
> > Hi,
> > I think the exception locking bits look good - I cannot comment on the
> > rest.
> > I looks like it was a good idea to bring the locking into dax.c instead
> > of trying to make it generic.
> >
>
> Actually ... I'm still bothered by the exclusive waiting. If an entry
> is locked and there are two threads in dax_pfn_mkwrite() then one would
> be woken up when the entry is unlocked and it will just set the TAG_DIRTY
> flag and then continue without ever waking the next waiter on the
> wait queue.
>
> I *think* that any thread which gets an exclusive wakeup is responsible
> for performing another wakeup. In this case it must either lock the
> slot, or call __wakeup.
> That means:
> grab_mapping_entry needs to call wakeup:
> if radix_tree_preload() fails
> if radix_tree_insert fails other than with -EEXIST
> if a valid page was found
Why would we need to call wake up when a valid page was found? In that case
there should not be any process waiting for the radix tree entry lock.
Otherwise I agree with you. Thanks for pointing this out, you've likely
saved me quite some debugging ;).
> dax_delete_mapping_entry needs to call wakeup
> if the fail case, though as that isn't expect (WARN_ON_ONCE)
> it should be a problem not to wakeup here
> dax_pfn_mkwrite needs to call wakeup unconditionally
Honza
--
Jan Kara <jack(a)suse.com>
SUSE Labs, CR
6 years, 2 months
Re: [PATCH 0/4] Remove un-needed 'major' registration when alloc_disk(0) is used.
by Jens Axboe
On 03/15/2016 03:15 PM, NeilBrown wrote:
> On Tue, Mar 15 2016, Ross Zwisler wrote:
>
>> On Thu, Mar 10, 2016 at 08:59:28AM +1100, NeilBrown wrote:
>>> When alloc_disk(0) is used, the ->major number is ignored and
>>> irrelevant. Yet several drivers register a major number anyway.
>>>
>>> This series of patches removes the pointless registrations. The pmem
>>> driver also does this, but a patch has already been sent for that
>>> driver.
>>>
>>> Note that I am not in a position to test these beyond simple compile
>>> testing.
>>>
>>> Thanks,
>>> NeilBrown
>>>
>>>
>>> ---
>>>
>>> NeilBrown (4):
>>> nvdimm/blk: don't allocate unused major device number
>>> nvdimm/btt: don't allocate unused major device number
>>> memstick: don't allocate unused major for ms_block
>>> NVMe: don't allocate unused nvme_major
>>>
>>>
>>> drivers/memstick/core/ms_block.c | 17 ++---------------
>>> drivers/nvdimm/blk.c | 18 +-----------------
>>> drivers/nvdimm/btt.c | 19 ++-----------------
>>> drivers/nvme/host/core.c | 16 +---------------
>>> 4 files changed, 6 insertions(+), 64 deletions(-)
>>
>> There are several other drivers that allocate a major, but then use it for
>> some small number of minors (1 for null_blk.c and 16 for virtio_blk.c). They
>> both have GENHD_FL_EXT_DEVT set, so I think what happens is that after we
>> exhaust the allocated minors they hop over to having BLOCK_EXT_MAJOR as a
>> major and a dynamically assigned minor.
>
> null_blk looks like it would be safe to convert - it is just used for
> testing. Jens Axboe would probably know for sure.
>
> virtio_blk is a much older and there may will be code which has some
> sort of expectations about minor numbers. I think it would not be worth
> the risks to change it.
Agree on both - null_blk can be trivially converted, and I too would be
worried about virt_blkio changes breaking existing assumptions.
--
Jens Axboe
6 years, 2 months