[PATCH v2 1/4] test: Don't skip tests if nfit modules are missing
by Santosh Sivaraj
For NFIT to be available ACPI is a must, so don't fail when nfit modules
are missing on a platform that doesn't support ACPI.
Signed-off-by: Santosh Sivaraj <santosh(a)fossix.org>
---
test.h | 2 +-
test/ack-shutdown-count-set.c | 2 +-
test/blk_namespaces.c | 2 +-
test/core.c | 23 +++++++++++++++++++++--
test/dpa-alloc.c | 2 +-
test/dsm-fail.c | 2 +-
test/libndctl.c | 2 +-
test/multi-pmem.c | 2 +-
test/parent-uuid.c | 2 +-
test/pmem_namespaces.c | 2 +-
10 files changed, 30 insertions(+), 11 deletions(-)
Changelog:
v2:
* Patch 2: Fix a bug, I skip erroring out if PAPR family, but condition had INTEL family instead.
That change was there to test the same code on x86, but accidently committed. Now have
a environment variable to force test PAPR family on x86.
* Patch 4: Remove stray code, artifact of refactoring in patch 1.
diff --git a/test.h b/test.h
index 3f6212e..94d8936 100644
--- a/test.h
+++ b/test.h
@@ -30,7 +30,7 @@ void builtin_xaction_namespace_reset(void);
struct kmod_ctx;
struct kmod_module;
-int nfit_test_init(struct kmod_ctx **ctx, struct kmod_module **mod,
+int ndctl_test_init(struct kmod_ctx **ctx, struct kmod_module **mod,
struct ndctl_ctx *nd_ctx, int log_level,
struct ndctl_test *test);
diff --git a/test/ack-shutdown-count-set.c b/test/ack-shutdown-count-set.c
index 742e976..6315a94 100644
--- a/test/ack-shutdown-count-set.c
+++ b/test/ack-shutdown-count-set.c
@@ -99,7 +99,7 @@ static int test_ack_shutdown_count_set(int loglevel, struct ndctl_test *test,
int result = EXIT_FAILURE, err;
ndctl_set_log_priority(ctx, loglevel);
- err = nfit_test_init(&kmod_ctx, &mod, NULL, loglevel, test);
+ err = ndctl_test_init(&kmod_ctx, &mod, NULL, loglevel, test);
if (err < 0) {
result = 77;
ndctl_test_skip(test);
diff --git a/test/blk_namespaces.c b/test/blk_namespaces.c
index 437fcad..dfb0332 100644
--- a/test/blk_namespaces.c
+++ b/test/blk_namespaces.c
@@ -240,7 +240,7 @@ int test_blk_namespaces(int log_level, struct ndctl_test *test,
if (!bus) {
fprintf(stderr, "ACPI.NFIT unavailable falling back to nfit_test\n");
- rc = nfit_test_init(&kmod_ctx, &mod, NULL, log_level, test);
+ rc = ndctl_test_init(&kmod_ctx, &mod, NULL, log_level, test);
ndctl_invalidate(ctx);
bus = ndctl_bus_get_by_provider(ctx, "nfit_test.0");
if (rc < 0 || !bus) {
diff --git a/test/core.c b/test/core.c
index 5118d86..8e48fd6 100644
--- a/test/core.c
+++ b/test/core.c
@@ -21,6 +21,7 @@
#include <util/log.h>
#include <util/sysfs.h>
#include <ndctl/libndctl.h>
+#include <ndctl/ndctl.h>
#include <ccan/array_size/array_size.h>
#define KVER_STRLEN 20
@@ -116,11 +117,11 @@ int ndctl_test_get_skipped(struct ndctl_test *test)
return test->skip;
}
-int nfit_test_init(struct kmod_ctx **ctx, struct kmod_module **mod,
+int ndctl_test_init(struct kmod_ctx **ctx, struct kmod_module **mod,
struct ndctl_ctx *nd_ctx, int log_level,
struct ndctl_test *test)
{
- int rc;
+ int rc, family = NVDIMM_FAMILY_INTEL;
unsigned int i;
const char *name;
struct ndctl_bus *bus;
@@ -137,6 +138,19 @@ int nfit_test_init(struct kmod_ctx **ctx, struct kmod_module **mod,
"nd_e820",
"nd_pmem",
};
+ char *test_env;
+
+ /* Do we want to force test PAPR? */
+ test_env = getenv("NDCTL_TEST_FAMILY");
+ if (test_env && strcmp(test_env, "PAPR") == 0)
+ family = NVDIMM_FAMILY_PAPR;
+
+ /* ACPI is a must for nfit, so if ACPI is not available let's default to
+ * PAPR */
+ if (access("/sys/bus/acpi", F_OK) == -1) {
+ if (errno == ENOENT)
+ family = NVDIMM_FAMILY_PAPR;
+ }
log_init(&log_ctx, "test/init", "NDCTL_TEST");
log_ctx.log_priority = log_level;
@@ -195,6 +209,11 @@ retry:
path = kmod_module_get_path(*mod);
if (!path) {
+ if (family == NVDIMM_FAMILY_PAPR &&
+ (strcmp(name, "nfit") == 0 ||
+ strcmp(name, "nd_e820") == 0))
+ continue;
+
log_err(&log_ctx, "%s.ko: failed to get path\n", name);
break;
}
diff --git a/test/dpa-alloc.c b/test/dpa-alloc.c
index b757b9a..10af189 100644
--- a/test/dpa-alloc.c
+++ b/test/dpa-alloc.c
@@ -299,7 +299,7 @@ int test_dpa_alloc(int loglevel, struct ndctl_test *test, struct ndctl_ctx *ctx)
return 77;
ndctl_set_log_priority(ctx, loglevel);
- err = nfit_test_init(&kmod_ctx, &mod, NULL, loglevel, test);
+ err = ndctl_test_init(&kmod_ctx, &mod, NULL, loglevel, test);
if (err < 0) {
ndctl_test_skip(test);
fprintf(stderr, "nfit_test unavailable skipping tests\n");
diff --git a/test/dsm-fail.c b/test/dsm-fail.c
index b2c51db..1d03470 100644
--- a/test/dsm-fail.c
+++ b/test/dsm-fail.c
@@ -356,7 +356,7 @@ int test_dsm_fail(int loglevel, struct ndctl_test *test, struct ndctl_ctx *ctx)
int result = EXIT_FAILURE, err;
ndctl_set_log_priority(ctx, loglevel);
- err = nfit_test_init(&kmod_ctx, &mod, NULL, loglevel, test);
+ err = ndctl_test_init(&kmod_ctx, &mod, NULL, loglevel, test);
if (err < 0) {
result = 77;
ndctl_test_skip(test);
diff --git a/test/libndctl.c b/test/libndctl.c
index 994e0fa..5043ae0 100644
--- a/test/libndctl.c
+++ b/test/libndctl.c
@@ -2696,7 +2696,7 @@ int test_libndctl(int loglevel, struct ndctl_test *test, struct ndctl_ctx *ctx)
daxctl_set_log_priority(daxctl_ctx, loglevel);
ndctl_set_private_data(ctx, test);
- err = nfit_test_init(&kmod_ctx, &mod, ctx, loglevel, test);
+ err = ndctl_test_init(&kmod_ctx, &mod, ctx, loglevel, test);
if (err < 0) {
ndctl_test_skip(test);
fprintf(stderr, "nfit_test unavailable skipping tests\n");
diff --git a/test/multi-pmem.c b/test/multi-pmem.c
index cb7cd40..111aa28 100644
--- a/test/multi-pmem.c
+++ b/test/multi-pmem.c
@@ -259,7 +259,7 @@ int test_multi_pmem(int loglevel, struct ndctl_test *test, struct ndctl_ctx *ctx
ndctl_set_log_priority(ctx, loglevel);
- err = nfit_test_init(&kmod_ctx, &mod, NULL, loglevel, test);
+ err = ndctl_test_init(&kmod_ctx, &mod, NULL, loglevel, test);
if (err < 0) {
result = 77;
ndctl_test_skip(test);
diff --git a/test/parent-uuid.c b/test/parent-uuid.c
index f41ca2c..1e5a503 100644
--- a/test/parent-uuid.c
+++ b/test/parent-uuid.c
@@ -230,7 +230,7 @@ int test_parent_uuid(int loglevel, struct ndctl_test *test, struct ndctl_ctx *ct
return 77;
ndctl_set_log_priority(ctx, loglevel);
- err = nfit_test_init(&kmod_ctx, &mod, NULL, loglevel, test);
+ err = ndctl_test_init(&kmod_ctx, &mod, NULL, loglevel, test);
if (err < 0) {
ndctl_test_skip(test);
fprintf(stderr, "nfit_test unavailable skipping tests\n");
diff --git a/test/pmem_namespaces.c b/test/pmem_namespaces.c
index eac56ce..afa79a2 100644
--- a/test/pmem_namespaces.c
+++ b/test/pmem_namespaces.c
@@ -203,7 +203,7 @@ int test_pmem_namespaces(int log_level, struct ndctl_test *test,
if (!bus) {
fprintf(stderr, "ACPI.NFIT unavailable falling back to nfit_test\n");
- rc = nfit_test_init(&kmod_ctx, &mod, NULL, log_level, test);
+ rc = ndctl_test_init(&kmod_ctx, &mod, NULL, log_level, test);
ndctl_invalidate(ctx);
bus = ndctl_bus_get_by_provider(ctx, "nfit_test.0");
if (rc < 0 || !bus) {
--
2.29.2
1 month, 3 weeks
[0/7] PMEM device emulation without nfit depenency
by Santosh Sivaraj
The current test module cannot be used for testing platforms (make check)
that do not have support for NFIT. In order to get the ndctl tests working,
we need a module which can emulate NVDIMM devices without relying on
ACPI/NFIT.
The emulated PMEM device is made part of the PAPR family.
Corresponding changes for ndctl is also required, to add attributes needed
for the test, which will be sent as a reply to this patch.
None of tests passed on PAPR before, now there are 16 test that pass. Error
injection tests and SMART are not yet implemented.
Santosh Sivaraj (7):
testing/nvdimm: Add test module for non-nfit platforms
ndtest: Add compatability string to treat it as PAPR family
ndtest: Add dimms to the two buses
ndtest: Add dimm attributes
ndtest: Add regions and mappings to the test buses
ndtest: Add nvdimm control functions
ndtest: Add papr health related flags
tools/testing/nvdimm/config_check.c | 3 +-
tools/testing/nvdimm/test/Kbuild | 6 +-
tools/testing/nvdimm/test/ndtest.c | 1138 +++++++++++++++++++++++++++
tools/testing/nvdimm/test/ndtest.h | 109 +++
4 files changed, 1254 insertions(+), 2 deletions(-)
create mode 100644 tools/testing/nvdimm/test/ndtest.c
create mode 100644 tools/testing/nvdimm/test/ndtest.h
--
2.26.2
1 month, 3 weeks
[GIT PULL] libnvdimm + device-dax for v5.12-rc1
by Dan Williams
Hi Linus, please pull from:
git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
tags/libnvdimm-for-5.12
...to receive some miscellaneous cleanups and a fix for v5.12. This
mainly continues the kernel wide effort to remove a return code from
the remove() callback in the driver model. The fix addresses a return
code polarity typo in the new sysfs attribute to manually specify a
device-dax instance mapping range. This has all appeared in -next with
no reported issues.
---
The following changes since commit 1048ba83fb1c00cd24172e23e8263972f6b5d9ac:
Linux 5.11-rc6 (2021-01-31 13:50:09 -0800)
are available in the Git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
tags/libnvdimm-for-5.12
for you to fetch changes up to 64ffe84320745ea836555ad207ebfb0e896b6167:
Merge branch 'for-5.12/dax' into for-5.12/libnvdimm (2021-02-23
18:13:45 -0800)
----------------------------------------------------------------
libnvdimm + device-dax for 5.12
- Fix the error code polarity for the device-dax/mapping attribute
- For the device-dax and libnvdimm bus implementations stop implementing
a useless return code for the remove() callback.
- Miscellaneous cleanups
----------------------------------------------------------------
Dan Williams (1):
Merge branch 'for-5.12/dax' into for-5.12/libnvdimm
Shiyang Ruan (1):
device-dax: Fix default return code of range_parse()
Uwe Kleine-König (7):
libnvdimm/dimm: Simplify nvdimm_remove()
libnvdimm: Make remove callback return void
device-dax: Prevent registering drivers without probe callback
device-dax: Properly handle drivers without remove callback
device-dax: Fix error path in dax_driver_register
device-dax: Drop an empty .remove callback
dax-device: Make remove callback return void
drivers/dax/bus.c | 24 +++++++++++++++++++++---
drivers/dax/bus.h | 2 +-
drivers/dax/device.c | 8 +-------
drivers/dax/kmem.c | 7 ++-----
drivers/dax/pmem/compat.c | 3 +--
drivers/nvdimm/blk.c | 3 +--
drivers/nvdimm/bus.c | 13 +++++--------
drivers/nvdimm/dimm.c | 7 +------
drivers/nvdimm/pmem.c | 4 +---
drivers/nvdimm/region.c | 4 +---
include/linux/nd.h | 2 +-
11 files changed, 36 insertions(+), 41 deletions(-)
1 month, 3 weeks
[GIT PULL] Compute Express Linux (CXL) for v5.12-rc1
by Dan Williams
Hi Linus, please pull from:
git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm tags/cxl-for-5.12
...to receive an initial driver for CXL 2.0 Memory Devices. Technical
details are in the tag message and Documentation/. I am taking this
through nvdimm.git this first cycle until the cxl.git repository and
maintainer team can be set up on git.kernel.org.
In terms of why merge this initial driver now, it establishes just
enough functionality to enumerate these devices and issue all
administrative commands. It sets a v5.12 baseline to develop the more
complicated higher order functionality like memory device
interleaving, persistent memory support, and hotplug which entangle
with ACPI, LIBNVDIMM, and PCI.
The focus of this release is establishing the ioctl UAPI for the
management commands. Similar to NVME there are a set of standard
commands as well as the possibility for vendor specific commands.
Unlike the NVME driver the CXL driver does not enable vendor specific
command functionality by default. This conservatism is out of concern
for the fact that CXL interleaves memory across devices and implements
host memory. The system integrity implications of some commands are
more severe than NVME and vendor specific functionality is mostly
unauditable. This will be an ongoing topic of discussion with the
wider CXL community for next few months.
The driver has been developed in the open since November against a
work-in-progress QEMU emulation of the CXL device model. That QEMU
effort has recently attracted contributions from multiple hardware
vendors.
The driver has appeared in -next. It collected some initial static
analysis fixes and build-robot reports, but all quiet in -next for the
past week.
A list of review tags that arrived after the branch for -next was cut
is appended to the tag message below.
---
The following changes since commit 1048ba83fb1c00cd24172e23e8263972f6b5d9ac:
Linux 5.11-rc6 (2021-01-31 13:50:09 -0800)
are available in the Git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm tags/cxl-for-5.12
for you to fetch changes up to 88ff5d466c0250259818f3153dbdc4af1f8615dd:
cxl/mem: Fix potential memory leak (2021-02-22 14:44:39 -0800)
----------------------------------------------------------------
cxl for 5.12
Introduce an initial driver for CXL 2.0 Type-3 Memory Devices. CXL is
Compute Express Link which released the 2.0 specification in November.
The Linux relevant changes in CXL 2.0 are support for an OS to
dynamically assign address space to memory devices, support for
switches, persistent memory, and hotplug. A Type-3 Memory Device is a
PCI enumerated device presenting the CXL Memory Device Class Code and
implementing the CXL.mem protocol. CXL.mem allows device to advertise
CPU and I/O coherent memory to the system, i.e. typical "System RAM" and
"Persistent Memory" in Linux /proc/iomem terms.
In addition to the CXL.mem fast path there is an administrative command
hardware mailbox interface for maintenance and provisioning. It is this
command interface that is the focus of the initial driver. With this
driver a CXL device that is mapped by the BIOS can be administered by
Linux. Linux support for CXL PMEM and dynamic CXL address space
management are to be implemented post v5.12.
4cdadfd5e0a7 cxl/mem: Introduce a driver for CXL-2.0-Type-3 endpoints
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
8adaf747c9f0 cxl/mem: Find device capabilities
Reviewed-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
b39cb1052a5c cxl/mem: Register CXL memX devices
Reviewed-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
13237183c735 cxl/mem: Add a "RAW" send command
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
472b1ce6e9d6 cxl/mem: Enable commands via CEL
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
57ee605b976c cxl/mem: Add set of informational commands
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
----------------------------------------------------------------
Ben Widawsky (7):
cxl/mem: Find device capabilities
cxl/mem: Add basic IOCTL interface
cxl/mem: Add a "RAW" send command
cxl/mem: Enable commands via CEL
cxl/mem: Add set of informational commands
MAINTAINERS: Add maintainers of the CXL driver
cxl/mem: Fix potential memory leak
Dan Carpenter (1):
cxl/mem: Return -EFAULT if copy_to_user() fails
Dan Williams (2):
cxl/mem: Introduce a driver for CXL-2.0-Type-3 endpoints
cxl/mem: Register CXL memX devices
.clang-format | 1 +
Documentation/ABI/testing/sysfs-bus-cxl | 26 +
Documentation/driver-api/cxl/index.rst | 12 +
Documentation/driver-api/cxl/memory-devices.rst | 46 +
Documentation/driver-api/index.rst | 1 +
Documentation/userspace-api/ioctl/ioctl-number.rst | 1 +
MAINTAINERS | 11 +
drivers/Kconfig | 1 +
drivers/Makefile | 1 +
drivers/cxl/Kconfig | 53 +
drivers/cxl/Makefile | 7 +
drivers/cxl/bus.c | 29 +
drivers/cxl/cxl.h | 95 ++
drivers/cxl/mem.c | 1552 ++++++++++++++++++++
drivers/cxl/pci.h | 31 +
include/linux/pci_ids.h | 1 +
include/uapi/linux/cxl_mem.h | 172 +++
17 files changed, 2040 insertions(+)
create mode 100644 Documentation/ABI/testing/sysfs-bus-cxl
create mode 100644 Documentation/driver-api/cxl/index.rst
create mode 100644 Documentation/driver-api/cxl/memory-devices.rst
create mode 100644 drivers/cxl/Kconfig
create mode 100644 drivers/cxl/Makefile
create mode 100644 drivers/cxl/bus.c
create mode 100644 drivers/cxl/cxl.h
create mode 100644 drivers/cxl/mem.c
create mode 100644 drivers/cxl/pci.h
create mode 100644 include/uapi/linux/cxl_mem.h
1 month, 3 weeks
Re: [PATCH RFC 0/9] mm, sparse-vmemmap: Introduce compound pagemaps
by Dan Williams
On Tue, Feb 23, 2021 at 5:00 PM Jason Gunthorpe <jgg(a)ziepe.ca> wrote:
>
> On Tue, Feb 23, 2021 at 04:14:01PM -0800, Dan Williams wrote:
> > [ add Ralph ]
> >
> > On Tue, Feb 23, 2021 at 3:07 PM Jason Gunthorpe <jgg(a)ziepe.ca> wrote:
> > >
> > > On Tue, Feb 23, 2021 at 02:48:20PM -0800, Dan Williams wrote:
> > > > On Tue, Feb 23, 2021 at 10:54 AM Jason Gunthorpe <jgg(a)ziepe.ca> wrote:
> > > > >
> > > > > On Tue, Feb 23, 2021 at 08:44:52AM -0800, Dan Williams wrote:
> > > > >
> > > > > > > The downside would be one extra lookup in dev_pagemap tree
> > > > > > > for other pgmap->types (P2P, FSDAX, PRIVATE). But just one
> > > > > > > per gup-fast() call.
> > > > > >
> > > > > > I'd guess a dev_pagemap lookup is faster than a get_user_pages slow
> > > > > > path. It should be measurable that this change is at least as fast or
> > > > > > faster than falling back to the slow path, but it would be good to
> > > > > > measure.
> > > > >
> > > > > What is the dev_pagemap thing doing in gup fast anyhow?
> > > > >
> > > > > I've been wondering for a while..
> > > >
> > > > It's there to synchronize against dax-device removal. The device will
> > > > suspend removal awaiting all page references to be dropped, but
> > > > gup-fast could be racing device removal. So gup-fast checks for
> > > > pte_devmap() to grab a live reference to the device before assuming it
> > > > can pin a page.
> > >
> > > From the perspective of CPU A it can't tell if CPU B is doing a HW
> > > page table walk or a GUP fast when it invalidates a page table. The
> > > design of gup-fast is supposed to be the same as the design of a HW
> > > page table walk, and the tlb invalidate CPU A does when removing a
> > > page from a page table is supposed to serialize against both a HW page
> > > table walk and gup-fast.
> > >
> > > Given that the HW page table walker does not do dev_pagemap stuff, why
> > > does gup-fast?
> >
> > gup-fast historically assumed that the 'struct page' and memory
> > backing the page-table walk could not physically be removed from the
> > system during its walk because those pages were allocated from the
> > page allocator before being mapped into userspace.
>
> No, I'd say gup-fast assumes that any non-special PTE it finds in a
> page table must have a struct page.
>
> If something wants to remove that struct page it must first remove all
> the PTEs pointing at it from the entire system and flush the TLBs,
> which directly prevents a future gup-fast from running and trying to
> access the struct page. No extra locking needed
>
> > implied elevated reference on any page that gup-fast would be asked to
> > walk, or pte_special() is there to "say wait, nevermind this isn't a
> > page allocator page fallback to gup-slow()".
>
> pte_special says there is no struct page, and some of those cases can
> be fixed up in gup-slow.
>
> > > Can you sketch the exact race this is protecting against?
> >
> > Thread1 mmaps /mnt/daxfile1 from a "mount -o dax" filesystem and
> > issues direct I/O with that mapping as the target buffer, Thread2 does
> > "echo "namespace0.0" > /sys/bus/nd/drivers/nd_pmem/unbind". Without
> > the dev_pagemap check reference gup-fast could execute
> > get_page(pte_page(pte)) on a page that doesn't even exist anymore
> > because the driver unbind has already performed remove_pages().
>
> Surely the unbind either waits for all the VMAs to be destroyed or
> zaps them before allowing things to progress to remove_pages()?
If we're talking about device-dax this is precisely what it does, zaps
and prevents new faults from resolving, but filesystem-dax...
> Having a situation where the CPU page tables still point at physical
> pages that have been removed sounds so crazy/insecure, that can't be
> what is happening, can it??
Hmm, that may be true and an original dax bug! The unbind of a
block-device from underneath the filesystem does trigger the
filesystem to emergency shutdown / go read-only, but unless that
process also includes a global zap of all dax mappings not only is
that violating expectations of "page-tables to disappearing memory",
but the filesystem may also want to guarantee that no further dax
writes can happen after shutdown. Right now I believe it only assumes
that mmap I/O will come from page writeback so there's no need to
bother applications with mappings to page cache, but dax mappings need
to be ripped away.
/me goes to look at what filesytems guarantee when the block-device is
surprise removed out from under them.
In any event, this accelerates the effort to go implement
fs-global-dax-zap at the request of the device driver.
1 month, 3 weeks
Re: [PATCH RFC 0/9] mm, sparse-vmemmap: Introduce compound pagemaps
by Dan Williams
[ add Ralph ]
On Tue, Feb 23, 2021 at 3:07 PM Jason Gunthorpe <jgg(a)ziepe.ca> wrote:
>
> On Tue, Feb 23, 2021 at 02:48:20PM -0800, Dan Williams wrote:
> > On Tue, Feb 23, 2021 at 10:54 AM Jason Gunthorpe <jgg(a)ziepe.ca> wrote:
> > >
> > > On Tue, Feb 23, 2021 at 08:44:52AM -0800, Dan Williams wrote:
> > >
> > > > > The downside would be one extra lookup in dev_pagemap tree
> > > > > for other pgmap->types (P2P, FSDAX, PRIVATE). But just one
> > > > > per gup-fast() call.
> > > >
> > > > I'd guess a dev_pagemap lookup is faster than a get_user_pages slow
> > > > path. It should be measurable that this change is at least as fast or
> > > > faster than falling back to the slow path, but it would be good to
> > > > measure.
> > >
> > > What is the dev_pagemap thing doing in gup fast anyhow?
> > >
> > > I've been wondering for a while..
> >
> > It's there to synchronize against dax-device removal. The device will
> > suspend removal awaiting all page references to be dropped, but
> > gup-fast could be racing device removal. So gup-fast checks for
> > pte_devmap() to grab a live reference to the device before assuming it
> > can pin a page.
>
> From the perspective of CPU A it can't tell if CPU B is doing a HW
> page table walk or a GUP fast when it invalidates a page table. The
> design of gup-fast is supposed to be the same as the design of a HW
> page table walk, and the tlb invalidate CPU A does when removing a
> page from a page table is supposed to serialize against both a HW page
> table walk and gup-fast.
>
> Given that the HW page table walker does not do dev_pagemap stuff, why
> does gup-fast?
gup-fast historically assumed that the 'struct page' and memory
backing the page-table walk could not physically be removed from the
system during its walk because those pages were allocated from the
page allocator before being mapped into userspace. So there is an
implied elevated reference on any page that gup-fast would be asked to
walk, or pte_special() is there to "say wait, nevermind this isn't a
page allocator page fallback to gup-slow()". pte_devmap() is there to
say "wait, there is no implied elevated reference for this page, check
and hold dev_pagemap alive until a page reference can be taken". So it
splits the difference between pte_special() and typical page allocator
pages.
> Can you sketch the exact race this is protecting against?
Thread1 mmaps /mnt/daxfile1 from a "mount -o dax" filesystem and
issues direct I/O with that mapping as the target buffer, Thread2 does
"echo "namespace0.0" > /sys/bus/nd/drivers/nd_pmem/unbind". Without
the dev_pagemap check reference gup-fast could execute
get_page(pte_page(pte)) on a page that doesn't even exist anymore
because the driver unbind has already performed remove_pages().
Effectively the same percpu_ref that protects the pmem0 block device
from new command submissions while the device is dying also prevents
new dax page references being taken while the device is dying.
This could be solved with the traditional gup-fast rules if the device
driver could tell the filesystem to unmap all dax files and force them
to re-fault through the gup-slow path to see that the device is now
dying. I'll likely be working on that sooner rather than later given
some of the expectations of the CXL persistent memory "dirty shutdown"
detection.
1 month, 3 weeks
Re: [PATCH RFC 0/9] mm, sparse-vmemmap: Introduce compound pagemaps
by Dan Williams
On Tue, Feb 23, 2021 at 10:54 AM Jason Gunthorpe <jgg(a)ziepe.ca> wrote:
>
> On Tue, Feb 23, 2021 at 08:44:52AM -0800, Dan Williams wrote:
>
> > > The downside would be one extra lookup in dev_pagemap tree
> > > for other pgmap->types (P2P, FSDAX, PRIVATE). But just one
> > > per gup-fast() call.
> >
> > I'd guess a dev_pagemap lookup is faster than a get_user_pages slow
> > path. It should be measurable that this change is at least as fast or
> > faster than falling back to the slow path, but it would be good to
> > measure.
>
> What is the dev_pagemap thing doing in gup fast anyhow?
>
> I've been wondering for a while..
It's there to synchronize against dax-device removal. The device will
suspend removal awaiting all page references to be dropped, but
gup-fast could be racing device removal. So gup-fast checks for
pte_devmap() to grab a live reference to the device before assuming it
can pin a page.
1 month, 3 weeks
[ndctl PATCH v2 00/13] Initial CXL support
by Vishal Verma
Changes since v1[1]:
- Add 'firmware_version' retrieval for memdevs via sysfs attribute
- Add private data storage and accessors for libcxl
- Add a local copy of the UAPI header (cxl_mem.h)
- Refactor 'Identify' command support into a single patch
- Add libcxl APIs for get_lsa
- Add libcxl APIs for get_health_info
- Add libcxl APIs for firmware_status and out.size from cmd response
- Refactor common test helpers to make them more generic
- Add a hexdump helper in util/
- Add a new unit test, test/libcxl which tests:
- Basic sanity tests
- Module unload/load
- identify device command
- set_lsa (via RAW mode) command
- get_lsa command
- fuzzes command input/output payload sizes
- Fix install location of cxl headers
- Add section 3 man pages for libcxl API documentation (only two pages
added so far).
[1]: https://lore.kernel.org/linux-cxl/20210112003403.2944568-1-vishal.l.verma...
---
Add a new utility and library to support CXL devices. This comprehends
the kernel's sysfs layout for CXL devices, and implements a command
submission harness for CXL mailbox commands via ioctl()s defined by the
cxl_mem driver.
A 'cxl-list' command is added which uses some of the libcxl APIs to
display a listing of CXL devices that includes attributes obtained via
sysfs.
Additionally, a new unit test is added to test the library and kernel
(ioctl) interfaces. This includes basic functionality tests for a subset
of the mailbox commands, as well as some negative tests to validate
graceful handling of malformed commands with unexpected buffer sizing
for payloads.
The unit tests are tied to the QEMU implementation[2] of CXL devices.
The latest kernel patches can be found at [3].
An ndctl branch with these patches is also available at [4]
[2]: https://lore.kernel.org/linux-cxl/20210202005948.241655-1-ben.widawsky@in...
[3]: https://lore.kernel.org/linux-cxl/20210217040958.1354670-1-ben.widawsky@i...
[4]: https://github.com/pmem/ndctl/tree/cxl-2.0v2
Vishal Verma (13):
cxl: add a cxl utility and libcxl library
cxl: add a local copy of the cxl_mem UAPI header
libcxl: add support for command query and submission
libcxl: add support for the 'Identify Device' command
test: rename 'ndctl_test' to 'test_ctx'
test: rename 'ndctl_test_*' helpers to 'test_*'
test: introduce a libcxl unit test
libcxl: add GET_HEALTH_INFO mailbox command and accessors
libcxl: add support for the 'GET_LSA' command
util/hexdump: Add a util helper to print a buffer in hex
test/libcxl: add a test for {set, get}_lsa commands
Documentation/cxl: add library API documentation
test/libcxl: introduce a command size fuzzing test
Documentation/cxl/cxl-list.txt | 65 ++
Documentation/cxl/cxl.txt | 34 ++
Documentation/cxl/human-option.txt | 8 +
Documentation/cxl/lib/cxl_new.txt | 43 ++
Documentation/cxl/lib/libcxl.txt | 56 ++
Documentation/cxl/verbose-option.txt | 5 +
configure.ac | 4 +
Makefile.am | 10 +-
Makefile.am.in | 5 +
cxl/lib/private.h | 97 +++
cxl/lib/libcxl.c | 879 +++++++++++++++++++++++++++
cxl/builtin.h | 8 +
cxl/cxl_mem.h | 181 ++++++
cxl/libcxl.h | 82 +++
test.h | 40 +-
test/libcxl-expect.h | 13 +
util/filter.h | 2 +
util/hexdump.h | 8 +
util/json.h | 3 +
util/main.h | 3 +
cxl/cxl.c | 95 +++
cxl/list.c | 113 ++++
ndctl/bat.c | 8 +-
ndctl/test.c | 8 +-
test/ack-shutdown-count-set.c | 16 +-
test/blk_namespaces.c | 14 +-
test/core.c | 32 +-
test/dax-dev.c | 10 +-
test/dax-pmd.c | 13 +-
test/dax-poison.c | 6 +-
test/daxdev-errors.c | 2 +-
test/device-dax.c | 24 +-
test/dpa-alloc.c | 14 +-
test/dsm-fail.c | 14 +-
test/libcxl.c | 514 ++++++++++++++++
test/libndctl.c | 84 +--
test/multi-pmem.c | 23 +-
test/parent-uuid.c | 13 +-
test/pmem_namespaces.c | 14 +-
test/revoke-devmem.c | 12 +-
util/filter.c | 20 +
util/hexdump.c | 53 ++
util/json.c | 26 +
.gitignore | 5 +
Documentation/cxl/Makefile.am | 58 ++
Documentation/cxl/lib/Makefile.am | 58 ++
README.md | 2 +-
cxl/Makefile.am | 21 +
cxl/lib/Makefile.am | 32 +
cxl/lib/libcxl.pc.in | 11 +
cxl/lib/libcxl.sym | 57 ++
test/Makefile.am | 15 +-
52 files changed, 2754 insertions(+), 179 deletions(-)
create mode 100644 Documentation/cxl/cxl-list.txt
create mode 100644 Documentation/cxl/cxl.txt
create mode 100644 Documentation/cxl/human-option.txt
create mode 100644 Documentation/cxl/lib/cxl_new.txt
create mode 100644 Documentation/cxl/lib/libcxl.txt
create mode 100644 Documentation/cxl/verbose-option.txt
create mode 100644 cxl/lib/private.h
create mode 100644 cxl/lib/libcxl.c
create mode 100644 cxl/builtin.h
create mode 100644 cxl/cxl_mem.h
create mode 100644 cxl/libcxl.h
create mode 100644 test/libcxl-expect.h
create mode 100644 util/hexdump.h
create mode 100644 cxl/cxl.c
create mode 100644 cxl/list.c
create mode 100644 test/libcxl.c
create mode 100644 util/hexdump.c
create mode 100644 Documentation/cxl/Makefile.am
create mode 100644 Documentation/cxl/lib/Makefile.am
create mode 100644 cxl/Makefile.am
create mode 100644 cxl/lib/Makefile.am
create mode 100644 cxl/lib/libcxl.pc.in
create mode 100644 cxl/lib/libcxl.sym
--
2.29.2
1 month, 3 weeks