[ndctl PATCH v2 1/4] libndctl: add support for NVDIMM_FAMILY_HYPERV's _DSM Function 1
by Dexuan Cui
This patch retrieves the health info by Hyper-V _DSM method Function 1:
Get Health Information (Function Index 1)
See http://www.uefi.org/RFIC_LIST ("Virtual NVDIMM 0x1901").
Now "ndctl list --dimms --health --idle" can show a line "health_state":"ok",
e.g.
{
"dev":"nmem0",
"id":"04d5-01-1701-00000000",
"handle":0,
"phys_id":0,
"health":{
"health_state":"ok"
}
}
If there is an error with the NVDIMM, the "ok" will be replaced with "unknown",
"fatal", "critical", or "non-critical".
Signed-off-by: Dexuan Cui <decui(a)microsoft.com>
---
ndctl/lib/Makefile.am | 1 +
ndctl/lib/hyperv.c | 129 ++++++++++++++++++++++++++++++++++++++++++
ndctl/lib/hyperv.h | 51 +++++++++++++++++
ndctl/lib/libndctl.c | 2 +
ndctl/lib/private.h | 3 +
ndctl/ndctl.h | 1 +
6 files changed, 187 insertions(+)
create mode 100644 ndctl/lib/hyperv.c
create mode 100644 ndctl/lib/hyperv.h
diff --git a/ndctl/lib/Makefile.am b/ndctl/lib/Makefile.am
index 7797039..fb75fda 100644
--- a/ndctl/lib/Makefile.am
+++ b/ndctl/lib/Makefile.am
@@ -20,6 +20,7 @@ libndctl_la_SOURCES =\
intel.c \
hpe1.c \
msft.c \
+ hyperv.c \
ars.c \
firmware.c \
libndctl.c
diff --git a/ndctl/lib/hyperv.c b/ndctl/lib/hyperv.c
new file mode 100644
index 0000000..b303d50
--- /dev/null
+++ b/ndctl/lib/hyperv.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2019, Microsoft Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU Lesser General Public License,
+ * version 2.1, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for
+ * more details.
+ */
+#include <stdlib.h>
+#include <limits.h>
+#include <util/bitmap.h>
+#include <util/log.h>
+#include <ndctl/libndctl.h>
+#include "private.h"
+#include "hyperv.h"
+
+#define CMD_HYPERV(_c) ((_c)->hyperv)
+#define CMD_HYPERV_STATUS(_c) (CMD_HYPERV(_c)->u.status)
+#define CMD_HYPERV_SMART_DATA(_c) (CMD_HYPERV(_c)->u.smart.data)
+
+static struct ndctl_cmd *hyperv_dimm_cmd_new_smart(struct ndctl_dimm *dimm)
+{
+ struct ndctl_bus *bus = ndctl_dimm_get_bus(dimm);
+ struct ndctl_ctx *ctx = ndctl_bus_get_ctx(bus);
+ struct ndctl_cmd *cmd;
+ size_t size;
+ struct nd_pkg_hyperv *hyperv;
+
+ if (!ndctl_dimm_is_cmd_supported(dimm, ND_CMD_CALL)) {
+ dbg(ctx, "unsupported cmd\n");
+ return NULL;
+ }
+
+ if (test_dimm_dsm(dimm, ND_HYPERV_CMD_GET_HEALTH_INFO) ==
+ DIMM_DSM_UNSUPPORTED) {
+ dbg(ctx, "unsupported function\n");
+ return NULL;
+ }
+
+ size = sizeof(*cmd) + sizeof(struct nd_pkg_hyperv);
+ cmd = calloc(1, size);
+ if (!cmd)
+ return NULL;
+
+ cmd->dimm = dimm;
+ ndctl_cmd_ref(cmd);
+ cmd->type = ND_CMD_CALL;
+ cmd->size = size;
+ cmd->status = 1;
+
+ hyperv = CMD_HYPERV(cmd);
+ hyperv->gen.nd_family = NVDIMM_FAMILY_HYPERV;
+ hyperv->gen.nd_command = ND_HYPERV_CMD_GET_HEALTH_INFO;
+ hyperv->gen.nd_fw_size = 0;
+ hyperv->gen.nd_size_in = offsetof(struct nd_hyperv_smart, status);
+ hyperv->gen.nd_size_out = sizeof(hyperv->u.smart);
+ hyperv->u.smart.status = 0;
+
+ cmd->firmware_status = &hyperv->u.smart.status;
+
+ return cmd;
+}
+
+static int hyperv_smart_valid(struct ndctl_cmd *cmd)
+{
+ if (cmd->type != ND_CMD_CALL ||
+ cmd->size != sizeof(*cmd) + sizeof(struct nd_pkg_hyperv) ||
+ CMD_HYPERV(cmd)->gen.nd_family != NVDIMM_FAMILY_HYPERV ||
+ CMD_HYPERV(cmd)->gen.nd_command != ND_HYPERV_CMD_GET_HEALTH_INFO ||
+ cmd->status != 0 ||
+ CMD_HYPERV_STATUS(cmd) != 0)
+ return cmd->status < 0 ? cmd->status : -EINVAL;
+ return 0;
+}
+
+static int hyperv_cmd_xlat_firmware_status(struct ndctl_cmd *cmd)
+{
+ return CMD_HYPERV_STATUS(cmd) == 0 ? 0 : -EINVAL;
+}
+
+static unsigned int hyperv_cmd_smart_get_flags(struct ndctl_cmd *cmd)
+{
+ int rc;
+
+ rc = hyperv_smart_valid(cmd);
+ if (rc < 0) {
+ errno = -rc;
+ return 0;
+ }
+
+ return ND_SMART_HEALTH_VALID;
+}
+
+static unsigned int hyperv_cmd_smart_get_health(struct ndctl_cmd *cmd)
+{
+ unsigned int health = 0;
+ __u32 num;
+ int rc;
+
+ rc = hyperv_smart_valid(cmd);
+ if (rc < 0) {
+ errno = -rc;
+ return UINT_MAX;
+ }
+
+ num = CMD_HYPERV_SMART_DATA(cmd)->health & 0x3F;
+
+ if (num & (BIT(0) | BIT(1)))
+ health |= ND_SMART_CRITICAL_HEALTH;
+
+ if (num & BIT(2))
+ health |= ND_SMART_FATAL_HEALTH;
+
+ if (num & (BIT(3) | BIT(4) | BIT(5)))
+ health |= ND_SMART_NON_CRITICAL_HEALTH;
+
+ return health;
+}
+
+struct ndctl_dimm_ops * const hyperv_dimm_ops = &(struct ndctl_dimm_ops) {
+ .new_smart = hyperv_dimm_cmd_new_smart,
+ .smart_get_flags = hyperv_cmd_smart_get_flags,
+ .smart_get_health = hyperv_cmd_smart_get_health,
+ .xlat_firmware_status = hyperv_cmd_xlat_firmware_status,
+};
diff --git a/ndctl/lib/hyperv.h b/ndctl/lib/hyperv.h
new file mode 100644
index 0000000..8e55a97
--- /dev/null
+++ b/ndctl/lib/hyperv.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2019, Microsoft Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU Lesser General Public License,
+ * version 2.1, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for
+ * more details.
+ */
+#ifndef __NDCTL_HYPERV_H__
+#define __NDCTL_HYPERV_H__
+
+/* See http://www.uefi.org/RFIC_LIST ("Virtual NVDIMM 0x1901") */
+enum {
+ ND_HYPERV_CMD_QUERY = 0,
+
+ /* non-root commands */
+ ND_HYPERV_CMD_GET_HEALTH_INFO = 1,
+};
+
+/*
+ * This is actually Function 1's data,
+ * This is the closest I can find to match the "smart".
+ * Hyper-V _DSM methods don't have a smart function.
+ */
+struct nd_hyperv_smart_data {
+ __u32 health;
+} __attribute__((packed));
+
+struct nd_hyperv_smart {
+ __u32 status;
+ union {
+ __u8 buf[4];
+ struct nd_hyperv_smart_data data[0];
+ };
+} __attribute__((packed));
+
+union nd_hyperv_cmd {
+ __u32 status;
+ struct nd_hyperv_smart smart;
+} __attribute__((packed));
+
+struct nd_pkg_hyperv {
+ struct nd_cmd_pkg gen;
+ union nd_hyperv_cmd u;
+} __attribute__((packed));
+
+#endif /* __NDCTL_HYPERV_H__ */
diff --git a/ndctl/lib/libndctl.c b/ndctl/lib/libndctl.c
index c9e2875..48bdb27 100644
--- a/ndctl/lib/libndctl.c
+++ b/ndctl/lib/libndctl.c
@@ -1492,6 +1492,8 @@ static void *add_dimm(void *parent, int id, const char *dimm_base)
dimm->ops = hpe1_dimm_ops;
if (dimm->cmd_family == NVDIMM_FAMILY_MSFT)
dimm->ops = msft_dimm_ops;
+ if (dimm->cmd_family == NVDIMM_FAMILY_HYPERV)
+ dimm->ops = hyperv_dimm_ops;
sprintf(path, "%s/nfit/dsm_mask", dimm_base);
if (sysfs_read_attr(ctx, path, buf) == 0)
diff --git a/ndctl/lib/private.h b/ndctl/lib/private.h
index a387b0b..a9d35c5 100644
--- a/ndctl/lib/private.h
+++ b/ndctl/lib/private.h
@@ -31,6 +31,7 @@
#include "intel.h"
#include "hpe1.h"
#include "msft.h"
+#include "hyperv.h"
struct nvdimm_data {
struct ndctl_cmd *cmd_read;
@@ -270,6 +271,7 @@ struct ndctl_cmd {
struct nd_cmd_pkg pkg[0];
struct ndn_pkg_hpe1 hpe1[0];
struct ndn_pkg_msft msft[0];
+ struct nd_pkg_hyperv hyperv[0];
struct nd_pkg_intel intel[0];
struct nd_cmd_get_config_size get_size[0];
struct nd_cmd_get_config_data_hdr get_data[0];
@@ -344,6 +346,7 @@ struct ndctl_dimm_ops {
struct ndctl_dimm_ops * const intel_dimm_ops;
struct ndctl_dimm_ops * const hpe1_dimm_ops;
struct ndctl_dimm_ops * const msft_dimm_ops;
+struct ndctl_dimm_ops * const hyperv_dimm_ops;
static inline struct ndctl_bus *cmd_to_bus(struct ndctl_cmd *cmd)
{
diff --git a/ndctl/ndctl.h b/ndctl/ndctl.h
index c6aaa4c..008f81c 100644
--- a/ndctl/ndctl.h
+++ b/ndctl/ndctl.h
@@ -262,6 +262,7 @@ struct nd_cmd_pkg {
#define NVDIMM_FAMILY_HPE1 1
#define NVDIMM_FAMILY_HPE2 2
#define NVDIMM_FAMILY_MSFT 3
+#define NVDIMM_FAMILY_HYPERV 4
#define ND_IOCTL_CALL _IOWR(ND_IOCTL, ND_CMD_CALL,\
struct nd_cmd_pkg)
--
2.19.1
3 years, 4 months
Re: [PATCH 0/5] Page demotion for memory reclaim
by Keith Busch
On Thu, Mar 21, 2019 at 02:20:51PM -0700, Zi Yan wrote:
> 1. The name of “page demotion” seems confusing to me, since I thought it was about large pages
> demote to small pages as opposite to promoting small pages to THPs. Am I the only
> one here?
If you have a THP, we'll skip the page migration and fall through to
split_huge_page_to_list(), then the smaller pages can be considered,
migrated and reclaimed individually. Not that we couldn't try to migrate
a THP directly. It was just simpler implementation for this first attempt.
> 2. For the demotion path, a common case would be from high-performance memory, like HBM
> or Multi-Channel DRAM, to DRAM, then to PMEM, and finally to disks, right? More general
> case for demotion path would be derived from the memory performance description from HMAT[1],
> right? Do you have any algorithm to form such a path from HMAT?
Yes, I have a PoC for the kernel setting up a demotion path based on
HMAT properties here:
https://git.kernel.org/pub/scm/linux/kernel/git/kbusch/linux.git/commit/?...
The above is just from an experimental branch.
> 3. Do you have a plan for promoting pages from lower-level memory to higher-level memory,
> like from PMEM to DRAM? Will this one-way demotion make all pages sink to PMEM and disk?
Promoting previously demoted pages would require the application do
something to make that happen if you turn demotion on with this series.
Kernel auto-promotion is still being investigated, and it's a little
trickier than reclaim.
If it sinks to disk, though, the next access behavior is the same as
before, without this series.
> 4. In your patch 3, you created a new method migrate_demote_mapping() to migrate pages to
> other memory node, is there any problem of reusing existing migrate_pages() interface?
Yes, we may not want to migrate everything in the shrink_page_list()
pages. We might want to keep a page, so we have to do those checks first. At
the point we know we want to attempt migration, the page is already
locked and not in a list, so it is just easier to directly invoke the
new __unmap_and_move_locked() that migrate_pages() eventually also calls.
> 5. In addition, you only migrate base pages, is there any performance concern on migrating THPs?
> Is it too costly to migrate THPs?
It was just easier to consider single pages first, so we let a THP split
if possible. I'm not sure of the cost in migrating THPs directly.
3 years, 4 months
[PATCH 0/6] security/keys/encrypted: Break module dependency chain
by Dan Williams
With v5.1-rc1 all the nvdimm sub-system regression tests started failing
because the libnvdimm module failed to load in the qemu-kvm test
environment. Critically that environment does not have a TPM. Commit
240730437deb "KEYS: trusted: explicitly use tpm_chip structure..."
started to require a TPM to be present for the trusted.ko module to load
where there was no requirement for that before.
Rather than undo the "fail if no hardware" behavior James points out
that the module dependencies can be broken by looking up the key-type by
name. Remove the dependencies on the "key_type_trusted" and
"key_type_encrypted" symbol exports, and clean up other boilerplate that
supported those exports in different configurations.
---
Dan Williams (6):
security/keys/encrypted: Allow operation without trusted.ko
security/keys/encrypted: Clean up request_trusted_key()
libnvdimm/security: Drop direct dependency on key_type_encrypted
security/keys/ecryptfs: Drop direct dependency on key_type_encrypted
security/integrity/evm: Drop direct dependency on key_type_encrypted
security/keys/encrypted: Drop export of key_type_encrypted
drivers/nvdimm/security.c | 11 ++++-
fs/ecryptfs/ecryptfs_kernel.h | 22 -----------
fs/ecryptfs/keystore.c | 12 ++++++
include/keys/encrypted-type.h | 2 -
include/linux/key.h | 1
security/integrity/evm/evm_crypto.c | 9 ++++
security/keys/encrypted-keys/Makefile | 3 -
security/keys/encrypted-keys/encrypted.c | 35 ++++++++++++++++-
security/keys/encrypted-keys/encrypted.h | 12 ------
security/keys/encrypted-keys/masterkey_trusted.c | 46 ----------------------
security/keys/internal.h | 2 -
security/keys/key.c | 1
12 files changed, 65 insertions(+), 91 deletions(-)
delete mode 100644 security/keys/encrypted-keys/masterkey_trusted.c
3 years, 4 months
Re: [PATCH v10, RESEND 5/6] KEYS: trusted: explicitly use tpm_chip structure from tpm_default_chip()
by Dan Williams
On Wed, Feb 6, 2019 at 10:30 AM Roberto Sassu <roberto.sassu(a)huawei.com> wrote:
>
> When crypto agility support will be added to the TPM driver, users of the
> driver have to retrieve the allocated banks from chip->allocated_banks and
> use this information to prepare the array of tpm_digest structures to be
> passed to tpm_pcr_extend().
>
> This patch retrieves a tpm_chip pointer from tpm_default_chip() so that the
> pointer can be used to prepare the array of tpm_digest structures.
>
> Signed-off-by: Roberto Sassu <roberto.sassu(a)huawei.com>
> Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen(a)linux.intel.com>
> Tested-by: Jarkko Sakkinen <jarkko.sakkinen(a)linux.intel.com>
> ---
> security/keys/trusted.c | 38 ++++++++++++++++++++++++--------------
> 1 file changed, 24 insertions(+), 14 deletions(-)
>
> diff --git a/security/keys/trusted.c b/security/keys/trusted.c
> index 4d98f4f87236..5b852263eae1 100644
> --- a/security/keys/trusted.c
> +++ b/security/keys/trusted.c
> @@ -34,6 +34,7 @@
>
> static const char hmac_alg[] = "hmac(sha1)";
> static const char hash_alg[] = "sha1";
> +static struct tpm_chip *chip;
>
> struct sdesc {
> struct shash_desc shash;
> @@ -362,7 +363,7 @@ int trusted_tpm_send(unsigned char *cmd, size_t buflen)
> int rc;
>
> dump_tpm_buf(cmd);
> - rc = tpm_send(NULL, cmd, buflen);
> + rc = tpm_send(chip, cmd, buflen);
> dump_tpm_buf(cmd);
> if (rc > 0)
> /* Can't return positive return codes values to keyctl */
> @@ -384,10 +385,10 @@ static int pcrlock(const int pcrnum)
>
> if (!capable(CAP_SYS_ADMIN))
> return -EPERM;
> - ret = tpm_get_random(NULL, hash, SHA1_DIGEST_SIZE);
> + ret = tpm_get_random(chip, hash, SHA1_DIGEST_SIZE);
> if (ret != SHA1_DIGEST_SIZE)
> return ret;
> - return tpm_pcr_extend(NULL, pcrnum, hash) ? -EINVAL : 0;
> + return tpm_pcr_extend(chip, pcrnum, hash) ? -EINVAL : 0;
> }
>
> /*
> @@ -400,7 +401,7 @@ static int osap(struct tpm_buf *tb, struct osapsess *s,
> unsigned char ononce[TPM_NONCE_SIZE];
> int ret;
>
> - ret = tpm_get_random(NULL, ononce, TPM_NONCE_SIZE);
> + ret = tpm_get_random(chip, ononce, TPM_NONCE_SIZE);
> if (ret != TPM_NONCE_SIZE)
> return ret;
>
> @@ -496,7 +497,7 @@ static int tpm_seal(struct tpm_buf *tb, uint16_t keytype,
> if (ret < 0)
> goto out;
>
> - ret = tpm_get_random(NULL, td->nonceodd, TPM_NONCE_SIZE);
> + ret = tpm_get_random(chip, td->nonceodd, TPM_NONCE_SIZE);
> if (ret != TPM_NONCE_SIZE)
> goto out;
> ordinal = htonl(TPM_ORD_SEAL);
> @@ -606,7 +607,7 @@ static int tpm_unseal(struct tpm_buf *tb,
>
> ordinal = htonl(TPM_ORD_UNSEAL);
> keyhndl = htonl(SRKHANDLE);
> - ret = tpm_get_random(NULL, nonceodd, TPM_NONCE_SIZE);
> + ret = tpm_get_random(chip, nonceodd, TPM_NONCE_SIZE);
> if (ret != TPM_NONCE_SIZE) {
> pr_info("trusted_key: tpm_get_random failed (%d)\n", ret);
> return ret;
> @@ -751,7 +752,7 @@ static int getoptions(char *c, struct trusted_key_payload *pay,
> int i;
> int tpm2;
>
> - tpm2 = tpm_is_tpm2(NULL);
> + tpm2 = tpm_is_tpm2(chip);
> if (tpm2 < 0)
> return tpm2;
>
> @@ -920,7 +921,7 @@ static struct trusted_key_options *trusted_options_alloc(void)
> struct trusted_key_options *options;
> int tpm2;
>
> - tpm2 = tpm_is_tpm2(NULL);
> + tpm2 = tpm_is_tpm2(chip);
> if (tpm2 < 0)
> return NULL;
>
> @@ -970,7 +971,7 @@ static int trusted_instantiate(struct key *key,
> size_t key_len;
> int tpm2;
>
> - tpm2 = tpm_is_tpm2(NULL);
> + tpm2 = tpm_is_tpm2(chip);
> if (tpm2 < 0)
> return tpm2;
>
> @@ -1011,7 +1012,7 @@ static int trusted_instantiate(struct key *key,
> switch (key_cmd) {
> case Opt_load:
> if (tpm2)
> - ret = tpm_unseal_trusted(NULL, payload, options);
> + ret = tpm_unseal_trusted(chip, payload, options);
> else
> ret = key_unseal(payload, options);
> dump_payload(payload);
> @@ -1021,13 +1022,13 @@ static int trusted_instantiate(struct key *key,
> break;
> case Opt_new:
> key_len = payload->key_len;
> - ret = tpm_get_random(NULL, payload->key, key_len);
> + ret = tpm_get_random(chip, payload->key, key_len);
> if (ret != key_len) {
> pr_info("trusted_key: key_create failed (%d)\n", ret);
> goto out;
> }
> if (tpm2)
> - ret = tpm_seal_trusted(NULL, payload, options);
> + ret = tpm_seal_trusted(chip, payload, options);
> else
> ret = key_seal(payload, options);
> if (ret < 0)
> @@ -1225,17 +1226,26 @@ static int __init init_trusted(void)
> {
> int ret;
>
> + chip = tpm_default_chip();
> + if (!chip)
> + return -ENOENT;
This change causes a regression loading the encrypted_keys module on
systems that don't have a tpm.
Module init functions should not have hardware dependencies.
The effect is that the libnvdimm module, which is an encrypted_keys
user, fails to load, but up until this change encrypted_keys did not
have a hard dependency on TPM presence.
3 years, 4 months
Re: [PATCH 2/2] mm/dax: Don't enable huge dax mapping by default
by Aneesh Kumar K.V
Dan Williams <dan.j.williams(a)intel.com> writes:
> On Thu, Feb 28, 2019 at 1:40 AM Oliver <oohall(a)gmail.com> wrote:
>>
>> On Thu, Feb 28, 2019 at 7:35 PM Aneesh Kumar K.V
>> <aneesh.kumar(a)linux.ibm.com> wrote:
>> >
>> > Add a flag to indicate the ability to do huge page dax mapping. On architecture
>> > like ppc64, the hypervisor can disable huge page support in the guest. In
>> > such a case, we should not enable huge page dax mapping. This patch adds
>> > a flag which the architecture code will update to indicate huge page
>> > dax mapping support.
>>
>> *groan*
>>
>> > Architectures mostly do transparent_hugepage_flag = 0; if they can't
>> > do hugepages. That also takes care of disabling dax hugepage mapping
>> > with this change.
>> >
>> > Without this patch we get the below error with kvm on ppc64.
>> >
>> > [ 118.849975] lpar: Failed hash pte insert with error -4
>> >
>> > NOTE: The patch also use
>> >
>> > echo never > /sys/kernel/mm/transparent_hugepage/enabled
>> > to disable dax huge page mapping.
>> >
>> > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar(a)linux.ibm.com>
>> > ---
>> > TODO:
>> > * Add Fixes: tag
>> >
>> > include/linux/huge_mm.h | 4 +++-
>> > mm/huge_memory.c | 4 ++++
>> > 2 files changed, 7 insertions(+), 1 deletion(-)
>> >
>> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> > index 381e872bfde0..01ad5258545e 100644
>> > --- a/include/linux/huge_mm.h
>> > +++ b/include/linux/huge_mm.h
>> > @@ -53,6 +53,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
>> > pud_t *pud, pfn_t pfn, bool write);
>> > enum transparent_hugepage_flag {
>> > TRANSPARENT_HUGEPAGE_FLAG,
>> > + TRANSPARENT_HUGEPAGE_DAX_FLAG,
>> > TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
>> > TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
>> > TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
>> > @@ -111,7 +112,8 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
>> > if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG))
>> > return true;
>> >
>> > - if (vma_is_dax(vma))
>> > + if (vma_is_dax(vma) &&
>> > + (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_DAX_FLAG)))
>> > return true;
>>
>> Forcing PTE sized faults should be fine for fsdax, but it'll break
>> devdax. The devdax driver requires the fault size be >= the namespace
>> alignment since devdax tries to guarantee hugepage mappings will be
>> used and PMD alignment is the default. We can probably have devdax
>> fall back to the largest size the hypervisor has made available, but
>> it does run contrary to the design. Ah well, I suppose it's better off
>> being degraded rather than unusable.
>
> Given this is an explicit setting I think device-dax should explicitly
> fail to enable in the presence of this flag to preserve the
> application visible behavior.
>
> I.e. if device-dax was enabled after this setting was made then I
> think future faults should fail as well.
Not sure I understood that. Now we are disabling the ability to map
pages as huge pages. I am now considering that this should not be
user configurable. Ie, this is something that platform can use to avoid
dax forcing huge page mapping, but if the architecture can enable huge
dax mapping, we should always default to using that.
Now w.r.t to failures, can device-dax do an opportunistic huge page
usage? I haven't looked at the device-dax details fully yet. Do we make the
assumption of the mapping page size as a format w.r.t device-dax? Is that
derived from nd_pfn->align value?
Here is what I am working on:
1) If the platform doesn't support huge page and if the device superblock
indicated that it was created with huge page support, we fail the device
init.
2) Now if we are creating a new namespace without huge page support in
the platform, then we force the align details to PAGE_SIZE. In such a
configuration when handling dax fault even with THP enabled during
the build, we should not try to use hugepage. This I think we can
achieve by using TRANSPARENT_HUGEPAEG_DAX_FLAG.
Also even if the user decided to not use THP, by
echo "never" > transparent_hugepage/enabled , we should continue to map
dax fault using huge page on platforms that can support huge pages.
This still doesn't cover the details of a device-dax created with
PAGE_SIZE align later booted with a kernel that can do hugepage dax.How
should we handle that? That makes me think, this should be a VMA flag
which got derived from device config? May be use VM_HUGEPAGE to indicate
if device should use a hugepage mapping or not?
-aneesh
3 years, 4 months
[ndctl PATCH 0/8] Improve support + testing for labels + info-blocks
by Dan Williams
As noted in the kernel patches for this issue:
Lately Linux has encountered platforms that collide Persistent
Memory regions between each other, specifically cases where
->start_pad needed to be non-zero. This lead to commit ae86cbfef381
"libnvdimm, pfn: Pad pfn namespaces relative to other regions". That
commit allowed namespaces to be mapped with devm_memremap_pages().
However dax operations on those configurations currently fail if
attempted within the ->start_pad range because
pmem_device->data_offset was still relative to raw resource base not
relative to the section aligned resource range mapped by
devm_memremap_pages().
Luckily __bdev_dax_supported() caught these failures and simply
disabled dax. However, to fix this situation a non-backwards
compatible change needs to be made to the interpretation of the
nd_pfn info-block. ->start_pad needs to be accounted in
->map.map_offset (formerly ->data_offset), and ->map.map_base
(formerly ->phys_addr) needs to be adjusted to the section aligned
resource base used to establish ->map.map formerly (formerly
->virt_addr).
Towards preventing similar bugs in this area introduce a regression
test "test/collide.sh" to validate support for pre- and post-fixed
kernels. In the course of developing this test a few missing
capabilities and fixes also surfaced.
---
Dan Williams (8):
ndctl/dimm: Add 'flags' field to read-labels output
ndctl/dimm: Add --human support to read-labels
ndctl/build: Drop -Wpointer-arith
ndctl/namespace: Add read-info-block command
ndctl/test: Update dax-dev to handle multiple e820 ranges
ndctl/test: Make dax.sh more robust vs small namespaces
ndctl/namespace: Always zero info-blocks
ndctl/test: Test inter-region collision handling
configure.ac | 1
ndctl/action.h | 1
ndctl/builtin.h | 1
ndctl/check.c | 20 --
ndctl/dimm.c | 21 ++-
ndctl/namespace.c | 416 +++++++++++++++++++++++++++++++++++++++++++++++++-
ndctl/namespace.h | 51 ++++++
ndctl/ndctl.c | 1
test/Makefile.am | 1
test/collide.sh | 226 +++++++++++++++++++++++++++
test/dax-dev.c | 17 ++
test/dax.sh | 4
test/fsdax-info0.xxd | 11 +
test/fsdax-info1.xxd | 11 +
test/fsdax-info2.xxd | 11 +
test/fsdax-info3.xxd | 11 +
util/fletcher.h | 1
util/size.h | 1
18 files changed, 763 insertions(+), 43 deletions(-)
create mode 100755 test/collide.sh
create mode 100644 test/fsdax-info0.xxd
create mode 100644 test/fsdax-info1.xxd
create mode 100644 test/fsdax-info2.xxd
create mode 100644 test/fsdax-info3.xxd
3 years, 4 months
公司级的质量管理组织如何定位
by 范女士
-------- 转发邮件信息 --------
发件人:fg(a)rav.com
发送日期:2019-3-20 7:07:21
收件人:linux-nvdimm(a)lists.01.org
---请---查---阅---附---件---大---纲
7:07:21
3 years, 4 months
failure to setup devdax over fake nvdimm
by Brice Goglin
Hello Dan,
On a machine emulating nvdimms over DDR using the memmap kernel parameter,
I can't enable devdax anymore with ndctl, when using your libnvdimm-pending
branch (1ffc664f9b8ca7346f2af0fb4f62b923193bf65c):
# ndctl create-namespace -f -e namespace1.0 -t pmem -m devdax
libndctl: ndctl_dax_enable: dax1.0: failed to enable
Error: namespace1.0: failed to enable
failed to reconfigure namespace: No such device or address
In the previous version of your libnvdimm-pending branch that I used before
(8d4a7e3da68e88a6efd3acb5cc971d0c2ca219a4 from January 8th) this ndctl line
worked fine right after boot.
The difference between strace ndctl doesn't show anything obvious.
* /sys/devices/platform/e820_pmem/ndbus0/region1/namespace1.0/holder_class contains nothing instead of "dax"
* /sys/devices/platform/e820_pmem/ndbus0/region1/dax_seed contains dax1.0 instead of dax1.1
Any idea where to look?
Thanks
Brice
3 years, 4 months
[PATCH] tools/testing/nvdimm: add watermarks for dax_pmem* modules
by Vishal Verma
Add nfit_test 'watermarks' for the dax_pmem, dax_pmem_core, and
dax_pmem_compat modules. This causes the nfit_test module to fail
loading in case any of these modules are also not overridden with the
ldconfig wrapped modules. Without this, nfit_test would sometimes fail
creation of device-dax namespaces on the nfit_test_bus with an unhelpful
error log such as:
dax_pmem dax5.0: could not reserve metadata
dax_pmem: probe of dax5.0 failed with error -16
Which was caused due to the unwrapped version of
devm_request_mem_region() being called.
Cc: Dan Williams <dan.j.williams(a)intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma(a)intel.com>
---
tools/testing/nvdimm/Kbuild | 3 +++
tools/testing/nvdimm/dax_pmem_compat_test.c | 8 ++++++++
tools/testing/nvdimm/dax_pmem_core_test.c | 8 ++++++++
tools/testing/nvdimm/dax_pmem_test.c | 8 ++++++++
tools/testing/nvdimm/test/nfit.c | 3 +++
tools/testing/nvdimm/watermark.h | 3 +++
6 files changed, 33 insertions(+)
create mode 100644 tools/testing/nvdimm/dax_pmem_compat_test.c
create mode 100644 tools/testing/nvdimm/dax_pmem_core_test.c
create mode 100644 tools/testing/nvdimm/dax_pmem_test.c
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index e1286d2cdfbf..c4a9196d794c 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -68,8 +68,11 @@ device_dax-y += device_dax_test.o
device_dax-y += config_check.o
dax_pmem-y := $(DAX_SRC)/pmem/pmem.o
+dax_pmem-y += dax_pmem_test.o
dax_pmem_core-y := $(DAX_SRC)/pmem/core.o
+dax_pmem_core-y += dax_pmem_core_test.o
dax_pmem_compat-y := $(DAX_SRC)/pmem/compat.o
+dax_pmem_compat-y += dax_pmem_compat_test.o
dax_pmem-y += config_check.o
libnvdimm-y := $(NVDIMM_SRC)/core.o
diff --git a/tools/testing/nvdimm/dax_pmem_compat_test.c b/tools/testing/nvdimm/dax_pmem_compat_test.c
new file mode 100644
index 000000000000..7cd1877f3765
--- /dev/null
+++ b/tools/testing/nvdimm/dax_pmem_compat_test.c
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright(c) 2019 Intel Corporation. All rights reserved.
+
+#include <linux/module.h>
+#include <linux/printk.h>
+#include "watermark.h"
+
+nfit_test_watermark(dax_pmem_compat);
diff --git a/tools/testing/nvdimm/dax_pmem_core_test.c b/tools/testing/nvdimm/dax_pmem_core_test.c
new file mode 100644
index 000000000000..a4249cdbeec1
--- /dev/null
+++ b/tools/testing/nvdimm/dax_pmem_core_test.c
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright(c) 2019 Intel Corporation. All rights reserved.
+
+#include <linux/module.h>
+#include <linux/printk.h>
+#include "watermark.h"
+
+nfit_test_watermark(dax_pmem_core);
diff --git a/tools/testing/nvdimm/dax_pmem_test.c b/tools/testing/nvdimm/dax_pmem_test.c
new file mode 100644
index 000000000000..fd4c94a5aa02
--- /dev/null
+++ b/tools/testing/nvdimm/dax_pmem_test.c
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright(c) 2019 Intel Corporation. All rights reserved.
+
+#include <linux/module.h>
+#include <linux/printk.h>
+#include "watermark.h"
+
+nfit_test_watermark(dax_pmem);
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index b579f962451d..9e7a6ea2ac91 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -3162,6 +3162,9 @@ static __init int nfit_test_init(void)
acpi_nfit_test();
device_dax_test();
mcsafe_test();
+ dax_pmem_test();
+ dax_pmem_core_test();
+ dax_pmem_compat_test();
nfit_test_setup(nfit_test_lookup, nfit_test_evaluate_dsm);
diff --git a/tools/testing/nvdimm/watermark.h b/tools/testing/nvdimm/watermark.h
index ed0528757bd4..43fc4f3e7927 100644
--- a/tools/testing/nvdimm/watermark.h
+++ b/tools/testing/nvdimm/watermark.h
@@ -6,6 +6,9 @@ int pmem_test(void);
int libnvdimm_test(void);
int acpi_nfit_test(void);
int device_dax_test(void);
+int dax_pmem_test(void);
+int dax_pmem_core_test(void);
+int dax_pmem_compat_test(void);
/*
* dummy routine for nfit_test to validate it is linking to the properly
--
2.20.1
3 years, 4 months
[ndctl PATCH] ndctl/test: add dax_pmem* modules to the test-core
by Vishal Verma
Add a check for the nfit_test overridden modules for dax_pmem,
dax_pmem_core, and dax_pmem_compat to the test core to ensure that the
regular production versions aren't the ones being used.
Cc: Dan Williams <dan.j.williams(a)intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma(a)intel.com>
---
test/core.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/test/core.c b/test/core.c
index 4b36b2d..b9e3bbf 100644
--- a/test/core.c
+++ b/test/core.c
@@ -129,6 +129,8 @@ int nfit_test_init(struct kmod_ctx **ctx, struct kmod_module **mod,
"nfit",
"device_dax",
"dax_pmem",
+ "dax_pmem_core",
+ "dax_pmem_compat",
"libnvdimm",
"nd_blk",
"nd_btt",
--
2.20.1
3 years, 4 months