simplify follow_pte a bit
by Christoph Hellwig
Hi Andrew,
this small series drops the not needed follow_pte_pmd exports, and
simplifies the follow_pte family of functions a bit.
1 year, 6 months
[PATCH RFC PKS/PMEM 00/58] PMEM: Introduce stray write protection for PMEM
by ira.weiny@intel.com
From: Ira Weiny <ira.weiny(a)intel.com>
Should a stray write in the kernel occur persistent memory is affected more
than regular memory. A write to the wrong area of memory could result in
latent data corruption which will will persist after a reboot. PKS provides a
nice way to restrict access to persistent memory kernel mappings, while
providing fast access when needed.
Since the last RFC[1] this patch set has grown quite a bit. It now depends on
the core patches submitted separately.
https://lore.kernel.org/lkml/20201009194258.3207172-1-ira.weiny@intel.com/
And contained in the git tree here:
https://github.com/weiny2/linux-kernel/tree/pks-rfc-v3
However, functionally there is only 1 major change from the last RFC.
Specifically, kmap() is most often used within a single thread in a 'map/do
something/unmap' pattern. In fact this is the pattern used in ~90% of the
callers of kmap(). This pattern works very well for the pmem use case and the
testing which was done. However, there were another ~20-30 kmap users which do
not follow this pattern. Some of them seem to expect the mapping to be
'global' while others require a detailed audit to be sure.[2][3]
While we don't anticipate global mappings to pmem there is a danger in
changing the semantics of kmap(). Effectively, this would cause an unresolved
page fault with little to no information about why.
There were a number of options considered.
1) Attempt to change all the thread local kmap() calls to kmap_atomic()
2) Introduce a flags parameter to kmap() to indicate if the mapping should be
global or not
3) Change ~20-30 call sites to 'kmap_global()' to indicate that they require a
global mapping of the pages
4) Change ~209 call sites to 'kmap_thread()' to indicate that the mapping is to
be used within that thread of execution only
Option 1 is simply not feasible kmap_atomic() is not the same semantic as
kmap() within a single tread. Option 2 would require all of the call sites of
kmap() to change. Option 3 seems like a good minimal change but there is a
danger that new code may miss the semantic change of kmap() and not get the
behavior intended for future users. Therefore, option #4 was chosen.
To handle the global PKRS state in the most efficient manner possible. We
lazily override the thread specific PKRS key value only when needed because we
anticipate PKS to not be needed will not be needed most of the time. And even
when it is used 90% of the time it is a thread local call.
[1] https://lore.kernel.org/lkml/20200717072056.73134-1-ira.weiny@intel.com/
[2] The following list of callers continue calling kmap() (utilizing the global
PKRS). It would be nice if more of them could be converted to kmap_thread()
drivers/firewire/net.c: ptr = kmap(dev->broadcast_rcv_buffer.pages[u]);
drivers/gpu/drm/i915/gem/i915_gem_pages.c: return kmap(sg_page(sgt->sgl));
drivers/gpu/drm/ttm/ttm_bo_util.c: map->virtual = kmap(map->page);
drivers/infiniband/hw/qib/qib_user_sdma.c: mpage = kmap(page);
drivers/misc/vmw_vmci/vmci_host.c: context->notify = kmap(context->notify_page) + (uva & (PAGE_SIZE - 1));
drivers/misc/xilinx_sdfec.c: addr = kmap(pages[i]);
drivers/mmc/host/usdhi6rol0.c: host->pg.mapped = kmap(host->pg.page);
drivers/mmc/host/usdhi6rol0.c: host->pg.mapped = kmap(host->pg.page);
drivers/mmc/host/usdhi6rol0.c: host->pg.mapped = kmap(host->pg.page);
drivers/nvme/target/tcp.c: iov->iov_base = kmap(sg_page(sg)) + sg->offset + sg_offset;
drivers/scsi/libiscsi_tcp.c: segment->sg_mapped = kmap(sg_page(sg));
drivers/target/iscsi/iscsi_target.c: iov[i].iov_base = kmap(sg_page(sg)) + sg->offset + page_off;
drivers/target/target_core_transport.c: return kmap(sg_page(sg)) + sg->offset;
fs/btrfs/check-integrity.c: block_ctx->datav[i] = kmap(block_ctx->pagev[i]);
fs/ceph/dir.c: cache_ctl->dentries = kmap(cache_ctl->page);
fs/ceph/inode.c: ctl->dentries = kmap(ctl->page);
fs/erofs/zpvec.h: kmap_atomic(ctor->curr) : kmap(ctor->curr);
lib/scatterlist.c: miter->addr = kmap(miter->page) + miter->__offset;
net/ceph/pagelist.c: pl->mapped_tail = kmap(page);
net/ceph/pagelist.c: pl->mapped_tail = kmap(page);
virt/kvm/kvm_main.c: hva = kmap(page);
[3] The following appear to follow the same pattern as ext2 which was converted
after some code audit. So I _think_ they too could be converted to
k[un]map_thread().
fs/freevxfs/vxfs_subr.c|75| kmap(pp);
fs/jfs/jfs_metapage.c|102| kmap(page);
fs/jfs/jfs_metapage.c|156| kmap(page);
fs/minix/dir.c|72| kmap(page);
fs/nilfs2/dir.c|195| kmap(page);
fs/nilfs2/ifile.h|24| void *kaddr = kmap(ibh->b_page);
fs/ntfs/aops.h|78| kmap(page);
fs/ntfs/compress.c|574| kmap(page);
fs/qnx6/dir.c|32| kmap(page);
fs/qnx6/dir.c|58| kmap(*p = page);
fs/qnx6/inode.c|190| kmap(page);
fs/qnx6/inode.c|557| kmap(page);
fs/reiserfs/inode.c|2397| kmap(bh_result->b_page);
fs/reiserfs/xattr.c|444| kmap(page);
fs/sysv/dir.c|60| kmap(page);
fs/sysv/dir.c|262| kmap(page);
fs/ufs/dir.c|194| kmap(page);
fs/ufs/dir.c|562| kmap(page);
Ira Weiny (58):
x86/pks: Add a global pkrs option
x86/pks/test: Add testing for global option
memremap: Add zone device access protection
kmap: Add stray access protection for device pages
kmap: Introduce k[un]map_thread
kmap: Introduce k[un]map_thread debugging
drivers/drbd: Utilize new kmap_thread()
drivers/firmware_loader: Utilize new kmap_thread()
drivers/gpu: Utilize new kmap_thread()
drivers/rdma: Utilize new kmap_thread()
drivers/net: Utilize new kmap_thread()
fs/afs: Utilize new kmap_thread()
fs/btrfs: Utilize new kmap_thread()
fs/cifs: Utilize new kmap_thread()
fs/ecryptfs: Utilize new kmap_thread()
fs/gfs2: Utilize new kmap_thread()
fs/nilfs2: Utilize new kmap_thread()
fs/hfs: Utilize new kmap_thread()
fs/hfsplus: Utilize new kmap_thread()
fs/jffs2: Utilize new kmap_thread()
fs/nfs: Utilize new kmap_thread()
fs/f2fs: Utilize new kmap_thread()
fs/fuse: Utilize new kmap_thread()
fs/freevxfs: Utilize new kmap_thread()
fs/reiserfs: Utilize new kmap_thread()
fs/zonefs: Utilize new kmap_thread()
fs/ubifs: Utilize new kmap_thread()
fs/cachefiles: Utilize new kmap_thread()
fs/ntfs: Utilize new kmap_thread()
fs/romfs: Utilize new kmap_thread()
fs/vboxsf: Utilize new kmap_thread()
fs/hostfs: Utilize new kmap_thread()
fs/cramfs: Utilize new kmap_thread()
fs/erofs: Utilize new kmap_thread()
fs: Utilize new kmap_thread()
fs/ext2: Use ext2_put_page
fs/ext2: Utilize new kmap_thread()
fs/isofs: Utilize new kmap_thread()
fs/jffs2: Utilize new kmap_thread()
net: Utilize new kmap_thread()
drivers/target: Utilize new kmap_thread()
drivers/scsi: Utilize new kmap_thread()
drivers/mmc: Utilize new kmap_thread()
drivers/xen: Utilize new kmap_thread()
drivers/firmware: Utilize new kmap_thread()
drives/staging: Utilize new kmap_thread()
drivers/mtd: Utilize new kmap_thread()
drivers/md: Utilize new kmap_thread()
drivers/misc: Utilize new kmap_thread()
drivers/android: Utilize new kmap_thread()
kernel: Utilize new kmap_thread()
mm: Utilize new kmap_thread()
lib: Utilize new kmap_thread()
powerpc: Utilize new kmap_thread()
samples: Utilize new kmap_thread()
dax: Stray access protection for dax_direct_access()
nvdimm/pmem: Stray access protection for pmem->virt_addr
[dax|pmem]: Enable stray access protection
Documentation/core-api/protection-keys.rst | 11 +-
arch/powerpc/mm/mem.c | 4 +-
arch/x86/entry/common.c | 28 +++
arch/x86/include/asm/pkeys.h | 6 +-
arch/x86/include/asm/pkeys_common.h | 8 +-
arch/x86/kernel/process.c | 74 ++++++-
arch/x86/mm/fault.c | 193 ++++++++++++++----
arch/x86/mm/pkeys.c | 88 ++++++--
drivers/android/binder_alloc.c | 4 +-
drivers/base/firmware_loader/fallback.c | 4 +-
drivers/base/firmware_loader/main.c | 4 +-
drivers/block/drbd/drbd_main.c | 4 +-
drivers/block/drbd/drbd_receiver.c | 12 +-
drivers/dax/device.c | 2 +
drivers/dax/super.c | 2 +
drivers/firmware/efi/capsule-loader.c | 6 +-
drivers/firmware/efi/capsule.c | 4 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 12 +-
drivers/gpu/drm/gma500/gma_display.c | 4 +-
drivers/gpu/drm/gma500/mmu.c | 10 +-
drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 4 +-
.../drm/i915/gem/selftests/i915_gem_context.c | 4 +-
.../drm/i915/gem/selftests/i915_gem_mman.c | 8 +-
drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c | 4 +-
drivers/gpu/drm/i915/gt/intel_gtt.c | 4 +-
drivers/gpu/drm/i915/gt/shmem_utils.c | 4 +-
drivers/gpu/drm/i915/i915_gem.c | 8 +-
drivers/gpu/drm/i915/i915_gpu_error.c | 4 +-
drivers/gpu/drm/i915/selftests/i915_perf.c | 4 +-
drivers/gpu/drm/radeon/radeon_ttm.c | 4 +-
drivers/infiniband/hw/hfi1/sdma.c | 4 +-
drivers/infiniband/hw/i40iw/i40iw_cm.c | 10 +-
drivers/infiniband/sw/siw/siw_qp_tx.c | 14 +-
drivers/md/bcache/request.c | 4 +-
drivers/misc/vmw_vmci/vmci_queue_pair.c | 12 +-
drivers/mmc/host/mmc_spi.c | 4 +-
drivers/mmc/host/sdricoh_cs.c | 4 +-
drivers/mtd/mtd_blkdevs.c | 12 +-
drivers/net/ethernet/intel/igb/igb_ethtool.c | 4 +-
.../net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 4 +-
drivers/nvdimm/pmem.c | 6 +
drivers/scsi/ipr.c | 8 +-
drivers/scsi/pmcraid.c | 8 +-
drivers/staging/rts5208/rtsx_transport.c | 4 +-
drivers/target/target_core_iblock.c | 4 +-
drivers/target/target_core_rd.c | 4 +-
drivers/target/target_core_transport.c | 4 +-
drivers/xen/gntalloc.c | 4 +-
fs/afs/dir.c | 16 +-
fs/afs/dir_edit.c | 16 +-
fs/afs/mntpt.c | 4 +-
fs/afs/write.c | 4 +-
fs/aio.c | 4 +-
fs/binfmt_elf.c | 4 +-
fs/binfmt_elf_fdpic.c | 4 +-
fs/btrfs/check-integrity.c | 4 +-
fs/btrfs/compression.c | 4 +-
fs/btrfs/inode.c | 16 +-
fs/btrfs/lzo.c | 24 +--
fs/btrfs/raid56.c | 34 +--
fs/btrfs/reflink.c | 8 +-
fs/btrfs/send.c | 4 +-
fs/btrfs/zlib.c | 32 +--
fs/btrfs/zstd.c | 20 +-
fs/cachefiles/rdwr.c | 4 +-
fs/cifs/cifsencrypt.c | 6 +-
fs/cifs/file.c | 16 +-
fs/cifs/smb2ops.c | 8 +-
fs/cramfs/inode.c | 10 +-
fs/ecryptfs/crypto.c | 8 +-
fs/ecryptfs/read_write.c | 8 +-
fs/erofs/super.c | 4 +-
fs/erofs/xattr.c | 4 +-
fs/exec.c | 10 +-
fs/ext2/dir.c | 8 +-
fs/ext2/ext2.h | 8 +
fs/ext2/namei.c | 15 +-
fs/f2fs/f2fs.h | 8 +-
fs/freevxfs/vxfs_immed.c | 4 +-
fs/fuse/readdir.c | 4 +-
fs/gfs2/bmap.c | 4 +-
fs/gfs2/ops_fstype.c | 4 +-
fs/hfs/bnode.c | 14 +-
fs/hfs/btree.c | 20 +-
fs/hfsplus/bitmap.c | 20 +-
fs/hfsplus/bnode.c | 102 ++++-----
fs/hfsplus/btree.c | 18 +-
fs/hostfs/hostfs_kern.c | 12 +-
fs/io_uring.c | 4 +-
fs/isofs/compress.c | 4 +-
fs/jffs2/file.c | 8 +-
fs/jffs2/gc.c | 4 +-
fs/nfs/dir.c | 20 +-
fs/nilfs2/alloc.c | 34 +--
fs/nilfs2/cpfile.c | 4 +-
fs/ntfs/aops.c | 4 +-
fs/reiserfs/journal.c | 4 +-
fs/romfs/super.c | 4 +-
fs/splice.c | 4 +-
fs/ubifs/file.c | 16 +-
fs/vboxsf/file.c | 12 +-
fs/zonefs/super.c | 4 +-
include/linux/entry-common.h | 3 +
include/linux/highmem.h | 63 +++++-
include/linux/memremap.h | 1 +
include/linux/mm.h | 43 ++++
include/linux/pkeys.h | 6 +-
include/linux/sched.h | 8 +
include/trace/events/kmap_thread.h | 56 +++++
init/init_task.c | 6 +
kernel/fork.c | 18 ++
kernel/kexec_core.c | 8 +-
lib/Kconfig.debug | 8 +
lib/iov_iter.c | 12 +-
lib/pks/pks_test.c | 138 +++++++++++--
lib/test_bpf.c | 4 +-
lib/test_hmm.c | 8 +-
mm/Kconfig | 13 ++
mm/debug.c | 23 +++
mm/memory.c | 8 +-
mm/memremap.c | 90 ++++++++
mm/swapfile.c | 4 +-
mm/userfaultfd.c | 4 +-
net/ceph/messenger.c | 4 +-
net/core/datagram.c | 4 +-
net/core/sock.c | 8 +-
net/ipv4/ip_output.c | 4 +-
net/sunrpc/cache.c | 4 +-
net/sunrpc/xdr.c | 8 +-
net/tls/tls_device.c | 4 +-
samples/vfio-mdev/mbochs.c | 4 +-
131 files changed, 1284 insertions(+), 565 deletions(-)
create mode 100644 include/trace/events/kmap_thread.h
--
2.28.0.rc0.12.gb6a658bd00c9
1 year, 6 months
[PATCH] daxctl: phys_index value 0 is valid
by Aneesh Kumar K.V
On power platforms we can find
# cat /sys/devices/system/memory/memory0/phys_index
00000000
This results in
libdaxctl: memblock_in_dev: dax1.0: memory0: Unable to determine phys_index: Success
Avoid considering phys_index == 0 as error.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar(a)linux.ibm.com>
---
daxctl/lib/libdaxctl.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/daxctl/lib/libdaxctl.c b/daxctl/lib/libdaxctl.c
index ee4a069eb463..3cb89c755978 100644
--- a/daxctl/lib/libdaxctl.c
+++ b/daxctl/lib/libdaxctl.c
@@ -1229,7 +1229,7 @@ static int memblock_in_dev(struct daxctl_memory *mem, const char *memblock)
rc = sysfs_read_attr(ctx, path, buf);
if (rc == 0) {
phys_index = strtoul(buf, NULL, 16);
- if (phys_index == 0 || phys_index == ULONG_MAX) {
+ if (phys_index == ULONG_MAX) {
rc = -errno;
err(ctx, "%s: %s: Unable to determine phys_index: %s\n",
devname, memblock, strerror(-rc));
--
2.26.2
1 year, 6 months
[PATCH] x86/mm: Fix phys_to_target_node() export
by Dan Williams
The core-mm has a default __weak implementation of phys_to_target_node()
when the architecture does not override it. That symbol is exported
for modules. However, while the export in mm/memory_hotplug.c exported
the symbol in the configuration cases of:
CONFIG_NUMA_KEEP_MEMINFO=y
CONFIG_MEMORY_HOTPLUG=y
...and:
CONFIG_NUMA_KEEP_MEMINFO=n
CONFIG_MEMORY_HOTPLUG=y
...it failed to export the symbol in the case of:
CONFIG_NUMA_KEEP_MEMINFO=y
CONFIG_MEMORY_HOTPLUG=n
Always export the symbol from the CONFIG_NUMA_KEEP_MEMINFO section of
arch/x86/mm/numa.c, and teach mm/memory_hotplug.c to optionally export
in case arch/x86/mm/numa.c has already performed the export.
The dependency on NUMA_KEEP_MEMINFO for DEV_DAX_HMEM_DEVICES is invalid
now that the symbol is properly exported in all combinations of
CONFIG_NUMA_KEEP_MEMINFO and CONFIG_MEMORY_HOTPLUG. Note that in the
CONFIG_NUMA=n case no export is needed since their is a dummy static
inline implementation of phys_to_target_node() in that case.
Reported-by: Randy Dunlap <rdunlap(a)infradead.org>
Reported-by: Thomas Gleixner <tglx(a)linutronix.de>
Reported-by: kernel test robot <lkp(a)intel.com>
Fixes: a035b6bf863e ("mm/memory_hotplug: introduce default phys_to_target_node() implementation")
Cc: Joao Martins <joao.m.martins(a)oracle.com>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: x86(a)kernel.org
Cc: Vishal Verma <vishal.l.verma(a)intel.com>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
---
arch/x86/mm/numa.c | 1 +
drivers/dax/Kconfig | 1 -
mm/memory_hotplug.c | 5 +++++
3 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 44148691d78b..e025947f19e0 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -938,6 +938,7 @@ int phys_to_target_node(phys_addr_t start)
return meminfo_to_nid(&numa_reserved_meminfo, start);
}
+EXPORT_SYMBOL_GPL(phys_to_target_node);
int memory_add_physaddr_to_nid(u64 start)
{
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index 567428e10b7b..d2834c2cfa10 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -50,7 +50,6 @@ config DEV_DAX_HMEM
Say M if unsure.
config DEV_DAX_HMEM_DEVICES
- depends on NUMA_KEEP_MEMINFO # for phys_to_target_node()
depends on DEV_DAX_HMEM && DAX=y
def_bool y
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b44d4c7ba73b..ed326b489674 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -365,9 +365,14 @@ int __weak phys_to_target_node(u64 start)
start);
return 0;
}
+
+/* If the arch did not export a strong symbol, export the weak one. */
+#ifndef CONFIG_NUMA_KEEP_MEMINFO
EXPORT_SYMBOL_GPL(phys_to_target_node);
#endif
+#endif
+
/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
unsigned long start_pfn,
1 year, 6 months
Re: linux-next: Tree for Oct 29 [drivers/nvdimm/nd_e820.ko]
by Randy Dunlap
On 10/28/20 9:55 PM, Stephen Rothwell wrote:
> Hi all,
>
> Changes since 20201028:
>
on x86_64:
ERROR: modpost: "phys_to_target_node" [drivers/nvdimm/nd_e820.ko] undefined!
Full randconfig file is attached.
--
~Randy
Reported-by: Randy Dunlap <rdunlap(a)infradead.org>
1 year, 6 months
[PATCH -next] ACPI: NFIT: Fix judgment of rc is '-ENXIO'
by Zhang Qilong
Initial value of rc is '-ENXIO', and we should
use the initial value to check it.
Signed-off-by: Zhang Qilong <zhangqilong3(a)huawei.com>
---
drivers/acpi/nfit/core.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 756227837b3b..3a3c209ed3d3 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1564,7 +1564,7 @@ static ssize_t format1_show(struct device *dev,
le16_to_cpu(nfit_dcr->dcr->code));
break;
}
- if (rc != ENXIO)
+ if (rc != -ENXIO)
break;
}
mutex_unlock(&acpi_desc->init_mutex);
--
2.17.1
1 year, 6 months
[PATCH 00/10] PKS: Add Protection Keys Supervisor (PKS) support
by ira.weiny@intel.com
From: Ira Weiny <ira.weiny(a)intel.com>
Changes from RFC V3[3]
Rebase to TIP master
Update test error output
Standardize on 'irq_state' for state variables
From Dave Hansen
Update commit messages
Add/clean up comments
Add X86_FEATURE_PKS to disabled-features.h and remove some
explicit CONFIG checks
Move saved_pkrs member of thread_struct
Remove superfluous preempt_disable()
s/irq_save_pks/irq_save_set_pks/
Ensure PKRS is not seen in faults if not configured or not
supported
s/pks_mknoaccess/pks_mk_noaccess/
s/pks_mkread/pks_mk_readonly/
s/pks_mkrdwr/pks_mk_readwrite/
Change pks_key_alloc return to -EOPNOTSUPP when not supported
From Peter Zijlstra
Clean up Attribution
Remove superfluous preempt_disable()
Add union to differentiate exit_rcu/lockdep use in
irqentry_state_t
From Thomas Gleixner
Add preliminary clean up patch and adjust series as needed
Introduce a new page protection mechanism for supervisor pages, Protection Key
Supervisor (PKS).
2 use cases for PKS are being developed, trusted keys and PMEM. Trusted keys
is a newer use case which is still being explored. PMEM was submitted as part
of the RFC (v2) series[1]. However, since then it was found that some callers
of kmap() require a global implementation of PKS. Specifically some users of
kmap() expect mappings to be available to all kernel threads. While global use
of PKS is rare it needs to be included for correctness. Unfortunately the
kmap() updates required a large patch series to make the needed changes at the
various kmap() call sites so that patch set has been split out. Because the
global PKS feature is only required for that use case it will be deferred to
that set as well.[2] This patch set is being submitted as a precursor to both
of the use cases.
For an overview of the entire PKS ecosystem, a git tree including this series
and 2 proposed use cases can be found here:
https://lore.kernel.org/lkml/20201009195033.3208459-1-ira.weiny@intel.com/
https://lore.kernel.org/lkml/20201009201410.3209180-1-ira.weiny@intel.com/
PKS enables protections on 'domains' of supervisor pages to limit supervisor
mode access to those pages beyond the normal paging protections. PKS works in
a similar fashion to user space pkeys, PKU. As with PKU, supervisor pkeys are
checked in addition to normal paging protections and Access or Writes can be
disabled via a MSR update without TLB flushes when permissions change. Also
like PKU, a page mapping is assigned to a domain by setting pkey bits in the
page table entry for that mapping.
Access is controlled through a PKRS register which is updated via WRMSR/RDMSR.
XSAVE is not supported for the PKRS MSR. Therefore the implementation
saves/restores the MSR across context switches and during exceptions. Nested
exceptions are supported by each exception getting a new PKS state.
For consistent behavior with current paging protections, pkey 0 is reserved and
configured to allow full access via the pkey mechanism, thus preserving the
default paging protections on mappings with the default pkey value of 0.
Other keys, (1-15) are allocated by an allocator which prepares us for key
contention from day one. Kernel users should be prepared for the allocator to
fail either because of key exhaustion or due to PKS not being supported on the
arch and/or CPU instance.
The following are key attributes of PKS.
1) Fast switching of permissions
1a) Prevents access without page table manipulations
1b) No TLB flushes required
2) Works on a per thread basis
PKS is available with 4 and 5 level paging. Like PKRU it consumes 4 bits from
the PTE to store the pkey within the entry.
[1] https://lore.kernel.org/lkml/20200717072056.73134-1-ira.weiny@intel.com/
[2] https://lore.kernel.org/lkml/20201009195033.3208459-2-ira.weiny@intel.com/
[3] https://lore.kernel.org/lkml/20201009194258.3207172-1-ira.weiny@intel.com/
Fenghua Yu (2):
x86/pks: Enable Protection Keys Supervisor (PKS)
x86/pks: Add PKS kernel API
Ira Weiny (7):
x86/pkeys: Create pkeys_common.h
x86/fpu: Refactor arch_set_user_pkey_access() for PKS support
x86/pks: Preserve the PKRS MSR on context switch
x86/entry: Pass irqentry_state_t by reference
x86/entry: Preserve PKRS MSR across exceptions
x86/fault: Report the PKRS state on fault
x86/pks: Add PKS test code
Thomas Gleixner (1):
x86/entry: Move nmi entry/exit into common code
Documentation/core-api/protection-keys.rst | 102 ++-
arch/x86/Kconfig | 1 +
arch/x86/entry/common.c | 65 +-
arch/x86/include/asm/cpufeatures.h | 1 +
arch/x86/include/asm/disabled-features.h | 8 +-
arch/x86/include/asm/idtentry.h | 28 +-
arch/x86/include/asm/msr-index.h | 1 +
arch/x86/include/asm/pgtable.h | 13 +-
arch/x86/include/asm/pgtable_types.h | 12 +
arch/x86/include/asm/pkeys.h | 15 +
arch/x86/include/asm/pkeys_common.h | 40 ++
arch/x86/include/asm/processor.h | 14 +
arch/x86/include/uapi/asm/processor-flags.h | 2 +
arch/x86/kernel/cpu/common.c | 15 +
arch/x86/kernel/cpu/mce/core.c | 6 +-
arch/x86/kernel/fpu/xstate.c | 22 +-
arch/x86/kernel/kvm.c | 6 +-
arch/x86/kernel/nmi.c | 6 +-
arch/x86/kernel/process.c | 26 +
arch/x86/kernel/traps.c | 24 +-
arch/x86/mm/fault.c | 87 ++-
arch/x86/mm/pkeys.c | 191 +++++-
include/linux/entry-common.h | 46 +-
include/linux/pgtable.h | 4 +
include/linux/pkeys.h | 22 +
kernel/entry/common.c | 62 +-
lib/Kconfig.debug | 12 +
lib/Makefile | 3 +
lib/pks/Makefile | 3 +
lib/pks/pks_test.c | 691 ++++++++++++++++++++
mm/Kconfig | 2 +
tools/testing/selftests/x86/Makefile | 3 +-
tools/testing/selftests/x86/test_pks.c | 66 ++
33 files changed, 1441 insertions(+), 158 deletions(-)
create mode 100644 arch/x86/include/asm/pkeys_common.h
create mode 100644 lib/pks/Makefile
create mode 100644 lib/pks/pks_test.c
create mode 100644 tools/testing/selftests/x86/test_pks.c
--
2.28.0.rc0.12.gb6a658bd00c9
1 year, 6 months
[PATCH] MIPS: export has_transparent_hugepage() for modules
by Randy Dunlap
MIPS should export its local version of "has_transparent_hugepage"
so that loadable modules (dax) can use it.
Fixes this build error:
ERROR: modpost: "has_transparent_hugepage" [drivers/dax/dax.ko] undefined!
Fixes: fd8cfd300019 ("arch: fix has_transparent_hugepage()")
Reported-by: kernel test robot <lkp(a)intel.com>
Signed-off-by: Randy Dunlap <rdunlap(a)infradead.org>
Cc: Thomas Bogendoerfer <tsbogend(a)alpha.franken.de>
Cc: linux-mips(a)vger.kernel.org
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: Vishal Verma <vishal.l.verma(a)intel.com>
Cc: Dave Jiang <dave.jiang(a)intel.com>
Cc: linux-nvdimm(a)lists.01.org
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
---
arch/mips/mm/tlb-r4k.c | 1 +
1 file changed, 1 insertion(+)
--- linux-next-20201022.orig/arch/mips/mm/tlb-r4k.c
+++ linux-next-20201022/arch/mips/mm/tlb-r4k.c
@@ -438,6 +438,7 @@ int has_transparent_hugepage(void)
}
return mask == PM_HUGE_MASK;
}
+EXPORT_SYMBOL(has_transparent_hugepage);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1 year, 6 months
[PATCH v7 0/7] mm: introduce memfd_secret system call to create "secret" memory areas
by Mike Rapoport
From: Mike Rapoport <rppt(a)linux.ibm.com>
Hi,
This is an implementation of "secret" mappings backed by a file descriptor.
The file descriptor backing secret memory mappings is created using a
dedicated memfd_secret system call The desired protection mode for the
memory is configured using flags parameter of the system call. The mmap()
of the file descriptor created with memfd_secret() will create a "secret"
memory mapping. The pages in that mapping will be marked as not present in
the direct map and will have desired protection bits set in the user page
table. For instance, current implementation allows uncached mappings.
Although normally Linux userspace mappings are protected from other users,
such secret mappings are useful for environments where a hostile tenant is
trying to trick the kernel into giving them access to other tenants
mappings.
Additionally, in the future the secret mappings may be used as a mean to
protect guest memory in a virtual machine host.
For demonstration of secret memory usage we've created a userspace library
https://git.kernel.org/pub/scm/linux/kernel/git/jejb/secret-memory-preloa...
that does two things: the first is act as a preloader for openssl to
redirect all the OPENSSL_malloc calls to secret memory meaning any secret
keys get automatically protected this way and the other thing it does is
expose the API to the user who needs it. We anticipate that a lot of the
use cases would be like the openssl one: many toolkits that deal with
secret keys already have special handling for the memory to try to give
them greater protection, so this would simply be pluggable into the
toolkits without any need for user application modification.
Hiding secret memory mappings behind an anonymous file allows (ab)use of
the page cache for tracking pages allocated for the "secret" mappings as
well as using address_space_operations for e.g. page migration callbacks.
The anonymous file may be also used implicitly, like hugetlb files, to
implement mmap(MAP_SECRET) and use the secret memory areas with "native" mm
ABIs in the future.
To limit fragmentation of the direct map to splitting only PUD-size pages,
I've added an amortizing cache of PMD-size pages to each file descriptor
that is used as an allocation pool for the secret memory areas.
It is easy to add boot time reservation of the memory for secretmem
needs. There was an implementation in earlier version of this set, but I've
dropped it for now as there is no consensus whether the boot time
reservation should be done from memblock or from CMA. I beleive we can have
this discussion after straightening out the basic implementation.
v7:
* Use set_direct_map() instead of __kernel_map_pages() to ensure error
handling in case the direct map update fails
* Add accounting of large pages used to reduce the direct map fragmentation
* Teach get_user_pages() and frieds to refuse get/pin secretmem pages
v6: https://lore.kernel.org/lkml/20200924132904.1391-1-rppt@kernel.org
* Silence the warning about missing syscall, thanks to Qian Cai
* Replace spaces with tabs in Kconfig additions, per Randy
* Add a selftest.
v5: https://lore.kernel.org/lkml/20200916073539.3552-1-rppt@kernel.org
* rebase on v5.9-rc5
* drop boot time memory reservation patch
v4: https://lore.kernel.org/lkml/20200818141554.13945-1-rppt@kernel.org
* rebase on v5.9-rc1
* Do not redefine PMD_PAGE_ORDER in fs/dax.c, thanks Kirill
* Make secret mappings exclusive by default and only require flags to
memfd_secret() system call for uncached mappings, thanks again Kirill :)
v3: https://lore.kernel.org/lkml/20200804095035.18778-1-rppt@kernel.org
* Squash kernel-parameters.txt update into the commit that added the
command line option.
* Make uncached mode explicitly selectable by architectures. For now enable
it only on x86.
v2: https://lore.kernel.org/lkml/20200727162935.31714-1-rppt@kernel.org
* Follow Michael's suggestion and name the new system call 'memfd_secret'
* Add kernel-parameters documentation about the boot option
* Fix i386-tinyconfig regression reported by the kbuild bot.
CONFIG_SECRETMEM now depends on !EMBEDDED to disable it on small systems
from one side and still make it available unconditionally on
architectures that support SET_DIRECT_MAP.
v1: https://lore.kernel.org/lkml/20200720092435.17469-1-rppt@kernel.org
Mike Rapoport (8):
mm: add definition of PMD_PAGE_ORDER
mmap: make mlock_future_check() global
set_memory: allow set_direct_map_*_noflush() for multiple pages
mm: introduce memfd_secret system call to create "secret" memory areas
arch, mm: wire up memfd_secret system call were relevant
mm: secretmem: use PMD-size pages to amortize direct map fragmentation
secretmem: test: add basic selftest for memfd_secret(2)
mm: secretmem: add ability to reserve memory at boot
arch/Kconfig | 7 +
arch/arm64/include/asm/cacheflush.h | 4 +-
arch/arm64/include/asm/unistd.h | 2 +-
arch/arm64/include/asm/unistd32.h | 2 +
arch/arm64/include/uapi/asm/unistd.h | 1 +
arch/arm64/mm/pageattr.c | 10 +-
arch/riscv/include/asm/set_memory.h | 4 +-
arch/riscv/include/asm/unistd.h | 1 +
arch/riscv/mm/pageattr.c | 8 +-
arch/x86/Kconfig | 1 +
arch/x86/entry/syscalls/syscall_32.tbl | 1 +
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
arch/x86/include/asm/set_memory.h | 4 +-
arch/x86/mm/pat/set_memory.c | 8 +-
fs/dax.c | 11 +-
include/linux/pgtable.h | 3 +
include/linux/set_memory.h | 4 +-
include/linux/syscalls.h | 1 +
include/uapi/asm-generic/unistd.h | 7 +-
include/uapi/linux/magic.h | 1 +
include/uapi/linux/secretmem.h | 8 +
kernel/sys_ni.c | 2 +
mm/Kconfig | 4 +
mm/Makefile | 1 +
mm/gup.c | 10 +
mm/internal.h | 3 +
mm/mmap.c | 5 +-
mm/secretmem.c | 487 ++++++++++++++++++++++
mm/vmalloc.c | 5 +-
scripts/checksyscalls.sh | 4 +
tools/testing/selftests/vm/.gitignore | 1 +
tools/testing/selftests/vm/Makefile | 3 +-
tools/testing/selftests/vm/memfd_secret.c | 296 +++++++++++++
tools/testing/selftests/vm/run_vmtests | 17 +
34 files changed, 892 insertions(+), 35 deletions(-)
create mode 100644 include/uapi/linux/secretmem.h
create mode 100644 mm/secretmem.c
create mode 100644 tools/testing/selftests/vm/memfd_secret.c
--
2.28.0
1 year, 6 months
[PATCH] mm/mremap_pages: Fix static key devmap_managed_key updates
by Ralph Campbell
commit 6f42193fd86e ("memremap: don't use a separate devm action for
devmap_managed_enable_get") changed the static key updates such that we
now call devmap_managed_enable_put() without doing the equivalent
devmap_managed_enable_get().
devmap_managed_enable_get() is only called for MEMORY_DEVICE_PRIVATE and
MEMORY_DEVICE_FS_DAX, But memunmap_pages() get called for other pgmap
types too. This results in the below warning when switching between
system-ram and devdax mode for devdax namespace.
jump label: negative count!
WARNING: CPU: 52 PID: 1335 at kernel/jump_label.c:235 static_key_slow_try_dec+0x88/0xa0
Modules linked in:
....
NIP [c000000000433318] static_key_slow_try_dec+0x88/0xa0
LR [c000000000433314] static_key_slow_try_dec+0x84/0xa0
Call Trace:
[c000000025c1f660] [c000000000433314] static_key_slow_try_dec+0x84/0xa0
[c000000025c1f6d0] [c000000000433664] __static_key_slow_dec_cpuslocked+0x34/0xd0
[c000000025c1f700] [c0000000004337a4] static_key_slow_dec+0x54/0xf0
[c000000025c1f770] [c00000000059c49c] memunmap_pages+0x36c/0x500
[c000000025c1f820] [c000000000d91d10] devm_action_release+0x30/0x50
[c000000025c1f840] [c000000000d92e34] release_nodes+0x2f4/0x3e0
[c000000025c1f8f0] [c000000000d8b15c] device_release_driver_internal+0x17c/0x280
[c000000025c1f930] [c000000000d883a4] bus_remove_device+0x124/0x210
[c000000025c1f9b0] [c000000000d80ef4] device_del+0x1d4/0x530
[c000000025c1fa70] [c000000000e341e8] unregister_dev_dax+0x48/0xe0
[c000000025c1fae0] [c000000000d91d10] devm_action_release+0x30/0x50
[c000000025c1fb00] [c000000000d92e34] release_nodes+0x2f4/0x3e0
[c000000025c1fbb0] [c000000000d8b15c] device_release_driver_internal+0x17c/0x280
[c000000025c1fbf0] [c000000000d87000] unbind_store+0x130/0x170
[c000000025c1fc30] [c000000000d862a0] drv_attr_store+0x40/0x60
[c000000025c1fc50] [c0000000006d316c] sysfs_kf_write+0x6c/0xb0
[c000000025c1fc90] [c0000000006d2328] kernfs_fop_write+0x118/0x280
[c000000025c1fce0] [c0000000005a79f8] vfs_write+0xe8/0x2a0
[c000000025c1fd30] [c0000000005a7d94] ksys_write+0x84/0x140
[c000000025c1fd80] [c00000000003a430] system_call_exception+0x120/0x270
[c000000025c1fe20] [c00000000000c540] system_call_common+0xf0/0x27c
Cc: Christoph Hellwig <hch(a)infradead.org>
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: linux-nvdimm(a)lists.01.org
Cc: Jason Gunthorpe <jgg(a)mellanox.com>
Signed-off-by: Ralph Campbell <rcampbell(a)nvidia.com>
Reported-by: Aneesh Kumar K.V <aneesh.kumar(a)linux.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar(a)linux.ibm.com>
Reviewed-by: Ira Weiny <ira.weiny(a)intel.com>
Tested-by: Sachin Sant <sachinp(a)linux.vnet.ibm.com>
---
Andrew, I guess this is for the merge window since it fixes a top-of-tree
problem.
mm/memremap.c | 39 ++++++++++++++++-----------------------
1 file changed, 16 insertions(+), 23 deletions(-)
diff --git a/mm/memremap.c b/mm/memremap.c
index 73a206d0f645..16b2fb482da1 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -41,28 +41,24 @@ EXPORT_SYMBOL_GPL(memremap_compat_align);
DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
EXPORT_SYMBOL(devmap_managed_key);
-static void devmap_managed_enable_put(void)
+static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
{
- static_branch_dec(&devmap_managed_key);
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
+ pgmap->type == MEMORY_DEVICE_FS_DAX)
+ static_branch_dec(&devmap_managed_key);
}
-static int devmap_managed_enable_get(struct dev_pagemap *pgmap)
+static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
{
- if (pgmap->type == MEMORY_DEVICE_PRIVATE &&
- (!pgmap->ops || !pgmap->ops->page_free)) {
- WARN(1, "Missing page_free method\n");
- return -EINVAL;
- }
-
- static_branch_inc(&devmap_managed_key);
- return 0;
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
+ pgmap->type == MEMORY_DEVICE_FS_DAX)
+ static_branch_inc(&devmap_managed_key);
}
#else
-static int devmap_managed_enable_get(struct dev_pagemap *pgmap)
+static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
{
- return -EINVAL;
}
-static void devmap_managed_enable_put(void)
+static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
{
}
#endif /* CONFIG_DEV_PAGEMAP_OPS */
@@ -169,7 +165,7 @@ void memunmap_pages(struct dev_pagemap *pgmap)
pageunmap_range(pgmap, i);
WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n");
- devmap_managed_enable_put();
+ devmap_managed_enable_put(pgmap);
}
EXPORT_SYMBOL_GPL(memunmap_pages);
@@ -307,7 +303,6 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
.pgprot = PAGE_KERNEL,
};
const int nr_range = pgmap->nr_range;
- bool need_devmap_managed = true;
int error, i;
if (WARN_ONCE(!nr_range, "nr_range must be specified\n"))
@@ -323,6 +318,10 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
WARN(1, "Missing migrate_to_ram method\n");
return ERR_PTR(-EINVAL);
}
+ if (!pgmap->ops->page_free) {
+ WARN(1, "Missing page_free method\n");
+ return ERR_PTR(-EINVAL);
+ }
if (!pgmap->owner) {
WARN(1, "Missing owner\n");
return ERR_PTR(-EINVAL);
@@ -336,11 +335,9 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
}
break;
case MEMORY_DEVICE_GENERIC:
- need_devmap_managed = false;
break;
case MEMORY_DEVICE_PCI_P2PDMA:
params.pgprot = pgprot_noncached(params.pgprot);
- need_devmap_managed = false;
break;
default:
WARN(1, "Invalid pgmap type %d\n", pgmap->type);
@@ -364,11 +361,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
}
}
- if (need_devmap_managed) {
- error = devmap_managed_enable_get(pgmap);
- if (error)
- return ERR_PTR(error);
- }
+ devmap_managed_enable_get(pgmap);
/*
* Clear the pgmap nr_range as it will be incremented for each
--
2.20.1
1 year, 7 months