[PATCH] dax: Fix missed PMD wakeups
by Dan Williams
Ever since the conversion of DAX to the Xarray a RocksDB benchmark has
been encountering intermittent lockups. In the failing case a thread
that is taking a PMD-fault is awaiting a wakeup while holding the
'mmap_sem' for read. As soon as the next mmap() event occurs that tries
to take the 'mmap_sem' for write it causes ps(1) and any new 'mmap_sem'
reader to block.
Debug shows that there are no outstanding Xarray entry-lock holders in
the hang state which indicates that a PTE lock-holder thread caused a
PMD thread to wait. When the PTE index-lock is released it may wake the
wrong waitqueue depending on how the index hashes. Brute-force fix this
by arranging for PTE-aligned indices within a PMD-span to hash to the
same waitqueue as the PMD-index.
This fix may increase waitqueue contention, but a fix for that is saved
for a larger rework. In the meantime this fix is suitable for -stable
backports.
Link: https://lore.kernel.org/linux-fsdevel/CAPcyv4hwHpX-MkUEqxwdTj7wCCZCN4RV-L...>
Fixes: b15cd800682f ("dax: Convert page fault handlers to XArray")
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Jan Kara <jack(a)suse.cz>
Cc: Boaz Harrosh <openosd(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Reported-by: Robert Barror <robert.barror(a)intel.com>
Reported-by: Seema Pandit <seema.pandit(a)intel.com>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
---
fs/dax.c | 34 ++++++++++++----------------------
1 file changed, 12 insertions(+), 22 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index 9fd908f3df32..592944c522b8 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -144,19 +144,14 @@ struct wait_exceptional_entry_queue {
struct exceptional_entry_key key;
};
-static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
- void *entry, struct exceptional_entry_key *key)
+static wait_queue_head_t *dax_index_waitqueue(struct xa_state *xas,
+ struct exceptional_entry_key *key)
{
unsigned long hash;
unsigned long index = xas->xa_index;
- /*
- * If 'entry' is a PMD, align the 'index' that we use for the wait
- * queue to the start of that PMD. This ensures that all offsets in
- * the range covered by the PMD map to the same bit lock.
- */
- if (dax_is_pmd_entry(entry))
- index &= ~PG_PMD_COLOUR;
+ /* PMD-align the index to ensure PTE events wakeup PMD waiters */
+ index &= ~PG_PMD_COLOUR;
key->xa = xas->xa;
key->entry_start = index;
@@ -177,17 +172,12 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
return autoremove_wake_function(wait, mode, sync, NULL);
}
-/*
- * @entry may no longer be the entry at the index in the mapping.
- * The important information it's conveying is whether the entry at
- * this index used to be a PMD entry.
- */
-static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all)
+static void dax_wake_index(struct xa_state *xas, bool wake_all)
{
struct exceptional_entry_key key;
wait_queue_head_t *wq;
- wq = dax_entry_waitqueue(xas, entry, &key);
+ wq = dax_index_waitqueue(xas, &key);
/*
* Checking for locked entry and prepare_to_wait_exclusive() happens
@@ -222,7 +212,7 @@ static void *get_unlocked_entry(struct xa_state *xas)
!dax_is_locked(entry))
return entry;
- wq = dax_entry_waitqueue(xas, entry, &ewait.key);
+ wq = dax_index_waitqueue(xas, &ewait.key);
prepare_to_wait_exclusive(wq, &ewait.wait,
TASK_UNINTERRUPTIBLE);
xas_unlock_irq(xas);
@@ -246,7 +236,7 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry)
init_wait(&ewait.wait);
ewait.wait.func = wake_exceptional_entry_func;
- wq = dax_entry_waitqueue(xas, entry, &ewait.key);
+ wq = dax_index_waitqueue(xas, &ewait.key);
/*
* Unlike get_unlocked_entry() there is no guarantee that this
* path ever successfully retrieves an unlocked entry before an
@@ -263,7 +253,7 @@ static void put_unlocked_entry(struct xa_state *xas, void *entry)
{
/* If we were the only waiter woken, wake the next one */
if (entry)
- dax_wake_entry(xas, entry, false);
+ dax_wake_index(xas, false);
}
/*
@@ -281,7 +271,7 @@ static void dax_unlock_entry(struct xa_state *xas, void *entry)
old = xas_store(xas, entry);
xas_unlock_irq(xas);
BUG_ON(!dax_is_locked(old));
- dax_wake_entry(xas, entry, false);
+ dax_wake_index(xas, false);
}
/*
@@ -522,7 +512,7 @@ static void *grab_mapping_entry(struct xa_state *xas,
dax_disassociate_entry(entry, mapping, false);
xas_store(xas, NULL); /* undo the PMD join */
- dax_wake_entry(xas, entry, true);
+ dax_wake_index(xas, true);
mapping->nrexceptional--;
entry = NULL;
xas_set(xas, index);
@@ -915,7 +905,7 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
xas_lock_irq(xas);
xas_store(xas, entry);
xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
- dax_wake_entry(xas, entry, false);
+ dax_wake_index(xas, false);
trace_dax_writeback_one(mapping->host, index, count);
return ret;
2 years, 9 months
转发:GD&T形位公差与尺寸链计算
by 致企业的一封信
---- 原邮件信息 -----
发件人:致企业的一封信<rycb(a)gx.com>
收件人:linux-nvdimm <linux-nvdimm(a)lists.01.org>;
发送时间:2019-7-29 19:16:41
2 years, 9 months
xfs quota test xfs/050 fails with dax mount option and "-d su=2m,sw=1" mkfs option
by Murphy Zhou
Hi,
As subject.
-d su=2m,sw=1 && -o dax fail
-d su=2m,sw=1 && NO dax pass
no su mkfs option && -o dax pass
no su mkfs option && NO dax pass
On latest Linus tree. Reproduce every time.
Testing on older kernels are going on to see if it's a regression.
Is this failure expected ?
Thanks,
M
# fail with 2m su mkfs option and dax mount option:
FSTYP -- xfs (debug)
PLATFORM -- Linux/x86_64 7u 5.3.0-rc1-master-ad5e427+ #126 SMP Wed Jul 24 14:46:09 CST 2019
MKFS_OPTIONS -- -f -f -b size=4096 -d su=2m,sw=1 /dev/pmem1
MOUNT_OPTIONS -- -o dax -o context=system_u:object_r:root_t:s0 /dev/pmem1 /test1
xfs/050 4s ... [05:30:52] [05:30:56]- output mismatch (see /root/xfstests-dev/results//xfs/050.out.bad)
--- tests/xfs/050.out 2019-05-07 02:34:03.391107482 -0400
+++ /root/xfstests-dev/results//xfs/050.out.bad 2019-07-24 05:30:56.483044548 -0400
@@ -29,6 +29,7 @@
*** push past the hard block limit (expect EDQUOT)
[ROOT] 0 0 0 00 [--------] 3 0 0 00 [--------] 0 0 0 00 [--------]
[NAME] =OK= 200 1000 0 [7 days] 10 4 10 00 [7 days] 0 0 0 00 [--------]
+ URK 99: 2097152 is out of range! [3481600,4096000]
*** unmount
*** group
...
(Run 'diff -u /root/xfstests-dev/tests/xfs/050.out /root/xfstests-dev/results//xfs/050.out.bad' to see the entire diff)
Ran: xfs/050
Failures: xfs/050
Failed 1 of 1 tests
~
[root@7u ~]# diff -u /root/xfstests-dev/tests/xfs/050.out /root/xfstests-dev/results//xfs/050.out.bad
--- /root/xfstests-dev/tests/xfs/050.out 2019-05-07 02:34:03.391107482 -0400
+++ /root/xfstests-dev/results//xfs/050.out.bad 2019-07-24 05:30:56.483044548 -0400
@@ -29,6 +29,7 @@
*** push past the hard block limit (expect EDQUOT)
[ROOT] 0 0 0 00 [--------] 3 0 0 00 [--------] 0 0 0 00 [--------]
[NAME] =OK= 200 1000 0 [7 days] 10 4 10 00 [7 days] 0 0 0 00 [--------]
+ URK 99: 2097152 is out of range! [3481600,4096000]
*** unmount
*** group
@@ -61,6 +62,7 @@
*** push past the hard block limit (expect EDQUOT)
[ROOT] 0 0 0 00 [--------] 3 0 0 00 [--------] 0 0 0 00 [--------]
[NAME] =OK= 200 1000 0 [7 days] 10 4 10 00 [7 days] 0 0 0 00 [--------]
+ URK 99: 2097152 is out of range! [3481600,4096000]
*** unmount
*** uqnoenforce
@@ -157,6 +159,7 @@
*** push past the hard block limit (expect EDQUOT)
[ROOT] 0 0 0 00 [--------] 3 0 0 00 [--------] 0 0 0 00 [--------]
[NAME] =OK= 200 1000 0 [7 days] 9 4 10 00 [7 days] 0 0 0 00 [--------]
+ URK 1: 2097152 is out of range! [3481600,4096000]
*** unmount
*** pqnoenforce
#Pass without dax:
FSTYP -- xfs (debug)
PLATFORM -- Linux/x86_64 7u 5.3.0-rc1-master-ad5e427+ #126 SMP Wed Jul 24 14:46:09 CST 2019
MKFS_OPTIONS -- -f -f -b size=4096 /dev/pmem1
MOUNT_OPTIONS -- -o context=system_u:object_r:root_t:s0 /dev/pmem1 /test1
xfs/050 [05:30:35] [05:30:39] 4s
Ran: xfs/050
Passed all 1 tests
#Pass without 2m su mkfs option and with dax option:
FSTYP -- xfs (debug)
PLATFORM -- Linux/x86_64 7u 5.3.0-rc1-master-ad5e427+ #126 SMP Wed Jul 24 14:46:09 CST 2019
MKFS_OPTIONS -- -f -f -b size=4096 /dev/pmem1
MOUNT_OPTIONS -- -o dax -o context=system_u:object_r:root_t:s0 /dev/pmem1 /test1
xfs/050 4s ... [05:34:13] [05:34:17] 4s
Ran: xfs/050
Passed all 1 tests
# Pass with 2m su mkfs option and without dax mount option:
FSTYP -- xfs (debug)
PLATFORM -- Linux/x86_64 7u 5.3.0-rc1-master-ad5e427+ #126 SMP Wed Jul 24 14:46:09 CST 2019
MKFS_OPTIONS -- -f -f -b size=4096 -d su=2m,sw=1 /dev/pmem1
MOUNT_OPTIONS -- -o context=system_u:object_r:root_t:s0 /dev/pmem1 /test1
xfs/050 4s ... [05:36:08] [05:36:12] 4s
Ran: xfs/050
Passed all 1 tests
2 years, 9 months
Оценка по KPI
by Helena
Оплата по результату KPI – мотивация 4.0
Как с ограниченным бюджетом на
персонал добиться роста
эффективности персонала и компании
Оплата по результату: KPI-мотивация –
это практичная и эффективная
технология, которая позволит:
- разработать систему оценки
сотрудников сбытовых и сервисных
подразделений
- разработать систему материального и
нематериального стимулирования
- построить систему так, чтобы оплата
премий и бонусов формировалась за
счет дополнительно полученной
прибыли вплоть до должности а так же
по факту практикума
- Участники получат проекты систем
оплаты и премирования для ряда
должностей, а также примеры от
Авторов.
Для кого: Программа предназначена
руководителям и топ-менеджерам
компаний, руководителям
подразделений и HR-специалистам,
ответственным за разработку и
внедрение систем оценки и
материального и нематериального
стимулирования персонала.
Бизнес-программа:
Модуль №1. Как добиваться пиковой
эффективности с помощью правильной
системы KPI-мотивации
- Какие показатели действительно
имеют значение
- Как оценить работу каждого
сотрудника
- Как различать лучших сотрудников
- Какой размер фонда оплаты труда
должен быть в компании
Порог и потолок премиального фонда,
Процентомания: плюсы и минусы
- Пример распределения премиального
фонда в зависимости от сезонности
продаж
- Связь KPI с бонусами
- Как внедрить систему KPI-мотивации
- Ошибки внедрения KPI-мотивации и как
их преодолеть
- Пример Плана внедрения KPI-мотивации
Модуль №2. KPI-МОТИВАЦИЯ В ПРОДАЖАХ И
МАРКЕТИНГЕ
- Как обеспечить ПРОЗРАЧНОСТЬ отдела
продаж и каждого сотрудника?
- Какие планы продаж установить на
текущий год и месяц для компании и
каждого торгового представителя?
- Если у вас недостаточный прирост
новых клиентов, какие показатели
установить, чтобы обеспечить
существенный прирост.
- Как мотивировать уменьшить
просроченную дебиторскую
задолженность и оборачиваемость ДЗ
- Как мотивировать на сохранение,
расширение клиентской базы
- Как мотивировать на рост
прибыльности клиентов
- Как обеспечить и измерить
удовлетворенность целевых клиентов
- Управление затратами на сбыт.
Затраты на приращённый объем продаж
Модуль №3. KPI-МОТИВАЦИЯ В СНАБЖЕНИИ И
ПРОИЗВОДСТВЕ
- Как оценить работу начальника
отдела производства, бригадиров, ОТК,
службы по охране труда, инженеров,
мастеров участков, начальника отдела
снабжения, менеджеров по снабжению
- Как определить качественные
показатели
- Схемы расчета фонда оплаты труда:
ставки и бонусы
- Как сформировать бюджет
производства
- Как планировать остатки готовой
продукции
МОДУЛЬ №4. KPI-МОТИВАЦИЯ ДЛЯ СКЛАДА И
ТРАНСПОРТНОЙ СЛУЖБЫ.
- Как оценить работу сотрудников
склада и ТЭС
- За достижение каких показателей
платятся бонусы
- Как сформировать бюджет
складирования
- Как планировать штат складских
работников и ТЭС
- Как сформировать бюджет ТЭС
МОДУЛЬ №5. KPI-МОТИВЦИЯ СОТРУДНИКОВ
БЭК-ОФИСА: БУХГАЛТЕРИИ, HR-СЛУЖБЫ, ИТ.
- Как оценить объем выполняемой
работы сотрудниками
- Как оценить качество работы
- Как рассчитать потребность в
персонале по отделам
- Как сократить фонд оплаты труда, не
снижая мотивации сотрудников?
МОДУЛЬ №6. КАК ЗАСТАВИТЬ ЛЮДЕЙ "ГОРЕТЬ"
НА РАБОТЕ
Как превратить ленивых бездельников
в амбициозных трудоголиков.
- Как измерить Счастье на работе и как
Счастье сотрудника влияет на его
результативность
- Как изменить поведение сотрудников.
"Правильные люди в команде". Поиск
людей с высокой внутренней мотивацией
- Как цели зажигают сотрудников?
- Тестируем на себе. Что Вас
мотивирует кроме зарплаты
- Влейте в работу душу. Нематериальное
стимулирование
Дата: 8 августа
Заполнить заявку для участия >
http://stata.kpi4.in.ua/campaigns/ae67408eja1ea/track-url/mc072o9kbl74a/9...
Если у Вас возникнут дополнительные
вопросы -
мы всегда на связи:
http://stata.kpi4.in.ua/campaigns/ae67408eja1ea/track-url/mc072o9kbl74a/f...
http://stata.kpi4.in.ua/campaigns/ae67408eja1ea/track-url/mc072o9kbl74a/e...
http://stata.kpi4.in.ua/campaigns/ae67408eja1ea/track-url/mc072o9kbl74a/8...
http://stata.kpi4.in.ua/campaigns/ae67408eja1ea/track-url/mc072o9kbl74a/4...
http://stata.kpi4.in.ua/campaigns/ae67408eja1ea/track-url/mc072o9kbl74a/2...
Уважаемый подписчик. Данное письмо не
требует ответа.
Сooбщение пoдгoтoвленo и aдресoвaнo нa
электрoнный aдрес linux-nvdimm(a)lists.01.org.
Для отказа от рассылки щёлкните по
ссылке отказ от почты List-Unsubscribe
http://stata.kpi4.in.ua/lists/xx572kqs3sa91/unsubscribe/mc072o9kbl74a/ae6...
или пожаловаться на Спам
http://stata.kpi4.in.ua/lists/xx572kqs3sa91/unsubscribe/mc072o9kbl74a/ae6...
2 years, 9 months
[GIT PULL] libnvdimm fixes for 5.3-rc2
by Dan Williams
Hi Linus, please pull from:
git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
tags/libnvdimm-fixes-5.3-rc2
...to receive a collection of locking and async operations fixes for
v5.3-rc2. These had been soaking in a branch targeting the merge
window, but missed due to a regression hunt. This fixed up version has
otherwise been in -next this past week with no reported issues.
In order to gain confidence in the locking changes the pull also
includes a debug / instrumentation patch to enable lockdep coverage
for libnvdimm subsystem operations that depend on the device_lock for
exclusion. As mentioned in the changelog it is a hack, but it works
and documents the locking expectations of the sub-system in a way that
others can use lockdep to verify. The driver core touches got an ack
from Greg.
Please pull, but I'll understand if you want a resend with the debug
patch dropped.
---
The following changes since commit d1fdb6d8f6a4109a4263176c84b899076a5f8008:
Linux 5.2-rc4 (2019-06-08 20:24:46 -0700)
are available in the Git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
tags/libnvdimm-fixes-5.3-rc2
for you to fetch changes up to 87a30e1f05d73a34e6d1895065541369131aaf1c:
driver-core, libnvdimm: Let device subsystems add local lockdep
coverage (2019-07-18 16:23:27 -0700)
----------------------------------------------------------------
libnvdimm fixes v5.3-rc2
- Fix duplicate device_unregister() calls (multiple threads competing to
do unregister work when scheduling device removal from a sysfs attribute
of the self-same device).
- Fix badblocks registration order bug. Ensure region badblocks are
initialized in advance of namespace registration.
- Fix a deadlock between the bus lock and probe operations.
- Export device-core infrastructure to coordinate async operations via
the device ->dead state.
- Add device-core infrastructure to validate device_lock() usage with
lockdep.
----------------------------------------------------------------
Dan Williams (7):
drivers/base: Introduce kill_device()
libnvdimm/bus: Prevent duplicate device_unregister() calls
libnvdimm/region: Register badblocks before namespaces
libnvdimm/bus: Prepare the nd_ioctl() path to be re-entrant
libnvdimm/bus: Stop holding nvdimm_bus_list_mutex over __nd_ioctl()
libnvdimm/bus: Fix wait_nvdimm_bus_probe_idle() ABBA deadlock
driver-core, libnvdimm: Let device subsystems add local lockdep coverage
drivers/acpi/nfit/core.c | 28 +++---
drivers/acpi/nfit/nfit.h | 24 +++++
drivers/base/core.c | 30 ++++--
drivers/nvdimm/btt_devs.c | 16 +--
drivers/nvdimm/bus.c | 210 ++++++++++++++++++++++++++--------------
drivers/nvdimm/core.c | 10 +-
drivers/nvdimm/dimm_devs.c | 4 +-
drivers/nvdimm/namespace_devs.c | 36 +++----
drivers/nvdimm/nd-core.h | 71 +++++++++++++-
drivers/nvdimm/pfn_devs.c | 24 ++---
drivers/nvdimm/pmem.c | 4 +-
drivers/nvdimm/region.c | 24 ++---
drivers/nvdimm/region_devs.c | 12 ++-
drivers/nvdimm/region.c | 24 ++---
drivers/nvdimm/region_devs.c | 12 ++-
include/linux/device.h | 6 ++
14 files changed, 343 insertions(+), 156 deletions(-)
2 years, 10 months
[ndctl PATCH v7 00/13] daxctl: add a new reconfigure-device command
by Vishal Verma
Changes in v7:
- Fix a couple of checkaptch type errors in the new lines added in v6 (Dan).
- Get rid of daxctl_dev_get_mode. daxctl_dev_get_memory is sufficient to
both check the mode and allocate the memory related structures on its
first call. (Dan)
- Due to the above, daxctl_dev_mode is now private to libdaxctl, and not
part of the API exported through libdaxctl.h
- Add a large enough buffer at init time to construct dynamic paths, and avoid
asprintf() type allocations for memory blocks at runtime (Dan).
Changes in v6:
- For memory block online/offline operations, the kernel responds with
an EINVAL for both 'real' errors, and if the memory was already in the
requested state. Since there is a TOCTOU hole between checking the
state and storing it, just perform a second check if the store results
in an error. If the check shows the state to be the same as the one
we're attempting, it means that another agent (usually udev) won the
race, but we don't care so long as the state change happened, so don't
report an error. (Fan Du)
Changes in v5:
- device.c: correctly set loglevel for daxctl_ctx for --verbose
- drop the subsys caching, its complexity started to exceed its
benefit. dax-class device models will simply error out during
reconfigure. (Dan)
- Add a note to the man page for the above.
- Clarify the onlining policy (online_movable) in the man page
- rename "numa_node" to "target_node" in device listings (Dan)
- When printing a device 'mode', assume devdax if !system-ram,
avoiding a "mode: unknown" situation which can be confusing. (Dan)
- Add a "state: disabled" attribute to the device listing if a driver
is not bound. This is more apt than the previous "mode: unknown"
listing.
- add an api to get 'dev->resource' parsing /proc/iomem as a
fallback for when the kernel doesn't provide the attribute (Dan)
- convert node_* apis to 'memory_* apis that act on a new daxctl_memory
object (Dan)
- online only memory sections belonging to the device in question by
cross referencing block indices with the dax device resource (Dan)
- Refuse to reconfigure a device that is already in the target mode.
Until now, reconfiguring a system-ram device back to system-ram would
result in a 'online memory may not be hot-removed' kernel warning.
- If the device was already in the system-ram mode, skip
disabling/enabling, but still try to online the memory unless the
--no-online option is in effect.
- In daxctl_unbind, also 'remove_id' to prevent devices automatically
binding to the kmem driver on a disable + re-enable, which can be
surprising (Dan).
- Rewrite the top half of daxctl/device.c to borrow elements from
ndctl/namespace.c so that it can support growing additional commands
that operate on devices (online-memory and offline-memory)
- Refactor the bottom half of daxctl/device.c so we only do the
disabling/offlining steps if the device was enabled.
- Add new commands to online and offline memory sections (Dan)
associated with a given dax device (Dan)
- Add a new test - daxctl-device.sh - to test daxctl reconfigure-device,
online-memory, and offline-memory commands.
- Add an example in documentation demonstrating how to use numactl
to bind a process to a node surfaced from a dax device (Andy Rudoff)
Changes in v4:
- Don't fail add_dax_dev for kmod failures. Instead fail only when the kmod
list is actually used, i.e. during daxctl-reconfigure-device
Changes in v3:
- In daxctl_dev_get_mode(), remove the subsystem warning, detect dax-class
and simply make it return devdax
Changes in v2:
- Add examples to the documentation page (Dave Hansen)
- Clarify documentation regarding the conversion from system-ram to devdax
- Remove any references to a persistent config from the documentation -
those can be added when the feature is added.
- device.c: validate option compatibility
- daxctl-list: display numa_node for device listings
- daxctl-list: display mode for device listings
- make the options more consistent by adding a '-O' short option
for --attempt-offline
Add a new daxctl-reconfigure-device command that lets us reconfigure DAX
devices back and forth between 'system-ram' and 'device-dax' modes. It
also includes facilities to online any newly hot-plugged memory
(default), and attempt to offline memory before converting away from the
system-ram mode (not default, requires a --attempt-offline option).
Currently missing from this series is a way to persistently store which
devices have been 'marked' for use as system-ram. This depends on a
config system overhaul in ndctl, and patches for those will follow
separately and are independent of this work.
Example invocations:
1. Reconfigure dax0.0 to system-ram mode, don’t online the memory
# daxctl reconfigure-device --mode=system-ram --no-online dax0.0
[
{
"chardev":"dax0.0",
"size":16777216000,
"target_node":2,
"mode":"system-ram"
}
]
2. Reconfigure dax0.0 to devdax mode, attempt to offline the memory
# daxctl reconfigure-device --human --mode=devdax --attempt-offline dax0.0
{
"chardev":"dax0.0",
"size":"15.63 GiB (16.78 GB)",
"target_node":2,
"mode":"devdax"
}
3. Reconfigure all dax devices on region0 to system-ram mode
# daxctl reconfigure-device --mode=system-ram --region=0 all
[
{
"chardev":"dax0.0",
"size":16777216000,
"target_node":2,
"mode":"system-ram"
},
{
"chardev":"dax0.1",
"size":16777216000,
"target_node":3,
"mode":"system-ram"
}
]
These patches can also be found in the 'kmem-pending' branch on github:
https://github.com/pmem/ndctl/tree/kmem-pending
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Pavel Tatashin <pasha.tatashin(a)soleen.com>
Vishal Verma (13):
libdaxctl: add interfaces to get ctx and check device state
libdaxctl: add interfaces to enable/disable devices
libdaxctl: add an interface to retrieve the device resource
libdaxctl: add a 'daxctl_memory' object for memory based operations
daxctl/list: add target_node for device listings
daxctl/list: display the mode for a dax device
daxctl: add a new reconfigure-device command
Documentation/daxctl: add a man page for daxctl-reconfigure-device
daxctl: add commands to online and offline memory
Documentation: Add man pages for daxctl-{on,off}line-memory
contrib/ndctl: fix region-id completions for daxctl
contrib/ndctl: add bash-completion for the new daxctl commands
test: Add a unit test for daxctl-reconfigure-device and friends
Documentation/daxctl/Makefile.am | 5 +-
.../daxctl/daxctl-offline-memory.txt | 72 ++
Documentation/daxctl/daxctl-online-memory.txt | 80 ++
.../daxctl/daxctl-reconfigure-device.txt | 139 ++++
Makefile.am | 3 +-
contrib/ndctl | 38 +-
daxctl/Makefile.am | 2 +
daxctl/builtin.h | 3 +
daxctl/daxctl.c | 3 +
daxctl/device.c | 476 ++++++++++++
daxctl/lib/Makefile.am | 5 +-
daxctl/lib/libdaxctl-private.h | 38 +
daxctl/lib/libdaxctl.c | 685 ++++++++++++++++++
daxctl/lib/libdaxctl.sym | 18 +
daxctl/libdaxctl.h | 16 +
test/Makefile.am | 3 +-
test/common | 19 +-
test/daxctl-devices.sh | 81 +++
util/iomem.c | 37 +
util/iomem.h | 12 +
util/json.c | 22 +
21 files changed, 1743 insertions(+), 14 deletions(-)
create mode 100644 Documentation/daxctl/daxctl-offline-memory.txt
create mode 100644 Documentation/daxctl/daxctl-online-memory.txt
create mode 100644 Documentation/daxctl/daxctl-reconfigure-device.txt
create mode 100644 daxctl/device.c
create mode 100755 test/daxctl-devices.sh
create mode 100644 util/iomem.c
create mode 100644 util/iomem.h
--
2.20.1
2 years, 10 months
[PATCH AUTOSEL 5.2 67/85] device-dax: fix memory and resource leak if hotplug fails
by Sasha Levin
From: Pavel Tatashin <pasha.tatashin(a)soleen.com>
[ Upstream commit 31e4ca92a7dd4cdebd7fe1456b3b0b6ace9a816f ]
Patch series ""Hotremove" persistent memory", v6.
Recently, adding a persistent memory to be used like a regular RAM was
added to Linux. This work extends this functionality to also allow hot
removing persistent memory.
We (Microsoft) have an important use case for this functionality.
The requirement is for physical machines with small amount of RAM (~8G)
to be able to reboot in a very short period of time (<1s). Yet, there
is a userland state that is expensive to recreate (~2G).
The solution is to boot machines with 2G preserved for persistent
memory.
Copy the state, and hotadd the persistent memory so machine still has
all 8G available for runtime. Before reboot, offline and hotremove
device-dax 2G, copy the memory that is needed to be preserved to pmem0
device, and reboot.
The series of operations look like this:
1. After boot restore /dev/pmem0 to ramdisk to be consumed by apps.
and free ramdisk.
2. Convert raw pmem0 to devdax
ndctl create-namespace --mode devdax --map mem -e namespace0.0 -f
3. Hotadd to System RAM
echo dax0.0 > /sys/bus/dax/drivers/device_dax/unbind
echo dax0.0 > /sys/bus/dax/drivers/kmem/new_id
echo online_movable > /sys/devices/system/memoryXXX/state
4. Before reboot hotremove device-dax memory from System RAM
echo offline > /sys/devices/system/memoryXXX/state
echo dax0.0 > /sys/bus/dax/drivers/kmem/unbind
5. Create raw pmem0 device
ndctl create-namespace --mode raw -e namespace0.0 -f
6. Copy the state that was stored by apps to ramdisk to pmem device
7. Do kexec reboot or reboot through firmware if firmware does not
zero memory in pmem0 region (These machines have only regular
volatile memory). So to have pmem0 device either memmap kernel
parameter is used, or devices nodes in dtb are specified.
This patch (of 3):
When add_memory() fails, the resource and the memory should be freed.
Link: http://lkml.kernel.org/r/20190517215438.6487-2-pasha.tatashin@soleen.com
Fixes: c221c0b0308f ("device-dax: "Hotplug" persistent memory for use like normal RAM")
Signed-off-by: Pavel Tatashin <pasha.tatashin(a)soleen.com>
Reviewed-by: Dave Hansen <dave.hansen(a)intel.com>
Cc: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: Borislav Petkov <bp(a)suse.de>
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Dave Jiang <dave.jiang(a)intel.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Fengguang Wu <fengguang.wu(a)intel.com>
Cc: Huang Ying <ying.huang(a)intel.com>
Cc: James Morris <jmorris(a)namei.org>
Cc: Jérôme Glisse <jglisse(a)redhat.com>
Cc: Keith Busch <keith.busch(a)intel.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Ross Zwisler <zwisler(a)kernel.org>
Cc: Sasha Levin <sashal(a)kernel.org>
Cc: Takashi Iwai <tiwai(a)suse.de>
Cc: Tom Lendacky <thomas.lendacky(a)amd.com>
Cc: Vishal Verma <vishal.l.verma(a)intel.com>
Cc: Yaowei Bai <baiyaowei(a)cmss.chinamobile.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/dax/kmem.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index a02318c6d28a..4c0131857133 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -66,8 +66,11 @@ int dev_dax_kmem_probe(struct device *dev)
new_res->name = dev_name(dev);
rc = add_memory(numa_node, new_res->start, resource_size(new_res));
- if (rc)
+ if (rc) {
+ release_resource(new_res);
+ kfree(new_res);
return rc;
+ }
return 0;
}
--
2.20.1
2 years, 10 months
[PATCH v2 0/1] mm/memory-failure: Poison read receives SIGKILL instead of SIGBUS issue
by Jane Chu
Changes in v2:
- move 'tk' allocations internal to add_to_kill(), suggested by Dan;
- ran checkpatch.pl check, pointed out by Matthew;
- Noaya pointed out that v1 would have missed the SIGKILL
if "tk->addr == -EFAULT", since the code returns early.
Incorporated Noaya's suggestion, also, skip VMAs where
"tk->size_shift == 0" for zone device page, and deliver SIGBUS
when "tk->size_shift != 0" so the payload is helpful;
- added Suggested-by: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com>
Jane Chu (1):
mm/memory-failure: Poison read receives SIGKILL instead of SIGBUS if
mmaped more than once
mm/memory-failure.c | 62 ++++++++++++++++++++++-------------------------------
1 file changed, 26 insertions(+), 36 deletions(-)
--
1.8.3.1
2 years, 10 months
[ndctl PATCH v6 00/13] daxctl: add a new reconfigure-device command
by Vishal Verma
Changes in v6:
- For memory block online/offline operations, the kernel responds with
an EINVAL for both 'real' errors, and if the memory was already in the
requested state. Since there is a TOCTOU hole between checking the
state and storing it, just perform a second check if the store results
in an error. If the check shows the state to be the same as the one
we're attempting, it means that another agent (usually udev) won the
race, but we don't care so long as the state change happened, so don't
report an error. (Fan Du)
Changes in v5:
- device.c: correctly set loglevel for daxctl_ctx for --verbose
- drop the subsys caching, its complexity started to exceed its
benefit. dax-class device models will simply error out during
reconfigure. (Dan)
- Add a note to the man page for the above.
- Clarify the onlining policy (online_movable) in the man page
- rename "numa_node" to "target_node" in device listings (Dan)
- When printing a device 'mode', assume devdax if !system-ram,
avoiding a "mode: unknown" situation which can be confusing. (Dan)
- Add a "state: disabled" attribute to the device listing if a driver
is not bound. This is more apt than the previous "mode: unknown"
listing.
- add an api to get 'dev->resource' parsing /proc/iomem as a
fallback for when the kernel doesn't provide the attribute (Dan)
- convert node_* apis to 'memory_* apis that act on a new daxctl_memory
object (Dan)
- online only memory sections belonging to the device in question by
cross referencing block indices with the dax device resource (Dan)
- Refuse to reconfigure a device that is already in the target mode.
Until now, reconfiguring a system-ram device back to system-ram would
result in a 'online memory may not be hot-removed' kernel warning.
- If the device was already in the system-ram mode, skip
disabling/enabling, but still try to online the memory unless the
--no-online option is in effect.
- In daxctl_unbind, also 'remove_id' to prevent devices automatically
binding to the kmem driver on a disable + re-enable, which can be
surprising (Dan).
- Rewrite the top half of daxctl/device.c to borrow elements from
ndctl/namespace.c so that it can support growing additional commands
that operate on devices (online-memory and offline-memory)
- Refactor the bottom half of daxctl/device.c so we only do the
disabling/offlining steps if the device was enabled.
- Add new commands to online and offline memory sections (Dan)
associated with a given dax device (Dan)
- Add a new test - daxctl-device.sh - to test daxctl reconfigure-device,
online-memory, and offline-memory commands.
- Add an example in documentation demonstrating how to use numactl
to bind a process to a node surfaced from a dax device (Andy Rudoff)
Changes in v4:
- Don't fail add_dax_dev for kmod failures. Instead fail only when the kmod
list is actually used, i.e. during daxctl-reconfigure-device
Changes in v3:
- In daxctl_dev_get_mode(), remove the subsystem warning, detect dax-class
and simply make it return devdax
Changes in v2:
- Add examples to the documentation page (Dave Hansen)
- Clarify documentation regarding the conversion from system-ram to devdax
- Remove any references to a persistent config from the documentation -
those can be added when the feature is added.
- device.c: validate option compatibility
- daxctl-list: display numa_node for device listings
- daxctl-list: display mode for device listings
- make the options more consistent by adding a '-O' short option
for --attempt-offline
Add a new daxctl-reconfigure-device command that lets us reconfigure DAX
devices back and forth between 'system-ram' and 'device-dax' modes. It
also includes facilities to online any newly hot-plugged memory
(default), and attempt to offline memory before converting away from the
system-ram mode (not default, requires a --attempt-offline option).
Currently missing from this series is a way to persistently store which
devices have been 'marked' for use as system-ram. This depends on a
config system overhaul in ndctl, and patches for those will follow
separately and are independent of this work.
Example invocations:
1. Reconfigure dax0.0 to system-ram mode, don’t online the memory
# daxctl reconfigure-device --mode=system-ram --no-online dax0.0
[
{
"chardev":"dax0.0",
"size":16777216000,
"target_node":2,
"mode":"system-ram"
}
]
2. Reconfigure dax0.0 to devdax mode, attempt to offline the memory
# daxctl reconfigure-device --human --mode=devdax --attempt-offline dax0.0
{
"chardev":"dax0.0",
"size":"15.63 GiB (16.78 GB)",
"target_node":2,
"mode":"devdax"
}
3. Reconfigure all dax devices on region0 to system-ram mode
# daxctl reconfigure-device --mode=system-ram --region=0 all
[
{
"chardev":"dax0.0",
"size":16777216000,
"target_node":2,
"mode":"system-ram"
},
{
"chardev":"dax0.1",
"size":16777216000,
"target_node":3,
"mode":"system-ram"
}
]
These patches can also be found in the 'kmem-pending' branch on github:
https://github.com/pmem/ndctl/tree/kmem-pending
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Pavel Tatashin <pasha.tatashin(a)soleen.com>
Vishal Verma (13):
libdaxctl: add interfaces to get ctx and check device state
libdaxctl: add interfaces to enable/disable devices
libdaxctl: add an interface to retrieve the device resource
libdaxctl: add a 'daxctl_memory' object for memory based operations
daxctl/list: add target_node for device listings
libdaxctl: add an interface to get the mode for a dax device
daxctl: add a new reconfigure-device command
Documentation/daxctl: add a man page for daxctl-reconfigure-device
daxctl: add commands to online and offline memory
Documentation: Add man pages for daxctl-{on,off}line-memory
contrib/ndctl: fix region-id completions for daxctl
contrib/ndctl: add bash-completion for the new daxctl commands
test: Add a unit test for daxctl-reconfigure-device and friends
Documentation/daxctl/Makefile.am | 5 +-
.../daxctl/daxctl-offline-memory.txt | 72 ++
Documentation/daxctl/daxctl-online-memory.txt | 80 +++
.../daxctl/daxctl-reconfigure-device.txt | 139 ++++
Makefile.am | 3 +-
contrib/ndctl | 38 +-
daxctl/Makefile.am | 2 +
daxctl/builtin.h | 3 +
daxctl/daxctl.c | 3 +
daxctl/device.c | 484 +++++++++++++
daxctl/lib/Makefile.am | 5 +-
daxctl/lib/libdaxctl-private.h | 30 +
daxctl/lib/libdaxctl.c | 665 ++++++++++++++++++
daxctl/lib/libdaxctl.sym | 19 +
daxctl/libdaxctl.h | 23 +
test/Makefile.am | 3 +-
test/common | 19 +-
test/daxctl-devices.sh | 81 +++
util/iomem.c | 37 +
util/iomem.h | 12 +
util/json.c | 25 +
21 files changed, 1734 insertions(+), 14 deletions(-)
create mode 100644 Documentation/daxctl/daxctl-offline-memory.txt
create mode 100644 Documentation/daxctl/daxctl-online-memory.txt
create mode 100644 Documentation/daxctl/daxctl-reconfigure-device.txt
create mode 100644 daxctl/device.c
create mode 100755 test/daxctl-devices.sh
create mode 100644 util/iomem.c
create mode 100644 util/iomem.h
--
2.20.1
2 years, 10 months
[PATCH] mm/memory-failure: Poison read receives SIGKILL instead of SIGBUS if mmaped more than once
by Jane Chu
Mmap /dev/dax more than once, then read the poison location using address
from one of the mappings. The other mappings due to not having the page
mapped in will cause SIGKILLs delivered to the process. SIGKILL succeeds
over SIGBUS, so user process looses the opportunity to handle the UE.
Although one may add MAP_POPULATE to mmap(2) to work around the issue,
MAP_POPULATE makes mapping 128GB of pmem several magnitudes slower, so
isn't always an option.
Details -
ndctl inject-error --block=10 --count=1 namespace6.0
./read_poison -x dax6.0 -o 5120 -m 2
mmaped address 0x7f5bb6600000
mmaped address 0x7f3cf3600000
doing local read at address 0x7f3cf3601400
Killed
Console messages in instrumented kernel -
mce: Uncorrected hardware memory error in user-access at edbe201400
Memory failure: tk->addr = 7f5bb6601000
Memory failure: address edbe201: call dev_pagemap_mapping_shift
dev_pagemap_mapping_shift: page edbe201: no PUD
Memory failure: tk->size_shift == 0
Memory failure: Unable to find user space address edbe201 in read_poison
Memory failure: tk->addr = 7f3cf3601000
Memory failure: address edbe201: call dev_pagemap_mapping_shift
Memory failure: tk->size_shift = 21
Memory failure: 0xedbe201: forcibly killing read_poison:22434 because of failure to unmap corrupted page
=> to deliver SIGKILL
Memory failure: 0xedbe201: Killing read_poison:22434 due to hardware memory corruption
=> to deliver SIGBUS
Signed-off-by: Jane Chu <jane.chu(a)oracle.com>
---
mm/memory-failure.c | 16 ++++++++++------
1 file changed, 10 insertions(+), 6 deletions(-)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d9cc660..7038abd 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -315,7 +315,6 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
if (*tkc) {
tk = *tkc;
- *tkc = NULL;
} else {
tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
if (!tk) {
@@ -331,16 +330,21 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT;
/*
- * In theory we don't have to kill when the page was
- * munmaped. But it could be also a mremap. Since that's
- * likely very rare kill anyways just out of paranoia, but use
- * a SIGKILL because the error is not contained anymore.
+ * Indeed a page could be mmapped N times within a process. And it's possible
+ * that not all of those N VMAs contain valid mapping for the page. In which
+ * case we don't want to send SIGKILL to the process on behalf of the VMAs
+ * that don't have the valid mapping, because doing so will eclipse the SIGBUS
+ * delivered on behalf of the active VMA.
*/
if (tk->addr == -EFAULT || tk->size_shift == 0) {
pr_info("Memory failure: Unable to find user space address %lx in %s\n",
page_to_pfn(p), tsk->comm);
- tk->addr_valid = 0;
+ if (tk != *tkc)
+ kfree(tk);
+ return;
}
+ if (tk == *tkc)
+ *tkc = NULL;
get_task_struct(tsk);
tk->tsk = tsk;
list_add_tail(&tk->nd, to_kill);
--
1.8.3.1
2 years, 10 months