On Tue, Feb 7, 2017 at 11:22 PM, Xiong Zhou <xzhou(a)redhat.com> wrote:
On Wed, Feb 08, 2017 at 03:09:07PM +0800, Xiong Zhou wrote:
> On Wed, Feb 08, 2017 at 02:56:51PM +0800, Xiong Zhou wrote:
> > On Tue, Feb 07, 2017 at 09:05:21PM -0800, Dan Williams wrote:
> > > On Tue, Feb 7, 2017 at 8:49 PM, Xiong Zhou <xzhou(a)redhat.com> wrote:
> > > > On Tue, Feb 07, 2017 at 08:10:14PM -0800, Dan Williams wrote:
> > > >> On Tue, Feb 7, 2017 at 7:51 PM, Xiong Zhou
<xzhou(a)redhat.com> wrote:
> > > >> > On Fri, Jan 20, 2017 at 12:40:07PM +0800, Xiong Zhou wrote:
> > > >> >> Hi,
> > > >> >>
> > > >> >> At first, I am not sure whether this is an issue.
> > > >> >>
> > > >> >> mmap a file in a DAX mountpoint, open another file
> > > >> >> in a non-DAX mountpoint with O_DIRECT, write the
> > > >> >> mapped area to the other file.
> > > >> >>
> > > >> >> This write Success on pmem ramdisk(memmap=2G!20G like)
> > > >> >> This write Fail(Bad address) on nvdimm pmem devices.
> > > >> >> This write Fail(Bad address) on brd based ramdisk.
> > > >> >>
> > > >> >> If we skip the O_DIRECT flag, all tests pass.
> > > >> >>
> > > >> >> If we write from DAX to DAX, all tests pass.
> > > >> >> If we write from non-DAX to DAX, all tests pass.
> > > >> >>
> > > >> > snip..
> > > >> >
> > > >> > To falloc instead of pwrite while initiating test files,
> > > >> > ( Thanks Ross! :)
> > > >> > the write call returned success, however the following
> > > >> > read back to mmaped area FAILED the same way:
> > > >> >
> > > >> > return (Bad address) on raw-mode nvdimm device;
> > > >> > return (Success) on memory-mode nvdimm device;
> > > >> > return (Bad address) on brd based ramdisk.
> > > >> >
> > > >> > Also, this only happends with O_DIRECT flag on.
> > > >> >
> > > >> > This smells like an issue to me, still looking into why
> > > >> > read can't get that page..
> > > >> >
> > > >>
> > > >> Why does it smell like an issue? Any path that calls
get_user_pages()
> > > >
> > > > Because the write call gets its page and succeeds, while read back
fails.
> > > > __get_user_pages on the same address first pass, then fail.
> > >
> > > Ok, I might have misread your description. Can you tell me the exact
> > > reproduction steps so I can give it a try?
> >
> > Reproducer attached.
> >
Attachment issue..
You need root to run this, assuming your pmem device is /dev/pmem0.
Steps:
sh test.sh /dev/pmem0
Thanks for your time!
----- test.sh --------------------------------------
#!/bin/bash
[ -z "$1" ] && { echo "$0 <dev>"; exit 1; }
DEV="$1"
MNT=/tbdmnt
cc t_mmap_dio.c
mkdir -p $MNT
wipefs -af $DEV > /dev/null
#mkfs.xfs -fq -d su=2m,sw=1 $DEV && \
mkfs.xfs -fq $DEV && \
mount -o dax $DEV $MNT && \
#xfs_io -f -c "w 0 268435456" $MNT/ts > /dev/null && \
#xfs_io -f -c "w 0 268435456" /root/td > /dev/null
xfs_io -f -c "falloc 0 268435456" $MNT/ts > /dev/null && \
xfs_io -f -c "falloc 0 268435456" /root/td > /dev/null
if ./a.out $MNT/ts /root/td 16777216 "$DEV" ; then
echo dio PASS
else
echo dio FAIL
fi
rm -f $MNT/ts /root/td
xfs_io -f -c "falloc 0 268435456" $MNT/ts > /dev/null
xfs_io -f -c "falloc 0 268435456" /root/td > /dev/null
if ./a.out -b $MNT/ts /root/td 16777216 "$DEV" ; then
echo buffered IO PASS
else
echo buffered IO FAIL
fi
umount $MNT
--------------------------------------------------------
----- t_mmap_dio.c ----------------------------------
/*
* This programme was originally written by
* Jeff Moyer <jmoyer(a)redhat.com>
*
* Copyright (C) 2016, Red Hat, Inc.
*/
#define _GNU_SOURCE 1
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <libaio.h>
#include <errno.h>
#include <sys/time.h>
void usage(char *prog)
{
fprintf(stderr,
"usage: %s <src file> <dest file> <size>
<msg>\n",
prog);
exit(1);
}
void err_exit(char *op, unsigned long len, char *s)
{
fprintf(stderr, "%s(%s) len %lu %s\n",
op, strerror(errno), len, s);
exit(1);
}
int main(int argc, char **argv)
{
int fd, fd2, ret, dio = 1;
char *map;
char *msg;
char *sfile;
char *dfile;
unsigned long len, opt;
if (argc < 4)
usage(basename(argv[0]));
while ((opt = getopt(argc, argv, "b")) != -1)
dio = 0;
sfile = argv[optind];
dfile = argv[optind + 1];
msg = argv[optind + 3];
len = strtoul(argv[optind + 2], NULL, 10);
if (errno == ERANGE)
err_exit("strtoul", 0, msg);
/* Open source file and mmap*/
fd = open(sfile, O_RDWR, 0644);
if (fd < 0)
err_exit("open src", len, msg);
map = (char *)mmap(NULL, len,
PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
if (map == MAP_FAILED)
err_exit("mmap", len, msg);
if (dio == 1) {
/* Open dest file with O_DIRECT */
fd2 = open(dfile, O_RDWR|O_DIRECT, 0644);
if (fd2 < 0)
err_exit("open dest", len, msg);
} else {
/* Open dest file without O_DIRECT */
fd2 = open(dfile, O_RDWR, 0644);
if (fd2 < 0)
err_exit("open dest", len, msg);
}
/* First, test storing to dest file from source mapping */
ret = write(fd2, map, len);
if (ret != len)
err_exit("write", len, msg);
ret = fsync(fd2);
if (ret != 0)
err_exit("fsync", len, msg);
ret = (int)lseek(fd2, 0, SEEK_SET);
if (ret == -1)
err_exit("lseek", len, msg);
/* Next, test reading from dest file into source mapping */
ret = read(fd2, map, len);
if (ret != len)
err_exit("read", len, msg);
ret = msync(map, len, MS_SYNC);
if (ret < 0)
err_exit("msync", len, msg);
ret = munmap(map, len);
if (ret < 0)
err_exit("munmap", len, msg);
ret = close(fd);
if (ret < 0)
err_exit("clsoe fd", len, msg);
ret = close(fd2);
if (ret < 0)
err_exit("close fd2", len, msg);
exit(0);
}
----------------------------------------------
----- my log -------------
Thanks for the reproducer!
sh-4.2# uname -r
4.10.0-rc7-master-f7d6040+
sh-4.2# whoami
root
sh-4.2# pwd
/root
sh-4.2# sh test.sh /dev/pmem0
dio PASS
buffered IO PASS
sh-4.2# sh test.sh /dev/pmem2
read(Bad address) len 16777216 /dev/pmem2
dio FAIL
buffered IO PASS
This is expected. In the raw case we can't do the direct-I/O access to
read() into the buffer since there's no page. The reason the write()
from the buffer succeeds is because the extent is unwritten, so the
filesystem uses the zero page.
This is why the:
xfs_io -f -c 'w 0 268435456' /tbdmnt/ts
...setup fails at the write(), while the:
xfs_io -f -c 'falloc 0 268435456' /tbdmnt/ts
...setup fails later at the read() when the test switches from hitting
the zero page to trying to lookup a "dax" page.
sh-4.2# modprobe brd rd_size=$((1*1024*1024))
sh-4.2# sh test.sh /dev/ram0
read(Bad address) len 16777216 /dev/ram0
dio FAIL
This fails because dax on /dev/ramX does not support direct-I/O. The
write() works for the same "zero-page" reason above, but the read()
fails because the pte entry for the mapping is marked pte_special()
and we don't have a ->find_special_page() in the vm_ops to go from pte
back to the page that the brd driver is using. I don't think this is
a problem worth solving since brd is more of a test vehicle than a
production driver.