NFS problems

203 views
Skip to first unread message

Rudd-O

unread,
Jul 11, 2010, 11:10:39 PM7/11/10
to zfs-fuse
When I use Nautilus to drag and drop a folder or file into another
folder that is served via NFSv4, and the served folder is on a ZFS-
FUSE partition, the file copy fails instantly with Permission denied
(EPERM). When inspecting the destination folder, there IS a file,
however it is permission 000.

No such thing happens when dragging and dropping a folder into an ext3
NFSv4-shared filesystem. The file gets successfully copied just fine.

What is going on?

Rudd-O

unread,
Jul 12, 2010, 2:09:00 AM7/12/10
to zfs-fuse
Sorry for replying to self. You can test this theory. Export your
ZFS file system over NFSv4. Now do this:

- on the server
cd /path/to/zfs/mount
umask 777
nano testfile
<type some text, save file>
ls -la testfile
<should give you a file with permissions 000, testfile, with the
contents of what you typed>

- on the client
cd /path/to/nfs/mounted/zfs/volume
umask 777
nano testfile
<type some text, save file>
<this will fail>
ls -la testfile
<will give you a file with permissions 000, but with ZERO BYTES>

Basically when a file is opened with permissions that are not writable
for the owner (Nautilus e.g. does this), what happens is that the file
is created, but the first write() to the open file is denied by the
NFS server.

WHY?

Rudd-O

unread,
Jul 12, 2010, 3:27:47 AM7/12/10
to zfs-fuse
As usual, more information:

---------------------------------------

strace cp Shared\ disk-s001.vmdk /var/shared/tmp2/ # < zfs vol
exported via nfs mounted here
execve("/bin/cp", ["cp", "Shared disk-s001.vmdk", "/var/shared/
tmp2/"], [/* 54 vars */]) = 0
brk(0) = 0xe84000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1,
0) = 0x7f92843a3000
access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or
directory)
open("/etc/ld.so.cache", O_RDONLY) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=199790, ...}) = 0
mmap(NULL, 199790, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f9284372000
close(3) = 0
open("/lib64/libselinux.so.1", O_RDONLY) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0PU@=>
\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=120480, ...}) = 0
mmap(0x3e3d400000, 2217736, PROT_READ|PROT_EXEC, MAP_PRIVATE|
MAP_DENYWRITE, 3, 0) = 0x3e3d400000
mprotect(0x3e3d41c000, 2093056, PROT_NONE) = 0
mmap(0x3e3d61b000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|
MAP_DENYWRITE, 3, 0x1b000) = 0x3e3d61b000
mmap(0x3e3d61d000, 1800, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|
MAP_ANONYMOUS, -1, 0) = 0x3e3d61d000
close(3) = 0
open("/lib64/librt.so.1", O_RDONLY) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0@!\300<>
\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=47064, ...}) = 0
mmap(0x3e3cc00000, 2128816, PROT_READ|PROT_EXEC, MAP_PRIVATE|
MAP_DENYWRITE, 3, 0) = 0x3e3cc00000
mprotect(0x3e3cc07000, 2093056, PROT_NONE) = 0
mmap(0x3e3ce06000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|
MAP_DENYWRITE, 3, 0x6000) = 0x3e3ce06000
close(3) = 0
open("/lib64/libacl.so.1", O_RDONLY) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0000\35\200L>
\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=34336, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1,
0) = 0x7f9284371000
mmap(0x3e4c800000, 2126992, PROT_READ|PROT_EXEC, MAP_PRIVATE|
MAP_DENYWRITE, 3, 0) = 0x3e4c800000
mprotect(0x3e4c807000, 2097152, PROT_NONE) = 0
mmap(0x3e4ca07000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|
MAP_DENYWRITE, 3, 0x7000) = 0x3e4ca07000
close(3) = 0
open("/lib64/libattr.so.1", O_RDONLY) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0P\23@J>
\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=20312, ...}) = 0
mmap(0x3e4a400000, 2113104, PROT_READ|PROT_EXEC, MAP_PRIVATE|
MAP_DENYWRITE, 3, 0) = 0x3e4a400000
mprotect(0x3e4a404000, 2093056, PROT_NONE) = 0
mmap(0x3e4a603000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|
MAP_DENYWRITE, 3, 0x3000) = 0x3e4a603000
close(3) = 0
open("/lib64/libc.so.6", O_RDONLY) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0p\355\201;>
\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=1838312, ...}) = 0
mmap(0x3e3b800000, 3664040, PROT_READ|PROT_EXEC, MAP_PRIVATE|
MAP_DENYWRITE, 3, 0) = 0x3e3b800000
mprotect(0x3e3b975000, 2097152, PROT_NONE) = 0
mmap(0x3e3bb75000, 20480, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|
MAP_DENYWRITE, 3, 0x175000) = 0x3e3bb75000
mmap(0x3e3bb7a000, 18600, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|
MAP_ANONYMOUS, -1, 0) = 0x3e3bb7a000
close(3) = 0
open("/lib64/libdl.so.2", O_RDONLY) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\340\r\0<>
\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=22536, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1,
0) = 0x7f9284370000
mmap(0x3e3c000000, 2109696, PROT_READ|PROT_EXEC, MAP_PRIVATE|
MAP_DENYWRITE, 3, 0) = 0x3e3c000000
mprotect(0x3e3c002000, 2097152, PROT_NONE) = 0
mmap(0x3e3c202000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|
MAP_DENYWRITE, 3, 0x2000) = 0x3e3c202000
close(3) = 0
open("/lib64/libpthread.so.0", O_RDONLY) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\20\\@<>
\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=141592, ...}) = 0
mmap(0x3e3c400000, 2208672, PROT_READ|PROT_EXEC, MAP_PRIVATE|
MAP_DENYWRITE, 3, 0) = 0x3e3c400000
mprotect(0x3e3c417000, 2093056, PROT_NONE) = 0
mmap(0x3e3c616000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|
MAP_DENYWRITE, 3, 0x16000) = 0x3e3c616000
mmap(0x3e3c618000, 13216, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|
MAP_ANONYMOUS, -1, 0) = 0x3e3c618000
close(3) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1,
0) = 0x7f928436f000
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1,
0) = 0x7f928436d000
arch_prctl(ARCH_SET_FS, 0x7f928436d7a0) = 0
mprotect(0x3e3d61b000, 4096, PROT_READ) = 0
mprotect(0x3e3ce06000, 4096, PROT_READ) = 0
mprotect(0x3e3bb75000, 16384, PROT_READ) = 0
mprotect(0x3e3c202000, 4096, PROT_READ) = 0
mprotect(0x3e3b61e000, 4096, PROT_READ) = 0
mprotect(0x3e3c616000, 4096, PROT_READ) = 0
munmap(0x7f9284372000, 199790) = 0
set_tid_address(0x7f928436da70) = 9830
set_robust_list(0x7f928436da80, 0x18) = 0
futex(0x7fff370eb04c, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x7fff370eb04c, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME,
1, NULL, 7f928436d7a0) = -1 EAGAIN (Resource temporarily unavailable)
rt_sigaction(SIGRTMIN, {0x3e3c405a90, [], SA_RESTORER|SA_SIGINFO,
0x3e3c40f440}, NULL, 8) = 0
rt_sigaction(SIGRT_1, {0x3e3c405b20, [], SA_RESTORER|SA_RESTART|
SA_SIGINFO, 0x3e3c40f440}, NULL, 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [RTMIN RT_1], NULL, 8) = 0
getrlimit(RLIMIT_STACK, {rlim_cur=10240*1024, rlim_max=RLIM_INFINITY})
= 0
statfs("/selinux", {f_type=0xf97cff8c, f_bsize=4096, f_blocks=0,
f_bfree=0, f_bavail=0, f_files=0, f_ffree=0, f_fsid={0, 0},
f_namelen=255, f_frsize=4096}) = 0
brk(0) = 0xe84000
brk(0xea5000) = 0xea5000
open("/usr/lib/locale/locale-archive", O_RDONLY) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=99158752, ...}) = 0
mmap(NULL, 99158752, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f927e4dc000
close(3) = 0
gettid() = 9830
open("/proc/self/task/9830/attr/current", O_RDONLY) = 3
read(3, "kernel\0", 4095) = 7
close(3) = 0
geteuid() = 501
stat("/var/shared/tmp2/", {st_mode=S_IFDIR|0755, st_size=2, ...}) = 0
stat("Shared disk-s001.vmdk", {st_mode=S_IFREG|0600,
st_size=372834304, ...}) = 0
stat("/var/shared/tmp2/Shared disk-s001.vmdk", 0x7fff370eaba0) = -1
ENOENT (No such file or directory)
open("Shared disk-s001.vmdk", O_RDONLY) = 3
fstat(3, {st_mode=S_IFREG|0600, st_size=372834304, ...}) = 0
open("/var/shared/tmp2/Shared disk-s001.vmdk", O_WRONLY|O_CREAT|
O_EXCL, 0600) = -1 EACCES (Permission denied)
open("/usr/share/locale/locale.alias", O_RDONLY) = 4
fstat(4, {st_mode=S_IFREG|0644, st_size=2512, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1,
0) = 0x7f92843a2000
read(4, "# Locale name alias data base.\n#"..., 4096) = 2512
read(4, "", 4096) = 0
close(4) = 0
munmap(0x7f92843a2000, 4096) = 0
open("/usr/share/locale/en_US.UTF-8/LC_MESSAGES/coreutils.mo",
O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en_US.utf8/LC_MESSAGES/coreutils.mo",
O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en_US/LC_MESSAGES/coreutils.mo", O_RDONLY) =
-1 ENOENT (No such file or directory)
open("/usr/share/locale/en.UTF-8/LC_MESSAGES/coreutils.mo", O_RDONLY)
= -1 ENOENT (No such file or directory)
open("/usr/share/locale/en.utf8/LC_MESSAGES/coreutils.mo", O_RDONLY) =
-1 ENOENT (No such file or directory)
open("/usr/share/locale/en/LC_MESSAGES/coreutils.mo", O_RDONLY) = 4
fstat(4, {st_mode=S_IFREG|0644, st_size=435, ...}) = 0
mmap(NULL, 435, PROT_READ, MAP_PRIVATE, 4, 0) = 0x7f92843a2000
close(4) = 0
write(2, "cp: ", 4cp: ) = 4
write(2, "cannot create regular file `/var"..., 67cannot create
regular file `/var/shared/tmp2/Shared disk-s001.vmdk') = 67
open("/usr/share/locale/en_US.UTF-8/LC_MESSAGES/libc.mo", O_RDONLY) =
-1 ENOENT (No such file or directory)
open("/usr/share/locale/en_US.utf8/LC_MESSAGES/libc.mo", O_RDONLY) =
-1 ENOENT (No such file or directory)
open("/usr/share/locale/en_US/LC_MESSAGES/libc.mo", O_RDONLY) = -1
ENOENT (No such file or directory)
open("/usr/share/locale/en.UTF-8/LC_MESSAGES/libc.mo", O_RDONLY) = -1
ENOENT (No such file or directory)
open("/usr/share/locale/en.utf8/LC_MESSAGES/libc.mo", O_RDONLY) = -1
ENOENT (No such file or directory)
open("/usr/share/locale/en/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT
(No such file or directory)
write(2, ": Permission denied", 19: Permission denied) = 19
write(2, "\n", 1
) = 1
close(3) = 0
close(0) = 0
close(1) = 0
close(2) = 0
exit_group(1) = ?

Rudd-O

unread,
Jul 12, 2010, 3:29:07 AM7/12/10
to zfs-fuse
And more:

http://osdir.com/ml/linux.nfsv4/2006-11/msg00148.html

On Jul 11, 11:09 pm, Rudd-O <rud...@rudd-o.com> wrote:

Rudd-O

unread,
Jul 12, 2010, 3:33:08 AM7/12/10
to zfs-fuse
And more:

IT DOES NOT HAPPEN WITH NFS V3!!!

only with nfs4

On Jul 11, 11:09 pm, Rudd-O <rud...@rudd-o.com> wrote:

sgheeren

unread,
Jul 12, 2010, 5:34:59 AM7/12/10
to zfs-...@googlegroups.com
On 07/12/2010 09:33 AM, Rudd-O wrote:
> And more:
>
> IT DOES NOT HAPPEN WITH NFS V3!!!
>
> only with nfs4
>
>
I can only imagine that the options used are not identical (they can't
be as otherwise you wouldn't have nfsv[34] afaict

Could you at least post the export /var/lib/nfs/etab

It would seem like the most obvious cause for touble. Also,

(a) did you use the exact same options when testing ext[234]?
(b) what happens when you test with another fuse-fs? [i.e. is this a
fuse issue?]
(c) what happens if you add/remove the
-o default_permissions
-o debug
fuse mount options?

(HINT: -o debug results in libfuse debug output when in foreground)

Rudd-O

unread,
Jul 12, 2010, 5:52:02 AM7/12/10
to zfs-fuse
I cannot restart my ZFS server -- someone else is using it. But that
does not matter -- NFS4 gives me this problem, and NFS3 just results
in a whole lotta ESTALE errors, which I presume is because of this:

http://marc.info/?l=linux-kernel&m=110538645308163&w=2

(I suspect that as soon as the FUSE module forgets about an inode
associated to a file in the ZFS file system, ESTALE is returned to the
client when it tries to access the file handle.)

NFS is just not production-quality with FUSE, I guess.

etab is lots of lines like this one:

/export/shared/Entertainment/Music
10.254.102.0/24(ro,sync,wdelay,nohide,nocrossmnt,insecure,no_root_squash,no_all_squash,no_subtree_check,secure_locks,acl,fsid=5,mountpoint,anonuid=65534,anongid=65534)

Rudd-O

unread,
Jul 12, 2010, 5:59:57 AM7/12/10
to zfs-fuse
Thisis what FUSE's README.NFS says:

----------------------

NFS exporting is supported in Linux kernels 2.6.27 or later.

You need to add an fsid=NNN option to /etc/exports to make exporting a
FUSE directory work.

Filesystem support
------------------

NFS exporting works to some extent on all fuse filesystems, but not
perfectly. This is due to the stateless nature of the protocol, the
server has no way of knowing whether the client is keeping a reference
to a file or not, and hence that file may be removed from the server's
cache. In that case there has to be a way to look up that object
using the inode number, otherwise an ESTALE error will be returned.

1) low-level interface

Filesystems need to implement special lookups for the names "." and
"..". The former may be requested on any inode, including
non-directories, while the latter is only requested for directories.
Otherwise these special lookups should behave identically to ordinary
lookups.

2) high-level interface

Because the high-level interface is path based, it is not possible to
delegate looking up by inode to the filesystem.

To work around this, currently a "noforget" option is provided, which
makes the library remember nodes forever. This will make the NFS
server happy, but also results in an ever growing memory footprint for
the filesystem. For this reason if the filesystem is large (or the
memory is small), then this option is not recommended.

-------------------------

I do NOT believe that we implement looking up "." on files, nor do I
believe that we implement a way to look up by inode. Do we? Because
if we don't, that means we do not support NFS at all.

On Jul 12, 2:34 am, sgheeren <sghee...@hotmail.com> wrote:

sgheeren

unread,
Jul 12, 2010, 6:10:05 AM7/12/10
to zfs-...@googlegroups.com
On 07/12/2010 11:59 AM, Rudd-O wrote:
NFS exporting works to some extent on all fuse filesystems, but not
perfectly.  This is due to the stateless nature of the protocol, the
server has no way of knowing whether the client is keeping a reference
to a file or not, and hence that file may be removed from the server's
cache.  In that case there has to be a way to look up that object
using the inode number, otherwise an ESTALE error will be returned.
  
I may venture the guess that this might be remedied by using maint[1] or by passing --disable-page-cache (which does not disable the page cache but disables fuse caching of some sort at zfsfuse_opencreate, you should recognize this as the keep_cache flag[2])

[1] http://gitweb.zfs-fuse.net/?p=official;a=snapshot;h=maint;sf=tgz
or via http://zfs-fuse.net/releases/0.6.9

[2] our benchmarks were not able to confirm any (significant) loss of performance using bonnie tests, see http://zfs-fuse.net/issues/65 and git log f138e5b

sgheeren

unread,
Jul 12, 2010, 6:12:46 AM7/12/10
to zfs-...@googlegroups.com
On 07/12/2010 11:59 AM, Rudd-O wrote:
> I do NOT believe that we implement looking up "." on files
I don't have a clue what they/you mean by that.

Which call, which params and what supposed return value(s)?


PS. congratulations on Spain's victory in the world cup!

Rudd-O

unread,
Jul 12, 2010, 6:39:28 AM7/12/10
to zfs-fuse
thanks but I was rooting for Germany :-( Damn octopus.

Rudd-O

unread,
Jul 12, 2010, 6:41:15 AM7/12/10
to zfs-fuse
I may test --disable-page-cache but not right now as the ZFS server is
being used right now through SAMBA.

On Jul 12, 3:10 am, sgheeren <sghee...@hotmail.com> wrote:
> On 07/12/2010 11:59 AM, Rudd-O wrote:> NFS exporting works to some extent on all fuse filesystems, but not
> > perfectly.  This is due to the stateless nature of the protocol, the
> > server has no way of knowing whether the client is keeping a reference
> > to a file or not, and hence that file may be removed from the server's
> > cache.  In that case there has to be a way to look up that object
> > using the inode number, otherwise an ESTALE error will be returned.
>
> I may venture the guess that this might be remedied by using maint[1] or
> by passing --disable-page-cache (which does not disable the page cache
> but disables fuse caching of some sort at zfsfuse_opencreate, you should
> recognize this as the keep_cache flag[2])
>
> [1]http://gitweb.zfs-fuse.net/?p=official;a=snapshot;h=maint;sf=tgz
> or viahttp://zfs-fuse.net/releases/0.6.9

Rudd-O

unread,
Jul 28, 2010, 7:41:06 PM7/28/10
to zfs-fuse
BTW, --disable-page-cache DOES disable the page cache, dude. zfs-fuse/
zfs-operations.c, line 900:

fi->keep_cache = page_cache;

this means that the kernel will NOT CACHE the contents of the blocks
of files read from any ZFS file system, AT ALL. Of course, if you use
bonnie tests you will not see much performance loss, because bonnie is
designed by default to smash the cache so as to minimize the effects
of caching in the test results, BUT if you pay close attention to the
sound of your disks, every time you read a large file (that will fit
in the pagecache) you will hear heavy disk activity when --disable-
page-cache is specified.

Plus I don't think that disable-page-cache will fix the NFS problems
at all. Perhaps -o default_permissions would change things, but I
cannot afford to test this theory now.

Ah, I did not mention, but CIFS mounts sort-of-work (apart from the
brokenness I already shared about inodes). However, and this is the
big however, it's SLOOOOW over high-latency links, as directory
entries and inode info is not cached -- only file data is. So even ls
on remotely CIFS-mounted directory of a ZFS-FUSE file system is a
pain.

On Jul 12, 3:10 am, sgheeren <sghee...@hotmail.com> wrote:
> On 07/12/2010 11:59 AM, Rudd-O wrote:> NFS exporting works to some extent on all fuse filesystems, but not
> > perfectly.  This is due to the stateless nature of the protocol, the
> > server has no way of knowing whether the client is keeping a reference
> > to a file or not, and hence that file may be removed from the server's
> > cache.  In that case there has to be a way to look up that object
> > using the inode number, otherwise an ESTALE error will be returned.
>
> I may venture the guess that this might be remedied by using maint[1] or
> by passing --disable-page-cache (which does not disable the page cache
> but disables fuse caching of some sort at zfsfuse_opencreate, you should
> recognize this as the keep_cache flag[2])
>
> [1]http://gitweb.zfs-fuse.net/?p=official;a=snapshot;h=maint;sf=tgz
> or viahttp://zfs-fuse.net/releases/0.6.9
>
> [2] our benchmarks were not able to confirm any (significant) loss of

Rudd-O

unread,
Jul 29, 2010, 12:32:46 AM7/29/10
to zfs-fuse
Great news! I have managed to export ZFS filesystems STABLY and
WITHOUT the kernel errors by starting unfsd3 http://unfs3.sourceforge.net/
rather than knfsd. Problem = SOLVED. Performance is likely not that
great, but M E H, this is better than SAMBA (NFS client caches
readdirs) and knfsd (the lack of persistent inodes sucks big time).

On 28 jul, 16:41, Rudd-O <rud...@rudd-o.com> wrote:
> BTW, --disable-page-cache DOES disable the page cache, dude.  zfs-fuse/
> zfs-operations.c, line 900:
>
>         fi->keep_cache = page_cache;
>
> this means that the kernel will NOT CACHE the contents of the blocks
> of files read from any ZFS file system, AT ALL.  Of course, if you use
> bonnie tests you will not see much performance loss, because bonnie is
> designed by default to smash the cache so as to minimize the effects
> of caching in the test results, BUT if you pay close attention to the
> sound of your disks, every time you read a large file (that will fit
> in the pagecache) you will hear heavy disk activity  when --disable-
> page-cache is specified.
>
> Plus I don't think that disable-page-cache will fix theNFSproblems
> at all.  Perhaps -o default_permissions would change things, but I
> cannot afford to test this theory now.
>
> Ah, I did not mention, but CIFS mounts sort-of-work (apart from the
> brokenness I already shared about inodes).  However, and this is the
> big however, it's SLOOOOW over high-latency links, as directory
> entries and inode info is not cached -- only file data is.  So even ls
> on remotely CIFS-mounted directory of a ZFS-FUSE file system is a
> pain.
>
> On Jul 12, 3:10 am, sgheeren <sghee...@hotmail.com> wrote:
>
> > On 07/12/2010 11:59 AM, Rudd-O wrote:>NFSexporting works to some extent on all fuse filesystems, but not

Emmanuel Anne

unread,
Jul 30, 2010, 3:04:22 PM7/30/10
to zfs-...@googlegroups.com
Sorry for the very late delay to reply to this. I just confirm everything works fine with knfsd v3 (never tested with v4).
It's probably a fuse bug related to nfs v4 then.

2010/7/29 Rudd-O <rud...@rudd-o.com>
--
To post to this group, send email to zfs-...@googlegroups.com
To visit our Web site, click on http://zfs-fuse.net/



--
zfs-fuse git repository : http://rainemu.swishparty.co.uk/cgi-bin/gitweb.cgi?p=zfs;a=summary

Manuel Amador (Rudd-O)

unread,
Aug 1, 2010, 1:02:56 AM8/1/10
to zfs-...@googlegroups.com
You didn't confirm well.  ESTALE is a situation that only happens randomly (basically when the kernel drops the metadata cache on the inode).  If you want to confirm this, what you have to do is have like a really really large music collection, 150 GB or so, exported through NFS and mounted on another machine.  Then scan your collection using Amarok.  Enable auditd with these rules:

-a exit,always -F arch=b64 -F success=0 -F exit=-116 -S all

now check the audit log when you do the scan.  You will see LOTS of ESTALE errors.

ZFSRC details:

fuse-attr-timeout = 3600
fuse-entry-timeout = 3600

Emmanuel Anne

unread,
Aug 1, 2010, 3:51:34 AM8/1/10
to zfs-...@googlegroups.com
Maybe hitting the 32 bit limit in the inodes for nfsv3 would be a problem ? Normally it's handled, the high part of the 64 bit value is used as generation, and it would be very hard to reproduce (2^32 is a very big number !).

2010/8/1 Emmanuel Anne <emmanu...@gmail.com>
Well you wouldn't have an easier way than building a 150 Gb mp3 collection ? Because mine must be 10 times smaller for now, or even more than 10 times !
I regularly use nfsv3 on zfs but just for reading, no writes at all, never noticed any estale, and it wouldn't make sense anyway : fuse uses the inode numbers directly from zfs, so it can't forget them. The only problem is when there is more than 1fs in a subtree, in this case the fsid parameter becomes mandatory to isolate them.
Anyway, I'll experiment with this auditd thing (tonight !), but I don't think it will find anything unusual.

2010/8/1 Manuel Amador (Rudd-O) <rud...@rudd-o.com>

Emmanuel Anne

unread,
Aug 1, 2010, 3:48:16 AM8/1/10
to zfs-...@googlegroups.com
Well you wouldn't have an easier way than building a 150 Gb mp3 collection ? Because mine must be 10 times smaller for now, or even more than 10 times !
I regularly use nfsv3 on zfs but just for reading, no writes at all, never noticed any estale, and it wouldn't make sense anyway : fuse uses the inode numbers directly from zfs, so it can't forget them. The only problem is when there is more than 1fs in a subtree, in this case the fsid parameter becomes mandatory to isolate them.
Anyway, I'll experiment with this auditd thing (tonight !), but I don't think it will find anything unusual.

2010/8/1 Manuel Amador (Rudd-O) <rud...@rudd-o.com>
You didn't confirm well.  ESTALE is a situation that only happens randomly (basically when the kernel drops the metadata cache on the inode).  If you want to confirm this, what you have to do is have like a really really large music collection, 150 GB or so, exported through NFS and mounted on another machine.  Then scan your collection using Amarok.  Enable auditd with these rules:

Manuel Amador (Rudd-O)

unread,
Aug 1, 2010, 8:04:32 AM8/1/10
to zfs-...@googlegroups.com
Donno about that.  I will try to reproduce the problem again and isolate the set of circumstances under which this is a problem.

Manuel Amador (Rudd-O)

unread,
Aug 1, 2010, 8:06:14 AM8/1/10
to zfs-...@googlegroups.com
NFs doesnt work like that.  The inode is irrelevant.  What matters is that the client's filehandle is mapped to the correct file in the served file system.  When the filehandle becomes invalid (the file is e.g. deleted on the server without the client's knowledge), ESTALE is returned.  If the file is "deleted" and then "reappears" in ZFS from the standpoint of the kernel NFS server, then ESTALE is the logical consequence.

Rudd-O

unread,
Aug 4, 2010, 4:33:16 AM8/4/10
to zfs-fuse
New information.

Server: vanilla 2.6.35 kernel
Client: latest Fedora kernel

NFSv4:

- creation of a file using cp or any other tool that opens files with
O_EXCL causes the file to be created with mode 000, and the app to get
a Permission Denied reply from the creat() syscall
- Problem exclusive to ZFS-FUSE and other FUSE filesystems exported
via NFS -- above problem is not reproducible in tmpfs and ext4
filesystems exported via NFS

NFSv3:

- stale file handles, a LOT of them. Run find /mountpoint or use
something like amarokcollectionscanner on a big disk mounted via
NFSv3, and enable auditd with a rule to trap return status -113 and
check the audit log, you will see massive amounts of ESTALE errors.
- Problem exclusive to ZFS-FUSE and other FUSE filesystems exported
via NFS -- above problem is not reproducible in tmpfs and ext4
filesystems exported via NFS

NFSv2:

- same problems as NFSv3
- additional limitations: maxfilesize 2G

SAMBA:

- inodes are reused for mountpoints in ZFS file systems that contain
children ZFS file systems mounted atop of them. this will inevitably
lead to impossible-to-diagnose data loss scenarios.

Changing mount options like proto tcp proto udp, whatnot, or ZFS-FUSE
options like fuse mount options, fuse entry timeout / attr timeout,
disable page cache, M A K E S N O D I F F E R E N C E.

UNFSD

- works.
- INCREDIBLY SLOW.

-------------------------------

Conclusion: if you value your data, do not use any of the above
mentioned mechanisms to share ZFS-FUSE volumes.

Emmanuel Anne

unread,
Aug 4, 2010, 5:23:30 AM8/4/10
to zfs-...@googlegroups.com
You could have just said "nothing changes in 2.6.35"
Well didn't have time yet to check all this, but as I said I keep on using nfsv3 without problem, so I strongly suspect the problem is related to the size of the exported partition (and probably more precisely to the inode max numbers).

More on that when I'll have more time...

2010/8/4 Rudd-O <rud...@rudd-o.com>

--
To post to this group, send email to zfs-...@googlegroups.com
To visit our Web site, click on http://zfs-fuse.net/

Rudd-O

unread,
Aug 4, 2010, 5:40:53 AM8/4/10
to zfs-fuse
I do not have two billion files, so there is no way this is caused by
inode numbers. This is merely and possibly the same problem in the
NFSD for v3, v2 and v4 manifesting itself differently.

If explicit mount option serverino is given to cifs mounts of ZFS
filesystems exported using SAMBA, the vanishing files problem stops.
This is GOOD.
> > To visit our Web site, click onhttp://zfs-fuse.net/

Rudd-O

unread,
Aug 4, 2010, 5:41:52 AM8/4/10
to zfs-fuse
Btw I am pretty sure you also have problems, you just haven't noticed
them because apps aren't exactly going to pop up error messages for
the infrequent ESTALE. Likely they just silently lose data and you
haven't found out yet, just as it happened to me.

On 4 ago, 02:23, Emmanuel Anne <emmanuel.a...@gmail.com> wrote:
> > To visit our Web site, click onhttp://zfs-fuse.net/

sgheeren

unread,
Aug 4, 2010, 5:53:14 AM8/4/10
to zfs-...@googlegroups.com
On 08/04/2010 11:41 AM, Rudd-O wrote:
> Btw I am pretty sure you also have problems, you just haven't noticed
> them because apps aren't exactly going to pop up error messages for
> the infrequent ESTALE. Likely they just silently lose data and you
> haven't found out yet, just as it happened to me.
>

Please tell me what happened to you (nfs only, samba is not supported
anyway). The way I see it,

* nfs4 has usability issues, not integrity issues
* nfs3 might have integrity issues, but I'm not sure I understand. A
real-life example would help

If you say you found out about silent data loss, could you provide a
summary of what happened/how you found out?

Once we can explain this, I will put up a notice on the site

Seth

Manuel Amador (Rudd-O)

unread,
Aug 4, 2010, 6:49:16 PM8/4/10
to zfs-...@googlegroups.com, sgheeren
My conclusions:

Bottom line, if you want to share ZFS file systems:

- upgrade to latest 2.6.35 vanilla kernel
- use either UNFSD in the server, or SAMBA in the server
- in SAMBA clients, use explicit mount option serverino or use Windows

Ironically, SAMBA works, which means that SAMBA ought to be supported, and NFS
ought NOT to be supported.

----------------------------------

This is my report:

----------------------------------

Server: vanilla 2.6.35 kernel
Client: vanilla 2.6.35 kernel



NFSv4:

- creation of a file using cp or any other tool that opens files with
O_EXCL causes the file to be created with mode 000, and the app to get
a Permission Denied reply from the creat() syscall
- Problem exclusive to ZFS-FUSE and other FUSE filesystems exported
via NFS -- above problem is not reproducible in tmpfs and ext4
filesystems exported via NFS

NFSv3:

- stale file handles, a LOT of them. Run find /mountpoint or use
something like amarokcollectionscanner on a big disk mounted via
NFSv3, and enable auditd with a rule to trap return status -113 and
check the audit log, you will see massive amounts of ESTALE errors.
- Problem exclusive to ZFS-FUSE and other FUSE filesystems exported
via NFS -- above problem is not reproducible in tmpfs and ext4
filesystems exported via NFS

NFSv2:

- same problems as NFSv3
- additional limitations: maxfilesize 2G

SAMBA:

- inodes are reused for mountpoints in ZFS file systems that contain
children ZFS file systems mounted atop of them. this will inevitably

lead to impossible-to-diagnose data loss scenarios. one such scenario is
easy to verify: run find /mountpoint once, then run it again, and you will see
the two finds return different lists of files.
- the only way to make SAMBA work is with explicit serverino mount option, and
this has only been tested with kernel 2.6.35 on both client and server

UNFSD

- works.
- INCREDIBLY SLOW.


Changing mount options like proto tcp proto udp, whatnot, or ZFS-FUSE
options like fuse mount options, fuse entry timeout / attr timeout,
disable page cache, M A K E S N O D I F F E R E N C E.

------------------------------------

Emmanuel Anne

unread,
Aug 5, 2010, 3:17:49 PM8/5/10
to zfs-...@googlegroups.com
Ok, I finally took the time to test all this :
1) auditd with my SMALL collection : no problem at all over nfsv3
2) I even took the trouble to test 64 bit inode numbers over nfs using fuse by modifying hello_ll.c so that the only file has inode number 0x100000000. And it works perfectly well, locally or over nfs, even using a 32 bit os to connect to the nfs share !

So no problem at all for me, problem impossible to reproduce here.
If you really want to investigate this seriously, you might want to take some of the simple fuse example source code to make your tests.

2010/8/5 Manuel Amador (Rudd-O) <rud...@rudd-o.com>
--
To post to this group, send email to zfs-...@googlegroups.com
To visit our Web site, click on http://zfs-fuse.net/

Rudd-O

unread,
Aug 6, 2010, 1:55:24 AM8/6/10
to zfs-fuse
Posted the bug here:

https://bugzilla.redhat.com/show_bug.cgi?id=621774

But my kernel is actually the vanilla 2.6.35 one.

Do not limit yourself to test fuse. It appears it might be a bug in
zfs-fuse itself. Do you have a big partition with many files that you
can mount on a client using NFS? Do that. Then find /mountpoint 2>&1
> /tmp/log1 and then look for errors in the log1 file.

What distro / kernel version are you using?
> > To visit our Web site, click onhttp://zfs-fuse.net/

Emmanuel Anne

unread,
Aug 6, 2010, 5:11:44 AM8/6/10
to zfs-...@googlegroups.com
Nothing particular here, 2.6.34.1 or 2.6.32.x, standard kernel nfs server v3.
See my thread about auditd and zfs-fuse also.

2010/8/6 Rudd-O <rud...@rudd-o.com>
To visit our Web site, click on http://zfs-fuse.net/
Reply all
Reply to author
Forward
0 new messages