For details on the manager's policy, please see
fs/virtiofs/virtiofs_dax.hh.
Signed-off-by: Fotis Xenakis <
fo...@windowslive.com>
---
Makefile | 39 +++---
fs/virtiofs/virtiofs_dax.cc | 268 ++++++++++++++++++++++++++++++++++++
fs/virtiofs/virtiofs_dax.hh | 109 +++++++++++++++
3 files changed, 397 insertions(+), 19 deletions(-)
create mode 100644 fs/virtiofs/virtiofs_dax.cc
create mode 100644 fs/virtiofs/virtiofs_dax.hh
diff --git a/Makefile b/Makefile
index 20ddf3b1..12366794 100644
--- a/Makefile
+++ b/Makefile
@@ -536,23 +536,23 @@ bsd += bsd/porting/mmu.o
bsd += bsd/porting/pcpu.o
bsd += bsd/porting/bus_dma.o
bsd += bsd/porting/kobj.o
-bsd += bsd/sys/netinet/if_ether.o
-bsd += bsd/sys/compat/linux/linux_socket.o
-bsd += bsd/sys/compat/linux/linux_ioctl.o
-bsd += bsd/sys/net/if_ethersubr.o
-bsd += bsd/sys/net/if_llatbl.o
-bsd += bsd/sys/net/radix.o
-bsd += bsd/sys/net/route.o
-bsd += bsd/sys/net/raw_cb.o
-bsd += bsd/sys/net/raw_usrreq.o
-bsd += bsd/sys/net/rtsock.o
-bsd += bsd/sys/net/netisr.o
-bsd += bsd/sys/net/netisr1.o
-bsd += bsd/sys/net/if_dead.o
-bsd += bsd/sys/net/if_clone.o
-bsd += bsd/sys/net/if_loop.o
-bsd += bsd/sys/net/if.o
-bsd += bsd/sys/net/pfil.o
+bsd += bsd/sys/netinet/if_ether.o
+bsd += bsd/sys/compat/linux/linux_socket.o
+bsd += bsd/sys/compat/linux/linux_ioctl.o
+bsd += bsd/sys/net/if_ethersubr.o
+bsd += bsd/sys/net/if_llatbl.o
+bsd += bsd/sys/net/radix.o
+bsd += bsd/sys/net/route.o
+bsd += bsd/sys/net/raw_cb.o
+bsd += bsd/sys/net/raw_usrreq.o
+bsd += bsd/sys/net/rtsock.o
+bsd += bsd/sys/net/netisr.o
+bsd += bsd/sys/net/netisr1.o
+bsd += bsd/sys/net/if_dead.o
+bsd += bsd/sys/net/if_clone.o
+bsd += bsd/sys/net/if_loop.o
+bsd += bsd/sys/net/if.o
+bsd += bsd/sys/net/pfil.o
bsd += bsd/sys/net/routecache.o
bsd += bsd/sys/netinet/in.o
bsd += bsd/sys/netinet/in_pcb.o
@@ -1769,7 +1769,8 @@ fs_objs += rofs/rofs_vfsops.o \
rofs/rofs_common.o
fs_objs += virtiofs/virtiofs_vfsops.o \
- virtiofs/virtiofs_vnops.o
+ virtiofs/virtiofs_vnops.o \
+ virtiofs/virtiofs_dax.o
fs_objs += pseudofs/pseudofs.o
fs_objs += procfs/procfs_vnops.o
@@ -1976,7 +1977,7 @@ libuutil-objects = $(foreach file, $(libuutil-file-list), $(out)/bsd/cddl/contri
define libuutil-includes
bsd/cddl/contrib/opensolaris/lib/libuutil/common
- bsd/cddl/compat/opensolaris/include
+ bsd/cddl/compat/opensolaris/include
bsd/sys/cddl/contrib/opensolaris/uts/common
bsd/sys/cddl/compat/opensolaris
bsd/cddl/contrib/opensolaris/head
diff --git a/fs/virtiofs/virtiofs_dax.cc b/fs/virtiofs/virtiofs_dax.cc
new file mode 100644
index 00000000..8e612eb5
--- /dev/null
+++ b/fs/virtiofs/virtiofs_dax.cc
@@ -0,0 +1,268 @@
+/*
+ * Copyright (C) 2020 Fotis Xenakis
+ *
+ * This work is open source software, licensed under the terms of the
+ * BSD license as described in the LICENSE file in the top-level directory.
+ */
+
+#include <algorithm>
+#include <mutex>
+
+#include <osv/debug.h>
+#include <osv/uio.h>
+
+#include "fuse_kernel.h"
+#include "virtiofs.hh"
+#include "virtiofs_dax.hh"
+#include "virtiofs_i.hh"
+
+namespace virtiofs {
+
+int dax_manager::read(virtiofs_inode& inode, uint64_t file_handle, u64 read_amt,
+ struct uio& uio, bool aggressive)
+{
+ std::lock_guard<mutex> guard {_lock};
+
+ // Necessary pre-declarations due to goto below
+ size_t to_map;
+ chunk nchunks;
+ int error;
+ mapping_part mp;
+ chunk fstart = uio.uio_offset / _chunk_size;
+ off_t coffset = uio.uio_offset % _chunk_size; // offset within chunk
+ if (find(inode.nodeid, fstart, mp)) {
+ // Requested data (at least some initial) is already mapped
+ auto read_amt_act = std::min<size_t>(read_amt,
+ (mp.nchunks * _chunk_size) - coffset);
+ virtiofs_debug("inode %lld, found in DAX (foffset=%lld, len=%lld, "
+ "moffset=%lld)\n", inode.nodeid, uio.uio_offset, read_amt_act,
+ (mp.mstart * _chunk_size) + coffset);
+ goto out;
+ }
+
+ // Map file
+ to_map = coffset; // bytes to map
+ if (aggressive) {
+ // Map the rest of the file
+ to_map += inode.attr.size - uio.uio_offset;
+ } else {
+ // Map just enough chunks to satisfy read_amt
+ to_map += read_amt;
+ }
+ nchunks = to_map / _chunk_size;
+ if (to_map % _chunk_size > 0) {
+ nchunks++;
+ }
+ // NOTE: This relies on the fact that requesting a mapping longer than the
+ // remaining file works (see mmap() on the host). If that didn't work, we
+ // would have to request exact mappings (byte-granularity, rather than
+ // chunk-granularity).
+ error = map(inode.nodeid, file_handle, nchunks, fstart, mp, true);
+ if (error) {
+ return error;
+ }
+
+out:
+ auto req_data = _window->addr + (mp.mstart * _chunk_size) + coffset;
+ auto read_amt_act = std::min<size_t>(read_amt,
+ (mp.nchunks * _chunk_size) - coffset);
+ // NOTE: It shouldn't be necessary to use the mmio* interface (i.e. volatile
+ // accesses). From the spec: "Drivers map this shared memory region with
+ // writeback caching as if it were regular RAM."
+ error = uiomove(const_cast<void*>(req_data), read_amt_act, &uio);
+ if (error) {
+ kprintf("[virtiofs] inode %lld, uiomove failed\n", inode.nodeid);
+ }
+ return error;
+}
+
+int dax_manager::map(uint64_t nodeid, uint64_t file_handle, chunk nchunks,
+ chunk fstart, mapping_part& mapped, bool evict)
+{
+ // If necessary, unmap just enough chunks
+ auto empty = _window_chunks - first_empty();
+ if (evict && empty < nchunks) {
+ mapping_part mp;
+ auto error = unmap(nchunks - empty, mp, false);
+ if (error) {
+ return error;
+ }
+ empty += mp.nchunks;
+ }
+ auto to_map = std::min<chunk>(nchunks, empty);
+ if (to_map == 0) {
+ // The window is full and evict is false, or nchunks is 0
+ mapped.mstart = _window_chunks - empty;
+ mapped.nchunks = 0;
+ return (nchunks == 0) ? 0 : ENOBUFS;
+ }
+
+ // Map new chunks
+ auto mstart = _window_chunks - empty;
+ auto error = map_ll(nodeid, file_handle, to_map, fstart, mstart);
+ if (error) {
+ return error;
+ }
+ if (!_mappings.empty()) {
+ auto& m {_mappings.back()};
+ if (m.nodeid == nodeid && m.fstart + m.nchunks == fstart) {
+ // Extend previous mapping
+ m.nchunks += to_map;
+ mapped.mstart = mstart;
+ mapped.nchunks = to_map;
+ return 0;
+ }
+ }
+ _mappings.emplace_back(nodeid, to_map, fstart, mstart);
+ mapped.mstart = mstart;
+ mapped.nchunks = to_map;
+ return 0;
+}
+
+int dax_manager::unmap(chunk nchunks, mapping_part& unmapped, bool deep)
+{
+ // Determine necessary changes
+ chunk to_unmap = 0;
+ auto erase_first {_mappings.cend()};
+ chunk to_unmap_from_last = 0;
+ for (auto it {_mappings.crbegin()};
+ to_unmap < nchunks && it != _mappings.crend(); it++) {
+
+ if (it->nchunks <= nchunks - to_unmap) {
+ // Remove *it
+ erase_first = it.base() - 1;
+ to_unmap += it->nchunks;
+ } else {
+ // Modify *it
+ to_unmap_from_last = nchunks - to_unmap;
+ to_unmap = nchunks;
+ }
+ }
+ if (to_unmap == 0) {
+ // The window is empty, or nchunks is 0
+ unmapped.mstart = first_empty();
+ unmapped.nchunks = 0;
+ return (nchunks == 0) ? 0 : ENODATA;
+ }
+
+ // Apply changes
+ if (deep) {
+ auto mstart = first_empty() - to_unmap;
+ auto error = unmap_ll(to_unmap, mstart);
+ if (error) {
+ return error;
+ }
+ }
+ _mappings.erase(erase_first, _mappings.cend());
+ if (to_unmap_from_last > 0) {
+ _mappings.back().nchunks -= to_unmap_from_last;
+ }
+
+ unmapped.mstart = first_empty();
+ unmapped.nchunks = to_unmap;
+ return 0;
+}
+
+int dax_manager::map_ll(uint64_t nodeid, uint64_t file_handle, chunk nchunks,
+ chunk fstart, chunk mstart)
+{
+ assert(mstart + nchunks <= _window_chunks);
+
+ // NOTE: There are restrictions on the arguments to FUSE_SETUPMAPPING, from
+ // the spec: "Alignment constraints for FUSE_SETUPMAPPING and
+ // FUSE_REMOVEMAPPING requests are communicated during FUSE_INIT
+ // negotiation"):
+ // - foffset: multiple of map_alignment from FUSE_INIT
+ // - len: not larger than remaining file?
+ // - moffset: multiple of map_alignment from FUSE_INIT
+ // In practice, map_alignment is the host's page size, because foffset and
+ // moffset are passed to mmap() on the host. These are satisfied by
+ // _chunk_size being a multiple of map_alignment.
+
+ std::unique_ptr<fuse_setupmapping_in> in_args {
+ new (std::nothrow) fuse_setupmapping_in()};
+ if (!in_args) {
+ return ENOMEM;
+ }
+ in_args->fh = file_handle;
+ in_args->foffset = fstart * _chunk_size;
+ in_args->len = nchunks * _chunk_size;
+ in_args->flags = 0; // Read-only
+ in_args->moffset = mstart * _chunk_size;
+
+ virtiofs_debug("inode %lld, setting up mapping (foffset=%lld, len=%lld, "
+ "moffset=%lld)\n", nodeid, in_args->foffset, in_args->len,
+ in_args->moffset);
+ auto error = fuse_req_send_and_receive_reply(&_drv, FUSE_SETUPMAPPING,
+ nodeid, in_args.get(), sizeof(*in_args), nullptr, 0);
+ if (error) {
+ kprintf("[virtiofs] inode %lld, mapping setup failed\n", nodeid);
+ return error;
+ }
+
+ return 0;
+}
+
+int dax_manager::unmap_ll(chunk nchunks, chunk mstart)
+{
+ assert(mstart + nchunks <= _window_chunks);
+
+ // NOTE: FUSE_REMOVEMAPPING accepts a fuse_removemapping_in followed by
+ // fuse_removemapping_in.count fuse_removemapping_one arguments in general.
+ auto in_args_size = sizeof(fuse_removemapping_in) +
+ sizeof(fuse_removemapping_one);
+ std::unique_ptr<u8> in_args {new (std::nothrow) u8[in_args_size]};
+ if (!in_args) {
+ return ENOMEM;
+ }
+ auto r_in = new (in_args.get()) fuse_removemapping_in();
+ auto r_one = new (in_args.get() + sizeof(fuse_removemapping_in))
+ fuse_removemapping_one();
+ r_in->count = 1;
+ r_one->moffset = mstart * _chunk_size;
+ r_one->len = nchunks * _chunk_size;
+
+ // The nodeid is irrelevant for the current implementation of
+ // FUSE_REMOVEMAPPING. If it needed to be set, would we need to make a
+ // request per inode?
+ uint64_t nodeid = 0;
+
+ virtiofs_debug("inode %lld, removing mapping (moffset=%lld, len=%lld)\n",
+ nodeid, r_one->moffset, r_one->len);
+ auto error = fuse_req_send_and_receive_reply(&_drv, FUSE_REMOVEMAPPING,
+ nodeid, in_args.get(), in_args_size, nullptr, 0);
+ if (error) {
+ kprintf("[virtiofs] inode %lld, mapping removal failed\n", nodeid);
+ return error;
+ }
+
+ return 0;
+}
+
+bool dax_manager::find(uint64_t nodeid, chunk fstart, mapping_part& found) const
+{
+ for (auto& m : _mappings) {
+ if (m.nodeid == nodeid &&
+ m.fstart <= fstart &&
+ m.fstart + m.nchunks > fstart) {
+
+ // m contains fstart
+ auto excess = fstart - m.fstart; // excess contained in m
+ found.nchunks = m.nchunks - excess;
+ found.mstart = m.mstart + excess;
+ return true;
+ }
+ }
+ return false;
+}
+
+dax_manager::chunk dax_manager::first_empty() const
+{
+ if (_mappings.empty()) {
+ return 0;
+ }
+ auto& m {_mappings.back()};
+ return m.mstart + m.nchunks;
+}
+
+}
diff --git a/fs/virtiofs/virtiofs_dax.hh b/fs/virtiofs/virtiofs_dax.hh
new file mode 100644
index 00000000..2b9fa341
--- /dev/null
+++ b/fs/virtiofs/virtiofs_dax.hh
@@ -0,0 +1,109 @@
+/*
+ * Copyright (C) 2020 Fotis Xenakis
+ *
+ * This work is open source software, licensed under the terms of the
+ * BSD license as described in the LICENSE file in the top-level directory.
+ */
+
+#include <vector>
+
+#include <api/assert.h>
+#include <osv/mutex.h>
+#include <osv/uio.h>
+
+#include "drivers/virtio-fs.hh"
+#include "virtiofs.hh"
+
+namespace virtiofs {
+
+// A manager for the DAX window of a virtio-fs device. This implements a
+// straight-forward scheme for file mappings:
+// - The window is split into equally-sized chunks. Each mapping occupies an
+// integer amount of consecutive chunks.
+// - New mappings are placed on the lowest available chunks in the window.
+// - When there are not enough chunks available for a new mapping, the highest
+// (i.e. most recently mapped) chunks occupied are evicted. Thus, chunks are
+// mapped in a LIFO manner (the window resembles a stack).
+class dax_manager {
+public:
+ static constexpr size_t DEFAULT_CHUNK_SIZE = 1 << 21; // 2MiB
+
+ // Construct a new manager for the DAX window associated with @drv (as
+ // returned by drv.get_dax()). The alignment constraint of the device (as
+ // reported by drv.get_map_alignment()) should be compatible with
+ // @chunk_size.
+ dax_manager(virtio::fs& drv, size_t chunk_size = DEFAULT_CHUNK_SIZE)
+ : _drv {drv},
+ _window {drv.get_dax()},
+ _chunk_size {chunk_size},
+ _window_chunks {_window->len / _chunk_size} {
+
+ assert(_chunk_size % (1ull << _drv.get_map_alignment()) == 0);
+
+ // NOTE: If _window->len % CHUNK_SIZE > 0, that remainder (< CHUNK_SIZE)
+ // is effectively ignored.
+ }
+
+ // Read @read_amt bytes from @inode, using the DAX window. If @aggressive,
+ // try to prefetch as much of the rest of the file as possible.
+ int read(virtiofs_inode& inode, uint64_t file_handle, u64 read_amt,
+ struct uio& uio, bool aggressive = false);
+
+private:
+ // Helper type to better distinguish referring to chunks vs bytes
+ using chunk = size_t;
+
+ struct mapping {
+ mapping(uint64_t _nodeid, chunk _nchunks, chunk _fstart, chunk _mstart)
+ : nodeid {_nodeid},
+ nchunks {_nchunks},
+ fstart {_fstart},
+ mstart {_mstart} {}
+ uint64_t nodeid;
+ chunk nchunks;
+ chunk fstart;
+ chunk mstart;
+ };
+
+ struct mapping_part {
+ chunk nchunks;
+ chunk mstart;
+ };
+
+ // Map up to @nchunks chunks of the file with @nodeid, starting at chunk
+ // @fstart of the file, after all other mappings. If @evict, evict other
+ // chunks if necessary. Returns in @mapped the new mapping and non-zero on
+ // failure. Called with _lock held (for writing).
+ int map(uint64_t nodeid, uint64_t file_handle, chunk nchunks, chunk fstart,
+ mapping_part& mapped, bool evict = false);
+ // Unmap @nchunks last chunks, also doing an actual unmapping on the device
+ // if @deep. Returns in @unmapped what was unmapped and non-zero on failure.
+ // Called with _lock held (for writing).
+ int unmap(chunk nchunks, mapping_part& unmapped, bool deep = false);
+ // Map @nchunks chunks of the file with @nodeid (opened as @fh), starting at
+ // chunk @fstart of the file and chunk @mstart of the window. Returns
+ // non-zero on failure. Called with _lock held (for writing).
+ int map_ll(uint64_t nodeid, uint64_t fh, chunk nchunks, chunk fstart,
+ chunk mstart);
+ // Unmap @nchunks chunks, starting at chunk @mstart of the window. Returns
+ // non-zero on failure. Called with _lock held (for writing).
+ int unmap_ll(chunk nchunks, chunk mstart);
+
+ // Return in @found the largest contiguous existing mapping for @nodeid
+ // starting at @fstart. If none found, returns false. Called with _lock held
+ // (for reading).
+ bool find(uint64_t nodeid, chunk fstart, mapping_part& found) const;
+ // Returns the first empty chunk in the window, or one-past-the-last if the
+ // window is full. Called with _lock held (for reading).
+ chunk first_empty() const;
+
+ virtio::fs& _drv;
+ const virtio::fs::dax_window* const _window;
+ const size_t _chunk_size;
+ const chunk _window_chunks;
+ // TODO OPT: Switch to rwlock
+ mutex _lock;
+ std::vector<mapping> _mappings;
+};
+
+}
--
2.27.0