[PATCH 1/3] vfs: store and track filesystem ID

17 views
Skip to first unread message

Waldemar Kozaczuk

unread,
Apr 4, 2020, 12:58:53 AM4/4/20
to osv...@googlegroups.com, Waldemar Kozaczuk
This patch adds new field m_fsid to filesystem configuration structure
to identify which filesystem (zfs, rofs, etc) given vnode belongs to.
This becomes necessary to properly refactor logic in pagecache.

Signed-off-by: Waldemar Kozaczuk <jwkoz...@gmail.com>
---
.../opensolaris/uts/common/fs/zfs/zfs_vfsops.c | 10 ++++++----
fs/vfs/vfs.h | 9 +++++++++
fs/vfs/vfs_conf.cc | 18 +++++++++---------
fs/vfs/vfs_mount.cc | 1 +
include/osv/mount.h | 1 +
5 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
index cd08ff4f..d79e0360 100644
--- a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
+++ b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
@@ -1005,11 +1005,13 @@ zfs_domount(vfs_t *vfsp, const char *osname)
* The 8-bit fs type must be put in the low bits of fsid[1]
* because that's where other Solaris filesystems put it.
*/
- fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
- ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
- vfsp->vfs_fsid.__val[0] = fsid_guid;
+ //DISABLE the logic below to make sure that vfs_fsid
+ //is static and stays the same as it is set in config
+ //fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
+ //ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
+ //vfsp->vfs_fsid.__val[0] = fsid_guid;
/* fsid type not included */
- vfsp->vfs_fsid.__val[1] = fsid_guid >> 32;
+ //vfsp->vfs_fsid.__val[1] = fsid_guid >> 32;

/*
* Set features for file system.
diff --git a/fs/vfs/vfs.h b/fs/vfs/vfs.h
index d86ef957..ef165a62 100644
--- a/fs/vfs/vfs.h
+++ b/fs/vfs/vfs.h
@@ -76,6 +76,15 @@ extern int vfs_debug;

#define OPEN_MAX 256

+#define RAMFS_ID 1
+#define DEVFS_ID 2
+#define NFS_ID 3
+#define PROCFS_ID 4
+#define SYSFS_ID 5
+#define ZFS_ID 6
+#define ROFS_ID 7
+#define VIRTIOFS_ID 8
+
/*
* per task data
*/
diff --git a/fs/vfs/vfs_conf.cc b/fs/vfs/vfs_conf.cc
index 4a54cb97..1976092b 100644
--- a/fs/vfs/vfs_conf.cc
+++ b/fs/vfs/vfs_conf.cc
@@ -67,13 +67,13 @@ extern "C" int zfs_init(void);
* VFS switch table
*/
const struct vfssw vfssw[] = {
- {"ramfs", ramfs_init, &ramfs_vfsops},
- {"devfs", devfs_init, &devfs_vfsops},
- {"nfs", nfs_init, &nfs_vfsops},
- {"procfs", procfs_init, &procfs_vfsops},
- {"sysfs", sysfs_init, &sysfs_vfsops},
- {"zfs", zfs_init, &zfs_vfsops},
- {"rofs", rofs_init, &rofs_vfsops},
- {"virtiofs", virtiofs_init, &virtiofs_vfsops},
- {nullptr, fs_noop, nullptr},
+ {{0, RAMFS_ID}, "ramfs", ramfs_init, &ramfs_vfsops},
+ {{0, DEVFS_ID}, "devfs", devfs_init, &devfs_vfsops},
+ {{0, NFS_ID}, "nfs", nfs_init, &nfs_vfsops},
+ {{0, PROCFS_ID}, "procfs", procfs_init, &procfs_vfsops},
+ {{0, SYSFS_ID}, "sysfs", sysfs_init, &sysfs_vfsops},
+ {{0, ZFS_ID}, "zfs", zfs_init, &zfs_vfsops},
+ {{0, ROFS_ID}, "rofs", rofs_init, &rofs_vfsops},
+ {{0, VIRTIOFS_ID}, "virtiofs", virtiofs_init, &virtiofs_vfsops},
+ {{0, 0}, nullptr, fs_noop, nullptr},
};
diff --git a/fs/vfs/vfs_mount.cc b/fs/vfs/vfs_mount.cc
index dac4d09c..bcc9568a 100644
--- a/fs/vfs/vfs_mount.cc
+++ b/fs/vfs/vfs_mount.cc
@@ -147,6 +147,7 @@ sys_mount(const char *dev, const char *dir, const char *fsname, int flags, const
mp->m_flags = flags;
mp->m_dev = device;
mp->m_data = nullptr;
+ mp->m_fsid = fs->m_fsid;
strlcpy(mp->m_path, dir, sizeof(mp->m_path));
strlcpy(mp->m_special, dev, sizeof(mp->m_special));

diff --git a/include/osv/mount.h b/include/osv/mount.h
index 7268d8ce..99eb8766 100755
--- a/include/osv/mount.h
+++ b/include/osv/mount.h
@@ -102,6 +102,7 @@ struct mount {
* Filesystem type switch table.
*/
struct vfssw {
+ fsid_t m_fsid; /* filesystem id */
const char *vs_name; /* name of file system */
int (*vs_init)(void); /* initialize routine */
struct vfsops *vs_op; /* pointer to vfs operation */
--
2.20.1

Waldemar Kozaczuk

unread,
Apr 4, 2020, 12:59:04 AM4/4/20
to osv...@googlegroups.com, Waldemar Kozaczuk
So far the pagecache layer has been pretty tightly integrated with ZFS
and more specifically with its ARC cache layer. This patch refactors
the pagecache implementation to make it support other filesystems than zfs.

In essence we modify all necessary places like retrieving and releasing
cached file pages to behave slightly differently depending on the filesystem
(ZFS versus non-ZFS) given vnode belongs to. The changes only apply to
read cache where ZFS page caching requires tighter integration with ARC.

This patch adds new integration function - map_read_cached_page() -
intended to be used by non-ZFS filesystem implementations to register
cached file pages into page cache.

Signed-off-by: Waldemar Kozaczuk <jwkoz...@gmail.com>
---
core/pagecache.cc | 176 ++++++++++++++++++++++++++++-----------
fs/vfs/vfs_fops.cc | 2 +-
include/osv/pagecache.hh | 1 +
include/osv/vfs_file.hh | 2 +-
4 files changed, 131 insertions(+), 50 deletions(-)

diff --git a/core/pagecache.cc b/core/pagecache.cc
index d5d2ee86..426e2a1c 100644
--- a/core/pagecache.cc
+++ b/core/pagecache.cc
@@ -161,7 +161,7 @@ protected:
public:
cached_page(hashkey key, void* page) : _key(key), _page(page) {
}
- ~cached_page() {
+ virtual ~cached_page() {
}

void map(mmu::hw_ptep<0> ptep) {
@@ -198,7 +198,7 @@ public:
_vp = fp->f_dentry->d_vnode;
vref(_vp);
}
- ~cached_page_write() {
+ virtual ~cached_page_write() {
if (_page) {
if (_dirty) {
writeback();
@@ -238,7 +238,7 @@ public:

class cached_page_arc;

-unsigned drop_read_cached_page(cached_page_arc* cp, bool flush = true);
+static unsigned drop_arc_read_cached_page(cached_page_arc* cp, bool flush = true);

class cached_page_arc : public cached_page {
public:
@@ -267,7 +267,7 @@ private:

public:
cached_page_arc(hashkey key, void* page, arc_buf_t* ab) : cached_page(key, page), _ab(ref(ab, this)) {}
- ~cached_page_arc() {
+ virtual ~cached_page_arc() {
if (!_removed && unref(_ab, this)) {
arc_unshare_buf(_ab);
}
@@ -282,7 +282,7 @@ public:
std::for_each(it.first, it.second, [&count](arc_map::value_type& p) {
auto cp = p.second;
cp->_removed = true;
- count += drop_read_cached_page(cp, false);
+ count += drop_arc_read_cached_page(cp, false);
});
arc_cache_map.erase(ab);
if (count) {
@@ -296,10 +296,14 @@ static bool operator==(const cached_page_arc::arc_map::value_type& l, const cach
}

std::unordered_multimap<arc_buf_t*, cached_page_arc*> cached_page_arc::arc_cache_map;
-static std::unordered_map<hashkey, cached_page_arc*> read_cache;
+//Map used to store read cache pages for ZFS filesystem interacting with ARC
+static std::unordered_map<hashkey, cached_page_arc*> arc_read_cache;
+//Map used to store read cache pages for non-ZFS filesystems
+static std::unordered_map<hashkey, cached_page*> read_cache;
static std::unordered_map<hashkey, cached_page_write*> write_cache;
static std::deque<cached_page_write*> write_lru;
-static mutex arc_lock; // protects against parallel access to the read cache
+static mutex arc_read_lock; // protects against parallel access to the ARC read cache
+static mutex read_lock; // protects against parallel access to the read cache
static mutex write_lock; // protect against parallel access to the write cache

template<typename T>
@@ -314,38 +318,57 @@ static T find_in_cache(std::unordered_map<hashkey, T>& cache, hashkey& key)
}
}

+static void add_read_mapping(cached_page *cp, mmu::hw_ptep<0> ptep)
+{
+ cp->map(ptep);
+}
+
TRACEPOINT(trace_add_read_mapping, "buf=%p, addr=%p, ptep=%p", void*, void*, void*);
-void add_read_mapping(cached_page_arc *cp, mmu::hw_ptep<0> ptep)
+static void add_arc_read_mapping(cached_page_arc *cp, mmu::hw_ptep<0> ptep)
{
trace_add_read_mapping(cp->arcbuf(), cp->addr(), ptep.release());
- cp->map(ptep);
+ add_read_mapping(cp, ptep);
}

-TRACEPOINT(trace_remove_mapping, "buf=%p, addr=%p, ptep=%p", void*, void*, void*);
-void remove_read_mapping(cached_page_arc* cp, mmu::hw_ptep<0> ptep)
+template<typename T>
+static void remove_read_mapping(std::unordered_map<hashkey, T>& cache, cached_page* cp, mmu::hw_ptep<0> ptep)
{
- trace_remove_mapping(cp->arcbuf(), cp->addr(), ptep.release());
if (cp->unmap(ptep) == 0) {
- read_cache.erase(cp->key());
+ cache.erase(cp->key());
delete cp;
}
}

+TRACEPOINT(trace_remove_mapping, "buf=%p, addr=%p, ptep=%p", void*, void*, void*);
+static void remove_arc_read_mapping(cached_page_arc* cp, mmu::hw_ptep<0> ptep)
+{
+ trace_remove_mapping(cp->arcbuf(), cp->addr(), ptep.release());
+ remove_read_mapping(arc_read_cache, cp, ptep);
+}
+
void remove_read_mapping(hashkey& key, mmu::hw_ptep<0> ptep)
{
- SCOPE_LOCK(arc_lock);
- cached_page_arc* cp = find_in_cache(read_cache, key);
+ SCOPE_LOCK(read_lock);
+ cached_page* cp = find_in_cache(read_cache, key);
if (cp) {
- remove_read_mapping(cp, ptep);
+ remove_read_mapping(read_cache, cp, ptep);
}
}

-TRACEPOINT(trace_drop_read_cached_page, "buf=%p, addr=%p", void*, void*);
-unsigned drop_read_cached_page(cached_page_arc* cp, bool flush)
+void remove_arc_read_mapping(hashkey& key, mmu::hw_ptep<0> ptep)
+{
+ SCOPE_LOCK(arc_read_lock);
+ cached_page_arc* cp = find_in_cache(arc_read_cache, key);
+ if (cp) {
+ remove_arc_read_mapping(cp, ptep);
+ }
+}
+
+template<typename T>
+static unsigned drop_read_cached_page(std::unordered_map<hashkey, T>& cache, cached_page* cp, bool flush)
{
- trace_drop_read_cached_page(cp->arcbuf(), cp->addr());
int flushed = cp->flush();
- read_cache.erase(cp->key());
+ cache.erase(cp->key());

if (flush && flushed > 1) { // if there was only one pte it is the one we are faulting on; no need to flush.
mmu::flush_tlb_all();
@@ -356,12 +379,28 @@ unsigned drop_read_cached_page(cached_page_arc* cp, bool flush)
return flushed;
}

-void drop_read_cached_page(hashkey& key)
+static unsigned drop_arc_read_cached_page(cached_page_arc* cp, bool flush)
+{
+ return drop_read_cached_page(arc_read_cache, cp, flush);
+}
+
+static void drop_read_cached_page(hashkey& key)
{
- SCOPE_LOCK(arc_lock);
- cached_page_arc* cp = find_in_cache(read_cache, key);
+ SCOPE_LOCK(read_lock);
+ cached_page* cp = find_in_cache(read_cache, key);
if (cp) {
- drop_read_cached_page(cp, true);
+ drop_read_cached_page(read_cache, cp, true);
+ }
+}
+
+TRACEPOINT(trace_drop_read_cached_page, "buf=%p, addr=%p", void*, void*);
+static void drop_arc_read_cached_page(hashkey& key)
+{
+ SCOPE_LOCK(arc_read_lock);
+ cached_page_arc* cp = find_in_cache(arc_read_cache, key);
+ if (cp) {
+ trace_drop_read_cached_page(cp->arcbuf(), cp->addr());
+ drop_read_cached_page(arc_read_cache, cp, true);
}
}

@@ -369,7 +408,7 @@ TRACEPOINT(trace_unmap_arc_buf, "buf=%p", void*);
void unmap_arc_buf(arc_buf_t* ab)
{
trace_unmap_arc_buf(ab);
- SCOPE_LOCK(arc_lock);
+ SCOPE_LOCK(arc_read_lock);
cached_page_arc::unmap_arc_buf(ab);
}

@@ -377,15 +416,22 @@ TRACEPOINT(trace_map_arc_buf, "buf=%p page=%p", void*, void*);
void map_arc_buf(hashkey *key, arc_buf_t* ab, void *page)
{
trace_map_arc_buf(ab, page);
- SCOPE_LOCK(arc_lock);
+ SCOPE_LOCK(arc_read_lock);
cached_page_arc* pc = new cached_page_arc(*key, page, ab);
- read_cache.emplace(*key, pc);
+ arc_read_cache.emplace(*key, pc);
arc_share_buf(ab);
}

+void map_read_cached_page(hashkey *key, void *page)
+{
+ SCOPE_LOCK(read_lock);
+ cached_page* pc = new cached_page(*key, page);
+ read_cache.emplace(*key, pc);
+}
+
static int create_read_cached_page(vfs_file* fp, hashkey& key)
{
- return fp->get_arcbuf(&key, key.offset);
+ return fp->read_page_from_cache(&key, key.offset);
}

static std::unique_ptr<cached_page_write> create_write_cached_page(vfs_file* fp, hashkey& key)
@@ -422,6 +468,8 @@ static void insert(cached_page_write* cp) {
}
}

+#define IS_ZFS(fsid) (fsid.__val[1] == ZFS_ID)
+
bool get(vfs_file* fp, off_t offset, mmu::hw_ptep<0> ptep, mmu::pt_element<0> pte, bool write, bool shared)
{
struct stat st;
@@ -430,6 +478,8 @@ bool get(vfs_file* fp, off_t offset, mmu::hw_ptep<0> ptep, mmu::pt_element<0> pt
SCOPE_LOCK(write_lock);
cached_page_write* wcp = find_in_cache(write_cache, key);

+ bool zfs = IS_ZFS(fp->f_dentry->d_vnode->v_mount->m_fsid);
+
if (write) {
if (!wcp) {
auto newcp = create_write_cached_page(fp, key);
@@ -437,17 +487,25 @@ bool get(vfs_file* fp, off_t offset, mmu::hw_ptep<0> ptep, mmu::pt_element<0> pt
// write fault into shared mapping, there page is not in write cache yet, add it.
wcp = newcp.release();
insert(wcp);
- // page is moved from ARC to write cache
- // drop ARC page if exists, removing all mappings
- drop_read_cached_page(key);
+ // page is moved from read cache to write cache
+ // drop read page if exists, removing all mappings
+ if (zfs) {
+ drop_arc_read_cached_page(key);
+ } else {
+ drop_read_cached_page(key);
+ }
} else {
- // remove mapping to ARC page if exists
- remove_read_mapping(key, ptep);
- // cow of private page from ARC
+ // remove mapping to read cache page if exists
+ if (zfs) {
+ remove_arc_read_mapping(key, ptep);
+ } else {
+ remove_read_mapping(key, ptep);
+ }
+ // cow (copy-on-write) of private page from read cache
return mmu::write_pte(newcp->release(), ptep, pte);
}
} else if (!shared) {
- // cow of private page from write cache
+ // cow (copy-on-write) of private page from write cache
void* page = memory::alloc_page();
memcpy(page, wcp->addr(), mmu::page_size);
return mmu::write_pte(page, ptep, pte);
@@ -456,11 +514,22 @@ bool get(vfs_file* fp, off_t offset, mmu::hw_ptep<0> ptep, mmu::pt_element<0> pt
int ret;
// read fault and page is not in write cache yet, return one from ARC, mark it cow
do {
- WITH_LOCK(arc_lock) {
- cached_page_arc* cp = find_in_cache(read_cache, key);
- if (cp) {
- add_read_mapping(cp, ptep);
- return mmu::write_pte(cp->addr(), ptep, mmu::pte_mark_cow(pte, true));
+ if (zfs) {
+ WITH_LOCK(arc_read_lock) {
+ cached_page_arc* cp = find_in_cache(arc_read_cache, key);
+ if (cp) {
+ add_arc_read_mapping(cp, ptep);
+ return mmu::write_pte(cp->addr(), ptep, mmu::pte_mark_cow(pte, true));
+ }
+ }
+ }
+ else {
+ WITH_LOCK(read_lock) {
+ cached_page* cp = find_in_cache(read_cache, key);
+ if (cp) {
+ add_read_mapping(cp, ptep);
+ return mmu::write_pte(cp->addr(), ptep, mmu::pte_mark_cow(pte, true));
+ }
}
}

@@ -513,12 +582,23 @@ bool release(vfs_file* fp, void *addr, off_t offset, mmu::hw_ptep<0> ptep)
}
}

- WITH_LOCK(arc_lock) {
- cached_page_arc* rcp = find_in_cache(read_cache, key);
- if (rcp && mmu::virt_to_phys(rcp->addr()) == old.addr()) {
- // page is in ARC
- remove_read_mapping(rcp, ptep);
- return false;
+ if (IS_ZFS(fp->f_dentry->d_vnode->v_mount->m_fsid)) {
+ WITH_LOCK(arc_read_lock) {
+ cached_page_arc* rcp = find_in_cache(arc_read_cache, key);
+ if (rcp && mmu::virt_to_phys(rcp->addr()) == old.addr()) {
+ // page is in ARC read cache
+ remove_arc_read_mapping(rcp, ptep);
+ return false;
+ }
+ }
+ } else {
+ WITH_LOCK(read_lock) {
+ cached_page* rcp = find_in_cache(read_cache, key);
+ if (rcp && mmu::virt_to_phys(rcp->addr()) == old.addr()) {
+ // page is in regular read cache
+ remove_read_mapping(read_cache, rcp, ptep);
+ return false;
+ }
}
}

@@ -595,7 +675,7 @@ private:
auto start = sched::thread::current()->thread_clock();
auto deadline = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::nanoseconds(static_cast<unsigned long>(work/_freq))) + start;

- WITH_LOCK(arc_lock) {
+ WITH_LOCK(arc_read_lock) {
while (sched::thread::current()->thread_clock() < deadline && buckets_scanned < bucket_count) {
if (current_bucket >= cached_page_arc::arc_cache_map.bucket_count()) {
current_bucket = 0;
@@ -617,7 +697,7 @@ private:

// mark ARC buffers as accessed when we have 1024 of them
if (!(cleared % 1024)) {
- DROP_LOCK(arc_lock) {
+ DROP_LOCK(arc_read_lock) {
flush = mark_accessed(accessed);
}
}
diff --git a/fs/vfs/vfs_fops.cc b/fs/vfs/vfs_fops.cc
index 3a8f98b4..0b73cecd 100644
--- a/fs/vfs/vfs_fops.cc
+++ b/fs/vfs/vfs_fops.cc
@@ -157,7 +157,7 @@ void vfs_file::sync(off_t start, off_t end)
// eviction that will hold the mmu-side lock that protects the mappings
// Always follow that order. We however can't just get rid of the mmu-side lock,
// because not all invalidations will be synchronous.
-int vfs_file::get_arcbuf(void* key, off_t offset)
+int vfs_file::read_page_from_cache(void* key, off_t offset)
{
struct vnode *vp = f_dentry->d_vnode;

diff --git a/include/osv/pagecache.hh b/include/osv/pagecache.hh
index f3881b19..efb7b32d 100644
--- a/include/osv/pagecache.hh
+++ b/include/osv/pagecache.hh
@@ -35,4 +35,5 @@ bool release(vfs_file* fp, void *addr, off_t offset, mmu::hw_ptep<0> ptep);
void sync(vfs_file* fp, off_t start, off_t end);
void unmap_arc_buf(arc_buf_t* ab);
void map_arc_buf(hashkey* key, arc_buf_t* ab, void* page);
+void map_read_cached_page(hashkey *key, void *page);
}
diff --git a/include/osv/vfs_file.hh b/include/osv/vfs_file.hh
index ffda5a4c..bb02555a 100644
--- a/include/osv/vfs_file.hh
+++ b/include/osv/vfs_file.hh
@@ -26,7 +26,7 @@ public:
virtual bool put_page(void *addr, uintptr_t offset, mmu::hw_ptep<0> ptep);
virtual void sync(off_t start, off_t end);

- int get_arcbuf(void *key, off_t offset);
+ int read_page_from_cache(void *key, off_t offset);
};

#endif /* VFS_FILE_HH_ */
--
2.20.1

Waldemar Kozaczuk

unread,
Apr 4, 2020, 12:59:08 AM4/4/20
to osv...@googlegroups.com, Waldemar Kozaczuk
This patch optimizes memory utilization by integrating with page cache.
In essence it eliminates second copy of file data in memory when mapping files using mmap().

The crux of the changes involve adding new vnops function of type VOP_CACHE -
rofs_map_cached_page() - that ensures that requested page of a file is loaded
from disk into ROFS cache (by triggering read from disk if missing) and
eventually registers the page into pagecache by calling pagecache::map_read_cached_page().

This partially addresses #979

Signed-off-by: Waldemar Kozaczuk <jwkoz...@gmail.com>
---
fs/rofs/rofs.hh | 2 ++
fs/rofs/rofs_cache.cc | 63 ++++++++++++++++++++++++++++++++++++++++---
fs/rofs/rofs_vnops.cc | 38 ++++++++++++++++++++++++--
3 files changed, 97 insertions(+), 6 deletions(-)

diff --git a/fs/rofs/rofs.hh b/fs/rofs/rofs.hh
index 16f811b0..401cad8d 100644
--- a/fs/rofs/rofs.hh
+++ b/fs/rofs/rofs.hh
@@ -128,6 +128,8 @@ struct rofs_info {
namespace rofs {
int
cache_read(struct rofs_inode *inode, struct device *device, struct rofs_super_block *sb, struct uio *uio);
+ int
+ cache_get_page_address(struct rofs_inode *inode, struct device *device, struct rofs_super_block *sb, off_t offset, void **addr);
}

int rofs_read_blocks(struct device *device, uint64_t starting_block, uint64_t blocks_count, void* buf);
diff --git a/fs/rofs/rofs_cache.cc b/fs/rofs/rofs_cache.cc
index cfb5287d..533d902b 100644
--- a/fs/rofs/rofs_cache.cc
+++ b/fs/rofs/rofs_cache.cc
@@ -10,8 +10,10 @@
#include <list>
#include <unordered_map>
#include <include/osv/uio.h>
+#include <include/osv/contiguous_alloc.hh>
#include <osv/debug.h>
#include <osv/sched.hh>
+#include <sys/mman.h>

/*
* From cache perspective let us divide each file into sequence of contiguous 32K segments.
@@ -56,20 +58,29 @@ public:
this->starting_block = _starting_block;
this->block_count = _block_count;
this->data_ready = false; // Data has to be loaded from disk
- this->data = malloc(_cache->sb->block_size * _block_count);
+ //Round-up allocation size up to whole pages
+ auto pages_to_allocate = _block_count / 8; // 4K pages is made of 8 512-byte blocks
+ if (_block_count % 8) {
+ pages_to_allocate++;
+ }
+ this->data = memory::alloc_phys_contiguous_aligned(pages_to_allocate * mmu::page_size, mmu::page_size);
#if defined(ROFS_DIAGNOSTICS_ENABLED)
rofs_block_allocated += block_count;
#endif
}

~file_cache_segment() {
- free(this->data);
+ memory::free_phys_contiguous_aligned(this->data);
}

uint64_t length() {
return this->block_count * this->cache->sb->block_size;
}

+ void* memory_address(off_t offset) {
+ return this->data + offset;
+ }
+
bool is_data_ready() {
return this->data_ready;
}
@@ -190,8 +201,8 @@ plan_cache_transactions(struct file_cache *cache, struct uio *uio) {
bytes_to_read -= transaction.bytes_to_read;
transactions.push_back(transaction);
}
- //
- // Miss -> read from disk
+ //
+ // Miss -> read from disk
else {
print("[rofs] [%d] -> rofs_cache_get_segment_operations i-node: %d, cache segment %d MISS at file offset %d\n",
sched::thread::current()->id(), cache->inode->inode_no, cache_segment_index, file_offset);
@@ -271,4 +282,48 @@ cache_read(struct rofs_inode *inode, struct device *device, struct rofs_super_bl
return error;
}

+// Ensure a page (4096 bytes) of a file specified by offset is in memory in cache. Otherwise
+// load it from disk and eventually return address of the page in memory.
+int
+cache_get_page_address(struct rofs_inode *inode, struct device *device, struct rofs_super_block *sb, off_t offset, void **addr)
+{
+ //
+ // Find existing one or create new file cache
+ struct file_cache *cache = get_or_create_file_cache(inode, sb);
+
+ struct uio _uio;
+ _uio.uio_offset = offset;
+ _uio.uio_resid = mmu::page_size;
+ //
+ // Prepare a cache transaction (copy from memory
+ // or read from disk into cache memory and then copy into memory)
+ auto segment_transactions = plan_cache_transactions(cache, &_uio);
+ print("[rofs] [%d] rofs_get_page_address called for i-node [%d] at %d with %d ops\n",
+ sched::thread::current()->id(), inode->inode_no, offset, segment_transactions.size());
+
+ int error = 0;
+
+ auto it = segment_transactions.begin();
+ assert(it != segment_transactions.end()); // There should be at least ONE transaction
+ auto transaction = *it;
+#if defined(ROFS_DIAGNOSTICS_ENABLED)
+ rofs_cache_reads += 1;
+#endif
+ if (transaction.transaction_type == CacheTransactionType::READ_FROM_DISK) {
+ // Read from disk into segment missing in cache or empty segment that was in cache but had not data because
+ // of failure to read
+ error = transaction.segment->read_from_disk(device);
+#if defined(ROFS_DIAGNOSTICS_ENABLED)
+ rofs_cache_misses += 1;
+#endif
+ }
+
+ if( !error)
+ *addr = transaction.segment->memory_address(transaction.segment_offset);
+ else
+ *addr = nullptr;
+
+ return error;
+}
+
}
diff --git a/fs/rofs/rofs_vnops.cc b/fs/rofs/rofs_vnops.cc
index 99b2bfbd..15d7fc3b 100644
--- a/fs/rofs/rofs_vnops.cc
+++ b/fs/rofs/rofs_vnops.cc
@@ -40,6 +40,7 @@
#include <sys/types.h>
#include <osv/device.h>
#include <osv/sched.hh>
+#include <osv/pagecache.hh>

#include "rofs.hh"

@@ -272,6 +273,39 @@ static int rofs_getattr(struct vnode *vnode, struct vattr *attr)
return 0;
}

+static int rofs_map_cached_page(struct vnode *vnode, struct file* fp, struct uio *uio) {
+ struct rofs_info *rofs = (struct rofs_info *) vnode->v_mount->m_data;
+ struct rofs_super_block *sb = rofs->sb;
+ struct rofs_inode *inode = (struct rofs_inode *) vnode->v_data;
+ struct device *device = vnode->v_mount->m_dev;
+
+ if (vnode->v_type == VDIR)
+ return EISDIR;
+ /* Cant read anything but reg */
+ if (vnode->v_type != VREG)
+ return EINVAL;
+ /* Cant start reading before the first byte */
+ if (uio->uio_offset < 0)
+ return EINVAL;
+ /* Cant read after the end of the file */
+ if (uio->uio_offset >= (off_t)vnode->v_size)
+ return 0;
+ if (uio->uio_resid != mmu::page_size)
+ return EINVAL;
+ if (uio->uio_offset % mmu::page_size)
+ return EINVAL;
+
+ void *page_address;
+ int ret = rofs::cache_get_page_address(inode, device, sb, uio->uio_offset, &page_address);
+
+ if (!ret) {
+ pagecache::map_read_cached_page((pagecache::hashkey*)uio->uio_iov->iov_base, page_address);
+ uio->uio_resid = 0;
+ }
+
+ return ret;
+}
+
#define rofs_write ((vnop_write_t)vop_erofs)
#define rofs_seek ((vnop_seek_t)vop_nullop)
#define rofs_ioctl ((vnop_ioctl_t)vop_nullop)
@@ -284,7 +318,6 @@ static int rofs_getattr(struct vnode *vnode, struct vattr *attr)
#define rofs_inactive ((vnop_inactive_t)vop_nullop)
#define rofs_truncate ((vnop_truncate_t)vop_erofs)
#define rofs_link ((vnop_link_t)vop_erofs)
-#define rofs_arc ((vnop_cache_t) nullptr)
#define rofs_fallocate ((vnop_fallocate_t)vop_erofs)
#define rofs_fsync ((vnop_fsync_t)vop_nullop)
#define rofs_symlink ((vnop_symlink_t)vop_erofs)
@@ -309,7 +342,7 @@ struct vnops rofs_vnops = {
rofs_inactive, /* inactive */
rofs_truncate, /* truncate - returns error when called*/
rofs_link, /* link - returns error when called*/
- rofs_arc, /* arc */ //TODO: Implement to allow memory re-use when mapping files
+ rofs_map_cached_page, /* map page in cache */
rofs_fallocate, /* fallocate - returns error when called*/
rofs_readlink, /* read link */
rofs_symlink /* symbolic link - returns error when called*/
@@ -317,4 +350,5 @@ struct vnops rofs_vnops = {

extern "C" void rofs_disable_cache() {
rofs_vnops.vop_read = rofs_read_without_cache;
+ rofs_vnops.vop_cache = (vnop_cache_t)vop_nullop;
}
--
2.20.1

Waldemar Kozaczuk

unread,
Apr 7, 2020, 1:09:12 AM4/7/20
to osv...@googlegroups.com, Waldemar Kozaczuk
This patch optimizes memory utilization by integrating with page cache.
In essence it eliminates second copy of file data in memory when mapping files using mmap().
For example simple java example need 9MB less to run.

The crux of the changes involve adding new vnops function of type VOP_CACHE -
rofs_map_cached_page() - that ensures that requested page of a file is loaded
from disk into ROFS cache (by triggering read from disk if missing) and
eventually registers the page into pagecache by calling pagecache::map_read_cached_page().

This partially addresses #979

Signed-off-by: Waldemar Kozaczuk <jwkoz...@gmail.com>
---
fs/rofs/rofs.hh | 2 ++
fs/rofs/rofs_cache.cc | 70 ++++++++++++++++++++++++++++++++++++++++---
fs/rofs/rofs_vnops.cc | 38 +++++++++++++++++++++--
3 files changed, 104 insertions(+), 6 deletions(-)

diff --git a/fs/rofs/rofs.hh b/fs/rofs/rofs.hh
index 16f811b0..401cad8d 100644
--- a/fs/rofs/rofs.hh
+++ b/fs/rofs/rofs.hh
@@ -128,6 +128,8 @@ struct rofs_info {
namespace rofs {
int
cache_read(struct rofs_inode *inode, struct device *device, struct rofs_super_block *sb, struct uio *uio);
+ int
+ cache_get_page_address(struct rofs_inode *inode, struct device *device, struct rofs_super_block *sb, off_t offset, void **addr);
}

int rofs_read_blocks(struct device *device, uint64_t starting_block, uint64_t blocks_count, void* buf);
diff --git a/fs/rofs/rofs_cache.cc b/fs/rofs/rofs_cache.cc
index cfb5287d..71c39081 100644
--- a/fs/rofs/rofs_cache.cc
+++ b/fs/rofs/rofs_cache.cc
@@ -10,8 +10,10 @@
#include <list>
#include <unordered_map>
#include <include/osv/uio.h>
+#include <include/osv/contiguous_alloc.hh>
#include <osv/debug.h>
#include <osv/sched.hh>
+#include <sys/mman.h>

/*
* From cache perspective let us divide each file into sequence of contiguous 32K segments.
@@ -56,20 +58,36 @@ public:
this->starting_block = _starting_block;
this->block_count = _block_count;
this->data_ready = false; // Data has to be loaded from disk
- this->data = malloc(_cache->sb->block_size * _block_count);
+ auto size = _cache->sb->block_size * _block_count;
+ // Only allocate contigous page-aligned memory if size greater or equal a page
+ // to make sure page-cache mapping works properly
+ if (size >= mmu::page_size) {
+ this->data = memory::alloc_phys_contiguous_aligned(size, mmu::page_size);
+ } else {
+ this->data = malloc(size);
+ }
#if defined(ROFS_DIAGNOSTICS_ENABLED)
rofs_block_allocated += block_count;
#endif
}

~file_cache_segment() {
- free(this->data);
+ auto size = this->cache->sb->block_size * this->block_count;
+ if (size >= mmu::page_size) {
+ memory::free_phys_contiguous_aligned(this->data);
+ } else {
+ free(this->data);
+ }
}

uint64_t length() {
return this->block_count * this->cache->sb->block_size;
}

+ void* memory_address(off_t offset) {
+ return this->data + offset;
+ }
+
bool is_data_ready() {
return this->data_ready;
}
@@ -190,8 +208,8 @@ plan_cache_transactions(struct file_cache *cache, struct uio *uio) {
bytes_to_read -= transaction.bytes_to_read;
transactions.push_back(transaction);
}
- //
- // Miss -> read from disk
+ //
+ // Miss -> read from disk
else {
print("[rofs] [%d] -> rofs_cache_get_segment_operations i-node: %d, cache segment %d MISS at file offset %d\n",
sched::thread::current()->id(), cache->inode->inode_no, cache_segment_index, file_offset);
@@ -271,4 +289,48 @@ cache_read(struct rofs_inode *inode, struct device *device, struct rofs_super_bl

Waldek Kozaczuk

unread,
Apr 27, 2020, 2:05:41 AM4/27/20
to OSv Development
Add this here: ->
mmu::flush_tlb_all();

I was about to push this series of patches a couple of days ago when I came across crashes when stress-testing one of the apps with wrk. More specifically it was graalvm-netty-plot app using isolates as described in this article - https://medium.com/graalvm/isolates-and-compressed-references-more-flexible-and-efficient-memory-management-for-graalvm-a044cc50b67e. I do not exactly understand the nature of the crashes - looks like an internal app null-pointer-assert type of error that happens 60-70% of the time, no error on OSv side, app simply exits most of the time with error message. The same app works perfectly on OSv without these patches and on Linux.

It took me a while to figure out what is wrong. And honestly, I am not 100% I am right. But here is my theory: I noticed that crash happens only at the beginning of the test if it happens at all. If I execute a single request with curl and then stress test with wrk there are no crashes. So my observation was that a crash would happen under stress when multiple app threads would create mappings and at the same time execute code causing faults and have ROFS load data from disk into its cache. Once all code was in the ROFS cache in memory, the crashes would not happen.

After putting all kinds of assertions in the right places to make sure that read_cache is properly locked when accessed for read and write and validating many other things in ROFS layer I could not find any problems with the code. Then I came with this speculative scenario that might be the source of the crashes. Imagine when pagecache::get() is called during page fault to handle an attempt to write to a page that is set as read-only (for COW), the code would first remove read mapping for that virtual page from page table (see remove_read_mapping() with 2 parameters) and then update mapping to point to the newly allocated page. If the application thread after continues to execute on the same CPU 1 it should see new mapping without having to flush TLB (which we do not do here) as the old mapping would have been discarded due to the protection fault (is my thinking correct?). But what if the same thread was migrated to another CPU 2 and accessed data using the same virtual address - quite likely the TLB for that cpu might still have pointed to old read-only physical frame and simply see wrong data. I do not think we are handling this case in any way.

After adding mmu::flush_tlb_all() to remove_read_mapping() (which is only used by with ROFS) the crashes went away. So is my theory right? And the solution correct? Flushing all tlb is pretty expensive but do we have another choice?

Also, there are holes in my theory. First, the crashes would happen with single cpu as well, no migration to different cpu, so the same cpu should always see data consistently. So what might be going on in this case? Could vcpu be migrated to different physical cpu on host and could similar same effect? 

Secondly, I could see these crashes with ZFS images, why?

What do you think about my theory and the solution - flush_tlb_all()?

Waldemar Kozaczuk

unread,
Apr 28, 2020, 10:24:03 PM4/28/20
to osv...@googlegroups.com, Waldemar Kozaczuk
So far the pagecache layer has been pretty tightly integrated with ZFS
and more specifically with its ARC cache layer. This patch refactors
the pagecache implementation to make it support other filesystems than zfs.

In essence we modify all necessary places like retrieving and releasing
cached file pages to behave slightly differently depending on the filesystem
(ZFS versus non-ZFS) given vnode belongs to. The changes only apply to
read cache where ZFS page caching requires tighter integration with ARC.

This patch adds new integration function - map_read_cached_page() -
intended to be used by non-ZFS filesystem implementations to register
cached file pages into page cache.

Signed-off-by: Waldemar Kozaczuk <jwkoz...@gmail.com>
---
core/pagecache.cc | 200 +++++++++++++++++++++++++++++----------
fs/vfs/vfs_fops.cc | 2 +-
include/osv/pagecache.hh | 1 +
include/osv/vfs_file.hh | 2 +-
4 files changed, 154 insertions(+), 51 deletions(-)

diff --git a/core/pagecache.cc b/core/pagecache.cc
index d5d2ee86..b58a97fb 100644
--- a/core/pagecache.cc
+++ b/core/pagecache.cc
@@ -14,6 +14,7 @@
#include <osv/pagecache.hh>
#include <osv/mempool.hh>
#include <fs/vfs/vfs.h>
+#include <fs/vfs/vfs_id.h>
#include <osv/trace.hh>
#include <osv/prio.hh>
#include <chrono>
@@ -161,7 +162,7 @@ protected:
public:
cached_page(hashkey key, void* page) : _key(key), _page(page) {
}
- ~cached_page() {
+ virtual ~cached_page() {
}

void map(mmu::hw_ptep<0> ptep) {
@@ -198,7 +199,7 @@ public:
_vp = fp->f_dentry->d_vnode;
vref(_vp);
}
- ~cached_page_write() {
+ virtual ~cached_page_write() {
if (_page) {
if (_dirty) {
writeback();
@@ -238,7 +239,7 @@ public:

class cached_page_arc;

-unsigned drop_read_cached_page(cached_page_arc* cp, bool flush = true);
+static unsigned drop_arc_read_cached_page(cached_page_arc* cp, bool flush = true);

class cached_page_arc : public cached_page {
public:
@@ -267,7 +268,7 @@ private:

public:
cached_page_arc(hashkey key, void* page, arc_buf_t* ab) : cached_page(key, page), _ab(ref(ab, this)) {}
- ~cached_page_arc() {
+ virtual ~cached_page_arc() {
if (!_removed && unref(_ab, this)) {
arc_unshare_buf(_ab);
}
@@ -282,7 +283,7 @@ public:
std::for_each(it.first, it.second, [&count](arc_map::value_type& p) {
auto cp = p.second;
cp->_removed = true;
- count += drop_read_cached_page(cp, false);
+ count += drop_arc_read_cached_page(cp, false);
});
arc_cache_map.erase(ab);
if (count) {
@@ -296,10 +297,14 @@ static bool operator==(const cached_page_arc::arc_map::value_type& l, const cach
}

std::unordered_multimap<arc_buf_t*, cached_page_arc*> cached_page_arc::arc_cache_map;
-static std::unordered_map<hashkey, cached_page_arc*> read_cache;
+//Map used to store read cache pages for ZFS filesystem interacting with ARC
+static std::unordered_map<hashkey, cached_page_arc*> arc_read_cache;
+//Map used to store read cache pages for non-ZFS filesystems
+static std::unordered_map<hashkey, cached_page*> read_cache;
static std::unordered_map<hashkey, cached_page_write*> write_cache;
static std::deque<cached_page_write*> write_lru;
-static mutex arc_lock; // protects against parallel access to the read cache
+static mutex arc_read_lock; // protects against parallel access to the ARC read cache
+static mutex read_lock; // protects against parallel access to the read cache
static mutex write_lock; // protect against parallel access to the write cache

template<typename T>
@@ -314,38 +319,76 @@ static T find_in_cache(std::unordered_map<hashkey, T>& cache, hashkey& key)
+ // The method remove_read_mapping() is called by pagecache::get()
+ // to handle MAP_PRIVATE COW (Copy-On-Write) scenario triggered by an attempt to write
+ // to read-only page in read_cache (write protection page-fault). To handle it properly
+ // we need to remove the existing mapping for specified file hash (ino, offset) so that
+ // the physical address of newly allocated read-write page with the exact copy of
+ // the original data can be placed in pte instead.
+ // Normally this works fine and the application continues after and reads from/writes to
+ // new allocated page. But sometimes when applications "mmap" same portions of the same file
+ // with MAP_PRIVATE flag at the same time from multiple threads under load,
+ // those threads get migrated to different CPUs so it is important we flush tlb
+ // on all cpus. Otherwise given thread after migrated to different CPU may still see old
+ // read-only page with stale data and cause spectacular crash.
+ // The fact we have to flush tlb is very unfortunate as it is pretty expensive operation.
+ // On positive side most applications loaded from Read-Only FS will experience pretty limited
+ // number of COW faults mostly caused by ELF linker writing to GOT or PLT section.
+ // TODO 1: Investigate if there is an alternative "cheaper" way to solve this problem.
+ // TODO 2: Investigate why flushing TLB is necessary in single CPU scenario.
+ // TODO 3: Investigate why we do not have to flush tlb when removing read mapping with ZFS.
+ mmu::flush_tlb_all();
}
}

-TRACEPOINT(trace_drop_read_cached_page, "buf=%p, addr=%p", void*, void*);
-unsigned drop_read_cached_page(cached_page_arc* cp, bool flush)
+void remove_arc_read_mapping(hashkey& key, mmu::hw_ptep<0> ptep)
+{
+ SCOPE_LOCK(arc_read_lock);
+ cached_page_arc* cp = find_in_cache(arc_read_cache, key);
+ if (cp) {
+ remove_arc_read_mapping(cp, ptep);
+ }
+}
+
+template<typename T>
+static unsigned drop_read_cached_page(std::unordered_map<hashkey, T>& cache, cached_page* cp, bool flush)
{
- trace_drop_read_cached_page(cp->arcbuf(), cp->addr());
int flushed = cp->flush();
- read_cache.erase(cp->key());
+ cache.erase(cp->key());

if (flush && flushed > 1) { // if there was only one pte it is the one we are faulting on; no need to flush.
mmu::flush_tlb_all();
@@ -356,12 +399,28 @@ unsigned drop_read_cached_page(cached_page_arc* cp, bool flush)
@@ -369,7 +428,7 @@ TRACEPOINT(trace_unmap_arc_buf, "buf=%p", void*);
void unmap_arc_buf(arc_buf_t* ab)
{
trace_unmap_arc_buf(ab);
- SCOPE_LOCK(arc_lock);
+ SCOPE_LOCK(arc_read_lock);
cached_page_arc::unmap_arc_buf(ab);
}

@@ -377,15 +436,22 @@ TRACEPOINT(trace_map_arc_buf, "buf=%p page=%p", void*, void*);
void map_arc_buf(hashkey *key, arc_buf_t* ab, void *page)
{
trace_map_arc_buf(ab, page);
- SCOPE_LOCK(arc_lock);
+ SCOPE_LOCK(arc_read_lock);
cached_page_arc* pc = new cached_page_arc(*key, page, ab);
- read_cache.emplace(*key, pc);
+ arc_read_cache.emplace(*key, pc);
arc_share_buf(ab);
}

+void map_read_cached_page(hashkey *key, void *page)
+{
+ SCOPE_LOCK(read_lock);
+ cached_page* pc = new cached_page(*key, page);
+ read_cache.emplace(*key, pc);
+}
+
static int create_read_cached_page(vfs_file* fp, hashkey& key)
{
- return fp->get_arcbuf(&key, key.offset);
+ return fp->read_page_from_cache(&key, key.offset);
}

static std::unique_ptr<cached_page_write> create_write_cached_page(vfs_file* fp, hashkey& key)
@@ -422,6 +488,8 @@ static void insert(cached_page_write* cp) {
}
}

+#define IS_ZFS(st_dev) ((st_dev & (0xffULL<<56)) == ZFS_ID)
+
bool get(vfs_file* fp, off_t offset, mmu::hw_ptep<0> ptep, mmu::pt_element<0> pte, bool write, bool shared)
{
struct stat st;
@@ -437,17 +505,27 @@ bool get(vfs_file* fp, off_t offset, mmu::hw_ptep<0> ptep, mmu::pt_element<0> pt
// write fault into shared mapping, there page is not in write cache yet, add it.
wcp = newcp.release();
insert(wcp);
- // page is moved from ARC to write cache
- // drop ARC page if exists, removing all mappings
- drop_read_cached_page(key);
+ // page is moved from read cache to write cache
+ // drop read page if exists, removing all mappings
+ if (IS_ZFS(st.st_dev)) {
+ drop_arc_read_cached_page(key);
+ } else {
+ // ROFS (at least for now)
+ drop_read_cached_page(key);
+ }
} else {
- // remove mapping to ARC page if exists
- remove_read_mapping(key, ptep);
- // cow of private page from ARC
+ // remove mapping to read cache page if exists
+ if (IS_ZFS(st.st_dev)) {
+ remove_arc_read_mapping(key, ptep);
+ } else {
+ // ROFS (at least for now)
+ remove_read_mapping(key, ptep);
+ }
+ // cow (copy-on-write) of private page from read cache
return mmu::write_pte(newcp->release(), ptep, pte);
}
} else if (!shared) {
- // cow of private page from write cache
+ // cow (copy-on-write) of private page from write cache
void* page = memory::alloc_page();
memcpy(page, wcp->addr(), mmu::page_size);
return mmu::write_pte(page, ptep, pte);
@@ -456,11 +534,23 @@ bool get(vfs_file* fp, off_t offset, mmu::hw_ptep<0> ptep, mmu::pt_element<0> pt
int ret;
// read fault and page is not in write cache yet, return one from ARC, mark it cow
do {
- WITH_LOCK(arc_lock) {
- cached_page_arc* cp = find_in_cache(read_cache, key);
- if (cp) {
- add_read_mapping(cp, ptep);
- return mmu::write_pte(cp->addr(), ptep, mmu::pte_mark_cow(pte, true));
+ if (IS_ZFS(st.st_dev)) {
+ WITH_LOCK(arc_read_lock) {
+ cached_page_arc* cp = find_in_cache(arc_read_cache, key);
+ if (cp) {
+ add_arc_read_mapping(cp, ptep);
+ return mmu::write_pte(cp->addr(), ptep, mmu::pte_mark_cow(pte, true));
+ }
+ }
+ }
+ else {
+ // ROFS (at least for now)
+ WITH_LOCK(read_lock) {
+ cached_page* cp = find_in_cache(read_cache, key);
+ if (cp) {
+ add_read_mapping(cp, ptep);
+ return mmu::write_pte(cp->addr(), ptep, mmu::pte_mark_cow(pte, true));
+ }
}
}

@@ -480,7 +570,7 @@ bool get(vfs_file* fp, off_t offset, mmu::hw_ptep<0> ptep, mmu::pt_element<0> pt

} while (ret != -1);

- // try to access a hole in a file, map by zero_page }
+ // try to access a hole in a file, map by zero_page
return mmu::write_pte(zero_page, ptep, mmu::pte_mark_cow(pte, true));
}

@@ -513,12 +603,24 @@ bool release(vfs_file* fp, void *addr, off_t offset, mmu::hw_ptep<0> ptep)
}
}

- WITH_LOCK(arc_lock) {
- cached_page_arc* rcp = find_in_cache(read_cache, key);
- if (rcp && mmu::virt_to_phys(rcp->addr()) == old.addr()) {
- // page is in ARC
- remove_read_mapping(rcp, ptep);
- return false;
+ if (IS_ZFS(st.st_dev)) {
+ WITH_LOCK(arc_read_lock) {
+ cached_page_arc* rcp = find_in_cache(arc_read_cache, key);
+ if (rcp && mmu::virt_to_phys(rcp->addr()) == old.addr()) {
+ // page is in ARC read cache
+ remove_arc_read_mapping(rcp, ptep);
+ return false;
+ }
+ }
+ } else {
+ // ROFS (at least for now)
+ WITH_LOCK(read_lock) {
+ cached_page* rcp = find_in_cache(read_cache, key);
+ if (rcp && mmu::virt_to_phys(rcp->addr()) == old.addr()) {
+ // page is in regular read cache
+ remove_read_mapping(read_cache, rcp, ptep);
+ return false;
+ }
}
}

@@ -595,7 +697,7 @@ private:
auto start = sched::thread::current()->thread_clock();
auto deadline = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::nanoseconds(static_cast<unsigned long>(work/_freq))) + start;

- WITH_LOCK(arc_lock) {
+ WITH_LOCK(arc_read_lock) {
while (sched::thread::current()->thread_clock() < deadline && buckets_scanned < bucket_count) {
if (current_bucket >= cached_page_arc::arc_cache_map.bucket_count()) {
current_bucket = 0;
@@ -617,7 +719,7 @@ private:

Waldemar Kozaczuk

unread,
Apr 28, 2020, 10:24:06 PM4/28/20
to osv...@googlegroups.com, Waldemar Kozaczuk
This patch optimizes memory utilization by integrating with page cache.
In essence it eliminates second copy of file data in memory when mapping files using mmap().
For example simple java example need 9MB less to run.

The crux of the changes involves adding new vnops function of type VOP_CACHE -
rofs_map_cached_page() - that ensures that requested page of a file is loaded
from disk into ROFS cache (by triggering read from disk if missing) and
eventually registers the page into pagecache by calling pagecache::map_read_cached_page().

This partially addresses #979

Signed-off-by: Waldemar Kozaczuk <jwkoz...@gmail.com>
---
fs/rofs/rofs.hh | 2 ++
fs/rofs/rofs_cache.cc | 73 +++++++++++++++++++++++++++++++++++++++----
fs/rofs/rofs_vnops.cc | 40 ++++++++++++++++++++++--
3 files changed, 107 insertions(+), 8 deletions(-)

diff --git a/fs/rofs/rofs.hh b/fs/rofs/rofs.hh
index 16f811b0..eff816f7 100644
--- a/fs/rofs/rofs.hh
+++ b/fs/rofs/rofs.hh
@@ -128,6 +128,8 @@ struct rofs_info {
namespace rofs {
int
cache_read(struct rofs_inode *inode, struct device *device, struct rofs_super_block *sb, struct uio *uio);
+ int
+ cache_get_page_address(struct rofs_inode *inode, struct device *device, struct rofs_super_block *sb, struct uio *uio, void **addr);
}

int rofs_read_blocks(struct device *device, uint64_t starting_block, uint64_t blocks_count, void* buf);
diff --git a/fs/rofs/rofs_cache.cc b/fs/rofs/rofs_cache.cc
index cfb5287d..fe9b624f 100644
--- a/fs/rofs/rofs_cache.cc
+++ b/fs/rofs/rofs_cache.cc
@@ -10,8 +10,10 @@
#include <list>
#include <unordered_map>
#include <include/osv/uio.h>
+#include <include/osv/contiguous_alloc.hh>
#include <osv/debug.h>
#include <osv/sched.hh>
+#include <sys/mman.h>

/*
* From cache perspective let us divide each file into sequence of contiguous 32K segments.
@@ -56,20 +58,36 @@ public:
this->starting_block = _starting_block;
this->block_count = _block_count;
this->data_ready = false; // Data has to be loaded from disk
- this->data = malloc(_cache->sb->block_size * _block_count);
+ auto size = _cache->sb->block_size * _block_count;
+ // Only allocate contiguous page-aligned memory if size greater or equal a page
@@ -93,12 +111,16 @@ public:
blocks_remaining++;
}
auto block_count_to_read = std::min(block_count, blocks_remaining);
- print("[rofs] [%d] -> file_cache_segment::write() i-node: %d, starting block %d, reading [%d] blocks at disk offset [%d]\n",
+ print("[rofs] [%d] -> file_cache_segment::read_from_disk() i-node: %d, starting block %d, reading [%d] blocks at disk offset [%d]\n",
sched::thread::current()->id(), cache->inode->inode_no, starting_block, block_count_to_read, block);
auto error = rofs_read_blocks(device, block, block_count_to_read, data);
this->data_ready = (error == 0);
if (error) {
- print("!!!!! Error reading from disk\n");
+ printf("!!!!! Error reading from disk\n");
+ } else {
+ if (bytes_remaining < this->length()) {
+ memset(data + bytes_remaining, 0, this->length() - bytes_remaining);
+ }
}
return error;
}
@@ -190,8 +212,8 @@ plan_cache_transactions(struct file_cache *cache, struct uio *uio) {
bytes_to_read -= transaction.bytes_to_read;
transactions.push_back(transaction);
}
- //
- // Miss -> read from disk
+ //
+ // Miss -> read from disk
else {
print("[rofs] [%d] -> rofs_cache_get_segment_operations i-node: %d, cache segment %d MISS at file offset %d\n",
sched::thread::current()->id(), cache->inode->inode_no, cache_segment_index, file_offset);
@@ -271,4 +293,43 @@ cache_read(struct rofs_inode *inode, struct device *device, struct rofs_super_bl
return error;
}

+// Ensure a page (4096 bytes) of a file specified by offset is in memory in cache. Otherwise
+// load it from disk and eventually return address of the page in memory.
+int
+cache_get_page_address(struct rofs_inode *inode, struct device *device, struct rofs_super_block *sb, struct uio *uio, void **addr)
+{
+ // Find existing one or create new file cache
+ struct file_cache *cache = get_or_create_file_cache(inode, sb);
+
+ //
+ // Prepare a cache transaction (copy from memory
+ // or read from disk into cache memory and then copy into memory)
+ auto segment_transactions = plan_cache_transactions(cache, uio);
+ print("[rofs] [%d] rofs_get_page_address called for i-node [%d] at %d with %d ops\n",
+ sched::thread::current()->id(), inode->inode_no, offset, segment_transactions.size());
+
+ int error = 0;
+
+ assert(segment_transactions.size() == 1);
+ auto transaction = segment_transactions[0];
+#if defined(ROFS_DIAGNOSTICS_ENABLED)
+ rofs_cache_reads += 1;
+#endif
+ if (transaction.transaction_type == CacheTransactionType::READ_FROM_DISK) {
+ // Read from disk into segment missing in cache or empty segment that was in cache but had not data because
+ // of failure to read
+ error = transaction.segment->read_from_disk(device);
+#if defined(ROFS_DIAGNOSTICS_ENABLED)
+ rofs_cache_misses += 1;
+#endif
+ }
+
+ if( !error)
+ *addr = transaction.segment->memory_address(transaction.segment_offset);
+ else
+ *addr = nullptr;
+
+ return error;
+}
+
}
diff --git a/fs/rofs/rofs_vnops.cc b/fs/rofs/rofs_vnops.cc
index cd953b5f..f7b53c56 100644
--- a/fs/rofs/rofs_vnops.cc
+++ b/fs/rofs/rofs_vnops.cc
@@ -40,6 +40,7 @@
#include <sys/types.h>
#include <osv/device.h>
#include <osv/sched.hh>
+#include <osv/pagecache.hh>

#include "rofs.hh"

@@ -275,6 +276,41 @@ static int rofs_getattr(struct vnode *vnode, struct vattr *attr)
return 0;
}

+int rofs_map_cached_page(struct vnode *vnode, struct file* fp, struct uio *uio) {
+ struct rofs_info *rofs = (struct rofs_info *) vnode->v_mount->m_data;
+ struct rofs_super_block *sb = rofs->sb;
+ struct rofs_inode *inode = (struct rofs_inode *) vnode->v_data;
+ struct device *device = vnode->v_mount->m_dev;
+
+ if (vnode->v_type == VDIR)
+ return EISDIR;
+ /* Cant read anything but reg */
+ if (vnode->v_type != VREG)
+ return EINVAL;
+ /* Cant start reading before the first byte */
+ if (uio->uio_offset < 0)
+ return EINVAL;
+ /* Cant read after the end of the file */
+ if (uio->uio_offset >= (off_t)vnode->v_size)
+ return 0;
+ if (uio->uio_resid != mmu::page_size)
+ return EINVAL;
+ if (uio->uio_offset % mmu::page_size)
+ return EINVAL;
+
+ void *page_address;
+ int ret = rofs::cache_get_page_address(inode, device, sb, uio, &page_address);
+
+ if (!ret) {
+ pagecache::map_read_cached_page((pagecache::hashkey*)uio->uio_iov->iov_base, page_address);
+ uio->uio_resid = 0;
+ } else {
+ abort("ROFS cache failed!");
+ }
+
+ return ret;
+}
+
#define rofs_write ((vnop_write_t)vop_erofs)
#define rofs_seek ((vnop_seek_t)vop_nullop)
#define rofs_ioctl ((vnop_ioctl_t)vop_nullop)
@@ -287,7 +323,6 @@ static int rofs_getattr(struct vnode *vnode, struct vattr *attr)
#define rofs_inactive ((vnop_inactive_t)vop_nullop)
#define rofs_truncate ((vnop_truncate_t)vop_erofs)
#define rofs_link ((vnop_link_t)vop_erofs)
-#define rofs_arc ((vnop_cache_t) nullptr)
#define rofs_fallocate ((vnop_fallocate_t)vop_erofs)
#define rofs_fsync ((vnop_fsync_t)vop_nullop)
#define rofs_symlink ((vnop_symlink_t)vop_erofs)
@@ -312,7 +347,7 @@ struct vnops rofs_vnops = {
rofs_inactive, /* inactive */
rofs_truncate, /* truncate - returns error when called*/
rofs_link, /* link - returns error when called*/
- rofs_arc, /* arc */ //TODO: Implement to allow memory re-use when mapping files
+ rofs_map_cached_page,
rofs_fallocate, /* fallocate - returns error when called*/
rofs_readlink, /* read link */
rofs_symlink /* symbolic link - returns error when called*/
@@ -320,4 +355,5 @@ struct vnops rofs_vnops = {

extern "C" void rofs_disable_cache() {
rofs_vnops.vop_read = rofs_read_without_cache;
+ rofs_vnops.vop_cache = (vnop_cache_t) nullptr;
}
--
2.20.1

Commit Bot

unread,
Apr 30, 2020, 12:53:18 PM4/30/20
to osv...@googlegroups.com, Waldemar Kozaczuk
From: Waldemar Kozaczuk <jwkoz...@gmail.com>
Committer: Waldemar Kozaczuk <jwkoz...@gmail.com>
Branch: master

pagecache: refactor to allow integration with non-ZFS filesystems

So far the pagecache layer has been pretty tightly integrated with ZFS
and more specifically with its ARC cache layer. This patch refactors
the pagecache implementation to make it support other filesystems than zfs.

In essence we modify all necessary places like retrieving and releasing
cached file pages to behave slightly differently depending on the filesystem
(ZFS versus non-ZFS) given vnode belongs to. The changes only apply to
read cache where ZFS page caching requires tighter integration with ARC.

This patch adds new integration function - map_read_cached_page() -
intended to be used by non-ZFS filesystem implementations to register
cached file pages into page cache.

Signed-off-by: Waldemar Kozaczuk <jwkoz...@gmail.com>

---
diff --git a/core/pagecache.cc b/core/pagecache.cc
--- a/core/pagecache.cc
+++ b/core/pagecache.cc
@@ -14,6 +14,7 @@
#include <osv/pagecache.hh>
#include <osv/mempool.hh>
#include <fs/vfs/vfs.h>
+#include <fs/vfs/vfs_id.h>
#include <osv/trace.hh>
#include <osv/prio.hh>
#include <chrono>
@@ -161,7 +162,7 @@ class cached_page {
public:
cached_page(hashkey key, void* page) : _key(key), _page(page) {
}
- ~cached_page() {
+ virtual ~cached_page() {
}

void map(mmu::hw_ptep<0> ptep) {
@@ -198,7 +199,7 @@ class cached_page_write : public cached_page {
_vp = fp->f_dentry->d_vnode;
vref(_vp);
}
- ~cached_page_write() {
+ virtual ~cached_page_write() {
if (_page) {
if (_dirty) {
writeback();
@@ -238,7 +239,7 @@ class cached_page_write : public cached_page {

class cached_page_arc;

-unsigned drop_read_cached_page(cached_page_arc* cp, bool flush = true);
+static unsigned drop_arc_read_cached_page(cached_page_arc* cp, bool flush = true);

class cached_page_arc : public cached_page {
public:
@@ -267,7 +268,7 @@ class cached_page_arc : public cached_page {

public:
cached_page_arc(hashkey key, void* page, arc_buf_t* ab) : cached_page(key, page), _ab(ref(ab, this)) {}
- ~cached_page_arc() {
+ virtual ~cached_page_arc() {
if (!_removed && unref(_ab, this)) {
arc_unshare_buf(_ab);
}
@@ -282,7 +283,7 @@ class cached_page_arc : public cached_page {
@@ -356,36 +399,59 @@ unsigned drop_read_cached_page(cached_page_arc* cp, bool flush)
TRACEPOINT(trace_unmap_arc_buf, "buf=%p", void*);
void unmap_arc_buf(arc_buf_t* ab)
{
trace_unmap_arc_buf(ab);
- SCOPE_LOCK(arc_lock);
+ SCOPE_LOCK(arc_read_lock);
cached_page_arc::unmap_arc_buf(ab);
}

@@ -595,7 +697,7 @@ static class access_scanner {
auto start = sched::thread::current()->thread_clock();
auto deadline = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::nanoseconds(static_cast<unsigned long>(work/_freq))) + start;

- WITH_LOCK(arc_lock) {
+ WITH_LOCK(arc_read_lock) {
while (sched::thread::current()->thread_clock() < deadline && buckets_scanned < bucket_count) {
if (current_bucket >= cached_page_arc::arc_cache_map.bucket_count()) {
current_bucket = 0;
@@ -617,7 +719,7 @@ static class access_scanner {

// mark ARC buffers as accessed when we have 1024 of them
if (!(cleared % 1024)) {
- DROP_LOCK(arc_lock) {
+ DROP_LOCK(arc_read_lock) {
flush = mark_accessed(accessed);
}
}
diff --git a/fs/vfs/vfs_fops.cc b/fs/vfs/vfs_fops.cc
--- a/fs/vfs/vfs_fops.cc
+++ b/fs/vfs/vfs_fops.cc
@@ -157,7 +157,7 @@ void vfs_file::sync(off_t start, off_t end)
// eviction that will hold the mmu-side lock that protects the mappings
// Always follow that order. We however can't just get rid of the mmu-side lock,
// because not all invalidations will be synchronous.
-int vfs_file::get_arcbuf(void* key, off_t offset)
+int vfs_file::read_page_from_cache(void* key, off_t offset)
{
struct vnode *vp = f_dentry->d_vnode;

diff --git a/include/osv/pagecache.hh b/include/osv/pagecache.hh
--- a/include/osv/pagecache.hh
+++ b/include/osv/pagecache.hh
@@ -35,4 +35,5 @@ bool release(vfs_file* fp, void *addr, off_t offset, mmu::hw_ptep<0> ptep);
void sync(vfs_file* fp, off_t start, off_t end);
void unmap_arc_buf(arc_buf_t* ab);
void map_arc_buf(hashkey* key, arc_buf_t* ab, void* page);
+void map_read_cached_page(hashkey *key, void *page);
}
diff --git a/include/osv/vfs_file.hh b/include/osv/vfs_file.hh

Commit Bot

unread,
Apr 30, 2020, 12:53:20 PM4/30/20
to osv...@googlegroups.com, Waldemar Kozaczuk
From: Waldemar Kozaczuk <jwkoz...@gmail.com>
Committer: Waldemar Kozaczuk <jwkoz...@gmail.com>
Branch: master

rofs: optimize memory utilization by integrating with page cache

This patch optimizes memory utilization by integrating with page cache.
In essence it eliminates second copy of file data in memory when mapping files using mmap().
For example simple java example need 9MB less to run.

The crux of the changes involves adding new vnops function of type VOP_CACHE -
rofs_map_cached_page() - that ensures that requested page of a file is loaded
from disk into ROFS cache (by triggering read from disk if missing) and
eventually registers the page into pagecache by calling pagecache::map_read_cached_page().

This partially addresses #979

Signed-off-by: Waldemar Kozaczuk <jwkoz...@gmail.com>

---
diff --git a/fs/rofs/rofs.hh b/fs/rofs/rofs.hh
--- a/fs/rofs/rofs.hh
+++ b/fs/rofs/rofs.hh
@@ -128,6 +128,8 @@ struct rofs_info {
namespace rofs {
int
cache_read(struct rofs_inode *inode, struct device *device, struct rofs_super_block *sb, struct uio *uio);
+ int
+ cache_get_page_address(struct rofs_inode *inode, struct device *device, struct rofs_super_block *sb, struct uio *uio, void **addr);
}

int rofs_read_blocks(struct device *device, uint64_t starting_block, uint64_t blocks_count, void* buf);
diff --git a/fs/rofs/rofs_cache.cc b/fs/rofs/rofs_cache.cc
--- a/fs/rofs/rofs_cache.cc
+++ b/fs/rofs/rofs_cache.cc
@@ -10,8 +10,10 @@
#include <list>
#include <unordered_map>
#include <include/osv/uio.h>
+#include <include/osv/contiguous_alloc.hh>
#include <osv/debug.h>
#include <osv/sched.hh>
+#include <sys/mman.h>

/*
* From cache perspective let us divide each file into sequence of contiguous 32K segments.
@@ -56,20 +58,36 @@ class file_cache_segment {
@@ -93,12 +111,16 @@ class file_cache_segment {
@@ -312,12 +347,13 @@ struct vnops rofs_vnops = {
rofs_inactive, /* inactive */
rofs_truncate, /* truncate - returns error when called*/
rofs_link, /* link - returns error when called*/
- rofs_arc, /* arc */ //TODO: Implement to allow memory re-use when mapping files
+ rofs_map_cached_page,
rofs_fallocate, /* fallocate - returns error when called*/
rofs_readlink, /* read link */
rofs_symlink /* symbolic link - returns error when called*/
};
Reply all
Reply to author
Forward
0 new messages