[COMMIT osv master] mmap: avoid collisions with linear map

16 views
Skip to first unread message

Commit Bot

unread,
Mar 30, 2022, 11:36:44 PM3/30/22
to osv...@googlegroups.com, Waldemar Kozaczuk
From: Waldemar Kozaczuk <jwkoz...@gmail.com>
Committer: Waldemar Kozaczuk <jwkoz...@gmail.com>
Branch: master

mmap: avoid collisions with linear map

This patch enhances virtual memory mapping code to track both
linear (linear_map) and non-linear (mmap()) virtual memory mappings.

It does so by adding simple struct - vma_range and vma_range_set
collection to store all mappings. It then modifies mmu::find_hole()
implementation to use vma_range_set instead of vma_list to find next hole.
That way it can avoid collisions described in the issue #1135.

Fixes #1135

Signed-off-by: Waldemar Kozaczuk <jwkoz...@gmail.com>

---
diff --git a/core/mmu.cc b/core/mmu.cc
--- a/core/mmu.cc
+++ b/core/mmu.cc
@@ -47,6 +47,17 @@ extern const char text_start[], text_end[];

namespace mmu {

+struct vma_range_compare {
+ bool operator()(const vma_range& a, const vma_range& b) {
+ return a.start() < b.start();
+ }
+};
+
+//Set of all vma ranges - both linear and non-linear ones
+__attribute__((init_priority((int)init_prio::vma_range_set)))
+std::set<vma_range, vma_range_compare> vma_range_set;
+rwlock_t vma_range_set_mutex;
+
struct linear_vma_compare {
bool operator()(const linear_vma* a, const linear_vma* b) {
return a->_virt_addr < b->_virt_addr;
@@ -66,6 +77,9 @@ class vma_compare {
}
};

+constexpr uintptr_t lower_vma_limit = 0x0;
+constexpr uintptr_t upper_vma_limit = 0x800000000000;
+
typedef boost::intrusive::set<vma,
bi::compare<vma_compare>,
bi::member_hook<vma,
@@ -78,9 +92,15 @@ struct vma_list_type : vma_list_base {
vma_list_type() {
// insert markers for the edges of allocatable area
// simplifies searches
- insert(*new anon_vma(addr_range(0, 0), 0, 0));
- uintptr_t e = 0x800000000000;
- insert(*new anon_vma(addr_range(e, e), 0, 0));
+ auto lower_edge = new anon_vma(addr_range(lower_vma_limit, lower_vma_limit), 0, 0);
+ insert(*lower_edge);
+ auto upper_edge = new anon_vma(addr_range(upper_vma_limit, upper_vma_limit), 0, 0);
+ insert(*upper_edge);
+
+ WITH_LOCK(vma_range_set_mutex.for_write()) {
+ vma_range_set.insert(vma_range(lower_edge));
+ vma_range_set.insert(vma_range(upper_edge));
+ }
}
};

@@ -954,27 +974,41 @@ static error protect(const void *addr, size_t size, unsigned int perm)
return no_error();
}

+class vma_range_addr_compare {
+public:
+ bool operator()(const vma_range& x, uintptr_t y) const { return x.start() < y; }
+ bool operator()(uintptr_t x, const vma_range& y) const { return x < y.start(); }
+};
+
uintptr_t find_hole(uintptr_t start, uintptr_t size)
{
bool small = size < huge_page_size;
uintptr_t good_enough = 0;

- // FIXME: use lower_bound or something
- auto p = vma_list.begin();
+ SCOPE_LOCK(vma_range_set_mutex.for_read());
+ //Find first vma range which starts before the start parameter or is the 1st one
+ auto p = std::lower_bound(vma_range_set.begin(), vma_range_set.end(), start, vma_range_addr_compare());
+ if (p != vma_range_set.begin()) {
+ --p;
+ }
auto n = std::next(p);
- while (n != vma_list.end()) {
+ while (n->start() <= upper_vma_limit) { //we only go up to the upper mmap vma limit
+ //See if desired hole fits between p and n vmas
if (start >= p->end() && start + size <= n->start()) {
return start;
}
+ //See if shifting start to the end of p makes desired hole fit between p and n
if (p->end() >= start && n->start() - p->end() >= size) {
good_enough = p->end();
if (small) {
return good_enough;
}
+ //See if huge hole fits between p and n
if (n->start() - align_up(good_enough, huge_page_size) >= size) {
return align_up(good_enough, huge_page_size);
}
}
+ //If nothing worked move next in the list
p = n;
++n;
}
@@ -999,6 +1033,9 @@ ulong evacuate(uintptr_t start, uintptr_t end)
memory::stats::on_jvm_heap_free(size);
}
vma_list.erase(dead);
+ WITH_LOCK(vma_range_set_mutex.for_write()) {
+ vma_range_set.erase(vma_range(&dead));
+ }
delete &dead;
}
}
@@ -1140,6 +1177,9 @@ uintptr_t allocate(vma *v, uintptr_t start, size_t size, bool search)
v->set(start, start+size);

vma_list.insert(*v);
+ WITH_LOCK(vma_range_set_mutex.for_write()) {
+ vma_range_set.insert(vma_range(v));
+ }

return start;
}
@@ -1493,6 +1533,9 @@ void anon_vma::split(uintptr_t edge)
vma* n = new anon_vma(addr_range(edge, _range.end()), _perm, _flags);
set(_range.start(), edge);
vma_list.insert(*n);
+ WITH_LOCK(vma_range_set_mutex.for_write()) {
+ vma_range_set.insert(vma_range(n));
+ }
}

error anon_vma::sync(uintptr_t start, uintptr_t end)
@@ -1600,6 +1643,9 @@ jvm_balloon_vma::~jvm_balloon_vma()
// for a dangling mapping representing a balloon that was already moved
// out.
vma_list.erase(*this);
+ WITH_LOCK(vma_range_set_mutex.for_write()) {
+ vma_range_set.erase(vma_range(this));
+ }
assert(!(_real_flags & mmap_jvm_balloon));
mmu::map_anon(addr(), size(), _real_flags, _real_perm);

@@ -1667,6 +1713,9 @@ ulong map_jvm(unsigned char* jvm_addr, size_t size, size_t align, balloon_ptr b)
// Since we will change its position in the tree, for the sake of future
// lookups we need to reinsert it.
vma_list.erase(*jvma);
+ WITH_LOCK(vma_range_set_mutex.for_write()) {
+ vma_range_set.erase(vma_range(jvma));
+ }
if (jvma->start() < start) {
assert(jvma->partial() >= (jvma->end() - start));
jvma->set(jvma->start(), start);
@@ -1675,11 +1724,17 @@ ulong map_jvm(unsigned char* jvm_addr, size_t size, size_t align, balloon_ptr b)
jvma->set(end, jvma->end());
}
vma_list.insert(*jvma);
+ WITH_LOCK(vma_range_set_mutex.for_write()) {
+ vma_range_set.insert(vma_range(jvma));
+ }
} else {
// Note how v and jvma are different. This is because this one,
// we will delete.
auto& v = *i--;
vma_list.erase(v);
+ WITH_LOCK(vma_range_set_mutex.for_write()) {
+ vma_range_set.erase(vma_range(&v));
+ }
// Finish the move. In practice, it will temporarily remap an
// anon mapping here, but this should be rare. Let's not
// complicate the code to optimize it. There are no
@@ -1692,6 +1747,9 @@ ulong map_jvm(unsigned char* jvm_addr, size_t size, size_t align, balloon_ptr b)

evacuate(start, start + size);
vma_list.insert(*vma);
+ WITH_LOCK(vma_range_set_mutex.for_write()) {
+ vma_range_set.insert(vma_range(vma));
+ }
return vma->size();
}
return 0;
@@ -1753,6 +1811,9 @@ void file_vma::split(uintptr_t edge)
vma *n = _file->mmap(addr_range(edge, _range.end()), _flags, _perm, off).release();
set(_range.start(), edge);
vma_list.insert(*n);
+ WITH_LOCK(vma_range_set_mutex.for_write()) {
+ vma_range_set.insert(vma_range(n));
+ }
}

error file_vma::sync(uintptr_t start, uintptr_t end)
@@ -1903,6 +1964,9 @@ void linear_map(void* _virt, phys addr, size_t size, const char* name,
WITH_LOCK(linear_vma_set_mutex.for_write()) {
linear_vma_set.insert(_vma);
}
+ WITH_LOCK(vma_range_set_mutex.for_write()) {
+ vma_range_set.insert(vma_range(_vma));
+ }
}

void free_initial_memory_range(uintptr_t addr, size_t size)
diff --git a/include/osv/mmu.hh b/include/osv/mmu.hh
--- a/include/osv/mmu.hh
+++ b/include/osv/mmu.hh
@@ -90,6 +90,37 @@ public:
boost::intrusive::set_member_hook<> _vma_list_hook;
};

+struct vma_range {
+ const void* _vma;
+ bool _is_linear;
+
+ vma_range(const linear_vma* v) {
+ _vma = v;
+ _is_linear = true;
+ }
+
+ vma_range(const vma* v) {
+ _vma = v;
+ _is_linear = false;
+ }
+
+ uintptr_t start() const {
+ if (_is_linear) {
+ return static_cast<const linear_vma*>(_vma)->v_start();
+ } else {
+ return static_cast<const vma*>(_vma)->start();
+ }
+ }
+
+ uintptr_t end() const {
+ if (_is_linear) {
+ return static_cast<const linear_vma*>(_vma)->v_end();
+ } else {
+ return static_cast<const vma*>(_vma)->end();
+ }
+ }
+};
+
class anon_vma : public vma {
public:
anon_vma(addr_range range, unsigned perm, unsigned flags);
diff --git a/include/osv/prio.hh b/include/osv/prio.hh
--- a/include/osv/prio.hh
+++ b/include/osv/prio.hh
@@ -16,6 +16,7 @@ enum {
cpus,
fpranges,
pt_root,
+ vma_range_set,
linear_vma_set,
mempool,
routecache,

Nadav Har'El

unread,
Mar 31, 2022, 8:54:24 AM3/31/22
to Waldemar Kozaczuk, Osv Dev
Do we still need both vma_range_set and vma_list? Why?
Unfortunately I'm very rusy with this area of the code, so I have a question that might be stupid:

How come we didn't need a mutex here before, but need one now?
Could adding a mutex here be a problem - namely that maybe we need to call some of this code from non-preemptable code and this will now break?
 
--
You received this message because you are subscribed to the Google Groups "OSv Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+u...@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/000000000000ea29e705db7b5e8f%40google.com.

Waldek Kozaczuk

unread,
Mar 31, 2022, 1:45:43 PM3/31/22
to OSv Development
I have actually thought hard about using only one set and one data structure to represent both non-linear and linear VMAs but this seemed a harder road. The information is quite different and in most places (except for find_hole() we either need to access the regular (non-linear) vma_list or the new linear_vma_set (which I added in one of the previous patches). The find_hole() is the ONLY place where need information about all ranges to avoid a collision. If we put everything in one set then in most places when we iterate over/access we would need to differentiate between two types and I reasoned it would only complicate things and possibly break existing code. So the solution was to add another set - vma_range_set (a superset) where we register ALL ranges (only start address and size) which we then use in the find_hole().
Yeah, it is a good point. The intention was to guard against concurrent calls from linear_map() which inserts to both linear_vma_set and vma_range_set and find_hole(). Maybe it is not necessary as this code is called from allocate() which is called with the SCOPE_LOCK(vma_list_mutex.for_write()) from both map_anon() and map_file(). Could these be called from non-preemptable code?

Also I have added this code to many places in core/mmu.cc (other patch_:

WITH_LOCK(vma_range_set_mutex.for_write()) {
       vma_range_set.erase(vma_range(&dead));
}
 
but I think they all would never be called from non-preemptable code, would they?
Reply all
Reply to author
Forward
0 new messages