RWF_NOWAIT support was disabled in e67572ef68a6cd1f0 ("reactor:
disable nowait aio due to a kernel bug"). It was later re-enabled
but disabled again.
The kernel problems have now been indentified[1] and so we can
can re-enable it. Since each filesystem was fixed in a different
kernel version, we need to plumb down this information from
posix_file_impl and friends all the way down to the reactor backend,
via io_request. This is annoying but cannot be helped.
Tested via the unit tests and also verified that the nowait paths
are exercised with strace.
[1]
https://lore.kernel.org/linux-xfs/20210117213...@dread.disaster.area/
---
include/seastar/core/internal/io_request.hh | 35 +++++++++----
src/core/file-impl.hh | 23 +++++----
src/core/file.cc | 54 +++++++++++++++------
src/core/reactor.cc | 5 +-
src/core/reactor_backend.cc | 8 +--
tests/unit/io_queue_test.cc | 2 +-
tests/unit/timer_test.cc | 1 +
7 files changed, 88 insertions(+), 40 deletions(-)
diff --git a/include/seastar/core/internal/io_request.hh b/include/seastar/core/internal/io_request.hh
index caa24b6f6..bf2e92060 100644
--- a/include/seastar/core/internal/io_request.hh
+++ b/include/seastar/core/internal/io_request.hh
@@ -60,6 +60,8 @@ class io_request {
socklen_t socklen;
} _size;
+ bool _nowait_works;
+
explicit io_request(operation op, int fd, int flags, ::msghdr* msg)
: _op(op)
, _fd(fd)
@@ -93,9 +95,20 @@ class io_request {
_size.len = size;
}
- explicit io_request(operation op, int fd, uint64_t pos, iovec* ptr, size_t size)
+ explicit io_request(operation op, int fd, uint64_t pos, char* ptr, size_t size, bool nowait_works)
+ : _op(op)
+ , _fd(fd)
+ , _nowait_works(nowait_works)
+ {
+ _attr.pos = pos;
+ _ptr.addr = ptr;
+ _size.len = size;
+ }
+
+ explicit io_request(operation op, int fd, uint64_t pos, iovec* ptr, size_t size, bool nowait_works)
: _op(op)
, _fd(fd)
+ , _nowait_works(nowait_works)
{
_attr.pos = pos;
_ptr.iovec = ptr;
@@ -198,12 +211,16 @@ class io_request {
return _size.socklen_ptr;
}
- static io_request make_read(int fd, uint64_t pos, void* address, size_t size) {
- return io_request(operation::read, fd, pos, reinterpret_cast<char*>(address), size);
+ bool nowait_works() const {
+ return _nowait_works;
+ }
+
+ static io_request make_read(int fd, uint64_t pos, void* address, size_t size, bool nowait_works) {
+ return io_request(operation::read, fd, pos, reinterpret_cast<char*>(address), size, nowait_works);
}
- static io_request make_readv(int fd, uint64_t pos, std::vector<iovec>& iov) {
- return io_request(operation::readv, fd, pos, iov.data(), iov.size());
+ static io_request make_readv(int fd, uint64_t pos, std::vector<iovec>& iov, bool nowait_works) {
+ return io_request(operation::readv, fd, pos, iov.data(), iov.size(), nowait_works);
}
static io_request make_recv(int fd, void* address, size_t size, int flags) {
@@ -222,12 +239,12 @@ class io_request {
return io_request(operation::sendmsg, fd, flags, msg);
}
- static io_request make_write(int fd, uint64_t pos, const void* address, size_t size) {
- return io_request(operation::write, fd, pos, const_cast<char*>(reinterpret_cast<const char*>(address)), size);
+ static io_request make_write(int fd, uint64_t pos, const void* address, size_t size, bool nowait_works) {
+ return io_request(operation::write, fd, pos, const_cast<char*>(reinterpret_cast<const char*>(address)), size, nowait_works);
}
- static io_request make_writev(int fd, uint64_t pos, std::vector<iovec>& iov) {
- return io_request(operation::writev, fd, pos, iov.data(), iov.size());
+ static io_request make_writev(int fd, uint64_t pos, std::vector<iovec>& iov, bool nowait_works) {
+ return io_request(operation::writev, fd, pos, iov.data(), iov.size(), nowait_works);
}
static io_request make_fdatasync(int fd) {
diff --git a/src/core/file-impl.hh b/src/core/file-impl.hh
index 601691733..c3c541ab0 100644
--- a/src/core/file-impl.hh
+++ b/src/core/file-impl.hh
@@ -49,17 +49,20 @@ class posix_file_handle_impl : public seastar::file_handle_impl {
uint32_t _disk_read_dma_alignment;
uint32_t _disk_write_dma_alignment;
uint32_t _disk_overwrite_dma_alignment;
+ bool _nowait_works;
public:
posix_file_handle_impl(int fd, open_flags f, std::atomic<unsigned>* refcount, dev_t device_id,
uint32_t memory_dma_alignment,
uint32_t disk_read_dma_alignment,
uint32_t disk_write_dma_alignment,
- uint32_t disk_overwrite_dma_alignment)
+ uint32_t disk_overwrite_dma_alignment,
+ bool nowait_works)
: _fd(fd), _refcount(refcount), _device_id(device_id), _open_flags(f)
, _memory_dma_alignment(memory_dma_alignment)
, _disk_read_dma_alignment(disk_read_dma_alignment)
, _disk_write_dma_alignment(disk_write_dma_alignment)
- , _disk_overwrite_dma_alignment(disk_overwrite_dma_alignment) {
+ , _disk_overwrite_dma_alignment(disk_overwrite_dma_alignment)
+ , _nowait_works(nowait_works) {
}
virtual ~posix_file_handle_impl();
posix_file_handle_impl(const posix_file_handle_impl&) = delete;
@@ -71,17 +74,19 @@ class posix_file_handle_impl : public seastar::file_handle_impl {
class posix_file_impl : public file_impl {
std::atomic<unsigned>* _refcount = nullptr;
dev_t _device_id;
+ bool _nowait_works;
io_queue* _io_queue;
open_flags _open_flags;
public:
int _fd;
posix_file_impl(int fd, open_flags, file_open_options options, dev_t device_id,
- uint32_t block_size);
+ uint32_t block_size, bool nowait_works);
posix_file_impl(int fd, open_flags, std::atomic<unsigned>* refcount, dev_t device_id,
uint32_t memory_dma_alignment,
uint32_t disk_read_dma_alignment,
uint32_t disk_write_dma_alignment,
- uint32_t disk_overwrite_dma_alignment);
+ uint32_t disk_overwrite_dma_alignment,
+ bool nowait_works);
virtual ~posix_file_impl() override;
future<> flush(void) noexcept override;
future<struct stat> stat(void) noexcept override;
@@ -154,11 +159,11 @@ class posix_file_impl : public file_impl {
class posix_file_real_impl final : public posix_file_impl {
public:
- posix_file_real_impl(int fd, open_flags of, file_open_options options, dev_t device_id, uint32_t block_size)
- : posix_file_impl(fd, of, std::move(options), device_id, block_size) {}
+ posix_file_real_impl(int fd, open_flags of, file_open_options options, dev_t device_id, uint32_t block_size, bool nowait_works)
+ : posix_file_impl(fd, of, std::move(options), device_id, block_size, nowait_works) {}
posix_file_real_impl(int fd, open_flags of, std::atomic<unsigned>* refcount, dev_t device_id,
- uint32_t memory_dma_alignment, uint32_t disk_read_dma_alignment, uint32_t disk_write_dma_alignment, uint32_t disk_overwrite_dma_alignment)
- : posix_file_impl(fd, of, refcount, device_id, memory_dma_alignment, disk_read_dma_alignment, disk_write_dma_alignment, disk_overwrite_dma_alignment) {}
+ uint32_t memory_dma_alignment, uint32_t disk_read_dma_alignment, uint32_t disk_write_dma_alignment, uint32_t disk_overwrite_dma_alignment, bool nowait_works)
+ : posix_file_impl(fd, of, refcount, device_id, memory_dma_alignment, disk_read_dma_alignment, disk_write_dma_alignment, disk_overwrite_dma_alignment, nowait_works) {}
virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept override;
virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc, io_intent* intent) noexcept override;
virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept override;
@@ -235,7 +240,7 @@ class append_challenged_posix_file_impl final : public posix_file_impl, public e
}
}
public:
- append_challenged_posix_file_impl(int fd, open_flags, file_open_options options, unsigned max_size_changing_ops, bool fsync_is_exclusive, dev_t device_id, size_t block_size);
+ append_challenged_posix_file_impl(int fd, open_flags, file_open_options options, unsigned max_size_changing_ops, bool fsync_is_exclusive, dev_t device_id, size_t block_size, bool nowait_works);
~append_challenged_posix_file_impl() override;
virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept override;
virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc, io_intent* intent) noexcept override;
diff --git a/src/core/file.cc b/src/core/file.cc
index ebf3ff544..29e8e3040 100644
--- a/src/core/file.cc
+++ b/src/core/file.cc
@@ -75,8 +75,9 @@ file_handle::to_file() && {
return file(std::move(*_impl).to_file());
}
-posix_file_impl::posix_file_impl(int fd, open_flags f, file_open_options options, dev_t device_id, uint32_t block_size)
+posix_file_impl::posix_file_impl(int fd, open_flags f, file_open_options options, dev_t device_id, uint32_t block_size, bool nowait_works)
: _device_id(device_id)
+ , _nowait_works(nowait_works)
, _io_queue(&(engine().get_io_queue(_device_id)))
, _open_flags(f)
, _fd(fd)
@@ -116,7 +117,8 @@ posix_file_impl::dup() {
_refcount = new std::atomic<unsigned>(1u);
}
auto ret = std::make_unique<posix_file_handle_impl>(_fd, _open_flags, _refcount, _device_id,
- _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment);
+ _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment,
+ _nowait_works);
_refcount->fetch_add(1, std::memory_order_relaxed);
return ret;
}
@@ -125,9 +127,11 @@ posix_file_impl::posix_file_impl(int fd, open_flags f, std::atomic<unsigned>* re
uint32_t memory_dma_alignment,
uint32_t disk_read_dma_alignment,
uint32_t disk_write_dma_alignment,
- uint32_t disk_overwrite_dma_alignment)
+ uint32_t disk_overwrite_dma_alignment,
+ bool nowait_works)
: _refcount(refcount)
, _device_id(device_id)
+ , _nowait_works(nowait_works)
, _io_queue(&(engine().get_io_queue(_device_id)))
, _open_flags(f)
, _fd(fd) {
@@ -336,27 +340,27 @@ posix_file_impl::list_directory(std::function<future<> (directory_entry de)> nex
future<size_t>
posix_file_impl::do_write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& io_priority_class, io_intent* intent) noexcept {
- auto req = internal::io_request::make_write(_fd, pos, buffer, len);
+ auto req = internal::io_request::make_write(_fd, pos, buffer, len, _nowait_works);
return engine().submit_io_write(_io_queue, io_priority_class, len, std::move(req), intent);
}
future<size_t>
posix_file_impl::do_write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& io_priority_class, io_intent* intent) noexcept {
auto len = internal::sanitize_iovecs(iov, _disk_write_dma_alignment);
- auto req = internal::io_request::make_writev(_fd, pos, iov);
+ auto req = internal::io_request::make_writev(_fd, pos, iov, _nowait_works);
return engine().submit_io_write(_io_queue, io_priority_class, len, std::move(req), intent).finally([iov = std::move(iov)] () {});
}
future<size_t>
posix_file_impl::do_read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& io_priority_class, io_intent* intent) noexcept {
- auto req = internal::io_request::make_read(_fd, pos, buffer, len);
+ auto req = internal::io_request::make_read(_fd, pos, buffer, len, _nowait_works);
return engine().submit_io_read(_io_queue, io_priority_class, len, std::move(req), intent);
}
future<size_t>
posix_file_impl::do_read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& io_priority_class, io_intent* intent) noexcept {
auto len = internal::sanitize_iovecs(iov, _disk_read_dma_alignment);
- auto req = internal::io_request::make_readv(_fd, pos, iov);
+ auto req = internal::io_request::make_readv(_fd, pos, iov, _nowait_works);
return engine().submit_io_read(_io_queue, io_priority_class, len, std::move(req), intent).finally([iov = std::move(iov)] () {});
}
@@ -489,8 +493,10 @@ posix_file_impl::read_maybe_eof(uint64_t pos, size_t len, const io_priority_clas
});
}
+static bool blockdev_nowait_works = kernel_uname().whitelisted({"4.13"});
+
blockdev_file_impl::blockdev_file_impl(int fd, open_flags f, file_open_options options, dev_t device_id, size_t block_size)
- : posix_file_impl(fd, f, options, device_id, block_size) {
+ : posix_file_impl(fd, f, options, device_id, block_size, blockdev_nowait_works) {
}
future<>
@@ -541,8 +547,8 @@ blockdev_file_impl::dma_read_bulk(uint64_t offset, size_t range_size, const io_p
}
append_challenged_posix_file_impl::append_challenged_posix_file_impl(int fd, open_flags f, file_open_options options,
- unsigned max_size_changing_ops, bool fsync_is_exclusive, dev_t device_id, size_t block_size)
- : posix_file_impl(fd, f, options, device_id, block_size)
+ unsigned max_size_changing_ops, bool fsync_is_exclusive, dev_t device_id, size_t block_size, bool nowait_works)
+ : posix_file_impl(fd, f, options, device_id, block_size, nowait_works)
, _max_size_changing_ops(max_size_changing_ops)
, _fsync_is_exclusive(fsync_is_exclusive) {
auto r = ::lseek(fd, 0, SEEK_END);
@@ -841,7 +847,7 @@ posix_file_handle_impl::~posix_file_handle_impl() {
std::unique_ptr<seastar::file_handle_impl>
posix_file_handle_impl::clone() const {
auto ret = std::make_unique<posix_file_handle_impl>(_fd, _open_flags, _refcount, _device_id,
- _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment);
+ _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment, _nowait_works);
if (_refcount) {
_refcount->fetch_add(1, std::memory_order_relaxed);
}
@@ -851,7 +857,7 @@ posix_file_handle_impl::clone() const {
shared_ptr<file_impl>
posix_file_handle_impl::to_file() && {
auto ret = ::seastar::make_shared<posix_file_real_impl>(_fd, _open_flags, _refcount, _device_id,
- _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment);
+ _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment, _nowait_works);
_fd = -1;
_refcount = nullptr;
return ret;
@@ -889,12 +895,13 @@ make_file_impl(int fd, file_open_options options, int flags) noexcept {
if ((flags & O_ACCMODE) == O_RDONLY || S_ISDIR(st.st_mode)) {
// Directories don't care about block size, so we need not
// query it here. Just provide something reasonable.
- return make_ready_future<shared_ptr<file_impl>>(make_shared<posix_file_real_impl>(fd, open_flags(flags), options, st_dev, /* blocksize */ 4096));
+ return make_ready_future<shared_ptr<file_impl>>(make_shared<posix_file_real_impl>(fd, open_flags(flags), options, st_dev, /* blocksize */ 4096, false));
}
struct append_support {
bool append_challenged;
unsigned append_concurrency;
bool fsync_is_exclusive;
+ bool nowait_works;
};
struct fs_info {
append_support append_support_;
@@ -911,30 +918,47 @@ make_file_impl(int fd, file_open_options options, int flags) noexcept {
static auto xc = xfs_concurrency_from_kernel_version();
as.append_concurrency = xc;
as.fsync_is_exclusive = true;
+ as.nowait_works = kernel_uname().whitelisted({"4.13"});
break;
case 0x6969: /* NFS */
as.append_challenged = false;
as.append_concurrency = 0;
as.fsync_is_exclusive = false;
+ as.nowait_works = kernel_uname().whitelisted({"4.13"});
break;
case 0xEF53: /* EXT4 */
as.append_challenged = true;
as.append_concurrency = 0;
as.fsync_is_exclusive = false;
+ as.nowait_works = kernel_uname().whitelisted({"5.5"});
+ break;
+ case 0x9123683E: /* BTRFS */
+ as.append_challenged = true;
+ as.append_concurrency = 0;
+ as.fsync_is_exclusive = true;
+ as.nowait_works = kernel_uname().whitelisted({"5.9"});
+ break;
+ case 0x01021994: /* TMPFS */
+ case 0x65735546: /* FUSE */
+ as.append_challenged = false;
+ as.append_concurrency = 999;
+ as.fsync_is_exclusive = false;
+ as.nowait_works = false;
break;
default:
as.append_challenged = true;
as.append_concurrency = 0;
as.fsync_is_exclusive = true;
+ as.nowait_works = kernel_uname().whitelisted({"4.13"});
}
s_fstype[st_dev] = fs_info{as, block_size};
});
return get_fs_info.then([st_dev, fd, flags, options = std::move(options)] () mutable {
auto [as, block_size] = s_fstype[st_dev];
if (!as.append_challenged) {
- return make_ready_future<shared_ptr<file_impl>>(make_shared<posix_file_real_impl>(fd, open_flags(flags), std::move(options), st_dev, block_size));
+ return make_ready_future<shared_ptr<file_impl>>(make_shared<posix_file_real_impl>(fd, open_flags(flags), std::move(options), st_dev, block_size, as.nowait_works));
}
- return make_ready_future<shared_ptr<file_impl>>(make_shared<append_challenged_posix_file_impl>(fd, open_flags(flags), std::move(options), as.append_concurrency, as.fsync_is_exclusive, st_dev, block_size));
+ return make_ready_future<shared_ptr<file_impl>>(make_shared<append_challenged_posix_file_impl>(fd, open_flags(flags), std::move(options), as.append_concurrency, as.fsync_is_exclusive, st_dev, block_size, as.nowait_works));
});
}
});
diff --git a/src/core/reactor.cc b/src/core/reactor.cc
index 6b5316857..b6871e9f8 100644
--- a/src/core/reactor.cc
+++ b/src/core/reactor.cc
@@ -533,8 +533,9 @@ constexpr std::chrono::milliseconds lowres_clock_impl::_granularity;
constexpr unsigned reactor::max_queues;
constexpr unsigned reactor::max_aio_per_queue;
-// Broken (returns spurious EIO). Cause/fix unknown.
-bool aio_nowait_supported = false;
+// Base version where this works; some filesystems were only fixed later, so
+// this value is mixed in with filesystem-provided values later.
+bool aio_nowait_supported = kernel_uname().whitelisted({"4.13"});
static bool sched_debug() {
return false;
diff --git a/src/core/reactor_backend.cc b/src/core/reactor_backend.cc
index e0c1eea0f..04dc1f40b 100644
--- a/src/core/reactor_backend.cc
+++ b/src/core/reactor_backend.cc
@@ -59,19 +59,19 @@ void prepare_iocb(io_request& req, io_completion* desc, iocb& iocb) {
break;
case io_request::operation::write:
iocb = make_write_iocb(req.fd(), req.pos(), req.address(), req.size());
- set_nowait(iocb, true);
+ set_nowait(iocb, req.nowait_works());
break;
case io_request::operation::writev:
iocb = make_writev_iocb(req.fd(), req.pos(), req.iov(), req.size());
- set_nowait(iocb, true);
+ set_nowait(iocb, req.nowait_works());
break;
case io_request::operation::read:
iocb = make_read_iocb(req.fd(), req.pos(), req.address(), req.size());
- set_nowait(iocb, true);
+ set_nowait(iocb, req.nowait_works());
break;
case io_request::operation::readv:
iocb = make_readv_iocb(req.fd(), req.pos(), req.iov(), req.size());
- set_nowait(iocb, true);
+ set_nowait(iocb, req.nowait_works());
break;
default:
seastar_logger.error("Invalid operation for iocb: {}", req.opname());
diff --git a/tests/unit/io_queue_test.cc b/tests/unit/io_queue_test.cc
index 7753a8193..a5e669883 100644
--- a/tests/unit/io_queue_test.cc
+++ b/tests/unit/io_queue_test.cc
@@ -41,7 +41,7 @@ struct fake_file {
static internal::io_request make_write_req(size_t idx, int val) {
int* buf = new int(val);
- return internal::io_request::make_write(0, idx, buf, 1);
+ return internal::io_request::make_write(0, idx, buf, 1, false);
}
void execute_write_req(internal::io_request& rq, io_completion* desc) {
diff --git a/tests/unit/timer_test.cc b/tests/unit/timer_test.cc
index ca698b7eb..7713e60cf 100644
--- a/tests/unit/timer_test.cc
+++ b/tests/unit/timer_test.cc
@@ -79,6 +79,7 @@ struct timer_test {
}
future<> test_timer_cancelling() {
+ print("test timer cancelling\n");
timer<Clock>& t1 = *new timer<Clock>();
t1.set_callback([] { BUG(); });
t1.arm(100ms);
--
2.29.2