[PATCH v1] file, reactor: reinstate RWF_NOWAIT support

14 views
Skip to first unread message

Avi Kivity

<avi@scylladb.com>
unread,
Mar 6, 2021, 3:06:36 PM3/6/21
to seastar-dev@googlegroups.com
RWF_NOWAIT support was disabled in e67572ef68a6cd1f0 ("reactor:
disable nowait aio due to a kernel bug"). It was later re-enabled
but disabled again.

The kernel problems have now been indentified[1] and so we can
can re-enable it. Since each filesystem was fixed in a different
kernel version, we need to plumb down this information from
posix_file_impl and friends all the way down to the reactor backend,
via io_request. This is annoying but cannot be helped.

Tested via the unit tests and also verified that the nowait paths
are exercised with strace.

[1] https://lore.kernel.org/linux-xfs/20210117213...@dread.disaster.area/
---
include/seastar/core/internal/io_request.hh | 35 +++++++++----
src/core/file-impl.hh | 23 +++++----
src/core/file.cc | 54 +++++++++++++++------
src/core/reactor.cc | 5 +-
src/core/reactor_backend.cc | 8 +--
tests/unit/io_queue_test.cc | 2 +-
tests/unit/timer_test.cc | 1 +
7 files changed, 88 insertions(+), 40 deletions(-)

diff --git a/include/seastar/core/internal/io_request.hh b/include/seastar/core/internal/io_request.hh
index caa24b6f6..bf2e92060 100644
--- a/include/seastar/core/internal/io_request.hh
+++ b/include/seastar/core/internal/io_request.hh
@@ -60,6 +60,8 @@ class io_request {
socklen_t socklen;
} _size;

+ bool _nowait_works;
+
explicit io_request(operation op, int fd, int flags, ::msghdr* msg)
: _op(op)
, _fd(fd)
@@ -93,9 +95,20 @@ class io_request {
_size.len = size;
}

- explicit io_request(operation op, int fd, uint64_t pos, iovec* ptr, size_t size)
+ explicit io_request(operation op, int fd, uint64_t pos, char* ptr, size_t size, bool nowait_works)
+ : _op(op)
+ , _fd(fd)
+ , _nowait_works(nowait_works)
+ {
+ _attr.pos = pos;
+ _ptr.addr = ptr;
+ _size.len = size;
+ }
+
+ explicit io_request(operation op, int fd, uint64_t pos, iovec* ptr, size_t size, bool nowait_works)
: _op(op)
, _fd(fd)
+ , _nowait_works(nowait_works)
{
_attr.pos = pos;
_ptr.iovec = ptr;
@@ -198,12 +211,16 @@ class io_request {
return _size.socklen_ptr;
}

- static io_request make_read(int fd, uint64_t pos, void* address, size_t size) {
- return io_request(operation::read, fd, pos, reinterpret_cast<char*>(address), size);
+ bool nowait_works() const {
+ return _nowait_works;
+ }
+
+ static io_request make_read(int fd, uint64_t pos, void* address, size_t size, bool nowait_works) {
+ return io_request(operation::read, fd, pos, reinterpret_cast<char*>(address), size, nowait_works);
}

- static io_request make_readv(int fd, uint64_t pos, std::vector<iovec>& iov) {
- return io_request(operation::readv, fd, pos, iov.data(), iov.size());
+ static io_request make_readv(int fd, uint64_t pos, std::vector<iovec>& iov, bool nowait_works) {
+ return io_request(operation::readv, fd, pos, iov.data(), iov.size(), nowait_works);
}

static io_request make_recv(int fd, void* address, size_t size, int flags) {
@@ -222,12 +239,12 @@ class io_request {
return io_request(operation::sendmsg, fd, flags, msg);
}

- static io_request make_write(int fd, uint64_t pos, const void* address, size_t size) {
- return io_request(operation::write, fd, pos, const_cast<char*>(reinterpret_cast<const char*>(address)), size);
+ static io_request make_write(int fd, uint64_t pos, const void* address, size_t size, bool nowait_works) {
+ return io_request(operation::write, fd, pos, const_cast<char*>(reinterpret_cast<const char*>(address)), size, nowait_works);
}

- static io_request make_writev(int fd, uint64_t pos, std::vector<iovec>& iov) {
- return io_request(operation::writev, fd, pos, iov.data(), iov.size());
+ static io_request make_writev(int fd, uint64_t pos, std::vector<iovec>& iov, bool nowait_works) {
+ return io_request(operation::writev, fd, pos, iov.data(), iov.size(), nowait_works);
}

static io_request make_fdatasync(int fd) {
diff --git a/src/core/file-impl.hh b/src/core/file-impl.hh
index 601691733..c3c541ab0 100644
--- a/src/core/file-impl.hh
+++ b/src/core/file-impl.hh
@@ -49,17 +49,20 @@ class posix_file_handle_impl : public seastar::file_handle_impl {
uint32_t _disk_read_dma_alignment;
uint32_t _disk_write_dma_alignment;
uint32_t _disk_overwrite_dma_alignment;
+ bool _nowait_works;
public:
posix_file_handle_impl(int fd, open_flags f, std::atomic<unsigned>* refcount, dev_t device_id,
uint32_t memory_dma_alignment,
uint32_t disk_read_dma_alignment,
uint32_t disk_write_dma_alignment,
- uint32_t disk_overwrite_dma_alignment)
+ uint32_t disk_overwrite_dma_alignment,
+ bool nowait_works)
: _fd(fd), _refcount(refcount), _device_id(device_id), _open_flags(f)
, _memory_dma_alignment(memory_dma_alignment)
, _disk_read_dma_alignment(disk_read_dma_alignment)
, _disk_write_dma_alignment(disk_write_dma_alignment)
- , _disk_overwrite_dma_alignment(disk_overwrite_dma_alignment) {
+ , _disk_overwrite_dma_alignment(disk_overwrite_dma_alignment)
+ , _nowait_works(nowait_works) {
}
virtual ~posix_file_handle_impl();
posix_file_handle_impl(const posix_file_handle_impl&) = delete;
@@ -71,17 +74,19 @@ class posix_file_handle_impl : public seastar::file_handle_impl {
class posix_file_impl : public file_impl {
std::atomic<unsigned>* _refcount = nullptr;
dev_t _device_id;
+ bool _nowait_works;
io_queue* _io_queue;
open_flags _open_flags;
public:
int _fd;
posix_file_impl(int fd, open_flags, file_open_options options, dev_t device_id,
- uint32_t block_size);
+ uint32_t block_size, bool nowait_works);
posix_file_impl(int fd, open_flags, std::atomic<unsigned>* refcount, dev_t device_id,
uint32_t memory_dma_alignment,
uint32_t disk_read_dma_alignment,
uint32_t disk_write_dma_alignment,
- uint32_t disk_overwrite_dma_alignment);
+ uint32_t disk_overwrite_dma_alignment,
+ bool nowait_works);
virtual ~posix_file_impl() override;
future<> flush(void) noexcept override;
future<struct stat> stat(void) noexcept override;
@@ -154,11 +159,11 @@ class posix_file_impl : public file_impl {

class posix_file_real_impl final : public posix_file_impl {
public:
- posix_file_real_impl(int fd, open_flags of, file_open_options options, dev_t device_id, uint32_t block_size)
- : posix_file_impl(fd, of, std::move(options), device_id, block_size) {}
+ posix_file_real_impl(int fd, open_flags of, file_open_options options, dev_t device_id, uint32_t block_size, bool nowait_works)
+ : posix_file_impl(fd, of, std::move(options), device_id, block_size, nowait_works) {}
posix_file_real_impl(int fd, open_flags of, std::atomic<unsigned>* refcount, dev_t device_id,
- uint32_t memory_dma_alignment, uint32_t disk_read_dma_alignment, uint32_t disk_write_dma_alignment, uint32_t disk_overwrite_dma_alignment)
- : posix_file_impl(fd, of, refcount, device_id, memory_dma_alignment, disk_read_dma_alignment, disk_write_dma_alignment, disk_overwrite_dma_alignment) {}
+ uint32_t memory_dma_alignment, uint32_t disk_read_dma_alignment, uint32_t disk_write_dma_alignment, uint32_t disk_overwrite_dma_alignment, bool nowait_works)
+ : posix_file_impl(fd, of, refcount, device_id, memory_dma_alignment, disk_read_dma_alignment, disk_write_dma_alignment, disk_overwrite_dma_alignment, nowait_works) {}
virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept override;
virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc, io_intent* intent) noexcept override;
virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept override;
@@ -235,7 +240,7 @@ class append_challenged_posix_file_impl final : public posix_file_impl, public e
}
}
public:
- append_challenged_posix_file_impl(int fd, open_flags, file_open_options options, unsigned max_size_changing_ops, bool fsync_is_exclusive, dev_t device_id, size_t block_size);
+ append_challenged_posix_file_impl(int fd, open_flags, file_open_options options, unsigned max_size_changing_ops, bool fsync_is_exclusive, dev_t device_id, size_t block_size, bool nowait_works);
~append_challenged_posix_file_impl() override;
virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept override;
virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc, io_intent* intent) noexcept override;
diff --git a/src/core/file.cc b/src/core/file.cc
index ebf3ff544..29e8e3040 100644
--- a/src/core/file.cc
+++ b/src/core/file.cc
@@ -75,8 +75,9 @@ file_handle::to_file() && {
return file(std::move(*_impl).to_file());
}

-posix_file_impl::posix_file_impl(int fd, open_flags f, file_open_options options, dev_t device_id, uint32_t block_size)
+posix_file_impl::posix_file_impl(int fd, open_flags f, file_open_options options, dev_t device_id, uint32_t block_size, bool nowait_works)
: _device_id(device_id)
+ , _nowait_works(nowait_works)
, _io_queue(&(engine().get_io_queue(_device_id)))
, _open_flags(f)
, _fd(fd)
@@ -116,7 +117,8 @@ posix_file_impl::dup() {
_refcount = new std::atomic<unsigned>(1u);
}
auto ret = std::make_unique<posix_file_handle_impl>(_fd, _open_flags, _refcount, _device_id,
- _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment);
+ _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment,
+ _nowait_works);
_refcount->fetch_add(1, std::memory_order_relaxed);
return ret;
}
@@ -125,9 +127,11 @@ posix_file_impl::posix_file_impl(int fd, open_flags f, std::atomic<unsigned>* re
uint32_t memory_dma_alignment,
uint32_t disk_read_dma_alignment,
uint32_t disk_write_dma_alignment,
- uint32_t disk_overwrite_dma_alignment)
+ uint32_t disk_overwrite_dma_alignment,
+ bool nowait_works)
: _refcount(refcount)
, _device_id(device_id)
+ , _nowait_works(nowait_works)
, _io_queue(&(engine().get_io_queue(_device_id)))
, _open_flags(f)
, _fd(fd) {
@@ -336,27 +340,27 @@ posix_file_impl::list_directory(std::function<future<> (directory_entry de)> nex

future<size_t>
posix_file_impl::do_write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& io_priority_class, io_intent* intent) noexcept {
- auto req = internal::io_request::make_write(_fd, pos, buffer, len);
+ auto req = internal::io_request::make_write(_fd, pos, buffer, len, _nowait_works);
return engine().submit_io_write(_io_queue, io_priority_class, len, std::move(req), intent);
}

future<size_t>
posix_file_impl::do_write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& io_priority_class, io_intent* intent) noexcept {
auto len = internal::sanitize_iovecs(iov, _disk_write_dma_alignment);
- auto req = internal::io_request::make_writev(_fd, pos, iov);
+ auto req = internal::io_request::make_writev(_fd, pos, iov, _nowait_works);
return engine().submit_io_write(_io_queue, io_priority_class, len, std::move(req), intent).finally([iov = std::move(iov)] () {});
}

future<size_t>
posix_file_impl::do_read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& io_priority_class, io_intent* intent) noexcept {
- auto req = internal::io_request::make_read(_fd, pos, buffer, len);
+ auto req = internal::io_request::make_read(_fd, pos, buffer, len, _nowait_works);
return engine().submit_io_read(_io_queue, io_priority_class, len, std::move(req), intent);
}

future<size_t>
posix_file_impl::do_read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& io_priority_class, io_intent* intent) noexcept {
auto len = internal::sanitize_iovecs(iov, _disk_read_dma_alignment);
- auto req = internal::io_request::make_readv(_fd, pos, iov);
+ auto req = internal::io_request::make_readv(_fd, pos, iov, _nowait_works);
return engine().submit_io_read(_io_queue, io_priority_class, len, std::move(req), intent).finally([iov = std::move(iov)] () {});
}

@@ -489,8 +493,10 @@ posix_file_impl::read_maybe_eof(uint64_t pos, size_t len, const io_priority_clas
});
}

+static bool blockdev_nowait_works = kernel_uname().whitelisted({"4.13"});
+
blockdev_file_impl::blockdev_file_impl(int fd, open_flags f, file_open_options options, dev_t device_id, size_t block_size)
- : posix_file_impl(fd, f, options, device_id, block_size) {
+ : posix_file_impl(fd, f, options, device_id, block_size, blockdev_nowait_works) {
}

future<>
@@ -541,8 +547,8 @@ blockdev_file_impl::dma_read_bulk(uint64_t offset, size_t range_size, const io_p
}

append_challenged_posix_file_impl::append_challenged_posix_file_impl(int fd, open_flags f, file_open_options options,
- unsigned max_size_changing_ops, bool fsync_is_exclusive, dev_t device_id, size_t block_size)
- : posix_file_impl(fd, f, options, device_id, block_size)
+ unsigned max_size_changing_ops, bool fsync_is_exclusive, dev_t device_id, size_t block_size, bool nowait_works)
+ : posix_file_impl(fd, f, options, device_id, block_size, nowait_works)
, _max_size_changing_ops(max_size_changing_ops)
, _fsync_is_exclusive(fsync_is_exclusive) {
auto r = ::lseek(fd, 0, SEEK_END);
@@ -841,7 +847,7 @@ posix_file_handle_impl::~posix_file_handle_impl() {
std::unique_ptr<seastar::file_handle_impl>
posix_file_handle_impl::clone() const {
auto ret = std::make_unique<posix_file_handle_impl>(_fd, _open_flags, _refcount, _device_id,
- _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment);
+ _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment, _nowait_works);
if (_refcount) {
_refcount->fetch_add(1, std::memory_order_relaxed);
}
@@ -851,7 +857,7 @@ posix_file_handle_impl::clone() const {
shared_ptr<file_impl>
posix_file_handle_impl::to_file() && {
auto ret = ::seastar::make_shared<posix_file_real_impl>(_fd, _open_flags, _refcount, _device_id,
- _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment);
+ _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment, _nowait_works);
_fd = -1;
_refcount = nullptr;
return ret;
@@ -889,12 +895,13 @@ make_file_impl(int fd, file_open_options options, int flags) noexcept {
if ((flags & O_ACCMODE) == O_RDONLY || S_ISDIR(st.st_mode)) {
// Directories don't care about block size, so we need not
// query it here. Just provide something reasonable.
- return make_ready_future<shared_ptr<file_impl>>(make_shared<posix_file_real_impl>(fd, open_flags(flags), options, st_dev, /* blocksize */ 4096));
+ return make_ready_future<shared_ptr<file_impl>>(make_shared<posix_file_real_impl>(fd, open_flags(flags), options, st_dev, /* blocksize */ 4096, false));
}
struct append_support {
bool append_challenged;
unsigned append_concurrency;
bool fsync_is_exclusive;
+ bool nowait_works;
};
struct fs_info {
append_support append_support_;
@@ -911,30 +918,47 @@ make_file_impl(int fd, file_open_options options, int flags) noexcept {
static auto xc = xfs_concurrency_from_kernel_version();
as.append_concurrency = xc;
as.fsync_is_exclusive = true;
+ as.nowait_works = kernel_uname().whitelisted({"4.13"});
break;
case 0x6969: /* NFS */
as.append_challenged = false;
as.append_concurrency = 0;
as.fsync_is_exclusive = false;
+ as.nowait_works = kernel_uname().whitelisted({"4.13"});
break;
case 0xEF53: /* EXT4 */
as.append_challenged = true;
as.append_concurrency = 0;
as.fsync_is_exclusive = false;
+ as.nowait_works = kernel_uname().whitelisted({"5.5"});
+ break;
+ case 0x9123683E: /* BTRFS */
+ as.append_challenged = true;
+ as.append_concurrency = 0;
+ as.fsync_is_exclusive = true;
+ as.nowait_works = kernel_uname().whitelisted({"5.9"});
+ break;
+ case 0x01021994: /* TMPFS */
+ case 0x65735546: /* FUSE */
+ as.append_challenged = false;
+ as.append_concurrency = 999;
+ as.fsync_is_exclusive = false;
+ as.nowait_works = false;
break;
default:
as.append_challenged = true;
as.append_concurrency = 0;
as.fsync_is_exclusive = true;
+ as.nowait_works = kernel_uname().whitelisted({"4.13"});
}
s_fstype[st_dev] = fs_info{as, block_size};
});
return get_fs_info.then([st_dev, fd, flags, options = std::move(options)] () mutable {
auto [as, block_size] = s_fstype[st_dev];
if (!as.append_challenged) {
- return make_ready_future<shared_ptr<file_impl>>(make_shared<posix_file_real_impl>(fd, open_flags(flags), std::move(options), st_dev, block_size));
+ return make_ready_future<shared_ptr<file_impl>>(make_shared<posix_file_real_impl>(fd, open_flags(flags), std::move(options), st_dev, block_size, as.nowait_works));
}
- return make_ready_future<shared_ptr<file_impl>>(make_shared<append_challenged_posix_file_impl>(fd, open_flags(flags), std::move(options), as.append_concurrency, as.fsync_is_exclusive, st_dev, block_size));
+ return make_ready_future<shared_ptr<file_impl>>(make_shared<append_challenged_posix_file_impl>(fd, open_flags(flags), std::move(options), as.append_concurrency, as.fsync_is_exclusive, st_dev, block_size, as.nowait_works));
});
}
});
diff --git a/src/core/reactor.cc b/src/core/reactor.cc
index 6b5316857..b6871e9f8 100644
--- a/src/core/reactor.cc
+++ b/src/core/reactor.cc
@@ -533,8 +533,9 @@ constexpr std::chrono::milliseconds lowres_clock_impl::_granularity;
constexpr unsigned reactor::max_queues;
constexpr unsigned reactor::max_aio_per_queue;

-// Broken (returns spurious EIO). Cause/fix unknown.
-bool aio_nowait_supported = false;
+// Base version where this works; some filesystems were only fixed later, so
+// this value is mixed in with filesystem-provided values later.
+bool aio_nowait_supported = kernel_uname().whitelisted({"4.13"});

static bool sched_debug() {
return false;
diff --git a/src/core/reactor_backend.cc b/src/core/reactor_backend.cc
index e0c1eea0f..04dc1f40b 100644
--- a/src/core/reactor_backend.cc
+++ b/src/core/reactor_backend.cc
@@ -59,19 +59,19 @@ void prepare_iocb(io_request& req, io_completion* desc, iocb& iocb) {
break;
case io_request::operation::write:
iocb = make_write_iocb(req.fd(), req.pos(), req.address(), req.size());
- set_nowait(iocb, true);
+ set_nowait(iocb, req.nowait_works());
break;
case io_request::operation::writev:
iocb = make_writev_iocb(req.fd(), req.pos(), req.iov(), req.size());
- set_nowait(iocb, true);
+ set_nowait(iocb, req.nowait_works());
break;
case io_request::operation::read:
iocb = make_read_iocb(req.fd(), req.pos(), req.address(), req.size());
- set_nowait(iocb, true);
+ set_nowait(iocb, req.nowait_works());
break;
case io_request::operation::readv:
iocb = make_readv_iocb(req.fd(), req.pos(), req.iov(), req.size());
- set_nowait(iocb, true);
+ set_nowait(iocb, req.nowait_works());
break;
default:
seastar_logger.error("Invalid operation for iocb: {}", req.opname());
diff --git a/tests/unit/io_queue_test.cc b/tests/unit/io_queue_test.cc
index 7753a8193..a5e669883 100644
--- a/tests/unit/io_queue_test.cc
+++ b/tests/unit/io_queue_test.cc
@@ -41,7 +41,7 @@ struct fake_file {

static internal::io_request make_write_req(size_t idx, int val) {
int* buf = new int(val);
- return internal::io_request::make_write(0, idx, buf, 1);
+ return internal::io_request::make_write(0, idx, buf, 1, false);
}

void execute_write_req(internal::io_request& rq, io_completion* desc) {
diff --git a/tests/unit/timer_test.cc b/tests/unit/timer_test.cc
index ca698b7eb..7713e60cf 100644
--- a/tests/unit/timer_test.cc
+++ b/tests/unit/timer_test.cc
@@ -79,6 +79,7 @@ struct timer_test {
}

future<> test_timer_cancelling() {
+ print("test timer cancelling\n");
timer<Clock>& t1 = *new timer<Clock>();
t1.set_callback([] { BUG(); });
t1.arm(100ms);
--
2.29.2

Pavel Emelyanov

<xemul@scylladb.com>
unread,
Mar 6, 2021, 10:52:25 PM3/6/21
to Avi Kivity, seastar-dev


сб, 6 мар. 2021 г., 23:06 Avi Kivity <a...@scylladb.com>:
RWF_NOWAIT support was disabled in e67572ef68a6cd1f0 ("reactor:
disable nowait aio due to a kernel bug"). It was later re-enabled
but disabled again.

REQ_NOWAIT?
This is left uninitialized for everything but reads and writes. But since io_request is only created for reads and writes maybe it's not a problem.

+
     explicit io_request(operation op, int fd, int flags, ::msghdr* msg)
         : _op(op)
         , _fd(fd)
@@ -93,9 +95,20 @@ class io_request {
         _size.len = size;
     }

-    explicit io_request(operation op, int fd, uint64_t pos, iovec* ptr, size_t size)
+    explicit io_request(operation op, int fd, uint64_t pos, char* ptr, size_t size, bool nowait_works)
+        : _op(op)
+        , _fd(fd)
+        , _nowait_works(nowait_works)

It's a pity we have to waste 8 bytes for a boolean value. Maybe encode this bit into _op as either explicit bit or by introducing read_nowait, write_nowait? Not in v2, of course, but incrementally.
In his response Dave said that XFS was safe to use with any kernel, why do you still check one?

                         break;
                     case 0x6969: /* NFS */
                         as.append_challenged = false;
                         as.append_concurrency = 0;
                         as.fsync_is_exclusive = false;
+                        as.nowait_works = kernel_uname().whitelisted({"4.13"});

aio_nowait_supported?
aio_nowait_supported here too?
Leftover from testing?

         timer<Clock>& t1 = *new timer<Clock>();
         t1.set_callback([] { BUG(); });
         t1.arm(100ms);
--
2.29.2

--
You received this message because you are subscribed to the Google Groups "seastar-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email to seastar-dev...@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/seastar-dev/20210306200630.1816214-1-avi%40scylladb.com.

Avi Kivity

<avi@scylladb.com>
unread,
Mar 7, 2021, 2:51:44 AM3/7/21
to Pavel Emelyanov, seastar-dev
On 3/7/21 5:52 AM, Pavel Emelyanov wrote:


сб, 6 мар. 2021 г., 23:06 Avi Kivity <a...@scylladb.com>:
RWF_NOWAIT support was disabled in e67572ef68a6cd1f0 ("reactor:
disable nowait aio due to a kernel bug"). It was later re-enabled
but disabled again.

REQ_NOWAIT?


REQ_NOWAIT is some internal kernel thing. RWF_NOWAIT is the userspace name (read/write flag, see preadv2(2).


And look what I found there:


       RWF_DSYNC (since Linux 4.7)
              Provide a per-write equivalent of the O_DSYNC open(2) flag.  This flag is meaningful only for pwritev2(), and its effect applies only to the data range written by  the
              system call.


This can be useful for pre-zeroing a file without O_DSYNC, and then writing to it with RWF_DSYNC. However, it won't be so useful while we support 3.10 era kernels.

Yes, for an internal structure I think it's okay.


+
     explicit io_request(operation op, int fd, int flags, ::msghdr* msg)
         : _op(op)
         , _fd(fd)
@@ -93,9 +95,20 @@ class io_request {
         _size.len = size;
     }

-    explicit io_request(operation op, int fd, uint64_t pos, iovec* ptr, size_t size)
+    explicit io_request(operation op, int fd, uint64_t pos, char* ptr, size_t size, bool nowait_works)
+        : _op(op)
+        , _fd(fd)
+        , _nowait_works(nowait_works)

It's a pity we have to waste 8 bytes for a boolean value. Maybe encode this bit into _op as either explicit bit or by introducing read_nowait, write_nowait? Not in v2, of course, but incrementally.


I could put it next to _op, and make _op size be 8 bits. Then we have room for up to 24 more flags if we use bit fields.


(though note that reads and writes take up way more space in the payload, the io_request size is trivial; more interesting is to do late-allocation for reads).


@@ -889,12 +895,13 @@ make_file_impl(int fd, file_open_options options, int flags) noexcept {
             if ((flags & O_ACCMODE) == O_RDONLY || S_ISDIR(st.st_mode)) {
                 // Directories don't care about block size, so we need not
                 // query it here. Just provide something reasonable.
-                return make_ready_future<shared_ptr<file_impl>>(make_shared<posix_file_real_impl>(fd, open_flags(flags), options, st_dev, /* blocksize */ 4096));
+                return make_ready_future<shared_ptr<file_impl>>(make_shared<posix_file_real_impl>(fd, open_flags(flags), options, st_dev, /* blocksize */ 4096, false));
             }
             struct append_support {
                 bool append_challenged;
                 unsigned append_concurrency;
                 bool fsync_is_exclusive;
+                bool nowait_works;
             };
             struct fs_info {
                 append_support append_support_;
@@ -911,30 +918,47 @@ make_file_impl(int fd, file_open_options options, int flags) noexcept {
                         static auto xc = xfs_concurrency_from_kernel_version();
                         as.append_concurrency = xc;
                         as.fsync_is_exclusive = true;
+                        as.nowait_works = kernel_uname().whitelisted({"4.13"});

In his response Dave said that XFS was safe to use with any kernel, why do you still check one?


The whole thing was introduced in 4.13, so best not to try before that.


                         break;
                     case 0x6969: /* NFS */
                         as.append_challenged = false;
                         as.append_concurrency = 0;
                         as.fsync_is_exclusive = false;
+                        as.nowait_works = kernel_uname().whitelisted({"4.13"});

aio_nowait_supported?


Same thing.


For the 4.13 cases I could put "true", and rely on aio_nowait_supported to mask it off for pre-4.13 kernels. That would cause EINVAL failures if someone force-enables nowait, but I guess if someone force-enables nowait on an older kernel, they know what they are doing.

From testing another patch :(

Avi Kivity

<avi@scylladb.com>
unread,
Mar 11, 2021, 10:50:57 AM3/11/21
to seastar-dev@googlegroups.com
Review ping.

Raphael S. Carvalho

<raphaelsc@scylladb.com>
unread,
Mar 11, 2021, 2:55:38 PM3/11/21
to Avi Kivity, seastar-dev
we're starting to accumulate fs info which is copied at all file handles. looks like make_file_impl() cache this fs info, if we move this info to reactor, we could make file handles store a ptr to the info of its underlying fs.
 

Avi Kivity

<avi@scylladb.com>
unread,
Mar 14, 2021, 9:58:36 AM3/14/21
to Raphael S. Carvalho, seastar-dev

Yes. We could start sending a filesystem* instead of devid_t along with all requests, and place this extra info there. But I hope it can be done afterwards.


Raphael S. Carvalho

<raphaelsc@scylladb.com>
unread,
Mar 14, 2021, 10:27:52 AM3/14/21
to Avi Kivity, seastar-dev
I think we can postpone it. definitely not a blocker.

 

Piotr Sarna

<sarna@scylladb.com>
unread,
Mar 15, 2021, 4:01:13 AM3/15/21
to Avi Kivity, seastar-dev@googlegroups.com
Looks good, aside from the leftover print statement mentioned before,
and one trailing space below.

As for kernel versions, I didn't verify manually, so I blindly trust
Dave Chinner on that one.
^ trailing whitespace
> +
> + static io_request make_read(int fd, uint64_t pos, void* address, size_t size, bool nowait_works) {
> + return io_request(operation::read, fd, pos, reinterpret_cast<char*>(address), size, nowait_works);
> }
>
> - static io_request make_readv(int fd, uint64_t pos, std::vector<iovec>& iov) {
> - return io_request(operation::readv, fd, pos, iov.data(), iov.size());
> + static io_request make_readv(int fd, uint64_t pos, std::vector<iovec>& iov, bool nowait_works) {
> + return io_request(operation::readv, fd, pos, iov.data(), iov.size(), nowait_works);
> }
>
> static io_request make_recv(int fd, void* address, size_t size, int flags) {
> @@ -222,12 +239,12 @@ class io_request {
> return io_request(operation::sendmsg, fd, flags, msg);
> }
>
> - static io_request make_write(int fd, uint64_t pos, const void* address, size_t size) {
> - return io_request(operation::write, fd, pos, const_cast<char*>(reinterpret_cast<const char*>(address)), size);
> + static io_request make_write(int fd, uint64_t pos, const void* address, size_t size, bool nowait_works) {
> + return io_request(operation::write, fd, pos, const_cast<char*>(reinterpret_cast<const char*>(address)), size, nowait_works);
> }
>
> - static io_request make_writev(int fd, uint64_t pos, std::vector<iovec>& iov) {
> - return io_request(operation::writev, fd, pos, iov.data(), iov.size());
> + static io_request make_writev(int fd, uint64_t pos, std::vector<iovec>& iov, bool nowait_works) {
> + return io_request(operation::writev, fd, pos, iov.data(), iov.size(), nowait_works);
> } I'll take liberty of fixing a trailing whitespace in `bool nowait_works() const` and queue.^

Avi Kivity

<avi@scylladb.com>
unread,
Mar 15, 2021, 4:59:40 AM3/15/21
to seastar-dev@googlegroups.com
RWF_NOWAIT support was disabled in e67572ef68a6cd1f0 ("reactor:
disable nowait aio due to a kernel bug"). It was later re-enabled
but disabled again.

The kernel problems have now been indentified[1] and so we can
can re-enable it. Since each filesystem was fixed in a different
kernel version, we need to plumb down this information from
posix_file_impl and friends all the way down to the reactor backend,
via io_request. This is annoying but cannot be helped.

Tested via the unit tests and also verified that the nowait paths
are exercised with strace.

[1] https://lore.kernel.org/linux-xfs/20210117213...@dread.disaster.area/
---

v2:
- removed trailing whitespace (can cause stalls)
- removed leftover print from a different patch entirely

include/seastar/core/internal/io_request.hh | 35 +++++++++----
src/core/file-impl.hh | 23 +++++----
src/core/file.cc | 54 +++++++++++++++------
src/core/reactor.cc | 5 +-
src/core/reactor_backend.cc | 8 +--
tests/unit/io_queue_test.cc | 2 +-
6 files changed, 87 insertions(+), 40 deletions(-)

diff --git a/include/seastar/core/internal/io_request.hh b/include/seastar/core/internal/io_request.hh
index caa24b6f6f..c05532f8f7 100644
--- a/include/seastar/core/internal/io_request.hh
+++ b/include/seastar/core/internal/io_request.hh
@@ -58,10 +58,12 @@ class io_request {
size_t len;
socklen_t* socklen_ptr;
socklen_t socklen;
} _size;

+ bool _nowait_works;
+
explicit io_request(operation op, int fd, int flags, ::msghdr* msg)
: _op(op)
, _fd(fd)
{
_attr.flags = flags;
@@ -91,13 +93,24 @@ class io_request {
_attr.pos = pos;
_ptr.addr = ptr;
_size.len = size;
}

- explicit io_request(operation op, int fd, uint64_t pos, iovec* ptr, size_t size)
+ explicit io_request(operation op, int fd, uint64_t pos, char* ptr, size_t size, bool nowait_works)
: _op(op)
, _fd(fd)
+ , _nowait_works(nowait_works)
+ {
+ _attr.pos = pos;
+ _ptr.addr = ptr;
+ _size.len = size;
+ }
+
+ explicit io_request(operation op, int fd, uint64_t pos, iovec* ptr, size_t size, bool nowait_works)
+ : _op(op)
+ , _fd(fd)
+ , _nowait_works(nowait_works)
{
_attr.pos = pos;
_ptr.iovec = ptr;
_size.len = size;
}
@@ -196,16 +209,20 @@ class io_request {

socklen_t* socklen_ptr() const {
return _size.socklen_ptr;
}

- static io_request make_read(int fd, uint64_t pos, void* address, size_t size) {
- return io_request(operation::read, fd, pos, reinterpret_cast<char*>(address), size);
+ bool nowait_works() const {
+ return _nowait_works;
+ }
+
+ static io_request make_read(int fd, uint64_t pos, void* address, size_t size, bool nowait_works) {
+ return io_request(operation::read, fd, pos, reinterpret_cast<char*>(address), size, nowait_works);
}

- static io_request make_readv(int fd, uint64_t pos, std::vector<iovec>& iov) {
- return io_request(operation::readv, fd, pos, iov.data(), iov.size());
+ static io_request make_readv(int fd, uint64_t pos, std::vector<iovec>& iov, bool nowait_works) {
+ return io_request(operation::readv, fd, pos, iov.data(), iov.size(), nowait_works);
}

static io_request make_recv(int fd, void* address, size_t size, int flags) {
return io_request(operation::recv, fd, flags, reinterpret_cast<char*>(address), size);
}
@@ -220,16 +237,16 @@ class io_request {

static io_request make_sendmsg(int fd, ::msghdr* msg, int flags) {
return io_request(operation::sendmsg, fd, flags, msg);
}

- static io_request make_write(int fd, uint64_t pos, const void* address, size_t size) {
- return io_request(operation::write, fd, pos, const_cast<char*>(reinterpret_cast<const char*>(address)), size);
+ static io_request make_write(int fd, uint64_t pos, const void* address, size_t size, bool nowait_works) {
+ return io_request(operation::write, fd, pos, const_cast<char*>(reinterpret_cast<const char*>(address)), size, nowait_works);
}

- static io_request make_writev(int fd, uint64_t pos, std::vector<iovec>& iov) {
- return io_request(operation::writev, fd, pos, iov.data(), iov.size());
+ static io_request make_writev(int fd, uint64_t pos, std::vector<iovec>& iov, bool nowait_works) {
+ return io_request(operation::writev, fd, pos, iov.data(), iov.size(), nowait_works);
}

static io_request make_fdatasync(int fd) {
return io_request(operation::fdatasync, fd);
}
diff --git a/src/core/file-impl.hh b/src/core/file-impl.hh
index 6016917339..c3c541ab0d 100644
--- a/src/core/file-impl.hh
+++ b/src/core/file-impl.hh
@@ -47,21 +47,24 @@ class posix_file_handle_impl : public seastar::file_handle_impl {
open_flags _open_flags;
uint32_t _memory_dma_alignment;
uint32_t _disk_read_dma_alignment;
uint32_t _disk_write_dma_alignment;
uint32_t _disk_overwrite_dma_alignment;
+ bool _nowait_works;
public:
posix_file_handle_impl(int fd, open_flags f, std::atomic<unsigned>* refcount, dev_t device_id,
uint32_t memory_dma_alignment,
uint32_t disk_read_dma_alignment,
uint32_t disk_write_dma_alignment,
- uint32_t disk_overwrite_dma_alignment)
+ uint32_t disk_overwrite_dma_alignment,
+ bool nowait_works)
: _fd(fd), _refcount(refcount), _device_id(device_id), _open_flags(f)
, _memory_dma_alignment(memory_dma_alignment)
, _disk_read_dma_alignment(disk_read_dma_alignment)
, _disk_write_dma_alignment(disk_write_dma_alignment)
- , _disk_overwrite_dma_alignment(disk_overwrite_dma_alignment) {
+ , _disk_overwrite_dma_alignment(disk_overwrite_dma_alignment)
+ , _nowait_works(nowait_works) {
}
virtual ~posix_file_handle_impl();
posix_file_handle_impl(const posix_file_handle_impl&) = delete;
posix_file_handle_impl(posix_file_handle_impl&&) = delete;
virtual shared_ptr<file_impl> to_file() && override;
@@ -69,21 +72,23 @@ class posix_file_handle_impl : public seastar::file_handle_impl {
};

class posix_file_impl : public file_impl {
std::atomic<unsigned>* _refcount = nullptr;
dev_t _device_id;
+ bool _nowait_works;
io_queue* _io_queue;
open_flags _open_flags;
public:
int _fd;
posix_file_impl(int fd, open_flags, file_open_options options, dev_t device_id,
- uint32_t block_size);
+ uint32_t block_size, bool nowait_works);
posix_file_impl(int fd, open_flags, std::atomic<unsigned>* refcount, dev_t device_id,
uint32_t memory_dma_alignment,
uint32_t disk_read_dma_alignment,
uint32_t disk_write_dma_alignment,
- uint32_t disk_overwrite_dma_alignment);
+ uint32_t disk_overwrite_dma_alignment,
+ bool nowait_works);
virtual ~posix_file_impl() override;
future<> flush(void) noexcept override;
future<struct stat> stat(void) noexcept override;
future<> truncate(uint64_t length) noexcept override;
future<> discard(uint64_t offset, uint64_t length) noexcept override;
@@ -152,15 +157,15 @@ class posix_file_impl : public file_impl {
future<temporary_buffer<uint8_t>> do_dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc, io_intent* intent) noexcept;
};

class posix_file_real_impl final : public posix_file_impl {
public:
- posix_file_real_impl(int fd, open_flags of, file_open_options options, dev_t device_id, uint32_t block_size)
- : posix_file_impl(fd, of, std::move(options), device_id, block_size) {}
+ posix_file_real_impl(int fd, open_flags of, file_open_options options, dev_t device_id, uint32_t block_size, bool nowait_works)
+ : posix_file_impl(fd, of, std::move(options), device_id, block_size, nowait_works) {}
posix_file_real_impl(int fd, open_flags of, std::atomic<unsigned>* refcount, dev_t device_id,
- uint32_t memory_dma_alignment, uint32_t disk_read_dma_alignment, uint32_t disk_write_dma_alignment, uint32_t disk_overwrite_dma_alignment)
- : posix_file_impl(fd, of, refcount, device_id, memory_dma_alignment, disk_read_dma_alignment, disk_write_dma_alignment, disk_overwrite_dma_alignment) {}
+ uint32_t memory_dma_alignment, uint32_t disk_read_dma_alignment, uint32_t disk_write_dma_alignment, uint32_t disk_overwrite_dma_alignment, bool nowait_works)
+ : posix_file_impl(fd, of, refcount, device_id, memory_dma_alignment, disk_read_dma_alignment, disk_write_dma_alignment, disk_overwrite_dma_alignment, nowait_works) {}
virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept override;
virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc, io_intent* intent) noexcept override;
virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept override;
virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc, io_intent* intent) noexcept override;
virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc, io_intent* intent) noexcept override;
@@ -233,11 +238,11 @@ class append_challenged_posix_file_impl final : public posix_file_impl, public e
} catch (...) {
return make_exception_future<T...>(std::current_exception());
}
}
public:
- append_challenged_posix_file_impl(int fd, open_flags, file_open_options options, unsigned max_size_changing_ops, bool fsync_is_exclusive, dev_t device_id, size_t block_size);
+ append_challenged_posix_file_impl(int fd, open_flags, file_open_options options, unsigned max_size_changing_ops, bool fsync_is_exclusive, dev_t device_id, size_t block_size, bool nowait_works);
~append_challenged_posix_file_impl() override;
virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept override;
virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc, io_intent* intent) noexcept override;
virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept override;
virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc, io_intent* intent) noexcept override;
diff --git a/src/core/file.cc b/src/core/file.cc
index ebf3ff544b..29e8e3040d 100644
--- a/src/core/file.cc
+++ b/src/core/file.cc
@@ -73,12 +73,13 @@ file_handle::to_file() const & {
file
file_handle::to_file() && {
return file(std::move(*_impl).to_file());
}

-posix_file_impl::posix_file_impl(int fd, open_flags f, file_open_options options, dev_t device_id, uint32_t block_size)
+posix_file_impl::posix_file_impl(int fd, open_flags f, file_open_options options, dev_t device_id, uint32_t block_size, bool nowait_works)
: _device_id(device_id)
+ , _nowait_works(nowait_works)
, _io_queue(&(engine().get_io_queue(_device_id)))
, _open_flags(f)
, _fd(fd)
{
query_dma_alignment(block_size);
@@ -114,22 +115,25 @@ std::unique_ptr<seastar::file_handle_impl>
posix_file_impl::dup() {
if (!_refcount) {
_refcount = new std::atomic<unsigned>(1u);
}
auto ret = std::make_unique<posix_file_handle_impl>(_fd, _open_flags, _refcount, _device_id,
- _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment);
+ _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment,
+ _nowait_works);
_refcount->fetch_add(1, std::memory_order_relaxed);
return ret;
}

posix_file_impl::posix_file_impl(int fd, open_flags f, std::atomic<unsigned>* refcount, dev_t device_id,
uint32_t memory_dma_alignment,
uint32_t disk_read_dma_alignment,
uint32_t disk_write_dma_alignment,
- uint32_t disk_overwrite_dma_alignment)
+ uint32_t disk_overwrite_dma_alignment,
+ bool nowait_works)
: _refcount(refcount)
, _device_id(device_id)
+ , _nowait_works(nowait_works)
, _io_queue(&(engine().get_io_queue(_device_id)))
, _open_flags(f)
, _fd(fd) {
_memory_dma_alignment = memory_dma_alignment;
_disk_read_dma_alignment = disk_read_dma_alignment;
@@ -334,31 +338,31 @@ posix_file_impl::list_directory(std::function<future<> (directory_entry de)> nex
return ret;
}

future<size_t>
posix_file_impl::do_write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& io_priority_class, io_intent* intent) noexcept {
- auto req = internal::io_request::make_write(_fd, pos, buffer, len);
+ auto req = internal::io_request::make_write(_fd, pos, buffer, len, _nowait_works);
return engine().submit_io_write(_io_queue, io_priority_class, len, std::move(req), intent);
}

future<size_t>
posix_file_impl::do_write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& io_priority_class, io_intent* intent) noexcept {
auto len = internal::sanitize_iovecs(iov, _disk_write_dma_alignment);
- auto req = internal::io_request::make_writev(_fd, pos, iov);
+ auto req = internal::io_request::make_writev(_fd, pos, iov, _nowait_works);
return engine().submit_io_write(_io_queue, io_priority_class, len, std::move(req), intent).finally([iov = std::move(iov)] () {});
}

future<size_t>
posix_file_impl::do_read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& io_priority_class, io_intent* intent) noexcept {
- auto req = internal::io_request::make_read(_fd, pos, buffer, len);
+ auto req = internal::io_request::make_read(_fd, pos, buffer, len, _nowait_works);
return engine().submit_io_read(_io_queue, io_priority_class, len, std::move(req), intent);
}

future<size_t>
posix_file_impl::do_read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& io_priority_class, io_intent* intent) noexcept {
auto len = internal::sanitize_iovecs(iov, _disk_read_dma_alignment);
- auto req = internal::io_request::make_readv(_fd, pos, iov);
+ auto req = internal::io_request::make_readv(_fd, pos, iov, _nowait_works);
return engine().submit_io_read(_io_queue, io_priority_class, len, std::move(req), intent).finally([iov = std::move(iov)] () {});
}

future<size_t>
posix_file_real_impl::write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept {
@@ -487,12 +491,14 @@ posix_file_impl::read_maybe_eof(uint64_t pos, size_t len, const io_priority_clas
}
}
});
}

+static bool blockdev_nowait_works = kernel_uname().whitelisted({"4.13"});
+
blockdev_file_impl::blockdev_file_impl(int fd, open_flags f, file_open_options options, dev_t device_id, size_t block_size)
- : posix_file_impl(fd, f, options, device_id, block_size) {
+ : posix_file_impl(fd, f, options, device_id, block_size, blockdev_nowait_works) {
}

future<>
blockdev_file_impl::truncate(uint64_t length) noexcept {
return make_ready_future<>();
@@ -539,12 +545,12 @@ future<temporary_buffer<uint8_t>>
blockdev_file_impl::dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc, io_intent* intent) noexcept {
return posix_file_impl::do_dma_read_bulk(offset, range_size, pc, intent);
}

append_challenged_posix_file_impl::append_challenged_posix_file_impl(int fd, open_flags f, file_open_options options,
- unsigned max_size_changing_ops, bool fsync_is_exclusive, dev_t device_id, size_t block_size)
- : posix_file_impl(fd, f, options, device_id, block_size)
+ unsigned max_size_changing_ops, bool fsync_is_exclusive, dev_t device_id, size_t block_size, bool nowait_works)
+ : posix_file_impl(fd, f, options, device_id, block_size, nowait_works)
, _max_size_changing_ops(max_size_changing_ops)
, _fsync_is_exclusive(fsync_is_exclusive) {
auto r = ::lseek(fd, 0, SEEK_END);
throw_system_error_on(r == -1);
_committed_size = _logical_size = r;
@@ -839,21 +845,21 @@ posix_file_handle_impl::~posix_file_handle_impl() {
}

std::unique_ptr<seastar::file_handle_impl>
posix_file_handle_impl::clone() const {
auto ret = std::make_unique<posix_file_handle_impl>(_fd, _open_flags, _refcount, _device_id,
- _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment);
+ _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment, _nowait_works);
if (_refcount) {
_refcount->fetch_add(1, std::memory_order_relaxed);
}
return ret;
}

shared_ptr<file_impl>
posix_file_handle_impl::to_file() && {
auto ret = ::seastar::make_shared<posix_file_real_impl>(_fd, _open_flags, _refcount, _device_id,
- _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment);
+ _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment, _nowait_works);
_fd = -1;
_refcount = nullptr;
return ret;
}

@@ -887,16 +893,17 @@ make_file_impl(int fd, file_open_options options, int flags) noexcept {
return make_ready_future<shared_ptr<file_impl>>(make_shared<blockdev_file_impl>(fd, open_flags(flags), options, st_dev, block_size));
} else {
if ((flags & O_ACCMODE) == O_RDONLY || S_ISDIR(st.st_mode)) {
// Directories don't care about block size, so we need not
// query it here. Just provide something reasonable.
- return make_ready_future<shared_ptr<file_impl>>(make_shared<posix_file_real_impl>(fd, open_flags(flags), options, st_dev, /* blocksize */ 4096));
+ return make_ready_future<shared_ptr<file_impl>>(make_shared<posix_file_real_impl>(fd, open_flags(flags), options, st_dev, /* blocksize */ 4096, false));
}
struct append_support {
bool append_challenged;
unsigned append_concurrency;
bool fsync_is_exclusive;
+ bool nowait_works;
};
struct fs_info {
append_support append_support_;
uint32_t block_size;
};
@@ -909,34 +916,51 @@ make_file_impl(int fd, file_open_options options, int flags) noexcept {
case 0x58465342: /* XFS */
as.append_challenged = true;
index 6b53168579..b6871e9f84 100644
--- a/src/core/reactor.cc
+++ b/src/core/reactor.cc
@@ -531,12 +531,13 @@ std::atomic<manual_clock::rep> manual_clock::_now;
constexpr std::chrono::milliseconds lowres_clock_impl::_granularity;

constexpr unsigned reactor::max_queues;
constexpr unsigned reactor::max_aio_per_queue;

-// Broken (returns spurious EIO). Cause/fix unknown.
-bool aio_nowait_supported = false;
+// Base version where this works; some filesystems were only fixed later, so
+// this value is mixed in with filesystem-provided values later.
+bool aio_nowait_supported = kernel_uname().whitelisted({"4.13"});

static bool sched_debug() {
return false;
}

diff --git a/src/core/reactor_backend.cc b/src/core/reactor_backend.cc
index e0c1eea0fb..04dc1f40bd 100644
--- a/src/core/reactor_backend.cc
+++ b/src/core/reactor_backend.cc
@@ -57,23 +57,23 @@ void prepare_iocb(io_request& req, io_completion* desc, iocb& iocb) {
case io_request::operation::fdatasync:
iocb = make_fdsync_iocb(req.fd());
break;
case io_request::operation::write:
iocb = make_write_iocb(req.fd(), req.pos(), req.address(), req.size());
- set_nowait(iocb, true);
+ set_nowait(iocb, req.nowait_works());
break;
case io_request::operation::writev:
iocb = make_writev_iocb(req.fd(), req.pos(), req.iov(), req.size());
- set_nowait(iocb, true);
+ set_nowait(iocb, req.nowait_works());
break;
case io_request::operation::read:
iocb = make_read_iocb(req.fd(), req.pos(), req.address(), req.size());
- set_nowait(iocb, true);
+ set_nowait(iocb, req.nowait_works());
break;
case io_request::operation::readv:
iocb = make_readv_iocb(req.fd(), req.pos(), req.iov(), req.size());
- set_nowait(iocb, true);
+ set_nowait(iocb, req.nowait_works());
break;
default:
seastar_logger.error("Invalid operation for iocb: {}", req.opname());
std::abort();
}
diff --git a/tests/unit/io_queue_test.cc b/tests/unit/io_queue_test.cc
index 7753a8193e..a5e669883e 100644
--- a/tests/unit/io_queue_test.cc
+++ b/tests/unit/io_queue_test.cc
@@ -39,11 +39,11 @@ template <size_t Len>
struct fake_file {
int data[Len] = {};

static internal::io_request make_write_req(size_t idx, int val) {
int* buf = new int(val);
- return internal::io_request::make_write(0, idx, buf, 1);
+ return internal::io_request::make_write(0, idx, buf, 1, false);
}

void execute_write_req(internal::io_request& rq, io_completion* desc) {
data[rq.pos()] = *(reinterpret_cast<int*>(rq.address()));
desc->complete_with(rq.size());
--
2.30.2

Avi Kivity

<avi@scylladb.com>
unread,
Mar 15, 2021, 5:00:08 AM3/15/21
to Piotr Sarna, seastar-dev@googlegroups.com
Ouch, should not have forgotten it. Sent v2.

Commit Bot

<bot@cloudius-systems.com>
unread,
Mar 15, 2021, 7:10:35 AM3/15/21
to seastar-dev@googlegroups.com, Avi Kivity
From: Avi Kivity <a...@scylladb.com>
Committer: Piotr Sarna <sa...@scylladb.com>
Branch: master

file, reactor: reinstate RWF_NOWAIT support

RWF_NOWAIT support was disabled in e67572ef68a6cd1f0 ("reactor:
disable nowait aio due to a kernel bug"). It was later re-enabled
but disabled again.

The kernel problems have now been indentified[1] and so we can
can re-enable it. Since each filesystem was fixed in a different
kernel version, we need to plumb down this information from
posix_file_impl and friends all the way down to the reactor backend,
via io_request. This is annoying but cannot be helped.

Tested via the unit tests and also verified that the nowait paths
are exercised with strace.

[1] https://lore.kernel.org/linux-xfs/20210117213...@dread.disaster.area/
Message-Id: <20210315085935...@scylladb.com>

---
diff --git a/include/seastar/core/internal/io_request.hh b/include/seastar/core/internal/io_request.hh
--- a/include/seastar/core/internal/io_request.hh
+++ b/include/seastar/core/internal/io_request.hh
@@ -60,6 +60,8 @@ private:
socklen_t socklen;
} _size;

+ bool _nowait_works;
+
explicit io_request(operation op, int fd, int flags, ::msghdr* msg)
: _op(op)
, _fd(fd)
@@ -93,9 +95,20 @@ private:
_size.len = size;
}

- explicit io_request(operation op, int fd, uint64_t pos, iovec* ptr, size_t size)
+ explicit io_request(operation op, int fd, uint64_t pos, char* ptr, size_t size, bool nowait_works)
: _op(op)
, _fd(fd)
+ , _nowait_works(nowait_works)
+ {
+ _attr.pos = pos;
+ _ptr.addr = ptr;
+ _size.len = size;
+ }
+
+ explicit io_request(operation op, int fd, uint64_t pos, iovec* ptr, size_t size, bool nowait_works)
+ : _op(op)
+ , _fd(fd)
+ , _nowait_works(nowait_works)
{
_attr.pos = pos;
_ptr.iovec = ptr;
@@ -198,12 +211,16 @@ public:
return _size.socklen_ptr;
}

- static io_request make_read(int fd, uint64_t pos, void* address, size_t size) {
- return io_request(operation::read, fd, pos, reinterpret_cast<char*>(address), size);
+ bool nowait_works() const {
+ return _nowait_works;
+ }
+
+ static io_request make_read(int fd, uint64_t pos, void* address, size_t size, bool nowait_works) {
+ return io_request(operation::read, fd, pos, reinterpret_cast<char*>(address), size, nowait_works);
}

- static io_request make_readv(int fd, uint64_t pos, std::vector<iovec>& iov) {
- return io_request(operation::readv, fd, pos, iov.data(), iov.size());
+ static io_request make_readv(int fd, uint64_t pos, std::vector<iovec>& iov, bool nowait_works) {
+ return io_request(operation::readv, fd, pos, iov.data(), iov.size(), nowait_works);
}

static io_request make_recv(int fd, void* address, size_t size, int flags) {
@@ -222,12 +239,12 @@ public:
return io_request(operation::sendmsg, fd, flags, msg);
}

- static io_request make_write(int fd, uint64_t pos, const void* address, size_t size) {
- return io_request(operation::write, fd, pos, const_cast<char*>(reinterpret_cast<const char*>(address)), size);
+ static io_request make_write(int fd, uint64_t pos, const void* address, size_t size, bool nowait_works) {
+ return io_request(operation::write, fd, pos, const_cast<char*>(reinterpret_cast<const char*>(address)), size, nowait_works);
}

- static io_request make_writev(int fd, uint64_t pos, std::vector<iovec>& iov) {
- return io_request(operation::writev, fd, pos, iov.data(), iov.size());
+ static io_request make_writev(int fd, uint64_t pos, std::vector<iovec>& iov, bool nowait_works) {
+ return io_request(operation::writev, fd, pos, iov.data(), iov.size(), nowait_works);
}

static io_request make_fdatasync(int fd) {
diff --git a/src/core/file-impl.hh b/src/core/file-impl.hh
--- a/src/core/file-impl.hh
+++ b/src/core/file-impl.hh
@@ -49,17 +49,20 @@ class posix_file_handle_impl : public seastar::file_handle_impl {
uint32_t _disk_read_dma_alignment;
uint32_t _disk_write_dma_alignment;
uint32_t _disk_overwrite_dma_alignment;
+ bool _nowait_works;
public:
posix_file_handle_impl(int fd, open_flags f, std::atomic<unsigned>* refcount, dev_t device_id,
uint32_t memory_dma_alignment,
uint32_t disk_read_dma_alignment,
uint32_t disk_write_dma_alignment,
- uint32_t disk_overwrite_dma_alignment)
+ uint32_t disk_overwrite_dma_alignment,
+ bool nowait_works)
: _fd(fd), _refcount(refcount), _device_id(device_id), _open_flags(f)
, _memory_dma_alignment(memory_dma_alignment)
, _disk_read_dma_alignment(disk_read_dma_alignment)
, _disk_write_dma_alignment(disk_write_dma_alignment)
- , _disk_overwrite_dma_alignment(disk_overwrite_dma_alignment) {
+ , _disk_overwrite_dma_alignment(disk_overwrite_dma_alignment)
+ , _nowait_works(nowait_works) {
}
virtual ~posix_file_handle_impl();
posix_file_handle_impl(const posix_file_handle_impl&) = delete;
@@ -71,17 +74,19 @@ public:
class posix_file_impl : public file_impl {
std::atomic<unsigned>* _refcount = nullptr;
dev_t _device_id;
+ bool _nowait_works;
io_queue* _io_queue;
open_flags _open_flags;
public:
int _fd;
posix_file_impl(int fd, open_flags, file_open_options options, dev_t device_id,
- uint32_t block_size);
+ uint32_t block_size, bool nowait_works);
posix_file_impl(int fd, open_flags, std::atomic<unsigned>* refcount, dev_t device_id,
uint32_t memory_dma_alignment,
uint32_t disk_read_dma_alignment,
uint32_t disk_write_dma_alignment,
- uint32_t disk_overwrite_dma_alignment);
+ uint32_t disk_overwrite_dma_alignment,
+ bool nowait_works);
virtual ~posix_file_impl() override;
future<> flush(void) noexcept override;
future<struct stat> stat(void) noexcept override;
@@ -154,11 +159,11 @@ protected:

class posix_file_real_impl final : public posix_file_impl {
public:
- posix_file_real_impl(int fd, open_flags of, file_open_options options, dev_t device_id, uint32_t block_size)
- : posix_file_impl(fd, of, std::move(options), device_id, block_size) {}
+ posix_file_real_impl(int fd, open_flags of, file_open_options options, dev_t device_id, uint32_t block_size, bool nowait_works)
+ : posix_file_impl(fd, of, std::move(options), device_id, block_size, nowait_works) {}
posix_file_real_impl(int fd, open_flags of, std::atomic<unsigned>* refcount, dev_t device_id,
- uint32_t memory_dma_alignment, uint32_t disk_read_dma_alignment, uint32_t disk_write_dma_alignment, uint32_t disk_overwrite_dma_alignment)
- : posix_file_impl(fd, of, refcount, device_id, memory_dma_alignment, disk_read_dma_alignment, disk_write_dma_alignment, disk_overwrite_dma_alignment) {}
+ uint32_t memory_dma_alignment, uint32_t disk_read_dma_alignment, uint32_t disk_write_dma_alignment, uint32_t disk_overwrite_dma_alignment, bool nowait_works)
+ : posix_file_impl(fd, of, refcount, device_id, memory_dma_alignment, disk_read_dma_alignment, disk_write_dma_alignment, disk_overwrite_dma_alignment, nowait_works) {}
virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept override;
virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc, io_intent* intent) noexcept override;
virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept override;
@@ -235,7 +240,7 @@ private:
}
}
public:
- append_challenged_posix_file_impl(int fd, open_flags, file_open_options options, unsigned max_size_changing_ops, bool fsync_is_exclusive, dev_t device_id, size_t block_size);
+ append_challenged_posix_file_impl(int fd, open_flags, file_open_options options, unsigned max_size_changing_ops, bool fsync_is_exclusive, dev_t device_id, size_t block_size, bool nowait_works);
~append_challenged_posix_file_impl() override;
virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept override;
virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc, io_intent* intent) noexcept override;
diff --git a/src/core/file.cc b/src/core/file.cc
--- a/src/core/file.cc
+++ b/src/core/file.cc
@@ -75,8 +75,9 @@ file_handle::to_file() && {
return file(std::move(*_impl).to_file());
}

-posix_file_impl::posix_file_impl(int fd, open_flags f, file_open_options options, dev_t device_id, uint32_t block_size)
+posix_file_impl::posix_file_impl(int fd, open_flags f, file_open_options options, dev_t device_id, uint32_t block_size, bool nowait_works)
: _device_id(device_id)
+ , _nowait_works(nowait_works)
, _io_queue(&(engine().get_io_queue(_device_id)))
, _open_flags(f)
, _fd(fd)
@@ -116,7 +117,8 @@ posix_file_impl::dup() {
_refcount = new std::atomic<unsigned>(1u);
}
auto ret = std::make_unique<posix_file_handle_impl>(_fd, _open_flags, _refcount, _device_id,
- _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment);
+ _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment,
+ _nowait_works);
_refcount->fetch_add(1, std::memory_order_relaxed);
return ret;
}
@@ -125,9 +127,11 @@ posix_file_impl::posix_file_impl(int fd, open_flags f, std::atomic<unsigned>* re
uint32_t memory_dma_alignment,
uint32_t disk_read_dma_alignment,
uint32_t disk_write_dma_alignment,
- uint32_t disk_overwrite_dma_alignment)
+ uint32_t disk_overwrite_dma_alignment,
+ bool nowait_works)
: _refcount(refcount)
, _device_id(device_id)
+ , _nowait_works(nowait_works)
, _io_queue(&(engine().get_io_queue(_device_id)))
, _open_flags(f)
, _fd(fd) {
@@ -336,27 +340,27 @@ posix_file_impl::list_directory(std::function<future<> (directory_entry de)> nex

future<size_t>
posix_file_impl::do_write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& io_priority_class, io_intent* intent) noexcept {
- auto req = internal::io_request::make_write(_fd, pos, buffer, len);
+ auto req = internal::io_request::make_write(_fd, pos, buffer, len, _nowait_works);
return engine().submit_io_write(_io_queue, io_priority_class, len, std::move(req), intent);
}

future<size_t>
posix_file_impl::do_write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& io_priority_class, io_intent* intent) noexcept {
auto len = internal::sanitize_iovecs(iov, _disk_write_dma_alignment);
- auto req = internal::io_request::make_writev(_fd, pos, iov);
+ auto req = internal::io_request::make_writev(_fd, pos, iov, _nowait_works);
return engine().submit_io_write(_io_queue, io_priority_class, len, std::move(req), intent).finally([iov = std::move(iov)] () {});
}

future<size_t>
posix_file_impl::do_read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& io_priority_class, io_intent* intent) noexcept {
- auto req = internal::io_request::make_read(_fd, pos, buffer, len);
+ auto req = internal::io_request::make_read(_fd, pos, buffer, len, _nowait_works);
return engine().submit_io_read(_io_queue, io_priority_class, len, std::move(req), intent);
}

future<size_t>
posix_file_impl::do_read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& io_priority_class, io_intent* intent) noexcept {
auto len = internal::sanitize_iovecs(iov, _disk_read_dma_alignment);
- auto req = internal::io_request::make_readv(_fd, pos, iov);
+ auto req = internal::io_request::make_readv(_fd, pos, iov, _nowait_works);
return engine().submit_io_read(_io_queue, io_priority_class, len, std::move(req), intent).finally([iov = std::move(iov)] () {});
}

@@ -489,8 +493,10 @@ posix_file_impl::read_maybe_eof(uint64_t pos, size_t len, const io_priority_clas
});
}

+static bool blockdev_nowait_works = kernel_uname().whitelisted({"4.13"});
+
blockdev_file_impl::blockdev_file_impl(int fd, open_flags f, file_open_options options, dev_t device_id, size_t block_size)
- : posix_file_impl(fd, f, options, device_id, block_size) {
+ : posix_file_impl(fd, f, options, device_id, block_size, blockdev_nowait_works) {
}

future<>
@@ -541,8 +547,8 @@ blockdev_file_impl::dma_read_bulk(uint64_t offset, size_t range_size, const io_p
}

append_challenged_posix_file_impl::append_challenged_posix_file_impl(int fd, open_flags f, file_open_options options,
- unsigned max_size_changing_ops, bool fsync_is_exclusive, dev_t device_id, size_t block_size)
- : posix_file_impl(fd, f, options, device_id, block_size)
+ unsigned max_size_changing_ops, bool fsync_is_exclusive, dev_t device_id, size_t block_size, bool nowait_works)
+ : posix_file_impl(fd, f, options, device_id, block_size, nowait_works)
, _max_size_changing_ops(max_size_changing_ops)
, _fsync_is_exclusive(fsync_is_exclusive) {
auto r = ::lseek(fd, 0, SEEK_END);
@@ -841,7 +847,7 @@ posix_file_handle_impl::~posix_file_handle_impl() {
std::unique_ptr<seastar::file_handle_impl>
posix_file_handle_impl::clone() const {
auto ret = std::make_unique<posix_file_handle_impl>(_fd, _open_flags, _refcount, _device_id,
- _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment);
+ _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment, _nowait_works);
if (_refcount) {
_refcount->fetch_add(1, std::memory_order_relaxed);
}
@@ -851,7 +857,7 @@ posix_file_handle_impl::clone() const {
shared_ptr<file_impl>
posix_file_handle_impl::to_file() && {
auto ret = ::seastar::make_shared<posix_file_real_impl>(_fd, _open_flags, _refcount, _device_id,
- _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment);
+ _memory_dma_alignment, _disk_read_dma_alignment, _disk_write_dma_alignment, _disk_overwrite_dma_alignment, _nowait_works);
_fd = -1;
_refcount = nullptr;
return ret;
@@ -889,12 +895,13 @@ make_file_impl(int fd, file_open_options options, int flags) noexcept {
if ((flags & O_ACCMODE) == O_RDONLY || S_ISDIR(st.st_mode)) {
// Directories don't care about block size, so we need not
// query it here. Just provide something reasonable.
- return make_ready_future<shared_ptr<file_impl>>(make_shared<posix_file_real_impl>(fd, open_flags(flags), options, st_dev, /* blocksize */ 4096));
+ return make_ready_future<shared_ptr<file_impl>>(make_shared<posix_file_real_impl>(fd, open_flags(flags), options, st_dev, /* blocksize */ 4096, false));
}
struct append_support {
bool append_challenged;
unsigned append_concurrency;
bool fsync_is_exclusive;
+ bool nowait_works;
};
struct fs_info {
append_support append_support_;
@@ -911,30 +918,47 @@ make_file_impl(int fd, file_open_options options, int flags) noexcept {
--- a/src/core/reactor.cc
+++ b/src/core/reactor.cc
@@ -533,8 +533,9 @@ constexpr std::chrono::milliseconds lowres_clock_impl::_granularity;
constexpr unsigned reactor::max_queues;
constexpr unsigned reactor::max_aio_per_queue;

-// Broken (returns spurious EIO). Cause/fix unknown.
-bool aio_nowait_supported = false;
+// Base version where this works; some filesystems were only fixed later, so
+// this value is mixed in with filesystem-provided values later.
+bool aio_nowait_supported = kernel_uname().whitelisted({"4.13"});

static bool sched_debug() {
return false;
diff --git a/src/core/reactor_backend.cc b/src/core/reactor_backend.cc
--- a/src/core/reactor_backend.cc
+++ b/src/core/reactor_backend.cc
@@ -59,19 +59,19 @@ void prepare_iocb(io_request& req, io_completion* desc, iocb& iocb) {
break;
case io_request::operation::write:
iocb = make_write_iocb(req.fd(), req.pos(), req.address(), req.size());
- set_nowait(iocb, true);
+ set_nowait(iocb, req.nowait_works());
break;
case io_request::operation::writev:
iocb = make_writev_iocb(req.fd(), req.pos(), req.iov(), req.size());
- set_nowait(iocb, true);
+ set_nowait(iocb, req.nowait_works());
break;
case io_request::operation::read:
iocb = make_read_iocb(req.fd(), req.pos(), req.address(), req.size());
- set_nowait(iocb, true);
+ set_nowait(iocb, req.nowait_works());
break;
case io_request::operation::readv:
iocb = make_readv_iocb(req.fd(), req.pos(), req.iov(), req.size());
- set_nowait(iocb, true);
+ set_nowait(iocb, req.nowait_works());
break;
default:
seastar_logger.error("Invalid operation for iocb: {}", req.opname());
diff --git a/tests/unit/io_queue_test.cc b/tests/unit/io_queue_test.cc
--- a/tests/unit/io_queue_test.cc
+++ b/tests/unit/io_queue_test.cc
@@ -41,7 +41,7 @@ struct fake_file {

Reply all
Reply to author
Forward
0 new messages