> diff --git a/cmake/SeastarDependencies.cmake b/cmake/SeastarDependencies.cmake
> index 5141ffda..da7882fd 100644
> --- a/cmake/SeastarDependencies.cmake
> +++ b/cmake/SeastarDependencies.cmake
> @@ -92,6 +92,7 @@ macro (seastar_find_dependencies)
> Concepts
> GnuTLS
> LibUring
> + LibNvme
I hope this is optional? Please test building with the library not present.
> LinuxMembarrier
> Sanitizers
> SourceLocation
> diff --git a/include/seastar/core/internal/io_request.hh b/include/seastar/core/internal/io_request.hh
> index 3d43490b..090ba8ef 100644
> --- a/include/seastar/core/internal/io_request.hh
> +++ b/include/seastar/core/internal/io_request.hh
> @@ -35,7 +35,7 @@ namespace internal {
>
> class io_request {
> public:
> - enum class operation { read, readv, write, writev, fdatasync, recv, recvmsg, send, sendmsg, accept, connect, poll_add, poll_remove, cancel };
> + enum class operation { read, readv, write, writev, fdatasync, recv, recvmsg, send, sendmsg, accept, connect, poll_add, poll_remove, cancel, append };
How do appends work with concurrency? If I append something, do I have
to wait until a previous append completed?
> private:
> operation _op;
> int _fd;
> @@ -64,6 +64,8 @@ class io_request {
> } _size;
>
> bool _nowait_works;
> + uint32_t _nsid = -1;
> + size_t _block_size = -1;
Can this be moved to one of the unions to reduce size growth?
> +
> using shard_id = unsigned;
> using stream_id = unsigned;
>
> @@ -121,7 +123,7 @@ class io_queue {
> size_t len, internal::io_request req, io_intent* intent, iovec_keeper iovs = {}) noexcept;
>
> future<size_t> queue_request(const io_priority_class& pc, internal::io_direction_and_length dnl, internal::io_request req, io_intent* intent, iovec_keeper iovs) noexcept;
> - future<size_t> queue_one_request(const io_priority_class& pc, internal::io_direction_and_length dnl, internal::io_request req, io_intent* intent, iovec_keeper iovs) noexcept;
> + future<io_result> queue_one_request(const io_priority_class& pc, internal::io_direction_and_length dnl, internal::io_request req, io_intent* intent, iovec_keeper iovs) noexcept;
> void submit_request(io_desc_read_write* desc, internal::io_request req) noexcept;
> void cancel_request(queued_io_request& req) noexcept;
> void complete_cancelled_request(queued_io_request& req) noexcept;
> diff --git a/include/seastar/core/reactor.hh b/include/seastar/core/reactor.hh
> index db2c2bb4..aee0f918 100644
> --- a/include/seastar/core/reactor.hh
> +++ b/include/seastar/core/reactor.hh
> @@ -170,11 +170,20 @@ class io_intent;
> class disk_config_params;
>
> class io_completion : public kernel_completion {
> +private:
> + size_t result = 0;
> +
Please provide motivation for this change.
>
> # seastar doesn't directly depend on these packages. They are
> diff --git a/src/core/io_queue.cc b/src/core/io_queue.cc
> index ea82551d..a1a12a64 100644
> --- a/src/core/io_queue.cc
> +++ b/src/core/io_queue.cc
> @@ -208,7 +208,7 @@ class io_desc_read_write final : public io_completion {
> const stream_id _stream;
> const io_direction_and_length _dnl;
> const fair_queue_ticket _fq_ticket;
> - promise<size_t> _pr;
> + promise<io_result> _pr;
> iovec_keeper _iovs;
>
> public:
> @@ -237,7 +237,7 @@ class io_desc_read_write final : public io_completion {
> auto now = io_queue::clock_type::now();
> _pclass.on_complete(std::chrono::duration_cast<std::chrono::duration<double>>(now - _ts));
> _ioq.complete_request(*this);
> - _pr.set_value(res);
> + _pr.set_value(std::make_tuple(res, get_result()));
This looks strange. Why are there two results here?
This causes an extra continuation which will have negative performance
impact. So the change to io_result (if we agree it's required) is better
propagated deeper into the I/O stack.
> #endif
>
> #ifdef HAVE_OSV
> #include <osv/newpoll.hh>
> #endif
>
> +static const unsigned char IORING_OP_URING_CMD = 40;
> +
Isn't this available from <liburing.h>?
Should be protected by #ifndef in case it becomes available later.
> namespace seastar {
>
> using namespace std::chrono_literals;
> @@ -1254,7 +1257,9 @@ class reactor_backend_uring final : public reactor_backend {
> }
> auto sqe = be.get_sqe();
> ::io_uring_prep_poll_add(sqe, fd().get(), POLLIN);
> - ::io_uring_sqe_set_data(sqe, static_cast<kernel_completion*>(this));
> + auto arg = new passthru_arg;
This should be limited to the case where nvme passthrough is used.
> + arg->completion = this;
> + ::io_uring_sqe_set_data(sqe, arg);
> _armed = true;
> be._has_pending_submissions = true;
> }
> @@ -1316,10 +1321,134 @@ class reactor_backend_uring final : public reactor_backend {
> auto sqe = get_sqe();
> ::io_uring_prep_poll_add(sqe, fd.fd.get(), events);
> auto ufd = static_cast<uring_pollable_fd_state*>(&fd);
> - ::io_uring_sqe_set_data(sqe, static_cast<kernel_completion*>(ufd->get_desc(events)));
> + auto arg = new passthru_arg;
> + arg->completion = ufd->get_desc(events);
> + ::io_uring_sqe_set_data(sqe, arg);
> _has_pending_submissions = true;
> return ufd->get_completion_future(events);
> }
> + struct block_uring_cmd {
All of this should be protected against a newer liburing.h. Or maybe it
should be disabled unless a sufficiently recent liburing is available.
> + passthru_arg* arg = nullptr;
> + size_t block_size = -1;
> +
> + void complete_with(ssize_t res) override {
> + seastar_logger.error("complete res {}", res);
This seems excessive.
Thanks for your prompt reply.
By the way, I was wondering how to send the newly reflected version of the code.Do I need to send another new patch? or should I reply on the same maling thread by using the option "git send-email --in-reply-to" ?
The convention on this list is to start a thread with new
patches. However, it's better to agree on the direction before.
> +#
> +# Copyright (C) 2022 ScyllaDB
> +#
You should put your own copyright here.
Is it okay to reuse existing copyright?
The code is copyrighted by whoever wrote it or their employer.
ScyllaDB can't claim copyright to this code.
> LinuxMembarrier
> Sanitizers
> SourceLocation
> diff --git a/include/seastar/core/internal/io_request.hh b/include/seastar/core/internal/io_request.hh
> index 3d43490b..090ba8ef 100644
> --- a/include/seastar/core/internal/io_request.hh
> +++ b/include/seastar/core/internal/io_request.hh
> @@ -35,7 +35,7 @@ namespace internal {
>
> class io_request {
> public:
> - enum class operation { read, readv, write, writev, fdatasync, recv, recvmsg, send, sendmsg, accept, connect, poll_add, poll_remove, cancel };
> + enum class operation { read, readv, write, writev, fdatasync, recv, recvmsg, send, sendmsg, accept, connect, poll_add, poll_remove, cancel, append };
How do appends work with concurrency? If I append something, do I have
to wait until a previous append completed?Multiple appends can be worked concurrently. Users do not have to wait for the completion of the previous append command to submit another one.
Only if the user wants to ensure that data of the append command is written to ZNS SSD, the user should wait for the completion.
But, how does the device know which command has to be executed
first? The commands may be reordered by Seastar or possibly the
kernel.
> private:
> operation _op;
> int _fd;
> @@ -64,6 +64,8 @@ class io_request {
> } _size;
>
> bool _nowait_works;
> + uint32_t _nsid = -1;
> + size_t _block_size = -1;
Can this be moved to one of the unions to reduce size growth?
No. existing unions (_attr, _ptr, _size) are already used by the append command.
What about the offset?
Why is a new block size needed?
So, how am I supposed to use it? For every write, remember the assigned LBA and use it?
Very strange. Can you point me to an application that uses it?
To make a completion of the append command, an assigned block address is required. This new 'result' field is for passing the assigned block address via io_completion.
> +
> using shard_id = unsigned;
> using stream_id = unsigned;
>
> @@ -121,7 +123,7 @@ class io_queue {
> size_t len, internal::io_request req, io_intent* intent, iovec_keeper iovs = {}) noexcept;
>
> future<size_t> queue_request(const io_priority_class& pc, internal::io_direction_and_length dnl, internal::io_request req, io_intent* intent, iovec_keeper iovs) noexcept;
> - future<size_t> queue_one_request(const io_priority_class& pc, internal::io_direction_and_length dnl, internal::io_request req, io_intent* intent, iovec_keeper iovs) noexcept;
> + future<io_result> queue_one_request(const io_priority_class& pc, internal::io_direction_and_length dnl, internal::io_request req, io_intent* intent, iovec_keeper iovs) noexcept;
> void submit_request(io_desc_read_write* desc, internal::io_request req) noexcept;
> void cancel_request(queued_io_request& req) noexcept;
> void complete_cancelled_request(queued_io_request& req) noexcept;
> diff --git a/include/seastar/core/reactor.hh b/include/seastar/core/reactor.hh
> index db2c2bb4..aee0f918 100644
> --- a/include/seastar/core/reactor.hh
> +++ b/include/seastar/core/reactor.hh
> @@ -170,11 +170,20 @@ class io_intent;
> class disk_config_params;
>
> class io_completion : public kernel_completion {
> +private:
> + size_t result = 0;
> +
Please provide motivation for this change.
We added a comment in code.
I think it's better to use a subclass. Also, name it
_assigned_lba_address so it's clear what it is. We have plenty of
variables named "result" already,
Ok. I still don't understand how the high-level code can make use
of this lba address.
I don't understand it completely myself since I don't understand how assigned_lba_address can be used.
If it turns out that we can't get rid of it, then yes we need to
return io_result everywhere.
<snip>
Please add an example application (in demos/).
We added and tested demo code that utilizes append command, read and verify for the example. (demos/passthru_append_demo.cc)ᐧ
Thanks, this will help me understand. In addition pointers to
real life applications will be appreciated.
On 07/09/2022 12.23, 신희원 / 학생 / 컴퓨터공학부 wrote:
Thanks for your prompt reply.
By the way, I was wondering how to send the newly reflected version of the code.Do I need to send another new patch? or should I reply on the same maling thread by using the option "git send-email --in-reply-to" ?
The convention on this list is to start a thread with new patches. However, it's better to agree on the direction before.
> +#
> +# Copyright (C) 2022 ScyllaDB
> +#
You should put your own copyright here.
Is it okay to reuse existing copyright?
The code is copyrighted by whoever wrote it or their employer. ScyllaDB can't claim copyright to this code.
> LinuxMembarrier
> Sanitizers
> SourceLocation
> diff --git a/include/seastar/core/internal/io_request.hh b/include/seastar/core/internal/io_request.hh
> index 3d43490b..090ba8ef 100644
> --- a/include/seastar/core/internal/io_request.hh
> +++ b/include/seastar/core/internal/io_request.hh
> @@ -35,7 +35,7 @@ namespace internal {
>
> class io_request {
> public:
> - enum class operation { read, readv, write, writev, fdatasync, recv, recvmsg, send, sendmsg, accept, connect, poll_add, poll_remove, cancel };
> + enum class operation { read, readv, write, writev, fdatasync, recv, recvmsg, send, sendmsg, accept, connect, poll_add, poll_remove, cancel, append };
How do appends work with concurrency? If I append something, do I have
to wait until a previous append completed?Multiple appends can be worked concurrently. Users do not have to wait for the completion of the previous append command to submit another one.
Only if the user wants to ensure that data of the append command is written to ZNS SSD, the user should wait for the completion.
But, how does the device know which command has to be executed first? The commands may be reordered by Seastar or possibly the kernel.
> private:
> operation _op;
> int _fd;
> @@ -64,6 +64,8 @@ class io_request {
> } _size;
>
> bool _nowait_works;
> + uint32_t _nsid = -1;
> + size_t _block_size = -1;
Can this be moved to one of the unions to reduce size growth?
No. existing unions (_attr, _ptr, _size) are already used by the append command.
What about the offset?
Why is a new block size needed?
Very strange. Can you point me to an application that uses it?
If it turns out that we can't get rid of it, then yes we need to return io_result everywhere.
<snip>
Please add an example application (in demos/).
We added and tested demo code that utilizes append command, read and verify for the example. (demos/passthru_append_demo.cc)ᐧ
Thanks, this will help me understand. In addition pointers to real life applications will be appreciated.
I'm beginning to understand it. The resulting zone can't be read
in-order, so if the application wants the ability to read it
in-order, it must keep track of the returned offsets in some index
file.
> private:
> operation _op;
> int _fd;
> @@ -64,6 +64,8 @@ class io_request {
> } _size;
>
> bool _nowait_works;
> + uint32_t _nsid = -1;
> + size_t _block_size = -1;
Can this be moved to one of the unions to reduce size growth?
No. existing unions (_attr, _ptr, _size) are already used by the append command.
What about the offset?
Append command requires an index of a zone where to place data.Therefore the user needs an offset for selecting a zone to append.
So, io_request._attr.pos is used to select the zone?
Let's add another member to the _attr union, so it's clear it's
in use.
Why is a new block size needed?
To prepare the NVMe passthrough command, the start address of zone and size of data should be expressed in terms of the number of blocks.Therefore, block size is required to translate byte granularity to block granularity.
Ok.
Hard for me to see, I'll comment on the next version of the patch.
A different approach is to have a separate concrete class,
io_desc_zone_append, but to it will be easier to see if it's
better if I see the patch.
Very good, thanks.
It's not ideal, but it's okay as long as it isn't exposed in a
public API.
> private:
> operation _op;
> int _fd;
> @@ -64,6 +64,8 @@ class io_request {
> } _size;
>
> bool _nowait_works;
> + uint32_t _nsid = -1;
> + size_t _block_size = -1;
Can this be moved to one of the unions to reduce size growth?
No. existing unions (_attr, _ptr, _size) are already used by the append command.
What about the offset?
Append command requires an index of a zone where to place data.Therefore the user needs an offset for selecting a zone to append.
So, io_request._attr.pos is used to select the zone?
Let's add another member to the _attr union, so it's clear it's in use.
<snip>
Please add an example application (in demos/).
We added and tested demo code that utilizes append command, read and verify for the example. (demos/passthru_append_demo.cc)ᐧ
Thanks, this will help me understand. In addition pointers to real life applications will be appreciated.
Okay, it would be included in the upcoming patch.(demos/passthru_append_demo.cc)
ThanksHeewonᐧ