[PATCH 2/2] file: implement append command

shw096@snu.ac.kr

<shw096@snu.ac.kr>

unread,

Aug 29, 2022, 8:17:31 AM8/29/22

to seastar-dev@googlegroups.com, Heewon Shin, JinyongHa

From: Heewon Shin <shw...@snu.ac.kr>

This commit contains implementation of append command on file-related
classes. Almost changes are just copies of write-related functions
except the returned value. The returned value of append is struct
io_result that contains both the status value and assigned block address.

User should be aware of which the 'pos' input parameter should be a start
address of a zone. If 'pos' is not aligned with a zone's starting
address, it will failed with the errno EINVAL.

Signed-off-by: Heewon Shin <shw...@snu.ac.kr>
Signed-off-by: JinyongHa <jy20...@samsung.com>
---
include/seastar/core/file.hh | 19 +++++++-
include/seastar/core/io_queue.hh | 2 +
src/core/file-impl.hh | 14 +++++-
src/core/file.cc | 78 ++++++++++++++++++++++++++++++--
src/core/io_queue.cc | 7 +++
5 files changed, 113 insertions(+), 7 deletions(-)

diff --git a/include/seastar/core/file.hh b/include/seastar/core/file.hh
index 28e0cae3..be8a5d1c 100644
--- a/include/seastar/core/file.hh
+++ b/include/seastar/core/file.hh
@@ -38,6 +38,8 @@

namespace seastar {

+using io_result = std::tuple<size_t, ssize_t>;
+
/// \addtogroup fileio-module
/// @{

@@ -129,6 +131,11 @@ class file_impl {
return read_dma(pos, std::move(iov), pc);
}

+ virtual future<io_result> append_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc, io_intent*) {
+ // return error because only blockdev_file_impl can process append
+ return current_exception_as_future<io_result>();
+ }
+
virtual future<> flush(void) = 0;
virtual future<struct stat> stat(void) = 0;
virtual future<> truncate(uint64_t length) = 0;
@@ -150,7 +157,7 @@ class file_impl {
friend class reactor;
};

-future<shared_ptr<file_impl>> make_file_impl(int fd, file_open_options options, int oflags) noexcept;
+future<shared_ptr<file_impl>> make_file_impl(int fd, file_open_options options, int oflags, std::string name = "") noexcept;

/// \endcond

@@ -335,6 +342,11 @@ class file {
/// read may happen due to end-of-file or an I/O error.
future<size_t> dma_read(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc = default_priority_class(), io_intent* intent = nullptr) noexcept;

+ template <typename CharType>
+ future<io_result> dma_append(uint64_t pos, const CharType* buffer, size_t len, const io_priority_class& pc = default_priority_class(), io_intent* intent = nullptr) noexcept {
+ return dma_append_impl(pos, reinterpret_cast<const uint8_t*>(buffer), len, pc, intent);
+ }
+
/// Performs a DMA write from the specified buffer.
///
/// \param pos offset to write into. Must be aligned to \ref disk_write_dma_alignment.
@@ -563,6 +575,11 @@ class file {
future<size_t>
dma_write_impl(uint64_t pos, const uint8_t* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept;

+ // first size_t : result status or written bytes
+ // second size_t : appended block address (bytes)
+ future<io_result>
+ dma_append_impl(uint64_t pos, const uint8_t* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept;
+
future<temporary_buffer<uint8_t>>
dma_read_impl(uint64_t pos, size_t len, const io_priority_class& pc, io_intent* intent) noexcept;

diff --git a/include/seastar/core/io_queue.hh b/include/seastar/core/io_queue.hh
index d2936244..887f6d18 100644
--- a/include/seastar/core/io_queue.hh
+++ b/include/seastar/core/io_queue.hh
@@ -121,6 +121,8 @@ class io_queue {
size_t len, internal::io_request req, io_intent* intent, iovec_keeper iovs = {}) noexcept;
future<size_t> submit_io_write(const io_priority_class& priority_class,
size_t len, internal::io_request req, io_intent* intent, iovec_keeper iovs = {}) noexcept;
+ future<io_result> submit_io_append(const io_priority_class& priority_class,
+ size_t len, internal::io_request req, io_intent* intent, iovec_keeper iovs = {}) noexcept;

future<size_t> queue_request(const io_priority_class& pc, internal::io_direction_and_length dnl, internal::io_request req, io_intent* intent, iovec_keeper iovs) noexcept;
future<io_result> queue_one_request(const io_priority_class& pc, internal::io_direction_and_length dnl, internal::io_request req, io_intent* intent, iovec_keeper iovs) noexcept;
diff --git a/src/core/file-impl.hh b/src/core/file-impl.hh
index 8534091f..4c0a6a15 100644
--- a/src/core/file-impl.hh
+++ b/src/core/file-impl.hh
@@ -123,6 +123,11 @@ class posix_file_impl : public file_impl {
virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept override = 0;
virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc, io_intent* intent) noexcept override = 0;
virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc, io_intent* intent) noexcept override = 0;
+ virtual future<io_result> append_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept override
+ {
+ // return error because only blockdev_file_impl can process append
+ return current_exception_as_future<io_result>();
+ }

open_flags flags() const {
return _open_flags;
@@ -160,6 +165,7 @@ class posix_file_impl : public file_impl {
future<size_t> do_read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept;
future<size_t> do_read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc, io_intent* intent) noexcept;
future<temporary_buffer<uint8_t>> do_dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc, io_intent* intent) noexcept;
+ future<io_result> do_append_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc, io_intent* intent, int ng_device_fd, uint32_t nsid, size_t block_size) noexcept;
};

class posix_file_real_impl final : public posix_file_impl {
@@ -276,8 +282,12 @@ class append_challenged_posix_file_impl final : public posix_file_impl, public e
};

class blockdev_file_impl final : public posix_file_impl {
+ int ng_device_fd;
+ uint32_t nsid;
+ size_t block_size;
+
public:
- blockdev_file_impl(int fd, open_flags, file_open_options options, dev_t device_id, size_t block_size);
+ blockdev_file_impl(int fd, open_flags, file_open_options options, dev_t device_id, size_t block_size, int ng_device_fd = -1, uint32_t nsid = -1);
future<> truncate(uint64_t length) noexcept override;
future<> discard(uint64_t offset, uint64_t length) noexcept override;
future<uint64_t> size() noexcept override;
@@ -288,8 +298,10 @@ class blockdev_file_impl final : public posix_file_impl {
using posix_file_impl::write_dma;
virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept override;
virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc, io_intent* intent) noexcept override;
+ virtual future<io_result> append_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept override;
using posix_file_impl::dma_read_bulk;
virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc, io_intent* intent) noexcept override;
+ virtual future<> close() noexcept override;
};

}
diff --git a/src/core/file.cc b/src/core/file.cc
index 688e8455..14147842 100644
--- a/src/core/file.cc
+++ b/src/core/file.cc
@@ -33,6 +33,7 @@
#define min min /* prevent xfs.h from defining min() as a macro */
#include <xfs/xfs.h>
#undef min
+#include <libnvme.h>
#include <seastar/core/reactor.hh>
#include <seastar/core/file.hh>
#include <seastar/core/report_exception.hh>
@@ -419,6 +420,12 @@ posix_file_impl::do_write_dma(uint64_t pos, const void* buffer, size_t len, cons
return _io_queue.submit_io_write(io_priority_class, len, std::move(req), intent);
}

+future<io_result>
+posix_file_impl::do_append_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& io_priority_class, io_intent* intent, int ng_device_fd, uint32_t nsid, size_t block_size) noexcept {
+ auto req = internal::io_request::make_append(ng_device_fd, pos, buffer, len, _nowait_works, nsid, block_size);
+ return _io_queue.submit_io_append(io_priority_class, len, std::move(req), intent);
+}
+
future<size_t>
posix_file_impl::do_write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& io_priority_class, io_intent* intent) noexcept {
auto len = internal::sanitize_iovecs(iov, _disk_write_dma_alignment);
@@ -579,8 +586,8 @@ static bool blockdev_nowait_works(dev_t device_id) {
return blockdev_gen_nowait_works;
}

-blockdev_file_impl::blockdev_file_impl(int fd, open_flags f, file_open_options options, dev_t device_id, size_t block_size)
- : posix_file_impl(fd, f, options, device_id, blockdev_nowait_works(device_id)) {
+blockdev_file_impl::blockdev_file_impl(int fd, open_flags f, file_open_options options, dev_t device_id, size_t block_size, int ng_device_fd, uint32_t nsid)
+ : posix_file_impl(fd, f, options, device_id, blockdev_nowait_works(device_id)), ng_device_fd(ng_device_fd), nsid(nsid), block_size(block_size) {
// FIXME -- configure file_impl::_..._dma_alignment's from block_size
}

@@ -626,11 +633,31 @@ blockdev_file_impl::read_dma(uint64_t pos, std::vector<iovec> iov, const io_prio
return posix_file_impl::do_read_dma(pos, std::move(iov), pc, intent);
}

+future<io_result>
+blockdev_file_impl::append_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept {
+ // append is available on ng device only
+ if (ng_device_fd != -1) {
+ return posix_file_impl::do_append_dma(pos, buffer, len, pc, intent, ng_device_fd, nsid, block_size);
+ } else {
+ return posix_file_impl::do_write_dma(pos, buffer, len, pc, intent).then([pos] (auto result) {
+ return make_ready_future<io_result>(result, pos);
+ });
+ }
+}
+
future<temporary_buffer<uint8_t>>
blockdev_file_impl::dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc, io_intent* intent) noexcept {
return posix_file_impl::do_dma_read_bulk(offset, range_size, pc, intent);
}

+future<>
+blockdev_file_impl::close() noexcept {
+ if (ng_device_fd != -1) {
+ ::close(ng_device_fd);
+ }
+ return posix_file_impl::close();
+}
+
append_challenged_posix_file_impl::append_challenged_posix_file_impl(int fd, open_flags f, file_open_options options, const fs_info& fsi, dev_t device_id)
: posix_file_impl(fd, f, options, device_id, fsi)
, _max_size_changing_ops(fsi.append_concurrency)
@@ -992,18 +1019,50 @@ xfs_concurrency_from_kernel_version() {
return 0;
}

+static const std::string NVME_STR = "nvme";
+static const std::string NG_STR = "ng";
+
+bool
+is_nvme_device(std::string name) {
+ size_t pos = name.find(NVME_STR);
+ if (pos != std::string::npos) {
+ return true;
+ } else {
+ return false;
+ }
+}
+
+int
+open_ng_device_fd(std::string nvme_file_path) {
+ std::string ng_file_path = nvme_file_path;
+ size_t pos = ng_file_path.find(NVME_STR);
+ assert(pos != std::string::npos);
+ ng_file_path.replace(pos, NVME_STR.size(), NG_STR);
+ return ::open(ng_file_path.c_str(), O_RDWR);
+}
+
future<shared_ptr<file_impl>>
-make_file_impl(int fd, file_open_options options, int flags) noexcept {
- return engine().fstat(fd).then([fd, options = std::move(options), flags] (struct stat st) mutable {
+make_file_impl(int fd, file_open_options options, int flags, std::string name) noexcept {
+ return engine().fstat(fd).then([fd, options = std::move(options), flags, name = std::move(name)] (struct stat st) mutable {
auto st_dev = st.st_dev;

if (S_ISBLK(st.st_mode)) {
- size_t block_size;
+ size_t block_size = 0;
auto ret = ::ioctl(fd, BLKBSZGET, &block_size);
if (ret == -1) {
return make_exception_future<shared_ptr<file_impl>>(
std::system_error(errno, std::system_category(), "ioctl(BLKBSZGET) failed"));
}
+
+ uint32_t nsid = -1;
+ int ng_device_fd = -1;
+ if (is_nvme_device(name)) {
+ nvme_get_nsid(fd, &nsid);
+ ng_device_fd = open_ng_device_fd(name);
+ if (ng_device_fd != -1) {
+ return make_ready_future<shared_ptr<file_impl>>(make_shared<blockdev_file_impl>(fd, open_flags(flags), options, st.st_rdev, block_size, ng_device_fd ,nsid));
+ }
+ }
return make_ready_future<shared_ptr<file_impl>>(make_shared<blockdev_file_impl>(fd, open_flags(flags), options, st.st_rdev, block_size));
} else {
if (S_ISDIR(st.st_mode)) {
@@ -1221,6 +1280,15 @@ future<size_t> file::dma_write(uint64_t pos, std::vector<iovec> iov, const io_pr
}
}

+future<io_result>
+file::dma_append_impl(uint64_t pos, const uint8_t* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept {
+ try {
+ return _file_impl->append_dma(pos, buffer, len, pc, intent);
+ } catch (...) {
+ return current_exception_as_future<io_result>();
+ }
+}
+
future<size_t>
file::dma_write_impl(uint64_t pos, const uint8_t* buffer, size_t len, const io_priority_class& pc, io_intent* intent) noexcept {
try {
diff --git a/src/core/io_queue.cc b/src/core/io_queue.cc
index a1a12a64..cb672666 100644
--- a/src/core/io_queue.cc
+++ b/src/core/io_queue.cc
@@ -922,6 +922,13 @@ future<size_t> io_queue::submit_io_read(const io_priority_class& pc, size_t len,
return queue_request(pc, io_direction_and_length(io_direction_read, len), std::move(req), intent, std::move(iovs));
}

+future<io_result> io_queue::submit_io_append(const io_priority_class& pc, size_t len, internal::io_request req, io_intent* intent, iovec_keeper iovs) noexcept {
+ auto& r = engine();
+ ++r._io_stats.aio_writes;
+ r._io_stats.aio_write_bytes += len;
+ return queue_one_request(pc, io_direction_and_length(io_direction_write, len), std::move(req), intent, std::move(iovs));
+}
+
future<size_t> io_queue::submit_io_write(const io_priority_class& pc, size_t len, internal::io_request req, io_intent* intent, iovec_keeper iovs) noexcept {
auto& r = engine();
++r._io_stats.aio_writes;
--
2.25.1

shw096@snu.ac.kr

<shw096@snu.ac.kr>

unread,

Aug 29, 2022, 8:19:47 AM8/29/22

to seastar-dev@googlegroups.com, Heewon Shin, JinyongHa

Avi Kivity

<avi@scylladb.com>

unread,

Sep 1, 2022, 11:41:10 AM9/1/22

to shw096@snu.ac.kr, seastar-dev@googlegroups.com, JinyongHa

On 29/08/2022 15.17, shw...@snu.ac.kr wrote:
> From: Heewon Shin <shw...@snu.ac.kr>
>
> This commit contains implementation of append command on file-related
> classes. Almost changes are just copies of write-related functions
> except the returned value. The returned value of append is struct
> io_result that contains both the status value and assigned block address.
>
> User should be aware of which the 'pos' input parameter should be a start
> address of a zone. If 'pos' is not aligned with a zone's starting
> address, it will failed with the errno EINVAL.

I wonder if we should introduce a new function here, or instead use the
regular dma_write in a new subclass and make dma_write call the
low-level append. the new zoned_nvme_file::dma_write can take care of
supplying the correct "pos" to the lower layer.

Using dma_write means that wrapping the file with output_stream will
work as expected, so integration with applications becomes much simpler.

신희원 / 학생 / 컴퓨터공학부

<shw096@snu.ac.kr>

unread,

Sep 7, 2022, 5:28:40 AM9/7/22

to Avi Kivity, seastar-dev@googlegroups.com, JinyongHa

2022년 9월 2일 (금) 오전 12:41, Avi Kivity <a...@scylladb.com>님이 작성:

On 29/08/2022 15.17, shw...@snu.ac.kr wrote:
> From: Heewon Shin <shw...@snu.ac.kr>
>
> This commit contains implementation of append command on file-related
> classes. Almost changes are just copies of write-related functions
> except the returned value. The returned value of append is struct
> io_result that contains both the status value and assigned block address.
>
> User should be aware of which the 'pos' input parameter should be a start
> address of a zone. If 'pos' is not aligned with a zone's starting
> address, it will failed with the errno EINVAL.

I wonder if we should introduce a new function here, or instead use the
regular dma_write in a new subclass and make dma_write call the
low-level append. the new zoned_nvme_file::dma_write can take care of
supplying the correct "pos" to the lower layer.

Using dma_write means that wrapping the file with output_stream will
work as expected, so integration with applications becomes much simpler.

There are some reasons why we separately added dma_append() a new interface.
We thought that some users still want to place their data to the exact block address via legacy dma_write() although dma_append() is provided.
Furthermore, because the block address is determined after block IO completion, it is hard to melt the append command into the implementation of legacy dma_write() which is expected to place data to the block address provided by the user.

ᐧ

Avi Kivity

<avi@scylladb.com>

unread,

Sep 7, 2022, 11:31:44 AM9/7/22

to 신희원 / 학생 / 컴퓨터공학부, seastar-dev@googlegroups.com, JinyongHa

On 07/09/2022 12.28, 신희원 / 학생 / 컴퓨터공학부 wrote:

2022년 9월 2일 (금) 오전 12:41, Avi Kivity <a...@scylladb.com>님이 작성:

On 29/08/2022 15.17, shw...@snu.ac.kr wrote:
> From: Heewon Shin <shw...@snu.ac.kr>
>
> This commit contains implementation of append command on file-related
> classes. Almost changes are just copies of write-related functions
> except the returned value. The returned value of append is struct
> io_result that contains both the status value and assigned block address.
>
> User should be aware of which the 'pos' input parameter should be a start
> address of a zone. If 'pos' is not aligned with a zone's starting
> address, it will failed with the errno EINVAL.

I wonder if we should introduce a new function here, or instead use the
regular dma_write in a new subclass and make dma_write call the
low-level append. the new zoned_nvme_file::dma_write can take care of
supplying the correct "pos" to the lower layer.

Using dma_write means that wrapping the file with output_stream will
work as expected, so integration with applications becomes much simpler.

There are some reasons why we separately added dma_append() a new interface.
We thought that some users still want to place their data to the exact block address via legacy dma_write() although dma_append() is provided.

Isn't that impossible with a zoned device?

Furthermore, because the block address is determined after block IO completion, it is hard to melt the append command into the implementation of legacy dma_write() which is expected to place data to the block address provided by the user.

I thought we'd just ignore the block address (apart from sequencing concurrent write operations generated by output_stream).

신희원 / 학생 / 컴퓨터공학부

<shw096@snu.ac.kr>

unread,

Sep 16, 2022, 5:12:52 AM9/16/22

to Avi Kivity, seastar-dev@googlegroups.com, JinyongHa

2022년 9월 8일 (목) 오전 12:31, Avi Kivity <a...@scylladb.com>님이 작성:

On 07/09/2022 12.28, 신희원 / 학생 / 컴퓨터공학부 wrote:

2022년 9월 2일 (금) 오전 12:41, Avi Kivity <a...@scylladb.com>님이 작성:

On 29/08/2022 15.17, shw...@snu.ac.kr wrote:
> From: Heewon Shin <shw...@snu.ac.kr>
>
> This commit contains implementation of append command on file-related
> classes. Almost changes are just copies of write-related functions
> except the returned value. The returned value of append is struct
> io_result that contains both the status value and assigned block address.
>
> User should be aware of which the 'pos' input parameter should be a start
> address of a zone. If 'pos' is not aligned with a zone's starting
> address, it will failed with the errno EINVAL.

I wonder if we should introduce a new function here, or instead use the
regular dma_write in a new subclass and make dma_write call the
low-level append. the new zoned_nvme_file::dma_write can take care of
supplying the correct "pos" to the lower layer.

Using dma_write means that wrapping the file with output_stream will
work as expected, so integration with applications becomes much simpler.

There are some reasons why we separately added dma_append() a new interface.
We thought that some users still want to place their data to the exact block address via legacy dma_write() although dma_append() is provided.

Isn't that impossible with a zoned device?

No, there is no problem to write data to the exact block address if the block address is not violated with a write pointer managed by device internal.

But it is true that the user should carefully manage block addresses and the sequence of multiple write requests.

Furthermore, because the block address is determined after block IO completion, it is hard to melt the append command into the implementation of legacy dma_write() which is expected to place data to the block address provided by the user.

I thought we'd just ignore the block address (apart from sequencing concurrent write operations generated by output_stream).

As we mentioned on "[PATCH 1/2] reactor: add passthru functionality to io_uring backend" mailing thread, user should remember assigned lba for reading data from correct address.

Here's our document why we need append command in Ceph Seastore (https://docs.google.com/document/d/1DvADmtqVAoXKSmds8ByqPhVTUoexDQvRlkPvB2xzSfc/edit?usp=sharing)

And also attached a presentation file that introduces the necessity of append command. (https://www.usenix.org/system/files/vault20_slides_bjorling.pdf)

Thanks

Heewon

ᐧ

Avi Kivity

<avi@scylladb.com>

unread,

Sep 18, 2022, 6:52:09 AM9/18/22

to 신희원 / 학생 / 컴퓨터공학부, seastar-dev@googlegroups.com, JinyongHa

On 16/09/2022 12.12, 신희원 / 학생 / 컴퓨터공학부 wrote:

2022년 9월 8일 (목) 오전 12:31, Avi Kivity <a...@scylladb.com>님이 작성:

On 07/09/2022 12.28, 신희원 / 학생 / 컴퓨터공학부 wrote:

2022년 9월 2일 (금) 오전 12:41, Avi Kivity <a...@scylladb.com>님이 작성:

On 29/08/2022 15.17, shw...@snu.ac.kr wrote:
> From: Heewon Shin <shw...@snu.ac.kr>
>
> This commit contains implementation of append command on file-related
> classes. Almost changes are just copies of write-related functions
> except the returned value. The returned value of append is struct
> io_result that contains both the status value and assigned block address.
>
> User should be aware of which the 'pos' input parameter should be a start
> address of a zone. If 'pos' is not aligned with a zone's starting
> address, it will failed with the errno EINVAL.

I wonder if we should introduce a new function here, or instead use the
regular dma_write in a new subclass and make dma_write call the
low-level append. the new zoned_nvme_file::dma_write can take care of
supplying the correct "pos" to the lower layer.

Using dma_write means that wrapping the file with output_stream will
work as expected, so integration with applications becomes much simpler.

There are some reasons why we separately added dma_append() a new interface.
We thought that some users still want to place their data to the exact block address via legacy dma_write() although dma_append() is provided.

Isn't that impossible with a zoned device?

No, there is no problem to write data to the exact block address if the block address is not violated with a write pointer managed by device internal.

But it is true that the user should carefully manage block addresses and the sequence of multiple write requests.

Ok.

Furthermore, because the block address is determined after block IO completion, it is hard to melt the append command into the implementation of legacy dma_write() which is expected to place data to the block address provided by the user.

I thought we'd just ignore the block address (apart from sequencing concurrent write operations generated by output_stream).

As we mentioned on "[PATCH 1/2] reactor: add passthru functionality to io_uring backend" mailing thread, user should remember assigned lba for reading data from correct address.

Here's our document why we need append command in Ceph Seastore (https://docs.google.com/document/d/1DvADmtqVAoXKSmds8ByqPhVTUoexDQvRlkPvB2xzSfc/edit?usp=sharing)

It's protected. I asked for access, but please consider making it public.

And also attached a presentation file that introduces the necessity of append command. (https://www.usenix.org/system/files/vault20_slides_bjorling.pdf)

This is very helpful.

I think we need an API to allocate new zones, no?

Thanks

Heewon

ᐧ
--
You received this message because you are subscribed to the Google Groups "seastar-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email to seastar-dev...@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/seastar-dev/CAG6M7EMY4m-uBBR2UNBgZ4CdBVqwqeBKM_jebHpuN3bJWu_jcw%40mail.gmail.com.

Avi Kivity

<avi@scylladb.com>

unread,

Sep 18, 2022, 8:32:24 AM9/18/22

to 신희원 / 학생 / 컴퓨터공학부, seastar-dev@googlegroups.com, JinyongHa

On 18/09/2022 13.52, Avi Kivity wrote:

And also attached a presentation file that introduces the necessity of append command. (https://www.usenix.org/system/files/vault20_slides_bjorling.pdf)

This is very helpful.

I think we need an API to allocate new zones, no?

Having read the specification, I see the number of zones is fixed. But we need APIs to list the available zones and their capacities, so that the application can implement a zone allocator itself.

What is the typical zone size? Is it something on the order of a few megabytes?

신희원 / 학생 / 컴퓨터공학부

<shw096@snu.ac.kr>

unread,

Sep 19, 2022, 4:53:51 AM9/19/22

to Avi Kivity, seastar-dev@googlegroups.com, JinyongHa, Jinyong Ha

2022년 9월 18일 (일) 오후 9:32, Avi Kivity <a...@scylladb.com>님이 작성:

On 18/09/2022 13.52, Avi Kivity wrote:

And also attached a presentation file that introduces the necessity of append command. (https://www.usenix.org/system/files/vault20_slides_bjorling.pdf)

This is very helpful.

I think we need an API to allocate new zones, no?

Having read the specification, I see the number of zones is fixed. But we need APIs to list the available zones and their capacities, so that the application can implement a zone allocator itself.

We think that zone management is separated functionality. Users can do it with libzbd(library for zone management) or IOCTl(available at kernel). Should it be provided by Seastar library level?

What is the typical zone size? Is it something on the order of a few megabytes?

Typically, the zone size depends on the manufacturer, about 100MB to 2GB. User can achieve from libzbd or IOCTl mentioned just above.

ᐧ

Avi Kivity

<avi@scylladb.com>

unread,

Sep 19, 2022, 5:04:38 AM9/19/22

to 신희원 / 학생 / 컴퓨터공학부, seastar-dev@googlegroups.com, JinyongHa, Jinyong Ha

On 19/09/2022 11.53, 신희원 / 학생 / 컴퓨터공학부 wrote:

2022년 9월 18일 (일) 오후 9:32, Avi Kivity <a...@scylladb.com>님이 작성:

On 18/09/2022 13.52, Avi Kivity wrote:

And also attached a presentation file that introduces the necessity of append command. (https://www.usenix.org/system/files/vault20_slides_bjorling.pdf)

This is very helpful.

I think we need an API to allocate new zones, no?

Having read the specification, I see the number of zones is fixed. But we need APIs to list the available zones and their capacities, so that the application can implement a zone allocator itself.

We think that zone management is separated functionality. Users can do it with libzbd(library for zone management) or IOCTl(available at kernel). Should it be provided by Seastar library level?

ioctl will be a blocking call, no? So users will have to use alien.

It's best to provide it via Seastar so users don't have to glue together stuff from random places.

What is the typical zone size? Is it something on the order of a few megabytes?

Typically, the zone size depends on the manufacturer, about 100MB to 2GB. User can achieve from libzbd or IOCTl mentioned just above

I see, I'm interested in using it for ScyllaDB too.

What happens if a zone is not completely filled? Is the space there wasted, or can it be reused? In ScyllaDB we have a large variety of file sizes.

.

ᐧ

신희원 / 학생 / 컴퓨터공학부

<shw096@snu.ac.kr>

unread,

Sep 20, 2022, 4:08:56 AM9/20/22

to Avi Kivity, seastar-dev@googlegroups.com, JinyongHa, Jinyong Ha

2022년 9월 19일 (월) 오후 6:04, Avi Kivity <a...@scylladb.com>님이 작성:

On 19/09/2022 11.53, 신희원 / 학생 / 컴퓨터공학부 wrote:

2022년 9월 18일 (일) 오후 9:32, Avi Kivity <a...@scylladb.com>님이 작성:

On 18/09/2022 13.52, Avi Kivity wrote:

And also attached a presentation file that introduces the necessity of append command. (https://www.usenix.org/system/files/vault20_slides_bjorling.pdf)

This is very helpful.

I think we need an API to allocate new zones, no?

Having read the specification, I see the number of zones is fixed. But we need APIs to list the available zones and their capacities, so that the application can implement a zone allocator itself.

We think that zone management is separated functionality. Users can do it with libzbd(library for zone management) or IOCTl(available at kernel). Should it be provided by Seastar library level?

ioctl will be a blocking call, no? So users will have to use alien.

Yes, it will be a blocking call.

It's best to provide it via Seastar so users don't have to glue together stuff from random places.

Then, we need to provide zone management functionality via ioctl. However, can we do this on separated pull request?

What is the typical zone size? Is it something on the order of a few megabytes?

Typically, the zone size depends on the manufacturer, about 100MB to 2GB. User can achieve from libzbd or IOCTl mentioned just above

I see, I'm interested in using it for ScyllaDB too.

What happens if a zone is not completely filled? Is the space there wasted, or can it be reused? In ScyllaDB we have a large variety of file sizes.

If the zone is not fulfilled, it is in opened state. However, most ZNS devices have limited numbers of open-stated zones, so the user is responsible for managing the state of the zones.

A user changes the state of a zone closed via IOCTl, the remaining space of the zone is wasted, otherwise it can be reused.

.

ᐧ

ᐧ

tcha...@gmail.com

<tchaikov@gmail.com>

unread,

Sep 20, 2022, 11:22:57 AM9/20/22

to seastar-dev

On Tuesday, September 20, 2022 at 4:08:56 PM UTC+8 Heewon Shin wrote:

2022년 9월 19일 (월) 오후 6:04, Avi Kivity <a...@scylladb.com>님이 작성:

On 19/09/2022 11.53, 신희원 / 학생 / 컴퓨터공학부 wrote:

2022년 9월 18일 (일) 오후 9:32, Avi Kivity <a...@scylladb.com>님이 작성:

On 18/09/2022 13.52, Avi Kivity wrote:

And also attached a presentation file that introduces the necessity of append command. (https://www.usenix.org/system/files/vault20_slides_bjorling.pdf)

This is very helpful.

I think we need an API to allocate new zones, no?

Having read the specification, I see the number of zones is fixed. But we need APIs to list the available zones and their capacities, so that the application can implement a zone allocator itself.

We think that zone management is separated functionality. Users can do it with libzbd(library for zone management) or IOCTl(available at kernel). Should it be provided by Seastar library level?

ioctl will be a blocking call, no? So users will have to use alien.
Yes, it will be a blocking call.

probably we can workaround using iouring ? as iouring allows us to send NVME_URING_CMD right to the nvme driver. so, we should be able to send admin commands to the nvme drvier. functionality wise, this should be equivalent to what we have using ioctl(), right?

Avi Kivity

<avi@scylladb.com>

unread,

Sep 25, 2022, 11:42:46 AM9/25/22

to tcha...@gmail.com, seastar-dev, 신희원 / 학생 / 컴퓨터공학부, JinyongHa, Jinyong Ha

Do you mean we should expose NVME_CMD_URING to users? This will
eliminate any questions about how the API should look like, since users
can just send append via NVME_CMD_URING. After Ceph gains experience
with it, we can think of a better way to expose it to users.

The downside is that Seastar I/O scheduling is bypassed (but, append
likely is cheaper than regular overwrite).

The API could look like

future<uring_cmd_respond>
file::io_uring_command(const void* command, size_t len);

which is then implemented only for block devices.

Perhaps we can pass optional information to tell Seastar how to
classify the request for I/O scheduling purposes.

This can then be used to implement append, administrative requests,
key/value, or whatever the NVMe spec comes up with.

>
> > > It's best to provide it via Seastar so users don't have to glue
> > > together stuff from random places.
> > Then, we need to provide zone management functionality via ioctl.
> > However, can we do this on separated pull request?
> > >
> > >
> > > >
> > > > >
> > > > > What is the typical zone size? Is it something on the order
> > > > > of a few megabytes?
> > > > >
> > > >
> > > > Typically, the zone size depends on the manufacturer, about
> > > > 100MB to 2GB. User can achieve from libzbd or IOCTl mentioned
> > > > just above
> > >
> > >
> > > I see, I'm interested in using it for ScyllaDB too.
> > >
> > > What happens if a zone is not completely filled? Is the space
> > > there wasted, or can it be reused? In ScyllaDB we have a large
> > > variety of file sizes.
> > If the zone is not fulfilled, it is in opened state. However,
> > most ZNS devices have limited numbers of open-stated zones, so the
> > user is responsible for managing the state of the zones.
> > A user changes the state of a zone closed via IOCTl, the remaining
> > space of the zone is wasted, otherwise it can be reused.
> > >
> > >
> > > > .

> > > > ᐧ

> > ᐧ
> --
> You received this message because you are subscribed to the Google
> Groups "seastar-dev" group.
> To unsubscribe from this group and stop receiving emails from it,
> send an email to seastar-dev...@googlegroups.com.
> To view this discussion on the web visit

> https://groups.google.com/d/msgid/seastar-dev/33067caf-31a7-4f90-b32b-a033e23b0363n%40googlegroups.com
> .

신희원 / 학생 / 컴퓨터공학부

<shw096@snu.ac.kr>

unread,

Sep 25, 2022, 11:59:03 PM9/25/22

to Avi Kivity, tcha...@gmail.com, seastar-dev, JinyongHa, Jinyong Ha

2022년 9월 26일 (월) 오전 12:42, Avi Kivity <a...@scylladb.com>님이 작성:

You're right. Both can send admin commands to the NVMe driver.

However, using IOCTL, which is provided as abstracted interfaces, is simpler than iouring because users need to construct the NVMe command to use iouring passthru.

Also, it is better to use IOCTL in zone management functionality that is already verified by kernel maintainers.

Do you mean we should expose NVME_CMD_URING to users? This will
eliminate any questions about how the API should look like, since users
can just send append via NVME_CMD_URING. After Ceph gains experience
with it, we can think of a better way to expose it to users.

The downside is that Seastar I/O scheduling is bypassed (but, append
likely is cheaper than regular overwrite).

The API could look like

future<uring_cmd_respond>
file::io_uring_command(const void* command, size_t len);

which is then implemented only for block devices.

Perhaps we can pass optional information to tell Seastar how to
classify the request for I/O scheduling purposes.

This can then be used to implement append, administrative requests,
key/value, or whatever the NVMe spec comes up with.

We sent the new patches by using git pull-requests. (https://github.com/scylladb/seastar/pull/1218)
If you have any comments, please give us the feedback on this link.

Thanks,

Heewon Shin

ᐧ

tcha...@gmail.com

<tchaikov@gmail.com>

unread,

Sep 28, 2022, 10:32:16 AM9/28/22

to seastar-dev

yeah, i understand. just wanted to put all options on the table. without https://lore.kernel.org/io-uring/20210127212541...@kernel.dk/t/, we won't be able to have an ready-to-use async ioctl. with nvme_uring_cmd approach, we need to implement a handful NVMe commands using NVME_URING_CMD. namely to construct nvme_uring_cmd by setting the opcode and filling the necessary bits. see https://github.com/torvalds/linux/blob/49c13ed0316d55d73f1c81c66a7e2abd743f9ae6/include/linux/nvme.h#L785-L799 .

Do you mean we should expose NVME_CMD_URING to users? This will
eliminate any questions about how the API should look like, since users
can just send append via NVME_CMD_URING. After Ceph gains experience
with it, we can think of a better way to expose it to users.

no, i just wanted to suggest that, technically, there was another option which allows us to do ZNS admin in an async way. to expose NVME_CMD_URING is the most flexible approach, but it is way too raw, i'd say. people would always have to implement their own higher level op based on the nvme cmd API.

The downside is that Seastar I/O scheduling is bypassed (but, append
likely is cheaper than regular overwrite).

The API could look like

future<uring_cmd_respond>
file::io_uring_command(const void* command, size_t len);

which is then implemented only for block devices.

Perhaps we can pass optional information to tell Seastar how to
classify the request for I/O scheduling purposes.

This can then be used to implement append, administrative requests,
key/value, or whatever the NVMe spec comes up with.

We sent the new patches by using git pull-requests. (https://github.com/scylladb/seastar/pull/1218)
If you have any comments, please give us the feedback on this link.

sure. will continue there.

Reply all

Reply to author

Forward

[PATCH 2/2] file: implement append command

shw096@snu.ac.kr

shw096@snu.ac.kr

Avi Kivity

­신희원 / 학생 / 컴퓨터공학부

Avi Kivity

­신희원 / 학생 / 컴퓨터공학부

Avi Kivity

Avi Kivity

­신희원 / 학생 / 컴퓨터공학부

Avi Kivity

­신희원 / 학생 / 컴퓨터공학부

tcha...@gmail.com

Avi Kivity

­신희원 / 학생 / 컴퓨터공학부

tcha...@gmail.com

신희원 / 학생 / 컴퓨터공학부

신희원 / 학생 / 컴퓨터공학부

신희원 / 학생 / 컴퓨터공학부

신희원 / 학생 / 컴퓨터공학부

신희원 / 학생 / 컴퓨터공학부