[PATCH 1/6] block: Add config option to not allow writing to mounted devices

193 views
Skip to first unread message

Jan Kara

unread,
Jul 4, 2023, 8:57:05 AM7/4/23
to linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Christian Brauner, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, Eric Biggers, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov, Jan Kara
Writing to mounted devices is dangerous and can lead to filesystem
corruption as well as crashes. Furthermore syzbot comes with more and
more involved examples how to corrupt block device under a mounted
filesystem leading to kernel crashes and reports we can do nothing
about. Add tracking of writers to each block device and a kernel cmdline
argument which controls whether writes to block devices open with
BLK_OPEN_BLOCK_WRITES flag are allowed. We will make filesystems use
this flag for used devices.

Syzbot can use this cmdline argument option to avoid uninteresting
crashes. Also users whose userspace setup does not need writing to
mounted block devices can set this option for hardening.

Link: https://lore.kernel.org/all/60788e5d-5c7c-1142...@linaro.org
Signed-off-by: Jan Kara <ja...@suse.cz>
---
block/Kconfig | 16 ++++++++++
block/bdev.c | 63 ++++++++++++++++++++++++++++++++++++++-
include/linux/blk_types.h | 1 +
include/linux/blkdev.h | 3 ++
4 files changed, 82 insertions(+), 1 deletion(-)

diff --git a/block/Kconfig b/block/Kconfig
index 86122e459fe0..8b4fa105b854 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,6 +77,22 @@ config BLK_DEV_INTEGRITY_T10
select CRC_T10DIF
select CRC64_ROCKSOFT

+config BLK_DEV_WRITE_MOUNTED
+ bool "Allow writing to mounted block devices"
+ default y
+ help
+ When a block device is mounted, writing to its buffer cache very likely
+ going to cause filesystem corruption. It is also rather easy to crash
+ the kernel in this way since the filesystem has no practical way of
+ detecting these writes to buffer cache and verifying its metadata
+ integrity. However there are some setups that need this capability
+ like running fsck on read-only mounted root device, modifying some
+ features on mounted ext4 filesystem, and similar. If you say N, the
+ kernel will prevent processes from writing to block devices that are
+ mounted by filesystems which provides some more protection from runaway
+ priviledged processes. If in doubt, say Y. The configuration can be
+ overridden with bdev_allow_write_mounted boot option.
+
config BLK_DEV_ZONED
bool "Zoned block device support"
select MQ_IOSCHED_DEADLINE
diff --git a/block/bdev.c b/block/bdev.c
index 523ea7289834..346e68dbf0bf 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -30,6 +30,9 @@
#include "../fs/internal.h"
#include "blk.h"

+/* Should we allow writing to mounted block devices? */
+static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED);
+
struct bdev_inode {
struct block_device bdev;
struct inode vfs_inode;
@@ -744,7 +747,34 @@ void blkdev_put_no_open(struct block_device *bdev)
{
put_device(&bdev->bd_device);
}
-
+
+static bool bdev_writes_blocked(struct block_device *bdev)
+{
+ return bdev->bd_writers == -1;
+}
+
+static void bdev_block_writes(struct block_device *bdev)
+{
+ bdev->bd_writers = -1;
+}
+
+static void bdev_unblock_writes(struct block_device *bdev)
+{
+ bdev->bd_writers = 0;
+}
+
+static bool blkdev_open_compatible(struct block_device *bdev, blk_mode_t mode)
+{
+ if (!bdev_allow_write_mounted) {
+ /* Writes blocked? */
+ if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev))
+ return false;
+ if (mode & BLK_OPEN_BLOCK_WRITES && bdev->bd_writers > 0)
+ return false;
+ }
+ return true;
+}
+
/**
* blkdev_get_by_dev - open a block device by device number
* @dev: device number of block device to open
@@ -787,6 +817,10 @@ struct bdev_handle *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
if (ret)
goto free_handle;

+ /* Blocking writes requires exclusive opener */
+ if (mode & BLK_OPEN_BLOCK_WRITES && !holder)
+ return ERR_PTR(-EINVAL);
+
bdev = blkdev_get_no_open(dev);
if (!bdev) {
ret = -ENXIO;
@@ -814,12 +848,21 @@ struct bdev_handle *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
goto abort_claiming;
if (!try_module_get(disk->fops->owner))
goto abort_claiming;
+ ret = -EBUSY;
+ if (!blkdev_open_compatible(bdev, mode))
+ goto abort_claiming;
if (bdev_is_partition(bdev))
ret = blkdev_get_part(bdev, mode);
else
ret = blkdev_get_whole(bdev, mode);
if (ret)
goto put_module;
+ if (!bdev_allow_write_mounted) {
+ if (mode & BLK_OPEN_BLOCK_WRITES)
+ bdev_block_writes(bdev);
+ else if (mode & BLK_OPEN_WRITE)
+ bdev->bd_writers++;
+ }
if (holder) {
bd_finish_claiming(bdev, holder, hops);

@@ -842,6 +885,7 @@ struct bdev_handle *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
disk_unblock_events(disk);
handle->bdev = bdev;
handle->holder = holder;
+ handle->mode = mode;
return handle;
put_module:
module_put(disk->fops->owner);
@@ -914,6 +958,14 @@ void blkdev_put(struct bdev_handle *handle)
sync_blockdev(bdev);

mutex_lock(&disk->open_mutex);
+ if (!bdev_allow_write_mounted) {
+ /* The exclusive opener was blocking writes? Unblock them. */
+ if (handle->mode & BLK_OPEN_BLOCK_WRITES)
+ bdev_unblock_writes(bdev);
+ else if (handle->mode & BLK_OPEN_WRITE)
+ bdev->bd_writers--;
+ }
+
if (handle->holder)
bd_end_claim(bdev, handle->holder);

@@ -1070,3 +1122,12 @@ void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)

blkdev_put_no_open(bdev);
}
+
+static int __init setup_bdev_allow_write_mounted(char *str)
+{
+ if (kstrtobool(str, &bdev_allow_write_mounted))
+ pr_warn("Invalid option string for bdev_allow_write_mounted:"
+ " '%s'\n", str);
+ return 1;
+}
+__setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 0bad62cca3d0..5bf0d2d458fd 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -70,6 +70,7 @@ struct block_device {
#ifdef CONFIG_FAIL_MAKE_REQUEST
bool bd_make_it_fail;
#endif
+ int bd_writers;
/*
* keep this out-of-line as it's both big and not needed in the fast
* path
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4ae3647a0322..ca467525e6e4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -124,6 +124,8 @@ typedef unsigned int __bitwise blk_mode_t;
#define BLK_OPEN_NDELAY ((__force blk_mode_t)(1 << 3))
/* open for "writes" only for ioctls (specialy hack for floppy.c) */
#define BLK_OPEN_WRITE_IOCTL ((__force blk_mode_t)(1 << 4))
+/* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */
+#define BLK_OPEN_BLOCK_WRITES ((__force blk_mode_t)(1 << 5))

struct gendisk {
/*
@@ -1474,6 +1476,7 @@ struct blk_holder_ops {
struct bdev_handle {
struct block_device *bdev;
void *holder;
+ blk_mode_t mode;
};

struct bdev_handle *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
--
2.35.3

Jan Kara

unread,
Jul 4, 2023, 8:57:06 AM7/4/23
to linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Christian Brauner, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, Eric Biggers, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov, Jan Kara
Ask block layer to not allow other writers to open block device used
for ext4 journal.

Signed-off-by: Jan Kara <ja...@suse.cz>
---
fs/ext4/super.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index cbeb8a555fe3..8181285b7549 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1112,8 +1112,9 @@ static struct bdev_handle *ext4_blkdev_get(dev_t dev, struct super_block *sb)
{
struct bdev_handle *handle;

- handle = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb,
- &ext4_holder_ops);
+ handle = blkdev_get_by_dev(dev,
+ BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_BLOCK_WRITES,
+ sb, &ext4_holder_ops);
if (IS_ERR(handle))
goto fail;
return handle;
--
2.35.3

Jan Kara

unread,
Jul 4, 2023, 8:57:06 AM7/4/23
to linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Christian Brauner, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, Eric Biggers, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov, Jan Kara
Ask block layer to block writes to block devices mounted by filesystems.

Signed-off-by: Jan Kara <ja...@suse.cz>
---
include/linux/blkdev.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ca467525e6e4..a1fb90f3887e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1471,7 +1471,8 @@ struct blk_holder_ops {
* as stored in sb->s_flags.
*/
#define sb_open_mode(flags) \
- (BLK_OPEN_READ | (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE))
+ (BLK_OPEN_READ | BLK_OPEN_BLOCK_WRITES | \
+ (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE))

struct bdev_handle {
struct block_device *bdev;
--
2.35.3

Jan Kara

unread,
Jul 4, 2023, 8:57:06 AM7/4/23
to linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Christian Brauner, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, Eric Biggers, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov, Jan Kara
Hello!

This is second version of the patches to add config option to not allow writing
to mounted block devices. For motivation why this is interesting see patch 1/6.
I've been testing the patches more extensively this time and I've found couple
of things that get broken by disallowing writes to mounted block devices:
1) Bind mounts get broken because get_tree_bdev() / mount_bdev() first try to
claim the bdev before searching whether it is already mounted. Patch 6
reworks the mount code to avoid this problem.
2) btrfs mounting is likely having the same problem as 1). It should be fixable
AFAICS but for now I've left it alone until we settle on the rest of the
series.
3) "mount -o loop" gets broken because util-linux keeps the loop device open
read-write when attempting to mount it. Hopefully fixable within util-linux.
4) resize2fs online resizing gets broken because it tries to open the block
device read-write only to call resizing ioctl. Trivial to fix within
e2fsprogs.

Likely there will be other breakage I didn't find yet but overall the breakage
looks minor enough that the option might be useful. Definitely good enough
for syzbot fuzzing and likely good enough for hardening of systems with
more tightened security.

This patch set is based on the patches making blkdev_get_by_*() return
bdev_handle [1].

Changes since v1:
* Added kernel cmdline argument to toggle whether writing to mounted block
devices is allowed or not
* Fixed handling of partitions
* Limit write blocking only to devices open with explicit BLK_OPEN_BLOCK_WRITES
flag

Honza

[1] https://lore.kernel.org/all/202306291652...@suse.cz

Previous versions:
v1: https://lore.kernel.org/all/2023061216161...@suse.cz

Jan Kara

unread,
Jul 4, 2023, 8:57:06 AM7/4/23
to linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Christian Brauner, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, Eric Biggers, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov, Jan Kara
When we don't allow opening of mounted block devices for writing, bind
mounting is broken because the bind mount tries to open the block device
before finding the superblock for it already exists. Reorganize the
mounting code to first look whether the superblock for a particular
device is already mounted and open the block device only if it is not.

Signed-off-by: Jan Kara <ja...@suse.cz>
---
fs/super.c | 188 +++++++++++++++++++++++++----------------------------
1 file changed, 89 insertions(+), 99 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index ea135fece772..fdf1e286926e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1228,13 +1228,7 @@ static const struct blk_holder_ops fs_holder_ops = {

static int set_bdev_super(struct super_block *s, void *data)
{
- s->s_bdev_handle = data;
- s->s_bdev = s->s_bdev_handle->bdev;
- s->s_dev = s->s_bdev->bd_dev;
- s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi);
-
- if (bdev_stable_writes(s->s_bdev))
- s->s_iflags |= SB_I_STABLE_WRITES;
+ s->s_dev = *(dev_t *)data;
return 0;
}

@@ -1246,7 +1240,53 @@ static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc)
static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc)
{
return !(s->s_iflags & SB_I_RETIRED) &&
- s->s_bdev == ((struct bdev_handle *)fc->sget_key)->bdev;
+ s->s_dev == *(dev_t *)fc->sget_key;
+}
+
+static int setup_bdev_super(struct super_block *s, int sb_flags,
+ struct fs_context *fc)
+{
+ struct bdev_handle *bdev_handle;
+
+ bdev_handle = blkdev_get_by_dev(s->s_dev, sb_open_mode(sb_flags),
+ s->s_type, &fs_holder_ops);
+ if (IS_ERR(bdev_handle)) {
+ if (fc)
+ errorf(fc, "%s: Can't open blockdev", fc->source);
+ return PTR_ERR(bdev_handle);
+ }
+ spin_lock(&sb_lock);
+ s->s_bdev_handle = bdev_handle;
+ s->s_bdev = bdev_handle->bdev;
+ s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi);
+
+ if (bdev_stable_writes(s->s_bdev))
+ s->s_iflags |= SB_I_STABLE_WRITES;
+ spin_unlock(&sb_lock);
+
+ /*
+ * Until SB_BORN flag is set, there can be no active superblock
+ * references and thus no filesystem freezing. get_active_super()
+ * will just loop waiting for SB_BORN so even freeze_bdev() cannot
+ * proceed. It is enough to check bdev was not frozen before we set
+ * s_bdev.
+ */
+ mutex_lock(&s->s_bdev->bd_fsfreeze_mutex);
+ if (s->s_bdev->bd_fsfreeze_count > 0) {
+ mutex_unlock(&s->s_bdev->bd_fsfreeze_mutex);
+ if (fc)
+ warnf(fc, "%pg: Can't mount, blockdev is frozen",
+ s->s_bdev);
+ return -EBUSY;
+ }
+ mutex_unlock(&s->s_bdev->bd_fsfreeze_mutex);
+
+ snprintf(s->s_id, sizeof(s->s_id), "%pg", s->s_bdev);
+ shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s",
+ fc->fs_type->name, s->s_id);
+ sb_set_blocksize(s, block_size(s->s_bdev));
+
+ return 0;
}

/**
@@ -1258,75 +1298,51 @@ int get_tree_bdev(struct fs_context *fc,
int (*fill_super)(struct super_block *,
struct fs_context *))
{
- struct bdev_handle *bdev_handle;
- struct block_device *bdev;
+ dev_t dev;
struct super_block *s;
int error = 0;

if (!fc->source)
return invalf(fc, "No source specified");

- bdev_handle = blkdev_get_by_path(fc->source,
- sb_open_mode(fc->sb_flags),
- fc->fs_type, &fs_holder_ops);
- if (IS_ERR(bdev_handle)) {
- errorf(fc, "%s: Can't open blockdev", fc->source);
- return PTR_ERR(bdev_handle);
- }
- bdev = bdev_handle->bdev;
-
- /* Once the superblock is inserted into the list by sget_fc(), s_umount
- * will protect the lockfs code from trying to start a snapshot while
- * we are mounting
- */
- mutex_lock(&bdev->bd_fsfreeze_mutex);
- if (bdev->bd_fsfreeze_count > 0) {
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
- blkdev_put(bdev_handle);
- return -EBUSY;
+ error = lookup_bdev(fc->source, &dev);
+ if (error) {
+ errorf(fc, "%s: Can't lookup blockdev", fc->source);
+ return error;
}

fc->sb_flags |= SB_NOSEC;
- fc->sget_key = bdev_handle;
+ fc->sget_key = &dev;
s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc);
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- if (IS_ERR(s)) {
- blkdev_put(bdev_handle);
+ if (IS_ERR(s))
return PTR_ERR(s);
- }

if (s->s_root) {
/* Don't summarily change the RO/RW state. */
if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) {
- warnf(fc, "%pg: Can't mount, would change RO state", bdev);
+ warnf(fc, "%pg: Can't mount, would change RO state", s->s_bdev);
deactivate_locked_super(s);
- blkdev_put(bdev_handle);
return -EBUSY;
}
-
+ } else {
/*
- * s_umount nests inside open_mutex during
- * __invalidate_device(). blkdev_put() acquires open_mutex and
- * can't be called under s_umount. Drop s_umount temporarily.
- * This is safe as we're holding an active reference.
+ * We drop s_umount here because we need to lookup bdev and
+ * bdev->open_mutex ranks above s_umount (blkdev_put() ->
+ * invalidate_bdev()). It is safe because we have active sb
+ * reference and SB_BORN is not set yet.
*/
up_write(&s->s_umount);
- blkdev_put(bdev_handle);
+ error = setup_bdev_super(s, fc->sb_flags, fc);
down_write(&s->s_umount);
- } else {
- snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
- shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s",
- fc->fs_type->name, s->s_id);
- sb_set_blocksize(s, block_size(bdev));
- error = fill_super(s, fc);
+ if (!error)
+ error = fill_super(s, fc);
if (error) {
deactivate_locked_super(s);
return error;
}

s->s_flags |= SB_ACTIVE;
- bdev->bd_super = s;
+ s->s_bdev->bd_super = s;
}

BUG_ON(fc->root);
@@ -1337,81 +1353,53 @@ EXPORT_SYMBOL(get_tree_bdev);

static int test_bdev_super(struct super_block *s, void *data)
{
- return !(s->s_iflags & SB_I_RETIRED) &&
- s->s_bdev == ((struct bdev_handle *)data)->bdev;
+ return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data;
}

struct dentry *mount_bdev(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data,
int (*fill_super)(struct super_block *, void *, int))
{
- struct bdev_handle *bdev_handle;
- struct block_device *bdev;
struct super_block *s;
int error = 0;
+ dev_t dev;

- bdev_handle = blkdev_get_by_path(dev_name, sb_open_mode(flags),
- fs_type, &fs_holder_ops);
- if (IS_ERR(bdev_handle))
- return ERR_CAST(bdev_handle);
- bdev = bdev_handle->bdev;
+ error = lookup_bdev(dev_name, &dev);
+ if (error)
+ return ERR_PTR(error);

- /*
- * once the super is inserted into the list by sget, s_umount
- * will protect the lockfs code from trying to start a snapshot
- * while we are mounting
- */
- mutex_lock(&bdev->bd_fsfreeze_mutex);
- if (bdev->bd_fsfreeze_count > 0) {
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- error = -EBUSY;
- goto error_bdev;
- }
- s = sget(fs_type, test_bdev_super, set_bdev_super, flags | SB_NOSEC,
- bdev_handle);
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ flags |= SB_NOSEC;
+ s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev);
if (IS_ERR(s))
- goto error_s;
+ return ERR_CAST(s);

if (s->s_root) {
if ((flags ^ s->s_flags) & SB_RDONLY) {
deactivate_locked_super(s);
- error = -EBUSY;
- goto error_bdev;
+ return ERR_PTR(-EBUSY);
}
-
+ } else {
/*
- * s_umount nests inside open_mutex during
- * __invalidate_device(). blkdev_put() acquires open_mutex and
- * can't be called under s_umount. Drop s_umount temporarily.
- * This is safe as we're holding an active reference.
+ * We drop s_umount here because we need to lookup bdev and
+ * bdev->open_mutex ranks above s_umount (blkdev_put() ->
+ * invalidate_bdev()). It is safe because we have active sb
+ * reference and SB_BORN is not set yet.
*/
up_write(&s->s_umount);
- blkdev_put(bdev_handle);
+ error = setup_bdev_super(s, flags, NULL);
down_write(&s->s_umount);
- } else {
- snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
- shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s",
- fs_type->name, s->s_id);
- sb_set_blocksize(s, block_size(bdev));
- error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
+ if (!error)
+ error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
if (error) {
deactivate_locked_super(s);
- goto error;
+ return ERR_PTR(error);
}

s->s_flags |= SB_ACTIVE;
- bdev->bd_super = s;
+ s->s_bdev->bd_super = s;
}

return dget(s->s_root);
-
-error_s:
- error = PTR_ERR(s);
-error_bdev:
- blkdev_put(bdev_handle);
-error:
- return ERR_PTR(error);
}
EXPORT_SYMBOL(mount_bdev);

@@ -1419,10 +1407,12 @@ void kill_block_super(struct super_block *sb)
{
struct block_device *bdev = sb->s_bdev;

- bdev->bd_super = NULL;
generic_shutdown_super(sb);
- sync_blockdev(bdev);
- blkdev_put(sb->s_bdev_handle);
+ if (bdev) {
+ bdev->bd_super = NULL;
+ sync_blockdev(bdev);
+ blkdev_put(sb->s_bdev_handle);
+ }
}

EXPORT_SYMBOL(kill_block_super);
--
2.35.3

Christian Brauner

unread,
Jul 4, 2023, 9:40:48 AM7/4/23
to Jan Kara, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, Eric Biggers, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov
On Tue, Jul 04, 2023 at 02:56:48PM +0200, Jan Kara wrote:
> Hello!
>
> This is second version of the patches to add config option to not allow writing
> to mounted block devices. For motivation why this is interesting see patch 1/6.
> I've been testing the patches more extensively this time and I've found couple
> of things that get broken by disallowing writes to mounted block devices:
> 1) Bind mounts get broken because get_tree_bdev() / mount_bdev() first try to
> claim the bdev before searching whether it is already mounted. Patch 6
> reworks the mount code to avoid this problem.
> 2) btrfs mounting is likely having the same problem as 1). It should be fixable

It likely would. Note that I've got a series to port btrfs to the new
mount api that I sent out which changes btrfs mounting quite
significantly.

Christian Brauner

unread,
Jul 4, 2023, 9:59:49 AM7/4/23
to Jan Kara, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, Eric Biggers, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov
On Tue, Jul 04, 2023 at 02:56:54PM +0200, Jan Kara wrote:
> When we don't allow opening of mounted block devices for writing, bind
> mounting is broken because the bind mount tries to open the block device

Sorry, I'm going to be annoying now...

Afaict, the analysis is misleading but I'm happy to be corrected ofc.
Finding an existing superblock is independent of mounts. get_tree_bdev()
and mount_bdev() are really only interested in finding a matching
superblock independent of whether or not a mount for it already exists.
IOW, if you had two filesystem contexts for the same block device with
different mount options:

T1 T2
fd_fs = fsopen("ext4"); fd_fs = fsopen("ext4");
fsconfig(fd_fs, FSCONFIG_SET_STRING, "source", "/dev/sda"); fsconfig(fd_fs, FSCONFIG_SET_STRING, "source", "/dev/sda");

// create superblock
fsconfig(fd_fs, FSCONFIG_CMD_CREATE, ...)
// finds superblock of T1 if opts are compatible
fsconfig(fd_fs, FSCONFIG_CMD_CREATE, ...)

you should have the issue that you're describing. But for neither of
them does a mount already exist as the first mount here would only be
created when:

T1 T2
fsmount(fd_fs); fsmount(fd_fs);

is called at which point the whole superblock issue is already settled.
Afterwards, both mounts of both T1 and T2 refer to the same superblock -
as long as the fs and the mount options support this ofc.

---

Btw, I talked to Karel a while ago. I'd like to add an option like:

FSCONFIG_CMD_CREATE_EXCL

or similar to fsconfig() that fails in scenarios like the one I
described in T1 and T2. IOW, T1 succeeds but T2 would fail even if the
filesystem would consider the sb to be compatible. This way userspace
can be sure that they did indeed succeed in creating the superblock...

Colin Walters

unread,
Jul 4, 2023, 12:00:10 PM7/4/23
to Jan Kara, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Christian Brauner, Jens Axboe, Kees Cook, Theodore Ts'o, syzkaller, Alexander Popov, Eric Biggers, xfs, linux...@vger.kernel.org, Dmitry Vyukov
s/very/is very/

> + going to cause filesystem corruption. It is also rather easy to crash
> + the kernel in this way since the filesystem has no practical way of
> + detecting these writes to buffer cache and verifying its metadata
> + integrity. However there are some setups that need this capability
> + like running fsck on read-only mounted root device, modifying some
> + features on mounted ext4 filesystem, and similar. If you say N, the
> + kernel will prevent processes from writing to block devices that are
> + mounted by filesystems which provides some more protection from runaway
> + priviledged processes. If in doubt, say Y. The configuration can be

s/priviledged/privileged/

> + overridden with bdev_allow_write_mounted boot option.

s/with/with the/

> +/* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */
> +#define BLK_OPEN_BLOCK_WRITES ((__force blk_mode_t)(1 << 5))

Bikeshed but: I think BLK and BLOCK "stutter" here. The doc comment already uses the term "exclusive" so how about BLK_OPEN_EXCLUSIVE ?

Eric Biggers

unread,
Jul 4, 2023, 12:52:45 PM7/4/23
to Colin Walters, Jan Kara, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Christian Brauner, Jens Axboe, Kees Cook, Theodore Ts'o, syzkaller, Alexander Popov, xfs, linux...@vger.kernel.org, Dmitry Vyukov
On Tue, Jul 04, 2023 at 11:56:44AM -0400, Colin Walters wrote:
> > +/* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */
> > +#define BLK_OPEN_BLOCK_WRITES ((__force blk_mode_t)(1 << 5))
>
> Bikeshed but: I think BLK and BLOCK "stutter" here. The doc comment already
> uses the term "exclusive" so how about BLK_OPEN_EXCLUSIVE ?

Yeah, using "block" in two different ways at the same time is confusing.
BLK_OPEN_EXCLUSIVE would probably be good, as would something like
BLK_OPEN_RESTRICT_WRITES.

I can't figure out how to apply this patch series, so I can't really see it in
context though.

- Eric

Eric Biggers

unread,
Jul 4, 2023, 2:44:20 PM7/4/23
to Jan Kara, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Christian Brauner, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov
Does this prevent the underlying storage from being written to? Say if the
mounted block device is /dev/sda1 and someone tries to write to /dev/sda in the
region that contains sda1.

I *think* the answer is no, writes to /dev/sda are still allowed since the goal
is just to prevent writes to the buffer cache of mounted block devices, not
writes to the underlying storage. That is really something that should be
stated explicitly, though.

- Eric

Theodore Ts'o

unread,
Jul 4, 2023, 4:55:38 PM7/4/23
to Eric Biggers, Jan Kara, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Christian Brauner, Jens Axboe, Kees Cook, syzkaller, Alexander Popov, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov
On Tue, Jul 04, 2023 at 11:44:16AM -0700, Eric Biggers wrote:
> Does this prevent the underlying storage from being written to? Say if the
> mounted block device is /dev/sda1 and someone tries to write to /dev/sda in the
> region that contains sda1.
>
> I *think* the answer is no, writes to /dev/sda are still allowed since the goal
> is just to prevent writes to the buffer cache of mounted block devices, not
> writes to the underlying storage. That is really something that should be
> stated explicitly, though.

Well, at the risk of giving the Syzbot developers any ideas, we also
aren't preventing someone from opening the SCSI generic device and
manually sending raw SCSI commands to modify a mounted block device,
and then no doubt they would claim that the kernel config
CONFIG_CHR_DEV_SG is "insecure", and so therefore any kernel that
could support writing CD or DVD's is by definition "insecure" by their
lights...

Which is why talking about security models without having an agreed
upon threat model is really a waste of time...

- Ted

Jan Kara

unread,
Jul 5, 2023, 6:30:40 AM7/5/23
to Eric Biggers, Jan Kara, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Christian Brauner, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov
You are correct. The answer is "no" because as Ted says, there are many
ways to do that anyway and for a filesystem it is generally not much
different from just corrupted fs image. I'll explicitely mention it in the
config text, that's a good idea.

Honza
--
Jan Kara <ja...@suse.com>
SUSE Labs, CR

Mike Fleetwood

unread,
Jul 5, 2023, 8:46:30 AM7/5/23
to Jan Kara, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Christian Brauner, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, Eric Biggers, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov
On Tue, 4 Jul 2023 at 13:57, Jan Kara <ja...@suse.cz> wrote:
>
> Hello!
>
> This is second version of the patches to add config option to not allow writing
> to mounted block devices. For motivation why this is interesting see patch 1/6.
> I've been testing the patches more extensively this time and I've found couple
> of things that get broken by disallowing writes to mounted block devices:
> 1) Bind mounts get broken because get_tree_bdev() / mount_bdev() first try to
> claim the bdev before searching whether it is already mounted. Patch 6
> reworks the mount code to avoid this problem.
> 2) btrfs mounting is likely having the same problem as 1). It should be fixable
> AFAICS but for now I've left it alone until we settle on the rest of the
> series.
> 3) "mount -o loop" gets broken because util-linux keeps the loop device open
> read-write when attempting to mount it. Hopefully fixable within util-linux.
> 4) resize2fs online resizing gets broken because it tries to open the block
> device read-write only to call resizing ioctl. Trivial to fix within
> e2fsprogs.
>
> Likely there will be other breakage I didn't find yet but overall the breakage
> looks minor enough that the option might be useful. Definitely good enough
> for syzbot fuzzing and likely good enough for hardening of systems with
> more tightened security.

5) Online e2label will break because it directly writes to the ext2/3/4
superblock while the FS is mounted to set the new label. Ext4 driver
will have to implement the SETFSLABEL ioctl() and e2label will have
to use it, matching what happens for online labelling of btrfs and
xfs.

Thanks,
Mike

Jan Kara

unread,
Jul 5, 2023, 9:00:36 AM7/5/23
to Christian Brauner, Jan Kara, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, Eric Biggers, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov
On Tue 04-07-23 15:59:41, Christian Brauner wrote:
> On Tue, Jul 04, 2023 at 02:56:54PM +0200, Jan Kara wrote:
> > When we don't allow opening of mounted block devices for writing, bind
> > mounting is broken because the bind mount tries to open the block device
>
> Sorry, I'm going to be annoying now...
>
> Afaict, the analysis is misleading but I'm happy to be corrected ofc.

I'm not sure what your objection exactly is. Probably I was imprecise in my
changelog description. What gets broken by not allowing RW open of a
mounted block device is:

mount -t ext4 /dev/sda1 /mnt1
mount -t ext4 /dev/sda1 /mnt2

The second mount should create another mount of the superblock created by
the first mount but before that is done, get_tree_bdev() tries to open the
block device and fails when only patches 1 & 2 are applied. This patch
fixes that.

> Finding an existing superblock is independent of mounts. get_tree_bdev()
> and mount_bdev() are really only interested in finding a matching
> superblock independent of whether or not a mount for it already exists.
> IOW, if you had two filesystem contexts for the same block device with
> different mount options:
>
> T1 T2
> fd_fs = fsopen("ext4"); fd_fs = fsopen("ext4");
> fsconfig(fd_fs, FSCONFIG_SET_STRING, "source", "/dev/sda"); fsconfig(fd_fs, FSCONFIG_SET_STRING, "source", "/dev/sda");
>
> // create superblock
> fsconfig(fd_fs, FSCONFIG_CMD_CREATE, ...)
> // finds superblock of T1 if opts are compatible
> fsconfig(fd_fs, FSCONFIG_CMD_CREATE, ...)
>
> you should have the issue that you're describing.

Correct, this will get broken when not allowing RW open for mounted block
devices as well because the second fsconfig(fd_fs, FSCONFIG_CMD_CREATE,
...) will fail to open the block device in get_tree_bdev(). But again this
patch should fix that.

> But for neither of them does a mount already exist as the first mount
> here would only be created when:
>
> T1 T2
> fsmount(fd_fs); fsmount(fd_fs);
>
> is called at which point the whole superblock issue is already settled.
> Afterwards, both mounts of both T1 and T2 refer to the same superblock -
> as long as the fs and the mount options support this ofc.

I guess the confusion comes from me calling "mount" an operation as
performed by the mount(8) command but which is in fact multiple operations
with the new mount API. Anyway, is the motivation of this patch clearer
now?

Christian Brauner

unread,
Jul 5, 2023, 9:46:26 AM7/5/23
to Jan Kara, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, Eric Biggers, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov
On Wed, Jul 05, 2023 at 03:00:33PM +0200, Jan Kara wrote:
> On Tue 04-07-23 15:59:41, Christian Brauner wrote:
> > On Tue, Jul 04, 2023 at 02:56:54PM +0200, Jan Kara wrote:
> > > When we don't allow opening of mounted block devices for writing, bind
> > > mounting is broken because the bind mount tries to open the block device
> >
> > Sorry, I'm going to be annoying now...
> >
> > Afaict, the analysis is misleading but I'm happy to be corrected ofc.
>
> I'm not sure what your objection exactly is. Probably I was imprecise in my
> changelog description. What gets broken by not allowing RW open of a
> mounted block device is:
>
> mount -t ext4 /dev/sda1 /mnt1
> mount -t ext4 /dev/sda1 /mnt2
>
> The second mount should create another mount of the superblock created by
> the first mount but before that is done, get_tree_bdev() tries to open the
> block device and fails when only patches 1 & 2 are applied. This patch
> fixes that.

My objection is that this has nothing to do with mounts but with
superblocks. :) No mount need exist for this issue to appear. And I would
prefer if we keep superblock and mount separate as this leads to unclear
analysis and changelogs.
I'm clear about what you're doing here. I would just like to not have
mounts brought into the changelog. Even before the new mount api what
you were describing was technically a superblock only issue. If someone
reads the changelog I want them to be able to clearly see that this is a
fix for superblocks, not mounts.

Especially, since the code you touch really only has to to with
superblocks.
Let me - non ironically - return the question: Is my own request clearer
now?

Darrick J. Wong

unread,
Jul 5, 2023, 11:12:22 AM7/5/23
to Jan Kara, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Christian Brauner, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, Eric Biggers, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov
On Tue, Jul 04, 2023 at 02:56:49PM +0200, Jan Kara wrote:
This might be premature at this point, but I wonder if you've given any
consideration to adding a lockdown prohibition as well? e.g.

static inline bool bdev_allow_write_mounted(void)
{
if (security_locked_down(LOCKDOWN_MOUNTED_BDEV) != 0)
return false;

return __bdev_allow_write_mounted;
}

--D

Jan Kara

unread,
Jul 5, 2023, 12:14:04 PM7/5/23
to Christian Brauner, Jan Kara, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, Eric Biggers, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov
Yes, I understand now :). Thanks for explanation. I'll rephrase the
changelog to speak about superblocks.

Christoph Hellwig

unread,
Jul 6, 2023, 11:55:28 AM7/6/23
to Jan Kara, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Christian Brauner, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, Eric Biggers, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov
On Tue, Jul 04, 2023 at 02:56:54PM +0200, Jan Kara wrote:
> When we don't allow opening of mounted block devices for writing, bind
> mounting is broken because the bind mount tries to open the block device
> before finding the superblock for it already exists. Reorganize the
> mounting code to first look whether the superblock for a particular
> device is already mounted and open the block device only if it is not.

Warning: this might be a rathole.

I really hate how mount_bdev / get_tree_bdev try to deal with multiple
mounts.

The idea to just open the device and work from there just feels very
bogus.

There is really no good reason to have the bdev to find a superblock,
the dev_t does just fine (and in fact I have a patch to remove
the bdev based get_super and just use the dev_t based one all the
time). So I'd really like to actually turn this around and only
open when we need to allocate a new super block. That probably
means tearning sget_fc apart a bit, so it will turn into a fair
amount of work, but I think it's the right thing to do.

Jan Kara

unread,
Jul 6, 2023, 12:12:59 PM7/6/23
to Christoph Hellwig, Jan Kara, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christian Brauner, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, Eric Biggers, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov
Well, this is exactly what this patch does - we use dev_t to lookup the
superblock in sget_fc() and we open the block device only if we cannot find
matching superblock and need to create a new one...

Christian Brauner

unread,
Jul 7, 2023, 3:39:13 AM7/7/23
to Jan Kara, Christoph Hellwig, linux-...@vger.kernel.org, linux...@vger.kernel.org, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, Eric Biggers, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov
Can you do this rework independent of the bdev_handle work that you're
doing so this series doesn't depend on the other work and we can get
the VFS bits merged for this?

Jan Kara

unread,
Jul 7, 2023, 6:48:39 AM7/7/23
to Christian Brauner, Jan Kara, Christoph Hellwig, linux-...@vger.kernel.org, linux...@vger.kernel.org, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, Eric Biggers, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov
Yeah, it should be doable. I'll have a look into it.

Christoph Hellwig

unread,
Jul 7, 2023, 7:30:27 AM7/7/23
to Jan Kara, Christoph Hellwig, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christian Brauner, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, Eric Biggers, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov
On Thu, Jul 06, 2023 at 06:12:55PM +0200, Jan Kara wrote:
> Well, this is exactly what this patch does - we use dev_t to lookup the
> superblock in sget_fc() and we open the block device only if we cannot find
> matching superblock and need to create a new one...

Hah, that's what you get for writing one last mail before getting on
an airplane without reading the patch. This just sounded like a
workaround especially being at the end of the series.

Christoph Hellwig

unread,
Jul 7, 2023, 7:31:13 AM7/7/23
to Christian Brauner, Jan Kara, Christoph Hellwig, linux-...@vger.kernel.org, linux...@vger.kernel.org, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, Eric Biggers, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov
On Fri, Jul 07, 2023 at 09:39:05AM +0200, Christian Brauner wrote:
> Can you do this rework independent of the bdev_handle work that you're
> doing so this series doesn't depend on the other work and we can get
> the VFS bits merged for this?

It really should be before it. I have a few other things related to
it, so if Jan doesn't mind I'd be happy to take it over and post a
series with a version of this and some sget and get_super rework.

Jan Kara

unread,
Jul 7, 2023, 8:28:34 AM7/7/23
to Christoph Hellwig, Christian Brauner, Jan Kara, linux-...@vger.kernel.org, linux...@vger.kernel.org, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, Eric Biggers, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov
OK, sure go ahead. I'll be on vacation with my family in the coming weeks
anyway so I won't have time to seriously dwelve into this. So I'll just see
what's the situation once I return.

Jan Kara

unread,
Aug 14, 2023, 12:39:54 PM8/14/23
to Mike Fleetwood, Jan Kara, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Christian Brauner, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, Eric Biggers, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov
Thanks, added to the description.

Jan Kara

unread,
Aug 14, 2023, 12:41:06 PM8/14/23
to Eric Biggers, Colin Walters, Jan Kara, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Christian Brauner, Jens Axboe, Kees Cook, Theodore Ts'o, syzkaller, Alexander Popov, xfs, linux...@vger.kernel.org, Dmitry Vyukov
On Tue 04-07-23 09:52:40, Eric Biggers wrote:
> On Tue, Jul 04, 2023 at 11:56:44AM -0400, Colin Walters wrote:
> > > +/* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */
> > > +#define BLK_OPEN_BLOCK_WRITES ((__force blk_mode_t)(1 << 5))
> >
> > Bikeshed but: I think BLK and BLOCK "stutter" here. The doc comment already
> > uses the term "exclusive" so how about BLK_OPEN_EXCLUSIVE ?
>
> Yeah, using "block" in two different ways at the same time is confusing.
> BLK_OPEN_EXCLUSIVE would probably be good, as would something like
> BLK_OPEN_RESTRICT_WRITES.

BLK_OPEN_RESTRICT_WRITES sounds good to me. I'll rename the flag.

Jan Kara

unread,
Aug 14, 2023, 12:43:25 PM8/14/23
to Colin Walters, Jan Kara, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Christian Brauner, Jens Axboe, Kees Cook, Theodore Ts'o, syzkaller, Alexander Popov, Eric Biggers, xfs, linux...@vger.kernel.org, Dmitry Vyukov
Thanks for the language fixes!

> > +/* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */
> > +#define BLK_OPEN_BLOCK_WRITES ((__force blk_mode_t)(1 << 5))
>
> Bikeshed but: I think BLK and BLOCK "stutter" here. The doc comment
> already uses the term "exclusive" so how about BLK_OPEN_EXCLUSIVE ?

Well, we already have exclusive opens of block devices which are different
(they are exclusive only wrt other exclusive opens) so BLK_OPEN_EXCLUSIVE
will be really confusing. But BLK_OPEN_RESTRICT_WRITES sounds good to me.

Eric Biggers

unread,
Aug 22, 2023, 1:35:28 AM8/22/23
to Jan Kara, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Christian Brauner, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov
Hi Jan,

On Tue, Jul 04, 2023 at 02:56:49PM +0200, Jan Kara wrote:
> Writing to mounted devices is dangerous and can lead to filesystem
> corruption as well as crashes. Furthermore syzbot comes with more and
> more involved examples how to corrupt block device under a mounted
> filesystem leading to kernel crashes and reports we can do nothing
> about. Add tracking of writers to each block device and a kernel cmdline
> argument which controls whether writes to block devices open with
> BLK_OPEN_BLOCK_WRITES flag are allowed. We will make filesystems use
> this flag for used devices.
>
> Syzbot can use this cmdline argument option to avoid uninteresting
> crashes. Also users whose userspace setup does not need writing to
> mounted block devices can set this option for hardening.
>
> Link: https://lore.kernel.org/all/60788e5d-5c7c-1142...@linaro.org
> Signed-off-by: Jan Kara <ja...@suse.cz>

Can you make it clear that the important thing this patch prevents is writes to
the block device's buffer cache, not writes to the underlying storage? It's
super important not to confuse the two cases.

Related to this topic, I wonder if there is any value in providing an option
that would allow O_DIRECT writes but forbid buffered writes? Would that be
useful for any of the known use cases for writing to mounted block devices?

- Eric

Jan Kara

unread,
Aug 22, 2023, 6:12:01 AM8/22/23
to Eric Biggers, Jan Kara, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Christian Brauner, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov
Hi Eric!

On Mon 21-08-23 22:35:23, Eric Biggers wrote:
> On Tue, Jul 04, 2023 at 02:56:49PM +0200, Jan Kara wrote:
> > Writing to mounted devices is dangerous and can lead to filesystem
> > corruption as well as crashes. Furthermore syzbot comes with more and
> > more involved examples how to corrupt block device under a mounted
> > filesystem leading to kernel crashes and reports we can do nothing
> > about. Add tracking of writers to each block device and a kernel cmdline
> > argument which controls whether writes to block devices open with
> > BLK_OPEN_BLOCK_WRITES flag are allowed. We will make filesystems use
> > this flag for used devices.
> >
> > Syzbot can use this cmdline argument option to avoid uninteresting
> > crashes. Also users whose userspace setup does not need writing to
> > mounted block devices can set this option for hardening.
> >
> > Link: https://lore.kernel.org/all/60788e5d-5c7c-1142...@linaro.org
> > Signed-off-by: Jan Kara <ja...@suse.cz>
>
> Can you make it clear that the important thing this patch prevents is
> writes to the block device's buffer cache, not writes to the underlying
> storage? It's super important not to confuse the two cases.

Right, I've already updated the description of the help text in the kconfig
to explicitely explain that this does not prevent underlying device content
from being modified, it just prevents writes the the block device itself.
But I guess I can also explain this (with a bit more technical details) in
the changelog. Good idea.

> Related to this topic, I wonder if there is any value in providing an option
> that would allow O_DIRECT writes but forbid buffered writes? Would that be
> useful for any of the known use cases for writing to mounted block devices?

I'm not sure how useful that would be but it would be certainly rather
difficult to implement. The problem is we can currently fallback from
direct to buffered IO as we see fit, also we need to invalidate page cache
while doing direct IO which can fail etc. So it will be a rather nasty can
of worms to open...

Aleksandr Nogikh

unread,
Oct 19, 2023, 5:17:13 AM10/19/23
to Jan Kara, Eric Biggers, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Christian Brauner, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov
Hi Jan,

Thank you for the series!

Have you already had a chance to push an updated version of it?
I tried to search LKML, but didn't find anything.

Or did you decide to put it off until later?

--
Aleksandr

Jan Kara

unread,
Oct 24, 2023, 7:10:19 AM10/24/23
to Aleksandr Nogikh, Jan Kara, Eric Biggers, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Christian Brauner, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov
Hi!

On Thu 19-10-23 11:16:55, Aleksandr Nogikh wrote:
> Thank you for the series!
>
> Have you already had a chance to push an updated version of it?
> I tried to search LKML, but didn't find anything.
>
> Or did you decide to put it off until later?

So there is preliminary series sitting in VFS tree that changes how block
devices are open. There are some conflicts with btrfs tree and bcachefs
merge that complicate all this (plus there was quite some churn in VFS
itself due to changing rules how block devices are open) so I didn't push
out the series that actually forbids opening of mounted block devices
because that would cause a "merge from hell" issues. I plan to push out the
remaining patches once the merge window closes and all the dependencies are
hopefully in a stable state. Maybe I can push out the series earlier based
on linux-next so that people can have a look at the current state.

Aleksandr Nogikh

unread,
Oct 27, 2023, 8:06:14 AM10/27/23
to Jan Kara, Eric Biggers, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Christian Brauner, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov
I see, thanks for sharing the details!

We'll set CONFIG_BLK_DEV_WRITE_MOUNTED=n on syzbot once the series is
in linux-next.

Jan Kara

unread,
Nov 1, 2023, 1:43:29 PM11/1/23
to Christian Brauner, linux-...@vger.kernel.org, linux...@vger.kernel.org, Jens Axboe, Christoph Hellwig, Kees Cook, syzkaller, Alexander Popov, linu...@vger.kernel.org, Dmitry Vyukov, Jan Kara, Darrick J . Wong
Ask block layer to not allow other writers to open block devices used
for xfs log and realtime devices.

Reviewed-by: Darrick J. Wong <djw...@kernel.org>
Signed-off-by: Jan Kara <ja...@suse.cz>
---
fs/xfs/xfs_super.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 84107d162e41..2f29a6810e6a 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -365,8 +365,9 @@ xfs_blkdev_get(
{
int error = 0;

- *handlep = bdev_open_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE,
- mp->m_super, &fs_holder_ops);
+ *handlep = bdev_open_by_path(name,
+ BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES,
+ mp->m_super, &fs_holder_ops);
if (IS_ERR(*handlep)) {
error = PTR_ERR(*handlep);
*handlep = NULL;
--
2.35.3

Jan Kara

unread,
Nov 1, 2023, 1:43:29 PM11/1/23
to Christian Brauner, linux-...@vger.kernel.org, linux...@vger.kernel.org, Jens Axboe, Christoph Hellwig, Kees Cook, syzkaller, Alexander Popov, linu...@vger.kernel.org, Dmitry Vyukov, Jan Kara
Hello!

This is the third version of the patches to add config option to not allow
writing to mounted block devices. The new API for block device opening has been
merged so hopefully this patchset can progress towards being merged. We face
some issues with necessary btrfs changes (review bandwidth) so this series is
modified to enable restricting of writes for all other filesystems. Once btrfs
can merge necessary device scanning changes, enabling the support for
restricting writes for it is trivial.

For motivation why restricting writes to mounted block devices is interesting
see patch 3/7. I've been testing the patches more extensively and I've found
couple of things that get broken by disallowing writes to mounted block
devices:

1) "mount -o loop" gets broken because util-linux keeps the loop device open
read-write when attempting to mount it. Hopefully fixable within util-linux.
2) resize2fs online resizing gets broken because it tries to open the block
device read-write only to call resizing ioctl. Trivial to fix within
e2fsprogs.
3) Online e2label will break because it directly writes to the ext2/3/4
superblock while the FS is mounted to set the new label. Ext4 driver
will have to implement the SETFSLABEL ioctl() and e2label will have
to use it, matching what happens for online labelling of btrfs and
xfs.

Likely there will be other breakage I didn't find yet but overall the breakage
looks minor enough that the option might be useful. Definitely good enough
for syzbot fuzzing and likely good enough for hardening of systems with
more tightened security.

This patch set is based on the VFS tree as of yesterday.

Changes since v2:
* Rebased on top of current VFS tree
* Added missing conversion of bcachefs to new bdev opening API
* Added patch to drop old bdev opening API
* Dropped support for restricting writes to btrfs to avoid patch dependencies
and unblock merging of the patches

Changes since v1:
* Added kernel cmdline argument to toggle whether writing to mounted block
devices is allowed or not
* Fixed handling of partitions
* Limit write blocking only to devices open with explicit BLK_OPEN_BLOCK_WRITES
flag

Honza

Previous versions:
Link: https://lore.kernel.org/all/2023061216161...@suse.cz #v1
Link: https://lore.kernel.org/all/2023070412272...@suse.cz #v2

Jan Kara

unread,
Nov 1, 2023, 1:43:29 PM11/1/23
to Christian Brauner, linux-...@vger.kernel.org, linux...@vger.kernel.org, Jens Axboe, Christoph Hellwig, Kees Cook, syzkaller, Alexander Popov, linu...@vger.kernel.org, Dmitry Vyukov, Jan Kara
Writing to mounted devices is dangerous and can lead to filesystem
corruption as well as crashes. Furthermore syzbot comes with more and
more involved examples how to corrupt block device under a mounted
filesystem leading to kernel crashes and reports we can do nothing
about. Add tracking of writers to each block device and a kernel cmdline
argument which controls whether other writeable opens to block devices
open with BLK_OPEN_RESTRICT_WRITES flag are allowed. We will make
filesystems use this flag for used devices.

Note that this effectively only prevents modification of the particular
block device's page cache by other writers. The actual device content
can still be modified by other means - e.g. by issuing direct scsi
commands, by doing writes through devices lower in the storage stack
(e.g. in case loop devices, DM, or MD are involved) etc. But blocking
direct modifications of the block device page cache is enough to give
filesystems a chance to perform data validation when loading data from
the underlying storage and thus prevent kernel crashes.

Syzbot can use this cmdline argument option to avoid uninteresting
crashes. Also users whose userspace setup does not need writing to
mounted block devices can set this option for hardening.

Link: https://lore.kernel.org/all/60788e5d-5c7c-1142...@linaro.org
Signed-off-by: Jan Kara <ja...@suse.cz>
---
block/Kconfig | 20 +++++++++++++
block/bdev.c | 62 ++++++++++++++++++++++++++++++++++++++-
include/linux/blk_types.h | 1 +
include/linux/blkdev.h | 2 ++
4 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/block/Kconfig b/block/Kconfig
index f1364d1c0d93..ca04b657e058 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -78,6 +78,26 @@ config BLK_DEV_INTEGRITY_T10
select CRC_T10DIF
select CRC64_ROCKSOFT

+config BLK_DEV_WRITE_MOUNTED
+ bool "Allow writing to mounted block devices"
+ default y
+ help
+ When a block device is mounted, writing to its buffer cache is very
+ likely going to cause filesystem corruption. It is also rather easy to
+ crash the kernel in this way since the filesystem has no practical way
+ of detecting these writes to buffer cache and verifying its metadata
+ integrity. However there are some setups that need this capability
+ like running fsck on read-only mounted root device, modifying some
+ features on mounted ext4 filesystem, and similar. If you say N, the
+ kernel will prevent processes from writing to block devices that are
+ mounted by filesystems which provides some more protection from runaway
+ privileged processes and generally makes it much harder to crash
+ filesystem drivers. Note however that this does not prevent
+ underlying device(s) from being modified by other means, e.g. by
+ directly submitting SCSI commands or through access to lower layers of
+ storage stack. If in doubt, say Y. The configuration can be overridden
+ with the bdev_allow_write_mounted boot option.
+
config BLK_DEV_ZONED
bool "Zoned block device support"
select MQ_IOSCHED_DEADLINE
diff --git a/block/bdev.c b/block/bdev.c
index 3f27939e02c6..d75dd7dd2b31 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -30,6 +30,9 @@
#include "../fs/internal.h"
#include "blk.h"

+/* Should we allow writing to mounted block devices? */
+static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED);
+
struct bdev_inode {
struct block_device bdev;
struct inode vfs_inode;
@@ -730,7 +733,34 @@ void blkdev_put_no_open(struct block_device *bdev)
{
put_device(&bdev->bd_device);
}
-
+
+static bool bdev_writes_blocked(struct block_device *bdev)
+{
+ return bdev->bd_writers == -1;
+}
+
+static void bdev_block_writes(struct block_device *bdev)
+{
+ bdev->bd_writers = -1;
+}
+
+static void bdev_unblock_writes(struct block_device *bdev)
+{
+ bdev->bd_writers = 0;
+}
+
+static bool blkdev_open_compatible(struct block_device *bdev, blk_mode_t mode)
+{
+ if (!bdev_allow_write_mounted) {
+ /* Writes blocked? */
+ if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev))
+ return false;
+ if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0)
+ return false;
+ }
+ return true;
+}
+
/**
* bdev_open_by_dev - open a block device by device number
* @dev: device number of block device to open
@@ -773,6 +803,10 @@ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
if (ret)
goto free_handle;

+ /* Blocking writes requires exclusive opener */
+ if (mode & BLK_OPEN_RESTRICT_WRITES && !holder)
+ return ERR_PTR(-EINVAL);
+
bdev = blkdev_get_no_open(dev);
if (!bdev) {
ret = -ENXIO;
@@ -800,12 +834,21 @@ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
goto abort_claiming;
if (!try_module_get(disk->fops->owner))
goto abort_claiming;
+ ret = -EBUSY;
+ if (!blkdev_open_compatible(bdev, mode))
+ goto abort_claiming;
if (bdev_is_partition(bdev))
ret = blkdev_get_part(bdev, mode);
else
ret = blkdev_get_whole(bdev, mode);
if (ret)
goto put_module;
+ if (!bdev_allow_write_mounted) {
+ if (mode & BLK_OPEN_RESTRICT_WRITES)
+ bdev_block_writes(bdev);
+ else if (mode & BLK_OPEN_WRITE)
+ bdev->bd_writers++;
+ }
if (holder) {
bd_finish_claiming(bdev, holder, hops);

@@ -901,6 +944,14 @@ void bdev_release(struct bdev_handle *handle)
sync_blockdev(bdev);

mutex_lock(&disk->open_mutex);
+ if (!bdev_allow_write_mounted) {
+ /* The exclusive opener was blocking writes? Unblock them. */
+ if (handle->mode & BLK_OPEN_RESTRICT_WRITES)
+ bdev_unblock_writes(bdev);
+ else if (handle->mode & BLK_OPEN_WRITE)
+ bdev->bd_writers--;
+ }
+
if (handle->holder)
bd_end_claim(bdev, handle->holder);

@@ -1069,3 +1120,12 @@ void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)

blkdev_put_no_open(bdev);
}
+
+static int __init setup_bdev_allow_write_mounted(char *str)
+{
+ if (kstrtobool(str, &bdev_allow_write_mounted))
+ pr_warn("Invalid option string for bdev_allow_write_mounted:"
+ " '%s'\n", str);
+ return 1;
+}
+__setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 749203277fee..52e264d5a830 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -66,6 +66,7 @@ struct block_device {
#ifdef CONFIG_FAIL_MAKE_REQUEST
bool bd_make_it_fail;
#endif
+ int bd_writers;
/*
* keep this out-of-line as it's both big and not needed in the fast
* path
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7afc10315dd5..0e0c0186aa32 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -124,6 +124,8 @@ typedef unsigned int __bitwise blk_mode_t;
#define BLK_OPEN_NDELAY ((__force blk_mode_t)(1 << 3))
/* open for "writes" only for ioctls (specialy hack for floppy.c) */
#define BLK_OPEN_WRITE_IOCTL ((__force blk_mode_t)(1 << 4))
+/* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */
+#define BLK_OPEN_RESTRICT_WRITES ((__force blk_mode_t)(1 << 5))

struct gendisk {
/*
--
2.35.3

Jan Kara

unread,
Nov 1, 2023, 1:43:30 PM11/1/23
to Christian Brauner, linux-...@vger.kernel.org, linux...@vger.kernel.org, Jens Axboe, Christoph Hellwig, Kees Cook, syzkaller, Alexander Popov, linu...@vger.kernel.org, Dmitry Vyukov, Jan Kara
Ask block layer to block writes to block devices mounted by filesystems.

Signed-off-by: Jan Kara <ja...@suse.cz>
---
include/linux/blkdev.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0e0c0186aa32..9f6c3373f9fc 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1494,7 +1494,8 @@ extern const struct blk_holder_ops fs_holder_ops;
* as stored in sb->s_flags.
*/
#define sb_open_mode(flags) \
- (BLK_OPEN_READ | (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE))
+ (BLK_OPEN_READ | BLK_OPEN_RESTRICT_WRITES | \
+ (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE))

struct bdev_handle {
struct block_device *bdev;
--
2.35.3

Jan Kara

unread,
Nov 1, 2023, 1:43:30 PM11/1/23
to Christian Brauner, linux-...@vger.kernel.org, linux...@vger.kernel.org, Jens Axboe, Christoph Hellwig, Kees Cook, syzkaller, Alexander Popov, linu...@vger.kernel.org, Dmitry Vyukov, Jan Kara, Christoph Hellwig
blkdev_get_by_*() and blkdev_put() functions are now unused. Remove
them.

Acked-by: Christoph Hellwig <h...@lst.de>
Signed-off-by: Jan Kara <ja...@suse.cz>
---
block/bdev.c | 94 ++++++++++++++----------------------------
include/linux/blkdev.h | 5 ---
2 files changed, 30 insertions(+), 69 deletions(-)

diff --git a/block/bdev.c b/block/bdev.c
index 4a502fb9b814..3f27939e02c6 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -732,7 +732,7 @@ void blkdev_put_no_open(struct block_device *bdev)
}

/**
- * blkdev_get_by_dev - open a block device by device number
+ * bdev_open_by_dev - open a block device by device number
* @dev: device number of block device to open
* @mode: open mode (BLK_OPEN_*)
* @holder: exclusive holder identifier
@@ -744,32 +744,40 @@ void blkdev_put_no_open(struct block_device *bdev)
*
* Use this interface ONLY if you really do not have anything better - i.e. when
* you are behind a truly sucky interface and all you are given is a device
- * number. Everything else should use blkdev_get_by_path().
+ * number. Everything else should use bdev_open_by_path().
*
* CONTEXT:
* Might sleep.
*
* RETURNS:
- * Reference to the block_device on success, ERR_PTR(-errno) on failure.
+ * Handle with a reference to the block_device on success, ERR_PTR(-errno) on
+ * failure.
*/
-struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
- const struct blk_holder_ops *hops)
+struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
+ const struct blk_holder_ops *hops)
{
- bool unblock_events = true;
+ struct bdev_handle *handle = kmalloc(sizeof(struct bdev_handle),
+ GFP_KERNEL);
struct block_device *bdev;
+ bool unblock_events = true;
struct gendisk *disk;
int ret;

+ if (!handle)
+ return ERR_PTR(-ENOMEM);
+
ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
MAJOR(dev), MINOR(dev),
((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) |
((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0));
if (ret)
- return ERR_PTR(ret);
+ goto free_handle;

bdev = blkdev_get_no_open(dev);
- if (!bdev)
- return ERR_PTR(-ENXIO);
+ if (!bdev) {
+ ret = -ENXIO;
+ goto free_handle;
+ }
disk = bdev->bd_disk;

if (holder) {
@@ -818,7 +826,10 @@ struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,

if (unblock_events)
disk_unblock_events(disk);
- return bdev;
+ handle->bdev = bdev;
+ handle->holder = holder;
+ handle->mode = mode;
+ return handle;
put_module:
module_put(disk->fops->owner);
abort_claiming:
@@ -828,34 +839,14 @@ struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
disk_unblock_events(disk);
put_blkdev:
blkdev_put_no_open(bdev);
+free_handle:
+ kfree(handle);
return ERR_PTR(ret);
}
-EXPORT_SYMBOL(blkdev_get_by_dev);
-
-struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
- const struct blk_holder_ops *hops)
-{
- struct bdev_handle *handle = kmalloc(sizeof(*handle), GFP_KERNEL);
- struct block_device *bdev;
-
- if (!handle)
- return ERR_PTR(-ENOMEM);
- bdev = blkdev_get_by_dev(dev, mode, holder, hops);
- if (IS_ERR(bdev)) {
- kfree(handle);
- return ERR_CAST(bdev);
- }
- handle->bdev = bdev;
- handle->holder = holder;
- if (holder)
- mode |= BLK_OPEN_EXCL;
- handle->mode = mode;
- return handle;
-}
EXPORT_SYMBOL(bdev_open_by_dev);

/**
- * blkdev_get_by_path - open a block device by name
+ * bdev_open_by_path - open a block device by name
* @path: path to the block device to open
* @mode: open mode (BLK_OPEN_*)
* @holder: exclusive holder identifier
@@ -869,29 +860,9 @@ EXPORT_SYMBOL(bdev_open_by_dev);
* Might sleep.
*
* RETURNS:
- * Reference to the block_device on success, ERR_PTR(-errno) on failure.
+ * Handle with a reference to the block_device on success, ERR_PTR(-errno) on
+ * failure.
*/
-struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
- void *holder, const struct blk_holder_ops *hops)
-{
- struct block_device *bdev;
- dev_t dev;
- int error;
-
- error = lookup_bdev(path, &dev);
- if (error)
- return ERR_PTR(error);
-
- bdev = blkdev_get_by_dev(dev, mode, holder, hops);
- if (!IS_ERR(bdev) && (mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) {
- blkdev_put(bdev, holder);
- return ERR_PTR(-EACCES);
- }
-
- return bdev;
-}
-EXPORT_SYMBOL(blkdev_get_by_path);
-
struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
void *holder, const struct blk_holder_ops *hops)
{
@@ -914,8 +885,9 @@ struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
}
EXPORT_SYMBOL(bdev_open_by_path);

-void blkdev_put(struct block_device *bdev, void *holder)
+void bdev_release(struct bdev_handle *handle)
{
+ struct block_device *bdev = handle->bdev;
struct gendisk *disk = bdev->bd_disk;

/*
@@ -929,8 +901,8 @@ void blkdev_put(struct block_device *bdev, void *holder)
sync_blockdev(bdev);

mutex_lock(&disk->open_mutex);
- if (holder)
- bd_end_claim(bdev, holder);
+ if (handle->holder)
+ bd_end_claim(bdev, handle->holder);

/*
* Trigger event checking and tell drivers to flush MEDIA_CHANGE
@@ -947,12 +919,6 @@ void blkdev_put(struct block_device *bdev, void *holder)

module_put(disk->fops->owner);
blkdev_put_no_open(bdev);
-}
-EXPORT_SYMBOL(blkdev_put);
-
-void bdev_release(struct bdev_handle *handle)
-{
- blkdev_put(handle->bdev, handle->holder);
kfree(handle);
}
EXPORT_SYMBOL(bdev_release);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index abf71cce785c..7afc10315dd5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1500,10 +1500,6 @@ struct bdev_handle {
blk_mode_t mode;
};

-struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
- const struct blk_holder_ops *hops);
-struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
- void *holder, const struct blk_holder_ops *hops);
struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
const struct blk_holder_ops *hops);
struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
@@ -1511,7 +1507,6 @@ struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
int bd_prepare_to_claim(struct block_device *bdev, void *holder,
const struct blk_holder_ops *hops);
void bd_abort_claiming(struct block_device *bdev, void *holder);
-void blkdev_put(struct block_device *bdev, void *holder);
void bdev_release(struct bdev_handle *handle);

/* just for blk-cgroup, don't use elsewhere */
--
2.35.3

Christian Brauner

unread,
Nov 6, 2023, 9:11:03 AM11/6/23
to Jan Kara, linux-...@vger.kernel.org, linux...@vger.kernel.org, Jens Axboe, Christoph Hellwig, Kees Cook, syzkaller, Alexander Popov, linu...@vger.kernel.org, Dmitry Vyukov, Christoph Hellwig
On Wed, Nov 01, 2023 at 06:43:07PM +0100, Jan Kara wrote:
> blkdev_get_by_*() and blkdev_put() functions are now unused. Remove
> them.
>
> Acked-by: Christoph Hellwig <h...@lst.de>
> Signed-off-by: Jan Kara <ja...@suse.cz>
> ---

Yes, very nice!
Reviewed-by: Christian Brauner <bra...@kernel.org>

Christian Brauner

unread,
Nov 6, 2023, 9:32:38 AM11/6/23
to Jan Kara, linux-...@vger.kernel.org, linux...@vger.kernel.org, Jens Axboe, Christoph Hellwig, Kees Cook, syzkaller, Alexander Popov, linu...@vger.kernel.org, Dmitry Vyukov
On Wed, Nov 01, 2023 at 06:43:10PM +0100, Jan Kara wrote:
> Ask block layer to block writes to block devices mounted by filesystems.
>
> Signed-off-by: Jan Kara <ja...@suse.cz>
> ---

Looks good to me,
Reviewed-by: Christian Brauner <bra...@kernel.org>

Christian Brauner

unread,
Nov 6, 2023, 9:48:04 AM11/6/23
to Jan Kara, linux-...@vger.kernel.org, linux...@vger.kernel.org, Jens Axboe, Christoph Hellwig, Kees Cook, syzkaller, Alexander Popov, linu...@vger.kernel.org, Dmitry Vyukov
On Wed, Nov 01, 2023 at 06:43:08PM +0100, Jan Kara wrote:
> Writing to mounted devices is dangerous and can lead to filesystem
> corruption as well as crashes. Furthermore syzbot comes with more and
> more involved examples how to corrupt block device under a mounted
> filesystem leading to kernel crashes and reports we can do nothing
> about. Add tracking of writers to each block device and a kernel cmdline
> argument which controls whether other writeable opens to block devices
> open with BLK_OPEN_RESTRICT_WRITES flag are allowed. We will make
> filesystems use this flag for used devices.
>
> Note that this effectively only prevents modification of the particular
> block device's page cache by other writers. The actual device content
> can still be modified by other means - e.g. by issuing direct scsi
> commands, by doing writes through devices lower in the storage stack
> (e.g. in case loop devices, DM, or MD are involved) etc. But blocking
> direct modifications of the block device page cache is enough to give
> filesystems a chance to perform data validation when loading data from
> the underlying storage and thus prevent kernel crashes.
>
> Syzbot can use this cmdline argument option to avoid uninteresting
> crashes. Also users whose userspace setup does not need writing to
> mounted block devices can set this option for hardening.
>
> Link: https://lore.kernel.org/all/60788e5d-5c7c-1142...@linaro.org
> Signed-off-by: Jan Kara <ja...@suse.cz>
> ---

A few minor tweaks I would do in-tree. Please see below.
I know it's mostly stylistic that's why I would do it so there's no
resend dance for non-technical reasons.

> block/Kconfig | 20 +++++++++++++
> block/bdev.c | 62 ++++++++++++++++++++++++++++++++++++++-
> include/linux/blk_types.h | 1 +
> include/linux/blkdev.h | 2 ++
> 4 files changed, 84 insertions(+), 1 deletion(-)
>
> diff --git a/block/Kconfig b/block/Kconfig
> index f1364d1c0d93..ca04b657e058 100644
> --- a/block/Kconfig
> +++ b/block/Kconfig
> @@ -78,6 +78,26 @@ config BLK_DEV_INTEGRITY_T10
> select CRC_T10DIF
> select CRC64_ROCKSOFT
>
> +config BLK_DEV_WRITE_MOUNTED
> + bool "Allow writing to mounted block devices"
> + default y

Let's hope that this can become the default one day.

> +static void bdev_unblock_writes(struct block_device *bdev)
> +{
> + bdev->bd_writers = 0;
> +}
> +
> +static bool blkdev_open_compatible(struct block_device *bdev, blk_mode_t mode)

I would like to mirror our may_{open,create}() routines here and call
this:

bdev_may_open()

This is a well-known vfs pattern and also easy to understand for block
devs as well.

> @@ -800,12 +834,21 @@ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
> goto abort_claiming;
> if (!try_module_get(disk->fops->owner))
> goto abort_claiming;
> + ret = -EBUSY;
> + if (!blkdev_open_compatible(bdev, mode))
> + goto abort_claiming;
> if (bdev_is_partition(bdev))
> ret = blkdev_get_part(bdev, mode);
> else
> ret = blkdev_get_whole(bdev, mode);
> if (ret)
> goto put_module;
> + if (!bdev_allow_write_mounted) {
> + if (mode & BLK_OPEN_RESTRICT_WRITES)
> + bdev_block_writes(bdev);
> + else if (mode & BLK_OPEN_WRITE)
> + bdev->bd_writers++;
> + }

I would like to move this to a tiny helper for clarity:

static void bdev_claim_write_access(struct block_device *bdev)
{
if (!bdev_allow_write_mounted)
return;

/* Claim exclusive or shared write access to the block device. */
if (mode & BLK_OPEN_RESTRICT_WRITES)
bdev_block_writes(bdev);
else if (mode & BLK_OPEN_WRITE)
bdev->bd_writers++;
}

> if (holder) {
> bd_finish_claiming(bdev, holder, hops);
>
> @@ -901,6 +944,14 @@ void bdev_release(struct bdev_handle *handle)
> sync_blockdev(bdev);
>
> mutex_lock(&disk->open_mutex);
> + if (!bdev_allow_write_mounted) {
> + /* The exclusive opener was blocking writes? Unblock them. */
> + if (handle->mode & BLK_OPEN_RESTRICT_WRITES)
> + bdev_unblock_writes(bdev);
> + else if (handle->mode & BLK_OPEN_WRITE)
> + bdev->bd_writers--;
> + }

static void bdev_yield_write_access(struct block_device *bdev)
{
if (!bdev_allow_write_mounted)
return;

/* Yield exclusive or shared write access. */
if (handle->mode & BLK_OPEN_RESTRICT_WRITES)
bdev_unblock_writes(bdev);
else if (handle->mode & BLK_OPEN_WRITE)
bdev->bd_writers--;
}

Jan Kara

unread,
Nov 6, 2023, 10:18:31 AM11/6/23
to Christian Brauner, Jan Kara, linux-...@vger.kernel.org, linux...@vger.kernel.org, Jens Axboe, Christoph Hellwig, Kees Cook, syzkaller, Alexander Popov, linu...@vger.kernel.org, Dmitry Vyukov
Whatever works best for you. I agree with the changes but please see my
comments below for some fixes.

> > block/Kconfig | 20 +++++++++++++
> > block/bdev.c | 62 ++++++++++++++++++++++++++++++++++++++-
> > include/linux/blk_types.h | 1 +
> > include/linux/blkdev.h | 2 ++
> > 4 files changed, 84 insertions(+), 1 deletion(-)
> >
> > diff --git a/block/Kconfig b/block/Kconfig
> > index f1364d1c0d93..ca04b657e058 100644
> > --- a/block/Kconfig
> > +++ b/block/Kconfig
> > @@ -78,6 +78,26 @@ config BLK_DEV_INTEGRITY_T10
> > select CRC_T10DIF
> > select CRC64_ROCKSOFT
> >
> > +config BLK_DEV_WRITE_MOUNTED
> > + bool "Allow writing to mounted block devices"
> > + default y
>
> Let's hope that this can become the default one day.

Yes, I'd hope as well but we need some tooling work (util-linux, e2fsprogs)
before that can happen.

> > +static void bdev_unblock_writes(struct block_device *bdev)
> > +{
> > + bdev->bd_writers = 0;
> > +}
> > +
> > +static bool blkdev_open_compatible(struct block_device *bdev, blk_mode_t mode)
>
> I would like to mirror our may_{open,create}() routines here and call
> this:
>
> bdev_may_open()
>
> This is a well-known vfs pattern and also easy to understand for block
> devs as well.

Sure.

> > @@ -800,12 +834,21 @@ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
> > goto abort_claiming;
> > if (!try_module_get(disk->fops->owner))
> > goto abort_claiming;
> > + ret = -EBUSY;
> > + if (!blkdev_open_compatible(bdev, mode))
> > + goto abort_claiming;
> > if (bdev_is_partition(bdev))
> > ret = blkdev_get_part(bdev, mode);
> > else
> > ret = blkdev_get_whole(bdev, mode);
> > if (ret)
> > goto put_module;
> > + if (!bdev_allow_write_mounted) {
> > + if (mode & BLK_OPEN_RESTRICT_WRITES)
> > + bdev_block_writes(bdev);
> > + else if (mode & BLK_OPEN_WRITE)
> > + bdev->bd_writers++;
> > + }
>
> I would like to move this to a tiny helper for clarity:
>
> static void bdev_claim_write_access(struct block_device *bdev)
> {
> if (!bdev_allow_write_mounted)

This should be the other way around.

> return;
>
> /* Claim exclusive or shared write access to the block device. */
> if (mode & BLK_OPEN_RESTRICT_WRITES)
> bdev_block_writes(bdev);
> else if (mode & BLK_OPEN_WRITE)
> bdev->bd_writers++;
> }
>
> > if (holder) {
> > bd_finish_claiming(bdev, holder, hops);
> >
> > @@ -901,6 +944,14 @@ void bdev_release(struct bdev_handle *handle)
> > sync_blockdev(bdev);
> >
> > mutex_lock(&disk->open_mutex);
> > + if (!bdev_allow_write_mounted) {
> > + /* The exclusive opener was blocking writes? Unblock them. */
> > + if (handle->mode & BLK_OPEN_RESTRICT_WRITES)
> > + bdev_unblock_writes(bdev);
> > + else if (handle->mode & BLK_OPEN_WRITE)
> > + bdev->bd_writers--;
> > + }
>
> static void bdev_yield_write_access(struct block_device *bdev)
> {
> if (!bdev_allow_write_mounted)

And this as well.

> return;
>
> /* Yield exclusive or shared write access. */
> if (handle->mode & BLK_OPEN_RESTRICT_WRITES)
> bdev_unblock_writes(bdev);
> else if (handle->mode & BLK_OPEN_WRITE)
> bdev->bd_writers--;
> }

Christian Brauner

unread,
Nov 6, 2023, 10:57:36 AM11/6/23
to Jan Kara, linux-...@vger.kernel.org, linux...@vger.kernel.org, Jens Axboe, Christoph Hellwig, Kees Cook, syzkaller, Alexander Popov, linu...@vger.kernel.org, Dmitry Vyukov
> > Let's hope that this can become the default one day.
>
> Yes, I'd hope as well but we need some tooling work (util-linux, e2fsprogs)
> before that can happen.

Yeah, absolutely. I'm moderately confident we'll have fairly quick
adoption of this though.

Jens Axboe

unread,
Nov 7, 2023, 10:32:41 AM11/7/23
to Jan Kara, Christian Brauner, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Kees Cook, syzkaller, Alexander Popov, linu...@vger.kernel.org, Dmitry Vyukov
For the series:

Reviewed-by: Jens Axboe <ax...@kernel.dk>

--
Jens Axboe

Jan Kara

unread,
Nov 8, 2023, 5:10:18 AM11/8/23
to Aleksandr Nogikh, Jan Kara, Eric Biggers, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Christian Brauner, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov
Hi!
So patches are now in VFS tree [1] so they should be in linux-next as well.
You should be able to start using the config option for syzbot runs :)

Honza

[1] https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git/log/?h=vfs.super

Aleksandr Nogikh

unread,
Nov 8, 2023, 1:25:13 PM11/8/23
to Jan Kara, Eric Biggers, linux-...@vger.kernel.org, linux...@vger.kernel.org, Christoph Hellwig, Christian Brauner, Jens Axboe, Kees Cook, Ted Tso, syzkaller, Alexander Popov, linu...@vger.kernel.org, linux...@vger.kernel.org, Dmitry Vyukov
Hi!

Thanks for letting me know!

I've sent a PR with new syzbot configs:
https://github.com/google/syzkaller/pull/4324

--
Aleksandr

Li Lingfeng

unread,
Dec 20, 2023, 3:32:26 AM12/20/23
to Jan Kara, Christian Brauner, linux-...@vger.kernel.org, linux...@vger.kernel.org, Jens Axboe, Christoph Hellwig, Kees Cook, syzkaller, Alexander Popov, linu...@vger.kernel.org, Dmitry Vyukov, yangerkun, yukuai (C), zhangyi (F)
Hi, Jan

When a partition device is mounted, I think maybe it's better to block
writes on the whole device at same time.

Allowing the whole device to be opened for writing when mounting a
partition device, did you have any special considerations before?

Thanks.

Jan Kara

unread,
Dec 21, 2023, 7:11:14 AM12/21/23
to Li Lingfeng, Jan Kara, Christian Brauner, linux-...@vger.kernel.org, linux...@vger.kernel.org, Jens Axboe, Christoph Hellwig, Kees Cook, syzkaller, Alexander Popov, linu...@vger.kernel.org, Dmitry Vyukov, yangerkun, yukuai (C), zhangyi (F)
> When a partition device is mounted, I think maybe it's better to block
> writes on the whole device at same time.
>
> Allowing the whole device to be opened for writing when mounting a partition
> device, did you have any special considerations before?

Yes, we were considering this. But the truth is that:

a) It is *very* hard to stop all the possibilities of corrupting data on
the device - e.g. with device mapper / loop device / malicious partition
table you can construct many block devices pointing to the same storage,
you can use e.g. SG_IO to corrupt on disk data etc. So special-casing
partitions is providing little additional benefit.

b) It is difficult to then correctly handle valid cases of multiple
writeably mounted partitions on the same device - you'd need to track used
block numbers for each device which gets difficult in presence of device
mapper etc.

c) To stop filesystem crashes, it is enough to stop modifications of buffer
cache of that one block device. Because filesystems have to validate data
they are loading into buffer cache anyway to handle faulty device, fs
corruption etc.

Honza
Reply all
Reply to author
Forward
0 new messages