[PATCH] Add the #vars device to export kernel variables

10 views
Skip to first unread message

Barret Rhoden

unread,
Nov 24, 2015, 5:02:22 PM11/24/15
to aka...@googlegroups.com
With #vars, you can specify certain global variables, such as num_cores, to
be exposed to userspace. If you want, you can:

$ bind -a \#vars /dev
$ cat /dev/num_cores

For debugging, you can add entries to vars_dir[] to temporarily track
certain variables. In the future, we could try adding support for
dynamically adding entries.

Signed-off-by: Barret Rhoden <br...@cs.berkeley.edu>
---
kern/drivers/dev/Kbuild | 1 +
kern/drivers/dev/Kconfig | 7 ++
kern/drivers/dev/vars.c | 183 +++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 191 insertions(+)
create mode 100644 kern/drivers/dev/vars.c

diff --git a/kern/drivers/dev/Kbuild b/kern/drivers/dev/Kbuild
index 57fd9368fb60..ee5e7a74fe21 100644
--- a/kern/drivers/dev/Kbuild
+++ b/kern/drivers/dev/Kbuild
@@ -12,5 +12,6 @@ obj-y += proc.o
obj-$(CONFIG_REGRESS) += regress.o
obj-y += root.o
obj-y += srv.o
+obj-$(CONFIG_VARS) += vars.o

obj-$(CONFIG_NIX) += nix.o
diff --git a/kern/drivers/dev/Kconfig b/kern/drivers/dev/Kconfig
index eb737b8b11cb..85aac58f1a3f 100644
--- a/kern/drivers/dev/Kconfig
+++ b/kern/drivers/dev/Kconfig
@@ -4,3 +4,10 @@ config REGRESS
help
The regression test device allows you to push commands to monitor()
for testing. Defaults to 'y' for now.
+
+config VARS
+ bool "#vars kernel variable exporter"
+ default y
+ help
+ The #vars device exports read access to select kernel variables. The
+ list of variables is statically declared in vars.c
diff --git a/kern/drivers/dev/vars.c b/kern/drivers/dev/vars.c
new file mode 100644
index 000000000000..573e3ac1c1d9
--- /dev/null
+++ b/kern/drivers/dev/vars.c
@@ -0,0 +1,183 @@
+/* Copyright (c) 2015 Google Inc
+ * Barret Rhoden <br...@cs.berkeley.edu>
+ * See LICENSE for details.
+ *
+ * #vars device, exports read access to select kernel global variables. These
+ * variables are statically determined (i.e. set in the code below).
+ *
+ * To add a variable, add a VARS_ENTRY to vars_dir. If the variable is a number
+ * and you want it in hex, "or" VARS_HEX into the type. e.g:
+ *
+ * VARS_ENTRY(foobar, type_int | VARS_HEX),
+ *
+ * If we add write support to vars, those will also be flags for 'type.'
+ *
+ * Another thing we can consider doing is implementing create() to add variables
+ * on the fly. We can easily get the address (symbol table), but not the type,
+ * unless we get debugging info. We could consider a CTL command to allow the
+ * user to change the type, though that might overload write() if we also allow
+ * setting variables. */
+
+#include <ns.h>
+#include <kmalloc.h>
+#include <kref.h>
+#include <atomic.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <error.h>
+#include <sys/queue.h>
+#include <fdtap.h>
+#include <syscall.h>
+
+struct dev vars_devtab;
+
+static char *devname(void)
+{
+ return vars_devtab.name;
+}
+
+enum {
+ type_s8 = 1,
+ type_s16,
+ type_s32,
+ type_s64,
+ type_u8,
+ type_u16,
+ type_u32,
+ type_u64,
+
+ type_bool,
+ type_char,
+ type_short,
+ type_int,
+ type_long,
+ type_ushort,
+ type_uint,
+ type_ulong,
+ type_ntstr, /* null-terminated string */
+
+ VARS_HEX = (1 << 31),
+};
+
+/* The qid.path is the addr of the variable. The qid.vers is the type. Or-in
+ * any VARS_* flags you want to the type. "." has no type or addr. */
+#define VARS_ENTRY(name, type) {#name, \
+ {(uint64_t)&(name), (type), QTFILE}, \
+ sizeof((name)), \
+ 0444}
+
+static struct dirtab vars_dir[] = {
+ {".", {0, 0, QTDIR}, 0, DMDIR | 0555},
+ VARS_ENTRY(num_cores, type_int),
+};
+
+static struct chan *vars_attach(char *spec)
+{
+ struct chan *c;
+
+ c = devattach(devname(), spec);
+ mkqid(&c->qid, 0, 0, QTDIR);
+ return c;
+}
+
+static struct walkqid *vars_walk(struct chan *c, struct chan *nc, char **name,
+ int nname)
+{
+ return devwalk(c, nc, name, nname, vars_dir, ARRAY_SIZE(vars_dir), devgen);
+}
+
+static int vars_stat(struct chan *c, uint8_t *db, int n)
+{
+ return devstat(c, db, n, vars_dir, ARRAY_SIZE(vars_dir), devgen);
+}
+
+static struct chan *vars_open(struct chan *c, int omode)
+{
+ return devopen(c, omode, vars_dir, ARRAY_SIZE(vars_dir), devgen);
+}
+
+static void vars_close(struct chan *c)
+{
+}
+
+static long vars_read(struct chan *c, void *ubuf, long n, int64_t offset)
+{
+ char tmp[128]; /* big enough for any number and most strings */
+ size_t size = sizeof(tmp);
+ char *fmt_str_s32 = c->qid.vers & VARS_HEX ? "0x%x" : "%d";
+ char *fmt_str_s64 = c->qid.vers & VARS_HEX ? "0x%lx" : "%ld";
+ char *fmt_str_u32 = c->qid.vers & VARS_HEX ? "0x%x" : "%u";
+ char *fmt_str_u64 = c->qid.vers & VARS_HEX ? "0x%lx" : "%lu";
+ int type = c->qid.vers & ~VARS_HEX;
+
+ switch (type) {
+ case 0:
+ return devdirread(c, ubuf, n, vars_dir, ARRAY_SIZE(vars_dir), devgen);
+ case type_bool:
+ case type_u8:
+ size = snprintf(tmp, size, fmt_str_u32, *(uint8_t*)c->qid.path);
+ break;
+ case type_u16:
+ size = snprintf(tmp, size, fmt_str_u32, *(uint16_t*)c->qid.path);
+ break;
+ case type_uint:
+ case type_u32:
+ size = snprintf(tmp, size, fmt_str_u32, *(uint32_t*)c->qid.path);
+ break;
+ case type_ulong:
+ case type_u64:
+ size = snprintf(tmp, size, fmt_str_u64, *(uint64_t*)c->qid.path);
+ break;
+ case type_s8:
+ size = snprintf(tmp, size, fmt_str_s32, *(int8_t*)c->qid.path);
+ break;
+ case type_s16:
+ size = snprintf(tmp, size, fmt_str_s32, *(int16_t*)c->qid.path);
+ break;
+ case type_int:
+ case type_s32:
+ size = snprintf(tmp, size, fmt_str_s32, *(int32_t*)c->qid.path);
+ break;
+ case type_long:
+ case type_s64:
+ size = snprintf(tmp, size, fmt_str_s64, *(int64_t*)c->qid.path);
+ break;
+ case type_char:
+ size = snprintf(tmp, size, "%c", *(char*)c->qid.path);
+ break;
+ case type_ntstr:
+ size = snprintf(tmp, size, "%s", *(char**)c->qid.path);
+ break;
+ default:
+ panic("Unknown #%s type %d", devname(), type);
+ }
+ return readmem(offset, ubuf, n, tmp, size + 1);
+}
+
+static long vars_write(struct chan *c, void *ubuf, long n, int64_t offset)
+{
+ error(EFAIL, "Can't write to a #%s file", devname());
+}
+
+struct dev vars_devtab __devtab = {
+ .name = "vars",
+ .reset = devreset,
+ .init = devinit,
+ .shutdown = devshutdown,
+ .attach = vars_attach,
+ .walk = vars_walk,
+ .stat = vars_stat,
+ .open = vars_open,
+ .create = devcreate,
+ .close = vars_close,
+ .read = vars_read,
+ .bread = devbread,
+ .write = vars_write,
+ .bwrite = devbwrite,
+ .remove = devremove,
+ .wstat = devwstat,
+ .power = devpower,
+ .chaninfo = devchaninfo,
+ .tapfd = 0,
+};
--
2.6.0.rc2.230.g3dd15c0

Kevin Klues

unread,
Nov 24, 2015, 5:11:04 PM11/24/15
to aka...@googlegroups.com
I'm curious if we don't want to expose the topology in the same way
linux does in /proc so libraries like libnuma can compile and just
work. I see the usefulness of this for variables in general, but I'm
not sure num_cores should be here (or maybe it's also here in addition
to /proc).
> --
> You received this message because you are subscribed to the Google Groups "Akaros" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to akaros+un...@googlegroups.com.
> To post to this group, send email to aka...@googlegroups.com.
> For more options, visit https://groups.google.com/d/optout.



--
~Kevin

Davide Libenzi

unread,
Nov 24, 2015, 5:15:21 PM11/24/15
to Akaros
This would have been a bit nicer IMO, if modules wanting to export variables, won't need to hack into another file.
The macro, used locally in the module source files, would populate a given section with structs like:

struct var_export_entry {
    const char *name;
    void *addr;
    unsigned long flags;
};

Then the init code of the var file would just read those entries and populate the var namespace.



barret rhoden

unread,
Nov 25, 2015, 8:10:55 AM11/25/15
to aka...@googlegroups.com
On 2015-11-24 at 14:10 Kevin Klues wrote:
> I'm curious if we don't want to expose the topology in the same way
> linux does in /proc so libraries like libnuma can compile and just
> work. I see the usefulness of this for variables in general, but I'm
> not sure num_cores should be here (or maybe it's also here in addition
> to /proc).

I imagine we'll have a #topology device or something that people can
mount at /proc or wherever. Doing something with a hierarchy in #vars
would make it a lot more complicated.

I put num_cores in here both as an example and since someone had
mentioned exporting it the other day (i think).

Barret

Kevin Klues

unread,
Nov 25, 2015, 8:18:19 AM11/25/15
to aka...@googlegroups.com
I agree. It would be nice if we could export variables from the same
file they are declared in rather than spreading them across multiple
files.

barret rhoden

unread,
Nov 25, 2015, 8:35:01 AM11/25/15
to aka...@googlegroups.com
On 2015-11-24 at 14:15 'Davide Libenzi' via Akaros wrote:
> This would have been a bit nicer IMO, if modules wanting to export
> variables, won't need to hack into another file.
> The macro, used locally in the module source files, would populate a
> given section with structs like:
>
> struct var_export_entry {
> const char *name;
> void *addr;
> unsigned long flags;
> };
>
> Then the init code of the var file would just read those entries and
> populate the var namespace.

Nice idea; I think I can add that easily...

Davide Libenzi

unread,
Nov 25, 2015, 10:59:56 AM11/25/15
to Akaros
I would also have the macro take the name explicitly, instead of stringifying the var name.
This allows to use some namespacing (ala sysctl - foo.moo - given that all the variables are within the same ns), to avoid possible conflicts.


ron minnich

unread,
Nov 25, 2015, 11:26:08 AM11/25/15
to Akaros
I like Davide's idea of doing the name explicitly. At that point however I'm not sure I see the advantage of the macro 

VARS_ENTRY(foobar, type_int | VARS_HEX),

I'd also rather see a format string, not a fixed type, for controlling printing. Since we have Plan 9 style arbitrary formats, e.g. %F is a 9p fcall we can print, formats give us more room to shoot ourselves in the foot. What fun!

We also have a standard directory-describing struct, Dirtab, which with one extension we could use.

Hence this proposal:
static Dirtab mystuff[] __vartab = {
{"boot", {(uint64_t)&boot, 0, 0}, 0, 0555, "%x",},
};

OK, I admit, it's less convenient. And it requires we amend the Dirtab to include a void *aux (for the format). But it gives us arbitrary formats and you don't have to export all those enums (e.g. VARS_HEX) to every potential user.

But once you do the first entry, adding entries is easy (I've done it) and generating entries via a trivial Go program that scans the ELF for symbols of interest is really easy (ditto).

I still prefer the code over the macro because I can see precisely what is going on in the code. If someone ever changes the macro for any reason, I will tend to ass-u-me that the macro has not changed, and that can lead to hard to find errors. 

I agree that the macro can be a good defense against changes, but we're pretty good at spatch nowadays and that works well too for global language oriented structural changes.

Either way, this is really good stuff and I agree with Davide and Barret about their improvements.

ron


Message has been deleted

Kevin Klues

unread,
Nov 25, 2015, 1:26:35 PM11/25/15
to aka...@googlegroups.com
> Hence this proposal:
> static Dirtab mystuff[] __vartab = {
> {"boot", {(uint64_t)&boot, 0, 0}, 0, 0555, "%x",},
> };

I don't see how this let's you split the declaration of variables
across files (as Davide's Macros does). It looks to me like you would
have to edit the table in a single file for any variables you wanted
to include in it.

Kevin Klues

unread,
Nov 25, 2015, 1:27:33 PM11/25/15
to aka...@googlegroups.com
Though I do like the ability to specify a format string via something like "%x"

Kevin Klues

unread,
Nov 25, 2015, 1:30:19 PM11/25/15
to aka...@googlegroups.com
To reiterate from Davide's example (which looks quite nice to me):

https://gist.github.com/dlibenzi/08a22e95c21c50f25d38
--
~Kevin

Davide Libenzi

unread,
Nov 25, 2015, 1:30:42 PM11/25/15
to Akaros
You can either have var_write_hex_ functions, or make the section struct more complex by adding an extra fmt string.


ron minnich

unread,
Nov 25, 2015, 2:01:35 PM11/25/15
to aka...@googlegroups.com
this could be in any file. That's the point of the __vartab tag. It's the standard "gather it up at link time" technique. You can have these pretty much anywhere and they'll all show up when you
ls '#vars'


It's a good deal simpler than the other things seen.

ron 

Kevin Klues

unread,
Nov 25, 2015, 2:05:03 PM11/25/15
to aka...@googlegroups.com
mmm, I see. I'm torn between the two; they seem equivalent to me.
Probably comes down to a matter of preference.
> --
> You received this message because you are subscribed to the Google Groups
> "Akaros" group.
> To unsubscribe from this group and stop receiving emails from it, send an
> email to akaros+un...@googlegroups.com.
> To post to this group, send email to aka...@googlegroups.com.
> For more options, visit https://groups.google.com/d/optout.



--
~Kevin

ron minnich

unread,
Nov 25, 2015, 2:12:05 PM11/25/15
to aka...@googlegroups.com
On Wed, Nov 25, 2015 at 11:05 AM Kevin Klues <klu...@gmail.com> wrote:
mmm, I see. I'm torn between the two; they seem equivalent to me.
Probably comes down to a matter of preference.


They're basically equivalent with one major difference: my proposal fits in with the convention of a fixed inter-component interface based on Dev structs, building on our current model; as does Barrett's.

I'd prefer it if we could stick to this model. I'm a bit uncomfortable with the (to me) complex macros of the other approach, much less the fact that it introduces a new special-purpose struct and interface. It widens the interfaces and I'd rather we not do that.

Just my $.002 (discounted opinion rate applies).

ron 

Davide Libenzi

unread,
Nov 25, 2015, 2:23:35 PM11/25/15
to Akaros
Structs, which you can change the format of, in a single point (the macro), instead of having to go in every file where you explicitly hand coded them ☺
Also, once you add write capability, with support of custom read/write functions (besides the trivial "%x" printf specifier), you are not far at all as complexity.
Moreover, IMHO, a macro is MUCH more clear than:

{"boot", {(uint64_t)&boot, 0, 0}, 0, 0555, "%x",},

Also, I am not sure what the __vartab is going to do. If put them all in one section, you still need the section marker parsing.


--

Kevin Klues

unread,
Nov 25, 2015, 2:27:17 PM11/25/15
to aka...@googlegroups.com
I assume __vartab just turns into __attribute__((__section__(".__vartab")))
--
~Kevin

Barret Rhoden

unread,
Nov 25, 2015, 5:38:40 PM11/25/15
to aka...@googlegroups.com
On 2015-11-24 at 17:02 Barret Rhoden <br...@cs.berkeley.edu> wrote:
> With #vars, you can specify certain global variables, such as
> num_cores, to be exposed to userspace.

I opted for a mix of some of the ideas in this thread. Here's the
basics:

* #vars device, exports read access to select kernel variables. These
* variables are statically set.
*
* To add a variable, add a DEVVARS_ENTRY(name, format) somewhere in the kernel.
* The format is a string consisting of two characters, using a modified version
* of QEMU's formatting rules (ignoring count): [data_format][size]
*
* data_format is:
* x (hex)
* d (decimal)
* u (unsigned)
* o (octal)
* c (char) does not need a size
* s (string) does not need a size
* size is:
* b (8 bits)
* h (16 bits)
* w (32 bits)
* g (64 bits)
*
* e.g. DEVVARS_ENTRY(num_cores, "dw");

I'll play around with adding files before sending the revised patches
out.

Barret

Barret Rhoden

unread,
Nov 30, 2015, 4:18:34 PM11/30/15
to aka...@googlegroups.com
Version 2. Users can create variables from any C file in the kernel with
DEVVARS_ENTRY. Trusted users can create entries by name from an open or create
call.

I needed to do a few preparatory commits for this, notably dealing with a bug
in BSS zeroing.

------------
You can also find these patches at:
g...@github.com:brho/akaros.git
FROM: 2fa42319139e master
TO: 535ec6030b9a vars

And view them at:
https://github.com/brho/akaros/compare/2fa42319139e...535ec6030b9a



Barret Rhoden (6):
Add kmalloc_array() interface
Memset the BSS explicitly
Remove the edata symbol
Add the #vars device to export kernel variables
Add an assert for UTEST that takes a format string
Add a test for devvars

kern/arch/riscv/kernel.ld | 4 +-
kern/arch/x86/kernel64.ld | 4 +-
kern/drivers/dev/Kbuild | 1 +
kern/drivers/dev/Kconfig | 15 ++
kern/drivers/dev/vars.c | 487 +++++++++++++++++++++++++++++++++++++++++++++
kern/include/kmalloc.h | 5 +-
kern/include/ns.h | 10 +-
kern/src/init.c | 4 +-
kern/src/kreallocarray.c | 9 +
kern/src/monitor.c | 3 +-
user/utest/devvars.c | 125 ++++++++++++
user/utest/include/utest.h | 13 ++
12 files changed, 667 insertions(+), 13 deletions(-)
create mode 100644 kern/drivers/dev/vars.c
create mode 100644 user/utest/devvars.c

--
2.6.0.rc2.230.g3dd15c0

Barret Rhoden

unread,
Nov 30, 2015, 4:18:35 PM11/30/15
to aka...@googlegroups.com
Previously, we were assuming that everything from the end of .data to the
end of the kernel image was .bss. That is definitely not the case, which
becomes apparent if you try to add an attribute for another section and
then you wonder why said section is zeroed at runtime (but not in the
object!).

Signed-off-by: Barret Rhoden <br...@cs.berkeley.edu>
---
kern/arch/riscv/kernel.ld | 2 ++
kern/arch/x86/kernel64.ld | 2 ++
kern/src/init.c | 4 ++--
3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/kern/arch/riscv/kernel.ld b/kern/arch/riscv/kernel.ld
index 126b8bf7a5b8..c2c92057bc60 100644
--- a/kern/arch/riscv/kernel.ld
+++ b/kern/arch/riscv/kernel.ld
@@ -53,8 +53,10 @@ SECTIONS
PROVIDE(edata = .);

.bss : {
+ PROVIDE(__start_bss = .);
*(.bss)
*(.sbss)
+ PROVIDE(__stop_bss = .);
}

PROVIDE(end = .);
diff --git a/kern/arch/x86/kernel64.ld b/kern/arch/x86/kernel64.ld
index 06562979378e..f37a095d7e50 100644
--- a/kern/arch/x86/kernel64.ld
+++ b/kern/arch/x86/kernel64.ld
@@ -56,8 +56,10 @@ SECTIONS
PROVIDE(edata = .);

.bss : {
+ PROVIDE(__start_bss = .);
*(.bss)
*(COMMON)
+ PROVIDE(__stop_bss = .);
}

PROVIDE(end = .);
diff --git a/kern/src/init.c b/kern/src/init.c
index 67502c688050..5b2e9d2c65d0 100644
--- a/kern/src/init.c
+++ b/kern/src/init.c
@@ -119,9 +119,9 @@ static void extract_multiboot_cmdline(struct multiboot_info *mbi)

void kernel_init(multiboot_info_t *mboot_info)
{
- extern char edata[], end[];
+ extern char __start_bss[], __stop_bss[];

- memset(edata, 0, end - edata);
+ memset(__start_bss, 0, __stop_bss - __start_bss);
/* mboot_info is a physical address. while some arches currently have the
* lower memory mapped, everyone should have it mapped at kernbase by now.
* also, it might be in 'free' memory, so once we start dynamically using
--
2.6.0.rc2.230.g3dd15c0

Barret Rhoden

unread,
Nov 30, 2015, 4:18:35 PM11/30/15
to aka...@googlegroups.com
One thing to be careful of is that these functions can fail and return 0,
regardless of your kmalloc flags.

Signed-off-by: Barret Rhoden <br...@cs.berkeley.edu>
---
kern/include/kmalloc.h | 5 +++--
kern/src/kreallocarray.c | 9 +++++++++
2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/kern/include/kmalloc.h b/kern/include/kmalloc.h
index 268cf2985ddb..26f23c1754c0 100644
--- a/kern/include/kmalloc.h
+++ b/kern/include/kmalloc.h
@@ -16,8 +16,9 @@
#define KMALLOC_LARGEST KMALLOC_SMALLEST << NUM_KMALLOC_CACHES

void kmalloc_init(void);
-void* kmalloc(size_t size, int flags);
-void* kzmalloc(size_t size, int flags);
+void *kmalloc(size_t size, int flags);
+void *kmalloc_array(size_t nmemb, size_t size, int flags);
+void *kzmalloc(size_t size, int flags);
void *kmalloc_align(size_t size, int flags, size_t align);
void *kzmalloc_align(size_t size, int flags, size_t align);
void *krealloc(void *buf, size_t size, int flags);
diff --git a/kern/src/kreallocarray.c b/kern/src/kreallocarray.c
index d099721e0406..505159917317 100644
--- a/kern/src/kreallocarray.c
+++ b/kern/src/kreallocarray.c
@@ -34,3 +34,12 @@ void *kreallocarray(void *optr, size_t nmemb, size_t size, int flags)
}
return krealloc(optr, size * nmemb, flags);
}
+
+void *kmalloc_array(size_t nmemb, size_t size, int flags)
+{
+ if (((nmemb >= MUL_NO_OVERFLOW) || (size >= MUL_NO_OVERFLOW)) &&
+ (nmemb > 0) && ((SIZE_MAX / nmemb) < size)) {
+ return NULL;
+ }
+ return kmalloc(size * nmemb, flags);
+}
--
2.6.0.rc2.230.g3dd15c0

Barret Rhoden

unread,
Nov 30, 2015, 4:18:36 PM11/30/15
to aka...@googlegroups.com
It is mostly useless and probably confusing. I'm not even sure if the
linker had to put it directly after .data, or if it could put it wherever
it wanted before the .bss, since it was not inside the .data section.

Signed-off-by: Barret Rhoden <br...@cs.berkeley.edu>
---
kern/arch/riscv/kernel.ld | 2 --
kern/arch/x86/kernel64.ld | 2 --
kern/include/ns.h | 3 ---
kern/src/monitor.c | 3 +--
4 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/kern/arch/riscv/kernel.ld b/kern/arch/riscv/kernel.ld
index c2c92057bc60..bfc87964880f 100644
--- a/kern/arch/riscv/kernel.ld
+++ b/kern/arch/riscv/kernel.ld
@@ -50,8 +50,6 @@ SECTIONS
*(.sdata)
}

- PROVIDE(edata = .);
-
.bss : {
PROVIDE(__start_bss = .);
*(.bss)
diff --git a/kern/arch/x86/kernel64.ld b/kern/arch/x86/kernel64.ld
index f37a095d7e50..d56eef300052 100644
--- a/kern/arch/x86/kernel64.ld
+++ b/kern/arch/x86/kernel64.ld
@@ -53,8 +53,6 @@ SECTIONS
*(.data)
}

- PROVIDE(edata = .);
-
.bss : {
PROVIDE(__start_bss = .);
*(.bss)
diff --git a/kern/include/ns.h b/kern/include/ns.h
index b55d7075b241..166158472114 100644
--- a/kern/include/ns.h
+++ b/kern/include/ns.h
@@ -70,9 +70,6 @@ static inline uint32_t getcallerpc(void *v)
return 0;
}

-extern char etext[];
-extern char edata[];
-extern char end[];
extern int getfields(char *unused_char_p_t, char **unused_char_pp_t,
int unused_int, int, char *);
extern int tokenize(char *unused_char_p_t, char **unused_char_pp_t, int);
diff --git a/kern/src/monitor.c b/kern/src/monitor.c
index 048ef3b59900..2b2a075a9b57 100644
--- a/kern/src/monitor.c
+++ b/kern/src/monitor.c
@@ -93,12 +93,11 @@ int mon_ps(int argc, char** argv, struct hw_trapframe *hw_tf)

int mon_kerninfo(int argc, char **argv, struct hw_trapframe *hw_tf)
{
- extern char _start[], etext[], edata[], end[];
+ extern char _start[], etext[], end[];

cprintf("Special kernel symbols:\n");
cprintf(" _start %016x (virt) %016x (phys)\n", _start, (uintptr_t)(_start - KERNBASE));
cprintf(" etext %016x (virt) %016x (phys)\n", etext, (uintptr_t)(etext - KERNBASE));
- cprintf(" edata %016x (virt) %016x (phys)\n", edata, (uintptr_t)(edata - KERNBASE));
cprintf(" end %016x (virt) %016x (phys)\n", end, (uintptr_t)(end - KERNBASE));
cprintf("Kernel executable memory footprint: %dKB\n",
(uint32_t)(end-_start+1023)/1024);
--
2.6.0.rc2.230.g3dd15c0

Barret Rhoden

unread,
Nov 30, 2015, 4:18:37 PM11/30/15
to aka...@googlegroups.com
UT_ASSERT_M() just prints the message verbatim. Its va_args are cleanup
routines. UT_ASSERT_FMT() takes a format string, and its va_args is the
values for the format string.

Signed-off-by: Barret Rhoden <br...@cs.berkeley.edu>
---
user/utest/include/utest.h | 13 +++++++++++++
1 file changed, 13 insertions(+)

diff --git a/user/utest/include/utest.h b/user/utest/include/utest.h
index 646e492441ce..ceddcb662692 100644
--- a/user/utest/include/utest.h
+++ b/user/utest/include/utest.h
@@ -32,6 +32,19 @@ __BEGIN_DECLS
} \
} while (0)

+
+/* If 'test' fails, Sets an assert message, which can be a format string, and
+ * returns false. */
+#define UT_ASSERT_FMT(message, test, ...) \
+ do { \
+ if (!(test)) { \
+ char fmt[] = "Assertion failure in %s() at %s:%d: " #message; \
+ sprintf(utest_msg, fmt, __func__, __FILE__, __LINE__, \
+ ##__VA_ARGS__); \
+ return false; \
+ } \
+ } while (0)
+
/*
* Structs and macros for registering test cases.
*/
--
2.6.0.rc2.230.g3dd15c0

Barret Rhoden

unread,
Nov 30, 2015, 4:18:37 PM11/30/15
to aka...@googlegroups.com
With #vars, you can specify certain variables, such as num_cores, to be
exposed to userspace. If you want, you can:

$ bind -a \#vars /dev
$ cat /dev/num_cores!dw

For debugging, you can add entries with the DEVVARS_ENTRY(name, format)
macro. 'format' is two chars, the data_format and the data_size, using
Qemu's notation.

Privileged users (i.e. anyone) can add new entries to #vars, which
internally will do a lookup in the symbol table and trust the format
string. This is a little dangerous.

Signed-off-by: Barret Rhoden <br...@cs.berkeley.edu>
---
kern/drivers/dev/Kbuild | 1 +
kern/drivers/dev/Kconfig | 6 +
kern/drivers/dev/vars.c | 441 +++++++++++++++++++++++++++++++++++++++++++++++
kern/include/ns.h | 7 +
4 files changed, 455 insertions(+)
create mode 100644 kern/drivers/dev/vars.c

diff --git a/kern/drivers/dev/Kbuild b/kern/drivers/dev/Kbuild
index 57fd9368fb60..400a9b7d4c7f 100644
--- a/kern/drivers/dev/Kbuild
+++ b/kern/drivers/dev/Kbuild
@@ -12,5 +12,6 @@ obj-y += proc.o
obj-$(CONFIG_REGRESS) += regress.o
obj-y += root.o
obj-y += srv.o
+obj-$(CONFIG_DEVVARS) += vars.o

obj-$(CONFIG_NIX) += nix.o
diff --git a/kern/drivers/dev/Kconfig b/kern/drivers/dev/Kconfig
index eb737b8b11cb..d01c6dd563f6 100644
--- a/kern/drivers/dev/Kconfig
+++ b/kern/drivers/dev/Kconfig
@@ -4,3 +4,9 @@ config REGRESS
help
The regression test device allows you to push commands to monitor()
for testing. Defaults to 'y' for now.
+
+config DEVVARS
+ bool "#vars kernel variable exporter"
+ default y
+ help
+ The #vars device exports read access to select kernel variables.
diff --git a/kern/drivers/dev/vars.c b/kern/drivers/dev/vars.c
new file mode 100644
index 000000000000..bf9d68553212
--- /dev/null
+++ b/kern/drivers/dev/vars.c
@@ -0,0 +1,441 @@
+/* Copyright (c) 2015 Google Inc
+ * Barret Rhoden <br...@cs.berkeley.edu>
+ * See LICENSE for details.
+ *
+ * #vars device, exports read access to select kernel variables. These
+ * variables are statically set.
+ *
+ * To add a variable, add a DEVVARS_ENTRY(name, format) somewhere in the kernel.
+ * The format is a string consisting of two characters, using a modified version
+ * of QEMU's formatting rules (ignoring count): [data_format][size]
+ *
+ * data_format is:
+ * x (hex)
+ * d (decimal)
+ * u (unsigned)
+ * o (octal)
+ * c (char) does not need a size
+ * s (string) does not need a size
+ * size is:
+ * b (8 bits)
+ * h (16 bits)
+ * w (32 bits)
+ * g (64 bits)
+ *
+ * e.g. DEVVARS_ENTRY(num_cores, "dw");
+static struct dirtab *vars_dir;
+static size_t nr_vars;
+static qlock_t vars_lock;
+
+struct dirtab __attribute__((__section__("devvars")))
+ __devvars_dot = {".", {0, 0, QTDIR}, 0, DMDIR | 0555};
+
+DEVVARS_ENTRY(num_cores, "dw");
+
+static bool var_is_valid(struct dirtab *dir)
+{
+ return dir->qid.vers != -1;
+}
+
+/* Careful with this. c->name->s is the full path, at least sometimes. */
+static struct dirtab *find_var_by_name(const char *name)
+{
+ for (size_t i = 0; i < nr_vars; i++)
+ if (!strcmp(vars_dir[i].name, name))
+ return &vars_dir[i];
+ return 0;
+}
+
+static void vars_init(void)
+{
+ /* If you name a section without a '.', GCC will create start and stop
+ * symbols, e.g. __start_SECTION */
+ extern struct dirtab __start_devvars;
+ extern struct dirtab __stop_devvars;
+ struct dirtab *dot, temp;
+
+ nr_vars = &__stop_devvars - &__start_devvars;
+ vars_dir = kmalloc_array(nr_vars, sizeof(struct dirtab), KMALLOC_WAIT);
+ if (!vars_dir)
+ error(ENOMEM, "kmalloc_array failed, nr_vars was %p", nr_vars);
+ memcpy(vars_dir, &__start_devvars, nr_vars * sizeof(struct dirtab));
+ /* "." needs to be the first entry in a devtable. It might already be
+ * first, but we can do the swap regardless. */
+ temp = vars_dir[0];
+ dot = find_var_by_name(".");
+ assert(dot);
+ vars_dir[0] = *dot;
+ *dot = temp;
+ qlock_init(&vars_lock);
+}
+
+static struct chan *vars_attach(char *spec)
+{
+ struct chan *c;
+
+ c = devattach(devname(), spec);
+ mkqid(&c->qid, 0, 0, QTDIR);
+ return c;
+}
+
+static struct walkqid *vars_walk(struct chan *c, struct chan *nc, char **name,
+ int nname)
+{
+ ERRSTACK(1);
+ struct walkqid *ret;
+
+ qlock(&vars_lock);
+ if (waserror()) {
+ qunlock(&vars_lock);
+ nexterror();
+ }
+ ret = devwalk(c, nc, name, nname, vars_dir, nr_vars, devgen);
+ poperror();
+ qunlock(&vars_lock);
+ return ret;
+}
+
+static int vars_stat(struct chan *c, uint8_t *db, int n)
+{
+ ERRSTACK(1);
+ int ret;
+
+ qlock(&vars_lock);
+ if (waserror()) {
+ qunlock(&vars_lock);
+ nexterror();
+ }
+ ret = devstat(c, db, n, vars_dir, nr_vars, devgen);
+ poperror();
+ qunlock(&vars_lock);
+ return ret;
+}
+
+static struct chan *vars_open(struct chan *c, int omode)
+{
+ ERRSTACK(1);
+ struct chan *ret;
+
+ qlock(&vars_lock);
+ if (waserror()) {
+ qunlock(&vars_lock);
+ nexterror();
+ }
+ ret = devopen(c, omode, vars_dir, nr_vars, devgen);
+ poperror();
+ qunlock(&vars_lock);
+ return ret;
+}
+
+static void vars_close(struct chan *c)
+{
+}
+
+static struct dirtab *find_free_var(void)
+{
+ for (size_t i = 0; i < nr_vars; i++)
+ if (!var_is_valid(&vars_dir[i]))
+ return &vars_dir[i];
+ return 0;
+}
+
+/* We ignore the perm - they are all hard-coded in the dirtab */
+static void vars_create(struct chan *c, char *name, int omode, uint32_t perm)
+{
+ struct dirtab *new_slot;
+ uintptr_t addr;
+ char *bang;
+ size_t size;
+
+ /* TODO: check that the user is privileged */
+ bang = strchr(name, '!');
+ if (!bang)
+ error(EINVAL, "Var %s has no ! in its format string", name);
+ *bang = 0;
+ addr = get_symbol_addr(name);
+ *bang = '!';
+ if (!addr)
+ error(EINVAL, "Could not find symbol for %s", name);
+ bang++;
+ /* Note that we don't check the symbol type against the format. We're
+ * trusting the user here. o/w we'd need dwarf support. */
+ switch (*bang) {
+ case 'c':
+ size = sizeof(char);
+ break;
+ case 's':
+ size = sizeof(char*);
+ break;
+ case 'd':
+ case 'x':
+ case 'u':
+ case 'o':
+ bang++;
+ switch (*bang) {
+ case 'b':
+ size = sizeof(uint8_t);
+ break;
+ case 'h':
+ size = sizeof(uint16_t);
+ break;
+ case 'w':
+ size = sizeof(uint32_t);
+ break;
+ case 'g':
+ size = sizeof(uint64_t);
+ break;
+ default:
+ error(EINVAL, "Bad var size '%c'", *bang);
+ }
+ break;
+ default:
+ error(EINVAL, "Unknown var data_format '%c'", *bang);
+ }
+ bang++;
+ if (*bang)
+ error(EINVAL, "Extra chars for var %s", name);
+
+ qlock(&vars_lock);
+ new_slot = find_free_var();
+ if (!new_slot) {
+ vars_dir = kreallocarray(vars_dir, nr_vars * 2, sizeof(struct dirtab),
+ KMALLOC_WAIT);
+ if (!vars_dir)
+ error(ENOMEM, "krealloc_array failed, nr_vars was %p", nr_vars);
+ memset(vars_dir + nr_vars, 0, nr_vars * sizeof(struct dirtab));
+ for (size_t i = nr_vars; i < nr_vars * 2; i++)
+ vars_dir[i].qid.vers = -1;
+ new_slot = vars_dir + nr_vars;
+ nr_vars *= 2;
+ }
+ strncpy(new_slot->name, name, sizeof(new_slot->name));
+ new_slot->qid.path = addr;
+ new_slot->qid.vers = 0;
+ new_slot->qid.type = QTFILE;
+ new_slot->length = size;
+ new_slot->perm = 0444;
+ c->qid = new_slot->qid; /* need to update c with its new qid */
+ qunlock(&vars_lock);
+ c->mode = openmode(omode);
+}
+
+static const char *get_integer_fmt(char data_fmt, char data_size)
+{
+ switch (data_fmt) {
+ case 'x':
+ switch (data_size) {
+ case 'b':
+ case 'h':
+ case 'w':
+ return "0x%x";
+ case 'g':
+ return "0x%lx";
+ }
+ case 'd':
+ switch (data_size) {
+ case 'b':
+ case 'h':
+ case 'w':
+ return "%d";
+ case 'g':
+ return "%ld";
+ }
+ case 'u':
+ switch (data_size) {
+ case 'b':
+ case 'h':
+ case 'w':
+ return "%u";
+ case 'g':
+ return "%lu";
+ }
+ case 'o':
+ switch (data_size) {
+ case 'b':
+ case 'h':
+ case 'w':
+ return "0%o";
+ case 'g':
+ return "0%lo";
+ }
+ }
+ return 0;
+}
+
+static long vars_read(struct chan *c, void *ubuf, long n, int64_t offset)
+{
+ ERRSTACK(1);
+ char tmp[128]; /* big enough for any number and most strings */
+ size_t size = sizeof(tmp);
+ char data_size, data_fmt, *fmt;
+ const char *fmt_int;
+ bool is_signed = FALSE;
+ long ret;
+
+ qlock(&vars_lock);
+ if (waserror()) {
+ qunlock(&vars_lock);
+ nexterror();
+ }
+
+ if (c->qid.type == QTDIR) {
+ ret = devdirread(c, ubuf, n, vars_dir, nr_vars, devgen);
+ poperror();
+ qunlock(&vars_lock);
+ return ret;
+ }
+
+ /* These checks are mostly for the static variables. They are a
+ * double-check for the user-provided vars. */
+ fmt = strchr(c->name->s, '!');
+ if (!fmt)
+ error(EINVAL, "var %s has no ! in its format string", c->name->s);
+ fmt++;
+ data_fmt = *fmt;
+ if (!data_fmt)
+ error(EINVAL, "var %s has no data_format", c->name->s);
+
+ switch (data_fmt) {
+ case 'c':
+ size = snprintf(tmp, size, "%c", *(char*)c->qid.path);
+ break;
+ case 's':
+ size = snprintf(tmp, size, "%s", *(char**)c->qid.path);
+ break;
+ case 'd':
+ is_signed = TRUE;
+ /* fall through */
+ case 'x':
+ case 'u':
+ case 'o':
+ fmt++;
+ data_size = *fmt;
+ if (!data_size)
+ error(EINVAL, "var %s has no size", c->name->s);
+ fmt_int = get_integer_fmt(data_fmt, data_size);
+ if (!fmt_int)
+ error(EINVAL, "#%s was unable to get an int fmt for %s",
+ devname(), c->name->s);
+ switch (data_size) {
+ case 'b':
+ if (is_signed)
+ size = snprintf(tmp, size, fmt_int, *(int8_t*)c->qid.path);
+ else
+ size = snprintf(tmp, size, fmt_int, *(uint8_t*)c->qid.path);
+ break;
+ case 'h':
+ if (is_signed)
+ size = snprintf(tmp, size, fmt_int, *(int16_t*)c->qid.path);
+ else
+ size = snprintf(tmp, size, fmt_int, *(uint16_t*)c->qid.path);
+ break;
+ case 'w':
+ if (is_signed)
+ size = snprintf(tmp, size, fmt_int, *(int32_t*)c->qid.path);
+ else
+ size = snprintf(tmp, size, fmt_int, *(uint32_t*)c->qid.path);
+ break;
+ case 'g':
+ if (is_signed)
+ size = snprintf(tmp, size, fmt_int, *(int64_t*)c->qid.path);
+ else
+ size = snprintf(tmp, size, fmt_int, *(uint64_t*)c->qid.path);
+ break;
+ default:
+ error(EINVAL, "Bad #%s size %c", devname(), data_size);
+ }
+ break;
+ default:
+ error(EINVAL, "Unknown #%s data_format %c", devname(), data_fmt);
+ }
+ fmt++;
+ if (*fmt)
+ error(EINVAL, "Extra characters after var %s", c->name->s);
+ ret = readmem(offset, ubuf, n, tmp, size + 1);
+ poperror();
+ qunlock(&vars_lock);
+ return ret;
+}
+
+static long vars_write(struct chan *c, void *ubuf, long n, int64_t offset)
+{
+ error(EFAIL, "Can't write to a #%s file", devname());
+}
+
+/* remove is interesting. we mark the qid in the dirtab as -1, which is a
+ * signal to devgen that it is an invalid entry. someone could already have
+ * done a walk (before we qlocked) and grabbed the qid before it was -1. as far
+ * as they are concerned, they have a valid entry, since "the qid is the file"
+ * for devvars. (i.e. the chan gets a copy of the entire file, which fits into
+ * the qid). */
+static void vars_remove(struct chan *c)
+{
+ ERRSTACK(1);
+ struct dirtab *dir;
+ char *dir_name;
+
+ /* chan's name may have multiple elements in the path; get the last one. */
+ dir_name = strrchr(c->name->s, '/');
+ dir_name = dir_name ? dir_name + 1 : c->name->s;
+
+ qlock(&vars_lock);
+ if (waserror()) {
+ qunlock(&vars_lock);
+ nexterror();
+ }
+ dir = find_var_by_name(dir_name);
+ if (!dir || dir->qid.vers == -1)
+ error(ENOENT, "Failed to remove %s, was it already removed?",
+ c->name->s);
+ dir->qid.vers = -1;
+ poperror();
+ qunlock(&vars_lock);
+}
+
+struct dev vars_devtab __devtab = {
+ .name = "vars",
+ .reset = devreset,
+ .init = vars_init,
+ .shutdown = devshutdown,
+ .attach = vars_attach,
+ .walk = vars_walk,
+ .stat = vars_stat,
+ .open = vars_open,
+ .create = vars_create,
+ .close = vars_close,
+ .read = vars_read,
+ .bread = devbread,
+ .write = vars_write,
+ .bwrite = devbwrite,
+ .remove = vars_remove,
+ .wstat = devwstat,
+ .power = devpower,
+ .chaninfo = devchaninfo,
+ .tapfd = 0,
+};
diff --git a/kern/include/ns.h b/kern/include/ns.h
index 166158472114..c24f4c87a934 100644
--- a/kern/include/ns.h
+++ b/kern/include/ns.h
@@ -1000,3 +1000,10 @@ extern unsigned int qiomaxatomic;

/* special sections */
#define __devtab __attribute__((__section__(".devtab")))
+
+#define DEVVARS_ENTRY(name, fmt) \
+struct dirtab __attribute__((__section__("devvars"))) __devvars_##name = \
+ {#name "!" fmt, \
+ {(uint64_t)&(name), 0, QTFILE}, \
+ sizeof((name)), \
+ 0444}
--
2.6.0.rc2.230.g3dd15c0

Barret Rhoden

unread,
Nov 30, 2015, 4:18:38 PM11/30/15
to aka...@googlegroups.com
Signed-off-by: Barret Rhoden <br...@cs.berkeley.edu>
---
kern/drivers/dev/Kconfig | 9 ++++
kern/drivers/dev/vars.c | 46 +++++++++++++++++
user/utest/devvars.c | 125 +++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 180 insertions(+)
create mode 100644 user/utest/devvars.c

diff --git a/kern/drivers/dev/Kconfig b/kern/drivers/dev/Kconfig
index d01c6dd563f6..6a1ee1f39f89 100644
--- a/kern/drivers/dev/Kconfig
+++ b/kern/drivers/dev/Kconfig
@@ -10,3 +10,12 @@ config DEVVARS
default y
help
The #vars device exports read access to select kernel variables.
+
+config DEVVARS_TEST
+ bool "#vars test files"
+ depends on DEVVARS
+ default n
+ help
+ Have #vars include a collection of test files that devvars utest uses.
+ Say 'y' if you plan to use the utest, at the expense of having a
+ cluttered #vars.
diff --git a/kern/drivers/dev/vars.c b/kern/drivers/dev/vars.c
index bf9d68553212..a295629ed1c0 100644
--- a/kern/drivers/dev/vars.c
+++ b/kern/drivers/dev/vars.c
@@ -439,3 +439,49 @@ struct dev vars_devtab __devtab = {
.chaninfo = devchaninfo,
.tapfd = 0,
};
+
+/* The utest needs these variables exported */
+#ifdef CONFIG_DEVVARS_TEST
+
+static char *s = "string";
+static char c = 'x';
+static uint8_t u8 = 8;
+static uint16_t u16 = 16;
+static uint32_t u32 = 32;
+static uint64_t u64 = 64;
+static uint8_t d8 = -8;
+static uint16_t d16 = -16;
+static uint32_t d32 = -32;
+static uint64_t d64 = -64;
+static uint8_t x8 = 0x8;
+static uint16_t x16 = 0x16;
+static uint32_t x32 = 0x32;
+static uint64_t x64 = 0x64;
+static uint8_t o8 = 01;
+static uint16_t o16 = 016;
+static uint32_t o32 = 032;
+static uint64_t o64 = 064;
+
+/* For testing creation. There is no ENTRY for this. */
+char *devvars_foobar = "foobar";
+
+DEVVARS_ENTRY(s, "s");
+DEVVARS_ENTRY(c, "c");
+DEVVARS_ENTRY(u8, "ub");
+DEVVARS_ENTRY(u16, "uh");
+DEVVARS_ENTRY(u32, "uw");
+DEVVARS_ENTRY(u64, "ug");
+DEVVARS_ENTRY(d8, "db");
+DEVVARS_ENTRY(d16, "dh");
+DEVVARS_ENTRY(d32, "dw");
+DEVVARS_ENTRY(d64, "dg");
+DEVVARS_ENTRY(x8, "xb");
+DEVVARS_ENTRY(x16, "xh");
+DEVVARS_ENTRY(x32, "xw");
+DEVVARS_ENTRY(x64, "xg");
+DEVVARS_ENTRY(o8, "ob");
+DEVVARS_ENTRY(o16, "oh");
+DEVVARS_ENTRY(o32, "ow");
+DEVVARS_ENTRY(o64, "og");
+
+#endif /* CONFIG_DEVVARS_TEST */
diff --git a/user/utest/devvars.c b/user/utest/devvars.c
new file mode 100644
index 000000000000..1806454a6eab
--- /dev/null
+++ b/user/utest/devvars.c
@@ -0,0 +1,125 @@
+/* Copyright (c) 2015 Google Inc
+ * Barret Rhoden <br...@cs.berkeley.edu>
+ * See LICENSE for details. */
+
+#include <utest/utest.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <parlib/net.h>
+
+TEST_SUITE("DEVVARS");
+
+static bool test_read_var(const char *name, const char *val)
+{
+ char buf[128];
+ int fd, ret;
+
+ ret = snprintf(buf, sizeof(buf), "#vars/%s", name);
+ if (snprintf_overflow(ret, buf, sizeof(buf)))
+ UT_ASSERT_FMT("snprintf failed!", FALSE);
+ fd = open(buf, O_READ);
+ UT_ASSERT_FMT("Could not open vars file %s, check CONFIG_DEVVARS_TEST",
+ fd >= 0, buf);
+ ret = read(fd, buf, sizeof(buf));
+ UT_ASSERT_FMT("Could not read vars file %s", ret > 0, buf);
+ UT_ASSERT_FMT("Value differs, got %s, wanted %s", strcmp(buf, val) == 0,
+ buf, val);
+ return TRUE;
+}
+
+bool test_devvars_fmt(void)
+{
+ if (!test_read_var("s!s", "string"))
+ return FALSE;
+ if (!test_read_var("c!c", "x"))
+ return FALSE;
+ if (!test_read_var("u8!ub", "8"))
+ return FALSE;
+ if (!test_read_var("u16!uh", "16"))
+ return FALSE;
+ if (!test_read_var("u32!uw", "32"))
+ return FALSE;
+ if (!test_read_var("u64!ug", "64"))
+ return FALSE;
+ if (!test_read_var("d8!db", "-8"))
+ return FALSE;
+ if (!test_read_var("d16!dh", "-16"))
+ return FALSE;
+ if (!test_read_var("d32!dw", "-32"))
+ return FALSE;
+ if (!test_read_var("d64!dg", "-64"))
+ return FALSE;
+ if (!test_read_var("x8!xb", "0x8"))
+ return FALSE;
+ if (!test_read_var("x16!xh", "0x16"))
+ return FALSE;
+ if (!test_read_var("x32!xw", "0x32"))
+ return FALSE;
+ if (!test_read_var("x64!xg", "0x64"))
+ return FALSE;
+ if (!test_read_var("o8!ob", "01"))
+ return FALSE;
+ if (!test_read_var("o16!oh", "016"))
+ return FALSE;
+ if (!test_read_var("o32!ow", "032"))
+ return FALSE;
+ if (!test_read_var("o64!og", "064"))
+ return FALSE;
+ return TRUE;
+}
+
+static bool test_new_var(const char *name, const char *val)
+{
+ char buf[128];
+ char path[128];
+ int fd, ret;
+
+ ret = snprintf(path, sizeof(path), "#vars/%s", name);
+ if (snprintf_overflow(ret, path, sizeof(path)))
+ UT_ASSERT_FMT("snprintf failed!", FALSE);
+ fd = open(path, O_READ | O_CREATE, S_IRUSR);
+ UT_ASSERT_FMT("Could not open vars file %s, check CONFIG_DEVVARS_TEST",
+ fd >= 0, path);
+ ret = read(fd, buf, sizeof(buf));
+ UT_ASSERT_FMT("Could not read vars file %s", ret > 0, path);
+ UT_ASSERT_FMT("Value differs, got %s, wanted %s", strcmp(buf, val) == 0,
+ buf, val);
+ ret = unlink(path);
+ UT_ASSERT_FMT("Could not remove %s", ret == 0, path);
+ return TRUE;
+}
+
+bool test_devvars_newfile(void)
+{
+ if (!test_new_var("devvars_foobar!s", "foobar"))
+ return FALSE;
+ return TRUE;
+}
+
+/* Make sure test_read_var() knows how to fail */
+bool test_devvars_test(void)
+{
+ UT_ASSERT_FMT("Opened when it shouldn't have",
+ !test_read_var("NO_SUCH_FILE!xw", "0x32"));
+ UT_ASSERT_FMT("Got the wrong value but thought it was fine",
+ !test_read_var("x32!xw", "0xdeadbeef"));
+ return TRUE;
+}
+
+struct utest utests[] = {
+ UTEST_REG(devvars_fmt),
+ UTEST_REG(devvars_newfile),
+ UTEST_REG(devvars_test),
+};
+int num_utests = sizeof(utests) / sizeof(struct utest);
+
+int main(int argc, char *argv[])
+{
+ char **whitelist = &argv[1];
+ int whitelist_len = argc - 1;
+
+ RUN_TEST_SUITE(utests, num_utests, whitelist, whitelist_len);
+}
--
2.6.0.rc2.230.g3dd15c0

Davide Libenzi

unread,
Nov 30, 2015, 9:49:29 PM11/30/15
to Akaros
The MUL_NO_OVERFLOW thing makes little sense in the CPUs we are targeting.
A couple of mis-predicted branches are more expensive than a DIV (~20 cycles in today Intel).
Much faster:

typedef unsigned __int128 uint128_t;

{
     uint128_t n = (uint128_t) nmemb * (uint128_t) size;

    if (n > SIZE_MAX) foo();

}


--------------------
typedef unsigned __int128 u128;

static const unsigned long ulmax = ~0UL;

unsigned long oc(unsigned long a, unsigned long b)
{
    u128 m = (u128) a * (u128) b;

    if (m > ulmax)
        m = 0;

    return (unsigned long) m;
}

----
oc:
    movq    %rdi, %rax
    mulq    %rsi
    movq    %rax, %r9
    xorl    %eax, %eax
    testq   %rdx, %rdx
    cmove   %r9, %rax
    ret







--
2.6.0.rc2.230.g3dd15c0

Dan Cross

unread,
Nov 30, 2015, 10:38:02 PM11/30/15
to aka...@googlegroups.com
Numbers of it didn't happen.

Dan Cross

unread,
Nov 30, 2015, 10:41:25 PM11/30/15
to aka...@googlegroups.com
Also, bear in mind that this is right before we run the memory allocator. Shaving a few cycles off here is kind of like chipping a pebble off of an asteroid.

Davide Libenzi

unread,
Nov 30, 2015, 10:43:56 PM11/30/15
to Akaros
Oh, I would not have made any comment, had I not seen the silly optimization in place. Likely 1993 era 😀
No optimization in place, meant "we do not care at this point", which I agree in this case.
But the code was suggesting "we do care", hence a much better way to do it.

Dan Cross

unread,
Nov 30, 2015, 10:46:53 PM11/30/15
to aka...@googlegroups.com
I don't think that's an optimization; it's an attempt to avoid UB. In that sense, it's explicitly an anti-optimization....

Davide Libenzi

unread,
Nov 30, 2015, 10:48:03 PM11/30/15
to Akaros
No, the MUL checks are in place explicitly to try to avoid every time into the DIV.
I feel my hair growing back, thinking at the times that code made sense ☺

Dan Cross

unread,
Nov 30, 2015, 10:50:22 PM11/30/15
to aka...@googlegroups.com
Ah; yeah, perhaps. But bear in mind that this code is meant to be portable. 128-bit multiplication may be quite slow on RISC-V, for instance (though I don't know).

The OpenBSD people know what they're doing when it comes to this kind of thing....

Davide Libenzi

unread,
Nov 30, 2015, 11:53:23 PM11/30/15
to Akaros
For Open/NetBSD, it might make sense, as they run even is 13bit, nobody-heard-of, CPUs.
We don't run on anything which is not GCC, *and* 64bit, so caring about Arduino being possibly slower on that code does not make much sense.

Davide Libenzi

unread,
Dec 1, 2015, 12:54:29 AM12/1/15
to Akaros
On Mon, Nov 30, 2015 at 1:18 PM, Barret Rhoden <br...@cs.berkeley.edu> wrote:
IMHO it doesn't makes a lot sense the dynamic add/remove, given that we do not support dynload modules.
Makes code quite a bit simpler (no need for locking), and a few functions can go.
What would a possible use case be for adding variables at runtime?
They either exist as globals, or, if they come up as dynamic things (say a TCP connection), they better be handled by the module which creates them, within its own namespace.




+struct dirtab __attribute__((__section__("devvars")))
+              __devvars_dot = {".", {0, 0, QTDIR}, 0, DMDIR | 0555};

Do you really need this? Instead of adding, finding, ad swapping, you can just alloc an array +1 size, and create the proper entry at [0].
I know why you added it. Without that, with zero entries in the section, the __start_/__stop_ symbols are not defined and link fails☺


+
+/* Careful with this.  c->name->s is the full path, at least sometimes. */
+static struct dirtab *find_var_by_name(const char *name)
+{
+       for (size_t i = 0; i < nr_vars; i++)
+               if (!strcmp(vars_dir[i].name, name))
+                       return &vars_dir[i];
+       return 0;
+}

This could go, with the "." thing added at runtime, and no dynamic add/remove of vars.

 
+
+static void vars_init(void)
+{
+       /* If you name a section without a '.', GCC will create start and stop
+        * symbols, e.g. __start_SECTION */
+       extern struct dirtab __start_devvars;
+       extern struct dirtab __stop_devvars;
+       struct dirtab *dot, temp;
+
+       nr_vars = &__stop_devvars - &__start_devvars;
+       vars_dir = kmalloc_array(nr_vars, sizeof(struct dirtab), KMALLOC_WAIT);
+       if (!vars_dir)
+               error(ENOMEM, "kmalloc_array failed, nr_vars was %p", nr_vars);

What's the story? KMALLOC_WAIT can or cannot return NULL? ☺
I know, this is _array(), but I IMO here we could just kmalloc(a * b), and lose the NULL check.
If we are overflowing the address space with a*b as dirtabs at init, I don't see where the kernel data and code can be stored 😉

I see we don't support write, which would be pretty handy with /proc-like configurations.


Dan Cross

unread,
Dec 1, 2015, 5:27:11 AM12/1/15
to aka...@googlegroups.com

...but without taking a profile, how do you know its *actually* slower? Looking at assembly output can be misleading.

barret rhoden

unread,
Dec 1, 2015, 9:01:55 AM12/1/15
to aka...@googlegroups.com
On 2015-11-30 at 19:43 'Davide Libenzi' via Akaros wrote:
> Oh, I would not have made any comment, had I not seen the silly
> optimization in place. Likely 1993 era 😀
> No optimization in place, meant "we do not care at this point", which
> I agree in this case.
> But the code was suggesting "we do care", hence a much better way to
> do it.

I'm actually fine with just using:

/* We wraparound if UINT_MAX < a * b, which is also UINT_MAX / a < b. */
static inline bool mult_will_overflow_u64(uint64_t a, uint64_t b)
{
if (!a)
return FALSE;
return (uint64_t)(-1) / a < b;
}

which is already in the kernel.

I went with the existing approach, since that's what Dan brought in for
krealloc array recently, but would be happy to use either or.

My slight preference would be to just use a helper to detect overflow
(whether it's mine, Dan's from BSD, or yours that uses u128s), but I
went with a low-churn, let's not argue about it approach.

For now, I'm going to leave this as is.

If someone wants to investigate the best way to do this in a follow-up
commit and fix it for all uses of overflow detection, then that'd be
great.

Ideally, we'd have a helper that can take any type and tell us if the
mult will overflow. The alloc arrays will then both use that helper.
The helper would also provide the opportunity for arch-specific
includes. It also gets the enum and whatnot out of the .c file. I
think that covers everyone's points from this thread.

I put it on the big TODO list on the wiki:
https://github.com/brho/akaros/wiki#overflowing-multiplication

Barret

barret rhoden

unread,
Dec 1, 2015, 9:19:37 AM12/1/15
to aka...@googlegroups.com
On 2015-11-30 at 21:54 'Davide Libenzi' via Akaros wrote:
> IMHO it doesn't makes a lot sense the dynamic add/remove, given that
> we do not support dynload modules.
> Makes code quite a bit simpler (no need for locking), and a few
> functions can go.

I agree that it definitely is simpler without dynamically adding and
removing.

> What would a possible use case be for adding variables at runtime?
> They either exist as globals, or, if they come up as dynamic things
> (say a TCP connection), they better be handled by the module which
> creates them, within its own namespace.

The dynamic addition isn't for modules - it's for debugging. If you
know a variable exists and you want to see its value, this is an easy
way to get access to the variable without rebuilding the kernel with a
DEVVARS_ENTRY().

> +struct dirtab __attribute__((__section__("devvars")))
> > + __devvars_dot = {".", {0, 0, QTDIR}, 0, DMDIR |
> > 0555};
>
>
> Do you really need this? Instead of adding, finding, ad swapping, you
> can just alloc an array +1 size, and create the proper entry at [0].
> I know why you added it. Without that, with zero entries in the
> section, the __start_/__stop_ symbols are not defined and link fails☺

Yes, it's nice to have the symbols present. =)

However, I do want the "." entry to be in the devtab too. I guess I
could build the entry at runtime, but that didn't seem worth it, and
semantically it didn't make as much sense to me. Also, when this code
was static-only, I didn't even use any kmallocs, and simply used the
devvars section for the entire devtab.

> > + vars_dir = kmalloc_array(nr_vars, sizeof(struct dirtab),
> > KMALLOC_WAIT);
> > + if (!vars_dir)
> > + error(ENOMEM, "kmalloc_array failed, nr_vars was
> > %p", nr_vars);
> >
>
> What's the story? KMALLOC_WAIT can or cannot return NULL? ☺
> I know, this is _array(), but I IMO here we could just kmalloc(a *
> b), and lose the NULL check.

kmalloc(x, KMALLOC_WAIT) cannot return NULL, currently. It sounds like
you want to change that, but that can be a subject for discussion when
we fix the memory allocator.

I originally did just a kmalloc, but checkpatch yelled at me and
suggested I use a kmalloc_array(). So I added that.

> If we are overflowing the address space with a*b as dirtabs at init, I
> don't see where the kernel data and code can be stored 😉

You could store it in the cloud! =)

> I see we don't support write, which would be pretty handy
> with /proc-like configurations.

I considered that too dangerous for now. Creating variables is also
dangerous, but a little less. We can always add write support in the
future.

Barret

Davide Libenzi

unread,
Dec 1, 2015, 9:20:59 AM12/1/15
to Akaros
On Tue, Dec 1, 2015 at 6:01 AM, barret rhoden <br...@cs.berkeley.edu> wrote:
On 2015-11-30 at 19:43 'Davide Libenzi' via Akaros wrote:
> Oh, I would not have made any comment, had I not seen the silly
> optimization in place. Likely 1993 era 😀
> No optimization in place, meant "we do not care at this point", which
> I agree in this case.
> But the code was suggesting "we do care", hence a much better way to
> do it.

I'm actually fine with just using:

/* We wraparound if UINT_MAX < a * b, which is also UINT_MAX / a < b. */
static inline bool mult_will_overflow_u64(uint64_t a, uint64_t b)
{
    if (!a)
        return FALSE;
    return (uint64_t)(-1) / a < b;
}

That looks good to me, modulo maybe an unlikely(!a) 😀
We are dealing with an OS which does not have to deal with the past. And the future has DIV becoming faster more than slower (today is about 20 cycles IIRC, on modern x86_64 CPUs).
The mulq (128bit) approach is likely faster (mulq faster than DIV, and no branches), but we are splitting hairs here, in a place were there is no need to, and both approaches are more readable than the current code.


Davide Libenzi

unread,
Dec 1, 2015, 9:59:36 AM12/1/15
to Akaros
On Tue, Dec 1, 2015 at 6:19 AM, barret rhoden <br...@cs.berkeley.edu> wrote:
On 2015-11-30 at 21:54 'Davide Libenzi' via Akaros wrote:
> IMHO it doesn't makes a lot sense the dynamic add/remove, given that
> we do not support dynload modules.
> Makes code quite a bit simpler (no need for locking), and a few
> functions can go.

I agree that it definitely is simpler without dynamically adding and
removing.

> What would a possible use case be for adding variables at runtime?
> They either exist as globals, or, if they come up as dynamic things
> (say a TCP connection), they better be handled by the module which
> creates them, within its own namespace.

The dynamic addition isn't for modules - it's for debugging.  If you
know a variable exists and you want to see its value, this is an easy
way to get access to the variable without rebuilding the kernel with a
DEVVARS_ENTRY().

It seems a lot of extra code for debugging ☺
Also, if I am debugging, I could just use the macro in my module.
Yes, I would have to rebuild the kernel, but it's not that much of a wait.
I would have personally chosen the simpler code given the Pros that come with it, but anyway.


 

> +struct dirtab __attribute__((__section__("devvars")))
> > +              __devvars_dot = {".", {0, 0, QTDIR}, 0, DMDIR |
> > 0555};
>
>
> Do you really need this? Instead of adding, finding, ad swapping, you
> can just alloc an array +1 size, and create the proper entry at [0].
> I know why you added it. Without that, with zero entries in the
> section, the __start_/__stop_ symbols are not defined and link fails☺

Yes, it's nice to have the symbols present.  =)

Since I am currently using dummies in other code, can you check if something like this pleases the linker, without screwing up with section size constraints (multiple of object size):

#define INIT_SECTION(sect) char __attribute__((__section__( #sect ))) __init_section_##sect[0]

A better alternative would be weaks for __start_/__stop_. This generates no linker errors, even though foo is never defined:
extern int __attribute__((weak)) foo[];

int main(void)
{
    if (foo)
        return 0;

    return 1;
}
Look at what GCC (and linker missing relocation) does with rax/eax ☺

00000000004004ed <main>:
  4004ed:       55                      push   %rbp
  4004ee:       48 89 e5                mov    %rsp,%rbp
  4004f1:       b8 00 00 00 00          mov    $0x0,%eax
  4004f6:       48 85 c0                test   %rax,%rax
  4004f9:       74 07                   je     400502 <main+0x15>
  4004fb:       b8 00 00 00 00          mov    $0x0,%eax
  400500:       eb 05                   jmp    400507 <main+0x1a>
  400502:       b8 01 00 00 00          mov    $0x1,%eax
  400507:       5d                      pop    %rbp
  400508:       c3                      retq   
  400509:       0f 1f 80 00 00 00 00    nopl   0x0(%rax)



kmalloc(x, KMALLOC_WAIT) cannot return NULL, currently.  It sounds like
you want to change that, but that can be a subject for discussion when
we fix the memory allocator.

That might be a long patch to fix all the places where we assume WAIT means never NULL.
But I am not against it in principle.
We need to make sure we kill "current" instead of returning NULL, in these cases.
Otherwise we get panics instead of OOM kills.


barret rhoden

unread,
Dec 1, 2015, 10:50:29 AM12/1/15
to aka...@googlegroups.com
On 2015-12-01 at 6:59 'Davide Libenzi' via Akaros wrote:
> Since I am currently using dummies in other code, can you check if
> something like this pleases the linker, without screwing up with
> section size constraints (multiple of object size):
>
> #define INIT_SECTION(sect) char __attribute__((__section__( #sect )))
> __init_section_##sect[0]

My usual approach is to try something and see if the compiler/linker
yell at me, then do an objdump to see if it worked. =)

> A better alternative would be weaks for __start_/__stop_. This
> generates no linker errors, even though foo is never defined:
>
> extern int __attribute__((weak)) foo[];

I like this approach.

> int main(void)
> {
> if (foo)
> return 0;
>
> return 1;
> }
>
> Look at what GCC (and linker missing relocation) does with rax/eax ☺
>
>
> 00000000004004ed <main>:
> 4004ed: 55 push %rbp
> 4004ee: 48 89 e5 mov %rsp,%rbp
> 4004f1: b8 00 00 00 00 mov $0x0,%eax
> 4004f6: 48 85 c0 test %rax,%rax
> 4004f9: 74 07 je 400502 <main+0x15>
> 4004fb: b8 00 00 00 00 mov $0x0,%eax
> 400500: eb 05 jmp 400507 <main+0x1a>
> 400502: b8 01 00 00 00 mov $0x1,%eax
> 400507: 5d pop %rbp
> 400508: c3 retq
> 400509: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)

That 0x0 gets overriden at link time, so it seems to work.

foo.c:

extern int __attribute__((weak)) foo[];
int main(void)
{
if (foo)
return 0;
return 1;
}

bar.c:
int foo[5];

$ gcc -c foo.c -o foo.o
$ objdump -d foo.o

foo.o: file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <main>:
0: 55 push %rbp
1: 48 89 e5 mov %rsp,%rbp
4: b8 00 00 00 00 mov $0x0,%eax
9: 48 85 c0 test %rax,%rax
c: 74 07 je 15 <main+0x15>
e: b8 00 00 00 00 mov $0x0,%eax
13: eb 05 jmp 1a <main+0x1a>
15: b8 01 00 00 00 mov $0x1,%eax
1a: 5d pop %rbp
1b: c3 retq

Still has $0x0 for the instruction at byte 0x4.

$ objdump -x foo.o

foo.o: file format elf64-x86-64
foo.o
architecture: i386:x86-64, flags 0x00000011:
HAS_RELOC, HAS_SYMS
start address 0x0000000000000000

Sections:
Idx Name Size VMA LMA File off Algn
0 .text 0000001c 0000000000000000 0000000000000000 00000040 2**0
CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
1 .data 00000000 0000000000000000 0000000000000000 0000005c 2**0
CONTENTS, ALLOC, LOAD, DATA
2 .bss 00000000 0000000000000000 0000000000000000 0000005c 2**0
ALLOC
3 .comment 0000002b 0000000000000000 0000000000000000 0000005c 2**0
CONTENTS, READONLY
4 .note.GNU-stack 00000000 0000000000000000 0000000000000000 00000087 2**0
CONTENTS, READONLY
5 .eh_frame 00000038 0000000000000000 0000000000000000 00000088 2**3
CONTENTS, ALLOC, LOAD, RELOC, READONLY, DATA
SYMBOL TABLE:
0000000000000000 l df *ABS* 0000000000000000 foo.c
0000000000000000 l d .text 0000000000000000 .text
0000000000000000 l d .data 0000000000000000 .data
0000000000000000 l d .bss 0000000000000000 .bss
0000000000000000 l d .note.GNU-stack 0000000000000000 .note.GNU-stack
0000000000000000 l d .eh_frame 0000000000000000 .eh_frame
0000000000000000 l d .comment 0000000000000000 .comment
0000000000000000 g F .text 000000000000001c main
0000000000000000 w *UND* 0000000000000000 foo


RELOCATION RECORDS FOR [.text]:
OFFSET TYPE VALUE
0000000000000005 R_X86_64_32 foo


RELOCATION RECORDS FOR [.eh_frame]:
OFFSET TYPE VALUE
0000000000000020 R_X86_64_PC32 .text

It has a relocation record at offset 0x5, which is where the $0x0 is in
the instruction mov $0x0, %eax.

$ gcc -c bar.c -o bar.o
$ gcc -o foobar foo.o bar.o
$ objdump -d foobar
(edited to just show main)

0000000000400526 <main>:
400526: 55 push %rbp
400527: 48 89 e5 mov %rsp,%rbp
40052a: b8 50 10 60 00 mov $0x601050,%eax
40052f: 48 85 c0 test %rax,%rax
400532: 74 07 je 40053b <main+0x15>
400534: b8 00 00 00 00 mov $0x0,%eax
400539: eb 05 jmp 400540 <main+0x1a>
40053b: b8 01 00 00 00 mov $0x1,%eax
400540: 5d pop %rbp
400541: c3 retq

Barret

Davide Libenzi

unread,
Dec 1, 2015, 11:20:31 AM12/1/15
to Akaros
On Tue, Dec 1, 2015 at 7:50 AM, barret rhoden <br...@cs.berkeley.edu> wrote:
> A better alternative would be weaks for __start_/__stop_. This
> generates no linker errors, even though foo is never defined:
>
> extern int __attribute__((weak)) foo[];

I like this approach.

OK, switched the new percpu code, and the old ex-table, to use weaks, and drop the fake entries added to please the linker.
It's working fine.


Barret Rhoden

unread,
Dec 1, 2015, 2:06:13 PM12/1/15
to aka...@googlegroups.com
On 2015-12-01 at 08:20 "'Davide Libenzi' via Akaros"
<aka...@googlegroups.com> wrote:
> OK, switched the new percpu code, and the old ex-table, to use weaks,
> and drop the fake entries added to please the linker.
> It's working fine.

Sweet.



Barret Rhoden

unread,
Dec 2, 2015, 3:00:00 PM12/2/15
to aka...@googlegroups.com
On 2015-11-30 at 16:18 Barret Rhoden <br...@cs.berkeley.edu> wrote:
> Version 2. Users can create variables from any C file in the kernel
> with DEVVARS_ENTRY. Trusted users can create entries by name from an
> open or create call.

Merged to master at 2fa42319139e..535ec6030b9a (from, to]

You can see the entire diff with 'git diff' or at
https://github.com/brho/akaros/compare/2fa42319139e...535ec6030b9a

Reply all
Reply to author
Forward
0 new messages